public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	alan@lxorguk.ukuu.org.uk, Peng Tao <tao.peng@emc.com>,
	Trond Myklebust <Trond.Myklebust@netapp.com>
Subject: [ 02/76] pnfsblock: fix partial page buffer wirte
Date: Thu, 18 Oct 2012 19:46:26 -0700	[thread overview]
Message-ID: <20121019024350.466211922@linuxfoundation.org> (raw)
In-Reply-To: <20121019024350.087156547@linuxfoundation.org>

3.6-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Peng Tao <bergwolf@gmail.com>

commit fe6e1e8d9fad86873eb74a26e80a8f91f9e870b5 upstream.

If applications use flock to protect its write range, generic NFS
will not do read-modify-write cycle at page cache level. Therefore
LD should know how to handle non-sector aligned writes. Otherwise
there will be data corruption.

Signed-off-by: Peng Tao <tao.peng@emc.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 fs/nfs/blocklayout/blocklayout.c |  177 ++++++++++++++++++++++++++++++++++++---
 fs/nfs/blocklayout/blocklayout.h |    1 
 2 files changed, 166 insertions(+), 12 deletions(-)

--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -162,25 +162,39 @@ static struct bio *bl_alloc_init_bio(int
 	return bio;
 }
 
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
 				      sector_t isect, struct page *page,
 				      struct pnfs_block_extent *be,
 				      void (*end_io)(struct bio *, int err),
-				      struct parallel_io *par)
+				      struct parallel_io *par,
+				      unsigned int offset, int len)
 {
+	isect = isect + (offset >> SECTOR_SHIFT);
+	dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
+		npg, rw, (unsigned long long)isect, offset, len);
 retry:
 	if (!bio) {
 		bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
 		if (!bio)
 			return ERR_PTR(-ENOMEM);
 	}
-	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+	if (bio_add_page(bio, page, len, offset) < len) {
 		bio = bl_submit_bio(rw, bio);
 		goto retry;
 	}
 	return bio;
 }
 
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+				      sector_t isect, struct page *page,
+				      struct pnfs_block_extent *be,
+				      void (*end_io)(struct bio *, int err),
+				      struct parallel_io *par)
+{
+	return do_add_page_to_bio(bio, npg, rw, isect, page, be,
+				  end_io, par, 0, PAGE_CACHE_SIZE);
+}
+
 /* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
@@ -461,6 +475,106 @@ map_block(struct buffer_head *bh, sector
 	return;
 }
 
+static void
+bl_read_single_end_io(struct bio *bio, int error)
+{
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct page *page = bvec->bv_page;
+
+	/* Only one page in bvec */
+	unlock_page(page);
+}
+
+static int
+bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
+		    unsigned int offset, unsigned int len)
+{
+	struct bio *bio;
+	struct page *shadow_page;
+	sector_t isect;
+	char *kaddr, *kshadow_addr;
+	int ret = 0;
+
+	dprintk("%s: offset %u len %u\n", __func__, offset, len);
+
+	shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (shadow_page == NULL)
+		return -ENOMEM;
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	if (bio == NULL)
+		return -ENOMEM;
+
+	isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
+		(offset / SECTOR_SIZE);
+
+	bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+	bio->bi_bdev = be->be_mdev;
+	bio->bi_end_io = bl_read_single_end_io;
+
+	lock_page(shadow_page);
+	if (bio_add_page(bio, shadow_page,
+			 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
+		unlock_page(shadow_page);
+		bio_put(bio);
+		return -EIO;
+	}
+
+	submit_bio(READ, bio);
+	wait_on_page_locked(shadow_page);
+	if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
+		ret = -EIO;
+	} else {
+		kaddr = kmap_atomic(page);
+		kshadow_addr = kmap_atomic(shadow_page);
+		memcpy(kaddr + offset, kshadow_addr + offset, len);
+		kunmap_atomic(kshadow_addr);
+		kunmap_atomic(kaddr);
+	}
+	__free_page(shadow_page);
+	bio_put(bio);
+
+	return ret;
+}
+
+static int
+bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
+			  unsigned int dirty_offset, unsigned int dirty_len,
+			  bool full_page)
+{
+	int ret = 0;
+	unsigned int start, end;
+
+	if (full_page) {
+		start = 0;
+		end = PAGE_CACHE_SIZE;
+	} else {
+		start = round_down(dirty_offset, SECTOR_SIZE);
+		end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
+	}
+
+	dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
+	if (!be) {
+		zero_user_segments(page, start, dirty_offset,
+				   dirty_offset + dirty_len, end);
+		if (start == 0 && end == PAGE_CACHE_SIZE &&
+		    trylock_page(page)) {
+			SetPageUptodate(page);
+			unlock_page(page);
+		}
+		return ret;
+	}
+
+	if (start != dirty_offset)
+		ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
+
+	if (!ret && (dirty_offset + dirty_len < end))
+		ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
+					  end - dirty_offset - dirty_len);
+
+	return ret;
+}
+
 /* Given an unmapped page, zero it or read in page for COW, page is locked
  * by caller.
  */
@@ -494,7 +608,6 @@ init_page_for_write(struct page *page, s
 	SetPageUptodate(page);
 
 cleanup:
-	bl_put_extent(cow_read);
 	if (bh)
 		free_buffer_head(bh);
 	if (ret) {
@@ -566,6 +679,7 @@ bl_write_pagelist(struct nfs_write_data
 	struct parallel_io *par = NULL;
 	loff_t offset = wdata->args.offset;
 	size_t count = wdata->args.count;
+	unsigned int pg_offset, pg_len, saved_len;
 	struct page **pages = wdata->args.pages;
 	struct page *page;
 	pgoff_t index;
@@ -674,10 +788,11 @@ next_page:
 		if (!extent_length) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
+			bl_put_extent(cow_read);
 			bio = bl_submit_bio(WRITE, bio);
 			/* Get the next one */
 			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
-					     isect, NULL);
+					     isect, &cow_read);
 			if (!be || !is_writable(be, isect)) {
 				header->pnfs_error = -EINVAL;
 				goto out;
@@ -694,7 +809,26 @@ next_page:
 			extent_length = be->be_length -
 			    (isect - be->be_f_offset);
 		}
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+
+		dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
+		pg_offset = offset & ~PAGE_CACHE_MASK;
+		if (pg_offset + count > PAGE_CACHE_SIZE)
+			pg_len = PAGE_CACHE_SIZE - pg_offset;
+		else
+			pg_len = count;
+
+		saved_len = pg_len;
+		if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
+		    !bl_is_sector_init(be->be_inval, isect)) {
+			ret = bl_read_partial_page_sync(pages[i], cow_read,
+							pg_offset, pg_len, true);
+			if (ret) {
+				dprintk("%s bl_read_partial_page_sync fail %d\n",
+					__func__, ret);
+				header->pnfs_error = ret;
+				goto out;
+			}
+
 			ret = bl_mark_sectors_init(be->be_inval, isect,
 						       PAGE_CACHE_SECTORS);
 			if (unlikely(ret)) {
@@ -703,15 +837,35 @@ next_page:
 				header->pnfs_error = ret;
 				goto out;
 			}
+
+			/* Expand to full page write */
+			pg_offset = 0;
+			pg_len = PAGE_CACHE_SIZE;
+		} else if  ((pg_offset & (SECTOR_SIZE - 1)) ||
+			    (pg_len & (SECTOR_SIZE - 1))){
+			/* ahh, nasty case. We have to do sync full sector
+			 * read-modify-write cycles.
+			 */
+			unsigned int saved_offset = pg_offset;
+			ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
+							pg_len, false);
+			pg_offset = round_down(pg_offset, SECTOR_SIZE);
+			pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
+				 - pg_offset;
 		}
-		bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+
+
+		bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
 					 isect, pages[i], be,
-					 bl_end_io_write, par);
+					 bl_end_io_write, par,
+					 pg_offset, pg_len);
 		if (IS_ERR(bio)) {
 			header->pnfs_error = PTR_ERR(bio);
 			bio = NULL;
 			goto out;
 		}
+		offset += saved_len;
+		count -= saved_len;
 		isect += PAGE_CACHE_SECTORS;
 		last_isect = isect;
 		extent_length -= PAGE_CACHE_SECTORS;
@@ -729,17 +883,16 @@ next_page:
 	}
 
 write_done:
-	wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
-	if (count < wdata->res.count) {
-		wdata->res.count = count;
-	}
+	wdata->res.count = wdata->args.count;
 out:
 	bl_put_extent(be);
+	bl_put_extent(cow_read);
 	bl_submit_bio(WRITE, bio);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
 out_mds:
 	bl_put_extent(be);
+	bl_put_extent(cow_read);
 	kfree(par);
 	return PNFS_NOT_ATTEMPTED;
 }
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -41,6 +41,7 @@
 
 #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
 #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
 
 struct block_mount_id {
 	spinlock_t			bm_lock;    /* protects list */



  parent reply	other threads:[~2012-10-19  3:09 UTC|newest]

Thread overview: 83+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-10-19  2:46 [ 00/76] 3.6.3-stable review Greg Kroah-Hartman
2012-10-19  2:46 ` [ 01/76] ARM: vfp: fix saving d16-d31 vfp registers on v6+ kernels Greg Kroah-Hartman
2012-10-19  2:46 ` Greg Kroah-Hartman [this message]
2012-10-19  2:46 ` [ 03/76] NFS41: fix error of setting blocklayoutdriver Greg Kroah-Hartman
2012-10-19  2:46 ` [ 04/76] NFS: Remove bad delegations during open recovery Greg Kroah-Hartman
2012-10-19  2:46 ` [ 05/76] nfsd4: dont pin clientids to pseudoflavors Greg Kroah-Hartman
2012-10-19  2:46 ` [ 06/76] nfsd4: fix nfs4 stateid leak Greg Kroah-Hartman
2012-10-19  2:46 ` [ 07/76] NFSD: pass null terminated buf to kstrtouint() Greg Kroah-Hartman
2012-10-19  2:46 ` [ 08/76] lockd: per-net NSM client creation and destruction helpers introduced Greg Kroah-Hartman
2012-10-19  2:46 ` [ 09/76] lockd: use rpc clients cl_nodename for id encoding Greg Kroah-Hartman
2012-10-19 22:10   ` Ben Hutchings
2012-10-19 22:13     ` Ben Hutchings
2012-10-19  2:46 ` [ 10/76] lockd: create and use per-net NSM RPC clients on MON/UNMON requests Greg Kroah-Hartman
2012-10-19  2:46 ` [ 11/76] ACPI: EC: Make the GPE storm threshold a module parameter Greg Kroah-Hartman
2012-10-19  2:46 ` [ 12/76] ACPI: EC: Add a quirk for CLEVO M720T/M730T laptop Greg Kroah-Hartman
2012-10-19  2:46 ` [ 13/76] ALSA: hda - Add missing hda_gen_spec to struct via_spec Greg Kroah-Hartman
2012-10-19  2:46 ` [ 14/76] ALSA: hda - do not detect jack on internal speakers for Realtek Greg Kroah-Hartman
2012-10-19  2:46 ` [ 15/76] ALSA: hda - Fix memory leaks at error path in patch_cirrus.c Greg Kroah-Hartman
2012-10-19  2:46 ` [ 16/76] mips,kgdb: fix recursive page fault with CONFIG_KPROBES Greg Kroah-Hartman
2012-10-19  2:46 ` [ 17/76] tmpfs,ceph,gfs2,isofs,reiserfs,xfs: fix fh_len checking Greg Kroah-Hartman
2012-10-19  2:46 ` [ 18/76] iscsi-target: Correctly set 0xffffffff field within ISCSI_OP_REJECT PDU Greg Kroah-Hartman
2012-10-19  2:46 ` [ 19/76] iscsit: remove incorrect unlock in iscsit_build_sendtargets_resp Greg Kroah-Hartman
2012-10-19  2:46 ` [ 20/76] iscsi-target: Add explicit set of cache_dynamic_acls=1 for TPG demo-mode Greg Kroah-Hartman
2012-10-19  2:46 ` [ 21/76] iscsi-target: Bump defaults for nopin_timeout + nopin_response_timeout values Greg Kroah-Hartman
2012-10-19  2:46 ` [ 22/76] SCSI: storvsc: Account for in-transit packets in the RESET path Greg Kroah-Hartman
2012-10-19  2:46 ` [ 23/76] SCSI: scsi_debug: Fix off-by-one bug when unmapping region Greg Kroah-Hartman
2012-10-19  2:46 ` [ 24/76] SCSI: virtio-scsi: initialize scatterlist structure Greg Kroah-Hartman
2012-10-19  2:46 ` [ 25/76] ARM: 7541/1: Add ARM ERRATA 775420 workaround Greg Kroah-Hartman
2012-10-19  2:46 ` [ 26/76] ARM: OMAP: counter: add locking to read_persistent_clock Greg Kroah-Hartman
2012-10-19  2:46 ` [ 27/76] firewire: cdev: fix user memory corruption (i386 userland on amd64 kernel) Greg Kroah-Hartman
2012-10-19  2:46 ` [ 28/76] SUNRPC: Ensure that the TCP socket is closed when in CLOSE_WAIT Greg Kroah-Hartman
2012-10-19  2:46 ` [ 29/76] target: support zero allocation length in INQUIRY Greg Kroah-Hartman
2012-10-19  2:46 ` [ 30/76] target: fix truncation of mode data, support zero allocation length Greg Kroah-Hartman
2012-10-19  2:46 ` [ 31/76] target: fix return code in target_core_init_configfs error path Greg Kroah-Hartman
2012-10-19  2:46 ` [ 32/76] target/file: Re-enable optional fd_buffered_io=1 operation Greg Kroah-Hartman
2012-10-19  2:46 ` [ 33/76] qla2xxx: Fix endianness of task management response code Greg Kroah-Hartman
2012-10-19  2:46 ` [ 34/76] vfio: Move PCI INTx eventfd setting earlier Greg Kroah-Hartman
2012-10-19  2:46 ` [ 35/76] vfio: Fix PCI INTx disable consistency Greg Kroah-Hartman
2012-10-19  2:47 ` [ 36/76] xen/pv-on-hvm kexec: add quirk for Xen 3.4 and shutdown watches Greg Kroah-Hartman
2012-10-19  2:47 ` [ 37/76] xen/bootup: allow {read|write}_cr8 pvops call Greg Kroah-Hartman
2012-10-19  2:47 ` [ 38/76] xen/bootup: allow read_tscp call for Xen PV guests Greg Kroah-Hartman
2012-10-19  2:47 ` [ 39/76] block: fix request_queue->flags initialization Greg Kroah-Hartman
2012-10-19  2:47 ` [ 40/76] autofs4 - fix reset pending flag on mount fail Greg Kroah-Hartman
2012-10-19  2:47 ` [ 41/76] module: taint kernel when lve module is loaded Greg Kroah-Hartman
2012-10-19  2:47 ` [ 42/76] video/udlfb: fix line counting in fb_write Greg Kroah-Hartman
2012-10-19  2:47 ` [ 43/76] viafb: dont touch clock state on OLPC XO-1.5 Greg Kroah-Hartman
2012-10-19  2:47 ` [ 44/76] timekeeping: Cast raw_interval to u64 to avoid shift overflow Greg Kroah-Hartman
2012-10-19  2:47 ` [ 45/76] timers: Fix endless looping between cascade() and internal_add_timer() Greg Kroah-Hartman
2012-10-19  2:47 ` [ 46/76] nohz: Fix one jiffy count too far in idle cputime Greg Kroah-Hartman
2012-10-19  2:47 ` [ 47/76] ath9k: use ieee80211_free_txskb Greg Kroah-Hartman
2012-10-19  2:47 ` [ 48/76] mac80211: use ieee80211_free_txskb to fix possible skb leaks Greg Kroah-Hartman
2012-10-19  2:47 ` [ 49/76] md/raid10: use correct limit variable Greg Kroah-Hartman
2012-10-19  2:47 ` [ 50/76] kdb,vt_console: Fix missed data due to pager overruns Greg Kroah-Hartman
2012-10-19  2:47 ` [ 51/76] pktgen: fix crash when generating IPv6 packets Greg Kroah-Hartman
2012-10-19  2:47 ` [ 52/76] MIPS: ath79: Fix CPU/DDR frequency calculation for SRIF PLLs Greg Kroah-Hartman
2012-10-19  2:47 ` [ 53/76] kbuild: Fix accidental revert in commit fe04ddf Greg Kroah-Hartman
2012-10-19  2:47 ` [ 54/76] Add CDC-ACM support for the CX93010-2x UCMxx USB Modem Greg Kroah-Hartman
2012-10-19  2:47 ` [ 55/76] fs: handle failed audit_log_start properly Greg Kroah-Hartman
2012-10-19  2:47 ` [ 56/76] fs: prevent use after free in auditing when symlink following was denied Greg Kroah-Hartman
2012-10-19  2:47 ` [ 57/76] drm/radeon: Dont destroy I2C Bus Rec in radeon_ext_tmds_enc_destroy() Greg Kroah-Hartman
2012-10-19  2:47 ` [ 58/76] drm/i915: remove useless BUG_ON which caused a regression in 3.5 Greg Kroah-Hartman
2012-10-19  2:47 ` [ 59/76] drm/i915: Set guardband clipping workaround bit in the right register Greg Kroah-Hartman
2012-10-19  2:47 ` [ 60/76] drm/nouveau/bios: fix shadowing of ACPI ROMs larger than 64KiB Greg Kroah-Hartman
2012-10-19 16:01   ` Heinz Diehl
2012-10-19 17:48     ` Greg Kroah-Hartman
2012-10-19 19:11       ` Heinz Diehl
2012-10-21 16:31         ` Greg Kroah-Hartman
2012-10-19  2:47 ` [ 61/76] drm/i915: use adjusted_mode instead of mode for checking the 6bpc force flag Greg Kroah-Hartman
2012-10-19  2:47 ` [ 62/76] mcs7830: Fix link state detection Greg Kroah-Hartman
2012-10-19  2:47 ` [ 63/76] jbd: Fix assertion failure in commit code due to lacking transaction credits Greg Kroah-Hartman
2012-10-19  2:47 ` [ 64/76] mtd: nand: allow NAND_NO_SUBPAGE_WRITE to be set from driver Greg Kroah-Hartman
2012-10-19  2:47 ` [ 65/76] e1000e: Change wthresh to 1 to avoid possible Tx stalls Greg Kroah-Hartman
2012-10-19  2:47 ` [ 66/76] tpm: Propagate error from tpm_transmit to fix a timeout hang Greg Kroah-Hartman
2012-10-19  2:47 ` [ 67/76] usb: gadget: at91_udc: fix dt support Greg Kroah-Hartman
2012-10-19  2:47 ` [ 68/76] ALSA: hda - Fix registration race of VGA switcheroo Greg Kroah-Hartman
2012-10-19  2:47 ` [ 69/76] ALSA: hda - Stop LPIB delay counting on broken hardware Greg Kroah-Hartman
2012-10-19  2:47 ` [ 70/76] ALSA: hda - Always check array bounds in alc_get_line_out_pfx Greg Kroah-Hartman
2012-10-19  2:47 ` [ 71/76] ASoC: fsi: dont reschedule DMA from an atomic context Greg Kroah-Hartman
2012-10-19  2:47 ` [ 72/76] ASoC: wm2200: Use rev A register patches on rev B Greg Kroah-Hartman
2012-10-19  2:47 ` [ 73/76] ASoC: wm2200: Fix non-inverted OUT2 mute control Greg Kroah-Hartman
2012-10-19  2:47 ` [ 74/76] ASoC: omap-abe-twl6040: Fix typo of Vibrator Greg Kroah-Hartman
2012-10-19  2:47 ` [ 75/76] ALSA: ac97 - Fix missing NULL check in snd_ac97_cvol_new() Greg Kroah-Hartman
2012-10-19  2:47 ` [ 76/76] ALSA: emu10k1: add chip details for E-mu 1010 PCIe card Greg Kroah-Hartman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20121019024350.466211922@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=Trond.Myklebust@netapp.com \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=tao.peng@emc.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox