* [PATCH 0/4] pnfsblock: fix IO alignment bug
@ 2012-08-08 1:54 Peng Tao
2012-08-08 1:54 ` [PATCH 1/4] Revert "pnfsblock: bail out partial page IO" Peng Tao
2012-08-08 1:54 ` [PATCH 2/4] pnfsblock: fix partial page buffer wirte Peng Tao
0 siblings, 2 replies; 4+ messages in thread
From: Peng Tao @ 2012-08-08 1:54 UTC (permalink / raw)
To: Trond.Myklebust; +Cc: linux-nfs
Peng Tao (4):
Revert "pnfsblock: bail out partial page IO"
pnfsblock: fix partial page buffer wirte
pnfsblock: fix non-aligned DIO read
pnfsblock: fix non-aligned DIO write
fs/nfs/blocklayout/blocklayout.c | 270 ++++++++++++++++++++++++++++++++-----
fs/nfs/blocklayout/blocklayout.h | 1 +
2 files changed, 234 insertions(+), 37 deletions(-)
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/4] Revert "pnfsblock: bail out partial page IO"
2012-08-08 1:54 [PATCH 0/4] pnfsblock: fix IO alignment bug Peng Tao
@ 2012-08-08 1:54 ` Peng Tao
2012-08-08 1:54 ` [PATCH 2/4] pnfsblock: fix partial page buffer wirte Peng Tao
1 sibling, 0 replies; 4+ messages in thread
From: Peng Tao @ 2012-08-08 1:54 UTC (permalink / raw)
To: Trond.Myklebust; +Cc: linux-nfs
This reverts commit 159e0561e322dd8008fff59e36efff8d2bdd0b0e in favor
of a complete alignment fix.
Signed-off-by: Peng Tao <tao.peng@emc.com>
---
fs/nfs/blocklayout/blocklayout.c | 39 ++-----------------------------------
1 files changed, 3 insertions(+), 36 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index dd392ed..7ae8a60 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -228,14 +228,6 @@ bl_end_par_io_read(void *data, int unused)
schedule_work(&rdata->task.u.tk_work);
}
-static bool
-bl_check_alignment(u64 offset, u32 len, unsigned long blkmask)
-{
- if ((offset & blkmask) || (len & blkmask))
- return false;
- return true;
-}
-
static enum pnfs_try_status
bl_read_pagelist(struct nfs_read_data *rdata)
{
@@ -252,9 +244,6 @@ bl_read_pagelist(struct nfs_read_data *rdata)
dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
- if (!bl_check_alignment(f_offset, rdata->args.count, PAGE_CACHE_MASK))
- goto use_mds;
-
par = alloc_parallel(rdata);
if (!par)
goto use_mds;
@@ -563,7 +552,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
struct bio *bio = NULL;
struct pnfs_block_extent *be = NULL, *cow_read = NULL;
sector_t isect, last_isect = 0, extent_length = 0;
- struct parallel_io *par = NULL;
+ struct parallel_io *par;
loff_t offset = wdata->args.offset;
size_t count = wdata->args.count;
struct page **pages = wdata->args.pages;
@@ -574,10 +563,6 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
- /* Check for alignment first */
- if (!bl_check_alignment(offset, count, PAGE_CACHE_MASK))
- goto out_mds;
-
/* At this point, wdata->pages is a (sequential) list of nfs_pages.
* We want to write each, and if there is an error set pnfs_error
* to have it redone using nfs.
@@ -1011,32 +996,14 @@ bl_clear_layoutdriver(struct nfs_server *server)
return 0;
}
-static void
-bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
-{
- if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK))
- nfs_pageio_reset_read_mds(pgio);
- else
- pnfs_generic_pg_init_read(pgio, req);
-}
-
-static void
-bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
-{
- if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK))
- nfs_pageio_reset_write_mds(pgio);
- else
- pnfs_generic_pg_init_write(pgio, req);
-}
-
static const struct nfs_pageio_ops bl_pg_read_ops = {
- .pg_init = bl_pg_init_read,
+ .pg_init = pnfs_generic_pg_init_read,
.pg_test = pnfs_generic_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
};
static const struct nfs_pageio_ops bl_pg_write_ops = {
- .pg_init = bl_pg_init_write,
+ .pg_init = pnfs_generic_pg_init_write,
.pg_test = pnfs_generic_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
};
--
1.7.1.262.g5ef3d
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/4] pnfsblock: fix partial page buffer wirte
2012-08-08 1:54 [PATCH 0/4] pnfsblock: fix IO alignment bug Peng Tao
2012-08-08 1:54 ` [PATCH 1/4] Revert "pnfsblock: bail out partial page IO" Peng Tao
@ 2012-08-08 1:54 ` Peng Tao
2012-08-08 1:58 ` Peng Tao
1 sibling, 1 reply; 4+ messages in thread
From: Peng Tao @ 2012-08-08 1:54 UTC (permalink / raw)
To: Trond.Myklebust; +Cc: linux-nfs, stable
If applications use flock to protect its write range, generic NFS
will not do read-modify-write cycle at page cache level. Therefore
LD should know how to handle non-sector aligned writes. Otherwise
there will be data corruption.
Cc: stable <stable@verg.kernel.org>
Signed-off-by: Peng Tao <tao.peng@emc.com>
---
fs/nfs/blocklayout/blocklayout.c | 177 +++++++++++++++++++++++++++++++++++---
fs/nfs/blocklayout/blocklayout.h | 1 +
2 files changed, 166 insertions(+), 12 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 7ae8a60..39fa002 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -162,25 +162,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
return bio;
}
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
sector_t isect, struct page *page,
struct pnfs_block_extent *be,
void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
+ struct parallel_io *par,
+ unsigned int offset, int len)
{
+ isect = isect + (offset >> SECTOR_SHIFT);
+ dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
+ npg, rw, (unsigned long long)isect, offset, len);
retry:
if (!bio) {
bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
}
- if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ if (bio_add_page(bio, page, len, offset) < len) {
bio = bl_submit_bio(rw, bio);
goto retry;
}
return bio;
}
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+ sector_t isect, struct page *page,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par)
+{
+ return do_add_page_to_bio(bio, npg, rw, isect, page, be,
+ end_io, par, 0, PAGE_CACHE_SIZE);
+}
+
/* This is basically copied from mpage_end_io_read */
static void bl_end_io_read(struct bio *bio, int err)
{
@@ -450,6 +464,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
return;
}
+static void
+bl_read_single_end_io(struct bio *bio, int error)
+{
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct page *page = bvec->bv_page;
+
+ /* Only one page in bvec */
+ unlock_page(page);
+}
+
+static int
+bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
+ unsigned int offset, unsigned int len)
+{
+ struct bio *bio;
+ struct page *shadow_page;
+ sector_t isect;
+ char *kaddr, *kshadow_addr;
+ int ret = 0;
+
+ dprintk("%s: offset %u len %u\n", __func__, offset, len);
+
+ shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (shadow_page == NULL)
+ return -ENOMEM;
+
+ bio = bio_alloc(GFP_NOIO, 1);
+ if (bio == NULL)
+ return -ENOMEM;
+
+ isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
+ (offset / SECTOR_SIZE);
+
+ bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+ bio->bi_bdev = be->be_mdev;
+ bio->bi_end_io = bl_read_single_end_io;
+
+ lock_page(shadow_page);
+ if (bio_add_page(bio, shadow_page,
+ SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
+ unlock_page(shadow_page);
+ bio_put(bio);
+ return -EIO;
+ }
+
+ submit_bio(READ, bio);
+ wait_on_page_locked(shadow_page);
+ if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
+ ret = -EIO;
+ } else {
+ kaddr = kmap_atomic(page);
+ kshadow_addr = kmap_atomic(shadow_page);
+ memcpy(kaddr + offset, kshadow_addr + offset, len);
+ kunmap_atomic(kshadow_addr);
+ kunmap_atomic(kaddr);
+ }
+ __free_page(shadow_page);
+ bio_put(bio);
+
+ return ret;
+}
+
+static int
+bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
+ unsigned int dirty_offset, unsigned int dirty_len,
+ bool full_page)
+{
+ int ret = 0;
+ unsigned int start, end;
+
+ if (full_page) {
+ start = 0;
+ end = PAGE_CACHE_SIZE;
+ } else {
+ start = round_down(dirty_offset, SECTOR_SIZE);
+ end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
+ }
+
+ dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
+ if (!be) {
+ zero_user_segments(page, start, dirty_offset,
+ dirty_offset + dirty_len, end);
+ if (start == 0 && end == PAGE_CACHE_SIZE &&
+ trylock_page(page)) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ return ret;
+ }
+
+ if (start != dirty_offset)
+ ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
+
+ if (!ret && (dirty_offset + dirty_len < end))
+ ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
+ end - dirty_offset - dirty_len);
+
+ return ret;
+}
+
/* Given an unmapped page, zero it or read in page for COW, page is locked
* by caller.
*/
@@ -483,7 +597,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
SetPageUptodate(page);
cleanup:
- bl_put_extent(cow_read);
if (bh)
free_buffer_head(bh);
if (ret) {
@@ -555,6 +668,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
struct parallel_io *par;
loff_t offset = wdata->args.offset;
size_t count = wdata->args.count;
+ unsigned int pg_offset, pg_len, saved_len;
struct page **pages = wdata->args.pages;
struct page *page;
pgoff_t index;
@@ -659,10 +773,11 @@ next_page:
if (!extent_length) {
/* We've used up the previous extent */
bl_put_extent(be);
+ bl_put_extent(cow_read);
bio = bl_submit_bio(WRITE, bio);
/* Get the next one */
be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
- isect, NULL);
+ isect, &cow_read);
if (!be || !is_writable(be, isect)) {
header->pnfs_error = -EINVAL;
goto out;
@@ -679,7 +794,26 @@ next_page:
extent_length = be->be_length -
(isect - be->be_f_offset);
}
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+
+ dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
+ pg_offset = offset & ~PAGE_CACHE_MASK;
+ if (pg_offset + count > PAGE_CACHE_SIZE)
+ pg_len = PAGE_CACHE_SIZE - pg_offset;
+ else
+ pg_len = count;
+
+ saved_len = pg_len;
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
+ !bl_is_sector_init(be->be_inval, isect)) {
+ ret = bl_read_partial_page_sync(pages[i], cow_read,
+ pg_offset, pg_len, true);
+ if (ret) {
+ dprintk("%s bl_read_partial_page_sync fail %d\n",
+ __func__, ret);
+ header->pnfs_error = ret;
+ goto out;
+ }
+
ret = bl_mark_sectors_init(be->be_inval, isect,
PAGE_CACHE_SECTORS);
if (unlikely(ret)) {
@@ -688,15 +822,35 @@ next_page:
header->pnfs_error = ret;
goto out;
}
+
+ /* Expand to full page write */
+ pg_offset = 0;
+ pg_len = PAGE_CACHE_SIZE;
+ } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
+ (pg_len & (SECTOR_SIZE - 1))){
+ /* ahh, nasty case. We have to do sync full sector
+ * read-modify-write cycles.
+ */
+ unsigned int saved_offset = pg_offset;
+ ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
+ pg_len, false);
+ pg_offset = round_down(pg_offset, SECTOR_SIZE);
+ pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
+ - pg_offset;
}
- bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+
+
+ bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
isect, pages[i], be,
- bl_end_io_write, par);
+ bl_end_io_write, par,
+ pg_offset, pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
+ offset += saved_len;
+ count -= saved_len;
isect += PAGE_CACHE_SECTORS;
last_isect = isect;
extent_length -= PAGE_CACHE_SECTORS;
@@ -714,17 +868,16 @@ next_page:
}
write_done:
- wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
- if (count < wdata->res.count) {
- wdata->res.count = count;
- }
+ wdata->res.count = wdata->args.count;
out:
bl_put_extent(be);
+ bl_put_extent(cow_read);
bl_submit_bio(WRITE, bio);
put_parallel(par);
return PNFS_ATTEMPTED;
out_mds:
bl_put_extent(be);
+ bl_put_extent(cow_read);
kfree(par);
return PNFS_NOT_ATTEMPTED;
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 0335069..39bb51a 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -41,6 +41,7 @@
#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
struct block_mount_id {
spinlock_t bm_lock; /* protects list */
--
1.7.1.262.g5ef3d
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 2/4] pnfsblock: fix partial page buffer wirte
2012-08-08 1:54 ` [PATCH 2/4] pnfsblock: fix partial page buffer wirte Peng Tao
@ 2012-08-08 1:58 ` Peng Tao
0 siblings, 0 replies; 4+ messages in thread
From: Peng Tao @ 2012-08-08 1:58 UTC (permalink / raw)
To: Trond.Myklebust; +Cc: linux-nfs, stable
On Wed, Aug 8, 2012 at 9:54 AM, Peng Tao <bergwolf@gmail.com> wrote:
> If applications use flock to protect its write range, generic NFS
> will not do read-modify-write cycle at page cache level. Therefore
> LD should know how to handle non-sector aligned writes. Otherwise
> there will be data corruption.
>
> Cc: stable <stable@verg.kernel.org>
~~~typo here... should be.
Cc: stable <stable@vger.kernel.org>
> Signed-off-by: Peng Tao <tao.peng@emc.com>
> ---
> fs/nfs/blocklayout/blocklayout.c | 177 +++++++++++++++++++++++++++++++++++---
> fs/nfs/blocklayout/blocklayout.h | 1 +
> 2 files changed, 166 insertions(+), 12 deletions(-)
>
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 7ae8a60..39fa002 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -162,25 +162,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
> return bio;
> }
>
> -static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
> +static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
> sector_t isect, struct page *page,
> struct pnfs_block_extent *be,
> void (*end_io)(struct bio *, int err),
> - struct parallel_io *par)
> + struct parallel_io *par,
> + unsigned int offset, int len)
> {
> + isect = isect + (offset >> SECTOR_SHIFT);
> + dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
> + npg, rw, (unsigned long long)isect, offset, len);
> retry:
> if (!bio) {
> bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
> if (!bio)
> return ERR_PTR(-ENOMEM);
> }
> - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
> + if (bio_add_page(bio, page, len, offset) < len) {
> bio = bl_submit_bio(rw, bio);
> goto retry;
> }
> return bio;
> }
>
> +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
> + sector_t isect, struct page *page,
> + struct pnfs_block_extent *be,
> + void (*end_io)(struct bio *, int err),
> + struct parallel_io *par)
> +{
> + return do_add_page_to_bio(bio, npg, rw, isect, page, be,
> + end_io, par, 0, PAGE_CACHE_SIZE);
> +}
> +
> /* This is basically copied from mpage_end_io_read */
> static void bl_end_io_read(struct bio *bio, int err)
> {
> @@ -450,6 +464,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
> return;
> }
>
> +static void
> +bl_read_single_end_io(struct bio *bio, int error)
> +{
> + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
> + struct page *page = bvec->bv_page;
> +
> + /* Only one page in bvec */
> + unlock_page(page);
> +}
> +
> +static int
> +bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
> + unsigned int offset, unsigned int len)
> +{
> + struct bio *bio;
> + struct page *shadow_page;
> + sector_t isect;
> + char *kaddr, *kshadow_addr;
> + int ret = 0;
> +
> + dprintk("%s: offset %u len %u\n", __func__, offset, len);
> +
> + shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
> + if (shadow_page == NULL)
> + return -ENOMEM;
> +
> + bio = bio_alloc(GFP_NOIO, 1);
> + if (bio == NULL)
> + return -ENOMEM;
> +
> + isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
> + (offset / SECTOR_SIZE);
> +
> + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
> + bio->bi_bdev = be->be_mdev;
> + bio->bi_end_io = bl_read_single_end_io;
> +
> + lock_page(shadow_page);
> + if (bio_add_page(bio, shadow_page,
> + SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
> + unlock_page(shadow_page);
> + bio_put(bio);
> + return -EIO;
> + }
> +
> + submit_bio(READ, bio);
> + wait_on_page_locked(shadow_page);
> + if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
> + ret = -EIO;
> + } else {
> + kaddr = kmap_atomic(page);
> + kshadow_addr = kmap_atomic(shadow_page);
> + memcpy(kaddr + offset, kshadow_addr + offset, len);
> + kunmap_atomic(kshadow_addr);
> + kunmap_atomic(kaddr);
> + }
> + __free_page(shadow_page);
> + bio_put(bio);
> +
> + return ret;
> +}
> +
> +static int
> +bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
> + unsigned int dirty_offset, unsigned int dirty_len,
> + bool full_page)
> +{
> + int ret = 0;
> + unsigned int start, end;
> +
> + if (full_page) {
> + start = 0;
> + end = PAGE_CACHE_SIZE;
> + } else {
> + start = round_down(dirty_offset, SECTOR_SIZE);
> + end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
> + }
> +
> + dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
> + if (!be) {
> + zero_user_segments(page, start, dirty_offset,
> + dirty_offset + dirty_len, end);
> + if (start == 0 && end == PAGE_CACHE_SIZE &&
> + trylock_page(page)) {
> + SetPageUptodate(page);
> + unlock_page(page);
> + }
> + return ret;
> + }
> +
> + if (start != dirty_offset)
> + ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
> +
> + if (!ret && (dirty_offset + dirty_len < end))
> + ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
> + end - dirty_offset - dirty_len);
> +
> + return ret;
> +}
> +
> /* Given an unmapped page, zero it or read in page for COW, page is locked
> * by caller.
> */
> @@ -483,7 +597,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
> SetPageUptodate(page);
>
> cleanup:
> - bl_put_extent(cow_read);
> if (bh)
> free_buffer_head(bh);
> if (ret) {
> @@ -555,6 +668,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
> struct parallel_io *par;
> loff_t offset = wdata->args.offset;
> size_t count = wdata->args.count;
> + unsigned int pg_offset, pg_len, saved_len;
> struct page **pages = wdata->args.pages;
> struct page *page;
> pgoff_t index;
> @@ -659,10 +773,11 @@ next_page:
> if (!extent_length) {
> /* We've used up the previous extent */
> bl_put_extent(be);
> + bl_put_extent(cow_read);
> bio = bl_submit_bio(WRITE, bio);
> /* Get the next one */
> be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
> - isect, NULL);
> + isect, &cow_read);
> if (!be || !is_writable(be, isect)) {
> header->pnfs_error = -EINVAL;
> goto out;
> @@ -679,7 +794,26 @@ next_page:
> extent_length = be->be_length -
> (isect - be->be_f_offset);
> }
> - if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> +
> + dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
> + pg_offset = offset & ~PAGE_CACHE_MASK;
> + if (pg_offset + count > PAGE_CACHE_SIZE)
> + pg_len = PAGE_CACHE_SIZE - pg_offset;
> + else
> + pg_len = count;
> +
> + saved_len = pg_len;
> + if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
> + !bl_is_sector_init(be->be_inval, isect)) {
> + ret = bl_read_partial_page_sync(pages[i], cow_read,
> + pg_offset, pg_len, true);
> + if (ret) {
> + dprintk("%s bl_read_partial_page_sync fail %d\n",
> + __func__, ret);
> + header->pnfs_error = ret;
> + goto out;
> + }
> +
> ret = bl_mark_sectors_init(be->be_inval, isect,
> PAGE_CACHE_SECTORS);
> if (unlikely(ret)) {
> @@ -688,15 +822,35 @@ next_page:
> header->pnfs_error = ret;
> goto out;
> }
> +
> + /* Expand to full page write */
> + pg_offset = 0;
> + pg_len = PAGE_CACHE_SIZE;
> + } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
> + (pg_len & (SECTOR_SIZE - 1))){
> + /* ahh, nasty case. We have to do sync full sector
> + * read-modify-write cycles.
> + */
> + unsigned int saved_offset = pg_offset;
> + ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
> + pg_len, false);
> + pg_offset = round_down(pg_offset, SECTOR_SIZE);
> + pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
> + - pg_offset;
> }
> - bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
> +
> +
> + bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
> isect, pages[i], be,
> - bl_end_io_write, par);
> + bl_end_io_write, par,
> + pg_offset, pg_len);
> if (IS_ERR(bio)) {
> header->pnfs_error = PTR_ERR(bio);
> bio = NULL;
> goto out;
> }
> + offset += saved_len;
> + count -= saved_len;
> isect += PAGE_CACHE_SECTORS;
> last_isect = isect;
> extent_length -= PAGE_CACHE_SECTORS;
> @@ -714,17 +868,16 @@ next_page:
> }
>
> write_done:
> - wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
> - if (count < wdata->res.count) {
> - wdata->res.count = count;
> - }
> + wdata->res.count = wdata->args.count;
> out:
> bl_put_extent(be);
> + bl_put_extent(cow_read);
> bl_submit_bio(WRITE, bio);
> put_parallel(par);
> return PNFS_ATTEMPTED;
> out_mds:
> bl_put_extent(be);
> + bl_put_extent(cow_read);
> kfree(par);
> return PNFS_NOT_ATTEMPTED;
> }
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index 0335069..39bb51a 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -41,6 +41,7 @@
>
> #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
> #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
> +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
>
> struct block_mount_id {
> spinlock_t bm_lock; /* protects list */
> --
> 1.7.1.262.g5ef3d
>
--
Thanks,
Tao
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2012-08-08 1:58 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-08-08 1:54 [PATCH 0/4] pnfsblock: fix IO alignment bug Peng Tao
2012-08-08 1:54 ` [PATCH 1/4] Revert "pnfsblock: bail out partial page IO" Peng Tao
2012-08-08 1:54 ` [PATCH 2/4] pnfsblock: fix partial page buffer wirte Peng Tao
2012-08-08 1:58 ` Peng Tao
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).