public inbox for linux-nfs@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH-v2 0/7] pnfsblock cleanup and fixes
@ 2011-11-10 15:25 Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 1/7] pnfsblock: cleanup bl_mark_sectors_init Peng Tao
                   ` (7 more replies)
  0 siblings, 8 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:25 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy

Hi, Trond and Benny,

These are updated v2 of the pnfsblock patches and all Benny's comments are addressed.

Thanks,
Tao

Peng Tao (8):
  pnfsblock: cleanup bl_mark_sectors_init
  pnfsblock: acquire im_lock in _preload_range
  pnfsblock: move find lock page logic out of bl_write_pagelist
  pnfsblock: set read/write tk_status to pnfs_error
  pnfsblock: remove rpc_call_ops from struct parallel_io
  pnfsblock: clean up _add_entry
  pnfsblock: alloc short extent before submit bio
  pnfsblock: release lock when freeing block_dev

 fs/nfs/blocklayout/blocklayout.c |  176 +++++++++++++++++++++++++-------------
 fs/nfs/blocklayout/blocklayout.h |   12 ++-
 fs/nfs/blocklayout/extents.c     |  157 ++++++++++++++--------------------
 3 files changed, 192 insertions(+), 153 deletions(-)


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH-v2 1/7] pnfsblock: cleanup bl_mark_sectors_init
  2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
@ 2011-11-10 15:25 ` Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 3/7] pnfsblock: move find lock page logic out of bl_write_pagelist Peng Tao
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:25 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy, Peng Tao

It does not need to manipulate on partial initialized blocks.
Writeback code takes care of it.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/blocklayout.c |    6 +--
 fs/nfs/blocklayout/blocklayout.h |    3 +-
 fs/nfs/blocklayout/extents.c     |   76 ++-----------------------------------
 3 files changed, 8 insertions(+), 77 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 281ae95..4ced0b0 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -571,8 +571,7 @@ fill_invalid_ext:
 			unlock_page(page);
 
 			ret = bl_mark_sectors_init(be->be_inval, isect,
-						       PAGE_CACHE_SECTORS,
-						       NULL);
+						       PAGE_CACHE_SECTORS);
 			if (unlikely(ret)) {
 				dprintk("%s bl_mark_sectors_init fail %d\n",
 					__func__, ret);
@@ -621,8 +620,7 @@ next_page:
 		}
 		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
 			ret = bl_mark_sectors_init(be->be_inval, isect,
-						       PAGE_CACHE_SECTORS,
-						       NULL);
+						       PAGE_CACHE_SECTORS);
 			if (unlikely(ret)) {
 				dprintk("%s bl_mark_sectors_init fail %d\n",
 					__func__, ret);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 42acf7e..60728ac 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -186,8 +186,7 @@ struct pnfs_block_extent *
 bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
 		struct pnfs_block_extent **cow_read);
 int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-			     sector_t offset, sector_t length,
-			     sector_t **pages);
+			     sector_t offset, sector_t length);
 void bl_put_extent(struct pnfs_block_extent *be);
 struct pnfs_block_extent *bl_alloc_extent(void);
 int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 19fa7b0..0fc1321 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -179,33 +179,6 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
 	return status;
 }
 
-static void set_needs_init(sector_t *array, sector_t offset)
-{
-	sector_t *p = array;
-
-	dprintk("%s enter\n", __func__);
-	if (!p)
-		return;
-	while (*p < offset)
-		p++;
-	if (*p == offset)
-		return;
-	else if (*p == ~0) {
-		*p++ = offset;
-		*p = ~0;
-		return;
-	} else {
-		sector_t *save = p;
-		dprintk("%s Adding %llu\n", __func__, (u64)offset);
-		while (*p != ~0)
-			p++;
-		p++;
-		memmove(save + 1, save, (char *)p - (char *)save);
-		*save = offset;
-		return;
-	}
-}
-
 /* We are relying on page lock to serialize this */
 int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
 {
@@ -261,28 +234,15 @@ static int is_range_written(struct pnfs_inval_markings *marks,
 
 /* Marks sectors in [offest, offset_length) as having been initialized.
  * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Notes where partial block is initialized, and helps prepare it for
- * complete initialization later.
+ * Currently assumes offset is page-aligned
  */
-/* Currently assumes offset is page-aligned */
 int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-			     sector_t offset, sector_t length,
-			     sector_t **pages)
+			     sector_t offset, sector_t length)
 {
-	sector_t s, start, end;
-	sector_t *array = NULL; /* Pages to mark */
+	sector_t start, end;
 
 	dprintk("%s(offset=%llu,len=%llu) enter\n",
 		__func__, (u64)offset, (u64)length);
-	s = max((sector_t) 3,
-		2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
-	dprintk("%s set max=%llu\n", __func__, (u64)s);
-	if (pages) {
-		array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
-		if (!array)
-			goto outerr;
-		array[0] = ~0;
-	}
 
 	start = normalize(offset, marks->im_block_size);
 	end = normalize_up(offset + length, marks->im_block_size);
@@ -290,41 +250,15 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
 		goto outerr;
 
 	spin_lock(&marks->im_lock);
-
-	for (s = normalize_up(start, PAGE_CACHE_SECTORS);
-	     s < offset; s += PAGE_CACHE_SECTORS) {
-		dprintk("%s pre-area pages\n", __func__);
-		/* Portion of used block is not initialized */
-		if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
-			set_needs_init(array, s);
-	}
 	if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
 		goto out_unlock;
-	for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
-	     s < end; s += PAGE_CACHE_SECTORS) {
-		dprintk("%s post-area pages\n", __func__);
-		if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
-			set_needs_init(array, s);
-	}
-
 	spin_unlock(&marks->im_lock);
 
-	if (pages) {
-		if (array[0] == ~0) {
-			kfree(array);
-			*pages = NULL;
-		} else
-			*pages = array;
-	}
 	return 0;
 
- out_unlock:
+out_unlock:
 	spin_unlock(&marks->im_lock);
- outerr:
-	if (pages) {
-		kfree(array);
-		*pages = NULL;
-	}
+outerr:
 	return -ENOMEM;
 }
 
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH-v2 3/7] pnfsblock: move find lock page logic out of bl_write_pagelist
  2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 1/7] pnfsblock: cleanup bl_mark_sectors_init Peng Tao
@ 2011-11-10 15:25 ` Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 4/7] pnfsblock: set read/write tk_status to pnfs_error Peng Tao
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:25 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy, Peng Tao

Also avoid unnecessary lock_page if page is handled by others.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/blocklayout.c |   78 ++++++++++++++++++++++++++------------
 1 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 4ced0b0..e84bd9e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -484,6 +484,55 @@ cleanup:
 	return ret;
 }
 
+/* Find or create a zeroing page marked being writeback.
+ * Return ERR_PTR on error, NULL to indicate skip this page and page itself
+ * to indicate write out.
+ */
+static struct page *
+bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
+			struct pnfs_block_extent *cow_read)
+{
+	struct page *page;
+	bool locked = false;
+	page = find_get_page(inode->i_mapping, index);
+	if (page)
+		goto check_page;
+
+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	if (unlikely(!page)) {
+		dprintk("%s oom\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+	locked = true;
+
+check_page:
+	/* PageDirty: Other will write this out
+	 * PageWriteback: Other is writing this out
+	 * PageUptodate: It was read before
+	 */
+	if (PageDirty(page) || PageWriteback(page)) {
+		print_page(page);
+		if (locked)
+			unlock_page(page);
+		page_cache_release(page);
+		return NULL;
+	}
+
+	if (!locked) {
+		lock_page(page);
+		locked = true;
+		goto check_page;
+	}
+	if (!PageUptodate(page)) {
+		/* New page, readin or zero it */
+		init_page_for_write(page, cow_read);
+	}
+	set_page_writeback(page);
+	unlock_page(page);
+
+	return page;
+}
+
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 {
@@ -543,32 +592,13 @@ fill_invalid_ext:
 			dprintk("%s zero %dth page: index %lu isect %llu\n",
 				__func__, npg_zero, index,
 				(unsigned long long)isect);
-			page =
-			    find_or_create_page(wdata->inode->i_mapping, index,
-						GFP_NOFS);
-			if (!page) {
-				dprintk("%s oom\n", __func__);
-				wdata->pnfs_error = -ENOMEM;
+			page = bl_find_get_zeroing_page(wdata->inode, index,
+							cow_read);
+			if (unlikely(IS_ERR(page))) {
+				wdata->pnfs_error = PTR_ERR(page);
 				goto out;
-			}
-
-			/* PageDirty: Other will write this out
-			 * PageWriteback: Other is writing this out
-			 * PageUptodate: It was read before
-			 * sector_initialized: already written out
-			 */
-			if (PageDirty(page) || PageWriteback(page)) {
-				print_page(page);
-				unlock_page(page);
-				page_cache_release(page);
+			} else if (page == NULL)
 				goto next_page;
-			}
-			if (!PageUptodate(page)) {
-				/* New page, readin or zero it */
-				init_page_for_write(page, cow_read);
-			}
-			set_page_writeback(page);
-			unlock_page(page);
 
 			ret = bl_mark_sectors_init(be->be_inval, isect,
 						       PAGE_CACHE_SECTORS);
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH-v2 4/7] pnfsblock: set read/write tk_status to pnfs_error
  2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 1/7] pnfsblock: cleanup bl_mark_sectors_init Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 3/7] pnfsblock: move find lock page logic out of bl_write_pagelist Peng Tao
@ 2011-11-10 15:25 ` Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 5/7] pnfsblock: remove rpc_call_ops from struct parallel_io Peng Tao
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:25 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy, Peng Tao

To pass the IO status to upper layer.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/blocklayout.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e84bd9e..8ad8014 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -216,6 +216,7 @@ bl_end_par_io_read(void *data)
 {
 	struct nfs_read_data *rdata = data;
 
+	rdata->task.tk_status = rdata->pnfs_error;
 	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
 	schedule_work(&rdata->task.u.tk_work);
 }
@@ -405,7 +406,7 @@ static void bl_end_par_io_write(void *data)
 {
 	struct nfs_write_data *wdata = data;
 
-	wdata->task.tk_status = 0;
+	wdata->task.tk_status = wdata->pnfs_error;
 	wdata->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
 	schedule_work(&wdata->task.u.tk_work);
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH-v2 5/7] pnfsblock: remove rpc_call_ops from struct parallel_io
  2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
                   ` (2 preceding siblings ...)
  2011-11-10 15:25 ` [PATCH-v2 4/7] pnfsblock: set read/write tk_status to pnfs_error Peng Tao
@ 2011-11-10 15:25 ` Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 6/7] pnfsblock: clean up _add_entry Peng Tao
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:25 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy

block layout can just make use of generic read/write_done.

Signed-off-by: Peng Tao <bergwolf@gmail.com>
---
 fs/nfs/blocklayout/blocklayout.c |   13 -------------
 1 files changed, 0 insertions(+), 13 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 8ad8014..3eaea2b 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,7 +90,6 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
  */
 struct parallel_io {
 	struct kref refcnt;
-	struct rpc_call_ops call_ops;
 	void (*pnfs_callback) (void *data);
 	void *data;
 };
@@ -221,14 +220,6 @@ bl_end_par_io_read(void *data)
 	schedule_work(&rdata->task.u.tk_work);
 }
 
-/* We don't want normal .rpc_call_done callback used, so we replace it
- * with this stub.
- */
-static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
-{
-	return;
-}
-
 static enum pnfs_try_status
 bl_read_pagelist(struct nfs_read_data *rdata)
 {
@@ -248,8 +239,6 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 	par = alloc_parallel(rdata);
 	if (!par)
 		goto use_mds;
-	par->call_ops = *rdata->mds_ops;
-	par->call_ops.rpc_call_done = bl_rpc_do_nothing;
 	par->pnfs_callback = bl_end_par_io_read;
 	/* At this point, we can no longer jump to use_mds */
 
@@ -559,8 +548,6 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	par = alloc_parallel(wdata);
 	if (!par)
 		return PNFS_NOT_ATTEMPTED;
-	par->call_ops = *wdata->mds_ops;
-	par->call_ops.rpc_call_done = bl_rpc_do_nothing;
 	par->pnfs_callback = bl_end_par_io_write;
 	/* At this point, have to be more careful with error handling */
 
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH-v2 6/7] pnfsblock: clean up _add_entry
  2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
                   ` (3 preceding siblings ...)
  2011-11-10 15:25 ` [PATCH-v2 5/7] pnfsblock: remove rpc_call_ops from struct parallel_io Peng Tao
@ 2011-11-10 15:25 ` Peng Tao
  2011-11-10 15:25 ` [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio Peng Tao
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:25 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy, Peng Tao

It is wrong to kmalloc in _add_entry() as it is inside
spinlock. memory should be already allocated _add_entry() is called.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/extents.c |    8 +-------
 1 files changed, 1 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index f383524..d0f52ed 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
 		return 0;
 	} else {
 		struct pnfs_inval_tracking *new;
-		if (storage)
-			new = storage;
-		else {
-			new = kmalloc(sizeof(*new), GFP_NOFS);
-			if (!new)
-				return -ENOMEM;
-		}
+		new = storage;
 		new->it_sector = s;
 		new->it_tags = (1 << tag);
 		list_add(&new->it_link, &pos->it_link);
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio
  2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
                   ` (4 preceding siblings ...)
  2011-11-10 15:25 ` [PATCH-v2 6/7] pnfsblock: clean up _add_entry Peng Tao
@ 2011-11-10 15:25 ` Peng Tao
  2011-11-10 16:43   ` Bryan Schumaker
  2011-11-10 15:37 ` [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
  2011-11-10 15:52 ` Peng Tao
  7 siblings, 1 reply; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:25 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy, Peng Tao

As discussed earlier, it is better for block client to allocate memory for
tracking extents state before submitting bio. So the patch does it by allocating
a short_extent for every INVALID extent touched by write pagelist and for
every zeroing page we created, saving them in layout header. Then in end_io we
can just use them to create commit list items and avoid memory allocation there.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/blocklayout.c |   74 +++++++++++++++++++++++++++++--------
 fs/nfs/blocklayout/blocklayout.h |    9 ++++-
 fs/nfs/blocklayout/extents.c     |   62 +++++++++++++++++++++++++++-----
 3 files changed, 119 insertions(+), 26 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 3eaea2b..3fdfb29 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
  */
 struct parallel_io {
 	struct kref refcnt;
-	void (*pnfs_callback) (void *data);
+	void (*pnfs_callback) (void *data, int num_se);
 	void *data;
+	int bse_count;
 };
 
 static inline struct parallel_io *alloc_parallel(void *data)
@@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
 	if (rv) {
 		rv->data = data;
 		kref_init(&rv->refcnt);
+		rv->bse_count = 0;
 	}
 	return rv;
 }
@@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
 	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
 
 	dprintk("%s enter\n", __func__);
-	p->pnfs_callback(p->data);
+	p->pnfs_callback(p->data, p->bse_count);
 	kfree(p);
 }
 
@@ -211,7 +213,7 @@ static void bl_read_cleanup(struct work_struct *work)
 }
 
 static void
-bl_end_par_io_read(void *data)
+bl_end_par_io_read(void *data, int unused)
 {
 	struct nfs_read_data *rdata = data;
 
@@ -312,6 +314,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 {
 	sector_t isect, end;
 	struct pnfs_block_extent *be;
+	struct pnfs_block_short_extent *se;
 
 	dprintk("%s(%llu, %u)\n", __func__, offset, count);
 	if (count == 0)
@@ -324,8 +327,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 		be = bl_find_get_extent(bl, isect, NULL);
 		BUG_ON(!be); /* FIXME */
 		len = min(end, be->be_f_offset + be->be_length) - isect;
-		if (be->be_state == PNFS_BLOCK_INVALID_DATA)
-			bl_mark_for_commit(be, isect, len); /* What if fails? */
+		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+			se = bl_pop_one_short_extent(be->be_inval);
+			BUG_ON(!se);
+			bl_mark_for_commit(be, isect, len, se);
+		}
 		isect += len;
 		bl_put_extent(be);
 	}
@@ -347,7 +353,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 		end_page_writeback(page);
 		page_cache_release(page);
 	} while (bvec >= bio->bi_io_vec);
-	if (!uptodate) {
+
+	if (unlikely(!uptodate)) {
 		if (!wdata->pnfs_error)
 			wdata->pnfs_error = -EIO;
 		pnfs_set_lo_fail(wdata->lseg);
@@ -356,7 +363,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	put_parallel(par);
 }
 
-/* This is basically copied from mpage_end_io_read */
 static void bl_end_io_write(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
@@ -382,7 +388,7 @@ static void bl_write_cleanup(struct work_struct *work)
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
 	wdata = container_of(task, struct nfs_write_data, task);
-	if (!wdata->pnfs_error) {
+	if (likely(!wdata->pnfs_error)) {
 		/* Marks for LAYOUTCOMMIT */
 		mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
 				     wdata->args.offset, wdata->args.count);
@@ -391,10 +397,15 @@ static void bl_write_cleanup(struct work_struct *work)
 }
 
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data)
+static void bl_end_par_io_write(void *data, int num_se)
 {
 	struct nfs_write_data *wdata = data;
 
+	if (unlikely(wdata->pnfs_error)) {
+		bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+					num_se);
+	}
+
 	wdata->task.tk_status = wdata->pnfs_error;
 	wdata->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
@@ -547,7 +558,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	 */
 	par = alloc_parallel(wdata);
 	if (!par)
-		return PNFS_NOT_ATTEMPTED;
+		goto out_mds;
 	par->pnfs_callback = bl_end_par_io_write;
 	/* At this point, have to be more careful with error handling */
 
@@ -555,12 +566,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
 	if (!be || !is_writable(be, isect)) {
 		dprintk("%s no matching extents!\n", __func__);
-		wdata->pnfs_error = -EINVAL;
-		goto out;
+		goto out_mds;
 	}
 
 	/* First page inside INVALID extent */
 	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+		if (likely(!bl_push_one_short_extent(be->be_inval)))
+			par->bse_count++;
+		else
+			goto out_mds;
 		temp = offset >> PAGE_CACHE_SHIFT;
 		npg_zero = do_div(temp, npg_per_block);
 		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -598,6 +612,19 @@ fill_invalid_ext:
 				wdata->pnfs_error = ret;
 				goto out;
 			}
+			if (likely(!bl_push_one_short_extent(be->be_inval)))
+				par->bse_count++;
+			else {
+				end_page_writeback(page);
+				page_cache_release(page);
+				wdata->pnfs_error = -ENOMEM;
+				goto out;
+			}
+			/* FIXME: This should be done in bi_end_io */
+			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+					     page->index << PAGE_CACHE_SHIFT,
+					     PAGE_CACHE_SIZE);
+
 			bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
 						 isect, page, be,
 						 bl_end_io_write_zero, par);
@@ -606,10 +633,6 @@ fill_invalid_ext:
 				bio = NULL;
 				goto out;
 			}
-			/* FIXME: This should be done in bi_end_io */
-			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
-					     page->index << PAGE_CACHE_SHIFT,
-					     PAGE_CACHE_SIZE);
 next_page:
 			isect += PAGE_CACHE_SECTORS;
 			extent_length -= PAGE_CACHE_SECTORS;
@@ -633,6 +656,15 @@ next_page:
 				wdata->pnfs_error = -EINVAL;
 				goto out;
 			}
+			if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+				if (likely(!bl_push_one_short_extent(
+								be->be_inval)))
+					par->bse_count++;
+				else {
+					wdata->pnfs_error = -ENOMEM;
+					goto out;
+				}
+			}
 			extent_length = be->be_length -
 			    (isect - be->be_f_offset);
 		}
@@ -680,6 +712,10 @@ out:
 	bl_submit_bio(WRITE, bio);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
+out_mds:
+	bl_put_extent(be);
+	kfree(par);
+	return PNFS_NOT_ATTEMPTED;
 }
 
 /* FIXME - range ignored */
@@ -706,11 +742,17 @@ static void
 release_inval_marks(struct pnfs_inval_markings *marks)
 {
 	struct pnfs_inval_tracking *pos, *temp;
+	struct pnfs_block_short_extent *se, *stemp;
 
 	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
 		list_del(&pos->it_link);
 		kfree(pos);
 	}
+
+	list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
+		list_del(&se->bse_node);
+		kfree(se);
+	}
 	return;
 }
 
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 60728ac..e31a2df 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -70,6 +70,7 @@ struct pnfs_inval_markings {
 	spinlock_t	im_lock;
 	struct my_tree	im_tree;	/* Sectors that need LAYOUTCOMMIT */
 	sector_t	im_block_size;	/* Server blocksize in sectors */
+	struct list_head im_extents;	/* Short extents for INVAL->RW conversion */
 };
 
 struct pnfs_inval_tracking {
@@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
 {
 	spin_lock_init(&marks->im_lock);
 	INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+	INIT_LIST_HEAD(&marks->im_extents);
 	marks->im_block_size = blocksize;
 	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
 					   blocksize);
@@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
 int bl_add_merge_extent(struct pnfs_block_layout *bl,
 			 struct pnfs_block_extent *new);
 int bl_mark_for_commit(struct pnfs_block_extent *be,
-			sector_t offset, sector_t length);
+			sector_t offset, sector_t length,
+			struct pnfs_block_short_extent *new);
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
 
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index d0f52ed..db66e68 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
 
 /* Note the range described by offset, length is guaranteed to be contained
  * within be.
+ * new will be freed, either by this function or add_to_commitlist if they
+ * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
  */
 int bl_mark_for_commit(struct pnfs_block_extent *be,
-		    sector_t offset, sector_t length)
+		    sector_t offset, sector_t length,
+		    struct pnfs_block_short_extent *new)
 {
 	sector_t new_end, end = offset + length;
-	struct pnfs_block_short_extent *new;
 	struct pnfs_block_layout *bl = container_of(be->be_inval,
 						    struct pnfs_block_layout,
 						    bl_inval);
 
-	new = kmalloc(sizeof(*new), GFP_NOFS);
-	if (!new)
-		return -ENOMEM;
-
 	mark_written_sectors(be->be_inval, offset, length);
 	/* We want to add the range to commit list, but it must be
 	 * block-normalized, and verified that the normalized range has
@@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
 	new->bse_mdev = be->be_mdev;
 
 	spin_lock(&bl->bl_ext_lock);
-	/* new will be freed, either by add_to_commitlist if it decides not
-	 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
-	 */
 	add_to_commitlist(bl, new);
 	spin_unlock(&bl->bl_ext_lock);
 	return 0;
@@ -862,3 +857,52 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
 		}
 	}
 }
+
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
+{
+	struct pnfs_block_short_extent *new;
+
+	new = kmalloc(sizeof(*new), GFP_NOFS);
+	if (unlikely(!new))
+		return -ENOMEM;
+
+	spin_lock(&marks->im_lock);
+	list_add(&new->bse_node, &marks->im_extents);
+	spin_unlock(&marks->im_lock);
+
+	return 0;
+}
+
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
+{
+	struct pnfs_block_short_extent *rv = NULL;
+
+	spin_lock(&marks->im_lock);
+	if (!list_empty(&marks->im_extents)) {
+		rv = list_entry((&marks->im_extents)->next,
+				struct pnfs_block_short_extent, bse_node);
+		list_del_init(&rv->bse_node);
+	}
+	spin_unlock(&marks->im_lock);
+
+	return rv;
+}
+
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
+{
+	struct pnfs_block_short_extent *se = NULL, *tmp;
+
+	BUG_ON(num_to_free <= 0);
+
+	spin_lock(&marks->im_lock);
+	list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
+		list_del(&se->bse_node);
+		kfree(se);
+		if (--num_to_free == 0)
+			break;
+	}
+	spin_unlock(&marks->im_lock);
+
+	BUG_ON(num_to_free > 0);
+}
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH-v2 0/7] pnfsblock cleanup and fixes
  2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
                   ` (5 preceding siblings ...)
  2011-11-10 15:25 ` [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio Peng Tao
@ 2011-11-10 15:37 ` Peng Tao
  2011-11-10 15:52 ` Peng Tao
  7 siblings, 0 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:37 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy

[-- Attachment #1: Type: text/plain, Size: 1117 bytes --]

On Thu, Nov 10, 2011 at 11:25 PM, Peng Tao <bergwolf@gmail.com> wrote:
> Hi, Trond and Benny,
>
> These are updated v2 of the pnfsblock patches and all Benny's comments are addressed.
>
> Thanks,
> Tao
>
> Peng Tao (8):
>  pnfsblock: cleanup bl_mark_sectors_init
>  pnfsblock: acquire im_lock in _preload_range
Just checked. This one is dropped by vger again... even thought I
resent it once again separately.

So I have to attach it here....

Thanks,
Tao

>  pnfsblock: move find lock page logic out of bl_write_pagelist
>  pnfsblock: set read/write tk_status to pnfs_error
>  pnfsblock: remove rpc_call_ops from struct parallel_io
>  pnfsblock: clean up _add_entry
>  pnfsblock: alloc short extent before submit bio
>  pnfsblock: release lock when freeing block_dev
>
>  fs/nfs/blocklayout/blocklayout.c |  176 +++++++++++++++++++++++++-------------
>  fs/nfs/blocklayout/blocklayout.h |   12 ++-
>  fs/nfs/blocklayout/extents.c     |  157 ++++++++++++++--------------------
>  3 files changed, 192 insertions(+), 153 deletions(-)
>
>



-- 
Thanks,
-Bergwolf

[-- Attachment #2: 0002-pnfsblock-acquire-im_lock-in-_preload_range.patch --]
[-- Type: application/octet-stream, Size: 2020 bytes --]

From 6b919e0d97846de104bb4481541d8b1d0612a8a7 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Wed, 2 Nov 2011 01:11:14 -0400
Subject: [PATCH-v2 2/7] pnfsblock: acquire im_lock in _preload_range

When calling _add_entry, we should take the im_lock to protect
agains other modifiers.

Cc: stable@kernel.org [3.1]
Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/extents.c |   11 ++++++-----
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 0fc1321..f383524 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -139,11 +139,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
 }
 
 /* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+static int _preload_range(struct pnfs_inval_markings *marks,
+		u64 offset, u64 length)
 {
 	u64 start, end, s;
 	int count, i, used = 0, status = -ENOMEM;
 	struct pnfs_inval_tracking **storage;
+	struct my_tree  *tree = &marks->im_tree;
 
 	dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
 	start = normalize(offset, tree->mtt_step_size);
@@ -161,12 +163,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
 			goto out_cleanup;
 	}
 
-	/* Now need lock - HOW??? */
-
+	spin_lock(&marks->im_lock);
 	for (s = start; s < end; s += tree->mtt_step_size)
 		used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+	spin_unlock(&marks->im_lock);
 
-	/* Unlock - HOW??? */
 	status = 0;
 
  out_cleanup:
@@ -246,7 +247,7 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
 
 	start = normalize(offset, marks->im_block_size);
 	end = normalize_up(offset + length, marks->im_block_size);
-	if (_preload_range(&marks->im_tree, start, end - start))
+	if (_preload_range(marks, start, end - start))
 		goto outerr;
 
 	spin_lock(&marks->im_lock);
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH-v2 0/7] pnfsblock cleanup and fixes
  2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
                   ` (6 preceding siblings ...)
  2011-11-10 15:37 ` [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
@ 2011-11-10 15:52 ` Peng Tao
  7 siblings, 0 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-10 15:52 UTC (permalink / raw)
  To: linux-nfs; +Cc: Trond.Myklebust, bhalevy

On Thu, Nov 10, 2011 at 11:25 PM, Peng Tao <bergwolf@gmail.com> wrote:
> Hi, Trond and Benny,
>
> These are updated v2 of the pnfsblock patches and all Benny's comments are addressed.
>
> Thanks,
> Tao
>
> Peng Tao (8):
>  pnfsblock: cleanup bl_mark_sectors_init
>  pnfsblock: acquire im_lock in _preload_range
>  pnfsblock: move find lock page logic out of bl_write_pagelist
>  pnfsblock: set read/write tk_status to pnfs_error
>  pnfsblock: remove rpc_call_ops from struct parallel_io
>  pnfsblock: clean up _add_entry
>  pnfsblock: alloc short extent before submit bio
>  pnfsblock: release lock when freeing block_dev
And sorry for mistyping... The patchset has only 7 patches. The last
one is still under test and is not included here...

-- 
Thanks,
Tao

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio
  2011-11-10 15:25 ` [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio Peng Tao
@ 2011-11-10 16:43   ` Bryan Schumaker
  2011-11-11  2:05     ` tao.peng
  0 siblings, 1 reply; 13+ messages in thread
From: Bryan Schumaker @ 2011-11-10 16:43 UTC (permalink / raw)
  To: Peng Tao; +Cc: linux-nfs, Trond.Myklebust, bhalevy, Peng Tao

On 11/10/2011 10:25 AM, Peng Tao wrote:
> As discussed earlier, it is better for block client to allocate memory for
> tracking extents state before submitting bio. So the patch does it by allocating
> a short_extent for every INVALID extent touched by write pagelist and for
> every zeroing page we created, saving them in layout header. Then in end_io we
> can just use them to create commit list items and avoid memory allocation there.
> 
> Signed-off-by: Peng Tao <peng_tao@emc.com>
> ---
>  fs/nfs/blocklayout/blocklayout.c |   74 +++++++++++++++++++++++++++++--------
>  fs/nfs/blocklayout/blocklayout.h |    9 ++++-
>  fs/nfs/blocklayout/extents.c     |   62 +++++++++++++++++++++++++++-----
>  3 files changed, 119 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 3eaea2b..3fdfb29 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
>   */
>  struct parallel_io {
>  	struct kref refcnt;
> -	void (*pnfs_callback) (void *data);
> +	void (*pnfs_callback) (void *data, int num_se);
>  	void *data;
> +	int bse_count;
>  };
>  
>  static inline struct parallel_io *alloc_parallel(void *data)
> @@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
>  	if (rv) {
>  		rv->data = data;
>  		kref_init(&rv->refcnt);
> +		rv->bse_count = 0;
>  	}
>  	return rv;
>  }
> @@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
>  	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
>  
>  	dprintk("%s enter\n", __func__);
> -	p->pnfs_callback(p->data);
> +	p->pnfs_callback(p->data, p->bse_count);
>  	kfree(p);
>  }
>  
> @@ -211,7 +213,7 @@ static void bl_read_cleanup(struct work_struct *work)
>  }
>  
>  static void
> -bl_end_par_io_read(void *data)
> +bl_end_par_io_read(void *data, int unused)
>  {
>  	struct nfs_read_data *rdata = data;
>  
> @@ -312,6 +314,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
>  {
>  	sector_t isect, end;
>  	struct pnfs_block_extent *be;
> +	struct pnfs_block_short_extent *se;
>  
>  	dprintk("%s(%llu, %u)\n", __func__, offset, count);
>  	if (count == 0)
> @@ -324,8 +327,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
>  		be = bl_find_get_extent(bl, isect, NULL);
>  		BUG_ON(!be); /* FIXME */
>  		len = min(end, be->be_f_offset + be->be_length) - isect;
> -		if (be->be_state == PNFS_BLOCK_INVALID_DATA)
> -			bl_mark_for_commit(be, isect, len); /* What if fails? */
> +		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> +			se = bl_pop_one_short_extent(be->be_inval);
> +			BUG_ON(!se);
> +			bl_mark_for_commit(be, isect, len, se);
> +		}
>  		isect += len;
>  		bl_put_extent(be);
>  	}
> @@ -347,7 +353,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>  		end_page_writeback(page);
>  		page_cache_release(page);
>  	} while (bvec >= bio->bi_io_vec);
> -	if (!uptodate) {
> +
> +	if (unlikely(!uptodate)) {
>  		if (!wdata->pnfs_error)
>  			wdata->pnfs_error = -EIO;
>  		pnfs_set_lo_fail(wdata->lseg);
> @@ -356,7 +363,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>  	put_parallel(par);
>  }
>  
> -/* This is basically copied from mpage_end_io_read */
>  static void bl_end_io_write(struct bio *bio, int err)
>  {
>  	struct parallel_io *par = bio->bi_private;
> @@ -382,7 +388,7 @@ static void bl_write_cleanup(struct work_struct *work)
>  	dprintk("%s enter\n", __func__);
>  	task = container_of(work, struct rpc_task, u.tk_work);
>  	wdata = container_of(task, struct nfs_write_data, task);
> -	if (!wdata->pnfs_error) {
> +	if (likely(!wdata->pnfs_error)) {
>  		/* Marks for LAYOUTCOMMIT */
>  		mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>  				     wdata->args.offset, wdata->args.count);
> @@ -391,10 +397,15 @@ static void bl_write_cleanup(struct work_struct *work)
>  }
>  
>  /* Called when last of bios associated with a bl_write_pagelist call finishes */
> -static void bl_end_par_io_write(void *data)
> +static void bl_end_par_io_write(void *data, int num_se)
>  {
>  	struct nfs_write_data *wdata = data;
>  
> +	if (unlikely(wdata->pnfs_error)) {
> +		bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
> +					num_se);
> +	}
> +
>  	wdata->task.tk_status = wdata->pnfs_error;
>  	wdata->verf.committed = NFS_FILE_SYNC;
>  	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
> @@ -547,7 +558,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
>  	 */
>  	par = alloc_parallel(wdata);
>  	if (!par)
> -		return PNFS_NOT_ATTEMPTED;
> +		goto out_mds;
>  	par->pnfs_callback = bl_end_par_io_write;
>  	/* At this point, have to be more careful with error handling */
>  
> @@ -555,12 +566,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
>  	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
>  	if (!be || !is_writable(be, isect)) {
>  		dprintk("%s no matching extents!\n", __func__);
> -		wdata->pnfs_error = -EINVAL;
> -		goto out;
> +		goto out_mds;
>  	}
>  
>  	/* First page inside INVALID extent */
>  	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> +		if (likely(!bl_push_one_short_extent(be->be_inval)))
> +			par->bse_count++;
> +		else
> +			goto out_mds;
>  		temp = offset >> PAGE_CACHE_SHIFT;
>  		npg_zero = do_div(temp, npg_per_block);
>  		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
> @@ -598,6 +612,19 @@ fill_invalid_ext:
>  				wdata->pnfs_error = ret;
>  				goto out;
>  			}
> +			if (likely(!bl_push_one_short_extent(be->be_inval)))
> +				par->bse_count++;
> +			else {
> +				end_page_writeback(page);
> +				page_cache_release(page);
> +				wdata->pnfs_error = -ENOMEM;
> +				goto out;
> +			}
> +			/* FIXME: This should be done in bi_end_io */
> +			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
> +					     page->index << PAGE_CACHE_SHIFT,
> +					     PAGE_CACHE_SIZE);
> +
>  			bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
>  						 isect, page, be,
>  						 bl_end_io_write_zero, par);
> @@ -606,10 +633,6 @@ fill_invalid_ext:
>  				bio = NULL;
>  				goto out;
>  			}
> -			/* FIXME: This should be done in bi_end_io */
> -			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
> -					     page->index << PAGE_CACHE_SHIFT,
> -					     PAGE_CACHE_SIZE);
>  next_page:
>  			isect += PAGE_CACHE_SECTORS;
>  			extent_length -= PAGE_CACHE_SECTORS;
> @@ -633,6 +656,15 @@ next_page:
>  				wdata->pnfs_error = -EINVAL;
>  				goto out;
>  			}
> +			if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> +				if (likely(!bl_push_one_short_extent(
> +								be->be_inval)))
> +					par->bse_count++;
> +				else {
> +					wdata->pnfs_error = -ENOMEM;
> +					goto out;
> +				}
> +			}
>  			extent_length = be->be_length -
>  			    (isect - be->be_f_offset);
>  		}
> @@ -680,6 +712,10 @@ out:
>  	bl_submit_bio(WRITE, bio);
>  	put_parallel(par);
>  	return PNFS_ATTEMPTED;
> +out_mds:
> +	bl_put_extent(be);
> +	kfree(par);
> +	return PNFS_NOT_ATTEMPTED;
>  }
>  
>  /* FIXME - range ignored */
> @@ -706,11 +742,17 @@ static void
>  release_inval_marks(struct pnfs_inval_markings *marks)
>  {
>  	struct pnfs_inval_tracking *pos, *temp;
> +	struct pnfs_block_short_extent *se, *stemp;
>  
>  	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
>  		list_del(&pos->it_link);
>  		kfree(pos);
>  	}
> +
> +	list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
> +		list_del(&se->bse_node);
> +		kfree(se);
> +	}
>  	return;
>  }
>  
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index 60728ac..e31a2df 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -70,6 +70,7 @@ struct pnfs_inval_markings {
>  	spinlock_t	im_lock;
>  	struct my_tree	im_tree;	/* Sectors that need LAYOUTCOMMIT */
>  	sector_t	im_block_size;	/* Server blocksize in sectors */
> +	struct list_head im_extents;	/* Short extents for INVAL->RW conversion */
>  };
>  
>  struct pnfs_inval_tracking {
> @@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
>  {
>  	spin_lock_init(&marks->im_lock);
>  	INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
> +	INIT_LIST_HEAD(&marks->im_extents);
>  	marks->im_block_size = blocksize;
>  	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
>  					   blocksize);
> @@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
>  int bl_add_merge_extent(struct pnfs_block_layout *bl,
>  			 struct pnfs_block_extent *new);
>  int bl_mark_for_commit(struct pnfs_block_extent *be,
> -			sector_t offset, sector_t length);
> +			sector_t offset, sector_t length,
> +			struct pnfs_block_short_extent *new);
> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
> +struct pnfs_block_short_extent *
> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
>  
>  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
> diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
> index d0f52ed..db66e68 100644
> --- a/fs/nfs/blocklayout/extents.c
> +++ b/fs/nfs/blocklayout/extents.c
> @@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
>  
>  /* Note the range described by offset, length is guaranteed to be contained
>   * within be.
> + * new will be freed, either by this function or add_to_commitlist if they
> + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
>   */
>  int bl_mark_for_commit(struct pnfs_block_extent *be,
> -		    sector_t offset, sector_t length)
> +		    sector_t offset, sector_t length,
> +		    struct pnfs_block_short_extent *new)
>  {
>  	sector_t new_end, end = offset + length;
> -	struct pnfs_block_short_extent *new;
>  	struct pnfs_block_layout *bl = container_of(be->be_inval,
>  						    struct pnfs_block_layout,
>  						    bl_inval);
>  
> -	new = kmalloc(sizeof(*new), GFP_NOFS);
> -	if (!new)
> -		return -ENOMEM;
> -
>  	mark_written_sectors(be->be_inval, offset, length);
>  	/* We want to add the range to commit list, but it must be
>  	 * block-normalized, and verified that the normalized range has
> @@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
>  	new->bse_mdev = be->be_mdev;
>  
>  	spin_lock(&bl->bl_ext_lock);
> -	/* new will be freed, either by add_to_commitlist if it decides not
> -	 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
> -	 */
>  	add_to_commitlist(bl, new);
>  	spin_unlock(&bl->bl_ext_lock);
>  	return 0;
> @@ -862,3 +857,52 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
>  		}
>  	}
>  }
> +
> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
> +{
> +	struct pnfs_block_short_extent *new;
> +
> +	new = kmalloc(sizeof(*new), GFP_NOFS);
> +	if (unlikely(!new))
> +		return -ENOMEM;
> +
> +	spin_lock(&marks->im_lock);
> +	list_add(&new->bse_node, &marks->im_extents);
> +	spin_unlock(&marks->im_lock);
> +
> +	return 0;
> +}
> +
> +struct pnfs_block_short_extent *
> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
> +{
> +	struct pnfs_block_short_extent *rv = NULL;
> +
> +	spin_lock(&marks->im_lock);
> +	if (!list_empty(&marks->im_extents)) {
> +		rv = list_entry((&marks->im_extents)->next,
> +				struct pnfs_block_short_extent, bse_node);
> +		list_del_init(&rv->bse_node);
> +	}
> +	spin_unlock(&marks->im_lock);
> +
> +	return rv;
> +}
> +
> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
> +{
> +	struct pnfs_block_short_extent *se = NULL, *tmp;
> +
> +	BUG_ON(num_to_free <= 0);

Why is it a bug if num_to_free is <= 0?  Couldn't you do:

if (num_to_free <= 0)
	return;

instead?

- Bryan

> +
> +	spin_lock(&marks->im_lock);
> +	list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
> +		list_del(&se->bse_node);
> +		kfree(se);
> +		if (--num_to_free == 0)
> +			break;
> +	}
> +	spin_unlock(&marks->im_lock);
> +
> +	BUG_ON(num_to_free > 0);
> +}


^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio
  2011-11-10 16:43   ` Bryan Schumaker
@ 2011-11-11  2:05     ` tao.peng
  2011-11-14 10:26       ` Benny Halevy
  0 siblings, 1 reply; 13+ messages in thread
From: tao.peng @ 2011-11-11  2:05 UTC (permalink / raw)
  To: bjschuma, bergwolf; +Cc: linux-nfs, Trond.Myklebust, bhalevy

Hi, Bryan,

> -----Original Message-----
> From: Bryan Schumaker [mailto:bjschuma@netapp.com]
> Sent: Friday, November 11, 2011 12:44 AM
> To: Peng Tao
> Cc: linux-nfs@vger.kernel.org; Trond.Myklebust@netapp.com; bhalevy@tonian.com;
> Peng, Tao
> Subject: Re: [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio
>
> On 11/10/2011 10:25 AM, Peng Tao wrote:
> > As discussed earlier, it is better for block client to allocate memory for
> > tracking extents state before submitting bio. So the patch does it by allocating
> > a short_extent for every INVALID extent touched by write pagelist and for
> > every zeroing page we created, saving them in layout header. Then in end_io we
> > can just use them to create commit list items and avoid memory allocation there.
> >
> > Signed-off-by: Peng Tao <peng_tao@emc.com>
> > ---
> >  fs/nfs/blocklayout/blocklayout.c |   74
> +++++++++++++++++++++++++++++--------
> >  fs/nfs/blocklayout/blocklayout.h |    9 ++++-
> >  fs/nfs/blocklayout/extents.c     |   62 +++++++++++++++++++++++++++-----
> >  3 files changed, 119 insertions(+), 26 deletions(-)
> >
> > diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> > index 3eaea2b..3fdfb29 100644
> > --- a/fs/nfs/blocklayout/blocklayout.c
> > +++ b/fs/nfs/blocklayout/blocklayout.c
> > @@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t
> isect)
> >   */
> >  struct parallel_io {
> >     struct kref refcnt;
> > -   void (*pnfs_callback) (void *data);
> > +   void (*pnfs_callback) (void *data, int num_se);
> >     void *data;
> > +   int bse_count;
> >  };
> >
> >  static inline struct parallel_io *alloc_parallel(void *data)
> > @@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
> >     if (rv) {
> >             rv->data = data;
> >             kref_init(&rv->refcnt);
> > +           rv->bse_count = 0;
> >     }
> >     return rv;
> >  }
> > @@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
> >     struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
> >
> >     dprintk("%s enter\n", __func__);
> > -   p->pnfs_callback(p->data);
> > +   p->pnfs_callback(p->data, p->bse_count);
> >     kfree(p);
> >  }
> >
> > @@ -211,7 +213,7 @@ static void bl_read_cleanup(struct work_struct *work)
> >  }
> >
> >  static void
> > -bl_end_par_io_read(void *data)
> > +bl_end_par_io_read(void *data, int unused)
> >  {
> >     struct nfs_read_data *rdata = data;
> >
> > @@ -312,6 +314,7 @@ static void mark_extents_written(struct pnfs_block_layout
> *bl,
> >  {
> >     sector_t isect, end;
> >     struct pnfs_block_extent *be;
> > +   struct pnfs_block_short_extent *se;
> >
> >     dprintk("%s(%llu, %u)\n", __func__, offset, count);
> >     if (count == 0)
> > @@ -324,8 +327,11 @@ static void mark_extents_written(struct
> pnfs_block_layout *bl,
> >             be = bl_find_get_extent(bl, isect, NULL);
> >             BUG_ON(!be); /* FIXME */
> >             len = min(end, be->be_f_offset + be->be_length) - isect;
> > -           if (be->be_state == PNFS_BLOCK_INVALID_DATA)
> > -                   bl_mark_for_commit(be, isect, len); /* What if fails? */
> > +           if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> > +                   se = bl_pop_one_short_extent(be->be_inval);
> > +                   BUG_ON(!se);
> > +                   bl_mark_for_commit(be, isect, len, se);
> > +           }
> >             isect += len;
> >             bl_put_extent(be);
> >     }
> > @@ -347,7 +353,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
> >             end_page_writeback(page);
> >             page_cache_release(page);
> >     } while (bvec >= bio->bi_io_vec);
> > -   if (!uptodate) {
> > +
> > +   if (unlikely(!uptodate)) {
> >             if (!wdata->pnfs_error)
> >                     wdata->pnfs_error = -EIO;
> >             pnfs_set_lo_fail(wdata->lseg);
> > @@ -356,7 +363,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
> >     put_parallel(par);
> >  }
> >
> > -/* This is basically copied from mpage_end_io_read */
> >  static void bl_end_io_write(struct bio *bio, int err)
> >  {
> >     struct parallel_io *par = bio->bi_private;
> > @@ -382,7 +388,7 @@ static void bl_write_cleanup(struct work_struct *work)
> >     dprintk("%s enter\n", __func__);
> >     task = container_of(work, struct rpc_task, u.tk_work);
> >     wdata = container_of(task, struct nfs_write_data, task);
> > -   if (!wdata->pnfs_error) {
> > +   if (likely(!wdata->pnfs_error)) {
> >             /* Marks for LAYOUTCOMMIT */
> >             mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
> >                                  wdata->args.offset, wdata->args.count);
> > @@ -391,10 +397,15 @@ static void bl_write_cleanup(struct work_struct *work)
> >  }
> >
> >  /* Called when last of bios associated with a bl_write_pagelist call finishes */
> > -static void bl_end_par_io_write(void *data)
> > +static void bl_end_par_io_write(void *data, int num_se)
> >  {
> >     struct nfs_write_data *wdata = data;
> >
> > +   if (unlikely(wdata->pnfs_error)) {
> > +           bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
> > +                                   num_se);
> > +   }
> > +
> >     wdata->task.tk_status = wdata->pnfs_error;
> >     wdata->verf.committed = NFS_FILE_SYNC;
> >     INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
> > @@ -547,7 +558,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
> >      */
> >     par = alloc_parallel(wdata);
> >     if (!par)
> > -           return PNFS_NOT_ATTEMPTED;
> > +           goto out_mds;
> >     par->pnfs_callback = bl_end_par_io_write;
> >     /* At this point, have to be more careful with error handling */
> >
> > @@ -555,12 +566,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int
> sync)
> >     be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
> >     if (!be || !is_writable(be, isect)) {
> >             dprintk("%s no matching extents!\n", __func__);
> > -           wdata->pnfs_error = -EINVAL;
> > -           goto out;
> > +           goto out_mds;
> >     }
> >
> >     /* First page inside INVALID extent */
> >     if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> > +           if (likely(!bl_push_one_short_extent(be->be_inval)))
> > +                   par->bse_count++;
> > +           else
> > +                   goto out_mds;
> >             temp = offset >> PAGE_CACHE_SHIFT;
> >             npg_zero = do_div(temp, npg_per_block);
> >             isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
> > @@ -598,6 +612,19 @@ fill_invalid_ext:
> >                             wdata->pnfs_error = ret;
> >                             goto out;
> >                     }
> > +                   if (likely(!bl_push_one_short_extent(be->be_inval)))
> > +                           par->bse_count++;
> > +                   else {
> > +                           end_page_writeback(page);
> > +                           page_cache_release(page);
> > +                           wdata->pnfs_error = -ENOMEM;
> > +                           goto out;
> > +                   }
> > +                   /* FIXME: This should be done in bi_end_io */
> > +                   mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
> > +                                        page->index << PAGE_CACHE_SHIFT,
> > +                                        PAGE_CACHE_SIZE);
> > +
> >                     bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
> >                                              isect, page, be,
> >                                              bl_end_io_write_zero, par);
> > @@ -606,10 +633,6 @@ fill_invalid_ext:
> >                             bio = NULL;
> >                             goto out;
> >                     }
> > -                   /* FIXME: This should be done in bi_end_io */
> > -                   mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
> > -                                        page->index << PAGE_CACHE_SHIFT,
> > -                                        PAGE_CACHE_SIZE);
> >  next_page:
> >                     isect += PAGE_CACHE_SECTORS;
> >                     extent_length -= PAGE_CACHE_SECTORS;
> > @@ -633,6 +656,15 @@ next_page:
> >                             wdata->pnfs_error = -EINVAL;
> >                             goto out;
> >                     }
> > +                   if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> > +                           if (likely(!bl_push_one_short_extent(
> > +                                                           be->be_inval)))
> > +                                   par->bse_count++;
> > +                           else {
> > +                                   wdata->pnfs_error = -ENOMEM;
> > +                                   goto out;
> > +                           }
> > +                   }
> >                     extent_length = be->be_length -
> >                         (isect - be->be_f_offset);
> >             }
> > @@ -680,6 +712,10 @@ out:
> >     bl_submit_bio(WRITE, bio);
> >     put_parallel(par);
> >     return PNFS_ATTEMPTED;
> > +out_mds:
> > +   bl_put_extent(be);
> > +   kfree(par);
> > +   return PNFS_NOT_ATTEMPTED;
> >  }
> >
> >  /* FIXME - range ignored */
> > @@ -706,11 +742,17 @@ static void
> >  release_inval_marks(struct pnfs_inval_markings *marks)
> >  {
> >     struct pnfs_inval_tracking *pos, *temp;
> > +   struct pnfs_block_short_extent *se, *stemp;
> >
> >     list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
> >             list_del(&pos->it_link);
> >             kfree(pos);
> >     }
> > +
> > +   list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
> > +           list_del(&se->bse_node);
> > +           kfree(se);
> > +   }
> >     return;
> >  }
> >
> > diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> > index 60728ac..e31a2df 100644
> > --- a/fs/nfs/blocklayout/blocklayout.h
> > +++ b/fs/nfs/blocklayout/blocklayout.h
> > @@ -70,6 +70,7 @@ struct pnfs_inval_markings {
> >     spinlock_t      im_lock;
> >     struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
> >     sector_t        im_block_size;  /* Server blocksize in sectors */
> > +   struct list_head im_extents;    /* Short extents for INVAL->RW conversion */
> >  };
> >
> >  struct pnfs_inval_tracking {
> > @@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings
> *marks, sector_t blocksize)
> >  {
> >     spin_lock_init(&marks->im_lock);
> >     INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
> > +   INIT_LIST_HEAD(&marks->im_extents);
> >     marks->im_block_size = blocksize;
> >     marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
> >                                        blocksize);
> > @@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct
> pnfs_block_layout *bl,
> >  int bl_add_merge_extent(struct pnfs_block_layout *bl,
> >                      struct pnfs_block_extent *new);
> >  int bl_mark_for_commit(struct pnfs_block_extent *be,
> > -                   sector_t offset, sector_t length);
> > +                   sector_t offset, sector_t length,
> > +                   struct pnfs_block_short_extent *new);
> > +int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
> > +struct pnfs_block_short_extent *
> > +bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
> > +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
> >
> >  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
> > diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
> > index d0f52ed..db66e68 100644
> > --- a/fs/nfs/blocklayout/extents.c
> > +++ b/fs/nfs/blocklayout/extents.c
> > @@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout
> *bl,
> >
> >  /* Note the range described by offset, length is guaranteed to be contained
> >   * within be.
> > + * new will be freed, either by this function or add_to_commitlist if they
> > + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
> >   */
> >  int bl_mark_for_commit(struct pnfs_block_extent *be,
> > -               sector_t offset, sector_t length)
> > +               sector_t offset, sector_t length,
> > +               struct pnfs_block_short_extent *new)
> >  {
> >     sector_t new_end, end = offset + length;
> > -   struct pnfs_block_short_extent *new;
> >     struct pnfs_block_layout *bl = container_of(be->be_inval,
> >                                                 struct pnfs_block_layout,
> >                                                 bl_inval);
> >
> > -   new = kmalloc(sizeof(*new), GFP_NOFS);
> > -   if (!new)
> > -           return -ENOMEM;
> > -
> >     mark_written_sectors(be->be_inval, offset, length);
> >     /* We want to add the range to commit list, but it must be
> >      * block-normalized, and verified that the normalized range has
> > @@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
> >     new->bse_mdev = be->be_mdev;
> >
> >     spin_lock(&bl->bl_ext_lock);
> > -   /* new will be freed, either by add_to_commitlist if it decides not
> > -    * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
> > -    */
> >     add_to_commitlist(bl, new);
> >     spin_unlock(&bl->bl_ext_lock);
> >     return 0;
> > @@ -862,3 +857,52 @@ clean_pnfs_block_layoutupdate(struct
> pnfs_block_layout *bl,
> >             }
> >     }
> >  }
> > +
> > +int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
> > +{
> > +   struct pnfs_block_short_extent *new;
> > +
> > +   new = kmalloc(sizeof(*new), GFP_NOFS);
> > +   if (unlikely(!new))
> > +           return -ENOMEM;
> > +
> > +   spin_lock(&marks->im_lock);
> > +   list_add(&new->bse_node, &marks->im_extents);
> > +   spin_unlock(&marks->im_lock);
> > +
> > +   return 0;
> > +}
> > +
> > +struct pnfs_block_short_extent *
> > +bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
> > +{
> > +   struct pnfs_block_short_extent *rv = NULL;
> > +
> > +   spin_lock(&marks->im_lock);
> > +   if (!list_empty(&marks->im_extents)) {
> > +           rv = list_entry((&marks->im_extents)->next,
> > +                           struct pnfs_block_short_extent, bse_node);
> > +           list_del_init(&rv->bse_node);
> > +   }
> > +   spin_unlock(&marks->im_lock);
> > +
> > +   return rv;
> > +}
> > +
> > +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
> > +{
> > +   struct pnfs_block_short_extent *se = NULL, *tmp;
> > +
> > +   BUG_ON(num_to_free <= 0);
>
> Why is it a bug if num_to_free is <= 0?  Couldn't you do:
>
> if (num_to_free <= 0)
>       return;
>
> instead?
It used to be a sanity check but was changed to BUG_ON in the second version.
Yeah, after giving it a second thought, I think it necessary to keep the sanity check. There are cases that num_to_free can be zero. E.g., a write_pagelist touches no INVALID extents, and one bio fails with disk failure. Then we will come to calling bl_free_short_extents() with num_to_free == 0. So the BUG_ON is really a bug itself... I will change it back to a normal sanity check.

Thanks,
Tao

>From d4f49b042411da140c95559895c57a00a1920fdd Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 10 Nov 2011 03:57:00 -0500
Subject: [PATCH 7/7] pnfsblock: alloc short extent before submit bio

As discussed earlier, it is better for block client to allocate memory for
tracking extents state before submitting bio. So the patch does it by allocating
a short_extent for every INVALID extent touched by write pagelist and for
every zeroing page we created, saving them in layout header. Then in end_io we
can just use them to create commit list items and avoid memory allocation there.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/blocklayout.c |   74 +++++++++++++++++++++++++++++--------
 fs/nfs/blocklayout/blocklayout.h |    9 ++++-
 fs/nfs/blocklayout/extents.c     |   63 +++++++++++++++++++++++++++-----
 3 files changed, 120 insertions(+), 26 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 7fc69c9..1dd2983 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
  */
 struct parallel_io {
        struct kref refcnt;
-       void (*pnfs_callback) (void *data);
+       void (*pnfs_callback) (void *data, int num_se);
        void *data;
+       int bse_count;
 };

 static inline struct parallel_io *alloc_parallel(void *data)
@@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
        if (rv) {
                rv->data = data;
                kref_init(&rv->refcnt);
+               rv->bse_count = 0;
        }
        return rv;
 }
@@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
        struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);

        dprintk("%s enter\n", __func__);
-       p->pnfs_callback(p->data);
+       p->pnfs_callback(p->data, p->bse_count);
        kfree(p);
 }

@@ -211,7 +213,7 @@ static void bl_read_cleanup(struct work_struct *work)
 }

 static void
-bl_end_par_io_read(void *data)
+bl_end_par_io_read(void *data, int unused)
 {
        struct nfs_read_data *rdata = data;

@@ -312,6 +314,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 {
        sector_t isect, end;
        struct pnfs_block_extent *be;
+       struct pnfs_block_short_extent *se;

        dprintk("%s(%llu, %u)\n", __func__, offset, count);
        if (count == 0)
@@ -324,8 +327,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
                be = bl_find_get_extent(bl, isect, NULL);
                BUG_ON(!be); /* FIXME */
                len = min(end, be->be_f_offset + be->be_length) - isect;
-               if (be->be_state == PNFS_BLOCK_INVALID_DATA)
-                       bl_mark_for_commit(be, isect, len); /* What if fails? */
+               if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                       se = bl_pop_one_short_extent(be->be_inval);
+                       BUG_ON(!se);
+                       bl_mark_for_commit(be, isect, len, se);
+               }
                isect += len;
                bl_put_extent(be);
        }
@@ -347,7 +353,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
                end_page_writeback(page);
                page_cache_release(page);
        } while (bvec >= bio->bi_io_vec);
-       if (!uptodate) {
+
+       if (unlikely(!uptodate)) {
                if (!wdata->pnfs_error)
                        wdata->pnfs_error = -EIO;
                pnfs_set_lo_fail(wdata->lseg);
@@ -356,7 +363,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
        put_parallel(par);
 }

-/* This is basically copied from mpage_end_io_read */
 static void bl_end_io_write(struct bio *bio, int err)
 {
        struct parallel_io *par = bio->bi_private;
@@ -382,7 +388,7 @@ static void bl_write_cleanup(struct work_struct *work)
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
        wdata = container_of(task, struct nfs_write_data, task);
-       if (!wdata->pnfs_error) {
+       if (likely(!wdata->pnfs_error)) {
                /* Marks for LAYOUTCOMMIT */
                mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
                                     wdata->args.offset, wdata->args.count);
@@ -391,10 +397,15 @@ static void bl_write_cleanup(struct work_struct *work)
 }

 /* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data)
+static void bl_end_par_io_write(void *data, int num_se)
 {
        struct nfs_write_data *wdata = data;

+       if (unlikely(wdata->pnfs_error)) {
+               bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+                                       num_se);
+       }
+
        wdata->task.tk_status = wdata->pnfs_error;
        wdata->verf.committed = NFS_FILE_SYNC;
        INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
@@ -547,7 +558,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
         */
        par = alloc_parallel(wdata);
        if (!par)
-               return PNFS_NOT_ATTEMPTED;
+               goto out_mds;
        par->pnfs_callback = bl_end_par_io_write;
        /* At this point, have to be more careful with error handling */

@@ -555,12 +566,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
        be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
        if (!be || !is_writable(be, isect)) {
                dprintk("%s no matching extents!\n", __func__);
-               wdata->pnfs_error = -EINVAL;
-               goto out;
+               goto out_mds;
        }

        /* First page inside INVALID extent */
        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+               if (likely(!bl_push_one_short_extent(be->be_inval)))
+                       par->bse_count++;
+               else
+                       goto out_mds;
                temp = offset >> PAGE_CACHE_SHIFT;
                npg_zero = do_div(temp, npg_per_block);
                isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -598,6 +612,19 @@ fill_invalid_ext:
                                wdata->pnfs_error = ret;
                                goto out;
                        }
+                       if (likely(!bl_push_one_short_extent(be->be_inval)))
+                               par->bse_count++;
+                       else {
+                               end_page_writeback(page);
+                               page_cache_release(page);
+                               wdata->pnfs_error = -ENOMEM;
+                               goto out;
+                       }
+                       /* FIXME: This should be done in bi_end_io */
+                       mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+                                            page->index << PAGE_CACHE_SHIFT,
+                                            PAGE_CACHE_SIZE);
+
                        bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
                                                 isect, page, be,
                                                 bl_end_io_write_zero, par);
@@ -606,10 +633,6 @@ fill_invalid_ext:
                                bio = NULL;
                                goto out;
                        }
-                       /* FIXME: This should be done in bi_end_io */
-                       mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
-                                            page->index << PAGE_CACHE_SHIFT,
-                                            PAGE_CACHE_SIZE);
 next_page:
                        isect += PAGE_CACHE_SECTORS;
                        extent_length -= PAGE_CACHE_SECTORS;
@@ -633,6 +656,15 @@ next_page:
                                wdata->pnfs_error = -EINVAL;
                                goto out;
                        }
+                       if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                               if (likely(!bl_push_one_short_extent(
+                                                               be->be_inval)))
+                                       par->bse_count++;
+                               else {
+                                       wdata->pnfs_error = -ENOMEM;
+                                       goto out;
+                               }
+                       }
                        extent_length = be->be_length -
                            (isect - be->be_f_offset);
                }
@@ -680,6 +712,10 @@ out:
        bl_submit_bio(WRITE, bio);
        put_parallel(par);
        return PNFS_ATTEMPTED;
+out_mds:
+       bl_put_extent(be);
+       kfree(par);
+       return PNFS_NOT_ATTEMPTED;
 }

 /* FIXME - range ignored */
@@ -706,11 +742,17 @@ static void
 release_inval_marks(struct pnfs_inval_markings *marks)
 {
        struct pnfs_inval_tracking *pos, *temp;
+       struct pnfs_block_short_extent *se, *stemp;

        list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
                list_del(&pos->it_link);
                kfree(pos);
        }
+
+       list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
+               list_del(&se->bse_node);
+               kfree(se);
+       }
        return;
 }

diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 60728ac..e31a2df 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -70,6 +70,7 @@ struct pnfs_inval_markings {
        spinlock_t      im_lock;
        struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
        sector_t        im_block_size;  /* Server blocksize in sectors */
+       struct list_head im_extents;    /* Short extents for INVAL->RW conversion */
 };

 struct pnfs_inval_tracking {
@@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
 {
        spin_lock_init(&marks->im_lock);
        INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+       INIT_LIST_HEAD(&marks->im_extents);
        marks->im_block_size = blocksize;
        marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
                                           blocksize);
@@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
 int bl_add_merge_extent(struct pnfs_block_layout *bl,
                         struct pnfs_block_extent *new);
 int bl_mark_for_commit(struct pnfs_block_extent *be,
-                       sector_t offset, sector_t length);
+                       sector_t offset, sector_t length,
+                       struct pnfs_block_short_extent *new);
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);

 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index d0f52ed..6ce5a8d 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,

 /* Note the range described by offset, length is guaranteed to be contained
  * within be.
+ * new will be freed, either by this function or add_to_commitlist if they
+ * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
  */
 int bl_mark_for_commit(struct pnfs_block_extent *be,
-                   sector_t offset, sector_t length)
+                   sector_t offset, sector_t length,
+                   struct pnfs_block_short_extent *new)
 {
        sector_t new_end, end = offset + length;
-       struct pnfs_block_short_extent *new;
        struct pnfs_block_layout *bl = container_of(be->be_inval,
                                                    struct pnfs_block_layout,
                                                    bl_inval);

-       new = kmalloc(sizeof(*new), GFP_NOFS);
-       if (!new)
-               return -ENOMEM;
-
        mark_written_sectors(be->be_inval, offset, length);
        /* We want to add the range to commit list, but it must be
         * block-normalized, and verified that the normalized range has
@@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
        new->bse_mdev = be->be_mdev;

        spin_lock(&bl->bl_ext_lock);
-       /* new will be freed, either by add_to_commitlist if it decides not
-        * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
-        */
        add_to_commitlist(bl, new);
        spin_unlock(&bl->bl_ext_lock);
        return 0;
@@ -862,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
                }
        }
 }
+
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
+{
+       struct pnfs_block_short_extent *new;
+
+       new = kmalloc(sizeof(*new), GFP_NOFS);
+       if (unlikely(!new))
+               return -ENOMEM;
+
+       spin_lock(&marks->im_lock);
+       list_add(&new->bse_node, &marks->im_extents);
+       spin_unlock(&marks->im_lock);
+
+       return 0;
+}
+
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
+{
+       struct pnfs_block_short_extent *rv = NULL;
+
+       spin_lock(&marks->im_lock);
+       if (!list_empty(&marks->im_extents)) {
+               rv = list_entry((&marks->im_extents)->next,
+                               struct pnfs_block_short_extent, bse_node);
+               list_del_init(&rv->bse_node);
+       }
+       spin_unlock(&marks->im_lock);
+
+       return rv;
+}
+
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
+{
+       struct pnfs_block_short_extent *se = NULL, *tmp;
+
+       if (num_to_free <= 0)
+               return;
+
+       spin_lock(&marks->im_lock);
+       list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
+               list_del(&se->bse_node);
+               kfree(se);
+               if (--num_to_free == 0)
+                       break;
+       }
+       spin_unlock(&marks->im_lock);
+
+       BUG_ON(num_to_free > 0);
+}
--
1.7.4.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio
  2011-11-11  2:05     ` tao.peng
@ 2011-11-14 10:26       ` Benny Halevy
  2011-11-14 14:18         ` Peng Tao
  0 siblings, 1 reply; 13+ messages in thread
From: Benny Halevy @ 2011-11-14 10:26 UTC (permalink / raw)
  To: tao.peng; +Cc: bjschuma, bergwolf, linux-nfs, Trond.Myklebust

Tao, The patch below doesn't apply.
Can you please re-send a clean patch, with tab indents (vs. spaces).

Thanks,

Benny

On 2011-11-11 04:05, tao.peng@emc.com wrote:
> Hi, Bryan,
> 
>> -----Original Message-----
>> From: Bryan Schumaker [mailto:bjschuma@netapp.com]
>> Sent: Friday, November 11, 2011 12:44 AM
>> To: Peng Tao
>> Cc: linux-nfs@vger.kernel.org; Trond.Myklebust@netapp.com; bhalevy@tonian.com;
>> Peng, Tao
>> Subject: Re: [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio
>>
>> On 11/10/2011 10:25 AM, Peng Tao wrote:
>>> As discussed earlier, it is better for block client to allocate memory for
>>> tracking extents state before submitting bio. So the patch does it by allocating
>>> a short_extent for every INVALID extent touched by write pagelist and for
>>> every zeroing page we created, saving them in layout header. Then in end_io we
>>> can just use them to create commit list items and avoid memory allocation there.
>>>
>>> Signed-off-by: Peng Tao <peng_tao@emc.com>
>>> ---
>>>  fs/nfs/blocklayout/blocklayout.c |   74
>> +++++++++++++++++++++++++++++--------
>>>  fs/nfs/blocklayout/blocklayout.h |    9 ++++-
>>>  fs/nfs/blocklayout/extents.c     |   62 +++++++++++++++++++++++++++-----
>>>  3 files changed, 119 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
>>> index 3eaea2b..3fdfb29 100644
>>> --- a/fs/nfs/blocklayout/blocklayout.c
>>> +++ b/fs/nfs/blocklayout/blocklayout.c
>>> @@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t
>> isect)
>>>   */
>>>  struct parallel_io {
>>>     struct kref refcnt;
>>> -   void (*pnfs_callback) (void *data);
>>> +   void (*pnfs_callback) (void *data, int num_se);
>>>     void *data;
>>> +   int bse_count;
>>>  };
>>>
>>>  static inline struct parallel_io *alloc_parallel(void *data)
>>> @@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
>>>     if (rv) {
>>>             rv->data = data;
>>>             kref_init(&rv->refcnt);
>>> +           rv->bse_count = 0;
>>>     }
>>>     return rv;
>>>  }
>>> @@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
>>>     struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
>>>
>>>     dprintk("%s enter\n", __func__);
>>> -   p->pnfs_callback(p->data);
>>> +   p->pnfs_callback(p->data, p->bse_count);
>>>     kfree(p);
>>>  }
>>>
>>> @@ -211,7 +213,7 @@ static void bl_read_cleanup(struct work_struct *work)
>>>  }
>>>
>>>  static void
>>> -bl_end_par_io_read(void *data)
>>> +bl_end_par_io_read(void *data, int unused)
>>>  {
>>>     struct nfs_read_data *rdata = data;
>>>
>>> @@ -312,6 +314,7 @@ static void mark_extents_written(struct pnfs_block_layout
>> *bl,
>>>  {
>>>     sector_t isect, end;
>>>     struct pnfs_block_extent *be;
>>> +   struct pnfs_block_short_extent *se;
>>>
>>>     dprintk("%s(%llu, %u)\n", __func__, offset, count);
>>>     if (count == 0)
>>> @@ -324,8 +327,11 @@ static void mark_extents_written(struct
>> pnfs_block_layout *bl,
>>>             be = bl_find_get_extent(bl, isect, NULL);
>>>             BUG_ON(!be); /* FIXME */
>>>             len = min(end, be->be_f_offset + be->be_length) - isect;
>>> -           if (be->be_state == PNFS_BLOCK_INVALID_DATA)
>>> -                   bl_mark_for_commit(be, isect, len); /* What if fails? */
>>> +           if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>>> +                   se = bl_pop_one_short_extent(be->be_inval);
>>> +                   BUG_ON(!se);
>>> +                   bl_mark_for_commit(be, isect, len, se);
>>> +           }
>>>             isect += len;
>>>             bl_put_extent(be);
>>>     }
>>> @@ -347,7 +353,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>>>             end_page_writeback(page);
>>>             page_cache_release(page);
>>>     } while (bvec >= bio->bi_io_vec);
>>> -   if (!uptodate) {
>>> +
>>> +   if (unlikely(!uptodate)) {
>>>             if (!wdata->pnfs_error)
>>>                     wdata->pnfs_error = -EIO;
>>>             pnfs_set_lo_fail(wdata->lseg);
>>> @@ -356,7 +363,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>>>     put_parallel(par);
>>>  }
>>>
>>> -/* This is basically copied from mpage_end_io_read */
>>>  static void bl_end_io_write(struct bio *bio, int err)
>>>  {
>>>     struct parallel_io *par = bio->bi_private;
>>> @@ -382,7 +388,7 @@ static void bl_write_cleanup(struct work_struct *work)
>>>     dprintk("%s enter\n", __func__);
>>>     task = container_of(work, struct rpc_task, u.tk_work);
>>>     wdata = container_of(task, struct nfs_write_data, task);
>>> -   if (!wdata->pnfs_error) {
>>> +   if (likely(!wdata->pnfs_error)) {
>>>             /* Marks for LAYOUTCOMMIT */
>>>             mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>>>                                  wdata->args.offset, wdata->args.count);
>>> @@ -391,10 +397,15 @@ static void bl_write_cleanup(struct work_struct *work)
>>>  }
>>>
>>>  /* Called when last of bios associated with a bl_write_pagelist call finishes */
>>> -static void bl_end_par_io_write(void *data)
>>> +static void bl_end_par_io_write(void *data, int num_se)
>>>  {
>>>     struct nfs_write_data *wdata = data;
>>>
>>> +   if (unlikely(wdata->pnfs_error)) {
>>> +           bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
>>> +                                   num_se);
>>> +   }
>>> +
>>>     wdata->task.tk_status = wdata->pnfs_error;
>>>     wdata->verf.committed = NFS_FILE_SYNC;
>>>     INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
>>> @@ -547,7 +558,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
>>>      */
>>>     par = alloc_parallel(wdata);
>>>     if (!par)
>>> -           return PNFS_NOT_ATTEMPTED;
>>> +           goto out_mds;
>>>     par->pnfs_callback = bl_end_par_io_write;
>>>     /* At this point, have to be more careful with error handling */
>>>
>>> @@ -555,12 +566,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int
>> sync)
>>>     be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
>>>     if (!be || !is_writable(be, isect)) {
>>>             dprintk("%s no matching extents!\n", __func__);
>>> -           wdata->pnfs_error = -EINVAL;
>>> -           goto out;
>>> +           goto out_mds;
>>>     }
>>>
>>>     /* First page inside INVALID extent */
>>>     if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>>> +           if (likely(!bl_push_one_short_extent(be->be_inval)))
>>> +                   par->bse_count++;
>>> +           else
>>> +                   goto out_mds;
>>>             temp = offset >> PAGE_CACHE_SHIFT;
>>>             npg_zero = do_div(temp, npg_per_block);
>>>             isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
>>> @@ -598,6 +612,19 @@ fill_invalid_ext:
>>>                             wdata->pnfs_error = ret;
>>>                             goto out;
>>>                     }
>>> +                   if (likely(!bl_push_one_short_extent(be->be_inval)))
>>> +                           par->bse_count++;
>>> +                   else {
>>> +                           end_page_writeback(page);
>>> +                           page_cache_release(page);
>>> +                           wdata->pnfs_error = -ENOMEM;
>>> +                           goto out;
>>> +                   }
>>> +                   /* FIXME: This should be done in bi_end_io */
>>> +                   mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>>> +                                        page->index << PAGE_CACHE_SHIFT,
>>> +                                        PAGE_CACHE_SIZE);
>>> +
>>>                     bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
>>>                                              isect, page, be,
>>>                                              bl_end_io_write_zero, par);
>>> @@ -606,10 +633,6 @@ fill_invalid_ext:
>>>                             bio = NULL;
>>>                             goto out;
>>>                     }
>>> -                   /* FIXME: This should be done in bi_end_io */
>>> -                   mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>>> -                                        page->index << PAGE_CACHE_SHIFT,
>>> -                                        PAGE_CACHE_SIZE);
>>>  next_page:
>>>                     isect += PAGE_CACHE_SECTORS;
>>>                     extent_length -= PAGE_CACHE_SECTORS;
>>> @@ -633,6 +656,15 @@ next_page:
>>>                             wdata->pnfs_error = -EINVAL;
>>>                             goto out;
>>>                     }
>>> +                   if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>>> +                           if (likely(!bl_push_one_short_extent(
>>> +                                                           be->be_inval)))
>>> +                                   par->bse_count++;
>>> +                           else {
>>> +                                   wdata->pnfs_error = -ENOMEM;
>>> +                                   goto out;
>>> +                           }
>>> +                   }
>>>                     extent_length = be->be_length -
>>>                         (isect - be->be_f_offset);
>>>             }
>>> @@ -680,6 +712,10 @@ out:
>>>     bl_submit_bio(WRITE, bio);
>>>     put_parallel(par);
>>>     return PNFS_ATTEMPTED;
>>> +out_mds:
>>> +   bl_put_extent(be);
>>> +   kfree(par);
>>> +   return PNFS_NOT_ATTEMPTED;
>>>  }
>>>
>>>  /* FIXME - range ignored */
>>> @@ -706,11 +742,17 @@ static void
>>>  release_inval_marks(struct pnfs_inval_markings *marks)
>>>  {
>>>     struct pnfs_inval_tracking *pos, *temp;
>>> +   struct pnfs_block_short_extent *se, *stemp;
>>>
>>>     list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
>>>             list_del(&pos->it_link);
>>>             kfree(pos);
>>>     }
>>> +
>>> +   list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
>>> +           list_del(&se->bse_node);
>>> +           kfree(se);
>>> +   }
>>>     return;
>>>  }
>>>
>>> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
>>> index 60728ac..e31a2df 100644
>>> --- a/fs/nfs/blocklayout/blocklayout.h
>>> +++ b/fs/nfs/blocklayout/blocklayout.h
>>> @@ -70,6 +70,7 @@ struct pnfs_inval_markings {
>>>     spinlock_t      im_lock;
>>>     struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
>>>     sector_t        im_block_size;  /* Server blocksize in sectors */
>>> +   struct list_head im_extents;    /* Short extents for INVAL->RW conversion */
>>>  };
>>>
>>>  struct pnfs_inval_tracking {
>>> @@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings
>> *marks, sector_t blocksize)
>>>  {
>>>     spin_lock_init(&marks->im_lock);
>>>     INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
>>> +   INIT_LIST_HEAD(&marks->im_extents);
>>>     marks->im_block_size = blocksize;
>>>     marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
>>>                                        blocksize);
>>> @@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct
>> pnfs_block_layout *bl,
>>>  int bl_add_merge_extent(struct pnfs_block_layout *bl,
>>>                      struct pnfs_block_extent *new);
>>>  int bl_mark_for_commit(struct pnfs_block_extent *be,
>>> -                   sector_t offset, sector_t length);
>>> +                   sector_t offset, sector_t length,
>>> +                   struct pnfs_block_short_extent *new);
>>> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
>>> +struct pnfs_block_short_extent *
>>> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
>>> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
>>>
>>>  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
>>> diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
>>> index d0f52ed..db66e68 100644
>>> --- a/fs/nfs/blocklayout/extents.c
>>> +++ b/fs/nfs/blocklayout/extents.c
>>> @@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout
>> *bl,
>>>
>>>  /* Note the range described by offset, length is guaranteed to be contained
>>>   * within be.
>>> + * new will be freed, either by this function or add_to_commitlist if they
>>> + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
>>>   */
>>>  int bl_mark_for_commit(struct pnfs_block_extent *be,
>>> -               sector_t offset, sector_t length)
>>> +               sector_t offset, sector_t length,
>>> +               struct pnfs_block_short_extent *new)
>>>  {
>>>     sector_t new_end, end = offset + length;
>>> -   struct pnfs_block_short_extent *new;
>>>     struct pnfs_block_layout *bl = container_of(be->be_inval,
>>>                                                 struct pnfs_block_layout,
>>>                                                 bl_inval);
>>>
>>> -   new = kmalloc(sizeof(*new), GFP_NOFS);
>>> -   if (!new)
>>> -           return -ENOMEM;
>>> -
>>>     mark_written_sectors(be->be_inval, offset, length);
>>>     /* We want to add the range to commit list, but it must be
>>>      * block-normalized, and verified that the normalized range has
>>> @@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
>>>     new->bse_mdev = be->be_mdev;
>>>
>>>     spin_lock(&bl->bl_ext_lock);
>>> -   /* new will be freed, either by add_to_commitlist if it decides not
>>> -    * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
>>> -    */
>>>     add_to_commitlist(bl, new);
>>>     spin_unlock(&bl->bl_ext_lock);
>>>     return 0;
>>> @@ -862,3 +857,52 @@ clean_pnfs_block_layoutupdate(struct
>> pnfs_block_layout *bl,
>>>             }
>>>     }
>>>  }
>>> +
>>> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
>>> +{
>>> +   struct pnfs_block_short_extent *new;
>>> +
>>> +   new = kmalloc(sizeof(*new), GFP_NOFS);
>>> +   if (unlikely(!new))
>>> +           return -ENOMEM;
>>> +
>>> +   spin_lock(&marks->im_lock);
>>> +   list_add(&new->bse_node, &marks->im_extents);
>>> +   spin_unlock(&marks->im_lock);
>>> +
>>> +   return 0;
>>> +}
>>> +
>>> +struct pnfs_block_short_extent *
>>> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
>>> +{
>>> +   struct pnfs_block_short_extent *rv = NULL;
>>> +
>>> +   spin_lock(&marks->im_lock);
>>> +   if (!list_empty(&marks->im_extents)) {
>>> +           rv = list_entry((&marks->im_extents)->next,
>>> +                           struct pnfs_block_short_extent, bse_node);
>>> +           list_del_init(&rv->bse_node);
>>> +   }
>>> +   spin_unlock(&marks->im_lock);
>>> +
>>> +   return rv;
>>> +}
>>> +
>>> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
>>> +{
>>> +   struct pnfs_block_short_extent *se = NULL, *tmp;
>>> +
>>> +   BUG_ON(num_to_free <= 0);
>>
>> Why is it a bug if num_to_free is <= 0?  Couldn't you do:
>>
>> if (num_to_free <= 0)
>>       return;
>>
>> instead?
> It used to be a sanity check but was changed to BUG_ON in the second version.
> Yeah, after giving it a second thought, I think it necessary to keep the sanity check. There are cases that num_to_free can be zero. E.g., a write_pagelist touches no INVALID extents, and one bio fails with disk failure. Then we will come to calling bl_free_short_extents() with num_to_free == 0. So the BUG_ON is really a bug itself... I will change it back to a normal sanity check.
> 
> Thanks,
> Tao
> 
> From d4f49b042411da140c95559895c57a00a1920fdd Mon Sep 17 00:00:00 2001
> From: Peng Tao <bergwolf@gmail.com>
> Date: Thu, 10 Nov 2011 03:57:00 -0500
> Subject: [PATCH 7/7] pnfsblock: alloc short extent before submit bio
> 
> As discussed earlier, it is better for block client to allocate memory for
> tracking extents state before submitting bio. So the patch does it by allocating
> a short_extent for every INVALID extent touched by write pagelist and for
> every zeroing page we created, saving them in layout header. Then in end_io we
> can just use them to create commit list items and avoid memory allocation there.
> 
> Signed-off-by: Peng Tao <peng_tao@emc.com>
> ---
>  fs/nfs/blocklayout/blocklayout.c |   74 +++++++++++++++++++++++++++++--------
>  fs/nfs/blocklayout/blocklayout.h |    9 ++++-
>  fs/nfs/blocklayout/extents.c     |   63 +++++++++++++++++++++++++++-----
>  3 files changed, 120 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 7fc69c9..1dd2983 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
>   */
>  struct parallel_io {
>         struct kref refcnt;
> -       void (*pnfs_callback) (void *data);
> +       void (*pnfs_callback) (void *data, int num_se);
>         void *data;
> +       int bse_count;
>  };
> 
>  static inline struct parallel_io *alloc_parallel(void *data)
> @@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
>         if (rv) {
>                 rv->data = data;
>                 kref_init(&rv->refcnt);
> +               rv->bse_count = 0;
>         }
>         return rv;
>  }
> @@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
>         struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
> 
>         dprintk("%s enter\n", __func__);
> -       p->pnfs_callback(p->data);
> +       p->pnfs_callback(p->data, p->bse_count);
>         kfree(p);
>  }
> 
> @@ -211,7 +213,7 @@ static void bl_read_cleanup(struct work_struct *work)
>  }
> 
>  static void
> -bl_end_par_io_read(void *data)
> +bl_end_par_io_read(void *data, int unused)
>  {
>         struct nfs_read_data *rdata = data;
> 
> @@ -312,6 +314,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
>  {
>         sector_t isect, end;
>         struct pnfs_block_extent *be;
> +       struct pnfs_block_short_extent *se;
> 
>         dprintk("%s(%llu, %u)\n", __func__, offset, count);
>         if (count == 0)
> @@ -324,8 +327,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
>                 be = bl_find_get_extent(bl, isect, NULL);
>                 BUG_ON(!be); /* FIXME */
>                 len = min(end, be->be_f_offset + be->be_length) - isect;
> -               if (be->be_state == PNFS_BLOCK_INVALID_DATA)
> -                       bl_mark_for_commit(be, isect, len); /* What if fails? */
> +               if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> +                       se = bl_pop_one_short_extent(be->be_inval);
> +                       BUG_ON(!se);
> +                       bl_mark_for_commit(be, isect, len, se);
> +               }
>                 isect += len;
>                 bl_put_extent(be);
>         }
> @@ -347,7 +353,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>                 end_page_writeback(page);
>                 page_cache_release(page);
>         } while (bvec >= bio->bi_io_vec);
> -       if (!uptodate) {
> +
> +       if (unlikely(!uptodate)) {
>                 if (!wdata->pnfs_error)
>                         wdata->pnfs_error = -EIO;
>                 pnfs_set_lo_fail(wdata->lseg);
> @@ -356,7 +363,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>         put_parallel(par);
>  }
> 
> -/* This is basically copied from mpage_end_io_read */
>  static void bl_end_io_write(struct bio *bio, int err)
>  {
>         struct parallel_io *par = bio->bi_private;
> @@ -382,7 +388,7 @@ static void bl_write_cleanup(struct work_struct *work)
>         dprintk("%s enter\n", __func__);
>         task = container_of(work, struct rpc_task, u.tk_work);
>         wdata = container_of(task, struct nfs_write_data, task);
> -       if (!wdata->pnfs_error) {
> +       if (likely(!wdata->pnfs_error)) {
>                 /* Marks for LAYOUTCOMMIT */
>                 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>                                      wdata->args.offset, wdata->args.count);
> @@ -391,10 +397,15 @@ static void bl_write_cleanup(struct work_struct *work)
>  }
> 
>  /* Called when last of bios associated with a bl_write_pagelist call finishes */
> -static void bl_end_par_io_write(void *data)
> +static void bl_end_par_io_write(void *data, int num_se)
>  {
>         struct nfs_write_data *wdata = data;
> 
> +       if (unlikely(wdata->pnfs_error)) {
> +               bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
> +                                       num_se);
> +       }
> +
>         wdata->task.tk_status = wdata->pnfs_error;
>         wdata->verf.committed = NFS_FILE_SYNC;
>         INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
> @@ -547,7 +558,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
>          */
>         par = alloc_parallel(wdata);
>         if (!par)
> -               return PNFS_NOT_ATTEMPTED;
> +               goto out_mds;
>         par->pnfs_callback = bl_end_par_io_write;
>         /* At this point, have to be more careful with error handling */
> 
> @@ -555,12 +566,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
>         be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
>         if (!be || !is_writable(be, isect)) {
>                 dprintk("%s no matching extents!\n", __func__);
> -               wdata->pnfs_error = -EINVAL;
> -               goto out;
> +               goto out_mds;
>         }
> 
>         /* First page inside INVALID extent */
>         if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> +               if (likely(!bl_push_one_short_extent(be->be_inval)))
> +                       par->bse_count++;
> +               else
> +                       goto out_mds;
>                 temp = offset >> PAGE_CACHE_SHIFT;
>                 npg_zero = do_div(temp, npg_per_block);
>                 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
> @@ -598,6 +612,19 @@ fill_invalid_ext:
>                                 wdata->pnfs_error = ret;
>                                 goto out;
>                         }
> +                       if (likely(!bl_push_one_short_extent(be->be_inval)))
> +                               par->bse_count++;
> +                       else {
> +                               end_page_writeback(page);
> +                               page_cache_release(page);
> +                               wdata->pnfs_error = -ENOMEM;
> +                               goto out;
> +                       }
> +                       /* FIXME: This should be done in bi_end_io */
> +                       mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
> +                                            page->index << PAGE_CACHE_SHIFT,
> +                                            PAGE_CACHE_SIZE);
> +
>                         bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
>                                                  isect, page, be,
>                                                  bl_end_io_write_zero, par);
> @@ -606,10 +633,6 @@ fill_invalid_ext:
>                                 bio = NULL;
>                                 goto out;
>                         }
> -                       /* FIXME: This should be done in bi_end_io */
> -                       mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
> -                                            page->index << PAGE_CACHE_SHIFT,
> -                                            PAGE_CACHE_SIZE);
>  next_page:
>                         isect += PAGE_CACHE_SECTORS;
>                         extent_length -= PAGE_CACHE_SECTORS;
> @@ -633,6 +656,15 @@ next_page:
>                                 wdata->pnfs_error = -EINVAL;
>                                 goto out;
>                         }
> +                       if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
> +                               if (likely(!bl_push_one_short_extent(
> +                                                               be->be_inval)))
> +                                       par->bse_count++;
> +                               else {
> +                                       wdata->pnfs_error = -ENOMEM;
> +                                       goto out;
> +                               }
> +                       }
>                         extent_length = be->be_length -
>                             (isect - be->be_f_offset);
>                 }
> @@ -680,6 +712,10 @@ out:
>         bl_submit_bio(WRITE, bio);
>         put_parallel(par);
>         return PNFS_ATTEMPTED;
> +out_mds:
> +       bl_put_extent(be);
> +       kfree(par);
> +       return PNFS_NOT_ATTEMPTED;
>  }
> 
>  /* FIXME - range ignored */
> @@ -706,11 +742,17 @@ static void
>  release_inval_marks(struct pnfs_inval_markings *marks)
>  {
>         struct pnfs_inval_tracking *pos, *temp;
> +       struct pnfs_block_short_extent *se, *stemp;
> 
>         list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
>                 list_del(&pos->it_link);
>                 kfree(pos);
>         }
> +
> +       list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
> +               list_del(&se->bse_node);
> +               kfree(se);
> +       }
>         return;
>  }
> 
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index 60728ac..e31a2df 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -70,6 +70,7 @@ struct pnfs_inval_markings {
>         spinlock_t      im_lock;
>         struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
>         sector_t        im_block_size;  /* Server blocksize in sectors */
> +       struct list_head im_extents;    /* Short extents for INVAL->RW conversion */
>  };
> 
>  struct pnfs_inval_tracking {
> @@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
>  {
>         spin_lock_init(&marks->im_lock);
>         INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
> +       INIT_LIST_HEAD(&marks->im_extents);
>         marks->im_block_size = blocksize;
>         marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
>                                            blocksize);
> @@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
>  int bl_add_merge_extent(struct pnfs_block_layout *bl,
>                          struct pnfs_block_extent *new);
>  int bl_mark_for_commit(struct pnfs_block_extent *be,
> -                       sector_t offset, sector_t length);
> +                       sector_t offset, sector_t length,
> +                       struct pnfs_block_short_extent *new);
> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
> +struct pnfs_block_short_extent *
> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
> 
>  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
> diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
> index d0f52ed..6ce5a8d 100644
> --- a/fs/nfs/blocklayout/extents.c
> +++ b/fs/nfs/blocklayout/extents.c
> @@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
> 
>  /* Note the range described by offset, length is guaranteed to be contained
>   * within be.
> + * new will be freed, either by this function or add_to_commitlist if they
> + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
>   */
>  int bl_mark_for_commit(struct pnfs_block_extent *be,
> -                   sector_t offset, sector_t length)
> +                   sector_t offset, sector_t length,
> +                   struct pnfs_block_short_extent *new)
>  {
>         sector_t new_end, end = offset + length;
> -       struct pnfs_block_short_extent *new;
>         struct pnfs_block_layout *bl = container_of(be->be_inval,
>                                                     struct pnfs_block_layout,
>                                                     bl_inval);
> 
> -       new = kmalloc(sizeof(*new), GFP_NOFS);
> -       if (!new)
> -               return -ENOMEM;
> -
>         mark_written_sectors(be->be_inval, offset, length);
>         /* We want to add the range to commit list, but it must be
>          * block-normalized, and verified that the normalized range has
> @@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
>         new->bse_mdev = be->be_mdev;
> 
>         spin_lock(&bl->bl_ext_lock);
> -       /* new will be freed, either by add_to_commitlist if it decides not
> -        * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
> -        */
>         add_to_commitlist(bl, new);
>         spin_unlock(&bl->bl_ext_lock);
>         return 0;
> @@ -862,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
>                 }
>         }
>  }
> +
> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
> +{
> +       struct pnfs_block_short_extent *new;
> +
> +       new = kmalloc(sizeof(*new), GFP_NOFS);
> +       if (unlikely(!new))
> +               return -ENOMEM;
> +
> +       spin_lock(&marks->im_lock);
> +       list_add(&new->bse_node, &marks->im_extents);
> +       spin_unlock(&marks->im_lock);
> +
> +       return 0;
> +}
> +
> +struct pnfs_block_short_extent *
> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
> +{
> +       struct pnfs_block_short_extent *rv = NULL;
> +
> +       spin_lock(&marks->im_lock);
> +       if (!list_empty(&marks->im_extents)) {
> +               rv = list_entry((&marks->im_extents)->next,
> +                               struct pnfs_block_short_extent, bse_node);
> +               list_del_init(&rv->bse_node);
> +       }
> +       spin_unlock(&marks->im_lock);
> +
> +       return rv;
> +}
> +
> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
> +{
> +       struct pnfs_block_short_extent *se = NULL, *tmp;
> +
> +       if (num_to_free <= 0)
> +               return;
> +
> +       spin_lock(&marks->im_lock);
> +       list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
> +               list_del(&se->bse_node);
> +               kfree(se);
> +               if (--num_to_free == 0)
> +                       break;
> +       }
> +       spin_unlock(&marks->im_lock);
> +
> +       BUG_ON(num_to_free > 0);
> +}
> --
> 1.7.4.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio
  2011-11-14 10:26       ` Benny Halevy
@ 2011-11-14 14:18         ` Peng Tao
  0 siblings, 0 replies; 13+ messages in thread
From: Peng Tao @ 2011-11-14 14:18 UTC (permalink / raw)
  To: Benny Halevy; +Cc: tao.peng, bjschuma, linux-nfs, Trond.Myklebust

On Mon, Nov 14, 2011 at 6:26 PM, Benny Halevy <bhalevy@tonian.com> wrote:
> Tao, The patch below doesn't apply.
> Can you please re-send a clean patch, with tab indents (vs. spaces).
oops, I will resend it.

Thanks,
Tao
>
> Thanks,
>
> Benny
>
> On 2011-11-11 04:05, tao.peng@emc.com wrote:
>> Hi, Bryan,
>>
>>> -----Original Message-----
>>> From: Bryan Schumaker [mailto:bjschuma@netapp.com]
>>> Sent: Friday, November 11, 2011 12:44 AM
>>> To: Peng Tao
>>> Cc: linux-nfs@vger.kernel.org; Trond.Myklebust@netapp.com; bhalevy@tonian.com;
>>> Peng, Tao
>>> Subject: Re: [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio
>>>
>>> On 11/10/2011 10:25 AM, Peng Tao wrote:
>>>> As discussed earlier, it is better for block client to allocate memory for
>>>> tracking extents state before submitting bio. So the patch does it by allocating
>>>> a short_extent for every INVALID extent touched by write pagelist and for
>>>> every zeroing page we created, saving them in layout header. Then in end_io we
>>>> can just use them to create commit list items and avoid memory allocation there.
>>>>
>>>> Signed-off-by: Peng Tao <peng_tao@emc.com>
>>>> ---
>>>>  fs/nfs/blocklayout/blocklayout.c |   74
>>> +++++++++++++++++++++++++++++--------
>>>>  fs/nfs/blocklayout/blocklayout.h |    9 ++++-
>>>>  fs/nfs/blocklayout/extents.c     |   62 +++++++++++++++++++++++++++-----
>>>>  3 files changed, 119 insertions(+), 26 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
>>>> index 3eaea2b..3fdfb29 100644
>>>> --- a/fs/nfs/blocklayout/blocklayout.c
>>>> +++ b/fs/nfs/blocklayout/blocklayout.c
>>>> @@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t
>>> isect)
>>>>   */
>>>>  struct parallel_io {
>>>>     struct kref refcnt;
>>>> -   void (*pnfs_callback) (void *data);
>>>> +   void (*pnfs_callback) (void *data, int num_se);
>>>>     void *data;
>>>> +   int bse_count;
>>>>  };
>>>>
>>>>  static inline struct parallel_io *alloc_parallel(void *data)
>>>> @@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
>>>>     if (rv) {
>>>>             rv->data = data;
>>>>             kref_init(&rv->refcnt);
>>>> +           rv->bse_count = 0;
>>>>     }
>>>>     return rv;
>>>>  }
>>>> @@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
>>>>     struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
>>>>
>>>>     dprintk("%s enter\n", __func__);
>>>> -   p->pnfs_callback(p->data);
>>>> +   p->pnfs_callback(p->data, p->bse_count);
>>>>     kfree(p);
>>>>  }
>>>>
>>>> @@ -211,7 +213,7 @@ static void bl_read_cleanup(struct work_struct *work)
>>>>  }
>>>>
>>>>  static void
>>>> -bl_end_par_io_read(void *data)
>>>> +bl_end_par_io_read(void *data, int unused)
>>>>  {
>>>>     struct nfs_read_data *rdata = data;
>>>>
>>>> @@ -312,6 +314,7 @@ static void mark_extents_written(struct pnfs_block_layout
>>> *bl,
>>>>  {
>>>>     sector_t isect, end;
>>>>     struct pnfs_block_extent *be;
>>>> +   struct pnfs_block_short_extent *se;
>>>>
>>>>     dprintk("%s(%llu, %u)\n", __func__, offset, count);
>>>>     if (count == 0)
>>>> @@ -324,8 +327,11 @@ static void mark_extents_written(struct
>>> pnfs_block_layout *bl,
>>>>             be = bl_find_get_extent(bl, isect, NULL);
>>>>             BUG_ON(!be); /* FIXME */
>>>>             len = min(end, be->be_f_offset + be->be_length) - isect;
>>>> -           if (be->be_state == PNFS_BLOCK_INVALID_DATA)
>>>> -                   bl_mark_for_commit(be, isect, len); /* What if fails? */
>>>> +           if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>>>> +                   se = bl_pop_one_short_extent(be->be_inval);
>>>> +                   BUG_ON(!se);
>>>> +                   bl_mark_for_commit(be, isect, len, se);
>>>> +           }
>>>>             isect += len;
>>>>             bl_put_extent(be);
>>>>     }
>>>> @@ -347,7 +353,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>>>>             end_page_writeback(page);
>>>>             page_cache_release(page);
>>>>     } while (bvec >= bio->bi_io_vec);
>>>> -   if (!uptodate) {
>>>> +
>>>> +   if (unlikely(!uptodate)) {
>>>>             if (!wdata->pnfs_error)
>>>>                     wdata->pnfs_error = -EIO;
>>>>             pnfs_set_lo_fail(wdata->lseg);
>>>> @@ -356,7 +363,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>>>>     put_parallel(par);
>>>>  }
>>>>
>>>> -/* This is basically copied from mpage_end_io_read */
>>>>  static void bl_end_io_write(struct bio *bio, int err)
>>>>  {
>>>>     struct parallel_io *par = bio->bi_private;
>>>> @@ -382,7 +388,7 @@ static void bl_write_cleanup(struct work_struct *work)
>>>>     dprintk("%s enter\n", __func__);
>>>>     task = container_of(work, struct rpc_task, u.tk_work);
>>>>     wdata = container_of(task, struct nfs_write_data, task);
>>>> -   if (!wdata->pnfs_error) {
>>>> +   if (likely(!wdata->pnfs_error)) {
>>>>             /* Marks for LAYOUTCOMMIT */
>>>>             mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>>>>                                  wdata->args.offset, wdata->args.count);
>>>> @@ -391,10 +397,15 @@ static void bl_write_cleanup(struct work_struct *work)
>>>>  }
>>>>
>>>>  /* Called when last of bios associated with a bl_write_pagelist call finishes */
>>>> -static void bl_end_par_io_write(void *data)
>>>> +static void bl_end_par_io_write(void *data, int num_se)
>>>>  {
>>>>     struct nfs_write_data *wdata = data;
>>>>
>>>> +   if (unlikely(wdata->pnfs_error)) {
>>>> +           bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
>>>> +                                   num_se);
>>>> +   }
>>>> +
>>>>     wdata->task.tk_status = wdata->pnfs_error;
>>>>     wdata->verf.committed = NFS_FILE_SYNC;
>>>>     INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
>>>> @@ -547,7 +558,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
>>>>      */
>>>>     par = alloc_parallel(wdata);
>>>>     if (!par)
>>>> -           return PNFS_NOT_ATTEMPTED;
>>>> +           goto out_mds;
>>>>     par->pnfs_callback = bl_end_par_io_write;
>>>>     /* At this point, have to be more careful with error handling */
>>>>
>>>> @@ -555,12 +566,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int
>>> sync)
>>>>     be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
>>>>     if (!be || !is_writable(be, isect)) {
>>>>             dprintk("%s no matching extents!\n", __func__);
>>>> -           wdata->pnfs_error = -EINVAL;
>>>> -           goto out;
>>>> +           goto out_mds;
>>>>     }
>>>>
>>>>     /* First page inside INVALID extent */
>>>>     if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>>>> +           if (likely(!bl_push_one_short_extent(be->be_inval)))
>>>> +                   par->bse_count++;
>>>> +           else
>>>> +                   goto out_mds;
>>>>             temp = offset >> PAGE_CACHE_SHIFT;
>>>>             npg_zero = do_div(temp, npg_per_block);
>>>>             isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
>>>> @@ -598,6 +612,19 @@ fill_invalid_ext:
>>>>                             wdata->pnfs_error = ret;
>>>>                             goto out;
>>>>                     }
>>>> +                   if (likely(!bl_push_one_short_extent(be->be_inval)))
>>>> +                           par->bse_count++;
>>>> +                   else {
>>>> +                           end_page_writeback(page);
>>>> +                           page_cache_release(page);
>>>> +                           wdata->pnfs_error = -ENOMEM;
>>>> +                           goto out;
>>>> +                   }
>>>> +                   /* FIXME: This should be done in bi_end_io */
>>>> +                   mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>>>> +                                        page->index << PAGE_CACHE_SHIFT,
>>>> +                                        PAGE_CACHE_SIZE);
>>>> +
>>>>                     bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
>>>>                                              isect, page, be,
>>>>                                              bl_end_io_write_zero, par);
>>>> @@ -606,10 +633,6 @@ fill_invalid_ext:
>>>>                             bio = NULL;
>>>>                             goto out;
>>>>                     }
>>>> -                   /* FIXME: This should be done in bi_end_io */
>>>> -                   mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>>>> -                                        page->index << PAGE_CACHE_SHIFT,
>>>> -                                        PAGE_CACHE_SIZE);
>>>>  next_page:
>>>>                     isect += PAGE_CACHE_SECTORS;
>>>>                     extent_length -= PAGE_CACHE_SECTORS;
>>>> @@ -633,6 +656,15 @@ next_page:
>>>>                             wdata->pnfs_error = -EINVAL;
>>>>                             goto out;
>>>>                     }
>>>> +                   if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>>>> +                           if (likely(!bl_push_one_short_extent(
>>>> +                                                           be->be_inval)))
>>>> +                                   par->bse_count++;
>>>> +                           else {
>>>> +                                   wdata->pnfs_error = -ENOMEM;
>>>> +                                   goto out;
>>>> +                           }
>>>> +                   }
>>>>                     extent_length = be->be_length -
>>>>                         (isect - be->be_f_offset);
>>>>             }
>>>> @@ -680,6 +712,10 @@ out:
>>>>     bl_submit_bio(WRITE, bio);
>>>>     put_parallel(par);
>>>>     return PNFS_ATTEMPTED;
>>>> +out_mds:
>>>> +   bl_put_extent(be);
>>>> +   kfree(par);
>>>> +   return PNFS_NOT_ATTEMPTED;
>>>>  }
>>>>
>>>>  /* FIXME - range ignored */
>>>> @@ -706,11 +742,17 @@ static void
>>>>  release_inval_marks(struct pnfs_inval_markings *marks)
>>>>  {
>>>>     struct pnfs_inval_tracking *pos, *temp;
>>>> +   struct pnfs_block_short_extent *se, *stemp;
>>>>
>>>>     list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
>>>>             list_del(&pos->it_link);
>>>>             kfree(pos);
>>>>     }
>>>> +
>>>> +   list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
>>>> +           list_del(&se->bse_node);
>>>> +           kfree(se);
>>>> +   }
>>>>     return;
>>>>  }
>>>>
>>>> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
>>>> index 60728ac..e31a2df 100644
>>>> --- a/fs/nfs/blocklayout/blocklayout.h
>>>> +++ b/fs/nfs/blocklayout/blocklayout.h
>>>> @@ -70,6 +70,7 @@ struct pnfs_inval_markings {
>>>>     spinlock_t      im_lock;
>>>>     struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
>>>>     sector_t        im_block_size;  /* Server blocksize in sectors */
>>>> +   struct list_head im_extents;    /* Short extents for INVAL->RW conversion */
>>>>  };
>>>>
>>>>  struct pnfs_inval_tracking {
>>>> @@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings
>>> *marks, sector_t blocksize)
>>>>  {
>>>>     spin_lock_init(&marks->im_lock);
>>>>     INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
>>>> +   INIT_LIST_HEAD(&marks->im_extents);
>>>>     marks->im_block_size = blocksize;
>>>>     marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
>>>>                                        blocksize);
>>>> @@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct
>>> pnfs_block_layout *bl,
>>>>  int bl_add_merge_extent(struct pnfs_block_layout *bl,
>>>>                      struct pnfs_block_extent *new);
>>>>  int bl_mark_for_commit(struct pnfs_block_extent *be,
>>>> -                   sector_t offset, sector_t length);
>>>> +                   sector_t offset, sector_t length,
>>>> +                   struct pnfs_block_short_extent *new);
>>>> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
>>>> +struct pnfs_block_short_extent *
>>>> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
>>>> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
>>>>
>>>>  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
>>>> diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
>>>> index d0f52ed..db66e68 100644
>>>> --- a/fs/nfs/blocklayout/extents.c
>>>> +++ b/fs/nfs/blocklayout/extents.c
>>>> @@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout
>>> *bl,
>>>>
>>>>  /* Note the range described by offset, length is guaranteed to be contained
>>>>   * within be.
>>>> + * new will be freed, either by this function or add_to_commitlist if they
>>>> + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
>>>>   */
>>>>  int bl_mark_for_commit(struct pnfs_block_extent *be,
>>>> -               sector_t offset, sector_t length)
>>>> +               sector_t offset, sector_t length,
>>>> +               struct pnfs_block_short_extent *new)
>>>>  {
>>>>     sector_t new_end, end = offset + length;
>>>> -   struct pnfs_block_short_extent *new;
>>>>     struct pnfs_block_layout *bl = container_of(be->be_inval,
>>>>                                                 struct pnfs_block_layout,
>>>>                                                 bl_inval);
>>>>
>>>> -   new = kmalloc(sizeof(*new), GFP_NOFS);
>>>> -   if (!new)
>>>> -           return -ENOMEM;
>>>> -
>>>>     mark_written_sectors(be->be_inval, offset, length);
>>>>     /* We want to add the range to commit list, but it must be
>>>>      * block-normalized, and verified that the normalized range has
>>>> @@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
>>>>     new->bse_mdev = be->be_mdev;
>>>>
>>>>     spin_lock(&bl->bl_ext_lock);
>>>> -   /* new will be freed, either by add_to_commitlist if it decides not
>>>> -    * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
>>>> -    */
>>>>     add_to_commitlist(bl, new);
>>>>     spin_unlock(&bl->bl_ext_lock);
>>>>     return 0;
>>>> @@ -862,3 +857,52 @@ clean_pnfs_block_layoutupdate(struct
>>> pnfs_block_layout *bl,
>>>>             }
>>>>     }
>>>>  }
>>>> +
>>>> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
>>>> +{
>>>> +   struct pnfs_block_short_extent *new;
>>>> +
>>>> +   new = kmalloc(sizeof(*new), GFP_NOFS);
>>>> +   if (unlikely(!new))
>>>> +           return -ENOMEM;
>>>> +
>>>> +   spin_lock(&marks->im_lock);
>>>> +   list_add(&new->bse_node, &marks->im_extents);
>>>> +   spin_unlock(&marks->im_lock);
>>>> +
>>>> +   return 0;
>>>> +}
>>>> +
>>>> +struct pnfs_block_short_extent *
>>>> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
>>>> +{
>>>> +   struct pnfs_block_short_extent *rv = NULL;
>>>> +
>>>> +   spin_lock(&marks->im_lock);
>>>> +   if (!list_empty(&marks->im_extents)) {
>>>> +           rv = list_entry((&marks->im_extents)->next,
>>>> +                           struct pnfs_block_short_extent, bse_node);
>>>> +           list_del_init(&rv->bse_node);
>>>> +   }
>>>> +   spin_unlock(&marks->im_lock);
>>>> +
>>>> +   return rv;
>>>> +}
>>>> +
>>>> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
>>>> +{
>>>> +   struct pnfs_block_short_extent *se = NULL, *tmp;
>>>> +
>>>> +   BUG_ON(num_to_free <= 0);
>>>
>>> Why is it a bug if num_to_free is <= 0?  Couldn't you do:
>>>
>>> if (num_to_free <= 0)
>>>       return;
>>>
>>> instead?
>> It used to be a sanity check but was changed to BUG_ON in the second version.
>> Yeah, after giving it a second thought, I think it necessary to keep the sanity check. There are cases that num_to_free can be zero. E.g., a write_pagelist touches no INVALID extents, and one bio fails with disk failure. Then we will come to calling bl_free_short_extents() with num_to_free == 0. So the BUG_ON is really a bug itself... I will change it back to a normal sanity check.
>>
>> Thanks,
>> Tao
>>
>> From d4f49b042411da140c95559895c57a00a1920fdd Mon Sep 17 00:00:00 2001
>> From: Peng Tao <bergwolf@gmail.com>
>> Date: Thu, 10 Nov 2011 03:57:00 -0500
>> Subject: [PATCH 7/7] pnfsblock: alloc short extent before submit bio
>>
>> As discussed earlier, it is better for block client to allocate memory for
>> tracking extents state before submitting bio. So the patch does it by allocating
>> a short_extent for every INVALID extent touched by write pagelist and for
>> every zeroing page we created, saving them in layout header. Then in end_io we
>> can just use them to create commit list items and avoid memory allocation there.
>>
>> Signed-off-by: Peng Tao <peng_tao@emc.com>
>> ---
>>  fs/nfs/blocklayout/blocklayout.c |   74 +++++++++++++++++++++++++++++--------
>>  fs/nfs/blocklayout/blocklayout.h |    9 ++++-
>>  fs/nfs/blocklayout/extents.c     |   63 +++++++++++++++++++++++++++-----
>>  3 files changed, 120 insertions(+), 26 deletions(-)
>>
>> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
>> index 7fc69c9..1dd2983 100644
>> --- a/fs/nfs/blocklayout/blocklayout.c
>> +++ b/fs/nfs/blocklayout/blocklayout.c
>> @@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
>>   */
>>  struct parallel_io {
>>         struct kref refcnt;
>> -       void (*pnfs_callback) (void *data);
>> +       void (*pnfs_callback) (void *data, int num_se);
>>         void *data;
>> +       int bse_count;
>>  };
>>
>>  static inline struct parallel_io *alloc_parallel(void *data)
>> @@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
>>         if (rv) {
>>                 rv->data = data;
>>                 kref_init(&rv->refcnt);
>> +               rv->bse_count = 0;
>>         }
>>         return rv;
>>  }
>> @@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
>>         struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
>>
>>         dprintk("%s enter\n", __func__);
>> -       p->pnfs_callback(p->data);
>> +       p->pnfs_callback(p->data, p->bse_count);
>>         kfree(p);
>>  }
>>
>> @@ -211,7 +213,7 @@ static void bl_read_cleanup(struct work_struct *work)
>>  }
>>
>>  static void
>> -bl_end_par_io_read(void *data)
>> +bl_end_par_io_read(void *data, int unused)
>>  {
>>         struct nfs_read_data *rdata = data;
>>
>> @@ -312,6 +314,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
>>  {
>>         sector_t isect, end;
>>         struct pnfs_block_extent *be;
>> +       struct pnfs_block_short_extent *se;
>>
>>         dprintk("%s(%llu, %u)\n", __func__, offset, count);
>>         if (count == 0)
>> @@ -324,8 +327,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
>>                 be = bl_find_get_extent(bl, isect, NULL);
>>                 BUG_ON(!be); /* FIXME */
>>                 len = min(end, be->be_f_offset + be->be_length) - isect;
>> -               if (be->be_state == PNFS_BLOCK_INVALID_DATA)
>> -                       bl_mark_for_commit(be, isect, len); /* What if fails? */
>> +               if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>> +                       se = bl_pop_one_short_extent(be->be_inval);
>> +                       BUG_ON(!se);
>> +                       bl_mark_for_commit(be, isect, len, se);
>> +               }
>>                 isect += len;
>>                 bl_put_extent(be);
>>         }
>> @@ -347,7 +353,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>>                 end_page_writeback(page);
>>                 page_cache_release(page);
>>         } while (bvec >= bio->bi_io_vec);
>> -       if (!uptodate) {
>> +
>> +       if (unlikely(!uptodate)) {
>>                 if (!wdata->pnfs_error)
>>                         wdata->pnfs_error = -EIO;
>>                 pnfs_set_lo_fail(wdata->lseg);
>> @@ -356,7 +363,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
>>         put_parallel(par);
>>  }
>>
>> -/* This is basically copied from mpage_end_io_read */
>>  static void bl_end_io_write(struct bio *bio, int err)
>>  {
>>         struct parallel_io *par = bio->bi_private;
>> @@ -382,7 +388,7 @@ static void bl_write_cleanup(struct work_struct *work)
>>         dprintk("%s enter\n", __func__);
>>         task = container_of(work, struct rpc_task, u.tk_work);
>>         wdata = container_of(task, struct nfs_write_data, task);
>> -       if (!wdata->pnfs_error) {
>> +       if (likely(!wdata->pnfs_error)) {
>>                 /* Marks for LAYOUTCOMMIT */
>>                 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>>                                      wdata->args.offset, wdata->args.count);
>> @@ -391,10 +397,15 @@ static void bl_write_cleanup(struct work_struct *work)
>>  }
>>
>>  /* Called when last of bios associated with a bl_write_pagelist call finishes */
>> -static void bl_end_par_io_write(void *data)
>> +static void bl_end_par_io_write(void *data, int num_se)
>>  {
>>         struct nfs_write_data *wdata = data;
>>
>> +       if (unlikely(wdata->pnfs_error)) {
>> +               bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
>> +                                       num_se);
>> +       }
>> +
>>         wdata->task.tk_status = wdata->pnfs_error;
>>         wdata->verf.committed = NFS_FILE_SYNC;
>>         INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
>> @@ -547,7 +558,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
>>          */
>>         par = alloc_parallel(wdata);
>>         if (!par)
>> -               return PNFS_NOT_ATTEMPTED;
>> +               goto out_mds;
>>         par->pnfs_callback = bl_end_par_io_write;
>>         /* At this point, have to be more careful with error handling */
>>
>> @@ -555,12 +566,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
>>         be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
>>         if (!be || !is_writable(be, isect)) {
>>                 dprintk("%s no matching extents!\n", __func__);
>> -               wdata->pnfs_error = -EINVAL;
>> -               goto out;
>> +               goto out_mds;
>>         }
>>
>>         /* First page inside INVALID extent */
>>         if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>> +               if (likely(!bl_push_one_short_extent(be->be_inval)))
>> +                       par->bse_count++;
>> +               else
>> +                       goto out_mds;
>>                 temp = offset >> PAGE_CACHE_SHIFT;
>>                 npg_zero = do_div(temp, npg_per_block);
>>                 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
>> @@ -598,6 +612,19 @@ fill_invalid_ext:
>>                                 wdata->pnfs_error = ret;
>>                                 goto out;
>>                         }
>> +                       if (likely(!bl_push_one_short_extent(be->be_inval)))
>> +                               par->bse_count++;
>> +                       else {
>> +                               end_page_writeback(page);
>> +                               page_cache_release(page);
>> +                               wdata->pnfs_error = -ENOMEM;
>> +                               goto out;
>> +                       }
>> +                       /* FIXME: This should be done in bi_end_io */
>> +                       mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>> +                                            page->index << PAGE_CACHE_SHIFT,
>> +                                            PAGE_CACHE_SIZE);
>> +
>>                         bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
>>                                                  isect, page, be,
>>                                                  bl_end_io_write_zero, par);
>> @@ -606,10 +633,6 @@ fill_invalid_ext:
>>                                 bio = NULL;
>>                                 goto out;
>>                         }
>> -                       /* FIXME: This should be done in bi_end_io */
>> -                       mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
>> -                                            page->index << PAGE_CACHE_SHIFT,
>> -                                            PAGE_CACHE_SIZE);
>>  next_page:
>>                         isect += PAGE_CACHE_SECTORS;
>>                         extent_length -= PAGE_CACHE_SECTORS;
>> @@ -633,6 +656,15 @@ next_page:
>>                                 wdata->pnfs_error = -EINVAL;
>>                                 goto out;
>>                         }
>> +                       if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
>> +                               if (likely(!bl_push_one_short_extent(
>> +                                                               be->be_inval)))
>> +                                       par->bse_count++;
>> +                               else {
>> +                                       wdata->pnfs_error = -ENOMEM;
>> +                                       goto out;
>> +                               }
>> +                       }
>>                         extent_length = be->be_length -
>>                             (isect - be->be_f_offset);
>>                 }
>> @@ -680,6 +712,10 @@ out:
>>         bl_submit_bio(WRITE, bio);
>>         put_parallel(par);
>>         return PNFS_ATTEMPTED;
>> +out_mds:
>> +       bl_put_extent(be);
>> +       kfree(par);
>> +       return PNFS_NOT_ATTEMPTED;
>>  }
>>
>>  /* FIXME - range ignored */
>> @@ -706,11 +742,17 @@ static void
>>  release_inval_marks(struct pnfs_inval_markings *marks)
>>  {
>>         struct pnfs_inval_tracking *pos, *temp;
>> +       struct pnfs_block_short_extent *se, *stemp;
>>
>>         list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
>>                 list_del(&pos->it_link);
>>                 kfree(pos);
>>         }
>> +
>> +       list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
>> +               list_del(&se->bse_node);
>> +               kfree(se);
>> +       }
>>         return;
>>  }
>>
>> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
>> index 60728ac..e31a2df 100644
>> --- a/fs/nfs/blocklayout/blocklayout.h
>> +++ b/fs/nfs/blocklayout/blocklayout.h
>> @@ -70,6 +70,7 @@ struct pnfs_inval_markings {
>>         spinlock_t      im_lock;
>>         struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
>>         sector_t        im_block_size;  /* Server blocksize in sectors */
>> +       struct list_head im_extents;    /* Short extents for INVAL->RW conversion */
>>  };
>>
>>  struct pnfs_inval_tracking {
>> @@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
>>  {
>>         spin_lock_init(&marks->im_lock);
>>         INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
>> +       INIT_LIST_HEAD(&marks->im_extents);
>>         marks->im_block_size = blocksize;
>>         marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
>>                                            blocksize);
>> @@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
>>  int bl_add_merge_extent(struct pnfs_block_layout *bl,
>>                          struct pnfs_block_extent *new);
>>  int bl_mark_for_commit(struct pnfs_block_extent *be,
>> -                       sector_t offset, sector_t length);
>> +                       sector_t offset, sector_t length,
>> +                       struct pnfs_block_short_extent *new);
>> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
>> +struct pnfs_block_short_extent *
>> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
>> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
>>
>>  #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
>> diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
>> index d0f52ed..6ce5a8d 100644
>> --- a/fs/nfs/blocklayout/extents.c
>> +++ b/fs/nfs/blocklayout/extents.c
>> @@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
>>
>>  /* Note the range described by offset, length is guaranteed to be contained
>>   * within be.
>> + * new will be freed, either by this function or add_to_commitlist if they
>> + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
>>   */
>>  int bl_mark_for_commit(struct pnfs_block_extent *be,
>> -                   sector_t offset, sector_t length)
>> +                   sector_t offset, sector_t length,
>> +                   struct pnfs_block_short_extent *new)
>>  {
>>         sector_t new_end, end = offset + length;
>> -       struct pnfs_block_short_extent *new;
>>         struct pnfs_block_layout *bl = container_of(be->be_inval,
>>                                                     struct pnfs_block_layout,
>>                                                     bl_inval);
>>
>> -       new = kmalloc(sizeof(*new), GFP_NOFS);
>> -       if (!new)
>> -               return -ENOMEM;
>> -
>>         mark_written_sectors(be->be_inval, offset, length);
>>         /* We want to add the range to commit list, but it must be
>>          * block-normalized, and verified that the normalized range has
>> @@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
>>         new->bse_mdev = be->be_mdev;
>>
>>         spin_lock(&bl->bl_ext_lock);
>> -       /* new will be freed, either by add_to_commitlist if it decides not
>> -        * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
>> -        */
>>         add_to_commitlist(bl, new);
>>         spin_unlock(&bl->bl_ext_lock);
>>         return 0;
>> @@ -862,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
>>                 }
>>         }
>>  }
>> +
>> +int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
>> +{
>> +       struct pnfs_block_short_extent *new;
>> +
>> +       new = kmalloc(sizeof(*new), GFP_NOFS);
>> +       if (unlikely(!new))
>> +               return -ENOMEM;
>> +
>> +       spin_lock(&marks->im_lock);
>> +       list_add(&new->bse_node, &marks->im_extents);
>> +       spin_unlock(&marks->im_lock);
>> +
>> +       return 0;
>> +}
>> +
>> +struct pnfs_block_short_extent *
>> +bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
>> +{
>> +       struct pnfs_block_short_extent *rv = NULL;
>> +
>> +       spin_lock(&marks->im_lock);
>> +       if (!list_empty(&marks->im_extents)) {
>> +               rv = list_entry((&marks->im_extents)->next,
>> +                               struct pnfs_block_short_extent, bse_node);
>> +               list_del_init(&rv->bse_node);
>> +       }
>> +       spin_unlock(&marks->im_lock);
>> +
>> +       return rv;
>> +}
>> +
>> +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
>> +{
>> +       struct pnfs_block_short_extent *se = NULL, *tmp;
>> +
>> +       if (num_to_free <= 0)
>> +               return;
>> +
>> +       spin_lock(&marks->im_lock);
>> +       list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
>> +               list_del(&se->bse_node);
>> +               kfree(se);
>> +               if (--num_to_free == 0)
>> +                       break;
>> +       }
>> +       spin_unlock(&marks->im_lock);
>> +
>> +       BUG_ON(num_to_free > 0);
>> +}
>> --
>> 1.7.4.2
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>



-- 
Thanks,
-Bergwolf

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2011-11-14 14:18 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-10 15:25 [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
2011-11-10 15:25 ` [PATCH-v2 1/7] pnfsblock: cleanup bl_mark_sectors_init Peng Tao
2011-11-10 15:25 ` [PATCH-v2 3/7] pnfsblock: move find lock page logic out of bl_write_pagelist Peng Tao
2011-11-10 15:25 ` [PATCH-v2 4/7] pnfsblock: set read/write tk_status to pnfs_error Peng Tao
2011-11-10 15:25 ` [PATCH-v2 5/7] pnfsblock: remove rpc_call_ops from struct parallel_io Peng Tao
2011-11-10 15:25 ` [PATCH-v2 6/7] pnfsblock: clean up _add_entry Peng Tao
2011-11-10 15:25 ` [PATCH-v2 7/7] pnfsblock: alloc short extent before submit bio Peng Tao
2011-11-10 16:43   ` Bryan Schumaker
2011-11-11  2:05     ` tao.peng
2011-11-14 10:26       ` Benny Halevy
2011-11-14 14:18         ` Peng Tao
2011-11-10 15:37 ` [PATCH-v2 0/7] pnfsblock cleanup and fixes Peng Tao
2011-11-10 15:52 ` Peng Tao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox