[PATCH] ext4: Rework the ext4_da

linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] ext4: Rework the ext4_da_writepages
@ 2008-07-31 17:33 Aneesh Kumar K.V
  2008-07-31 17:47 ` Aneesh Kumar K.V
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Aneesh Kumar K.V @ 2008-07-31 17:33 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V

With the below changes we reserve credit needed to insert only one extent
resulting from a call to single get_block. That make sure we don't take
too much journal credits during writeout. We also don't limit the pages
to write. That means we loop through the dirty pages building largest
possible contiguous block request. Then we issue a single get_block request.
We may get less block that we requested. If so we would end up not mapping
some of the buffer_heads. That means those buffer_heads are still marked delay.
Later in the writepage callback via __mpage_writepage we redirty those pages.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c |  128 +++++++++++++++++++++++++++++-------------------------
 1 files changed, 69 insertions(+), 59 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5665bec..465108b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
 #include "acl.h"
 #include "ext4_extents.h"
 
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
@@ -1580,6 +1582,8 @@ static void ext4_da_page_release_reservation(struct page *page,
 	unsigned long first_page, next_page;	/* extent of pages */
 	get_block_t *get_block;
 	struct writeback_control *wbc;
+	int io_done;
+	long pages_written;
 };
 
 /*
@@ -1629,6 +1633,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 			index++;
 
 			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+			if (!err)
+				mpd->pages_written++;
 
 			/*
 			 * In error case, we have to continue because
@@ -1748,8 +1754,8 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
  */
 static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
+	int err = 0;
 	struct buffer_head *lbh = &mpd->lbh;
-	int err = 0, remain = lbh->b_size;
 	sector_t next = lbh->b_blocknr;
 	struct buffer_head new;
 
@@ -1759,35 +1765,25 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if (buffer_mapped(lbh) && !buffer_delay(lbh))
 		return;
 
-	while (remain) {
-		new.b_state = lbh->b_state;
-		new.b_blocknr = 0;
-		new.b_size = remain;
-		err = mpd->get_block(mpd->inode, next, &new, 1);
-		if (err) {
-			/*
-			 * Rather than implement own error handling
-			 * here, we just leave remaining blocks
-			 * unallocated and try again with ->writepage()
-			 */
-			break;
-		}
-		BUG_ON(new.b_size == 0);
+	new.b_state = lbh->b_state;
+	new.b_blocknr = 0;
+	new.b_size = lbh->b_size;
+	err = mpd->get_block(mpd->inode, next, &new, 1);
+	if (err)
+		return;
+	BUG_ON(new.b_size == 0);
 
-		if (buffer_new(&new))
-			__unmap_underlying_blocks(mpd->inode, &new);
+	if (buffer_new(&new))
+		__unmap_underlying_blocks(mpd->inode, &new);
 
-		/*
-		 * If blocks are delayed marked, we need to
-		 * put actual blocknr and drop delayed bit
-		 */
-		if (buffer_delay(lbh))
-			mpage_put_bnr_to_bhs(mpd, next, &new);
+	/*
+	 * If blocks are delayed marked, we need to
+	 * put actual blocknr and drop delayed bit
+	 */
+	if (buffer_delay(lbh))
+		mpage_put_bnr_to_bhs(mpd, next, &new);
 
-		/* go for the remaining blocks */
-		next += new.b_size >> mpd->inode->i_blkbits;
-		remain -= new.b_size;
-	}
+	return;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
@@ -1832,13 +1828,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	 * need to flush current  extent and start new one
 	 */
 	mpage_da_map_blocks(mpd);
-
-	/*
-	 * Now start a new extent
-	 */
-	lbh->b_size = bh->b_size;
-	lbh->b_state = bh->b_state & BH_FLAGS;
-	lbh->b_blocknr = logical;
+	mpage_da_submit_io(mpd);
+	mpd->io_done = 1;
+	return;
 }
 
 /*
@@ -1858,6 +1850,17 @@ static int __mpage_da_writepage(struct page *page,
 	struct buffer_head *bh, *head, fake;
 	sector_t logical;
 
+	if (mpd->io_done) {
+		/*
+		 * Rest of the page in the page_vec
+		 * redirty then and skip then. We will
+		 * try to to write them again after
+		 * starting a new transaction
+		 */
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return MPAGE_DA_EXTENT_TAIL;
+	}
 	/*
 	 * Can we merge this page to current extent?
 	 */
@@ -1869,6 +1872,13 @@ static int __mpage_da_writepage(struct page *page,
 		if (mpd->next_page != mpd->first_page) {
 			mpage_da_map_blocks(mpd);
 			mpage_da_submit_io(mpd);
+			/*
+			 * skip rest of the page in the page_vec
+			 */
+			mpd->io_done = 1;
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return MPAGE_DA_EXTENT_TAIL;
 		}
 
 		/*
@@ -1899,6 +1909,8 @@ static int __mpage_da_writepage(struct page *page,
 		set_buffer_dirty(bh);
 		set_buffer_uptodate(bh);
 		mpage_add_bh_to_extent(mpd, logical, bh);
+		if (mpd->io_done)
+			return MPAGE_DA_EXTENT_TAIL;
 	} else {
 		/*
 		 * Page with regular buffer heads, just add all dirty ones
@@ -1907,8 +1919,11 @@ static int __mpage_da_writepage(struct page *page,
 		bh = head;
 		do {
 			BUG_ON(buffer_locked(bh));
-			if (buffer_dirty(bh))
+			if (buffer_dirty(bh)) {
 				mpage_add_bh_to_extent(mpd, logical, bh);
+				if (mpd->io_done)
+					return MPAGE_DA_EXTENT_TAIL;
+			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
 	}
@@ -1943,6 +1958,7 @@ static int mpage_da_writepages(struct address_space *mapping,
 			       get_block_t get_block)
 {
 	struct mpage_da_data mpd;
+	long to_write;
 	int ret;
 
 	if (!get_block)
@@ -1956,17 +1972,22 @@ static int mpage_da_writepages(struct address_space *mapping,
 	mpd.first_page = 0;
 	mpd.next_page = 0;
 	mpd.get_block = get_block;
+	mpd.io_done = 0;
+	mpd.pages_written = 0;
+
+	to_write = wbc->nr_to_write;
 
 	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
 
 	/*
 	 * Handle last extent of pages
 	 */
-	if (mpd.next_page != mpd.first_page) {
+	if (!mpd.io_done && mpd.next_page != mpd.first_page) {
 		mpage_da_map_blocks(&mpd);
 		mpage_da_submit_io(&mpd);
 	}
 
+	wbc->nr_to_write = to_write - mpd.pages_written;
 	return ret;
 }
 
@@ -2178,10 +2199,6 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int ret = 0;
 	long to_write;
 	loff_t range_start = 0;
-	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
-	int max_credit_blocks = ext4_journal_max_transaction_buffers(inode);
-	int need_credits_per_page =  ext4_writepages_trans_blocks(inode, 1);
-	int max_writeback_pages = (max_credit_blocks / blocks_per_page) / need_credits_per_page;
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2205,25 +2222,11 @@ static int ext4_da_writepages(struct address_space *mapping,
 		range_start =  wbc->range_start;
 	}
 
-	while (!ret && to_write) {
-		/*
-		 * set the max dirty pages could be write at a time
-		 * to fit into the reserved transaction credits
-		 */
-		if (wbc->nr_to_write > max_writeback_pages)
-			wbc->nr_to_write = max_writeback_pages;
+	while (!ret && to_write > 0) {
+
+		BUG_ON(ext4_should_journal_data(inode));
+		needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
 
-		/*
-		 * Estimate the worse case needed credits to write out
-		 * to_write pages
-		 */
-		needed_blocks = ext4_writepages_trans_blocks(inode,
-							     wbc->nr_to_write);
-		while (needed_blocks > max_credit_blocks) {
-			wbc->nr_to_write --;
-			needed_blocks = ext4_writepages_trans_blocks(inode,
-							     wbc->nr_to_write);
-		}
 		/* start a new transaction*/
 		handle = ext4_journal_start(inode, needed_blocks);
 		if (IS_ERR(handle)) {
@@ -2251,7 +2254,14 @@ static int ext4_da_writepages(struct address_space *mapping,
 		ret = mpage_da_writepages(mapping, wbc,
 						ext4_da_get_block_write);
 		ext4_journal_stop(handle);
-		if (wbc->nr_to_write) {
+		if (ret == MPAGE_DA_EXTENT_TAIL) {
+			/*
+			 * got one extent now try with
+			 * rest of the pages
+			 */
+			to_write += wbc->nr_to_write;
+			ret = 0;
+		} else if (wbc->nr_to_write) {
 			/*
 			 * There is no more writeout needed
 			 * or we requested for a noblocking writeout
-- 
1.6.0.rc0.42.g186458.dirty


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: Rework the ext4_da_writepages
  2008-07-31 17:33 [PATCH] ext4: Rework the ext4_da_writepages Aneesh Kumar K.V
@ 2008-07-31 17:47 ` Aneesh Kumar K.V
  2008-07-31 20:10 ` Andreas Dilger
  2008-08-01  3:08 ` Theodore Tso
  2 siblings, 0 replies; 8+ messages in thread
From: Aneesh Kumar K.V @ 2008-07-31 17:47 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4

On Thu, Jul 31, 2008 at 11:03:25PM +0530, Aneesh Kumar K.V wrote:
> With the below changes we reserve credit needed to insert only one extent
> resulting from a call to single get_block. That make sure we don't take
> too much journal credits during writeout. We also don't limit the pages
> to write. That means we loop through the dirty pages building largest
> possible contiguous block request. Then we issue a single get_block request.
> We may get less block that we requested. If so we would end up not mapping
> some of the buffer_heads. That means those buffer_heads are still marked delay.
> Later in the writepage callback via __mpage_writepage we redirty those pages.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Tested with 
a) fsstress with falloc
b) fsxlinux with falloc
c) fs_inode
d) ffsb
e) cp -ax / .

with fsck after each test

-aneesh

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: Rework the ext4_da_writepages
  2008-07-31 17:33 [PATCH] ext4: Rework the ext4_da_writepages Aneesh Kumar K.V
  2008-07-31 17:47 ` Aneesh Kumar K.V
@ 2008-07-31 20:10 ` Andreas Dilger
  2008-08-01  4:54   ` Aneesh Kumar K.V
  2008-08-01  3:08 ` Theodore Tso
  2 siblings, 1 reply; 8+ messages in thread
From: Andreas Dilger @ 2008-07-31 20:10 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: cmm, tytso, sandeen, linux-ext4

On Jul 31, 2008  23:03 +0530, Aneesh Kumar wrote:
> With the below changes we reserve credit needed to insert only one extent
> resulting from a call to single get_block. That make sure we don't take
> too much journal credits during writeout. We also don't limit the pages
> to write. That means we loop through the dirty pages building largest
> possible contiguous block request. Then we issue a single get_block request.
> We may get less block that we requested. If so we would end up not mapping
> some of the buffer_heads. That means those buffer_heads are still marked delay.
> Later in the writepage callback via __mpage_writepage we redirty those pages.

Can you please clarify this?  Does this mean we take one pass through the
dirty pages, but possibly do not allocate some subset of the pages.  Then,
at some later time these holes are written out separately?  This seems
like it would produce fragmentation if we do not work to ensure the pages
are allocated in sequence.  Maybe I'm misunderstanding your comment and
the unmapped pages are immediately mapped on the next loop?

It is great that this will potentially allocate huge amounts of space
(up to 128MB ideally) in a single call if the pages are contiguous.

The only danger I can see of having many smaller transactions instead
of a single larger one is if this is causing many more transactions
in the case of e.g. O_SYNC or similar, but AFAIK that is handled at
a higher level and we should be OK.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: Rework the ext4_da_writepages
  2008-07-31 20:10 ` Andreas Dilger
@ 2008-08-01  4:54   ` Aneesh Kumar K.V
  2008-08-01  5:07     ` Aneesh Kumar K.V
  0 siblings, 1 reply; 8+ messages in thread
From: Aneesh Kumar K.V @ 2008-08-01  4:54 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: cmm, tytso, sandeen, linux-ext4

On Thu, Jul 31, 2008 at 02:10:55PM -0600, Andreas Dilger wrote:
> On Jul 31, 2008  23:03 +0530, Aneesh Kumar wrote:
> > With the below changes we reserve credit needed to insert only one extent
> > resulting from a call to single get_block. That make sure we don't take
> > too much journal credits during writeout. We also don't limit the pages
> > to write. That means we loop through the dirty pages building largest
> > possible contiguous block request. Then we issue a single get_block request.
> > We may get less block that we requested. If so we would end up not mapping
> > some of the buffer_heads. That means those buffer_heads are still marked delay.
> > Later in the writepage callback via __mpage_writepage we redirty those pages.
> 
> Can you please clarify this?  Does this mean we take one pass through the
> dirty pages, but possibly do not allocate some subset of the pages.  Then,
> at some later time these holes are written out separately?  This seems
> like it would produce fragmentation if we do not work to ensure the pages
> are allocated in sequence.  Maybe I'm misunderstanding your comment and
> the unmapped pages are immediately mapped on the next loop?

We take multiple pass through the dirty pages until wbc->nr_to_write is
<= 0 or we don't have anything more to write. But if get_block doesn't
return the requested number of blocks we may possibly not writeout
some of the pages. Whether this can result in a disk layout worse than
the current, I am not sure. I haven't looked at the layout yet.
But these pages which are skipped are redirtied again via
reditry_pages_for_writepage and will be forced for writeout. Well
we can do better by setting  wbc->encountered_congestion = 1; even
though we are not really congested. That would cause most of the pdflush
work func to retry writeback_indoes.

for(;;) {
...
wbc.pages_skipped = 0;
writeback_inodes(&wbc);
...

if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
	/* Wrote less than expected */
	if (wbc.encountered_congestion || wbc.more_io)
		congestion_wait(WRITE, HZ/10);
	else
		break;
}

}

> 
> It is great that this will potentially allocate huge amounts of space
> (up to 128MB ideally) in a single call if the pages are contiguous.
> 
> The only danger I can see of having many smaller transactions instead
> of a single larger one is if this is causing many more transactions
> in the case of e.g. O_SYNC or similar, but AFAIK that is handled at
> a higher level and we should be OK.
> 
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
> 

-aneesh

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: Rework the ext4_da_writepages
  2008-08-01  4:54   ` Aneesh Kumar K.V
@ 2008-08-01  5:07     ` Aneesh Kumar K.V
  0 siblings, 0 replies; 8+ messages in thread
From: Aneesh Kumar K.V @ 2008-08-01  5:07 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: cmm, tytso, sandeen, linux-ext4

On Fri, Aug 01, 2008 at 10:24:12AM +0530, Aneesh Kumar K.V wrote:
> On Thu, Jul 31, 2008 at 02:10:55PM -0600, Andreas Dilger wrote:
> > On Jul 31, 2008  23:03 +0530, Aneesh Kumar wrote:
> > > With the below changes we reserve credit needed to insert only one extent
> > > resulting from a call to single get_block. That make sure we don't take
> > > too much journal credits during writeout. We also don't limit the pages
> > > to write. That means we loop through the dirty pages building largest
> > > possible contiguous block request. Then we issue a single get_block request.
> > > We may get less block that we requested. If so we would end up not mapping
> > > some of the buffer_heads. That means those buffer_heads are still marked delay.
> > > Later in the writepage callback via __mpage_writepage we redirty those pages.
> > 
> > Can you please clarify this?  Does this mean we take one pass through the
> > dirty pages, but possibly do not allocate some subset of the pages.  Then,
> > at some later time these holes are written out separately?  This seems
> > like it would produce fragmentation if we do not work to ensure the pages
> > are allocated in sequence.  Maybe I'm misunderstanding your comment and
> > the unmapped pages are immediately mapped on the next loop?
> 
> We take multiple pass through the dirty pages until wbc->nr_to_write is
> <= 0 or we don't have anything more to write. But if get_block doesn't
> return the requested number of blocks we may possibly not writeout
> some of the pages. Whether this can result in a disk layout worse than
> the current, I am not sure. I haven't looked at the layout yet.
> But these pages which are skipped are redirtied again via
> reditry_pages_for_writepage and will be forced for writeout. Well
> we can do better by setting  wbc->encountered_congestion = 1; even
> though we are not really congested. That would cause most of the pdflush
> work func to retry writeback_indoes.
> 
> for(;;) {
> ...
> wbc.pages_skipped = 0;
> writeback_inodes(&wbc);
> ...
> 
> if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> 	/* Wrote less than expected */
> 	if (wbc.encountered_congestion || wbc.more_io)
> 		congestion_wait(WRITE, HZ/10);
> 	else
> 		break;
> }
> 
> }
> 

like below ?

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 53a8fc7..6fd527c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1773,6 +1773,14 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 		return;
 	BUG_ON(new.b_size == 0);
 
+	if (new.b_size < lbh->b_size) {
+		/*
+		 * allocated less blocks. force writepages
+		 * to be called again
+		 */
+		mpd->wbc->more_io = 1;
+	}
+
 	if (buffer_new(&new))
 		__unmap_underlying_blocks(mpd->inode, &new);
 
@@ -1876,6 +1884,8 @@ static int __mpage_da_writepage(struct page *page,
 			 * skip rest of the page in the page_vec
 			 */
 			mpd->io_done = 1;
+			/* We want writepages to be called again */
+			wbc->more_io = 1;
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return MPAGE_DA_EXTENT_TAIL;

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: Rework the ext4_da_writepages
  2008-07-31 17:33 [PATCH] ext4: Rework the ext4_da_writepages Aneesh Kumar K.V
  2008-07-31 17:47 ` Aneesh Kumar K.V
  2008-07-31 20:10 ` Andreas Dilger
@ 2008-08-01  3:08 ` Theodore Tso
  2008-08-01  4:06   ` Aneesh Kumar K.V
  2 siblings, 1 reply; 8+ messages in thread
From: Theodore Tso @ 2008-08-01  3:08 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: cmm, sandeen, linux-ext4

On Thu, Jul 31, 2008 at 11:03:25PM +0530, Aneesh Kumar K.V wrote:
> With the below changes we reserve credit needed to insert only one extent
> resulting from a call to single get_block. That make sure we don't take
> too much journal credits during writeout. We also don't limit the pages
> to write. That means we loop through the dirty pages building largest
> possible contiguous block request. Then we issue a single get_block request.
> We may get less block that we requested. If so we would end up not mapping
> some of the buffer_heads. That means those buffer_heads are still marked delay.
> Later in the writepage callback via __mpage_writepage we redirty those pages.

If you're only redirtying the pages in the callback, that means they
are left clean but with the delayed flag set; is that going to be
enough to keep the mm from dropping the pages because they are clean?
Or is the mechanism which prevents this is that you've kept the
refcount on the pages bumped until after the callback?

	    	      	     	   	     - Ted

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: Rework the ext4_da_writepages
  2008-08-01  3:08 ` Theodore Tso
@ 2008-08-01  4:06   ` Aneesh Kumar K.V
  0 siblings, 0 replies; 8+ messages in thread
From: Aneesh Kumar K.V @ 2008-08-01  4:06 UTC (permalink / raw)
  To: Theodore Tso; +Cc: cmm, sandeen, linux-ext4

On Thu, Jul 31, 2008 at 11:08:25PM -0400, Theodore Tso wrote:
> On Thu, Jul 31, 2008 at 11:03:25PM +0530, Aneesh Kumar K.V wrote:
> > With the below changes we reserve credit needed to insert only one extent
> > resulting from a call to single get_block. That make sure we don't take
> > too much journal credits during writeout. We also don't limit the pages
> > to write. That means we loop through the dirty pages building largest
> > possible contiguous block request. Then we issue a single get_block request.
> > We may get less block that we requested. If so we would end up not mapping
> > some of the buffer_heads. That means those buffer_heads are still marked delay.
> > Later in the writepage callback via __mpage_writepage we redirty those pages.
> 
> If you're only redirtying the pages in the callback, that means they
> are left clean but with the delayed flag set; is that going to be
> enough to keep the mm from dropping the pages because they are clean?
> Or is the mechanism which prevents this is that you've kept the
> refcount on the pages bumped until after the callback?

writepages can use redirty_page_for_writepage to skip the pages during
writeout. We do that in most of the writepage call backs.
So I guess they would be properly marked dirty .

-aneesh

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] ext4: Handle unwritten extent properly with delayed allocation.
@ 2008-08-11 10:01 Aneesh Kumar K.V
  2008-08-11 10:01 ` [PATCH] ext4: Rework the ext4_da_writepages Aneesh Kumar K.V
  0 siblings, 1 reply; 8+ messages in thread
From: Aneesh Kumar K.V @ 2008-08-11 10:01 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V

When using fallocate the buffer_heads are marked unwritten
and unmapped. We need to map them in the writepages after
a get_block. Otherwise we split the uninit extents,
but never write the content to disk.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c96cc0b..7a66bba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1732,6 +1732,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				if (buffer_delay(bh)) {
 					bh->b_blocknr = pblock;
 					clear_buffer_delay(bh);
+					bh->b_bdev = inode->i_sb->s_bdev;
+				} else if (buffer_unwritten(bh)) {
+					bh->b_blocknr = pblock;
+					clear_buffer_unwritten(bh);
+					set_buffer_mapped(bh);
+					set_buffer_new(bh);
+					bh->b_bdev = inode->i_sb->s_bdev;
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
@@ -1805,7 +1812,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 		 * If blocks are delayed marked, we need to
 		 * put actual blocknr and drop delayed bit
 		 */
-		if (buffer_delay(lbh))
+		if (buffer_delay(lbh) || buffer_unwritten(lbh))
 			mpage_put_bnr_to_bhs(mpd, next, &new);
 
 		/* go for the remaining blocks */
@@ -1814,7 +1821,8 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 	}
 }
 
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+		(1 << BH_Delay) | (1 << BH_Unwritten))
 
 /*
  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
-- 
1.6.0.rc0.42.g186458.dirty


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] ext4: Rework the ext4_da_writepages
  2008-08-11 10:01 [PATCH] ext4: Handle unwritten extent properly with delayed allocation Aneesh Kumar K.V
@ 2008-08-11 10:01 ` Aneesh Kumar K.V
  0 siblings, 0 replies; 8+ messages in thread
From: Aneesh Kumar K.V @ 2008-08-11 10:01 UTC (permalink / raw)
  To: cmm, tytso, sandeen; +Cc: linux-ext4, Aneesh Kumar K.V

With the below changes we reserve credit needed to insert only one extent
resulting from a call to single get_block. That make sure we don't take
too much journal credits during writeout. We also don't limit the pages
to write. That means we loop through the dirty pages building largest
possible contiguous block request. Then we issue a single get_block request.
We may get less block that we requested. If so we would end up not mapping
some of the buffer_heads. That means those buffer_heads are still marked delay.
Later in the writepage callback via __mpage_writepage we redirty those pages.

We should also not limit/throttle wbc->nr_to_write in the filesystem writepages
callback. That cause wrong behaviour in generic_sync_sb_inodes caused by
wbc->nr_to_write being <= 0

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c |  175 +++++++++++++++++++++++++++++++------------------------
 1 files changed, 99 insertions(+), 76 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7a66bba..ea1a8db 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
 #include "acl.h"
 #include "ext4_extents.h"
 
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
@@ -1604,6 +1606,8 @@ static void ext4_da_page_release_reservation(struct page *page,
 	unsigned long first_page, next_page;	/* extent of pages */
 	get_block_t *get_block;
 	struct writeback_control *wbc;
+	int io_done;
+	long pages_written;
 };
 
 /*
@@ -1623,12 +1627,6 @@ static void ext4_da_page_release_reservation(struct page *page,
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
 	struct address_space *mapping = mpd->inode->i_mapping;
-	struct mpage_data mpd_pp = {
-		.bio = NULL,
-		.last_block_in_bio = 0,
-		.get_block = mpd->get_block,
-		.use_writepage = 1,
-	};
 	int ret = 0, err, nr_pages, i;
 	unsigned long index, end;
 	struct pagevec pvec;
@@ -1652,7 +1650,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 				break;
 			index++;
 
-			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+			err = mapping->a_ops->writepage(page, mpd->wbc);
+			if (!err)
+				mpd->pages_written++;
 
 			/*
 			 * In error case, we have to continue because
@@ -1664,8 +1664,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 		}
 		pagevec_release(&pvec);
 	}
-	if (mpd_pp.bio)
-		mpage_bio_submit(WRITE, mpd_pp.bio);
 
 	return ret;
 }
@@ -1779,8 +1777,8 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
  */
 static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
+	int err = 0;
 	struct buffer_head *lbh = &mpd->lbh;
-	int err = 0, remain = lbh->b_size;
 	sector_t next = lbh->b_blocknr;
 	struct buffer_head new;
 
@@ -1790,35 +1788,32 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if (buffer_mapped(lbh) && !buffer_delay(lbh))
 		return;
 
-	while (remain) {
-		new.b_state = lbh->b_state;
-		new.b_blocknr = 0;
-		new.b_size = remain;
-		err = mpd->get_block(mpd->inode, next, &new, 1);
-		if (err) {
-			/*
-			 * Rather than implement own error handling
-			 * here, we just leave remaining blocks
-			 * unallocated and try again with ->writepage()
-			 */
-			break;
-		}
-		BUG_ON(new.b_size == 0);
+	new.b_state = lbh->b_state;
+	new.b_blocknr = 0;
+	new.b_size = lbh->b_size;
 
-		if (buffer_new(&new))
-			__unmap_underlying_blocks(mpd->inode, &new);
+	/*
+	 * If we didn't accumulate anything
+	 * to write simply return
+	 */
+	if (!new.b_size)
+		return;
+	err = mpd->get_block(mpd->inode, next, &new, 1);
+	if (err)
+		return;
+	BUG_ON(new.b_size == 0);
 
-		/*
-		 * If blocks are delayed marked, we need to
-		 * put actual blocknr and drop delayed bit
-		 */
-		if (buffer_delay(lbh) || buffer_unwritten(lbh))
-			mpage_put_bnr_to_bhs(mpd, next, &new);
+	if (buffer_new(&new))
+		__unmap_underlying_blocks(mpd->inode, &new);
 
-		/* go for the remaining blocks */
-		next += new.b_size >> mpd->inode->i_blkbits;
-		remain -= new.b_size;
-	}
+	/*
+	 * If blocks are delayed marked, we need to
+	 * put actual blocknr and drop delayed bit
+	 */
+	if (buffer_delay(lbh) || buffer_unwritten(lbh))
+		mpage_put_bnr_to_bhs(mpd, next, &new);
+
+	return;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -1864,13 +1859,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	 * need to flush current  extent and start new one
 	 */
 	mpage_da_map_blocks(mpd);
-
-	/*
-	 * Now start a new extent
-	 */
-	lbh->b_size = bh->b_size;
-	lbh->b_state = bh->b_state & BH_FLAGS;
-	lbh->b_blocknr = logical;
+	mpage_da_submit_io(mpd);
+	mpd->io_done = 1;
+	return;
 }
 
 /*
@@ -1890,6 +1881,17 @@ static int __mpage_da_writepage(struct page *page,
 	struct buffer_head *bh, *head, fake;
 	sector_t logical;
 
+	if (mpd->io_done) {
+		/*
+		 * Rest of the page in the page_vec
+		 * redirty then and skip then. We will
+		 * try to to write them again after
+		 * starting a new transaction
+		 */
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return MPAGE_DA_EXTENT_TAIL;
+	}
 	/*
 	 * Can we merge this page to current extent?
 	 */
@@ -1901,6 +1903,13 @@ static int __mpage_da_writepage(struct page *page,
 		if (mpd->next_page != mpd->first_page) {
 			mpage_da_map_blocks(mpd);
 			mpage_da_submit_io(mpd);
+			/*
+			 * skip rest of the page in the page_vec
+			 */
+			mpd->io_done = 1;
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return MPAGE_DA_EXTENT_TAIL;
 		}
 
 		/*
@@ -1931,6 +1940,8 @@ static int __mpage_da_writepage(struct page *page,
 		set_buffer_dirty(bh);
 		set_buffer_uptodate(bh);
 		mpage_add_bh_to_extent(mpd, logical, bh);
+		if (mpd->io_done)
+			return MPAGE_DA_EXTENT_TAIL;
 	} else {
 		/*
 		 * Page with regular buffer heads, just add all dirty ones
@@ -1939,8 +1950,12 @@ static int __mpage_da_writepage(struct page *page,
 		bh = head;
 		do {
 			BUG_ON(buffer_locked(bh));
-			if (buffer_dirty(bh))
+			if (buffer_dirty(bh) &&
+				(!buffer_mapped(bh) || buffer_delay(bh))) {
 				mpage_add_bh_to_extent(mpd, logical, bh);
+				if (mpd->io_done)
+					return MPAGE_DA_EXTENT_TAIL;
+			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
 	}
@@ -1975,6 +1990,7 @@ static int mpage_da_writepages(struct address_space *mapping,
 			       get_block_t get_block)
 {
 	struct mpage_da_data mpd;
+	long to_write;
 	int ret;
 
 	if (!get_block)
@@ -1988,17 +2004,22 @@ static int mpage_da_writepages(struct address_space *mapping,
 	mpd.first_page = 0;
 	mpd.next_page = 0;
 	mpd.get_block = get_block;
+	mpd.io_done = 0;
+	mpd.pages_written = 0;
+
+	to_write = wbc->nr_to_write;
 
 	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
 
 	/*
 	 * Handle last extent of pages
 	 */
-	if (mpd.next_page != mpd.first_page) {
+	if (!mpd.io_done && mpd.next_page != mpd.first_page) {
 		mpage_da_map_blocks(&mpd);
 		mpage_da_submit_io(&mpd);
 	}
 
+	wbc->nr_to_write = to_write - mpd.pages_written;
 	return ret;
 }
 
@@ -2210,10 +2231,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int ret = 0;
 	long to_write;
 	loff_t range_start = 0;
-	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
-	int max_credit_blocks = ext4_journal_max_transaction_buffers(inode);
-	int need_credits_per_page =  ext4_writepages_trans_blocks(inode, 1);
-	int max_writeback_pages = (max_credit_blocks / blocks_per_page) / need_credits_per_page;
+	long pages_skipped = 0;
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2223,39 +2241,29 @@ static int ext4_da_writepages(struct address_space *mapping,
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 
-	if (wbc->nr_to_write > mapping->nrpages)
-		wbc->nr_to_write = mapping->nrpages;
-
-	to_write = wbc->nr_to_write;
-
-	if (!wbc->range_cyclic) {
+	if (!wbc->range_cyclic)
 		/*
 		 * If range_cyclic is not set force range_cont
 		 * and save the old writeback_index
 		 */
 		wbc->range_cont = 1;
-		range_start =  wbc->range_start;
-	}
 
-	while (!ret && to_write) {
-		/*
-		 * set the max dirty pages could be write at a time
-		 * to fit into the reserved transaction credits
-		 */
-		if (wbc->nr_to_write > max_writeback_pages)
-			wbc->nr_to_write = max_writeback_pages;
+	range_start =  wbc->range_start;
+	pages_skipped = wbc->pages_skipped;
+
+restart_loop:
+	to_write = wbc->nr_to_write;
+	while (!ret && to_write > 0) {
 
 		/*
-		 * Estimate the worse case needed credits to write out
-		 * to_write pages
+		 * we  insert one extent at a time. So we need
+		 * credit needed for single extent allocation.
+		 * journalled mode is currently not supported
+		 * by delalloc
 		 */
-		needed_blocks = ext4_writepages_trans_blocks(inode,
-							     wbc->nr_to_write);
-		while (needed_blocks > max_credit_blocks) {
-			wbc->nr_to_write--;
-			needed_blocks = ext4_writepages_trans_blocks(inode,
-							     wbc->nr_to_write);
-		}
+		BUG_ON(ext4_should_journal_data(inode));
+		needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
 		/* start a new transaction*/
 		handle = ext4_journal_start(inode, needed_blocks);
 		if (IS_ERR(handle)) {
@@ -2284,7 +2292,14 @@ static int ext4_da_writepages(struct address_space *mapping,
 		ret = mpage_da_writepages(mapping, wbc,
 						ext4_da_get_block_write);
 		ext4_journal_stop(handle);
-		if (wbc->nr_to_write) {
+		if (ret == MPAGE_DA_EXTENT_TAIL) {
+			/*
+			 * got one extent now try with
+			 * rest of the pages
+			 */
+			to_write += wbc->nr_to_write;
+			ret = 0;
+		} else if (wbc->nr_to_write) {
 			/*
 			 * There is no more writeout needed
 			 * or we requested for a noblocking writeout
@@ -2296,10 +2311,18 @@ static int ext4_da_writepages(struct address_space *mapping,
 		wbc->nr_to_write = to_write;
 	}
 
+	if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
+		/* We skipped pages in this loop */
+		wbc->range_start = range_start;
+		wbc->nr_to_write = to_write +
+					wbc->pages_skipped - pages_skipped;
+		wbc->pages_skipped = pages_skipped;
+		goto restart_loop;
+	}
+
 out_writepages:
 	wbc->nr_to_write = to_write;
-	if (range_start)
-		wbc->range_start = range_start;
+	wbc->range_start = range_start;
 	return ret;
 }
 
-- 
1.6.0.rc0.42.g186458.dirty


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2008-08-11 10:02 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-31 17:33 [PATCH] ext4: Rework the ext4_da_writepages Aneesh Kumar K.V
2008-07-31 17:47 ` Aneesh Kumar K.V
2008-07-31 20:10 ` Andreas Dilger
2008-08-01  4:54   ` Aneesh Kumar K.V
2008-08-01  5:07     ` Aneesh Kumar K.V
2008-08-01  3:08 ` Theodore Tso
2008-08-01  4:06   ` Aneesh Kumar K.V
  -- strict thread matches above, loose matches on Subject: below --
2008-08-11 10:01 [PATCH] ext4: Handle unwritten extent properly with delayed allocation Aneesh Kumar K.V
2008-08-11 10:01 ` [PATCH] ext4: Rework the ext4_da_writepages Aneesh Kumar K.V

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).