[PATCH 0/5] sluggish writeback fixes

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/5] sluggish writeback fixes
       [not found] <20071002084143.110486039@mail.ustc.edu.cn>
@ 2007-10-02  8:41 ` Fengguang Wu
  2007-10-03 11:04   ` Martin Knoblauch
       [not found] ` <20071002090254.489150786@mail.ustc.edu.cn>
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 19+ messages in thread
From: Fengguang Wu @ 2007-10-02  8:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Michael Rubin

Andrew,

The following patches fix the sluggish writeback behavior.
They are well understood and well tested - but not yet widely tested.

The first patch reverts the debugging -mm only check_dirty_inode_list.patch -
which is no longer necessary.

The following 4 patches do the real jobs:

[PATCH 2/5] writeback: fix time ordering of the per superblock inode lists 8
[PATCH 3/5] writeback: fix ntfs with sb_has_dirty_inodes()
[PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page()
[PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io

They share the same goal as the following patches in -mm. Therefore I'd
recommend to put the last 4 new ones after them:

writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists.patch
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-2.patch
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-3.patch
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-4.patch
writeback-fix-comment-use-helper-function.patch
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-5.patch
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-6.patch
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-7.patch
writeback-fix-periodic-superblock-dirty-inode-flushing.patch

Regards,
Fengguang
--

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/5] revert check_dirty_inode_list.patch
       [not found] ` <20071002090254.489150786@mail.ustc.edu.cn>
@ 2007-10-02  8:41   ` Fengguang Wu
  0 siblings, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-02  8:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Michael Rubin

[-- Attachment #1: revert-check_dirty_inode_list.patch --]
[-- Type: text/plain, Size: 3766 bytes --]

Revert the check_dirty_inode_list.patch.
I'm pretty sure the time ordering problem has gone.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/fs-writeback.c |   62 --------------------------------------------
 kernel/sysctl.c   |    8 -----
 2 files changed, 1 insertion(+), 69 deletions(-)

--- linux-2.6.23-rc8-mm2.orig/fs/fs-writeback.c
+++ linux-2.6.23-rc8-mm2/fs/fs-writeback.c
@@ -25,57 +25,6 @@
 #include <linux/buffer_head.h>
 #include "internal.h"
 
-int sysctl_inode_debug __read_mostly;
-
-static int __check(struct super_block *sb, int print_stuff)
-{
-	struct list_head *cursor = &sb->s_dirty;
-	unsigned long dirtied_when = 0;
-
-	while ((cursor = cursor->prev) != &sb->s_dirty) {
-		struct inode *inode = list_entry(cursor, struct inode, i_list);
-		if (print_stuff) {
-			printk("%p:%lu\n", inode, inode->dirtied_when);
-		} else {
-			if (dirtied_when &&
-			    time_before(inode->dirtied_when, dirtied_when))
-				return 1;
-			dirtied_when = inode->dirtied_when;
-		}
-	}
-	return 0;
-}
-
-static void __check_dirty_inode_list(struct super_block *sb,
-			struct inode *inode, const char *file, int line)
-{
-	if (!sysctl_inode_debug)
-		return;
-
-	if (__check(sb, 0)) {
-		sysctl_inode_debug = 0;
-		if (inode)
-			printk("%s:%d: s_dirty got screwed up.  inode=%p:%lu\n",
-					file, line, inode, inode->dirtied_when);
-		else
-			printk("%s:%d: s_dirty got screwed up\n", file, line);
-		__check(sb, 1);
-	}
-}
-
-#define check_dirty_inode_list(sb)					\
-	do {								\
-		if (unlikely(sysctl_inode_debug))			\
-		__check_dirty_inode_list(sb, NULL, __FILE__, __LINE__);	\
-	} while (0)
-
-#define check_dirty_inode(inode)					\
-	do {								\
-		if (unlikely(sysctl_inode_debug))			\
-			__check_dirty_inode_list(inode->i_sb, inode,	\
-						__FILE__, __LINE__);	\
-	} while (0)
-
 /**
  *	__mark_inode_dirty -	internal function
  *	@inode: inode to mark
@@ -174,10 +123,8 @@ void __mark_inode_dirty(struct inode *in
 		 * reposition it (that would break s_dirty time-ordering).
 		 */
 		if (!was_dirty) {
-			check_dirty_inode(inode);
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_list, &sb->s_dirty);
-			check_dirty_inode(inode);
 		}
 	}
 out:
@@ -206,7 +153,6 @@ static void redirty_tail(struct inode *i
 {
 	struct super_block *sb = inode->i_sb;
 
-	check_dirty_inode(inode);
 	if (!list_empty(&sb->s_dirty)) {
 		struct inode *tail_inode;
 
@@ -216,7 +162,6 @@ static void redirty_tail(struct inode *i
 			inode->dirtied_when = jiffies;
 	}
 	list_move(&inode->i_list, &sb->s_dirty);
-	check_dirty_inode(inode);
 }
 
 /*
@@ -430,11 +375,8 @@ int generic_sync_sb_inodes(struct super_
 
 	spin_lock(&inode_lock);
 
-	if (!wbc->for_kupdate || list_empty(&sb->s_io)) {
-		check_dirty_inode_list(sb);
+	if (!wbc->for_kupdate || list_empty(&sb->s_io))
 		list_splice_init(&sb->s_dirty, &sb->s_io);
-		check_dirty_inode_list(sb);
-	}
 
 	while (!list_empty(&sb->s_io)) {
 		int err;
@@ -499,10 +441,8 @@ int generic_sync_sb_inodes(struct super_
 		if (!ret)
 			ret = err;
 		if (wbc->sync_mode == WB_SYNC_HOLD) {
-			check_dirty_inode(inode);
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_list, &sb->s_dirty);
-			check_dirty_inode(inode);
 		}
 		if (current_is_pdflush())
 			writeback_release(bdi);
--- linux-2.6.23-rc8-mm2.orig/kernel/sysctl.c
+++ linux-2.6.23-rc8-mm2/kernel/sysctl.c
@@ -1206,14 +1206,6 @@ static struct ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "inode_debug",
-		.data		= &sysctl_inode_debug,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
 	{
 		.ctl_name	= CTL_UNNUMBERED,

-- 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/5] writeback: fix time ordering of the per superblock inode lists 8
       [not found] ` <20071002090254.596842343@mail.ustc.edu.cn>
@ 2007-10-02  8:41   ` Fengguang Wu
  0 siblings, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-02  8:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Ken Chen, Andrew Morton, Michael Rubin

[-- Attachment #1: inode-dirty-time-ordering-fix.patch --]
[-- Type: text/plain, Size: 5448 bytes --]

Streamline the management of dirty inode lists and fix time ordering bugs.

The writeback logic used to moving not-yet-expired dirty inodes from s_dirty to
s_io, *only to* move them back. The move-inodes-back-and-forth thing is a mess,
which is eliminated by this patch.

The new scheme is:
- s_dirty acts as a time ordered io delaying queue;
- s_io/s_more_io together acts as an io dispatching queue.

On kupdate writeback, we pull some inodes from s_dirty to s_io at the start of
every full scan of s_io.  Otherwise(i.e. for sync/throttle/background
writeback), we always pull from s_dirty on each run(a partial scan).

Note that the line
	list_splice_init(&sb->s_more_io, &sb->s_io);
is moved to queue_io() to leave s_io empty. Otherwise a big dirtied file will
sit in s_io for a long time, preventing new expired inodes to get in.

Cc: Ken Chen <kenchen@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/fs-writeback.c |   59 ++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 21 deletions(-)

--- linux-2.6.23-rc8-mm2.orig/fs/fs-writeback.c
+++ linux-2.6.23-rc8-mm2/fs/fs-writeback.c
@@ -119,7 +119,7 @@ void __mark_inode_dirty(struct inode *in
 			goto out;
 
 		/*
-		 * If the inode was already on s_dirty or s_io, don't
+		 * If the inode was already on s_dirty/s_io/s_more_io, don't
 		 * reposition it (that would break s_dirty time-ordering).
 		 */
 		if (!was_dirty) {
@@ -182,6 +182,33 @@ static void inode_sync_complete(struct i
 }
 
 /*
+ * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
+ */
+static void move_expired_inodes(struct list_head *delaying_queue,
+			       struct list_head *dispatch_queue,
+				unsigned long *older_than_this)
+{
+	while (!list_empty(delaying_queue)) {
+		struct inode *inode = list_entry(delaying_queue->prev,
+						struct inode, i_list);
+		if (older_than_this &&
+			time_after(inode->dirtied_when, *older_than_this))
+			break;
+		list_move(&inode->i_list, dispatch_queue);
+	}
+}
+
+/*
+ * Queue all expired dirty inodes for io, eldest first.
+ */
+static void queue_io(struct super_block *sb,
+				unsigned long *older_than_this)
+{
+	list_splice_init(&sb->s_more_io, sb->s_io.prev);
+	move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+}
+
+/*
  * Write a single inode's dirty pages and inode data out to disk.
  * If `wait' is set, wait on the writeout.
  *
@@ -231,7 +258,7 @@ __sync_single_inode(struct inode *inode,
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
 			 * sometimes bales out without doing anything. Redirty
-			 * the inode.  It is moved from s_io onto s_dirty.
+			 * the inode; Move it from s_io onto s_more_io/s_dirty.
 			 */
 			/*
 			 * akpm: if the caller was the kupdate function we put
@@ -244,10 +271,9 @@ __sync_single_inode(struct inode *inode,
 			 */
 			if (wbc->for_kupdate) {
 				/*
-				 * For the kupdate function we leave the inode
-				 * at the head of sb_dirty so it will get more
-				 * writeout as soon as the queue becomes
-				 * uncongested.
+				 * For the kupdate function we move the inode
+				 * to s_more_io so it will get more writeout as
+				 * soon as the queue becomes uncongested.
 				 */
 				inode->i_state |= I_DIRTY_PAGES;
 				requeue_io(inode);
@@ -305,10 +331,10 @@ __writeback_single_inode(struct inode *i
 
 		/*
 		 * We're skipping this inode because it's locked, and we're not
-		 * doing writeback-for-data-integrity.  Move it to the head of
-		 * s_dirty so that writeback can proceed with the other inodes
-		 * on s_io.  We'll have another go at writing back this inode
-		 * when the s_dirty iodes get moved back onto s_io.
+		 * doing writeback-for-data-integrity.  Move it to s_more_io so
+		 * that writeback can proceed with the other inodes on s_io.
+		 * We'll have another go at writing back this inode when we
+		 * completed a full scan of s_io.
 		 */
 		requeue_io(inode);
 
@@ -376,7 +402,7 @@ int generic_sync_sb_inodes(struct super_
 	spin_lock(&inode_lock);
 
 	if (!wbc->for_kupdate || list_empty(&sb->s_io))
-		list_splice_init(&sb->s_dirty, &sb->s_io);
+		queue_io(sb, wbc->older_than_this);
 
 	while (!list_empty(&sb->s_io)) {
 		int err;
@@ -423,13 +449,6 @@ int generic_sync_sb_inodes(struct super_
 		if (time_after(inode->dirtied_when, start))
 			break;
 
-		/* Was this inode dirtied too recently? */
-		if (wbc->older_than_this && time_after(inode->dirtied_when,
-						*wbc->older_than_this)) {
-			list_splice_init(&sb->s_io, sb->s_dirty.prev);
-			break;
-		}
-
 		/* Is another pdflush already flushing this queue? */
 		if (current_is_pdflush() && !writeback_acquire(bdi))
 			break;
@@ -461,8 +480,6 @@ int generic_sync_sb_inodes(struct super_
 			break;
 	}
 
-	if (list_empty(&sb->s_io))
-		list_splice_init(&sb->s_more_io, &sb->s_io);
 	spin_unlock(&inode_lock);
 	return ret;		/* Leave any unwritten inodes on s_io */
 }
@@ -482,7 +499,7 @@ static int sync_sb_inodes(struct super_b
  * Note:
  * We don't need to grab a reference to superblock here. If it has non-empty
  * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
+ * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
  * empty. Since __sync_single_inode() regains inode_lock before it finally moves
  * inode from superblock lists we are OK.
  *

-- 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 3/5] writeback: fix ntfs with sb_has_dirty_inodes()
       [not found] ` <20071002090254.728493507@mail.ustc.edu.cn>
@ 2007-10-02  8:41   ` Fengguang Wu
  0 siblings, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-02  8:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, Anton Altaparmakov, Ken Chen, Andrew Morton,
	Michael Rubin

[-- Attachment #1: nfs-dirty-inodes.patch --]
[-- Type: text/plain, Size: 2520 bytes --]

NTFS's if-condition on dirty inodes is not complete.
Fix it with sb_has_dirty_inodes().

Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Ken Chen <kenchen@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
---
 fs/fs-writeback.c  |   10 +++++++++-
 fs/ntfs/super.c    |    4 ++--
 include/linux/fs.h |    1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

--- linux-2.6.23-rc8-mm2.orig/fs/ntfs/super.c
+++ linux-2.6.23-rc8-mm2/fs/ntfs/super.c
@@ -2381,14 +2381,14 @@ static void ntfs_put_super(struct super_
 	 */
 	ntfs_commit_inode(vol->mft_ino);
 	write_inode_now(vol->mft_ino, 1);
-	if (!list_empty(&sb->s_dirty)) {
+	if (sb_has_dirty_inodes(sb)) {
 		const char *s1, *s2;
 
 		mutex_lock(&vol->mft_ino->i_mutex);
 		truncate_inode_pages(vol->mft_ino->i_mapping, 0);
 		mutex_unlock(&vol->mft_ino->i_mutex);
 		write_inode_now(vol->mft_ino, 1);
-		if (!list_empty(&sb->s_dirty)) {
+		if (sb_has_dirty_inodes(sb)) {
 			static const char *_s1 = "inodes";
 			static const char *_s2 = "";
 			s1 = _s1;
--- linux-2.6.23-rc8-mm2.orig/include/linux/fs.h
+++ linux-2.6.23-rc8-mm2/include/linux/fs.h
@@ -1794,6 +1794,7 @@ extern int bdev_read_only(struct block_d
 extern int set_blocksize(struct block_device *, int);
 extern int sb_set_blocksize(struct super_block *, int);
 extern int sb_min_blocksize(struct super_block *, int);
+extern int sb_has_dirty_inodes(struct super_block *);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
--- linux-2.6.23-rc8-mm2.orig/fs/fs-writeback.c
+++ linux-2.6.23-rc8-mm2/fs/fs-writeback.c
@@ -208,6 +208,14 @@ static void queue_io(struct super_block 
 	move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
 }
 
+int sb_has_dirty_inodes(struct super_block *sb)
+{
+	return !list_empty(&sb->s_dirty) ||
+	       !list_empty(&sb->s_io) ||
+	       !list_empty(&sb->s_more_io);
+}
+EXPORT_SYMBOL(sb_has_dirty_inodes);
+
 /*
  * Write a single inode's dirty pages and inode data out to disk.
  * If `wait' is set, wait on the writeout.
@@ -522,7 +530,7 @@ int writeback_inodes(struct writeback_co
 restart:
 	sb = sb_entry(super_blocks.prev);
 	for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
-		if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
+		if (sb_has_dirty_inodes(sb)) {
 			/* we're making our own get_super here */
 			sb->s_count++;
 			spin_unlock(&sb_lock);

-- 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page()
       [not found] ` <20071002090254.873023041@mail.ustc.edu.cn>
@ 2007-10-02  8:41   ` Fengguang Wu
  2007-10-04 21:26     ` Andrew Morton
  2007-10-02 21:55   ` David Chinner
  1 sibling, 1 reply; 19+ messages in thread
From: Fengguang Wu @ 2007-10-02  8:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, David Chinner, Ken Chen, Andrew Morton,
	Michael Rubin

[-- Attachment #1: no-skipped.patch --]
[-- Type: text/plain, Size: 4428 bytes --]

Miklos Szeredi <miklos@szeredi.hu> and me identified a writeback bug:

> The following strange behavior can be observed:
>
> 1. large file is written
> 2. after 30 seconds, nr_dirty goes down by 1024
> 3. then for some time (< 30 sec) nothing happens (disk idle)
> 4. then nr_dirty again goes down by 1024
> 5. repeat from 3. until whole file is written
>
> So basically a 4Mbyte chunk of the file is written every 30 seconds.
> I'm quite sure this is not the intended behavior.

It can be produced by the following test scheme:

# cat bin/test-writeback.sh 
grep nr_dirty /proc/vmstat
echo 1 > /proc/sys/fs/inode_debug
dd if=/dev/zero of=/var/x bs=1K count=204800&
while true; do grep nr_dirty /proc/vmstat; sleep 1; done

# bin/test-writeback.sh
nr_dirty 19207
nr_dirty 19207
nr_dirty 30924
204800+0 records in
204800+0 records out
209715200 bytes (210 MB) copied, 1.58363 seconds, 132 MB/s
nr_dirty 47150
nr_dirty 47141
nr_dirty 47142
nr_dirty 47142
nr_dirty 47142
nr_dirty 47142
nr_dirty 47205
nr_dirty 47214
nr_dirty 47214
nr_dirty 47214
nr_dirty 47214
nr_dirty 47214
nr_dirty 47215
nr_dirty 47216
nr_dirty 47216
nr_dirty 47216
nr_dirty 47154
nr_dirty 47143
nr_dirty 47143
nr_dirty 47143
nr_dirty 47143
nr_dirty 47143
nr_dirty 47142
nr_dirty 47142
nr_dirty 47142
nr_dirty 47142
nr_dirty 47134
nr_dirty 47134
nr_dirty 47135
nr_dirty 47135
nr_dirty 47135
nr_dirty 46097 <== -1038
nr_dirty 46098
nr_dirty 46098
nr_dirty 46098
[...]
nr_dirty 46091
nr_dirty 46092
nr_dirty 46092
nr_dirty 45069 <== -1023
nr_dirty 45056
nr_dirty 45056
nr_dirty 45056
[...]
nr_dirty 37822
nr_dirty 36799 <== -1023
[...]
nr_dirty 36781
nr_dirty 35758 <== -1023
[...]
nr_dirty 34708
nr_dirty 33672 <== -1024
[...]
nr_dirty 33692
nr_dirty 32669 <== -1023


% ls -li /var/x
847824 -rw-r--r-- 1 root root 200M 2007-08-12 04:12 /var/x

% dmesg|grep 847824  # generated by a debug printk
[  529.263184] redirtied inode 847824 line 548
[  564.250872] redirtied inode 847824 line 548
[  594.272797] redirtied inode 847824 line 548
[  629.231330] redirtied inode 847824 line 548
[  659.224674] redirtied inode 847824 line 548
[  689.219890] redirtied inode 847824 line 548
[  724.226655] redirtied inode 847824 line 548
[  759.198568] redirtied inode 847824 line 548

# line 548 in fs/fs-writeback.c:
543                 if (wbc->pages_skipped != pages_skipped) {
544                         /*
545                          * writeback is not making progress due to locked
546                          * buffers.  Skip this inode for now.
547                          */
548                         redirty_tail(inode);
549                 }

More debug efforts show that __block_write_full_page()
never has the chance to call submit_bh() for that big dirty file:
the buffer head is *clean*. So basicly no page io is issued by
__block_write_full_page(), hence pages_skipped goes up.

Also the comment in generic_sync_sb_inodes():

544                         /*
545                          * writeback is not making progress due to locked
546                          * buffers.  Skip this inode for now.
547                          */

and the comment in __block_write_full_page():

1713                 /*
1714                  * The page was marked dirty, but the buffers were
1715                  * clean.  Someone wrote them back by hand with
1716                  * ll_rw_block/submit_bh.  A rare case.
1717                  */

do not quite agree with each other. The page writeback should be skipped for
'locked buffer', but here it is 'clean buffer'!

This patch fixes this bug. Though I'm not sure why __block_write_full_page()
is called only to do nothing and who actually issued the writeback for us.

This is the two possible new behaviors after the patch:

1) pretty nice: wait 30s and write ALL:)
2) not so good:
	- during the dd: ~16M 
	- after 30s:      ~4M
	- after 5s:       ~4M
	- after 5s:     ~176M

The next patch will fix case (2).

Cc: David Chinner <dgc@sgi.com>
Cc: Ken Chen <kenchen@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/buffer.c |    1 -
 1 file changed, 1 deletion(-)

--- linux-2.6.23-rc8-mm2.orig/fs/buffer.c
+++ linux-2.6.23-rc8-mm2/fs/buffer.c
@@ -1737,7 +1737,6 @@ done:
 		 * The page and buffer_heads can be released at any time from
 		 * here on.
 		 */
-		wbc->pages_skipped++;	/* We didn't write this page */
 	}
 	return err;
 

-- 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found] ` <20071002090254.987182999@mail.ustc.edu.cn>
@ 2007-10-02  8:41   ` Fengguang Wu
  2007-10-02 21:47   ` David Chinner
  1 sibling, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-02  8:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, David Chinner, Ken Chen, Andrew Morton,
	Michael Rubin

[-- Attachment #1: writeback-more-data.patch --]
[-- Type: text/plain, Size: 3892 bytes --]

After making dirty a 100M file, the normal behavior is to
start the writeback for all data after 30s delays. But
sometimes the following happens instead:

	- after 30s:    ~4M
	- after 5s:     ~4M
	- after 5s:     all remaining 92M

Some analyze shows that the internal io dispatch queues goes like this:

		s_io            s_more_io
		-------------------------
	1)	100M,1K         0
	2)	1K              96M
	3)	0               96M

1) initial state with a 100M file and a 1K file
2) 4M written, nr_to_write <= 0, so write more
3) 1K written, nr_to_write > 0, no more writes(BUG)

nr_to_write > 0 in (3) fools the upper layer to think that data have all been
written out. The big dirty file is actually still sitting in s_more_io. We
cannot simply splice s_more_io back to s_io as soon as s_io becomes empty, and
let the loop in generic_sync_sb_inodes() continue: this may starve newly
expired inodes in s_dirty.  It is also not an option to draw inodes from both
s_more_io and s_dirty, an let the loop go on: this might lead to live locks,
and might also starve other superblocks in sync time(well kupdate may still
starve some superblocks, that's another bug).

We have to return when a full scan of s_io completes. So nr_to_write > 0 does
not necessarily mean that "all data are written". This patch introduces a flag
writeback_control.more_io to indicate this situation. With it the big dirty file
no longer has to wait for the next kupdate invocation 5s later.

Cc: David Chinner <dgc@sgi.com>
Cc: Ken Chen <kenchen@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/fs-writeback.c         |    2 ++
 include/linux/writeback.h |    1 +
 mm/page-writeback.c       |    9 ++++++---
 3 files changed, 9 insertions(+), 3 deletions(-)

--- linux-2.6.23-rc8-mm2.orig/include/linux/writeback.h
+++ linux-2.6.23-rc8-mm2/include/linux/writeback.h
@@ -62,6 +62,7 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
+	unsigned more_io:1;		/* more io to be dispatched */
 	void *fs_private;               /* For use by ->writepages() */
 };
 
--- linux-2.6.23-rc8-mm2.orig/fs/fs-writeback.c
+++ linux-2.6.23-rc8-mm2/fs/fs-writeback.c
@@ -487,6 +487,8 @@ int generic_sync_sb_inodes(struct super_
 		if (wbc->nr_to_write <= 0)
 			break;
 	}
+	if (!list_empty(&sb->s_more_io))
+		wbc->more_io = 1;
 
 	spin_unlock(&inode_lock);
 	return ret;		/* Leave any unwritten inodes on s_io */
--- linux-2.6.23-rc8-mm2.orig/mm/page-writeback.c
+++ linux-2.6.23-rc8-mm2/mm/page-writeback.c
@@ -553,6 +553,7 @@ static void background_writeout(unsigned
 			global_page_state(NR_UNSTABLE_NFS) < background_thresh
 				&& min_pages <= 0)
 			break;
+		wbc.more_io = 0;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
@@ -560,8 +561,9 @@ static void background_writeout(unsigned
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
-			congestion_wait(WRITE, HZ/10);
-			if (!wbc.encountered_congestion)
+			if (wbc.encountered_congestion || wbc.more_io)
+				congestion_wait(WRITE, HZ/10);
+			else
 				break;
 		}
 	}
@@ -626,11 +628,12 @@ static void wb_kupdate(unsigned long arg
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	while (nr_to_write > 0) {
+		wbc.more_io = 0;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		writeback_inodes(&wbc);
 		if (wbc.nr_to_write > 0) {
-			if (wbc.encountered_congestion)
+			if (wbc.encountered_congestion || wbc.more_io)
 				congestion_wait(WRITE, HZ/10);
 			else
 				break;	/* All the old data is written */

-- 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found] ` <20071002090254.987182999@mail.ustc.edu.cn>
  2007-10-02  8:41   ` [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io Fengguang Wu
@ 2007-10-02 21:47   ` David Chinner
       [not found]     ` <20071003013439.GA6501@mail.ustc.edu.cn>
  1 sibling, 1 reply; 19+ messages in thread
From: David Chinner @ 2007-10-02 21:47 UTC (permalink / raw)
  To: Fengguang Wu
  Cc: Andrew Morton, linux-kernel, David Chinner, Ken Chen,
	Andrew Morton, Michael Rubin

On Tue, Oct 02, 2007 at 04:41:48PM +0800, Fengguang Wu wrote:
>  		wbc.pages_skipped = 0;
> @@ -560,8 +561,9 @@ static void background_writeout(unsigned
>  		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
>  		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
>  			/* Wrote less than expected */
> -			congestion_wait(WRITE, HZ/10);
> -			if (!wbc.encountered_congestion)
> +			if (wbc.encountered_congestion || wbc.more_io)
> +				congestion_wait(WRITE, HZ/10);
> +			else
>  				break;
>  		}

Why do you call congestion_wait() if there is more I/O to issue?  If
we have a fast filesystem, this might cause the device queues to
fill, then drain on congestion_wait(), then fill again, etc. i.e. we
will have trouble keeping the queues full, right?

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page()
       [not found] ` <20071002090254.873023041@mail.ustc.edu.cn>
  2007-10-02  8:41   ` [PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page() Fengguang Wu
@ 2007-10-02 21:55   ` David Chinner
       [not found]     ` <20071003014333.GB6501@mail.ustc.edu.cn>
  1 sibling, 1 reply; 19+ messages in thread
From: David Chinner @ 2007-10-02 21:55 UTC (permalink / raw)
  To: Fengguang Wu
  Cc: Andrew Morton, linux-kernel, David Chinner, Ken Chen,
	Andrew Morton, Michael Rubin

> 
> do not quite agree with each other. The page writeback should be skipped for
> 'locked buffer', but here it is 'clean buffer'!

Ok, so that means we need an equivalent fix in xfs_start_page_writeback()
as it will skip pages with clean buffers just like this. Something like
this (untested)?

---
 fs/xfs/linux-2.6/xfs_aops.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Index: 2.6.x-xfs-new/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/linux-2.6/xfs_aops.c	2007-10-02 16:12:56.000000000 +1000
+++ 2.6.x-xfs-new/fs/xfs/linux-2.6/xfs_aops.c	2007-10-03 07:53:27.866602431 +1000
@@ -420,10 +420,9 @@ xfs_start_page_writeback(
 		clear_page_dirty_for_io(page);
 	set_page_writeback(page);
 	unlock_page(page);
-	if (!buffers) {
+	/* If no buffers on the page are to be written, finish it here */
+	if (!buffers)
 		end_page_writeback(page);
-		wbc->pages_skipped++;	/* We didn't write this page */
-	}
 }
 
 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)

-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found]     ` <20071003013439.GA6501@mail.ustc.edu.cn>
@ 2007-10-03  1:34       ` Fengguang Wu
  2007-10-03  2:41       ` David Chinner
  1 sibling, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-03  1:34 UTC (permalink / raw)
  To: David Chinner
  Cc: Andrew Morton, linux-kernel, Ken Chen, Andrew Morton,
	Michael Rubin

On Wed, Oct 03, 2007 at 07:47:45AM +1000, David Chinner wrote:
> On Tue, Oct 02, 2007 at 04:41:48PM +0800, Fengguang Wu wrote:
> >  		wbc.pages_skipped = 0;
> > @@ -560,8 +561,9 @@ static void background_writeout(unsigned
> >  		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> >  		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> >  			/* Wrote less than expected */
> > -			congestion_wait(WRITE, HZ/10);
> > -			if (!wbc.encountered_congestion)
> > +			if (wbc.encountered_congestion || wbc.more_io)
> > +				congestion_wait(WRITE, HZ/10);
> > +			else
> >  				break;
> >  		}
> 
> Why do you call congestion_wait() if there is more I/O to issue?  If
> we have a fast filesystem, this might cause the device queues to
> fill, then drain on congestion_wait(), then fill again, etc. i.e. we
> will have trouble keeping the queues full, right?

You mean slow writers and fast RAID? That would be exactly the case
these patches try to improve.

The old writeback behaviors are sluggish when there is
        - single big dirty file;
        - single congested device
the queues may well build up slowly, hit background_limit, and
continue to build up, until hit dirty_limit. That means:
        - kupdate writeback could leave behind many expired dirty data
        - background writeback used to return prematurely
        - eventually it relies on balance_dirty_pages() to do the job,
          which means
          - writers get throttled unnecessarily
          - dirty_limit pages are pinned unnecessarily

This patchset makes kupdate/background writeback more responsible,
so that if (avg-write-speed < device-capabilities), the dirty data are
synced timely, and we don't have to go for balance_dirty_pages().

So for your question of queue depth, the answer is: the queue length
will not build up in the first place. 

Also the name of congestion_wait() could be misleading:
- when not congested, congestion_wait() will wakeup on write
  completions;
- when congested, congestion_wait() could also wakeup on write
  completions on other non-congested devices.
So congestion_wait(100ms) normally only takes 0.1-10ms.

For the more_io case, congestion_wait() serves more like 'to take a
breath'. Tests show that the system could go mad without it.

Regards,
Fengguang


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page()
       [not found]     ` <20071003014333.GB6501@mail.ustc.edu.cn>
@ 2007-10-03  1:43       ` Fengguang Wu
  2007-10-03  2:22       ` David Chinner
  1 sibling, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-03  1:43 UTC (permalink / raw)
  To: David Chinner
  Cc: Andrew Morton, linux-kernel, Ken Chen, Andrew Morton,
	Michael Rubin

On Wed, Oct 03, 2007 at 07:55:18AM +1000, David Chinner wrote:
> > 
> > do not quite agree with each other. The page writeback should be skipped for
> > 'locked buffer', but here it is 'clean buffer'!
> 
> Ok, so that means we need an equivalent fix in xfs_start_page_writeback()
> as it will skip pages with clean buffers just like this. Something like
> this (untested)?

Sure OK - as long as it is 'no write because of clean buffer'.
The only user of pages_skipped is obviously using that semantics.

Andrew, here is the expanded patch:
---
writeback: remove pages_skipped accounting in __block_write_full_page()

Miklos Szeredi <miklos@szeredi.hu> and me identified a writeback bug:

> The following strange behavior can be observed:
>
> 1. large file is written
> 2. after 30 seconds, nr_dirty goes down by 1024
> 3. then for some time (< 30 sec) nothing happens (disk idle)
> 4. then nr_dirty again goes down by 1024
> 5. repeat from 3. until whole file is written
>
> So basically a 4Mbyte chunk of the file is written every 30 seconds.
> I'm quite sure this is not the intended behavior.

It can be produced by the following test scheme:

# cat bin/test-writeback.sh 
grep nr_dirty /proc/vmstat
echo 1 > /proc/sys/fs/inode_debug
dd if=/dev/zero of=/var/x bs=1K count=204800&
while true; do grep nr_dirty /proc/vmstat; sleep 1; done

# bin/test-writeback.sh
nr_dirty 19207
nr_dirty 19207
nr_dirty 30924
204800+0 records in
204800+0 records out
209715200 bytes (210 MB) copied, 1.58363 seconds, 132 MB/s
nr_dirty 47150
nr_dirty 47141
nr_dirty 47142
nr_dirty 47142
nr_dirty 47142
nr_dirty 47142
nr_dirty 47205
nr_dirty 47214
nr_dirty 47214
nr_dirty 47214
nr_dirty 47214
nr_dirty 47214
nr_dirty 47215
nr_dirty 47216
nr_dirty 47216
nr_dirty 47216
nr_dirty 47154
nr_dirty 47143
nr_dirty 47143
nr_dirty 47143
nr_dirty 47143
nr_dirty 47143
nr_dirty 47142
nr_dirty 47142
nr_dirty 47142
nr_dirty 47142
nr_dirty 47134
nr_dirty 47134
nr_dirty 47135
nr_dirty 47135
nr_dirty 47135
nr_dirty 46097 <== -1038
nr_dirty 46098
nr_dirty 46098
nr_dirty 46098
[...]
nr_dirty 46091
nr_dirty 46092
nr_dirty 46092
nr_dirty 45069 <== -1023
nr_dirty 45056
nr_dirty 45056
nr_dirty 45056
[...]
nr_dirty 37822
nr_dirty 36799 <== -1023
[...]
nr_dirty 36781
nr_dirty 35758 <== -1023
[...]
nr_dirty 34708
nr_dirty 33672 <== -1024
[...]
nr_dirty 33692
nr_dirty 32669 <== -1023


% ls -li /var/x
847824 -rw-r--r-- 1 root root 200M 2007-08-12 04:12 /var/x

% dmesg|grep 847824  # generated by a debug printk
[  529.263184] redirtied inode 847824 line 548
[  564.250872] redirtied inode 847824 line 548
[  594.272797] redirtied inode 847824 line 548
[  629.231330] redirtied inode 847824 line 548
[  659.224674] redirtied inode 847824 line 548
[  689.219890] redirtied inode 847824 line 548
[  724.226655] redirtied inode 847824 line 548
[  759.198568] redirtied inode 847824 line 548

# line 548 in fs/fs-writeback.c:
543                 if (wbc->pages_skipped != pages_skipped) {
544                         /*
545                          * writeback is not making progress due to locked
546                          * buffers.  Skip this inode for now.
547                          */
548                         redirty_tail(inode);
549                 }

More debug efforts show that __block_write_full_page()
never has the chance to call submit_bh() for that big dirty file:
the buffer head is *clean*. So basicly no page io is issued by
__block_write_full_page(), hence pages_skipped goes up.

Also the comment in generic_sync_sb_inodes():

544                         /*
545                          * writeback is not making progress due to locked
546                          * buffers.  Skip this inode for now.
547                          */

and the comment in __block_write_full_page():

1713                 /*
1714                  * The page was marked dirty, but the buffers were
1715                  * clean.  Someone wrote them back by hand with
1716                  * ll_rw_block/submit_bh.  A rare case.
1717                  */

do not quite agree with each other. The page writeback should be skipped for
'locked buffer', but here it is 'clean buffer'!

This patch fixes this bug. Though I'm not sure why __block_write_full_page()
is called only to do nothing and who actually issued the writeback for us.

This is the two possible new behaviors after the patch:

1) pretty nice: wait 30s and write ALL:)
2) not so good:
	- during the dd: ~16M 
	- after 30s:      ~4M
	- after 5s:       ~4M
	- after 5s:     ~176M

The next patch will fix case (2).

Cc: Ken Chen <kenchen@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Signed-off-by: David Chinner <dgc@sgi.com>
---
 fs/buffer.c                 |    1 -
 fs/xfs/linux-2.6/xfs_aops.c |    5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

--- linux-2.6.23-rc8-mm2.orig/fs/buffer.c
+++ linux-2.6.23-rc8-mm2/fs/buffer.c
@@ -1737,7 +1737,6 @@ done:
 		 * The page and buffer_heads can be released at any time from
 		 * here on.
 		 */
-		wbc->pages_skipped++;	/* We didn't write this page */
 	}
 	return err;
 
--- linux-2.6.23-rc8-mm2.orig/fs/xfs/linux-2.6/xfs_aops.c
+++ linux-2.6.23-rc8-mm2/fs/xfs/linux-2.6/xfs_aops.c
@@ -420,10 +420,9 @@ xfs_start_page_writeback(
 		clear_page_dirty_for_io(page);
 	set_page_writeback(page);
 	unlock_page(page);
-	if (!buffers) {
+	/* If no buffers on the page are to be written, finish it here */
+	if (!buffers)
 		end_page_writeback(page);
-		wbc->pages_skipped++;	/* We didn't write this page */
-	}
 }
 
 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page()
       [not found]     ` <20071003014333.GB6501@mail.ustc.edu.cn>
  2007-10-03  1:43       ` Fengguang Wu
@ 2007-10-03  2:22       ` David Chinner
  1 sibling, 0 replies; 19+ messages in thread
From: David Chinner @ 2007-10-03  2:22 UTC (permalink / raw)
  To: Fengguang Wu
  Cc: David Chinner, Andrew Morton, linux-kernel, Ken Chen,
	Andrew Morton, Michael Rubin

On Wed, Oct 03, 2007 at 09:43:33AM +0800, Fengguang Wu wrote:
> On Wed, Oct 03, 2007 at 07:55:18AM +1000, David Chinner wrote:
> > > 
> > > do not quite agree with each other. The page writeback should be skipped for
> > > 'locked buffer', but here it is 'clean buffer'!
> > 
> > Ok, so that means we need an equivalent fix in xfs_start_page_writeback()
> > as it will skip pages with clean buffers just like this. Something like
> > this (untested)?
> 
> Sure OK - as long as it is 'no write because of clean buffer'.

Yes, that's the case here.

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found]     ` <20071003013439.GA6501@mail.ustc.edu.cn>
  2007-10-03  1:34       ` Fengguang Wu
@ 2007-10-03  2:41       ` David Chinner
       [not found]         ` <20071004022133.GA6244@mail.ustc.edu.cn>
  1 sibling, 1 reply; 19+ messages in thread
From: David Chinner @ 2007-10-03  2:41 UTC (permalink / raw)
  To: Fengguang Wu
  Cc: David Chinner, Andrew Morton, linux-kernel, Ken Chen,
	Andrew Morton, Michael Rubin

On Wed, Oct 03, 2007 at 09:34:39AM +0800, Fengguang Wu wrote:
> On Wed, Oct 03, 2007 at 07:47:45AM +1000, David Chinner wrote:
> > On Tue, Oct 02, 2007 at 04:41:48PM +0800, Fengguang Wu wrote:
> > >  		wbc.pages_skipped = 0;
> > > @@ -560,8 +561,9 @@ static void background_writeout(unsigned
> > >  		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> > >  		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> > >  			/* Wrote less than expected */
> > > -			congestion_wait(WRITE, HZ/10);
> > > -			if (!wbc.encountered_congestion)
> > > +			if (wbc.encountered_congestion || wbc.more_io)
> > > +				congestion_wait(WRITE, HZ/10);
> > > +			else
> > >  				break;
> > >  		}
> > 
> > Why do you call congestion_wait() if there is more I/O to issue?  If
> > we have a fast filesystem, this might cause the device queues to
> > fill, then drain on congestion_wait(), then fill again, etc. i.e. we
> > will have trouble keeping the queues full, right?
> 
> You mean slow writers and fast RAID? That would be exactly the case
> these patches try to improve.

I mean any writers and a fast block device (raid or otherwise).

> This patchset makes kupdate/background writeback more responsible,
> so that if (avg-write-speed < device-capabilities), the dirty data are
> synced timely, and we don't have to go for balance_dirty_pages().

Sure, but I'm asking about the effect of the patches on the
(avg-write-speed == device-capabilities) case. I agree that
they are necessary for timely syncing of data but I'm trying
to understand what effect they have on the normal write case
(i.e. keeping the disk at full write throughput).

> So for your question of queue depth, the answer is: the queue length
> will not build up in the first place. 

Which queue are you talking about here? The block deivce queue?

> Also the name of congestion_wait() could be misleading:
> - when not congested, congestion_wait() will wakeup on write
>   completions;
> - when congested, congestion_wait() could also wakeup on write
>   completions on other non-congested devices.
> So congestion_wait(100ms) normally only takes 0.1-10ms.

True, but if we know we are not congested and have more work
to do, why sleep at all?

> For the more_io case, congestion_wait() serves more like 'to take a
> breath'. Tests show that the system could go mad without it.

I'm interested to know what tests show that pushing more I/O when
you don't have block device congestion make the system go mad (and
what mad means).  It sounds to me like it's hiding (yet another)
bug in the writeback code......

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 0/5] sluggish writeback fixes
  2007-10-02  8:41 ` [PATCH 0/5] sluggish writeback fixes Fengguang Wu
@ 2007-10-03 11:04   ` Martin Knoblauch
  0 siblings, 0 replies; 19+ messages in thread
From: Martin Knoblauch @ 2007-10-03 11:04 UTC (permalink / raw)
  To: Fengguang Wu, Andrew Morton; +Cc: linux-kernel, Michael Rubin


--- Fengguang Wu <wfg@mail.ustc.edu.cn> wrote:

> Andrew,
> 
> The following patches fix the sluggish writeback behavior.
> They are well understood and well tested - but not yet widely tested.
> 
> The first patch reverts the debugging -mm only
> check_dirty_inode_list.patch -
> which is no longer necessary.
> 
> The following 4 patches do the real jobs:
> 
> [PATCH 2/5] writeback: fix time ordering of the per superblock inode
> lists 8
> [PATCH 3/5] writeback: fix ntfs with sb_has_dirty_inodes()
> [PATCH 4/5] writeback: remove pages_skipped accounting in
> __block_write_full_page()
> [PATCH 5/5] writeback: introduce writeback_control.more_io to
> indicate more io
> 
> They share the same goal as the following patches in -mm. Therefore
> I'd
> recommend to put the last 4 new ones after them:
> 
>
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists.patch
>
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-2.patch
>
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-3.patch
>
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-4.patch
> writeback-fix-comment-use-helper-function.patch
>
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-5.patch
>
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-6.patch
>
writeback-fix-time-ordering-of-the-per-superblock-dirty-inode-lists-7.patch
> writeback-fix-periodic-superblock-dirty-inode-flushing.patch
> 
> Regards,
> Fengguang
Hi Fenguang,

 now that Peters stuff seems to make it into mainline, do you think
your fixes should go in as well? Would definitely help to broaden the
tester base. Definitely by one very interested tester :-)

Keep on the good work
Martin

------------------------------------------------------
Martin Knoblauch
email: k n o b i AT knobisoft DOT de
www:   http://www.knobisoft.de

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found]         ` <20071004022133.GA6244@mail.ustc.edu.cn>
@ 2007-10-04  2:21           ` Fengguang Wu
  2007-10-04  5:03           ` David Chinner
  1 sibling, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-04  2:21 UTC (permalink / raw)
  To: David Chinner
  Cc: Andrew Morton, linux-kernel, Ken Chen, Andrew Morton,
	Michael Rubin

On Wed, Oct 03, 2007 at 12:41:19PM +1000, David Chinner wrote:
> On Wed, Oct 03, 2007 at 09:34:39AM +0800, Fengguang Wu wrote:
> > On Wed, Oct 03, 2007 at 07:47:45AM +1000, David Chinner wrote:
> > > On Tue, Oct 02, 2007 at 04:41:48PM +0800, Fengguang Wu wrote:
> > > >  		wbc.pages_skipped = 0;
> > > > @@ -560,8 +561,9 @@ static void background_writeout(unsigned
> > > >  		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> > > >  		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> > > >  			/* Wrote less than expected */
> > > > -			congestion_wait(WRITE, HZ/10);
> > > > -			if (!wbc.encountered_congestion)
> > > > +			if (wbc.encountered_congestion || wbc.more_io)
> > > > +				congestion_wait(WRITE, HZ/10);
> > > > +			else
> > > >  				break;
> > > >  		}
> > > 
> > > Why do you call congestion_wait() if there is more I/O to issue?  If
> > > we have a fast filesystem, this might cause the device queues to
> > > fill, then drain on congestion_wait(), then fill again, etc. i.e. we
> > > will have trouble keeping the queues full, right?
> > 
> > You mean slow writers and fast RAID? That would be exactly the case
> > these patches try to improve.
> 
> I mean any writers and a fast block device (raid or otherwise).
> 
> > This patchset makes kupdate/background writeback more responsible,
> > so that if (avg-write-speed < device-capabilities), the dirty data are
> > synced timely, and we don't have to go for balance_dirty_pages().
> 
> Sure, but I'm asking about the effect of the patches on the
> (avg-write-speed == device-capabilities) case. I agree that
> they are necessary for timely syncing of data but I'm trying
> to understand what effect they have on the normal write case

> (i.e. keeping the disk at full write throughput).

OK, I guess it is the focus of all your questions: Why should we sleep
in congestion_wait() and possibly hurt the write throughput? I'll try
to summary it:

- congestion_wait() is necessary
Besides device congestions, there may be other blockades we have to
wait on, e.g. temporary page locks, NFS/journal issues(I guess).

- congestion_wait() is called only when necessary
congestion_wait() will only be called we saw blockades:
        if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                congestion_wait(WRITE, HZ/10);
        }
So in normal case, it may well write 128MB data without any waiting.

- congestion_wait() won't hurt write throughput
When not congested, congestion_wait() will be wake up on each write
completion. Note that MAX_WRITEBACK_PAGES=1024 and
/sys/block/sda/queue/max_sectors_kb=512(for me),
which means we are gave the chance to sync 4MB on every 512KB written,
which means we are able to submit write IOs 8 times faster than the
device capability. congestion_wait() is a magical timer :-)

> > So for your question of queue depth, the answer is: the queue length
> > will not build up in the first place. 
> 
> Which queue are you talking about here? The block deivce queue?

Yes, the elevator's queues.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found]         ` <20071004022133.GA6244@mail.ustc.edu.cn>
  2007-10-04  2:21           ` Fengguang Wu
@ 2007-10-04  5:03           ` David Chinner
       [not found]             ` <20071005033652.GA6448@mail.ustc.edu.cn>
  1 sibling, 1 reply; 19+ messages in thread
From: David Chinner @ 2007-10-04  5:03 UTC (permalink / raw)
  To: Fengguang Wu
  Cc: David Chinner, Andrew Morton, linux-kernel, Ken Chen,
	Andrew Morton, Michael Rubin

On Thu, Oct 04, 2007 at 10:21:33AM +0800, Fengguang Wu wrote:
> On Wed, Oct 03, 2007 at 12:41:19PM +1000, David Chinner wrote:
> > On Wed, Oct 03, 2007 at 09:34:39AM +0800, Fengguang Wu wrote:
> > > On Wed, Oct 03, 2007 at 07:47:45AM +1000, David Chinner wrote:
> > > > On Tue, Oct 02, 2007 at 04:41:48PM +0800, Fengguang Wu wrote:
> > > > >  		wbc.pages_skipped = 0;
> > > > > @@ -560,8 +561,9 @@ static void background_writeout(unsigned
> > > > >  		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> > > > >  		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> > > > >  			/* Wrote less than expected */
> > > > > -			congestion_wait(WRITE, HZ/10);
> > > > > -			if (!wbc.encountered_congestion)
> > > > > +			if (wbc.encountered_congestion || wbc.more_io)
> > > > > +				congestion_wait(WRITE, HZ/10);
> > > > > +			else
> > > > >  				break;
> > > > >  		}
> > > > 
> > > > Why do you call congestion_wait() if there is more I/O to issue?  If
> > > > we have a fast filesystem, this might cause the device queues to
> > > > fill, then drain on congestion_wait(), then fill again, etc. i.e. we
> > > > will have trouble keeping the queues full, right?
> > > 
> > > You mean slow writers and fast RAID? That would be exactly the case
> > > these patches try to improve.
> > 
> > I mean any writers and a fast block device (raid or otherwise).
> > 
> > > This patchset makes kupdate/background writeback more responsible,
> > > so that if (avg-write-speed < device-capabilities), the dirty data are
> > > synced timely, and we don't have to go for balance_dirty_pages().
> > 
> > Sure, but I'm asking about the effect of the patches on the
> > (avg-write-speed == device-capabilities) case. I agree that
> > they are necessary for timely syncing of data but I'm trying
> > to understand what effect they have on the normal write case
> 
> > (i.e. keeping the disk at full write throughput).
> 
> OK, I guess it is the focus of all your questions: Why should we sleep
> in congestion_wait() and possibly hurt the write throughput? I'll try
> to summary it:
> 
> - congestion_wait() is necessary
> Besides device congestions, there may be other blockades we have to
> wait on, e.g. temporary page locks, NFS/journal issues(I guess).

We skip locked pages in writeback, and if some filesystems have
blocking issues that require non-blocking writeback waits for some
I/O to complete before re-entering writeback, then perhaps they should be
setting wbc->encountered_congestion to tell writeback to back off.

The question I'm asking is that if more_io tells us we have more
work to do, why do we have to sleep first if the block dev is
able to take more I/O?

> 
> - congestion_wait() is called only when necessary
> congestion_wait() will only be called we saw blockades:
>         if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
>                 congestion_wait(WRITE, HZ/10);
>         }
> So in normal case, it may well write 128MB data without any waiting.

Sure, but wbc.more_io doesn't indicate a blockade - just that there
is more work to do, right?

> - congestion_wait() won't hurt write throughput
> When not congested, congestion_wait() will be wake up on each write
> completion.

What happens if there I/O we issued has already completed before we
got back up to the congestion_wait() call? We'll spend 100ms
sleeping when we shouldn't have and throughput goes down by 10% on
every occurrence....

if we've got more work to do, then we should do it without an
arbitrary, non-deterministic delay being inserted. If the delay is
needed to prevent he system from "going mad" (whatever tht means),
then what's the explaination for the system "going mad"?

> Note that MAX_WRITEBACK_PAGES=1024 and
> /sys/block/sda/queue/max_sectors_kb=512(for me),
> which means we are gave the chance to sync 4MB on every 512KB written,
> which means we are able to submit write IOs 8 times faster than the
> device capability. congestion_wait() is a magical timer :-)

So, with Jens Axboe's sglist chaining, that single I/O could now
be up to 32MB on some hardware. IOWs, we push 1024 pages, and that
could end up as a single I/O being issued to disk.

Your magic just broke. :/

> > > So for your question of queue depth, the answer is: the queue length
> > > will not build up in the first place. 
> > 
> > Which queue are you talking about here? The block deivce queue?
> 
> Yes, the elevator's queues.

I think this is the wrong thing to be doing and is detrimental
to I/o perfomrance because it wil reduce elevator efficiency.

The elevator can only work efficiently if we allow the queues to
build up. The deeper the queue, the better the elevator can sort the
I/o requests and keep the device at maximum efficiency.  If we don't
push enough I/O into the queues the we miss opportunities to combine
adjacent I/Os and reduce the seek load of writeback. Also, a shallow
queue will run dry if we don't get back to it in time which is
possible if we wait for I/o to complete before we go and flush
more....

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page()
  2007-10-02  8:41   ` [PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page() Fengguang Wu
@ 2007-10-04 21:26     ` Andrew Morton
  0 siblings, 0 replies; 19+ messages in thread
From: Andrew Morton @ 2007-10-04 21:26 UTC (permalink / raw)
  To: Fengguang Wu; +Cc: linux-kernel, dgc, kenchen, mrubin

On Tue, 02 Oct 2007 16:41:47 +0800
Fengguang Wu <wfg@mail.ustc.edu.cn> wrote:

> This patch fixes this bug. Though I'm not sure why __block_write_full_page()
> is called only to do nothing and who actually issued the writeback for us.

kjourald wrote the page's buffers back (ext3 in ordered-data mode).  The VM
didn't know about that, so we have a PageDirty page which has clean
buffers.

We rely upon the VFS writeback code to "discover" that this dirty page has
clean buffers: the VFS will attempt to write the dirty page and will end up
marking the page clean without performing any IO.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found]             ` <20071005033652.GA6448@mail.ustc.edu.cn>
@ 2007-10-05  3:36               ` Fengguang Wu
  2007-10-05  7:41               ` David Chinner
  1 sibling, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-05  3:36 UTC (permalink / raw)
  To: David Chinner
  Cc: Andrew Morton, linux-kernel, Ken Chen, Andrew Morton,
	Michael Rubin

On Thu, Oct 04, 2007 at 03:03:44PM +1000, David Chinner wrote:
> On Thu, Oct 04, 2007 at 10:21:33AM +0800, Fengguang Wu wrote:
> > On Wed, Oct 03, 2007 at 12:41:19PM +1000, David Chinner wrote:
> > > On Wed, Oct 03, 2007 at 09:34:39AM +0800, Fengguang Wu wrote:
> > > > On Wed, Oct 03, 2007 at 07:47:45AM +1000, David Chinner wrote:
> > > > > On Tue, Oct 02, 2007 at 04:41:48PM +0800, Fengguang Wu wrote:
> > > > > >  		wbc.pages_skipped = 0;
> > > > > > @@ -560,8 +561,9 @@ static void background_writeout(unsigned
> > > > > >  		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> > > > > >  		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> > > > > >  			/* Wrote less than expected */
> > > > > > -			congestion_wait(WRITE, HZ/10);
> > > > > > -			if (!wbc.encountered_congestion)
> > > > > > +			if (wbc.encountered_congestion || wbc.more_io)
> > > > > > +				congestion_wait(WRITE, HZ/10);
> > > > > > +			else
> > > > > >  				break;
> > > > > >  		}
> > > > > 
> > > > > Why do you call congestion_wait() if there is more I/O to issue?  If
> > > > > we have a fast filesystem, this might cause the device queues to
> > > > > fill, then drain on congestion_wait(), then fill again, etc. i.e. we
> > > > > will have trouble keeping the queues full, right?
> > > > 
> > > > You mean slow writers and fast RAID? That would be exactly the case
> > > > these patches try to improve.
> > > 
> > > I mean any writers and a fast block device (raid or otherwise).
> > > 
> > > > This patchset makes kupdate/background writeback more responsible,
> > > > so that if (avg-write-speed < device-capabilities), the dirty data are
> > > > synced timely, and we don't have to go for balance_dirty_pages().
> > > 
> > > Sure, but I'm asking about the effect of the patches on the
> > > (avg-write-speed == device-capabilities) case. I agree that
> > > they are necessary for timely syncing of data but I'm trying
> > > to understand what effect they have on the normal write case
> > 
> > > (i.e. keeping the disk at full write throughput).
> > 
> > OK, I guess it is the focus of all your questions: Why should we sleep
> > in congestion_wait() and possibly hurt the write throughput? I'll try
> > to summary it:
> > 
> > - congestion_wait() is necessary
> > Besides device congestions, there may be other blockades we have to
> > wait on, e.g. temporary page locks, NFS/journal issues(I guess).
> 
> We skip locked pages in writeback, and if some filesystems have
> blocking issues that require non-blocking writeback waits for some
> I/O to complete before re-entering writeback, then perhaps they should be
> setting wbc->encountered_congestion to tell writeback to back off.

We have wbc->pages_skipped for that :-)

> The question I'm asking is that if more_io tells us we have more
> work to do, why do we have to sleep first if the block dev is
> able to take more I/O?

See below.

> > 
> > - congestion_wait() is called only when necessary
> > congestion_wait() will only be called we saw blockades:
> >         if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> >                 congestion_wait(WRITE, HZ/10);
> >         }
> > So in normal case, it may well write 128MB data without any waiting.
> 
> Sure, but wbc.more_io doesn't indicate a blockade - just that there
> is more work to do, right?
 
It's not wbc.more_io, but the context(wbc.pages_skipped > 0) indicates
a blockade:
        
if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {    /* all-written or blockade... */
        if (wbc.encountered_congestion || wbc.more_io) /* blockade! */
                congestion_wait(WRITE, HZ/10);
        else                                           /* all-written! */
                break;
}

We can also read the whole background_writeout() logic as

while (!done) {
        /* sync _all_ sync-able data */
        congestion_wait(100ms);
}

And an example run could be:

sync 1000MB, skipped 100MB
congestion_wait(100ms);
sync 100MB, skipped 10MB
congestion_wait(100ms);
sync 10MB, all done

Note that it's far from "wait 100ms for every 4MB" (which is merely
the worst possible case).

> > - congestion_wait() won't hurt write throughput
> > When not congested, congestion_wait() will be wake up on each write
> > completion.
> 
> What happens if the I/O we issued has already completed before we
> got back up to the congestion_wait() call? We'll spend 100ms
> sleeping when we shouldn't have and throughput goes down by 10% on
> every occurrence....

Ah, that was out of my imagination. Maybe we could do with

        if (wbc.more_io)
                congestion_wait(WRITE, 1);

It's at least 10 times better.

> if we've got more work to do, then we should do it without an
> arbitrary, non-deterministic delay being inserted. If the delay is
> needed to prevent he system from "going mad" (whatever tht means),
> then what's the explaination for the system "going mad"?

"going mad" means "busy waiting".

> > Note that MAX_WRITEBACK_PAGES=1024 and
> > /sys/block/sda/queue/max_sectors_kb=512(for me),
> > which means we are gave the chance to sync 4MB on every 512KB written,
> > which means we are able to submit write IOs 8 times faster than the
> > device capability. congestion_wait() is a magical timer :-)
> 
> So, with Jens Axboe's sglist chaining, that single I/O could now
> be up to 32MB on some hardware. IOWs, we push 1024 pages, and that
> could end up as a single I/O being issued to disk.
> 
> Your magic just broke. :/

Hmm, congestion_wait(WRITE, 1) could re-establish the balance ;-)
Which waits <10ms for HZ=100.
 
> > > > So for your question of queue depth, the answer is: the queue length
> > > > will not build up in the first place. 
> > > 
> > > Which queue are you talking about here? The block deivce queue?
> > 
> > Yes, the elevator's queues.
> 
> I think this is the wrong thing to be doing and is detrimental
> to I/o perfomrance because it wil reduce elevator efficiency.
> 
> The elevator can only work efficiently if we allow the queues to
> build up. The deeper the queue, the better the elevator can sort the
> I/o requests and keep the device at maximum efficiency.  If we don't
> push enough I/O into the queues the we miss opportunities to combine
> adjacent I/Os and reduce the seek load of writeback. Also, a shallow
> queue will run dry if we don't get back to it in time which is
> possible if we wait for I/o to complete before we go and flush
> more....

Sure, the queues should be filled as fast as possible.
How fast can we fill the queue? Let's measure it:

//generated by the patch below

[  871.430700] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 54289 global 29911 0 0 wc _M tw -12 sk 0
[  871.444718] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 53253 global 28857 0 0 wc _M tw -12 sk 0
[  871.458764] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 52217 global 27834 0 0 wc _M tw -12 sk 0
[  871.472797] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 51181 global 26780 0 0 wc _M tw -12 sk 0
[  871.486825] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 50145 global 25757 0 0 wc _M tw -12 sk 0
[  871.500857] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 49109 global 24734 0 0 wc _M tw -12 sk 0
[  871.514864] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 48073 global 23680 0 0 wc _M tw -12 sk 0
[  871.528889] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 47037 global 22657 0 0 wc _M tw -12 sk 0
[  871.542894] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 46001 global 21603 0 0 wc _M tw -12 sk 0
[  871.556927] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 44965 global 20580 0 0 wc _M tw -12 sk 0
[  871.570961] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 43929 global 19557 0 0 wc _M tw -12 sk 0
[  871.584992] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 42893 global 18503 0 0 wc _M tw -12 sk 0
[  871.599005] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 41857 global 17480 0 0 wc _M tw -12 sk 0
[  871.613027] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 40821 global 16426 0 0 wc _M tw -12 sk 0
[  871.628626] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 39785 global 15403 961 0 wc _M tw -12 sk 0
[  871.644439] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 38749 global 14380 1550 0 wc _M tw -12 sk 0
[  871.660267] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 37713 global 13326 2573 0 wc _M tw -12 sk 0
[  871.676236] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 36677 global 12303 3224 0 wc _M tw -12 sk 0
[  871.692021] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 35641 global 11280 4154 0 wc _M tw -12 sk 0
[  871.707824] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 34605 global 10226 4929 0 wc _M tw -12 sk 0
[  871.723638] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 33569 global 9203 5735 0 wc _M tw -12 sk 0
[  871.739708] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 32533 global 8149 6603 0 wc _M tw -12 sk 0
[  871.756407] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 31497 global 7126 7409 0 wc _M tw -12 sk 0
[  871.772165] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 30461 global 6103 8246 0 wc _M tw -12 sk 0
[  871.788035] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 29425 global 5049 9052 0 wc _M tw -12 sk 0
[  871.803896] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 28389 global 4026 9982 0 wc _M tw -12 sk 0
[  871.820427] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 27353 global 2972 10757 0 wc _M tw -12 sk 0
[  871.836728] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 26317 global 1949 11656 0 wc _M tw -12 sk 0
[  871.853286] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 25281 global 895 12431 0 wc _M tw -12 sk 0
[  871.868762] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 24245 global 58 13051 0 wc __ tw 168 sk 0

It's an Intel Core 2 2.93GHz CPU and a SATA disk.
The trace shows that
- there's no congestion_wait() called in wb_kupdate()
- it takes wb_kupdate() ~15ms to sync every 4MB 

So I guess congestion_wait(WRITE, 1) will be more than enough.

However, wb_kupdate() is syncing the data a bit slow(4*1000/15=266MB/s),
could it be because of a lot of cond_resched()?


Fengguang
---
 mm/page-writeback.c |   23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

--- linux-2.6.23-rc8-mm2.orig/mm/page-writeback.c
+++ linux-2.6.23-rc8-mm2/mm/page-writeback.c
@@ -98,6 +98,26 @@ EXPORT_SYMBOL(laptop_mode);
 
 /* End of sysctl-exported parameters */
 
+#define writeback_debug_report(n, wbc) do {                               \
+	__writeback_debug_report(n, wbc, __FILE__, __LINE__, __FUNCTION__); \
+} while (0)
+
+void __writeback_debug_report(long n, struct writeback_control *wbc,
+		const char *file, int line, const char *func)
+{
+	printk("%s %d %s: %s(%d) %ld "
+			"global %lu %lu %lu "
+			"wc %c%c tw %ld sk %ld\n",
+			file, line, func,
+			current->comm, current->pid, n,
+			global_page_state(NR_FILE_DIRTY),
+			global_page_state(NR_WRITEBACK),
+			global_page_state(NR_UNSTABLE_NFS),
+			wbc->encountered_congestion ? 'C':'_',
+			wbc->more_io ? 'M':'_',
+			wbc->nr_to_write,
+			wbc->pages_skipped);
+}
 
 static void background_writeout(unsigned long _min_pages);
 
@@ -404,6 +424,7 @@ static void balance_dirty_pages(struct a
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+			writeback_debug_report(pages_written, &wbc);
 		}
 
 		/*
@@ -568,6 +589,7 @@ static void background_writeout(unsigned
 		wbc.pages_skipped = 0;
 		writeback_inodes(&wbc);
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		writeback_debug_report(min_pages, &wbc);
 		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
 			if (wbc.encountered_congestion)
@@ -643,6 +665,7 @@ static void wb_kupdate(unsigned long arg
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		writeback_inodes(&wbc);
+		writeback_debug_report(nr_to_write, &wbc);
 		if (wbc.nr_to_write > 0) {
 			if (wbc.encountered_congestion)
 				congestion_wait(WRITE, HZ/10);


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found]             ` <20071005033652.GA6448@mail.ustc.edu.cn>
  2007-10-05  3:36               ` Fengguang Wu
@ 2007-10-05  7:41               ` David Chinner
       [not found]                 ` <20071005115508.GA9998@mail.ustc.edu.cn>
  1 sibling, 1 reply; 19+ messages in thread
From: David Chinner @ 2007-10-05  7:41 UTC (permalink / raw)
  To: Fengguang Wu
  Cc: David Chinner, Andrew Morton, linux-kernel, Ken Chen,
	Andrew Morton, Michael Rubin

On Fri, Oct 05, 2007 at 11:36:52AM +0800, Fengguang Wu wrote:
> On Thu, Oct 04, 2007 at 03:03:44PM +1000, David Chinner wrote:
> > On Thu, Oct 04, 2007 at 10:21:33AM +0800, Fengguang Wu wrote:
> > > OK, I guess it is the focus of all your questions: Why should we sleep
> > > in congestion_wait() and possibly hurt the write throughput? I'll try
> > > to summary it:
> > > 
> > > - congestion_wait() is necessary
> > > Besides device congestions, there may be other blockades we have to
> > > wait on, e.g. temporary page locks, NFS/journal issues(I guess).
> > 
> > We skip locked pages in writeback, and if some filesystems have
> > blocking issues that require non-blocking writeback waits for some
> > I/O to complete before re-entering writeback, then perhaps they should be
> > setting wbc->encountered_congestion to tell writeback to back off.
> 
> We have wbc->pages_skipped for that :-)

I walked right into that one ;)

> if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {    /* all-written or blockade... */
>         if (wbc.encountered_congestion || wbc.more_io) /* blockade! */
>                 congestion_wait(WRITE, HZ/10);
>         else                                           /* all-written! */
>                 break;
> }

>From this, if we have more_io on one superblock and we skip pages on a
different superblock, the combination of the two will causes us to stop
writeback for a while. Is this the right thing to do?

> We can also read the whole background_writeout() logic as
> 
> while (!done) {
>         /* sync _all_ sync-able data */
>         congestion_wait(100ms);
> }

To me it reads as:

	while (!done) {
		/* sync all data or until one inode skips */
		congestion_wait(up to 100ms);
	}

and it ignores that we might have more superblocks with dirty data
on them that we haven't flushed because we skipped pages on
an inode on a different block device.


> Note that it's far from "wait 100ms for every 4MB" (which is merely
> the worst possible case).

If that's the worst case, then it's far better than the current
"wait 30s for every 4MB".  ;)

Still, if it can be improved....

> > > - congestion_wait() won't hurt write throughput
> > > When not congested, congestion_wait() will be wake up on each write
> > > completion.
> > 
> > What happens if the I/O we issued has already completed before we
> > got back up to the congestion_wait() call? We'll spend 100ms
> > sleeping when we shouldn't have and throughput goes down by 10% on
> > every occurrence....
> 
> Ah, that was out of my imagination. Maybe we could do with
> 
>         if (wbc.more_io)
>                 congestion_wait(WRITE, 1);
> 
> It's at least 10 times better.

And probably good enough to make it unnoticable.

> "going mad" means "busy waiting".

Ah, ok. that I understand ;)

> > > > > So for your question of queue depth, the answer is: the queue length
> > > > > will not build up in the first place. 
> > > > 
> > > > Which queue are you talking about here? The block deivce queue?
> > > 
> > > Yes, the elevator's queues.
> > 
> > I think this is the wrong thing to be doing and is detrimental
> > to I/o perfomrance because it wil reduce elevator efficiency.
> > 
> > The elevator can only work efficiently if we allow the queues to
> > build up. The deeper the queue, the better the elevator can sort the
> > I/o requests and keep the device at maximum efficiency.  If we don't
> > push enough I/O into the queues the we miss opportunities to combine
> > adjacent I/Os and reduce the seek load of writeback. Also, a shallow
> > queue will run dry if we don't get back to it in time which is
> > possible if we wait for I/o to complete before we go and flush
> > more....
> 
> Sure, the queues should be filled as fast as possible.
> How fast can we fill the queue? Let's measure it:
> 
> //generated by the patch below
> 
> [  871.430700] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 54289 global 29911 0 0 wc _M tw -12 sk 0
> [  871.444718] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 53253 global 28857 0 0 wc _M tw -12 sk 0
> [  871.458764] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 52217 global 27834 0 0 wc _M tw -12 sk 0
> [  871.472797] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 51181 global 26780 0 0 wc _M tw -12 sk 0
> [  871.486825] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 50145 global 25757 0 0 wc _M tw -12 sk 0
> [  871.500857] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 49109 global 24734 0 0 wc _M tw -12 sk 0
> [  871.514864] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 48073 global 23680 0 0 wc _M tw -12 sk 0
> [  871.528889] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 47037 global 22657 0 0 wc _M tw -12 sk 0
> [  871.542894] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 46001 global 21603 0 0 wc _M tw -12 sk 0
> [  871.556927] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 44965 global 20580 0 0 wc _M tw -12 sk 0
> [  871.570961] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 43929 global 19557 0 0 wc _M tw -12 sk 0
> [  871.584992] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 42893 global 18503 0 0 wc _M tw -12 sk 0
> [  871.599005] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 41857 global 17480 0 0 wc _M tw -12 sk 0
> [  871.613027] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 40821 global 16426 0 0 wc _M tw -12 sk 0
> [  871.628626] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 39785 global 15403 961 0 wc _M tw -12 sk 0
> [  871.644439] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 38749 global 14380 1550 0 wc _M tw -12 sk 0
> [  871.660267] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 37713 global 13326 2573 0 wc _M tw -12 sk 0
> [  871.676236] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 36677 global 12303 3224 0 wc _M tw -12 sk 0
> [  871.692021] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 35641 global 11280 4154 0 wc _M tw -12 sk 0
> [  871.707824] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 34605 global 10226 4929 0 wc _M tw -12 sk 0
> [  871.723638] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 33569 global 9203 5735 0 wc _M tw -12 sk 0
> [  871.739708] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 32533 global 8149 6603 0 wc _M tw -12 sk 0
> [  871.756407] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 31497 global 7126 7409 0 wc _M tw -12 sk 0
> [  871.772165] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 30461 global 6103 8246 0 wc _M tw -12 sk 0
> [  871.788035] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 29425 global 5049 9052 0 wc _M tw -12 sk 0
> [  871.803896] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 28389 global 4026 9982 0 wc _M tw -12 sk 0
> [  871.820427] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 27353 global 2972 10757 0 wc _M tw -12 sk 0
> [  871.836728] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 26317 global 1949 11656 0 wc _M tw -12 sk 0
> [  871.853286] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 25281 global 895 12431 0 wc _M tw -12 sk 0
> [  871.868762] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 24245 global 58 13051 0 wc __ tw 168 sk 0
> 
> It's an Intel Core 2 2.93GHz CPU and a SATA disk.
> The trace shows that
> - there's no congestion_wait() called in wb_kupdate()
> - it takes wb_kupdate() ~15ms to sync every 4MB 

But it takes a modern SATA disk ~40-50ms to write 4MB (80-100MB/s).
IOWs, what you've timed above is a burst workload, not a steady
state behaviour. And it actually shows that the elevator queues
are growing in constrast to your goal of preventing them from
growing.

In more detail, the first half of the trace indicates no pages under
writeback, that tends to imply that all I/O is complete by the
time wb_kupdate is woken - it's been sucked into the drive
cache as fast as possible.

About half way through we start to see windup of the the number of
pages under writeback of about 800-900 pages per printk.  That's
1024 pages minus 1 or 2 512k I/Os. This implies that the disk cache
is now full and the disk has reached saturation. I/O is now
being queued in the elevator. The last trace has 13051 pages under
writeback, which at 128 pages per I/O is ~100 queued 512k I/Os.

The default queue depth with cfq is 128 requests, and IIRC it
congests at 7/8s full, or 112 requests. IOWs, you file that you
wrote was about 10MB short of what is needed to see congestion on
your test rig.

So the trace shows we slept on neither congestion or more_io
and it points towards congestion being the thing will typically
block us on large file I/O. Before drawing any conclusions on
whether wbc.more_io is needed or not, do you have any way of
producing skipped pages when more_io is set?

> However, wb_kupdate() is syncing the data a bit slow(4*1000/15=266MB/s),
> could it be because of a lot of cond_resched()?

You are using ext3? That would be my guess based simply on the write
rate - ext3 has long been stuck at about that speed for buffered
writes even on much faster block devices.  If I'm right, try using
XFS and see how much differently it behaves. I bet you hit
congestion much sooner than you expect. ;)

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io
       [not found]                 ` <20071005115508.GA9998@mail.ustc.edu.cn>
@ 2007-10-05 11:55                   ` Fengguang Wu
  0 siblings, 0 replies; 19+ messages in thread
From: Fengguang Wu @ 2007-10-05 11:55 UTC (permalink / raw)
  To: David Chinner
  Cc: Andrew Morton, linux-kernel, Ken Chen, Andrew Morton,
	Michael Rubin

On Fri, Oct 05, 2007 at 05:41:03PM +1000, David Chinner wrote:
> On Fri, Oct 05, 2007 at 11:36:52AM +0800, Fengguang Wu wrote:
> > On Thu, Oct 04, 2007 at 03:03:44PM +1000, David Chinner wrote:
> > > On Thu, Oct 04, 2007 at 10:21:33AM +0800, Fengguang Wu wrote:

> > if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {    /* all-written or blockade... */
> >         if (wbc.encountered_congestion || wbc.more_io) /* blockade! */
> >                 congestion_wait(WRITE, HZ/10);
> >         else                                           /* all-written! */
> >                 break;
> > }
> 
> >From this, if we have more_io on one superblock and we skip pages on a
> different superblock, the combination of the two will causes us to stop
> writeback for a while. Is this the right thing to do?

No, the two cases will occur at the same time to a super_block.
See below.

> > We can also read the whole background_writeout() logic as
> > 
> > while (!done) {
> >         /* sync _all_ sync-able data */
> >         congestion_wait(100ms);
> > }
> 
> To me it reads as:
> 
> 	while (!done) {
> 		/* sync all data or until one inode skips */
> 		congestion_wait(up to 100ms);
> 	}
> 
> and it ignores that we might have more superblocks with dirty data
> on them that we haven't flushed because we skipped pages on
> an inode on a different block device.

AFAIK, generic_sync_sb_inodes() will simply skip the inode in trouble
and _continue_ to sync other inodes:

                if (wbc->pages_skipped != pages_skipped) {
                        /*
                         * writeback is not making progress due to locked
                         * buffers.  Skip this inode for now.
                         */
                        redirty_tail(inode);
                }

Note that there's no "break" here.

> > Sure, the queues should be filled as fast as possible.
> > How fast can we fill the queue? Let's measure it:
> > 
> > //generated by the patch below
> > 
> > [  871.430700] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 54289 global 29911 0 0 wc _M tw -12 sk 0
> > [  871.444718] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 53253 global 28857 0 0 wc _M tw -12 sk 0
> > [  871.458764] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 52217 global 27834 0 0 wc _M tw -12 sk 0
> > [  871.472797] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 51181 global 26780 0 0 wc _M tw -12 sk 0
> > [  871.486825] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 50145 global 25757 0 0 wc _M tw -12 sk 0
> > [  871.500857] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 49109 global 24734 0 0 wc _M tw -12 sk 0
> > [  871.514864] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 48073 global 23680 0 0 wc _M tw -12 sk 0
> > [  871.528889] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 47037 global 22657 0 0 wc _M tw -12 sk 0
> > [  871.542894] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 46001 global 21603 0 0 wc _M tw -12 sk 0
> > [  871.556927] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 44965 global 20580 0 0 wc _M tw -12 sk 0
> > [  871.570961] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 43929 global 19557 0 0 wc _M tw -12 sk 0
> > [  871.584992] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 42893 global 18503 0 0 wc _M tw -12 sk 0
> > [  871.599005] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 41857 global 17480 0 0 wc _M tw -12 sk 0
> > [  871.613027] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 40821 global 16426 0 0 wc _M tw -12 sk 0
> > [  871.628626] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 39785 global 15403 961 0 wc _M tw -12 sk 0
> > [  871.644439] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 38749 global 14380 1550 0 wc _M tw -12 sk 0
> > [  871.660267] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 37713 global 13326 2573 0 wc _M tw -12 sk 0
> > [  871.676236] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 36677 global 12303 3224 0 wc _M tw -12 sk 0
> > [  871.692021] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 35641 global 11280 4154 0 wc _M tw -12 sk 0
> > [  871.707824] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 34605 global 10226 4929 0 wc _M tw -12 sk 0
> > [  871.723638] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 33569 global 9203 5735 0 wc _M tw -12 sk 0
> > [  871.739708] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 32533 global 8149 6603 0 wc _M tw -12 sk 0
> > [  871.756407] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 31497 global 7126 7409 0 wc _M tw -12 sk 0
> > [  871.772165] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 30461 global 6103 8246 0 wc _M tw -12 sk 0
> > [  871.788035] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 29425 global 5049 9052 0 wc _M tw -12 sk 0
> > [  871.803896] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 28389 global 4026 9982 0 wc _M tw -12 sk 0
> > [  871.820427] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 27353 global 2972 10757 0 wc _M tw -12 sk 0
> > [  871.836728] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 26317 global 1949 11656 0 wc _M tw -12 sk 0
> > [  871.853286] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 25281 global 895 12431 0 wc _M tw -12 sk 0
> > [  871.868762] mm/page-writeback.c 668 wb_kupdate: pdflush(202) 24245 global 58 13051 0 wc __ tw 168 sk 0
> > 
> > It's an Intel Core 2 2.93GHz CPU and a SATA disk.
> > The trace shows that
> > - there's no congestion_wait() called in wb_kupdate()
> > - it takes wb_kupdate() ~15ms to sync every 4MB 
> 
> But it takes a modern SATA disk ~40-50ms to write 4MB (80-100MB/s).
> IOWs, what you've timed above is a burst workload, not a steady
> state behaviour. And it actually shows that the elevator queues
> are growing in constrast to your goal of preventing them from
> growing.

My goal really? ;-)
 
> In more detail, the first half of the trace indicates no pages under
> writeback, that tends to imply that all I/O is complete by the
> time wb_kupdate is woken - it's been sucked into the drive
> cache as fast as possible.

Right.
 
> About half way through we start to see windup of the the number of
> pages under writeback of about 800-900 pages per printk.  That's
> 1024 pages minus 1 or 2 512k I/Os. This implies that the disk cache
> is now full and the disk has reached saturation. I/O is now
> being queued in the elevator. The last trace has 13051 pages under
> writeback, which at 128 pages per I/O is ~100 queued 512k I/Os.
> 
> The default queue depth with cfq is 128 requests, and IIRC it
> congests at 7/8s full, or 112 requests. IOWs, you file that you
> wrote was about 10MB short of what is needed to see congestion on
> your test rig.

Exactly.

wfg ~% cat /sys/block/sda/queue/nr_requests   
128
wfg ~% cat /sys/block/sda/queue/max_sectors_kb
512
 
More exactly, I was writing a huge file. It produces
balance_dirty_pages, background_writeout, and at last wb_kupdate. The
trace messages are collected after the copy completes, when
wb_kupdate() starts to sync the remaining (< background_thresh) data.

> So the trace shows we slept on neither congestion or more_io
> and it points towards congestion being the thing will typically
> block us on large file I/O. Before drawing any conclusions on
> whether wbc.more_io is needed or not, do you have any way of
> producing skipped pages when more_io is set?

No(and not that easy). (pages_skipped && more_io) events are rare indeed.

> > However, wb_kupdate() is syncing the data a bit slow(4*1000/15=266MB/s),
> > could it be because of a lot of cond_resched()?
> 
> You are using ext3? That would be my guess based simply on the write
> rate - ext3 has long been stuck at about that speed for buffered
> writes even on much faster block devices.  If I'm right, try using
> XFS and see how much differently it behaves. I bet you hit
> congestion much sooner than you expect. ;)

Yes, I was running ext3.  It seems that XFS is about the same speed:

[ 1427.278454] mm/page-writeback.c 668 wb_kupdate: pdflush(5606) 37974 global 16727 0 0 wc _M tw -4 sk 0
[ 1427.293653] mm/page-writeback.c 668 wb_kupdate: pdflush(5606) 36946 global 15704 0 0 wc _M tw -3 sk 0
[ 1427.308891] mm/page-writeback.c 668 wb_kupdate: pdflush(5606) 35919 global 14650 0 0 wc _M tw -13 sk 0
[ 1427.322462] mm/page-writeback.c 668 wb_kupdate: pdflush(5606) 34882 global 13937 0 0 wc _M tw 300 sk 0
[ 1427.338194] mm/page-writeback.c 668 wb_kupdate: pdflush(5606) 34158 global 12914 0 0 wc _M tw -9 sk 0
[ 1427.353473] mm/page-writeback.c 668 wb_kupdate: pdflush(5606) 33125 global 11860 0 0 wc _M tw -12 sk 0
[ 1427.362984] mm/page-writeback.c 668 wb_kupdate: pdflush(5606) 32089 global 11860 0 0 wc _M tw 1018 sk 0

That's 14ms per 4MB.  Maybe it's a VFS issue.


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2007-10-05 11:55 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20071002084143.110486039@mail.ustc.edu.cn>
2007-10-02  8:41 ` [PATCH 0/5] sluggish writeback fixes Fengguang Wu
2007-10-03 11:04   ` Martin Knoblauch
     [not found] ` <20071002090254.489150786@mail.ustc.edu.cn>
2007-10-02  8:41   ` [PATCH 1/5] revert check_dirty_inode_list.patch Fengguang Wu
     [not found] ` <20071002090254.596842343@mail.ustc.edu.cn>
2007-10-02  8:41   ` [PATCH 2/5] writeback: fix time ordering of the per superblock inode lists 8 Fengguang Wu
     [not found] ` <20071002090254.728493507@mail.ustc.edu.cn>
2007-10-02  8:41   ` [PATCH 3/5] writeback: fix ntfs with sb_has_dirty_inodes() Fengguang Wu
     [not found] ` <20071002090254.873023041@mail.ustc.edu.cn>
2007-10-02  8:41   ` [PATCH 4/5] writeback: remove pages_skipped accounting in __block_write_full_page() Fengguang Wu
2007-10-04 21:26     ` Andrew Morton
2007-10-02 21:55   ` David Chinner
     [not found]     ` <20071003014333.GB6501@mail.ustc.edu.cn>
2007-10-03  1:43       ` Fengguang Wu
2007-10-03  2:22       ` David Chinner
     [not found] ` <20071002090254.987182999@mail.ustc.edu.cn>
2007-10-02  8:41   ` [PATCH 5/5] writeback: introduce writeback_control.more_io to indicate more io Fengguang Wu
2007-10-02 21:47   ` David Chinner
     [not found]     ` <20071003013439.GA6501@mail.ustc.edu.cn>
2007-10-03  1:34       ` Fengguang Wu
2007-10-03  2:41       ` David Chinner
     [not found]         ` <20071004022133.GA6244@mail.ustc.edu.cn>
2007-10-04  2:21           ` Fengguang Wu
2007-10-04  5:03           ` David Chinner
     [not found]             ` <20071005033652.GA6448@mail.ustc.edu.cn>
2007-10-05  3:36               ` Fengguang Wu
2007-10-05  7:41               ` David Chinner
     [not found]                 ` <20071005115508.GA9998@mail.ustc.edu.cn>
2007-10-05 11:55                   ` Fengguang Wu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox