From: Wu Fengguang <fengguang.wu@intel.com>
To: Shaohua Li <shaohua.li@intel.com>
Cc: linux-kernel@vger.kernel.org, richard@rsk.demon.co.uk,
a.p.zijlstra@chello.nl, jens.axboe@oracle.com,
akpm@linux-foundation.org, linux-fsdevel@vger.kernel.org,
Chris Mason <chris.mason@oracle.com>
Subject: Re: regression in page writeback
Date: Tue, 22 Sep 2009 18:49:15 +0800 [thread overview]
Message-ID: <20090922104915.GA1649@localhost> (raw)
In-Reply-To: <20090922054913.GA27260@sli10-desk.sh.intel.com>
[-- Attachment #1: Type: text/plain, Size: 1305 bytes --]
Shaohua,
On Tue, Sep 22, 2009 at 01:49:13PM +0800, Li, Shaohua wrote:
> Hi,
> Commit d7831a0bdf06b9f722b947bb0c205ff7d77cebd8 causes disk io regression
> in my test.
> My system has 12 disks, each disk has two partitions. System runs fio sequence
> write on all partitions, each partion has 8 jobs.
> 2.6.31-rc1, fio gives 460m/s disk io
> 2.6.31-rc2, fio gives about 400m/s disk io. Revert the patch, speed back to
> 460m/s
>
> Under latest git: fio gives 450m/s disk io; If reverting the patch, the speed
> is 484m/s.
>
> With the patch, fio reports less io merge and more interrupts. My naive
> analysis is the patch makes balance_dirty_pages_ratelimited_nr() limits
> write chunk to 8 pages and then soon go to sleep in balance_dirty_pages(),
> because most time the bdi_nr_reclaimable < bdi_thresh, and so when write
> the pages out, the chunk is 8 pages long instead of 4M long. Without the patch,
> thread can write 8 pages and then move some pages to writeback, and then
> continue doing write. The patch seems to break this.
Do you have trace/numbers for above descriptions?
> Unfortunatelly I can't figure out a fix for this issue, hopefully
> you have more ideas.
Attached is a very verbose writeback debug patch, hope it helps and
won't disturb the workload a lot :)
Thanks,
Fengguang
[-- Attachment #2: writeback-debug-2.6.31.patch --]
[-- Type: text/x-diff, Size: 5468 bytes --]
--- linux-2.6.orig/fs/fs-writeback.c 2009-08-23 14:44:22.000000000 +0800
+++ linux-2.6/fs/fs-writeback.c 2009-09-22 18:45:06.000000000 +0800
@@ -26,6 +26,9 @@
#include "internal.h"
+int sysctl_dirty_debug __read_mostly;
+
+
/**
* writeback_acquire - attempt to get exclusive writeback access to a device
* @bdi: the device's backing_dev_info structure
@@ -186,6 +189,11 @@ static int write_inode(struct inode *ino
return 0;
}
+#define redirty_tail(inode) \
+ do { \
+ __redirty_tail(inode, __LINE__); \
+ } while (0)
+
/*
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
* furthest end of its superblock's dirty-inode list.
@@ -195,10 +203,15 @@ static int write_inode(struct inode *ino
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
-static void redirty_tail(struct inode *inode)
+static void __redirty_tail(struct inode *inode, int line)
{
struct super_block *sb = inode->i_sb;
+ if (sysctl_dirty_debug) {
+ printk(KERN_DEBUG "redirty_tail +%d: inode %lu\n",
+ line, inode->i_ino);
+ }
+
if (!list_empty(&sb->s_dirty)) {
struct inode *tail_inode;
@@ -210,12 +223,22 @@ static void redirty_tail(struct inode *i
list_move(&inode->i_list, &sb->s_dirty);
}
+#define requeue_io(inode) \
+ do { \
+ __requeue_io(inode, __LINE__); \
+ } while (0)
+
/*
* requeue inode for re-scanning after sb->s_io list is exhausted.
*/
-static void requeue_io(struct inode *inode)
+static void __requeue_io(struct inode *inode, int line)
{
list_move(&inode->i_list, &inode->i_sb->s_more_io);
+
+ if (sysctl_dirty_debug) {
+ printk(KERN_DEBUG "requeue_io +%d: inode %lu\n",
+ line, inode->i_ino);
+ }
}
static void inode_sync_complete(struct inode *inode)
--- linux-2.6.orig/include/linux/writeback.h 2009-08-23 14:44:23.000000000 +0800
+++ linux-2.6/include/linux/writeback.h 2009-09-22 18:29:05.000000000 +0800
@@ -168,5 +168,6 @@ void writeback_set_ratelimit(void);
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
read-only. */
+extern int sysctl_dirty_debug;
#endif /* WRITEBACK_H */
--- linux-2.6.orig/kernel/sysctl.c 2009-08-23 14:44:23.000000000 +0800
+++ linux-2.6/kernel/sysctl.c 2009-09-22 18:29:05.000000000 +0800
@@ -1516,6 +1516,14 @@ static struct ctl_table fs_table[] = {
.extra1 = &zero,
.extra2 = &two,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "dirty_debug",
+ .data = &sysctl_dirty_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{
.ctl_name = CTL_UNNUMBERED,
--- linux-2.6.orig/mm/page-writeback.c 2009-08-23 14:44:23.000000000 +0800
+++ linux-2.6/mm/page-writeback.c 2009-09-22 18:45:50.000000000 +0800
@@ -116,6 +116,35 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
+#define writeback_debug_report(n, wbc) do { \
+ if(sysctl_dirty_debug) \
+ __writeback_debug_report(n, wbc, \
+ __FILE__, __LINE__, __FUNCTION__); \
+} while (0)
+
+void print_writeback_control(struct writeback_control *wbc)
+{
+ printk(KERN_DEBUG
+ "global dirty=%lu writeback=%lu nfs=%lu "
+ "flags=%c%c towrite=%ld skipped=%ld\n",
+ global_page_state(NR_FILE_DIRTY),
+ global_page_state(NR_WRITEBACK),
+ global_page_state(NR_UNSTABLE_NFS),
+ wbc->encountered_congestion ? 'C':'_',
+ wbc->more_io ? 'M':'_',
+ wbc->nr_to_write,
+ wbc->pages_skipped);
+}
+
+void __writeback_debug_report(long n, struct writeback_control *wbc,
+ const char *file, int line, const char *func)
+{
+ printk(KERN_DEBUG "%s %d %s: comm=%s pid=%d n=%ld\n",
+ file, line, func,
+ current->comm, current->pid,
+ n);
+ print_writeback_control(wbc);
+}
static void background_writeout(unsigned long _min_pages);
@@ -550,7 +579,12 @@ static void balance_dirty_pages(struct a
pages_written += write_chunk - wbc.nr_to_write;
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);
+ writeback_debug_report(pages_written, &wbc);
}
+ printk("bdi_nr_reclaimable=%lu, bdi_thresh=%lu, "
+ "background_thresh=%lu, dirty_thresh=%lu\n",
+ bdi_nr_reclaimable, bdi_thresh,
+ background_thresh, dirty_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -670,6 +704,11 @@ void throttle_vm_writeout(gfp_t gfp_mask
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
congestion_wait(BLK_RW_ASYNC, HZ/10);
+ printk(KERN_DEBUG "throttle_vm_writeout: "
+ "congestion_wait on %lu+%lu > %lu\n",
+ global_page_state(NR_UNSTABLE_NFS),
+ global_page_state(NR_WRITEBACK),
+ dirty_thresh);
/*
* The caller might hold locks which can prevent IO completion
@@ -719,7 +758,9 @@ static void background_writeout(unsigned
else
break;
}
+ writeback_debug_report(min_pages, &wbc);
}
+ writeback_debug_report(min_pages, &wbc);
}
/*
@@ -792,7 +833,9 @@ static void wb_kupdate(unsigned long arg
break; /* All the old data is written */
}
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ writeback_debug_report(nr_to_write, &wbc);
}
+ writeback_debug_report(nr_to_write, &wbc);
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
if (dirty_writeback_interval)
next prev parent reply other threads:[~2009-09-22 10:49 UTC|newest]
Thread overview: 79+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-09-22 5:49 regression in page writeback Shaohua Li
2009-09-22 6:40 ` Peter Zijlstra
2009-09-22 8:05 ` Wu Fengguang
2009-09-22 8:09 ` Peter Zijlstra
2009-09-22 8:24 ` Wu Fengguang
2009-09-22 8:32 ` Peter Zijlstra
2009-09-22 8:51 ` Wu Fengguang
2009-09-22 8:52 ` Richard Kennedy
2009-09-22 9:05 ` Wu Fengguang
2009-09-22 11:41 ` Shaohua Li
2009-09-22 15:52 ` Chris Mason
2009-09-23 0:22 ` Wu Fengguang
2009-09-23 0:54 ` Andrew Morton
2009-09-23 1:17 ` Wu Fengguang
2009-09-23 1:27 ` Wu Fengguang
2009-09-23 1:28 ` Andrew Morton
2009-09-23 1:32 ` Wu Fengguang
2009-09-23 1:47 ` Andrew Morton
2009-09-23 2:01 ` Wu Fengguang
2009-09-23 2:09 ` Andrew Morton
2009-09-23 3:07 ` Wu Fengguang
2009-09-23 1:45 ` Wu Fengguang
2009-09-23 1:59 ` Andrew Morton
2009-09-23 2:26 ` Wu Fengguang
2009-09-23 2:36 ` Andrew Morton
2009-09-23 2:49 ` Wu Fengguang
2009-09-23 2:56 ` Andrew Morton
2009-09-23 3:11 ` Wu Fengguang
2009-09-23 3:10 ` Shaohua Li
2009-09-23 3:14 ` Wu Fengguang
2009-09-23 3:25 ` Wu Fengguang
2009-09-23 14:00 ` Chris Mason
2009-09-24 3:15 ` Wu Fengguang
2009-09-24 12:10 ` Chris Mason
2009-09-25 3:26 ` Wu Fengguang
2009-09-25 0:11 ` Dave Chinner
2009-09-25 0:38 ` Chris Mason
2009-09-25 5:04 ` Dave Chinner
2009-09-25 6:45 ` Wu Fengguang
2009-09-28 1:07 ` Dave Chinner
2009-09-28 7:15 ` Wu Fengguang
2009-09-28 13:08 ` Christoph Hellwig
2009-09-28 14:07 ` Theodore Tso
2009-09-30 5:26 ` Wu Fengguang
2009-09-30 5:32 ` Wu Fengguang
2009-10-01 22:17 ` Jan Kara
2009-10-02 3:27 ` Wu Fengguang
2009-10-06 12:55 ` Jan Kara
2009-10-06 13:18 ` Wu Fengguang
2009-09-30 14:11 ` Theodore Tso
2009-10-01 15:14 ` Wu Fengguang
2009-10-01 21:54 ` Theodore Tso
2009-10-02 2:55 ` Wu Fengguang
2009-10-02 8:19 ` Wu Fengguang
2009-10-02 17:26 ` Theodore Tso
2009-10-03 6:10 ` Wu Fengguang
2009-09-29 2:32 ` Wu Fengguang
2009-09-29 14:00 ` Chris Mason
2009-09-29 14:21 ` Christoph Hellwig
2009-09-29 0:15 ` Wu Fengguang
2009-09-28 14:25 ` Chris Mason
2009-09-29 23:39 ` Dave Chinner
2009-09-30 1:30 ` Wu Fengguang
2009-09-25 12:06 ` Chris Mason
2009-09-25 3:19 ` Wu Fengguang
2009-09-26 1:47 ` Dave Chinner
2009-09-26 3:02 ` Wu Fengguang
2009-09-26 3:02 ` Wu Fengguang
2009-09-23 9:19 ` Richard Kennedy
2009-09-23 9:23 ` Peter Zijlstra
2009-09-23 9:37 ` Wu Fengguang
2009-09-23 10:30 ` Wu Fengguang
2009-09-23 6:41 ` Shaohua Li
2009-09-22 10:49 ` Wu Fengguang [this message]
2009-09-22 11:50 ` Shaohua Li
2009-09-22 13:39 ` Wu Fengguang
2009-09-23 1:52 ` Shaohua Li
2009-09-23 4:00 ` Wu Fengguang
2009-09-25 6:14 ` Wu Fengguang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090922104915.GA1649@localhost \
--to=fengguang.wu@intel.com \
--cc=a.p.zijlstra@chello.nl \
--cc=akpm@linux-foundation.org \
--cc=chris.mason@oracle.com \
--cc=jens.axboe@oracle.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=richard@rsk.demon.co.uk \
--cc=shaohua.li@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.