linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/6] writeback: Add tracing to balance_dirty_pages
  2010-05-25 10:54 [PATCH 0/6] writeback: tracing and fixes Dave Chinner
@ 2010-05-25 10:54 ` Dave Chinner
  2010-05-25 11:13   ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Dave Chinner @ 2010-05-25 10:54 UTC (permalink / raw)
  To: linux-kernel; +Cc: xfs, linux-fsdevel, linux-ext4, tytso, jens.axboe

From: Dave Chinner <dchinner@redhat.com>

Tracing high level background writeback events is good, but it doesn't
give the entire picture. Add IO dispatched by foreground throttling to the
writeback events.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c                |    5 ++
 include/trace/events/writeback.h |   80 ++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |    4 ++
 3 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b44aba6..efa3d18 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -814,7 +814,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
+
+		trace_wbc_writeback_start(&wbc);
 		writeback_inodes_wb(wb, &wbc);
+		trace_wbc_writeback_written(&wbc);
+
 		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
@@ -842,6 +846,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (!list_empty(&wb->b_more_io))  {
 			inode = list_entry(wb->b_more_io.prev,
 						struct inode, i_list);
+			trace_wbc_writeback_wait(&wbc);
 			inode_wait_for_writeback(inode);
 		}
 		spin_unlock(&inode_lock);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index df76457..72c1a12 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -7,6 +7,9 @@
 #include <linux/backing-dev.h>
 #include <linux/writeback.h>
 
+struct wb_writeback_args;
+struct bdi_work;
+
 TRACE_EVENT(writeback_queue,
 
 	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_args *args),
@@ -165,6 +168,83 @@ TRACE_EVENT(writeback_bdi_register,
 	TP_printk("%s: %s", __entry->name,
 			__entry->start ? "registered" : "unregistered")
 );
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(wbc_class,
+	TP_PROTO(struct writeback_control *wbc),
+	TP_ARGS(wbc),
+	TP_STRUCT__entry(
+		__field(unsigned int, wbc)
+		__array(char, name, 16)
+		__field(long, nr_to_write)
+		__field(long, pages_skipped)
+		__field(int, sb)
+		__field(int, sync_mode)
+		__field(int, nonblocking)
+		__field(int, encountered_congestion)
+		__field(int, for_kupdate)
+		__field(int, for_background)
+		__field(int, for_reclaim)
+		__field(int, range_cyclic)
+		__field(int, more_io)
+		__field(unsigned long, older_than_this)
+		__field(long, range_start)
+		__field(long, range_end)
+	),
+
+	TP_fast_assign(
+		char *__name = "(none)";
+
+		__entry->wbc		= (unsigned long)wbc & 0xffff;
+		if (wbc->bdi)
+			strncpy(__entry->name, dev_name(wbc->bdi->dev), 16);
+		else
+			strncpy(__entry->name, __name, 16);
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->sb		= !!wbc->sb;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->for_kupdate	= wbc->for_kupdate;
+		__entry->for_background	= wbc->for_background;
+		__entry->for_reclaim	= wbc->for_reclaim;
+		__entry->range_cyclic	= wbc->range_cyclic;
+		__entry->more_io	= wbc->more_io;
+		__entry->older_than_this = wbc->older_than_this ?
+						*wbc->older_than_this : 0;
+		__entry->range_start	= (long)wbc->range_start;
+		__entry->range_end	= (long)wbc->range_end;
+	),
+
+	TP_printk("dev %s wbc=%x towrt=%ld skip=%ld sb=%d mode=%d kupd=%d "
+		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"start=0x%lx end=0x%lx",
+		__entry->name,
+		__entry->wbc,
+		__entry->nr_to_write,
+		__entry->pages_skipped,
+		__entry->sb,
+		__entry->sync_mode,
+		__entry->for_kupdate,
+		__entry->for_background,
+		__entry->for_reclaim,
+		__entry->range_cyclic,
+		__entry->more_io,
+		__entry->older_than_this,
+		__entry->range_start,
+		__entry->range_end)
+)
+
+#define DEFINE_WBC_EVENT(name) \
+DEFINE_EVENT(wbc_class, name, \
+	TP_PROTO(struct writeback_control *wbc), \
+	TP_ARGS(wbc))
+DEFINE_WBC_EVENT(wbc_writeback_start);
+DEFINE_WBC_EVENT(wbc_writeback_written);
+DEFINE_WBC_EVENT(wbc_writeback_wait);
+DEFINE_WBC_EVENT(wbc_balance_dirty_start);
+DEFINE_WBC_EVENT(wbc_balance_dirty_written);
+DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b289310..68eb727 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/events/writeback.h>
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -536,11 +537,13 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
+		trace_wbc_balance_dirty_start(&wbc);
 		if (bdi_nr_reclaimable > bdi_thresh) {
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+			trace_wbc_balance_dirty_written(&wbc);
 		}
 
 		/*
@@ -566,6 +569,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
+		trace_wbc_balance_dirty_wait(&wbc);
 		__set_current_state(TASK_INTERRUPTIBLE);
 		io_schedule_timeout(pause);
 
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/6] writeback: Add tracing to balance_dirty_pages
  2010-05-25 10:54 ` [PATCH 2/6] writeback: Add tracing to balance_dirty_pages Dave Chinner
@ 2010-05-25 11:13   ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2010-05-25 11:13 UTC (permalink / raw)
  To: Dave Chinner
  Cc: linux-kernel, xfs, linux-fsdevel, linux-ext4, tytso, jens.axboe

On Tue, May 25, 2010 at 08:54:08PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Tracing high level background writeback events is good, but it doesn't
> give the entire picture. Add IO dispatched by foreground throttling to the
> writeback events.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Looks good,


Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 2/6] writeback: Add tracing to balance_dirty_pages
  2010-06-03 23:55 [PATCH 0/6] writeback: tracing and fixes V3 Dave Chinner
@ 2010-06-03 23:55 ` Dave Chinner
  0 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-06-03 23:55 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs, akpm

From: Dave Chinner <dchinner@redhat.com>

Tracing high level background writeback events is good, but it doesn't
give the entire picture. Add IO dispatched by foreground throttling to the
writeback events.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/fs-writeback.c                |    5 ++
 include/trace/events/writeback.h |   80 ++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |    4 ++
 3 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ebfaed8..7cd4585 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -819,7 +819,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
+
+		trace_wbc_writeback_start(&wbc);
 		writeback_inodes_wb(wb, &wbc);
+		trace_wbc_writeback_written(&wbc);
+
 		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
@@ -847,6 +851,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (!list_empty(&wb->b_more_io))  {
 			inode = list_entry(wb->b_more_io.prev,
 						struct inode, i_list);
+			trace_wbc_writeback_wait(&wbc);
 			inode_wait_for_writeback(inode);
 		}
 		spin_unlock(&inode_lock);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index df76457..72c1a12 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -7,6 +7,9 @@
 #include <linux/backing-dev.h>
 #include <linux/writeback.h>
 
+struct wb_writeback_args;
+struct bdi_work;
+
 TRACE_EVENT(writeback_queue,
 
 	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_args *args),
@@ -165,6 +168,83 @@ TRACE_EVENT(writeback_bdi_register,
 	TP_printk("%s: %s", __entry->name,
 			__entry->start ? "registered" : "unregistered")
 );
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(wbc_class,
+	TP_PROTO(struct writeback_control *wbc),
+	TP_ARGS(wbc),
+	TP_STRUCT__entry(
+		__field(unsigned int, wbc)
+		__array(char, name, 16)
+		__field(long, nr_to_write)
+		__field(long, pages_skipped)
+		__field(int, sb)
+		__field(int, sync_mode)
+		__field(int, nonblocking)
+		__field(int, encountered_congestion)
+		__field(int, for_kupdate)
+		__field(int, for_background)
+		__field(int, for_reclaim)
+		__field(int, range_cyclic)
+		__field(int, more_io)
+		__field(unsigned long, older_than_this)
+		__field(long, range_start)
+		__field(long, range_end)
+	),
+
+	TP_fast_assign(
+		char *__name = "(none)";
+
+		__entry->wbc		= (unsigned long)wbc & 0xffff;
+		if (wbc->bdi)
+			strncpy(__entry->name, dev_name(wbc->bdi->dev), 16);
+		else
+			strncpy(__entry->name, __name, 16);
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->sb		= !!wbc->sb;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->for_kupdate	= wbc->for_kupdate;
+		__entry->for_background	= wbc->for_background;
+		__entry->for_reclaim	= wbc->for_reclaim;
+		__entry->range_cyclic	= wbc->range_cyclic;
+		__entry->more_io	= wbc->more_io;
+		__entry->older_than_this = wbc->older_than_this ?
+						*wbc->older_than_this : 0;
+		__entry->range_start	= (long)wbc->range_start;
+		__entry->range_end	= (long)wbc->range_end;
+	),
+
+	TP_printk("dev %s wbc=%x towrt=%ld skip=%ld sb=%d mode=%d kupd=%d "
+		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"start=0x%lx end=0x%lx",
+		__entry->name,
+		__entry->wbc,
+		__entry->nr_to_write,
+		__entry->pages_skipped,
+		__entry->sb,
+		__entry->sync_mode,
+		__entry->for_kupdate,
+		__entry->for_background,
+		__entry->for_reclaim,
+		__entry->range_cyclic,
+		__entry->more_io,
+		__entry->older_than_this,
+		__entry->range_start,
+		__entry->range_end)
+)
+
+#define DEFINE_WBC_EVENT(name) \
+DEFINE_EVENT(wbc_class, name, \
+	TP_PROTO(struct writeback_control *wbc), \
+	TP_ARGS(wbc))
+DEFINE_WBC_EVENT(wbc_writeback_start);
+DEFINE_WBC_EVENT(wbc_writeback_written);
+DEFINE_WBC_EVENT(wbc_writeback_wait);
+DEFINE_WBC_EVENT(wbc_balance_dirty_start);
+DEFINE_WBC_EVENT(wbc_balance_dirty_written);
+DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b289310..68eb727 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/events/writeback.h>
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -536,11 +537,13 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
+		trace_wbc_balance_dirty_start(&wbc);
 		if (bdi_nr_reclaimable > bdi_thresh) {
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+			trace_wbc_balance_dirty_written(&wbc);
 		}
 
 		/*
@@ -566,6 +569,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
+		trace_wbc_balance_dirty_wait(&wbc);
 		__set_current_state(TASK_INTERRUPTIBLE);
 		io_schedule_timeout(pause);
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 0/6] writeback: tracing and fixes V4
@ 2010-06-08  0:38 Dave Chinner
  2010-06-08  0:38 ` [PATCH 1/6] writeback: initial tracing support Dave Chinner
                   ` (7 more replies)
  0 siblings, 8 replies; 17+ messages in thread
From: Dave Chinner @ 2010-06-08  0:38 UTC (permalink / raw)
  To: torvalds; +Cc: linux-fsdevel, linux-kernel, xfs, akpm

Hi Linus,

Can you please consider this patch series for 2.6.35? If you don't
want the tracing at this stage of the release process, I can redo
the series with just the bug fixes. However, given the regularity
with which we break the writeback code in subtle ways, the tracing
is probably just as important as the bug fixes.

---

This series contains the initial writeback tracing patches from
Jens, as well as the extensions I added to provide visibility into
writeback control structures as the are used by the writeback code.
The visibility given is sufficient to understand what is happening
in the writeback path - what path is writing data, what path is
blocking on congestion, etc, and to determine the differences in
behaviour for different sync modes and calling contexts. This
tracing really needs to be integrated into mainline so that anyone
can improve the tracing as they use it to track down problems in our
convoluted writeback paths.

The remaining patches are fixes to problems that the new tracing
highlighted.

Version 4:
- converted strncpy to strlcpy
- remove stale information from commit message for nr_to_write fixup.

Version 3:
- added comment to tracepoint creation to explain the unusual
  placement of the tracepoint header file include.
- separated out ->writepage tracepoint addition into it's own patch.
- dropped ext4 write_cache_pages separation as it is now in mainline.
- removed ext4 tracing references to wbc->no_nrwrite_index_update as they
  weren't removed in mainline.
- fixed commit message for write_cache_pages patch
- added more information to commit message for sync hold-off fixup.

Version 2:
- included ext4 write_cache_pages separation patch from Ted Ts'o.
- moved CREATE_TRACE_POINTS into fs-writeback.c as suggested by
  Christoph Hellwig.
- moved include of trace/events/writeback.h until after structure
  definitions in fs-writeback.c
- manually revert changes made to write_cache_pages() in
  17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4 that caused the
  regression.  This restores the convention that if the fs writes
  back more than a single page, it subtracts (nr_written - 1) from
  wbc->nr_to_write, as suggested by Andrew Morton.
- added patch to prevent sync from looping in write_cache_pages
  chasing a moving tail when an appending write workload is running
  concurrently with sync.

---
 fs/fs-writeback.c                |   50 ++++++--
 fs/xfs/linux-2.6/xfs_aops.c      |    8 --
 include/linux/writeback.h        |    9 --
 include/trace/events/ext4.h      |    5 +-
 include/trace/events/writeback.h |  252 ++++++++++++++++++++++++++++++++++++++
 mm/backing-dev.c                 |    3 +
 mm/page-writeback.c              |   35 ++++--
 7 files changed, 321 insertions(+), 41 deletions(-)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/6] writeback: initial tracing support
  2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
@ 2010-06-08  0:38 ` Dave Chinner
  2010-06-08  0:38 ` [PATCH 2/6] writeback: Add tracing to balance_dirty_pages Dave Chinner
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-06-08  0:38 UTC (permalink / raw)
  To: torvalds; +Cc: linux-fsdevel, linux-kernel, xfs, akpm

From: From: Jens Axboe <jens.axboe@oracle.com>

Trace queue/sched/exec parts of the writeback loop. This provides
insight into when and why flusher threads are scheduled to run. e.g
a sync invocation leaves a trace like:

sync-2798  [006] 611323.335713: writeback_queue: 253:16: pages=87879, sb=0, kupdate=0, range_cyclic=0 for_background=0
sync-2798  [006] 611323.335718: writeback_sched: work=37c0, task=task
sync-2798  [006] 611323.335817: writeback_queue: 8:0: pages=92680, sb=1, kupdate=0, range_cyclic=-1 for_background=0
sync-2798  [006] 611323.335819: writeback_sched: work=35c0, task=task
sync-2798  [006] 611323.335855: writeback_queue: 253:16: pages=92680, sb=1, kupdate=0, range_cyclic=-1 for_background=0
sync-2798  [006] 611323.335857: writeback_sched: work=36c0, task=task
sync-2798  [006] 611323.335890: writeback_queue: 8:0: pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
sync-2798  [006] 611323.335891: writeback_sched: work=fe58, task=task
sync-2798  [006] 611323.377341: writeback_queue: 253:16: pages=9223372036854775807, sb=1, kupdate=0, range_cyclic=0 for_background=0
sync-2798  [006] 611323.377346: writeback_sched: work=fe58, task=task

This also lays the foundation for adding more writeback tracing to
provide deeper insight into the whole writeback path.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/fs-writeback.c                |   45 ++++++++--
 include/trace/events/writeback.h |  171 ++++++++++++++++++++++++++++++++++++++
 mm/backing-dev.c                 |    3 +
 3 files changed, 209 insertions(+), 10 deletions(-)
 create mode 100644 include/trace/events/writeback.h

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1d1088f..0949dfd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,15 +26,9 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
+#include <linux/ftrace.h>
 #include "internal.h"
 
-#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
-
-/*
- * We don't actually have pdflush, but this one is exported though /proc...
- */
-int nr_pdflush_threads;
-
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -62,6 +56,21 @@ struct bdi_work {
 	unsigned long state;		/* flag bits, see WS_* */
 };
 
+/*
+ * Include the creation of the trace points after defining the bdi_work and
+ * wb_writeback_args structures so that the definitions remain local to this
+ * file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
+#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
+
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
 enum {
 	WS_USED_B = 0,
 	WS_ONSTACK_B,
@@ -136,6 +145,8 @@ static void wb_work_complete(struct bdi_work *work)
 
 static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 {
+	trace_writeback_clear(work);
+
 	/*
 	 * The caller has retrieved the work arguments from this work,
 	 * drop our reference. If this is the last ref, delete and free it
@@ -171,12 +182,16 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	 * If the default thread isn't there, make sure we add it. When
 	 * it gets created and wakes up, we'll run this work.
 	 */
-	if (unlikely(list_empty_careful(&bdi->wb_list)))
+	if (unlikely(list_empty_careful(&bdi->wb_list))) {
+		trace_writeback_sched(bdi, work, "default");
 		wake_up_process(default_backing_dev_info.wb.task);
-	else {
+	} else {
 		struct bdi_writeback *wb = &bdi->wb;
+		struct task_struct *task = wb->task;
 
-		if (wb->task)
+		trace_writeback_sched(bdi, work, task ? "task" : "notask");
+
+		if (task)
 			wake_up_process(wb->task);
 	}
 }
@@ -203,6 +218,7 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 	if (work) {
 		bdi_work_init(work, args);
+		trace_writeback_queue(bdi, args);
 		bdi_queue_work(bdi, work);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
@@ -236,6 +252,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 	bdi_work_init(&work, &args);
 	work.state |= WS_ONSTACK;
 
+	trace_writeback_queue(bdi, &args);
 	bdi_queue_work(bdi, &work);
 	bdi_wait_on_work_clear(&work);
 }
@@ -907,6 +924,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		if (force_wait)
 			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 
+		trace_writeback_exec(work);
+
 		/*
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
@@ -942,9 +961,13 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
+	trace_writeback_thread_start(1);
+
 	while (!kthread_should_stop()) {
 		pages_written = wb_do_writeback(wb, 0);
 
+		trace_writeback_pages_written(pages_written);
+
 		if (pages_written)
 			last_active = jiffies;
 		else if (wait_jiffies != -1UL) {
@@ -974,6 +997,8 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 		try_to_freeze();
 	}
 
+	trace_writeback_thread_start(0);
+
 	return 0;
 }
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
new file mode 100644
index 0000000..6f510fa
--- /dev/null
+++ b/include/trace/events/writeback.h
@@ -0,0 +1,171 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM writeback
+
+#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WRITEBACK_H
+
+#include <linux/backing-dev.h>
+#include <linux/writeback.h>
+
+TRACE_EVENT(writeback_queue,
+
+	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_args *args),
+
+	TP_ARGS(bdi, args),
+
+	TP_STRUCT__entry(
+		__array(char,		name,		16)
+		__field(long,		nr_pages)
+		__field(int,		sb)
+		__field(int,		sync_mode)
+		__field(int,		for_kupdate)
+		__field(int,		range_cyclic)
+		__field(int,		for_background)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->name, dev_name(bdi->dev), 16);
+		__entry->nr_pages	= args->nr_pages;
+		__entry->sb		= !!args->sb;
+		__entry->for_kupdate	= args->for_kupdate;
+		__entry->range_cyclic	= args->range_cyclic;
+		__entry->for_background	= args->for_background;
+	),
+
+	TP_printk("%s: pages=%ld, sb=%d, kupdate=%d, range_cyclic=%d "
+		  "for_background=%d", __entry->name, __entry->nr_pages,
+			__entry->sb, __entry->for_kupdate,
+			__entry->range_cyclic, __entry->for_background)
+);
+
+TRACE_EVENT(writeback_sched,
+
+	TP_PROTO(struct backing_dev_info *bdi, struct bdi_work *work,
+		 const char *msg),
+
+	TP_ARGS(bdi, work, msg),
+
+	TP_STRUCT__entry(
+		__array(char,			name,		16)
+		__field(unsigned int,		work)
+		__array(char,			task,		8)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->name, dev_name(bdi->dev), 16);
+		__entry->work = (unsigned long) work & 0xffff;
+		snprintf(__entry->task, 8, "%s", msg);
+	),
+
+	TP_printk("work=%x, task=%s", __entry->work, __entry->task)
+);
+
+TRACE_EVENT(writeback_exec,
+
+	TP_PROTO(struct bdi_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(unsigned int,	work)
+		__field(long,		nr_pages)
+		__field(int,		sb)
+		__field(int,		sync_mode)
+		__field(int,		for_kupdate)
+		__field(int,		range_cyclic)
+		__field(int,		for_background)
+	),
+
+	TP_fast_assign(
+		__entry->work		= (unsigned long) work & 0xffff;
+		__entry->nr_pages	= work->args.nr_pages;
+		__entry->sb		= !!work->args.sb;
+		__entry->for_kupdate	= work->args.for_kupdate;
+		__entry->range_cyclic	= work->args.range_cyclic;
+		__entry->for_background	= work->args.for_background;
+
+	),
+
+	TP_printk("work=%x pages=%ld, sb=%d, kupdate=%d, range_cyclic=%d"
+		  " for_background=%d", __entry->work,
+			__entry->nr_pages, __entry->sb, __entry->for_kupdate,
+			__entry->range_cyclic, __entry->for_background)
+);
+
+TRACE_EVENT(writeback_clear,
+
+	TP_PROTO(struct bdi_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(struct bdi_work *,	work)
+		__field(int,			refs)
+	),
+
+	TP_fast_assign(
+		__entry->work		= work;
+		__entry->refs		= atomic_read(&work->pending);
+	),
+
+	TP_printk("work=%p, refs=%d", __entry->work, __entry->refs)
+);
+
+TRACE_EVENT(writeback_pages_written,
+
+	TP_PROTO(long pages_written),
+
+	TP_ARGS(pages_written),
+
+	TP_STRUCT__entry(
+		__field(long,		pages)
+	),
+
+	TP_fast_assign(
+		__entry->pages		= pages_written;
+	),
+
+	TP_printk("%ld", __entry->pages)
+);
+
+
+TRACE_EVENT(writeback_thread_start,
+
+	TP_PROTO(int start),
+
+	TP_ARGS(start),
+
+	TP_STRUCT__entry(
+		__field(int,	start)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+	),
+
+	TP_printk("%s", __entry->start ? "started" : "exited")
+);
+
+TRACE_EVENT(writeback_bdi_register,
+
+	TP_PROTO(const char *name, int start),
+
+	TP_ARGS(name, start),
+
+	TP_STRUCT__entry(
+		__array(char,	name,		16)
+		__field(int,	start)
+	),
+
+	TP_fast_assign(
+		strlcpy(__entry->name, name, 16);
+		__entry->start = start;
+	),
+
+	TP_printk("%s: %s", __entry->name,
+			__entry->start ? "registered" : "unregistered")
+);
+#endif /* _TRACE_WRITEBACK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 660a87a..1f7723b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
+#include <trace/events/writeback.h>
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
@@ -585,6 +586,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
 	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(BDI_registered, &bdi->state);
+	trace_writeback_bdi_register(dev_name(dev), 1);
 exit:
 	return ret;
 }
@@ -647,6 +649,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
 	if (bdi->dev) {
+		trace_writeback_bdi_register(dev_name(bdi->dev), 0);
 		bdi_prune_sb(bdi);
 
 		if (!bdi_cap_flush_forker(bdi))
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/6] writeback: Add tracing to balance_dirty_pages
  2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
  2010-06-08  0:38 ` [PATCH 1/6] writeback: initial tracing support Dave Chinner
@ 2010-06-08  0:38 ` Dave Chinner
  2010-06-08  0:38 ` [PATCH 3/6] writeback: Add tracing to write_cache_pages Dave Chinner
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-06-08  0:38 UTC (permalink / raw)
  To: torvalds; +Cc: linux-fsdevel, linux-kernel, xfs, akpm

From: Dave Chinner <dchinner@redhat.com>

Tracing high level background writeback events is good, but it doesn't
give the entire picture. Add IO dispatched by foreground throttling to the
writeback events.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/fs-writeback.c                |    5 ++
 include/trace/events/writeback.h |   80 ++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |    4 ++
 3 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0949dfd..6f777d6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -807,7 +807,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
+
+		trace_wbc_writeback_start(&wbc);
 		writeback_inodes_wb(wb, &wbc);
+		trace_wbc_writeback_written(&wbc);
+
 		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
@@ -835,6 +839,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (!list_empty(&wb->b_more_io))  {
 			inode = list_entry(wb->b_more_io.prev,
 						struct inode, i_list);
+			trace_wbc_writeback_wait(&wbc);
 			inode_wait_for_writeback(inode);
 		}
 		spin_unlock(&inode_lock);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 6f510fa..7867c16 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -7,6 +7,9 @@
 #include <linux/backing-dev.h>
 #include <linux/writeback.h>
 
+struct wb_writeback_args;
+struct bdi_work;
+
 TRACE_EVENT(writeback_queue,
 
 	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_args *args),
@@ -165,6 +168,83 @@ TRACE_EVENT(writeback_bdi_register,
 	TP_printk("%s: %s", __entry->name,
 			__entry->start ? "registered" : "unregistered")
 );
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(wbc_class,
+	TP_PROTO(struct writeback_control *wbc),
+	TP_ARGS(wbc),
+	TP_STRUCT__entry(
+		__field(unsigned int, wbc)
+		__array(char, name, 16)
+		__field(long, nr_to_write)
+		__field(long, pages_skipped)
+		__field(int, sb)
+		__field(int, sync_mode)
+		__field(int, nonblocking)
+		__field(int, encountered_congestion)
+		__field(int, for_kupdate)
+		__field(int, for_background)
+		__field(int, for_reclaim)
+		__field(int, range_cyclic)
+		__field(int, more_io)
+		__field(unsigned long, older_than_this)
+		__field(long, range_start)
+		__field(long, range_end)
+	),
+
+	TP_fast_assign(
+		char *__name = "(none)";
+
+		__entry->wbc		= (unsigned long)wbc & 0xffff;
+		if (wbc->bdi)
+			strncpy(__entry->name, dev_name(wbc->bdi->dev), 16);
+		else
+			strncpy(__entry->name, __name, 16);
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->sb		= !!wbc->sb;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->for_kupdate	= wbc->for_kupdate;
+		__entry->for_background	= wbc->for_background;
+		__entry->for_reclaim	= wbc->for_reclaim;
+		__entry->range_cyclic	= wbc->range_cyclic;
+		__entry->more_io	= wbc->more_io;
+		__entry->older_than_this = wbc->older_than_this ?
+						*wbc->older_than_this : 0;
+		__entry->range_start	= (long)wbc->range_start;
+		__entry->range_end	= (long)wbc->range_end;
+	),
+
+	TP_printk("dev %s wbc=%x towrt=%ld skip=%ld sb=%d mode=%d kupd=%d "
+		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"start=0x%lx end=0x%lx",
+		__entry->name,
+		__entry->wbc,
+		__entry->nr_to_write,
+		__entry->pages_skipped,
+		__entry->sb,
+		__entry->sync_mode,
+		__entry->for_kupdate,
+		__entry->for_background,
+		__entry->for_reclaim,
+		__entry->range_cyclic,
+		__entry->more_io,
+		__entry->older_than_this,
+		__entry->range_start,
+		__entry->range_end)
+)
+
+#define DEFINE_WBC_EVENT(name) \
+DEFINE_EVENT(wbc_class, name, \
+	TP_PROTO(struct writeback_control *wbc), \
+	TP_ARGS(wbc))
+DEFINE_WBC_EVENT(wbc_writeback_start);
+DEFINE_WBC_EVENT(wbc_writeback_written);
+DEFINE_WBC_EVENT(wbc_writeback_wait);
+DEFINE_WBC_EVENT(wbc_balance_dirty_start);
+DEFINE_WBC_EVENT(wbc_balance_dirty_written);
+DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5fa63bd..345a730 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/events/writeback.h>
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -536,11 +537,13 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
+		trace_wbc_balance_dirty_start(&wbc);
 		if (bdi_nr_reclaimable > bdi_thresh) {
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+			trace_wbc_balance_dirty_written(&wbc);
 		}
 
 		/*
@@ -566,6 +569,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
+		trace_wbc_balance_dirty_wait(&wbc);
 		__set_current_state(TASK_INTERRUPTIBLE);
 		io_schedule_timeout(pause);
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/6] writeback: Add tracing to write_cache_pages
  2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
  2010-06-08  0:38 ` [PATCH 1/6] writeback: initial tracing support Dave Chinner
  2010-06-08  0:38 ` [PATCH 2/6] writeback: Add tracing to balance_dirty_pages Dave Chinner
@ 2010-06-08  0:38 ` Dave Chinner
  2010-06-08  0:38 ` [PATCH 4/6] writeback: pay attention to wbc->nr_to_write in write_cache_pages Dave Chinner
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-06-08  0:38 UTC (permalink / raw)
  To: torvalds; +Cc: linux-fsdevel, linux-kernel, xfs, akpm

From: Dave Chinner <dchinner@redhat.com>

Add a trace event to the ->writepage loop in write_cache_pages to give
visibility into how the ->writepage call is changing variables within the
writeback control structure. Of most interest is how wbc->nr_to_write changes
from call to call, especially with filesystems that write multiple pages
in ->writepage.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/trace/events/writeback.h |    1 +
 mm/page-writeback.c              |    1 +
 2 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 7867c16..6797c46 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -244,6 +244,7 @@ DEFINE_WBC_EVENT(wbc_writeback_wait);
 DEFINE_WBC_EVENT(wbc_balance_dirty_start);
 DEFINE_WBC_EVENT(wbc_balance_dirty_written);
 DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+DEFINE_WBC_EVENT(wbc_writepage);
 
 #endif /* _TRACE_WRITEBACK_H */
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 345a730..7ac39c5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -921,6 +921,7 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
+			trace_wbc_writepage(wbc);
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret)) {
 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/6] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
                   ` (2 preceding siblings ...)
  2010-06-08  0:38 ` [PATCH 3/6] writeback: Add tracing to write_cache_pages Dave Chinner
@ 2010-06-08  0:38 ` Dave Chinner
  2010-06-08  5:43   ` Nick Piggin
  2010-06-08  0:38 ` [PATCH 5/6] xfs: remove nr_to_write writeback windup Dave Chinner
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 17+ messages in thread
From: Dave Chinner @ 2010-06-08  0:38 UTC (permalink / raw)
  To: torvalds; +Cc: linux-fsdevel, linux-kernel, xfs, akpm

From: Dave Chinner <dchinner@redhat.com>

If a filesystem writes more than one page in ->writepage, write_cache_pages
fails to notice this and continues to attempt writeback when wbc->nr_to_write
has gone negative - this trace was captured from XFS:


    wbc_writeback_start: towrt=1024
    wbc_writepage: towrt=1024
    wbc_writepage: towrt=0
    wbc_writepage: towrt=-1
    wbc_writepage: towrt=-5
    wbc_writepage: towrt=-21
    wbc_writepage: towrt=-85

This has adverse effects on filesystem writeback behaviour. write_cache_pages()
needs to terminate after a certain number of pages are written, not after a
certain number of calls to ->writepage are made.  This is a regression
introduced by 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4 ("vfs: Add
no_nrwrite_index_update writeback control flag"), but cannot be reverted
directly due to subsequent bug fixes that have gone in on top of it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/writeback.h   |    9 ---------
 include/trace/events/ext4.h |    5 +----
 mm/page-writeback.c         |   15 +++++----------
 3 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f641346..d63ef8f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -56,15 +56,6 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
-	/*
-	 * write_cache_pages() won't update wbc->nr_to_write and
-	 * mapping->writeback_index if no_nrwrite_index_update
-	 * is set.  write_cache_pages() may write more than we
-	 * requested and we want to make sure nr_to_write and
-	 * writeback_index are updated in a consistent manner
-	 * so we use a single control to update them
-	 */
-	unsigned no_nrwrite_index_update:1;
 };
 
 /*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f5b1ba9..f3865c7 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -306,7 +306,6 @@ TRACE_EVENT(ext4_da_writepages_result,
 		__field(	int,	pages_written		)
 		__field(	long,	pages_skipped		)
 		__field(	char,	more_io			)	
-		__field(	char,	no_nrwrite_index_update )
 		__field(       pgoff_t,	writeback_index		)
 	),
 
@@ -317,16 +316,14 @@ TRACE_EVENT(ext4_da_writepages_result,
 		__entry->pages_written	= pages_written;
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->more_io	= wbc->more_io;
-		__entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 	),
 
-	TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d no_nrwrite_index_update %d writeback_index %lu",
+	TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu",
 		  jbd2_dev_to_name(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->ret,
 		  __entry->pages_written, __entry->pages_skipped,
 		  __entry->more_io,
-		  __entry->no_nrwrite_index_update,
 		  (unsigned long) __entry->writeback_index)
 );
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7ac39c5..0acadee 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -839,7 +839,6 @@ int write_cache_pages(struct address_space *mapping,
 	pgoff_t done_index;
 	int cycled;
 	int range_whole = 0;
-	long nr_to_write = wbc->nr_to_write;
 
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
@@ -940,11 +939,10 @@ continue_unlock:
 					done = 1;
 					break;
 				}
- 			}
+			}
 
-			if (nr_to_write > 0) {
-				nr_to_write--;
-				if (nr_to_write == 0 &&
+			if (wbc->nr_to_write > 0) {
+				if (--wbc->nr_to_write == 0 &&
 				    wbc->sync_mode == WB_SYNC_NONE) {
 					/*
 					 * We stop writing back only if we are
@@ -975,11 +973,8 @@ continue_unlock:
 		end = writeback_index - 1;
 		goto retry;
 	}
-	if (!wbc->no_nrwrite_index_update) {
-		if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
-			mapping->writeback_index = done_index;
-		wbc->nr_to_write = nr_to_write;
-	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = done_index;
 
 	return ret;
 }
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 5/6] xfs: remove nr_to_write writeback windup.
  2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
                   ` (3 preceding siblings ...)
  2010-06-08  0:38 ` [PATCH 4/6] writeback: pay attention to wbc->nr_to_write in write_cache_pages Dave Chinner
@ 2010-06-08  0:38 ` Dave Chinner
  2010-06-08  0:38 ` [PATCH 6/6] writeback: limit write_cache_pages integrity scanning to current EOF Dave Chinner
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-06-08  0:38 UTC (permalink / raw)
  To: torvalds; +Cc: linux-fsdevel, linux-kernel, xfs, akpm

From: Dave Chinner <dchinner@redhat.com>

Now that the background flush code has been fixed, we shouldn't need to
silently multiply the wbc->nr_to_write to get good writeback. Remove
that code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_aops.c |    8 --------
 1 files changed, 0 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a0fa3bf..34640d6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1381,14 +1381,6 @@ xfs_vm_writepage(
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
 
-
-	/*
-	 *  VM calculation for nr_to_write seems off.  Bump it way
-	 *  up, this gets simple streaming writes zippy again.
-	 *  To be reviewed again after Jens' writeback changes.
-	 */
-	wbc->nr_to_write *= 4;
-
 	/*
 	 * Convert delayed allocate, unwritten or unmapped space
 	 * to real space and flush out to disk.
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 6/6] writeback: limit write_cache_pages integrity scanning to current EOF
  2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
                   ` (4 preceding siblings ...)
  2010-06-08  0:38 ` [PATCH 5/6] xfs: remove nr_to_write writeback windup Dave Chinner
@ 2010-06-08  0:38 ` Dave Chinner
  2010-06-08  5:38   ` Nick Piggin
  2010-06-08  5:44 ` [PATCH 0/6] writeback: tracing and fixes V4 Nick Piggin
  2010-06-08 17:41 ` Linus Torvalds
  7 siblings, 1 reply; 17+ messages in thread
From: Dave Chinner @ 2010-06-08  0:38 UTC (permalink / raw)
  To: torvalds; +Cc: linux-fsdevel, linux-kernel, xfs, akpm

From: Dave Chinner <dchinner@redhat.com>

sync can currently take a really long time if a concurrent writer is
extending a file. The problem is that the dirty pages on the address
space grow in the same direction as write_cache_pages scans, so if
the writer keeps ahead of writeback, the writeback will not
terminate until the writer stops adding dirty pages.

For a data integrity sync, we only need to write the pages dirty at
the time we start the writeback, so we can stop scanning once we get
to the page that was at the end of the file at the time the scan
started.

This will prevent operations like copying a large file preventing
sync from completing as it will not write back pages that were
dirtied after the sync was started. This does not impact the
existing integrity guarantees, as any dirty page (old or new)
within the EOF range at the start of the scan will still be
captured.

This patch will not prevent sync from blocking on large writes into
holes. That requires more complex intervention while this patch only
addresses the common append-case of this sync holdoff.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 mm/page-writeback.c |   15 +++++++++++++++
 1 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0acadee..1a361f0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -855,7 +855,22 @@ int write_cache_pages(struct address_space *mapping,
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
+
+		/*
+		 * If this is a data integrity sync, cap the writeback to the
+		 * current end of file. Any extension to the file that occurs
+		 * after this is a new write and we don't need to write those
+		 * pages out to fulfil our data integrity requirements. If we
+		 * try to write them out, we can get stuck in this scan until
+		 * the concurrent writer stops adding dirty pages and extending
+		 * EOF.
+		 */
+		if (wbc->sync_mode == WB_SYNC_ALL &&
+		    wbc->range_end == LLONG_MAX) {
+			end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
+		}
 	}
+
 retry:
 	done_index = index;
 	while (!done && (index <= end)) {
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 6/6] writeback: limit write_cache_pages integrity scanning to current EOF
  2010-06-08  0:38 ` [PATCH 6/6] writeback: limit write_cache_pages integrity scanning to current EOF Dave Chinner
@ 2010-06-08  5:38   ` Nick Piggin
  2010-06-08  6:59     ` Dave Chinner
  0 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2010-06-08  5:38 UTC (permalink / raw)
  To: Dave Chinner; +Cc: torvalds, linux-fsdevel, linux-kernel, xfs, akpm

On Tue, Jun 08, 2010 at 10:38:07AM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> sync can currently take a really long time if a concurrent writer is
> extending a file. The problem is that the dirty pages on the address
> space grow in the same direction as write_cache_pages scans, so if
> the writer keeps ahead of writeback, the writeback will not
> terminate until the writer stops adding dirty pages.
> 
> For a data integrity sync, we only need to write the pages dirty at
> the time we start the writeback, so we can stop scanning once we get
> to the page that was at the end of the file at the time the scan
> started.
> 
> This will prevent operations like copying a large file preventing
> sync from completing as it will not write back pages that were
> dirtied after the sync was started. This does not impact the
> existing integrity guarantees, as any dirty page (old or new)
> within the EOF range at the start of the scan will still be
> captured.
> 
> This patch will not prevent sync from blocking on large writes into
> holes.

The writes don't have to be into holes to cause this starvation
problem, do they?


> That requires more complex intervention while this patch only
> addresses the common append-case of this sync holdoff.

Jan's tagging patch looks pretty good to me and isn't so complex.
I think we should just take that.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/6] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-06-08  0:38 ` [PATCH 4/6] writeback: pay attention to wbc->nr_to_write in write_cache_pages Dave Chinner
@ 2010-06-08  5:43   ` Nick Piggin
  0 siblings, 0 replies; 17+ messages in thread
From: Nick Piggin @ 2010-06-08  5:43 UTC (permalink / raw)
  To: Dave Chinner; +Cc: torvalds, linux-fsdevel, linux-kernel, xfs, akpm

On Tue, Jun 08, 2010 at 10:38:05AM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If a filesystem writes more than one page in ->writepage, write_cache_pages
> fails to notice this and continues to attempt writeback when wbc->nr_to_write
> has gone negative - this trace was captured from XFS:
> 
> 
>     wbc_writeback_start: towrt=1024
>     wbc_writepage: towrt=1024
>     wbc_writepage: towrt=0
>     wbc_writepage: towrt=-1
>     wbc_writepage: towrt=-5
>     wbc_writepage: towrt=-21
>     wbc_writepage: towrt=-85
> 
> This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> needs to terminate after a certain number of pages are written, not after a
> certain number of calls to ->writepage are made.  This is a regression
> introduced by 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4 ("vfs: Add
> no_nrwrite_index_update writeback control flag"), but cannot be reverted
> directly due to subsequent bug fixes that have gone in on top of it.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Nick Piggin <npiggin@suse.de>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/6] writeback: tracing and fixes V4
  2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
                   ` (5 preceding siblings ...)
  2010-06-08  0:38 ` [PATCH 6/6] writeback: limit write_cache_pages integrity scanning to current EOF Dave Chinner
@ 2010-06-08  5:44 ` Nick Piggin
  2010-06-08 17:41 ` Linus Torvalds
  7 siblings, 0 replies; 17+ messages in thread
From: Nick Piggin @ 2010-06-08  5:44 UTC (permalink / raw)
  To: Dave Chinner; +Cc: torvalds, linux-fsdevel, linux-kernel, xfs, akpm

On Tue, Jun 08, 2010 at 10:38:01AM +1000, Dave Chinner wrote:
> Hi Linus,
> 
> Can you please consider this patch series for 2.6.35? If you don't
> want the tracing at this stage of the release process, I can redo
> the series with just the bug fixes. However, given the regularity
> with which we break the writeback code in subtle ways, the tracing
> is probably just as important as the bug fixes.

Oh, I don't have a problem with your EOF fix being merged now, as Jan's
patch isn't suitable for 2.6.35.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 6/6] writeback: limit write_cache_pages integrity scanning to current EOF
  2010-06-08  5:38   ` Nick Piggin
@ 2010-06-08  6:59     ` Dave Chinner
  0 siblings, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-06-08  6:59 UTC (permalink / raw)
  To: Nick Piggin; +Cc: torvalds, linux-fsdevel, linux-kernel, xfs, akpm

On Tue, Jun 08, 2010 at 03:38:31PM +1000, Nick Piggin wrote:
> On Tue, Jun 08, 2010 at 10:38:07AM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > sync can currently take a really long time if a concurrent writer is
> > extending a file. The problem is that the dirty pages on the address
> > space grow in the same direction as write_cache_pages scans, so if
> > the writer keeps ahead of writeback, the writeback will not
> > terminate until the writer stops adding dirty pages.
> > 
> > For a data integrity sync, we only need to write the pages dirty at
> > the time we start the writeback, so we can stop scanning once we get
> > to the page that was at the end of the file at the time the scan
> > started.
> > 
> > This will prevent operations like copying a large file preventing
> > sync from completing as it will not write back pages that were
> > dirtied after the sync was started. This does not impact the
> > existing integrity guarantees, as any dirty page (old or new)
> > within the EOF range at the start of the scan will still be
> > captured.
> > 
> > This patch will not prevent sync from blocking on large writes into
> > holes.
> 
> The writes don't have to be into holes to cause this starvation
> problem, do they?

No, they don't.

> > That requires more complex intervention while this patch only
> > addresses the common append-case of this sync holdoff.
> 
> Jan's tagging patch looks pretty good to me and isn't so complex.
> I think we should just take that.

I don't care which one we take as long as it is actually tested by
more than the submitter and we get everything in for 2.6.35...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/6] writeback: tracing and fixes V4
  2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
                   ` (6 preceding siblings ...)
  2010-06-08  5:44 ` [PATCH 0/6] writeback: tracing and fixes V4 Nick Piggin
@ 2010-06-08 17:41 ` Linus Torvalds
  2010-06-08 17:50   ` Andrew Morton
  2010-06-08 22:46   ` Dave Chinner
  7 siblings, 2 replies; 17+ messages in thread
From: Linus Torvalds @ 2010-06-08 17:41 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs, akpm



On Tue, 8 Jun 2010, Dave Chinner wrote:
>
> Can you please consider this patch series for 2.6.35?

Yeah, no. See the several lkml discussions and the -rc2 announcement about 
how I do _not_ intend to merge anything even remotely "new code" this time 
around.

So purely regression fixes and serious bugfixes (oopses/security issues).

			Kubys

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/6] writeback: tracing and fixes V4
  2010-06-08 17:41 ` Linus Torvalds
@ 2010-06-08 17:50   ` Andrew Morton
  2010-06-08 22:46   ` Dave Chinner
  1 sibling, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2010-06-08 17:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Dave Chinner, linux-fsdevel, linux-kernel, xfs

On Tue, 8 Jun 2010 10:41:28 -0700 (PDT) Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Tue, 8 Jun 2010, Dave Chinner wrote:
> >
> > Can you please consider this patch series for 2.6.35?
> 
> Yeah, no. See the several lkml discussions and the -rc2 announcement about 
> how I do _not_ intend to merge anything even remotely "new code" this time 
> around.
> 
> So purely regression fixes and serious bugfixes (oopses/security issues).

Linus would have merged it.

> 			Kubys

But his evil twin got ahold of the keyboard.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/6] writeback: tracing and fixes V4
  2010-06-08 17:41 ` Linus Torvalds
  2010-06-08 17:50   ` Andrew Morton
@ 2010-06-08 22:46   ` Dave Chinner
  1 sibling, 0 replies; 17+ messages in thread
From: Dave Chinner @ 2010-06-08 22:46 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, linux-kernel, xfs, akpm

On Tue, Jun 08, 2010 at 10:41:28AM -0700, Linus Torvalds wrote:
> 
> 
> On Tue, 8 Jun 2010, Dave Chinner wrote:
> >
> > Can you please consider this patch series for 2.6.35?
> 
> Yeah, no. See the several lkml discussions and the -rc2 announcement about 
> how I do _not_ intend to merge anything even remotely "new code" this time 
> around.
> 
> So purely regression fixes and serious bugfixes (oopses/security issues).

No worries,  I'll re-order the series and send you just the bug
fixes later this morning.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-06-08 22:46 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-08  0:38 [PATCH 0/6] writeback: tracing and fixes V4 Dave Chinner
2010-06-08  0:38 ` [PATCH 1/6] writeback: initial tracing support Dave Chinner
2010-06-08  0:38 ` [PATCH 2/6] writeback: Add tracing to balance_dirty_pages Dave Chinner
2010-06-08  0:38 ` [PATCH 3/6] writeback: Add tracing to write_cache_pages Dave Chinner
2010-06-08  0:38 ` [PATCH 4/6] writeback: pay attention to wbc->nr_to_write in write_cache_pages Dave Chinner
2010-06-08  5:43   ` Nick Piggin
2010-06-08  0:38 ` [PATCH 5/6] xfs: remove nr_to_write writeback windup Dave Chinner
2010-06-08  0:38 ` [PATCH 6/6] writeback: limit write_cache_pages integrity scanning to current EOF Dave Chinner
2010-06-08  5:38   ` Nick Piggin
2010-06-08  6:59     ` Dave Chinner
2010-06-08  5:44 ` [PATCH 0/6] writeback: tracing and fixes V4 Nick Piggin
2010-06-08 17:41 ` Linus Torvalds
2010-06-08 17:50   ` Andrew Morton
2010-06-08 22:46   ` Dave Chinner
  -- strict thread matches above, loose matches on Subject: below --
2010-06-03 23:55 [PATCH 0/6] writeback: tracing and fixes V3 Dave Chinner
2010-06-03 23:55 ` [PATCH 2/6] writeback: Add tracing to balance_dirty_pages Dave Chinner
2010-05-25 10:54 [PATCH 0/6] writeback: tracing and fixes Dave Chinner
2010-05-25 10:54 ` [PATCH 2/6] writeback: Add tracing to balance_dirty_pages Dave Chinner
2010-05-25 11:13   ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).