linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 4/4] writeback: Remove pages_skipped from writeback_control
  2014-03-20 14:30 [PATCH 0/4 RFC] " Jan Kara
@ 2014-03-20 14:30 ` Jan Kara
  0 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2014-03-20 14:30 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Wu Fengguang, Andrew Morton, dchinner, Jan Kara

pages_skipped in struct writeback_control isn't used anymore. Remove it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/btrfs/extent_io.c             |  4 +---
 fs/ceph/addr.c                   |  2 --
 fs/f2fs/checkpoint.c             |  1 -
 fs/f2fs/data.c                   |  1 -
 fs/f2fs/node.c                   |  1 -
 fs/fs-writeback.c                |  1 -
 include/linux/writeback.h        |  2 --
 include/trace/events/btrfs.h     |  9 +++------
 include/trace/events/ext4.h      | 14 ++++----------
 include/trace/events/writeback.h |  8 ++------
 mm/page-writeback.c              |  1 -
 11 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..a3d4232f1d39 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3233,9 +3233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 						      page_end);
 		if (ret) {
 			/* Fixup worker will requeue */
-			if (ret == -EBUSY)
-				wbc->pages_skipped++;
-			else
+			if (ret != -EBUSY)
 				redirty_page_for_writepage(wbc, page);
 			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b53278c9fd97..7943d4197514 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -526,8 +526,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
 		mapping_set_error(&inode->i_data, err);
-		if (wbc)
-			wbc->pages_skipped++;
 	} else {
 		dout("writepage cleaned page %p\n", page);
 		err = 0;  /* vfs expects us to return 0 */
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 293d0486a40f..625b540b69c1 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,7 +98,6 @@ static int f2fs_write_meta_page(struct page *page,
 
 redirty_out:
 	dec_page_count(sbi, F2FS_DIRTY_META);
-	wbc->pages_skipped++;
 	set_page_dirty(page);
 	return AOP_WRITEPAGE_ACTIVATE;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2261ccdd0b5f..f7b5943315b2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -848,7 +848,6 @@ out:
 	return 0;
 
 redirty_out:
-	wbc->pages_skipped++;
 	set_page_dirty(page);
 	return err;
 }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b0649b76eb4f..87cafab66483 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1230,7 +1230,6 @@ static int f2fs_write_node_page(struct page *page,
 
 redirty_out:
 	dec_page_count(sbi, F2FS_DIRTY_NODES);
-	wbc->pages_skipped++;
 	set_page_dirty(page);
 	return AOP_WRITEPAGE_ACTIVATE;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6fb88cdc3c8a..adf71c95f163 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -555,7 +555,6 @@ restart:
 
 		write_chunk = writeback_chunk_size(wb->bdi, work);
 		wbc.nr_to_write = write_chunk;
-		wbc.pages_skipped = 0;
 
 		/*
 		 * We use I_SYNC to pin the inode in memory. While it is set
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 021b8a319b9e..b70fe1fb742c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -66,8 +66,6 @@ enum wb_reason {
 struct writeback_control {
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
-	long pages_skipped;		/* Pages which were not written */
-
 	/*
 	 * For a_ops->writepages(): if start or end are non-zero then this is
 	 * a hint that the filesystem need only write out the pages inside that
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 3176cdc32937..6909a138eb94 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -309,7 +309,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		__field(	ino_t,  ino			)
 		__field(	pgoff_t,  index			)
 		__field(	long,   nr_to_write		)
-		__field(	long,   pages_skipped		)
 		__field(	loff_t, range_start		)
 		__field(	loff_t, range_end		)
 		__field(	char,   for_kupdate		)
@@ -323,7 +322,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		__entry->ino		= inode->i_ino;
 		__entry->index		= page->index;
 		__entry->nr_to_write	= wbc->nr_to_write;
-		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
 		__entry->for_kupdate	= wbc->for_kupdate;
@@ -335,14 +333,13 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 	),
 
 	TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
-		  "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
+		  "nr_to_write = %ld, range_start = %llu, "
 		  "range_end = %llu, for_kupdate = %d, "
 		  "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long)__entry->ino, __entry->index,
-		  __entry->nr_to_write, __entry->pages_skipped,
-		  __entry->range_start, __entry->range_end,
-		  __entry->for_kupdate,
+		  __entry->nr_to_write, __entry->range_start,
+		  __entry->range_end, __entry->for_kupdate,
 		  __entry->for_reclaim, __entry->range_cyclic,
 		  (unsigned long)__entry->writeback_index)
 );
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 197d3125df2a..472517016765 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -340,7 +340,6 @@ TRACE_EVENT(ext4_writepages,
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	long,	nr_to_write		)
-		__field(	long,	pages_skipped		)
 		__field(	loff_t,	range_start		)
 		__field(	loff_t,	range_end		)
 		__field(       pgoff_t,	writeback_index		)
@@ -353,7 +352,6 @@ TRACE_EVENT(ext4_writepages,
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->nr_to_write	= wbc->nr_to_write;
-		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
@@ -362,13 +360,12 @@ TRACE_EVENT(ext4_writepages,
 		__entry->range_cyclic	= wbc->range_cyclic;
 	),
 
-	TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
+	TP_printk("dev %d,%d ino %lu nr_to_write %ld "
 		  "range_start %lld range_end %lld sync_mode %d "
 		  "for_kupdate %d range_cyclic %d writeback_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->nr_to_write,
-		  __entry->pages_skipped, __entry->range_start,
-		  __entry->range_end, __entry->sync_mode,
+		  __entry->range_start, __entry->range_end, __entry->sync_mode,
 		  __entry->for_kupdate, __entry->range_cyclic,
 		  (unsigned long) __entry->writeback_index)
 );
@@ -440,7 +437,6 @@ TRACE_EVENT(ext4_writepages_result,
 		__field(	ino_t,	ino			)
 		__field(	int,	ret			)
 		__field(	int,	pages_written		)
-		__field(	long,	pages_skipped		)
 		__field(       pgoff_t,	writeback_index		)
 		__field(	int,	sync_mode		)
 	),
@@ -450,17 +446,15 @@ TRACE_EVENT(ext4_writepages_result,
 		__entry->ino		= inode->i_ino;
 		__entry->ret		= ret;
 		__entry->pages_written	= pages_written;
-		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 		__entry->sync_mode	= wbc->sync_mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
+	TP_printk("dev %d,%d ino %lu ret %d pages_written %d "
 		  "sync_mode %d writeback_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->ret,
-		  __entry->pages_written, __entry->pages_skipped,
-		  __entry->sync_mode,
+		  __entry->pages_written, __entry->sync_mode,
 		  (unsigned long) __entry->writeback_index)
 );
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index ef0683ae456c..dbb8d224bbe8 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -229,7 +229,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 	TP_STRUCT__entry(
 		__array(char, name, 32)
 		__field(long, nr_to_write)
-		__field(long, pages_skipped)
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, for_background)
@@ -242,7 +241,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 	TP_fast_assign(
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_to_write	= wbc->nr_to_write;
-		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->sync_mode	= wbc->sync_mode;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_background	= wbc->for_background;
@@ -252,12 +250,10 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->range_end	= (long)wbc->range_end;
 	),
 
-	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
-		"bgrd=%d reclm=%d cyclic=%d "
-		"start=0x%lx end=0x%lx",
+	TP_printk("bdi %s: towrt=%ld mode=%d kupd=%d bgrd=%d reclm=%d "
+		"cyclic=%d start=0x%lx end=0x%lx",
 		__entry->name,
 		__entry->nr_to_write,
-		__entry->pages_skipped,
 		__entry->sync_mode,
 		__entry->for_kupdate,
 		__entry->for_background,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7106cb1aca8e..b9ee38fba369 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2223,7 +2223,6 @@ EXPORT_SYMBOL(account_page_redirty);
  */
 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 {
-	wbc->pages_skipped++;
 	account_page_redirty(page);
 	return __set_page_dirty_nobuffers(page);
 }
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 0/4 RESEND] writeback: Dirty list handling changes
@ 2014-05-15 15:41 Jan Kara
  2014-05-15 15:41 ` [PATCH 1/4] writeback: Get rid of superblock pinning Jan Kara
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Jan Kara @ 2014-05-15 15:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Wu Fengguang, Andrew Morton, Jan Kara

  Hello,

  so I was recently thinking about how writeback code shuffles inodes between
lists and also how redirty_tail() clobbers dirtied_when timestamp (which broke
my sync(2) optimization). This patch series came out of that. Patch 1 is a
clear win and just needs an independent review that I didn't forget about
something. Patch 3 changes writeback list handling - IMHO it makes the logic
somewhat more straightforward as we don't have to bother shuffling inodes
between lists and we also don't need to clobber dirtied_when timestamp.
But opinions may differ...

Patches passed xfstests run and I did some basic writeback tests using tiobench
and some artifical sync livelock tests to verify nothing regressed. So I'd
be happy if people could have a look.

								Honza

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] writeback: Get rid of superblock pinning
  2014-05-15 15:41 [PATCH 0/4 RESEND] writeback: Dirty list handling changes Jan Kara
@ 2014-05-15 15:41 ` Jan Kara
  2014-05-15 15:41 ` [PATCH 2/4] writeback: Move removal from writeback list in evict() Jan Kara
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2014-05-15 15:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Wu Fengguang, Andrew Morton, Jan Kara

Currently flusher thread pinned superblock (via grab_super_passive()) it
was working on. However this is unnecessary after commit
169ebd90131b "writeback: Avoid iput() from flusher thread". Before this
commit we had to block umount so that it doesn't complain about busy inodes
because of elevated i_count flusher thread used. After this commit we
can let umount run and it will block in evict_inodes() waiting for
flusher thread to be done with the inode (thus flusher thread is also
safe against inode going away from under it). Removing the superblock
pinning allows us to simplify the code quite a bit. Among other things
there's no need to sort b_io list in move_expired_inodes() anymore.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c                | 104 ++++++---------------------------------
 include/trace/events/writeback.h |   2 +-
 2 files changed, 17 insertions(+), 89 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index be568b7311d6..b54ec30541e6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -255,11 +255,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
 			       struct wb_writeback_work *work)
 {
-	LIST_HEAD(tmp);
-	struct list_head *pos, *node;
-	struct super_block *sb = NULL;
 	struct inode *inode;
-	int do_sb_sort = 0;
 	int moved = 0;
 
 	while (!list_empty(delaying_queue)) {
@@ -267,29 +263,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 		if (work->older_than_this &&
 		    inode_dirtied_after(inode, *work->older_than_this))
 			break;
-		list_move(&inode->i_wb_list, &tmp);
+		list_move(&inode->i_wb_list, dispatch_queue);
 		moved++;
-		if (sb_is_blkdev_sb(inode->i_sb))
-			continue;
-		if (sb && sb != inode->i_sb)
-			do_sb_sort = 1;
-		sb = inode->i_sb;
-	}
-
-	/* just one sb in list, splice to dispatch_queue and we're done */
-	if (!do_sb_sort) {
-		list_splice(&tmp, dispatch_queue);
-		goto out;
-	}
-
-	/* Move inodes from one superblock together */
-	while (!list_empty(&tmp)) {
-		sb = wb_inode(tmp.prev)->i_sb;
-		list_for_each_prev_safe(pos, node, &tmp) {
-			inode = wb_inode(pos);
-			if (inode->i_sb == sb)
-				list_move(&inode->i_wb_list, dispatch_queue);
-		}
 	}
 out:
 	return moved;
@@ -500,7 +475,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  *
  * This function is designed to be called for writing back one inode which
  * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
- * and does more profound writeback list handling in writeback_sb_inodes().
+ * and does more profound writeback list handling in writeback_inodes().
  */
 static int
 writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
@@ -570,8 +545,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
 	 * The intended call sequence for WB_SYNC_ALL writeback is:
 	 *
 	 *      wb_writeback()
-	 *          writeback_sb_inodes()       <== called only once
-	 *              write_cache_pages()     <== called once for each inode
+	 *          writeback_inodes()       <== called only once
+	 *              write_cache_pages()  <== called once for each inode
 	 *                   (quickly) tag currently dirty pages
 	 *                   (maybe slowly) sync all tagged pages
 	 */
@@ -589,13 +564,12 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
 }
 
 /*
- * Write a portion of b_io inodes which belong to @sb.
+ * Write inodes in b_io list belonging to @work->sb (if set).
  *
  * Return the number of pages and/or inodes written.
  */
-static long writeback_sb_inodes(struct super_block *sb,
-				struct bdi_writeback *wb,
-				struct wb_writeback_work *work)
+static long writeback_inodes(struct bdi_writeback *wb,
+			     struct wb_writeback_work *work)
 {
 	struct writeback_control wbc = {
 		.sync_mode		= work->sync_mode,
@@ -614,23 +588,14 @@ static long writeback_sb_inodes(struct super_block *sb,
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
 
-		if (inode->i_sb != sb) {
-			if (work->sb) {
-				/*
-				 * We only want to write back data for this
-				 * superblock, move all inodes not belonging
-				 * to it back onto the dirty list.
-				 */
-				redirty_tail(inode, wb);
-				continue;
-			}
-
+		if (work->sb && inode->i_sb != work->sb) {
 			/*
-			 * The inode belongs to a different superblock.
-			 * Bounce back to the caller to unpin this and
-			 * pin the next superblock.
+			 * We only want to write back data for this
+			 * superblock, move all inodes not belonging
+			 * to it back onto the dirty list.
 			 */
-			break;
+			redirty_tail(inode, wb);
+			continue;
 		}
 
 		/*
@@ -656,7 +621,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 			 */
 			spin_unlock(&inode->i_lock);
 			requeue_io(inode, wb);
-			trace_writeback_sb_inodes_requeue(inode);
+			trace_writeback_inodes_requeue(inode);
 			continue;
 		}
 		spin_unlock(&wb->list_lock);
@@ -710,40 +675,6 @@ static long writeback_sb_inodes(struct super_block *sb,
 	return wrote;
 }
 
-static long __writeback_inodes_wb(struct bdi_writeback *wb,
-				  struct wb_writeback_work *work)
-{
-	unsigned long start_time = jiffies;
-	long wrote = 0;
-
-	while (!list_empty(&wb->b_io)) {
-		struct inode *inode = wb_inode(wb->b_io.prev);
-		struct super_block *sb = inode->i_sb;
-
-		if (!grab_super_passive(sb)) {
-			/*
-			 * grab_super_passive() may fail consistently due to
-			 * s_umount being grabbed by someone else. Don't use
-			 * requeue_io() to avoid busy retrying the inode/sb.
-			 */
-			redirty_tail(inode, wb);
-			continue;
-		}
-		wrote += writeback_sb_inodes(sb, wb, work);
-		drop_super(sb);
-
-		/* refer to the same tests at the end of writeback_sb_inodes */
-		if (wrote) {
-			if (time_is_before_jiffies(start_time + HZ / 10UL))
-				break;
-			if (work->nr_pages <= 0)
-				break;
-		}
-	}
-	/* Leave any unwritten inodes on b_io */
-	return wrote;
-}
-
 static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 				enum wb_reason reason)
 {
@@ -757,7 +688,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 	spin_lock(&wb->list_lock);
 	if (list_empty(&wb->b_io))
 		queue_io(wb, &work);
-	__writeback_inodes_wb(wb, &work);
+	writeback_inodes(wb, &work);
 	spin_unlock(&wb->list_lock);
 
 	return nr_pages - work.nr_pages;
@@ -857,10 +788,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
 			queue_io(wb, work);
-		if (work->sb)
-			progress = writeback_sb_inodes(work->sb, wb, work);
-		else
-			progress = __writeback_inodes_wb(wb, work);
+		progress = writeback_inodes(wb, work);
 		trace_writeback_written(wb->bdi, work);
 
 		wb_update_bandwidth(wb, wb_start);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..9bf6f2da32d2 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -477,7 +477,7 @@ TRACE_EVENT(balance_dirty_pages,
 	  )
 );
 
-TRACE_EVENT(writeback_sb_inodes_requeue,
+TRACE_EVENT(writeback_inodes_requeue,
 
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/4] writeback: Move removal from writeback list in evict()
  2014-05-15 15:41 [PATCH 0/4 RESEND] writeback: Dirty list handling changes Jan Kara
  2014-05-15 15:41 ` [PATCH 1/4] writeback: Get rid of superblock pinning Jan Kara
@ 2014-05-15 15:41 ` Jan Kara
  2014-05-15 15:41 ` [PATCH 3/4] writeback: Replace several writeback lists with inode tagging Jan Kara
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2014-05-15 15:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Wu Fengguang, Andrew Morton, Jan Kara

Remove inode from writeback list only after waiting for flusher work do
be done with it (I_SYNC is cleared). Currently it makes no difference
but later we will depend on the fact that while inode has I_SYNC set, it
is pinned in the dirty writeback list.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/inode.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index f96d2a6f88cc..1be1fb73527c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -533,9 +533,6 @@ static void evict(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(!list_empty(&inode->i_lru));
 
-	if (!list_empty(&inode->i_wb_list))
-		inode_wb_list_del(inode);
-
 	inode_sb_list_del(inode);
 
 	/*
@@ -545,6 +542,8 @@ static void evict(struct inode *inode)
 	 * the inode.  We just have to wait for running writeback to finish.
 	 */
 	inode_wait_for_writeback(inode);
+	if (!list_empty(&inode->i_wb_list))
+		inode_wb_list_del(inode);
 
 	if (op->evict_inode) {
 		op->evict_inode(inode);
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/4] writeback: Replace several writeback lists with inode tagging
  2014-05-15 15:41 [PATCH 0/4 RESEND] writeback: Dirty list handling changes Jan Kara
  2014-05-15 15:41 ` [PATCH 1/4] writeback: Get rid of superblock pinning Jan Kara
  2014-05-15 15:41 ` [PATCH 2/4] writeback: Move removal from writeback list in evict() Jan Kara
@ 2014-05-15 15:41 ` Jan Kara
  2014-05-15 15:41 ` [PATCH 4/4] writeback: Remove pages_skipped from writeback_control Jan Kara
  2014-05-15 23:55 ` [PATCH 0/4 RESEND] writeback: Dirty list handling changes Dave Chinner
  4 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2014-05-15 15:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Wu Fengguang, Andrew Morton, Jan Kara

Currently writeback code maintains three lists with dirty inodes -
b_dirty list, b_io list of inodes to write, and b_more_io list of inodes
which need more writeback. Inodes on b_dirty list are sorted by their
i_dirtied_when timestamp so that we can easily select inodes for kupdate
style writeback and also so that we keep the natural expectation that
first written files should reach the disk first.

This design has however the downside that when we need to move inode
from b_io to b_dirty list (e.g. when writeback for the inode is stalled
because of some locks), we have to set i_dirtied_when timestamp to
current time (finding proper place in the b_dirty list so that it
remains sorted would be too time consuming). So we lose the information
when the inode was dirtied.

In this patch we change writeback code so that is maintains only a
single list of dirty inodes - b_dirty list - sorted by i_dirtied_when
timestamp. Identification of inodes that still need to be considered for
writeback is done by tagging inodes with a flag I_TO_WRITE. This
somewhat simplifies the logic and also allows us not to clobber
i_dirtied_when timestamp.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c                | 273 +++++++++++++--------------------------
 include/linux/backing-dev.h      |   6 +-
 include/linux/fs.h               |   4 +
 include/trace/events/writeback.h |  27 ++--
 mm/backing-dev.c                 |  16 +--
 5 files changed, 116 insertions(+), 210 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b54ec30541e6..6bf9c50ecd53 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -191,37 +191,6 @@ void inode_wb_list_del(struct inode *inode)
 	spin_unlock(&bdi->wb.list_lock);
 }
 
-/*
- * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
- * furthest end of its superblock's dirty-inode list.
- *
- * Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the b_dirty list.  If that is
- * the case then the inode must have been redirtied while it was being written
- * out and we don't reset its dirtied_when.
- */
-static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
-{
-	assert_spin_locked(&wb->list_lock);
-	if (!list_empty(&wb->b_dirty)) {
-		struct inode *tail;
-
-		tail = wb_inode(wb->b_dirty.next);
-		if (time_before(inode->dirtied_when, tail->dirtied_when))
-			inode->dirtied_when = jiffies;
-	}
-	list_move(&inode->i_wb_list, &wb->b_dirty);
-}
-
-/*
- * requeue inode for re-scanning after bdi->b_io list is exhausted.
- */
-static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
-{
-	assert_spin_locked(&wb->list_lock);
-	list_move(&inode->i_wb_list, &wb->b_more_io);
-}
-
 static void inode_sync_complete(struct inode *inode)
 {
 	inode->i_state &= ~I_SYNC;
@@ -247,47 +216,25 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 	return ret;
 }
 
-/*
- * Move expired (dirtied before work->older_than_this) dirty inodes from
- * @delaying_queue to @dispatch_queue.
- */
-static int move_expired_inodes(struct list_head *delaying_queue,
-			       struct list_head *dispatch_queue,
-			       struct wb_writeback_work *work)
+static void tag_for_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
+	int tagged = 0;
 	struct inode *inode;
-	int moved = 0;
 
-	while (!list_empty(delaying_queue)) {
-		inode = wb_inode(delaying_queue->prev);
+	assert_spin_locked(&wb->list_lock);
+	list_for_each_entry(inode, &wb->b_dirty, i_wb_list) {
 		if (work->older_than_this &&
 		    inode_dirtied_after(inode, *work->older_than_this))
 			break;
-		list_move(&inode->i_wb_list, dispatch_queue);
-		moved++;
+		if ((!work->sb || inode->i_sb == work->sb) &&
+		    !(inode->i_state & I_TO_WRITE)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_TO_WRITE;
+			spin_unlock(&inode->i_lock);
+			tagged++;
+		}
 	}
-out:
-	return moved;
-}
-
-/*
- * Queue all expired dirty inodes for io, eldest first.
- * Before
- *         newly dirtied     b_dirty    b_io    b_more_io
- *         =============>    gf         edc     BA
- * After
- *         newly dirtied     b_dirty    b_io    b_more_io
- *         =============>    g          fBAedc
- *                                           |
- *                                           +--> dequeue for IO
- */
-static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
-{
-	int moved;
-	assert_spin_locked(&wb->list_lock);
-	list_splice_init(&wb->b_more_io, &wb->b_io);
-	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
-	trace_writeback_queue_io(wb, work, moved);
+	trace_writeback_tag_for_io(wb, work, tagged);
 }
 
 static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -353,66 +300,37 @@ static void inode_sleep_on_writeback(struct inode *inode)
 }
 
 /*
- * Find proper writeback list for the inode depending on its current state and
- * possibly also change of its state while we were doing writeback.  Here we
- * handle things such as livelock prevention or fairness of writeback among
- * inodes. This function can be called only by flusher thread - noone else
- * processes all inodes in writeback lists and requeueing inodes behind flusher
- * thread's back can have unexpected consequences.
+ * Decide whether we should try further writeback on the inode or exclude it
+ * from this writeback round. Also remove inode from dirty list if it got
+ * clean.
  */
-static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
-			  struct writeback_control *wbc)
+static void requeue_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	if (inode->i_state & I_FREEING)
-		return;
+		goto exclude;
 
-	/*
-	 * Sync livelock prevention. Each inode is tagged and synced in one
-	 * shot. If still dirty, it will be redirty_tail()'ed below.  Update
-	 * the dirty time to prevent enqueue and sync it again.
-	 */
-	if ((inode->i_state & I_DIRTY) &&
-	    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
-		inode->dirtied_when = jiffies;
+	/* The inode is clean. Remove from writeback lists. */
+	if (!(inode->i_state & I_DIRTY)) {
+		list_del_init(&inode->i_wb_list);
+		goto exclude;
+	}
 
-	if (wbc->pages_skipped) {
-		/*
-		 * writeback is not making progress due to locked
-		 * buffers. Skip this inode for now.
-		 */
-		redirty_tail(inode, wb);
+	/* We used up our writeback chunk, give inode another try */
+	if (wbc->nr_to_write <= 0)
 		return;
-	}
 
-	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
-		/*
-		 * We didn't write back all the pages.  nfs_writepages()
-		 * sometimes bales out without doing anything.
-		 */
-		if (wbc->nr_to_write <= 0) {
-			/* Slice used up. Queue for next turn. */
-			requeue_io(inode, wb);
-		} else {
-			/*
-			 * Writeback blocked by something other than
-			 * congestion. Delay the inode for some time to
-			 * avoid spinning on the CPU (100% iowait)
-			 * retrying writeback of the dirty page/inode
-			 * that cannot be performed immediately.
-			 */
-			redirty_tail(inode, wb);
-		}
-	} else if (inode->i_state & I_DIRTY) {
-		/*
-		 * Filesystems can dirty the inode during writeback operations,
-		 * such as delayed allocation during submission or metadata
-		 * updates after data IO completion.
-		 */
-		redirty_tail(inode, wb);
-	} else {
-		/* The inode is clean. Remove from writeback lists. */
-		list_del_init(&inode->i_wb_list);
-	}
+	/*
+	 * In all the other cases we exclude inode from further writeback
+	 * in this writeback round. We know the inode is dirty although
+	 * we didn't use all of our writeback chunk. This means that
+	 * writeback was blocked for some reason or inode was redirtied.
+	 * In the first case we exclude inode from writeback to avoid
+	 * busylooping, in the second case to avoid livelocks. Note that
+	 * for_background and for_kupdate writeback will tag inodes again
+	 * after finishing one pass through the list.
+	 */
+exclude:
+	inode->i_state &= ~I_TO_WRITE;
 }
 
 /*
@@ -523,8 +441,10 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * If inode is clean, remove it from writeback lists. Otherwise don't
 	 * touch it. See comment above for explanation.
 	 */
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY)) {
+		inode->i_state &= ~I_TO_WRITE;
 		list_del_init(&inode->i_wb_list);
+	}
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
 out:
@@ -584,19 +504,24 @@ static long writeback_inodes(struct bdi_writeback *wb,
 	unsigned long start_time = jiffies;
 	long write_chunk;
 	long wrote = 0;  /* count both pages and inodes */
+	struct inode *inode, *next;
 
-	while (!list_empty(&wb->b_io)) {
-		struct inode *inode = wb_inode(wb->b_io.prev);
+restart:
+	/* We use list_safe_reset_next() to make the list iteration safe */
+	list_for_each_entry_safe(inode, next, &wb->b_dirty, i_wb_list) {
+		/* Don't bother scanning too new inodes */
+		if (work->older_than_this &&
+		    inode_dirtied_after(inode, *work->older_than_this))
+			break;
 
-		if (work->sb && inode->i_sb != work->sb) {
-			/*
-			 * We only want to write back data for this
-			 * superblock, move all inodes not belonging
-			 * to it back onto the dirty list.
-			 */
-			redirty_tail(inode, wb);
+		if (!(inode->i_state & I_TO_WRITE))
+			continue;
+		/*
+		 * Skip inodes on different sb. They can be tagged from
+		 * previous writeback rounds.
+		 */
+		if (work->sb && inode->i_sb != work->sb)
 			continue;
-		}
 
 		/*
 		 * Don't bother with new inodes or inodes being freed, first
@@ -605,22 +530,18 @@ static long writeback_inodes(struct bdi_writeback *wb,
 		 */
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			inode->i_state &= ~I_TO_WRITE;
 			spin_unlock(&inode->i_lock);
-			redirty_tail(inode, wb);
 			continue;
 		}
 		if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
 			/*
 			 * If this inode is locked for writeback and we are not
-			 * doing writeback-for-data-integrity, move it to
-			 * b_more_io so that writeback can proceed with the
-			 * other inodes on s_io.
-			 *
-			 * We'll have another go at writing back this inode
-			 * when we completed a full scan of b_io.
+			 * doing writeback-for-data-integrity, exclude inode
+			 * from writeback.
 			 */
+			inode->i_state &= ~I_TO_WRITE;
 			spin_unlock(&inode->i_lock);
-			requeue_io(inode, wb);
 			trace_writeback_inodes_requeue(inode);
 			continue;
 		}
@@ -632,11 +553,17 @@ static long writeback_inodes(struct bdi_writeback *wb,
 		 * WB_SYNC_ALL case.
 		 */
 		if (inode->i_state & I_SYNC) {
+			trace_writeback_inodes_restart(inode);
 			/* Wait for I_SYNC. This function drops i_lock... */
 			inode_sleep_on_writeback(inode);
-			/* Inode may be gone, start again */
 			spin_lock(&wb->list_lock);
-			continue;
+			/*
+			 * Start again since inode may be gone so we have no
+			 * fixed point in list to start from. Luckily this case
+			 * is rare (it requires someone else to call
+			 * sync_inode() or a similar function).
+			 */
+			goto restart;
 		}
 		inode->i_state |= I_SYNC;
 		spin_unlock(&inode->i_lock);
@@ -657,10 +584,15 @@ static long writeback_inodes(struct bdi_writeback *wb,
 		spin_lock(&inode->i_lock);
 		if (!(inode->i_state & I_DIRTY))
 			wrote++;
-		requeue_inode(inode, wb, &wbc);
+		/*
+		 * Update next after retaking list_lock but before removing
+		 * inode from the list in requeue_inode(). We are guaranteed
+		 * current inode stays in the list because it has I_SYNC set.
+		 */
+		list_safe_reset_next(inode, next, i_wb_list);
+		requeue_inode(inode, &wbc);
 		inode_sync_complete(inode);
 		spin_unlock(&inode->i_lock);
-		cond_resched_lock(&wb->list_lock);
 		/*
 		 * bail out to wb_writeback() often enough to check
 		 * background threshold and other termination conditions.
@@ -686,8 +618,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 	};
 
 	spin_lock(&wb->list_lock);
-	if (list_empty(&wb->b_io))
-		queue_io(wb, &work);
+	tag_for_io(wb, &work);
 	writeback_inodes(wb, &work);
 	spin_unlock(&wb->list_lock);
 
@@ -742,14 +673,21 @@ static long wb_writeback(struct bdi_writeback *wb,
 	unsigned long wb_start = jiffies;
 	long nr_pages = work->nr_pages;
 	unsigned long oldest_jif;
-	struct inode *inode;
 	long progress;
 
+	/*
+	 * We protect against writeback livelocks when creating new files by
+	 * writing only files created before writeback started. Background and
+	 * kupdate style writeback are handled in a special way below.
+	 */
 	oldest_jif = jiffies;
 	work->older_than_this = &oldest_jif;
 
 	spin_lock(&wb->list_lock);
-	for (;;) {
+	if (!work->for_background && !work->for_kupdate)
+		tag_for_io(wb, work);
+
+	do {
 		/*
 		 * Stop writeback when nr_pages has been consumed
 		 */
@@ -782,47 +720,18 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (work->for_kupdate) {
 			oldest_jif = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
-		} else if (work->for_background)
+			tag_for_io(wb, work);
+		} else if (work->for_background) {
 			oldest_jif = jiffies;
+			tag_for_io(wb, work);
+		}
 
 		trace_writeback_start(wb->bdi, work);
-		if (list_empty(&wb->b_io))
-			queue_io(wb, work);
 		progress = writeback_inodes(wb, work);
 		trace_writeback_written(wb->bdi, work);
 
 		wb_update_bandwidth(wb, wb_start);
-
-		/*
-		 * Did we write something? Try for more
-		 *
-		 * Dirty inodes are moved to b_io for writeback in batches.
-		 * The completion of the current batch does not necessarily
-		 * mean the overall work is done. So we keep looping as long
-		 * as made some progress on cleaning pages or inodes.
-		 */
-		if (progress)
-			continue;
-		/*
-		 * No more inodes for IO, bail
-		 */
-		if (list_empty(&wb->b_more_io))
-			break;
-		/*
-		 * Nothing written. Wait for some inode to
-		 * become available for writeback. Otherwise
-		 * we'll just busyloop.
-		 */
-		if (!list_empty(&wb->b_more_io))  {
-			trace_writeback_wait(wb->bdi, work);
-			inode = wb_inode(wb->b_more_io.prev);
-			spin_lock(&inode->i_lock);
-			spin_unlock(&wb->list_lock);
-			/* This function drops i_lock... */
-			inode_sleep_on_writeback(inode);
-			spin_lock(&wb->list_lock);
-		}
-	}
+	} while (progress);
 	spin_unlock(&wb->list_lock);
 
 	return nr_pages - work->nr_pages;
@@ -1113,8 +1022,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			goto out_unlock_inode;
 
 		/*
-		 * If the inode was already on b_dirty/b_io/b_more_io, don't
-		 * reposition it (that would break b_dirty time-ordering).
+		 * If the inode was already on b_dirty, don't reposition it
+		 * (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
 			bool wakeup_bdi = false;
@@ -1137,7 +1046,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			list_move_tail(&inode->i_wb_list, &bdi->wb.b_dirty);
 			spin_unlock(&bdi->wb.list_lock);
 
 			if (wakeup_bdi)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e488e9459a93..897f6ff8c982 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -56,8 +56,6 @@ struct bdi_writeback {
 
 	struct delayed_work dwork;	/* work item used for writeback */
 	struct list_head b_dirty;	/* dirty inodes */
-	struct list_head b_io;		/* parked for writeback */
-	struct list_head b_more_io;	/* parked for more writeback */
 	spinlock_t list_lock;		/* protects the b_* lists */
 };
 
@@ -133,9 +131,7 @@ extern struct workqueue_struct *bdi_wq;
 
 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 {
-	return !list_empty(&wb->b_dirty) ||
-	       !list_empty(&wb->b_io) ||
-	       !list_empty(&wb->b_more_io);
+	return !list_empty(&wb->b_dirty);
 }
 
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 878031227c57..99fcc1221c24 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1661,6 +1661,9 @@ struct super_operations {
  *
  * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
  *
+ * I_TO_WRITE		Used by flusher thread to track which inodes need
+ *			writing in this writeback round.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -1677,6 +1680,7 @@ struct super_operations {
 #define __I_DIO_WAKEUP		9
 #define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
 #define I_LINKABLE		(1 << 10)
+#define I_TO_WRITE		(1 << 11)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 9bf6f2da32d2..071eb2eea350 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -188,7 +188,6 @@ DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
-DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);
 
 TRACE_EVENT(writeback_pages_written,
 	TP_PROTO(long pages_written),
@@ -275,16 +274,16 @@ DEFINE_EVENT(wbc_class, name, \
 	TP_ARGS(wbc, bdi))
 DEFINE_WBC_EVENT(wbc_writepage);
 
-TRACE_EVENT(writeback_queue_io,
+TRACE_EVENT(writeback_tag_for_io,
 	TP_PROTO(struct bdi_writeback *wb,
 		 struct wb_writeback_work *work,
-		 int moved),
-	TP_ARGS(wb, work, moved),
+		 int tagged),
+	TP_ARGS(wb, work, tagged),
 	TP_STRUCT__entry(
 		__array(char,		name, 32)
 		__field(unsigned long,	older)
 		__field(long,		age)
-		__field(int,		moved)
+		__field(int,		tagged)
 		__field(int,		reason)
 	),
 	TP_fast_assign(
@@ -293,14 +292,14 @@ TRACE_EVENT(writeback_queue_io,
 		__entry->older	= older_than_this ?  *older_than_this : 0;
 		__entry->age	= older_than_this ?
 				  (jiffies - *older_than_this) * 1000 / HZ : -1;
-		__entry->moved	= moved;
+		__entry->tagged	= tagged;
 		__entry->reason	= work->reason;
 	),
-	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
+	TP_printk("bdi %s: older=%lu age=%ld tagged=%d reason=%s",
 		__entry->name,
 		__entry->older,	/* older_than_this in jiffies */
 		__entry->age,	/* older_than_this in relative milliseconds */
-		__entry->moved,
+		__entry->tagged,
 		__print_symbolic(__entry->reason, WB_WORK_REASON)
 	)
 );
@@ -477,7 +476,7 @@ TRACE_EVENT(balance_dirty_pages,
 	  )
 );
 
-TRACE_EVENT(writeback_inodes_requeue,
+DECLARE_EVENT_CLASS(writeback_inodes_template,
 
 	TP_PROTO(struct inode *inode),
 	TP_ARGS(inode),
@@ -506,6 +505,16 @@ TRACE_EVENT(writeback_inodes_requeue,
 	)
 );
 
+DEFINE_EVENT(writeback_inodes_template, writeback_inodes_requeue,
+	TP_PROTO(struct inode *inode),
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_inodes_template, writeback_inodes_restart,
+	TP_PROTO(struct inode *inode),
+	TP_ARGS(inode)
+);
+
 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
 
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 09d9591b7708..322d704c6cfe 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -69,17 +69,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
-	unsigned long nr_dirty, nr_io, nr_more_io;
+	unsigned long nr_dirty;
 	struct inode *inode;
 
-	nr_dirty = nr_io = nr_more_io = 0;
+	nr_dirty = 0;
 	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
-	list_for_each_entry(inode, &wb->b_io, i_wb_list)
-		nr_io++;
-	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
-		nr_more_io++;
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -96,8 +92,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "BdiWritten:         %10lu kB\n"
 		   "BdiWriteBandwidth:  %10lu kBps\n"
 		   "b_dirty:            %10lu\n"
-		   "b_io:               %10lu\n"
-		   "b_more_io:          %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -109,8 +103,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
 		   (unsigned long) K(bdi->write_bandwidth),
 		   nr_dirty,
-		   nr_io,
-		   nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
@@ -428,8 +420,6 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	wb->bdi = bdi;
 	wb->last_old_flush = jiffies;
 	INIT_LIST_HEAD(&wb->b_dirty);
-	INIT_LIST_HEAD(&wb->b_io);
-	INIT_LIST_HEAD(&wb->b_more_io);
 	spin_lock_init(&wb->list_lock);
 	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
@@ -495,8 +485,6 @@ void bdi_destroy(struct backing_dev_info *bdi)
 
 		bdi_lock_two(&bdi->wb, dst);
 		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
-		list_splice(&bdi->wb.b_io, &dst->b_io);
-		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
 		spin_unlock(&bdi->wb.list_lock);
 		spin_unlock(&dst->list_lock);
 	}
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/4] writeback: Remove pages_skipped from writeback_control
  2014-05-15 15:41 [PATCH 0/4 RESEND] writeback: Dirty list handling changes Jan Kara
                   ` (2 preceding siblings ...)
  2014-05-15 15:41 ` [PATCH 3/4] writeback: Replace several writeback lists with inode tagging Jan Kara
@ 2014-05-15 15:41 ` Jan Kara
  2014-05-15 23:05   ` Dave Chinner
  2014-05-15 23:55 ` [PATCH 0/4 RESEND] writeback: Dirty list handling changes Dave Chinner
  4 siblings, 1 reply; 12+ messages in thread
From: Jan Kara @ 2014-05-15 15:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: Wu Fengguang, Andrew Morton, Jan Kara

pages_skipped in struct writeback_control isn't used anymore. Remove it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/btrfs/extent_io.c             |  4 +---
 fs/ceph/addr.c                   |  2 --
 fs/f2fs/checkpoint.c             |  1 -
 fs/f2fs/data.c                   |  1 -
 fs/f2fs/node.c                   |  1 -
 fs/fs-writeback.c                |  1 -
 include/linux/writeback.h        |  2 --
 include/trace/events/btrfs.h     |  9 +++------
 include/trace/events/ext4.h      | 14 ++++----------
 include/trace/events/writeback.h |  8 ++------
 mm/page-writeback.c              |  1 -
 11 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3955e475ceec..b727cc413aec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3240,9 +3240,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 						      page_end);
 		if (ret) {
 			/* Fixup worker will requeue */
-			if (ret == -EBUSY)
-				wbc->pages_skipped++;
-			else
+			if (ret != -EBUSY)
 				redirty_page_for_writepage(wbc, page);
 			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b53278c9fd97..7943d4197514 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -526,8 +526,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
 		mapping_set_error(&inode->i_data, err);
-		if (wbc)
-			wbc->pages_skipped++;
 	} else {
 		dout("writepage cleaned page %p\n", page);
 		err = 0;  /* vfs expects us to return 0 */
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 4aa521aa9bc3..9beb9e218f19 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -175,7 +175,6 @@ no_write:
 
 redirty_out:
 	dec_page_count(sbi, F2FS_DIRTY_META);
-	wbc->pages_skipped++;
 	account_page_redirty(page);
 	set_page_dirty(page);
 	return AOP_WRITEPAGE_ACTIVATE;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 45abd60e2bff..fd5825f0c1db 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -838,7 +838,6 @@ out:
 	return 0;
 
 redirty_out:
-	wbc->pages_skipped++;
 	account_page_redirty(page);
 	set_page_dirty(page);
 	return AOP_WRITEPAGE_ACTIVATE;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a161e955c4c8..b294debb8200 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1228,7 +1228,6 @@ static int f2fs_write_node_page(struct page *page,
 
 redirty_out:
 	dec_page_count(sbi, F2FS_DIRTY_NODES);
-	wbc->pages_skipped++;
 	account_page_redirty(page);
 	set_page_dirty(page);
 	return AOP_WRITEPAGE_ACTIVATE;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6bf9c50ecd53..426ff81d9863 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -570,7 +570,6 @@ restart:
 
 		write_chunk = writeback_chunk_size(wb->bdi, work);
 		wbc.nr_to_write = write_chunk;
-		wbc.pages_skipped = 0;
 
 		/*
 		 * We use I_SYNC to pin the inode in memory. While it is set
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 5777c13849ba..5d15d6299347 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -66,8 +66,6 @@ enum wb_reason {
 struct writeback_control {
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
-	long pages_skipped;		/* Pages which were not written */
-
 	/*
 	 * For a_ops->writepages(): if start or end are non-zero then this is
 	 * a hint that the filesystem need only write out the pages inside that
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 4ee4e30d26d9..95ec983b55e4 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -311,7 +311,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		__field(	ino_t,  ino			)
 		__field(	pgoff_t,  index			)
 		__field(	long,   nr_to_write		)
-		__field(	long,   pages_skipped		)
 		__field(	loff_t, range_start		)
 		__field(	loff_t, range_end		)
 		__field(	char,   for_kupdate		)
@@ -325,7 +324,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		__entry->ino		= inode->i_ino;
 		__entry->index		= page->index;
 		__entry->nr_to_write	= wbc->nr_to_write;
-		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
 		__entry->for_kupdate	= wbc->for_kupdate;
@@ -337,14 +335,13 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 	),
 
 	TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
-		  "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
+		  "nr_to_write = %ld, range_start = %llu, "
 		  "range_end = %llu, for_kupdate = %d, "
 		  "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long)__entry->ino, __entry->index,
-		  __entry->nr_to_write, __entry->pages_skipped,
-		  __entry->range_start, __entry->range_end,
-		  __entry->for_kupdate,
+		  __entry->nr_to_write, __entry->range_start,
+		  __entry->range_end, __entry->for_kupdate,
 		  __entry->for_reclaim, __entry->range_cyclic,
 		  (unsigned long)__entry->writeback_index)
 );
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6a1a0245474f..20a4b1b0511a 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -347,7 +347,6 @@ TRACE_EVENT(ext4_writepages,
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(	long,	nr_to_write		)
-		__field(	long,	pages_skipped		)
 		__field(	loff_t,	range_start		)
 		__field(	loff_t,	range_end		)
 		__field(       pgoff_t,	writeback_index		)
@@ -360,7 +359,6 @@ TRACE_EVENT(ext4_writepages,
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->nr_to_write	= wbc->nr_to_write;
-		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
@@ -369,13 +367,12 @@ TRACE_EVENT(ext4_writepages,
 		__entry->range_cyclic	= wbc->range_cyclic;
 	),
 
-	TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
+	TP_printk("dev %d,%d ino %lu nr_to_write %ld "
 		  "range_start %lld range_end %lld sync_mode %d "
 		  "for_kupdate %d range_cyclic %d writeback_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->nr_to_write,
-		  __entry->pages_skipped, __entry->range_start,
-		  __entry->range_end, __entry->sync_mode,
+		  __entry->range_start, __entry->range_end, __entry->sync_mode,
 		  __entry->for_kupdate, __entry->range_cyclic,
 		  (unsigned long) __entry->writeback_index)
 );
@@ -447,7 +444,6 @@ TRACE_EVENT(ext4_writepages_result,
 		__field(	ino_t,	ino			)
 		__field(	int,	ret			)
 		__field(	int,	pages_written		)
-		__field(	long,	pages_skipped		)
 		__field(       pgoff_t,	writeback_index		)
 		__field(	int,	sync_mode		)
 	),
@@ -457,17 +453,15 @@ TRACE_EVENT(ext4_writepages_result,
 		__entry->ino		= inode->i_ino;
 		__entry->ret		= ret;
 		__entry->pages_written	= pages_written;
-		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 		__entry->sync_mode	= wbc->sync_mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
+	TP_printk("dev %d,%d ino %lu ret %d pages_written %d "
 		  "sync_mode %d writeback_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->ret,
-		  __entry->pages_written, __entry->pages_skipped,
-		  __entry->sync_mode,
+		  __entry->pages_written, __entry->sync_mode,
 		  (unsigned long) __entry->writeback_index)
 );
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 071eb2eea350..a76d4e8e7ed5 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -230,7 +230,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 	TP_STRUCT__entry(
 		__array(char, name, 32)
 		__field(long, nr_to_write)
-		__field(long, pages_skipped)
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, for_background)
@@ -243,7 +242,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 	TP_fast_assign(
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_to_write	= wbc->nr_to_write;
-		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->sync_mode	= wbc->sync_mode;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_background	= wbc->for_background;
@@ -253,12 +251,10 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->range_end	= (long)wbc->range_end;
 	),
 
-	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
-		"bgrd=%d reclm=%d cyclic=%d "
-		"start=0x%lx end=0x%lx",
+	TP_printk("bdi %s: towrt=%ld mode=%d kupd=%d bgrd=%d reclm=%d "
+		"cyclic=%d start=0x%lx end=0x%lx",
 		__entry->name,
 		__entry->nr_to_write,
-		__entry->pages_skipped,
 		__entry->sync_mode,
 		__entry->for_kupdate,
 		__entry->for_background,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a4317da60532..ebccf20d098d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2223,7 +2223,6 @@ EXPORT_SYMBOL(account_page_redirty);
  */
 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 {
-	wbc->pages_skipped++;
 	account_page_redirty(page);
 	return __set_page_dirty_nobuffers(page);
 }
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] writeback: Remove pages_skipped from writeback_control
  2014-05-15 15:41 ` [PATCH 4/4] writeback: Remove pages_skipped from writeback_control Jan Kara
@ 2014-05-15 23:05   ` Dave Chinner
  0 siblings, 0 replies; 12+ messages in thread
From: Dave Chinner @ 2014-05-15 23:05 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, Wu Fengguang, Andrew Morton

On Thu, May 15, 2014 at 05:41:57PM +0200, Jan Kara wrote:
> pages_skipped in struct writeback_control isn't used anymore. Remove it.

f2fs uses pages_skipped in 3.15-rc5 and so this doesn't compile.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4 RESEND] writeback: Dirty list handling changes
  2014-05-15 15:41 [PATCH 0/4 RESEND] writeback: Dirty list handling changes Jan Kara
                   ` (3 preceding siblings ...)
  2014-05-15 15:41 ` [PATCH 4/4] writeback: Remove pages_skipped from writeback_control Jan Kara
@ 2014-05-15 23:55 ` Dave Chinner
  2014-05-16  0:47   ` Dave Chinner
  4 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2014-05-15 23:55 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, Wu Fengguang, Andrew Morton

On Thu, May 15, 2014 at 05:41:53PM +0200, Jan Kara wrote:
>   Hello,
> 
>   so I was recently thinking about how writeback code shuffles inodes between
> lists and also how redirty_tail() clobbers dirtied_when timestamp (which broke
> my sync(2) optimization). This patch series came out of that. Patch 1 is a
> clear win and just needs an independent review that I didn't forget about
> something. Patch 3 changes writeback list handling - IMHO it makes the logic
> somewhat more straightforward as we don't have to bother shuffling inodes
> between lists and we also don't need to clobber dirtied_when timestamp.
> But opinions may differ...
> 
> Patches passed xfstests run and I did some basic writeback tests using tiobench
> and some artifical sync livelock tests to verify nothing regressed. So I'd
> be happy if people could have a look.

Performance regresses significantly.

Test is on a 16p/16GB VM with a sparse 100TB XFS filesystem backed
by a pair of SSDs in RAID0:

./fs_mark  -D  10000  -S0  -n  10000  -s  4096  -L  120  -d
/mnt/scratch/0  -d  /mnt/scratch/1  -d  /mnt/scratch/2  -d
/mnt/scratch/3  -d  /mnt/scratch/4  -d  /mnt/scratch/5  -d
/mnt/scratch/6  -d  /mnt/scratch/7

That creates 10 million 4k files with 16 threads and 10000 files per
directory. No sync/fsync is done, so it's a pure background
writeback workload.

For 0-400,000 files, it runs in memory, at 400-800k files background
writeback is occurring, at > 800k files foreground throttling is
occurring.

The file create rates and write IOPS/bw are:

                  vanilla		    patched
load point     files  iops  bw		files iops  bw
< bg thres	120k   0     0		 110k   0    0
< fg thres	120k  37k  210MB/s	  60k  20k 130MB/s
sustained	 36k  37k  210MB/s	  25k  28k 150MB/s


The average create rate is 40k (vanilla) vs 28k (patched). Wall
times:

         vanilla	  patched
real    4m27.475s	 6m29.364s
user    1m7.072s	 1m3.590s
sys     10m0.836s	 22m34.362s

The new code burns more than twice the system CPU whilst going
significantly slower.

I haven't done any further investigation to determine which patch
causes the regression, but it's large enough you shoul dbe able to
reproduce it.

BTW, while touching this code, we should also add plugging at the
upper inode writeback level - it provides a 20% performance boost to
this workload. The numbers in the patch description below are old,
but I just verified 3.15-rc5 gives the same scale of improvement.
e.g. it almost completely negates the throughput and wall time
regressions this this patchset introduces:

                  vanilla	    patched		patched+plug
load point   files  iops  bw	 files iops  bw	   files iops  bw
< bg thres    120k   0     0	  110k   0    0     120k  0     0
< fg thres    120k  37k  210MB/s  60k  20k 130MB/s  80k   1k 180MB/s
sustained      36k  37k  210MB/s  25k  28k 150MB/s  33k 1.5k 200MB/s
real             4m27.475s	     6m29.364s		4m40.524s
user             1m7.072s	     1m3.590s		0m55.819s
sys              10m0.836s	     22m34.362s		18m8.130s

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

writeback: plug writeback at a high level

From: Dave Chinner <dchinner@redhat.com>

Doing writeback on lots of little files causes terrible IOPS storms
because of the per-mapping writeback plugging we do. This
essentially causes imeediate dispatch of IO for each mapping,
regardless of the context in which writeback is occurring.

IOWs, running a concurrent write-lots-of-small 4k files using fsmark
on XFS results in a huge number of IOPS being issued for data
writes.  Metadata writes are sorted and plugged at a high level by
XFS, so aggregate nicely into large IOs. However, data writeback IOs
are dispatched in individual 4k IOs, even when the blocks of two
consecutively written files are adjacent.

Test VM: 8p, 8GB RAM, 4xSSD in RAID0, 100TB sparse XFS filesystem,
metadata CRCs enabled.

Kernel: 3.10-rc5 + xfsdev + my 3.11 xfs queue (~70 patches)

Test:

$ ./fs_mark  -D  10000  -S0  -n  10000  -s  4096  -L  120  -d
/mnt/scratch/0  -d  /mnt/scratch/1  -d  /mnt/scratch/2  -d
/mnt/scratch/3  -d  /mnt/scratch/4  -d  /mnt/scratch/5  -d
/mnt/scratch/6  -d  /mnt/scratch/7

Result:

		wall	sys	create rate	Physical write IO
		time	CPU	(avg files/s)	 IOPS	Bandwidth
		-----	-----	------------	------	---------
unpatched	6m56s	15m47s	24,000+/-500	26,000	130MB/s
patched		5m06s	13m28s	32,800+/-600	 1,500	180MB/s
improvement	-26.44%	-14.68%	  +36.67%	-94.23%	+38.46%

If I use zero length files, this workload at about 500 IOPS, so
plugging drops the data IOs from roughly 25,500/s to 1000/s.
3 lines of code, 35% better throughput for 15% less CPU.

The benefits of plugging at this layer are likely to be higher for
spinning media as the IO patterns for this workload are going make a
much bigger difference on high IO latency devices.....

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 426ff81..7cd2b3a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -505,6 +505,9 @@ static long writeback_inodes(struct bdi_writeback *wb,
 	long write_chunk;
 	long wrote = 0;  /* count both pages and inodes */
 	struct inode *inode, *next;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
 
 restart:
 	/* We use list_safe_reset_next() to make the list iteration safe */
@@ -603,6 +606,7 @@ restart:
 				break;
 		}
 	}
+	blk_finish_plug(&plug);
 	return wrote;
 }
 

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4 RESEND] writeback: Dirty list handling changes
  2014-05-15 23:55 ` [PATCH 0/4 RESEND] writeback: Dirty list handling changes Dave Chinner
@ 2014-05-16  0:47   ` Dave Chinner
  2014-05-16  9:27     ` Christoph Hellwig
  2014-05-19 16:47     ` Jan Kara
  0 siblings, 2 replies; 12+ messages in thread
From: Dave Chinner @ 2014-05-16  0:47 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, Wu Fengguang, Andrew Morton

On Fri, May 16, 2014 at 09:55:14AM +1000, Dave Chinner wrote:
> On Thu, May 15, 2014 at 05:41:53PM +0200, Jan Kara wrote:
> >   Hello,
> > 
> >   so I was recently thinking about how writeback code shuffles inodes between
> > lists and also how redirty_tail() clobbers dirtied_when timestamp (which broke
> > my sync(2) optimization). This patch series came out of that. Patch 1 is a
> > clear win and just needs an independent review that I didn't forget about
> > something. Patch 3 changes writeback list handling - IMHO it makes the logic
> > somewhat more straightforward as we don't have to bother shuffling inodes
> > between lists and we also don't need to clobber dirtied_when timestamp.
> > But opinions may differ...
> > 
> > Patches passed xfstests run and I did some basic writeback tests using tiobench
> > and some artifical sync livelock tests to verify nothing regressed. So I'd
> > be happy if people could have a look.
> 
> Performance regresses significantly.
> 
> Test is on a 16p/16GB VM with a sparse 100TB XFS filesystem backed
> by a pair of SSDs in RAID0:
> 
> ./fs_mark  -D  10000  -S0  -n  10000  -s  4096  -L  120  -d
> /mnt/scratch/0  -d  /mnt/scratch/1  -d  /mnt/scratch/2  -d
> /mnt/scratch/3  -d  /mnt/scratch/4  -d  /mnt/scratch/5  -d
> /mnt/scratch/6  -d  /mnt/scratch/7
> 
> That creates 10 million 4k files with 16 threads and 10000 files per
> directory. No sync/fsync is done, so it's a pure background
> writeback workload.
> 
> For 0-400,000 files, it runs in memory, at 400-800k files background
> writeback is occurring, at > 800k files foreground throttling is
> occurring.
> 
> The file create rates and write IOPS/bw are:
> 
>                   vanilla		    patched
> load point     files  iops  bw		files iops  bw
> < bg thres	120k   0     0		 110k   0    0
> < fg thres	120k  37k  210MB/s	  60k  20k 130MB/s
> sustained	 36k  37k  210MB/s	  25k  28k 150MB/s
> 
> 
> The average create rate is 40k (vanilla) vs 28k (patched). Wall
> times:
> 
>          vanilla	  patched
> real    4m27.475s	 6m29.364s
> user    1m7.072s	 1m3.590s
> sys     10m0.836s	 22m34.362s
> 
> The new code burns more than twice the system CPU whilst going
> significantly slower.

OK, so it looks like we've got a case of lock contention:

  48.37%  [kernel]  [k] _raw_spin_lock
   - _raw_spin_lock
      - 96.32% __mark_inode_dirty
         - 99.99% __set_page_dirty
              mark_buffer_dirty
              __block_commit_write.isra.25
              block_write_end
              generic_write_end
              xfs_vm_write_end
              generic_perform_write
              xfs_file_buffered_aio_write
              xfs_file_aio_write
              do_sync_write
              vfs_write
              sys_write
              tracesys
              __GI___libc_write
      + 1.80% xfs_log_commit_cil

Which is almost certainly the bdi->wb.list_lock.

-   6.70%  [kernel]  [k] tag_for_io
     tag_for_io
     wb_writeback
     bdi_writeback_workfn
     process_one_work
     worker_thread
     kthread
     ret_from_fork

But that's still a lot of CPU in tag_for_io().

Ah, that's the cause. tag_for_io() holds the bdi->wb.list_lock
for the entire list walk tagging inodes. That explains why
performance doesn't start to drop until writeback begins. Ouch:

writeback_tag_for_io: bdi 253:32: older=4295532768 age=56 tagged=14092 reason=background
writeback_tag_for_io: bdi 253:32: older=4295532808 age=56 tagged=13556 reason=background
writeback_tag_for_io: bdi 253:32: older=4295532848 age=56 tagged=12387 reason=background
writeback_tag_for_io: bdi 253:32: older=4295532888 age=60 tagged=11414 reason=background
writeback_tag_for_io: bdi 253:32: older=4295532929 age=60 tagged=11128 reason=background
writeback_tag_for_io: bdi 253:32: older=4295532970 age=60 tagged=10457 reason=background

We're holding the log while we walk 10,000+ inodes at a time and
the other 15 CPUs are spinning on that.

List walking and tagging like this is never going to scale. I think
we do need multiple lists, but not like we currently have. Right now
the problem is that inodes of different dirtied_when sit on the same
ordered list, and we need to find the first inode with a specific
dirtied_when value within the list efficiently to be able to
determine what inodes need to be written back.

Seems to me that this problem has been solved elswhere in the
kernel, like for tracking of timers to expire on a given jiffie
(tvec, tvec_base, timer_lists). Perhaps we should be looking to move
to a set of time based lists for efficiently tracking what inodes
should be written back at a given background writeback interval
rather than trying to keep everything on the one list.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4 RESEND] writeback: Dirty list handling changes
  2014-05-16  0:47   ` Dave Chinner
@ 2014-05-16  9:27     ` Christoph Hellwig
  2014-05-16 22:42       ` Dave Chinner
  2014-05-19 16:47     ` Jan Kara
  1 sibling, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2014-05-16  9:27 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jan Kara, linux-fsdevel, Wu Fengguang, Andrew Morton

> Seems to me that this problem has been solved elswhere in the
> kernel, like for tracking of timers to expire on a given jiffie
> (tvec, tvec_base, timer_lists). Perhaps we should be looking to move
> to a set of time based lists for efficiently tracking what inodes
> should be written back at a given background writeback interval
> rather than trying to keep everything on the one list.

It's also pretty similar to the mru cache we use for filesystems in XFS,
except that we'd start writeback instead of deleting items.  The whole
bucket scheme there should also help with I/O batching.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4 RESEND] writeback: Dirty list handling changes
  2014-05-16  9:27     ` Christoph Hellwig
@ 2014-05-16 22:42       ` Dave Chinner
  0 siblings, 0 replies; 12+ messages in thread
From: Dave Chinner @ 2014-05-16 22:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jan Kara, linux-fsdevel, Wu Fengguang, Andrew Morton

On Fri, May 16, 2014 at 02:27:20AM -0700, Christoph Hellwig wrote:
> > Seems to me that this problem has been solved elswhere in the
> > kernel, like for tracking of timers to expire on a given jiffie
> > (tvec, tvec_base, timer_lists). Perhaps we should be looking to move
> > to a set of time based lists for efficiently tracking what inodes
> > should be written back at a given background writeback interval
> > rather than trying to keep everything on the one list.
> 
> It's also pretty similar to the mru cache we use for filesystems in XFS,
> except that we'd start writeback instead of deleting items.  The whole
> bucket scheme there should also help with I/O batching.

*nod*

I thought about that, but then though that an example from a core
subsystem would be better ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4 RESEND] writeback: Dirty list handling changes
  2014-05-16  0:47   ` Dave Chinner
  2014-05-16  9:27     ` Christoph Hellwig
@ 2014-05-19 16:47     ` Jan Kara
  1 sibling, 0 replies; 12+ messages in thread
From: Jan Kara @ 2014-05-19 16:47 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jan Kara, linux-fsdevel, Wu Fengguang, Andrew Morton

On Fri 16-05-14 10:47:54, Dave Chinner wrote:
> On Fri, May 16, 2014 at 09:55:14AM +1000, Dave Chinner wrote:
> > On Thu, May 15, 2014 at 05:41:53PM +0200, Jan Kara wrote:
> > >   Hello,
> > > 
> > >   so I was recently thinking about how writeback code shuffles inodes between
> > > lists and also how redirty_tail() clobbers dirtied_when timestamp (which broke
> > > my sync(2) optimization). This patch series came out of that. Patch 1 is a
> > > clear win and just needs an independent review that I didn't forget about
> > > something. Patch 3 changes writeback list handling - IMHO it makes the logic
> > > somewhat more straightforward as we don't have to bother shuffling inodes
> > > between lists and we also don't need to clobber dirtied_when timestamp.
> > > But opinions may differ...
> > > 
> > > Patches passed xfstests run and I did some basic writeback tests using tiobench
> > > and some artifical sync livelock tests to verify nothing regressed. So I'd
> > > be happy if people could have a look.
> > 
> > Performance regresses significantly.
> > 
> > Test is on a 16p/16GB VM with a sparse 100TB XFS filesystem backed
> > by a pair of SSDs in RAID0:
> > 
> > ./fs_mark  -D  10000  -S0  -n  10000  -s  4096  -L  120  -d
> > /mnt/scratch/0  -d  /mnt/scratch/1  -d  /mnt/scratch/2  -d
> > /mnt/scratch/3  -d  /mnt/scratch/4  -d  /mnt/scratch/5  -d
> > /mnt/scratch/6  -d  /mnt/scratch/7
> > 
> > That creates 10 million 4k files with 16 threads and 10000 files per
> > directory. No sync/fsync is done, so it's a pure background
> > writeback workload.
> > 
> > For 0-400,000 files, it runs in memory, at 400-800k files background
> > writeback is occurring, at > 800k files foreground throttling is
> > occurring.
> > 
> > The file create rates and write IOPS/bw are:
> > 
> >                   vanilla		    patched
> > load point     files  iops  bw		files iops  bw
> > < bg thres	120k   0     0		 110k   0    0
> > < fg thres	120k  37k  210MB/s	  60k  20k 130MB/s
> > sustained	 36k  37k  210MB/s	  25k  28k 150MB/s
> > 
> > 
> > The average create rate is 40k (vanilla) vs 28k (patched). Wall
> > times:
> > 
> >          vanilla	  patched
> > real    4m27.475s	 6m29.364s
> > user    1m7.072s	 1m3.590s
> > sys     10m0.836s	 22m34.362s
> > 
> > The new code burns more than twice the system CPU whilst going
> > significantly slower.
> 
> OK, so it looks like we've got a case of lock contention:
> 
>   48.37%  [kernel]  [k] _raw_spin_lock
>    - _raw_spin_lock
>       - 96.32% __mark_inode_dirty
>          - 99.99% __set_page_dirty
>               mark_buffer_dirty
>               __block_commit_write.isra.25
>               block_write_end
>               generic_write_end
>               xfs_vm_write_end
>               generic_perform_write
>               xfs_file_buffered_aio_write
>               xfs_file_aio_write
>               do_sync_write
>               vfs_write
>               sys_write
>               tracesys
>               __GI___libc_write
>       + 1.80% xfs_log_commit_cil
> 
> Which is almost certainly the bdi->wb.list_lock.
> 
> -   6.70%  [kernel]  [k] tag_for_io
>      tag_for_io
>      wb_writeback
>      bdi_writeback_workfn
>      process_one_work
>      worker_thread
>      kthread
>      ret_from_fork
> 
> But that's still a lot of CPU in tag_for_io().
> 
> Ah, that's the cause. tag_for_io() holds the bdi->wb.list_lock
> for the entire list walk tagging inodes. That explains why
> performance doesn't start to drop until writeback begins. Ouch:
> 
> writeback_tag_for_io: bdi 253:32: older=4295532768 age=56 tagged=14092 reason=background
> writeback_tag_for_io: bdi 253:32: older=4295532808 age=56 tagged=13556 reason=background
> writeback_tag_for_io: bdi 253:32: older=4295532848 age=56 tagged=12387 reason=background
> writeback_tag_for_io: bdi 253:32: older=4295532888 age=60 tagged=11414 reason=background
> writeback_tag_for_io: bdi 253:32: older=4295532929 age=60 tagged=11128 reason=background
> writeback_tag_for_io: bdi 253:32: older=4295532970 age=60 tagged=10457 reason=background
> 
> We're holding the log while we walk 10,000+ inodes at a time and
> the other 15 CPUs are spinning on that.
> 
> List walking and tagging like this is never going to scale. I think
> we do need multiple lists, but not like we currently have. Right now
> the problem is that inodes of different dirtied_when sit on the same
> ordered list, and we need to find the first inode with a specific
> dirtied_when value within the list efficiently to be able to
> determine what inodes need to be written back.
  Thanks for the data and the analysis. I'm aware of this problem but
previously we walked the list in move_expired_inodes() so I'd expect my
patches didn't make this any worse... After some thought I guess the inode
tagging with i_lock acquisition ends up being more expensive than the plain
list_move() we did previously.

> Seems to me that this problem has been solved elswhere in the
> kernel, like for tracking of timers to expire on a given jiffie
> (tvec, tvec_base, timer_lists). Perhaps we should be looking to move
> to a set of time based lists for efficiently tracking what inodes
> should be written back at a given background writeback interval
> rather than trying to keep everything on the one list.
  Thanks for the idea. I'll think about this - the trouble with writeback
lists is that we either need to select based on time, or based on
superblock (depending on the type of writeback). But we surely can do
better than now :)

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-05-19 16:47 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-15 15:41 [PATCH 0/4 RESEND] writeback: Dirty list handling changes Jan Kara
2014-05-15 15:41 ` [PATCH 1/4] writeback: Get rid of superblock pinning Jan Kara
2014-05-15 15:41 ` [PATCH 2/4] writeback: Move removal from writeback list in evict() Jan Kara
2014-05-15 15:41 ` [PATCH 3/4] writeback: Replace several writeback lists with inode tagging Jan Kara
2014-05-15 15:41 ` [PATCH 4/4] writeback: Remove pages_skipped from writeback_control Jan Kara
2014-05-15 23:05   ` Dave Chinner
2014-05-15 23:55 ` [PATCH 0/4 RESEND] writeback: Dirty list handling changes Dave Chinner
2014-05-16  0:47   ` Dave Chinner
2014-05-16  9:27     ` Christoph Hellwig
2014-05-16 22:42       ` Dave Chinner
2014-05-19 16:47     ` Jan Kara
  -- strict thread matches above, loose matches on Subject: below --
2014-03-20 14:30 [PATCH 0/4 RFC] " Jan Kara
2014-03-20 14:30 ` [PATCH 4/4] writeback: Remove pages_skipped from writeback_control Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).