From: Wu Fengguang <fengguang.wu@intel.com>
To: Jan Kara <jack@suse.cz>
Cc: "linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>,
Christoph Hellwig <hch@infradead.org>,
Dave Chinner <david@fromorbit.com>
Subject: Re: [PATCH 1/2] writeback: Improve busyloop prevention
Date: Thu, 20 Oct 2011 20:09:09 +0800 [thread overview]
Message-ID: <20111020120909.GA8193@localhost> (raw)
In-Reply-To: <20111019115630.GA22266@quack.suse.cz>
Jan,
I tried the below combined patch over the ioless one, and find some
minor regressions. I studied the thresh=1G/ext3-1dd case in particular
and find that nr_writeback and the iostat avgrq-sz drops from time to time.
I'll try to bisect the changeset.
3.1.0-rc9-ioless-full-next-20111014+ 3.1.0-rc9-ioless-full-more_io_wait-next-20111014+
------------------------ ------------------------
56.47 -0.6% 56.13 thresh=100M/btrfs-10dd-4k-8p-4096M-100M:10-X
56.28 -0.4% 56.07 thresh=100M/btrfs-1dd-4k-8p-4096M-100M:10-X
56.11 -0.1% 56.05 thresh=100M/btrfs-2dd-4k-8p-4096M-100M:10-X
37.86 +1.8% 38.54 thresh=100M/ext3-10dd-4k-8p-4096M-100M:10-X
45.91 +0.7% 46.22 thresh=100M/ext3-1dd-4k-8p-4096M-100M:10-X
41.87 +0.8% 42.19 thresh=100M/ext3-2dd-4k-8p-4096M-100M:10-X
45.68 -0.4% 45.50 thresh=100M/ext4-10dd-4k-8p-4096M-100M:10-X
55.74 -2.2% 54.51 thresh=100M/ext4-1dd-4k-8p-4096M-100M:10-X
46.20 -4.8% 43.98 thresh=100M/xfs-10dd-4k-8p-4096M-100M:10-X
55.72 +0.1% 55.76 thresh=100M/xfs-1dd-4k-8p-4096M-100M:10-X
54.01 -2.0% 52.94 thresh=100M/xfs-2dd-4k-8p-4096M-100M:10-X
55.08 -1.0% 54.52 thresh=1G/btrfs-100dd-4k-8p-4096M-1024M:10-X
55.49 -1.0% 54.94 thresh=1G/btrfs-10dd-4k-8p-4096M-1024M:10-X
55.38 -2.7% 53.91 thresh=1G/btrfs-1dd-4k-8p-4096M-1024M:10-X
36.70 -1.5% 36.15 thresh=1G/ext3-100dd-4k-8p-4096M-1024M:10-X
40.64 -5.9% 38.25 thresh=1G/ext3-10dd-4k-8p-4096M-1024M:10-X
48.65 -6.9% 45.30 thresh=1G/ext3-1dd-4k-8p-4096M-1024M:10-X
49.84 -3.2% 48.23 thresh=1G/ext4-100dd-4k-8p-4096M-1024M:10-X
56.03 -3.3% 54.21 thresh=1G/ext4-10dd-4k-8p-4096M-1024M:10-X
57.42 -2.3% 56.07 thresh=1G/ext4-1dd-4k-8p-4096M-1024M:10-X
45.74 -1.4% 45.12 thresh=1G/xfs-100dd-4k-8p-4096M-1024M:10-X
54.19 -0.5% 53.94 thresh=1G/xfs-10dd-4k-8p-4096M-1024M:10-X
55.93 -0.5% 55.66 thresh=1G/xfs-1dd-4k-8p-4096M-1024M:10-X
2.77 +27.8% 3.54 thresh=1M/btrfs-10dd-4k-8p-4096M-1M:10-X
2.20 -15.5% 1.86 thresh=1M/btrfs-1dd-4k-8p-4096M-1M:10-X
2.42 -1.3% 2.39 thresh=1M/btrfs-2dd-4k-8p-4096M-1M:10-X
28.91 +1.9% 29.47 thresh=1M/ext3-10dd-4k-8p-4096M-1M:10-X
45.02 +1.1% 45.50 thresh=1M/ext3-1dd-4k-8p-4096M-1M:10-X
40.91 +0.4% 41.09 thresh=1M/ext3-2dd-4k-8p-4096M-1M:10-X
31.82 +2.3% 32.56 thresh=1M/ext4-10dd-4k-8p-4096M-1M:10-X
52.33 -0.9% 51.85 thresh=1M/ext4-1dd-4k-8p-4096M-1M:10-X
28.43 +1.2% 28.77 thresh=1M/xfs-10dd-4k-8p-4096M-1M:10-X
52.93 -3.8% 50.90 thresh=1M/xfs-1dd-4k-8p-4096M-1M:10-X
46.87 -0.0% 46.85 thresh=1M/xfs-2dd-4k-8p-4096M-1M:10-X
54.54 -1.3% 53.82 thresh=8M/btrfs-10dd-4k-8p-4096M-8M:10-X
56.60 -1.4% 55.80 thresh=8M/btrfs-1dd-4k-8p-4096M-8M:10-X
56.21 -0.4% 55.96 thresh=8M/btrfs-2dd-4k-8p-4096M-8M:10-X
32.54 +0.2% 32.62 thresh=8M/ext3-10dd-4k-8p-4096M-8M:10-X
46.01 -1.0% 45.55 thresh=8M/ext3-1dd-4k-8p-4096M-8M:10-X
44.13 -0.6% 43.87 thresh=8M/ext3-2dd-4k-8p-4096M-8M:10-X
35.78 -0.4% 35.63 thresh=8M/ext4-10dd-4k-8p-4096M-8M:10-X
55.29 +0.2% 55.38 thresh=8M/ext4-1dd-4k-8p-4096M-8M:10-X
31.21 -0.8% 30.96 thresh=8M/xfs-10dd-4k-8p-4096M-8M:10-X
54.10 -0.3% 53.95 thresh=8M/xfs-1dd-4k-8p-4096M-8M:10-X
46.97 +0.5% 47.20 thresh=8M/xfs-2dd-4k-8p-4096M-8M:10-X
2010.92 -1.1% 1989.70 TOTAL write_bw
--- linux-next.orig/fs/fs-writeback.c 2011-10-20 19:26:37.000000000 +0800
+++ linux-next/fs/fs-writeback.c 2011-10-20 20:00:22.000000000 +0800
@@ -234,6 +234,15 @@ static void requeue_io(struct inode *ino
list_move(&inode->i_wb_list, &wb->b_more_io);
}
+/*
+ * The inode should be retried in an opportunistic way.
+ */
+static void requeue_io_wait(struct inode *inode, struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+ list_move(&inode->i_wb_list, &wb->b_more_io_wait);
+}
+
static void inode_sync_complete(struct inode *inode)
{
/*
@@ -321,6 +330,7 @@ static void queue_io(struct bdi_writebac
int moved;
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
+ list_splice_init(&wb->b_more_io_wait, &wb->b_io);
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
trace_writeback_queue_io(wb, work, moved);
}
@@ -470,7 +480,7 @@ writeback_single_inode(struct inode *ino
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
- redirty_tail(inode, wb);
+ requeue_io_wait(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
@@ -478,8 +488,18 @@ writeback_single_inode(struct inode *ino
* operations, such as delayed allocation during
* submission or metadata updates after data IO
* completion.
+ *
+ * For the latter case it is very important to give
+ * the inode another turn on b_more_io instead of
+ * redirtying it. Constantly moving dirtied_when
+ * forward will prevent us from ever writing out
+ * the metadata dirtied in the I/O completion handler.
+ *
+ * For files on XFS that constantly get appended to
+ * calling redirty_tail means they will never get
+ * their updated i_size written out.
*/
- redirty_tail(inode, wb);
+ requeue_io_wait(inode, wb);
} else {
/*
* The inode is clean. At this point we either have
@@ -600,7 +620,7 @@ static long writeback_sb_inodes(struct s
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
- redirty_tail(inode, wb);
+ requeue_io_wait(inode, wb);
}
spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
@@ -637,7 +657,7 @@ static long __writeback_inodes_wb(struct
* s_umount being grabbed by someone else. Don't use
* requeue_io() to avoid busy retrying the inode/sb.
*/
- redirty_tail(inode, wb);
+ requeue_io_wait(inode, wb);
continue;
}
wrote += writeback_sb_inodes(sb, wb, work);
@@ -720,10 +740,10 @@ static long wb_writeback(struct bdi_writ
struct wb_writeback_work *work)
{
unsigned long wb_start = jiffies;
- long nr_pages = work->nr_pages;
unsigned long oldest_jif;
struct inode *inode;
long progress;
+ long total_progress = 0;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
@@ -753,11 +773,17 @@ static long wb_writeback(struct bdi_writ
if (work->for_background && !over_bground_thresh(wb->bdi))
break;
+ /*
+ * Kupdate and background works are special and we want to
+ * include all inodes that need writing. Livelock avoidance is
+ * handled by these works yielding to any other work so we are
+ * safe.
+ */
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
- work->older_than_this = &oldest_jif;
- }
+ } else if (work->for_background)
+ oldest_jif = jiffies;
trace_writeback_start(wb->bdi, work);
if (list_empty(&wb->b_io))
@@ -767,6 +793,7 @@ static long wb_writeback(struct bdi_writ
else
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb->bdi, work);
+ total_progress += progress;
wb_update_bandwidth(wb, wb_start);
@@ -800,7 +827,7 @@ static long wb_writeback(struct bdi_writ
}
spin_unlock(&wb->list_lock);
- return nr_pages - work->nr_pages;
+ return total_progress;
}
/*
@@ -863,7 +890,7 @@ static long wb_check_old_data_flush(stru
expired = wb->last_old_flush +
msecs_to_jiffies(dirty_writeback_interval * 10);
- if (time_before(jiffies, expired))
+ if (time_before(jiffies, expired) && list_empty(&wb->b_more_io_wait))
return 0;
wb->last_old_flush = jiffies;
@@ -934,7 +961,11 @@ int bdi_writeback_thread(void *data)
{
struct bdi_writeback *wb = data;
struct backing_dev_info *bdi = wb->bdi;
- long pages_written;
+ long progress;
+ unsigned int pause = 1;
+ unsigned int max_pause = dirty_writeback_interval ?
+ msecs_to_jiffies(dirty_writeback_interval * 10) :
+ HZ;
current->flags |= PF_SWAPWRITE;
set_freezable();
@@ -954,12 +985,12 @@ int bdi_writeback_thread(void *data)
*/
del_timer(&wb->wakeup_timer);
- pages_written = wb_do_writeback(wb, 0);
+ progress = wb_do_writeback(wb, 0);
- trace_writeback_pages_written(pages_written);
-
- if (pages_written)
+ if (progress) {
wb->last_active = jiffies;
+ pause = 1;
+ }
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
@@ -967,8 +998,11 @@ int bdi_writeback_thread(void *data)
continue;
}
- if (wb_has_dirty_io(wb) && dirty_writeback_interval)
- schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+ if (!list_empty(&wb->b_more_io_wait) && pause < max_pause) {
+ schedule_timeout(pause);
+ pause <<= 1;
+ } else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+ schedule_timeout(max_pause);
else {
/*
* We have nothing to do, so can go sleep without any
--- linux-next.orig/include/linux/backing-dev.h 2011-10-20 19:26:37.000000000 +0800
+++ linux-next/include/linux/backing-dev.h 2011-10-20 19:29:39.000000000 +0800
@@ -59,6 +59,7 @@ struct bdi_writeback {
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
+ struct list_head b_more_io_wait;/* opportunistic retry io */
spinlock_t list_lock; /* protects the b_* lists */
};
@@ -133,9 +134,10 @@ extern struct list_head bdi_pending_list
static inline int wb_has_dirty_io(struct bdi_writeback *wb)
{
- return !list_empty(&wb->b_dirty) ||
- !list_empty(&wb->b_io) ||
- !list_empty(&wb->b_more_io);
+ return !list_empty(&wb->b_dirty) ||
+ !list_empty(&wb->b_io) ||
+ !list_empty(&wb->b_more_io) ||
+ !list_empty(&wb->b_more_io_wait);
}
static inline void __add_bdi_stat(struct backing_dev_info *bdi,
--- linux-next.orig/mm/backing-dev.c 2011-10-20 19:26:37.000000000 +0800
+++ linux-next/mm/backing-dev.c 2011-10-20 19:29:39.000000000 +0800
@@ -74,10 +74,10 @@ static int bdi_debug_stats_show(struct s
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io;
+ unsigned long nr_dirty, nr_io, nr_more_io, nr_more_io_wait;
struct inode *inode;
- nr_dirty = nr_io = nr_more_io = 0;
+ nr_dirty = nr_io = nr_more_io = nr_more_io_wait = 0;
spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
@@ -85,6 +85,8 @@ static int bdi_debug_stats_show(struct s
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
+ list_for_each_entry(inode, &wb->b_more_io_wait, i_wb_list)
+ nr_more_io_wait++;
spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -103,6 +105,7 @@ static int bdi_debug_stats_show(struct s
"b_dirty: %10lu\n"
"b_io: %10lu\n"
"b_more_io: %10lu\n"
+ "b_more_io_wait: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -116,6 +119,7 @@ static int bdi_debug_stats_show(struct s
nr_dirty,
nr_io,
nr_more_io,
+ nr_more_io_wait,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
@@ -651,6 +655,7 @@ static void bdi_wb_init(struct bdi_write
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ INIT_LIST_HEAD(&wb->b_more_io_wait);
spin_lock_init(&wb->list_lock);
setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}
@@ -718,6 +723,7 @@ void bdi_destroy(struct backing_dev_info
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
+ list_splice(&bdi->wb.b_more_io_wait, &dst->b_more_io_wait);
spin_unlock(&bdi->wb.list_lock);
spin_unlock(&dst->list_lock);
}
next prev parent reply other threads:[~2011-10-20 12:09 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-10-12 20:57 [PATCH 0/2 v4] writeback: Improve busyloop prevention and inode requeueing Jan Kara
2011-10-12 20:57 ` [PATCH 1/2] writeback: Improve busyloop prevention Jan Kara
2011-10-13 14:26 ` Wu Fengguang
2011-10-13 20:13 ` Jan Kara
2011-10-14 7:18 ` Christoph Hellwig
2011-10-14 19:31 ` Chris Mason
[not found] ` <20111013143939.GA9691@localhost>
2011-10-13 20:18 ` Jan Kara
2011-10-14 16:00 ` Wu Fengguang
2011-10-14 16:28 ` Wu Fengguang
2011-10-18 0:51 ` Jan Kara
2011-10-18 14:35 ` Wu Fengguang
2011-10-19 11:56 ` Jan Kara
2011-10-19 13:25 ` Wu Fengguang
2011-10-19 13:30 ` Wu Fengguang
2011-10-19 13:35 ` Wu Fengguang
2011-10-20 12:09 ` Wu Fengguang [this message]
2011-10-20 12:33 ` Wu Fengguang
2011-10-20 13:39 ` Wu Fengguang
2011-10-20 22:26 ` Jan Kara
2011-10-22 4:20 ` Wu Fengguang
2011-10-24 15:45 ` Jan Kara
[not found] ` <20111027063133.GA10146@localhost>
2011-10-27 20:31 ` Jan Kara
[not found] ` <20111101134231.GA31718@localhost>
2011-11-01 21:53 ` Jan Kara
2011-11-02 17:25 ` Wu Fengguang
[not found] ` <20111102185603.GA4034@localhost>
2011-11-03 1:51 ` Jan Kara
2011-11-03 14:52 ` Wu Fengguang
[not found] ` <20111104152054.GA11577@localhost>
2011-11-08 23:52 ` Jan Kara
2011-11-09 13:51 ` Wu Fengguang
2011-11-10 14:50 ` Jan Kara
2011-12-05 8:02 ` Wu Fengguang
2011-12-07 10:13 ` Jan Kara
2011-12-07 11:45 ` Wu Fengguang
[not found] ` <20111027064745.GA14017@localhost>
2011-10-27 20:50 ` Jan Kara
2011-10-20 9:46 ` Christoph Hellwig
2011-10-20 15:32 ` Jan Kara
2011-10-15 12:41 ` Wu Fengguang
2011-10-12 20:57 ` [PATCH 2/2] writeback: Replace some redirty_tail() calls with requeue_io() Jan Kara
2011-10-13 14:30 ` Wu Fengguang
2011-10-13 14:15 ` [PATCH 0/2 v4] writeback: Improve busyloop prevention and inode requeueing Wu Fengguang
-- strict thread matches above, loose matches on Subject: below --
2011-09-08 0:44 [PATCH 1/2] writeback: Improve busyloop prevention Jan Kara
2011-09-08 0:57 ` Wu Fengguang
2011-09-08 13:49 ` Jan Kara
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20111020120909.GA8193@localhost \
--to=fengguang.wu@intel.com \
--cc=david@fromorbit.com \
--cc=hch@infradead.org \
--cc=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).