* [PATCH 1/4] writeback: merely wakeup flusher thread if work allocation fails for WB_SYNC_NONE
2009-09-11 11:50 [PATCH 0/4] Post merge per-bdi writeback patches Jens Axboe
@ 2009-09-11 11:50 ` Jens Axboe
2009-09-11 17:59 ` Christoph Hellwig
2009-09-11 11:50 ` [PATCH 2/4] Assign bdi in super_block Jens Axboe
` (2 subsequent siblings)
3 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2009-09-11 11:50 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: chris.mason, hch, tytso, akpm, jack, trond.myklebust, Jens Axboe
From: Christoph Hellwig <hch@infradead.org>
Since it's an opportunistic writeback and not a data integrity action,
don't punt to blocking writeback. Just wakeup the thread and it will
flush old data.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
fs/fs-writeback.c | 46 ++++++++++++++--------------------------------
1 files changed, 14 insertions(+), 32 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index da86ef5..1873fd0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -75,13 +75,6 @@ static inline void bdi_work_init(struct bdi_work *work,
work->state = WS_USED;
}
-static inline void bdi_work_init_on_stack(struct bdi_work *work,
- struct writeback_control *wbc)
-{
- bdi_work_init(work, wbc);
- work->state |= WS_ONSTACK;
-}
-
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
@@ -207,34 +200,23 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
void bdi_start_writeback(struct writeback_control *wbc)
{
- const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
- struct bdi_work work_stack, *work = NULL;
-
- if (!must_wait)
- work = bdi_alloc_work(wbc);
+ /*
+ * WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
+ * bdi_queue_work() will wake up the thread and flush old data. This
+ * should ensure some amount of progress in freeing memory.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL) {
+ struct bdi_work *w = bdi_alloc_work(wbc);
- if (!work) {
- work = &work_stack;
- bdi_work_init_on_stack(work, wbc);
- }
+ bdi_queue_work(wbc->bdi, w);
+ } else {
+ struct bdi_work work;
- bdi_queue_work(wbc->bdi, work);
+ bdi_work_init(&work, wbc);
+ work.state |= WS_ONSTACK;
- /*
- * If the sync mode is WB_SYNC_ALL, block waiting for the work to
- * complete. If not, we only need to wait for the work to be started,
- * if we allocated it on-stack. We use the same mechanism, if the
- * wait bit is set in the bdi_work struct, then threads will not
- * clear pending until after they are done.
- *
- * Note that work == &work_stack if must_wait is true, so we don't
- * need to do call_rcu() here ever, since the completion path will
- * have done that for us.
- */
- if (must_wait || work == &work_stack) {
- bdi_wait_on_work_clear(work);
- if (work != &work_stack)
- call_rcu(&work->rcu_head, bdi_work_free);
+ bdi_queue_work(wbc->bdi, &work);
+ bdi_wait_on_work_clear(&work);
}
}
--
1.6.4.1.207.g68ea
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 1/4] writeback: merely wakeup flusher thread if work allocation fails for WB_SYNC_NONE
2009-09-11 11:50 ` [PATCH 1/4] writeback: merely wakeup flusher thread if work allocation fails for WB_SYNC_NONE Jens Axboe
@ 2009-09-11 17:59 ` Christoph Hellwig
0 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2009-09-11 17:59 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-kernel, linux-fsdevel, chris.mason, hch, tytso, akpm, jack,
trond.myklebust
> + /*
> + * WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
> + * bdi_queue_work() will wake up the thread and flush old data. This
> + * should ensure some amount of progress in freeing memory.
> + */
> + if (wbc->sync_mode != WB_SYNC_ALL) {
> + struct bdi_work *w = bdi_alloc_work(wbc);
>
> + bdi_queue_work(wbc->bdi, w);
> + } else {
> + struct bdi_work work;
>
> + bdi_work_init(&work, wbc);
> + work.state |= WS_ONSTACK;
>
> + bdi_queue_work(wbc->bdi, &work);
> + bdi_wait_on_work_clear(&work);
That's even nice than my version, great.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 2/4] Assign bdi in super_block
2009-09-11 11:50 [PATCH 0/4] Post merge per-bdi writeback patches Jens Axboe
2009-09-11 11:50 ` [PATCH 1/4] writeback: merely wakeup flusher thread if work allocation fails for WB_SYNC_NONE Jens Axboe
@ 2009-09-11 11:50 ` Jens Axboe
2009-09-11 18:01 ` Christoph Hellwig
2009-09-11 11:50 ` [PATCH 3/4] writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout Jens Axboe
2009-09-11 11:50 ` [PATCH 4/4] writeback: use RCU to protect bdi_list Jens Axboe
3 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2009-09-11 11:50 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: chris.mason, hch, tytso, akpm, jack, trond.myklebust, Jens Axboe
We do this automatically in get_sb_bdev() from the set_bdev_super()
callback. Filesystems that have their own private backing_dev_info
must assign that in ->fill_super().
Note that ->s_bdi assignment is required for proper writeback!
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
fs/btrfs/disk-io.c | 1 +
fs/fuse/inode.c | 2 ++
fs/super.c | 6 ++++++
fs/sync.c | 9 ++++++++-
fs/ubifs/super.c | 1 +
include/linux/fs.h | 1 +
6 files changed, 19 insertions(+), 1 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15831d5..8b81927 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1600,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
+ sb->s_bdi = &fs_info->bdi;
/*
* we set the i_size on the btree inode to the max possible int.
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4567db6..e5dbecd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -894,6 +894,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_put_conn;
+ sb->s_bdi = &fc->bdi;
+
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
diff --git a/fs/super.c b/fs/super.c
index 9cda337..b03fea8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -707,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+
+ /*
+ * We set the bdi here to the queue backing, file systems can
+ * overwrite this in ->fill_super()
+ */
+ s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
diff --git a/fs/sync.c b/fs/sync.c
index 103cc7f..8582c34 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,6 +27,13 @@
*/
static int __sync_filesystem(struct super_block *sb, int wait)
{
+ /*
+ * This should be safe, as we require bdi backing to actually
+ * write out data in the first place
+ */
+ if (!sb->s_bdi)
+ return 0;
+
/* Avoid doing twice syncing and cache pruning for quota sync */
if (!wait) {
writeout_quota_sb(sb, -1);
@@ -101,7 +108,7 @@ restart:
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
- if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+ if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
__sync_filesystem(sb, wait);
up_read(&sb->s_umount);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 51763aa..c4af069 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1980,6 +1980,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto out_bdi;
+ sb->s_bdi = &c->bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 26da98f..940f2a8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1343,6 +1343,7 @@ struct super_block {
int s_nr_dentry_unused; /* # of dentry on lru */
struct block_device *s_bdev;
+ struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
struct list_head s_instances;
struct quota_info s_dquot; /* Diskquota specific options */
--
1.6.4.1.207.g68ea
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 2/4] Assign bdi in super_block
2009-09-11 11:50 ` [PATCH 2/4] Assign bdi in super_block Jens Axboe
@ 2009-09-11 18:01 ` Christoph Hellwig
2009-09-11 18:12 ` Jens Axboe
0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2009-09-11 18:01 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-kernel, linux-fsdevel, chris.mason, hch, tytso, akpm, jack,
trond.myklebust
On Fri, Sep 11, 2009 at 01:50:30PM +0200, Jens Axboe wrote:
> We do this automatically in get_sb_bdev() from the set_bdev_super()
> callback. Filesystems that have their own private backing_dev_info
> must assign that in ->fill_super().
>
> Note that ->s_bdi assignment is required for proper writeback!
Looks good to me. Can we get rid of BDI_CAP_NO_WRITEBACK with this by
simply not assigning a bdi?
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/4] Assign bdi in super_block
2009-09-11 18:01 ` Christoph Hellwig
@ 2009-09-11 18:12 ` Jens Axboe
2009-09-11 18:16 ` Christoph Hellwig
0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2009-09-11 18:12 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-kernel, linux-fsdevel, chris.mason, tytso, akpm, jack,
trond.myklebust
On Fri, Sep 11 2009, Christoph Hellwig wrote:
> On Fri, Sep 11, 2009 at 01:50:30PM +0200, Jens Axboe wrote:
> > We do this automatically in get_sb_bdev() from the set_bdev_super()
> > callback. Filesystems that have their own private backing_dev_info
> > must assign that in ->fill_super().
> >
> > Note that ->s_bdi assignment is required for proper writeback!
>
> Looks good to me. Can we get rid of BDI_CAP_NO_WRITEBACK with this by
> simply not assigning a bdi?
Good question, I was indeed looking for an sb equivalent of that bdi
flag. And now ->s_bdi being set or not is indeed that. Some of our
generated writeback originates at the bdi level though, so we may or may
not have the sb. From a quick look at fs-writeback.c, it looks feasible
though. I'll try.
Can I take that as an acked-by?
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/4] Assign bdi in super_block
2009-09-11 18:12 ` Jens Axboe
@ 2009-09-11 18:16 ` Christoph Hellwig
0 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2009-09-11 18:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, linux-kernel, linux-fsdevel, chris.mason,
tytso, akpm, jack, trond.myklebust
On Fri, Sep 11, 2009 at 08:12:27PM +0200, Jens Axboe wrote:
> Good question, I was indeed looking for an sb equivalent of that bdi
> flag. And now ->s_bdi being set or not is indeed that. Some of our
> generated writeback originates at the bdi level though, so we may or may
> not have the sb. From a quick look at fs-writeback.c, it looks feasible
> though. I'll try.
>
> Can I take that as an acked-by?
Yes.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 3/4] writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout
2009-09-11 11:50 [PATCH 0/4] Post merge per-bdi writeback patches Jens Axboe
2009-09-11 11:50 ` [PATCH 1/4] writeback: merely wakeup flusher thread if work allocation fails for WB_SYNC_NONE Jens Axboe
2009-09-11 11:50 ` [PATCH 2/4] Assign bdi in super_block Jens Axboe
@ 2009-09-11 11:50 ` Jens Axboe
2009-09-11 18:09 ` Christoph Hellwig
2009-09-11 11:50 ` [PATCH 4/4] writeback: use RCU to protect bdi_list Jens Axboe
3 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2009-09-11 11:50 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: chris.mason, hch, tytso, akpm, jack, trond.myklebust, Jens Axboe
Data integrity writeback must use bdi_start_writeback() and ensure
that wbc->sb and wbc->bdi are set.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
fs/fs-writeback.c | 69 ++++++++++------------------------------------------
1 files changed, 14 insertions(+), 55 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1873fd0..5d4bd1c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -187,7 +187,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
TASK_UNINTERRUPTIBLE);
}
-static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+ struct writeback_control *wbc)
{
struct bdi_work *work;
@@ -195,7 +196,7 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
if (work)
bdi_work_init(work, wbc);
- return work;
+ bdi_queue_work(bdi, work);
}
void bdi_start_writeback(struct writeback_control *wbc)
@@ -205,11 +206,9 @@ void bdi_start_writeback(struct writeback_control *wbc)
* bdi_queue_work() will wake up the thread and flush old data. This
* should ensure some amount of progress in freeing memory.
*/
- if (wbc->sync_mode != WB_SYNC_ALL) {
- struct bdi_work *w = bdi_alloc_work(wbc);
-
- bdi_queue_work(wbc->bdi, w);
- } else {
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ bdi_alloc_queue_work(wbc->bdi, wbc);
+ else {
struct bdi_work work;
bdi_work_init(&work, wbc);
@@ -840,67 +839,26 @@ int bdi_writeback_task(struct bdi_writeback *wb)
}
/*
- * Schedule writeback for all backing devices. Expensive! If this is a data
- * integrity operation, writeback will be complete when this returns. If
- * we are simply called for WB_SYNC_NONE, then writeback will merely be
- * scheduled to run.
+ * Schedule writeback for all backing devices. Can only be used for
+ * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
+ * and pass in the superblock.
*/
static void bdi_writeback_all(struct writeback_control *wbc)
{
- const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
struct backing_dev_info *bdi;
- struct bdi_work *work;
- LIST_HEAD(list);
-restart:
+ WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
spin_lock(&bdi_lock);
list_for_each_entry(bdi, &bdi_list, bdi_list) {
- struct bdi_work *work;
-
if (!bdi_has_dirty_io(bdi))
continue;
- /*
- * If work allocation fails, do the writes inline. We drop
- * the lock and restart the list writeout. This should be OK,
- * since this happens rarely and because the writeout should
- * eventually make more free memory available.
- */
- work = bdi_alloc_work(wbc);
- if (!work) {
- struct writeback_control __wbc;
-
- /*
- * Not a data integrity writeout, just continue
- */
- if (!must_wait)
- continue;
-
- spin_unlock(&bdi_lock);
- __wbc = *wbc;
- __wbc.bdi = bdi;
- writeback_inodes_wbc(&__wbc);
- goto restart;
- }
- if (must_wait)
- list_add_tail(&work->wait_list, &list);
-
- bdi_queue_work(bdi, work);
+ bdi_alloc_queue_work(bdi, wbc);
}
spin_unlock(&bdi_lock);
-
- /*
- * If this is for WB_SYNC_ALL, wait for pending work to complete
- * before returning.
- */
- while (!list_empty(&list)) {
- work = list_entry(list.next, struct bdi_work, wait_list);
- list_del(&work->wait_list);
- bdi_wait_on_work_clear(work);
- call_rcu(&work->rcu_head, bdi_work_free);
- }
}
/*
@@ -1157,6 +1115,7 @@ long sync_inodes_sb(struct super_block *sb)
{
struct writeback_control wbc = {
.sb = sb,
+ .bdi = sb->s_bdi,
.sync_mode = WB_SYNC_ALL,
.range_start = 0,
.range_end = LLONG_MAX,
@@ -1164,7 +1123,7 @@ long sync_inodes_sb(struct super_block *sb)
long nr_to_write = LONG_MAX; /* doesn't actually matter */
wbc.nr_to_write = nr_to_write;
- bdi_writeback_all(&wbc);
+ bdi_start_writeback(&wbc);
wait_sb_inodes(&wbc);
return nr_to_write - wbc.nr_to_write;
}
--
1.6.4.1.207.g68ea
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 3/4] writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout
2009-09-11 11:50 ` [PATCH 3/4] writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout Jens Axboe
@ 2009-09-11 18:09 ` Christoph Hellwig
2009-09-11 18:14 ` Jens Axboe
2009-09-13 18:20 ` Jens Axboe
0 siblings, 2 replies; 12+ messages in thread
From: Christoph Hellwig @ 2009-09-11 18:09 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-kernel, linux-fsdevel, chris.mason, hch, tytso, akpm, jack,
trond.myklebust
> -static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
> +static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
> + struct writeback_control *wbc)
> {
> struct bdi_work *work;
>
> @@ -195,7 +196,7 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
> if (work)
> bdi_work_init(work, wbc);
>
> - return work;
> + bdi_queue_work(bdi, work);
This is now the only caller of bdi_queue_work that has a NULL work
argument. I would recommend removing the !work half of bdi_queue_work
and just inline it into this function (or make it a separate helper).
> /*
> + * Schedule writeback for all backing devices. Can only be used for
> + * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
> + * and pass in the superblock.
> */
> static void bdi_writeback_all(struct writeback_control *wbc)
> {
> struct backing_dev_info *bdi;
>
> + WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
> +
> spin_lock(&bdi_lock);
>
> list_for_each_entry(bdi, &bdi_list, bdi_list) {
> if (!bdi_has_dirty_io(bdi))
> continue;
>
> + bdi_alloc_queue_work(bdi, wbc);
> }
Much nicer than before. Could be even further simplified to:
if (bdi_has_dirty_io(bdi))
bdi_alloc_queue_work(bdi, wbc);
> @@ -1157,6 +1115,7 @@ long sync_inodes_sb(struct super_block *sb)
> {
> struct writeback_control wbc = {
> .sb = sb,
> + .bdi = sb->s_bdi,
> .sync_mode = WB_SYNC_ALL,
> .range_start = 0,
> .range_end = LLONG_MAX,
> @@ -1164,7 +1123,7 @@ long sync_inodes_sb(struct super_block *sb)
> long nr_to_write = LONG_MAX; /* doesn't actually matter */
>
> wbc.nr_to_write = nr_to_write;
> - bdi_writeback_all(&wbc);
> + bdi_start_writeback(&wbc);
So here we have a WB_SYNC_ALL caller of bdi_writeback_all and the
only other caller is WB_SYNC_NONE. Given that after patch two those
are entirely different codepathes in bdi_start_writeback I would just
split bdi_start_writeback into two separate functions.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 3/4] writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout
2009-09-11 18:09 ` Christoph Hellwig
@ 2009-09-11 18:14 ` Jens Axboe
2009-09-13 18:20 ` Jens Axboe
1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2009-09-11 18:14 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-kernel, linux-fsdevel, chris.mason, tytso, akpm, jack,
trond.myklebust
On Fri, Sep 11 2009, Christoph Hellwig wrote:
> > -static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
> > +static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
> > + struct writeback_control *wbc)
> > {
> > struct bdi_work *work;
> >
> > @@ -195,7 +196,7 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
> > if (work)
> > bdi_work_init(work, wbc);
> >
> > - return work;
> > + bdi_queue_work(bdi, work);
>
> This is now the only caller of bdi_queue_work that has a NULL work
> argument. I would recommend removing the !work half of bdi_queue_work
> and just inline it into this function (or make it a separate helper).
Yep agreed.
> > /*
> > + * Schedule writeback for all backing devices. Can only be used for
> > + * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
> > + * and pass in the superblock.
> > */
> > static void bdi_writeback_all(struct writeback_control *wbc)
> > {
> > struct backing_dev_info *bdi;
> >
> > + WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
> > +
> > spin_lock(&bdi_lock);
> >
> > list_for_each_entry(bdi, &bdi_list, bdi_list) {
> > if (!bdi_has_dirty_io(bdi))
> > continue;
> >
> > + bdi_alloc_queue_work(bdi, wbc);
> > }
>
>
> Much nicer than before. Could be even further simplified to:
>
> if (bdi_has_dirty_io(bdi))
> bdi_alloc_queue_work(bdi, wbc);
Sure, matter of style.
> > @@ -1157,6 +1115,7 @@ long sync_inodes_sb(struct super_block *sb)
> > {
> > struct writeback_control wbc = {
> > .sb = sb,
> > + .bdi = sb->s_bdi,
> > .sync_mode = WB_SYNC_ALL,
> > .range_start = 0,
> > .range_end = LLONG_MAX,
> > @@ -1164,7 +1123,7 @@ long sync_inodes_sb(struct super_block *sb)
> > long nr_to_write = LONG_MAX; /* doesn't actually matter */
> >
> > wbc.nr_to_write = nr_to_write;
> > - bdi_writeback_all(&wbc);
> > + bdi_start_writeback(&wbc);
>
> So here we have a WB_SYNC_ALL caller of bdi_writeback_all and the
> only other caller is WB_SYNC_NONE. Given that after patch two those
> are entirely different codepathes in bdi_start_writeback I would just
> split bdi_start_writeback into two separate functions.
Sure, it'll draw a sharper line between WB_SYNC_NONE and WB_SYNC_ALL.
I'll work on this and the bdi cap dirty flag removal as a follow up.
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 3/4] writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout
2009-09-11 18:09 ` Christoph Hellwig
2009-09-11 18:14 ` Jens Axboe
@ 2009-09-13 18:20 ` Jens Axboe
1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2009-09-13 18:20 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-kernel, linux-fsdevel, chris.mason, tytso, akpm, jack,
trond.myklebust
On Fri, Sep 11 2009, Christoph Hellwig wrote:
> > -static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
> > +static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
> > + struct writeback_control *wbc)
> > {
> > struct bdi_work *work;
> >
> > @@ -195,7 +196,7 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
> > if (work)
> > bdi_work_init(work, wbc);
> >
> > - return work;
> > + bdi_queue_work(bdi, work);
>
> This is now the only caller of bdi_queue_work that has a NULL work
> argument. I would recommend removing the !work half of bdi_queue_work
> and just inline it into this function (or make it a separate helper).
http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=cd932613c5b86a5bb2eefee726749acb0ed1a5b8
> > @@ -1157,6 +1115,7 @@ long sync_inodes_sb(struct super_block *sb)
> > {
> > struct writeback_control wbc = {
> > .sb = sb,
> > + .bdi = sb->s_bdi,
> > .sync_mode = WB_SYNC_ALL,
> > .range_start = 0,
> > .range_end = LLONG_MAX,
> > @@ -1164,7 +1123,7 @@ long sync_inodes_sb(struct super_block *sb)
> > long nr_to_write = LONG_MAX; /* doesn't actually matter */
> >
> > wbc.nr_to_write = nr_to_write;
> > - bdi_writeback_all(&wbc);
> > + bdi_start_writeback(&wbc);
>
> So here we have a WB_SYNC_ALL caller of bdi_writeback_all and the
> only other caller is WB_SYNC_NONE. Given that after patch two those
> are entirely different codepathes in bdi_start_writeback I would just
> split bdi_start_writeback into two separate functions.
http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=479364efda68ae0d954f325fa52a3bbe94916783
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 4/4] writeback: use RCU to protect bdi_list
2009-09-11 11:50 [PATCH 0/4] Post merge per-bdi writeback patches Jens Axboe
` (2 preceding siblings ...)
2009-09-11 11:50 ` [PATCH 3/4] writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout Jens Axboe
@ 2009-09-11 11:50 ` Jens Axboe
3 siblings, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2009-09-11 11:50 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: chris.mason, hch, tytso, akpm, jack, trond.myklebust, Jens Axboe
Now that bdi_writeback_all() no longer handles integrity writeback,
it doesn't have to block anymore. This means that we can switch
bdi_list reader side protection to RCU.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
fs/fs-writeback.c | 4 +-
include/linux/backing-dev.h | 1 +
mm/backing-dev.c | 76 +++++++++++++++++++++++++++++++------------
mm/page-writeback.c | 8 ++--
4 files changed, 62 insertions(+), 27 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5d4bd1c..d7592c7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -849,7 +849,7 @@ static void bdi_writeback_all(struct writeback_control *wbc)
WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
- spin_lock(&bdi_lock);
+ rcu_read_lock();
list_for_each_entry(bdi, &bdi_list, bdi_list) {
if (!bdi_has_dirty_io(bdi))
@@ -858,7 +858,7 @@ static void bdi_writeback_all(struct writeback_control *wbc)
bdi_alloc_queue_work(bdi, wbc);
}
- spin_unlock(&bdi_lock);
+ rcu_read_unlock();
}
/*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f169bcb..859e797 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -59,6 +59,7 @@ struct bdi_writeback {
struct backing_dev_info {
struct list_head bdi_list;
+ struct rcu_head rcu_head;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
unsigned int capabilities; /* Device capabilities */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca0da..fd93566 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -26,6 +26,12 @@ struct backing_dev_info default_backing_dev_info = {
EXPORT_SYMBOL_GPL(default_backing_dev_info);
static struct class *bdi_class;
+
+/*
+ * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
+ * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * locking.
+ */
DEFINE_SPINLOCK(bdi_lock);
LIST_HEAD(bdi_list);
LIST_HEAD(bdi_pending_list);
@@ -284,9 +290,9 @@ static int bdi_start_fn(void *ptr)
/*
* Add us to the active bdi_list
*/
- spin_lock(&bdi_lock);
- list_add(&bdi->bdi_list, &bdi_list);
- spin_unlock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
+ list_add_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
bdi_task_init(bdi, wb);
@@ -389,7 +395,7 @@ static int bdi_forker_task(void *ptr)
if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
wb_do_writeback(me, 0);
- spin_lock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
/*
* Check if any existing bdi's have dirty data without
@@ -410,7 +416,7 @@ static int bdi_forker_task(void *ptr)
if (list_empty(&bdi_pending_list)) {
unsigned long wait;
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
wait = msecs_to_jiffies(dirty_writeback_interval * 10);
schedule_timeout(wait);
try_to_freeze();
@@ -426,7 +432,7 @@ static int bdi_forker_task(void *ptr)
bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
bdi_list);
list_del_init(&bdi->bdi_list);
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
wb = &bdi->wb;
wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
@@ -445,9 +451,9 @@ static int bdi_forker_task(void *ptr)
* a chance to flush other bdi's to free
* memory.
*/
- spin_lock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
bdi_flush_io(bdi);
}
@@ -456,6 +462,24 @@ static int bdi_forker_task(void *ptr)
return 0;
}
+static void bdi_add_to_pending(struct rcu_head *head)
+{
+ struct backing_dev_info *bdi;
+
+ bdi = container_of(head, struct backing_dev_info, rcu_head);
+ INIT_LIST_HEAD(&bdi->bdi_list);
+
+ spin_lock(&bdi_lock);
+ list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+ spin_unlock(&bdi_lock);
+
+ /*
+ * We are now on the pending list, wake up bdi_forker_task()
+ * to finish the job and add us back to the active bdi_list
+ */
+ wake_up_process(default_backing_dev_info.wb.task);
+}
+
/*
* Add the default flusher task that gets created for any bdi
* that has dirty data pending writeout
@@ -478,16 +502,29 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
* waiting for previous additions to finish.
*/
if (!test_and_set_bit(BDI_pending, &bdi->state)) {
- list_move_tail(&bdi->bdi_list, &bdi_pending_list);
+ list_del_rcu(&bdi->bdi_list);
/*
- * We are now on the pending list, wake up bdi_forker_task()
- * to finish the job and add us back to the active bdi_list
+ * We must wait for the current RCU period to end before
+ * moving to the pending list. So schedule that operation
+ * from an RCU callback.
*/
- wake_up_process(default_backing_dev_info.wb.task);
+ call_rcu(&bdi->rcu_head, bdi_add_to_pending);
}
}
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+ spin_lock_bh(&bdi_lock);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ synchronize_rcu();
+}
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -506,9 +543,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
goto exit;
}
- spin_lock(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_list);
- spin_unlock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
bdi->dev = dev;
@@ -526,9 +563,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
wb->task = NULL;
ret = -ENOMEM;
- spin_lock(&bdi_lock);
- list_del(&bdi->bdi_list);
- spin_unlock(&bdi_lock);
+ bdi_remove_from_list(bdi);
goto exit;
}
}
@@ -565,9 +600,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
/*
* Make sure nobody finds us on the bdi_list anymore
*/
- spin_lock(&bdi_lock);
- list_del(&bdi->bdi_list);
- spin_unlock(&bdi_lock);
+ bdi_remove_from_list(bdi);
/*
* Finally, kill the kernel threads. We don't need to be RCU
@@ -599,6 +632,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
+ INIT_RCU_HEAD(&bdi->rcu_head);
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
INIT_LIST_HEAD(&bdi->work_list);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 25e7770..a5f0f76 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -315,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
- spin_lock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
@@ -327,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
ret = -EINVAL;
}
}
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
return ret;
}
@@ -339,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
if (max_ratio > 100)
return -EINVAL;
- spin_lock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
}
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
return ret;
}
--
1.6.4.1.207.g68ea
^ permalink raw reply related [flat|nested] 12+ messages in thread