From: Jens Axboe <jens.axboe@oracle.com>
To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org
Cc: chris.mason@oracle.com, david@fromorbit.com, npiggin@suse.de,
Jens Axboe <jens.axboe@oracle.com>
Subject: [PATCH 6/7] writeback: add lazy bdi->task creation
Date: Thu, 12 Mar 2009 15:33:47 +0100 [thread overview]
Message-ID: <1236868428-20408-7-git-send-email-jens.axboe@oracle.com> (raw)
In-Reply-To: <1236868428-20408-1-git-send-email-jens.axboe@oracle.com>
Instead of creating the bdi flusher threads when the bdi is registered,
defer that to the point where we have dirty IO pending and someone
attempts to start the flushing.
A bdi is put on the normal bdi_list when it is registered. When someone
attempts to schedule writeback on this bdi, we move it to a pending list
and wake up the default bdi forker thread to take care of setting up a
task and putting the bdi back on the normal bdi_list. If task creation
should fail, the forker thread will writeout some data on behalf of the
pending bdi. This should ensure progress always.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
fs/fs-writeback.c | 42 +++++-----
include/linux/backing-dev.h | 8 ++-
mm/backing-dev.c | 196 +++++++++++++++++++++++++++++++++++++++----
3 files changed, 206 insertions(+), 40 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 37b042f..c25c261 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -74,14 +74,17 @@ static void writeback_release(struct backing_dev_info *bdi)
clear_bit(BDI_pdflush, &bdi->state);
}
-void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
+int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
long nr_pages)
{
/*
- * Should not happen, complain?
+ * This only happens the first time someone kicks this bdi, so put
+ * it out-of-line.
*/
- if (unlikely(!bdi->task))
- return;
+ if (unlikely(!bdi->task)) {
+ bdi_add_flusher_task(bdi);
+ return 1;
+ }
if (writeback_acquire(bdi)) {
bdi->wb_arg.nr_pages = nr_pages;
@@ -92,6 +95,8 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
smp_mb();
wake_up(&bdi->wait);
}
+
+ return 0;
}
/*
@@ -185,24 +190,13 @@ static void bdi_pdflush(struct backing_dev_info *bdi)
* Handle writeback of dirty data for the device backed by this bdi. Also
* wakes up periodically and does kupdated style flushing.
*/
-int bdi_writeback_task(void *ptr)
+int bdi_writeback_task(struct backing_dev_info *bdi)
{
- struct backing_dev_info *bdi = ptr;
- struct task_struct *tsk = current;
-
- tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
- set_freezable();
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(tsk, 0);
-
while (!kthread_should_stop()) {
- DECLARE_WAITQUEUE(wait, tsk);
+ DECLARE_WAITQUEUE(wait, current);
add_wait_queue(&bdi->wait, &wait);
- set_task_state(tsk, TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(dirty_writeback_interval);
try_to_freeze();
@@ -226,7 +220,7 @@ int bdi_writeback_task(void *ptr)
bdi_pdflush(bdi);
writeback_release(bdi);
- set_task_state(tsk, TASK_RUNNING);
+ set_current_state(TASK_RUNNING);
finish_wait(&bdi->wait, &wait);
}
@@ -239,9 +233,13 @@ void bdi_writeback_all(struct super_block *sb, long nr_pages)
rcu_read_lock();
- list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- if (bdi_has_dirty_io(bdi))
- bdi_start_writeback(bdi, sb, nr_pages);
+restart:
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ if (!bdi_has_dirty_io(bdi))
+ continue;
+ if (bdi_start_writeback(bdi, sb, nr_pages))
+ goto restart;
+ }
rcu_read_unlock();
}
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 3c94fbd..b9e2085 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -24,6 +24,7 @@ struct dentry;
*/
enum bdi_state {
BDI_pdflush, /* A pdflush thread is working this device */
+ BDI_pending, /* On its way to being activated */
BDI_write_congested, /* The write queue is getting full */
BDI_read_congested, /* The read queue is getting full */
BDI_unused, /* Available bits start here */
@@ -46,6 +47,7 @@ struct bdi_writeback_arg {
struct backing_dev_info {
struct list_head bdi_list;
+ struct rcu_head rcu_head;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
@@ -85,10 +87,11 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
void bdi_unregister(struct backing_dev_info *bdi);
-void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
+int bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
long nr_pages);
-int bdi_writeback_task(void *);
+int bdi_writeback_task(struct backing_dev_info *bdi);
void bdi_writeback_all(struct super_block *sb, long nr_pages);
+void bdi_add_flusher_task(struct backing_dev_info *bdi);
extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
@@ -215,6 +218,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
#define BDI_CAP_EXEC_MAP 0x00000040
#define BDI_CAP_NO_ACCT_WB 0x00000080
#define BDI_CAP_SWAP_BACKED 0x00000100
+#define BDI_CAP_FLUSH_FORKER 0x00000200
#define BDI_CAP_VMFLAGS \
(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0096b96..500d1fc 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -2,6 +2,7 @@
#include <linux/wait.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -18,7 +19,7 @@ EXPORT_SYMBOL(default_unplug_io_fn);
struct backing_dev_info default_backing_dev_info = {
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
- .capabilities = BDI_CAP_MAP_COPY,
+ .capabilities = BDI_CAP_MAP_COPY | BDI_CAP_FLUSH_FORKER,
.unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
@@ -26,6 +27,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
static struct class *bdi_class;
DEFINE_SPINLOCK(bdi_lock);
LIST_HEAD(bdi_list);
+LIST_HEAD(bdi_pending_list);
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -197,6 +199,147 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
+static int bdi_start_fn(void *ptr)
+{
+ struct backing_dev_info *bdi = ptr;
+ struct task_struct *tsk = current;
+
+ /*
+ * Add us to the active bdi_list
+ */
+ spin_lock_bh(&bdi_lock);
+ list_add_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
+
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(tsk, 0);
+
+ /*
+ * Clear pending bit and wakeup anybody waiting to tear us down
+ */
+ clear_bit(BDI_pending, &bdi->state);
+ wake_up_bit(&bdi->state, BDI_pending);
+
+ return bdi_writeback_task(bdi);
+}
+
+static int bdi_forker_task(void *ptr)
+{
+ struct backing_dev_info *bdi = ptr;
+ struct task_struct *tsk = current;
+
+ for (;;) {
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ /*
+ * Should never trigger on the default bdi
+ */
+ WARN_ON(bdi_has_dirty_io(bdi));
+
+ add_wait_queue(&bdi->wait, &wait);
+ set_task_state(tsk, TASK_INTERRUPTIBLE);
+ smp_mb();
+ if (list_empty(&bdi_pending_list))
+ schedule();
+ else {
+ struct backing_dev_info *bdi = NULL;
+
+ spin_lock_bh(&bdi_lock);
+ if (!list_empty(&bdi_pending_list)) {
+ bdi = list_entry(bdi_pending_list.next,
+ struct backing_dev_info,
+ bdi_list);
+ list_del_init(&bdi->bdi_list);
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ /*
+ * If no bdi or bdi already got setup, continue
+ */
+ if (!bdi || bdi->task)
+ continue;
+
+ bdi->task = kthread_run(bdi_start_fn, bdi, "bdi-%s",
+ dev_name(bdi->dev));
+ /*
+ * If task creation fails, then readd the bdi to
+ * the pending list and force writeout of the bdi
+ * from this forker thread. That will free some memory
+ * and we can try again.
+ */
+ if (!bdi->task) {
+ struct writeback_control wbc = {
+ .bdi = bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .range_cyclic = 1,
+ };
+
+ /*
+ * Add this 'bdi' to the back, so we get
+ * a chance to flush other bdi's to free
+ * memory.
+ */
+ spin_lock_bh(&bdi_lock);
+ list_add_tail(&bdi->bdi_list,
+ &bdi_pending_list);
+ spin_unlock_bh(&bdi_lock);
+
+ wbc.nr_to_write = 1024;
+ generic_sync_bdi_inodes(NULL, &wbc);
+ }
+ }
+
+ set_task_state(tsk, TASK_RUNNING);
+ finish_wait(&bdi->wait, &wait);
+ }
+
+ return 0;
+}
+
+/*
+ * Grace period has now ended, init bdi->bdi_list and add us to the
+ * list of bdi's that are pending for task creation. Wake up
+ * bdi_forker_task() to finish the job and add us back to the
+ * active bdi_list.
+ */
+static void bdi_add_to_pending(struct rcu_head *head)
+{
+ struct backing_dev_info *bdi;
+
+ bdi = container_of(head, struct backing_dev_info, rcu_head);
+ INIT_LIST_HEAD(&bdi->bdi_list);
+
+ spin_lock(&bdi_lock);
+ list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+ spin_unlock(&bdi_lock);
+
+ wake_up(&default_backing_dev_info.wait);
+}
+
+void bdi_add_flusher_task(struct backing_dev_info *bdi)
+{
+ if (test_and_set_bit(BDI_pending, &bdi->state))
+ return;
+
+ spin_lock_bh(&bdi_lock);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ /*
+ * We need to wait for the current grace period to end,
+ * in case others were browsing the bdi_list as well.
+ * So defer the adding and wakeup to after the RCU
+ * grace period has ended.
+ */
+ call_rcu(&bdi->rcu_head, bdi_add_to_pending);
+}
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -215,17 +358,24 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
goto exit;
}
- bdi->task = kthread_run(bdi_writeback_task, bdi, "bdi-%s",
- dev_name(dev));
- if (!bdi->task) {
- ret = -ENOMEM;
- goto exit;
+ /*
+ * Just start the forker thread for our default backing_dev_info,
+ * and add other bdi's to the list. They will get a thread created
+ * on-demand when they need it.
+ */
+ if (bdi->capabilities & BDI_CAP_FLUSH_FORKER) {
+ bdi->task = kthread_run(bdi_forker_task, bdi, "bdi-%s",
+ dev_name(dev));
+ if (!bdi->task) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+ } else {
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
}
- spin_lock(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock(&bdi_lock);
-
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
@@ -240,11 +390,22 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
}
EXPORT_SYMBOL(bdi_register_dev);
+static int sched_wait(void *word)
+{
+ schedule();
+ return 0;
+}
+
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
- spin_lock(&bdi_lock);
+ /*
+ * If setup is pending, wait for that to complete first
+ */
+ wait_on_bit(&bdi->state, BDI_pending, sched_wait, TASK_UNINTERRUPTIBLE);
+
+ spin_lock_bh(&bdi_lock);
list_del_rcu(&bdi->bdi_list);
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
/*
* In case the bdi is freed right after unregister, we need to
@@ -256,10 +417,12 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
- bdi_remove_from_list(bdi);
- if (bdi->task) {
- kthread_stop(bdi->task);
- bdi->task = NULL;
+ if (!(bdi->capabilities & BDI_CAP_FLUSH_FORKER)) {
+ bdi_remove_from_list(bdi);
+ if (bdi->task) {
+ kthread_stop(bdi->task);
+ bdi->task = NULL;
+ }
}
bdi_debug_unregister(bdi);
device_unregister(bdi->dev);
@@ -272,6 +435,7 @@ int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
+ INIT_RCU_HEAD(&bdi->rcu_head);
bdi->dev = NULL;
bdi->min_ratio = 0;
--
1.6.2
next prev parent reply other threads:[~2009-03-12 14:33 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-03-12 14:33 [PATCH 0/7] Per-bdi writeback flusher threads Jens Axboe
2009-03-12 14:33 ` [PATCH 1/7] writeback: move dirty inodes from super_block to backing_dev_info Jens Axboe
2009-03-24 16:17 ` Jan Kara
2009-03-24 18:45 ` Jens Axboe
2009-03-12 14:33 ` [PATCH 2/7] writeback: switch to per-bdi threads for flushing data Jens Axboe
2009-03-13 5:33 ` Andrew Morton
2009-03-13 10:54 ` Jens Axboe
2009-03-15 22:52 ` Dave Chinner
2009-03-16 7:33 ` Jens Axboe
2009-03-16 10:17 ` Christoph Hellwig
2009-03-16 10:21 ` Jens Axboe
2009-03-16 23:38 ` Dave Chinner
2009-03-17 9:37 ` Jens Axboe
2009-03-17 13:21 ` Chris Mason
2009-03-16 10:22 ` Christoph Hellwig
2009-03-16 13:30 ` Chris Mason
2009-03-16 13:39 ` Christoph Hellwig
2009-03-12 14:33 ` [PATCH 3/7] writeback: get rid of pdflush_operation() in emergency sync and remount Jens Axboe
2009-03-16 10:13 ` Christoph Hellwig
2009-03-12 14:33 ` [PATCH 4/7] writeback: get rid of task/current_is_pdflush() Jens Axboe
2009-03-16 10:14 ` Christoph Hellwig
2009-03-16 10:22 ` Jens Axboe
2009-03-16 13:26 ` Chris Mason
2009-03-12 14:33 ` [PATCH 5/7] writeback: move the default backing_dev_info out of readahead Jens Axboe
2009-03-16 10:19 ` Christoph Hellwig
2009-03-16 10:23 ` Jens Axboe
2009-03-12 14:33 ` Jens Axboe [this message]
2009-03-12 14:33 ` [PATCH 7/7] writeback: add some debug inode list counters to bdi stats Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1236868428-20408-7-git-send-email-jens.axboe@oracle.com \
--to=jens.axboe@oracle.com \
--cc=chris.mason@oracle.com \
--cc=david@fromorbit.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=npiggin@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).