From: Jens Axboe <jens.axboe@oracle.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Theodore Ts'o <tytso@mit.edu>,
Linux Kernel Developers List <linux-kernel@vger.kernel.org>,
Ext4 Developers List <linux-ext4@vger.kernel.org>,
jack@suse.cz
Subject: Re: [PATCH 1/3] block_write_full_page: Use synchronous writes for WBC_SYNC_ALL writebacks
Date: Tue, 7 Apr 2009 10:16:05 +0200 [thread overview]
Message-ID: <20090407081605.GP5178@kernel.dk> (raw)
In-Reply-To: <20090407071729.GN5178@kernel.dk>
On Tue, Apr 07 2009, Jens Axboe wrote:
> BTW, with the increased number of sync IO and unplugging, it makes sense
> to soon look into some finer granularity of plugging. If we didn't have
> so many single page submission paths it would not be as big a problem,
> but we do. And since they still persist so many years after we added
> functionality to pass bigger IOs, it likely wont be much better in the
> future either.
>
> So we can either look into doing per io context plugging, or doing
> something similar to:
>
> plugctx = blk_get_plug_context();
> ...
> submit_bio_plug(rw, bio, plugctx);
> ...
> submit_bio_plug(rw, bio, plugctx);
> ...
> blk_submit_plug_context(plugctx);
>
> and pass that down through wbc, perhaps. Dunno, just a thought.
> Basically a work-around for not having a dedicated writepages() that
> does the right thing (ext3 anyone?).
Here's a quick mockup. It compiles, but that's about all the usage it
has seen so far :-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 43fdedc..5cf416c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1567,6 +1567,17 @@ void submit_bio(int rw, struct bio *bio)
}
EXPORT_SYMBOL(submit_bio);
+void submit_bio_plug(int rw, struct bio *bio, struct blk_plug_ctx *ctx)
+{
+ if (ctx) {
+ bio->bi_rw |= rw;
+ bio->bi_next = ctx->bio_list;
+ ctx->bio_list = bio;
+ } else
+ submit_bio(rw, bio);
+}
+EXPORT_SYMBOL(submit_bio_plug);
+
/**
* blk_rq_check_limits - Helper function to check a request for the queue limit
* @q: the queue
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 012f065..e4313e3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -101,6 +101,8 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
INIT_HLIST_HEAD(&ret->cic_list);
ret->ioc_data = NULL;
+ ret->plug_ctx.bio_list = NULL;
+ ret->plug_ctx.state = 0;
}
return ret;
@@ -171,6 +173,55 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc)
}
EXPORT_SYMBOL(copy_io_context);
+struct blk_plug_ctx *blk_get_plug_context(void)
+{
+ struct io_context *ioc;
+
+ ioc = current_io_context(GFP_ATOMIC, -1);
+ if (!ioc)
+ return NULL;
+
+ if (!test_and_set_bit_lock(0, &ioc->plug_ctx.state))
+ return &ioc->plug_ctx;
+
+ return NULL;
+}
+
+static void __blk_submit_plug_context(struct blk_plug_ctx *ctx)
+{
+ struct block_device *bdev = NULL;
+ struct bio *bio;
+
+ while ((bio = ctx->bio_list) != NULL) {
+ ctx->bio_list = bio->bi_next;
+ bio->bi_next = NULL;
+
+ if (bdev && bdev != bio->bi_bdev)
+ blk_unplug(bdev_get_queue(bdev));
+
+ if (bio_unplug(bio))
+ bdev = bio->bi_bdev;
+
+ bio->bi_flags &= ~(1 << BIO_RW_UNPLUG);
+
+ submit_bio(bio->bi_rw, bio);
+ }
+}
+
+void blk_submit_plug_context(struct blk_plug_ctx *ctx)
+{
+ if (ctx) {
+ __blk_submit_plug_context(ctx);
+ clear_bit_unlock(0, &ctx->state);
+ }
+}
+
+void blk_flush_plug_context(struct blk_plug_ctx *ctx)
+{
+ if (ctx)
+ __blk_submit_plug_context(ctx);
+}
+
static int __init blk_ioc_init(void)
{
iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/fs/buffer.c b/fs/buffer.c
index 6e35762..2ed21b8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1698,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(write_op, bh);
+ submit_bh_plug(write_op, bh, wbc->plug);
nr_underway++;
}
bh = next;
@@ -2884,8 +2884,10 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
bio_put(bio);
}
-int submit_bh(int rw, struct buffer_head * bh)
+static int __submit_bh(int rw, struct buffer_head * bh,
+ struct blk_plug_ctx *ctx)
{
+ gfp_t gfp = ctx ? GFP_ATOMIC : GFP_NOIO;
struct bio *bio;
int ret = 0;
@@ -2910,7 +2912,12 @@ int submit_bh(int rw, struct buffer_head * bh)
* from here on down, it's all bio -- do the initial mapping,
* submit_bio -> generic_make_request may further map this bio around
*/
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc(gfp, 1);
+ if (!bio) {
+ blk_flush_plug_context(ctx);
+ bio_alloc(GFP_NOIO, 1);
+ ctx = NULL;
+ }
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
@@ -2926,7 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh)
bio->bi_private = bh;
bio_get(bio);
- submit_bio(rw, bio);
+
+ submit_bio_plug(rw, bio, ctx);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
@@ -2935,6 +2943,16 @@ int submit_bh(int rw, struct buffer_head * bh)
return ret;
}
+int submit_bh(int rw, struct buffer_head *bh)
+{
+ return __submit_bh(rw, bh, NULL);
+}
+
+int submit_bh_plug(int rw, struct buffer_head *bh, struct blk_plug_ctx *ctx)
+{
+ return __submit_bh(rw, bh, ctx);
+}
+
/**
* ll_rw_block: low-level access to block devices (DEPRECATED)
* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7b73bb8..a8eec18 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -183,6 +183,7 @@ void __lock_buffer(struct buffer_head *bh);
void ll_rw_block(int, int, struct buffer_head * bh[]);
int sync_dirty_buffer(struct buffer_head *bh);
int submit_bh(int, struct buffer_head *);
+int submit_bh_plug(int, struct buffer_head *, struct blk_plug_ctx *);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bce40a2..8a0c4b5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2117,7 +2117,9 @@ extern void file_move(struct file *f, struct list_head *list);
extern void file_kill(struct file *f);
#ifdef CONFIG_BLOCK
struct bio;
+struct blk_plug_ctx;
extern void submit_bio(int, struct bio *);
+extern void submit_bio_plug(int, struct bio *, struct blk_plug_ctx *);
extern int bdev_read_only(struct block_device *);
#endif
extern int set_blocksize(struct block_device *, int);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 08b987b..38c8a2c 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,6 +3,7 @@
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
+#include <linux/list.h>
/*
* This is the per-process anticipatory I/O scheduler state.
@@ -59,6 +60,11 @@ struct cfq_io_context {
struct rcu_head rcu_head;
};
+struct blk_plug_ctx {
+ struct bio *bio_list;
+ unsigned long state;
+};
+
/*
* I/O subsystem state of the associated processes. It is refcounted
* and kmalloc'ed. These could be shared between processes.
@@ -83,6 +89,8 @@ struct io_context {
struct radix_tree_root radix_root;
struct hlist_head cic_list;
void *ioc_data;
+
+ struct blk_plug_ctx plug_ctx;
};
static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -105,7 +113,17 @@ void exit_io_context(void);
struct io_context *get_io_context(gfp_t gfp_flags, int node);
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
+struct blk_plug_ctx *blk_get_plug_context(void);
+void blk_submit_plug_context(struct blk_plug_ctx *);
+void blk_flush_plug_context(struct blk_plug_ctx *);
#else
+static inline void blk_submit_plug_context(struct blk_plug_ctx *ctx)
+{
+}
+static inline struct blk_plug_ctx *blk_get_plug_context(void)
+{
+ return NULL;
+}
static inline void exit_io_context(void)
{
}
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9344547..8b5c14a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -6,6 +6,7 @@
#include <linux/sched.h>
#include <linux/fs.h>
+#include <linux/iocontext.h>
struct backing_dev_info;
@@ -40,6 +41,7 @@ enum writeback_sync_modes {
struct writeback_control {
struct backing_dev_info *bdi; /* If !NULL, only write back this
queue */
+ struct blk_plug_ctx *plug;
enum writeback_sync_modes sync_mode;
unsigned long *older_than_this; /* If !NULL, only write back inodes
older than this */
diff --git a/mm/filemap.c b/mm/filemap.c
index 2e2d38e..d521830 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -218,7 +218,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
if (!mapping_cap_writeback_dirty(mapping))
return 0;
+ wbc.plug = blk_get_plug_context();
ret = do_writepages(mapping, &wbc);
+ blk_submit_plug_context(wbc.plug);
return ret;
}
--
Jens Axboe
next prev parent reply other threads:[~2009-04-07 8:16 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-03-27 20:24 [PATCH 0/3] Ext3 latency improvement patches Theodore Ts'o
2009-03-27 20:24 ` [PATCH 1/3] block_write_full_page: Use synchronous writes for WBC_SYNC_ALL writebacks Theodore Ts'o
2009-03-27 20:24 ` [PATCH 2/3] ext3: Use WRITE_SYNC for commits which are caused by fsync() Theodore Ts'o
2009-03-27 20:24 ` [PATCH 3/3] ext3: Avoid starting a transaction in writepage when not necessary Theodore Ts'o
2009-03-27 22:23 ` Jan Kara
2009-03-27 23:03 ` Theodore Tso
2009-03-30 13:22 ` Jan Kara
2009-03-27 22:20 ` [PATCH 2/3] ext3: Use WRITE_SYNC for commits which are caused by fsync() Jan Kara
2009-03-27 20:55 ` [PATCH 1/3] block_write_full_page: Use synchronous writes for WBC_SYNC_ALL writebacks Jan Kara
2009-04-07 6:21 ` Andrew Morton
2009-04-07 6:50 ` Andrew Morton
2009-04-07 7:08 ` Jens Axboe
2009-04-07 7:17 ` Jens Axboe
2009-04-07 8:16 ` Jens Axboe [this message]
2009-04-07 7:23 ` Andrew Morton
2009-04-07 7:57 ` Jens Axboe
2009-04-07 19:09 ` Theodore Tso
2009-04-07 19:32 ` Jens Axboe
2009-04-07 21:44 ` Theodore Tso
2009-04-07 22:19 ` [PATCH] block_write_full_page: switch synchronous writes to use WRITE_SYNC_PLUG Theodore Tso
2009-04-07 23:09 ` Andrew Morton
2009-04-07 23:46 ` Theodore Tso
2009-04-08 8:08 ` Jens Axboe
2009-04-08 22:34 ` Andrew Morton
2009-04-09 17:59 ` Jens Axboe
2009-04-08 6:00 ` Jens Axboe
2009-04-08 15:26 ` Theodore Tso
2009-04-08 5:58 ` [PATCH 1/3] block_write_full_page: Use synchronous writes for WBC_SYNC_ALL writebacks Jens Axboe
2009-04-08 15:25 ` Theodore Tso
2009-04-07 14:19 ` Theodore Tso
2009-03-27 20:50 ` [PATCH 0/3] Ext3 latency improvement patches Chris Mason
2009-03-27 21:03 ` Chris Mason
2009-03-27 21:19 ` Jan Kara
2009-03-27 21:30 ` Theodore Tso
2009-03-27 21:54 ` Jan Kara
2009-03-27 23:09 ` Theodore Tso
2009-03-28 0:14 ` Jeff Garzik
2009-03-28 0:24 ` David Rees
2009-03-30 14:16 ` Ric Wheeler
2009-03-30 11:23 ` Aneesh Kumar K.V
[not found] ` <20090330112330.GA11357@skywalker>
2009-03-30 11:44 ` Chris Mason
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090407081605.GP5178@kernel.dk \
--to=jens.axboe@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=jack@suse.cz \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).