* [PATCH v3 01/16] block: Generalized bio pool freeing
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-2-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 02/16] dm: Use bioset's front_pad for dm_rq_clone_bio_info Kent Overstreet
` (14 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: Kent Overstreet, tj, axboe, agk, neilb, drbd-dev, bharrosh,
vgoyal, mpatocka, sage, yehuda
With the old code, when you allocate a bio from a bio pool you have to
implement your own destructor that knows how to find the bio pool the
bio was originally allocated from.
This adds a new field to struct bio (bi_pool) and changes
bio_alloc_bioset() to use it. This makes various bio destructors
unnecessary, so they're then deleted.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: I5eb66c1d6910757f4af8755b8857dcbe4619cf8d
---
drivers/md/dm-crypt.c | 9 ---------
drivers/md/dm-io.c | 11 -----------
drivers/md/dm.c | 20 --------------------
drivers/md/md.c | 28 ++++------------------------
drivers/target/target_core_iblock.c | 9 ---------
fs/bio.c | 26 ++++++++------------------
include/linux/blk_types.h | 3 +++
7 files changed, 15 insertions(+), 91 deletions(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3f06df5..40716d8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -808,14 +808,6 @@ static int crypt_convert(struct crypt_config *cc,
return 0;
}
-static void dm_crypt_bio_destructor(struct bio *bio)
-{
- struct dm_crypt_io *io = bio->bi_private;
- struct crypt_config *cc = io->target->private;
-
- bio_free(bio, cc->bs);
-}
-
/*
* Generate a new unfragmented bio with the given size
* This should never violate the device limitations
@@ -985,7 +977,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
clone->bi_end_io = crypt_endio;
clone->bi_bdev = cc->dev->bdev;
clone->bi_rw = io->base_bio->bi_rw;
- clone->bi_destructor = dm_crypt_bio_destructor;
}
static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea5dd28..1c46f97 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -249,16 +249,6 @@ static void vm_dp_init(struct dpages *dp, void *data)
dp->context_ptr = data;
}
-static void dm_bio_destructor(struct bio *bio)
-{
- unsigned region;
- struct io *io;
-
- retrieve_io_and_region_from_bio(bio, &io, ®ion);
-
- bio_free(bio, io->client->bios);
-}
-
/*
* Functions for getting the pages from kernel memory.
*/
@@ -317,7 +307,6 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
- bio->bi_destructor = dm_bio_destructor;
store_io_and_region_in_bio(bio, io, region);
if (rw & REQ_DISCARD) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e24143c..40b7735 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -681,11 +681,6 @@ static void clone_endio(struct bio *bio, int error)
}
}
- /*
- * Store md for cleanup instead of tio which is about to get freed.
- */
- bio->bi_private = md->bs;
-
free_tio(md, tio);
bio_put(bio);
dec_pending(io, error);
@@ -1013,11 +1008,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
/* error the io and bail out, or requeue it if needed */
md = tio->io->md;
dec_pending(tio->io, r);
- /*
- * Store bio_set for cleanup.
- */
- clone->bi_end_io = NULL;
- clone->bi_private = md->bs;
bio_put(clone);
free_tio(md, tio);
} else if (r) {
@@ -1036,13 +1026,6 @@ struct clone_info {
unsigned short idx;
};
-static void dm_bio_destructor(struct bio *bio)
-{
- struct bio_set *bs = bio->bi_private;
-
- bio_free(bio, bs);
-}
-
/*
* Creates a little bio that just does part of a bvec.
*/
@@ -1054,7 +1037,6 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
struct bio_vec *bv = bio->bi_io_vec + idx;
clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
- clone->bi_destructor = dm_bio_destructor;
*clone->bi_io_vec = *bv;
clone->bi_sector = sector;
@@ -1086,7 +1068,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
- clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
clone->bi_vcnt = idx + bv_count;
@@ -1131,7 +1112,6 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
*/
clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
__bio_clone(clone, ci->bio);
- clone->bi_destructor = dm_bio_destructor;
if (len) {
clone->bi_sector = ci->sector;
clone->bi_size = to_bytes(len);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01233d8..a5a524e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -155,32 +155,17 @@ static int start_readonly;
* like bio_clone, but with a local bio set
*/
-static void mddev_bio_destructor(struct bio *bio)
-{
- struct mddev *mddev, **mddevp;
-
- mddevp = (void*)bio;
- mddev = mddevp[-1];
-
- bio_free(bio, mddev->bio_set);
-}
-
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev)
{
struct bio *b;
- struct mddev **mddevp;
if (!mddev || !mddev->bio_set)
return bio_alloc(gfp_mask, nr_iovecs);
- b = bio_alloc_bioset(gfp_mask, nr_iovecs,
- mddev->bio_set);
+ b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
if (!b)
return NULL;
- mddevp = (void*)b;
- mddevp[-1] = mddev;
- b->bi_destructor = mddev_bio_destructor;
return b;
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);
@@ -189,18 +174,14 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
struct mddev *mddev)
{
struct bio *b;
- struct mddev **mddevp;
if (!mddev || !mddev->bio_set)
return bio_clone(bio, gfp_mask);
- b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
- mddev->bio_set);
+ b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, mddev->bio_set);
if (!b)
return NULL;
- mddevp = (void*)b;
- mddevp[-1] = mddev;
- b->bi_destructor = mddev_bio_destructor;
+
__bio_clone(b, bio);
if (bio_integrity(bio)) {
int ret;
@@ -4831,8 +4812,7 @@ int md_run(struct mddev *mddev)
}
if (mddev->bio_set == NULL)
- mddev->bio_set = bioset_create(BIO_POOL_SIZE,
- sizeof(struct mddev *));
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 2ec299e..85b20c4 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -448,14 +448,6 @@ static ssize_t iblock_show_configfs_dev_params(
return bl;
}
-static void iblock_bio_destructor(struct bio *bio)
-{
- struct se_task *task = bio->bi_private;
- struct iblock_dev *ib_dev = task->task_se_cmd->se_dev->dev_ptr;
-
- bio_free(bio, ib_dev->ibd_bio_set);
-}
-
static struct bio *
iblock_get_bio(struct se_task *task, sector_t lba, u32 sg_num)
{
@@ -482,7 +474,6 @@ iblock_get_bio(struct se_task *task, sector_t lba, u32 sg_num)
bio->bi_bdev = ib_dev->ibd_bd;
bio->bi_private = task;
- bio->bi_destructor = iblock_bio_destructor;
bio->bi_end_io = &iblock_bio_done;
bio->bi_sector = lba;
atomic_inc(&ib_req->pending);
diff --git a/fs/bio.c b/fs/bio.c
index e453924..3667cef 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -269,10 +269,6 @@ EXPORT_SYMBOL(bio_init);
* bio_alloc_bioset will try its own mempool to satisfy the allocation.
* If %__GFP_WAIT is set then we will block on the internal pool waiting
* for a &struct bio to become free.
- *
- * Note that the caller must set ->bi_destructor on successful return
- * of a bio, to do the appropriate freeing of the bio once the reference
- * count drops to zero.
**/
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
@@ -287,6 +283,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
bio = p + bs->front_pad;
bio_init(bio);
+ bio->bi_pool = bs;
if (unlikely(!nr_iovecs))
goto out_set;
@@ -313,11 +310,6 @@ err_free:
}
EXPORT_SYMBOL(bio_alloc_bioset);
-static void bio_fs_destructor(struct bio *bio)
-{
- bio_free(bio, fs_bio_set);
-}
-
/**
* bio_alloc - allocate a new bio, memory pool backed
* @gfp_mask: allocation mask to use
@@ -339,12 +331,7 @@ static void bio_fs_destructor(struct bio *bio)
*/
struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
- struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-
- if (bio)
- bio->bi_destructor = bio_fs_destructor;
-
- return bio;
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
}
EXPORT_SYMBOL(bio_alloc);
@@ -419,7 +406,11 @@ void bio_put(struct bio *bio)
*/
if (atomic_dec_and_test(&bio->bi_cnt)) {
bio->bi_next = NULL;
- bio->bi_destructor(bio);
+
+ if (bio->bi_pool)
+ bio_free(bio, bio->bi_pool);
+ else
+ bio->bi_destructor(bio);
}
}
EXPORT_SYMBOL(bio_put);
@@ -470,12 +461,11 @@ EXPORT_SYMBOL(__bio_clone);
*/
struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
{
- struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
+ struct bio *b = bio_alloc(gfp_mask, bio->bi_max_vecs);
if (!b)
return NULL;
- b->bi_destructor = bio_fs_destructor;
__bio_clone(b, bio);
if (bio_integrity(bio)) {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4053cbd..dc0e399 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -70,6 +70,9 @@ struct bio {
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
+ /* If bi_pool is non NULL, bi_destructor is not called */
+ struct bio_set *bi_pool;
+
bio_destructor_t *bi_destructor; /* destructor */
/*
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 02/16] dm: Use bioset's front_pad for dm_rq_clone_bio_info
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
2012-05-25 20:25 ` [PATCH v3 01/16] block: Generalized bio pool freeing Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-3-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
[not found] ` <1337977539-16977-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (13 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
Previously, dm_rq_clone_bio_info needed to be freed by the bio's
destructor to avoid a memory leak in the blk_rq_prep_clone() error path.
This gets rid of a memory allocation and means we can kill
dm_rq_bio_destructor.
Change-Id: I4ed2b9dbd1dffd44ad15bc3de89d3c3b6bc7714a
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/md/dm.c | 31 +++++--------------------------
1 file changed, 5 insertions(+), 26 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 40b7735..4014696 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -92,6 +92,7 @@ struct dm_rq_target_io {
struct dm_rq_clone_bio_info {
struct bio *orig;
struct dm_rq_target_io *tio;
+ struct bio clone;
};
union map_info *dm_get_mapinfo(struct bio *bio)
@@ -467,16 +468,6 @@ static void free_rq_tio(struct dm_rq_target_io *tio)
mempool_free(tio, tio->md->tio_pool);
}
-static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
-{
- return mempool_alloc(md->io_pool, GFP_ATOMIC);
-}
-
-static void free_bio_info(struct dm_rq_clone_bio_info *info)
-{
- mempool_free(info, info->tio->md->io_pool);
-}
-
static int md_in_flight(struct mapped_device *md)
{
return atomic_read(&md->pending[READ]) +
@@ -1438,30 +1429,17 @@ void dm_dispatch_request(struct request *rq)
}
EXPORT_SYMBOL_GPL(dm_dispatch_request);
-static void dm_rq_bio_destructor(struct bio *bio)
-{
- struct dm_rq_clone_bio_info *info = bio->bi_private;
- struct mapped_device *md = info->tio->md;
-
- free_bio_info(info);
- bio_free(bio, md->bs);
-}
-
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
void *data)
{
struct dm_rq_target_io *tio = data;
- struct mapped_device *md = tio->md;
- struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
-
- if (!info)
- return -ENOMEM;
+ struct dm_rq_clone_bio_info *info =
+ container_of(bio, struct dm_rq_clone_bio_info, clone);
info->orig = bio_orig;
info->tio = tio;
bio->bi_end_io = end_clone_bio;
bio->bi_private = info;
- bio->bi_destructor = dm_rq_bio_destructor;
return 0;
}
@@ -2696,7 +2674,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
if (!pools->tio_pool)
goto free_io_pool_and_out;
- pools->bs = bioset_create(pool_size, 0);
+ pools->bs = bioset_create(pool_size,
+ offsetof(struct dm_rq_clone_bio_info, orig));
if (!pools->bs)
goto free_tio_pool_and_out;
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
[parent not found: <1337977539-16977-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]
* [PATCH v3 03/16] block: Add bio_reset()
[not found] ` <1337977539-16977-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-4-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
linux-bcache-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA
Cc: Kent Overstreet, tj-DgEjT+Ai2ygdnm+yROfE0A,
axboe-tSWWG44O7X1aa/9Udqfwiw, agk-H+wXaHxf7aLQT0dZR+AlfA,
neilb-l3A5Bk7waGM, drbd-dev-cunTk1MwBs8qoQakbn7OcQ,
bharrosh-C4P08NqkoRlBDgjK7y7TUQ, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
mpatocka-H+wXaHxf7aLQT0dZR+AlfA, sage-BnTBU8nroG7k1uMJSBkQmQ,
yehuda-L5o5AL9CYN0tUFlbccrkMA
Reusing bios is something that's been highly frowned upon in the past,
but driver code keeps doing it anyways. If it's going to happen anyways,
we should provide a generic method.
This'll help with getting rid of bi_destructor - drivers/block/pktcdvd.c
was open coding it, by doing a bio_init() and resetting bi_destructor.
Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Change-Id: Ib0a43dfcb3f6c22a54da513d4a86be544b5ffd95
---
fs/bio.c | 8 ++++++++
include/linux/bio.h | 1 +
include/linux/blk_types.h | 6 ++++++
3 files changed, 15 insertions(+)
diff --git a/fs/bio.c b/fs/bio.c
index 3667cef..240da21 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -259,6 +259,14 @@ void bio_init(struct bio *bio)
}
EXPORT_SYMBOL(bio_init);
+void bio_reset(struct bio *bio)
+{
+ memset(bio, 0, BIO_RESET_BYTES);
+ bio->bi_flags = 1 << BIO_UPTODATE;
+
+}
+EXPORT_SYMBOL(bio_reset);
+
/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_ mask given to the slab allocator
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4d94eb8..e5c2504 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -226,6 +226,7 @@ extern void __bio_clone(struct bio *, struct bio *);
extern struct bio *bio_clone(struct bio *, gfp_t);
extern void bio_init(struct bio *);
+extern void bio_reset(struct bio *);
extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dc0e399..6b7daf3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -57,6 +57,10 @@ struct bio {
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
+ /*
+ * Everything starting with bi_max_vecs will be preserved by bio_reset()
+ */
+
unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t bi_cnt; /* pin count */
@@ -83,6 +87,8 @@ struct bio {
struct bio_vec bi_inline_vecs[0];
};
+#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
+
/*
* bio flags
*/
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 04/16] pktcdvd: Switch to bio_kmalloc()
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (2 preceding siblings ...)
[not found] ` <1337977539-16977-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-5-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 05/16] block: Kill bi_destructor Kent Overstreet
` (11 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
This is prep work for killing bi_destructor - previously, pktcdvd had
its own pkt_bio_alloc which was basically duplication bio_kmalloc(),
necessitating its own bi_destructor implementation.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: I5604293e07f695c8f0106ae819e306f1def89a67
---
drivers/block/pktcdvd.c | 115 ++++++++++++++++-------------------------------
1 file changed, 39 insertions(+), 76 deletions(-)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ba66e44..6fe693a 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -522,36 +522,38 @@ static void pkt_bio_finished(struct pktcdvd_device *pd)
}
}
-static void pkt_bio_destructor(struct bio *bio)
+static void pkt_end_io_read(struct bio *bio, int err)
{
- kfree(bio->bi_io_vec);
- kfree(bio);
-}
+ struct packet_data *pkt = bio->bi_private;
+ struct pktcdvd_device *pd = pkt->pd;
+ BUG_ON(!pd);
-static struct bio *pkt_bio_alloc(int nr_iovecs)
-{
- struct bio_vec *bvl = NULL;
- struct bio *bio;
+ VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio,
+ (unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err);
- bio = kmalloc(sizeof(struct bio), GFP_KERNEL);
- if (!bio)
- goto no_bio;
- bio_init(bio);
+ if (err)
+ atomic_inc(&pkt->io_errors);
+ if (atomic_dec_and_test(&pkt->io_wait)) {
+ atomic_inc(&pkt->run_sm);
+ wake_up(&pd->wqueue);
+ }
+ pkt_bio_finished(pd);
+}
- bvl = kcalloc(nr_iovecs, sizeof(struct bio_vec), GFP_KERNEL);
- if (!bvl)
- goto no_bvl;
+static void pkt_end_io_packet_write(struct bio *bio, int err)
+{
+ struct packet_data *pkt = bio->bi_private;
+ struct pktcdvd_device *pd = pkt->pd;
+ BUG_ON(!pd);
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bvl;
- bio->bi_destructor = pkt_bio_destructor;
+ VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err);
- return bio;
+ pd->stats.pkt_ended++;
- no_bvl:
- kfree(bio);
- no_bio:
- return NULL;
+ pkt_bio_finished(pd);
+ atomic_dec(&pkt->io_wait);
+ atomic_inc(&pkt->run_sm);
+ wake_up(&pd->wqueue);
}
/*
@@ -567,10 +569,13 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
goto no_pkt;
pkt->frames = frames;
- pkt->w_bio = pkt_bio_alloc(frames);
+ pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames);
if (!pkt->w_bio)
goto no_bio;
+ pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+ pkt->w_bio->bi_private = pkt;
+
for (i = 0; i < frames / FRAMES_PER_PAGE; i++) {
pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (!pkt->pages[i])
@@ -581,9 +586,12 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
bio_list_init(&pkt->orig_bios);
for (i = 0; i < frames; i++) {
- struct bio *bio = pkt_bio_alloc(1);
+ struct bio *bio = bio_kmalloc(GFP_KERNEL, 1);
if (!bio)
goto no_rd_bio;
+
+ bio->bi_end_io = pkt_end_io_read;
+ bio->bi_private = pkt;
pkt->r_bios[i] = bio;
}
@@ -1036,40 +1044,6 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
}
}
-static void pkt_end_io_read(struct bio *bio, int err)
-{
- struct packet_data *pkt = bio->bi_private;
- struct pktcdvd_device *pd = pkt->pd;
- BUG_ON(!pd);
-
- VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio,
- (unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err);
-
- if (err)
- atomic_inc(&pkt->io_errors);
- if (atomic_dec_and_test(&pkt->io_wait)) {
- atomic_inc(&pkt->run_sm);
- wake_up(&pd->wqueue);
- }
- pkt_bio_finished(pd);
-}
-
-static void pkt_end_io_packet_write(struct bio *bio, int err)
-{
- struct packet_data *pkt = bio->bi_private;
- struct pktcdvd_device *pd = pkt->pd;
- BUG_ON(!pd);
-
- VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err);
-
- pd->stats.pkt_ended++;
-
- pkt_bio_finished(pd);
- atomic_dec(&pkt->io_wait);
- atomic_inc(&pkt->run_sm);
- wake_up(&pd->wqueue);
-}
-
/*
* Schedule reads for the holes in a packet
*/
@@ -1111,21 +1085,15 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
* Schedule reads for missing parts of the packet.
*/
for (f = 0; f < pkt->frames; f++) {
- struct bio_vec *vec;
-
int p, offset;
+
if (written[f])
continue;
+
bio = pkt->r_bios[f];
- vec = bio->bi_io_vec;
- bio_init(bio);
- bio->bi_max_vecs = 1;
- bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
- bio->bi_bdev = pd->bdev;
- bio->bi_end_io = pkt_end_io_read;
- bio->bi_private = pkt;
- bio->bi_io_vec = vec;
- bio->bi_destructor = pkt_bio_destructor;
+ bio_reset(bio);
+ bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
+ bio->bi_bdev = pd->bdev;
p = (f * CD_FRAMESIZE) / PAGE_SIZE;
offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
@@ -1418,14 +1386,9 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
}
/* Start the write request */
- bio_init(pkt->w_bio);
- pkt->w_bio->bi_max_vecs = PACKET_MAX_SIZE;
+ bio_reset(pkt->w_bio);
pkt->w_bio->bi_sector = pkt->sector;
pkt->w_bio->bi_bdev = pd->bdev;
- pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
- pkt->w_bio->bi_private = pkt;
- pkt->w_bio->bi_io_vec = bvec;
- pkt->w_bio->bi_destructor = pkt_bio_destructor;
for (f = 0; f < pkt->frames; f++)
if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
BUG();
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 05/16] block: Kill bi_destructor
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (3 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 04/16] pktcdvd: Switch to bio_kmalloc() Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-6-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 06/16] block: Add an explicit bio flag for bios that own their bvec Kent Overstreet
` (10 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
Now that we've got generic code for freeing bios allocated from bio
pools, this isn't needed anymore.
This also changes the semantics of bio_free() a bit - it now also frees
bios allocated by bio_kmalloc(). It's also no longer exported, as
without bi_destructor there should be no need for it to be called
anywhere else.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: Iade939e76ff329cb03eef9125e4377ba1e4265c5
---
Documentation/block/biodoc.txt | 5 -----
fs/bio.c | 33 +++++++++++++++------------------
include/linux/bio.h | 1 -
include/linux/blk_types.h | 3 ---
4 files changed, 15 insertions(+), 27 deletions(-)
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index e418dc0..8df5e8e 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -465,7 +465,6 @@ struct bio {
bio_end_io_t *bi_end_io; /* bi_end_io (bio) */
atomic_t bi_cnt; /* pin count: free when it hits zero */
void *bi_private;
- bio_destructor_t *bi_destructor; /* bi_destructor (bio) */
};
With this multipage bio design:
@@ -647,10 +646,6 @@ for a non-clone bio. There are the 6 pools setup for different size biovecs,
so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
given size from these slabs.
-The bi_destructor() routine takes into account the possibility of the bio
-having originated from a different source (see later discussions on
-n/w to block transfers and kvec_cb)
-
The bio_get() routine may be used to hold an extra reference on a bio prior
to i/o submission, if the bio fields are likely to be accessed after the
i/o is issued (since the bio may otherwise get freed in case i/o completion
diff --git a/fs/bio.c b/fs/bio.c
index 240da21..4075171 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -230,10 +230,19 @@ fallback:
return bvl;
}
-void bio_free(struct bio *bio, struct bio_set *bs)
+static void bio_free(struct bio *bio)
{
+ struct bio_set *bs = bio->bi_pool;
void *p;
+ if (!bs) {
+ /* Bio was allocated by bio_kmalloc() */
+ if (bio_integrity(bio))
+ bio_integrity_free(bio, fs_bio_set);
+ kfree(bio);
+ return;
+ }
+
if (bio_has_allocated_vec(bio))
bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
@@ -249,7 +258,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
mempool_free(p, bs->bio_pool);
}
-EXPORT_SYMBOL(bio_free);
void bio_init(struct bio *bio)
{
@@ -343,13 +351,6 @@ struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
}
EXPORT_SYMBOL(bio_alloc);
-static void bio_kmalloc_destructor(struct bio *bio)
-{
- if (bio_integrity(bio))
- bio_integrity_free(bio, fs_bio_set);
- kfree(bio);
-}
-
/**
* bio_kmalloc - allocate a bio for I/O using kmalloc()
* @gfp_mask: the GFP_ mask given to the slab allocator
@@ -376,7 +377,6 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
bio->bi_max_vecs = nr_iovecs;
bio->bi_io_vec = bio->bi_inline_vecs;
- bio->bi_destructor = bio_kmalloc_destructor;
return bio;
}
@@ -411,15 +411,12 @@ void bio_put(struct bio *bio)
/*
* last put frees it
+ *
+ * bio->bi_pool will be NULL if the bio was allocated by bio_kmalloc() -
+ * bio_free() handles that case too.
*/
- if (atomic_dec_and_test(&bio->bi_cnt)) {
- bio->bi_next = NULL;
-
- if (bio->bi_pool)
- bio_free(bio, bio->bi_pool);
- else
- bio->bi_destructor(bio);
- }
+ if (atomic_dec_and_test(&bio->bi_cnt))
+ bio_free(bio);
}
EXPORT_SYMBOL(bio_put);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e5c2504..242447c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -216,7 +216,6 @@ extern struct bio *bio_alloc(gfp_t, unsigned int);
extern struct bio *bio_kmalloc(gfp_t, unsigned int);
extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
extern void bio_put(struct bio *);
-extern void bio_free(struct bio *, struct bio_set *);
extern void bio_endio(struct bio *, int);
struct request_queue;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6b7daf3..b6ddbf1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -74,11 +74,8 @@ struct bio {
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
- /* If bi_pool is non NULL, bi_destructor is not called */
struct bio_set *bi_pool;
- bio_destructor_t *bi_destructor; /* destructor */
-
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 06/16] block: Add an explicit bio flag for bios that own their bvec
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (4 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 05/16] block: Kill bi_destructor Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-7-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 07/16] block: Rename bio_split() -> bio_pair_split() Kent Overstreet
` (9 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
This is for the new bio splitting code. When we split a bio, if the
split occured on a bvec boundry we reuse the bvec for the new bio. But
that means bio_free() can't free it, hence the explicit flag.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: I040f6b501088e882a9f013d6b6e730ff04e9c1da
---
fs/bio.c | 3 ++-
include/linux/bio.h | 5 -----
include/linux/blk_types.h | 1 +
3 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/fs/bio.c b/fs/bio.c
index 4075171..692911e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -243,7 +243,7 @@ static void bio_free(struct bio *bio)
return;
}
- if (bio_has_allocated_vec(bio))
+ if (bio_flagged(bio, BIO_OWNS_VEC))
bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
if (bio_integrity(bio))
@@ -313,6 +313,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
goto err_free;
nr_iovecs = bvec_nr_vecs(idx);
+ bio->bi_flags |= 1 << BIO_OWNS_VEC;
}
out_set:
bio->bi_flags |= idx << BIO_POOL_OFFSET;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 242447c..9e89f5e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -84,11 +84,6 @@ static inline void *bio_data(struct bio *bio)
return NULL;
}
-static inline int bio_has_allocated_vec(struct bio *bio)
-{
- return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs;
-}
-
/*
* will die
*/
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b6ddbf1..5ad45bf 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -101,6 +101,7 @@ struct bio {
#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */
#define BIO_QUIET 10 /* Make BIO Quiet */
#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+#define BIO_OWNS_VEC 12 /* bio_free() should free bvec */
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
/*
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 07/16] block: Rename bio_split() -> bio_pair_split()
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (5 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 06/16] block: Add an explicit bio flag for bios that own their bvec Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-8-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 08/16] block: Rework bio splitting Kent Overstreet
` (8 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
This is prep work for introducing a more general bio_split()
Change-Id: Ib9c4ff691af889d26bc9ec9fb42a2f3068f34ad9
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/block/drbd/drbd_req.c | 2 +-
drivers/block/pktcdvd.c | 2 +-
drivers/block/rbd.c | 3 ++-
drivers/md/linear.c | 2 +-
drivers/md/raid0.c | 6 +++---
drivers/md/raid10.c | 4 ++--
fs/bio.c | 4 ++--
include/linux/bio.h | 2 +-
8 files changed, 13 insertions(+), 12 deletions(-)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 4a0f314..1c7e3c4 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1128,7 +1128,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
const int sps = 1 << HT_SHIFT; /* sectors per slot */
const int mask = sps - 1;
const sector_t first_sectors = sps - (sect & mask);
- bp = bio_split(bio, first_sectors);
+ bp = bio_pair_split(bio, first_sectors);
/* we need to get a "reference count" (ap_bio_cnt)
* to avoid races with the disconnect/reconnect/suspend code.
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 6fe693a..12a14c0 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2467,7 +2467,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
if (last_zone != zone) {
BUG_ON(last_zone != zone + pd->settings.size);
first_sectors = last_zone - bio->bi_sector;
- bp = bio_split(bio, first_sectors);
+ bp = bio_pair_split(bio, first_sectors);
BUG_ON(!bp);
pkt_make_request(q, &bp->bio1);
pkt_make_request(q, &bp->bio2);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 013c7a5..a0b76c6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -747,7 +747,8 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
/* split the bio. We'll release it either in the next
call, or it will have to be released outside */
- bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
+ bp = bio_pair_split(old_chain,
+ (len - total) / SECTOR_SIZE);
if (!bp)
goto err_out;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index fa211d8..e860cb9 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -314,7 +314,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
rcu_read_unlock();
- bp = bio_split(bio, end_sector - bio->bi_sector);
+ bp = bio_pair_split(bio, end_sector - bio->bi_sector);
linear_make_request(mddev, &bp->bio1);
linear_make_request(mddev, &bp->bio2);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index de63a1f..c89c8aa 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -516,11 +516,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
* refuse to split for us, so we need to split it.
*/
if (likely(is_power_of_2(chunk_sects)))
- bp = bio_split(bio, chunk_sects - (sector &
+ bp = bio_pair_split(bio, chunk_sects - (sector &
(chunk_sects-1)));
else
- bp = bio_split(bio, chunk_sects -
- sector_div(sector, chunk_sects));
+ bp = bio_pair_split(bio, chunk_sects -
+ sector_div(sector, chunk_sects));
raid0_make_request(mddev, &bp->bio1);
raid0_make_request(mddev, &bp->bio2);
bio_pair_release(bp);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3e7b154..f8b6f14 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1008,8 +1008,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
/* This is a one page bio that upper layers
* refuse to split for us, so we need to split it.
*/
- bp = bio_split(bio,
- chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
+ bp = bio_pair_split(bio,
+ chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
/* Each of these 'make_request' calls will call 'wait_barrier'.
* If the first succeeds but the second blocks due to the resync
diff --git a/fs/bio.c b/fs/bio.c
index 692911e..fb7607b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1478,7 +1478,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
/*
* split a bio - only worry about a bio with a single page in its iovec
*/
-struct bio_pair *bio_split(struct bio *bi, int first_sectors)
+struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors)
{
struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
@@ -1521,7 +1521,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
return bp;
}
-EXPORT_SYMBOL(bio_split);
+EXPORT_SYMBOL(bio_pair_split);
/**
* bio_sector_offset - Find hardware sector offset in bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9e89f5e..18ab020 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -201,7 +201,7 @@ struct bio_pair {
atomic_t cnt;
int error;
};
-extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
+extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors);
extern void bio_pair_release(struct bio_pair *dbio);
extern struct bio_set *bioset_create(unsigned int, unsigned int);
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 08/16] block: Rework bio splitting
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (6 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 07/16] block: Rename bio_split() -> bio_pair_split() Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-9-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 09/16] block: Add bio_clone_kmalloc() Kent Overstreet
` (7 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
Break out bio_split() and bio_pair_split(), and get rid of the
limitations of bio_split()
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: Ic4733be7af6a4957974b42b09233b2209a779cbf
---
drivers/block/drbd/drbd_req.c | 16 +---
drivers/block/pktcdvd.c | 4 +-
drivers/block/rbd.c | 7 +-
drivers/md/linear.c | 4 +-
drivers/md/raid0.c | 6 +-
drivers/md/raid10.c | 21 +-----
fs/bio-integrity.c | 44 -----------
fs/bio.c | 167 +++++++++++++++++++++++++++++------------
include/linux/bio.h | 25 +++---
9 files changed, 144 insertions(+), 150 deletions(-)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 1c7e3c4..68fa494 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1101,18 +1101,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
if (likely(s_enr == e_enr)) {
inc_ap_bio(mdev, 1);
drbd_make_request_common(mdev, bio, start_time);
- return;
- }
-
- /* can this bio be split generically?
- * Maybe add our own split-arbitrary-bios function. */
- if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
- /* rather error out here than BUG in bio_split */
- dev_err(DEV, "bio would need to, but cannot, be split: "
- "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
- bio->bi_vcnt, bio->bi_idx, bio->bi_size,
- (unsigned long long)bio->bi_sector);
- bio_endio(bio, -EINVAL);
} else {
/* This bio crosses some boundary, so we have to split it. */
struct bio_pair *bp;
@@ -1139,10 +1127,10 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
D_ASSERT(e_enr == s_enr + 1);
- while (drbd_make_request_common(mdev, &bp->bio1, start_time))
+ while (drbd_make_request_common(mdev, &bp->split, start_time))
inc_ap_bio(mdev, 1);
- while (drbd_make_request_common(mdev, &bp->bio2, start_time))
+ while (drbd_make_request_common(mdev, bio, start_time))
inc_ap_bio(mdev, 1);
dec_ap_bio(mdev);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 12a14c0..1465155 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2469,8 +2469,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
first_sectors = last_zone - bio->bi_sector;
bp = bio_pair_split(bio, first_sectors);
BUG_ON(!bp);
- pkt_make_request(q, &bp->bio1);
- pkt_make_request(q, &bp->bio2);
+ pkt_make_request(q, &bp->split);
+ pkt_make_request(q, bio);
bio_pair_release(bp);
return;
}
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a0b76c6..11777b4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -747,14 +747,13 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
/* split the bio. We'll release it either in the next
call, or it will have to be released outside */
- bp = bio_pair_split(old_chain,
- (len - total) / SECTOR_SIZE);
+ bp = bio_pair_split(old_chain, (len - total) / SECTOR_SIZE);
if (!bp)
goto err_out;
- __bio_clone(tmp, &bp->bio1);
+ __bio_clone(tmp, &bp->split);
- *next = &bp->bio2;
+ *next = bp->orig;
} else {
__bio_clone(tmp, old_chain);
*next = old_chain->bi_next;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index e860cb9..7c6cafd 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -316,8 +316,8 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
bp = bio_pair_split(bio, end_sector - bio->bi_sector);
- linear_make_request(mddev, &bp->bio1);
- linear_make_request(mddev, &bp->bio2);
+ linear_make_request(mddev, &bp->split);
+ linear_make_request(mddev, bio);
bio_pair_release(bp);
return;
}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c89c8aa..3469adf 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -520,9 +520,9 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
(chunk_sects-1)));
else
bp = bio_pair_split(bio, chunk_sects -
- sector_div(sector, chunk_sects));
- raid0_make_request(mddev, &bp->bio1);
- raid0_make_request(mddev, &bp->bio2);
+ sector_div(sector, chunk_sects));
+ raid0_make_request(mddev, &bp->split);
+ raid0_make_request(mddev, bio);
bio_pair_release(bp);
return;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f8b6f14..0062326 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1001,15 +1001,9 @@ static void make_request(struct mddev *mddev, struct bio * bio)
> chunk_sects &&
conf->near_copies < conf->raid_disks)) {
struct bio_pair *bp;
- /* Sanity check -- queue functions should prevent this happening */
- if (bio->bi_vcnt != 1 ||
- bio->bi_idx != 0)
- goto bad_map;
- /* This is a one page bio that upper layers
- * refuse to split for us, so we need to split it.
- */
+
bp = bio_pair_split(bio,
- chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
+ chunk_sects - (bio->bi_sector & (chunk_sects - 1)));
/* Each of these 'make_request' calls will call 'wait_barrier'.
* If the first succeeds but the second blocks due to the resync
@@ -1023,8 +1017,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
conf->nr_waiting++;
spin_unlock_irq(&conf->resync_lock);
- make_request(mddev, &bp->bio1);
- make_request(mddev, &bp->bio2);
+ make_request(mddev, &bp->split);
+ make_request(mddev, bio);
spin_lock_irq(&conf->resync_lock);
conf->nr_waiting--;
@@ -1033,13 +1027,6 @@ static void make_request(struct mddev *mddev, struct bio * bio)
bio_pair_release(bp);
return;
- bad_map:
- printk("md/raid10:%s: make_request bug: can't convert block across chunks"
- " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
- (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
-
- bio_io_error(bio);
- return;
}
md_write_start(mddev, bio);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e85c04b..9ed2c44 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -682,50 +682,6 @@ void bio_integrity_trim(struct bio *bio, unsigned int offset,
EXPORT_SYMBOL(bio_integrity_trim);
/**
- * bio_integrity_split - Split integrity metadata
- * @bio: Protected bio
- * @bp: Resulting bio_pair
- * @sectors: Offset
- *
- * Description: Splits an integrity page into a bio_pair.
- */
-void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
-{
- struct blk_integrity *bi;
- struct bio_integrity_payload *bip = bio->bi_integrity;
- unsigned int nr_sectors;
-
- if (bio_integrity(bio) == 0)
- return;
-
- bi = bdev_get_integrity(bio->bi_bdev);
- BUG_ON(bi == NULL);
- BUG_ON(bip->bip_vcnt != 1);
-
- nr_sectors = bio_integrity_hw_sectors(bi, sectors);
-
- bp->bio1.bi_integrity = &bp->bip1;
- bp->bio2.bi_integrity = &bp->bip2;
-
- bp->iv1 = bip->bip_vec[0];
- bp->iv2 = bip->bip_vec[0];
-
- bp->bip1.bip_vec[0] = bp->iv1;
- bp->bip2.bip_vec[0] = bp->iv2;
-
- bp->iv1.bv_len = sectors * bi->tuple_size;
- bp->iv2.bv_offset += sectors * bi->tuple_size;
- bp->iv2.bv_len -= sectors * bi->tuple_size;
-
- bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
- bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
-
- bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
- bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
-}
-EXPORT_SYMBOL(bio_integrity_split);
-
-/**
* bio_integrity_clone - Callback for cloning bios with integrity metadata
* @bio: New bio
* @bio_src: Original bio
diff --git a/fs/bio.c b/fs/bio.c
index fb7607b..62c4af81 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -35,7 +35,7 @@
*/
#define BIO_INLINE_VECS 4
-static mempool_t *bio_split_pool __read_mostly;
+static struct bio_set *bio_split_pool __read_mostly;
/*
* if you change this list, also change bvec_alloc or things will
@@ -1447,81 +1447,151 @@ EXPORT_SYMBOL(bio_endio);
void bio_pair_release(struct bio_pair *bp)
{
if (atomic_dec_and_test(&bp->cnt)) {
- struct bio *master = bp->bio1.bi_private;
+ bp->orig->bi_end_io = bp->bi_end_io;
+ bp->orig->bi_private = bp->bi_private;
- bio_endio(master, bp->error);
- mempool_free(bp, bp->bio2.bi_private);
+ bio_endio(bp->orig, 0);
+ bio_put(&bp->split);
}
}
EXPORT_SYMBOL(bio_pair_release);
-static void bio_pair_end_1(struct bio *bi, int err)
+static void bio_pair_end(struct bio *bio, int error)
{
- struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
+ struct bio_pair *bp = bio->bi_private;
- if (err)
- bp->error = err;
+ if (error)
+ clear_bit(BIO_UPTODATE, &bp->orig->bi_flags);
bio_pair_release(bp);
}
-static void bio_pair_end_2(struct bio *bi, int err)
+struct bio_pair *bio_pair_split(struct bio *bio, int first_sectors)
{
- struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
+ struct bio_pair *bp;
+ struct bio *split = bio_split(bio, first_sectors,
+ GFP_NOIO, bio_split_pool);
- if (err)
- bp->error = err;
+ if (!split)
+ return NULL;
- bio_pair_release(bp);
+ BUG_ON(split == bio);
+
+ bp = container_of(split, struct bio_pair, split);
+
+ atomic_set(&bp->cnt, 3);
+
+ bp->bi_end_io = bio->bi_end_io;
+ bp->bi_private = bio->bi_private;
+
+ bio->bi_private = bp;
+ bio->bi_end_io = bio_pair_end;
+
+ split->bi_private = bp;
+ split->bi_end_io = bio_pair_end;
+
+ return bp;
}
+EXPORT_SYMBOL(bio_pair_split);
-/*
- * split a bio - only worry about a bio with a single page in its iovec
+/**
+ * bio_split - split a bio
+ * @bio: bio to split
+ * @sectors: number of sectors to split from the front of @bio
+ * @gfp: gfp mask
+ * @bs: bio set to allocate from
+ *
+ * Allocates and returns a new bio which represents @sectors from the start of
+ * @bio, and updates @bio to represent the remaining sectors.
+ *
+ * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
+ * unchanged.
+ *
+ * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
+ * bvec boundry; it is the caller's responsibility to ensure that @bio is not
+ * freed before the split.
+ *
+ * If bio_split() is running under generic_make_request(), it's not safe to
+ * allocate more than one bio from the same bio set. Therefore, if it is running
+ * under generic_make_request() it masks out __GFP_WAIT when doing the
+ * allocation. The caller must check for failure if there's any possibility of
+ * it being called from under generic_make_request(); it is then the caller's
+ * responsibility to retry from a safe context (by e.g. punting to workqueue).
*/
-struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors)
+struct bio *bio_split(struct bio *bio, int sectors,
+ gfp_t gfp, struct bio_set *bs)
{
- struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
+ unsigned idx, vcnt = 0, nbytes = sectors << 9;
+ struct bio_vec *bv;
+ struct bio *ret = NULL;
- if (!bp)
- return bp;
+ BUG_ON(sectors <= 0);
- trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
- bi->bi_sector + first_sectors);
+ /*
+ * If we're being called from underneath generic_make_request() and we
+ * already allocated any bios from this bio set, we risk deadlock if we
+ * use the mempool. So instead, we possibly fail and let the caller punt
+ * to workqueue or somesuch and retry in a safe context.
+ */
+ if (current->bio_list)
+ gfp &= ~__GFP_WAIT;
- BUG_ON(bi->bi_vcnt != 1);
- BUG_ON(bi->bi_idx != 0);
- atomic_set(&bp->cnt, 3);
- bp->error = 0;
- bp->bio1 = *bi;
- bp->bio2 = *bi;
- bp->bio2.bi_sector += first_sectors;
- bp->bio2.bi_size -= first_sectors << 9;
- bp->bio1.bi_size = first_sectors << 9;
+ if (nbytes >= bio->bi_size)
+ return bio;
- bp->bv1 = bi->bi_io_vec[0];
- bp->bv2 = bi->bi_io_vec[0];
- bp->bv2.bv_offset += first_sectors << 9;
- bp->bv2.bv_len -= first_sectors << 9;
- bp->bv1.bv_len = first_sectors << 9;
+ trace_block_split(bdev_get_queue(bio->bi_bdev), bio,
+ bio->bi_sector + sectors);
- bp->bio1.bi_io_vec = &bp->bv1;
- bp->bio2.bi_io_vec = &bp->bv2;
+ bio_for_each_segment(bv, bio, idx) {
+ vcnt = idx - bio->bi_idx;
- bp->bio1.bi_max_vecs = 1;
- bp->bio2.bi_max_vecs = 1;
+ if (!nbytes) {
+ ret = bio_alloc_bioset(gfp, 0, bs);
+ if (!ret)
+ return NULL;
- bp->bio1.bi_end_io = bio_pair_end_1;
- bp->bio2.bi_end_io = bio_pair_end_2;
+ ret->bi_io_vec = bio_iovec(bio);
+ ret->bi_flags |= 1 << BIO_CLONED;
+ break;
+ } else if (nbytes < bv->bv_len) {
+ ret = bio_alloc_bioset(gfp, ++vcnt, bs);
+ if (!ret)
+ return NULL;
- bp->bio1.bi_private = bi;
- bp->bio2.bi_private = bio_split_pool;
+ memcpy(ret->bi_io_vec, bio_iovec(bio),
+ sizeof(struct bio_vec) * vcnt);
- if (bio_integrity(bi))
- bio_integrity_split(bi, bp, first_sectors);
+ ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
+ bv->bv_offset += nbytes;
+ bv->bv_len -= nbytes;
+ break;
+ }
- return bp;
+ nbytes -= bv->bv_len;
+ }
+
+ ret->bi_bdev = bio->bi_bdev;
+ ret->bi_sector = bio->bi_sector;
+ ret->bi_size = sectors << 9;
+ ret->bi_rw = bio->bi_rw;
+ ret->bi_vcnt = vcnt;
+ ret->bi_max_vecs = vcnt;
+ ret->bi_end_io = bio->bi_end_io;
+ ret->bi_private = bio->bi_private;
+
+ bio->bi_sector += sectors;
+ bio->bi_size -= sectors << 9;
+ bio->bi_idx = idx;
+
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(ret, bio, gfp, bs);
+ bio_integrity_trim(ret, 0, bio_sectors(ret));
+ bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
+ }
+
+ return ret;
}
-EXPORT_SYMBOL(bio_pair_split);
+EXPORT_SYMBOL_GPL(bio_split);
/**
* bio_sector_offset - Find hardware sector offset in bio
@@ -1674,8 +1744,7 @@ static int __init init_bio(void)
if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
panic("bio: can't create integrity pool\n");
- bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
- sizeof(struct bio_pair));
+ bio_split_pool = bioset_create(BIO_POOL_SIZE, offsetof(struct bio_pair, split));
if (!bio_split_pool)
panic("bio: can't create split pool\n");
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 18ab020..486233c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -192,15 +192,17 @@ struct bio_integrity_payload {
* in bio2.bi_private
*/
struct bio_pair {
- struct bio bio1, bio2;
- struct bio_vec bv1, bv2;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
- struct bio_integrity_payload bip1, bip2;
- struct bio_vec iv1, iv2;
-#endif
- atomic_t cnt;
- int error;
+ atomic_t cnt;
+
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+
+ struct bio *orig;
+ struct bio split;
};
+
+extern struct bio *bio_split(struct bio *bio, int sectors,
+ gfp_t gfp, struct bio_set *bs);
extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors);
extern void bio_pair_release(struct bio_pair *dbio);
@@ -503,7 +505,6 @@ extern int bio_integrity_prep(struct bio *);
extern void bio_integrity_endio(struct bio *, int);
extern void bio_integrity_advance(struct bio *, unsigned int);
extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
-extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t, struct bio_set *);
extern int bioset_integrity_create(struct bio_set *, int);
extern void bioset_integrity_free(struct bio_set *);
@@ -547,12 +548,6 @@ static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
return 0;
}
-static inline void bio_integrity_split(struct bio *bio, struct bio_pair *bp,
- int sectors)
-{
- return;
-}
-
static inline void bio_integrity_advance(struct bio *bio,
unsigned int bytes_done)
{
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 09/16] block: Add bio_clone_kmalloc()
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (7 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 08/16] block: Rework bio splitting Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
2012-05-25 20:25 ` [PATCH v3 10/16] block: Add bio_clone_bioset() Kent Overstreet
` (6 subsequent siblings)
15 siblings, 0 replies; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: I4aad3ae23640e847056f9de54d6cb75b0ddaa85b
---
drivers/block/osdblk.c | 3 +--
fs/bio.c | 13 +++++++++++++
fs/exofs/ore.c | 5 ++---
include/linux/bio.h | 1 +
4 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 87311eb..1bbc681 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -266,11 +266,10 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
struct bio *tmp, *new_chain = NULL, *tail = NULL;
while (old_chain) {
- tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+ tmp = bio_clone_kmalloc(old_chain, gfpmask);
if (!tmp)
goto err_out;
- __bio_clone(tmp, old_chain);
tmp->bi_bdev = NULL;
gfpmask &= ~__GFP_WAIT;
tmp->bi_next = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index 62c4af81..d212ee2 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -489,6 +489,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
}
EXPORT_SYMBOL(bio_clone);
+struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
+{
+ struct bio *b = bio_kmalloc(gfp_mask, bio->bi_max_vecs);
+
+ if (!b)
+ return NULL;
+
+ __bio_clone(b, bio);
+
+ return b;
+}
+EXPORT_SYMBOL(bio_clone_kmalloc);
+
/**
* bio_get_nr_vecs - return approx number of vecs
* @bdev: I/O target
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 49cf230..95c7264 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -820,8 +820,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
struct bio *bio;
if (per_dev != master_dev) {
- bio = bio_kmalloc(GFP_KERNEL,
- master_dev->bio->bi_max_vecs);
+ bio = bio_clone_kmalloc(master_dev->bio,
+ GFP_KERNEL);
if (unlikely(!bio)) {
ORE_DBGMSG(
"Failed to allocate BIO size=%u\n",
@@ -830,7 +830,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
goto out;
}
- __bio_clone(bio, master_dev->bio);
bio->bi_bdev = NULL;
bio->bi_next = NULL;
per_dev->offset = master_dev->offset;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 486233c..8f280f7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -220,6 +220,7 @@ extern int bio_phys_segments(struct request_queue *, struct bio *);
extern void __bio_clone(struct bio *, struct bio *);
extern struct bio *bio_clone(struct bio *, gfp_t);
+struct bio *bio_clone_kmalloc(struct bio *, gfp_t);
extern void bio_init(struct bio *);
extern void bio_reset(struct bio *);
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 10/16] block: Add bio_clone_bioset()
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (8 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 09/16] block: Add bio_clone_kmalloc() Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
2012-05-25 20:25 ` [PATCH v3 11/16] block: Only clone bio vecs that are in use Kent Overstreet
` (5 subsequent siblings)
15 siblings, 0 replies; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
This consolidates some code, and will help in a later patch changing how
bio cloning works.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: If6cfd840db9cf90a798c6a5862e3314465c83000
---
block/blk-core.c | 8 +-------
drivers/md/dm.c | 4 ++--
drivers/md/md.c | 20 +-------------------
fs/bio.c | 16 ++++++++++++----
include/linux/bio.h | 1 +
5 files changed, 17 insertions(+), 32 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 1f61b74..91617eb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2660,16 +2660,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
blk_rq_init(NULL, rq);
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
+ bio = bio_clone_bioset(bio_src, gfp_mask, bs);
if (!bio)
goto free_and_out;
- __bio_clone(bio, bio_src);
-
- if (bio_integrity(bio_src) &&
- bio_integrity_clone(bio, bio_src, gfp_mask, bs))
- goto free_and_out;
-
if (bio_ctr && bio_ctr(bio, bio_src, data))
goto free_and_out;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4014696..3f3c26e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1101,8 +1101,8 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
* and discard, so no need for concern about wasted bvec allocations.
*/
- clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
- __bio_clone(clone, ci->bio);
+ clone = bio_clone_bioset(ci->bio, GFP_NOIO, ci->md->bs);
+
if (len) {
clone->bi_sector = ci->sector;
clone->bi_size = to_bytes(len);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a5a524e..508ae81 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -173,28 +173,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
struct mddev *mddev)
{
- struct bio *b;
-
if (!mddev || !mddev->bio_set)
return bio_clone(bio, gfp_mask);
- b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, mddev->bio_set);
- if (!b)
- return NULL;
-
- __bio_clone(b, bio);
- if (bio_integrity(bio)) {
- int ret;
-
- ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
-
- if (ret < 0) {
- bio_put(b);
- return NULL;
- }
- }
-
- return b;
+ return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
}
EXPORT_SYMBOL_GPL(bio_clone_mddev);
diff --git a/fs/bio.c b/fs/bio.c
index d212ee2..a62b5b6 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -459,15 +459,17 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
EXPORT_SYMBOL(__bio_clone);
/**
- * bio_clone - clone a bio
+ * bio_clone_bioset - clone a bio
* @bio: bio to clone
* @gfp_mask: allocation priority
+ * @bs: bio_set to allocate from
*
* Like __bio_clone, only also allocates the returned bio
*/
-struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
+ struct bio_set *bs)
{
- struct bio *b = bio_alloc(gfp_mask, bio->bi_max_vecs);
+ struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
if (!b)
return NULL;
@@ -477,7 +479,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
if (bio_integrity(bio)) {
int ret;
- ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
+ ret = bio_integrity_clone(b, bio, gfp_mask, bs);
if (ret < 0) {
bio_put(b);
@@ -487,6 +489,12 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
return b;
}
+EXPORT_SYMBOL(bio_clone_bioset);
+
+struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+ return bio_clone_bioset(bio, gfp_mask, fs_bio_set);
+}
EXPORT_SYMBOL(bio_clone);
struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8f280f7..0880a71 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -219,6 +219,7 @@ struct request_queue;
extern int bio_phys_segments(struct request_queue *, struct bio *);
extern void __bio_clone(struct bio *, struct bio *);
+extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
extern struct bio *bio_clone(struct bio *, gfp_t);
struct bio *bio_clone_kmalloc(struct bio *, gfp_t);
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 11/16] block: Only clone bio vecs that are in use
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (9 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 10/16] block: Add bio_clone_bioset() Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
2012-05-25 20:25 ` [PATCH v3 12/16] Closures Kent Overstreet
` (4 subsequent siblings)
15 siblings, 0 replies; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
bcache creates large bios internally, and then splits them according to
the device requirements before it sends them down. If a lower level
device tries to clone the bio, and the original bio had more than
BIO_MAX_PAGES, the clone will fail unecessarily.
We can fix this by only cloning the bio vecs that are actually in use.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: I71971827a623ce90626fa4a09b07f95f875ad890
---
drivers/block/rbd.c | 2 +-
drivers/md/dm.c | 5 ++---
fs/bio.c | 13 +++++++------
3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 11777b4..c1f8a8fe 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -729,7 +729,7 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
}
while (old_chain && (total < len)) {
- tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+ tmp = bio_kmalloc(gfpmask, bio_segments(old_chain));
if (!tmp)
goto err_out;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3f3c26e..193fb19 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1057,11 +1057,10 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
{
struct bio *clone;
- clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
+ clone = bio_alloc_bioset(GFP_NOIO, bv_count, bs);
__bio_clone(clone, bio);
clone->bi_sector = sector;
- clone->bi_idx = idx;
- clone->bi_vcnt = idx + bv_count;
+ clone->bi_vcnt = bv_count;
clone->bi_size = to_bytes(len);
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
diff --git a/fs/bio.c b/fs/bio.c
index a62b5b6..a85239f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -441,8 +441,9 @@ EXPORT_SYMBOL(bio_phys_segments);
*/
void __bio_clone(struct bio *bio, struct bio *bio_src)
{
- memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
- bio_src->bi_max_vecs * sizeof(struct bio_vec));
+ memcpy(bio->bi_io_vec,
+ bio_iovec(bio_src),
+ bio_segments(bio_src) * sizeof(struct bio_vec));
/*
* most users will be overriding ->bi_bdev with a new target,
@@ -451,10 +452,10 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
bio->bi_sector = bio_src->bi_sector;
bio->bi_bdev = bio_src->bi_bdev;
bio->bi_flags |= 1 << BIO_CLONED;
+ bio->bi_flags &= ~(1 << BIO_SEG_VALID);
bio->bi_rw = bio_src->bi_rw;
- bio->bi_vcnt = bio_src->bi_vcnt;
+ bio->bi_vcnt = bio_segments(bio_src);
bio->bi_size = bio_src->bi_size;
- bio->bi_idx = bio_src->bi_idx;
}
EXPORT_SYMBOL(__bio_clone);
@@ -469,7 +470,7 @@ EXPORT_SYMBOL(__bio_clone);
struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
struct bio_set *bs)
{
- struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
+ struct bio *b = bio_alloc_bioset(gfp_mask, bio_segments(bio), bs);
if (!b)
return NULL;
@@ -499,7 +500,7 @@ EXPORT_SYMBOL(bio_clone);
struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
{
- struct bio *b = bio_kmalloc(gfp_mask, bio->bi_max_vecs);
+ struct bio *b = bio_kmalloc(gfp_mask, bio_segments(bio));
if (!b)
return NULL;
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 12/16] Closures
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (10 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 11/16] block: Only clone bio vecs that are in use Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-13-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 13/16] Make generic_make_request handle arbitrarily large bios Kent Overstreet
` (3 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
Asynchronous refcounty thingies; they embed a refcount and a work
struct. Extensive documentation follows in include/linux/closure.h
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
include/linux/closure.h | 658 +++++++++++++++++++++++++++++++++++++++++++++++
lib/Kconfig.debug | 8 +
lib/Makefile | 2 +-
lib/closure.c | 344 +++++++++++++++++++++++++
4 files changed, 1011 insertions(+), 1 deletion(-)
create mode 100644 include/linux/closure.h
create mode 100644 lib/closure.c
diff --git a/include/linux/closure.h b/include/linux/closure.h
new file mode 100644
index 0000000..7cae8cf
--- /dev/null
+++ b/include/linux/closure.h
@@ -0,0 +1,658 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ * continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, is a macro that returns the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio, int error)
+ * {
+ * closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * For a closure to wait on an arbitrary event, we need to introduce waitlists:
+ *
+ * struct closure_waitlist list;
+ * closure_wait_event(list, cl, condition);
+ * closure_wake_up(wait_list);
+ *
+ * These work analagously to wait_event() and wake_up() - except that instead of
+ * operating on the current thread (for wait_event()) and lists of threads, they
+ * operate on an explicit closure and lists of closures.
+ *
+ * Because it's a closure we can now wait either synchronously or
+ * asynchronously. closure_wait_event() returns the current value of the
+ * condition, and if it returned false continue_at() or closure_sync() can be
+ * used to wait for it to become true.
+ *
+ * It's useful for waiting on things when you can't sleep in the context in
+ * which you must check the condition (perhaps a spinlock held, or you might be
+ * beneath generic_make_request() - in which case you can't sleep on IO).
+ *
+ * closure_wait_event() will wait either synchronously or asynchronously,
+ * depending on whether the closure is in blocking mode or not. You can pick a
+ * mode explicitly with closure_wait_event_sync() and
+ * closure_wait_event_async(), which do just what you might expect.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ *
+ * Locking:
+ *
+ * Closures are based on work items but they can be thought of as more like
+ * threads - in that like threads and unlike work items they have a well
+ * defined lifetime; they are created (with closure_init()) and eventually
+ * complete after a continue_at(cl, NULL, NULL).
+ *
+ * Suppose you've got some larger structure with a closure embedded in it that's
+ * used for periodically doing garbage collection. You only want one garbage
+ * collection happening at a time, so the natural thing to do is protect it with
+ * a lock. However, it's difficult to use a lock protecting a closure correctly
+ * because the unlock should come after the last continue_to() (additionally, if
+ * you're using the closure asynchronously a mutex won't work since a mutex has
+ * to be unlocked by the same process that locked it).
+ *
+ * So to make it less error prone and more efficient, we also have the ability
+ * to use closures as locks:
+ *
+ * closure_init_unlocked();
+ * closure_trylock();
+ *
+ * That's all we need for trylock() - the last closure_put() implicitly unlocks
+ * it for you. But for closure_lock(), we also need a wait list:
+ *
+ * struct closure_with_waitlist frobnicator_cl;
+ *
+ * closure_init_unlocked(&frobnicator_cl);
+ * closure_lock(&frobnicator_cl);
+ *
+ * A closure_with_waitlist embeds a closure and a wait list - much like struct
+ * delayed_work embeds a work item and a timer_list. The important thing is, use
+ * it exactly like you would a regular closure and closure_put() will magically
+ * handle everything for you.
+ *
+ * We've got closures that embed timers, too. They're called, appropriately
+ * enough:
+ * struct closure_with_timer;
+ *
+ * This gives you access to closure_sleep(). It takes a refcount for a specified
+ * number of jiffies - you could then call closure_sync() (for a slightly
+ * convoluted version of msleep()) or continue_at() - which gives you the same
+ * effect as using a delayed work item, except you can reuse the work_struct
+ * already embedded in struct closure.
+ *
+ * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
+ * probably expect, if you happen to need the features of both. (You don't
+ * really want to know how all this is implemented, but if I've done my job
+ * right you shouldn't have to care).
+ */
+
+struct closure;
+typedef void (closure_fn) (struct closure *);
+
+struct closure_waitlist {
+ struct llist_head list;
+};
+
+enum closure_type {
+ TYPE_closure = 0,
+ TYPE_closure_with_waitlist = 1,
+ TYPE_closure_with_timer = 2,
+ TYPE_closure_with_waitlist_and_timer = 3,
+ MAX_CLOSURE_TYPE = 3,
+};
+
+enum closure_state_bits {
+ CLOSURE_REMAINING_END = 20,
+ CLOSURE_BLOCKING_GUARD = 20,
+ CLOSURE_BLOCKING_BIT = 21,
+ CLOSURE_WAITING_GUARD = 22,
+ CLOSURE_WAITING_BIT = 23,
+ CLOSURE_SLEEPING_GUARD = 24,
+ CLOSURE_SLEEPING_BIT = 25,
+ CLOSURE_TIMER_GUARD = 26,
+ CLOSURE_TIMER_BIT = 27,
+ CLOSURE_RUNNING_GUARD = 28,
+ CLOSURE_RUNNING_BIT = 29,
+ CLOSURE_STACK_GUARD = 30,
+ CLOSURE_STACK_BIT = 31,
+};
+
+enum closure_state {
+ /*
+ * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
+ * waiting asynchronously
+ *
+ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+ * the thread that owns the closure, and cleared by the thread that's
+ * waking up the closure.
+ *
+ * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
+ * - indicates that cl->task is valid and closure_put() may wake it up.
+ * Only set or cleared by the thread that owns the closure.
+ *
+ * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
+ * has an outstanding timer. Must be set by the thread that owns the
+ * closure, and cleared by the timer function when the timer goes off.
+ *
+ * The rest are for debugging and don't affect behaviour:
+ *
+ * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+ * closure_init() and when closure_put() runs then next function), and
+ * must be cleared before remaining hits 0. Primarily to help guard
+ * against incorrect usage and accidentally transferring references.
+ * continue_at() and closure_return() clear it for you, if you're doing
+ * something unusual you can use closure_set_dead() which also helps
+ * annotate where references are being transferred.
+ *
+ * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
+ * closure with this flag set
+ */
+
+ CLOSURE_BLOCKING = (1 << CLOSURE_BLOCKING_BIT),
+ CLOSURE_WAITING = (1 << CLOSURE_WAITING_BIT),
+ CLOSURE_SLEEPING = (1 << CLOSURE_SLEEPING_BIT),
+ CLOSURE_TIMER = (1 << CLOSURE_TIMER_BIT),
+ CLOSURE_RUNNING = (1 << CLOSURE_RUNNING_BIT),
+ CLOSURE_STACK = (1 << CLOSURE_STACK_BIT),
+};
+
+#define CLOSURE_GUARD_MASK \
+ ((1 << CLOSURE_BLOCKING_GUARD)| \
+ (1 << CLOSURE_WAITING_GUARD)| \
+ (1 << CLOSURE_SLEEPING_GUARD)| \
+ (1 << CLOSURE_TIMER_GUARD)| \
+ (1 << CLOSURE_RUNNING_GUARD)| \
+ (1 << CLOSURE_STACK_GUARD))
+
+#define CLOSURE_REMAINING_MASK ((1 << CLOSURE_REMAINING_END) - 1)
+#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
+
+struct closure {
+ union {
+ struct {
+ struct workqueue_struct *wq;
+ struct task_struct *task;
+ struct llist_node list;
+ closure_fn *fn;
+ };
+ struct work_struct work;
+ };
+
+ struct closure *parent;
+
+ atomic_t remaining;
+
+ enum closure_type type;
+
+#ifdef CONFIG_DEBUG_CLOSURES
+#define CLOSURE_MAGIC_DEAD 0xc054dead
+#define CLOSURE_MAGIC_ALIVE 0xc054a11e
+
+ unsigned magic;
+ struct list_head all;
+ unsigned long ip;
+ unsigned long waiting_on;
+#endif
+};
+
+struct closure_with_waitlist {
+ struct closure cl;
+ struct closure_waitlist wait;
+};
+
+struct closure_with_timer {
+ struct closure cl;
+ struct timer_list timer;
+};
+
+struct closure_with_waitlist_and_timer {
+ struct closure cl;
+ struct closure_waitlist wait;
+ struct timer_list timer;
+};
+
+extern unsigned invalid_closure_type(void);
+
+#define __CLOSURE_TYPE(cl, _t) \
+ __builtin_types_compatible_p(typeof(cl), struct _t) \
+ ? TYPE_ ## _t : \
+
+#define __closure_type(cl) \
+( \
+ __CLOSURE_TYPE(cl, closure) \
+ __CLOSURE_TYPE(cl, closure_with_waitlist) \
+ __CLOSURE_TYPE(cl, closure_with_timer) \
+ __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \
+ invalid_closure_type() \
+)
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void closure_queue(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void closure_sync(struct closure *cl);
+
+bool closure_trylock(struct closure *cl, struct closure *parent);
+void __closure_lock(struct closure *cl, struct closure *parent,
+ struct closure_waitlist *wait_list);
+
+void do_closure_timer_init(struct closure *cl);
+bool __closure_sleep(struct closure *cl, unsigned long delay,
+ struct timer_list *timer);
+void __closure_flush(struct closure *cl, struct timer_list *timer);
+void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ BUG_ON((atomic_inc_return(&cl->remaining) &
+ CLOSURE_REMAINING_MASK) <= 1);
+#else
+ atomic_inc(&cl->remaining);
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+ atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline bool closure_is_stopped(struct closure *cl)
+{
+ return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
+}
+
+static inline bool closure_is_unlocked(struct closure *cl)
+{
+ return atomic_read(&cl->remaining) == -1;
+}
+
+static inline void do_closure_init(struct closure *cl, struct closure *parent,
+ bool running)
+{
+ switch (cl->type) {
+ case TYPE_closure_with_timer:
+ case TYPE_closure_with_waitlist_and_timer:
+ do_closure_timer_init(cl);
+ default:
+ break;
+ }
+
+ cl->parent = parent;
+ if (parent)
+ closure_get(parent);
+
+ if (running) {
+ closure_debug_create(cl);
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+ } else
+ atomic_set(&cl->remaining, -1);
+
+ closure_set_ip(cl);
+}
+
+/*
+ * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
+ * the result of __closure_type() is thrown away, it's used merely for type
+ * checking.
+ */
+#define __to_internal_closure(cl) \
+({ \
+ BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \
+ (struct closure *) cl; \
+})
+
+#define closure_init_type(cl, parent, running) \
+do { \
+ struct closure *_cl = __to_internal_closure(cl); \
+ _cl->type = __closure_type(*(cl)); \
+ do_closure_init(_cl, parent, running); \
+} while (0)
+
+/**
+ * __closure_init() - Initialize a closure, skipping the memset()
+ *
+ * May be used instead of closure_init() when memory has already been zeroed.
+ */
+#define __closure_init(cl, parent) \
+ closure_init_type(cl, parent, true)
+
+/**
+ * closure_init() - Initialize a closure, setting the refcount to 1
+ * @cl: closure to initialize
+ * @parent: parent of the new closure. cl will take a refcount on it for its
+ * lifetime; may be NULL.
+ */
+#define closure_init(cl, parent) \
+do { \
+ memset((cl), 0, sizeof(*(cl))); \
+ __closure_init(cl, parent); \
+} while (0)
+
+static inline void closure_init_stack(struct closure *cl)
+{
+ memset(cl, 0, sizeof(struct closure));
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
+ CLOSURE_BLOCKING|CLOSURE_STACK);
+}
+
+/**
+ * closure_init_unlocked() - Initialize a closure but leave it unlocked.
+ * @cl: closure to initialize
+ *
+ * For when the closure will be used as a lock. The closure may not be used
+ * until after a closure_lock() or closure_trylock().
+ */
+#define closure_init_unlocked(cl) \
+do { \
+ memset((cl), 0, sizeof(*(cl))); \
+ closure_init_type(cl, NULL, false); \
+} while (0)
+
+/**
+ * closure_lock() - lock and initialize a closure.
+ * @cl: the closure to lock
+ * @parent: the new parent for this closure
+ *
+ * The closure must be of one of the types that has a waitlist (otherwise we
+ * wouldn't be able to sleep on contention).
+ *
+ * @parent has exactly the same meaning as in closure_init(); if non null, the
+ * closure will take a reference on @parent which will be released when it is
+ * unlocked.
+ */
+#define closure_lock(cl, parent) \
+ __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
+
+/**
+ * closure_sleep() - asynchronous sleep
+ * @cl: the closure that will sleep
+ * @delay: the delay in jiffies
+ *
+ * Takes a refcount on @cl which will be released after @delay jiffies; this may
+ * be used to have a function run after a delay with continue_at(), or
+ * closure_sync() may be used for a convoluted version of msleep().
+ */
+#define closure_sleep(cl, delay) \
+ __closure_sleep(__to_internal_closure(cl), delay, &(cl)->timer)
+
+#define closure_flush(cl) \
+ __closure_flush(__to_internal_closure(cl), &(cl)->timer)
+
+#define closure_flush_sync(cl) \
+ __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
+
+static inline void __closure_end_sleep(struct closure *cl)
+{
+ __set_current_state(TASK_RUNNING);
+
+ if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+ atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+static inline void __closure_start_sleep(struct closure *cl)
+{
+ closure_set_ip(cl);
+ cl->task = current;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+ atomic_add(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+/**
+ * closure_blocking() - returns true if the closure is in blocking mode.
+ *
+ * If a closure is in blocking mode, closure_wait_event() will sleep until the
+ * condition is true instead of waiting asynchronously.
+ */
+static inline bool closure_blocking(struct closure *cl)
+{
+ return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
+}
+
+/**
+ * set_closure_blocking() - put a closure in blocking mode.
+ *
+ * If a closure is in blocking mode, closure_wait_event() will sleep until the
+ * condition is true instead of waiting asynchronously.
+ *
+ * Not thread safe - can only be called by the thread running the closure.
+ */
+static inline void set_closure_blocking(struct closure *cl)
+{
+ if (!closure_blocking(cl))
+ atomic_add(CLOSURE_BLOCKING, &cl->remaining);
+}
+
+/*
+ * Not thread safe - can only be called by the thread running the closure.
+ */
+static inline void clear_closure_blocking(struct closure *cl)
+{
+ if (closure_blocking(cl))
+ atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
+}
+
+/**
+ * closure_wake_up() - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+ smp_mb();
+ __closure_wake_up(list);
+}
+
+/*
+ * Wait on an event, synchronously or asynchronously - analogous to wait_event()
+ * but for closures.
+ *
+ * The loop is oddly structured so as to avoid a race; we must check the
+ * condition again after we've added ourself to the waitlist. We know if we were
+ * already on the waitlist because closure_wait() returns false; thus, we only
+ * schedule or break if closure_wait() returns false. If it returns true, we
+ * just loop again - rechecking the condition.
+ *
+ * The __closure_wake_up() is necessary because we may race with the event
+ * becoming true; i.e. we see event false -> wait -> recheck condition, but the
+ * thread that made the event true may have called closure_wake_up() before we
+ * added ourself to the wait list.
+ *
+ * We have to call closure_sync() at the end instead of just
+ * __closure_end_sleep() because a different thread might've called
+ * closure_wake_up() before us and gotten preempted before they dropped the
+ * refcount on our closure. If this was a stack allocated closure, that would be
+ * bad.
+ */
+#define __closure_wait_event(list, cl, condition, _block) \
+({ \
+ bool block = _block; \
+ typeof(condition) ret; \
+ \
+ while (1) { \
+ ret = (condition); \
+ if (ret) { \
+ __closure_wake_up(list); \
+ if (block) \
+ closure_sync(cl); \
+ \
+ break; \
+ } \
+ \
+ if (block) \
+ __closure_start_sleep(cl); \
+ \
+ if (!closure_wait(list, cl)) { \
+ if (!block) \
+ break; \
+ \
+ schedule(); \
+ } \
+ } \
+ \
+ ret; \
+})
+
+/**
+ * closure_wait_event() - wait on a condition, synchronously or asynchronously.
+ * @list: the wait list to wait on
+ * @cl: the closure that is doing the waiting
+ * @condition: a C expression for the event to wait for
+ *
+ * If the closure is in blocking mode, sleeps until the @condition evaluates to
+ * true - exactly like wait_event().
+ *
+ * If the closure is not in blocking mode, waits asynchronously; if the
+ * condition is currently false the @cl is put onto @list and returns. @list
+ * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
+ * wait for another thread to wake up @list, which drops the refcount on @cl.
+ *
+ * Returns the value of @condition; @cl will be on @list iff @condition was
+ * false.
+ *
+ * closure_wake_up(@list) must be called after changing any variable that could
+ * cause @condition to become true.
+ */
+#define closure_wait_event(list, cl, condition) \
+ __closure_wait_event(list, cl, condition, closure_blocking(cl))
+
+#define closure_wait_event_async(list, cl, condition) \
+ __closure_wait_event(list, cl, condition, false)
+
+#define closure_wait_event_sync(list, cl, condition) \
+ __closure_wait_event(list, cl, condition, true)
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+ struct workqueue_struct *wq)
+{
+ cl->fn = fn;
+ cl->wq = wq;
+ /* between atomic_dec() in closure_put() */
+ smp_mb__before_atomic_dec();
+}
+
+#define continue_at(_cl, _fn, _wq, ...) \
+do { \
+ BUG_ON(!(_cl) || object_is_on_stack(_cl)); \
+ closure_set_ip(_cl); \
+ set_closure_fn(_cl, _fn, _wq); \
+ closure_sub(_cl, CLOSURE_RUNNING + 1); \
+ return __VA_ARGS__; \
+} while (0)
+
+#define closure_return(_cl) continue_at((_cl), NULL, NULL)
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6777153..2a486e0 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -378,6 +378,14 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
help
Debug objects boot parameter default value
+config DEBUG_CLOSURES
+ bool "Debug closures"
+ select DEBUG_FS
+ ---help---
+ Keeps all active closures in a linked list and provides a debugfs
+ interface to list them, which makes it possible to see asynchronous
+ operations that get stuck.
+
config DEBUG_SLAB
bool "Debug slab memory allocations"
depends on DEBUG_KERNEL && SLAB && !KMEMCHECK
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..35ad204 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,7 @@ lib-y += kobject.o klist.o
obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \
- bsearch.o find_last_bit.o find_next_bit.o llist.o
+ bsearch.o find_last_bit.o find_next_bit.o llist.o closure.o
obj-y += kstrtox.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
diff --git a/lib/closure.c b/lib/closure.c
new file mode 100644
index 0000000..f776b6f
--- /dev/null
+++ b/lib/closure.c
@@ -0,0 +1,344 @@
+
+#include <linux/closure.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+/*
+ * Closure like things
+ * See include/linux/closure.h for full documentation
+ */
+
+void closure_queue(struct closure *cl)
+{
+ struct workqueue_struct *wq = cl->wq;
+ if (wq) {
+ INIT_WORK(&cl->work, cl->work.func);
+ BUG_ON(!queue_work(wq, &cl->work));
+ } else
+ cl->fn(cl);
+}
+EXPORT_SYMBOL_GPL(closure_queue);
+
+#define CL_FIELD(type, field) \
+ case TYPE_ ## type: \
+ return &container_of(cl, struct type, cl)->field
+
+static struct closure_waitlist *closure_waitlist(struct closure *cl)
+{
+ switch (cl->type) {
+ CL_FIELD(closure_with_waitlist, wait);
+ CL_FIELD(closure_with_waitlist_and_timer, wait);
+ default:
+ return NULL;
+ }
+}
+
+static struct timer_list *closure_timer(struct closure *cl)
+{
+ switch (cl->type) {
+ CL_FIELD(closure_with_timer, timer);
+ CL_FIELD(closure_with_waitlist_and_timer, timer);
+ default:
+ return NULL;
+ }
+}
+
+static void closure_put_after_sub(struct closure *cl, int r)
+{
+ int rem = r & CLOSURE_REMAINING_MASK;
+
+ BUG_ON(r & CLOSURE_GUARD_MASK);
+ /* CLOSURE_BLOCK is the only flag that's allowed when r hits 0 */
+ BUG_ON(!rem && (r & ~CLOSURE_BLOCKING));
+
+ /* Must deliver precisely one wakeup */
+ if (rem == 1 && (r & CLOSURE_SLEEPING))
+ wake_up_process(cl->task);
+
+ if (!rem) {
+ if (cl->fn) {
+ /* CLOSURE_BLOCKING might be set - clear it */
+ atomic_set(&cl->remaining,
+ CLOSURE_REMAINING_INITIALIZER);
+ closure_queue(cl);
+ } else {
+ struct closure *parent = cl->parent;
+ struct closure_waitlist *wait = closure_waitlist(cl);
+
+ closure_debug_destroy(cl);
+
+ atomic_set(&cl->remaining, -1);
+
+ if (wait)
+ closure_wake_up(wait);
+
+ if (parent)
+ closure_put(parent);
+ }
+ }
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL_GPL(closure_sub);
+
+void closure_put(struct closure *cl)
+{
+ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL_GPL(closure_put);
+
+static void set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->waiting_on = f;
+#endif
+}
+
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+ struct llist_node *list;
+ struct closure *cl;
+ struct llist_node *reverse = NULL;
+
+ list = llist_del_all(&wait_list->list);
+
+ /* We first reverse the list to preserve FIFO ordering and fairness */
+
+ while (list) {
+ struct llist_node *t = list;
+ list = llist_next(list);
+
+ t->next = reverse;
+ reverse = t;
+ }
+
+ /* Then do the wakeups */
+
+ while (reverse) {
+ cl = container_of(reverse, struct closure, list);
+ reverse = llist_next(reverse);
+
+ set_waiting(cl, 0);
+ closure_sub(cl, CLOSURE_WAITING + 1);
+ }
+}
+EXPORT_SYMBOL_GPL(__closure_wake_up);
+
+bool closure_wait(struct closure_waitlist *list, struct closure *cl)
+{
+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+ return false;
+
+ set_waiting(cl, _RET_IP_);
+ atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+ llist_add(&cl->list, &list->list);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(closure_wait);
+
+/**
+ * closure_sync() - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+void closure_sync(struct closure *cl)
+{
+ while (1) {
+ __closure_start_sleep(cl);
+ closure_set_ret_ip(cl);
+
+ if ((atomic_read(&cl->remaining) &
+ CLOSURE_REMAINING_MASK) == 1)
+ break;
+
+ schedule();
+ }
+
+ __closure_end_sleep(cl);
+}
+EXPORT_SYMBOL_GPL(closure_sync);
+
+/**
+ * closure_trylock() - try to acquire the closure, without waiting
+ * @cl: closure to lock
+ *
+ * Returns true if the closure was succesfully locked.
+ */
+bool closure_trylock(struct closure *cl, struct closure *parent)
+{
+ if (atomic_cmpxchg(&cl->remaining, -1,
+ CLOSURE_REMAINING_INITIALIZER) != -1)
+ return false;
+
+ closure_set_ret_ip(cl);
+
+ smp_mb();
+ cl->parent = parent;
+ if (parent)
+ closure_get(parent);
+
+ closure_debug_create(cl);
+ return true;
+}
+EXPORT_SYMBOL_GPL(closure_trylock);
+
+void __closure_lock(struct closure *cl, struct closure *parent,
+ struct closure_waitlist *wait_list)
+{
+ struct closure wait;
+ closure_init_stack(&wait);
+
+ while (1) {
+ if (closure_trylock(cl, parent))
+ return;
+
+ closure_wait_event_sync(wait_list, &wait,
+ atomic_read(&cl->remaining) == -1);
+ }
+}
+EXPORT_SYMBOL_GPL(__closure_lock);
+
+static void closure_sleep_timer_fn(unsigned long data)
+{
+ struct closure *cl = (struct closure *) data;
+ closure_sub(cl, CLOSURE_TIMER + 1);
+}
+
+void do_closure_timer_init(struct closure *cl)
+{
+ struct timer_list *timer = closure_timer(cl);
+
+ init_timer(timer);
+ timer->data = (unsigned long) cl;
+ timer->function = closure_sleep_timer_fn;
+}
+EXPORT_SYMBOL_GPL(do_closure_timer_init);
+
+bool __closure_sleep(struct closure *cl, unsigned long delay,
+ struct timer_list *timer)
+{
+ if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
+ return false;
+
+ BUG_ON(timer_pending(timer));
+
+ timer->expires = jiffies + delay;
+
+ atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
+ add_timer(timer);
+ return true;
+}
+EXPORT_SYMBOL_GPL(__closure_sleep);
+
+void __closure_flush(struct closure *cl, struct timer_list *timer)
+{
+ if (del_timer(timer))
+ closure_sub(cl, CLOSURE_TIMER + 1);
+}
+EXPORT_SYMBOL_GPL(__closure_flush);
+
+void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
+{
+ if (del_timer_sync(timer))
+ closure_sub(cl, CLOSURE_TIMER + 1);
+}
+EXPORT_SYMBOL_GPL(__closure_flush_sync);
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+ unsigned long flags;
+
+ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+ cl->magic = CLOSURE_MAGIC_ALIVE;
+
+ spin_lock_irqsave(&closure_list_lock, flags);
+ list_add(&cl->all, &closure_list);
+ spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+ unsigned long flags;
+
+ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+ cl->magic = CLOSURE_MAGIC_DEAD;
+
+ spin_lock_irqsave(&closure_list_lock, flags);
+ list_del(&cl->all);
+ spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(closure_debug_destroy);
+
+static struct dentry *debug;
+
+#define work_data_bits(work) ((unsigned long *)(&(work)->data))
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+ struct closure *cl;
+ spin_lock_irq(&closure_list_lock);
+
+ list_for_each_entry(cl, &closure_list, all) {
+ int r = atomic_read(&cl->remaining);
+
+ seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+ cl, (void *) cl->ip, cl->fn, cl->parent,
+ r & CLOSURE_REMAINING_MASK);
+
+ seq_printf(f, "%s%s%s%s%s%s\n",
+ test_bit(WORK_STRUCT_PENDING,
+ work_data_bits(&cl->work)) ? "Q" : "",
+ r & CLOSURE_RUNNING ? "R" : "",
+ r & CLOSURE_BLOCKING ? "B" : "",
+ r & CLOSURE_STACK ? "S" : "",
+ r & CLOSURE_SLEEPING ? "Sl" : "",
+ r & CLOSURE_TIMER ? "T" : "");
+
+ if (r & CLOSURE_WAITING)
+ seq_printf(f, " W %pF\n",
+ (void *) cl->waiting_on);
+
+ seq_printf(f, "\n");
+ }
+
+ spin_unlock_irq(&closure_list_lock);
+ return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, debug_seq_show, NULL);
+}
+
+static const struct file_operations debug_ops = {
+ .owner = THIS_MODULE,
+ .open = debug_seq_open,
+ .read = seq_read,
+ .release = single_release
+};
+
+int __init closure_debug_init(void)
+{
+ debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+ return 0;
+}
+
+module_init(closure_debug_init);
+
+#endif
+
+MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
+MODULE_LICENSE("GPL");
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 13/16] Make generic_make_request handle arbitrarily large bios
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (11 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 12/16] Closures Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-14-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 14/16] Gut bio_add_page() Kent Overstreet
` (2 subsequent siblings)
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: axboe, yehuda, Kent Overstreet, mpatocka, vgoyal, bharrosh, tj,
sage, agk, drbd-dev
The way the block layer is currently written, it goes to great lengths
to avoid having to split bios; upper layer code (such as bio_add_page())
checks what the underlying device can handle and tries to always create
bios that don't need to be split.
But this approach becomes unwieldy and eventually breaks down with
stacked devices and devices with dynamic limits, and it adds a lot of
complexity. If the block layer could split bios as needed, we could
eliminate a lot of complexity elsewhere - particularly in stacked
drivers. Code that creates bios can then create whatever size bios are
convenient, and more importantly stacked drivers don't have to deal with
both their own bio size limitations and the limitations of the
(potentially multiple) devices underneath them.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Change-Id: I53ed182e8c0a5fe192b040b1fdedba205fe953d5
---
block/blk-core.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++
fs/bio.c | 41 +++++++++++++++++
include/linux/bio.h | 7 +++
include/linux/blkdev.h | 3 ++
4 files changed, 169 insertions(+)
diff --git a/block/blk-core.c b/block/blk-core.c
index 91617eb..19145ab 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -29,6 +29,7 @@
#include <linux/fault-inject.h>
#include <linux/list_sort.h>
#include <linux/delay.h>
+#include <linux/closure.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -52,6 +53,12 @@ static struct kmem_cache *request_cachep;
struct kmem_cache *blk_requestq_cachep;
/*
+ * For bio_split_hook
+ */
+static struct kmem_cache *bio_split_cache;
+static struct workqueue_struct *bio_split_wq;
+
+/*
* Controlling structure to kblockd
*/
static struct workqueue_struct *kblockd_workqueue;
@@ -487,6 +494,14 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (q->id < 0)
goto fail_q;
+ q->bio_split_hook = mempool_create_slab_pool(4, bio_split_cache);
+ if (!q->bio_split_hook)
+ goto fail_split_hook;
+
+ q->bio_split = bioset_create(4, 0);
+ if (!q->bio_split)
+ goto fail_split;
+
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
@@ -526,6 +541,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
+fail_split:
+ bioset_free(q->bio_split);
+fail_split_hook:
+ mempool_destroy(q->bio_split_hook);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
@@ -1493,6 +1512,90 @@ static inline bool should_fail_request(struct hd_struct *part,
#endif /* CONFIG_FAIL_MAKE_REQUEST */
+struct bio_split_hook {
+ struct closure cl;
+ struct request_queue *q;
+ struct bio *bio;
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+};
+
+static void bio_submit_split_done(struct closure *cl)
+{
+ struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+ s->bio->bi_end_io = s->bi_end_io;
+ s->bio->bi_private = s->bi_private;
+ bio_endio(s->bio, 0);
+
+ closure_debug_destroy(&s->cl);
+ mempool_free(s, s->q->bio_split_hook);
+}
+
+static void bio_submit_split_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+ if (error)
+ clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
+
+ bio_put(bio);
+ closure_put(cl);
+}
+
+static void __bio_submit_split(struct closure *cl)
+{
+ struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+ struct bio *bio = s->bio, *n;
+
+ do {
+ /*
+ * If we're running underneath generic_make_request(), we risk
+ * deadlock if we allocate multiple bios from the mempool.
+ *
+ * To avoid this, bio_split() masks out __GFP_WAIT
+ * current->bio_list != NULL; if it fails, the continue_at()
+ * just punts us to a workqueue, where we can safely retry the
+ * allocation using the mempool.
+ */
+ n = bio_split(bio, bio_max_sectors(bio),
+ GFP_NOIO, s->q->bio_split);
+ if (!n)
+ continue_at(cl, __bio_submit_split, bio_split_wq);
+
+ closure_get(cl);
+ generic_make_request(n);
+ } while (n != bio);
+
+ continue_at(cl, bio_submit_split_done, NULL);
+}
+
+static bool bio_submit_split(struct bio *bio)
+{
+ struct bio_split_hook *s;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+ if (!bio_has_data(bio) || !q || !q->bio_split_hook ||
+ bio_sectors(bio) <= bio_max_sectors(bio))
+ return false;
+
+ s = mempool_alloc(q->bio_split_hook, GFP_NOIO);
+
+ closure_init(&s->cl, NULL);
+ s->bio = bio;
+ s->q = q;
+ s->bi_end_io = bio->bi_end_io;
+ s->bi_private = bio->bi_private;
+
+ bio_get(bio);
+ bio->bi_end_io = bio_submit_split_endio;
+ bio->bi_private = &s->cl;
+
+ __bio_submit_split(&s->cl);
+ return true;
+}
+
/*
* Check whether this bio extends beyond the end of the device.
*/
@@ -1646,6 +1749,14 @@ void generic_make_request(struct bio *bio)
* it is non-NULL, then a make_request is active, and new requests
* should be added at the tail
*/
+
+ /*
+ * If the device can't accept arbitrary sized bios, check if we
+ * need to split:
+ */
+ if (bio_submit_split(bio))
+ return;
+
if (current->bio_list) {
bio_list_add(current->bio_list, bio);
return;
@@ -2892,11 +3003,18 @@ int __init blk_dev_init(void)
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
+ bio_split_wq = alloc_workqueue("bio_split", WQ_MEM_RECLAIM, 0);
+ if (!bio_split_wq)
+ panic("Failed to create bio_split wq\n");
+
request_cachep = kmem_cache_create("blkdev_requests",
sizeof(struct request), 0, SLAB_PANIC, NULL);
blk_requestq_cachep = kmem_cache_create("blkdev_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+ bio_split_cache = kmem_cache_create("bio_split_hook",
+ sizeof(struct bio_split_hook), 0, SLAB_PANIC, NULL);
+
return 0;
}
diff --git a/fs/bio.c b/fs/bio.c
index a85239f..e4d54b2 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -430,6 +430,47 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
}
EXPORT_SYMBOL(bio_phys_segments);
+unsigned __bio_max_sectors(struct bio *bio, struct block_device *bdev,
+ sector_t sector)
+{
+ unsigned ret = bio_sectors(bio);
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct bio_vec *bv, *end = bio_iovec(bio) +
+ min_t(int, bio_segments(bio), queue_max_segments(q));
+
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bdev,
+ .bi_sector = sector,
+ .bi_size = 0,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (bio_segments(bio) > queue_max_segments(q) ||
+ q->merge_bvec_fn) {
+ ret = 0;
+
+ for (bv = bio_iovec(bio); bv < end; bv++) {
+ if (q->merge_bvec_fn &&
+ q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
+ break;
+
+ ret += bv->bv_len >> 9;
+ bvm.bi_size += bv->bv_len;
+ }
+
+ if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9)
+ return (BIO_MAX_PAGES * PAGE_SIZE) >> 9;
+ }
+
+ ret = min(ret, queue_max_sectors(q));
+
+ WARN_ON(!ret);
+ ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__bio_max_sectors);
+
/**
* __bio_clone - clone a bio
* @bio: destination bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0880a71..758cd56 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -218,6 +218,13 @@ extern void bio_endio(struct bio *, int);
struct request_queue;
extern int bio_phys_segments(struct request_queue *, struct bio *);
+unsigned __bio_max_sectors(struct bio *, struct block_device *, sector_t);
+
+static inline unsigned bio_max_sectors(struct bio *bio)
+{
+ return __bio_max_sectors(bio, bio->bi_bdev, bio->bi_sector);
+}
+
extern void __bio_clone(struct bio *, struct bio *);
extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
extern struct bio *bio_clone(struct bio *, gfp_t);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2aa2466..464adb7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -399,6 +399,9 @@ struct request_queue {
/* Throttle data */
struct throtl_data *td;
#endif
+
+ mempool_t *bio_split_hook;
+ struct bio_set *bio_split;
};
#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 14/16] Gut bio_add_page()
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (12 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 13/16] Make generic_make_request handle arbitrarily large bios Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
[not found] ` <1337977539-16977-15-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-25 20:25 ` [PATCH v3 15/16] md: Kill merge_bvec_fn()s Kent Overstreet
2012-05-25 20:25 ` [PATCH v3 16/16] dm: Kill merge_bvec_fn() Kent Overstreet
15 siblings, 1 reply; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: Kent Overstreet, tj, axboe, agk, neilb, drbd-dev, bharrosh,
vgoyal, mpatocka, sage, yehuda
Since generic_make_request() can now handle arbitrary size bios, all we
have to do is make sure the bvec array doesn't overflow.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
fs/bio.c | 133 ++++++++++++--------------------------------------------------
1 file changed, 26 insertions(+), 107 deletions(-)
diff --git a/fs/bio.c b/fs/bio.c
index e4d54b2..b0c2944 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -570,12 +570,22 @@ int bio_get_nr_vecs(struct block_device *bdev)
}
EXPORT_SYMBOL(bio_get_nr_vecs);
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
- *page, unsigned int len, unsigned int offset,
- unsigned short max_sectors)
+/**
+ * bio_add_page - attempt to add page to bio
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ *
+ * Attempt to add a page to the bio_vec maplist. This can fail for a
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ */
+int bio_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
{
- int retried_segments = 0;
- struct bio_vec *bvec;
+ struct bio_vec *bv;
/*
* cloned bio must not modify vec list
@@ -583,40 +593,17 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (unlikely(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_size + len) >> 9) > max_sectors)
- return 0;
-
/*
* For filesystems with a blocksize smaller than the pagesize
* we will often be called with the same page as last time and
* a consecutive offset. Optimize this special case.
*/
if (bio->bi_vcnt > 0) {
- struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (page == prev->bv_page &&
- offset == prev->bv_offset + prev->bv_len) {
- unsigned int prev_bv_len = prev->bv_len;
- prev->bv_len += len;
-
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- /* prev_bvec is already charged in
- bi_size, discharge it in order to
- simulate merging updated prev_bvec
- as new bvec. */
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_sector,
- .bi_size = bio->bi_size - prev_bv_len,
- .bi_rw = bio->bi_rw,
- };
-
- if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
- prev->bv_len -= len;
- return 0;
- }
- }
+ bv = bio_iovec_idx(bio, bio->bi_vcnt - 1);
+ if (page == bv->bv_page &&
+ offset == bv->bv_offset + bv->bv_len) {
+ bv->bv_len += len;
goto done;
}
}
@@ -624,64 +611,17 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (bio->bi_vcnt >= bio->bi_max_vecs)
return 0;
- /*
- * we might lose a segment or two here, but rather that than
- * make this too complex.
- */
-
- while (bio->bi_phys_segments >= queue_max_segments(q)) {
-
- if (retried_segments)
- return 0;
-
- retried_segments = 1;
- blk_recount_segments(q, bio);
- }
-
- /*
- * setup the new entry, we might clear it again later if we
- * cannot add the page
- */
- bvec = &bio->bi_io_vec[bio->bi_vcnt];
- bvec->bv_page = page;
- bvec->bv_len = len;
- bvec->bv_offset = offset;
-
- /*
- * if queue has other restrictions (eg varying max sector size
- * depending on offset), it can specify a merge_bvec_fn in the
- * queue to get further control
- */
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_sector,
- .bi_size = bio->bi_size,
- .bi_rw = bio->bi_rw,
- };
-
- /*
- * merge_bvec_fn() returns number of bytes it can accept
- * at this offset
- */
- if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
- bvec->bv_page = NULL;
- bvec->bv_len = 0;
- bvec->bv_offset = 0;
- return 0;
- }
- }
-
- /* If we may be able to merge these biovecs, force a recount */
- if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
- bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+ bv = bio_iovec_idx(bio, bio->bi_vcnt);
+ bv->bv_page = page;
+ bv->bv_len = len;
+ bv->bv_offset = offset;
bio->bi_vcnt++;
- bio->bi_phys_segments++;
- done:
+done:
bio->bi_size += len;
return len;
}
+EXPORT_SYMBOL(bio_add_page);
/**
* bio_add_pc_page - attempt to add page to bio
@@ -701,31 +641,10 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
- return __bio_add_page(q, bio, page, len, offset,
- queue_max_hw_sectors(q));
+ return bio_add_page(bio, page, len, offset);
}
EXPORT_SYMBOL(bio_add_pc_page);
-/**
- * bio_add_page - attempt to add page to bio
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- */
-int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
- unsigned int offset)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_page);
-
struct bio_map_data {
struct bio_vec *iovecs;
struct sg_iovec *sgvecs;
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 15/16] md: Kill merge_bvec_fn()s
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (13 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 14/16] Gut bio_add_page() Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
2012-05-25 20:25 ` [PATCH v3 16/16] dm: Kill merge_bvec_fn() Kent Overstreet
15 siblings, 0 replies; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: Kent Overstreet, tj, axboe, agk, neilb, drbd-dev, bharrosh,
vgoyal, mpatocka, sage, yehuda
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/md/linear.c | 46 ------------------------------
drivers/md/raid0.c | 62 +----------------------------------------
drivers/md/raid1.c | 34 -----------------------
drivers/md/raid10.c | 77 +--------------------------------------------------
4 files changed, 2 insertions(+), 217 deletions(-)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 7c6cafd..86e5405 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -52,51 +52,6 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
return conf->disks + lo;
}
-/**
- * linear_mergeable_bvec -- tell bio layer if two requests can be merged
- * @q: request queue
- * @bvm: properties of new bio
- * @biovec: the request that could be merged to it.
- *
- * Return amount of bytes we can take at this offset
- */
-static int linear_mergeable_bvec(struct request_queue *q,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct mddev *mddev = q->queuedata;
- struct dev_info *dev0;
- unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int maxbytes = biovec->bv_len;
- struct request_queue *subq;
-
- rcu_read_lock();
- dev0 = which_dev(mddev, sector);
- maxsectors = dev0->end_sector - sector;
- subq = bdev_get_queue(dev0->rdev->bdev);
- if (subq->merge_bvec_fn) {
- bvm->bi_bdev = dev0->rdev->bdev;
- bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
- maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
- biovec));
- }
- rcu_read_unlock();
-
- if (maxsectors < bio_sectors)
- maxsectors = 0;
- else
- maxsectors -= bio_sectors;
-
- if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
- return maxbytes;
-
- if (maxsectors > (maxbytes >> 9))
- return maxbytes;
- else
- return maxsectors << 9;
-}
-
static int linear_congested(void *data, int bits)
{
struct mddev *mddev = data;
@@ -209,7 +164,6 @@ static int linear_run (struct mddev *mddev)
mddev->private = conf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
- blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->backing_dev_info.congested_fn = linear_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 3469adf..5d463ef 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -340,59 +340,6 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
+ sector_div(sector, zone->nb_dev)];
}
-/**
- * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
- * @q: request queue
- * @bvm: properties of new bio
- * @biovec: the request that could be merged to it.
- *
- * Return amount of bytes we can accept at this offset
- */
-static int raid0_mergeable_bvec(struct request_queue *q,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct mddev *mddev = q->queuedata;
- struct r0conf *conf = mddev->private;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- sector_t sector_offset = sector;
- int max;
- unsigned int chunk_sectors = mddev->chunk_sectors;
- unsigned int bio_sectors = bvm->bi_size >> 9;
- struct strip_zone *zone;
- struct md_rdev *rdev;
- struct request_queue *subq;
-
- if (is_power_of_2(chunk_sectors))
- max = (chunk_sectors - ((sector & (chunk_sectors-1))
- + bio_sectors)) << 9;
- else
- max = (chunk_sectors - (sector_div(sector, chunk_sectors)
- + bio_sectors)) << 9;
- if (max < 0)
- max = 0; /* bio_add cannot handle a negative return */
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- if (max < biovec->bv_len)
- /* too small already, no need to check further */
- return max;
- if (!conf->has_merge_bvec)
- return max;
-
- /* May need to check subordinate device */
- sector = sector_offset;
- zone = find_zone(mddev->private, §or_offset);
- rdev = map_sector(mddev, zone, sector, §or_offset);
- subq = bdev_get_queue(rdev->bdev);
- if (subq->merge_bvec_fn) {
- bvm->bi_bdev = rdev->bdev;
- bvm->bi_sector = sector_offset + zone->dev_start +
- rdev->data_offset;
- return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
- } else
- return max;
-}
-
static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{
sector_t array_sectors = 0;
@@ -454,7 +401,6 @@ static int raid0_run(struct mddev *mddev)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
}
- blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
ret = md_integrity_register(mddev);
@@ -508,13 +454,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
sector_t sector = bio->bi_sector;
struct bio_pair *bp;
- /* Sanity check -- queue functions should prevent this happening */
- if (bio->bi_vcnt != 1 ||
- bio->bi_idx != 0)
- goto bad_map;
- /* This is a one page bio that upper layers
- * refuse to split for us, so we need to split it.
- */
+
if (likely(is_power_of_2(chunk_sects)))
bp = bio_pair_split(bio, chunk_sects - (sector &
(chunk_sects-1)));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 15dd59b..78f9dcd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -615,39 +615,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
-static int raid1_mergeable_bvec(struct request_queue *q,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct mddev *mddev = q->queuedata;
- struct r1conf *conf = mddev->private;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int max = biovec->bv_len;
-
- if (mddev->merge_check_needed) {
- int disk;
- rcu_read_lock();
- for (disk = 0; disk < conf->raid_disks * 2; disk++) {
- struct md_rdev *rdev = rcu_dereference(
- conf->mirrors[disk].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q =
- bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn) {
- bvm->bi_sector = sector +
- rdev->data_offset;
- bvm->bi_bdev = rdev->bdev;
- max = min(max, q->merge_bvec_fn(
- q, bvm, biovec));
- }
- }
- }
- rcu_read_unlock();
- }
- return max;
-
-}
-
int md_raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
@@ -2705,7 +2672,6 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
- blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
}
ret = md_integrity_register(mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0062326..a6f14e7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -579,77 +579,6 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
return (vchunk << conf->chunk_shift) + offset;
}
-/**
- * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
- * @q: request queue
- * @bvm: properties of new bio
- * @biovec: the request that could be merged to it.
- *
- * Return amount of bytes we can accept at this offset
- * This requires checking for end-of-chunk if near_copies != raid_disks,
- * and for subordinate merge_bvec_fns if merge_check_needed.
- */
-static int raid10_mergeable_bvec(struct request_queue *q,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct mddev *mddev = q->queuedata;
- struct r10conf *conf = mddev->private;
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int max;
- unsigned int chunk_sectors = mddev->chunk_sectors;
- unsigned int bio_sectors = bvm->bi_size >> 9;
-
- if (conf->near_copies < conf->raid_disks) {
- max = (chunk_sectors - ((sector & (chunk_sectors - 1))
- + bio_sectors)) << 9;
- if (max < 0)
- /* bio_add cannot handle a negative return */
- max = 0;
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- } else
- max = biovec->bv_len;
-
- if (mddev->merge_check_needed) {
- struct r10bio r10_bio;
- int s;
- r10_bio.sector = sector;
- raid10_find_phys(conf, &r10_bio);
- rcu_read_lock();
- for (s = 0; s < conf->copies; s++) {
- int disk = r10_bio.devs[s].devnum;
- struct md_rdev *rdev = rcu_dereference(
- conf->mirrors[disk].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q =
- bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn) {
- bvm->bi_sector = r10_bio.devs[s].addr
- + rdev->data_offset;
- bvm->bi_bdev = rdev->bdev;
- max = min(max, q->merge_bvec_fn(
- q, bvm, biovec));
- }
- }
- rdev = rcu_dereference(conf->mirrors[disk].replacement);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q =
- bdev_get_queue(rdev->bdev);
- if (q->merge_bvec_fn) {
- bvm->bi_sector = r10_bio.devs[s].addr
- + rdev->data_offset;
- bvm->bi_bdev = rdev->bdev;
- max = min(max, q->merge_bvec_fn(
- q, bvm, biovec));
- }
- }
- }
- rcu_read_unlock();
- }
- return max;
-}
-
/*
* This routine returns the disk from which the requested read should
* be done. There is a per-array 'next expected sequential IO' sector
@@ -994,9 +923,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
return;
}
- /* If this request crosses a chunk boundary, we need to
- * split it. This will only happen for 1 PAGE (or less) requests.
- */
+ /* If this request crosses a chunk boundary, we need to split it. */
if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
> chunk_sects &&
conf->near_copies < conf->raid_disks)) {
@@ -3380,8 +3307,6 @@ static int run(struct mddev *mddev)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
}
- blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-
if (md_integrity_register(mddev))
goto out_free_conf;
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread
* [PATCH v3 16/16] dm: Kill merge_bvec_fn()
2012-05-25 20:25 [PATCH v3 00/16] Block cleanups Kent Overstreet
` (14 preceding siblings ...)
2012-05-25 20:25 ` [PATCH v3 15/16] md: Kill merge_bvec_fn()s Kent Overstreet
@ 2012-05-25 20:25 ` Kent Overstreet
15 siblings, 0 replies; 52+ messages in thread
From: Kent Overstreet @ 2012-05-25 20:25 UTC (permalink / raw)
To: linux-kernel, linux-bcache, dm-devel, linux-fsdevel
Cc: Kent Overstreet, tj, axboe, agk, neilb, drbd-dev, bharrosh,
vgoyal, mpatocka, sage, yehuda
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/md/dm.c | 58 -------------------------------------------------------
1 file changed, 58 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 193fb19..a4c5256 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1309,63 +1309,6 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
* CRUD END
*---------------------------------------------------------------*/
-static int dm_merge_bvec(struct request_queue *q,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_live_table(md);
- struct dm_target *ti;
- sector_t max_sectors;
- int max_size = 0;
-
- if (unlikely(!map))
- goto out;
-
- ti = dm_table_find_target(map, bvm->bi_sector);
- if (!dm_target_is_valid(ti))
- goto out_table;
-
- /*
- * Find maximum amount of I/O that won't need splitting
- */
- max_sectors = min(max_io_len(bvm->bi_sector, ti),
- (sector_t) BIO_MAX_SECTORS);
- max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
- if (max_size < 0)
- max_size = 0;
-
- /*
- * merge_bvec_fn() returns number of bytes
- * it can accept at this offset
- * max is precomputed maximal io size
- */
- if (max_size && ti->type->merge)
- max_size = ti->type->merge(ti, bvm, biovec, max_size);
- /*
- * If the target doesn't support merge method and some of the devices
- * provided their merge_bvec method (we know this by looking at
- * queue_max_hw_sectors), then we can't allow bios with multiple vector
- * entries. So always set max_size to 0, and the code below allows
- * just one page.
- */
- else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
-
- max_size = 0;
-
-out_table:
- dm_table_put(map);
-
-out:
- /*
- * Always allow an entire first page
- */
- if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
- max_size = biovec->bv_len;
-
- return max_size;
-}
-
/*
* The request function that just remaps the bio built up by
* dm_merge_bvec.
@@ -1766,7 +1709,6 @@ static void dm_init_md_queue(struct mapped_device *md)
md->queue->backing_dev_info.congested_data = md;
blk_queue_make_request(md->queue, dm_request);
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
- blk_queue_merge_bvec(md->queue, dm_merge_bvec);
}
/*
--
1.7.9.3.327.g2980b
^ permalink raw reply related [flat|nested] 52+ messages in thread