[PATCH v7 0/9] Block cleanups, deadlock fix

linux-bcache.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v7 0/9] Block cleanups, deadlock fix
@ 2012-08-28 17:37 Kent Overstreet
  2012-08-28 17:37 ` [PATCH v7 1/9] block: Generalized bio pool freeing Kent Overstreet
                   ` (8 more replies)
  0 siblings, 9 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Kent Overstreet, tj, vgoyal, mpatocka, bharrosh

Since v6:

 * Rebased onto Linus master
 * Split off the bio splitting patches
 * Various review feedback

Kent Overstreet (9):
  block: Generalized bio pool freeing
  dm: Use bioset's front_pad for dm_rq_clone_bio_info
  block: Add bio_reset()
  pktcdvd: Switch to bio_kmalloc()
  block: Kill bi_destructor
  block: Consolidate bio_alloc_bioset(), bio_kmalloc()
  block: Add bio_clone_bioset(), bio_clone_kmalloc()
  block: Reorder struct bio_set
  block: Avoid deadlocks with bio allocation by stacking drivers

 Documentation/block/biodoc.txt      |   5 -
 block/blk-core.c                    |  10 +-
 drivers/block/drbd/drbd_main.c      |  13 +-
 drivers/block/osdblk.c              |   3 +-
 drivers/block/pktcdvd.c             |  52 ++------
 drivers/md/dm-crypt.c               |   9 --
 drivers/md/dm-io.c                  |  11 --
 drivers/md/dm.c                     |  68 +++-------
 drivers/md/md.c                     |  44 +------
 drivers/target/target_core_iblock.c |   9 --
 fs/bio.c                            | 243 ++++++++++++++++++++----------------
 fs/exofs/ore.c                      |   5 +-
 include/linux/bio.h                 | 111 ++++++++++------
 include/linux/blk_types.h           |  23 +++-
 14 files changed, 260 insertions(+), 346 deletions(-)

-- 
1.7.12

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v7 1/9] block: Generalized bio pool freeing
  2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
@ 2012-08-28 17:37 ` Kent Overstreet
  2012-08-28 17:37 ` [PATCH v7 2/9] dm: Use bioset's front_pad for dm_rq_clone_bio_info Kent Overstreet
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Jens Axboe, Kent Overstreet, mpatocka, Alasdair Kergon, bharrosh,
	tj, Lars Ellenberg, vgoyal

With the old code, when you allocate a bio from a bio pool you have to
implement your own destructor that knows how to find the bio pool the
bio was originally allocated from.

This adds a new field to struct bio (bi_pool) and changes
bio_alloc_bioset() to use it. This makes various bio destructors
unnecessary, so they're then deleted.

v6: Explain the temporary if statement in bio_put

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: NeilBrown <neilb@suse.de>
CC: Alasdair Kergon <agk@redhat.com>
CC: Nicholas Bellinger <nab@linux-iscsi.org>
CC: Lars Ellenberg <lars.ellenberg@linbit.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/block/drbd/drbd_main.c      | 13 +------------
 drivers/md/dm-crypt.c               |  9 ---------
 drivers/md/dm-io.c                  | 11 -----------
 drivers/md/dm.c                     | 20 --------------------
 drivers/md/md.c                     | 28 ++++------------------------
 drivers/target/target_core_iblock.c |  9 ---------
 fs/bio.c                            | 31 +++++++++++++------------------
 include/linux/blk_types.h           |  3 +++
 8 files changed, 21 insertions(+), 103 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index f93a032..f55683a 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -162,23 +162,12 @@ static const struct block_device_operations drbd_ops = {
 	.release = drbd_release,
 };
 
-static void bio_destructor_drbd(struct bio *bio)
-{
-	bio_free(bio, drbd_md_io_bio_set);
-}
-
 struct bio *bio_alloc_drbd(gfp_t gfp_mask)
 {
-	struct bio *bio;
-
 	if (!drbd_md_io_bio_set)
 		return bio_alloc(gfp_mask, 1);
 
-	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
-	if (!bio)
-		return NULL;
-	bio->bi_destructor = bio_destructor_drbd;
-	return bio;
+	return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
 }
 
 #ifdef __CHECKER__
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 664743d..3c0acba 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -798,14 +798,6 @@ static int crypt_convert(struct crypt_config *cc,
 	return 0;
 }
 
-static void dm_crypt_bio_destructor(struct bio *bio)
-{
-	struct dm_crypt_io *io = bio->bi_private;
-	struct crypt_config *cc = io->cc;
-
-	bio_free(bio, cc->bs);
-}
-
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
@@ -974,7 +966,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_end_io  = crypt_endio;
 	clone->bi_bdev    = cc->dev->bdev;
 	clone->bi_rw      = io->base_bio->bi_rw;
-	clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea5dd28..1c46f97 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -249,16 +249,6 @@ static void vm_dp_init(struct dpages *dp, void *data)
 	dp->context_ptr = data;
 }
 
-static void dm_bio_destructor(struct bio *bio)
-{
-	unsigned region;
-	struct io *io;
-
-	retrieve_io_and_region_from_bio(bio, &io, &region);
-
-	bio_free(bio, io->client->bios);
-}
-
 /*
  * Functions for getting the pages from kernel memory.
  */
@@ -317,7 +307,6 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 		bio->bi_sector = where->sector + (where->count - remaining);
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
-		bio->bi_destructor = dm_bio_destructor;
 		store_io_and_region_in_bio(bio, io, region);
 
 		if (rw & REQ_DISCARD) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4e09b6f..0c3d6dd 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -681,11 +681,6 @@ static void clone_endio(struct bio *bio, int error)
 		}
 	}
 
-	/*
-	 * Store md for cleanup instead of tio which is about to get freed.
-	 */
-	bio->bi_private = md->bs;
-
 	free_tio(md, tio);
 	bio_put(bio);
 	dec_pending(io, error);
@@ -1032,11 +1027,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		/* error the io and bail out, or requeue it if needed */
 		md = tio->io->md;
 		dec_pending(tio->io, r);
-		/*
-		 * Store bio_set for cleanup.
-		 */
-		clone->bi_end_io = NULL;
-		clone->bi_private = md->bs;
 		bio_put(clone);
 		free_tio(md, tio);
 	} else if (r) {
@@ -1055,13 +1045,6 @@ struct clone_info {
 	unsigned short idx;
 };
 
-static void dm_bio_destructor(struct bio *bio)
-{
-	struct bio_set *bs = bio->bi_private;
-
-	bio_free(bio, bs);
-}
-
 /*
  * Creates a little bio that just does part of a bvec.
  */
@@ -1073,7 +1056,6 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 	struct bio_vec *bv = bio->bi_io_vec + idx;
 
 	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
-	clone->bi_destructor = dm_bio_destructor;
 	*clone->bi_io_vec = *bv;
 
 	clone->bi_sector = sector;
@@ -1105,7 +1087,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
-	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
 	clone->bi_vcnt = idx + bv_count;
@@ -1150,7 +1131,6 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 	 */
 	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
 	__bio_clone(clone, ci->bio);
-	clone->bi_destructor = dm_bio_destructor;
 	if (len) {
 		clone->bi_sector = ci->sector;
 		clone->bi_size = to_bytes(len);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3f6203a..b8eebe3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -155,32 +155,17 @@ static int start_readonly;
  * like bio_clone, but with a local bio set
  */
 
-static void mddev_bio_destructor(struct bio *bio)
-{
-	struct mddev *mddev, **mddevp;
-
-	mddevp = (void*)bio;
-	mddev = mddevp[-1];
-
-	bio_free(bio, mddev->bio_set);
-}
-
 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 			    struct mddev *mddev)
 {
 	struct bio *b;
-	struct mddev **mddevp;
 
 	if (!mddev || !mddev->bio_set)
 		return bio_alloc(gfp_mask, nr_iovecs);
 
-	b = bio_alloc_bioset(gfp_mask, nr_iovecs,
-			     mddev->bio_set);
+	b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
 	if (!b)
 		return NULL;
-	mddevp = (void*)b;
-	mddevp[-1] = mddev;
-	b->bi_destructor = mddev_bio_destructor;
 	return b;
 }
 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
@@ -189,18 +174,14 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 			    struct mddev *mddev)
 {
 	struct bio *b;
-	struct mddev **mddevp;
 
 	if (!mddev || !mddev->bio_set)
 		return bio_clone(bio, gfp_mask);
 
-	b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
-			     mddev->bio_set);
+	b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, mddev->bio_set);
 	if (!b)
 		return NULL;
-	mddevp = (void*)b;
-	mddevp[-1] = mddev;
-	b->bi_destructor = mddev_bio_destructor;
+
 	__bio_clone(b, bio);
 	if (bio_integrity(bio)) {
 		int ret;
@@ -5006,8 +4987,7 @@ int md_run(struct mddev *mddev)
 	}
 
 	if (mddev->bio_set == NULL)
-		mddev->bio_set = bioset_create(BIO_POOL_SIZE,
-					       sizeof(struct mddev *));
+		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 76db75e..e58cd7d 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -543,14 +543,6 @@ static void iblock_complete_cmd(struct se_cmd *cmd)
 	kfree(ibr);
 }
 
-static void iblock_bio_destructor(struct bio *bio)
-{
-	struct se_cmd *cmd = bio->bi_private;
-	struct iblock_dev *ib_dev = cmd->se_dev->dev_ptr;
-
-	bio_free(bio, ib_dev->ibd_bio_set);
-}
-
 static struct bio *
 iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num)
 {
@@ -572,7 +564,6 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num)
 
 	bio->bi_bdev = ib_dev->ibd_bd;
 	bio->bi_private = cmd;
-	bio->bi_destructor = iblock_bio_destructor;
 	bio->bi_end_io = &iblock_bio_done;
 	bio->bi_sector = lba;
 	return bio;
diff --git a/fs/bio.c b/fs/bio.c
index 71072ab..e017f7a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -272,10 +272,6 @@ EXPORT_SYMBOL(bio_init);
  *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
  *   If %__GFP_WAIT is set then we will block on the internal pool waiting
  *   for a &struct bio to become free.
- *
- *   Note that the caller must set ->bi_destructor on successful return
- *   of a bio, to do the appropriate freeing of the bio once the reference
- *   count drops to zero.
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
@@ -290,6 +286,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 	bio = p + bs->front_pad;
 
 	bio_init(bio);
+	bio->bi_pool = bs;
 
 	if (unlikely(!nr_iovecs))
 		goto out_set;
@@ -316,11 +313,6 @@ err_free:
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
 
-static void bio_fs_destructor(struct bio *bio)
-{
-	bio_free(bio, fs_bio_set);
-}
-
 /**
  *	bio_alloc - allocate a new bio, memory pool backed
  *	@gfp_mask: allocation mask to use
@@ -342,12 +334,7 @@ static void bio_fs_destructor(struct bio *bio)
  */
 struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
-	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-
-	if (bio)
-		bio->bi_destructor = bio_fs_destructor;
-
-	return bio;
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
 }
 EXPORT_SYMBOL(bio_alloc);
 
@@ -423,7 +410,16 @@ void bio_put(struct bio *bio)
 	if (atomic_dec_and_test(&bio->bi_cnt)) {
 		bio_disassociate_task(bio);
 		bio->bi_next = NULL;
-		bio->bi_destructor(bio);
+
+		/*
+		 * This if statement is temporary - bi_pool is replacing
+		 * bi_destructor, but bi_destructor will be taken out in another
+		 * patch.
+		 */
+		if (bio->bi_pool)
+			bio_free(bio, bio->bi_pool);
+		else
+			bio->bi_destructor(bio);
 	}
 }
 EXPORT_SYMBOL(bio_put);
@@ -474,12 +470,11 @@ EXPORT_SYMBOL(__bio_clone);
  */
 struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
-	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
+	struct bio *b = bio_alloc(gfp_mask, bio->bi_max_vecs);
 
 	if (!b)
 		return NULL;
 
-	b->bi_destructor = bio_fs_destructor;
 	__bio_clone(b, bio);
 
 	if (bio_integrity(bio)) {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 7b7ac9c..af9dd9d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -80,6 +80,9 @@ struct bio {
 	struct bio_integrity_payload *bi_integrity;  /* data integrity */
 #endif
 
+	/* If bi_pool is non NULL, bi_destructor is not called */
+	struct bio_set		*bi_pool;
+
 	bio_destructor_t	*bi_destructor;	/* destructor */
 
 	/*
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH v7 2/9] dm: Use bioset's front_pad for dm_rq_clone_bio_info
  2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
  2012-08-28 17:37 ` [PATCH v7 1/9] block: Generalized bio pool freeing Kent Overstreet
@ 2012-08-28 17:37 ` Kent Overstreet
  2012-08-28 17:37 ` [PATCH v7 3/9] block: Add bio_reset() Kent Overstreet
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Kent Overstreet, tj, vgoyal, mpatocka, bharrosh, Alasdair Kergon

Previously, dm_rq_clone_bio_info needed to be freed by the bio's
destructor to avoid a memory leak in the blk_rq_prep_clone() error path.
This gets rid of a memory allocation and means we can kill
dm_rq_bio_destructor.

The _rq_bio_info_cache kmem cache is unused now and needs to be deleted,
but due to the way io_pool is used and overloaded this looks not quite
trivial so I'm leaving it for a later patch.

v6: Fix comment on struct dm_rq_clone_bio_info, per Tejun

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Alasdair Kergon <agk@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 drivers/md/dm.c | 44 ++++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0c3d6dd..9206781 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -86,12 +86,17 @@ struct dm_rq_target_io {
 };
 
 /*
- * For request-based dm.
- * One of these is allocated per bio.
+ * For request-based dm - the bio clones we allocate are embedded in these
+ * structs.
+ *
+ * We allocate these with bio_alloc_bioset, using the front_pad parameter when
+ * the bioset is created - this means the bio has to come at the end of the
+ * struct.
  */
 struct dm_rq_clone_bio_info {
 	struct bio *orig;
 	struct dm_rq_target_io *tio;
+	struct bio clone;
 };
 
 union map_info *dm_get_mapinfo(struct bio *bio)
@@ -211,6 +216,11 @@ struct dm_md_mempools {
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_tio_cache;
 static struct kmem_cache *_rq_tio_cache;
+
+/*
+ * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
+ * still used for _io_cache, I'm leaving this for a later cleanup
+ */
 static struct kmem_cache *_rq_bio_info_cache;
 
 static int __init local_init(void)
@@ -467,16 +477,6 @@ static void free_rq_tio(struct dm_rq_target_io *tio)
 	mempool_free(tio, tio->md->tio_pool);
 }
 
-static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
-{
-	return mempool_alloc(md->io_pool, GFP_ATOMIC);
-}
-
-static void free_bio_info(struct dm_rq_clone_bio_info *info)
-{
-	mempool_free(info, info->tio->md->io_pool);
-}
-
 static int md_in_flight(struct mapped_device *md)
 {
 	return atomic_read(&md->pending[READ]) +
@@ -1460,30 +1460,17 @@ void dm_dispatch_request(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(dm_dispatch_request);
 
-static void dm_rq_bio_destructor(struct bio *bio)
-{
-	struct dm_rq_clone_bio_info *info = bio->bi_private;
-	struct mapped_device *md = info->tio->md;
-
-	free_bio_info(info);
-	bio_free(bio, md->bs);
-}
-
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 				 void *data)
 {
 	struct dm_rq_target_io *tio = data;
-	struct mapped_device *md = tio->md;
-	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
-
-	if (!info)
-		return -ENOMEM;
+	struct dm_rq_clone_bio_info *info =
+		container_of(bio, struct dm_rq_clone_bio_info, clone);
 
 	info->orig = bio_orig;
 	info->tio = tio;
 	bio->bi_end_io = end_clone_bio;
 	bio->bi_private = info;
-	bio->bi_destructor = dm_rq_bio_destructor;
 
 	return 0;
 }
@@ -2718,7 +2705,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 	if (!pools->tio_pool)
 		goto free_io_pool_and_out;
 
-	pools->bs = bioset_create(pool_size, 0);
+	pools->bs = bioset_create(pool_size,
+				  offsetof(struct dm_rq_clone_bio_info, clone));
 	if (!pools->bs)
 		goto free_tio_pool_and_out;
 
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH v7 3/9] block: Add bio_reset()
  2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
  2012-08-28 17:37 ` [PATCH v7 1/9] block: Generalized bio pool freeing Kent Overstreet
  2012-08-28 17:37 ` [PATCH v7 2/9] dm: Use bioset's front_pad for dm_rq_clone_bio_info Kent Overstreet
@ 2012-08-28 17:37 ` Kent Overstreet
       [not found]   ` <1346175456-1572-4-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2012-08-28 17:37 ` [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc() Kent Overstreet
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Jens Axboe, Kent Overstreet, mpatocka, bharrosh, tj, vgoyal

Reusing bios is something that's been highly frowned upon in the past,
but driver code keeps doing it anyways. If it's going to happen anyways,
we should provide a generic method.

This'll help with getting rid of bi_destructor - drivers/block/pktcdvd.c
was open coding it, by doing a bio_init() and resetting bi_destructor.

v5: Add a define BIO_RESET_BITS, to be very explicit about what parts of
bio->bi_flags are saved.
v6: Further commenting verbosity, per Tejun

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
Acked-by: Tejun Heo <tj@kernel.org>
---
 fs/bio.c                  | 14 ++++++++++++++
 include/linux/bio.h       |  1 +
 include/linux/blk_types.h | 21 +++++++++++++++++----
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index e017f7a..52da084 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -262,6 +262,20 @@ void bio_init(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_init);
 
+void bio_reset(struct bio *bio)
+{
+	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
+
+	if (bio_integrity(bio))
+		bio_integrity_free(bio, bio->bi_pool);
+
+	bio_disassociate_task(bio);
+
+	memset(bio, 0, BIO_RESET_BYTES);
+	bio->bi_flags = flags|(1 << BIO_UPTODATE);
+}
+EXPORT_SYMBOL(bio_reset);
+
 /**
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2643589..ba60319 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -226,6 +226,7 @@ extern void __bio_clone(struct bio *, struct bio *);
 extern struct bio *bio_clone(struct bio *, gfp_t);
 
 extern void bio_init(struct bio *);
+extern void bio_reset(struct bio *);
 
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index af9dd9d..691edd1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -59,15 +59,19 @@ struct bio {
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;
 
+	bio_end_io_t		*bi_end_io;
+
+	void			*bi_private;
+
+	/*
+	 * Everything starting with bi_max_vecs will be preserved by bio_reset()
+	 */
+
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 
 	atomic_t		bi_cnt;		/* pin count */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
-
-	bio_end_io_t		*bi_end_io;
-
-	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
 	/*
 	 * Optional ioc and css associated with this bio.  Put on bio
@@ -93,6 +97,8 @@ struct bio {
 	struct bio_vec		bi_inline_vecs[0];
 };
 
+#define BIO_RESET_BYTES		offsetof(struct bio, bi_max_vecs)
+
 /*
  * bio flags
  */
@@ -108,6 +114,13 @@ struct bio {
 #define BIO_FS_INTEGRITY 9	/* fs owns integrity data, not block layer */
 #define BIO_QUIET	10	/* Make BIO Quiet */
 #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * BIO_POOL_IDX()
+ */
+#define BIO_RESET_BITS	12
+
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

[parent not found: <1346175456-1572-4-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 3/9] block: Add bio_reset()
       [not found]   ` <1346175456-1572-4-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 20:31     ` Tejun Heo
  2012-08-28 22:17       ` Kent Overstreet
  0 siblings, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-08-28 20:31 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello, Kent.

On Tue, Aug 28, 2012 at 10:37:30AM -0700, Kent Overstreet wrote:
> Reusing bios is something that's been highly frowned upon in the past,
> but driver code keeps doing it anyways. If it's going to happen anyways,
> we should provide a generic method.
> 
> This'll help with getting rid of bi_destructor - drivers/block/pktcdvd.c
> was open coding it, by doing a bio_init() and resetting bi_destructor.

Better to explain why some bio fields are re-ordered and why that
shouldn't make things worse cacheline-wise?

> +void bio_reset(struct bio *bio)
> +{

Function comment explaining what it does and why it does what it does
with integrity / bi_css / whatnot?

> +	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
> +
> +	if (bio_integrity(bio))
> +		bio_integrity_free(bio, bio->bi_pool);
> +
> +	bio_disassociate_task(bio);

Is this desirable?  Why?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 3/9] block: Add bio_reset()
  2012-08-28 20:31     ` Tejun Heo
@ 2012-08-28 22:17       ` Kent Overstreet
       [not found]         ` <20120828221715.GD1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 22:17 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache, linux-kernel, dm-devel, vgoyal, mpatocka, bharrosh,
	Jens Axboe

On Tue, Aug 28, 2012 at 01:31:48PM -0700, Tejun Heo wrote:
> Hello, Kent.
> 
> On Tue, Aug 28, 2012 at 10:37:30AM -0700, Kent Overstreet wrote:
> > Reusing bios is something that's been highly frowned upon in the past,
> > but driver code keeps doing it anyways. If it's going to happen anyways,
> > we should provide a generic method.
> > 
> > This'll help with getting rid of bi_destructor - drivers/block/pktcdvd.c
> > was open coding it, by doing a bio_init() and resetting bi_destructor.
> 
> Better to explain why some bio fields are re-ordered and why that
> shouldn't make things worse cacheline-wise?

Well it may (struct bio is what, 3 or 4 cachelines now?) but even on
ridiculous million iop devices struct bio just isn't hot enough for this
kind of stuff to show up in the profiles... and I've done enough
profiling to be pretty confident of that.

So um, if there was anything to say performance wise I would, but to me
it seems more that there isn't.

(pruning struct bio would have more of an effect performance wise, which
you know is something I'd like to do.)

> 
> > +void bio_reset(struct bio *bio)
> > +{
> 
> Function comment explaining what it does and why it does what it does
> with integrity / bi_css / whatnot?

Yeah, good idea.

> 
> > +	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
> > +
> > +	if (bio_integrity(bio))
> > +		bio_integrity_free(bio, bio->bi_pool);
> > +
> > +	bio_disassociate_task(bio);
> 
> Is this desirable?  Why?

*twitch* I should've thought more when I made that change.

It occurs to me now that we specifically talked about that and decided
to do it the other way - when I changed that I was just looking at
bio_free() and decided they needed to be symmetrical.

I still think they should be symmetrical, but if that's true bi_ioc and
bi_css need to be moved, and also bio_disassociate_task() should be
getting called from bio_free(), not bio_put().

Were you the one that added that call? I know you've been working on
that area of the code recently. Sticking it in bio_put() instead of
bio_free() seems odd to be, and they're completely equivalent now that
bio_free() is only called from bio_put() (save one instance I should
probably fix).

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120828221715.GD1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>]

* Re: [PATCH v7 3/9] block: Add bio_reset()
       [not found]         ` <20120828221715.GD1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
@ 2012-08-28 22:53           ` Kent Overstreet
  2012-09-01  2:23           ` Tejun Heo
  1 sibling, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 22:53 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Tue, Aug 28, 2012 at 03:17:15PM -0700, Kent Overstreet wrote:
> On Tue, Aug 28, 2012 at 01:31:48PM -0700, Tejun Heo wrote:
> > > +	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
> > > +
> > > +	if (bio_integrity(bio))
> > > +		bio_integrity_free(bio, bio->bi_pool);
> > > +
> > > +	bio_disassociate_task(bio);
> > 
> > Is this desirable?  Why?
> 
> *twitch* I should've thought more when I made that change.
> 
> It occurs to me now that we specifically talked about that and decided
> to do it the other way - when I changed that I was just looking at
> bio_free() and decided they needed to be symmetrical.
> 
> I still think they should be symmetrical, but if that's true bi_ioc and
> bi_css need to be moved, and also bio_disassociate_task() should be
> getting called from bio_free(), not bio_put().

Though - looking at it again I didn't actually screw anything up when I
made that change, it's just bad style.

Just screwing around a bit with the patch below, but - couple thoughts:

1) I hate duplicated code, and for the stuff in
bio_init()/bio_free()/bio_reset() it's perhaps not worth it, it is the
kind of stuff that gets out of sync.

2) It'd be better to have bio_reset() reset as much as possible, i.e. be
as close to bio_init() as posible. Fewer differences to confuse people.


diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 7c8fe1d..a38bc7d 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -148,6 +148,9 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 
 	BUG_ON(bip == NULL);
 
+	if (!bs)
+		bs = fs_bio_set;
+
 	/* A cloned bio doesn't own the integrity metadata */
 	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
 	    && bip->bip_buf != NULL)
diff --git a/fs/bio.c b/fs/bio.c
index c938f42..56d8fa2 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -234,39 +234,47 @@ fallback:
 	return bvl;
 }
 
+static void bio_free_stuff(struct bio *bio)
+{
+	bio_disassociate_task(bio);
+
+	if (bio_integrity(bio))
+		bio_integrity_free(bio, bio->bi_pool);
+}
+
 void bio_free(struct bio *bio)
 {
 	struct bio_set *bs = bio->bi_pool;
 	void *p;
 
+	bio_free_stuff(bio);
+
 	if (!bs) {
-		/* Bio was allocated by bio_kmalloc() */
-		if (bio_integrity(bio))
-			bio_integrity_free(bio, fs_bio_set);
 		kfree(bio);
-		return;
-	}
-
-	if (bio_has_allocated_vec(bio))
-		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+	} else {
+		if (bio_has_allocated_vec(bio))
+			bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
-	if (bio_integrity(bio))
-		bio_integrity_free(bio, bs);
+		/*
+		 * If we have front padding, adjust the bio pointer before freeing
+		 */
+		p = bio;
+		if (bs->front_pad)
+			p -= bs->front_pad;
 
-	/*
-	 * If we have front padding, adjust the bio pointer before freeing
-	 */
-	p = bio;
-	if (bs->front_pad)
-		p -= bs->front_pad;
+		mempool_free(p, bs->bio_pool);
+	}
+}
 
-	mempool_free(p, bs->bio_pool);
+static void __bio_init(struct bio *bio)
+{
+	memset(bio, 0, BIO_RESET_BYTES);
+	bio->bi_flags = 1 << BIO_UPTODATE;
 }
 
 void bio_init(struct bio *bio)
 {
-	memset(bio, 0, sizeof(*bio));
-	bio->bi_flags = 1 << BIO_UPTODATE;
+	__bio_init(bio);
 	atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -275,13 +283,10 @@ void bio_reset(struct bio *bio)
 {
 	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
 
-	if (bio_integrity(bio))
-		bio_integrity_free(bio, bio->bi_pool);
+	bio_free_stuff(bio);
+	__bio_init(bio);
 
-	bio_disassociate_task(bio);
-
-	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
+	bio->bi_flags |= flags;
 }
 EXPORT_SYMBOL(bio_reset);
 
@@ -440,10 +445,8 @@ void bio_put(struct bio *bio)
 	/*
 	 * last put frees it
 	 */
-	if (atomic_dec_and_test(&bio->bi_cnt)) {
-		bio_disassociate_task(bio);
+	if (atomic_dec_and_test(&bio->bi_cnt))
 		bio_free(bio);
-	}
 }
 EXPORT_SYMBOL(bio_put);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ca63847..28bddc0 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -68,15 +68,6 @@ struct bio {
 
 	struct bvec_iter	bi_iter;
 
-	/*
-	 * Everything starting with bi_max_vecs will be preserved by bio_reset()
-	 */
-
-	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
-
-	atomic_t		bi_cnt;		/* pin count */
-
-	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 #ifdef CONFIG_BLK_CGROUP
 	/*
 	 * Optional ioc and css associated with this bio.  Put on bio
@@ -89,6 +80,16 @@ struct bio {
 	struct bio_integrity_payload *bi_integrity;  /* data integrity */
 #endif
 
+	/*
+	 * Everything starting with bi_max_vecs will be preserved by bio_reset()
+	 */
+
+	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
+
+	atomic_t		bi_cnt;		/* pin count */
+
+	struct bio_vec		*bi_io_vec;	/* the actual vec list */
+
 	struct bio_set		*bi_pool;
 
 	/*

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 3/9] block: Add bio_reset()
       [not found]         ` <20120828221715.GD1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  2012-08-28 22:53           ` Kent Overstreet
@ 2012-09-01  2:23           ` Tejun Heo
  2012-09-05 20:13             ` Kent Overstreet
  1 sibling, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-09-01  2:23 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello,

On Tue, Aug 28, 2012 at 03:17:15PM -0700, Kent Overstreet wrote:
> > Better to explain why some bio fields are re-ordered and why that
> > shouldn't make things worse cacheline-wise?
> 
> Well it may (struct bio is what, 3 or 4 cachelines now?) but even on
> ridiculous million iop devices struct bio just isn't hot enough for this
> kind of stuff to show up in the profiles... and I've done enough
> profiling to be pretty confident of that.
> 
> So um, if there was anything to say performance wise I would, but to me
> it seems more that there isn't.
> 
> (pruning struct bio would have more of an effect performance wise, which
> you know is something I'd like to do.)

Yeah, please put something like the above in the patch description.

> > > +	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
> > > +
> > > +	if (bio_integrity(bio))
> > > +		bio_integrity_free(bio, bio->bi_pool);
> > > +
> > > +	bio_disassociate_task(bio);
> > 
> > Is this desirable?  Why?
> 
> *twitch* I should've thought more when I made that change.
> 
> It occurs to me now that we specifically talked about that and decided
> to do it the other way - when I changed that I was just looking at
> bio_free() and decided they needed to be symmetrical.
> 
> I still think they should be symmetrical, but if that's true bi_ioc and
> bi_css need to be moved, and also bio_disassociate_task() should be
> getting called from bio_free(), not bio_put().
> 
> Were you the one that added that call? I know you've been working on
> that area of the code recently. Sticking it in bio_put() instead of
> bio_free() seems odd to be, and they're completely equivalent now that
> bio_free() is only called from bio_put() (save one instance I should
> probably fix).

Maybe I botched symmetry but anyways I *suspect* it probably would be
better to keep css association across bio_reset() give the current
usages of both mechanisms.  css association indicates the ownership of
the bio which isn't likely to change while recycling the bio.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 3/9] block: Add bio_reset()
  2012-09-01  2:23           ` Tejun Heo
@ 2012-09-05 20:13             ` Kent Overstreet
  0 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-09-05 20:13 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache, linux-kernel, dm-devel, vgoyal, mpatocka, bharrosh,
	Jens Axboe

On Fri, Aug 31, 2012 at 07:23:05PM -0700, Tejun Heo wrote:
> On Tue, Aug 28, 2012 at 03:17:15PM -0700, Kent Overstreet wrote:
> > I still think they should be symmetrical, but if that's true bi_ioc and
> > bi_css need to be moved, and also bio_disassociate_task() should be
> > getting called from bio_free(), not bio_put().
> > 
> > Were you the one that added that call? I know you've been working on
> > that area of the code recently. Sticking it in bio_put() instead of
> > bio_free() seems odd to be, and they're completely equivalent now that
> > bio_free() is only called from bio_put() (save one instance I should
> > probably fix).
> 
> Maybe I botched symmetry but anyways I *suspect* it probably would be
> better to keep css association across bio_reset() give the current
> usages of both mechanisms.  css association indicates the ownership of
> the bio which isn't likely to change while recycling the bio.

Thought about it more and while you're right that css association isn't
likely to change, it'd just be a needless difference. bio_reset() should
be as close to a bio_free()/bio_alloc() as possible, IMO.

Fixed my patches to do it right, though.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc()
  2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
                   ` (2 preceding siblings ...)
  2012-08-28 17:37 ` [PATCH v7 3/9] block: Add bio_reset() Kent Overstreet
@ 2012-08-28 17:37 ` Kent Overstreet
       [not found]   ` <1346175456-1572-5-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2012-08-28 17:37 ` [PATCH v7 5/9] block: Kill bi_destructor Kent Overstreet
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Kent Overstreet, tj, vgoyal, mpatocka, bharrosh, Peter Osterlund

This is prep work for killing bi_destructor - previously, pktcdvd had
its own pkt_bio_alloc which was basically duplication bio_kmalloc(),
necessitating its own bi_destructor implementation.

v5: Un-reorder some functions, to make the patch easier to review

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Peter Osterlund <petero2@telia.com>
---
 drivers/block/pktcdvd.c | 52 +++++++------------------------------------------
 1 file changed, 7 insertions(+), 45 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ba66e44..2e7de7a 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -522,38 +522,6 @@ static void pkt_bio_finished(struct pktcdvd_device *pd)
 	}
 }
 
-static void pkt_bio_destructor(struct bio *bio)
-{
-	kfree(bio->bi_io_vec);
-	kfree(bio);
-}
-
-static struct bio *pkt_bio_alloc(int nr_iovecs)
-{
-	struct bio_vec *bvl = NULL;
-	struct bio *bio;
-
-	bio = kmalloc(sizeof(struct bio), GFP_KERNEL);
-	if (!bio)
-		goto no_bio;
-	bio_init(bio);
-
-	bvl = kcalloc(nr_iovecs, sizeof(struct bio_vec), GFP_KERNEL);
-	if (!bvl)
-		goto no_bvl;
-
-	bio->bi_max_vecs = nr_iovecs;
-	bio->bi_io_vec = bvl;
-	bio->bi_destructor = pkt_bio_destructor;
-
-	return bio;
-
- no_bvl:
-	kfree(bio);
- no_bio:
-	return NULL;
-}
-
 /*
  * Allocate a packet_data struct
  */
@@ -567,7 +535,7 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
 		goto no_pkt;
 
 	pkt->frames = frames;
-	pkt->w_bio = pkt_bio_alloc(frames);
+	pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames);
 	if (!pkt->w_bio)
 		goto no_bio;
 
@@ -581,9 +549,10 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
 	bio_list_init(&pkt->orig_bios);
 
 	for (i = 0; i < frames; i++) {
-		struct bio *bio = pkt_bio_alloc(1);
+		struct bio *bio = bio_kmalloc(GFP_KERNEL, 1);
 		if (!bio)
 			goto no_rd_bio;
+
 		pkt->r_bios[i] = bio;
 	}
 
@@ -1111,21 +1080,17 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	 * Schedule reads for missing parts of the packet.
 	 */
 	for (f = 0; f < pkt->frames; f++) {
-		struct bio_vec *vec;
-
 		int p, offset;
+
 		if (written[f])
 			continue;
+
 		bio = pkt->r_bios[f];
-		vec = bio->bi_io_vec;
-		bio_init(bio);
-		bio->bi_max_vecs = 1;
+		bio_reset(bio);
 		bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
 		bio->bi_bdev = pd->bdev;
 		bio->bi_end_io = pkt_end_io_read;
 		bio->bi_private = pkt;
-		bio->bi_io_vec = vec;
-		bio->bi_destructor = pkt_bio_destructor;
 
 		p = (f * CD_FRAMESIZE) / PAGE_SIZE;
 		offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
@@ -1418,14 +1383,11 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	}
 
 	/* Start the write request */
-	bio_init(pkt->w_bio);
-	pkt->w_bio->bi_max_vecs = PACKET_MAX_SIZE;
+	bio_reset(pkt->w_bio);
 	pkt->w_bio->bi_sector = pkt->sector;
 	pkt->w_bio->bi_bdev = pd->bdev;
 	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
 	pkt->w_bio->bi_private = pkt;
-	pkt->w_bio->bi_io_vec = bvec;
-	pkt->w_bio->bi_destructor = pkt_bio_destructor;
 	for (f = 0; f < pkt->frames; f++)
 		if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
 			BUG();
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

[parent not found: <1346175456-1572-5-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc()
       [not found]   ` <1346175456-1572-5-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 20:32     ` Tejun Heo
  2012-08-28 22:24       ` Kent Overstreet
       [not found]       ` <20120828203247.GC24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
  0 siblings, 2 replies; 75+ messages in thread
From: Tejun Heo @ 2012-08-28 20:32 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Peter Osterlund

On Tue, Aug 28, 2012 at 10:37:31AM -0700, Kent Overstreet wrote:
> This is prep work for killing bi_destructor - previously, pktcdvd had
> its own pkt_bio_alloc which was basically duplication bio_kmalloc(),
> necessitating its own bi_destructor implementation.

How was this tested?

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc()
  2012-08-28 20:32     ` Tejun Heo
@ 2012-08-28 22:24       ` Kent Overstreet
       [not found]       ` <20120828203247.GC24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
  1 sibling, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 22:24 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache, linux-kernel, dm-devel, vgoyal, mpatocka, bharrosh,
	Peter Osterlund

On Tue, Aug 28, 2012 at 01:32:47PM -0700, Tejun Heo wrote:
> On Tue, Aug 28, 2012 at 10:37:31AM -0700, Kent Overstreet wrote:
> > This is prep work for killing bi_destructor - previously, pktcdvd had
> > its own pkt_bio_alloc which was basically duplication bio_kmalloc(),
> > necessitating its own bi_destructor implementation.
> 
> How was this tested?

Hasn't been yet, but I'm getting a new test machine in the next day or
so (and also the patch is a lot smaller since the bio_reset() changes
you suggested).

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120828203247.GC24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>]

* Re: [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc()
       [not found]       ` <20120828203247.GC24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
@ 2012-09-04  9:05         ` Jiri Kosina
       [not found]           ` <alpine.LRH.2.00.1209041104020.26301-1ReQVI26iDCaZKY3DrU6dA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Jiri Kosina @ 2012-09-04  9:05 UTC (permalink / raw)
  To: Tejun Heo, Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Peter Osterlund

On Tue, 28 Aug 2012, Tejun Heo wrote:

> > This is prep work for killing bi_destructor - previously, pktcdvd had
> > its own pkt_bio_alloc which was basically duplication bio_kmalloc(),
> > necessitating its own bi_destructor implementation.
> 
> How was this tested?

I have now tested it and it works properly. You can add

	Signed-off-by: Jiri Kosina <jkosina-AlSwsSmVLrQ@public.gmane.org>

once you will be pushing the patchset through (I will be pushing 
MAINTAINERS update through my tree to Jens separately).

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <alpine.LRH.2.00.1209041104020.26301-1ReQVI26iDCaZKY3DrU6dA@public.gmane.org>]

* Re: [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc()
       [not found]           ` <alpine.LRH.2.00.1209041104020.26301-1ReQVI26iDCaZKY3DrU6dA@public.gmane.org>
@ 2012-09-05 19:44             ` Kent Overstreet
  0 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-09-05 19:44 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Tejun Heo, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Peter Osterlund

On Tue, Sep 04, 2012 at 11:05:06AM +0200, Jiri Kosina wrote:
> On Tue, 28 Aug 2012, Tejun Heo wrote:
> 
> > > This is prep work for killing bi_destructor - previously, pktcdvd had
> > > its own pkt_bio_alloc which was basically duplication bio_kmalloc(),
> > > necessitating its own bi_destructor implementation.
> > 
> > How was this tested?
> 
> I have now tested it and it works properly. You can add
> 
> 	Signed-off-by: Jiri Kosina <jkosina-AlSwsSmVLrQ@public.gmane.org>
> 
> once you will be pushing the patchset through (I will be pushing 
> MAINTAINERS update through my tree to Jens separately).

Thanks! I'll make sure and add you to the cc next time - I'll have
another patch or two for pktcdvd coming soon as I clean up the next
patch series.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v7 5/9] block: Kill bi_destructor
  2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
                   ` (3 preceding siblings ...)
  2012-08-28 17:37 ` [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc() Kent Overstreet
@ 2012-08-28 17:37 ` Kent Overstreet
       [not found]   ` <1346175456-1572-6-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
       [not found] ` <1346175456-1572-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Kent Overstreet, tj, vgoyal, mpatocka, bharrosh, Jens Axboe

Now that we've got generic code for freeing bios allocated from bio
pools, this isn't needed anymore.

This also changes the semantics of bio_free() a bit - it now also frees
bios allocated by bio_kmalloc(). It's also no longer exported, as
without bi_destructor there should be no need for it to be called
anywhere else.

v5: Switch to BIO_KMALLOC_POOL ((void *)~0), per Boaz
v6: BIO_KMALLOC_POOL now NULL, drop bio_free's EXPORT_SYMBOL
v7: No #define BIO_KMALLOC_POOL anymore

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt |  5 -----
 block/blk-core.c               |  2 +-
 fs/bio.c                       | 33 ++++++++++++---------------------
 include/linux/bio.h            |  2 +-
 include/linux/blk_types.h      |  3 ---
 5 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index e418dc0..8df5e8e 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -465,7 +465,6 @@ struct bio {
        bio_end_io_t	*bi_end_io;  /* bi_end_io (bio) */
        atomic_t		bi_cnt;	     /* pin count: free when it hits zero */
        void             *bi_private;
-       bio_destructor_t *bi_destructor; /* bi_destructor (bio) */
 };
 
 With this multipage bio design:
@@ -647,10 +646,6 @@ for a non-clone bio. There are the 6 pools setup for different size biovecs,
 so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
 given size from these slabs.
 
-The bi_destructor() routine takes into account the possibility of the bio
-having originated from a different source (see later discussions on
-n/w to block transfers and kvec_cb)
-
 The bio_get() routine may be used to hold an extra reference on a bio prior
 to i/o submission, if the bio fields are likely to be accessed after the
 i/o is issued (since the bio may otherwise get freed in case i/o completion
diff --git a/block/blk-core.c b/block/blk-core.c
index 4b4dbdf..f3780f5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2807,7 +2807,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 
 free_and_out:
 	if (bio)
-		bio_free(bio, bs);
+		bio_free(bio);
 	blk_rq_unprep_clone(rq);
 
 	return -ENOMEM;
diff --git a/fs/bio.c b/fs/bio.c
index 52da084..ce829c8 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -233,10 +233,19 @@ fallback:
 	return bvl;
 }
 
-void bio_free(struct bio *bio, struct bio_set *bs)
+void bio_free(struct bio *bio)
 {
+	struct bio_set *bs = bio->bi_pool;
 	void *p;
 
+	if (!bs) {
+		/* Bio was allocated by bio_kmalloc() */
+		if (bio_integrity(bio))
+			bio_integrity_free(bio, fs_bio_set);
+		kfree(bio);
+		return;
+	}
+
 	if (bio_has_allocated_vec(bio))
 		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
@@ -252,7 +261,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 
 	mempool_free(p, bs->bio_pool);
 }
-EXPORT_SYMBOL(bio_free);
 
 void bio_init(struct bio *bio)
 {
@@ -352,13 +360,6 @@ struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 }
 EXPORT_SYMBOL(bio_alloc);
 
-static void bio_kmalloc_destructor(struct bio *bio)
-{
-	if (bio_integrity(bio))
-		bio_integrity_free(bio, fs_bio_set);
-	kfree(bio);
-}
-
 /**
  * bio_kmalloc - allocate a bio for I/O using kmalloc()
  * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -385,7 +386,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
 	bio->bi_max_vecs = nr_iovecs;
 	bio->bi_io_vec = bio->bi_inline_vecs;
-	bio->bi_destructor = bio_kmalloc_destructor;
+	bio->bi_pool = NULL;
 
 	return bio;
 }
@@ -423,17 +424,7 @@ void bio_put(struct bio *bio)
 	 */
 	if (atomic_dec_and_test(&bio->bi_cnt)) {
 		bio_disassociate_task(bio);
-		bio->bi_next = NULL;
-
-		/*
-		 * This if statement is temporary - bi_pool is replacing
-		 * bi_destructor, but bi_destructor will be taken out in another
-		 * patch.
-		 */
-		if (bio->bi_pool)
-			bio_free(bio, bio->bi_pool);
-		else
-			bio->bi_destructor(bio);
+		bio_free(bio);
 	}
 }
 EXPORT_SYMBOL(bio_put);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ba60319..393c87e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -216,7 +216,7 @@ extern struct bio *bio_alloc(gfp_t, unsigned int);
 extern struct bio *bio_kmalloc(gfp_t, unsigned int);
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
-extern void bio_free(struct bio *, struct bio_set *);
+extern void bio_free(struct bio *);
 
 extern void bio_endio(struct bio *, int);
 struct request_queue;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 691edd1..7a9c047 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -84,11 +84,8 @@ struct bio {
 	struct bio_integrity_payload *bi_integrity;  /* data integrity */
 #endif
 
-	/* If bi_pool is non NULL, bi_destructor is not called */
 	struct bio_set		*bi_pool;
 
-	bio_destructor_t	*bi_destructor;	/* destructor */
-
 	/*
 	 * We can inline a number of vecs at the end of the bio, to avoid
 	 * double allocations for a small number of bio_vecs. This member
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

[parent not found: <1346175456-1572-6-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 5/9] block: Kill bi_destructor
       [not found]   ` <1346175456-1572-6-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 20:36     ` Tejun Heo
  2012-08-28 22:07       ` Kent Overstreet
  0 siblings, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-08-28 20:36 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Tue, Aug 28, 2012 at 10:37:32AM -0700, Kent Overstreet wrote:
> @@ -385,7 +386,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
>  	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
>  	bio->bi_max_vecs = nr_iovecs;
>  	bio->bi_io_vec = bio->bi_inline_vecs;
> -	bio->bi_destructor = bio_kmalloc_destructor;
> +	bio->bi_pool = NULL;

Doesn't bio_init() already memset the whole thing?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 5/9] block: Kill bi_destructor
  2012-08-28 20:36     ` Tejun Heo
@ 2012-08-28 22:07       ` Kent Overstreet
  0 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 22:07 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache, linux-kernel, dm-devel, vgoyal, mpatocka, bharrosh,
	Jens Axboe

On Tue, Aug 28, 2012 at 01:36:13PM -0700, Tejun Heo wrote:
> On Tue, Aug 28, 2012 at 10:37:32AM -0700, Kent Overstreet wrote:
> > @@ -385,7 +386,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
> >  	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
> >  	bio->bi_max_vecs = nr_iovecs;
> >  	bio->bi_io_vec = bio->bi_inline_vecs;
> > -	bio->bi_destructor = bio_kmalloc_destructor;
> > +	bio->bi_pool = NULL;
> 
> Doesn't bio_init() already memset the whole thing?

Yes it does, holdover from BIO_KMALLOC_POOL. Dropping it.

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <1346175456-1572-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* [PATCH v7 6/9] block: Consolidate bio_alloc_bioset(), bio_kmalloc()
       [not found] ` <1346175456-1572-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 17:37   ` Kent Overstreet
       [not found]     ` <1346175456-1572-7-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA
  Cc: Kent Overstreet, tj-DgEjT+Ai2ygdnm+yROfE0A,
	vgoyal-H+wXaHxf7aLQT0dZR+AlfA, mpatocka-H+wXaHxf7aLQT0dZR+AlfA,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

Previously, bio_kmalloc() and bio_alloc_bioset() behaved slightly
different because there was some almost-duplicated code - this fixes
some of that.

The important change is that previously bio_kmalloc() always set
bi_io_vec = bi_inline_vecs, even if nr_iovecs == 0 - unlike
bio_alloc_bioset(). This would cause bio_has_data() to return true; I
don't know if this resulted in any actual bugs but it was certainly
wrong.

bio_kmalloc() and bio_alloc_bioset() also have different arbitrary
limits on nr_iovecs - 1024 (UIO_MAXIOV) for bio_kmalloc(), 256
(BIO_MAX_PAGES) for bio_alloc_bioset(). This patch doesn't fix that, but
at least they're enforced closer together and hopefully they will be
fixed in a later patch.

This'll also help with some future cleanups - there are a fair number of
functions that allocate bios (e.g. bio_clone()), and now they don't have
to be duplicated for bio_alloc(), bio_alloc_bioset(), and bio_kmalloc().

Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Jens Axboe <axboe-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>
v7: Re-add dropped comments, improv patch description
---
 fs/bio.c            | 111 ++++++++++++++++++----------------------------------
 include/linux/bio.h |  16 ++++++--
 2 files changed, 49 insertions(+), 78 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index ce829c8..5e947d3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
  * IO code that does not need private memory pools.
  */
 struct bio_set *fs_bio_set;
+EXPORT_SYMBOL(fs_bio_set);
 
 /*
  * Our slab pool management
@@ -291,39 +292,58 @@ EXPORT_SYMBOL(bio_reset);
  * @bs:		the bio_set to allocate from.
  *
  * Description:
- *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
- *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free.
- **/
+ *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
+ *   backed by the @bs's mempool.
+ *
+ *   When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
+ *   able to allocate a bio. This is due to the mempool guarantees. To make this
+ *   work, callers must never allocate more than 1 bio at a time from this pool.
+ *   Callers that need to allocate more than 1 bio must always submit the
+ *   previously allocated bio for IO before attempting to allocate a new one.
+ *   Failure to do so can cause deadlocks under memory pressure.
+ *
+ *   RETURNS:
+ *   Pointer to new bio on success, NULL on failure.
+ */
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	unsigned front_pad;
+	unsigned inline_vecs;
 	unsigned long idx = BIO_POOL_NONE;
 	struct bio_vec *bvl = NULL;
 	struct bio *bio;
 	void *p;
 
-	p = mempool_alloc(bs->bio_pool, gfp_mask);
+	if (!bs) {
+		if (nr_iovecs > UIO_MAXIOV)
+			return NULL;
+
+		p = kmalloc(sizeof(struct bio) +
+			    nr_iovecs * sizeof(struct bio_vec),
+			    gfp_mask);
+		front_pad = 0;
+		inline_vecs = nr_iovecs;
+	} else {
+		p = mempool_alloc(bs->bio_pool, gfp_mask);
+		front_pad = bs->front_pad;
+		inline_vecs = BIO_INLINE_VECS;
+	}
+
 	if (unlikely(!p))
 		return NULL;
-	bio = p + bs->front_pad;
 
+	bio = p + front_pad;
 	bio_init(bio);
-	bio->bi_pool = bs;
-
-	if (unlikely(!nr_iovecs))
-		goto out_set;
 
-	if (nr_iovecs <= BIO_INLINE_VECS) {
-		bvl = bio->bi_inline_vecs;
-		nr_iovecs = BIO_INLINE_VECS;
-	} else {
+	if (nr_iovecs > inline_vecs) {
 		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 		if (unlikely(!bvl))
 			goto err_free;
-
-		nr_iovecs = bvec_nr_vecs(idx);
+	} else if (nr_iovecs) {
+		bvl = bio->bi_inline_vecs;
 	}
-out_set:
+
+	bio->bi_pool = bs;
 	bio->bi_flags |= idx << BIO_POOL_OFFSET;
 	bio->bi_max_vecs = nr_iovecs;
 	bio->bi_io_vec = bvl;
@@ -335,63 +355,6 @@ err_free:
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
 
-/**
- *	bio_alloc - allocate a new bio, memory pool backed
- *	@gfp_mask: allocation mask to use
- *	@nr_iovecs: number of iovecs
- *
- *	bio_alloc will allocate a bio and associated bio_vec array that can hold
- *	at least @nr_iovecs entries. Allocations will be done from the
- *	fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
- *
- *	If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
- *	a bio. This is due to the mempool guarantees. To make this work, callers
- *	must never allocate more than 1 bio at a time from this pool. Callers
- *	that need to allocate more than 1 bio must always submit the previously
- *	allocated bio for IO before attempting to allocate a new one. Failure to
- *	do so can cause livelocks under memory pressure.
- *
- *	RETURNS:
- *	Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-}
-EXPORT_SYMBOL(bio_alloc);
-
-/**
- * bio_kmalloc - allocate a bio for I/O using kmalloc()
- * @gfp_mask:   the GFP_ mask given to the slab allocator
- * @nr_iovecs:	number of iovecs to pre-allocate
- *
- * Description:
- *   Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask contains
- *   %__GFP_WAIT, the allocation is guaranteed to succeed.
- *
- **/
-struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-	struct bio *bio;
-
-	if (nr_iovecs > UIO_MAXIOV)
-		return NULL;
-
-	bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
-		      gfp_mask);
-	if (unlikely(!bio))
-		return NULL;
-
-	bio_init(bio);
-	bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
-	bio->bi_max_vecs = nr_iovecs;
-	bio->bi_io_vec = bio->bi_inline_vecs;
-	bio->bi_pool = NULL;
-
-	return bio;
-}
-EXPORT_SYMBOL(bio_kmalloc);
-
 void zero_fill_bio(struct bio *bio)
 {
 	unsigned long flags;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 393c87e..8bfd572 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -212,12 +212,22 @@ extern void bio_pair_release(struct bio_pair *dbio);
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 
-extern struct bio *bio_alloc(gfp_t, unsigned int);
-extern struct bio *bio_kmalloc(gfp_t, unsigned int);
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
 extern void bio_free(struct bio *);
 
+extern struct bio_set *fs_bio_set;
+
+static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+}
+
+static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+}
+
 extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
@@ -305,8 +315,6 @@ struct biovec_slab {
 	struct kmem_cache *slab;
 };
 
-extern struct bio_set *fs_bio_set;
-
 /*
  * a small number of entries is fine, not going to be performance critical.
  * basically we just need to survive
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

[parent not found: <1346175456-1572-7-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 6/9] block: Consolidate bio_alloc_bioset(), bio_kmalloc()
       [not found]     ` <1346175456-1572-7-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 20:41       ` Tejun Heo
       [not found]         ` <20120828204148.GE24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-08-28 20:41 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello, Kent.

On Tue, Aug 28, 2012 at 10:37:33AM -0700, Kent Overstreet wrote:
> v7: Re-add dropped comments, improv patch description

I don't think you did the former part.  Other than that looks good to
me.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120828204148.GE24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>]

* Re: [PATCH v7 6/9] block: Consolidate bio_alloc_bioset(), bio_kmalloc()
       [not found]         ` <20120828204148.GE24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
@ 2012-08-28 22:03           ` Kent Overstreet
  2012-09-01  2:17             ` Tejun Heo
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 22:03 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Tue, Aug 28, 2012 at 01:41:48PM -0700, Tejun Heo wrote:
> Hello, Kent.
> 
> On Tue, Aug 28, 2012 at 10:37:33AM -0700, Kent Overstreet wrote:
> > v7: Re-add dropped comments, improv patch description
> 
> I don't think you did the former part.  Other than that looks good to
> me.

I folded them into the bio_alloc_bioset() comments - so all the
information that was there previously should still be there, just
centralized. That good enough for your acked-by?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 6/9] block: Consolidate bio_alloc_bioset(), bio_kmalloc()
  2012-08-28 22:03           ` Kent Overstreet
@ 2012-09-01  2:17             ` Tejun Heo
  0 siblings, 0 replies; 75+ messages in thread
From: Tejun Heo @ 2012-09-01  2:17 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache, linux-kernel, dm-devel, vgoyal, mpatocka, bharrosh,
	Jens Axboe

On Tue, Aug 28, 2012 at 03:03:32PM -0700, Kent Overstreet wrote:
> On Tue, Aug 28, 2012 at 01:41:48PM -0700, Tejun Heo wrote:
> > Hello, Kent.
> > 
> > On Tue, Aug 28, 2012 at 10:37:33AM -0700, Kent Overstreet wrote:
> > > v7: Re-add dropped comments, improv patch description
> > 
> > I don't think you did the former part.  Other than that looks good to
> > me.
> 
> I folded them into the bio_alloc_bioset() comments - so all the
> information that was there previously should still be there, just
> centralized. That good enough for your acked-by?

I think it would still be better to at least refer to
bio_alloc_bioset() from bio_alloc*()'s comments.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v7 7/9] block: Add bio_clone_bioset(), bio_clone_kmalloc()
  2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
                   ` (5 preceding siblings ...)
       [not found] ` <1346175456-1572-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 17:37 ` Kent Overstreet
       [not found]   ` <1346175456-1572-8-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2012-08-28 17:37 ` [PATCH v7 8/9] block: Reorder struct bio_set Kent Overstreet
  2012-08-28 17:37 ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Kent Overstreet
  8 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Kent Overstreet, tj, vgoyal, mpatocka, bharrosh, Jens Axboe,
	NeilBrown, Alasdair Kergon, Jeff Garzik

Previously, there was bio_clone() but it only allocated from the fs bio
set; as a result various users were open coding it and using
__bio_clone().

This changes bio_clone() to become bio_clone_bioset(), and then we add
bio_clone() and bio_clone_kmalloc() as wrappers around it, making use of
the functionality the last patch adedd.

This will also help in a later patch changing how bio cloning works.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: NeilBrown <neilb@suse.de>
CC: Alasdair Kergon <agk@redhat.com>
CC: Boaz Harrosh <bharrosh@panasas.com>
CC: Jeff Garzik <jeff@garzik.org>
Acked-by: Jeff Garzik <jgarzik@redhat.com>
---
 block/blk-core.c       |  8 +-------
 drivers/block/osdblk.c |  3 +--
 drivers/md/dm.c        |  4 ++--
 drivers/md/md.c        | 20 +-------------------
 fs/bio.c               | 13 ++++++++-----
 fs/exofs/ore.c         |  5 ++---
 include/linux/bio.h    | 17 ++++++++++++++---
 7 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f3780f5..66f0bb4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2781,16 +2781,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	blk_rq_init(NULL, rq);
 
 	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
+		bio = bio_clone_bioset(bio_src, gfp_mask, bs);
 		if (!bio)
 			goto free_and_out;
 
-		__bio_clone(bio, bio_src);
-
-		if (bio_integrity(bio_src) &&
-		    bio_integrity_clone(bio, bio_src, gfp_mask, bs))
-			goto free_and_out;
-
 		if (bio_ctr && bio_ctr(bio, bio_src, data))
 			goto free_and_out;
 
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 87311eb..1bbc681 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -266,11 +266,10 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
 	struct bio *tmp, *new_chain = NULL, *tail = NULL;
 
 	while (old_chain) {
-		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+		tmp = bio_clone_kmalloc(old_chain, gfpmask);
 		if (!tmp)
 			goto err_out;
 
-		__bio_clone(tmp, old_chain);
 		tmp->bi_bdev = NULL;
 		gfpmask &= ~__GFP_WAIT;
 		tmp->bi_next = NULL;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9206781..4b67966 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1129,8 +1129,8 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
 	 * and discard, so no need for concern about wasted bvec allocations.
 	 */
-	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
-	__bio_clone(clone, ci->bio);
+	clone = bio_clone_bioset(ci->bio, GFP_NOIO, ci->md->bs);
+
 	if (len) {
 		clone->bi_sector = ci->sector;
 		clone->bi_size = to_bytes(len);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b8eebe3..7a2b079 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -173,28 +173,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 			    struct mddev *mddev)
 {
-	struct bio *b;
-
 	if (!mddev || !mddev->bio_set)
 		return bio_clone(bio, gfp_mask);
 
-	b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, mddev->bio_set);
-	if (!b)
-		return NULL;
-
-	__bio_clone(b, bio);
-	if (bio_integrity(bio)) {
-		int ret;
-
-		ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
-
-		if (ret < 0) {
-			bio_put(b);
-			return NULL;
-		}
-	}
-
-	return b;
+	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
 
diff --git a/fs/bio.c b/fs/bio.c
index 5e947d3..31e637a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -430,16 +430,19 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 EXPORT_SYMBOL(__bio_clone);
 
 /**
- *	bio_clone	-	clone a bio
+ *	bio_clone_bioset -	clone a bio
  *	@bio: bio to clone
  *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
  *
  * 	Like __bio_clone, only also allocates the returned bio
  */
-struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
+			     struct bio_set *bs)
 {
-	struct bio *b = bio_alloc(gfp_mask, bio->bi_max_vecs);
+	struct bio *b;
 
+	b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
 	if (!b)
 		return NULL;
 
@@ -448,7 +451,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 	if (bio_integrity(bio)) {
 		int ret;
 
-		ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
+		ret = bio_integrity_clone(b, bio, gfp_mask, bs);
 
 		if (ret < 0) {
 			bio_put(b);
@@ -458,7 +461,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 
 	return b;
 }
-EXPORT_SYMBOL(bio_clone);
+EXPORT_SYMBOL(bio_clone_bioset);
 
 /**
  *	bio_get_nr_vecs		- return approx number of vecs
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 1585db1..f936cb5 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -814,8 +814,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 			struct bio *bio;
 
 			if (per_dev != master_dev) {
-				bio = bio_kmalloc(GFP_KERNEL,
-						  master_dev->bio->bi_max_vecs);
+				bio = bio_clone_kmalloc(master_dev->bio,
+							GFP_KERNEL);
 				if (unlikely(!bio)) {
 					ORE_DBGMSG(
 					      "Failed to allocate BIO size=%u\n",
@@ -824,7 +824,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 					goto out;
 				}
 
-				__bio_clone(bio, master_dev->bio);
 				bio->bi_bdev = NULL;
 				bio->bi_next = NULL;
 				per_dev->offset = master_dev->offset;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8bfd572..f9b61b4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -216,6 +216,9 @@ extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
 extern void bio_free(struct bio *);
 
+extern void __bio_clone(struct bio *, struct bio *);
+extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+
 extern struct bio_set *fs_bio_set;
 
 static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -223,18 +226,26 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
 }
 
+static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+	return bio_clone_bioset(bio, gfp_mask, fs_bio_set);
+}
+
 static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
 	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
 }
 
+static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
+{
+	return bio_clone_bioset(bio, gfp_mask, NULL);
+
+}
+
 extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
-extern void __bio_clone(struct bio *, struct bio *);
-extern struct bio *bio_clone(struct bio *, gfp_t);
-
 extern void bio_init(struct bio *);
 extern void bio_reset(struct bio *);
 
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

[parent not found: <1346175456-1572-8-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 7/9] block: Add bio_clone_bioset(), bio_clone_kmalloc()
       [not found]   ` <1346175456-1572-8-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 20:44     ` Tejun Heo
  2012-08-28 22:05       ` Kent Overstreet
  0 siblings, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-08-28 20:44 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe, NeilBrown, Alasdair Kergon, Jeff Garzik

On Tue, Aug 28, 2012 at 10:37:34AM -0700, Kent Overstreet wrote:
> +static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
> +{
> +	return bio_clone_bioset(bio, gfp_mask, fs_bio_set);
> +}
> +
...
> +static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
> +{
> +	return bio_clone_bioset(bio, gfp_mask, NULL);
> +
> +}

Do we really need these wrappers?  I'd prefer requiring users to
explicit choose @bioset when cloning.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 7/9] block: Add bio_clone_bioset(), bio_clone_kmalloc()
  2012-08-28 20:44     ` Tejun Heo
@ 2012-08-28 22:05       ` Kent Overstreet
       [not found]         ` <20120828220532.GB1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 22:05 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache, linux-kernel, dm-devel, vgoyal, mpatocka, bharrosh,
	Jens Axboe, NeilBrown, Alasdair Kergon, Jeff Garzik

On Tue, Aug 28, 2012 at 01:44:01PM -0700, Tejun Heo wrote:
> On Tue, Aug 28, 2012 at 10:37:34AM -0700, Kent Overstreet wrote:
> > +static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
> > +{
> > +	return bio_clone_bioset(bio, gfp_mask, fs_bio_set);
> > +}
> > +
> ...
> > +static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
> > +{
> > +	return bio_clone_bioset(bio, gfp_mask, NULL);
> > +
> > +}
> 
> Do we really need these wrappers?  I'd prefer requiring users to
> explicit choose @bioset when cloning.

bio_clone() is an existing api, I agree but I'd prefer to convert
existing users in a separate patch and when I do that I want to spend
some time actually looking at the existing code instead of doing the
conversion blindly (at least some of the existing users are incorrect
and I'll have to add bio_sets for them).

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120828220532.GB1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>]

* Re: [PATCH v7 7/9] block: Add bio_clone_bioset(), bio_clone_kmalloc()
       [not found]         ` <20120828220532.GB1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
@ 2012-09-01  2:19           ` Tejun Heo
  0 siblings, 0 replies; 75+ messages in thread
From: Tejun Heo @ 2012-09-01  2:19 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe, NeilBrown, Alasdair Kergon, Jeff Garzik

Hello,

On Tue, Aug 28, 2012 at 03:05:32PM -0700, Kent Overstreet wrote:
> On Tue, Aug 28, 2012 at 01:44:01PM -0700, Tejun Heo wrote:
> > On Tue, Aug 28, 2012 at 10:37:34AM -0700, Kent Overstreet wrote:
> > > +static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
> > > +{
> > > +	return bio_clone_bioset(bio, gfp_mask, fs_bio_set);
> > > +}
> > > +
> > ...
> > > +static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
> > > +{
> > > +	return bio_clone_bioset(bio, gfp_mask, NULL);
> > > +
> > > +}
> > 
> > Do we really need these wrappers?  I'd prefer requiring users to
> > explicit choose @bioset when cloning.
> 
> bio_clone() is an existing api, I agree but I'd prefer to convert
> existing users in a separate patch and when I do that I want to spend
> some time actually looking at the existing code instead of doing the
> conversion blindly (at least some of the existing users are incorrect
> and I'll have to add bio_sets for them).

Aren't there like three users in kernel?  If you wanna clean it up
later, that's fine too but I don't think it would make much difference
either way.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH v7 8/9] block: Reorder struct bio_set
  2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
                   ` (6 preceding siblings ...)
  2012-08-28 17:37 ` [PATCH v7 7/9] block: Add bio_clone_bioset(), bio_clone_kmalloc() Kent Overstreet
@ 2012-08-28 17:37 ` Kent Overstreet
  2012-08-28 17:37 ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Kent Overstreet
  8 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Kent Overstreet, tj, vgoyal, mpatocka, bharrosh, Jens Axboe

This is prep work for the next patch, which embeds a struct bio_list in
struct bio_set.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 66 ++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index f9b61b4..3a8345e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -299,39 +299,6 @@ static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 #endif	/* CONFIG_BLK_CGROUP */
 
-/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-#define BIO_POOL_SIZE 2
-#define BIOVEC_NR_POOLS 6
-#define BIOVEC_MAX_IDX	(BIOVEC_NR_POOLS - 1)
-
-struct bio_set {
-	struct kmem_cache *bio_slab;
-	unsigned int front_pad;
-
-	mempool_t *bio_pool;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-	mempool_t *bio_integrity_pool;
-#endif
-	mempool_t *bvec_pool;
-};
-
-struct biovec_slab {
-	int nr_vecs;
-	char *name;
-	struct kmem_cache *slab;
-};
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
-
 #ifdef CONFIG_HIGHMEM
 /*
  * remember never ever reenable interrupts between a bvec_kmap_irq and
@@ -506,6 +473,39 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 	return bio;
 }
 
+/*
+ * bio_set is used to allow other portions of the IO system to
+ * allocate their own private memory pools for bio and iovec structures.
+ * These memory pools in turn all allocate from the bio_slab
+ * and the bvec_slabs[].
+ */
+#define BIO_POOL_SIZE 2
+#define BIOVEC_NR_POOLS 6
+#define BIOVEC_MAX_IDX	(BIOVEC_NR_POOLS - 1)
+
+struct bio_set {
+	struct kmem_cache *bio_slab;
+	unsigned int front_pad;
+
+	mempool_t *bio_pool;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	mempool_t *bio_integrity_pool;
+#endif
+	mempool_t *bvec_pool;
+};
+
+struct biovec_slab {
+	int nr_vecs;
+	char *name;
+	struct kmem_cache *slab;
+};
+
+/*
+ * a small number of entries is fine, not going to be performance critical.
+ * basically we just need to survive
+ */
+#define BIO_SPLIT_ENTRIES 2
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 #define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
                   ` (7 preceding siblings ...)
  2012-08-28 17:37 ` [PATCH v7 8/9] block: Reorder struct bio_set Kent Overstreet
@ 2012-08-28 17:37 ` Kent Overstreet
       [not found]   ` <1346175456-1572-10-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2012-08-29 16:24   ` Mikulas Patocka
  8 siblings, 2 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 17:37 UTC (permalink / raw)
  To: linux-bcache, linux-kernel, dm-devel
  Cc: Kent Overstreet, tj, vgoyal, mpatocka, bharrosh, Jens Axboe

Previously, if we ever try to allocate more than once from the same bio
set while running under generic_make_request() (i.e. a stacking block
driver), we risk deadlock.

This is because of the code in generic_make_request() that converts
recursion to iteration; any bios we submit won't actually be submitted
(so they can complete and eventually be freed) until after we return -
this means if we allocate a second bio, we're blocking the first one
from ever being freed.

Thus if enough threads call into a stacking block driver at the same
time with bios that need multiple splits, and the bio_set's reserve gets
used up, we deadlock.

This can be worked around in the driver code - we could check if we're
running under generic_make_request(), then mask out __GFP_WAIT when we
go to allocate a bio, and if the allocation fails punt to workqueue and
retry the allocation.

But this is tricky and not a generic solution. This patch solves it for
all users by inverting the previously described technique. We allocate a
rescuer workqueue for each bio_set, and then in the allocation code if
there are bios on current->bio_list we would be blocking, we punt them
to the rescuer workqueue to be submitted.

Tested it by forcing the rescue codepath to be taken (by disabling the
first GFP_NOWAIT) attempt, and then ran it with bcache (which does a lot
of arbitrary bio splitting) and verified that the rescuer was being
invoked.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c            | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/bio.h |  9 +++++++
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 31e637a..5d46318 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -285,6 +285,23 @@ void bio_reset(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_reset);
 
+static void bio_alloc_rescue(struct work_struct *work)
+{
+	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+	struct bio *bio;
+
+	while (1) {
+		spin_lock(&bs->rescue_lock);
+		bio = bio_list_pop(&bs->rescue_list);
+		spin_unlock(&bs->rescue_lock);
+
+		if (!bio)
+			break;
+
+		generic_make_request(bio);
+	}
+}
+
 /**
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -307,6 +324,7 @@ EXPORT_SYMBOL(bio_reset);
  */
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	gfp_t saved_gfp = gfp_mask;
 	unsigned front_pad;
 	unsigned inline_vecs;
 	unsigned long idx = BIO_POOL_NONE;
@@ -324,13 +342,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		front_pad = 0;
 		inline_vecs = nr_iovecs;
 	} else {
+		/*
+		 * generic_make_request() converts recursion to iteration; this
+		 * means if we're running beneath it, any bios we allocate and
+		 * submit will not be submitted (and thus freed) until after we
+		 * return.
+		 *
+		 * This exposes us to a potential deadlock if we allocate
+		 * multiple bios from the same bio_set() while running
+		 * underneath generic_make_request(). If we were to allocate
+		 * multiple bios (say a stacking block driver that was splitting
+		 * bios), we would deadlock if we exhausted the mempool's
+		 * reserve.
+		 *
+		 * We solve this, and guarantee forward progress, with a rescuer
+		 * workqueue per bio_set. If we go to allocate and there are
+		 * bios on current->bio_list, we first try the allocation
+		 * without __GFP_WAIT; if that fails, we punt those bios we
+		 * would be blocking to the rescuer workqueue before we retry
+		 * with the original gfp_flags.
+		 */
+
+		if (current->bio_list && !bio_list_empty(current->bio_list))
+			gfp_mask &= ~__GFP_WAIT;
+retry:
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
 		front_pad = bs->front_pad;
 		inline_vecs = BIO_INLINE_VECS;
 	}
 
 	if (unlikely(!p))
-		return NULL;
+		goto err;
 
 	bio = p + front_pad;
 	bio_init(bio);
@@ -351,6 +393,19 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 err_free:
 	mempool_free(p, bs->bio_pool);
+err:
+	if (gfp_mask != saved_gfp) {
+		gfp_mask = saved_gfp;
+
+		spin_lock(&bs->rescue_lock);
+		bio_list_merge(&bs->rescue_list, current->bio_list);
+		bio_list_init(current->bio_list);
+		spin_unlock(&bs->rescue_lock);
+
+		queue_work(bs->rescue_workqueue, &bs->rescue_work);
+		goto retry;
+	}
+
 	return NULL;
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
@@ -1562,6 +1617,9 @@ static void biovec_free_pools(struct bio_set *bs)
 
 void bioset_free(struct bio_set *bs)
 {
+	if (bs->rescue_workqueue)
+		destroy_workqueue(bs->rescue_workqueue);
+
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
@@ -1597,6 +1655,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 
 	bs->front_pad = front_pad;
 
+	spin_lock_init(&bs->rescue_lock);
+	bio_list_init(&bs->rescue_list);
+	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
 	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
 	if (!bs->bio_slab) {
 		kfree(bs);
@@ -1607,9 +1669,14 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (!biovec_create_pools(bs, pool_size))
-		return bs;
+	if (biovec_create_pools(bs, pool_size))
+		goto bad;
+
+	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+	if (!bs->rescue_workqueue)
+		goto bad;
 
+	return bs;
 bad:
 	bioset_free(bs);
 	return NULL;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3a8345e..84fdaac 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -492,6 +492,15 @@ struct bio_set {
 	mempool_t *bio_integrity_pool;
 #endif
 	mempool_t *bvec_pool;
+
+	/*
+	 * Deadlock avoidance for stacking block drivers: see comments in
+	 * bio_alloc_bioset() for details
+	 */
+	spinlock_t		rescue_lock;
+	struct bio_list		rescue_list;
+	struct work_struct	rescue_work;
+	struct workqueue_struct	*rescue_workqueue;
 };
 
 struct biovec_slab {
-- 
1.7.12

^ permalink raw reply related	[flat|nested] 75+ messages in thread

[parent not found: <1346175456-1572-10-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]   ` <1346175456-1572-10-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 20:49     ` Tejun Heo
  2012-08-28 22:28       ` Kent Overstreet
  2012-08-28 22:06     ` Vivek Goyal
  1 sibling, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-08-28 20:49 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello,

On Tue, Aug 28, 2012 at 10:37:36AM -0700, Kent Overstreet wrote:
> @@ -324,13 +342,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
>  		front_pad = 0;
>  		inline_vecs = nr_iovecs;
>  	} else {
> +		/*
> +		 * generic_make_request() converts recursion to iteration; this
> +		 * means if we're running beneath it, any bios we allocate and
> +		 * submit will not be submitted (and thus freed) until after we
> +		 * return.
> +		 *
> +		 * This exposes us to a potential deadlock if we allocate
> +		 * multiple bios from the same bio_set() while running
> +		 * underneath generic_make_request(). If we were to allocate
> +		 * multiple bios (say a stacking block driver that was splitting
> +		 * bios), we would deadlock if we exhausted the mempool's
> +		 * reserve.
> +		 *
> +		 * We solve this, and guarantee forward progress, with a rescuer
> +		 * workqueue per bio_set. If we go to allocate and there are
> +		 * bios on current->bio_list, we first try the allocation
> +		 * without __GFP_WAIT; if that fails, we punt those bios we
> +		 * would be blocking to the rescuer workqueue before we retry
> +		 * with the original gfp_flags.
> +		 */
> +
> +		if (current->bio_list && !bio_list_empty(current->bio_list))
> +			gfp_mask &= ~__GFP_WAIT;
> +retry:
>  		p = mempool_alloc(bs->bio_pool, gfp_mask);
>  		front_pad = bs->front_pad;
>  		inline_vecs = BIO_INLINE_VECS;
>  	}
>  
>  	if (unlikely(!p))
> -		return NULL;
> +		goto err;
>  
>  	bio = p + front_pad;
>  	bio_init(bio);
> @@ -351,6 +393,19 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
>  
>  err_free:
>  	mempool_free(p, bs->bio_pool);
> +err:
> +	if (gfp_mask != saved_gfp) {
> +		gfp_mask = saved_gfp;
> +
> +		spin_lock(&bs->rescue_lock);
> +		bio_list_merge(&bs->rescue_list, current->bio_list);
> +		bio_list_init(current->bio_list);
> +		spin_unlock(&bs->rescue_lock);
> +
> +		queue_work(bs->rescue_workqueue, &bs->rescue_work);
> +		goto retry;
> +	}

I wonder whether it would be easier to follow if this part is in-line
where retry: is.  All that needs to be duplicated is single
mempool_alloc() call, right?

Overall, I *think* this is correct but need to think more about it to
be sure.

Thanks!

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-28 20:49     ` Tejun Heo
@ 2012-08-28 22:28       ` Kent Overstreet
       [not found]         ` <20120828222800.GG1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 22:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache, linux-kernel, dm-devel, vgoyal, mpatocka, bharrosh,
	Jens Axboe

On Tue, Aug 28, 2012 at 01:49:10PM -0700, Tejun Heo wrote:
> Hello,
> 
> On Tue, Aug 28, 2012 at 10:37:36AM -0700, Kent Overstreet wrote:
> > @@ -324,13 +342,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
> >  		front_pad = 0;
> >  		inline_vecs = nr_iovecs;
> >  	} else {
> > +		/*
> > +		 * generic_make_request() converts recursion to iteration; this
> > +		 * means if we're running beneath it, any bios we allocate and
> > +		 * submit will not be submitted (and thus freed) until after we
> > +		 * return.
> > +		 *
> > +		 * This exposes us to a potential deadlock if we allocate
> > +		 * multiple bios from the same bio_set() while running
> > +		 * underneath generic_make_request(). If we were to allocate
> > +		 * multiple bios (say a stacking block driver that was splitting
> > +		 * bios), we would deadlock if we exhausted the mempool's
> > +		 * reserve.
> > +		 *
> > +		 * We solve this, and guarantee forward progress, with a rescuer
> > +		 * workqueue per bio_set. If we go to allocate and there are
> > +		 * bios on current->bio_list, we first try the allocation
> > +		 * without __GFP_WAIT; if that fails, we punt those bios we
> > +		 * would be blocking to the rescuer workqueue before we retry
> > +		 * with the original gfp_flags.
> > +		 */
> > +
> > +		if (current->bio_list && !bio_list_empty(current->bio_list))
> > +			gfp_mask &= ~__GFP_WAIT;
> > +retry:
> >  		p = mempool_alloc(bs->bio_pool, gfp_mask);
> >  		front_pad = bs->front_pad;
> >  		inline_vecs = BIO_INLINE_VECS;
> >  	}
> >  
> >  	if (unlikely(!p))
> > -		return NULL;
> > +		goto err;
> >  
> >  	bio = p + front_pad;
> >  	bio_init(bio);
> > @@ -351,6 +393,19 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
> >  
> >  err_free:
> >  	mempool_free(p, bs->bio_pool);
> > +err:
> > +	if (gfp_mask != saved_gfp) {
> > +		gfp_mask = saved_gfp;
> > +
> > +		spin_lock(&bs->rescue_lock);
> > +		bio_list_merge(&bs->rescue_list, current->bio_list);
> > +		bio_list_init(current->bio_list);
> > +		spin_unlock(&bs->rescue_lock);
> > +
> > +		queue_work(bs->rescue_workqueue, &bs->rescue_work);
> > +		goto retry;
> > +	}
> 
> I wonder whether it would be easier to follow if this part is in-line
> where retry: is.  All that needs to be duplicated is single
> mempool_alloc() call, right?

Actually, what might be better than both of those approaches is shoving
that code into another function. Then it's just:

if (gfp_mask != saved_gfp) {
	gfp_mask = saved_gfp;
	shovel_bios_to_rescuer();
	goto retry;
}

> Overall, I *think* this is correct but need to think more about it to
> be sure.

Please do. As much time as I've spent staring at this kind of stuff,
I'm pretty sure I've got it correct but it still makes my head hurt to
work out all the various possible deadlocks.

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120828222800.GG1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]         ` <20120828222800.GG1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
@ 2012-08-28 23:01           ` Kent Overstreet
       [not found]             ` <20120828230108.GI1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 23:01 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, vgoyal-H+wXaHxf7aLQT0dZR+AlfA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Tue, Aug 28, 2012 at 03:28:00PM -0700, Kent Overstreet wrote:
> On Tue, Aug 28, 2012 at 01:49:10PM -0700, Tejun Heo wrote:
> > Overall, I *think* this is correct but need to think more about it to
> > be sure.
> 
> Please do. As much time as I've spent staring at this kind of stuff,
> I'm pretty sure I've got it correct but it still makes my head hurt to
> work out all the various possible deadlocks.

Hilarious thought: We're punting bios to a rescuer thread that's
specific to a certain bio_set, right? What if we happen to punt bios
from a different bio_set? And then the rescuer goes to resubmit those
bios, and in the process they happen to have dependencies on the
original bio_set...

I think it's actually necessary to filter out only bios from the current
bio_set to punt to the rescuer.

God I love the block layer sometimes.

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120828230108.GI1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]             ` <20120828230108.GI1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
@ 2012-08-29  1:31               ` Vivek Goyal
  2012-08-29  3:25                 ` Kent Overstreet
  0 siblings, 1 reply; 75+ messages in thread
From: Vivek Goyal @ 2012-08-29  1:31 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Tejun Heo, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, mpatocka-H+wXaHxf7aLQT0dZR+AlfA,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Tue, Aug 28, 2012 at 04:01:08PM -0700, Kent Overstreet wrote:
> On Tue, Aug 28, 2012 at 03:28:00PM -0700, Kent Overstreet wrote:
> > On Tue, Aug 28, 2012 at 01:49:10PM -0700, Tejun Heo wrote:
> > > Overall, I *think* this is correct but need to think more about it to
> > > be sure.
> > 
> > Please do. As much time as I've spent staring at this kind of stuff,
> > I'm pretty sure I've got it correct but it still makes my head hurt to
> > work out all the various possible deadlocks.
> 
> Hilarious thought: We're punting bios to a rescuer thread that's
> specific to a certain bio_set, right? What if we happen to punt bios
> from a different bio_set? And then the rescuer goes to resubmit those
> bios, and in the process they happen to have dependencies on the
> original bio_set...

Are they not fully allocated bios and when you submit these to underlying
device, ideally we should not be sharing memory pool at different layers
of stack otherwise we will deadlock any way as stack depth increases. So
there should not be a dependency on original bio_set?

Or, am I missing something. May be an example will help.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-29  1:31               ` Vivek Goyal
@ 2012-08-29  3:25                 ` Kent Overstreet
       [not found]                   ` <20120829032558.GA22214-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-29  3:25 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Tejun Heo, linux-bcache, linux-kernel, dm-devel, mpatocka,
	bharrosh, Jens Axboe

On Tue, Aug 28, 2012 at 09:31:50PM -0400, Vivek Goyal wrote:
> On Tue, Aug 28, 2012 at 04:01:08PM -0700, Kent Overstreet wrote:
> > On Tue, Aug 28, 2012 at 03:28:00PM -0700, Kent Overstreet wrote:
> > > On Tue, Aug 28, 2012 at 01:49:10PM -0700, Tejun Heo wrote:
> > > > Overall, I *think* this is correct but need to think more about it to
> > > > be sure.
> > > 
> > > Please do. As much time as I've spent staring at this kind of stuff,
> > > I'm pretty sure I've got it correct but it still makes my head hurt to
> > > work out all the various possible deadlocks.
> > 
> > Hilarious thought: We're punting bios to a rescuer thread that's
> > specific to a certain bio_set, right? What if we happen to punt bios
> > from a different bio_set? And then the rescuer goes to resubmit those
> > bios, and in the process they happen to have dependencies on the
> > original bio_set...
> 
> Are they not fully allocated bios and when you submit these to underlying
> device, ideally we should not be sharing memory pool at different layers
> of stack otherwise we will deadlock any way as stack depth increases. So
> there should not be a dependency on original bio_set?
> 
> Or, am I missing something. May be an example will help.

Uh, it's more complicated than that. My brain is too fried right now to
walk through it in detail, but the problem (if it is a problem; I can't
convince myself one way or the other) is roughly:

one virt block device stacked on top of another - they both do arbitrary
splitting:

So once they've submitted a bio, that bio needs to make forward progress
even if the thread goes to allocate another bio and blocks before it
returns from its make_request fn.

That much my patch solves, with the rescuer thread; if the thread goes
to block, it punts those blocked bios off to a rescuer thread - and we
create one rescuer per bio set.

So going back to the stacked block devices, if you've got say dm on top
of md (or something else since md doesn't really do much splitting) -
each block device will have its own rescuer and everything should be
hunky dory.

Except that when thread a goes to punt those blocked bios to its
rescuer, it punts _all_ the bios on current->bio_list. Even those
generated by/belonging to other bio_sets.

So thread 1 in device b punts bios to its rescuer, thread 2

But thread 2 ends up with bios for both device a and b - because they're
stacked.

Thread 2 starts on bios for device a before it gets to those for device
b. But a is stacked on top of b, so in the process it generates more
bios for b.

So now it's uhm...

yeah, I'm gonna sleep on this. I'm pretty sure to be rigorously correct
filtering the right bios when we punt them to the rescuer is needed,
though.

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120829032558.GA22214-jC9Py7bek1znysI04z7BkA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                   ` <20120829032558.GA22214-jC9Py7bek1znysI04z7BkA@public.gmane.org>
@ 2012-08-29 12:57                     ` Vivek Goyal
       [not found]                       ` <20120829125759.GB12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Vivek Goyal @ 2012-08-29 12:57 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Tejun Heo, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, mpatocka-H+wXaHxf7aLQT0dZR+AlfA,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Tue, Aug 28, 2012 at 08:25:58PM -0700, Kent Overstreet wrote:

[..]
> Except that when thread a goes to punt those blocked bios to its
> rescuer, it punts _all_ the bios on current->bio_list. Even those
> generated by/belonging to other bio_sets.
> 
> So thread 1 in device b punts bios to its rescuer, thread 2
> 
> But thread 2 ends up with bios for both device a and b - because they're
> stacked.

Ok, just to add more details to above example. Say we have device A stacked
on top of B and B stacked on top of C. Say a bio a is submitted to device A
and is splitted in two bios b1 and b2. Now b1 is sumbitted to device B and is
splitted in c1 and c2. Now current bio list has three bios. b2, c1 and c2.
If submitter is now about to block on any bio set, then all tree bios
b2, c1, c2 will punted to rescue thread and submssion of b2 will again block
resulting in blocking rescue thread itself.

I would say keep all the bio splitting patches and any fixes w.r.t
deadlocks in a seprate series. As this is little complicated and a lot
of is just theoritical corner cases. If you limit this series to just
bio_set related cleanups, it becomes more acceptable for inclusion.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120829125759.GB12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]

* Re: [dm-devel] [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                       ` <20120829125759.GB12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2012-08-29 14:39                         ` Alasdair G Kergon
       [not found]                           ` <20120829143913.GA5500-FDJ95KluN3Z0klwcnFlA1dvLeJWuRmrY@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Alasdair G Kergon @ 2012-08-29 14:39 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Vivek Goyal, Jens Axboe, dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Tejun Heo

On Wed, Aug 29, 2012 at 08:57:59AM -0400, Vivek Goyal wrote:
> I would say keep all the bio splitting patches and any fixes w.r.t
> deadlocks in a seprate series. As this is little complicated and a lot
> of is just theoritical corner cases. If you limit this series to just
> bio_set related cleanups, it becomes more acceptable for inclusion.

Yes, please keep the splitting patches separate.  The current code gets
away with what it does through statistics making the deadlocks very
unlikely.

It's also instructive to remember why the code is the way it is: it used
to process bios for underlying devices immediately, but this sometimes
meant too much recursive stack growth.  If a per-device rescuer thread
is to be made available (as well as the mempool), the option of
reinstating recursion is there too - only punting to workqueue when the
stack actually becomes "too big".  (Also bear in mind that some dm
targets may have dependencies on their own mempools - submission can
block there too.)  I find it helpful only to consider splitting into two
pieces - it must always be possible to process the first piece (i.e.
process it at the next layer down in the stack) and complete it
independently of what happens to the second piece (which might require
further splitting and block until the first piece has completed).

Alasdair

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120829143913.GA5500-FDJ95KluN3Z0klwcnFlA1dvLeJWuRmrY@public.gmane.org>]

* Re: [dm-devel] [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                           ` <20120829143913.GA5500-FDJ95KluN3Z0klwcnFlA1dvLeJWuRmrY@public.gmane.org>
@ 2012-08-29 16:26                             ` Kent Overstreet
  2012-08-29 21:01                               ` John Stoffel
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-29 16:26 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe, dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Tejun Heo

On Wed, Aug 29, 2012 at 03:39:14PM +0100, Alasdair G Kergon wrote:
> It's also instructive to remember why the code is the way it is: it used
> to process bios for underlying devices immediately, but this sometimes
> meant too much recursive stack growth.  If a per-device rescuer thread
> is to be made available (as well as the mempool), the option of
> reinstating recursion is there too - only punting to workqueue when the
> stack actually becomes "too big".  (Also bear in mind that some dm
> targets may have dependencies on their own mempools - submission can
> block there too.)  I find it helpful only to consider splitting into two
> pieces - it must always be possible to process the first piece (i.e.
> process it at the next layer down in the stack) and complete it
> independently of what happens to the second piece (which might require
> further splitting and block until the first piece has completed).

I'm sure it could be made to work (and it may well simpler), but it
seems problematic from a performance pov.

With stacked devices you'd then have to switch stacks on _every_ bio.
That could be made fast enough I'm sure, but it wouldn't be free and I
don't know of any existing code in the kernel that implements what we'd
need (though if you know how you'd go about doing that, I'd love to
know! Would be useful for other things).

The real problem is that because we'd need these extra stacks for
handling all bios we couldn't get by with just one per bio_set. We'd
only need one to make forward progress so the rest could be allocated
on demand (i.e. what the workqueue code does) but that sounds like it's
starting to get expensive.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [dm-devel] [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-29 16:26                             ` Kent Overstreet
@ 2012-08-29 21:01                               ` John Stoffel
       [not found]                                 ` <20542.33557.833272.561494-QCwP04CIH+34g5BaSWARrrNAH6kLmebB@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: John Stoffel @ 2012-08-29 21:01 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Vivek Goyal, Jens Axboe, dm-devel, linux-kernel, linux-bcache,
	mpatocka, bharrosh, Tejun Heo

>>>>> "Kent" == Kent Overstreet <koverstreet@google.com> writes:

Kent> On Wed, Aug 29, 2012 at 03:39:14PM +0100, Alasdair G Kergon wrote:
>> It's also instructive to remember why the code is the way it is: it used
>> to process bios for underlying devices immediately, but this sometimes
>> meant too much recursive stack growth.  If a per-device rescuer thread
>> is to be made available (as well as the mempool), the option of
>> reinstating recursion is there too - only punting to workqueue when the
>> stack actually becomes "too big".  (Also bear in mind that some dm
>> targets may have dependencies on their own mempools - submission can
>> block there too.)  I find it helpful only to consider splitting into two
>> pieces - it must always be possible to process the first piece (i.e.
>> process it at the next layer down in the stack) and complete it
>> independently of what happens to the second piece (which might require
>> further splitting and block until the first piece has completed).

Kent> I'm sure it could be made to work (and it may well simpler), but it
Kent> seems problematic from a performance pov.

Kent> With stacked devices you'd then have to switch stacks on _every_ bio.
Kent> That could be made fast enough I'm sure, but it wouldn't be free and I
Kent> don't know of any existing code in the kernel that implements what we'd
Kent> need (though if you know how you'd go about doing that, I'd love to
Kent> know! Would be useful for other things).

Kent> The real problem is that because we'd need these extra stacks for
Kent> handling all bios we couldn't get by with just one per bio_set. We'd
Kent> only need one to make forward progress so the rest could be allocated
Kent> on demand (i.e. what the workqueue code does) but that sounds like it's
Kent> starting to get expensive.

Maybe we need to limit the size of BIOs to that of the bottom-most
underlying device instead?  Or maybe limit BIOs to some small
multiple?  As you stack up DM targets one on top of each other, they
should respect the limits of the underlying devices and pass those
limits up the chain.

Or maybe I'm speaking giberish...

John

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20542.33557.833272.561494-QCwP04CIH+34g5BaSWARrrNAH6kLmebB@public.gmane.org>]

* Re: [dm-devel] [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                                 ` <20542.33557.833272.561494-QCwP04CIH+34g5BaSWARrrNAH6kLmebB@public.gmane.org>
@ 2012-08-29 21:08                                   ` Kent Overstreet
  0 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-29 21:08 UTC (permalink / raw)
  To: John Stoffel
  Cc: Vivek Goyal, Jens Axboe, dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Tejun Heo

On Wed, Aug 29, 2012 at 05:01:09PM -0400, John Stoffel wrote:
> >>>>> "Kent" == Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> writes:
> 
> Kent> On Wed, Aug 29, 2012 at 03:39:14PM +0100, Alasdair G Kergon wrote:
> >> It's also instructive to remember why the code is the way it is: it used
> >> to process bios for underlying devices immediately, but this sometimes
> >> meant too much recursive stack growth.  If a per-device rescuer thread
> >> is to be made available (as well as the mempool), the option of
> >> reinstating recursion is there too - only punting to workqueue when the
> >> stack actually becomes "too big".  (Also bear in mind that some dm
> >> targets may have dependencies on their own mempools - submission can
> >> block there too.)  I find it helpful only to consider splitting into two
> >> pieces - it must always be possible to process the first piece (i.e.
> >> process it at the next layer down in the stack) and complete it
> >> independently of what happens to the second piece (which might require
> >> further splitting and block until the first piece has completed).
> 
> Kent> I'm sure it could be made to work (and it may well simpler), but it
> Kent> seems problematic from a performance pov.
> 
> Kent> With stacked devices you'd then have to switch stacks on _every_ bio.
> Kent> That could be made fast enough I'm sure, but it wouldn't be free and I
> Kent> don't know of any existing code in the kernel that implements what we'd
> Kent> need (though if you know how you'd go about doing that, I'd love to
> Kent> know! Would be useful for other things).
> 
> Kent> The real problem is that because we'd need these extra stacks for
> Kent> handling all bios we couldn't get by with just one per bio_set. We'd
> Kent> only need one to make forward progress so the rest could be allocated
> Kent> on demand (i.e. what the workqueue code does) but that sounds like it's
> Kent> starting to get expensive.
> 
> Maybe we need to limit the size of BIOs to that of the bottom-most
> underlying device instead?  Or maybe limit BIOs to some small
> multiple?  As you stack up DM targets one on top of each other, they
> should respect the limits of the underlying devices and pass those
> limits up the chain.

That's the approach the block layer currently tries to take. It's
brittle, tricky and inefficient, and it completely breaks down when the
limits are dynamic - so things like dm and bcache are always going to
have to split anyways.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]   ` <1346175456-1572-10-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2012-08-28 20:49     ` Tejun Heo
@ 2012-08-28 22:06     ` Vivek Goyal
       [not found]       ` <20120828220610.GH31674-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 75+ messages in thread
From: Vivek Goyal @ 2012-08-28 22:06 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Tue, Aug 28, 2012 at 10:37:36AM -0700, Kent Overstreet wrote:
> Previously, if we ever try to allocate more than once from the same bio
> set while running under generic_make_request() (i.e. a stacking block
> driver), we risk deadlock.
> 
> This is because of the code in generic_make_request() that converts
> recursion to iteration; any bios we submit won't actually be submitted
> (so they can complete and eventually be freed) until after we return -
> this means if we allocate a second bio, we're blocking the first one
> from ever being freed.
> 
> Thus if enough threads call into a stacking block driver at the same
> time with bios that need multiple splits, and the bio_set's reserve gets
> used up, we deadlock.

Hi Kent,

So above deadlock possibility arises only if multiple threads are doing
multiple splits from same pool and all the threads get blocked on mempool
and don't return from ->make_request function.

Is there any existing driver in the tree which can cause this deadlock or
it becomes a possibility only when splitting and bcache code shows up?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120828220610.GH31674-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]       ` <20120828220610.GH31674-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2012-08-28 22:23         ` Kent Overstreet
  0 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-28 22:23 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	mpatocka-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Tue, Aug 28, 2012 at 06:06:10PM -0400, Vivek Goyal wrote:
> On Tue, Aug 28, 2012 at 10:37:36AM -0700, Kent Overstreet wrote:
> > Previously, if we ever try to allocate more than once from the same bio
> > set while running under generic_make_request() (i.e. a stacking block
> > driver), we risk deadlock.
> > 
> > This is because of the code in generic_make_request() that converts
> > recursion to iteration; any bios we submit won't actually be submitted
> > (so they can complete and eventually be freed) until after we return -
> > this means if we allocate a second bio, we're blocking the first one
> > from ever being freed.
> > 
> > Thus if enough threads call into a stacking block driver at the same
> > time with bios that need multiple splits, and the bio_set's reserve gets
> > used up, we deadlock.
> 
> Hi Kent,
> 
> So above deadlock possibility arises only if multiple threads are doing
> multiple splits from same pool and all the threads get blocked on mempool
> and don't return from ->make_request function.
> 
> Is there any existing driver in the tree which can cause this deadlock or
> it becomes a possibility only when splitting and bcache code shows up?

It is emphatically possible with dm, though you would probably need a
pathalogical configuration of your targets for it to be possible in
practice - at least with the targets currently commonly in use.

With most of the newer dm targets coming down the pipe I expect it
should be possible under real world configurations, with the caveat that
under that kind of memory pressure in practice many other things will
fall over first.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-28 17:37 ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Kent Overstreet
       [not found]   ` <1346175456-1572-10-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-29 16:24   ` Mikulas Patocka
  2012-08-29 16:50     ` Kent Overstreet
  1 sibling, 1 reply; 75+ messages in thread
From: Mikulas Patocka @ 2012-08-29 16:24 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: linux-bcache, linux-kernel, dm-devel, tj, vgoyal, bharrosh,
	Jens Axboe

Hi

This fixes the bio allocation problems, but doesn't fix a similar deadlock 
in device mapper when allocating from md->io_pool or other mempools in 
the target driver.

The problem is that majority of device mapper code assumes that if we 
submit a bio, that bio will be finished in a finite time. The commit 
d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1 in 2.6.22 broke this assumption.

I suggest - instead of writing workarounds for this current->bio_list 
misbehavior, why not remove current->bio_list at all? We could revert 
d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1, allocate a per-device workqueue, 
test stack usage in generic_make_request, and if it is too high (more than 
half of the stack used, or so), put the bio to the target device's 
blockqueue.

That could be simpler than allocating per-bioset workqueue and it also 
solves more problems (possible deadlocks in dm).

Mikulas



On Tue, 28 Aug 2012, Kent Overstreet wrote:

> Previously, if we ever try to allocate more than once from the same bio
> set while running under generic_make_request() (i.e. a stacking block
> driver), we risk deadlock.
> 
> This is because of the code in generic_make_request() that converts
> recursion to iteration; any bios we submit won't actually be submitted
> (so they can complete and eventually be freed) until after we return -
> this means if we allocate a second bio, we're blocking the first one
> from ever being freed.
> 
> Thus if enough threads call into a stacking block driver at the same
> time with bios that need multiple splits, and the bio_set's reserve gets
> used up, we deadlock.
> 
> This can be worked around in the driver code - we could check if we're
> running under generic_make_request(), then mask out __GFP_WAIT when we
> go to allocate a bio, and if the allocation fails punt to workqueue and
> retry the allocation.
> 
> But this is tricky and not a generic solution. This patch solves it for
> all users by inverting the previously described technique. We allocate a
> rescuer workqueue for each bio_set, and then in the allocation code if
> there are bios on current->bio_list we would be blocking, we punt them
> to the rescuer workqueue to be submitted.
> 
> Tested it by forcing the rescue codepath to be taken (by disabling the
> first GFP_NOWAIT) attempt, and then ran it with bcache (which does a lot
> of arbitrary bio splitting) and verified that the rescuer was being
> invoked.
> 
> Signed-off-by: Kent Overstreet <koverstreet@google.com>
> CC: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/bio.c            | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  include/linux/bio.h |  9 +++++++
>  2 files changed, 79 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/bio.c b/fs/bio.c
> index 31e637a..5d46318 100644
> --- a/fs/bio.c
> +++ b/fs/bio.c
> @@ -285,6 +285,23 @@ void bio_reset(struct bio *bio)
>  }
>  EXPORT_SYMBOL(bio_reset);
>  
> +static void bio_alloc_rescue(struct work_struct *work)
> +{
> +	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
> +	struct bio *bio;
> +
> +	while (1) {
> +		spin_lock(&bs->rescue_lock);
> +		bio = bio_list_pop(&bs->rescue_list);
> +		spin_unlock(&bs->rescue_lock);
> +
> +		if (!bio)
> +			break;
> +
> +		generic_make_request(bio);
> +	}
> +}
> +
>  /**
>   * bio_alloc_bioset - allocate a bio for I/O
>   * @gfp_mask:   the GFP_ mask given to the slab allocator
> @@ -307,6 +324,7 @@ EXPORT_SYMBOL(bio_reset);
>   */
>  struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
>  {
> +	gfp_t saved_gfp = gfp_mask;
>  	unsigned front_pad;
>  	unsigned inline_vecs;
>  	unsigned long idx = BIO_POOL_NONE;
> @@ -324,13 +342,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
>  		front_pad = 0;
>  		inline_vecs = nr_iovecs;
>  	} else {
> +		/*
> +		 * generic_make_request() converts recursion to iteration; this
> +		 * means if we're running beneath it, any bios we allocate and
> +		 * submit will not be submitted (and thus freed) until after we
> +		 * return.
> +		 *
> +		 * This exposes us to a potential deadlock if we allocate
> +		 * multiple bios from the same bio_set() while running
> +		 * underneath generic_make_request(). If we were to allocate
> +		 * multiple bios (say a stacking block driver that was splitting
> +		 * bios), we would deadlock if we exhausted the mempool's
> +		 * reserve.
> +		 *
> +		 * We solve this, and guarantee forward progress, with a rescuer
> +		 * workqueue per bio_set. If we go to allocate and there are
> +		 * bios on current->bio_list, we first try the allocation
> +		 * without __GFP_WAIT; if that fails, we punt those bios we
> +		 * would be blocking to the rescuer workqueue before we retry
> +		 * with the original gfp_flags.
> +		 */
> +
> +		if (current->bio_list && !bio_list_empty(current->bio_list))
> +			gfp_mask &= ~__GFP_WAIT;
> +retry:
>  		p = mempool_alloc(bs->bio_pool, gfp_mask);
>  		front_pad = bs->front_pad;
>  		inline_vecs = BIO_INLINE_VECS;
>  	}
>  
>  	if (unlikely(!p))
> -		return NULL;
> +		goto err;
>  
>  	bio = p + front_pad;
>  	bio_init(bio);
> @@ -351,6 +393,19 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
>  
>  err_free:
>  	mempool_free(p, bs->bio_pool);
> +err:
> +	if (gfp_mask != saved_gfp) {
> +		gfp_mask = saved_gfp;
> +
> +		spin_lock(&bs->rescue_lock);
> +		bio_list_merge(&bs->rescue_list, current->bio_list);
> +		bio_list_init(current->bio_list);
> +		spin_unlock(&bs->rescue_lock);
> +
> +		queue_work(bs->rescue_workqueue, &bs->rescue_work);
> +		goto retry;
> +	}
> +
>  	return NULL;
>  }
>  EXPORT_SYMBOL(bio_alloc_bioset);
> @@ -1562,6 +1617,9 @@ static void biovec_free_pools(struct bio_set *bs)
>  
>  void bioset_free(struct bio_set *bs)
>  {
> +	if (bs->rescue_workqueue)
> +		destroy_workqueue(bs->rescue_workqueue);
> +
>  	if (bs->bio_pool)
>  		mempool_destroy(bs->bio_pool);
>  
> @@ -1597,6 +1655,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
>  
>  	bs->front_pad = front_pad;
>  
> +	spin_lock_init(&bs->rescue_lock);
> +	bio_list_init(&bs->rescue_list);
> +	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
> +
>  	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
>  	if (!bs->bio_slab) {
>  		kfree(bs);
> @@ -1607,9 +1669,14 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
>  	if (!bs->bio_pool)
>  		goto bad;
>  
> -	if (!biovec_create_pools(bs, pool_size))
> -		return bs;
> +	if (biovec_create_pools(bs, pool_size))
> +		goto bad;
> +
> +	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
> +	if (!bs->rescue_workqueue)
> +		goto bad;
>  
> +	return bs;
>  bad:
>  	bioset_free(bs);
>  	return NULL;
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 3a8345e..84fdaac 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -492,6 +492,15 @@ struct bio_set {
>  	mempool_t *bio_integrity_pool;
>  #endif
>  	mempool_t *bvec_pool;
> +
> +	/*
> +	 * Deadlock avoidance for stacking block drivers: see comments in
> +	 * bio_alloc_bioset() for details
> +	 */
> +	spinlock_t		rescue_lock;
> +	struct bio_list		rescue_list;
> +	struct work_struct	rescue_work;
> +	struct workqueue_struct	*rescue_workqueue;
>  };
>  
>  struct biovec_slab {
> -- 
> 1.7.12
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-29 16:24   ` Mikulas Patocka
@ 2012-08-29 16:50     ` Kent Overstreet
       [not found]       ` <20120829165006.GB20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-29 16:50 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: linux-bcache, linux-kernel, dm-devel, tj, vgoyal, bharrosh,
	Jens Axboe

On Wed, Aug 29, 2012 at 12:24:43PM -0400, Mikulas Patocka wrote:
> Hi
> 
> This fixes the bio allocation problems, but doesn't fix a similar deadlock 
> in device mapper when allocating from md->io_pool or other mempools in 
> the target driver.

Ick. That is a problem.

There are a bunch of mempools in drivers/md too :(

Though honestly bio_sets have the front_pad thing for a reason, for per
bio state it makes sense to use that anyways - simpler code because
you're doing fewer explicit allocations and more efficient too.

So I wonder if we fixed that if it'd still be a problem.

The other thing we could do is make the rescuer thread per block device
(which arguably makes more sense than per bio_set anyways), then
associate bio_sets with specific block devices/rescuers.

Then the rescuer code can be abstracted out from the bio allocation
code, and we just create a thin wrapper around mempool_alloc() that does
the same dance bio_alloc_bioset() does now.

I think that'd be pretty easy and work out really nicely.

> The problem is that majority of device mapper code assumes that if we 
> submit a bio, that bio will be finished in a finite time. The commit 
> d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1 in 2.6.22 broke this assumption.
> 
> I suggest - instead of writing workarounds for this current->bio_list 
> misbehavior, why not remove current->bio_list at all? We could revert 
> d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1, allocate a per-device workqueue, 
> test stack usage in generic_make_request, and if it is too high (more than 
> half of the stack used, or so), put the bio to the target device's 
> blockqueue.
> 
> That could be simpler than allocating per-bioset workqueue and it also 
> solves more problems (possible deadlocks in dm).

It certainly would be simpler, but honestly the potential for
performance regressions scares me (and bcache at least is used on fast
enough devices where it's going to matter). Also it's not so much the
performance overhead - we can just measure that - it's that if we're
just using the workqueue code the scheduler's getting involved and we
can't just measure what the effects of that are going to be in
production.

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120829165006.GB20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [dm-devel] [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]       ` <20120829165006.GB20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-29 16:57         ` Alasdair G Kergon
  2012-08-29 17:07         ` Vivek Goyal
  1 sibling, 0 replies; 75+ messages in thread
From: Alasdair G Kergon @ 2012-08-29 16:57 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Mikulas Patocka, Jens Axboe, dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, tj-DgEjT+Ai2ygdnm+yROfE0A,
	vgoyal-H+wXaHxf7aLQT0dZR+AlfA

On Wed, Aug 29, 2012 at 09:50:06AM -0700, Kent Overstreet wrote:
> The other thing we could do is make the rescuer thread per block device
> (which arguably makes more sense than per bio_set anyways), then
> associate bio_sets with specific block devices/rescuers.
 
For dm, it's equivalent - already one bioset required per live device:
dm_alloc_md_mempools().

Alasdair

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]       ` <20120829165006.GB20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2012-08-29 16:57         ` [dm-devel] " Alasdair G Kergon
@ 2012-08-29 17:07         ` Vivek Goyal
       [not found]           ` <20120829170711.GC12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 75+ messages in thread
From: Vivek Goyal @ 2012-08-29 17:07 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Mikulas Patocka, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Wed, Aug 29, 2012 at 09:50:06AM -0700, Kent Overstreet wrote:

[..]
> > The problem is that majority of device mapper code assumes that if we 
> > submit a bio, that bio will be finished in a finite time. The commit 
> > d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1 in 2.6.22 broke this assumption.
> > 
> > I suggest - instead of writing workarounds for this current->bio_list 
> > misbehavior, why not remove current->bio_list at all? We could revert 
> > d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1, allocate a per-device workqueue, 
> > test stack usage in generic_make_request, and if it is too high (more than 
> > half of the stack used, or so), put the bio to the target device's 
> > blockqueue.
> > 
> > That could be simpler than allocating per-bioset workqueue and it also 
> > solves more problems (possible deadlocks in dm).
> 
> It certainly would be simpler, but honestly the potential for
> performance regressions scares me (and bcache at least is used on fast
> enough devices where it's going to matter). Also it's not so much the
> performance overhead - we can just measure that - it's that if we're
> just using the workqueue code the scheduler's getting involved and we
> can't just measure what the effects of that are going to be in
> production.

Are workqueues not getting involved already in your solution of punting
to rescuer thread. In the proposal above also, workers get involved
only if stack depth is too deep. So for normal stack usage performance
should not be impacted.

Performance aside, punting submission to per device worker in case of deep
stack usage sounds cleaner solution to me.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120829170711.GC12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]           ` <20120829170711.GC12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2012-08-29 17:13             ` Kent Overstreet
       [not found]               ` <20120829171345.GC20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-08-29 17:13 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Mikulas Patocka, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Wed, Aug 29, 2012 at 01:07:11PM -0400, Vivek Goyal wrote:
> On Wed, Aug 29, 2012 at 09:50:06AM -0700, Kent Overstreet wrote:
> 
> [..]
> > > The problem is that majority of device mapper code assumes that if we 
> > > submit a bio, that bio will be finished in a finite time. The commit 
> > > d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1 in 2.6.22 broke this assumption.
> > > 
> > > I suggest - instead of writing workarounds for this current->bio_list 
> > > misbehavior, why not remove current->bio_list at all? We could revert 
> > > d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1, allocate a per-device workqueue, 
> > > test stack usage in generic_make_request, and if it is too high (more than 
> > > half of the stack used, or so), put the bio to the target device's 
> > > blockqueue.
> > > 
> > > That could be simpler than allocating per-bioset workqueue and it also 
> > > solves more problems (possible deadlocks in dm).
> > 
> > It certainly would be simpler, but honestly the potential for
> > performance regressions scares me (and bcache at least is used on fast
> > enough devices where it's going to matter). Also it's not so much the
> > performance overhead - we can just measure that - it's that if we're
> > just using the workqueue code the scheduler's getting involved and we
> > can't just measure what the effects of that are going to be in
> > production.
> 
> Are workqueues not getting involved already in your solution of punting
> to rescuer thread.

Only on allocation failure.

> In the proposal above also, workers get involved
> only if stack depth is too deep. So for normal stack usage performance
> should not be impacted.
> 
> Performance aside, punting submission to per device worker in case of deep
> stack usage sounds cleaner solution to me.

Agreed, but performance tends to matter in the real world. And either
way the tricky bits are going to be confined to a few functions, so I
don't think it matters that much.

If someone wants to code up the workqueue version and test it, they're
more than welcome...

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120829171345.GC20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [dm-devel] [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]               ` <20120829171345.GC20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-08-29 17:23                 ` Alasdair G Kergon
  2012-08-29 17:32                   ` Kent Overstreet
  2012-08-30 22:07                 ` Vivek Goyal
  1 sibling, 1 reply; 75+ messages in thread
From: Alasdair G Kergon @ 2012-08-29 17:23 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Vivek Goyal, Jens Axboe, dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA, Mikulas Patocka,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, tj-DgEjT+Ai2ygdnm+yROfE0A

On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
> Only on allocation failure.
 
Which as you already said assumes wrapping together the other mempools in the
submission path first...

Alasdair

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [dm-devel] [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-29 17:23                 ` [dm-devel] " Alasdair G Kergon
@ 2012-08-29 17:32                   ` Kent Overstreet
  0 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-29 17:32 UTC (permalink / raw)
  To: Vivek Goyal, Jens Axboe, dm-devel, linux-kernel, linux-bcache,
	Mikulas Patocka, bharrosh, tj

On Wed, Aug 29, 2012 at 06:23:36PM +0100, Alasdair G Kergon wrote:
> On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
> > Only on allocation failure.
>  
> Which as you already said assumes wrapping together the other mempools in the
> submission path first...

Yes? I'm not sure what you're getting at.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]               ` <20120829171345.GC20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2012-08-29 17:23                 ` [dm-devel] " Alasdair G Kergon
@ 2012-08-30 22:07                 ` Vivek Goyal
       [not found]                   ` <20120830220745.GI27257-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 75+ messages in thread
From: Vivek Goyal @ 2012-08-30 22:07 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Mikulas Patocka, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:

[..]
> > Performance aside, punting submission to per device worker in case of deep
> > stack usage sounds cleaner solution to me.
> 
> Agreed, but performance tends to matter in the real world. And either
> way the tricky bits are going to be confined to a few functions, so I
> don't think it matters that much.
> 
> If someone wants to code up the workqueue version and test it, they're
> more than welcome...

Here is one quick and dirty proof of concept patch. It checks for stack
depth and if remaining space is less than 20% of stack size, then it
defers the bio submission to per queue worker.

Thanks
Vivek


---
 block/blk-core.c          |  171 ++++++++++++++++++++++++++++++++++------------
 block/blk-sysfs.c         |    1 
 include/linux/blk_types.h |    1 
 include/linux/blkdev.h    |    8 ++
 4 files changed, 138 insertions(+), 43 deletions(-)

Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h	2012-09-01 17:44:51.686485550 -0400
+++ linux-2.6/include/linux/blkdev.h	2012-09-01 18:09:58.805577658 -0400
@@ -430,6 +430,14 @@ struct request_queue {
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+
+	/*
+	 * Bio submission to queue can be deferred to a workqueue if stack
+	 * usage of submitter is high.
+	 */
+	struct bio_list         deferred_bios;
+	struct work_struct	deferred_bio_work;
+	struct workqueue_struct *deferred_bio_workqueue;
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c	2012-09-01 17:44:51.686485550 -0400
+++ linux-2.6/block/blk-core.c	2012-09-02 00:34:55.204091269 -0400
@@ -211,6 +211,23 @@ static void blk_delay_work(struct work_s
 	spin_unlock_irq(q->queue_lock);
 }
 
+static void blk_deferred_bio_work(struct work_struct *work)
+{
+	struct request_queue *q;
+	struct bio *bio = NULL;
+
+	q = container_of(work, struct request_queue, deferred_bio_work);
+
+	do {
+		spin_lock_irq(q->queue_lock);
+		bio = bio_list_pop(&q->deferred_bios);
+		spin_unlock_irq(q->queue_lock);
+		if (!bio)
+			break;
+		generic_make_request(bio);
+	} while (1);
+}
+
 /**
  * blk_delay_queue - restart queueing after defined interval
  * @q:		The &struct request_queue in question
@@ -289,6 +306,7 @@ void blk_sync_queue(struct request_queue
 {
 	del_timer_sync(&q->timeout);
 	cancel_delayed_work_sync(&q->delay_work);
+	cancel_work_sync(&q->deferred_bio_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -351,6 +369,29 @@ void blk_put_queue(struct request_queue 
 EXPORT_SYMBOL(blk_put_queue);
 
 /**
+ * blk_drain_deferred_bios - drain deferred bios
+ * @q: request_queue to drain deferred bios for
+ *
+ * Dispatch all currently deferred bios on @q through ->make_request_fn().
+ */
+static void blk_drain_deferred_bios(struct request_queue *q)
+{
+	struct bio_list bl;
+	struct bio *bio;
+	unsigned long flags;
+
+	bio_list_init(&bl);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	bio_list_merge(&bl, &q->deferred_bios);
+	bio_list_init(&q->deferred_bios);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	while ((bio = bio_list_pop(&bl)))
+		generic_make_request(bio);
+}
+
+/**
  * blk_drain_queue - drain requests from request_queue
  * @q: queue to drain
  * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
@@ -358,6 +399,10 @@ EXPORT_SYMBOL(blk_put_queue);
  * Drain requests from @q.  If @drain_all is set, all requests are drained.
  * If not, only ELVPRIV requests are drained.  The caller is responsible
  * for ensuring that no new requests which need to be drained are queued.
+ *
+ * Note: It does not drain bios on q->deferred_bios list.
+ * Call blk_drain_deferred_bios() if need be.
+ *
  */
 void blk_drain_queue(struct request_queue *q, bool drain_all)
 {
@@ -505,6 +550,9 @@ void blk_cleanup_queue(struct request_qu
 	spin_unlock_irq(lock);
 	mutex_unlock(&q->sysfs_lock);
 
+	/* First drain all deferred bios. */
+	blk_drain_deferred_bios(q);
+
 	/* drain all requests queued before DEAD marking */
 	blk_drain_queue(q, true);
 
@@ -614,11 +662,19 @@ struct request_queue *blk_alloc_queue_no
 	q->bypass_depth = 1;
 	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
 
-	if (blkcg_init_queue(q))
+	bio_list_init(&q->deferred_bios);
+	INIT_WORK(&q->deferred_bio_work, blk_deferred_bio_work);
+	q->deferred_bio_workqueue = alloc_workqueue("kdeferbiod", WQ_MEM_RECLAIM, 0);
+	if (!q->deferred_bio_workqueue)
 		goto fail_id;
 
+	if (blkcg_init_queue(q))
+		goto fail_deferred_bio_wq;
+
 	return q;
 
+fail_deferred_bio_wq:
+	destroy_workqueue(q->deferred_bio_workqueue);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
@@ -1635,8 +1691,10 @@ static inline int bio_check_eod(struct b
 	return 0;
 }
 
+
+
 static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+generic_make_request_checks_early(struct bio *bio)
 {
 	struct request_queue *q;
 	int nr_sectors = bio_sectors(bio);
@@ -1715,9 +1773,6 @@ generic_make_request_checks(struct bio *
 	 */
 	create_io_context(GFP_ATOMIC, q->node);
 
-	if (blk_throtl_bio(q, bio))
-		return false;	/* throttled, will be resubmitted later */
-
 	trace_block_bio_queue(q, bio);
 	return true;
 
@@ -1726,6 +1781,56 @@ end_io:
 	return false;
 }
 
+static noinline_for_stack bool
+generic_make_request_checks_late(struct bio *bio)
+{
+	struct request_queue *q;
+
+	q = bdev_get_queue(bio->bi_bdev);
+
+	/*
+	 * Various block parts want %current->io_context and lazy ioc
+	 * allocation ends up trading a lot of pain for a small amount of
+	 * memory.  Just allocate it upfront.  This may fail and block
+	 * layer knows how to live with it.
+	 */
+	create_io_context(GFP_ATOMIC, q->node);
+
+	if (blk_throtl_bio(q, bio))
+		return false;	/* throttled, will be resubmitted later */
+
+	return true;
+}
+
+static void __generic_make_request(struct bio *bio)
+{
+	struct request_queue *q;
+
+	if (!generic_make_request_checks_late(bio))
+		return;
+	q = bdev_get_queue(bio->bi_bdev);
+	q->make_request_fn(q, bio);
+}
+
+static void generic_make_request_defer_bio(struct bio *bio)
+{
+	struct request_queue *q;
+	unsigned long flags;
+
+	q = bdev_get_queue(bio->bi_bdev);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (unlikely(blk_queue_dead(q))) {
+		spin_unlock_irqrestore(q->queue_lock, flags);
+		bio_endio(bio, -ENODEV);
+		return;
+	}
+	set_bit(BIO_DEFERRED, &bio->bi_flags);
+	bio_list_add(&q->deferred_bios, bio);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+	queue_work(q->deferred_bio_workqueue, &q->deferred_bio_work);
+}
+
 /**
  * generic_make_request - hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
@@ -1752,51 +1857,31 @@ end_io:
  */
 void generic_make_request(struct bio *bio)
 {
-	struct bio_list bio_list_on_stack;
+	unsigned long sp = 0;
+	unsigned int threshold = (THREAD_SIZE * 2)/10;
 
-	if (!generic_make_request_checks(bio))
-		return;
+	BUG_ON(bio->bi_next);
 
-	/*
-	 * We only want one ->make_request_fn to be active at a time, else
-	 * stack usage with stacked devices could be a problem.  So use
-	 * current->bio_list to keep a list of requests submited by a
-	 * make_request_fn function.  current->bio_list is also used as a
-	 * flag to say if generic_make_request is currently active in this
-	 * task or not.  If it is NULL, then no make_request is active.  If
-	 * it is non-NULL, then a make_request is active, and new requests
-	 * should be added at the tail
-	 */
-	if (current->bio_list) {
-		bio_list_add(current->bio_list, bio);
+	/* Submitteing deferred bio from worker context. */
+	if (bio_flagged(bio, BIO_DEFERRED)) {
+		clear_bit(BIO_DEFERRED, &bio->bi_flags);
+		__generic_make_request(bio);
 		return;
 	}
 
-	/* following loop may be a bit non-obvious, and so deserves some
-	 * explanation.
-	 * Before entering the loop, bio->bi_next is NULL (as all callers
-	 * ensure that) so we have a list with a single bio.
-	 * We pretend that we have just taken it off a longer list, so
-	 * we assign bio_list to a pointer to the bio_list_on_stack,
-	 * thus initialising the bio_list of new bios to be
-	 * added.  ->make_request() may indeed add some more bios
-	 * through a recursive call to generic_make_request.  If it
-	 * did, we find a non-NULL value in bio_list and re-enter the loop
-	 * from the top.  In this case we really did just take the bio
-	 * of the top of the list (no pretending) and so remove it from
-	 * bio_list, and call into ->make_request() again.
-	 */
-	BUG_ON(bio->bi_next);
-	bio_list_init(&bio_list_on_stack);
-	current->bio_list = &bio_list_on_stack;
-	do {
-		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	if (!generic_make_request_checks_early(bio))
+		return;
 
-		q->make_request_fn(q, bio);
+	/*
+	 * FIXME. Provide an arch dependent function to return left stack
+	 * space for current task. This is hack for x86_64.
+	 */
+	asm volatile("movq %%rsp,%0" : "=m"(sp));
 
-		bio = bio_list_pop(current->bio_list);
-	} while (bio);
-	current->bio_list = NULL; /* deactivate */
+	if ((sp - (unsigned long)end_of_stack(current)) < threshold)
+		generic_make_request_defer_bio(bio);
+	else
+		__generic_make_request(bio);
 }
 EXPORT_SYMBOL(generic_make_request);
 
Index: linux-2.6/block/blk-sysfs.c
===================================================================
--- linux-2.6.orig/block/blk-sysfs.c	2012-09-01 17:44:51.686485550 -0400
+++ linux-2.6/block/blk-sysfs.c	2012-09-01 18:09:58.808577661 -0400
@@ -505,6 +505,7 @@ static void blk_release_queue(struct kob
 
 	ida_simple_remove(&blk_queue_ida, q->id);
 	kmem_cache_free(blk_requestq_cachep, q);
+	destroy_workqueue(q->deferred_bio_workqueue);
 }
 
 static const struct sysfs_ops queue_sysfs_ops = {
Index: linux-2.6/include/linux/blk_types.h
===================================================================
--- linux-2.6.orig/include/linux/blk_types.h	2012-09-02 00:34:17.607086696 -0400
+++ linux-2.6/include/linux/blk_types.h	2012-09-02 00:34:21.997087104 -0400
@@ -105,6 +105,7 @@ struct bio {
 #define BIO_FS_INTEGRITY 9	/* fs owns integrity data, not block layer */
 #define BIO_QUIET	10	/* Make BIO Quiet */
 #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+#define BIO_DEFERRED	12	/* Bio was deferred for submission by worker */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120830220745.GI27257-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                   ` <20120830220745.GI27257-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2012-08-31  1:43                     ` Kent Overstreet
  2012-08-31  1:55                       ` Kent Overstreet
                                         ` (2 more replies)
  2012-09-01  2:13                     ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Tejun Heo
  2012-09-03  0:49                     ` Dave Chinner
  2 siblings, 3 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-31  1:43 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Mikulas Patocka, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
> 
> [..]
> > > Performance aside, punting submission to per device worker in case of deep
> > > stack usage sounds cleaner solution to me.
> > 
> > Agreed, but performance tends to matter in the real world. And either
> > way the tricky bits are going to be confined to a few functions, so I
> > don't think it matters that much.
> > 
> > If someone wants to code up the workqueue version and test it, they're
> > more than welcome...
> 
> Here is one quick and dirty proof of concept patch. It checks for stack
> depth and if remaining space is less than 20% of stack size, then it
> defers the bio submission to per queue worker.

I can't think of any correctness issues. I see some stuff that could be
simplified (blk_drain_deferred_bios() is redundant, just make it a
wrapper around blk_deffered_bio_work()).

Still skeptical about the performance impact, though - frankly, on some
of the hardware I've been running bcache on this would be a visible
performance regression - probably double digit percentages but I'd have
to benchmark it.  That kind of of hardware/usage is not normal today,
but I've put a lot of work into performance and I don't want to make
things worse without good reason.

Have you tested/benchmarked it?

There's scheduling behaviour, too. We really want the workqueue thread's
cpu time to be charged to the process that submitted the bio. (We could
use a mechanism like that in other places, too... not like this is a new
issue).

This is going to be a real issue for users that need strong isolation -
for any driver that uses non negligable cpu (i.e. dm crypt), we're
breaking that (not that it wasn't broken already, but this makes it
worse).

I could be convinced, but right now I prefer my solution.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-31  1:43                     ` Kent Overstreet
@ 2012-08-31  1:55                       ` Kent Overstreet
  2012-08-31 15:01                       ` Vivek Goyal
       [not found]                       ` <20120831014359.GB15218-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  2 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-08-31  1:55 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Jens Axboe, dm-devel, linux-kernel, linux-bcache, Mikulas Patocka,
	bharrosh, tj

On Thu, Aug 30, 2012 at 6:43 PM, Kent Overstreet <koverstreet@google.com> wrote:
> On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
>> On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
>>
>> [..]
>> > > Performance aside, punting submission to per device worker in case of deep
>> > > stack usage sounds cleaner solution to me.
>> >
>> > Agreed, but performance tends to matter in the real world. And either
>> > way the tricky bits are going to be confined to a few functions, so I
>> > don't think it matters that much.
>> >
>> > If someone wants to code up the workqueue version and test it, they're
>> > more than welcome...
>>
>> Here is one quick and dirty proof of concept patch. It checks for stack
>> depth and if remaining space is less than 20% of stack size, then it
>> defers the bio submission to per queue worker.
>
> I can't think of any correctness issues. I see some stuff that could be
> simplified (blk_drain_deferred_bios() is redundant, just make it a
> wrapper around blk_deffered_bio_work()).
>
> Still skeptical about the performance impact, though - frankly, on some
> of the hardware I've been running bcache on this would be a visible
> performance regression - probably double digit percentages but I'd have
> to benchmark it.  That kind of of hardware/usage is not normal today,
> but I've put a lot of work into performance and I don't want to make
> things worse without good reason.

Here's another crazy idea - we don't really need another thread, just
more stack space.

We could check if we're running out of stack space, then if we are
just allocate another two pages and memcpy the struct thread_info
over.

I think the main obstacle is that we'd need some per arch code for
mucking with the stack pointer. And it'd break backtraces, but that's
fixable.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-08-31  1:43                     ` Kent Overstreet
  2012-08-31  1:55                       ` Kent Overstreet
@ 2012-08-31 15:01                       ` Vivek Goyal
       [not found]                         ` <20120831150159.GB13483-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
       [not found]                       ` <20120831014359.GB15218-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  2 siblings, 1 reply; 75+ messages in thread
From: Vivek Goyal @ 2012-08-31 15:01 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Mikulas Patocka, linux-bcache, linux-kernel, dm-devel, tj,
	bharrosh, Jens Axboe

On Thu, Aug 30, 2012 at 06:43:59PM -0700, Kent Overstreet wrote:
> On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> > On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
> > 
> > [..]
> > > > Performance aside, punting submission to per device worker in case of deep
> > > > stack usage sounds cleaner solution to me.
> > > 
> > > Agreed, but performance tends to matter in the real world. And either
> > > way the tricky bits are going to be confined to a few functions, so I
> > > don't think it matters that much.
> > > 
> > > If someone wants to code up the workqueue version and test it, they're
> > > more than welcome...
> > 
> > Here is one quick and dirty proof of concept patch. It checks for stack
> > depth and if remaining space is less than 20% of stack size, then it
> > defers the bio submission to per queue worker.
> 
> I can't think of any correctness issues. I see some stuff that could be
> simplified (blk_drain_deferred_bios() is redundant, just make it a
> wrapper around blk_deffered_bio_work()).
> 
> Still skeptical about the performance impact, though - frankly, on some
> of the hardware I've been running bcache on this would be a visible
> performance regression - probably double digit percentages but I'd have
> to benchmark it.  That kind of of hardware/usage is not normal today,
> but I've put a lot of work into performance and I don't want to make
> things worse without good reason.

Would you like to give this patch a quick try and see with bcache on your
hardware how much performance impact do you see. 

Given the fact that submission through worker happens only in case of 
when stack usage is high, that should reduce the impact of the patch
and common use cases should reamin unaffected.

> 
> Have you tested/benchmarked it?

No, I have not. I will run some simple workloads on SSD.

> 
> There's scheduling behaviour, too. We really want the workqueue thread's
> cpu time to be charged to the process that submitted the bio. (We could
> use a mechanism like that in other places, too... not like this is a new
> issue).
> 
> This is going to be a real issue for users that need strong isolation -
> for any driver that uses non negligable cpu (i.e. dm crypt), we're
> breaking that (not that it wasn't broken already, but this makes it
> worse).

There are so many places in kernel where worker threads do work on behalf
of each process. I think this is really a minor concern and I would not
be too worried about it.

What is concerning though really is the greater stack usage due to
recursive nature of make_request() and performance impact of deferral
to a worker thread.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120831150159.GB13483-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                         ` <20120831150159.GB13483-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2012-09-03  1:26                           ` Kent Overstreet
  0 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-09-03  1:26 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Mikulas Patocka, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Fri, Aug 31, 2012 at 11:01:59AM -0400, Vivek Goyal wrote:
> On Thu, Aug 30, 2012 at 06:43:59PM -0700, Kent Overstreet wrote:
> > On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> > > On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
> > > 
> > > [..]
> > > > > Performance aside, punting submission to per device worker in case of deep
> > > > > stack usage sounds cleaner solution to me.
> > > > 
> > > > Agreed, but performance tends to matter in the real world. And either
> > > > way the tricky bits are going to be confined to a few functions, so I
> > > > don't think it matters that much.
> > > > 
> > > > If someone wants to code up the workqueue version and test it, they're
> > > > more than welcome...
> > > 
> > > Here is one quick and dirty proof of concept patch. It checks for stack
> > > depth and if remaining space is less than 20% of stack size, then it
> > > defers the bio submission to per queue worker.
> > 
> > I can't think of any correctness issues. I see some stuff that could be
> > simplified (blk_drain_deferred_bios() is redundant, just make it a
> > wrapper around blk_deffered_bio_work()).
> > 
> > Still skeptical about the performance impact, though - frankly, on some
> > of the hardware I've been running bcache on this would be a visible
> > performance regression - probably double digit percentages but I'd have
> > to benchmark it.  That kind of of hardware/usage is not normal today,
> > but I've put a lot of work into performance and I don't want to make
> > things worse without good reason.
> 
> Would you like to give this patch a quick try and see with bcache on your
> hardware how much performance impact do you see. 

If I can get a test system I can publish numbers setup with a modern
kernel, on I will. Will take a bit though.

> Given the fact that submission through worker happens only in case of 
> when stack usage is high, that should reduce the impact of the patch
> and common use cases should reamin unaffected.

Except depending on how users have their systems configured, it'll
either never happen or it'll happen for most every bio. That makes the
performance overhead unpredictable, too.

> > 
> > Have you tested/benchmarked it?
> 
> No, I have not. I will run some simple workloads on SSD.

Normal SATA ssds are not going to show the overhead - achi is a pig
and it'll be lost in the noise.

> There are so many places in kernel where worker threads do work on behalf
> of each process. I think this is really a minor concern and I would not
> be too worried about it.

Yeah, but this is somewhat unprecedented in the amount of cpu time
you're potentially moving to worker threads.

It is a concern.

> What is concerning though really is the greater stack usage due to
> recursive nature of make_request() and performance impact of deferral
> to a worker thread.

Your patch shouldn't increase stack usage (at least if your threshold is
safe - it's too high as is).

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120831014359.GB15218-jC9Py7bek1znysI04z7BkA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                       ` <20120831014359.GB15218-jC9Py7bek1znysI04z7BkA@public.gmane.org>
@ 2012-09-03 20:41                         ` Mikulas Patocka
       [not found]                           ` <Pine.LNX.4.64.1209031638110.15620-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Mikulas Patocka @ 2012-09-03 20:41 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe



On Thu, 30 Aug 2012, Kent Overstreet wrote:

> On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> > On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
> > 
> > [..]
> > > > Performance aside, punting submission to per device worker in case of deep
> > > > stack usage sounds cleaner solution to me.
> > > 
> > > Agreed, but performance tends to matter in the real world. And either
> > > way the tricky bits are going to be confined to a few functions, so I
> > > don't think it matters that much.
> > > 
> > > If someone wants to code up the workqueue version and test it, they're
> > > more than welcome...
> > 
> > Here is one quick and dirty proof of concept patch. It checks for stack
> > depth and if remaining space is less than 20% of stack size, then it
> > defers the bio submission to per queue worker.
> 
> I can't think of any correctness issues. I see some stuff that could be
> simplified (blk_drain_deferred_bios() is redundant, just make it a
> wrapper around blk_deffered_bio_work()).
> 
> Still skeptical about the performance impact, though - frankly, on some
> of the hardware I've been running bcache on this would be a visible
> performance regression - probably double digit percentages but I'd have
> to benchmark it.  That kind of of hardware/usage is not normal today,
> but I've put a lot of work into performance and I don't want to make
> things worse without good reason.
> 
> Have you tested/benchmarked it?
> 
> There's scheduling behaviour, too. We really want the workqueue thread's
> cpu time to be charged to the process that submitted the bio. (We could
> use a mechanism like that in other places, too... not like this is a new
> issue).
> 
> This is going to be a real issue for users that need strong isolation -
> for any driver that uses non negligable cpu (i.e. dm crypt), we're
> breaking that (not that it wasn't broken already, but this makes it
> worse).

... or another possibility - start a timer when something is put to 
current->bio_list and use that timer to pop entries off current->bio_list 
and submit them to a workqueue. The timer can be cpu-local so only 
interrupt masking is required to synchronize against the timer.

This would normally run just like the current kernel and in case of 
deadlock, the timer would kick in and resolve the deadlock.

> I could be convinced, but right now I prefer my solution.

It fixes bio allocation problem, but not other similar mempool problems in 
dm and md.

Mikulas

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <Pine.LNX.4.64.1209031638110.15620-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                           ` <Pine.LNX.4.64.1209031638110.15620-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
@ 2012-09-04  3:41                             ` Kent Overstreet
       [not found]                               ` <20120904034100.GA21602-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-09-04  3:41 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Mon, Sep 03, 2012 at 04:41:37PM -0400, Mikulas Patocka wrote:
> ... or another possibility - start a timer when something is put to 
> current->bio_list and use that timer to pop entries off current->bio_list 
> and submit them to a workqueue. The timer can be cpu-local so only 
> interrupt masking is required to synchronize against the timer.
> 
> This would normally run just like the current kernel and in case of 
> deadlock, the timer would kick in and resolve the deadlock.

Ugh. That's a _terrible_ idea.

Remember the old plugging code? You ever have to debug performance
issues caused by it?

> 
> > I could be convinced, but right now I prefer my solution.
> 
> It fixes bio allocation problem, but not other similar mempool problems in 
> dm and md.

I looked a bit more, and actually I think the rest of the problem is
pretty limited in scope - most of those mempool allocations are per
request, not per split.

I'm willing to put some time into converting dm/md over to bioset's
front_pad. I'm having to learn the code for the immutable biovec work,
anyways.

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120904034100.GA21602-jC9Py7bek1znysI04z7BkA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                               ` <20120904034100.GA21602-jC9Py7bek1znysI04z7BkA@public.gmane.org>
@ 2012-09-04 18:55                                 ` Tejun Heo
  2012-09-04 19:01                                   ` Tejun Heo
  2012-09-04 19:42                                   ` Kent Overstreet
  2012-09-04 19:26                                 ` Mikulas Patocka
  1 sibling, 2 replies; 75+ messages in thread
From: Tejun Heo @ 2012-09-04 18:55 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Mikulas Patocka, Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello, Mikulas, Kent.

On Mon, Sep 03, 2012 at 08:41:00PM -0700, Kent Overstreet wrote:
> On Mon, Sep 03, 2012 at 04:41:37PM -0400, Mikulas Patocka wrote:
> > ... or another possibility - start a timer when something is put to 
> > current->bio_list and use that timer to pop entries off current->bio_list 
> > and submit them to a workqueue. The timer can be cpu-local so only 
> > interrupt masking is required to synchronize against the timer.
> > 
> > This would normally run just like the current kernel and in case of 
> > deadlock, the timer would kick in and resolve the deadlock.
> 
> Ugh. That's a _terrible_ idea.

That's exactly how workqueue rescuers work - rescuers kick in if new
worker creation doesn't succeed in given amount of time.  The
suggested mechanism already makes use of workqueue, so it's already
doing it.  If you can think of a better way to detect the generic
stall condition, please be my guest.

> Remember the old plugging code? You ever have to debug performance
> issues caused by it?

That is not equivalent.  Plugging was kicking in all the time and it
wasn't entirely well-defined what the plugging / unplugging conditions
were.  This type of rescuing for forward-progress guarantee only kicks
in under severe memory pressure and people expect finite latency and
throughput hits under such conditions.  The usual bio / request /
scsi_cmd allocations could be failing under these circumstances and
things could be progressing only thanks to the finite preallocated
pools.  I don't think involving rescue timer would be noticeably
deterimental.

Actually, if the timer approach can reduce the frequency of rescuer
involvement, I think it could actually be better.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-09-04 18:55                                 ` Tejun Heo
@ 2012-09-04 19:01                                   ` Tejun Heo
       [not found]                                     ` <20120904190119.GD3638-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
  2012-09-04 19:42                                   ` Kent Overstreet
  1 sibling, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-09-04 19:01 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Mikulas Patocka, Vivek Goyal, linux-bcache, linux-kernel,
	dm-devel, bharrosh, Jens Axboe

On Tue, Sep 04, 2012 at 11:55:40AM -0700, Tejun Heo wrote:
> Actually, if the timer approach can reduce the frequency of rescuer
> involvement, I think it could actually be better.

Ooh, it wouldn't.  It's kicking in only after alloc failure.  I don't
know.  I think conditioning it on alloc failure is cleaner and
converting all per-bio allocations to front-pad makes sense.  Using a
timer wouldn't make the mechanism any simpler, right?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120904190119.GD3638-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                                     ` <20120904190119.GD3638-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
@ 2012-09-04 19:43                                       ` Kent Overstreet
  0 siblings, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-09-04 19:43 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Mikulas Patocka, Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Tue, Sep 04, 2012 at 12:01:19PM -0700, Tejun Heo wrote:
> On Tue, Sep 04, 2012 at 11:55:40AM -0700, Tejun Heo wrote:
> > Actually, if the timer approach can reduce the frequency of rescuer
> > involvement, I think it could actually be better.
> 
> Ooh, it wouldn't.  It's kicking in only after alloc failure.  I don't
> know.  I think conditioning it on alloc failure is cleaner and
> converting all per-bio allocations to front-pad makes sense.  Using a
> timer wouldn't make the mechanism any simpler, right?

Exactly

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-09-04 18:55                                 ` Tejun Heo
  2012-09-04 19:01                                   ` Tejun Heo
@ 2012-09-04 19:42                                   ` Kent Overstreet
       [not found]                                     ` <20120904194237.GC25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-09-04 19:42 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Mikulas Patocka, Vivek Goyal, linux-bcache, linux-kernel,
	dm-devel, bharrosh, Jens Axboe

On Tue, Sep 04, 2012 at 11:55:40AM -0700, Tejun Heo wrote:
> Hello, Mikulas, Kent.
> 
> On Mon, Sep 03, 2012 at 08:41:00PM -0700, Kent Overstreet wrote:
> > On Mon, Sep 03, 2012 at 04:41:37PM -0400, Mikulas Patocka wrote:
> > > ... or another possibility - start a timer when something is put to 
> > > current->bio_list and use that timer to pop entries off current->bio_list 
> > > and submit them to a workqueue. The timer can be cpu-local so only 
> > > interrupt masking is required to synchronize against the timer.
> > > 
> > > This would normally run just like the current kernel and in case of 
> > > deadlock, the timer would kick in and resolve the deadlock.
> > 
> > Ugh. That's a _terrible_ idea.
> 
> That's exactly how workqueue rescuers work - rescuers kick in if new
> worker creation doesn't succeed in given amount of time.  The
> suggested mechanism already makes use of workqueue, so it's already
> doing it.  If you can think of a better way to detect the generic
> stall condition, please be my guest.
> 
> > Remember the old plugging code? You ever have to debug performance
> > issues caused by it?
> 
> That is not equivalent.  Plugging was kicking in all the time and it
> wasn't entirely well-defined what the plugging / unplugging conditions
> were.  This type of rescuing for forward-progress guarantee only kicks
> in under severe memory pressure and people expect finite latency and
> throughput hits under such conditions.  The usual bio / request /
> scsi_cmd allocations could be failing under these circumstances and
> things could be progressing only thanks to the finite preallocated
> pools.  I don't think involving rescue timer would be noticeably
> deterimental.
> 
> Actually, if the timer approach can reduce the frequency of rescuer
> involvement, I think it could actually be better.

Ok, that was an overly harsh, emotional response. But I still hate the
idea.

You want to point me at the relevant workqueue code? I'd really like to
see what you did there, it's entirely possible you're aware of some
issue I'm not but if not I'd like to take a stab at it.

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120904194237.GC25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                                     ` <20120904194237.GC25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-09-04 21:03                                       ` Tejun Heo
  0 siblings, 0 replies; 75+ messages in thread
From: Tejun Heo @ 2012-09-04 21:03 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Mikulas Patocka, Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello,

On Tue, Sep 04, 2012 at 12:42:37PM -0700, Kent Overstreet wrote:
> You want to point me at the relevant workqueue code? I'd really like to
> see what you did there, it's entirely possible you're aware of some
> issue I'm not but if not I'd like to take a stab at it.

I was mistaken.  The issue was that adding @gfp_flags to
kthread_create() wasn't trivial involving updates to arch callbacks,
so the timer was added to side-step the issue.  So, yeah, if it can be
made to work w/o timer, I think that would be better.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                               ` <20120904034100.GA21602-jC9Py7bek1znysI04z7BkA@public.gmane.org>
  2012-09-04 18:55                                 ` Tejun Heo
@ 2012-09-04 19:26                                 ` Mikulas Patocka
       [not found]                                   ` <Pine.LNX.4.64.1209041509160.22689-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
  1 sibling, 1 reply; 75+ messages in thread
From: Mikulas Patocka @ 2012-09-04 19:26 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe



On Mon, 3 Sep 2012, Kent Overstreet wrote:

> On Mon, Sep 03, 2012 at 04:41:37PM -0400, Mikulas Patocka wrote:
> > ... or another possibility - start a timer when something is put to 
> > current->bio_list and use that timer to pop entries off current->bio_list 
> > and submit them to a workqueue. The timer can be cpu-local so only 
> > interrupt masking is required to synchronize against the timer.
> > 
> > This would normally run just like the current kernel and in case of 
> > deadlock, the timer would kick in and resolve the deadlock.
> 
> Ugh. That's a _terrible_ idea.
> 
> Remember the old plugging code? You ever have to debug performance
> issues caused by it?

Yes, I do remember it (and I fixed one bug that resulted in missed unplug 
and degraded performance).

But currently, deadlocks due to exhausted mempools and bios being stalled 
in current->bio_list don't happen (or do happen below so rarely that they 
aren't reported).

If we add a timer, it will turn a deadlock into an i/o delay, but it can't 
make things any worse.

BTW. can these new-style timerless plugs introduce deadlocks too? What 
happens when some bios are indefinitely delayed because their requests are 
held in a plug and a mempool runs out?

> > > I could be convinced, but right now I prefer my solution.
> > 
> > It fixes bio allocation problem, but not other similar mempool problems in 
> > dm and md.
> 
> I looked a bit more, and actually I think the rest of the problem is
> pretty limited in scope - most of those mempool allocations are per
> request, not per split.
> 
> I'm willing to put some time into converting dm/md over to bioset's
> front_pad. I'm having to learn the code for the immutable biovec work,
> anyways.

Currently, dm targets allocate request-specific data from target-specific 
mempool. mempools are in dm-crypt, dm-delay, dm-mirror, dm-snapshot, 
dm-thin, dm-verity. You can change it to allocate request-specific data 
with the bio.

Mikulas

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <Pine.LNX.4.64.1209041509160.22689-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                                   ` <Pine.LNX.4.64.1209041509160.22689-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
@ 2012-09-04 19:39                                     ` Vivek Goyal
  2012-09-04 19:51                                     ` [PATCH] dm: Use bioset's front_pad for dm_target_io Kent Overstreet
  1 sibling, 0 replies; 75+ messages in thread
From: Vivek Goyal @ 2012-09-04 19:39 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Kent Overstreet, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Tue, Sep 04, 2012 at 03:26:19PM -0400, Mikulas Patocka wrote:

[..]
> BTW. can these new-style timerless plugs introduce deadlocks too? What 
> happens when some bios are indefinitely delayed because their requests are 
> held in a plug and a mempool runs out?

I think they will not deadlock because these on stack bios/requests are
flushed/dispatched when process schedules out. So if a submitter blocks
on a mempool, it will be scheduled out and requests on plug will be 
dispatched.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH] dm: Use bioset's front_pad for dm_target_io
       [not found]                                   ` <Pine.LNX.4.64.1209041509160.22689-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
  2012-09-04 19:39                                     ` Vivek Goyal
@ 2012-09-04 19:51                                     ` Kent Overstreet
       [not found]                                       ` <20120904195156.GE25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-09-04 19:51 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Tue, Sep 04, 2012 at 03:26:19PM -0400, Mikulas Patocka wrote:
> 
> 
> On Mon, 3 Sep 2012, Kent Overstreet wrote:
> 
> > On Mon, Sep 03, 2012 at 04:41:37PM -0400, Mikulas Patocka wrote:
> > > ... or another possibility - start a timer when something is put to 
> > > current->bio_list and use that timer to pop entries off current->bio_list 
> > > and submit them to a workqueue. The timer can be cpu-local so only 
> > > interrupt masking is required to synchronize against the timer.
> > > 
> > > This would normally run just like the current kernel and in case of 
> > > deadlock, the timer would kick in and resolve the deadlock.
> > 
> > Ugh. That's a _terrible_ idea.
> > 
> > Remember the old plugging code? You ever have to debug performance
> > issues caused by it?
> 
> Yes, I do remember it (and I fixed one bug that resulted in missed unplug 
> and degraded performance).
> 
> But currently, deadlocks due to exhausted mempools and bios being stalled 
> in current->bio_list don't happen (or do happen below so rarely that they 
> aren't reported).
> 
> If we add a timer, it will turn a deadlock into an i/o delay, but it can't 
> make things any worse.

This is all true. I'm not arguing your solution wouldn't _work_... I'd
try and give some real reasoning for my objections but it'll take me
awhile to figure out how to coherently explain it and I'm very sleep
deprived.

> Currently, dm targets allocate request-specific data from target-specific 
> mempool. mempools are in dm-crypt, dm-delay, dm-mirror, dm-snapshot, 
> dm-thin, dm-verity. You can change it to allocate request-specific data 
> with the bio.

I wrote a patch for dm_target_io last night. I think I know an easy way
to go about converting the rest but it'll probably have to wait until
I'm further along with my immutable bvec stuff.

Completely untested patch below:


commit 8754349145edfc791450d3ad54c19f0f3715c86c
Author: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Date:   Tue Sep 4 06:17:56 2012 -0700

    dm: Use bioset's front_pad for dm_target_io

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f2eb730..3cf39b0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -71,6 +71,7 @@ struct dm_target_io {
 	struct dm_io *io;
 	struct dm_target *ti;
 	union map_info info;
+	struct bio clone;
 };
 
 /*
@@ -174,7 +175,7 @@ struct mapped_device {
 	 * io objects are allocated from here.
 	 */
 	mempool_t *io_pool;
-	mempool_t *tio_pool;
+	mempool_t *rq_tio_pool;
 
 	struct bio_set *bs;
 
@@ -214,15 +215,8 @@ struct dm_md_mempools {
 
 #define MIN_IOS 256
 static struct kmem_cache *_io_cache;
-static struct kmem_cache *_tio_cache;
 static struct kmem_cache *_rq_tio_cache;
 
-/*
- * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
- * still used for _io_cache, I'm leaving this for a later cleanup
- */
-static struct kmem_cache *_rq_bio_info_cache;
-
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -232,22 +226,13 @@ static int __init local_init(void)
 	if (!_io_cache)
 		return r;
 
-	/* allocate a slab for the target ios */
-	_tio_cache = KMEM_CACHE(dm_target_io, 0);
-	if (!_tio_cache)
-		goto out_free_io_cache;
-
 	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 	if (!_rq_tio_cache)
-		goto out_free_tio_cache;
-
-	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
-	if (!_rq_bio_info_cache)
-		goto out_free_rq_tio_cache;
+		goto out_free_io_cache;
 
 	r = dm_uevent_init();
 	if (r)
-		goto out_free_rq_bio_info_cache;
+		goto out_free_rq_tio_cache;
 
 	_major = major;
 	r = register_blkdev(_major, _name);
@@ -261,12 +246,8 @@ static int __init local_init(void)
 
 out_uevent_exit:
 	dm_uevent_exit();
-out_free_rq_bio_info_cache:
-	kmem_cache_destroy(_rq_bio_info_cache);
 out_free_rq_tio_cache:
 	kmem_cache_destroy(_rq_tio_cache);
-out_free_tio_cache:
-	kmem_cache_destroy(_tio_cache);
 out_free_io_cache:
 	kmem_cache_destroy(_io_cache);
 
@@ -275,9 +256,7 @@ out_free_io_cache:
 
 static void local_exit(void)
 {
-	kmem_cache_destroy(_rq_bio_info_cache);
 	kmem_cache_destroy(_rq_tio_cache);
-	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
 	dm_uevent_exit();
@@ -461,20 +440,15 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
 	mempool_free(io, md->io_pool);
 }
 
-static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
-{
-	mempool_free(tio, md->tio_pool);
-}
-
 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 					    gfp_t gfp_mask)
 {
-	return mempool_alloc(md->tio_pool, gfp_mask);
+	return mempool_alloc(md->rq_tio_pool, gfp_mask);
 }
 
 static void free_rq_tio(struct dm_rq_target_io *tio)
 {
-	mempool_free(tio, tio->md->tio_pool);
+	mempool_free(tio, tio->md->rq_tio_pool);
 }
 
 static int md_in_flight(struct mapped_device *md)
@@ -658,7 +632,6 @@ static void clone_endio(struct bio *bio, int error)
 	int r = 0;
 	struct dm_target_io *tio = bio->bi_private;
 	struct dm_io *io = tio->io;
-	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 
 	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
@@ -681,7 +654,6 @@ static void clone_endio(struct bio *bio, int error)
 		}
 	}
 
-	free_tio(md, tio);
 	bio_put(bio);
 	dec_pending(io, error);
 }
@@ -998,13 +970,16 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
-static void __map_bio(struct dm_target *ti, struct bio *clone,
-		      struct dm_target_io *tio)
+static void __map_bio(struct dm_io *io, struct dm_target *ti, struct bio *clone)
 {
+	struct dm_target_io *tio = container_of(clone, struct dm_target_io, clone);
 	int r;
 	sector_t sector;
 	struct mapped_device *md;
 
+	tio->io = io;
+	tio->ti = ti;
+
 	clone->bi_end_io = clone_endio;
 	clone->bi_private = tio;
 
@@ -1028,7 +1003,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		md = tio->io->md;
 		dec_pending(tio->io, r);
 		bio_put(clone);
-		free_tio(md, tio);
 	} else if (r) {
 		DMWARN("unimplemented target map return value: %d", r);
 		BUG();
@@ -1104,26 +1078,18 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	return clone;
 }
 
-static struct dm_target_io *alloc_tio(struct clone_info *ci,
-				      struct dm_target *ti)
+static void init_tio(struct bio *bio)
 {
-	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
-
-	tio->io = ci->io;
-	tio->ti = ti;
+	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 	memset(&tio->info, 0, sizeof(tio->info));
-
-	return tio;
 }
 
 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 				   unsigned request_nr, sector_t len)
 {
-	struct dm_target_io *tio = alloc_tio(ci, ti);
+	struct dm_target_io *tio;
 	struct bio *clone;
 
-	tio->info.target_request_nr = request_nr;
-
 	/*
 	 * Discard requests require the bio's inline iovecs be initialized.
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
@@ -1136,7 +1102,10 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 		clone->bi_size = to_bytes(len);
 	}
 
-	__map_bio(ti, clone, tio);
+	tio = container_of(clone, struct dm_target_io, clone);
+	tio->info.target_request_nr = request_nr;
+
+	__map_bio(ci->io, ti, clone);
 }
 
 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
@@ -1166,13 +1135,13 @@ static int __clone_and_map_empty_flush(struct clone_info *ci)
 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
 {
 	struct bio *clone, *bio = ci->bio;
-	struct dm_target_io *tio;
 
-	tio = alloc_tio(ci, ti);
 	clone = clone_bio(bio, ci->sector, ci->idx,
 			  bio->bi_vcnt - ci->idx, ci->sector_count,
 			  ci->md->bs);
-	__map_bio(ti, clone, tio);
+
+	init_tio(clone);
+	__map_bio(ci->io, ti, clone);
 	ci->sector_count = 0;
 }
 
@@ -1213,7 +1182,6 @@ static int __clone_and_map(struct clone_info *ci)
 	struct bio *clone, *bio = ci->bio;
 	struct dm_target *ti;
 	sector_t len = 0, max;
-	struct dm_target_io *tio;
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD))
 		return __clone_and_map_discard(ci);
@@ -1250,10 +1218,11 @@ static int __clone_and_map(struct clone_info *ci)
 			len += bv_len;
 		}
 
-		tio = alloc_tio(ci, ti);
 		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
 				  ci->md->bs);
-		__map_bio(ti, clone, tio);
+
+		init_tio(clone);
+		__map_bio(ci->io, ti, clone);
 
 		ci->sector += len;
 		ci->sector_count -= len;
@@ -1278,12 +1247,12 @@ static int __clone_and_map(struct clone_info *ci)
 
 			len = min(remaining, max);
 
-			tio = alloc_tio(ci, ti);
 			clone = split_bvec(bio, ci->sector, ci->idx,
 					   bv->bv_offset + offset, len,
 					   ci->md->bs);
 
-			__map_bio(ti, clone, tio);
+			init_tio(clone);
+			__map_bio(ci->io, ti, clone);
 
 			ci->sector += len;
 			ci->sector_count -= len;
@@ -1911,8 +1880,8 @@ static void free_dev(struct mapped_device *md)
 	unlock_fs(md);
 	bdput(md->bdev);
 	destroy_workqueue(md->wq);
-	if (md->tio_pool)
-		mempool_destroy(md->tio_pool);
+	if (md->rq_tio_pool)
+		mempool_destroy(md->rq_tio_pool);
 	if (md->io_pool)
 		mempool_destroy(md->io_pool);
 	if (md->bs)
@@ -1935,16 +1904,16 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p;
 
-	if (md->io_pool && md->tio_pool && md->bs)
+	if ((md->io_pool || md->rq_tio_pool) && md->bs)
 		/* the md already has necessary mempools */
 		goto out;
 
 	p = dm_table_get_md_mempools(t);
-	BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
+	BUG_ON(!p || md->io_pool || md->rq_tio_pool || md->bs);
 
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
-	md->tio_pool = p->tio_pool;
+	md->rq_tio_pool = p->tio_pool;
 	p->tio_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
@@ -2693,40 +2662,29 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 	if (!pools)
 		return NULL;
 
-	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
-			 mempool_create_slab_pool(MIN_IOS, _io_cache) :
-			 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
+	if (type == DM_TYPE_BIO_BASED)
+		pools->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
 	if (!pools->io_pool)
-		goto free_pools_and_out;
+		goto err;
 
-	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
-			  mempool_create_slab_pool(MIN_IOS, _tio_cache) :
-			  mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
+	if (type == DM_TYPE_REQUEST_BASED)
+		pools->tio_pool =
+			mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
 	if (!pools->tio_pool)
-		goto free_io_pool_and_out;
+		goto err;
 
 	pools->bs = bioset_create(pool_size,
-				  offsetof(struct dm_rq_clone_bio_info, clone));
+				  max(offsetof(struct dm_target_io, clone),
+				      offsetof(struct dm_rq_clone_bio_info, clone)));
 	if (!pools->bs)
-		goto free_tio_pool_and_out;
+		goto err;
 
 	if (integrity && bioset_integrity_create(pools->bs, pool_size))
-		goto free_bioset_and_out;
+		goto err;
 
 	return pools;
-
-free_bioset_and_out:
-	bioset_free(pools->bs);
-
-free_tio_pool_and_out:
-	mempool_destroy(pools->tio_pool);
-
-free_io_pool_and_out:
-	mempool_destroy(pools->io_pool);
-
-free_pools_and_out:
-	kfree(pools);
-
+err:
+	dm_free_md_mempools(pools);
 	return NULL;
 }
 

^ permalink raw reply related	[flat|nested] 75+ messages in thread

[parent not found: <20120904195156.GE25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH] dm: Use bioset's front_pad for dm_target_io
       [not found]                                       ` <20120904195156.GE25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-09-04 21:20                                         ` Tejun Heo
  2012-09-11 19:28                                         ` [PATCH 2] " Mikulas Patocka
  1 sibling, 0 replies; 75+ messages in thread
From: Tejun Heo @ 2012-09-04 21:20 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Mikulas Patocka, Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello, Kent.

On Tue, Sep 04, 2012 at 12:51:56PM -0700, Kent Overstreet wrote:
> I wrote a patch for dm_target_io last night. I think I know an easy way
> to go about converting the rest but it'll probably have to wait until
> I'm further along with my immutable bvec stuff.
> 
> Completely untested patch below:

Yeap, this looks great to me.  In the end, I think it's better to
require stacking drivers to not use separate mempools other than
bioset.  Timer or not, using multiple alloc pools is brittle.  Any
path which ends up allocating in different orders for whatever reason
can lead to subtle deadlock scenarios which can be very difficult to
track down and, at least currently, there's no way to automatically
detect them.  Besides, w/ front-pad, it really shouldn't be necessary.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH 2] dm: Use bioset's front_pad for dm_target_io
       [not found]                                       ` <20120904195156.GE25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2012-09-04 21:20                                         ` Tejun Heo
@ 2012-09-11 19:28                                         ` Mikulas Patocka
       [not found]                                           ` <Pine.LNX.4.64.1209111349210.23947-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
  1 sibling, 1 reply; 75+ messages in thread
From: Mikulas Patocka @ 2012-09-11 19:28 UTC (permalink / raw)
  To: Kent Overstreet, Alasdair G. Kergon
  Cc: Vivek Goyal, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe



On Tue, 4 Sep 2012, Kent Overstreet wrote:

> On Tue, Sep 04, 2012 at 03:26:19PM -0400, Mikulas Patocka wrote:
> > 
> > 
> > On Mon, 3 Sep 2012, Kent Overstreet wrote:
> > 
> > > On Mon, Sep 03, 2012 at 04:41:37PM -0400, Mikulas Patocka wrote:
> > > > ... or another possibility - start a timer when something is put to 
> > > > current->bio_list and use that timer to pop entries off current->bio_list 
> > > > and submit them to a workqueue. The timer can be cpu-local so only 
> > > > interrupt masking is required to synchronize against the timer.
> > > > 
> > > > This would normally run just like the current kernel and in case of 
> > > > deadlock, the timer would kick in and resolve the deadlock.
> > > 
> > > Ugh. That's a _terrible_ idea.
> > > 
> > > Remember the old plugging code? You ever have to debug performance
> > > issues caused by it?
> > 
> > Yes, I do remember it (and I fixed one bug that resulted in missed unplug 
> > and degraded performance).
> > 
> > But currently, deadlocks due to exhausted mempools and bios being stalled 
> > in current->bio_list don't happen (or do happen below so rarely that they 
> > aren't reported).
> > 
> > If we add a timer, it will turn a deadlock into an i/o delay, but it can't 
> > make things any worse.
> 
> This is all true. I'm not arguing your solution wouldn't _work_... I'd
> try and give some real reasoning for my objections but it'll take me
> awhile to figure out how to coherently explain it and I'm very sleep
> deprived.
> 
> > Currently, dm targets allocate request-specific data from target-specific 
> > mempool. mempools are in dm-crypt, dm-delay, dm-mirror, dm-snapshot, 
> > dm-thin, dm-verity. You can change it to allocate request-specific data 
> > with the bio.
> 
> I wrote a patch for dm_target_io last night. I think I know an easy way
> to go about converting the rest but it'll probably have to wait until
> I'm further along with my immutable bvec stuff.
> 
> Completely untested patch below:

The patch didn't work (it has random allocation failures and crashes when 
the device is closed). The patch also contains some unrelated changes.

I created this patch that does the same thing.

Mikulas

---
Use front pad to allocate dm_target_io

dm_target_io was previously allocated from a mempool. For each
dm_target_io, there is exactly one bio allocated from a bioset.

This patch merges these two allocations into one allocating: we create a
bioset with front_pad equal to the size of dm_target_io - so that every
bio allocated from the bioset has sizeof(struct dm_target_io) bytes
before it. We allocate a bio and use the bytes before the bio as
dm_target_io.

This idea was introdiced by Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Signed-off-by: Mikulas Patocka <mpatocka-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

---
 drivers/md/dm.c |   96 +++++++++++++++++++++++---------------------------------
 1 file changed, 41 insertions(+), 55 deletions(-)

Index: linux-3.5.3-fast/drivers/md/dm.c
===================================================================
--- linux-3.5.3-fast.orig/drivers/md/dm.c	2012-09-10 16:06:30.000000000 +0200
+++ linux-3.5.3-fast/drivers/md/dm.c	2012-09-11 19:32:36.000000000 +0200
@@ -71,6 +71,7 @@ struct dm_target_io {
 	struct dm_io *io;
 	struct dm_target *ti;
 	union map_info info;
+	struct bio clone;
 };
 
 /*
@@ -464,7 +465,9 @@ static void free_io(struct mapped_device
 
 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 {
-	mempool_free(tio, md->tio_pool);
+	tio->clone.bi_private = md->bs;
+	tio->clone.bi_end_io = NULL;
+	bio_put(&tio->clone);
 }
 
 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
@@ -710,13 +713,7 @@ static void clone_endio(struct bio *bio,
 		}
 	}
 
-	/*
-	 * Store md for cleanup instead of tio which is about to get freed.
-	 */
-	bio->bi_private = md->bs;
-
 	free_tio(md, tio);
-	bio_put(bio);
 	dec_pending(io, error);
 }
 
@@ -1032,12 +1029,12 @@ int dm_set_target_max_io_len(struct dm_t
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
-static void __map_bio(struct dm_target *ti, struct bio *clone,
-		      struct dm_target_io *tio)
+static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
 	struct mapped_device *md;
+	struct bio *clone = &tio->clone;
 
 	clone->bi_end_io = clone_endio;
 	clone->bi_private = tio;
@@ -1061,12 +1058,6 @@ static void __map_bio(struct dm_target *
 		/* error the io and bail out, or requeue it if needed */
 		md = tio->io->md;
 		dec_pending(tio->io, r);
-		/*
-		 * Store bio_set for cleanup.
-		 */
-		clone->bi_end_io = NULL;
-		clone->bi_private = md->bs;
-		bio_put(clone);
 		free_tio(md, tio);
 	} else if (r) {
 		DMWARN("unimplemented target map return value: %d", r);
@@ -1094,15 +1085,13 @@ static void dm_bio_destructor(struct bio
 /*
  * Creates a little bio that just does part of a bvec.
  */
-static struct bio *split_bvec(struct bio *bio, sector_t sector,
-			      unsigned short idx, unsigned int offset,
-			      unsigned int len, struct bio_set *bs)
+static void split_bvec(struct dm_target_io *tio, struct bio *bio,
+		       sector_t sector, unsigned short idx, unsigned int offset,
+		       unsigned int len, struct bio_set *bs)
 {
-	struct bio *clone;
+	struct bio *clone = &tio->clone;
 	struct bio_vec *bv = bio->bi_io_vec + idx;
 
-	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
-	clone->bi_destructor = dm_bio_destructor;
 	*clone->bi_io_vec = *bv;
 
 	clone->bi_sector = sector;
@@ -1119,22 +1108,19 @@ static struct bio *split_bvec(struct bio
 		bio_integrity_trim(clone,
 				   bio_sector_offset(bio, idx, offset), len);
 	}
-
-	return clone;
 }
 
 /*
  * Creates a bio that consists of range of complete bvecs.
  */
-static struct bio *clone_bio(struct bio *bio, sector_t sector,
-			     unsigned short idx, unsigned short bv_count,
-			     unsigned int len, struct bio_set *bs)
+static void clone_bio(struct dm_target_io *tio, struct bio *bio,
+		      sector_t sector, unsigned short idx,
+		      unsigned short bv_count, unsigned int len,
+		      struct bio_set *bs)
 {
-	struct bio *clone;
+	struct bio *clone = &tio->clone;
 
-	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
-	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
 	clone->bi_vcnt = idx + bv_count;
@@ -1148,14 +1134,17 @@ static struct bio *clone_bio(struct bio 
 			bio_integrity_trim(clone,
 					   bio_sector_offset(bio, idx, 0), len);
 	}
-
-	return clone;
 }
 
 static struct dm_target_io *alloc_tio(struct clone_info *ci,
-				      struct dm_target *ti)
+				      struct dm_target *ti, int nr_iovecs)
 {
-	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
+	struct dm_target_io *tio;
+	struct bio *clone;
+
+	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
+	tio = container_of(clone, struct dm_target_io, clone);
+	tio->clone.bi_destructor = dm_bio_destructor;
 
 	tio->io = ci->io;
 	tio->ti = ti;
@@ -1167,8 +1156,8 @@ static struct dm_target_io *alloc_tio(st
 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 				   unsigned request_nr, sector_t len)
 {
-	struct dm_target_io *tio = alloc_tio(ci, ti);
-	struct bio *clone;
+	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
+	struct bio *clone = &tio->clone;
 
 	tio->info.target_request_nr = request_nr;
 
@@ -1177,15 +1166,13 @@ static void __issue_target_request(struc
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
 	 * and discard, so no need for concern about wasted bvec allocations.
 	 */
-	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
 	__bio_clone(clone, ci->bio);
-	clone->bi_destructor = dm_bio_destructor;
 	if (len) {
 		clone->bi_sector = ci->sector;
 		clone->bi_size = to_bytes(len);
 	}
 
-	__map_bio(ti, clone, tio);
+	__map_bio(ti, tio);
 }
 
 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
@@ -1214,14 +1201,13 @@ static int __clone_and_map_empty_flush(s
  */
 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
 {
-	struct bio *clone, *bio = ci->bio;
+	struct bio *bio = ci->bio;
 	struct dm_target_io *tio;
 
-	tio = alloc_tio(ci, ti);
-	clone = clone_bio(bio, ci->sector, ci->idx,
-			  bio->bi_vcnt - ci->idx, ci->sector_count,
-			  ci->md->bs);
-	__map_bio(ti, clone, tio);
+	tio = alloc_tio(ci, ti, bio->bi_max_vecs);
+	clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx,
+		  ci->sector_count, ci->md->bs);
+	__map_bio(ti, tio);
 	ci->sector_count = 0;
 }
 
@@ -1259,7 +1245,7 @@ static int __clone_and_map_discard(struc
 
 static int __clone_and_map(struct clone_info *ci)
 {
-	struct bio *clone, *bio = ci->bio;
+	struct bio *bio = ci->bio;
 	struct dm_target *ti;
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
@@ -1299,10 +1285,10 @@ static int __clone_and_map(struct clone_
 			len += bv_len;
 		}
 
-		tio = alloc_tio(ci, ti);
-		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
-				  ci->md->bs);
-		__map_bio(ti, clone, tio);
+		tio = alloc_tio(ci, ti, bio->bi_max_vecs);
+		clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len,
+			  ci->md->bs);
+		__map_bio(ti, tio);
 
 		ci->sector += len;
 		ci->sector_count -= len;
@@ -1327,12 +1313,11 @@ static int __clone_and_map(struct clone_
 
 			len = min(remaining, max);
 
-			tio = alloc_tio(ci, ti);
-			clone = split_bvec(bio, ci->sector, ci->idx,
-					   bv->bv_offset + offset, len,
-					   ci->md->bs);
+			tio = alloc_tio(ci, ti, 1);
+			split_bvec(tio, bio, ci->sector, ci->idx,
+				   bv->bv_offset + offset, len, ci->md->bs);
 
-			__map_bio(ti, clone, tio);
+			__map_bio(ti, tio);
 
 			ci->sector += len;
 			ci->sector_count -= len;
@@ -2765,7 +2750,8 @@ struct dm_md_mempools *dm_alloc_md_mempo
 	if (!pools->tio_pool)
 		goto free_io_pool_and_out;
 
-	pools->bs = bioset_create(pool_size, 0);
+	pools->bs = bioset_create(pool_size, (type == DM_TYPE_BIO_BASED) ?
+		offsetof(struct dm_target_io, clone) : 0);
 	if (!pools->bs)
 		goto free_tio_pool_and_out;
 

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <Pine.LNX.4.64.1209111349210.23947-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>]

* Re: [PATCH 2] dm: Use bioset's front_pad for dm_target_io
       [not found]                                           ` <Pine.LNX.4.64.1209111349210.23947-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
@ 2012-09-11 19:50                                             ` Kent Overstreet
       [not found]                                               ` <20120911195029.GJ19739-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Kent Overstreet @ 2012-09-11 19:50 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Alasdair G. Kergon, Vivek Goyal,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Tue, Sep 11, 2012 at 03:28:57PM -0400, Mikulas Patocka wrote:
> 
> 
> On Tue, 4 Sep 2012, Kent Overstreet wrote:
> 
> > On Tue, Sep 04, 2012 at 03:26:19PM -0400, Mikulas Patocka wrote:
> > > 
> > > 
> > > On Mon, 3 Sep 2012, Kent Overstreet wrote:
> > > 
> > > > On Mon, Sep 03, 2012 at 04:41:37PM -0400, Mikulas Patocka wrote:
> > > > > ... or another possibility - start a timer when something is put to 
> > > > > current->bio_list and use that timer to pop entries off current->bio_list 
> > > > > and submit them to a workqueue. The timer can be cpu-local so only 
> > > > > interrupt masking is required to synchronize against the timer.
> > > > > 
> > > > > This would normally run just like the current kernel and in case of 
> > > > > deadlock, the timer would kick in and resolve the deadlock.
> > > > 
> > > > Ugh. That's a _terrible_ idea.
> > > > 
> > > > Remember the old plugging code? You ever have to debug performance
> > > > issues caused by it?
> > > 
> > > Yes, I do remember it (and I fixed one bug that resulted in missed unplug 
> > > and degraded performance).
> > > 
> > > But currently, deadlocks due to exhausted mempools and bios being stalled 
> > > in current->bio_list don't happen (or do happen below so rarely that they 
> > > aren't reported).
> > > 
> > > If we add a timer, it will turn a deadlock into an i/o delay, but it can't 
> > > make things any worse.
> > 
> > This is all true. I'm not arguing your solution wouldn't _work_... I'd
> > try and give some real reasoning for my objections but it'll take me
> > awhile to figure out how to coherently explain it and I'm very sleep
> > deprived.
> > 
> > > Currently, dm targets allocate request-specific data from target-specific 
> > > mempool. mempools are in dm-crypt, dm-delay, dm-mirror, dm-snapshot, 
> > > dm-thin, dm-verity. You can change it to allocate request-specific data 
> > > with the bio.
> > 
> > I wrote a patch for dm_target_io last night. I think I know an easy way
> > to go about converting the rest but it'll probably have to wait until
> > I'm further along with my immutable bvec stuff.
> > 
> > Completely untested patch below:
> 
> The patch didn't work (it has random allocation failures and crashes when 
> the device is closed). The patch also contains some unrelated changes.
> 
> I created this patch that does the same thing.

Cool! Thanks for finishing this.

I like your approach with rolling the bio allocation into alloc_tio() -
it solves a problem I was having with the front_pad not being zeroed -
but it does prevent the use of bio_clone_bioset(), which is going to
cause minor issues with my immutable bvec work.

Perhaps bio_alloc_bioset() should just be changed to zero the front_pad,
that'd probably be useful elsewhere anyways.

You might want to rebase onto Jens' tree, it has my patches that get rid
of bi_destructor and the front_pad conversion for request based dm:

git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next

> 
> Mikulas
> 
> ---
> Use front pad to allocate dm_target_io
> 
> dm_target_io was previously allocated from a mempool. For each
> dm_target_io, there is exactly one bio allocated from a bioset.
> 
> This patch merges these two allocations into one allocating: we create a
> bioset with front_pad equal to the size of dm_target_io - so that every
> bio allocated from the bioset has sizeof(struct dm_target_io) bytes
> before it. We allocate a bio and use the bytes before the bio as
> dm_target_io.
> 
> This idea was introdiced by Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> 
> Signed-off-by: Mikulas Patocka <mpatocka-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> 
> ---
>  drivers/md/dm.c |   96 +++++++++++++++++++++++---------------------------------
>  1 file changed, 41 insertions(+), 55 deletions(-)
> 
> Index: linux-3.5.3-fast/drivers/md/dm.c
> ===================================================================
> --- linux-3.5.3-fast.orig/drivers/md/dm.c	2012-09-10 16:06:30.000000000 +0200
> +++ linux-3.5.3-fast/drivers/md/dm.c	2012-09-11 19:32:36.000000000 +0200
> @@ -71,6 +71,7 @@ struct dm_target_io {
>  	struct dm_io *io;
>  	struct dm_target *ti;
>  	union map_info info;
> +	struct bio clone;
>  };
>  
>  /*
> @@ -464,7 +465,9 @@ static void free_io(struct mapped_device
>  
>  static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
>  {
> -	mempool_free(tio, md->tio_pool);
> +	tio->clone.bi_private = md->bs;
> +	tio->clone.bi_end_io = NULL;
> +	bio_put(&tio->clone);
>  }
>  
>  static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
> @@ -710,13 +713,7 @@ static void clone_endio(struct bio *bio,
>  		}
>  	}
>  
> -	/*
> -	 * Store md for cleanup instead of tio which is about to get freed.
> -	 */
> -	bio->bi_private = md->bs;
> -
>  	free_tio(md, tio);
> -	bio_put(bio);
>  	dec_pending(io, error);
>  }
>  
> @@ -1032,12 +1029,12 @@ int dm_set_target_max_io_len(struct dm_t
>  }
>  EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
>  
> -static void __map_bio(struct dm_target *ti, struct bio *clone,
> -		      struct dm_target_io *tio)
> +static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
>  {
>  	int r;
>  	sector_t sector;
>  	struct mapped_device *md;
> +	struct bio *clone = &tio->clone;
>  
>  	clone->bi_end_io = clone_endio;
>  	clone->bi_private = tio;
> @@ -1061,12 +1058,6 @@ static void __map_bio(struct dm_target *
>  		/* error the io and bail out, or requeue it if needed */
>  		md = tio->io->md;
>  		dec_pending(tio->io, r);
> -		/*
> -		 * Store bio_set for cleanup.
> -		 */
> -		clone->bi_end_io = NULL;
> -		clone->bi_private = md->bs;
> -		bio_put(clone);
>  		free_tio(md, tio);
>  	} else if (r) {
>  		DMWARN("unimplemented target map return value: %d", r);
> @@ -1094,15 +1085,13 @@ static void dm_bio_destructor(struct bio
>  /*
>   * Creates a little bio that just does part of a bvec.
>   */
> -static struct bio *split_bvec(struct bio *bio, sector_t sector,
> -			      unsigned short idx, unsigned int offset,
> -			      unsigned int len, struct bio_set *bs)
> +static void split_bvec(struct dm_target_io *tio, struct bio *bio,
> +		       sector_t sector, unsigned short idx, unsigned int offset,
> +		       unsigned int len, struct bio_set *bs)
>  {
> -	struct bio *clone;
> +	struct bio *clone = &tio->clone;
>  	struct bio_vec *bv = bio->bi_io_vec + idx;
>  
> -	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
> -	clone->bi_destructor = dm_bio_destructor;
>  	*clone->bi_io_vec = *bv;
>  
>  	clone->bi_sector = sector;
> @@ -1119,22 +1108,19 @@ static struct bio *split_bvec(struct bio
>  		bio_integrity_trim(clone,
>  				   bio_sector_offset(bio, idx, offset), len);
>  	}
> -
> -	return clone;
>  }
>  
>  /*
>   * Creates a bio that consists of range of complete bvecs.
>   */
> -static struct bio *clone_bio(struct bio *bio, sector_t sector,
> -			     unsigned short idx, unsigned short bv_count,
> -			     unsigned int len, struct bio_set *bs)
> +static void clone_bio(struct dm_target_io *tio, struct bio *bio,
> +		      sector_t sector, unsigned short idx,
> +		      unsigned short bv_count, unsigned int len,
> +		      struct bio_set *bs)
>  {
> -	struct bio *clone;
> +	struct bio *clone = &tio->clone;
>  
> -	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
>  	__bio_clone(clone, bio);
> -	clone->bi_destructor = dm_bio_destructor;
>  	clone->bi_sector = sector;
>  	clone->bi_idx = idx;
>  	clone->bi_vcnt = idx + bv_count;
> @@ -1148,14 +1134,17 @@ static struct bio *clone_bio(struct bio 
>  			bio_integrity_trim(clone,
>  					   bio_sector_offset(bio, idx, 0), len);
>  	}
> -
> -	return clone;
>  }
>  
>  static struct dm_target_io *alloc_tio(struct clone_info *ci,
> -				      struct dm_target *ti)
> +				      struct dm_target *ti, int nr_iovecs)
>  {
> -	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
> +	struct dm_target_io *tio;
> +	struct bio *clone;
> +
> +	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
> +	tio = container_of(clone, struct dm_target_io, clone);
> +	tio->clone.bi_destructor = dm_bio_destructor;
>  
>  	tio->io = ci->io;
>  	tio->ti = ti;
> @@ -1167,8 +1156,8 @@ static struct dm_target_io *alloc_tio(st
>  static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
>  				   unsigned request_nr, sector_t len)
>  {
> -	struct dm_target_io *tio = alloc_tio(ci, ti);
> -	struct bio *clone;
> +	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
> +	struct bio *clone = &tio->clone;
>  
>  	tio->info.target_request_nr = request_nr;
>  
> @@ -1177,15 +1166,13 @@ static void __issue_target_request(struc
>  	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
>  	 * and discard, so no need for concern about wasted bvec allocations.
>  	 */
> -	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
>  	__bio_clone(clone, ci->bio);
> -	clone->bi_destructor = dm_bio_destructor;
>  	if (len) {
>  		clone->bi_sector = ci->sector;
>  		clone->bi_size = to_bytes(len);
>  	}
>  
> -	__map_bio(ti, clone, tio);
> +	__map_bio(ti, tio);
>  }
>  
>  static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
> @@ -1214,14 +1201,13 @@ static int __clone_and_map_empty_flush(s
>   */
>  static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
>  {
> -	struct bio *clone, *bio = ci->bio;
> +	struct bio *bio = ci->bio;
>  	struct dm_target_io *tio;
>  
> -	tio = alloc_tio(ci, ti);
> -	clone = clone_bio(bio, ci->sector, ci->idx,
> -			  bio->bi_vcnt - ci->idx, ci->sector_count,
> -			  ci->md->bs);
> -	__map_bio(ti, clone, tio);
> +	tio = alloc_tio(ci, ti, bio->bi_max_vecs);
> +	clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx,
> +		  ci->sector_count, ci->md->bs);
> +	__map_bio(ti, tio);
>  	ci->sector_count = 0;
>  }
>  
> @@ -1259,7 +1245,7 @@ static int __clone_and_map_discard(struc
>  
>  static int __clone_and_map(struct clone_info *ci)
>  {
> -	struct bio *clone, *bio = ci->bio;
> +	struct bio *bio = ci->bio;
>  	struct dm_target *ti;
>  	sector_t len = 0, max;
>  	struct dm_target_io *tio;
> @@ -1299,10 +1285,10 @@ static int __clone_and_map(struct clone_
>  			len += bv_len;
>  		}
>  
> -		tio = alloc_tio(ci, ti);
> -		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
> -				  ci->md->bs);
> -		__map_bio(ti, clone, tio);
> +		tio = alloc_tio(ci, ti, bio->bi_max_vecs);
> +		clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len,
> +			  ci->md->bs);
> +		__map_bio(ti, tio);
>  
>  		ci->sector += len;
>  		ci->sector_count -= len;
> @@ -1327,12 +1313,11 @@ static int __clone_and_map(struct clone_
>  
>  			len = min(remaining, max);
>  
> -			tio = alloc_tio(ci, ti);
> -			clone = split_bvec(bio, ci->sector, ci->idx,
> -					   bv->bv_offset + offset, len,
> -					   ci->md->bs);
> +			tio = alloc_tio(ci, ti, 1);
> +			split_bvec(tio, bio, ci->sector, ci->idx,
> +				   bv->bv_offset + offset, len, ci->md->bs);
>  
> -			__map_bio(ti, clone, tio);
> +			__map_bio(ti, tio);
>  
>  			ci->sector += len;
>  			ci->sector_count -= len;
> @@ -2765,7 +2750,8 @@ struct dm_md_mempools *dm_alloc_md_mempo
>  	if (!pools->tio_pool)
>  		goto free_io_pool_and_out;
>  
> -	pools->bs = bioset_create(pool_size, 0);
> +	pools->bs = bioset_create(pool_size, (type == DM_TYPE_BIO_BASED) ?
> +		offsetof(struct dm_target_io, clone) : 0);
>  	if (!pools->bs)
>  		goto free_tio_pool_and_out;
>  

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120911195029.GJ19739-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH 2] dm: Use bioset's front_pad for dm_target_io
       [not found]                                               ` <20120911195029.GJ19739-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-09-12 22:31                                                 ` Mikulas Patocka
       [not found]                                                   ` <Pine.LNX.4.64.1209121831110.21106-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
  0 siblings, 1 reply; 75+ messages in thread
From: Mikulas Patocka @ 2012-09-12 22:31 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Alasdair G. Kergon, Vivek Goyal,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe



On Tue, 11 Sep 2012, Kent Overstreet wrote:

> On Tue, Sep 11, 2012 at 03:28:57PM -0400, Mikulas Patocka wrote:
> > 
> > 
> > On Tue, 4 Sep 2012, Kent Overstreet wrote:
> > 
> > > On Tue, Sep 04, 2012 at 03:26:19PM -0400, Mikulas Patocka wrote:
> > > > 
> > > > 
> > > > On Mon, 3 Sep 2012, Kent Overstreet wrote:
> > > > 
> > > > > On Mon, Sep 03, 2012 at 04:41:37PM -0400, Mikulas Patocka wrote:
> > > > > > ... or another possibility - start a timer when something is put to 
> > > > > > current->bio_list and use that timer to pop entries off current->bio_list 
> > > > > > and submit them to a workqueue. The timer can be cpu-local so only 
> > > > > > interrupt masking is required to synchronize against the timer.
> > > > > > 
> > > > > > This would normally run just like the current kernel and in case of 
> > > > > > deadlock, the timer would kick in and resolve the deadlock.
> > > > > 
> > > > > Ugh. That's a _terrible_ idea.
> > > > > 
> > > > > Remember the old plugging code? You ever have to debug performance
> > > > > issues caused by it?
> > > > 
> > > > Yes, I do remember it (and I fixed one bug that resulted in missed unplug 
> > > > and degraded performance).
> > > > 
> > > > But currently, deadlocks due to exhausted mempools and bios being stalled 
> > > > in current->bio_list don't happen (or do happen below so rarely that they 
> > > > aren't reported).
> > > > 
> > > > If we add a timer, it will turn a deadlock into an i/o delay, but it can't 
> > > > make things any worse.
> > > 
> > > This is all true. I'm not arguing your solution wouldn't _work_... I'd
> > > try and give some real reasoning for my objections but it'll take me
> > > awhile to figure out how to coherently explain it and I'm very sleep
> > > deprived.
> > > 
> > > > Currently, dm targets allocate request-specific data from target-specific 
> > > > mempool. mempools are in dm-crypt, dm-delay, dm-mirror, dm-snapshot, 
> > > > dm-thin, dm-verity. You can change it to allocate request-specific data 
> > > > with the bio.
> > > 
> > > I wrote a patch for dm_target_io last night. I think I know an easy way
> > > to go about converting the rest but it'll probably have to wait until
> > > I'm further along with my immutable bvec stuff.
> > > 
> > > Completely untested patch below:
> > 
> > The patch didn't work (it has random allocation failures and crashes when 
> > the device is closed). The patch also contains some unrelated changes.
> > 
> > I created this patch that does the same thing.
> 
> Cool! Thanks for finishing this.
> 
> I like your approach with rolling the bio allocation into alloc_tio() -
> it solves a problem I was having with the front_pad not being zeroed -
> but it does prevent the use of bio_clone_bioset(), which is going to
> cause minor issues with my immutable bvec work.
> 
> Perhaps bio_alloc_bioset() should just be changed to zero the front_pad,
> that'd probably be useful elsewhere anyways.
> 
> You might want to rebase onto Jens' tree, it has my patches that get rid
> of bi_destructor and the front_pad conversion for request based dm:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next

This is the patch based on this tree.

Mikulas

---

Use front pad to allocate dm_target_io

dm_target_io was previously allocated from a mempool. For each
dm_target_io, there is exactly one bio allocated from a bioset.

This patch merges these two allocations into one allocating: we create a
bioset with front_pad equal to the size of dm_target_io - so that every
bio allocated from the bioset has sizeof(struct dm_target_io) bytes
before it. We allocate a bio and use the bytes before the bio as
dm_target_io.

This idea was introdiced by Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Signed-off-by: Mikulas Patocka <mpatocka-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

---
 drivers/md/dm.c |   80 ++++++++++++++++++++++++++------------------------------
 1 file changed, 38 insertions(+), 42 deletions(-)

Index: linux-block-copy/drivers/md/dm.c
===================================================================
--- linux-block-copy.orig/drivers/md/dm.c	2012-09-12 21:04:34.000000000 +0200
+++ linux-block-copy/drivers/md/dm.c	2012-09-12 21:07:09.000000000 +0200
@@ -71,6 +71,7 @@ struct dm_target_io {
 	struct dm_io *io;
 	struct dm_target *ti;
 	union map_info info;
+	struct bio clone;
 };
 
 /*
@@ -474,7 +475,7 @@ static void free_io(struct mapped_device
 
 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 {
-	mempool_free(tio, md->tio_pool);
+	bio_put(&tio->clone);
 }
 
 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
@@ -711,7 +712,6 @@ static void clone_endio(struct bio *bio,
 	}
 
 	free_tio(md, tio);
-	bio_put(bio);
 	dec_pending(io, error);
 }
 
@@ -1027,12 +1027,12 @@ int dm_set_target_max_io_len(struct dm_t
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
-static void __map_bio(struct dm_target *ti, struct bio *clone,
-		      struct dm_target_io *tio)
+static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
 	struct mapped_device *md;
+	struct bio *clone = &tio->clone;
 
 	clone->bi_end_io = clone_endio;
 	clone->bi_private = tio;
@@ -1056,7 +1056,6 @@ static void __map_bio(struct dm_target *
 		/* error the io and bail out, or requeue it if needed */
 		md = tio->io->md;
 		dec_pending(tio->io, r);
-		bio_put(clone);
 		free_tio(md, tio);
 	} else if (r) {
 		DMWARN("unimplemented target map return value: %d", r);
@@ -1077,14 +1076,13 @@ struct clone_info {
 /*
  * Creates a little bio that just does part of a bvec.
  */
-static struct bio *split_bvec(struct bio *bio, sector_t sector,
-			      unsigned short idx, unsigned int offset,
-			      unsigned int len, struct bio_set *bs)
+static void split_bvec(struct dm_target_io *tio, struct bio *bio,
+		       sector_t sector, unsigned short idx, unsigned int offset,
+		       unsigned int len, struct bio_set *bs)
 {
-	struct bio *clone;
+	struct bio *clone = &tio->clone;
 	struct bio_vec *bv = bio->bi_io_vec + idx;
 
-	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
 	*clone->bi_io_vec = *bv;
 
 	clone->bi_sector = sector;
@@ -1101,20 +1099,18 @@ static struct bio *split_bvec(struct bio
 		bio_integrity_trim(clone,
 				   bio_sector_offset(bio, idx, offset), len);
 	}
-
-	return clone;
 }
 
 /*
  * Creates a bio that consists of range of complete bvecs.
  */
-static struct bio *clone_bio(struct bio *bio, sector_t sector,
-			     unsigned short idx, unsigned short bv_count,
-			     unsigned int len, struct bio_set *bs)
+static void clone_bio(struct dm_target_io *tio, struct bio *bio,
+		      sector_t sector, unsigned short idx,
+		      unsigned short bv_count, unsigned int len,
+		      struct bio_set *bs)
 {
-	struct bio *clone;
+	struct bio *clone = &tio->clone;
 
-	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -1129,14 +1125,16 @@ static struct bio *clone_bio(struct bio 
 			bio_integrity_trim(clone,
 					   bio_sector_offset(bio, idx, 0), len);
 	}
-
-	return clone;
 }
 
 static struct dm_target_io *alloc_tio(struct clone_info *ci,
-				      struct dm_target *ti)
+				      struct dm_target *ti, int nr_iovecs)
 {
-	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
+	struct dm_target_io *tio;
+	struct bio *clone;
+
+	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
+	tio = container_of(clone, struct dm_target_io, clone);
 
 	tio->io = ci->io;
 	tio->ti = ti;
@@ -1148,8 +1146,8 @@ static struct dm_target_io *alloc_tio(st
 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 				   unsigned request_nr, sector_t len)
 {
-	struct dm_target_io *tio = alloc_tio(ci, ti);
-	struct bio *clone;
+	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
+	struct bio *clone = &tio->clone;
 
 	tio->info.target_request_nr = request_nr;
 
@@ -1158,14 +1156,13 @@ static void __issue_target_request(struc
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
 	 * and discard, so no need for concern about wasted bvec allocations.
 	 */
-	clone = bio_clone_bioset(ci->bio, GFP_NOIO, ci->md->bs);
 
 	if (len) {
 		clone->bi_sector = ci->sector;
 		clone->bi_size = to_bytes(len);
 	}
 
-	__map_bio(ti, clone, tio);
+	__map_bio(ti, tio);
 }
 
 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
@@ -1194,14 +1191,13 @@ static int __clone_and_map_empty_flush(s
  */
 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
 {
-	struct bio *clone, *bio = ci->bio;
+	struct bio *bio = ci->bio;
 	struct dm_target_io *tio;
 
-	tio = alloc_tio(ci, ti);
-	clone = clone_bio(bio, ci->sector, ci->idx,
-			  bio->bi_vcnt - ci->idx, ci->sector_count,
-			  ci->md->bs);
-	__map_bio(ti, clone, tio);
+	tio = alloc_tio(ci, ti, bio->bi_max_vecs);
+	clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx,
+		  ci->sector_count, ci->md->bs);
+	__map_bio(ti, tio);
 	ci->sector_count = 0;
 }
 
@@ -1239,7 +1235,7 @@ static int __clone_and_map_discard(struc
 
 static int __clone_and_map(struct clone_info *ci)
 {
-	struct bio *clone, *bio = ci->bio;
+	struct bio *bio = ci->bio;
 	struct dm_target *ti;
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
@@ -1279,10 +1275,10 @@ static int __clone_and_map(struct clone_
 			len += bv_len;
 		}
 
-		tio = alloc_tio(ci, ti);
-		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
-				  ci->md->bs);
-		__map_bio(ti, clone, tio);
+		tio = alloc_tio(ci, ti, bio->bi_max_vecs);
+		clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len,
+			  ci->md->bs);
+		__map_bio(ti, tio);
 
 		ci->sector += len;
 		ci->sector_count -= len;
@@ -1307,12 +1303,11 @@ static int __clone_and_map(struct clone_
 
 			len = min(remaining, max);
 
-			tio = alloc_tio(ci, ti);
-			clone = split_bvec(bio, ci->sector, ci->idx,
-					   bv->bv_offset + offset, len,
-					   ci->md->bs);
+			tio = alloc_tio(ci, ti, 1);
+			split_bvec(tio, bio, ci->sector, ci->idx,
+				   bv->bv_offset + offset, len, ci->md->bs);
 
-			__map_bio(ti, clone, tio);
+			__map_bio(ti, tio);
 
 			ci->sector += len;
 			ci->sector_count -= len;
@@ -2733,7 +2728,8 @@ struct dm_md_mempools *dm_alloc_md_mempo
 		goto free_io_pool_and_out;
 
 	pools->bs = (type == DM_TYPE_BIO_BASED) ?
-		bioset_create(pool_size, 0) :
+		bioset_create(pool_size,
+			      offsetof(struct dm_target_io, clone)) :
 		bioset_create(pool_size,
 			      offsetof(struct dm_rq_clone_bio_info, clone));
 	if (!pools->bs)

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <Pine.LNX.4.64.1209121831110.21106-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>]

* Re: [dm-devel] [PATCH 2] dm: Use bioset's front_pad for dm_target_io
       [not found]                                                   ` <Pine.LNX.4.64.1209121831110.21106-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
@ 2012-09-14 23:09                                                     ` Alasdair G Kergon
  0 siblings, 0 replies; 75+ messages in thread
From: Alasdair G Kergon @ 2012-09-14 23:09 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Kent Overstreet, Jens Axboe, dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA, Alasdair G. Kergon,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, tj-DgEjT+Ai2ygdnm+yROfE0A,
	Vivek Goyal

On Wed, Sep 12, 2012 at 06:31:53PM -0400, Mikulas Patocka wrote:
> This is the patch based on this tree.
 
I've pulled this into the dm tree for now.

Alasdair

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                   ` <20120830220745.GI27257-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2012-08-31  1:43                     ` Kent Overstreet
@ 2012-09-01  2:13                     ` Tejun Heo
       [not found]                       ` <20120901021348.GB19535-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
  2012-09-03  0:49                     ` Dave Chinner
  2 siblings, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-09-01  2:13 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Kent Overstreet, Mikulas Patocka,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello, Vivek.

On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> Here is one quick and dirty proof of concept patch. It checks for stack
> depth and if remaining space is less than 20% of stack size, then it
> defers the bio submission to per queue worker.

So, it removes breadth-first walking of bio construction by ensuring
stack overflow never happens by bouncing to workqueue if stack usage
seems too high.

I do like removal of breadth-first walking.  It makes failure
scenarios a lot less mind-bending.  That said, Kent is right that this
can incur significant overhead for certain configurations, and looking
at stack usage in block layer is rather nasty both in design and
implementation.

If we're gonna need rescuer anyway and can get it right and the
mechanism can be contained in block proper relatively well, I think it
would be better to make bread-first walking safe.  Both are nasty in
their own ways after all.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120901021348.GB19535-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>]

* [PATCH v2] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                       ` <20120901021348.GB19535-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
@ 2012-09-03  1:34                         ` Kent Overstreet
  2012-09-04 15:00                         ` [PATCH v7 9/9] " Vivek Goyal
  1 sibling, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-09-03  1:34 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vivek Goyal, Mikulas Patocka, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Fri, Aug 31, 2012 at 07:13:48PM -0700, Tejun Heo wrote:
> Hello, Vivek.
> 
> On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> > Here is one quick and dirty proof of concept patch. It checks for stack
> > depth and if remaining space is less than 20% of stack size, then it
> > defers the bio submission to per queue worker.
> 
> So, it removes breadth-first walking of bio construction by ensuring
> stack overflow never happens by bouncing to workqueue if stack usage
> seems too high.
> 
> I do like removal of breadth-first walking.  It makes failure
> scenarios a lot less mind-bending.  That said, Kent is right that this
> can incur significant overhead for certain configurations, and looking
> at stack usage in block layer is rather nasty both in design and
> implementation.
> 
> If we're gonna need rescuer anyway and can get it right and the
> mechanism can be contained in block proper relatively well, I think it
> would be better to make bread-first walking safe.  Both are nasty in
> their own ways after all.

I added that filtering I was talking about, and I like this version much
better.

To me at least, it's much clearer what it's actually doing; when we go
sleep in an allocation, we first unblock only the bios that were
allocated from this bio_set - i.e. only the bios that caused the
original deadlock.

It's still trickier than Vivek's approach but the performance impact
certainly lowers, since we're only using the workqueue thread on
allocation failure.

commit c61f9c16dc8c7ae833a73b857936106c71daab3f
Author: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Date:   Fri Aug 31 20:52:41 2012 -0700

    block: Avoid deadlocks with bio allocation by stacking drivers
    
    Previously, if we ever try to allocate more than once from the same bio
    set while running under generic_make_request() (i.e. a stacking block
    driver), we risk deadlock.
    
    This is because of the code in generic_make_request() that converts
    recursion to iteration; any bios we submit won't actually be submitted
    (so they can complete and eventually be freed) until after we return -
    this means if we allocate a second bio, we're blocking the first one
    from ever being freed.
    
    Thus if enough threads call into a stacking block driver at the same
    time with bios that need multiple splits, and the bio_set's reserve gets
    used up, we deadlock.
    
    This can be worked around in the driver code - we could check if we're
    running under generic_make_request(), then mask out __GFP_WAIT when we
    go to allocate a bio, and if the allocation fails punt to workqueue and
    retry the allocation.
    
    But this is tricky and not a generic solution. This patch solves it for
    all users by inverting the previously described technique. We allocate a
    rescuer workqueue for each bio_set, and then in the allocation code if
    there are bios on current->bio_list we would be blocking, we punt them
    to the rescuer workqueue to be submitted.
    
    Tested it by forcing the rescue codepath to be taken (by disabling the
    first GFP_NOWAIT) attempt, and then ran it with bcache (which does a lot
    of arbitrary bio splitting) and verified that the rescuer was being
    invoked.
    
    Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
    CC: Jens Axboe <axboe-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>

diff --git a/fs/bio.c b/fs/bio.c
index 22d654f..076751f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -286,6 +286,43 @@ void bio_reset(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_reset);
 
+static void bio_alloc_rescue(struct work_struct *work)
+{
+	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+	struct bio *bio;
+
+	while (1) {
+		spin_lock(&bs->rescue_lock);
+		bio = bio_list_pop(&bs->rescue_list);
+		spin_unlock(&bs->rescue_lock);
+
+		if (!bio)
+			break;
+
+		generic_make_request(bio);
+	}
+}
+
+static void punt_bios_to_rescuer(struct bio_set *bs)
+{
+	struct bio_list punt, nopunt;
+	struct bio *bio;
+
+	bio_list_init(&punt);
+	bio_list_init(&nopunt);
+
+	while ((bio = bio_list_pop(current->bio_list)))
+		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+
+	*current->bio_list = nopunt;
+
+	spin_lock(&bs->rescue_lock);
+	bio_list_merge(&bs->rescue_list, &punt);
+	spin_unlock(&bs->rescue_lock);
+
+	queue_work(bs->rescue_workqueue, &bs->rescue_work);
+}
+
 /**
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -308,6 +345,7 @@ EXPORT_SYMBOL(bio_reset);
  */
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	gfp_t saved_gfp = gfp_mask;
 	unsigned front_pad;
 	unsigned inline_vecs;
 	unsigned long idx = BIO_POOL_NONE;
@@ -325,13 +363,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		front_pad = 0;
 		inline_vecs = nr_iovecs;
 	} else {
+		/*
+		 * generic_make_request() converts recursion to iteration; this
+		 * means if we're running beneath it, any bios we allocate and
+		 * submit will not be submitted (and thus freed) until after we
+		 * return.
+		 *
+		 * This exposes us to a potential deadlock if we allocate
+		 * multiple bios from the same bio_set() while running
+		 * underneath generic_make_request(). If we were to allocate
+		 * multiple bios (say a stacking block driver that was splitting
+		 * bios), we would deadlock if we exhausted the mempool's
+		 * reserve.
+		 *
+		 * We solve this, and guarantee forward progress, with a rescuer
+		 * workqueue per bio_set. If we go to allocate and there are
+		 * bios on current->bio_list, we first try the allocation
+		 * without __GFP_WAIT; if that fails, we punt those bios we
+		 * would be blocking to the rescuer workqueue before we retry
+		 * with the original gfp_flags.
+		 */
+
+		if (current->bio_list && !bio_list_empty(current->bio_list))
+			gfp_mask &= ~__GFP_WAIT;
+retry:
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
 		front_pad = bs->front_pad;
 		inline_vecs = BIO_INLINE_VECS;
 	}
 
 	if (unlikely(!p))
-		return NULL;
+		goto err;
 
 	bio = p + front_pad;
 	bio_init(bio);
@@ -352,6 +414,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 err_free:
 	mempool_free(p, bs->bio_pool);
+err:
+	if (gfp_mask != saved_gfp) {
+		punt_bios_to_rescuer(bs);
+		gfp_mask = saved_gfp;
+		goto retry;
+	}
+
 	return NULL;
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
@@ -1561,6 +1630,9 @@ static void biovec_free_pools(struct bio_set *bs)
 
 void bioset_free(struct bio_set *bs)
 {
+	if (bs->rescue_workqueue)
+		destroy_workqueue(bs->rescue_workqueue);
+
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
@@ -1596,6 +1668,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 
 	bs->front_pad = front_pad;
 
+	spin_lock_init(&bs->rescue_lock);
+	bio_list_init(&bs->rescue_list);
+	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
 	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
 	if (!bs->bio_slab) {
 		kfree(bs);
@@ -1606,9 +1682,14 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (!biovec_create_pools(bs, pool_size))
-		return bs;
+	if (biovec_create_pools(bs, pool_size))
+		goto bad;
+
+	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+	if (!bs->rescue_workqueue)
+		goto bad;
 
+	return bs;
 bad:
 	bioset_free(bs);
 	return NULL;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a7561b9..f329102 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -491,6 +491,15 @@ struct bio_set {
 	mempool_t *bio_integrity_pool;
 #endif
 	mempool_t *bvec_pool;
+
+	/*
+	 * Deadlock avoidance for stacking block drivers: see comments in
+	 * bio_alloc_bioset() for details
+	 */
+	spinlock_t		rescue_lock;
+	struct bio_list		rescue_list;
+	struct work_struct	rescue_work;
+	struct workqueue_struct	*rescue_workqueue;
 };
 
 struct biovec_slab {

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                       ` <20120901021348.GB19535-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
  2012-09-03  1:34                         ` [PATCH v2] " Kent Overstreet
@ 2012-09-04 15:00                         ` Vivek Goyal
  1 sibling, 0 replies; 75+ messages in thread
From: Vivek Goyal @ 2012-09-04 15:00 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Kent Overstreet, Mikulas Patocka,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

On Fri, Aug 31, 2012 at 07:13:48PM -0700, Tejun Heo wrote:
> Hello, Vivek.
> 
> On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> > Here is one quick and dirty proof of concept patch. It checks for stack
> > depth and if remaining space is less than 20% of stack size, then it
> > defers the bio submission to per queue worker.
> 
> So, it removes breadth-first walking of bio construction by ensuring
> stack overflow never happens by bouncing to workqueue if stack usage
> seems too high.
> 
> I do like removal of breadth-first walking.  It makes failure
> scenarios a lot less mind-bending.  That said, Kent is right that this
> can incur significant overhead for certain configurations, and looking
> at stack usage in block layer is rather nasty both in design and
> implementation.
> 
> If we're gonna need rescuer anyway and can get it right and the
> mechanism can be contained in block proper relatively well, I think it
> would be better to make bread-first walking safe.  Both are nasty in
> their own ways after all.

Hi Tejun,

That's fine. Breadth first walking does make sense given the fact that
storage stack can be arbitrarily deep. And Kent's bio set rescuer patch
is not too bad. So I am fine with pursuing that patch.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                   ` <20120830220745.GI27257-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2012-08-31  1:43                     ` Kent Overstreet
  2012-09-01  2:13                     ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Tejun Heo
@ 2012-09-03  0:49                     ` Dave Chinner
  2012-09-03  1:17                       ` Kent Overstreet
  2012-09-04 13:54                       ` Vivek Goyal
  2 siblings, 2 replies; 75+ messages in thread
From: Dave Chinner @ 2012-09-03  0:49 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Kent Overstreet, Mikulas Patocka,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
> 
> [..]
> > > Performance aside, punting submission to per device worker in case of deep
> > > stack usage sounds cleaner solution to me.
> > 
> > Agreed, but performance tends to matter in the real world. And either
> > way the tricky bits are going to be confined to a few functions, so I
> > don't think it matters that much.
> > 
> > If someone wants to code up the workqueue version and test it, they're
> > more than welcome...
> 
> Here is one quick and dirty proof of concept patch. It checks for stack
> depth and if remaining space is less than 20% of stack size, then it
> defers the bio submission to per queue worker.

Given that we are working around stack depth issues in the
filesystems already in several places, and now it seems like there's
a reason to work around it in the block layers as well, shouldn't we
simply increase the default stack size rather than introduce
complexity and performance regressions to try and work around not
having enough stack?

I mean, we can deal with it like the ia32 4k stack issue was dealt
with (i.e. ignore those stupid XFS people, that's an XFS bug), or
we can face the reality that storage stacks have become so complex
that 8k is no longer a big enough stack for a modern system....

Cheers,

Dave.
-- 
Dave Chinner
david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-09-03  0:49                     ` Dave Chinner
@ 2012-09-03  1:17                       ` Kent Overstreet
  2012-09-04 13:54                       ` Vivek Goyal
  1 sibling, 0 replies; 75+ messages in thread
From: Kent Overstreet @ 2012-09-03  1:17 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Vivek Goyal, Mikulas Patocka, linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Mon, Sep 03, 2012 at 10:49:27AM +1000, Dave Chinner wrote:
> Given that we are working around stack depth issues in the
> filesystems already in several places, and now it seems like there's
> a reason to work around it in the block layers as well, shouldn't we
> simply increase the default stack size rather than introduce
> complexity and performance regressions to try and work around not
> having enough stack?
> 
> I mean, we can deal with it like the ia32 4k stack issue was dealt
> with (i.e. ignore those stupid XFS people, that's an XFS bug), or
> we can face the reality that storage stacks have become so complex
> that 8k is no longer a big enough stack for a modern system....

I'm not arguing against increasing the default stack size (I really
don't have an opinion there) - but it's not a solution for the block
layer, as stacking block devices can require an unbounded amount of
stack without the generic_make_request() convert recursion-to-iteration
thing.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-09-03  0:49                     ` Dave Chinner
  2012-09-03  1:17                       ` Kent Overstreet
@ 2012-09-04 13:54                       ` Vivek Goyal
       [not found]                         ` <20120904135422.GC13768-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 75+ messages in thread
From: Vivek Goyal @ 2012-09-04 13:54 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Kent Overstreet, Mikulas Patocka,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tj-DgEjT+Ai2ygdnm+yROfE0A,
	bharrosh-C4P08NqkoRlBDgjK7y7TUQ, Jens Axboe

On Mon, Sep 03, 2012 at 10:49:27AM +1000, Dave Chinner wrote:
> On Thu, Aug 30, 2012 at 06:07:45PM -0400, Vivek Goyal wrote:
> > On Wed, Aug 29, 2012 at 10:13:45AM -0700, Kent Overstreet wrote:
> > 
> > [..]
> > > > Performance aside, punting submission to per device worker in case of deep
> > > > stack usage sounds cleaner solution to me.
> > > 
> > > Agreed, but performance tends to matter in the real world. And either
> > > way the tricky bits are going to be confined to a few functions, so I
> > > don't think it matters that much.
> > > 
> > > If someone wants to code up the workqueue version and test it, they're
> > > more than welcome...
> > 
> > Here is one quick and dirty proof of concept patch. It checks for stack
> > depth and if remaining space is less than 20% of stack size, then it
> > defers the bio submission to per queue worker.
> 
> Given that we are working around stack depth issues in the
> filesystems already in several places, and now it seems like there's
> a reason to work around it in the block layers as well, shouldn't we
> simply increase the default stack size rather than introduce
> complexity and performance regressions to try and work around not
> having enough stack?

Dave,

In this particular instance, we really don't have any bug reports of
stack overflowing. Just discussing what will happen if we make 
generic_make_request() recursive again.

> 
> I mean, we can deal with it like the ia32 4k stack issue was dealt
> with (i.e. ignore those stupid XFS people, that's an XFS bug), or
> we can face the reality that storage stacks have become so complex
> that 8k is no longer a big enough stack for a modern system....

So first question will be, what's the right stack size? If we make
generic_make_request() recursive, then at some storage stack depth we will
overflow stack anyway (if we have created too deep a stack). Hence
keeping current logic kind of makes sense as in theory we can support
arbitrary depth of storage stack.

Yes, if higher layers are consuming more stack, then it does raise the
question whether to offload work to worker and take performance hit or
increase stack depth. I don't know what's the answer to that question.
I have only tried going through the archive where some people seem to have
pushed for even smaller stack size (4K).

Thanks
Vivek

^ permalink raw reply	[flat|nested] 75+ messages in thread

[parent not found: <20120904135422.GC13768-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
       [not found]                         ` <20120904135422.GC13768-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2012-09-04 18:26                           ` Tejun Heo
  2012-09-05  3:57                             ` Dave Chinner
  0 siblings, 1 reply; 75+ messages in thread
From: Tejun Heo @ 2012-09-04 18:26 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Dave Chinner, Kent Overstreet, Mikulas Patocka,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello,

On Tue, Sep 04, 2012 at 09:54:23AM -0400, Vivek Goyal wrote:
> > Given that we are working around stack depth issues in the
> > filesystems already in several places, and now it seems like there's
> > a reason to work around it in the block layers as well, shouldn't we
> > simply increase the default stack size rather than introduce
> > complexity and performance regressions to try and work around not
> > having enough stack?
> 
> Dave,
> 
> In this particular instance, we really don't have any bug reports of
> stack overflowing. Just discussing what will happen if we make 
> generic_make_request() recursive again.

I think there was one and that's why we added the bio_list thing.

> > I mean, we can deal with it like the ia32 4k stack issue was dealt
> > with (i.e. ignore those stupid XFS people, that's an XFS bug), or
> > we can face the reality that storage stacks have become so complex
> > that 8k is no longer a big enough stack for a modern system....
> 
> So first question will be, what's the right stack size? If we make
> generic_make_request() recursive, then at some storage stack depth we will
> overflow stack anyway (if we have created too deep a stack). Hence
> keeping current logic kind of makes sense as in theory we can support
> arbitrary depth of storage stack.

But, yeah, this can't be solved by enlarging the stack size.  The
upper limit is unbound.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-09-04 18:26                           ` Tejun Heo
@ 2012-09-05  3:57                             ` Dave Chinner
  2012-09-05  4:37                               ` Tejun Heo
  0 siblings, 1 reply; 75+ messages in thread
From: Dave Chinner @ 2012-09-05  3:57 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jens Axboe, Kent Overstreet, linux-bcache, linux-kernel, dm-devel,
	Mikulas Patocka, bharrosh, Vivek Goyal

On Tue, Sep 04, 2012 at 11:26:33AM -0700, Tejun Heo wrote:
> Hello,
> 
> On Tue, Sep 04, 2012 at 09:54:23AM -0400, Vivek Goyal wrote:
> > > Given that we are working around stack depth issues in the
> > > filesystems already in several places, and now it seems like there's
> > > a reason to work around it in the block layers as well, shouldn't we
> > > simply increase the default stack size rather than introduce
> > > complexity and performance regressions to try and work around not
> > > having enough stack?
> > 
> > Dave,
> > 
> > In this particular instance, we really don't have any bug reports of
> > stack overflowing. Just discussing what will happen if we make 
> > generic_make_request() recursive again.
> 
> I think there was one and that's why we added the bio_list thing.

There was more than one - it was a regular enough to be considered a
feature... :/

> > > I mean, we can deal with it like the ia32 4k stack issue was dealt
> > > with (i.e. ignore those stupid XFS people, that's an XFS bug), or
> > > we can face the reality that storage stacks have become so complex
> > > that 8k is no longer a big enough stack for a modern system....
> > 
> > So first question will be, what's the right stack size? If we make
> > generic_make_request() recursive, then at some storage stack depth we will
> > overflow stack anyway (if we have created too deep a stack). Hence
> > keeping current logic kind of makes sense as in theory we can support
> > arbitrary depth of storage stack.
> 
> But, yeah, this can't be solved by enlarging the stack size.  The
> upper limit is unbound.

Sure, but recursion issue is isolated to the block layer.

If we can still submit IO directly through the block layer without
pushing it off to a work queue, then the overall stack usage problem
still exists. But if the block layer always pushes the IO off into
another workqueue to avoid stack overflows, then the context
switches are going to cause significant performance regressions for
high IOPS workloads.  I don't really like either situation.

So while you are discussing stack issues, think a little about the
bigger picture outside of the immediate issue at hand - a better
solution for everyone might pop up....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers
  2012-09-05  3:57                             ` Dave Chinner
@ 2012-09-05  4:37                               ` Tejun Heo
  0 siblings, 0 replies; 75+ messages in thread
From: Tejun Heo @ 2012-09-05  4:37 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Vivek Goyal, Kent Overstreet, Mikulas Patocka,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, bharrosh-C4P08NqkoRlBDgjK7y7TUQ,
	Jens Axboe

Hello, Dave.

On Wed, Sep 05, 2012 at 01:57:59PM +1000, Dave Chinner wrote:
> > But, yeah, this can't be solved by enlarging the stack size.  The
> > upper limit is unbound.
> 
> Sure, but recursion issue is isolated to the block layer.
> 
> If we can still submit IO directly through the block layer without
> pushing it off to a work queue, then the overall stack usage problem
> still exists. But if the block layer always pushes the IO off into
> another workqueue to avoid stack overflows, then the context
> switches are going to cause significant performance regressions for
> high IOPS workloads.  I don't really like either situation.

Kent's proposed solution doesn't do that.  The rescuer work item is
used iff mempool allocation fails w/o GFP_WAIT.  IOW, we're already
under severe memory pressure and stalls are expected all around the
kernel (somehow this sounds festive...)  It doesn't alter the
breadth-first walk of bio decomposition and shouldn't degrade
performance in any noticeable way.

> So while you are discussing stack issues, think a little about the
> bigger picture outside of the immediate issue at hand - a better
> solution for everyone might pop up....

It's probably because I haven't been bitten much from stack overflow
but I'd like to keep thinking that stack overflows are extremely
unusual and the ones causing them are the bad ones.  Thank you very
much. :p

-- 
tejun

^ permalink raw reply	[flat|nested] 75+ messages in thread

end of thread, other threads:[~2012-09-14 23:09 UTC | newest]

Thread overview: 75+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-08-28 17:37 [PATCH v7 0/9] Block cleanups, deadlock fix Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 1/9] block: Generalized bio pool freeing Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 2/9] dm: Use bioset's front_pad for dm_rq_clone_bio_info Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 3/9] block: Add bio_reset() Kent Overstreet
     [not found]   ` <1346175456-1572-4-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-28 20:31     ` Tejun Heo
2012-08-28 22:17       ` Kent Overstreet
     [not found]         ` <20120828221715.GD1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
2012-08-28 22:53           ` Kent Overstreet
2012-09-01  2:23           ` Tejun Heo
2012-09-05 20:13             ` Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 4/9] pktcdvd: Switch to bio_kmalloc() Kent Overstreet
     [not found]   ` <1346175456-1572-5-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-28 20:32     ` Tejun Heo
2012-08-28 22:24       ` Kent Overstreet
     [not found]       ` <20120828203247.GC24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
2012-09-04  9:05         ` Jiri Kosina
     [not found]           ` <alpine.LRH.2.00.1209041104020.26301-1ReQVI26iDCaZKY3DrU6dA@public.gmane.org>
2012-09-05 19:44             ` Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 5/9] block: Kill bi_destructor Kent Overstreet
     [not found]   ` <1346175456-1572-6-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-28 20:36     ` Tejun Heo
2012-08-28 22:07       ` Kent Overstreet
     [not found] ` <1346175456-1572-1-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-28 17:37   ` [PATCH v7 6/9] block: Consolidate bio_alloc_bioset(), bio_kmalloc() Kent Overstreet
     [not found]     ` <1346175456-1572-7-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-28 20:41       ` Tejun Heo
     [not found]         ` <20120828204148.GE24608-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
2012-08-28 22:03           ` Kent Overstreet
2012-09-01  2:17             ` Tejun Heo
2012-08-28 17:37 ` [PATCH v7 7/9] block: Add bio_clone_bioset(), bio_clone_kmalloc() Kent Overstreet
     [not found]   ` <1346175456-1572-8-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-28 20:44     ` Tejun Heo
2012-08-28 22:05       ` Kent Overstreet
     [not found]         ` <20120828220532.GB1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
2012-09-01  2:19           ` Tejun Heo
2012-08-28 17:37 ` [PATCH v7 8/9] block: Reorder struct bio_set Kent Overstreet
2012-08-28 17:37 ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Kent Overstreet
     [not found]   ` <1346175456-1572-10-git-send-email-koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-28 20:49     ` Tejun Heo
2012-08-28 22:28       ` Kent Overstreet
     [not found]         ` <20120828222800.GG1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
2012-08-28 23:01           ` Kent Overstreet
     [not found]             ` <20120828230108.GI1048-jC9Py7bek1znysI04z7BkA@public.gmane.org>
2012-08-29  1:31               ` Vivek Goyal
2012-08-29  3:25                 ` Kent Overstreet
     [not found]                   ` <20120829032558.GA22214-jC9Py7bek1znysI04z7BkA@public.gmane.org>
2012-08-29 12:57                     ` Vivek Goyal
     [not found]                       ` <20120829125759.GB12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2012-08-29 14:39                         ` [dm-devel] " Alasdair G Kergon
     [not found]                           ` <20120829143913.GA5500-FDJ95KluN3Z0klwcnFlA1dvLeJWuRmrY@public.gmane.org>
2012-08-29 16:26                             ` Kent Overstreet
2012-08-29 21:01                               ` John Stoffel
     [not found]                                 ` <20542.33557.833272.561494-QCwP04CIH+34g5BaSWARrrNAH6kLmebB@public.gmane.org>
2012-08-29 21:08                                   ` Kent Overstreet
2012-08-28 22:06     ` Vivek Goyal
     [not found]       ` <20120828220610.GH31674-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2012-08-28 22:23         ` Kent Overstreet
2012-08-29 16:24   ` Mikulas Patocka
2012-08-29 16:50     ` Kent Overstreet
     [not found]       ` <20120829165006.GB20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-29 16:57         ` [dm-devel] " Alasdair G Kergon
2012-08-29 17:07         ` Vivek Goyal
     [not found]           ` <20120829170711.GC12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2012-08-29 17:13             ` Kent Overstreet
     [not found]               ` <20120829171345.GC20312-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-08-29 17:23                 ` [dm-devel] " Alasdair G Kergon
2012-08-29 17:32                   ` Kent Overstreet
2012-08-30 22:07                 ` Vivek Goyal
     [not found]                   ` <20120830220745.GI27257-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2012-08-31  1:43                     ` Kent Overstreet
2012-08-31  1:55                       ` Kent Overstreet
2012-08-31 15:01                       ` Vivek Goyal
     [not found]                         ` <20120831150159.GB13483-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2012-09-03  1:26                           ` Kent Overstreet
     [not found]                       ` <20120831014359.GB15218-jC9Py7bek1znysI04z7BkA@public.gmane.org>
2012-09-03 20:41                         ` Mikulas Patocka
     [not found]                           ` <Pine.LNX.4.64.1209031638110.15620-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
2012-09-04  3:41                             ` Kent Overstreet
     [not found]                               ` <20120904034100.GA21602-jC9Py7bek1znysI04z7BkA@public.gmane.org>
2012-09-04 18:55                                 ` Tejun Heo
2012-09-04 19:01                                   ` Tejun Heo
     [not found]                                     ` <20120904190119.GD3638-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
2012-09-04 19:43                                       ` Kent Overstreet
2012-09-04 19:42                                   ` Kent Overstreet
     [not found]                                     ` <20120904194237.GC25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-09-04 21:03                                       ` Tejun Heo
2012-09-04 19:26                                 ` Mikulas Patocka
     [not found]                                   ` <Pine.LNX.4.64.1209041509160.22689-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
2012-09-04 19:39                                     ` Vivek Goyal
2012-09-04 19:51                                     ` [PATCH] dm: Use bioset's front_pad for dm_target_io Kent Overstreet
     [not found]                                       ` <20120904195156.GE25236-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-09-04 21:20                                         ` Tejun Heo
2012-09-11 19:28                                         ` [PATCH 2] " Mikulas Patocka
     [not found]                                           ` <Pine.LNX.4.64.1209111349210.23947-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
2012-09-11 19:50                                             ` Kent Overstreet
     [not found]                                               ` <20120911195029.GJ19739-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-09-12 22:31                                                 ` Mikulas Patocka
     [not found]                                                   ` <Pine.LNX.4.64.1209121831110.21106-e+HWlsje6Db1wF9wiOj0lkEOCMrvLtNR@public.gmane.org>
2012-09-14 23:09                                                     ` [dm-devel] " Alasdair G Kergon
2012-09-01  2:13                     ` [PATCH v7 9/9] block: Avoid deadlocks with bio allocation by stacking drivers Tejun Heo
     [not found]                       ` <20120901021348.GB19535-RcKxWJ4Cfj1J2suj2OqeGauc2jM2gXBXkQQo+JxHRPFibQn6LdNjmg@public.gmane.org>
2012-09-03  1:34                         ` [PATCH v2] " Kent Overstreet
2012-09-04 15:00                         ` [PATCH v7 9/9] " Vivek Goyal
2012-09-03  0:49                     ` Dave Chinner
2012-09-03  1:17                       ` Kent Overstreet
2012-09-04 13:54                       ` Vivek Goyal
     [not found]                         ` <20120904135422.GC13768-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2012-09-04 18:26                           ` Tejun Heo
2012-09-05  3:57                             ` Dave Chinner
2012-09-05  4:37                               ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).