Linux block layer
 help / color / mirror / Atom feed
* [PATCH 3/5] blk: make the bioset rescue_workqueue optional.
From: NeilBrown @ 2017-03-10  6:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Mike Snitzer, LKML, linux-raid,
	device-mapper development, Mikulas Patocka, Pavel Machek,
	Jack Wang, Lars Ellenberg, Kent Overstreet
In-Reply-To: <148912539296.4002.219258660543808741.stgit@noble>

This patch converts bioset_create() and bioset_create_nobvec()
to not create a workqueue so alloctions will never trigger
punt_bios_to_rescuer().
It also introduces bioset_create_rescued() and bioset_create_nobvec_rescued()
which preserve the old behaviour.

*All* callers of bioset_create() and bioset_create_nobvec() are
converted to the _rescued() version, so that no change in behaviour
is experienced.

It is hoped that most, if not all, biosets can end up being the
non-rescued version.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/bio.c                         |   30 +++++++++++++++++++++++++-----
 block/blk-core.c                    |    2 +-
 drivers/block/drbd/drbd_main.c      |    2 +-
 drivers/md/bcache/super.c           |    4 ++--
 drivers/md/dm-crypt.c               |    2 +-
 drivers/md/dm-io.c                  |    2 +-
 drivers/md/dm.c                     |    5 +++--
 drivers/md/md.c                     |    2 +-
 drivers/md/raid5-cache.c            |    2 +-
 drivers/target/target_core_iblock.c |    2 +-
 fs/block_dev.c                      |    2 +-
 fs/btrfs/extent_io.c                |    4 ++--
 fs/xfs/xfs_super.c                  |    2 +-
 include/linux/bio.h                 |    2 ++
 14 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..3790c3f376b6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -362,6 +362,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 	struct bio_list punt, nopunt;
 	struct bio *bio;
 
+	if (!WARN_ON_ONCE(!bs->rescue_workqueue))
+		return;
 	/*
 	 * In order to guarantee forward progress we must punt only bios that
 	 * were allocated from this bio_set; otherwise, if there was a bio on
@@ -472,7 +474,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		if (current->bio_list &&
 		    (!bio_list_empty(&current->bio_list[0]) ||
-		     !bio_list_empty(&current->bio_list[1])))
+		     !bio_list_empty(&current->bio_list[1])) &&
+		    bs->rescue_workqueue)
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
@@ -1941,7 +1944,8 @@ EXPORT_SYMBOL(bioset_free);
 
 static struct bio_set *__bioset_create(unsigned int pool_size,
 				       unsigned int front_pad,
-				       bool create_bvec_pool)
+				       bool create_bvec_pool,
+				       bool create_rescue_workqueue)
 {
 	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
 	struct bio_set *bs;
@@ -1972,6 +1976,9 @@ static struct bio_set *__bioset_create(unsigned int pool_size,
 			goto bad;
 	}
 
+	if (!create_rescue_workqueue)
+		return bs;
+
 	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
 	if (!bs->rescue_workqueue)
 		goto bad;
@@ -1997,10 +2004,16 @@ static struct bio_set *__bioset_create(unsigned int pool_size,
  */
 struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
-	return __bioset_create(pool_size, front_pad, true);
+	return __bioset_create(pool_size, front_pad, true, false);
 }
 EXPORT_SYMBOL(bioset_create);
 
+struct bio_set *bioset_create_rescued(unsigned int pool_size, unsigned int front_pad)
+{
+	return __bioset_create(pool_size, front_pad, true, true);
+}
+EXPORT_SYMBOL(bioset_create_rescued);
+
 /**
  * bioset_create_nobvec  - Create a bio_set without bio_vec mempool
  * @pool_size:	Number of bio to cache in the mempool
@@ -2012,10 +2025,17 @@ EXPORT_SYMBOL(bioset_create);
  */
 struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad)
 {
-	return __bioset_create(pool_size, front_pad, false);
+	return __bioset_create(pool_size, front_pad, false, false);
 }
 EXPORT_SYMBOL(bioset_create_nobvec);
 
+struct bio_set *bioset_create_nobvec_rescued(unsigned int pool_size,
+					     unsigned int front_pad)
+{
+	return __bioset_create(pool_size, front_pad, false, true);
+}
+EXPORT_SYMBOL(bioset_create_nobvec_rescued);
+
 #ifdef CONFIG_BLK_CGROUP
 
 /**
@@ -2130,7 +2150,7 @@ static int __init init_bio(void)
 	bio_integrity_init();
 	biovec_init_slabs();
 
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+	fs_bio_set = bioset_create_rescued(BIO_POOL_SIZE, 0);
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
diff --git a/block/blk-core.c b/block/blk-core.c
index fae7966e1f98..430c82f646eb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -712,7 +712,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
-	q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	q->bio_split = bioset_create_rescued(BIO_POOL_SIZE, 0);
 	if (!q->bio_split)
 		goto fail_id;
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 92c60cbd04ee..2c69c2ab0fff 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2166,7 +2166,7 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
-	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+	drbd_md_io_bio_set = bioset_create_rescued(DRBD_MIN_POOL_PAGES, 0);
 	if (drbd_md_io_bio_set == NULL)
 		goto Enomem;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 85e3f21c2514..6cb30792f0ed 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -786,7 +786,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 
 	minor *= BCACHE_MINORS;
 
-	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	if (!(d->bio_split = bioset_create_rescued(4, offsetof(struct bbio, bio))) ||
 	    !(d->disk = alloc_disk(BCACHE_MINORS))) {
 		ida_simple_remove(&bcache_minor, minor);
 		return -ENOMEM;
@@ -1520,7 +1520,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 				sizeof(struct bbio) + sizeof(struct bio_vec) *
 				bucket_pages(c))) ||
 	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
-	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	    !(c->bio_split = bioset_create_rescued(4, offsetof(struct bbio, bio))) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
 						WQ_MEM_RECLAIM, 0)) ||
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..91a2d637d44f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1936,7 +1936,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, 0);
+	cc->bs = bioset_create_rescued(MIN_IOS, 0);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 03940bf36f6c..fe1241c196b1 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -58,7 +58,7 @@ struct dm_io_client *dm_io_client_create(void)
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(min_ios, 0);
+	client->bios = bioset_create_rescued(min_ios, 0);
 	if (!client->bios)
 		goto bad;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..41b1f033841f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1002,7 +1002,8 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 
 		while ((bio = bio_list_pop(&list))) {
 			struct bio_set *bs = bio->bi_pool;
-			if (unlikely(!bs) || bs == fs_bio_set) {
+			if (unlikely(!bs) || bs == fs_bio_set ||
+			    !bs->rescue_workqueue) {
 				bio_list_add(&current->bio_list[i], bio);
 				continue;
 			}
@@ -2577,7 +2578,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		BUG();
 	}
 
-	pools->bs = bioset_create_nobvec(pool_size, front_pad);
+	pools->bs = bioset_create_nobvec_rescued(pool_size, front_pad);
 	if (!pools->bs)
 		goto out;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d7d2bb51a58d..e5f08a195837 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5220,7 +5220,7 @@ int md_run(struct mddev *mddev)
 	}
 
 	if (mddev->bio_set == NULL) {
-		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+		mddev->bio_set = bioset_create_rescued(BIO_POOL_SIZE, 0);
 		if (!mddev->bio_set)
 			return -ENOMEM;
 	}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3f307be01b10..c95c6c046395 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2831,7 +2831,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_pool)
 		goto io_pool;
 
-	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	log->bs = bioset_create_rescued(R5L_POOL_SIZE, 0);
 	if (!log->bs)
 		goto io_bs;
 
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index d316ed537d59..5bf3392195c6 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -93,7 +93,7 @@ static int iblock_configure_device(struct se_device *dev)
 		return -EINVAL;
 	}
 
-	ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0);
+	ib_dev->ibd_bio_set = bioset_create_rescued(IBLOCK_BIO_POOL_SIZE, 0);
 	if (!ib_dev->ibd_bio_set) {
 		pr_err("IBLOCK: Unable to create bioset\n");
 		goto out;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..c0ca5f0d0369 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -436,7 +436,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 static __init int blkdev_init(void)
 {
-	blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
+	blkdev_dio_pool = bioset_create_rescued(4, offsetof(struct blkdev_dio, bio));
 	if (!blkdev_dio_pool)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 28e81922a21c..34aa8893790a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -173,8 +173,8 @@ int __init extent_io_init(void)
 	if (!extent_buffer_cache)
 		goto free_state_cache;
 
-	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
-				     offsetof(struct btrfs_io_bio, bio));
+	btrfs_bioset = bioset_create_rescued(BIO_POOL_SIZE,
+					     offsetof(struct btrfs_io_bio, bio));
 	if (!btrfs_bioset)
 		goto free_buffer_cache;
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 890862f2447c..f4c4d6f41d91 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1756,7 +1756,7 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
-	xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+	xfs_ioend_bioset = bioset_create_rescued(4 * MAX_BUF_PER_PAGE,
 			offsetof(struct xfs_ioend, io_inline_bio));
 	if (!xfs_ioend_bioset)
 		goto out;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..05730603fcf1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -379,7 +379,9 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 }
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
+extern struct bio_set *bioset_create_rescued(unsigned int, unsigned int);
 extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
+extern struct bio_set *bioset_create_nobvec_rescued(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 extern mempool_t *biovec_create_pool(int pool_entries);
 

^ permalink raw reply related

* [PATCH 2/5] blk: remove bio_set arg from blk_queue_split()
From: NeilBrown @ 2017-03-10  6:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Mike Snitzer, LKML, linux-raid,
	device-mapper development, Mikulas Patocka, Pavel Machek,
	Jack Wang, Lars Ellenberg, Kent Overstreet
In-Reply-To: <148912539296.4002.219258660543808741.stgit@noble>

blk_queue_split() is always called with the last arg being q->bio_split,
where 'q' is the first arg.

Also blk_queue_split() sometimes uses the passed-in 'bs' and sometimes uses
q->bio_split.

This is inconsistent and unnecessary.  Remove the last arg and always use
q->bio_split inside blk_queue_split()

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/blk-core.c              |    2 +-
 block/blk-merge.c             |    7 +++----
 block/blk-mq.c                |    4 ++--
 drivers/block/drbd/drbd_req.c |    2 +-
 drivers/block/pktcdvd.c       |    2 +-
 drivers/block/ps3vram.c       |    2 +-
 drivers/block/rsxx/dev.c      |    2 +-
 drivers/block/umem.c          |    2 +-
 drivers/block/zram/zram_drv.c |    2 +-
 drivers/lightnvm/rrpc.c       |    2 +-
 drivers/md/md.c               |    2 +-
 drivers/s390/block/dcssblk.c  |    2 +-
 drivers/s390/block/xpram.c    |    2 +-
 include/linux/blkdev.h        |    3 +--
 14 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d772c221cc17..fae7966e1f98 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1635,7 +1635,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 */
 	blk_queue_bounce(q, &bio);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio->bi_error = -EIO;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2afa262425d1..ce8838aff7f7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -188,8 +188,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	return do_split ? new : NULL;
 }
 
-void blk_queue_split(struct request_queue *q, struct bio **bio,
-		     struct bio_set *bs)
+void blk_queue_split(struct request_queue *q, struct bio **bio)
 {
 	struct bio *split, *res;
 	unsigned nsegs;
@@ -197,14 +196,14 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
+		split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
 		split = NULL;
 		nsegs = (*bio)->bi_phys_segments;
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs);
 		break;
 	default:
 		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 159187a28d66..e582d7f7511e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1502,7 +1502,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
@@ -1624,7 +1624,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (!is_flush_fua && !blk_queue_nomerges(q)) {
 		if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 652114ae1a8a..f6ed6f7f5ab2 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1554,7 +1554,7 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
 	struct drbd_device *device = (struct drbd_device *) q->queuedata;
 	unsigned long start_jif;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	start_jif = jiffies;
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 66d846ba85a9..98394d034c29 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2414,7 +2414,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_bounce(q, &bio);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	pd = q->queuedata;
 	if (!pd) {
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 456b4fe21559..48072c0c1010 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -606,7 +606,7 @@ static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio)
 
 	dev_dbg(&dev->core, "%s\n", __func__);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	spin_lock_irq(&priv->lock);
 	busy = !bio_list_empty(&priv->list);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index f81d70b39d10..8c8024f616ae 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -151,7 +151,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio)
 	struct rsxx_bio_meta *bio_meta;
 	int st = -EINVAL;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	might_sleep();
 
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index c141cc3be22b..c8d8a2f16f8e 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -529,7 +529,7 @@ static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio)
 		 (unsigned long long)bio->bi_iter.bi_sector,
 		 bio->bi_iter.bi_size);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	spin_lock_irq(&card->lock);
 	*card->biotail = bio;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index dceb5edd1e54..49d8bf37c4ef 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -880,7 +880,7 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	blk_queue_split(queue, &bio, queue->bio_split);
+	blk_queue_split(queue, &bio);
 
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index e00b1d7b976f..701fa2fafbb2 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -999,7 +999,7 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
 	struct nvm_rq *rqd;
 	int err;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (bio_op(bio) == REQ_OP_DISCARD) {
 		rrpc_discard(rrpc, bio);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 548d1b8014f8..d7d2bb51a58d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -253,7 +253,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	unsigned int sectors;
 	int cpu;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (mddev == NULL || mddev->pers == NULL) {
 		bio_io_error(bio);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 415d10a67b7a..10ece6f3c7eb 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -829,7 +829,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 	unsigned long source_addr;
 	unsigned long bytes_done;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	bytes_done = 0;
 	dev_info = bio->bi_bdev->bd_disk->private_data;
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index b9d7e755c8a3..a48f0d40c1d2 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -190,7 +190,7 @@ static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio)
 	unsigned long page_addr;
 	unsigned long bytes;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if ((bio->bi_iter.bi_sector & 7) != 0 ||
 	    (bio->bi_iter.bi_size & 4095) != 0)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..8f98b4e2b9e6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -933,8 +933,7 @@ extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
-extern void blk_queue_split(struct request_queue *, struct bio **,
-			    struct bio_set *);
+extern void blk_queue_split(struct request_queue *, struct bio **);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
 extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,

^ permalink raw reply related

* [PATCH 1/5] blk: Ensure users for current->bio_list can see the full list.
From: NeilBrown @ 2017-03-10  6:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Mike Snitzer, LKML, linux-raid,
	device-mapper development, Mikulas Patocka, Pavel Machek,
	Jack Wang, Lars Ellenberg, Kent Overstreet
In-Reply-To: <148912539296.4002.219258660543808741.stgit@noble>

Commit 79bd99596b73 ("blk: improve order of bio handling in generic_make_request()")
changed current->bio_list so that it did not contain *all* of the
queued bios, but only those submitted by the currently running
make_request_fn.

There are two places which walk the list and requeue selected bios,
and others that check if the list is empty.  These are no longer
correct.

So redefine current->bio_list to point to an array of two lists, which
contain all queued bios, and adjust various code to test or walk both
lists.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/bio.c         |   12 +++++++++---
 block/blk-core.c    |   30 ++++++++++++++++++------------
 drivers/md/dm.c     |   29 ++++++++++++++++-------------
 drivers/md/raid10.c |    3 ++-
 4 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 5eec5e08417f..e75878f8b14a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -376,10 +376,14 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 	bio_list_init(&punt);
 	bio_list_init(&nopunt);
 
-	while ((bio = bio_list_pop(current->bio_list)))
+	while ((bio = bio_list_pop(&current->bio_list[0])))
 		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+	current->bio_list[0] = nopunt;
 
-	*current->bio_list = nopunt;
+	bio_list_init(&nopunt);
+	while ((bio = bio_list_pop(&current->bio_list[1])))
+		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+	current->bio_list[1] = nopunt;
 
 	spin_lock(&bs->rescue_lock);
 	bio_list_merge(&bs->rescue_list, &punt);
@@ -466,7 +470,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		 * we retry with the original gfp_flags.
 		 */
 
-		if (current->bio_list && !bio_list_empty(current->bio_list))
+		if (current->bio_list &&
+		    (!bio_list_empty(&current->bio_list[0]) ||
+		     !bio_list_empty(&current->bio_list[1])))
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
diff --git a/block/blk-core.c b/block/blk-core.c
index 0eeb99ef654f..d772c221cc17 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1973,7 +1973,14 @@ generic_make_request_checks(struct bio *bio)
  */
 blk_qc_t generic_make_request(struct bio *bio)
 {
-	struct bio_list bio_list_on_stack;
+	/*
+	 * bio_list_on_stack[0] contains bios submitted by the current
+	 * make_request_fn.
+	 * bio_list_on_stack[1] contains bios that were submitted before
+	 * the current make_request_fn, but that haven't been processed
+	 * yet.
+	 */
+	struct bio_list bio_list_on_stack[2];
 	blk_qc_t ret = BLK_QC_T_NONE;
 
 	if (!generic_make_request_checks(bio))
@@ -1990,7 +1997,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	 * should be added at the tail
 	 */
 	if (current->bio_list) {
-		bio_list_add(current->bio_list, bio);
+		bio_list_add(&current->bio_list[0], bio);
 		goto out;
 	}
 
@@ -2009,18 +2016,17 @@ blk_qc_t generic_make_request(struct bio *bio)
 	 * bio_list, and call into ->make_request() again.
 	 */
 	BUG_ON(bio->bi_next);
-	bio_list_init(&bio_list_on_stack);
-	current->bio_list = &bio_list_on_stack;
+	bio_list_init(&bio_list_on_stack[0]);
+	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
 		if (likely(blk_queue_enter(q, false) == 0)) {
-			struct bio_list hold;
 			struct bio_list lower, same;
 
 			/* Create a fresh bio_list for all subordinate requests */
-			hold = bio_list_on_stack;
-			bio_list_init(&bio_list_on_stack);
+			bio_list_on_stack[1] = bio_list_on_stack[0];
+			bio_list_init(&bio_list_on_stack[0]);
 			ret = q->make_request_fn(q, bio);
 
 			blk_queue_exit(q);
@@ -2030,19 +2036,19 @@ blk_qc_t generic_make_request(struct bio *bio)
 			 */
 			bio_list_init(&lower);
 			bio_list_init(&same);
-			while ((bio = bio_list_pop(&bio_list_on_stack)) != NULL)
+			while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
 				if (q == bdev_get_queue(bio->bi_bdev))
 					bio_list_add(&same, bio);
 				else
 					bio_list_add(&lower, bio);
 			/* now assemble so we handle the lowest level first */
-			bio_list_merge(&bio_list_on_stack, &lower);
-			bio_list_merge(&bio_list_on_stack, &same);
-			bio_list_merge(&bio_list_on_stack, &hold);
+			bio_list_merge(&bio_list_on_stack[0], &lower);
+			bio_list_merge(&bio_list_on_stack[0], &same);
+			bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
 		} else {
 			bio_io_error(bio);
 		}
-		bio = bio_list_pop(current->bio_list);
+		bio = bio_list_pop(&bio_list_on_stack[0]);
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f4ffd1eb8f44..dfb75979e455 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -989,26 +989,29 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 	struct dm_offload *o = container_of(cb, struct dm_offload, cb);
 	struct bio_list list;
 	struct bio *bio;
+	int i;
 
 	INIT_LIST_HEAD(&o->cb.list);
 
 	if (unlikely(!current->bio_list))
 		return;
 
-	list = *current->bio_list;
-	bio_list_init(current->bio_list);
-
-	while ((bio = bio_list_pop(&list))) {
-		struct bio_set *bs = bio->bi_pool;
-		if (unlikely(!bs) || bs == fs_bio_set) {
-			bio_list_add(current->bio_list, bio);
-			continue;
+	for (i = 0; i < 2; i++) {
+		list = current->bio_list[i];
+		bio_list_init(&current->bio_list[i]);
+
+		while ((bio = bio_list_pop(&list))) {
+			struct bio_set *bs = bio->bi_pool;
+			if (unlikely(!bs) || bs == fs_bio_set) {
+				bio_list_add(&current->bio_list[i], bio);
+				continue;
+			}
+
+			spin_lock(&bs->rescue_lock);
+			bio_list_add(&bs->rescue_list, bio);
+			queue_work(bs->rescue_workqueue, &bs->rescue_work);
+			spin_unlock(&bs->rescue_lock);
 		}
-
-		spin_lock(&bs->rescue_lock);
-		bio_list_add(&bs->rescue_list, bio);
-		queue_work(bs->rescue_workqueue, &bs->rescue_work);
-		spin_unlock(&bs->rescue_lock);
 	}
 }
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 063c43d83b72..0536658c9d40 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -974,7 +974,8 @@ static void wait_barrier(struct r10conf *conf)
 				    !conf->barrier ||
 				    (atomic_read(&conf->nr_pending) &&
 				     current->bio_list &&
-				     !bio_list_empty(current->bio_list)),
+				     (!bio_list_empty(&current->bio_list[0]) ||
+				      !bio_list_empty(&current->bio_list[1]))),
 				    conf->resync_lock);
 		conf->nr_waiting--;
 		if (!conf->nr_waiting)

^ permalink raw reply related

* [PATCH 4/5] blk: use non-rescuing bioset for q->bio_split.
From: NeilBrown @ 2017-03-10  6:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Mike Snitzer, LKML, linux-raid,
	device-mapper development, Mikulas Patocka, Pavel Machek,
	Jack Wang, Lars Ellenberg, Kent Overstreet
In-Reply-To: <148912539296.4002.219258660543808741.stgit@noble>

A rescuing bioset is only useful if there might be bios from
that same bioset on the bio_list_on_stack queue at a time
when bio_alloc_bioset() is called.  This never applies to
q->bio_split.

Allocations from q->bio_split are only ever made from
blk_queue_split() which is only ever called early in each of
various make_request_fn()s.  The original bio (call this A)
is then passed to generic_make_request() and is placed on
the bio_list_on_stack queue, and the bio that was allocated
from q->bio_split (B) is processed.

The processing of this may cause other bios to be passed to
generic_make_request() or may even cause the bio B itself to
be passed, possible after some prefix has been split off
(using some other bioset).

generic_make_request() now guarantees that all of these bios
(B and dependants) will be fully processed before the tail
of the original bio A gets handled.  None of these early bios
can possible trigger an allocation from the original
q->bio_split as they are either too small to require
splitting or (more likely) are destined for a different queue.

The next time that the original q->bio_split might be used
by this thread is when A is processed again, as it might
still be too big to handle directly.  By this time there
cannot be any other bios allocated from q->bio_split in the
generic_make_request() queue.  So no rescuing will ever be
needed.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/blk-core.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 430c82f646eb..fae7966e1f98 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -712,7 +712,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
-	q->bio_split = bioset_create_rescued(BIO_POOL_SIZE, 0);
+	q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
 	if (!q->bio_split)
 		goto fail_id;
 

^ permalink raw reply related

* [PATCH 5/5] block_dev: make blkdev_dio_pool a non-rescuing bioset
From: NeilBrown @ 2017-03-10  6:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Mike Snitzer, LKML, linux-raid,
	device-mapper development, Mikulas Patocka, Pavel Machek,
	Jack Wang, Lars Ellenberg, Kent Overstreet
In-Reply-To: <148912539296.4002.219258660543808741.stgit@noble>

Allocations from blkdev_dio_pool are never made under
generic_make_request, so this bioset does not need a rescuer thread.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 fs/block_dev.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index c0ca5f0d0369..2eca00ec4370 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -436,7 +436,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 static __init int blkdev_init(void)
 {
-	blkdev_dio_pool = bioset_create_rescued(4, offsetof(struct blkdev_dio, bio));
+	blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
 	if (!blkdev_dio_pool)
 		return -ENOMEM;
 	return 0;

^ permalink raw reply related

* [PATCH 0/5] Updates following recent generic_make_request improvement
From: NeilBrown @ 2017-03-10  6:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Mike Snitzer, LKML, linux-raid,
	device-mapper development, Mikulas Patocka, Pavel Machek,
	Jack Wang, Lars Ellenberg, Kent Overstreet

This is a rebase of the series I sent earlier, based on the
very latest from Linus, which included my first patch.

The first fixes a problem that patch introduced, and so should go to
Linux promptly.
The others are more general improvements and can go in the normal
course of events.

It is possible that the changes to btrfs and xfs can just be dropped
as a subsequent patch will be needed to revert them anyway.  They are
there only to be able to say that "blk: make the bioset
rescue_workqueue optional." doesn't change any functionality at all.

Thanks,
NeilBrown

---

NeilBrown (5):
      blk: Ensure users for current->bio_list can see the full list.
      blk: remove bio_set arg from blk_queue_split()
      blk: make the bioset rescue_workqueue optional.
      blk: use non-rescuing bioset for q->bio_split.
      block_dev: make blkdev_dio_pool a non-rescuing bioset


 block/bio.c                         |   40 +++++++++++++++++++++++++++++------
 block/blk-core.c                    |   32 +++++++++++++++++-----------
 block/blk-merge.c                   |    7 +++---
 block/blk-mq.c                      |    4 ++--
 drivers/block/drbd/drbd_main.c      |    2 +-
 drivers/block/drbd/drbd_req.c       |    2 +-
 drivers/block/pktcdvd.c             |    2 +-
 drivers/block/ps3vram.c             |    2 +-
 drivers/block/rsxx/dev.c            |    2 +-
 drivers/block/umem.c                |    2 +-
 drivers/block/zram/zram_drv.c       |    2 +-
 drivers/lightnvm/rrpc.c             |    2 +-
 drivers/md/bcache/super.c           |    4 ++--
 drivers/md/dm-crypt.c               |    2 +-
 drivers/md/dm-io.c                  |    2 +-
 drivers/md/dm.c                     |   32 ++++++++++++++++------------
 drivers/md/md.c                     |    4 ++--
 drivers/md/raid10.c                 |    3 ++-
 drivers/md/raid5-cache.c            |    2 +-
 drivers/s390/block/dcssblk.c        |    2 +-
 drivers/s390/block/xpram.c          |    2 +-
 drivers/target/target_core_iblock.c |    2 +-
 fs/btrfs/extent_io.c                |    4 ++--
 fs/xfs/xfs_super.c                  |    2 +-
 include/linux/bio.h                 |    2 ++
 include/linux/blkdev.h              |    3 +--
 26 files changed, 101 insertions(+), 64 deletions(-)

--
Signature

^ permalink raw reply

* Re: [PATCH v2] blk: improve order of bio handling in generic_make_request()
From: NeilBrown @ 2017-03-10  5:19 UTC (permalink / raw)
  To: Jens Axboe, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <58be6551-4aa7-72ee-1616-a1545606d029@kernel.dk>

[-- Attachment #1: Type: text/plain, Size: 1137 bytes --]

On Thu, Mar 09 2017, Jens Axboe wrote:

> On 03/09/2017 09:32 PM, NeilBrown wrote:
>> 
>> I started looking further at the improvements we can make once
>> generic_make_request is fixed, and realised that I had missed an
>> important detail in this patch.
>> Several places test current->bio_list, and two actually edit the list.
>> With this change, that cannot see the whole lists, so it could cause a
>> regression.
>> 
>> So I've revised the patch to make sure that the entire list of queued
>> bios remains visible, and change the relevant code to look at both
>> the new list and the old list.
>> 
>> Following that there are some patches which make the rescuer thread
>> optional, and then starts removing it from some easy-to-fix places.
>
> Neil, note that the v2 patch is already in Linus tree as of earlier
> today. You need to rebase the series, and if we need fixups on
> top of v2, then that should be done separately and with increased
> urgency.

I had checked linux-next, but not the latest from Linus.
I see it now - thanks!
I'll rebase (and ensure nothing gets mangled)

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH v2] blk: improve order of bio handling in generic_make_request()
From: Jens Axboe @ 2017-03-10  4:40 UTC (permalink / raw)
  To: NeilBrown, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <58be6551-4aa7-72ee-1616-a1545606d029@kernel.dk>

On 03/09/2017 09:38 PM, Jens Axboe wrote:
> On 03/09/2017 09:32 PM, NeilBrown wrote:
>>
>> I started looking further at the improvements we can make once
>> generic_make_request is fixed, and realised that I had missed an
>> important detail in this patch.
>> Several places test current->bio_list, and two actually edit the list.
>> With this change, that cannot see the whole lists, so it could cause a
>> regression.
>>
>> So I've revised the patch to make sure that the entire list of queued
>> bios remains visible, and change the relevant code to look at both
>> the new list and the old list.
>>
>> Following that there are some patches which make the rescuer thread
>> optional, and then starts removing it from some easy-to-fix places.
> 
> Neil, note that the v2 patch is already in Linus tree as of earlier
> today. You need to rebase the series, and if we need fixups on
> top of v2, then that should be done separately and with increased
> urgency.

Additionally, at least the first patch appears to be badly mangled.
The formatting is screwed up.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH v2] blk: improve order of bio handling in generic_make_request()
From: Jens Axboe @ 2017-03-10  4:38 UTC (permalink / raw)
  To: NeilBrown, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <87d1dphhuy.fsf@notabene.neil.brown.name>

On 03/09/2017 09:32 PM, NeilBrown wrote:
> 
> I started looking further at the improvements we can make once
> generic_make_request is fixed, and realised that I had missed an
> important detail in this patch.
> Several places test current->bio_list, and two actually edit the list.
> With this change, that cannot see the whole lists, so it could cause a
> regression.
> 
> So I've revised the patch to make sure that the entire list of queued
> bios remains visible, and change the relevant code to look at both
> the new list and the old list.
> 
> Following that there are some patches which make the rescuer thread
> optional, and then starts removing it from some easy-to-fix places.

Neil, note that the v2 patch is already in Linus tree as of earlier
today. You need to rebase the series, and if we need fixups on
top of v2, then that should be done separately and with increased
urgency.

-- 
Jens Axboe

^ permalink raw reply

* [PATCH 5/5] block_dev: make blkdev_dio_pool a non-rescuing bioset
From: NeilBrown @ 2017-03-10  4:37 UTC (permalink / raw)
  To: Jens Axboe, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <87d1dphhuy.fsf@notabene.neil.brown.name>

[-- Attachment #1: Type: text/plain, Size: 727 bytes --]


Allocations from blkdev_dio_pool are never made under
generic_make_request, so this bioset does not need a rescuer thread.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 fs/block_dev.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index c0ca5f0d0369..2eca00ec4370 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -436,7 +436,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 static __init int blkdev_init(void)
 {
-	blkdev_dio_pool = bioset_create_rescued(4, offsetof(struct blkdev_dio, bio));
+	blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
 	if (!blkdev_dio_pool)
 		return -ENOMEM;
 	return 0;



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* [PATCH 4/5] blk: use non-rescuing bioset for q->bio_split.
From: NeilBrown @ 2017-03-10  4:36 UTC (permalink / raw)
  To: Jens Axboe, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <87d1dphhuy.fsf@notabene.neil.brown.name>

[-- Attachment #1: Type: text/plain, Size: 1967 bytes --]


A rescuing bioset is only useful if there might be bios from
that same bioset on the bio_list_on_stack queue at a time
when bio_alloc_bioset() is called.  This never applies to
q->bio_split.

Allocations from q->bio_split are only ever made from
blk_queue_split() which is only ever called early in each of
various make_request_fn()s.  The original bio (call this A)
is then passed to generic_make_request() and is placed on
the bio_list_on_stack queue, and the bio that was allocated
from q->bio_split (B) is processed.

The processing of this may cause other bios to be passed to
generic_make_request() or may even cause the bio B itself to
be passed, possible after some prefix has been split off
(using some other bioset).

generic_make_request() now guarantees that all of these bios
(B and dependants) will be fully processed before the tail
of the original bio A gets handled.  None of these early bios
can possible trigger an allocation from the original
q->bio_split as they are either too small to require
splitting or (more likely) are destined for a different queue.

The next time that the original q->bio_split might be used
by this thread is when A is processed again, as it might
still be too big to handle directly.  By this time there
cannot be and other bios allocated fro q->bio_split in the
generic_make_request() queue.  So no rescuing will ever be
needed.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/blk-core.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index c3992d17dc2c..375006c94c15 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -714,7 +714,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
-	q->bio_split = bioset_create_rescued(BIO_POOL_SIZE, 0);
+	q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
 	if (!q->bio_split)
 		goto fail_id;
 



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* [PATCH 3/5] blk: make the bioset rescue_workqueue optional.
From: NeilBrown @ 2017-03-10  4:35 UTC (permalink / raw)
  To: Jens Axboe, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <87d1dphhuy.fsf@notabene.neil.brown.name>

[-- Attachment #1: Type: text/plain, Size: 11485 bytes --]


This patch converts bioset_create() and bioset_create_nobvec()
to not create a workqueue so alloctions will never trigger
punt_bios_to_rescuer().
It also introduces bioset_create_rescued() and bioset_create_nobvec_rescued()
which preserve the old behaviour.

*All* callers of bioset_create() and bioset_create_nobvec() are
converted to the _rescued() version, so that not change in behaviour
is experienced.

It is hoped that most, if not all, bioset can end up being the
non-rescued version.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/bio.c                         |   30 +++++++++++++++++++++++++-----
 block/blk-core.c                    |    2 +-
 drivers/block/drbd/drbd_main.c      |    2 +-
 drivers/md/bcache/super.c           |    4 ++--
 drivers/md/dm-crypt.c               |    2 +-
 drivers/md/dm-io.c                  |    2 +-
 drivers/md/dm.c                     |    5 +++--
 drivers/md/md.c                     |    2 +-
 drivers/md/raid5-cache.c            |    2 +-
 drivers/target/target_core_iblock.c |    2 +-
 fs/block_dev.c                      |    2 +-
 fs/btrfs/extent_io.c                |    4 ++--
 fs/xfs/xfs_super.c                  |    2 +-
 include/linux/bio.h                 |    2 ++
 14 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 84ae39f06f81..06587f1119f5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -362,6 +362,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 	struct bio_list punt, nopunt;
 	struct bio *bio;
 
+	if (!WARN_ON_ONCE(!bs->rescue_workqueue))
+		return;
 	/*
 	 * In order to guarantee forward progress we must punt only bios that
 	 * were allocated from this bio_set; otherwise, if there was a bio on
@@ -471,7 +473,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		if (current->bio_list &&
 		    (!bio_list_empty(&current->bio_list[0]) ||
-		     !bio_list_empty(&current->bio_list[1])))
+		     !bio_list_empty(&current->bio_list[1])) &&
+		    bs->rescue_workqueue)
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
@@ -1940,7 +1943,8 @@ EXPORT_SYMBOL(bioset_free);
 
 static struct bio_set *__bioset_create(unsigned int pool_size,
 				       unsigned int front_pad,
-				       bool create_bvec_pool)
+				       bool create_bvec_pool,
+				       bool create_rescue_workqueue)
 {
 	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
 	struct bio_set *bs;
@@ -1971,6 +1975,9 @@ static struct bio_set *__bioset_create(unsigned int pool_size,
 			goto bad;
 	}
 
+	if (!create_rescue_workqueue)
+		return bs;
+
 	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
 	if (!bs->rescue_workqueue)
 		goto bad;
@@ -1996,10 +2003,16 @@ static struct bio_set *__bioset_create(unsigned int pool_size,
  */
 struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
-	return __bioset_create(pool_size, front_pad, true);
+	return __bioset_create(pool_size, front_pad, true, false);
 }
 EXPORT_SYMBOL(bioset_create);
 
+struct bio_set *bioset_create_rescued(unsigned int pool_size, unsigned int front_pad)
+{
+	return __bioset_create(pool_size, front_pad, true, true);
+}
+EXPORT_SYMBOL(bioset_create_rescued);
+
 /**
  * bioset_create_nobvec  - Create a bio_set without bio_vec mempool
  * @pool_size:	Number of bio to cache in the mempool
@@ -2011,10 +2024,17 @@ EXPORT_SYMBOL(bioset_create);
  */
 struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad)
 {
-	return __bioset_create(pool_size, front_pad, false);
+	return __bioset_create(pool_size, front_pad, false, false);
 }
 EXPORT_SYMBOL(bioset_create_nobvec);
 
+struct bio_set *bioset_create_nobvec_rescued(unsigned int pool_size,
+					     unsigned int front_pad)
+{
+	return __bioset_create(pool_size, front_pad, false, true);
+}
+EXPORT_SYMBOL(bioset_create_nobvec_rescued);
+
 #ifdef CONFIG_BLK_CGROUP
 
 /**
@@ -2129,7 +2149,7 @@ static int __init init_bio(void)
 	bio_integrity_init();
 	biovec_init_slabs();
 
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+	fs_bio_set = bioset_create_rescued(BIO_POOL_SIZE, 0);
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 375006c94c15..c3992d17dc2c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -714,7 +714,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
-	q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	q->bio_split = bioset_create_rescued(BIO_POOL_SIZE, 0);
 	if (!q->bio_split)
 		goto fail_id;
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 92c60cbd04ee..2c69c2ab0fff 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2166,7 +2166,7 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
-	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+	drbd_md_io_bio_set = bioset_create_rescued(DRBD_MIN_POOL_PAGES, 0);
 	if (drbd_md_io_bio_set == NULL)
 		goto Enomem;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 85e3f21c2514..6cb30792f0ed 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -786,7 +786,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 
 	minor *= BCACHE_MINORS;
 
-	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	if (!(d->bio_split = bioset_create_rescued(4, offsetof(struct bbio, bio))) ||
 	    !(d->disk = alloc_disk(BCACHE_MINORS))) {
 		ida_simple_remove(&bcache_minor, minor);
 		return -ENOMEM;
@@ -1520,7 +1520,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 				sizeof(struct bbio) + sizeof(struct bio_vec) *
 				bucket_pages(c))) ||
 	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
-	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	    !(c->bio_split = bioset_create_rescued(4, offsetof(struct bbio, bio))) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
 						WQ_MEM_RECLAIM, 0)) ||
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..91a2d637d44f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1936,7 +1936,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, 0);
+	cc->bs = bioset_create_rescued(MIN_IOS, 0);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 03940bf36f6c..fe1241c196b1 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -58,7 +58,7 @@ struct dm_io_client *dm_io_client_create(void)
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(min_ios, 0);
+	client->bios = bioset_create_rescued(min_ios, 0);
 	if (!client->bios)
 		goto bad;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..41b1f033841f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1002,7 +1002,8 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 
 		while ((bio = bio_list_pop(&list))) {
 			struct bio_set *bs = bio->bi_pool;
-			if (unlikely(!bs) || bs == fs_bio_set) {
+			if (unlikely(!bs) || bs == fs_bio_set ||
+			    !bs->rescue_workqueue) {
 				bio_list_add(&current->bio_list[i], bio);
 				continue;
 			}
@@ -2577,7 +2578,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		BUG();
 	}
 
-	pools->bs = bioset_create_nobvec(pool_size, front_pad);
+	pools->bs = bioset_create_nobvec_rescued(pool_size, front_pad);
 	if (!pools->bs)
 		goto out;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d7d2bb51a58d..e5f08a195837 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5220,7 +5220,7 @@ int md_run(struct mddev *mddev)
 	}
 
 	if (mddev->bio_set == NULL) {
-		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+		mddev->bio_set = bioset_create_rescued(BIO_POOL_SIZE, 0);
 		if (!mddev->bio_set)
 			return -ENOMEM;
 	}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3f307be01b10..c95c6c046395 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2831,7 +2831,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_pool)
 		goto io_pool;
 
-	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	log->bs = bioset_create_rescued(R5L_POOL_SIZE, 0);
 	if (!log->bs)
 		goto io_bs;
 
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index d316ed537d59..5bf3392195c6 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -93,7 +93,7 @@ static int iblock_configure_device(struct se_device *dev)
 		return -EINVAL;
 	}
 
-	ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0);
+	ib_dev->ibd_bio_set = bioset_create_rescued(IBLOCK_BIO_POOL_SIZE, 0);
 	if (!ib_dev->ibd_bio_set) {
 		pr_err("IBLOCK: Unable to create bioset\n");
 		goto out;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..c0ca5f0d0369 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -436,7 +436,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 static __init int blkdev_init(void)
 {
-	blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
+	blkdev_dio_pool = bioset_create_rescued(4, offsetof(struct blkdev_dio, bio));
 	if (!blkdev_dio_pool)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 28e81922a21c..34aa8893790a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -173,8 +173,8 @@ int __init extent_io_init(void)
 	if (!extent_buffer_cache)
 		goto free_state_cache;
 
-	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
-				     offsetof(struct btrfs_io_bio, bio));
+	btrfs_bioset = bioset_create_rescued(BIO_POOL_SIZE,
+					     offsetof(struct btrfs_io_bio, bio));
 	if (!btrfs_bioset)
 		goto free_buffer_cache;
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 890862f2447c..f4c4d6f41d91 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1756,7 +1756,7 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
-	xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+	xfs_ioend_bioset = bioset_create_rescued(4 * MAX_BUF_PER_PAGE,
 			offsetof(struct xfs_ioend, io_inline_bio));
 	if (!xfs_ioend_bioset)
 		goto out;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..05730603fcf1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -379,7 +379,9 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 }
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
+extern struct bio_set *bioset_create_rescued(unsigned int, unsigned int);
 extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
+extern struct bio_set *bioset_create_nobvec_rescued(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 extern mempool_t *biovec_create_pool(int pool_entries);
 



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* [PATCH 2/5] blk: remove bio_set arg from blk_queue_split()
From: NeilBrown @ 2017-03-10  4:34 UTC (permalink / raw)
  To: Jens Axboe, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <87d1dphhuy.fsf@notabene.neil.brown.name>

[-- Attachment #1: Type: text/plain, Size: 8745 bytes --]


blk_queue_split() is always called with the last arg being q->bio_split,
where 'q' is the first arg.

Also blk_queue_split() sometimes uses the passed-in 'bs' and sometimes uses
q->bio_split.

This is inconsistent and unnecessary.  Remove the last arg and always use
q->bio_split inside blk_queue_split()

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/blk-core.c              |    2 +-
 block/blk-merge.c             |    7 +++----
 block/blk-mq.c                |    4 ++--
 drivers/block/drbd/drbd_req.c |    2 +-
 drivers/block/pktcdvd.c       |    2 +-
 drivers/block/ps3vram.c       |    2 +-
 drivers/block/rsxx/dev.c      |    2 +-
 drivers/block/umem.c          |    2 +-
 drivers/block/zram/zram_drv.c |    2 +-
 drivers/lightnvm/rrpc.c       |    2 +-
 drivers/md/md.c               |    2 +-
 drivers/s390/block/dcssblk.c  |    2 +-
 drivers/s390/block/xpram.c    |    2 +-
 include/linux/blkdev.h        |    3 +--
 14 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index bd2cb4ba674e..375006c94c15 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1637,7 +1637,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 */
 	blk_queue_bounce(q, &bio);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio->bi_error = -EIO;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2afa262425d1..ce8838aff7f7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -188,8 +188,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	return do_split ? new : NULL;
 }
 
-void blk_queue_split(struct request_queue *q, struct bio **bio,
-		     struct bio_set *bs)
+void blk_queue_split(struct request_queue *q, struct bio **bio)
 {
 	struct bio *split, *res;
 	unsigned nsegs;
@@ -197,14 +196,14 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
+		split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
 		split = NULL;
 		nsegs = (*bio)->bi_phys_segments;
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs);
 		break;
 	default:
 		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b2fd175e84d7..ef7b289e873c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1502,7 +1502,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
@@ -1624,7 +1624,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (!is_flush_fua && !blk_queue_nomerges(q)) {
 		if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 652114ae1a8a..f6ed6f7f5ab2 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1554,7 +1554,7 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
 	struct drbd_device *device = (struct drbd_device *) q->queuedata;
 	unsigned long start_jif;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	start_jif = jiffies;
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 66d846ba85a9..98394d034c29 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2414,7 +2414,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_bounce(q, &bio);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	pd = q->queuedata;
 	if (!pd) {
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 456b4fe21559..48072c0c1010 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -606,7 +606,7 @@ static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio)
 
 	dev_dbg(&dev->core, "%s\n", __func__);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	spin_lock_irq(&priv->lock);
 	busy = !bio_list_empty(&priv->list);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index f81d70b39d10..8c8024f616ae 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -151,7 +151,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio)
 	struct rsxx_bio_meta *bio_meta;
 	int st = -EINVAL;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	might_sleep();
 
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index c141cc3be22b..c8d8a2f16f8e 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -529,7 +529,7 @@ static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio)
 		 (unsigned long long)bio->bi_iter.bi_sector,
 		 bio->bi_iter.bi_size);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	spin_lock_irq(&card->lock);
 	*card->biotail = bio;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e27d89a36c34..973dac2a9a99 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -880,7 +880,7 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	blk_queue_split(queue, &bio, queue->bio_split);
+	blk_queue_split(queue, &bio);
 
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index e00b1d7b976f..701fa2fafbb2 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -999,7 +999,7 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
 	struct nvm_rq *rqd;
 	int err;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (bio_op(bio) == REQ_OP_DISCARD) {
 		rrpc_discard(rrpc, bio);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 548d1b8014f8..d7d2bb51a58d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -253,7 +253,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	unsigned int sectors;
 	int cpu;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (mddev == NULL || mddev->pers == NULL) {
 		bio_io_error(bio);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 415d10a67b7a..10ece6f3c7eb 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -829,7 +829,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 	unsigned long source_addr;
 	unsigned long bytes_done;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	bytes_done = 0;
 	dev_info = bio->bi_bdev->bd_disk->private_data;
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index b9d7e755c8a3..a48f0d40c1d2 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -190,7 +190,7 @@ static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio)
 	unsigned long page_addr;
 	unsigned long bytes;
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if ((bio->bi_iter.bi_sector & 7) != 0 ||
 	    (bio->bi_iter.bi_size & 4095) != 0)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 796016e63c1d..63008a246403 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -934,8 +934,7 @@ extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
-extern void blk_queue_split(struct request_queue *, struct bio **,
-			    struct bio_set *);
+extern void blk_queue_split(struct request_queue *, struct bio **);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
 extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* [PATCH 1/5 v3] blk: improve order of bio handling in generic_make_request()
From: NeilBrown @ 2017-03-10  4:33 UTC (permalink / raw)
  To: Jens Axboe, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <87d1dphhuy.fsf@notabene.neil.brown.name>

[-- Attachment #1: Type: text/plain, Size: 9493 bytes --]


To avoid recursion on the kernel stack when stacked block devices
are in use, generic_make_request() will, when called recursively,
queue new requests for later handling.  They will be handled when the
make_request_fn for the current bio completes.

If any bios are submitted by a make_request_fn, these will ultimately
be handled seqeuntially.  If the handling of one of those generates
further requests, they will be added to the end of the queue.

This strict first-in-first-out behaviour can lead to deadlocks in
various ways, normally because a request might need to wait for a
previous request to the same device to complete.  This can happen when
they share a mempool, and can happen due to interdependencies
particular to the device.  Both md and dm have examples where this happens.

These deadlocks can be erradicated by more selective ordering of bios.
Specifically by handling them in depth-first order.  That is: when the
handling of one bio generates one or more further bios, they are
handled immediately after the parent, before any siblings of the
parent.  That way, when generic_make_request() calls make_request_fn
for some particular device, we can be certain that all previously
submited requests for that device have been completely handled and are
not waiting for anything in the queue of requests maintained in
generic_make_request().

An easy way to achieve this would be to use a last-in-first-out stack
instead of a queue.  However this will change the order of consecutive
bios submitted by a make_request_fn, which could have unexpected consequences.
Instead we take a slightly more complex approach.
A fresh queue is created for each call to a make_request_fn.  After it completes,
any bios for a different device are placed on the front of the main queue, followed
by any bios for the same device, followed by all bios that were already on
the queue before the make_request_fn was called.
This provides the depth-first approach without reordering bios on the same level.

This, by itself, it not enough to remove all deadlocks.  It just makes
it possible for drivers to take the extra step required themselves.

To avoid deadlocks, drivers must never risk waiting for a request
after submitting one to generic_make_request.  This includes never
allocing from a mempool twice in the one call to a make_request_fn.

A common pattern in drivers is to call bio_split() in a loop, handling
the first part and then looping around to possibly split the next part.
Instead, a driver that finds it needs to split a bio should queue
(with generic_make_request) the second part, handle the first part,
and then return.  The new code in generic_make_request will ensure the
requests to underlying bios are processed first, then the second bio
that was split off.  If it splits again, the same process happens.  In
each case one bio will be completely handled before the next one is attempted.

With this is place, it should be possible to disable the
punt_bios_to_recover() recovery thread for many block devices, and
eventually it may be possible to remove it completely.

Note that as some drivers look inside the bio_list, sometimes to punt
some bios to rescuer threads, we need to make both the pendind list and the
new list visible.  So current->bio_list is now an array of 2 lists,
and relevant code examines both of them.

Ref: http://www.spinics.net/lists/raid/msg54680.html
Tested-by: Jinpu Wang <jinpu.wang@profitbricks.com>
Inspired-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/bio.c         |   11 ++++++++---
 block/blk-core.c    |   40 ++++++++++++++++++++++++++++++++--------
 drivers/md/dm.c     |   29 ++++++++++++++++-------------
 drivers/md/raid10.c |    3 ++-
 4 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 5eec5e08417f..84ae39f06f81 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -376,10 +376,13 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 	bio_list_init(&punt);
 	bio_list_init(&nopunt);
 
-	while ((bio = bio_list_pop(current->bio_list)))
+	while ((bio = bio_list_pop(&current->bio_list[0])))
 		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+	current->bio_list[0] = nopunt;
 
-	*current->bio_list = nopunt;
+	while ((bio = bio_list_pop(&current->bio_list[1])))
+		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+	current->bio_list[1] = nopunt;
 
 	spin_lock(&bs->rescue_lock);
 	bio_list_merge(&bs->rescue_list, &punt);
@@ -466,7 +469,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		 * we retry with the original gfp_flags.
 		 */
 
-		if (current->bio_list && !bio_list_empty(current->bio_list))
+		if (current->bio_list &&
+		    (!bio_list_empty(&current->bio_list[0]) ||
+		     !bio_list_empty(&current->bio_list[1])))
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
diff --git a/block/blk-core.c b/block/blk-core.c
index 1086dac8724c..bd2cb4ba674e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1975,7 +1975,14 @@ generic_make_request_checks(struct bio *bio)
  */
 blk_qc_t generic_make_request(struct bio *bio)
 {
-	struct bio_list bio_list_on_stack;
+	/*
+	 * bio_list_on_stack[0] contains bios submitted by the current
+	 * make_request_fn.
+	 * bio_list_on_stack[1] contains bios that were submitted before
+	 * the current make_request_fn, but that haven't been processed
+	 * yet.
+	 */
+	struct bio_list bio_list_on_stack[2];
 	blk_qc_t ret = BLK_QC_T_NONE;
 
 	if (!generic_make_request_checks(bio))
@@ -1992,7 +1999,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	 * should be added at the tail
 	 */
 	if (current->bio_list) {
-		bio_list_add(current->bio_list, bio);
+		bio_list_add(&current->bio_list[0], bio);
 		goto out;
 	}
 
@@ -2011,23 +2018,40 @@ blk_qc_t generic_make_request(struct bio *bio)
 	 * bio_list, and call into ->make_request() again.
 	 */
 	BUG_ON(bio->bi_next);
-	bio_list_init(&bio_list_on_stack);
-	current->bio_list = &bio_list_on_stack;
+	bio_list_init(&bio_list_on_stack[0]);
+	bio_list_init(&bio_list_on_stack[1]);
+	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
 		if (likely(blk_queue_enter(q, false) == 0)) {
+			struct bio_list lower, same;
+
+			/* Create a fresh bio_list for all subordinate requests */
+			bio_list_on_stack[1] = bio_list_on_stack[0];
+			bio_list_init(&bio_list_on_stack[0]);
 			ret = q->make_request_fn(q, bio);
 
 			blk_queue_exit(q);
 
-			bio = bio_list_pop(current->bio_list);
+			/* sort new bios into those for a lower level
+			 * and those for the same level
+			 */
+			bio_list_init(&lower);
+			bio_list_init(&same);
+			while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
+				if (q == bdev_get_queue(bio->bi_bdev))
+					bio_list_add(&same, bio);
+				else
+					bio_list_add(&lower, bio);
+			/* now assemble so we handle the lowest level first */
+			bio_list_merge(&bio_list_on_stack[0], &lower);
+			bio_list_merge(&bio_list_on_stack[0], &same);
+			bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
 		} else {
-			struct bio *bio_next = bio_list_pop(current->bio_list);
-
 			bio_io_error(bio);
-			bio = bio_next;
 		}
+		bio = bio_list_pop(&bio_list_on_stack[0]);
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f4ffd1eb8f44..dfb75979e455 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -989,26 +989,29 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 	struct dm_offload *o = container_of(cb, struct dm_offload, cb);
 	struct bio_list list;
 	struct bio *bio;
+	int i;
 
 	INIT_LIST_HEAD(&o->cb.list);
 
 	if (unlikely(!current->bio_list))
 		return;
 
-	list = *current->bio_list;
-	bio_list_init(current->bio_list);
-
-	while ((bio = bio_list_pop(&list))) {
-		struct bio_set *bs = bio->bi_pool;
-		if (unlikely(!bs) || bs == fs_bio_set) {
-			bio_list_add(current->bio_list, bio);
-			continue;
+	for (i = 0; i < 2; i++) {
+		list = current->bio_list[i];
+		bio_list_init(&current->bio_list[i]);
+
+		while ((bio = bio_list_pop(&list))) {
+			struct bio_set *bs = bio->bi_pool;
+			if (unlikely(!bs) || bs == fs_bio_set) {
+				bio_list_add(&current->bio_list[i], bio);
+				continue;
+			}
+
+			spin_lock(&bs->rescue_lock);
+			bio_list_add(&bs->rescue_list, bio);
+			queue_work(bs->rescue_workqueue, &bs->rescue_work);
+			spin_unlock(&bs->rescue_lock);
 		}
-
-		spin_lock(&bs->rescue_lock);
-		bio_list_add(&bs->rescue_list, bio);
-		queue_work(bs->rescue_workqueue, &bs->rescue_work);
-		spin_unlock(&bs->rescue_lock);
 	}
 }
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 063c43d83b72..0536658c9d40 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -974,7 +974,8 @@ static void wait_barrier(struct r10conf *conf)
 				    !conf->barrier ||
 				    (atomic_read(&conf->nr_pending) &&
 				     current->bio_list &&
-				     !bio_list_empty(current->bio_list)),
+				     (!bio_list_empty(&current->bio_list[0]) ||
+				      !bio_list_empty(&current->bio_list[1]))),
 				    conf->resync_lock);
 		conf->nr_waiting--;
 		if (!conf->nr_waiting)



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* Re: [PATCH v2] blk: improve order of bio handling in generic_make_request()
From: NeilBrown @ 2017-03-10  4:32 UTC (permalink / raw)
  To: Jens Axboe, Jack Wang
  Cc: LKML, Lars Ellenberg, Kent Overstreet, Pavel Machek, Mike Snitzer,
	Mikulas Patocka, linux-raid, device-mapper development,
	linux-block
In-Reply-To: <87r328j00i.fsf@notabene.neil.brown.name>

[-- Attachment #1: Type: text/plain, Size: 2343 bytes --]


I started looking further at the improvements we can make once
generic_make_request is fixed, and realised that I had missed an
important detail in this patch.
Several places test current->bio_list, and two actually edit the list.
With this change, that cannot see the whole lists, so it could cause a
regression.

So I've revised the patch to make sure that the entire list of queued
bios remains visible, and change the relevant code to look at both
the new list and the old list.

Following that there are some patches which make the rescuer thread
optional, and then starts removing it from some easy-to-fix places.

The series summary is below.

NeilBrown


NeilBrown (5):
      blk: improve order of bio handling in generic_make_request()
      blk: remove bio_set arg from blk_queue_split()
      blk: make the bioset rescue_workqueue optional.
      blk: use non-rescuing bioset for q->bio_split.
      block_dev: make blkdev_dio_pool a non-rescuing bioset


 block/bio.c                         |   39 +++++++++++++++++++++++++++------
 block/blk-core.c                    |   42 ++++++++++++++++++++++++++++-------
 block/blk-merge.c                   |    7 +++---
 block/blk-mq.c                      |    4 ++-
 drivers/block/drbd/drbd_main.c      |    2 +-
 drivers/block/drbd/drbd_req.c       |    2 +-
 drivers/block/pktcdvd.c             |    2 +-
 drivers/block/ps3vram.c             |    2 +-
 drivers/block/rsxx/dev.c            |    2 +-
 drivers/block/umem.c                |    2 +-
 drivers/block/zram/zram_drv.c       |    2 +-
 drivers/lightnvm/rrpc.c             |    2 +-
 drivers/md/bcache/super.c           |    4 ++-
 drivers/md/dm-crypt.c               |    2 +-
 drivers/md/dm-io.c                  |    2 +-
 drivers/md/dm.c                     |   32 +++++++++++++++------------
 drivers/md/md.c                     |    4 ++-
 drivers/md/raid10.c                 |    3 ++-
 drivers/md/raid5-cache.c            |    2 +-
 drivers/s390/block/dcssblk.c        |    2 +-
 drivers/s390/block/xpram.c          |    2 +-
 drivers/target/target_core_iblock.c |    2 +-
 fs/btrfs/extent_io.c                |    4 ++-
 fs/xfs/xfs_super.c                  |    2 +-
 include/linux/bio.h                 |    2 ++
 include/linux/blkdev.h              |    3 +--
 26 files changed, 114 insertions(+), 60 deletions(-)

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH rfc 04/10] block: Add a non-selective polling interface
From: Damien Le Moal @ 2017-03-10  3:04 UTC (permalink / raw)
  To: Johannes Thumshirn, Sagi Grimberg, linux-block, linux-nvme,
	linux-rdma, target-devel
In-Reply-To: <83713ef7-168d-e1f6-8220-b7d6264ea29a@suse.de>

On 3/9/17 22:44, Johannes Thumshirn wrote:
> On 03/09/2017 02:16 PM, Sagi Grimberg wrote:
>> For a server/target appliance mode where we don't
>> necessarily care about specific IOs but rather want
>> to poll opportunisticly, it is useful to have a
>> non-selective polling interface.
>>
>> Expose a blk_poll_batch for a batched blkdev polling
>> interface so our nvme target (and others) can use.
>>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> ---
>>  block/blk-mq.c         | 14 ++++++++++++++
>>  include/linux/blk-mq.h |  2 ++
>>  include/linux/blkdev.h |  1 +
>>  3 files changed, 17 insertions(+)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index b2fd175e84d7..1962785b571a 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -2911,6 +2911,20 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
>>  }
>>  EXPORT_SYMBOL_GPL(blk_mq_poll);
>>  
>> +int blk_mq_poll_batch(struct request_queue *q, unsigned int batch)
>> +{
>> +	struct blk_mq_hw_ctx *hctx;
>> +
>> +	if (!q->mq_ops || !q->mq_ops->poll_batch)
>> +		return 0;
>> +
>> +	hctx = blk_mq_map_queue(q, smp_processor_id());
>> +	return q->mq_ops->poll_batch(hctx, batch);
>> +}
>> +EXPORT_SYMBOL_GPL(blk_mq_poll_batch);
>> +
>> +
>> +
> 
> Quite some additional newlines and I'm not really fond of the
> ->poll_batch() name. It's a bit confusing with ->poll() and we also have
> irq_poll(). But the only thing that would come to my mind is
> complete_batch() which "races" with ->complete().

What about ->check_completions()? After all, that is what both ->poll()
and ->poll_batch do but with a different stop condition, no ?
So it would also be easy to merge the two: both tag and batch are
unsigned int which could be called "cookie", and add a flag to tell how
to interpret it (as a tag or a batch limit).
e.g. something like:

+typedef int (check_completions_fn)(struct blk_mq_hw_ctx *,
		enum blk_mq_check_flags, /* flag (TAG or BATCH) */
		unsigned int); /* Target tag or batch limit */

Best regards.

-- 
Damien Le Moal, Ph.D.
Sr. Manager, System Software Research Group,
Western Digital Corporation
Damien.LeMoal@wdc.com
(+81) 0466-98-3593 (ext. 513593)
1 kirihara-cho, Fujisawa,
Kanagawa, 252-0888 Japan
www.wdc.com, www.hgst.com

^ permalink raw reply

* Re: [PATCH 2/2] blk-mq: start to freeze queue just after setting dying
From: Ming Lei @ 2017-03-10  2:16 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, axboe@fb.com, yizhan@redhat.com,
	tj@kernel.org
In-Reply-To: <1489078694.2597.5.camel@sandisk.com>

On Fri, Mar 10, 2017 at 12:58 AM, Bart Van Assche
<Bart.VanAssche@sandisk.com> wrote:
> On Thu, 2017-03-09 at 21:02 +0800, Ming Lei wrote:
>> Before commit 780db2071a(blk-mq: decouble blk-mq freezing
>> from generic bypassing), the dying flag is checked before
>> entering queue, and Tejun converts the checking into .mq_freeze_depth,
>> and assumes the counter is increased just after dying flag
>> is set. Unfortunately we doesn't do that in blk_set_queue_dying().
>>
>> This patch calls blk_mq_freeze_queue_start() for blk-mq in
>> blk_set_queue_dying(), so that we can block new I/O coming
>> once the queue is set as dying.
>>
>> Given blk_set_queue_dying() is always called in remove path
>> of block device, and queue will be cleaned up later, we don't
>> need to worry about undo of the counter.
>>
>> Cc: Tejun Heo <tj@kernel.org>
>> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
>> ---
>>  block/blk-core.c | 7 +++++--
>>  1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 0eeb99ef654f..559487e58296 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -500,9 +500,12 @@ void blk_set_queue_dying(struct request_queue *q)
>>       queue_flag_set(QUEUE_FLAG_DYING, q);
>>       spin_unlock_irq(q->queue_lock);
>>
>> -     if (q->mq_ops)
>> +     if (q->mq_ops) {
>>               blk_mq_wake_waiters(q);
>> -     else {
>> +
>> +             /* block new I/O coming */
>> +             blk_mq_freeze_queue_start(q);
>> +     } else {
>>               struct request_list *rl;
>>
>>               spin_lock_irq(q->queue_lock);
>
> The comment above blk_mq_freeze_queue_start() should explain more clearly
> why that call is needed. Additionally, I think this patch makes the

The comment of "block new I/O coming" has been added, and let me know what
others are needed, :-)

> blk_freeze_queue() call in blk_cleanup_queue() superfluous. How about the
> (entirely untested) patch below?

I don't think we need to wait in blk_set_queue_dying(), and the purpose
of this patch is to block new I/O coming once dying iset as pointed in the
comment, and the change in blk_cleanup_queue() isn't necessary too, since
that is exactly where we should drain the queue.


Thanks,
Ming Lei

^ permalink raw reply

* Re: [PATCH 06/16] mmc: core: replace waitqueue with worker
From: Linus Walleij @ 2017-03-09 22:49 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: linux-mmc@vger.kernel.org, Ulf Hansson, Paolo Valente,
	Chunyan Zhang, Baolin Wang, linux-block, Jens Axboe,
	Christoph Hellwig, Arnd Bergmann
In-Reply-To: <00989e26-cdb9-48d7-2e46-ae6ef66e59a7@intel.com>

On Wed, Feb 22, 2017 at 2:29 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:
> On 09/02/17 17:33, Linus Walleij wrote:
>> The waitqueue in the host context is there to signal back from
>> mmc_request_done() through mmc_wait_data_done() that the hardware
>> is done with a command, and when the wait is over, the core
>> will typically submit the next asynchronous request that is pending
>> just waiting for the hardware to be available.
>>
>> This is in the way for letting the mmc_request_done() trigger the
>> report up to the block layer that a block request is finished.
>>
>> Re-jig this as a first step, remvoving the waitqueue and introducing
>> a work that will run after a completed asynchronous request,
>> finalizing that request, including retransmissions, and eventually
>> reporting back with a completion and a status code to the
>> asynchronous issue method.
>>
>> This had the upside that we can remove the MMC_BLK_NEW_REQUEST
>> status code and the "new_request" state in the request queue
>> that is only there to make the state machine spin out
>> the first time we send a request.
>>
>> Introduce a workqueue in the host for handling just this, and
>> then a work and completion in the asynchronous request to deal
>> with this mechanism.
>>
>> This is a central change that let us do many other changes since
>> we have broken the submit and complete code paths in two, and we
>> can potentially remove the NULL flushing of the asynchronous
>> pipeline and report block requests as finished directly from
>> the worker.
>
> This needs more thought.  The completion should go straight to the mmc block
> driver from the ->done() callback.  And from there straight back to the
> block layer if recovery is not needed.  We want to stop using
> mmc_start_areq() altogether because we never want to wait - we always want
> to issue (if possible) and return.

I don't quite follow this. Isn't what you request exactly what
patch 15/16 "mmc: queue: issue requests in massive parallel"
is doing?

The whole patch series leads up to that.

> The core API to use is __mmc_start_req() but the block driver should
> populate mrq->done with its own handler. i.e. change __mmc_start_req()
>
> -       mrq->done = mmc_wait_done;
> +       if (!mrq->done)
> +               mrq->done = mmc_wait_done;
>
> mrq->done() would complete the request (e.g. via blk_complete_request()) if
> it has no errors (and doesn't need polling), and wake up the queue thread to
> finish up everything else and start the next request.

I think this is what it does at the end of the patch series, patch 15/16.
I have to split it somehow...

> For the blk-mq port, the queue thread should also be retained, partly
> because it solves some synchronization problems, but mostly because, at this
> stage, we anyway don't have solutions for all the different ways the driver
> can block.
> (as listed here https://marc.info/?l=linux-mmc&m=148336571720463&w=2 )

Essentially I take out that thread and replace it with this one worker
introduced in this very patch. I agree the driver can block in many ways
and that is why I need to have it running in process context, and this
is what the worker introduced here provides.

Maybe I'm getting something backwards, sorry then :/

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH RFC 22/39] mmc: block: Prepare CQE data
From: Linus Walleij @ 2017-03-09 22:39 UTC (permalink / raw)
  To: Adrian Hunter, linux-block, Paolo Valente, Jens Axboe
  Cc: Ulf Hansson, linux-mmc, Alex Lemberg, Mateusz Nowak,
	Yuliy Izrailov, Jaehoon Chung, Dong Aisheng, Das Asutosh,
	Zhangfei Gao, Dorfman Konstantin, David Griego, Sahitya Tummala,
	Harjani Ritesh, Venu Byravarasu
In-Reply-To: <e31765ec-3453-9c40-1710-bea881cd9dec@intel.com>

On Fri, Mar 3, 2017 at 1:22 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:
> On 15/02/17 15:49, Linus Walleij wrote:
>> On Fri, Feb 10, 2017 at 1:55 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:
>>
>>> Enhance mmc_blk_data_prep() to support CQE requests.
>>>
>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>>
>> Hey:
>>
>>> +#include <linux/ioprio.h>
>> (...)
>>> +       if (IOPRIO_PRIO_CLASS(req_get_ioprio(req)) == IOPRIO_CLASS_RT)
>>> +               brq->data.flags |= MMC_DATA_PRIO;
>>
>> What is this?
>
> It is the command queue priority.
>
> The command queue supports 2 priorities: "high" (1) and "simple" (0).  The
> eMMC will give "high" priority tasks priority over "simple" priority tasks.
>
> So here we give priority to IOPRIO_CLASS_RT which seems appropriate.

So if I understand correctly, you are obtaining the block layer scheduling
priorities, that can (only?) be set by ionice has from the command line?

We need to discuss this with the block maintainers.

I'm not so sure about the future of this. The IOPRIO is only used with the CFQ
scheduler, only two other sites in the kernel use this and MQ and its schedulers
surely does not have ionice handling as far as I know.

The BFQ does not use it, AFAIK it is using different heuristics to prioritize
block traffic, and that does not include using ionice hints.

Is ionice:ing something we're really going to do going forward?
Should this be repurposed so that the block scheduler use this prio to
communicate to the driver layer to prioritize certain traffic?

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH v5] blkcg: allocate struct blkcg_gq outside request queue spinlock
From: Tejun Heo @ 2017-03-09 18:27 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Jens Axboe, linux-block, David Rientjes, linux-kernel
In-Reply-To: <20170309080531.9048-1-tahsin@google.com>

On Thu, Mar 09, 2017 at 12:05:31AM -0800, Tahsin Erdogan wrote:
> blkg_conf_prep() currently calls blkg_lookup_create() while holding
> request queue spinlock. This means allocating memory for struct
> blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
> failures in call paths like below:
> 
>   pcpu_alloc+0x68f/0x710
>   __alloc_percpu_gfp+0xd/0x10
>   __percpu_counter_init+0x55/0xc0
>   cfq_pd_alloc+0x3b2/0x4e0
>   blkg_alloc+0x187/0x230
>   blkg_create+0x489/0x670
>   blkg_lookup_create+0x9a/0x230
>   blkg_conf_prep+0x1fb/0x240
>   __cfqg_set_weight_device.isra.105+0x5c/0x180
>   cfq_set_weight_on_dfl+0x69/0xc0
>   cgroup_file_write+0x39/0x1c0
>   kernfs_fop_write+0x13f/0x1d0
>   __vfs_write+0x23/0x120
>   vfs_write+0xc2/0x1f0
>   SyS_write+0x44/0xb0
>   entry_SYSCALL_64_fastpath+0x18/0xad
> 
> In the code path above, percpu allocator cannot call vmalloc() due to
> queue spinlock.
> 
> A failure in this call path gives grief to tools which are trying to
> configure io weights. We see occasional failures happen shortly after
> reboots even when system is not under any memory pressure. Machines
> with a lot of cpus are more vulnerable to this condition.
> 
> Update blkg_create() function to temporarily drop the rcu and queue
> locks when it is allowed by gfp mask.
> 
> Suggested-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Tahsin Erdogan <tahsin@google.com>

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH 2/2] blk-mq: start to freeze queue just after setting dying
From: Bart Van Assche @ 2017-03-09 16:58 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org, hch@infradead.org,
	linux-block@vger.kernel.org, tom.leiming@gmail.com, axboe@fb.com
  Cc: yizhan@redhat.com, tj@kernel.org
In-Reply-To: <1489064578-17305-4-git-send-email-tom.leiming@gmail.com>

On Thu, 2017-03-09 at 21:02 +0800, Ming Lei wrote:
> Before commit 780db2071a(blk-mq: decouble blk-mq freezing
> from generic bypassing), the dying flag is checked before
> entering queue, and Tejun converts the checking into .mq_freeze_depth,
> and assumes the counter is increased just after dying flag
> is set. Unfortunately we doesn't do that in blk_set_queue_dying().
>=20
> This patch calls blk_mq_freeze_queue_start() for blk-mq in
> blk_set_queue_dying(), so that we can block new I/O coming
> once the queue is set as dying.
>=20
> Given blk_set_queue_dying() is always called in remove path
> of block device, and queue will be cleaned up later, we don't
> need to worry about undo of the counter.
>=20
> Cc: Tejun Heo <tj@kernel.org>
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  block/blk-core.c | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
>=20
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 0eeb99ef654f..559487e58296 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -500,9 +500,12 @@ void blk_set_queue_dying(struct request_queue *q)
>  	queue_flag_set(QUEUE_FLAG_DYING, q);
>  	spin_unlock_irq(q->queue_lock);
> =20
> -	if (q->mq_ops)
> +	if (q->mq_ops) {
>  		blk_mq_wake_waiters(q);
> -	else {
> +
> +		/* block new I/O coming */
> +		blk_mq_freeze_queue_start(q);
> +	} else {
>  		struct request_list *rl;
> =20
>  		spin_lock_irq(q->queue_lock);

The comment above blk_mq_freeze_queue_start() should explain more clearly
why that call is needed. Additionally, I think this patch makes the
blk_freeze_queue() call in blk_cleanup_queue() superfluous. How about the
(entirely untested) patch below?

diff --git a/block/blk-core.c b/block/blk-core.c
index 1086dac8724c..3ce48f2d65cf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -500,6 +500,12 @@ void blk_set_queue_dying(struct request_queue *q)
=A0	queue_flag_set(QUEUE_FLAG_DYING, q);
=A0	spin_unlock_irq(q->queue_lock);
=A0
+	/*
+	=A0* Force blk_queue_enter() and blk_mq_queue_enter() to check the
+	=A0* "dying" flag.
+	=A0*/
+	blk_freeze_queue(q);
+
=A0	if (q->mq_ops)
=A0		blk_mq_wake_waiters(q);
=A0	else {
@@ -555,7 +561,7 @@ void blk_cleanup_queue(struct request_queue *q)
=A0	=A0* Drain all requests queued before DYING marking. Set DEAD flag to
=A0	=A0* prevent that q->request_fn() gets invoked after draining finished.
=A0	=A0*/
-	blk_freeze_queue(q);
+	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
=A0	spin_lock_irq(lock);
=A0	if (!q->mq_ops)
=A0		__blk_drain_queue(q, true);

Thanks,

Bart.=

^ permalink raw reply related

* Re: [PATCH 0/4 v2] block: Fixes for bdi handling
From: Jens Axboe @ 2017-03-09 16:41 UTC (permalink / raw)
  To: Jan Kara
  Cc: Tejun Heo, linux-block, Dan Williams, Omar Sandoval, Arthur Marsh,
	linux-scsi
In-Reply-To: <20170309101624.25901-1-jack@suse.cz>

On 03/09/2017 03:16 AM, Jan Kara wrote:
> Hi!
> 
> this is a second revision of the series fixing the most urgent bugs that were
> introduced by commit 165a5e22fafb "block: Move bdi_unregister() to
> del_gendisk()" and by 0dba1314d4f8 "scsi, block: fix duplicate bdi name
> registration crashes".  In fact before these commits we had a different set of
> problems in the code but they were less visible :).

It was rather urgent to get those fixes in, so I already sent them off. Not a
huge deal, but it would be nice to add the atomic init fix as a separate patch
later on.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH 1/4] block: Allow bdi re-registration
From: Tejun Heo @ 2017-03-09 16:39 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jens Axboe, linux-block, Dan Williams, Omar Sandoval,
	Arthur Marsh, linux-scsi
In-Reply-To: <20170309101624.25901-2-jack@suse.cz>

On Thu, Mar 09, 2017 at 11:16:21AM +0100, Jan Kara wrote:
> SCSI can call device_add_disk() several times for one request queue when
> a device in unbound and bound, creating new gendisk each time. This will
> lead to bdi being repeatedly registered and unregistered. This was not a
> big problem until commit 165a5e22fafb "block: Move bdi_unregister() to
> del_gendisk()" since bdi was only registered repeatedly (bdi_register()
> handles repeated calls fine, only we ended up leaking reference to
> gendisk due to overwriting bdi->owner) but unregistered only in
> blk_cleanup_queue() which didn't get called repeatedly. After
> 165a5e22fafb we were doing correct bdi_register() - bdi_unregister()
> cycles however bdi_unregister() is not prepared for it. So make sure
> bdi_unregister() cleans up bdi in such a way that it is prepared for
> a possible following bdi_register() call.
> 
> An easy way to provoke this behavior is to enable
> CONFIG_DEBUG_TEST_DRIVER_REMOVE and use scsi_debug driver to create a
> scsi disk which immediately hangs without this fix.
> 
> Fixes: 165a5e22fafb127ecb5914e12e8c32a1f0d3f820
> Tested-by: Omar Sandoval <osandov@fb.com>
> Signed-off-by: Jan Kara <jack@suse.cz>

Acked-by: Tejun Heo <tj@kernel.org>

Thanks!

-- 
tejun

^ permalink raw reply

* Re: [PATCH rfc 06/10] IB/cq: Don't force IB_POLL_DIRECT poll context for ib_process_cq_direct
From: Bart Van Assche @ 2017-03-09 16:30 UTC (permalink / raw)
  To: linux-rdma@vger.kernel.org, linux-block@vger.kernel.org,
	linux-nvme@lists.infradead.org, target-devel@vger.kernel.org,
	sagi@grimberg.me
In-Reply-To: <1489065402-14757-7-git-send-email-sagi@grimberg.me>

On Thu, 2017-03-09 at 15:16 +0200, Sagi Grimberg wrote:
> polling the completion queue directly does not interfere
> with the existing polling logic, hence drop the requirement.
> 
> This can be used for polling mode ULPs.
> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  drivers/infiniband/core/cq.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
> index 21d1a38af489..7f6ae0ecb0c5 100644
> --- a/drivers/infiniband/core/cq.c
> +++ b/drivers/infiniband/core/cq.c
> @@ -64,8 +64,6 @@ static int __ib_process_cq(struct ib_cq *cq, int budget)
>   */
>  int ib_process_cq_direct(struct ib_cq *cq, int budget)
>  {
> -	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
> -
>  	return __ib_process_cq(cq, budget);
>  }
>  EXPORT_SYMBOL(ib_process_cq_direct);

Using ib_process_cq_direct() for queues that have another type than
IB_POLL_DIRECT may trigger concurrent CQ processing. Before this patch
the completions from each CQ were processed sequentially. That's a big
change so I think this should be mentioned clearly in the header above
ib_process_cq_direct().

Bart.
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply

* Re: [PATCH rfc 04/10] block: Add a non-selective polling interface
From: Bart Van Assche @ 2017-03-09 16:25 UTC (permalink / raw)
  To: linux-rdma@vger.kernel.org, linux-block@vger.kernel.org,
	linux-nvme@lists.infradead.org, target-devel@vger.kernel.org,
	sagi@grimberg.me
In-Reply-To: <1489065402-14757-5-git-send-email-sagi@grimberg.me>

On Thu, 2017-03-09 at 15:16 +0200, Sagi Grimberg wrote:
> +int blk_mq_poll_batch(struct request_queue *q, unsigned int batch)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +
> +	if (!q->mq_ops || !q->mq_ops->poll_batch)
> +		return 0;
> +
> +	hctx =3D blk_mq_map_queue(q, smp_processor_id());
> +	return q->mq_ops->poll_batch(hctx, batch);
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_poll_batch);

A new exported function without any documentation? Wow. Please add a header
above this function that documents at least which other completion processi=
ng
code can execute concurrently with this function and from which contexts th=
e
other completion processing code can be called (e.g. blk_mq_poll() and
.complete()).

Why to return if (!q->mq_ops || !q->mq_ops->poll_batch)? Shouldn't that be =
a
WARN_ON_ONCE() instead? I think it is an error to calling blk_mq_poll_batch=
()
against a queue that does not define .poll_batch().

Additionally, I think making the hardware context an argument of this funct=
ion
instead of using blk_mq_map_queue(q, smp_processor_id()) would make this
function much more versatile.

Bart.=

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox