public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Christoph Hellwig <hch@lst.de>, Jens Axboe <axboe@kernel.dk>,
	"Martin K . Petersen" <martin.petersen@oracle.com>
Cc: Sagi Grimberg <sagi@grimberg.me>,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
	linux-scsi@vger.kernel.org, Keith Busch <kbusch@kernel.org>,
	Ming Lei <ming.lei@redhat.com>
Subject: [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
Date: Fri, 19 Nov 2021 10:18:45 +0800	[thread overview]
Message-ID: <20211119021849.2259254-2-ming.lei@redhat.com> (raw)
In-Reply-To: <20211119021849.2259254-1-ming.lei@redhat.com>

In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.

Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.

So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.

io_uring randread IO test shows that IOPS is basically not affected by this
change on null_blk(g_blocking, MQ).

Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-core.c       | 23 +++++++++++++++++----
 block/blk-mq-sysfs.c   |  2 --
 block/blk-mq.c         | 46 ++++++++++++------------------------------
 block/blk-sysfs.c      |  3 ++-
 block/blk.h            | 10 ++++++++-
 block/genhd.c          |  2 +-
 include/linux/blk-mq.h |  8 --------
 include/linux/blkdev.h |  8 ++++++++
 8 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ee54b34d5e99..aed14485a932 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,6 +64,7 @@ DEFINE_IDA(blk_queue_ida);
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *blk_requestq_srcu_cachep;
 
 /*
  * Controlling structure to kblockd
@@ -433,21 +434,25 @@ static void blk_timeout_work(struct work_struct *work)
 {
 }
 
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 {
 	struct request_queue *q;
 	int ret;
 
-	q = kmem_cache_alloc_node(blk_requestq_cachep,
-				GFP_KERNEL | __GFP_ZERO, node_id);
+	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
+			GFP_KERNEL | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 
+	q->alloc_srcu = alloc_srcu;
+	if (alloc_srcu && init_srcu_struct(q->srcu) != 0)
+		goto fail_q;
+
 	q->last_merge = NULL;
 
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
 	if (q->id < 0)
-		goto fail_q;
+		goto fail_srcu;
 
 	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
 	if (ret)
@@ -504,6 +509,9 @@ struct request_queue *blk_alloc_queue(int node_id)
 	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
+fail_srcu:
+	if (q->alloc_srcu)
+		cleanup_srcu_struct(q->srcu);
 fail_q:
 	kmem_cache_free(blk_requestq_cachep, q);
 	return NULL;
@@ -1305,6 +1313,9 @@ int __init blk_dev_init(void)
 			sizeof_field(struct request, cmd_flags));
 	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
 			sizeof_field(struct bio, bi_opf));
+	BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
+			   __alignof__(struct request_queue)) !=
+		     sizeof(struct request_queue));
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1315,6 +1326,10 @@ int __init blk_dev_init(void)
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
+	blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
+			sizeof(struct request_queue) +
+			sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
+
 	blk_debugfs_root = debugfs_create_dir("block", NULL);
 
 	return 0;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 253c857cba47..674786574075 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
 	struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
 						  kobj);
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		cleanup_srcu_struct(hctx->srcu);
 	blk_free_flush_queue(hctx->fq);
 	sbitmap_free(&hctx->ctx_map);
 	free_cpumask_var(hctx->cpumask);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1feb9ab65f28..9728a571b009 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -260,17 +260,11 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
  */
 void blk_mq_wait_quiesce_done(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-	bool rcu = false;
-
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->srcu);
-		else
-			rcu = true;
-	}
-	if (rcu)
+	WARN_ON_ONCE(q->alloc_srcu != !!(q->tag_set->flags &
+				BLK_MQ_F_BLOCKING));
+	if (q->alloc_srcu)
+		synchronize_srcu(q->srcu);
+	else
 		synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
@@ -1082,16 +1076,16 @@ void blk_mq_complete_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+static inline void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
 	__releases(hctx->srcu)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 		rcu_read_unlock();
 	else
-		srcu_read_unlock(hctx->srcu, srcu_idx);
+		srcu_read_unlock(hctx->queue->srcu, srcu_idx);
 }
 
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+static inline void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 	__acquires(hctx->srcu)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
@@ -1099,7 +1093,7 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 		*srcu_idx = 0;
 		rcu_read_lock();
 	} else
-		*srcu_idx = srcu_read_lock(hctx->srcu);
+		*srcu_idx = srcu_read_lock(hctx->queue->srcu);
 }
 
 /**
@@ -3515,20 +3509,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 	}
 }
 
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
-	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
-	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
-			   __alignof__(struct blk_mq_hw_ctx)) !=
-		     sizeof(struct blk_mq_hw_ctx));
-
-	if (tag_set->flags & BLK_MQ_F_BLOCKING)
-		hw_ctx_size += sizeof(struct srcu_struct);
-
-	return hw_ctx_size;
-}
-
 static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -3566,7 +3546,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	struct blk_mq_hw_ctx *hctx;
 	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
 
-	hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
 	if (!hctx)
 		goto fail_alloc_hctx;
 
@@ -3608,8 +3588,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	if (!hctx->fq)
 		goto free_bitmap;
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(hctx->srcu);
 	blk_mq_hctx_kobj_init(hctx);
 
 	return hctx;
@@ -3945,7 +3923,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 	struct request_queue *q;
 	int ret;
 
-	q = blk_alloc_queue(set->numa_node);
+	q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
 	if (!q)
 		return ERR_PTR(-ENOMEM);
 	q->queuedata = queuedata;
@@ -4094,6 +4072,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		struct request_queue *q)
 {
+	WARN_ON_ONCE(q->alloc_srcu != !!(set->flags & BLK_MQ_F_BLOCKING));
+
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cef1f713370b..2b79845a581d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -734,7 +734,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 {
 	struct request_queue *q = container_of(rcu_head, struct request_queue,
 					       rcu_head);
-	kmem_cache_free(blk_requestq_cachep, q);
+
+	kmem_cache_free(blk_get_queue_kmem_cache(q->alloc_srcu), q);
 }
 
 /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
diff --git a/block/blk.h b/block/blk.h
index 296e3010f8d6..c14bca80aba9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -32,6 +32,7 @@ struct blk_flush_queue {
 };
 
 extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *blk_requestq_srcu_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
@@ -448,7 +449,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
 		unsigned int max_sectors, bool *same_page);
 
-struct request_queue *blk_alloc_queue(int node_id);
+static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
+{
+	if (srcu)
+		return blk_requestq_srcu_cachep;
+	return blk_requestq_cachep;
+}
+
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
 
 int disk_alloc_events(struct gendisk *disk);
 void disk_add_events(struct gendisk *disk);
diff --git a/block/genhd.c b/block/genhd.c
index c5392cc24d37..e624fe9371f2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1341,7 +1341,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
 	struct request_queue *q;
 	struct gendisk *disk;
 
-	q = blk_alloc_queue(node);
+	q = blk_alloc_queue(node, false);
 	if (!q)
 		return NULL;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 308edc2a4925..5cc7fc1ea863 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -4,7 +4,6 @@
 
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
-#include <linux/srcu.h>
 #include <linux/lockdep.h>
 #include <linux/scatterlist.h>
 #include <linux/prefetch.h>
@@ -376,13 +375,6 @@ struct blk_mq_hw_ctx {
 	 * q->unused_hctx_list.
 	 */
 	struct list_head	hctx_list;
-
-	/**
-	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
-	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
-	 * blk_mq_hw_ctx_size().
-	 */
-	struct srcu_struct	srcu[];
 };
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd4370baccca..5741b46bca6c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/blkzoned.h>
 #include <linux/sbitmap.h>
+#include <linux/srcu.h>
 
 struct module;
 struct request_queue;
@@ -364,6 +365,7 @@ struct request_queue {
 #endif
 
 	bool			mq_sysfs_init_done;
+	bool			alloc_srcu;
 
 #define BLK_MAX_WRITE_HINTS	5
 	u64			write_hints[BLK_MAX_WRITE_HINTS];
@@ -373,6 +375,12 @@ struct request_queue {
 	 * devices that do not have multiple independent access ranges.
 	 */
 	struct blk_independent_access_ranges *ia_ranges;
+
+	/**
+	 * @srcu: Sleepable RCU. Use as lock when type of the request queue
+	 * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
+	 */
+	struct srcu_struct	srcu[];
 };
 
 /* Keep blk_queue_flag_name[] in sync with the definitions below */
-- 
2.31.1



  reply	other threads:[~2021-11-19  2:19 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-19  2:18 [PATCH 0/5] blk-mq: quiesce improvement Ming Lei
2021-11-19  2:18 ` Ming Lei [this message]
2021-11-19  4:30   ` [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Bart Van Assche
2021-11-19  8:10     ` Ming Lei
2021-11-22  7:48       ` Sagi Grimberg
2021-11-22 13:08         ` Ming Lei
2021-11-22 13:47           ` Sagi Grimberg
2021-11-19  2:18 ` [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock Ming Lei
2021-11-22  7:53   ` Sagi Grimberg
2021-11-22 13:20     ` Ming Lei
2021-11-22 13:50       ` Sagi Grimberg
2021-11-23  0:08         ` Ming Lei
2021-11-23  8:54           ` Sagi Grimberg
2021-11-22  9:00   ` Christoph Hellwig
2021-11-19  2:18 ` [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait() Ming Lei
2021-11-22  7:56   ` Sagi Grimberg
2021-11-22  8:00     ` Sagi Grimberg
2021-11-22 13:26     ` Ming Lei
2021-11-22 13:55       ` Sagi Grimberg
2021-11-23  0:17         ` Ming Lei
2021-11-23  9:00           ` Sagi Grimberg
2021-11-30  2:33             ` Ming Lei
2021-12-08 12:49               ` Sagi Grimberg
2021-12-10  2:02                 ` Ming Lei
2021-11-19  2:18 ` [PATCH 4/5] nvme: quiesce namespace queue in parallel Ming Lei
2021-11-22  8:07   ` Sagi Grimberg
2021-11-23  0:13     ` Ming Lei
2021-11-19  2:18 ` [PATCH 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211119021849.2259254-2-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox