linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, Ming Lei <ming.lei@redhat.com>,
	John Garry <john.garry@huawei.com>,
	Bart Van Assche <bvanassche@acm.org>,
	Hannes Reinecke <hare@suse.com>, Christoph Hellwig <hch@lst.de>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: [PATCH 6/6] blk-mq: stop to allocate new requst and drain request before hctx becomes inactive
Date: Fri, 15 May 2020 09:41:53 +0800	[thread overview]
Message-ID: <20200515014153.2403464-7-ming.lei@redhat.com> (raw)
In-Reply-To: <20200515014153.2403464-1-ming.lei@redhat.com>

Before one CPU becomes offline, check if it is the last online CPU of hctx.
If yes, mark this hctx as inactive, meantime wait for completion of all
allocated IO requests originated from this hctx. Meantime check if this
hctx has become inactive during allocating request, if yes, give up it
and retry from new online CPU.

This way guarantees that there isn't any inflight IO before shutdown
the managed IRQ line when all CPUs of this IRQ line is offline.

Cc: John Garry <john.garry@huawei.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c |   1 +
 block/blk-mq-tag.c     |  10 ++++
 block/blk-mq.c         | 122 +++++++++++++++++++++++++++++++++++++----
 include/linux/blk-mq.h |   3 +
 4 files changed, 125 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 6d40eff04524..15df3a36e9fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
+	HCTX_STATE_NAME(INACTIVE),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 0ae79ca6e2da..c8e34c1f547f 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -184,6 +184,16 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
+	/*
+	 * we are in request allocation, check if the current hctx is inactive.
+	 * If yes, give up this allocation. We will retry on another new online
+	 * CPU.
+	 */
+	if (data->preempt_disabled && unlikely(test_bit(BLK_MQ_S_INACTIVE,
+					&data->hctx->state))) {
+		blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
+		return BLK_MQ_TAG_FAIL;
+	}
 	return tag + tag_offset;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 18999198a37f..0551aa1740ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -40,9 +40,9 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static struct request *blk_mq_get_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_mq_alloc_data *data);
+static struct request *__blk_mq_get_request(struct request_queue *q,
+					    struct bio *bio,
+					    struct blk_mq_alloc_data *data);
 
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -338,18 +338,19 @@ struct blk_mq_smp_call_info {
 	struct request_queue *q;
 	struct blk_mq_alloc_data *data;
 	struct request *rq;
+	struct bio *bio;
 };
 
 static void __blk_mq_alloc_request(void *alloc_info)
 {
 	struct blk_mq_smp_call_info *info = alloc_info;
 
-	info->rq = blk_mq_get_request(info->q, NULL, info->data);
+	info->rq = __blk_mq_get_request(info->q, info->bio, info->data);
 }
 
-static struct request *blk_mq_get_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_mq_alloc_data *data)
+static struct request *__blk_mq_get_request(struct request_queue *q,
+					    struct bio *bio,
+					    struct blk_mq_alloc_data *data)
 {
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
@@ -413,6 +414,34 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	return rq;
 }
 
+static struct request *blk_mq_get_request(struct request_queue *q,
+					  struct bio *bio,
+					  struct blk_mq_alloc_data *data)
+{
+	int ret;
+	struct blk_mq_smp_call_info info;
+	struct request *rq = __blk_mq_get_request(q, bio, data);
+
+	if (rq)
+		return rq;
+
+	if (data->flags & BLK_MQ_REQ_NOWAIT)
+		return NULL;
+
+	/*
+	 * The allocation fails because of inactive hctx, so retry from
+	 * online cpus.
+	 */
+	info.q = q;
+	info.bio = bio;
+	info.data = data;
+
+	ret = smp_call_function_any(cpu_online_mask, __blk_mq_alloc_request, &info, 1);
+	if (ret)
+		return ERR_PTR(ret);
+	return info.rq;
+}
+
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		blk_mq_req_flags_t flags)
 {
@@ -2316,13 +2345,81 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	return -ENOMEM;
 }
 
-static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+struct rq_iter_data {
+	bool has_rq;
+	struct blk_mq_hw_ctx *hctx;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
 {
-	return 0;
+	struct rq_iter_data *iter_data = data;
+
+	if (rq->mq_hctx == iter_data->hctx) {
+		iter_data->has_rq = true;
+		return false;
+	}
+
+	return true;
+}
+
+static bool blk_mq_tags_has_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->sched_tags ?: hctx->tags;
+	struct rq_iter_data data = {
+		.hctx	= hctx,
+	};
+
+	blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+	return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+		struct blk_mq_hw_ctx *hctx)
+{
+	if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+		return false;
+	if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+		return false;
+	return true;
 }
 
 static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
 {
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+	struct request_queue *q = hctx->queue;
+
+	if (!cpumask_test_cpu(cpu, hctx->cpumask))
+		return 0;
+
+	if (!blk_mq_last_cpu_in_hctx(cpu, hctx))
+		return 0;
+
+	/* Prevent new request from being allocated on the current hctx/cpu */
+	set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+
+	/*
+	 * Grab one refcount for avoiding scheduler switch, and
+	 * return immediately if queue has been frozen.
+	 */
+	if (!percpu_ref_tryget(&q->q_usage_counter))
+		return 0;
+
+	/* wait until all requests in this hctx are gone */
+	while (blk_mq_tags_has_request(hctx))
+		msleep(5);
+
+	percpu_ref_put(&q->q_usage_counter);
+	return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+
+	if (cpumask_test_cpu(cpu, hctx->cpumask))
+		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
 	return 0;
 }
 
@@ -2333,12 +2430,15 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
  */
 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 {
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_dead);
 	struct blk_mq_ctx *ctx;
 	LIST_HEAD(tmp);
 	enum hctx_type type;
 
-	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+	if (!cpumask_test_cpu(cpu, hctx->cpumask))
+		return 0;
+
 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
 	type = hctx->type;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 682ce79e8973..e3f6608346b5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -407,6 +407,9 @@ enum {
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
 
+	/* hw queue is inactive after all its CPUs become offline */
+	BLK_MQ_S_INACTIVE	= 3,
+
 	BLK_MQ_MAX_DEPTH	= 10240,
 
 	BLK_MQ_CPU_WORK_BATCH	= 8,
-- 
2.25.2


  parent reply	other threads:[~2020-05-15  1:43 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-15  1:41 [PATCH 0/6] blk-mq: improvement CPU hotplug(simplified version) Ming Lei
2020-05-15  1:41 ` [PATCH 1/6] blk-mq: allocate request on cpu in hctx->cpumask for blk_mq_alloc_request_hctx Ming Lei
2020-05-15 15:32   ` Christoph Hellwig
2020-05-15  1:41 ` [PATCH 2/6] blk-mq: set data->ctx and data->hctx explicitly in blk_mq_get_request Ming Lei
2020-05-15  1:41 ` [PATCH 3/6] blk-mq: add blk_mq_all_tag_iter Ming Lei
2020-05-15  1:41 ` [PATCH 4/6] blk-mq: prepare for draining IO when hctx's all CPUs are offline Ming Lei
2020-05-15  1:41 ` [PATCH 5/6] blk-mq: disable preempt during allocating request tag Ming Lei
2020-05-15 15:38   ` Christoph Hellwig
2020-05-15 16:10     ` Christoph Hellwig
2020-05-15  1:41 ` Ming Lei [this message]
2020-05-15  3:55   ` [PATCH 6/6] blk-mq: stop to allocate new requst and drain request before hctx becomes inactive Bart Van Assche
2020-05-15  5:20     ` Ming Lei
2020-05-15 15:45 ` [PATCH 0/6] blk-mq: improvement CPU hotplug(simplified version) Christoph Hellwig
2020-05-16 12:35 ` Christoph Hellwig
2020-05-17  4:08   ` Ming Lei
2020-05-17  7:08     ` Christoph Hellwig
2020-05-18  4:07       ` Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200515014153.2403464-7-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=hare@suse.com \
    --cc=hch@lst.de \
    --cc=john.garry@huawei.com \
    --cc=linux-block@vger.kernel.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).