From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, Ming Lei <ming.lei@redhat.com>,
John Garry <john.garry@huawei.com>,
Bart Van Assche <bvanassche@acm.org>,
Hannes Reinecke <hare@suse.com>, Christoph Hellwig <hch@lst.de>,
Thomas Gleixner <tglx@linutronix.de>
Subject: [PATCH V7 5/9] blk-mq: stop to handle IO and drain IO before hctx becomes inactive
Date: Sat, 18 Apr 2020 11:09:21 +0800 [thread overview]
Message-ID: <20200418030925.31996-6-ming.lei@redhat.com> (raw)
In-Reply-To: <20200418030925.31996-1-ming.lei@redhat.com>
Before one CPU becomes offline, check if it is the last online CPU
of hctx. If yes, mark this hctx as inactive, meantime wait for
completion of all in-flight IOs originated from this hctx.
This way guarantees that there isn't any inflight IO before shutdowning
the managed IRQ line.
Cc: John Garry <john.garry@huawei.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq-debugfs.c | 1 +
block/blk-mq.c | 118 ++++++++++++++++++++++++++++++++++++++---
block/blk-mq.h | 3 +-
include/linux/blk-mq.h | 3 ++
4 files changed, 116 insertions(+), 9 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 8e745826eb86..b62390918ca5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
+ HCTX_STATE_NAME(INACTIVE),
};
#undef HCTX_STATE_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a28daefd7dd6..0b56d7d78269 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1021,7 +1021,7 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
-static bool blk_mq_get_driver_tag(struct request *rq)
+static bool blk_mq_get_driver_tag(struct request *rq, bool direct_issue)
{
struct blk_mq_alloc_data data = {
.q = rq->q,
@@ -1054,6 +1054,23 @@ static bool blk_mq_get_driver_tag(struct request *rq)
data.hctx->tags->rqs[rq->tag] = rq;
}
allocated:
+ /*
+ * Add one memory barrier in case that direct issue IO process
+ * is migrated to other CPU which may not belong to this hctx,
+ * so we can order driver tag assignment and checking
+ * BLK_MQ_S_INACTIVE. Otherwise, barrier() is enough given the
+ * two code paths are run on single CPU in case that
+ * BLK_MQ_S_INACTIVE is set.
+ */
+ if (unlikely(direct_issue && rq->mq_ctx->cpu != raw_smp_processor_id()))
+ smp_mb();
+ else
+ barrier();
+
+ if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data.hctx->state))) {
+ blk_mq_put_driver_tag(rq);
+ return false;
+ }
return rq->tag != -1;
}
@@ -1103,7 +1120,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
* Don't clear RESTART here, someone else could have set it.
* At most this will cost an extra queue run.
*/
- return blk_mq_get_driver_tag(rq);
+ return blk_mq_get_driver_tag(rq, false);
}
wait = &hctx->dispatch_wait;
@@ -1129,7 +1146,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
* allocation failure and adding the hardware queue to the wait
* queue.
*/
- ret = blk_mq_get_driver_tag(rq);
+ ret = blk_mq_get_driver_tag(rq, false);
if (!ret) {
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
@@ -1226,7 +1243,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
break;
- if (!blk_mq_get_driver_tag(rq)) {
+ if (!blk_mq_get_driver_tag(rq, false)) {
/*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. The
@@ -1258,7 +1275,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bd.last = true;
else {
nxt = list_first_entry(list, struct request, queuelist);
- bd.last = !blk_mq_get_driver_tag(nxt);
+ bd.last = !blk_mq_get_driver_tag(nxt, false);
}
ret = q->mq_ops->queue_rq(hctx, &bd);
@@ -1851,7 +1868,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (!blk_mq_get_dispatch_budget(hctx))
goto insert;
- if (!blk_mq_get_driver_tag(rq)) {
+ if (!blk_mq_get_driver_tag(rq, true)) {
blk_mq_put_dispatch_budget(hctx);
goto insert;
}
@@ -2259,13 +2276,95 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
return -ENOMEM;
}
-static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+struct count_inflight_data {
+ unsigned count;
+ struct blk_mq_hw_ctx *hctx;
+};
+
+static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
+ bool reserved)
{
- return 0;
+ struct count_inflight_data *count_data = data;
+
+ /*
+ * Can't check rq's state because it is updated to MQ_RQ_IN_FLIGHT
+ * in blk_mq_start_request(), at that time we can't prevent this rq
+ * from being issued.
+ *
+ * So check if driver tag is assigned, if yes, count this rq as
+ * inflight.
+ */
+ if (rq->tag >= 0 && rq->mq_hctx == count_data->hctx)
+ count_data->count++;
+
+ return true;
+}
+
+static bool blk_mq_inflight_rq(struct request *rq, void *data,
+ bool reserved)
+{
+ return rq->tag >= 0;
+}
+
+static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+ struct count_inflight_data count_data = {
+ .count = 0,
+ .hctx = hctx,
+ };
+
+ blk_mq_all_tag_busy_iter(hctx->tags, blk_mq_count_inflight_rq,
+ blk_mq_inflight_rq, &count_data);
+
+ return count_data.count;
+}
+
+static void blk_mq_hctx_drain_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+ while (1) {
+ if (!blk_mq_tags_inflight_rqs(hctx))
+ break;
+ msleep(5);
+ }
}
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+ struct blk_mq_hw_ctx, cpuhp_online);
+
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
+ return 0;
+
+ if ((cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) ||
+ (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask)
+ < nr_cpu_ids))
+ return 0;
+
+ /*
+ * The current CPU is the last one in this hctx, S_INACTIVE
+ * can be observed in dispatch path without any barrier needed,
+ * cause both are run on one same CPU.
+ */
+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+ /*
+ * Order setting BLK_MQ_S_INACTIVE and checking rq->tag & rqs[tag],
+ * and its pair is the smp_mb() in blk_mq_get_driver_tag
+ */
+ smp_mb();
+ blk_mq_hctx_drain_inflight_rqs(hctx);
+ return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+ struct blk_mq_hw_ctx, cpuhp_online);
+
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
+ return 0;
+
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
return 0;
}
@@ -2281,6 +2380,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
LIST_HEAD(tmp);
enum hctx_type type;
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
+ return 0;
+
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d0c72d7d07c8..a1fbd0b8657b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -167,7 +167,8 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
- return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ return test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
+ test_bit(BLK_MQ_S_INACTIVE, &hctx->state);
}
static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 786614753d73..650b208b42ea 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -403,6 +403,9 @@ enum {
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART = 2,
+ /* hw queue is inactive after all its CPUs become offline */
+ BLK_MQ_S_INACTIVE = 3,
+
BLK_MQ_MAX_DEPTH = 10240,
BLK_MQ_CPU_WORK_BATCH = 8,
--
2.25.2
next prev parent reply other threads:[~2020-04-18 3:10 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-04-18 3:09 [PATCH V7 0/9] blk-mq: improvement CPU hotplug Ming Lei
2020-04-18 3:09 ` [PATCH V7 1/9] blk-mq: mark blk_mq_get_driver_tag as static Ming Lei
2020-04-23 7:14 ` Christoph Hellwig
2020-04-18 3:09 ` [PATCH V7 2/9] blk-mq: assign rq->tag in blk_mq_get_driver_tag Ming Lei
2020-04-23 7:30 ` Christoph Hellwig
2020-04-18 3:09 ` [PATCH V7 3/9] blk-mq: prepare for draining IO when hctx's all CPUs are offline Ming Lei
2020-04-23 7:31 ` Christoph Hellwig
2020-04-18 3:09 ` [PATCH V7 4/9] blk-mq: support rq filter callback when iterating rqs Ming Lei
2020-04-20 10:34 ` John Garry
2020-04-23 7:31 ` Christoph Hellwig
2020-04-23 7:32 ` Christoph Hellwig
2020-04-18 3:09 ` Ming Lei [this message]
2020-04-23 7:38 ` [PATCH V7 5/9] blk-mq: stop to handle IO and drain IO before hctx becomes inactive Christoph Hellwig
2020-04-18 3:09 ` [PATCH V7 6/9] block: add blk_end_flush_machinery Ming Lei
2020-04-23 7:40 ` Christoph Hellwig
2020-04-18 3:09 ` [PATCH V7 7/9] blk-mq: re-submit IO in case that hctx is inactive Ming Lei
2020-04-23 7:50 ` Christoph Hellwig
2020-04-23 8:46 ` Ming Lei
2020-04-18 3:09 ` [PATCH V7 8/9] blk-mq: handle requests dispatched from IO scheduler in case of inactive hctx Ming Lei
2020-04-23 7:51 ` Christoph Hellwig
2020-04-18 3:09 ` [PATCH V7 9/9] block: deactivate hctx when the hctx is actually inactive Ming Lei
2020-04-20 10:29 ` [PATCH V7 0/9] blk-mq: improvement CPU hotplug John Garry
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200418030925.31996-6-ming.lei@redhat.com \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=bvanassche@acm.org \
--cc=hare@suse.com \
--cc=hch@lst.de \
--cc=john.garry@huawei.com \
--cc=linux-block@vger.kernel.org \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.