[PATCH V5 7/7] blk-mq-sched: don't dequeue request until all in ->dispatch are flushed

linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@fb.com>,
	linux-block@vger.kernel.org,
	Christoph Hellwig <hch@infradead.org>,
	Mike Snitzer <snitzer@redhat.com>,
	dm-devel@redhat.com
Cc: Bart Van Assche <bart.vanassche@sandisk.com>,
	Laurence Oberman <loberman@redhat.com>,
	Paolo Valente <paolo.valente@linaro.org>,
	Oleksandr Natalenko <oleksandr@natalenko.name>,
	Tom Nguyen <tom81094@gmail.com>,
	linux-kernel@vger.kernel.org, linux-scsi@vger.kernel.org,
	Omar Sandoval <osandov@fb.com>, Ming Lei <ming.lei@redhat.com>
Subject: [PATCH V5 7/7] blk-mq-sched: don't dequeue request until all in ->dispatch are flushed
Date: Sat, 30 Sep 2017 18:27:20 +0800	[thread overview]
Message-ID: <20170930102720.30219-8-ming.lei@redhat.com> (raw)
In-Reply-To: <20170930102720.30219-1-ming.lei@redhat.com>

During dispatching, we moved all requests from hctx->dispatch to
one temporary list, then dispatch them one by one from this list.
Unfortunately during this period, run queue from other contexts
may think the queue is idle, then start to dequeue from sw/scheduler
queue and still try to dispatch because ->dispatch is empty. This way
hurts sequential I/O performance because requests are dequeued when
lld queue is busy.

This patch introduces the state of BLK_MQ_S_DISPATCH_BUSY to
make sure that request isn't dequeued until ->dispatch is
flushed.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Tom Nguyen <tom81094@gmail.com>
Tested-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c |  1 +
 block/blk-mq-sched.c   | 53 +++++++++++++++++++++++++++++++++-----------------
 block/blk-mq.c         |  6 ++++++
 include/linux/blk-mq.h |  1 +
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 813ca3bbbefc..f1a62c0d1acc 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -182,6 +182,7 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(SCHED_RESTART),
 	HCTX_STATE_NAME(TAG_WAITING),
 	HCTX_STATE_NAME(START_ON_RUN),
+	HCTX_STATE_NAME(DISPATCH_BUSY),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 3ba112d9dc15..c5eac1eee442 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -146,7 +146,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-	bool do_sched_dispatch = true;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -177,8 +176,33 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
-	} else if (!has_sched_dispatch && !q->queue_depth) {
+		blk_mq_dispatch_rq_list(q, &rq_list);
+
+		/*
+		 * We may clear DISPATCH_BUSY just after it
+		 * is set from another context, the only cost
+		 * is that one request is dequeued a bit early,
+		 * we can survive that. Given the window is
+		 * small enough, no need to worry about performance
+		 * effect.
+		 */
+		if (list_empty_careful(&hctx->dispatch))
+			clear_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state);
+	}
+
+	/*
+	 * If DISPATCH_BUSY is set, that means hw queue is busy
+	 * and requests in the list of hctx->dispatch need to
+	 * be flushed first, so return early.
+	 *
+	 * Wherever DISPATCH_BUSY is set, blk_mq_run_hw_queue()
+	 * will be run to try to make progress, so it is always
+	 * safe to check the state here.
+	 */
+	if (test_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state))
+		return;
+
+	if (!has_sched_dispatch) {
 		/*
 		 * If there is no per-request_queue depth, we
 		 * flush all requests in this hw queue, otherwise
@@ -187,22 +211,15 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		 * run out of resource, which can be triggered
 		 * easily by per-request_queue queue depth
 		 */
-		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list);
-	}
-
-	if (!do_sched_dispatch)
-		return;
-
-	/*
-	 * We want to dispatch from the scheduler if there was nothing
-	 * on the dispatch list or we were able to dispatch from the
-	 * dispatch list.
-	 */
-	if (has_sched_dispatch)
+		if (!q->queue_depth) {
+			blk_mq_flush_busy_ctxs(hctx, &rq_list);
+			blk_mq_dispatch_rq_list(q, &rq_list);
+		} else {
+			blk_mq_do_dispatch_ctx(q, hctx);
+		}
+	} else {
 		blk_mq_do_dispatch_sched(q, e, hctx);
-	else
-		blk_mq_do_dispatch_ctx(q, hctx);
+	}
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8b49af1ade7f..7cb3f87334c0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1142,6 +1142,11 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);
+		/*
+		 * DISPATCH_BUSY won't be cleared until all requests
+		 * in hctx->dispatch are dispatched successfully
+		 */
+		set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state);
 		spin_unlock(&hctx->lock);
 
 		/*
@@ -1446,6 +1451,7 @@ static void blk_mq_request_direct_insert(struct blk_mq_hw_ctx *hctx,
 {
 	spin_lock(&hctx->lock);
 	list_add_tail(&rq->queuelist, &hctx->dispatch);
+	set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state);
 	spin_unlock(&hctx->lock);
 
 	blk_mq_run_hw_queue(hctx, false);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fccabe00fb55..aa9853ada8b8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -172,6 +172,7 @@ enum {
 	BLK_MQ_S_SCHED_RESTART	= 2,
 	BLK_MQ_S_TAG_WAITING	= 3,
 	BLK_MQ_S_START_ON_RUN	= 4,
+	BLK_MQ_S_DISPATCH_BUSY	= 5,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
2.9.5

next prev parent reply	other threads:[~2017-09-30 10:27 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-30 10:27 [PATCH V5 00/14] blk-mq-sched: improve sequential I/O performance(part 1) Ming Lei
2017-09-30 10:27 ` [PATCH V5 1/7] blk-mq: issue rq directly in blk_mq_request_bypass_insert() Ming Lei
2017-10-03  8:58   ` Christoph Hellwig
2017-10-03 13:39     ` Ming Lei
2017-09-30 10:27 ` [PATCH V5 2/7] blk-mq-sched: fix scheduler bad performance Ming Lei
2017-10-02 14:19   ` Christoph Hellwig
2017-09-30 10:27 ` [PATCH V5 3/7] sbitmap: introduce __sbitmap_for_each_set() Ming Lei
2017-09-30 10:27 ` [PATCH V5 4/7] blk-mq: introduce blk_mq_dequeue_from_ctx() Ming Lei
2017-10-03  9:01   ` Christoph Hellwig
2017-10-09  4:36     ` Ming Lei
2017-09-30 10:27 ` [PATCH V5 5/7] blk-mq-sched: move actual dispatching into one helper Ming Lei
2017-10-02 14:19   ` Christoph Hellwig
2017-10-09  9:07     ` Ming Lei
2017-09-30 10:27 ` [PATCH V5 6/7] blk-mq-sched: improve dispatching from sw queue Ming Lei
2017-10-03  9:05   ` Christoph Hellwig
2017-10-09 10:15     ` Ming Lei
2017-09-30 10:27 ` Ming Lei [this message]
2017-10-03  9:11   ` [PATCH V5 7/7] blk-mq-sched: don't dequeue request until all in ->dispatch are flushed Christoph Hellwig
2017-10-09 10:40     ` Ming Lei
2017-09-30 10:32 ` [PATCH V5 00/14] blk-mq-sched: improve sequential I/O performance(part 1) Ming Lei
2017-10-09 12:09 ` John Garry
2017-10-09 15:04   ` Ming Lei
2017-10-10  1:46     ` Ming Lei
2017-10-10 12:24       ` John Garry
2017-10-10 12:34         ` Johannes Thumshirn
2017-10-10 12:37           ` Paolo Valente
2017-10-10 13:45         ` Ming Lei
2017-10-10 15:10           ` John Garry

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:813ca3bbbef dfblob:f1a62c0d1ac dfblob:3ba112d9dc1
dfblob:c5eac1eee44 dfblob:8b49af1ade7 dfblob:7cb3f87334c
dfblob:fccabe00fb5 dfblob:aa9853ada8b )
 OR (
bs:"[PATCH V5 7/7] blk-mq-sched: don't dequeue request until all in ->dispatch are flushed" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170930102720.30219-8-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=axboe@fb.com \
    --cc=bart.vanassche@sandisk.com \
    --cc=dm-devel@redhat.com \
    --cc=hch@infradead.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=loberman@redhat.com \
    --cc=oleksandr@natalenko.name \
    --cc=osandov@fb.com \
    --cc=paolo.valente@linaro.org \
    --cc=snitzer@redhat.com \
    --cc=tom81094@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).