linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@fb.com>,
	linux-block@vger.kernel.org,
	Christoph Hellwig <hch@infradead.org>
Cc: Bart Van Assche <bart.vanassche@sandisk.com>,
	Laurence Oberman <loberman@redhat.com>,
	Paolo Valente <paolo.valente@linaro.org>,
	Oleksandr Natalenko <oleksandr@natalenko.name>,
	Tom Nguyen <tom81094@gmail.com>,
	linux-kernel@vger.kernel.org, linux-scsi@vger.kernel.org,
	Omar Sandoval <osandov@fb.com>,
	John Garry <john.garry@huawei.com>,
	Ming Lei <ming.lei@redhat.com>
Subject: [PATCH V9 5/7] blk-mq-sched: improve dispatching from sw queue
Date: Sat, 14 Oct 2017 02:05:30 +0800	[thread overview]
Message-ID: <20171013180532.29304-6-ming.lei@redhat.com> (raw)
In-Reply-To: <20171013180532.29304-1-ming.lei@redhat.com>

SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. Meantime there is also queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.

So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver, unfortunately
it is easy to cause queue busy because of the small .cmd_per_lun.
Once these requests are flushed out, they have to stay in hctx->dispatch,
and no bio merge can participate into these requests, and sequential IO
performance is hurt a lot.

This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
sw queue so that we can dispatch them in scheduler's way, then we can
avoid to dequeue too many requests from sw queue when ->dispatch isn't
flushed completely.

This patch improves dispatching from sw queue by using the callback of
.get_budget and .put_budget

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c   | 68 +++++++++++++++++++++++++++++++++++++++++++++++---
 block/blk-mq.c         | 39 +++++++++++++++++++++++++++++
 block/blk-mq.h         |  2 ++
 include/linux/blk-mq.h |  2 ++
 4 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 6bb92bbc3135..92df88351ad2 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -121,6 +121,55 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+					  struct blk_mq_ctx *ctx)
+{
+	unsigned idx = ctx->index_hw;
+
+	if (++idx == hctx->nr_ctx)
+		idx = 0;
+
+	return hctx->ctxs[idx];
+}
+
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	LIST_HEAD(rq_list);
+	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+	do {
+		struct request *rq;
+		blk_status_t ret;
+
+		if (!sbitmap_any_bit_set(&hctx->ctx_map))
+			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx, true);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
+
+		list_add(&rq->queuelist, &rq_list);
+
+		/* round robin for fair dispatch */
+		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	WRITE_ONCE(hctx->dispatch_from, ctx);
+
+	return false;
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
@@ -161,11 +210,24 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
-				has_sched_dispatch)
-			run_queue = blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+			if (has_sched_dispatch)
+				run_queue = blk_mq_do_dispatch_sched(hctx);
+			else
+				run_queue = blk_mq_do_dispatch_ctx(hctx);
+		}
 	} else if (has_sched_dispatch) {
 		run_queue = blk_mq_do_dispatch_sched(hctx);
+	} else if (q->mq_ops->get_budget) {
+		/*
+		 * If we need to get budget before queuing request, we
+		 * dequeue request one by one from sw queue for avoiding
+		 * to mess up I/O merge when dispatch runs out of resource.
+		 *
+		 * TODO: get more budgets, and dequeue more requests in
+		 * one time.
+		 */
+		run_queue = blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index feb89f1fcc63..3fdc5662231e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+	struct blk_mq_hw_ctx *hctx;
+	struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+		void *data)
+{
+	struct dispatch_rq_data *dispatch_data = data;
+	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+	spin_lock(&ctx->lock);
+	if (unlikely(!list_empty(&ctx->rq_list))) {
+		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+		list_del_init(&dispatch_data->rq->queuelist);
+		if (list_empty(&ctx->rq_list))
+			sbitmap_clear_bit(sb, bitnr);
+	}
+	spin_unlock(&ctx->lock);
+
+	return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start)
+{
+	unsigned off = start ? start->index_hw : 0;
+	struct dispatch_rq_data data = {
+		.hctx = hctx,
+		.rq   = NULL,
+	};
+
+	__sbitmap_for_each_set(&hctx->ctx_map, off,
+			       dispatch_rq_from_ctx, &data);
+
+	return data.rq;
+}
+
 static inline unsigned int queued_to_index(unsigned int queued)
 {
 	if (!queued)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index fab566ce0bbf..97e97a59464a 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 				bool wait);
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start);
 
 /*
  * Internal helpers for allocating/freeing the request map
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 901457df3d64..e5e6becd57d3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
 
 	struct sbitmap		ctx_map;
 
+	struct blk_mq_ctx	*dispatch_from;
+
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-- 
2.9.5

  parent reply	other threads:[~2017-10-13 18:05 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-10-13 18:05 [PATCH V9 0/7] blk-mq-sched: improve sequential I/O performance Ming Lei
2017-10-13 18:05 ` [PATCH V9 1/7] blk-mq-sched: dispatch from scheduler only after progress is made on ->dispatch Ming Lei
2017-10-13 18:05 ` [PATCH V9 2/7] blk-mq-sched: move actual dispatching into one helper Ming Lei
2017-10-13 18:05 ` [PATCH V9 3/7] sbitmap: introduce __sbitmap_for_each_set() Ming Lei
2017-10-13 18:05 ` [PATCH V9 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops Ming Lei
2017-10-13 23:43   ` Bart Van Assche
2017-10-14  7:34     ` Ming Lei
2017-10-13 18:05 ` Ming Lei [this message]
2017-10-13 18:05 ` [PATCH V9 6/7] SCSI: allow to pass null rq to scsi_prep_state_check() Ming Lei
2017-10-13 23:16   ` Bart Van Assche
2017-10-14  8:06     ` Ming Lei
2017-10-13 18:05 ` [PATCH V9 7/7] SCSI: implement .get_budget and .put_budget for blk-mq Ming Lei
2017-10-13 19:08 ` [PATCH V9 0/7] blk-mq-sched: improve sequential I/O performance Jens Axboe
2017-10-13 19:21   ` Jens Axboe
2017-10-13 20:23     ` Jens Axboe
2017-10-14  5:23       ` Ming Lei
2017-10-14  7:10       ` Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171013180532.29304-6-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=axboe@fb.com \
    --cc=bart.vanassche@sandisk.com \
    --cc=hch@infradead.org \
    --cc=john.garry@huawei.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=loberman@redhat.com \
    --cc=oleksandr@natalenko.name \
    --cc=osandov@fb.com \
    --cc=paolo.valente@linaro.org \
    --cc=tom81094@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).