* [PATCH V8 1/7] blk-mq-sched: fix scheduler bad performance
2017-10-13 17:24 [PATCH V8 0/7] blk-mq-sched: improve sequential I/O performance Ming Lei
@ 2017-10-13 17:24 ` Ming Lei
2017-10-13 17:30 ` Jens Axboe
2017-10-13 17:24 ` [PATCH V8 2/7] blk-mq-sched: move actual dispatching into one helper Ming Lei
` (5 subsequent siblings)
6 siblings, 1 reply; 10+ messages in thread
From: Ming Lei @ 2017-10-13 17:24 UTC (permalink / raw)
To: Jens Axboe, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry, Ming Lei
When hw queue is busy, we shouldn't take requests from
scheduler queue any more, otherwise it is difficult to do
IO merge.
This patch fixes the awful IO performance on some
SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber
is used by not taking requests if hw queue is busy.
Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq-sched.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..eca011fdfa0e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
- bool did_work = false;
+ bool do_sched_dispatch = true;
LIST_HEAD(rq_list);
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -125,18 +125,18 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- did_work = blk_mq_dispatch_rq_list(q, &rq_list);
+ do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
} else if (!has_sched_dispatch) {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list);
}
/*
- * We want to dispatch from the scheduler if we had no work left
- * on the dispatch list, OR if we did have work but weren't able
- * to make progress.
+ * We want to dispatch from the scheduler if there was nothing
+ * on the dispatch list or we were able to dispatch from the
+ * dispatch list.
*/
- if (!did_work && has_sched_dispatch) {
+ if (do_sched_dispatch && has_sched_dispatch) {
do {
struct request *rq;
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH V8 1/7] blk-mq-sched: fix scheduler bad performance
2017-10-13 17:24 ` [PATCH V8 1/7] blk-mq-sched: fix scheduler bad performance Ming Lei
@ 2017-10-13 17:30 ` Jens Axboe
0 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2017-10-13 17:30 UTC (permalink / raw)
To: Ming Lei, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry
On 10/13/2017 11:24 AM, Ming Lei wrote:
> When hw queue is busy, we shouldn't take requests from
> scheduler queue any more, otherwise it is difficult to do
> IO merge.
>
> This patch fixes the awful IO performance on some
> SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber
> is used by not taking requests if hw queue is busy.
This looks fine to me, but needs a much better patch title.
"fix scheduler bad performance" tells you nothing.
--
Jens Axboe
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH V8 2/7] blk-mq-sched: move actual dispatching into one helper
2017-10-13 17:24 [PATCH V8 0/7] blk-mq-sched: improve sequential I/O performance Ming Lei
2017-10-13 17:24 ` [PATCH V8 1/7] blk-mq-sched: fix scheduler bad performance Ming Lei
@ 2017-10-13 17:24 ` Ming Lei
2017-10-13 17:24 ` [PATCH V8 3/7] sbitmap: introduce __sbitmap_for_each_set() Ming Lei
` (4 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2017-10-13 17:24 UTC (permalink / raw)
To: Jens Axboe, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry, Ming Lei
So that it becomes easy to support to dispatch from sw queue in the
following patch.
No functional change.
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Suggested-by: Christoph Hellwig <hch@lst.de> # for simplifying dispatch logic
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq-sched.c | 43 ++++++++++++++++++++++++-------------------
1 file changed, 24 insertions(+), 19 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index eca011fdfa0e..be29ba849408 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,12 +89,26 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
return false;
}
+static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct elevator_queue *e = q->elevator;
+ LIST_HEAD(rq_list);
+
+ do {
+ struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+
+ if (!rq)
+ break;
+ list_add(&rq->queuelist, &rq_list);
+ } while (blk_mq_dispatch_rq_list(q, &rq_list));
+}
+
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
- bool do_sched_dispatch = true;
LIST_HEAD(rq_list);
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -122,30 +136,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
* scheduler, we can no longer merge or sort them. So it's best to
* leave them there for as long as we can. Mark the hw queue as
* needing a restart in that case.
+ *
+ * We want to dispatch from the scheduler if there was nothing
+ * on the dispatch list or we were able to dispatch from the
+ * dispatch list.
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
- } else if (!has_sched_dispatch) {
+ if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
+ blk_mq_do_dispatch_sched(hctx);
+ } else if (has_sched_dispatch) {
+ blk_mq_do_dispatch_sched(hctx);
+ } else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list);
}
-
- /*
- * We want to dispatch from the scheduler if there was nothing
- * on the dispatch list or we were able to dispatch from the
- * dispatch list.
- */
- if (do_sched_dispatch && has_sched_dispatch) {
- do {
- struct request *rq;
-
- rq = e->type->ops.mq.dispatch_request(hctx);
- if (!rq)
- break;
- list_add(&rq->queuelist, &rq_list);
- } while (blk_mq_dispatch_rq_list(q, &rq_list));
- }
}
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH V8 3/7] sbitmap: introduce __sbitmap_for_each_set()
2017-10-13 17:24 [PATCH V8 0/7] blk-mq-sched: improve sequential I/O performance Ming Lei
2017-10-13 17:24 ` [PATCH V8 1/7] blk-mq-sched: fix scheduler bad performance Ming Lei
2017-10-13 17:24 ` [PATCH V8 2/7] blk-mq-sched: move actual dispatching into one helper Ming Lei
@ 2017-10-13 17:24 ` Ming Lei
2017-10-13 17:24 ` [PATCH V8 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops Ming Lei
` (3 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2017-10-13 17:24 UTC (permalink / raw)
To: Jens Axboe, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry, Ming Lei
We need to iterate ctx starting from any ctx in round robin
way, so introduce this helper.
Reviewed-by: Omar Sandoval <osandov@fb.com>
Cc: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
include/linux/sbitmap.h | 64 ++++++++++++++++++++++++++++++++++++-------------
1 file changed, 47 insertions(+), 17 deletions(-)
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a1904aadbc45..0dcc60e820de 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb);
*/
bool sbitmap_any_bit_clear(const struct sbitmap *sb);
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
/**
- * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @start: Where to start the iteration.
* @sb: Bitmap to iterate over.
* @fn: Callback. Should return true to continue or false to break early.
* @data: Pointer to pass to callback.
@@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
* This is inline even though it's non-trivial so that the function calls to the
* callback will hopefully get optimized away.
*/
-static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
- void *data)
+static inline void __sbitmap_for_each_set(struct sbitmap *sb,
+ unsigned int start,
+ sb_for_each_fn fn, void *data)
{
- unsigned int i;
+ unsigned int index;
+ unsigned int nr;
+ unsigned int scanned = 0;
- for (i = 0; i < sb->map_nr; i++) {
- struct sbitmap_word *word = &sb->map[i];
- unsigned int off, nr;
+ if (start >= sb->depth)
+ start = 0;
+ index = SB_NR_TO_INDEX(sb, start);
+ nr = SB_NR_TO_BIT(sb, start);
- if (!word->word)
- continue;
+ while (scanned < sb->depth) {
+ struct sbitmap_word *word = &sb->map[index];
+ unsigned int depth = min_t(unsigned int, word->depth - nr,
+ sb->depth - scanned);
- nr = 0;
- off = i << sb->shift;
+ scanned += depth;
+ if (!word->word)
+ goto next;
+
+ /*
+ * On the first iteration of the outer loop, we need to add the
+ * bit offset back to the size of the word for find_next_bit().
+ * On all other iterations, nr is zero, so this is a noop.
+ */
+ depth += nr;
while (1) {
- nr = find_next_bit(&word->word, word->depth, nr);
- if (nr >= word->depth)
+ nr = find_next_bit(&word->word, depth, nr);
+ if (nr >= depth)
break;
-
- if (!fn(sb, off + nr, data))
+ if (!fn(sb, (index << sb->shift) + nr, data))
return;
nr++;
}
+next:
+ nr = 0;
+ if (++index >= sb->map_nr)
+ index = 0;
}
}
-#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
-#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+ void *data)
+{
+ __sbitmap_for_each_set(sb, 0, fn, data);
+}
static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
unsigned int bitnr)
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH V8 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops
2017-10-13 17:24 [PATCH V8 0/7] blk-mq-sched: improve sequential I/O performance Ming Lei
` (2 preceding siblings ...)
2017-10-13 17:24 ` [PATCH V8 3/7] sbitmap: introduce __sbitmap_for_each_set() Ming Lei
@ 2017-10-13 17:24 ` Ming Lei
2017-10-13 17:32 ` Jens Axboe
2017-10-13 17:24 ` [PATCH V8 5/7] blk-mq-sched: improve dispatching from sw queue Ming Lei
` (2 subsequent siblings)
6 siblings, 1 reply; 10+ messages in thread
From: Ming Lei @ 2017-10-13 17:24 UTC (permalink / raw)
To: Jens Axboe, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry, Ming Lei
For SCSI devices, there is often per-request-queue depth, which need
to be respected before queuing one request.
The current blk-mq always dequeues request first, then calls .queue_rq()
to dispatch the request to lld. One obvious issue of this way is that I/O
merge may not be good, because when the per-request-queue depth can't be
respected, .queue_rq() has to return BLK_STS_RESOURCE, then this request
has to stay in hctx->dispatch list, and never got chance to participate
into I/O merge.
This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
then we can try to get reserved budget first before dequeuing request.
Once the budget for queueing I/O can't be satisfied, we don't need to
dequeue request at all, then I/O merge can get improved a lot.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq-sched.c | 43 +++++++++++++++++++++++++++++++++++--------
block/blk-mq.c | 33 ++++++++++++++++++++++++++++++---
block/blk-mq.h | 20 +++++++++++++++++++-
include/linux/blk-mq.h | 11 +++++++++++
4 files changed, 95 insertions(+), 12 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba849408..cd1c0caae16a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,19 +89,36 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
return false;
}
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
do {
- struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+ struct request *rq;
+ blk_status_t ret;
- if (!rq)
+ if (e->type->ops.mq.has_work &&
+ !e->type->ops.mq.has_work(hctx))
break;
+
+ ret = blk_mq_get_budget(hctx);
+ if (ret == BLK_STS_RESOURCE)
+ return true;
+
+ rq = e->type->ops.mq.dispatch_request(hctx);
+ if (!rq) {
+ blk_mq_put_budget(hctx, true);
+ break;
+ } else if (ret != BLK_STS_OK) {
+ blk_mq_end_request(rq, ret);
+ continue;
+ }
list_add(&rq->queuelist, &rq_list);
- } while (blk_mq_dispatch_rq_list(q, &rq_list));
+ } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+ return false;
}
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -110,6 +127,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
LIST_HEAD(rq_list);
+ bool run_queue = false;
/* RCU or SRCU read lock is needed before checking quiesced flag */
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
@@ -143,13 +161,22 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
- blk_mq_do_dispatch_sched(hctx);
+ if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
+ has_sched_dispatch)
+ run_queue = blk_mq_do_dispatch_sched(hctx);
} else if (has_sched_dispatch) {
- blk_mq_do_dispatch_sched(hctx);
+ run_queue = blk_mq_do_dispatch_sched(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(q, &rq_list);
+ blk_mq_dispatch_rq_list(q, &rq_list, false);
+ }
+
+ if (run_queue) {
+ if (!blk_mq_sched_needs_restart(hctx) &&
+ !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
+ blk_mq_sched_mark_restart_hctx(hctx);
+ blk_mq_run_hw_queue(hctx, true);
+ }
}
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40cba1b1978f..24c1b80d4312 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
return true;
}
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+ bool got_budget)
{
struct blk_mq_hw_ctx *hctx;
struct request *rq;
@@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
if (list_empty(list))
return false;
+ WARN_ON(!list_is_singular(list) && got_budget);
+
/*
* Now process all the entries, sending them to the driver.
*/
@@ -1074,16 +1077,28 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed.
*/
- if (!blk_mq_dispatch_wait_add(hctx))
+ if (!blk_mq_dispatch_wait_add(hctx)) {
+ blk_mq_put_budget(hctx, got_budget);
break;
+ }
/*
* It's possible that a tag was freed in the window
* between the allocation failure and adding the
* hardware queue to the wait queue.
*/
- if (!blk_mq_get_driver_tag(rq, &hctx, false))
+ if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+ blk_mq_put_budget(hctx, got_budget);
+ break;
+ }
+ }
+
+ if (!got_budget) {
+ ret = blk_mq_get_budget(hctx);
+ if (ret == BLK_STS_RESOURCE)
break;
+ if (ret != BLK_STS_OK)
+ goto fail_rq;
}
list_del_init(&rq->queuelist);
@@ -1111,6 +1126,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
break;
}
+ fail_rq:
if (unlikely(ret != BLK_STS_OK)) {
errors++;
blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1582,6 +1598,13 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (!blk_mq_get_driver_tag(rq, NULL, false))
goto insert;
+ ret = blk_mq_get_budget(hctx);
+ if (ret == BLK_STS_RESOURCE) {
+ blk_mq_put_driver_tag(rq);
+ goto insert;
+ } else if (ret != BLK_STS_OK)
+ goto fail_rq;
+
new_cookie = request_to_qc_t(hctx, rq);
/*
@@ -1598,6 +1621,7 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
__blk_mq_requeue_request(rq);
goto insert;
default:
+ fail_rq:
*cookie = BLK_QC_T_NONE;
blk_mq_end_request(rq, ret);
return;
@@ -2582,6 +2606,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (!set->ops->queue_rq)
return -EINVAL;
+ if ((!!set->ops->get_budget) != (!!set->ops->put_budget))
+ return -EINVAL;
+
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
pr_info("blk-mq: reduced tag depth to %u\n",
BLK_MQ_MAX_DEPTH);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ef15b3414da5..cc7ee9ede3ae 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
@@ -137,4 +137,22 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
+static inline void blk_mq_put_budget(struct blk_mq_hw_ctx *hctx,
+ bool got_budget)
+{
+ struct request_queue *q = hctx->queue;
+
+ if (q->mq_ops->put_budget && got_budget)
+ q->mq_ops->put_budget(hctx);
+}
+
+static inline blk_status_t blk_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+
+ if (q->mq_ops->get_budget)
+ return q->mq_ops->get_budget(hctx);
+ return BLK_STS_OK;
+}
+
#endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..901457df3d64 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -90,6 +90,8 @@ struct blk_mq_queue_data {
typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
const struct blk_mq_queue_data *);
+typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -112,6 +114,15 @@ struct blk_mq_ops {
queue_rq_fn *queue_rq;
/*
+ * Reserve budget before queue request, once .queue_rq is
+ * run, it is driver's responsibility to release the
+ * reserved budget. Also we have to handle failure case
+ * of .get_budget for avoiding I/O deadlock.
+ */
+ get_budget_fn *get_budget;
+ put_budget_fn *put_budget;
+
+ /*
* Called on request timeout
*/
timeout_fn *timeout;
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH V8 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops
2017-10-13 17:24 ` [PATCH V8 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops Ming Lei
@ 2017-10-13 17:32 ` Jens Axboe
0 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2017-10-13 17:32 UTC (permalink / raw)
To: Ming Lei, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry
On 10/13/2017 11:24 AM, Ming Lei wrote:
> For SCSI devices, there is often per-request-queue depth, which need
> to be respected before queuing one request.
>
> The current blk-mq always dequeues request first, then calls .queue_rq()
> to dispatch the request to lld. One obvious issue of this way is that I/O
> merge may not be good, because when the per-request-queue depth can't be
> respected, .queue_rq() has to return BLK_STS_RESOURCE, then this request
> has to stay in hctx->dispatch list, and never got chance to participate
> into I/O merge.
>
> This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
> then we can try to get reserved budget first before dequeuing request.
> Once the budget for queueing I/O can't be satisfied, we don't need to
> dequeue request at all, then I/O merge can get improved a lot.
Still think this should be blk_mq_get_dispatch_budget(), like in the
incremental I sent out. That way you actually know what it is doing,
get_budget() could be anything.
> @@ -2582,6 +2606,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
> if (!set->ops->queue_rq)
> return -EINVAL;
>
> + if ((!!set->ops->get_budget) != (!!set->ops->put_budget))
> + return -EINVAL;
if (!set->ops->get_budget ^ !set->ops->put_budget)
is cleaner, imho.
--
Jens Axboe
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH V8 5/7] blk-mq-sched: improve dispatching from sw queue
2017-10-13 17:24 [PATCH V8 0/7] blk-mq-sched: improve sequential I/O performance Ming Lei
` (3 preceding siblings ...)
2017-10-13 17:24 ` [PATCH V8 4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops Ming Lei
@ 2017-10-13 17:24 ` Ming Lei
2017-10-13 17:24 ` [PATCH V8 6/7] SCSI: allow to pass null rq to scsi_prep_state_check() Ming Lei
2017-10-13 17:24 ` [PATCH V8 7/7] SCSI: implement .get_budget and .put_budget for blk-mq Ming Lei
6 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2017-10-13 17:24 UTC (permalink / raw)
To: Jens Axboe, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry, Ming Lei
SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. Meantime there is also queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.
So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver, unfortunately
it is easy to cause queue busy because of the small .cmd_per_lun.
Once these requests are flushed out, they have to stay in hctx->dispatch,
and no bio merge can participate into these requests, and sequential IO
performance is hurt a lot.
This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
sw queue so that we can dispatch them in scheduler's way, then we can
avoid to dequeue too many requests from sw queue when ->dispatch isn't
flushed completely.
This patch improves dispatching from sw queue by using the callback of
.get_budget and .put_budget
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq-sched.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++---
block/blk-mq.c | 39 +++++++++++++++++++++++++++++
block/blk-mq.h | 2 ++
include/linux/blk-mq.h | 2 ++
4 files changed, 108 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index cd1c0caae16a..78a862eabdec 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -121,6 +121,55 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
return false;
}
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+ unsigned idx = ctx->index_hw;
+
+ if (++idx == hctx->nr_ctx)
+ idx = 0;
+
+ return hctx->ctxs[idx];
+}
+
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ LIST_HEAD(rq_list);
+ struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+ do {
+ struct request *rq;
+ blk_status_t ret;
+
+ if (!sbitmap_any_bit_set(&hctx->ctx_map))
+ break;
+
+ ret = blk_mq_get_budget(hctx);
+ if (ret == BLK_STS_RESOURCE)
+ return true;
+
+ rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+ if (!rq) {
+ blk_mq_put_budget(hctx, true);
+ break;
+ } else if (ret != BLK_STS_OK) {
+ blk_mq_end_request(rq, ret);
+ continue;
+ }
+
+ list_add(&rq->queuelist, &rq_list);
+
+ /* round robin for fair dispatch */
+ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+ } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+ WRITE_ONCE(hctx->dispatch_from, ctx);
+
+ return false;
+}
+
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
@@ -161,11 +210,24 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
- has_sched_dispatch)
- run_queue = blk_mq_do_dispatch_sched(hctx);
+ if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+ if (has_sched_dispatch)
+ run_queue = blk_mq_do_dispatch_sched(hctx);
+ else
+ run_queue = blk_mq_do_dispatch_ctx(hctx);
+ }
} else if (has_sched_dispatch) {
run_queue = blk_mq_do_dispatch_sched(hctx);
+ } else if (q->mq_ops->get_budget) {
+ /*
+ * If we need to get budget before queuing request, we
+ * dequeue request one by one from sw queue for avoiding
+ * to mess up I/O merge when dispatch runs out of resource.
+ *
+ * TODO: get more budgets, and dequeue more requests in
+ * one time.
+ */
+ run_queue = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 24c1b80d4312..2416db4dc98b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
}
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
+struct dispatch_rq_data {
+ struct blk_mq_hw_ctx *hctx;
+ struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+ void *data)
+{
+ struct dispatch_rq_data *dispatch_data = data;
+ struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+ struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+ spin_lock(&ctx->lock);
+ if (unlikely(!list_empty(&ctx->rq_list))) {
+ dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+ list_del_init(&dispatch_data->rq->queuelist);
+ if (list_empty(&ctx->rq_list))
+ sbitmap_clear_bit(sb, bitnr);
+ }
+ spin_unlock(&ctx->lock);
+
+ return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *start)
+{
+ unsigned off = start ? start->index_hw : 0;
+ struct dispatch_rq_data data = {
+ .hctx = hctx,
+ .rq = NULL,
+ };
+
+ __sbitmap_for_each_set(&hctx->ctx_map, off,
+ dispatch_rq_from_ctx, &data);
+
+ return data.rq;
+}
+
static inline unsigned int queued_to_index(unsigned int queued)
{
if (!queued)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index cc7ee9ede3ae..80639cb69f31 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
bool wait);
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *start);
/*
* Internal helpers for allocating/freeing the request map
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 901457df3d64..e5e6becd57d3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
struct sbitmap ctx_map;
+ struct blk_mq_ctx *dispatch_from;
+
struct blk_mq_ctx **ctxs;
unsigned int nr_ctx;
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH V8 6/7] SCSI: allow to pass null rq to scsi_prep_state_check()
2017-10-13 17:24 [PATCH V8 0/7] blk-mq-sched: improve sequential I/O performance Ming Lei
` (4 preceding siblings ...)
2017-10-13 17:24 ` [PATCH V8 5/7] blk-mq-sched: improve dispatching from sw queue Ming Lei
@ 2017-10-13 17:24 ` Ming Lei
2017-10-13 17:24 ` [PATCH V8 7/7] SCSI: implement .get_budget and .put_budget for blk-mq Ming Lei
6 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2017-10-13 17:24 UTC (permalink / raw)
To: Jens Axboe, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry, Ming Lei
In the following patch, we will implement scsi_get_budget()
which need to call scsi_prep_state_check() when rq isn't
dequeued yet.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/scsi/scsi_lib.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9cf6a80fe297..d159bb085714 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
/*
* If the devices is blocked we defer normal commands.
*/
- if (!(req->rq_flags & RQF_PREEMPT))
+ if (req && !(req->rq_flags & RQF_PREEMPT))
ret = BLKPREP_DEFER;
break;
default:
@@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
* special commands. In particular any user initiated
* command is not allowed.
*/
- if (!(req->rq_flags & RQF_PREEMPT))
+ if (req && !(req->rq_flags & RQF_PREEMPT))
ret = BLKPREP_KILL;
break;
}
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH V8 7/7] SCSI: implement .get_budget and .put_budget for blk-mq
2017-10-13 17:24 [PATCH V8 0/7] blk-mq-sched: improve sequential I/O performance Ming Lei
` (5 preceding siblings ...)
2017-10-13 17:24 ` [PATCH V8 6/7] SCSI: allow to pass null rq to scsi_prep_state_check() Ming Lei
@ 2017-10-13 17:24 ` Ming Lei
6 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2017-10-13 17:24 UTC (permalink / raw)
To: Jens Axboe, linux-block, Christoph Hellwig
Cc: Bart Van Assche, Laurence Oberman, Paolo Valente,
Oleksandr Natalenko, Tom Nguyen, linux-kernel, linux-scsi,
Omar Sandoval, John Garry, Ming Lei
We need to tell blk-mq for reserving resource before queuing
one request, so implement these two callbacks. Then blk-mq
can avoid to dequeue request earlier, and IO merge can
be improved a lot.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/scsi/scsi_lib.c | 75 ++++++++++++++++++++++++++++++++++---------------
1 file changed, 52 insertions(+), 23 deletions(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d159bb085714..6f10afaca25b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1946,25 +1946,32 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
blk_mq_complete_request(cmd->request);
}
-static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
- const struct blk_mq_queue_data *bd)
+static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
{
- struct request *req = bd->rq;
- struct request_queue *q = req->q;
+ struct request_queue *q = hctx->queue;
+ struct scsi_device *sdev = q->queuedata;
+ struct Scsi_Host *shost = sdev->host;
+
+ atomic_dec(&shost->host_busy);
+ if (scsi_target(sdev)->can_queue > 0)
+ atomic_dec(&scsi_target(sdev)->target_busy);
+ atomic_dec(&sdev->device_busy);
+ put_device(&sdev->sdev_gendev);
+}
+
+static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost = sdev->host;
- struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
blk_status_t ret;
- int reason;
- ret = prep_to_mq(scsi_prep_state_check(sdev, req));
- if (ret != BLK_STS_OK)
- goto out;
+ ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
+ if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK)
+ return ret;
- ret = BLK_STS_RESOURCE;
if (!get_device(&sdev->sdev_gendev))
goto out;
-
if (!scsi_dev_queue_ready(q, sdev))
goto out_put_device;
if (!scsi_target_queue_ready(shost, sdev))
@@ -1972,10 +1979,38 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
if (!scsi_host_queue_ready(q, shost, sdev))
goto out_dec_target_busy;
+ return BLK_STS_OK;
+
+out_dec_target_busy:
+ if (scsi_target(sdev)->can_queue > 0)
+ atomic_dec(&scsi_target(sdev)->target_busy);
+out_dec_device_busy:
+ atomic_dec(&sdev->device_busy);
+out_put_device:
+ put_device(&sdev->sdev_gendev);
+out:
+ return BLK_STS_RESOURCE;
+}
+
+static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *req = bd->rq;
+ struct request_queue *q = req->q;
+ struct scsi_device *sdev = q->queuedata;
+ struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
+ blk_status_t ret;
+ int reason;
+
+ ret = prep_to_mq(scsi_prep_state_check(sdev, req));
+ if (ret != BLK_STS_OK)
+ goto out_put_budget;
+
+ ret = BLK_STS_RESOURCE;
if (!(req->rq_flags & RQF_DONTPREP)) {
ret = prep_to_mq(scsi_mq_prep_fn(req));
if (ret != BLK_STS_OK)
- goto out_dec_host_busy;
+ goto out_put_budget;
req->rq_flags |= RQF_DONTPREP;
} else {
blk_mq_start_request(req);
@@ -1993,21 +2028,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
if (reason) {
scsi_set_blocked(cmd, reason);
ret = BLK_STS_RESOURCE;
- goto out_dec_host_busy;
+ goto out_put_budget;
}
return BLK_STS_OK;
-out_dec_host_busy:
- atomic_dec(&shost->host_busy);
-out_dec_target_busy:
- if (scsi_target(sdev)->can_queue > 0)
- atomic_dec(&scsi_target(sdev)->target_busy);
-out_dec_device_busy:
- atomic_dec(&sdev->device_busy);
-out_put_device:
- put_device(&sdev->sdev_gendev);
-out:
+out_put_budget:
+ scsi_mq_put_budget(hctx);
switch (ret) {
case BLK_STS_OK:
break;
@@ -2211,6 +2238,8 @@ struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev)
}
static const struct blk_mq_ops scsi_mq_ops = {
+ .get_budget = scsi_mq_get_budget,
+ .put_budget = scsi_mq_put_budget,
.queue_rq = scsi_queue_rq,
.complete = scsi_softirq_done,
.timeout = scsi_timeout,
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread