From: Ming Lei <ming.lei@redhat.com>
To: John Garry <john.garry@huawei.com>
Cc: axboe@kernel.dk, linux-block@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-scsi@vger.kernel.org,
kashyap.desai@broadcom.com, chenxiang66@hisilicon.com,
yama@redhat.com, dgilbert@interlog.com
Subject: Re: [PATCH v2] blk-mq: Use request queue-wide tags for tagset-wide sbitmap
Date: Wed, 12 May 2021 09:58:01 +0800 [thread overview]
Message-ID: <YJs2KWMCn2kpyryT@T590> (raw)
In-Reply-To: <1620749743-36000-1-git-send-email-john.garry@huawei.com>
On Wed, May 12, 2021 at 12:15:43AM +0800, John Garry wrote:
> The tags used for an IO scheduler are currently per hctx.
>
> As such, when q->nr_hw_queues grows, so does the request queue total IO
> scheduler tag depth.
>
> This may cause problems for SCSI MQ HBAs whose total driver depth is
> fixed.
>
> Ming and Yanhui report higher CPU usage and lower throughput in scenarios
> where the fixed total driver tag depth is appreciably lower than the total
> scheduler tag depth:
> https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@huawei.com/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b
>
> In that scenario, since the scheduler tag is got first, much contention
> is introduced since a driver tag may not be available after we have got
> the sched tag.
>
> Improve this scenario by introducing request queue-wide tags for when
> a tagset-wide sbitmap is used. The static sched requests are still
> allocated per hctx, as requests are initialised per hctx, as in
> blk_mq_init_request(..., hctx_idx, ...) ->
> set->ops->init_request(.., hctx_idx, ...).
>
> For simplicity of resizing the request queue sbitmap when updating the
> request queue depth, just init at the max possible size, so we don't need
> to deal with the possibly with swapping out a new sbitmap for old if
> we need to grow.
>
> Signed-off-by: John Garry <john.garry@huawei.com>
> ---
>
> Please retest, thanks! For some reason I could not recreate the original
> issue, but I am using qemu...
>
> Changes since v1:
> - Embed sbitmaps in request_queue struct
> - Relocate IO sched functions to blk-mq-sched.c
> - Fix error path code
>
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 42a365b1b9c0..9a012e0818cb 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -507,11 +507,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
> struct blk_mq_hw_ctx *hctx,
> unsigned int hctx_idx)
> {
> - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> -
> if (hctx->sched_tags) {
> blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
> - blk_mq_free_rq_map(hctx->sched_tags, flags);
> + blk_mq_free_rq_map(hctx->sched_tags, set->flags);
> hctx->sched_tags = NULL;
> }
> }
> @@ -521,12 +519,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
> unsigned int hctx_idx)
> {
> struct blk_mq_tag_set *set = q->tag_set;
> - /* Clear HCTX_SHARED so tags are init'ed */
> - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> int ret;
>
> hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
> - set->reserved_tags, flags);
> + set->reserved_tags, set->flags);
> if (!hctx->sched_tags)
> return -ENOMEM;
>
> @@ -544,16 +540,40 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
> int i;
>
> queue_for_each_hw_ctx(q, hctx, i) {
> - /* Clear HCTX_SHARED so tags are freed */
> - unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> -
> if (hctx->sched_tags) {
> - blk_mq_free_rq_map(hctx->sched_tags, flags);
> + blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
> hctx->sched_tags = NULL;
> }
> }
> }
>
> +static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
> +{
> + struct blk_mq_tag_set *set = queue->tag_set;
> + int ret;
> +
> + /*
> + * Set initial depth at max so that we don't need to reallocate for
> + * updating nr_requests.
> + */
> + ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
> + &queue->sched_breserved_tags,
> + set, MAX_SCHED_RQ, set->reserved_tags);
> + if (ret)
> + return ret;
> +
> + sbitmap_queue_resize(&queue->sched_bitmap_tags,
> + queue->nr_requests - set->reserved_tags);
> +
> + return 0;
> +}
> +
> +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
> +{
> + sbitmap_queue_free(&queue->sched_bitmap_tags);
> + sbitmap_queue_free(&queue->sched_breserved_tags);
> +}
> +
> int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> {
> struct blk_mq_hw_ctx *hctx;
> @@ -578,12 +598,25 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> queue_for_each_hw_ctx(q, hctx, i) {
> ret = blk_mq_sched_alloc_tags(q, hctx, i);
> if (ret)
> - goto err;
> + goto err_free_tags;
> + }
> +
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
> + ret = blk_mq_init_sched_shared_sbitmap(q);
> + if (ret)
> + goto err_free_tags;
> +
> + queue_for_each_hw_ctx(q, hctx, i) {
> + hctx->sched_tags->bitmap_tags =
> + &q->sched_bitmap_tags;
> + hctx->sched_tags->breserved_tags =
> + &q->sched_breserved_tags;
> + }
The above assignment can be folded into blk_mq_init_sched_shared_sbitmap().
> }
>
> ret = e->ops.init_sched(q, e);
> if (ret)
> - goto err;
> + goto err_free_sbitmap;
>
> blk_mq_debugfs_register_sched(q);
>
> @@ -603,7 +636,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>
> return 0;
>
> -err:
> +err_free_sbitmap:
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> + blk_mq_exit_sched_shared_sbitmap(q);
> +err_free_tags:
> blk_mq_sched_free_requests(q);
> blk_mq_sched_tags_teardown(q);
> q->elevator = NULL;
> @@ -641,5 +677,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
> if (e->type->ops.exit_sched)
> e->type->ops.exit_sched(e);
> blk_mq_sched_tags_teardown(q);
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> + blk_mq_exit_sched_shared_sbitmap(q);
The above two lines can be moved to blk_mq_sched_tags_teardown().
> q->elevator = NULL;
> }
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 5b18ab915c65..aff037cfd8e7 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -5,6 +5,8 @@
> #include "blk-mq.h"
> #include "blk-mq-tag.h"
>
> +#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
> +
> void blk_mq_sched_assign_ioc(struct request *rq);
>
> bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 2a37731e8244..e3ab8631be22 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -13,6 +13,7 @@
> #include <linux/delay.h>
> #include "blk.h"
> #include "blk-mq.h"
> +#include "blk-mq-sched.h"
> #include "blk-mq-tag.h"
>
> /*
> @@ -466,19 +467,39 @@ static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
> return -ENOMEM;
> }
>
> -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
> +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
> + struct sbitmap_queue *breserved_tags,
> + struct blk_mq_tag_set *set,
The 'set' parameter can be killed, meantime pass 'node' & 'alloc_policy',
just like blk_mq_init_bitmap_tags()'s type, then blk_mq_init_bitmaps()
can be re-used by blk_mq_init_bitmap_tags() for avoiding to duplicate
bitmap allocation code.
> + unsigned int queue_depth, unsigned int reserved)
> {
> - unsigned int depth = set->queue_depth - set->reserved_tags;
> + unsigned int depth = queue_depth - reserved;
> int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
> bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
> - int i, node = set->numa_node;
>
> - if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node))
> + if (bt_alloc(bitmap_tags, depth, round_robin, set->numa_node))
> return -ENOMEM;
> - if (bt_alloc(&set->__breserved_tags, set->reserved_tags,
> - round_robin, node))
> + if (bt_alloc(breserved_tags, set->reserved_tags,
s/set->reserved_tags/reserved/
Thanks,
Ming
next prev parent reply other threads:[~2021-05-12 1:58 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-05-11 16:15 [PATCH v2] blk-mq: Use request queue-wide tags for tagset-wide sbitmap John Garry
2021-05-12 1:58 ` Ming Lei [this message]
2021-05-12 14:12 ` John Garry
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=YJs2KWMCn2kpyryT@T590 \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=chenxiang66@hisilicon.com \
--cc=dgilbert@interlog.com \
--cc=john.garry@huawei.com \
--cc=kashyap.desai@broadcom.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
--cc=yama@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.