From: Ming Lei <ming.lei@redhat.com>
To: Hannes Reinecke <hare@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>,
linux-block@vger.kernel.org, Yu Kuai <yukuai3@huawei.com>,
John Garry <john.garry@huawei.com>,
Sathya Prakash Veerichetty <sathya.prakash@broadcom.com>
Subject: Re: [PATCH 5/5] blk-mq: Replace tags->lock with SRCU for tag iterators
Date: Mon, 4 Aug 2025 19:35:24 +0800 [thread overview]
Message-ID: <aJCa_Ef6U00CZbpf@fedora> (raw)
In-Reply-To: <028ba177-da0c-465e-ab34-ec18039395e8@suse.de>
On Mon, Aug 04, 2025 at 09:13:08AM +0200, Hannes Reinecke wrote:
> On 8/1/25 13:44, Ming Lei wrote:
> > Replace the spinlock in blk_mq_find_and_get_req() with an SRCU read lock
> > around the tag iterators.
> >
> > This is done by:
> >
> > - Holding the SRCU read lock in blk_mq_queue_tag_busy_iter(),
> > blk_mq_tagset_busy_iter(), and blk_mq_hctx_has_requests().
> >
> > - Removing the now-redundant tags->lock from blk_mq_find_and_get_req().
> >
> > This change improves performance by replacing a spinlock with a more
> > scalable SRCU lock, and fixes lockup issue in scsi_host_busy() in case of
> > shost->host_blocked.
> >
> > Meantime it becomes possible to use blk_mq_in_driver_rw() for io
> > accounting.
> >
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> > block/blk-mq-tag.c | 12 ++++++++----
> > block/blk-mq.c | 24 ++++--------------------
> > 2 files changed, 12 insertions(+), 24 deletions(-)
> >
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index 6c2f5881e0de..7ae431077a32 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -256,13 +256,10 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
> > unsigned int bitnr)
> > {
> > struct request *rq;
> > - unsigned long flags;
> > - spin_lock_irqsave(&tags->lock, flags);
> > rq = tags->rqs[bitnr];
> > if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
> > rq = NULL;
> > - spin_unlock_irqrestore(&tags->lock, flags);
> > return rq;
> > }
> > @@ -440,7 +437,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
> > busy_tag_iter_fn *fn, void *priv)
> > {
> > unsigned int flags = tagset->flags;
> > - int i, nr_tags;
> > + int i, nr_tags, srcu_idx;
> > +
> > + srcu_idx = srcu_read_lock(&tagset->tags_srcu);
> > nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
> > @@ -449,6 +448,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
> > __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
> > BT_TAG_ITER_STARTED);
> > }
> > + srcu_read_unlock(&tagset->tags_srcu, srcu_idx);
> > }
> > EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
> > @@ -499,6 +499,8 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
> > void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
> > void *priv)
> > {
> > + int srcu_idx;
> > +
> > /*
> > * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
> > * while the queue is frozen. So we can use q_usage_counter to avoid
> > @@ -507,6 +509,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
> > if (!percpu_ref_tryget(&q->q_usage_counter))
> > return;
> > + srcu_idx = srcu_read_lock(&q->tag_set->tags_srcu);
> > if (blk_mq_is_shared_tags(q->tag_set->flags)) {
> > struct blk_mq_tags *tags = q->tag_set->shared_tags;
> > struct sbitmap_queue *bresv = &tags->breserved_tags;
> > @@ -536,6 +539,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
> > bt_for_each(hctx, q, btags, fn, priv, false);
> > }
> > }
> > + srcu_read_unlock(&q->tag_set->tags_srcu, srcu_idx);
> > blk_queue_exit(q);
> > }
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 7b4ab8e398b6..43b15e58ffe1 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -3415,7 +3415,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
> > struct blk_mq_tags *tags)
> > {
> > struct page *page;
> > - unsigned long flags;
> > /*
> > * There is no need to clear mapping if driver tags is not initialized
> > @@ -3439,15 +3438,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
> > }
> > }
> > }
> > -
> > - /*
> > - * Wait until all pending iteration is done.
> > - *
> > - * Request reference is cleared and it is guaranteed to be observed
> > - * after the ->lock is released.
> > - */
> > - spin_lock_irqsave(&drv_tags->lock, flags);
> > - spin_unlock_irqrestore(&drv_tags->lock, flags);
> > }
> > void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> > @@ -3670,8 +3660,12 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
> > struct rq_iter_data data = {
> > .hctx = hctx,
> > };
> > + int srcu_idx;
> > + srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu);
> > blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
> > + srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx);
> > +
> > return data.has_rq;
> > }
> > @@ -3891,7 +3885,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
> > unsigned int queue_depth, struct request *flush_rq)
> > {
> > int i;
> > - unsigned long flags;
> > /* The hw queue may not be mapped yet */
> > if (!tags)
> > @@ -3901,15 +3894,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
> > for (i = 0; i < queue_depth; i++)
> > cmpxchg(&tags->rqs[i], flush_rq, NULL);
> > -
> > - /*
> > - * Wait until all pending iteration is done.
> > - *
> > - * Request reference is cleared and it is guaranteed to be observed
> > - * after the ->lock is released.
> > - */
> > - spin_lock_irqsave(&tags->lock, flags);
> > - spin_unlock_irqrestore(&tags->lock, flags);
> > }
> > static void blk_free_flush_queue_callback(struct rcu_head *head)
>
> While this looks good, I do wonder what happened to the 'fq' srcu.
> Don't we need to insert an srcu_read_lock() when we're trying to
> access it?
That is exactly the srcu read lock added in this patch.
Thanks,
Ming
next prev parent reply other threads:[~2025-08-04 11:35 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-01 11:44 [PATCH 0/5] blk-mq: Replace tags->lock with SRCU for tag iterators Ming Lei
2025-08-01 11:44 ` [PATCH 1/5] blk-mq: Move flush queue allocation into blk_mq_init_hctx() Ming Lei
2025-08-04 6:06 ` Yu Kuai
2025-08-04 7:07 ` Hannes Reinecke
2025-08-01 11:44 ` [PATCH 2/5] blk-mq: Pass tag_set to blk_mq_free_rq_map/tags Ming Lei
2025-08-04 7:08 ` Hannes Reinecke
2025-08-05 7:48 ` Yu Kuai
2025-08-01 11:44 ` [PATCH 3/5] blk-mq: Defer freeing of tags page_list to SRCU callback Ming Lei
2025-08-04 7:09 ` Hannes Reinecke
2025-08-06 9:15 ` Yu Kuai
2025-08-01 11:44 ` [PATCH 4/5] blk-mq: Defer freeing flush queue " Ming Lei
2025-08-04 7:11 ` Hannes Reinecke
2025-08-06 9:17 ` Yu Kuai
2025-08-01 11:44 ` [PATCH 5/5] blk-mq: Replace tags->lock with SRCU for tag iterators Ming Lei
2025-08-04 6:30 ` Yu Kuai
2025-08-04 11:32 ` Ming Lei
2025-08-05 8:33 ` Yu Kuai
2025-08-05 8:38 ` Yu Kuai
2025-08-05 8:48 ` Ming Lei
2025-08-06 1:06 ` Yu Kuai
2025-08-06 8:59 ` Ming Lei
2025-08-06 9:06 ` Yu Kuai
2025-08-04 7:13 ` Hannes Reinecke
2025-08-04 11:35 ` Ming Lei [this message]
2025-08-04 11:45 ` Hannes Reinecke
2025-08-06 9:21 ` Yu Kuai
2025-08-06 13:28 ` Ming Lei
2025-08-07 1:23 ` Yu Kuai
2025-08-07 2:12 ` Ming Lei
2025-08-07 3:44 ` Yu Kuai
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aJCa_Ef6U00CZbpf@fedora \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=hare@suse.de \
--cc=john.garry@huawei.com \
--cc=linux-block@vger.kernel.org \
--cc=sathya.prakash@broadcom.com \
--cc=yukuai3@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.