From: Ming Lei <ming.lei@redhat.com>
To: Nilay Shroff <nilay@linux.ibm.com>
Cc: linux-block@vger.kernel.org, hch@lst.de, axboe@kernel.dk,
sth@linux.ibm.com, lkp@intel.com, gjoyce@ibm.com
Subject: Re: [PATCHv6 3/3] block: fix potential deadlock while running nr_hw_queue update
Date: Tue, 1 Jul 2025 11:24:12 +0800 [thread overview]
Message-ID: <aGNU3PPJ1wU--x-O@fedora> (raw)
In-Reply-To: <20250630054756.54532-4-nilay@linux.ibm.com>
On Mon, Jun 30, 2025 at 10:51:56AM +0530, Nilay Shroff wrote:
> Move scheduler tags (sched_tags) allocation and deallocation outside
> both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues.
> This change breaks the dependency chain from the percpu allocator lock
> to the elevator lock, helping to prevent potential deadlocks, as
> observed in the reported lockdep splat[1].
>
> This commit introduces batch allocation and deallocation helpers for
> sched_tags, which are now used from within __blk_mq_update_nr_hw_queues
> routine while iterating through the tagset.
>
> With this change, all sched_tags memory management is handled entirely
> outside the ->elevator_lock and the ->freeze_lock context, thereby
> eliminating the lock dependency that could otherwise manifest during
> nr_hw_queues updates.
>
> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
>
> Reported-by: Stefan Haberland <sth@linux.ibm.com>
> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
> ---
> block/blk-mq-sched.c | 63 ++++++++++++++++++++++++++++++++++++++++++++
> block/blk-mq-sched.h | 4 +++
> block/blk-mq.c | 11 +++++++-
> block/blk.h | 2 +-
> block/elevator.c | 4 +--
> 5 files changed, 80 insertions(+), 4 deletions(-)
>
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 2d6d1ebdd8fb..da802df34a8c 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -427,6 +427,30 @@ void blk_mq_free_sched_tags(struct elevator_tags *et,
> kfree(et);
> }
>
> +void blk_mq_free_sched_tags_batch(struct xarray *et_table,
> + struct blk_mq_tag_set *set)
> +{
> + struct request_queue *q;
> + struct elevator_tags *et;
> +
> + lockdep_assert_held_write(&set->update_nr_hwq_lock);
> +
> + list_for_each_entry(q, &set->tag_list, tag_set_list) {
> + /*
> + * Accessing q->elevator without holding q->elevator_lock is
> + * safe because we're holding here set->update_nr_hwq_lock in
> + * the writer context. So, scheduler update/switch code (which
> + * acquires the same lock but in the reader context) can't run
> + * concurrently.
> + */
> + if (q->elevator) {
> + et = xa_load(et_table, q->id);
> + if (et)
> + blk_mq_free_sched_tags(et, set);
> + }
> + }
> +}
> +
> struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> unsigned int nr_hw_queues)
> {
> @@ -477,6 +501,45 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> return NULL;
> }
>
> +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
> + struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
> +{
> + struct request_queue *q;
> + struct elevator_tags *et;
> + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
> +
> + lockdep_assert_held_write(&set->update_nr_hwq_lock);
> +
> + list_for_each_entry(q, &set->tag_list, tag_set_list) {
> + /*
> + * Accessing q->elevator without holding q->elevator_lock is
> + * safe because we're holding here set->update_nr_hwq_lock in
> + * the writer context. So, scheduler update/switch code (which
> + * acquires the same lock but in the reader context) can't run
> + * concurrently.
> + */
> + if (q->elevator) {
> + et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
> + if (!et)
> + goto out_unwind;
> + if (xa_insert(et_table, q->id, et, gfp))
> + goto out_free_tags;
> + }
> + }
> + return 0;
> +out_free_tags:
> + blk_mq_free_sched_tags(et, set);
> +out_unwind:
> + list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
> + if (q->elevator) {
> + et = xa_load(et_table, q->id);
> + if (et)
> + blk_mq_free_sched_tags(et, set);
> + }
> + }
> + return -ENOMEM;
> +}
> +
> /* caller must have a reference to @e, will grab another one if successful */
> int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
> struct elevator_tags *et)
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 0cde00cd1c47..b554e1d55950 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -25,8 +25,12 @@ void blk_mq_sched_free_rqs(struct request_queue *q);
>
> struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> unsigned int nr_hw_queues);
> +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
> + struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
> void blk_mq_free_sched_tags(struct elevator_tags *et,
> struct blk_mq_tag_set *set);
> +void blk_mq_free_sched_tags_batch(struct xarray *et_table,
> + struct blk_mq_tag_set *set);
>
> static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4806b867e37d..a68b658ce07b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -4972,6 +4972,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> struct request_queue *q;
> int prev_nr_hw_queues = set->nr_hw_queues;
> unsigned int memflags;
> + struct xarray et_table;
> int i;
>
> lockdep_assert_held(&set->tag_list_lock);
> @@ -4984,6 +4985,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> return;
>
> memflags = memalloc_noio_save();
> +
> + xa_init(&et_table);
> + if (blk_mq_alloc_sched_tags_batch(&et_table, set, nr_hw_queues) < 0)
> + goto out_memalloc_restore;
> +
> list_for_each_entry(q, &set->tag_list, tag_set_list) {
> blk_mq_debugfs_unregister_hctxs(q);
> blk_mq_sysfs_unregister_hctxs(q);
> @@ -4995,6 +5001,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
> list_for_each_entry(q, &set->tag_list, tag_set_list)
> blk_mq_unfreeze_queue_nomemrestore(q);
> + blk_mq_free_sched_tags_batch(&et_table, set);
> goto reregister;
> }
>
> @@ -5019,7 +5026,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>
> /* elv_update_nr_hw_queues() unfreeze queue for us */
> list_for_each_entry(q, &set->tag_list, tag_set_list)
> - elv_update_nr_hw_queues(q);
> + elv_update_nr_hw_queues(q, &et_table);
>
> reregister:
> list_for_each_entry(q, &set->tag_list, tag_set_list) {
> @@ -5029,7 +5036,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> blk_mq_remove_hw_queues_cpuhp(q);
> blk_mq_add_hw_queues_cpuhp(q);
> }
> +out_memalloc_restore:
> memalloc_noio_restore(memflags);
> + xa_destroy(&et_table);
>
> /* Free the excess tags when nr_hw_queues shrink. */
> for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
> diff --git a/block/blk.h b/block/blk.h
> index 37ec459fe656..c6d1d1458388 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -321,7 +321,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
>
> bool blk_insert_flush(struct request *rq);
>
> -void elv_update_nr_hw_queues(struct request_queue *q);
> +void elv_update_nr_hw_queues(struct request_queue *q, struct xarray *et_table);
> void elevator_set_default(struct request_queue *q);
> void elevator_set_none(struct request_queue *q);
>
> diff --git a/block/elevator.c b/block/elevator.c
> index 50f4b78efe66..8ba8b869d5a4 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -705,7 +705,7 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
> * The I/O scheduler depends on the number of hardware queues, this forces a
> * reattachment when nr_hw_queues changes.
> */
> -void elv_update_nr_hw_queues(struct request_queue *q)
> +void elv_update_nr_hw_queues(struct request_queue *q, struct xarray *et_table)
et_table isn't necessary to expose to elv_update_nr_hw_queues(), and it is
less readable than passing 'struct elevator_tags *' directly, but it can be
one followup improvement.
Anyway:
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Thanks,
Ming
next prev parent reply other threads:[~2025-07-01 3:24 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-30 5:21 [PATCHv6 0/3] block: move sched_tags allocation/de-allocation outside of locking context Nilay Shroff
2025-06-30 5:21 ` [PATCHv6 1/3] block: move elevator queue allocation logic into blk_mq_init_sched Nilay Shroff
2025-06-30 6:13 ` Christoph Hellwig
2025-06-30 6:17 ` Hannes Reinecke
2025-06-30 5:21 ` [PATCHv6 2/3] block: fix lockdep warning caused by lock dependency in elv_iosched_store Nilay Shroff
2025-06-30 6:15 ` Christoph Hellwig
2025-06-30 6:20 ` Hannes Reinecke
2025-06-30 6:48 ` Nilay Shroff
2025-07-01 3:19 ` Ming Lei
2025-06-30 5:21 ` [PATCHv6 3/3] block: fix potential deadlock while running nr_hw_queue update Nilay Shroff
2025-06-30 6:16 ` Christoph Hellwig
2025-06-30 6:24 ` Hannes Reinecke
2025-06-30 6:57 ` Nilay Shroff
2025-07-01 3:24 ` Ming Lei [this message]
2025-07-01 5:20 ` Nilay Shroff
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aGNU3PPJ1wU--x-O@fedora \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=gjoyce@ibm.com \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=lkp@intel.com \
--cc=nilay@linux.ibm.com \
--cc=sth@linux.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox