From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-block@vger.kernel.org
Cc: hch@lst.de, ming.lei@redhat.com, axboe@kernel.dk,
sth@linux.ibm.com, gjoyce@ibm.com
Subject: [PATCHv4 3/3] block: fix potential deadlock while running nr_hw_queue update
Date: Tue, 24 Jun 2025 18:47:05 +0530 [thread overview]
Message-ID: <20250624131716.630465-4-nilay@linux.ibm.com> (raw)
In-Reply-To: <20250624131716.630465-1-nilay@linux.ibm.com>
Move scheduler tags (sched_tags) allocation and deallocation outside
both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues.
This change breaks the dependency chain from the percpu allocator lock
to the elevator lock, helping to prevent potential deadlocks, as
observed in the reported lockdep splat[1].
This commit introduces batch allocation and deallocation helpers for
sched_tags, which are now used from within __blk_mq_update_nr_hw_queues
routine while iterating through the tagset.
With this change, all sched_tags memory management is handled entirely
outside the ->elevator_lock and the ->freeze_lock context, thereby
eliminating the lock dependency that could otherwise manifest during
nr_hw_queues updates.
[1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
Reported-by: Stefan Haberland <sth@linux.ibm.com>
Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
block/blk-mq-sched.c | 51 ++++++++++++++++++++++++++++++++++++++++++++
block/blk-mq-sched.h | 5 +++++
block/blk-mq.c | 12 ++++++++++-
block/blk.h | 3 ++-
block/elevator.c | 23 ++------------------
5 files changed, 71 insertions(+), 23 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 5d3132ac7777..acdc03718ebd 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -508,6 +508,57 @@ int blk_mq_alloc_sched_tags(struct elevator_tags *et,
return -ENOMEM;
}
+int blk_mq_alloc_sched_tags_batch(struct elevator_tags *et,
+ struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is
+ * safe because we're holding here set->update_nr_hwq_lock in
+ * the writer context. So, scheduler update/switch code (which
+ * acquires the same lock but in the reader context) can't run
+ * concurrently.
+ */
+ if (q->elevator) {
+ if (blk_mq_alloc_sched_tags(et, set, q->id))
+ goto out_unwind;
+ }
+ }
+ return 0;
+
+out_unwind:
+ list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
+ if (q->elevator)
+ blk_mq_free_sched_tags(et, set, q->id);
+ }
+
+ return -ENOMEM;
+}
+
+void blk_mq_free_sched_tags_batch(struct elevator_tags *et,
+ struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is
+ * safe because we're holding here set->update_nr_hwq_lock in
+ * the writer context. So, scheduler update/switch code (which
+ * acquires the same lock but in the reader context) can't run
+ * concurrently.
+ */
+ if (q->elevator)
+ blk_mq_free_sched_tags(et, set, q->id);
+ }
+}
+
/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
struct elevator_tags *et)
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 92aa50b8376a..4b3bf8946ae2 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -29,6 +29,11 @@ struct blk_mq_tags **__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
struct blk_mq_tags **tags, unsigned int nr_hw_queues);
+int blk_mq_alloc_sched_tags_batch(struct elevator_tags *et,
+ struct blk_mq_tag_set *set);
+void blk_mq_free_sched_tags_batch(struct elevator_tags *et,
+ struct blk_mq_tag_set *set);
+
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
struct elevator_tags *et);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4806b867e37d..a06f184f1d9a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4970,6 +4970,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
+ struct elevator_tags et;
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
@@ -4984,6 +4985,12 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
+
+ et.nr_hw_queues = nr_hw_queues;
+ xa_init(&et.tags_table);
+ if (blk_mq_alloc_sched_tags_batch(&et, set) < 0)
+ goto memalloc_restore;
+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister_hctxs(q);
@@ -4995,6 +5002,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue_nomemrestore(q);
+ blk_mq_free_sched_tags_batch(&et, set);
goto reregister;
}
@@ -5019,7 +5027,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
/* elv_update_nr_hw_queues() unfreeze queue for us */
list_for_each_entry(q, &set->tag_list, tag_set_list)
- elv_update_nr_hw_queues(q);
+ elv_update_nr_hw_queues(q, &et);
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
@@ -5029,7 +5037,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
blk_mq_remove_hw_queues_cpuhp(q);
blk_mq_add_hw_queues_cpuhp(q);
}
+memalloc_restore:
memalloc_noio_restore(memflags);
+ xa_destroy(&et.tags_table);
/* Free the excess tags when nr_hw_queues shrink. */
for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
diff --git a/block/blk.h b/block/blk.h
index 37ec459fe656..a312518fb8f3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -12,6 +12,7 @@
#include "blk-crypto-internal.h"
struct elevator_type;
+struct elevator_tags;
#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9)
#define BLK_MIN_SEGMENT_SIZE 4096
@@ -321,7 +322,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
-void elv_update_nr_hw_queues(struct request_queue *q);
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_tags *et);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);
diff --git a/block/elevator.c b/block/elevator.c
index 1408894c0396..4272f9bc7e11 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -717,31 +717,14 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
* The I/O scheduler depends on the number of hardware queues, this forces a
* reattachment when nr_hw_queues changes.
*/
-void elv_update_nr_hw_queues(struct request_queue *q)
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_tags *et)
{
struct blk_mq_tag_set *set = q->tag_set;
- struct elevator_tags et;
- struct elv_change_ctx ctx = {};
+ struct elv_change_ctx ctx = {.et = et};
int ret = -ENODEV;
WARN_ON_ONCE(q->mq_freeze_depth == 0);
- et.nr_hw_queues = set->nr_hw_queues;
- xa_init(&et.tags_table);
- ctx.et = &et;
- /*
- * Accessing q->elevator without holding q->elevator_lock is safe here
- * because nr_hw_queue update is protected by set->update_nr_hwq_lock
- * in the writer context. So, scheduler update/switch code (which
- * acquires same lock in the reader context) can't run concurrently.
- */
- if (q->elevator) {
- if (blk_mq_alloc_sched_tags(ctx.et, set, q->id)) {
- WARN_ON_ONCE(1);
- goto out;
- }
- }
-
mutex_lock(&q->elevator_lock);
if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
ctx.name = q->elevator->type->elevator_name;
@@ -758,8 +741,6 @@ void elv_update_nr_hw_queues(struct request_queue *q)
*/
if (!xa_empty(&ctx.et->tags_table) && !ctx.new)
blk_mq_free_sched_tags(ctx.et, set, q->id);
-out:
- xa_destroy(&ctx.et->tags_table);
}
/*
--
2.49.0
prev parent reply other threads:[~2025-06-24 13:17 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-24 13:17 [PATCHv4 0/3] block: move sched_tags allocation/de-allocation outside of locking context Nilay Shroff
2025-06-24 13:17 ` [PATCHv4 1/3] block: move elevator queue allocation logic into blk_mq_init_sched Nilay Shroff
2025-06-24 13:17 ` [PATCHv4 2/3] block: fix lockdep warning caused by lock dependency in elv_iosched_store Nilay Shroff
2025-06-26 14:43 ` Ming Lei
2025-06-27 4:13 ` Nilay Shroff
2025-06-27 7:58 ` Ming Lei
2025-06-27 9:50 ` Nilay Shroff
2025-06-24 13:17 ` Nilay Shroff [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250624131716.630465-4-nilay@linux.ibm.com \
--to=nilay@linux.ibm.com \
--cc=axboe@kernel.dk \
--cc=gjoyce@ibm.com \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=ming.lei@redhat.com \
--cc=sth@linux.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox