* [PATCH] block: break pcpu_alloc_mutex dependency on freeze_lock
@ 2026-02-25 12:44 Nilay Shroff
2026-02-25 15:50 ` Yu Kuai
0 siblings, 1 reply; 4+ messages in thread
From: Nilay Shroff @ 2026-02-25 12:44 UTC (permalink / raw)
To: linux-block
Cc: axboe, hch, ming.lei, yukuai, yi.zhang, shinichiro.kawasaki,
gjoyce, Nilay Shroff
While nr_hw_update allocates tagset tags it acquires ->pcpu_alloc_mutex
after ->freeze_lock is acquired or queue is frozen. This potentially
creates a circular dependency involving ->fs_reclaim if reclaim is
triggered simultaneously in a code path which first acquires ->pcpu_
alloc_mutex. As the queue is already frozen while nr_hw_queue update
allocates tagsets, the reclaim can't forward progress and thus it could
cause a potential deadlock as reported in lockdep splat[1].
Fix this by pre-allocating tagset tags before we freeze queue during
nr_hw_queue update. Later the allocated tagset tags could be safely
installed and used after queue is frozen.
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Closes: https://lore.kernel.org/all/CAHj4cs8F=OV9s3La2kEQ34YndgfZP-B5PHS4Z8_b9euKG6J4mw@mail.gmail.com/ [1]
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
block/blk-mq.c | 53 +++++++++++++++++++++++++++++++++++---------------
1 file changed, 37 insertions(+), 16 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d5602ff62c7c..687fd47786a6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4793,10 +4793,10 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
}
}
-static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
- int new_nr_hw_queues)
+static int blk_mq_prealloc_tag_set_tags(struct blk_mq_tag_set *set,
+ int new_nr_hw_queues, struct blk_mq_tags ***tags)
{
- struct blk_mq_tags **new_tags;
+ struct blk_mq_tags **new_tags = NULL;
int i;
if (set->nr_hw_queues >= new_nr_hw_queues)
@@ -4807,24 +4807,42 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
if (!new_tags)
return -ENOMEM;
- if (set->tags)
- memcpy(new_tags, set->tags, set->nr_hw_queues *
- sizeof(*set->tags));
- kfree(set->tags);
- set->tags = new_tags;
-
for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
- if (!__blk_mq_alloc_map_and_rqs(set, i)) {
- while (--i >= set->nr_hw_queues)
- __blk_mq_free_map_and_rqs(set, i);
- return -ENOMEM;
+ if (blk_mq_is_shared_tags(set->flags))
+ new_tags[i] = set->shared_tags;
+ else {
+ new_tags[i] = blk_mq_alloc_map_and_rqs(set, i,
+ set->queue_depth);
+ if (!new_tags[i])
+ goto out_unwind;
}
cond_resched();
}
done:
- set->nr_hw_queues = new_nr_hw_queues;
+ *tags = new_tags;
return 0;
+out_unwind:
+ while (--i >= set->nr_hw_queues) {
+ if (!blk_mq_is_shared_tags(set->flags))
+ blk_mq_free_map_and_rqs(set, new_tags[i], i);
+ }
+ return -ENOMEM;
+}
+
+static void blk_mq_init_tag_set_tags(struct blk_mq_tag_set *set,
+ int new_nr_hw_queues, struct blk_mq_tags **new_tags)
+{
+ if (set->nr_hw_queues >= new_nr_hw_queues)
+ goto done;
+
+ if (set->tags)
+ memcpy(new_tags, set->tags, set->nr_hw_queues *
+ sizeof(*set->tags));
+ kfree(set->tags);
+ set->tags = new_tags;
+done:
+ set->nr_hw_queues = new_nr_hw_queues;
}
/*
@@ -5113,6 +5131,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
unsigned int memflags;
int i;
struct xarray elv_tbl;
+ struct blk_mq_tags **new_tags;
bool queues_frozen = false;
lockdep_assert_held(&set->tag_list_lock);
@@ -5147,11 +5166,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
if (blk_mq_elv_switch_none(q, &elv_tbl))
goto switch_back;
+ if (blk_mq_prealloc_tag_set_tags(set, nr_hw_queues, &new_tags) < 0)
+ goto switch_back;
+
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue_nomemsave(q);
queues_frozen = true;
- if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
- goto switch_back;
+ blk_mq_init_tag_set_tags(set, nr_hw_queues, new_tags);
fallback:
blk_mq_update_queue_map(set);
--
2.52.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] block: break pcpu_alloc_mutex dependency on freeze_lock
2026-02-25 12:44 [PATCH] block: break pcpu_alloc_mutex dependency on freeze_lock Nilay Shroff
@ 2026-02-25 15:50 ` Yu Kuai
2026-02-26 15:37 ` Nilay Shroff
0 siblings, 1 reply; 4+ messages in thread
From: Yu Kuai @ 2026-02-25 15:50 UTC (permalink / raw)
To: Nilay Shroff, linux-block
Cc: axboe, hch, ming.lei, yi.zhang, shinichiro.kawasaki, gjoyce,
yukuai
Hi,
在 2026/2/25 20:44, Nilay Shroff 写道:
> While nr_hw_update allocates tagset tags it acquires ->pcpu_alloc_mutex
> after ->freeze_lock is acquired or queue is frozen. This potentially
> creates a circular dependency involving ->fs_reclaim if reclaim is
> triggered simultaneously in a code path which first acquires ->pcpu_
> alloc_mutex. As the queue is already frozen while nr_hw_queue update
> allocates tagsets, the reclaim can't forward progress and thus it could
> cause a potential deadlock as reported in lockdep splat[1].
>
> Fix this by pre-allocating tagset tags before we freeze queue during
> nr_hw_queue update. Later the allocated tagset tags could be safely
> installed and used after queue is frozen.
>
> Reported-by: Yi Zhang <yi.zhang@redhat.com>
> Closes: https://lore.kernel.org/all/CAHj4cs8F=OV9s3La2kEQ34YndgfZP-B5PHS4Z8_b9euKG6J4mw@mail.gmail.com/ [1]
> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
> ---
> block/blk-mq.c | 53 +++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 37 insertions(+), 16 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d5602ff62c7c..687fd47786a6 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -4793,10 +4793,10 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
> }
> }
>
> -static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
> - int new_nr_hw_queues)
> +static int blk_mq_prealloc_tag_set_tags(struct blk_mq_tag_set *set,
> + int new_nr_hw_queues, struct blk_mq_tags ***tags)
Personally I don't like the last parameter, since the return value is not
used at all, I feel it's better to return the new_tags instead.
> {
> - struct blk_mq_tags **new_tags;
> + struct blk_mq_tags **new_tags = NULL;
> int i;
>
> if (set->nr_hw_queues >= new_nr_hw_queues)
> @@ -4807,24 +4807,42 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
> if (!new_tags)
> return -ENOMEM;
>
> - if (set->tags)
> - memcpy(new_tags, set->tags, set->nr_hw_queues *
> - sizeof(*set->tags));
> - kfree(set->tags);
> - set->tags = new_tags;
> -
> for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
> - if (!__blk_mq_alloc_map_and_rqs(set, i)) {
> - while (--i >= set->nr_hw_queues)
> - __blk_mq_free_map_and_rqs(set, i);
> - return -ENOMEM;
> + if (blk_mq_is_shared_tags(set->flags))
> + new_tags[i] = set->shared_tags;
> + else {
> + new_tags[i] = blk_mq_alloc_map_and_rqs(set, i,
> + set->queue_depth);
> + if (!new_tags[i])
> + goto out_unwind;
> }
> cond_resched();
> }
>
> done:
> - set->nr_hw_queues = new_nr_hw_queues;
> + *tags = new_tags;
> return 0;
> +out_unwind:
> + while (--i >= set->nr_hw_queues) {
> + if (!blk_mq_is_shared_tags(set->flags))
> + blk_mq_free_map_and_rqs(set, new_tags[i], i);
> + }
> + return -ENOMEM;
> +}
> +
> +static void blk_mq_init_tag_set_tags(struct blk_mq_tag_set *set,
> + int new_nr_hw_queues, struct blk_mq_tags **new_tags)
> +{
> + if (set->nr_hw_queues >= new_nr_hw_queues)
> + goto done;
> +
> + if (set->tags)
> + memcpy(new_tags, set->tags, set->nr_hw_queues *
> + sizeof(*set->tags));
> + kfree(set->tags);
> + set->tags = new_tags;
> +done:
> + set->nr_hw_queues = new_nr_hw_queues;
> }
>
> /*
> @@ -5113,6 +5131,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> unsigned int memflags;
> int i;
> struct xarray elv_tbl;
> + struct blk_mq_tags **new_tags;
> bool queues_frozen = false;
>
> lockdep_assert_held(&set->tag_list_lock);
> @@ -5147,11 +5166,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> if (blk_mq_elv_switch_none(q, &elv_tbl))
> goto switch_back;
>
> + if (blk_mq_prealloc_tag_set_tags(set, nr_hw_queues, &new_tags) < 0)
> + goto switch_back;
> +
> list_for_each_entry(q, &set->tag_list, tag_set_list)
> blk_mq_freeze_queue_nomemsave(q);
> queues_frozen = true;
> - if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
> - goto switch_back;
> + blk_mq_init_tag_set_tags(set, nr_hw_queues, new_tags);
It looks ok just leave set->nr_hw_queues = nr_hw_queues, and the patch should
be much simpler.
>
> fallback:
> blk_mq_update_queue_map(set);
--
Thansk,
Kuai
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] block: break pcpu_alloc_mutex dependency on freeze_lock
2026-02-25 15:50 ` Yu Kuai
@ 2026-02-26 15:37 ` Nilay Shroff
[not found] ` <CO1PR15MB496953C277EC8E15ED965D7FCE72A@CO1PR15MB4969.namprd15.prod.outlook.com>
0 siblings, 1 reply; 4+ messages in thread
From: Nilay Shroff @ 2026-02-26 15:37 UTC (permalink / raw)
To: yukuai, linux-block
Cc: axboe, hch, ming.lei, yi.zhang, shinichiro.kawasaki, gjoyce
On 2/25/26 9:20 PM, Yu Kuai wrote:
> Hi,
>
> 在 2026/2/25 20:44, Nilay Shroff 写道:
>> While nr_hw_update allocates tagset tags it acquires ->pcpu_alloc_mutex
>> after ->freeze_lock is acquired or queue is frozen. This potentially
>> creates a circular dependency involving ->fs_reclaim if reclaim is
>> triggered simultaneously in a code path which first acquires ->pcpu_
>> alloc_mutex. As the queue is already frozen while nr_hw_queue update
>> allocates tagsets, the reclaim can't forward progress and thus it could
>> cause a potential deadlock as reported in lockdep splat[1].
>>
>> Fix this by pre-allocating tagset tags before we freeze queue during
>> nr_hw_queue update. Later the allocated tagset tags could be safely
>> installed and used after queue is frozen.
>>
>> Reported-by: Yi Zhang <yi.zhang@redhat.com>
>> Closes: https://lore.kernel.org/all/CAHj4cs8F=OV9s3La2kEQ34YndgfZP-B5PHS4Z8_b9euKG6J4mw@mail.gmail.com/ [1]
>> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
>> ---
>> block/blk-mq.c | 53 +++++++++++++++++++++++++++++++++++---------------
>> 1 file changed, 37 insertions(+), 16 deletions(-)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index d5602ff62c7c..687fd47786a6 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -4793,10 +4793,10 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
>> }
>> }
>>
>> -static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
>> - int new_nr_hw_queues)
>> +static int blk_mq_prealloc_tag_set_tags(struct blk_mq_tag_set *set,
>> + int new_nr_hw_queues, struct blk_mq_tags ***tags)
>
> Personally I don't like the last parameter, since the return value is not
> used at all, I feel it's better to return the new_tags instead.
>
Hmm ok I could return new_tags and then use ERR_PTR when it fails to
alloc tags. Then at call site we could use IS_ERR to distinguish
between success vs fail case.
>> {
>> - struct blk_mq_tags **new_tags;
>> + struct blk_mq_tags **new_tags = NULL;
>> int i;
>>
>> if (set->nr_hw_queues >= new_nr_hw_queues)
>> @@ -4807,24 +4807,42 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
>> if (!new_tags)
>> return -ENOMEM;
>>
>> - if (set->tags)
>> - memcpy(new_tags, set->tags, set->nr_hw_queues *
>> - sizeof(*set->tags));
>> - kfree(set->tags);
>> - set->tags = new_tags;
>> -
>> for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
>> - if (!__blk_mq_alloc_map_and_rqs(set, i)) {
>> - while (--i >= set->nr_hw_queues)
>> - __blk_mq_free_map_and_rqs(set, i);
>> - return -ENOMEM;
>> + if (blk_mq_is_shared_tags(set->flags))
>> + new_tags[i] = set->shared_tags;
>> + else {
>> + new_tags[i] = blk_mq_alloc_map_and_rqs(set, i,
>> + set->queue_depth);
>> + if (!new_tags[i])
>> + goto out_unwind;
>> }
>> cond_resched();
>> }
>>
>> done:
>> - set->nr_hw_queues = new_nr_hw_queues;
>> + *tags = new_tags;
>> return 0;
>> +out_unwind:
>> + while (--i >= set->nr_hw_queues) {
>> + if (!blk_mq_is_shared_tags(set->flags))
>> + blk_mq_free_map_and_rqs(set, new_tags[i], i);
>> + }
>> + return -ENOMEM;
>> +}
>> +
>> +static void blk_mq_init_tag_set_tags(struct blk_mq_tag_set *set,
>> + int new_nr_hw_queues, struct blk_mq_tags **new_tags)
>> +{
>> + if (set->nr_hw_queues >= new_nr_hw_queues)
>> + goto done;
>> +
>> + if (set->tags)
>> + memcpy(new_tags, set->tags, set->nr_hw_queues *
>> + sizeof(*set->tags));
>> + kfree(set->tags);
>> + set->tags = new_tags;
>> +done:
>> + set->nr_hw_queues = new_nr_hw_queues;
>> }
>>
>> /*
>> @@ -5113,6 +5131,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>> unsigned int memflags;
>> int i;
>> struct xarray elv_tbl;
>> + struct blk_mq_tags **new_tags;
>> bool queues_frozen = false;
>>
>> lockdep_assert_held(&set->tag_list_lock);
>> @@ -5147,11 +5166,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>> if (blk_mq_elv_switch_none(q, &elv_tbl))
>> goto switch_back;
>>
>> + if (blk_mq_prealloc_tag_set_tags(set, nr_hw_queues, &new_tags) < 0)
>> + goto switch_back;
>> +
>> list_for_each_entry(q, &set->tag_list, tag_set_list)
>> blk_mq_freeze_queue_nomemsave(q);
>> queues_frozen = true;
>> - if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
>> - goto switch_back;
>> + blk_mq_init_tag_set_tags(set, nr_hw_queues, new_tags);
>
> It looks ok just leave set->nr_hw_queues = nr_hw_queues, and the patch should
> be much simpler.
>
I think you’re suggesting that blk_mq_init_tag_set_tags() can be removed and
the tag installation along with set->nr_hw_queues = nr_hw_queues can be done
inline at the call site. Makes sense. I will implement it in v2.
Thanks,
--Nilay
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] block: break pcpu_alloc_mutex dependency on freeze_lock
[not found] ` <CO1PR15MB496953C277EC8E15ED965D7FCE72A@CO1PR15MB4969.namprd15.prod.outlook.com>
@ 2026-02-27 8:09 ` Nilay Shroff
0 siblings, 0 replies; 4+ messages in thread
From: Nilay Shroff @ 2026-02-27 8:09 UTC (permalink / raw)
To: Greg Joyce, yukuai@fnnas.com, linux-block@vger.kernel.org
Cc: axboe@kernel.dk, hch@lst.de, ming.lei@redhat.com,
yi.zhang@redhat.com, shinichiro.kawasaki@wdc.com
On 2/27/26 1:45 AM, Greg Joyce wrote:
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index d5602ff62c7c..687fd47786a6 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -4793,10 +4793,10 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
>>> }
>>> }
>>>
>>> -static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
>>> - int new_nr_hw_queues)
>>> +static int blk_mq_prealloc_tag_set_tags(struct blk_mq_tag_set *set,
>>> + int new_nr_hw_queues, struct blk_mq_tags ***tags)
>>>
>>> Personally I don't like the last parameter, since the return value is not
>>> used at all, I feel it's better to return the new_tags instead.
>>>
>> Hmm ok I could return new_tags and then use ERR_PTR when it fails to
>> alloc tags. Then at call site we could use IS_ERR to distinguish
>> between success vs fail case.
> I think this is a personal preference but I think it's better not using ERR_PTR since I think it's
> more error susceptible.
IMO, ERR_PTR()/IS_ERR() is widely used throughout the kernel and is fairly
idiomatic when a function needs to combine a pointer return value with an
error status. There are many thousands of instances of this pattern in the
kernel tree, so it’s a well-established convention.
As long as callers consistently use IS_ERR() (and not just a NULL check) to
evaluate the return value, this approach should be safe and clear.
Thanks,
--Nilay
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-02-27 8:09 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-25 12:44 [PATCH] block: break pcpu_alloc_mutex dependency on freeze_lock Nilay Shroff
2026-02-25 15:50 ` Yu Kuai
2026-02-26 15:37 ` Nilay Shroff
[not found] ` <CO1PR15MB496953C277EC8E15ED965D7FCE72A@CO1PR15MB4969.namprd15.prod.outlook.com>
2026-02-27 8:09 ` Nilay Shroff
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox