* [PATCH v2 1/7] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex
2026-02-03 8:05 [PATCH v2 0/7] blk-cgroup: fix races and deadlocks Yu Kuai
@ 2026-02-03 8:05 ` Yu Kuai
2026-02-03 8:05 ` [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() " Yu Kuai
` (5 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2026-02-03 8:05 UTC (permalink / raw)
To: tj, josef, axboe
Cc: cgroups, linux-block, linux-kernel, yukuai, zhengqixing, mkoutny,
hch, ming.lei, nilay
blkg_destroy_all() iterates q->blkg_list without holding blkcg_mutex,
which can race with blkg_free_workfn() that removes blkgs from the list
while holding blkcg_mutex.
Add blkcg_mutex protection around the q->blkg_list iteration to prevent
potential list corruption or use-after-free issues.
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/blk-cgroup.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3cffb68ba5d8..0bc7b19399b6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -574,6 +574,7 @@ static void blkg_destroy_all(struct gendisk *disk)
int i;
restart:
+ mutex_lock(&q->blkcg_mutex);
spin_lock_irq(&q->queue_lock);
list_for_each_entry(blkg, &q->blkg_list, q_node) {
struct blkcg *blkcg = blkg->blkcg;
@@ -592,6 +593,7 @@ static void blkg_destroy_all(struct gendisk *disk)
if (!(--count)) {
count = BLKG_DESTROY_BATCH_SIZE;
spin_unlock_irq(&q->queue_lock);
+ mutex_unlock(&q->blkcg_mutex);
cond_resched();
goto restart;
}
@@ -611,6 +613,7 @@ static void blkg_destroy_all(struct gendisk *disk)
q->root_blkg = NULL;
spin_unlock_irq(&q->queue_lock);
+ mutex_unlock(&q->blkcg_mutex);
}
static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex
2026-02-03 8:05 [PATCH v2 0/7] blk-cgroup: fix races and deadlocks Yu Kuai
2026-02-03 8:05 ` [PATCH v2 1/7] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex Yu Kuai
@ 2026-02-03 8:05 ` Yu Kuai
2026-02-03 12:54 ` kernel test robot
2026-02-03 13:36 ` kernel test robot
2026-02-03 8:05 ` [PATCH v2 3/7] blk-cgroup: fix race between policy activation and blkg destruction Yu Kuai
` (4 subsequent siblings)
6 siblings, 2 replies; 13+ messages in thread
From: Yu Kuai @ 2026-02-03 8:05 UTC (permalink / raw)
To: tj, josef, axboe
Cc: cgroups, linux-block, linux-kernel, yukuai, zhengqixing, mkoutny,
hch, ming.lei, nilay
bfq_end_wr_async() iterates q->blkg_list while only holding bfqd->lock,
but not blkcg_mutex. This can race with blkg_free_workfn() that removes
blkgs from the list while holding blkcg_mutex.
Add blkcg_mutex protection in bfq_end_wr() before taking bfqd->lock to
ensure proper synchronization when iterating q->blkg_list.
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/bfq-cgroup.c | 3 ++-
block/bfq-iosched.c | 2 ++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 6a75fe1c7a5c..839d266a6aa6 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -940,7 +940,8 @@ void bfq_end_wr_async(struct bfq_data *bfqd)
list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
struct bfq_group *bfqg = blkg_to_bfqg(blkg);
- bfq_end_wr_async_queues(bfqd, bfqg);
+ if (bfqg)
+ bfq_end_wr_async_queues(bfqd, bfqg);
}
bfq_end_wr_async_queues(bfqd, bfqd->root_group);
}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3ebdec40e758..617633be8abc 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2645,6 +2645,7 @@ static void bfq_end_wr(struct bfq_data *bfqd)
struct bfq_queue *bfqq;
int i;
+ mutex_lock(&bfqd->queue->blkcg_mutex);
spin_lock_irq(&bfqd->lock);
for (i = 0; i < bfqd->num_actuators; i++) {
@@ -2656,6 +2657,7 @@ static void bfq_end_wr(struct bfq_data *bfqd)
bfq_end_wr_async(bfqd);
spin_unlock_irq(&bfqd->lock);
+ mutex_unlock(&bfqd->queue->blkcg_mutex);
}
static sector_t bfq_io_struct_pos(void *io_struct, bool request)
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex
2026-02-03 8:05 ` [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() " Yu Kuai
@ 2026-02-03 12:54 ` kernel test robot
2026-02-03 13:07 ` Yu Kuai
2026-02-03 13:36 ` kernel test robot
1 sibling, 1 reply; 13+ messages in thread
From: kernel test robot @ 2026-02-03 12:54 UTC (permalink / raw)
To: Yu Kuai, tj, josef, axboe
Cc: oe-kbuild-all, cgroups, linux-block, linux-kernel, yukuai,
zhengqixing, mkoutny, hch, ming.lei, nilay
Hi Yu,
kernel test robot noticed the following build errors:
[auto build test ERROR on axboe/for-next]
[also build test ERROR on linus/master v6.19-rc8 next-20260202]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/blk-cgroup-protect-q-blkg_list-iteration-in-blkg_destroy_all-with-blkcg_mutex/20260203-161356
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20260203080602.726505-3-yukuai%40fnnas.com
patch subject: [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex
config: i386-buildonly-randconfig-002-20260203 (https://download.01.org/0day-ci/archive/20260203/202602032018.BRG7c7LT-lkp@intel.com/config)
compiler: gcc-13 (Debian 13.3.0-16) 13.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260203/202602032018.BRG7c7LT-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602032018.BRG7c7LT-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from include/linux/seqlock.h:19,
from include/linux/mmzone.h:17,
from include/linux/gfp.h:7,
from include/linux/umh.h:4,
from include/linux/kmod.h:9,
from include/linux/module.h:18,
from block/bfq-iosched.c:116:
block/bfq-iosched.c: In function 'bfq_end_wr':
>> block/bfq-iosched.c:2648:32: error: 'struct request_queue' has no member named 'blkcg_mutex'
2648 | mutex_lock(&bfqd->queue->blkcg_mutex);
| ^~
include/linux/mutex.h:193:44: note: in definition of macro 'mutex_lock'
193 | #define mutex_lock(lock) mutex_lock_nested(lock, 0)
| ^~~~
block/bfq-iosched.c:2660:34: error: 'struct request_queue' has no member named 'blkcg_mutex'
2660 | mutex_unlock(&bfqd->queue->blkcg_mutex);
| ^~
vim +2648 block/bfq-iosched.c
2642
2643 static void bfq_end_wr(struct bfq_data *bfqd)
2644 {
2645 struct bfq_queue *bfqq;
2646 int i;
2647
> 2648 mutex_lock(&bfqd->queue->blkcg_mutex);
2649 spin_lock_irq(&bfqd->lock);
2650
2651 for (i = 0; i < bfqd->num_actuators; i++) {
2652 list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
2653 bfq_bfqq_end_wr(bfqq);
2654 }
2655 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
2656 bfq_bfqq_end_wr(bfqq);
2657 bfq_end_wr_async(bfqd);
2658
2659 spin_unlock_irq(&bfqd->lock);
2660 mutex_unlock(&bfqd->queue->blkcg_mutex);
2661 }
2662
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex
2026-02-03 12:54 ` kernel test robot
@ 2026-02-03 13:07 ` Yu Kuai
0 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2026-02-03 13:07 UTC (permalink / raw)
To: kernel test robot, tj, josef, axboe
Cc: oe-kbuild-all, cgroups, linux-block, linux-kernel, zhengqixing,
mkoutny, hch, ming.lei, nilay, yukuai
Hi,
在 2026/2/3 20:54, kernel test robot 写道:
> Hi Yu,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on axboe/for-next]
> [also build test ERROR on linus/master v6.19-rc8 next-20260202]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/blk-cgroup-protect-q-blkg_list-iteration-in-blkg_destroy_all-with-blkcg_mutex/20260203-161356
> base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
> patch link: https://lore.kernel.org/r/20260203080602.726505-3-yukuai%40fnnas.com
> patch subject: [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex
> config: i386-buildonly-randconfig-002-20260203 (https://download.01.org/0day-ci/archive/20260203/202602032018.BRG7c7LT-lkp@intel.com/config)
> compiler: gcc-13 (Debian 13.3.0-16) 13.3.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260203/202602032018.BRG7c7LT-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202602032018.BRG7c7LT-lkp@intel.com/
Looks like this is due to CONFIG_BLK_CGROUP is disabled, will fix this in the next
version.
>
> All errors (new ones prefixed by >>):
>
> In file included from include/linux/seqlock.h:19,
> from include/linux/mmzone.h:17,
> from include/linux/gfp.h:7,
> from include/linux/umh.h:4,
> from include/linux/kmod.h:9,
> from include/linux/module.h:18,
> from block/bfq-iosched.c:116:
> block/bfq-iosched.c: In function 'bfq_end_wr':
>>> block/bfq-iosched.c:2648:32: error: 'struct request_queue' has no member named 'blkcg_mutex'
> 2648 | mutex_lock(&bfqd->queue->blkcg_mutex);
> | ^~
> include/linux/mutex.h:193:44: note: in definition of macro 'mutex_lock'
> 193 | #define mutex_lock(lock) mutex_lock_nested(lock, 0)
> | ^~~~
> block/bfq-iosched.c:2660:34: error: 'struct request_queue' has no member named 'blkcg_mutex'
> 2660 | mutex_unlock(&bfqd->queue->blkcg_mutex);
> | ^~
>
>
> vim +2648 block/bfq-iosched.c
>
> 2642
> 2643 static void bfq_end_wr(struct bfq_data *bfqd)
> 2644 {
> 2645 struct bfq_queue *bfqq;
> 2646 int i;
> 2647
>> 2648 mutex_lock(&bfqd->queue->blkcg_mutex);
> 2649 spin_lock_irq(&bfqd->lock);
> 2650
> 2651 for (i = 0; i < bfqd->num_actuators; i++) {
> 2652 list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
> 2653 bfq_bfqq_end_wr(bfqq);
> 2654 }
> 2655 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
> 2656 bfq_bfqq_end_wr(bfqq);
> 2657 bfq_end_wr_async(bfqd);
> 2658
> 2659 spin_unlock_irq(&bfqd->lock);
> 2660 mutex_unlock(&bfqd->queue->blkcg_mutex);
> 2661 }
> 2662
>
--
Thansk,
Kuai
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex
2026-02-03 8:05 ` [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() " Yu Kuai
2026-02-03 12:54 ` kernel test robot
@ 2026-02-03 13:36 ` kernel test robot
1 sibling, 0 replies; 13+ messages in thread
From: kernel test robot @ 2026-02-03 13:36 UTC (permalink / raw)
To: Yu Kuai, tj, josef, axboe
Cc: llvm, oe-kbuild-all, cgroups, linux-block, linux-kernel, yukuai,
zhengqixing, mkoutny, hch, ming.lei, nilay
Hi Yu,
kernel test robot noticed the following build errors:
[auto build test ERROR on axboe/for-next]
[also build test ERROR on linus/master v6.19-rc8 next-20260202]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/blk-cgroup-protect-q-blkg_list-iteration-in-blkg_destroy_all-with-blkcg_mutex/20260203-161356
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20260203080602.726505-3-yukuai%40fnnas.com
patch subject: [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex
config: s390-randconfig-002-20260203 (https://download.01.org/0day-ci/archive/20260203/202602032109.oYgANNeZ-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 9b8addffa70cee5b2acc5454712d9cf78ce45710)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260203/202602032109.oYgANNeZ-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602032109.oYgANNeZ-lkp@intel.com/
All errors (new ones prefixed by >>):
>> block/bfq-iosched.c:2648:27: error: no member named 'blkcg_mutex' in 'struct request_queue'
2648 | mutex_lock(&bfqd->queue->blkcg_mutex);
| ~~~~~~~~~~~ ^
include/linux/mutex.h:193:44: note: expanded from macro 'mutex_lock'
193 | #define mutex_lock(lock) mutex_lock_nested(lock, 0)
| ^~~~
block/bfq-iosched.c:2660:29: error: no member named 'blkcg_mutex' in 'struct request_queue'
2660 | mutex_unlock(&bfqd->queue->blkcg_mutex);
| ~~~~~~~~~~~ ^
2 errors generated.
vim +2648 block/bfq-iosched.c
2642
2643 static void bfq_end_wr(struct bfq_data *bfqd)
2644 {
2645 struct bfq_queue *bfqq;
2646 int i;
2647
> 2648 mutex_lock(&bfqd->queue->blkcg_mutex);
2649 spin_lock_irq(&bfqd->lock);
2650
2651 for (i = 0; i < bfqd->num_actuators; i++) {
2652 list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
2653 bfq_bfqq_end_wr(bfqq);
2654 }
2655 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
2656 bfq_bfqq_end_wr(bfqq);
2657 bfq_end_wr_async(bfqd);
2658
2659 spin_unlock_irq(&bfqd->lock);
2660 mutex_unlock(&bfqd->queue->blkcg_mutex);
2661 }
2662
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v2 3/7] blk-cgroup: fix race between policy activation and blkg destruction
2026-02-03 8:05 [PATCH v2 0/7] blk-cgroup: fix races and deadlocks Yu Kuai
2026-02-03 8:05 ` [PATCH v2 1/7] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex Yu Kuai
2026-02-03 8:05 ` [PATCH v2 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() " Yu Kuai
@ 2026-02-03 8:05 ` Yu Kuai
2026-02-03 8:05 ` [PATCH v2 4/7] blk-cgroup: skip dying blkg in blkcg_activate_policy() Yu Kuai
` (3 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2026-02-03 8:05 UTC (permalink / raw)
To: tj, josef, axboe
Cc: cgroups, linux-block, linux-kernel, yukuai, zhengqixing, mkoutny,
hch, ming.lei, nilay
From: Zheng Qixing <zhengqixing@huawei.com>
When switching an IO scheduler on a block device, blkcg_activate_policy()
allocates blkg_policy_data (pd) for all blkgs attached to the queue.
However, blkcg_activate_policy() may race with concurrent blkcg deletion,
leading to use-after-free and memory leak issues.
The use-after-free occurs in the following race:
T1 (blkcg_activate_policy):
- Successfully allocates pd for blkg1 (loop0->queue, blkcgA)
- Fails to allocate pd for blkg2 (loop0->queue, blkcgB)
- Enters the enomem rollback path to release blkg1 resources
T2 (blkcg deletion):
- blkcgA is deleted concurrently
- blkg1 is freed via blkg_free_workfn()
- blkg1->pd is freed
T1 (continued):
- Rollback path accesses blkg1->pd->online after pd is freed
- Triggers use-after-free
In addition, blkg_free_workfn() frees pd before removing the blkg from
q->blkg_list. This allows blkcg_activate_policy() to allocate a new pd
for a blkg that is being destroyed, leaving the newly allocated pd
unreachable when the blkg is finally freed.
Fix these races by extending blkcg_mutex coverage to serialize
blkcg_activate_policy() rollback and blkg destruction, ensuring pd
lifecycle is synchronized with blkg list visibility.
Link: https://lore.kernel.org/all/20260108014416.3656493-3-zhengqixing@huaweicloud.com/
Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/blk-cgroup.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0bc7b19399b6..a6ac6ba9430d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1599,6 +1599,8 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
if (queue_is_mq(q))
memflags = blk_mq_freeze_queue(q);
+
+ mutex_lock(&q->blkcg_mutex);
retry:
spin_lock_irq(&q->queue_lock);
@@ -1661,6 +1663,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
spin_unlock_irq(&q->queue_lock);
out:
+ mutex_unlock(&q->blkcg_mutex);
if (queue_is_mq(q))
blk_mq_unfreeze_queue(q, memflags);
if (pinned_blkg)
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH v2 4/7] blk-cgroup: skip dying blkg in blkcg_activate_policy()
2026-02-03 8:05 [PATCH v2 0/7] blk-cgroup: fix races and deadlocks Yu Kuai
` (2 preceding siblings ...)
2026-02-03 8:05 ` [PATCH v2 3/7] blk-cgroup: fix race between policy activation and blkg destruction Yu Kuai
@ 2026-02-03 8:05 ` Yu Kuai
2026-02-03 8:06 ` [PATCH v2 5/7] blk-cgroup: factor policy pd teardown loop into helper Yu Kuai
` (2 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2026-02-03 8:05 UTC (permalink / raw)
To: tj, josef, axboe
Cc: cgroups, linux-block, linux-kernel, yukuai, zhengqixing, mkoutny,
hch, ming.lei, nilay
From: Zheng Qixing <zhengqixing@huawei.com>
When switching IO schedulers on a block device, blkcg_activate_policy()
can race with concurrent blkcg deletion, leading to a use-after-free in
rcu_accelerate_cbs.
T1: T2:
blkg_destroy
kill(&blkg->refcnt) // blkg->refcnt=1->0
blkg_release // call_rcu(__blkg_release)
...
blkg_free_workfn
->pd_free_fn(pd)
elv_iosched_store
elevator_switch
...
iterate blkg list
blkg_get(blkg) // blkg->refcnt=0->1
list_del_init(&blkg->q_node)
blkg_put(pinned_blkg) // blkg->refcnt=1->0
blkg_release // call_rcu again
rcu_accelerate_cbs // uaf
Fix this by checking hlist_unhashed(&blkg->blkcg_node) before getting
a reference to the blkg. This is the same check used in blkg_destroy()
to detect if a blkg has already been destroyed. If the blkg is already
unhashed, skip processing it since it's being destroyed.
Link: https://lore.kernel.org/all/20260108014416.3656493-4-zhengqixing@huaweicloud.com/
Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/blk-cgroup.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a6ac6ba9430d..f5b14a1d6973 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1610,6 +1610,8 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
if (blkg->pd[pol->plid])
continue;
+ if (hlist_unhashed(&blkg->blkcg_node))
+ continue;
/* If prealloc matches, use it; otherwise try GFP_NOWAIT */
if (blkg == pinned_blkg) {
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH v2 5/7] blk-cgroup: factor policy pd teardown loop into helper
2026-02-03 8:05 [PATCH v2 0/7] blk-cgroup: fix races and deadlocks Yu Kuai
` (3 preceding siblings ...)
2026-02-03 8:05 ` [PATCH v2 4/7] blk-cgroup: skip dying blkg in blkcg_activate_policy() Yu Kuai
@ 2026-02-03 8:06 ` Yu Kuai
2026-02-03 8:06 ` [PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy() Yu Kuai
2026-02-03 8:06 ` [PATCH v2 7/7] blk-rq-qos: move rq_qos_mutex acquisition inside rq_qos_add/del Yu Kuai
6 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2026-02-03 8:06 UTC (permalink / raw)
To: tj, josef, axboe
Cc: cgroups, linux-block, linux-kernel, yukuai, zhengqixing, mkoutny,
hch, ming.lei, nilay
From: Zheng Qixing <zhengqixing@huawei.com>
Move the teardown sequence which offlines and frees per-policy
blkg_policy_data (pd) into a helper for readability.
No functional change intended.
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/blk-cgroup.c | 58 +++++++++++++++++++++-------------------------
1 file changed, 27 insertions(+), 31 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f5b14a1d6973..0206050f81ea 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1562,6 +1562,31 @@ struct cgroup_subsys io_cgrp_subsys = {
};
EXPORT_SYMBOL_GPL(io_cgrp_subsys);
+/*
+ * Tear down per-blkg policy data for @pol on @q.
+ */
+static void blkcg_policy_teardown_pds(struct request_queue *q,
+ const struct blkcg_policy *pol)
+{
+ struct blkcg_gq *blkg;
+
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
+ struct blkcg *blkcg = blkg->blkcg;
+ struct blkg_policy_data *pd;
+
+ spin_lock(&blkcg->lock);
+ pd = blkg->pd[pol->plid];
+ if (pd) {
+ if (pd->online && pol->pd_offline_fn)
+ pol->pd_offline_fn(pd);
+ pd->online = false;
+ pol->pd_free_fn(pd);
+ blkg->pd[pol->plid] = NULL;
+ }
+ spin_unlock(&blkcg->lock);
+ }
+}
+
/**
* blkcg_activate_policy - activate a blkcg policy on a gendisk
* @disk: gendisk of interest
@@ -1677,21 +1702,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
enomem:
/* alloc failed, take down everything */
spin_lock_irq(&q->queue_lock);
- list_for_each_entry(blkg, &q->blkg_list, q_node) {
- struct blkcg *blkcg = blkg->blkcg;
- struct blkg_policy_data *pd;
-
- spin_lock(&blkcg->lock);
- pd = blkg->pd[pol->plid];
- if (pd) {
- if (pd->online && pol->pd_offline_fn)
- pol->pd_offline_fn(pd);
- pd->online = false;
- pol->pd_free_fn(pd);
- blkg->pd[pol->plid] = NULL;
- }
- spin_unlock(&blkcg->lock);
- }
+ blkcg_policy_teardown_pds(q, pol);
spin_unlock_irq(&q->queue_lock);
ret = -ENOMEM;
goto out;
@@ -1710,7 +1721,6 @@ void blkcg_deactivate_policy(struct gendisk *disk,
const struct blkcg_policy *pol)
{
struct request_queue *q = disk->queue;
- struct blkcg_gq *blkg;
unsigned int memflags;
if (!blkcg_policy_enabled(q, pol))
@@ -1721,22 +1731,8 @@ void blkcg_deactivate_policy(struct gendisk *disk,
mutex_lock(&q->blkcg_mutex);
spin_lock_irq(&q->queue_lock);
-
__clear_bit(pol->plid, q->blkcg_pols);
-
- list_for_each_entry(blkg, &q->blkg_list, q_node) {
- struct blkcg *blkcg = blkg->blkcg;
-
- spin_lock(&blkcg->lock);
- if (blkg->pd[pol->plid]) {
- if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
- pol->pd_offline_fn(blkg->pd[pol->plid]);
- pol->pd_free_fn(blkg->pd[pol->plid]);
- blkg->pd[pol->plid] = NULL;
- }
- spin_unlock(&blkcg->lock);
- }
-
+ blkcg_policy_teardown_pds(q, pol);
spin_unlock_irq(&q->queue_lock);
mutex_unlock(&q->blkcg_mutex);
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy()
2026-02-03 8:05 [PATCH v2 0/7] blk-cgroup: fix races and deadlocks Yu Kuai
` (4 preceding siblings ...)
2026-02-03 8:06 ` [PATCH v2 5/7] blk-cgroup: factor policy pd teardown loop into helper Yu Kuai
@ 2026-02-03 8:06 ` Yu Kuai
2026-02-03 9:06 ` Michal Koutný
2026-02-03 8:06 ` [PATCH v2 7/7] blk-rq-qos: move rq_qos_mutex acquisition inside rq_qos_add/del Yu Kuai
6 siblings, 1 reply; 13+ messages in thread
From: Yu Kuai @ 2026-02-03 8:06 UTC (permalink / raw)
To: tj, josef, axboe
Cc: cgroups, linux-block, linux-kernel, yukuai, zhengqixing, mkoutny,
hch, ming.lei, nilay
Some policies like iocost and iolatency perform percpu allocation in
pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
because percpu memory reclaim may issue IO.
Now that q->blkg_list is protected by blkcg_mutex, restructure
blkcg_activate_policy() to allocate all pds before freezing the queue:
1. Allocate all pds with GFP_KERNEL before freezing the queue
2. Freeze the queue
3. Initialize and online all pds
Note: Future work is to remove all queue freezing before
blkcg_activate_policy() to fix the deadlocks thoroughly.
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
1 file changed, 32 insertions(+), 58 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0206050f81ea..7fcb216917d0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
struct request_queue *q = disk->queue;
- struct blkg_policy_data *pd_prealloc = NULL;
- struct blkcg_gq *blkg, *pinned_blkg = NULL;
+ struct blkcg_gq *blkg;
unsigned int memflags;
int ret;
@@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
return -EINVAL;
- if (queue_is_mq(q))
- memflags = blk_mq_freeze_queue(q);
-
+ /*
+ * Allocate all pds before freezing queue. Some policies like iocost
+ * and iolatency do percpu allocation in pd_alloc_fn(), which can
+ * deadlock with queue frozen because percpu memory reclaim may issue
+ * IO. blkcg_mutex protects q->blkg_list iteration.
+ */
mutex_lock(&q->blkcg_mutex);
-retry:
- spin_lock_irq(&q->queue_lock);
-
- /* blkg_list is pushed at the head, reverse walk to initialize parents first */
list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
struct blkg_policy_data *pd;
- if (blkg->pd[pol->plid])
- continue;
+ /* Skip dying blkg */
if (hlist_unhashed(&blkg->blkcg_node))
continue;
- /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
- if (blkg == pinned_blkg) {
- pd = pd_prealloc;
- pd_prealloc = NULL;
- } else {
- pd = pol->pd_alloc_fn(disk, blkg->blkcg,
- GFP_NOWAIT);
- }
-
+ pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL);
if (!pd) {
- /*
- * GFP_NOWAIT failed. Free the existing one and
- * prealloc for @blkg w/ GFP_KERNEL.
- */
- if (pinned_blkg)
- blkg_put(pinned_blkg);
- blkg_get(blkg);
- pinned_blkg = blkg;
-
- spin_unlock_irq(&q->queue_lock);
-
- if (pd_prealloc)
- pol->pd_free_fn(pd_prealloc);
- pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
- GFP_KERNEL);
- if (pd_prealloc)
- goto retry;
- else
- goto enomem;
+ ret = -ENOMEM;
+ goto err_teardown;
}
- spin_lock(&blkg->blkcg->lock);
-
pd->blkg = blkg;
pd->plid = pol->plid;
+ pd->online = false;
blkg->pd[pol->plid] = pd;
+ }
+ /* Now freeze queue and initialize/online all pds */
+ if (queue_is_mq(q))
+ memflags = blk_mq_freeze_queue(q);
+
+ spin_lock_irq(&q->queue_lock);
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
+ struct blkg_policy_data *pd = blkg->pd[pol->plid];
+
+ /* Skip dying blkg */
+ if (hlist_unhashed(&blkg->blkcg_node))
+ continue;
+
+ spin_lock(&blkg->blkcg->lock);
if (pol->pd_init_fn)
pol->pd_init_fn(pd);
-
if (pol->pd_online_fn)
pol->pd_online_fn(pd);
pd->online = true;
-
spin_unlock(&blkg->blkcg->lock);
}
__set_bit(pol->plid, q->blkcg_pols);
- ret = 0;
-
spin_unlock_irq(&q->queue_lock);
-out:
- mutex_unlock(&q->blkcg_mutex);
+
if (queue_is_mq(q))
blk_mq_unfreeze_queue(q, memflags);
- if (pinned_blkg)
- blkg_put(pinned_blkg);
- if (pd_prealloc)
- pol->pd_free_fn(pd_prealloc);
- return ret;
+ mutex_unlock(&q->blkcg_mutex);
+ return 0;
-enomem:
- /* alloc failed, take down everything */
- spin_lock_irq(&q->queue_lock);
+err_teardown:
blkcg_policy_teardown_pds(q, pol);
- spin_unlock_irq(&q->queue_lock);
- ret = -ENOMEM;
- goto out;
+ mutex_unlock(&q->blkcg_mutex);
+ return ret;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy()
2026-02-03 8:06 ` [PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy() Yu Kuai
@ 2026-02-03 9:06 ` Michal Koutný
2026-02-04 6:44 ` Yu Kuai
0 siblings, 1 reply; 13+ messages in thread
From: Michal Koutný @ 2026-02-03 9:06 UTC (permalink / raw)
To: Yu Kuai
Cc: tj, josef, axboe, cgroups, linux-block, linux-kernel, zhengqixing,
hch, ming.lei, nilay
[-- Attachment #1: Type: text/plain, Size: 2620 bytes --]
On Tue, Feb 03, 2026 at 04:06:01PM +0800, Yu Kuai <yukuai@fnnas.com> wrote:
> Some policies like iocost and iolatency perform percpu allocation in
> pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
> because percpu memory reclaim may issue IO.
>
> Now that q->blkg_list is protected by blkcg_mutex,
With this ^^^
...
> restructure
> blkcg_activate_policy() to allocate all pds before freezing the queue:
> 1. Allocate all pds with GFP_KERNEL before freezing the queue
> 2. Freeze the queue
> 3. Initialize and online all pds
>
> Note: Future work is to remove all queue freezing before
> blkcg_activate_policy() to fix the deadlocks thoroughly.
>
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
> ---
> block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
> 1 file changed, 32 insertions(+), 58 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 0206050f81ea..7fcb216917d0 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
> int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
> {
> struct request_queue *q = disk->queue;
> - struct blkg_policy_data *pd_prealloc = NULL;
> - struct blkcg_gq *blkg, *pinned_blkg = NULL;
> + struct blkcg_gq *blkg;
> unsigned int memflags;
> int ret;
>
> @@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
...
> + /* Now freeze queue and initialize/online all pds */
> + if (queue_is_mq(q))
> + memflags = blk_mq_freeze_queue(q);
> +
> + spin_lock_irq(&q->queue_lock);
> + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
> + struct blkg_policy_data *pd = blkg->pd[pol->plid];
> +
> + /* Skip dying blkg */
> + if (hlist_unhashed(&blkg->blkcg_node))
> + continue;
> +
> + spin_lock(&blkg->blkcg->lock);
> if (pol->pd_init_fn)
> pol->pd_init_fn(pd);
> -
> if (pol->pd_online_fn)
> pol->pd_online_fn(pd);
> pd->online = true;
> -
> spin_unlock(&blkg->blkcg->lock);
> }
>
> __set_bit(pol->plid, q->blkcg_pols);
> - ret = 0;
> -
> spin_unlock_irq(&q->queue_lock);
> -out:
> - mutex_unlock(&q->blkcg_mutex);
> +
> if (queue_is_mq(q))
> blk_mq_unfreeze_queue(q, memflags);
> - if (pinned_blkg)
> - blkg_put(pinned_blkg);
> - if (pd_prealloc)
> - pol->pd_free_fn(pd_prealloc);
> - return ret;
> + mutex_unlock(&q->blkcg_mutex);
> + return 0;
Why is q->queue_lock still needed here?
Thanks,
Michal
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy()
2026-02-03 9:06 ` Michal Koutný
@ 2026-02-04 6:44 ` Yu Kuai
0 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2026-02-04 6:44 UTC (permalink / raw)
To: Michal Koutný
Cc: tj, josef, axboe, cgroups, linux-block, linux-kernel, zhengqixing,
hch, ming.lei, nilay, yukuai
Hi,
在 2026/2/3 17:06, Michal Koutný 写道:
> On Tue, Feb 03, 2026 at 04:06:01PM +0800, Yu Kuai <yukuai@fnnas.com> wrote:
>> Some policies like iocost and iolatency perform percpu allocation in
>> pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
>> because percpu memory reclaim may issue IO.
>>
>> Now that q->blkg_list is protected by blkcg_mutex,
> With this ^^^
>
> ...
>> restructure
>> blkcg_activate_policy() to allocate all pds before freezing the queue:
>> 1. Allocate all pds with GFP_KERNEL before freezing the queue
>> 2. Freeze the queue
>> 3. Initialize and online all pds
>>
>> Note: Future work is to remove all queue freezing before
>> blkcg_activate_policy() to fix the deadlocks thoroughly.
>>
>> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
>> ---
>> block/blk-cgroup.c | 90 +++++++++++++++++-----------------------------
>> 1 file changed, 32 insertions(+), 58 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 0206050f81ea..7fcb216917d0 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
>> int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
>> {
>> struct request_queue *q = disk->queue;
>> - struct blkg_policy_data *pd_prealloc = NULL;
>> - struct blkcg_gq *blkg, *pinned_blkg = NULL;
>> + struct blkcg_gq *blkg;
>> unsigned int memflags;
>> int ret;
>>
>> @@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
> ...
>
>> + /* Now freeze queue and initialize/online all pds */
>> + if (queue_is_mq(q))
>> + memflags = blk_mq_freeze_queue(q);
>> +
>> + spin_lock_irq(&q->queue_lock);
>> + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>> + struct blkg_policy_data *pd = blkg->pd[pol->plid];
>> +
>> + /* Skip dying blkg */
>> + if (hlist_unhashed(&blkg->blkcg_node))
>> + continue;
>> +
>> + spin_lock(&blkg->blkcg->lock);
>> if (pol->pd_init_fn)
>> pol->pd_init_fn(pd);
>> -
>> if (pol->pd_online_fn)
>> pol->pd_online_fn(pd);
>> pd->online = true;
>> -
>> spin_unlock(&blkg->blkcg->lock);
>> }
>>
>> __set_bit(pol->plid, q->blkcg_pols);
>> - ret = 0;
>> -
>> spin_unlock_irq(&q->queue_lock);
>> -out:
>> - mutex_unlock(&q->blkcg_mutex);
>> +
>> if (queue_is_mq(q))
>> blk_mq_unfreeze_queue(q, memflags);
>> - if (pinned_blkg)
>> - blkg_put(pinned_blkg);
>> - if (pd_prealloc)
>> - pol->pd_free_fn(pd_prealloc);
>> - return ret;
>> + mutex_unlock(&q->blkcg_mutex);
>> + return 0;
> Why is q->queue_lock still needed here?
I do want to remove queue_lock for accessing blkgs. However, this set just protect q->blkg_list
with blkg_mutex, and I'll remove the queue_lock after everything is converted to blkg_mutex.
>
> Thanks,
> Michal
--
Thansk,
Kuai
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v2 7/7] blk-rq-qos: move rq_qos_mutex acquisition inside rq_qos_add/del
2026-02-03 8:05 [PATCH v2 0/7] blk-cgroup: fix races and deadlocks Yu Kuai
` (5 preceding siblings ...)
2026-02-03 8:06 ` [PATCH v2 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy() Yu Kuai
@ 2026-02-03 8:06 ` Yu Kuai
6 siblings, 0 replies; 13+ messages in thread
From: Yu Kuai @ 2026-02-03 8:06 UTC (permalink / raw)
To: tj, josef, axboe
Cc: cgroups, linux-block, linux-kernel, yukuai, zhengqixing, mkoutny,
hch, ming.lei, nilay
The current rq_qos_mutex handling has an awkward pattern where callers
must acquire the mutex before calling rq_qos_add()/rq_qos_del(), and
blkg_conf_open_bdev_frozen() had to release and re-acquire the mutex
around queue freezing to maintain proper locking order (freeze queue
before mutex).
On the other hand, with rq_qos_mutex held after blkg_conf_prep(), there
are many possible deadlocks:
- allocating memory with GFP_KERNEL, like blk_throtl_init();
- allocating percpu memory, like pd_alloc_fn() for iocost/iolatency;
This patch refactors the locking by:
1. Moving queue freeze and rq_qos_mutex acquisition inside
rq_qos_add()/rq_qos_del(), with the correct order: freeze first,
then acquire mutex.
2. Removing external mutex handling from wbt_init() since rq_qos_add()
now handles it internally.
3. Removing rq_qos_mutex handling from blkg_conf_open_bdev() entirely,
making it only responsible for parsing MAJ:MIN and opening the bdev.
4. Removing blkg_conf_open_bdev_frozen() and blkg_conf_exit_frozen()
functions which are no longer needed.
5. Updating ioc_qos_write() to use the simpler blkg_conf_open_bdev()
and blkg_conf_exit() functions.
This eliminates the release-and-reacquire pattern and makes
rq_qos_add()/rq_qos_del() self-contained, which is cleaner and reduces
complexity. Each function now properly manages its own locking with
the correct order: queue freeze → mutex acquire → modify → mutex
release → queue unfreeze.
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/blk-cgroup.c | 50 ----------------------------------------------
block/blk-cgroup.h | 2 --
block/blk-iocost.c | 11 ++++------
block/blk-rq-qos.c | 31 ++++++++++++++++------------
block/blk-wbt.c | 2 --
5 files changed, 22 insertions(+), 74 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7fcb216917d0..d17d2b44df43 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -802,10 +802,8 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
return -ENODEV;
}
- mutex_lock(&bdev->bd_queue->rq_qos_mutex);
if (!disk_live(bdev->bd_disk)) {
blkdev_put_no_open(bdev);
- mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
return -ENODEV;
}
@@ -813,38 +811,6 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
ctx->bdev = bdev;
return 0;
}
-/*
- * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
- * ensures the correct locking order between freeze queue and q->rq_qos_mutex.
- *
- * This function returns negative error on failure. On success it returns
- * memflags which must be saved and later passed to blkg_conf_exit_frozen
- * for restoring the memalloc scope.
- */
-unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
-{
- int ret;
- unsigned long memflags;
-
- if (ctx->bdev)
- return -EINVAL;
-
- ret = blkg_conf_open_bdev(ctx);
- if (ret < 0)
- return ret;
- /*
- * At this point, we haven’t started protecting anything related to QoS,
- * so we release q->rq_qos_mutex here, which was first acquired in blkg_
- * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
- * the queue to maintain the correct locking order.
- */
- mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
- memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
- mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
- return memflags;
-}
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
@@ -978,7 +944,6 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
*/
void blkg_conf_exit(struct blkg_conf_ctx *ctx)
__releases(&ctx->bdev->bd_queue->queue_lock)
- __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
if (ctx->blkg) {
spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
@@ -986,7 +951,6 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx)
}
if (ctx->bdev) {
- mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
blkdev_put_no_open(ctx->bdev);
ctx->body = NULL;
ctx->bdev = NULL;
@@ -994,20 +958,6 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);
-/*
- * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used
- * when blkg_conf_open_bdev_frozen is used to open the bdev.
- */
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
-{
- if (ctx->bdev) {
- struct request_queue *q = ctx->bdev->bd_queue;
-
- blkg_conf_exit(ctx);
- blk_mq_unfreeze_queue(q, memflags);
- }
-}
-
static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
int i;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1cce3294634d..d4e7f78ba545 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -219,11 +219,9 @@ struct blkg_conf_ctx {
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
-unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index ef543d163d46..104a9a9f563f 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3220,16 +3220,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
u32 qos[NR_QOS_PARAMS];
bool enable, user;
char *body, *p;
- unsigned long memflags;
int ret;
blkg_conf_init(&ctx, input);
- memflags = blkg_conf_open_bdev_frozen(&ctx);
- if (IS_ERR_VALUE(memflags)) {
- ret = memflags;
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
goto err;
- }
body = ctx.body;
disk = ctx.bdev->bd_disk;
@@ -3346,14 +3343,14 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
blk_mq_unquiesce_queue(disk->queue);
- blkg_conf_exit_frozen(&ctx, memflags);
+ blkg_conf_exit(&ctx);
return nbytes;
einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(disk->queue);
ret = -EINVAL;
err:
- blkg_conf_exit_frozen(&ctx, memflags);
+ blkg_conf_exit(&ctx);
return ret;
}
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 85cf74402a09..fe96183bcc75 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -327,8 +327,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
{
struct request_queue *q = disk->queue;
unsigned int memflags;
-
- lockdep_assert_held(&q->rq_qos_mutex);
+ int ret = 0;
rqos->disk = disk;
rqos->id = id;
@@ -337,20 +336,24 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
/*
* No IO can be in-flight when adding rqos, so freeze queue, which
* is fine since we only support rq_qos for blk-mq queue.
+ *
+ * Acquire rq_qos_mutex after freezing the queue to ensure proper
+ * locking order.
*/
memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->rq_qos_mutex);
- if (rq_qos_id(q, rqos->id))
- goto ebusy;
- rqos->next = q->rq_qos;
- q->rq_qos = rqos;
- blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
+ if (rq_qos_id(q, rqos->id)) {
+ ret = -EBUSY;
+ } else {
+ rqos->next = q->rq_qos;
+ q->rq_qos = rqos;
+ blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
+ }
+ mutex_unlock(&q->rq_qos_mutex);
blk_mq_unfreeze_queue(q, memflags);
- return 0;
-ebusy:
- blk_mq_unfreeze_queue(q, memflags);
- return -EBUSY;
+ return ret;
}
void rq_qos_del(struct rq_qos *rqos)
@@ -359,9 +362,9 @@ void rq_qos_del(struct rq_qos *rqos)
struct rq_qos **cur;
unsigned int memflags;
- lockdep_assert_held(&q->rq_qos_mutex);
-
memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->rq_qos_mutex);
+
for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
if (*cur == rqos) {
*cur = rqos->next;
@@ -370,5 +373,7 @@ void rq_qos_del(struct rq_qos *rqos)
}
if (!q->rq_qos)
blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
+
+ mutex_unlock(&q->rq_qos_mutex);
blk_mq_unfreeze_queue(q, memflags);
}
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 1415f2bf8611..a636dea27270 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -960,9 +960,7 @@ static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
/*
* Assign rwb and add the stats callback.
*/
- mutex_lock(&q->rq_qos_mutex);
ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
- mutex_unlock(&q->rq_qos_mutex);
if (ret)
return ret;
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread