Linux block layer
 help / color / mirror / Atom feed
* [PATCH V2] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
@ 2026-06-15 11:55 Zizhi Wo
  2026-06-15 16:16 ` Yu Kuai
  0 siblings, 1 reply; 4+ messages in thread
From: Zizhi Wo @ 2026-06-15 11:55 UTC (permalink / raw)
  To: axboe, tj, josef, linux-block
  Cc: cgroups, yangerkun, chengzhihao1, yukuai, houtao1, wozizhi

From: Zizhi Wo <wozizhi@huawei.com>

[BUG]
Our fuzz testing triggered a blkcg use-after-free issue:

  BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0
  Call Trace:
  ...
  blkcg_deactivate_policy+0x244/0x4d0
  ioc_rqos_exit+0x44/0xe0
  rq_qos_exit+0xba/0x120
  __del_gendisk+0x50b/0x800
  del_gendisk+0xff/0x190
  ...

[CAUSE]
process1						process2
cgroup_rmdir
...
  css_killed_work_fn
    offline_css
    ...
      blkcg_destroy_blkgs
      ...
        __blkg_release
	  css_put(&blkg->blkcg->css)
          blkg_free
	    INIT_WORK(xxx, blkg_free_workfn)
	    schedule_work
    css_put
    ...
      blkcg_css_free
        kfree(blkcg)--------blkcg has been freed!!!
====================================schedule_work
              blkg_free_workfn
							__del_gendisk
							  rq_qos_exit
							    ioc_rqos_exit
							      blkcg_deactivate_policy
							        mutex_lock(&q->blkcg_mutex)
								spin_lock_irq(&q->queue_lock)
							        list_for_each_entry(blkg, xxx)
								  blkcg = blkg->blkcg
								  spin_lock(&blkcg->lock)-------UAF!!!
	        mutex_lock(&q->blkcg_mutex)
	        spin_lock_irq(&q->queue_lock)
	        /* Only then is the blkg removed from the list */
	        list_del_init(&blkg->q_node)

As a result, a blkg can still be reachable through q->blkg_list while
its ->blkcg has already been freed.

[Fix]
Fix this by deferring the blkcg css_put() until after the blkg has been
unlinked from q->blkg_list in blkg_free_workfn(). This ensures that the
blkcg outlives every blkg still reachable through q->blkg_list, so any
iterator holding q->queue_lock is guaranteed to observe a valid
blkg->blkcg.

While at it, move css_tryget_online() from blkg_create() into blkg_alloc()
so that the css reference is owned by the alloc/free pair rather than
straddling layers:
blkg_alloc()  <-> blkg_free()
blkg_create() <-> blkg_destroy()

Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Suggested-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
---
v2:
 - Move css_tryget_online() from blkg_create() into blkg_alloc() so the
   css reference follows the blkg's own lifetime, making the put in
   blkg_free_workfn() symmetric with the get in blkg_alloc().

v1: https://lore.kernel.org/all/20260518010932.633707-1-wozizhi@huaweicloud.com/

 block/blk-cgroup.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bc63bd220865..27414c291e49 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -132,10 +132,15 @@ static void blkg_free_workfn(struct work_struct *work)
 	if (blkg->parent)
 		blkg_put(blkg->parent);
 	spin_lock_irq(&q->queue_lock);
 	list_del_init(&blkg->q_node);
 	spin_unlock_irq(&q->queue_lock);
+	/*
+	 * Release blkcg css ref only after blkg is removed from q->blkg_list,
+	 * so concurrent iterators won't see a blkg with a freed blkcg.
+	 */
+	css_put(&blkg->blkcg->css);
 	mutex_unlock(&q->blkcg_mutex);
 
 	blk_put_queue(q);
 	free_percpu(blkg->iostat_cpu);
 	percpu_ref_exit(&blkg->refcnt);
@@ -177,12 +182,10 @@ static void __blkg_release(struct rcu_head *rcu)
 	 * blkg_stat_lock is for serializing blkg stat update
 	 */
 	for_each_possible_cpu(cpu)
 		__blkcg_rstat_flush(blkcg, cpu);
 
-	/* release the blkcg and parent blkg refs this blkg has been holding */
-	css_put(&blkg->blkcg->css);
 	blkg_free(blkg);
 }
 
 /*
  * A group is RCU protected, but having an rcu lock does not mean that one
@@ -311,10 +314,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 	blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
 	if (!blkg->iostat_cpu)
 		goto out_exit_refcnt;
 	if (!blk_get_queue(disk->queue))
 		goto out_free_iostat;
+	/* blkg holds a reference to blkcg */
+	if (!css_tryget_online(&blkcg->css))
+		goto out_put_queue;
 
 	blkg->q = disk->queue;
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
 	blkg->iostat.blkg = blkg;
@@ -351,10 +357,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 
 out_free_pds:
 	while (--i >= 0)
 		if (blkg->pd[i])
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+	css_put(&blkcg->css);
+out_put_queue:
 	blk_put_queue(disk->queue);
 out_free_iostat:
 	free_percpu(blkg->iostat_cpu);
 out_exit_refcnt:
 	percpu_ref_exit(&blkg->refcnt);
@@ -379,32 +387,26 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 	if (blk_queue_dying(disk->queue)) {
 		ret = -ENODEV;
 		goto err_free_blkg;
 	}
 
-	/* blkg holds a reference to blkcg */
-	if (!css_tryget_online(&blkcg->css)) {
-		ret = -ENODEV;
-		goto err_free_blkg;
-	}
-
 	/* allocate */
 	if (!new_blkg) {
 		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
-			goto err_put_css;
+			goto err_free_blkg;
 		}
 	}
 	blkg = new_blkg;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -ENODEV;
-			goto err_put_css;
+			goto err_free_blkg;
 		}
 		blkg_get(blkg->parent);
 	}
 
 	/* invoke per-policy init */
@@ -440,12 +442,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 
 	/* @blkg failed fully initialized, use the usual release path */
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
-err_put_css:
-	css_put(&blkcg->css);
 err_free_blkg:
 	if (new_blkg)
 		blkg_free(new_blkg);
 	return ERR_PTR(ret);
 }
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
  2026-06-15 11:55 [PATCH V2] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue Zizhi Wo
@ 2026-06-15 16:16 ` Yu Kuai
  2026-06-16  1:23   ` Hou Tao
  0 siblings, 1 reply; 4+ messages in thread
From: Yu Kuai @ 2026-06-15 16:16 UTC (permalink / raw)
  To: Zizhi Wo, axboe, tj, josef, linux-block
  Cc: cgroups, yangerkun, chengzhihao1, houtao1, yukuai

Hi,

在 2026/6/15 19:55, Zizhi Wo 写道:
> From: Zizhi Wo <wozizhi@huawei.com>
>
> [BUG]
> Our fuzz testing triggered a blkcg use-after-free issue:
>
>    BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0
>    Call Trace:
>    ...
>    blkcg_deactivate_policy+0x244/0x4d0
>    ioc_rqos_exit+0x44/0xe0
>    rq_qos_exit+0xba/0x120
>    __del_gendisk+0x50b/0x800
>    del_gendisk+0xff/0x190
>    ...
>
> [CAUSE]
> process1						process2
> cgroup_rmdir
> ...
>    css_killed_work_fn
>      offline_css
>      ...
>        blkcg_destroy_blkgs
>        ...
>          __blkg_release
> 	  css_put(&blkg->blkcg->css)
>            blkg_free
> 	    INIT_WORK(xxx, blkg_free_workfn)
> 	    schedule_work
>      css_put
>      ...
>        blkcg_css_free
>          kfree(blkcg)--------blkcg has been freed!!!
> ====================================schedule_work
>                blkg_free_workfn
> 							__del_gendisk
> 							  rq_qos_exit
> 							    ioc_rqos_exit
> 							      blkcg_deactivate_policy
> 							        mutex_lock(&q->blkcg_mutex)
> 								spin_lock_irq(&q->queue_lock)
> 							        list_for_each_entry(blkg, xxx)
> 								  blkcg = blkg->blkcg
> 								  spin_lock(&blkcg->lock)-------UAF!!!
> 	        mutex_lock(&q->blkcg_mutex)
> 	        spin_lock_irq(&q->queue_lock)
> 	        /* Only then is the blkg removed from the list */
> 	        list_del_init(&blkg->q_node)
>
> As a result, a blkg can still be reachable through q->blkg_list while
> its ->blkcg has already been freed.
>
> [Fix]
> Fix this by deferring the blkcg css_put() until after the blkg has been
> unlinked from q->blkg_list in blkg_free_workfn(). This ensures that the
> blkcg outlives every blkg still reachable through q->blkg_list, so any
> iterator holding q->queue_lock is guaranteed to observe a valid
> blkg->blkcg.
>
> While at it, move css_tryget_online() from blkg_create() into blkg_alloc()
> so that the css reference is owned by the alloc/free pair rather than
> straddling layers:
> blkg_alloc()  <-> blkg_free()
> blkg_create() <-> blkg_destroy()
>
> Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
> Suggested-by: Hou Tao <houtao1@huawei.com>
> Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
> ---
> v2:
>   - Move css_tryget_online() from blkg_create() into blkg_alloc() so the
>     css reference follows the blkg's own lifetime, making the put in
>     blkg_free_workfn() symmetric with the get in blkg_alloc().
>
> v1: https://lore.kernel.org/all/20260518010932.633707-1-wozizhi@huaweicloud.com/
>
>   block/blk-cgroup.c | 24 ++++++++++++------------
>   1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index bc63bd220865..27414c291e49 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -132,10 +132,15 @@ static void blkg_free_workfn(struct work_struct *work)
>   	if (blkg->parent)
>   		blkg_put(blkg->parent);
>   	spin_lock_irq(&q->queue_lock);
>   	list_del_init(&blkg->q_node);
>   	spin_unlock_irq(&q->queue_lock);
> +	/*
> +	 * Release blkcg css ref only after blkg is removed from q->blkg_list,
> +	 * so concurrent iterators won't see a blkg with a freed blkcg.
> +	 */
> +	css_put(&blkg->blkcg->css);
>   	mutex_unlock(&q->blkcg_mutex);

Please move css_put after mutex_unlock, unless there is a strong reason.

With above change, feel free to add:

Reviewed-by: Yu Kuai <yukuai@fygo.io>

>   
>   	blk_put_queue(q);
>   	free_percpu(blkg->iostat_cpu);
>   	percpu_ref_exit(&blkg->refcnt);
> @@ -177,12 +182,10 @@ static void __blkg_release(struct rcu_head *rcu)
>   	 * blkg_stat_lock is for serializing blkg stat update
>   	 */
>   	for_each_possible_cpu(cpu)
>   		__blkcg_rstat_flush(blkcg, cpu);
>   
> -	/* release the blkcg and parent blkg refs this blkg has been holding */
> -	css_put(&blkg->blkcg->css);
>   	blkg_free(blkg);
>   }
>   
>   /*
>    * A group is RCU protected, but having an rcu lock does not mean that one
> @@ -311,10 +314,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>   	blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
>   	if (!blkg->iostat_cpu)
>   		goto out_exit_refcnt;
>   	if (!blk_get_queue(disk->queue))
>   		goto out_free_iostat;
> +	/* blkg holds a reference to blkcg */
> +	if (!css_tryget_online(&blkcg->css))
> +		goto out_put_queue;
>   
>   	blkg->q = disk->queue;
>   	INIT_LIST_HEAD(&blkg->q_node);
>   	blkg->blkcg = blkcg;
>   	blkg->iostat.blkg = blkg;
> @@ -351,10 +357,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>   
>   out_free_pds:
>   	while (--i >= 0)
>   		if (blkg->pd[i])
>   			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
> +	css_put(&blkcg->css);
> +out_put_queue:
>   	blk_put_queue(disk->queue);
>   out_free_iostat:
>   	free_percpu(blkg->iostat_cpu);
>   out_exit_refcnt:
>   	percpu_ref_exit(&blkg->refcnt);
> @@ -379,32 +387,26 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>   	if (blk_queue_dying(disk->queue)) {
>   		ret = -ENODEV;
>   		goto err_free_blkg;
>   	}
>   
> -	/* blkg holds a reference to blkcg */
> -	if (!css_tryget_online(&blkcg->css)) {
> -		ret = -ENODEV;
> -		goto err_free_blkg;
> -	}
> -
>   	/* allocate */
>   	if (!new_blkg) {
>   		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
>   		if (unlikely(!new_blkg)) {
>   			ret = -ENOMEM;
> -			goto err_put_css;
> +			goto err_free_blkg;
>   		}
>   	}
>   	blkg = new_blkg;
>   
>   	/* link parent */
>   	if (blkcg_parent(blkcg)) {
>   		blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
>   		if (WARN_ON_ONCE(!blkg->parent)) {
>   			ret = -ENODEV;
> -			goto err_put_css;
> +			goto err_free_blkg;
>   		}
>   		blkg_get(blkg->parent);
>   	}
>   
>   	/* invoke per-policy init */
> @@ -440,12 +442,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>   
>   	/* @blkg failed fully initialized, use the usual release path */
>   	blkg_put(blkg);
>   	return ERR_PTR(ret);
>   
> -err_put_css:
> -	css_put(&blkcg->css);
>   err_free_blkg:
>   	if (new_blkg)
>   		blkg_free(new_blkg);
>   	return ERR_PTR(ret);
>   }

-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
  2026-06-15 16:16 ` Yu Kuai
@ 2026-06-16  1:23   ` Hou Tao
  2026-06-16 16:44     ` Tang Yizhou
  0 siblings, 1 reply; 4+ messages in thread
From: Hou Tao @ 2026-06-16  1:23 UTC (permalink / raw)
  To: yukuai, Zizhi Wo, axboe, tj, josef, linux-block
  Cc: cgroups, yangerkun, chengzhihao1

Hi,

On 6/16/2026 12:16 AM, Yu Kuai wrote:
> Hi,
>
> 在 2026/6/15 19:55, Zizhi Wo 写道:
>> From: Zizhi Wo <wozizhi@huawei.com>
>>
>> [BUG]
>> Our fuzz testing triggered a blkcg use-after-free issue:
>>
>>    BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0
>>    Call Trace:
>>    ...
>>    blkcg_deactivate_policy+0x244/0x4d0
>>    ioc_rqos_exit+0x44/0xe0
>>    rq_qos_exit+0xba/0x120
>>    __del_gendisk+0x50b/0x800
>>    del_gendisk+0xff/0x190
>>    ...
>>
>> [CAUSE]
>> process1						process2
>> cgroup_rmdir
>> ...
>>    css_killed_work_fn
>>      offline_css
>>      ...
>>        blkcg_destroy_blkgs
>>        ...
>>          __blkg_release
>> 	  css_put(&blkg->blkcg->css)
>>            blkg_free
>> 	    INIT_WORK(xxx, blkg_free_workfn)
>> 	    schedule_work
>>      css_put
>>      ...
>>        blkcg_css_free
>>          kfree(blkcg)--------blkcg has been freed!!!
>> ====================================schedule_work
>>                blkg_free_workfn
>> 							__del_gendisk
>> 							  rq_qos_exit
>> 							    ioc_rqos_exit
>> 							      blkcg_deactivate_policy
>> 							        mutex_lock(&q->blkcg_mutex)
>> 								spin_lock_irq(&q->queue_lock)
>> 							        list_for_each_entry(blkg, xxx)
>> 								  blkcg = blkg->blkcg
>> 								  spin_lock(&blkcg->lock)-------UAF!!!
>> 	        mutex_lock(&q->blkcg_mutex)
>> 	        spin_lock_irq(&q->queue_lock)
>> 	        /* Only then is the blkg removed from the list */
>> 	        list_del_init(&blkg->q_node)
>>
>> As a result, a blkg can still be reachable through q->blkg_list while
>> its ->blkcg has already been freed.
>>
>> [Fix]
>> Fix this by deferring the blkcg css_put() until after the blkg has been
>> unlinked from q->blkg_list in blkg_free_workfn(). This ensures that the
>> blkcg outlives every blkg still reachable through q->blkg_list, so any
>> iterator holding q->queue_lock is guaranteed to observe a valid
>> blkg->blkcg.
>>
>> While at it, move css_tryget_online() from blkg_create() into blkg_alloc()
>> so that the css reference is owned by the alloc/free pair rather than
>> straddling layers:
>> blkg_alloc()  <-> blkg_free()
>> blkg_create() <-> blkg_destroy()
>>
>> Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
>> Suggested-by: Hou Tao <houtao1@huawei.com>
>> Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
>> ---
>> v2:
>>   - Move css_tryget_online() from blkg_create() into blkg_alloc() so the
>>     css reference follows the blkg's own lifetime, making the put in
>>     blkg_free_workfn() symmetric with the get in blkg_alloc().
>>
>> v1: https://lore.kernel.org/all/20260518010932.633707-1-wozizhi@huaweicloud.com/
>>
>>   block/blk-cgroup.c | 24 ++++++++++++------------
>>   1 file changed, 12 insertions(+), 12 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index bc63bd220865..27414c291e49 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -132,10 +132,15 @@ static void blkg_free_workfn(struct work_struct *work)
>>   	if (blkg->parent)
>>   		blkg_put(blkg->parent);
>>   	spin_lock_irq(&q->queue_lock);
>>   	list_del_init(&blkg->q_node);
>>   	spin_unlock_irq(&q->queue_lock);
>> +	/*
>> +	 * Release blkcg css ref only after blkg is removed from q->blkg_list,
>> +	 * so concurrent iterators won't see a blkg with a freed blkcg.
>> +	 */
>> +	css_put(&blkg->blkcg->css);
>>   	mutex_unlock(&q->blkcg_mutex);
> Please move css_put after mutex_unlock, unless there is a strong reason.

I think blkcg_mutex is used here to serialize the access of blkg->q_node
and blkg->blkcg. We could move the css_put after the mutex_unlock(),
however it stills depends on the mutex_lock and mutex_unlock pair on
blkcg_mutex implicitly. Instead of such implicit dependency, we move the
css_put inside the lock to make it be explicit.
>
> With above change, feel free to add:
>
> Reviewed-by: Yu Kuai <yukuai@fygo.io>
>
>>   
>>   	blk_put_queue(q);
>>   	free_percpu(blkg->iostat_cpu);
>>   	percpu_ref_exit(&blkg->refcnt);
>> @@ -177,12 +182,10 @@ static void __blkg_release(struct rcu_head *rcu)
>>   	 * blkg_stat_lock is for serializing blkg stat update
>>   	 */
>>   	for_each_possible_cpu(cpu)
>>   		__blkcg_rstat_flush(blkcg, cpu);
>>   
>> -	/* release the blkcg and parent blkg refs this blkg has been holding */
>> -	css_put(&blkg->blkcg->css);
>>   	blkg_free(blkg);
>>   }
>>   
>>   /*
>>    * A group is RCU protected, but having an rcu lock does not mean that one
>> @@ -311,10 +314,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>>   	blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
>>   	if (!blkg->iostat_cpu)
>>   		goto out_exit_refcnt;
>>   	if (!blk_get_queue(disk->queue))
>>   		goto out_free_iostat;
>> +	/* blkg holds a reference to blkcg */
>> +	if (!css_tryget_online(&blkcg->css))
>> +		goto out_put_queue;
>>   
>>   	blkg->q = disk->queue;
>>   	INIT_LIST_HEAD(&blkg->q_node);
>>   	blkg->blkcg = blkcg;
>>   	blkg->iostat.blkg = blkg;
>> @@ -351,10 +357,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>>   
>>   out_free_pds:
>>   	while (--i >= 0)
>>   		if (blkg->pd[i])
>>   			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
>> +	css_put(&blkcg->css);
>> +out_put_queue:
>>   	blk_put_queue(disk->queue);
>>   out_free_iostat:
>>   	free_percpu(blkg->iostat_cpu);
>>   out_exit_refcnt:
>>   	percpu_ref_exit(&blkg->refcnt);
>> @@ -379,32 +387,26 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>>   	if (blk_queue_dying(disk->queue)) {
>>   		ret = -ENODEV;
>>   		goto err_free_blkg;
>>   	}
>>   
>> -	/* blkg holds a reference to blkcg */
>> -	if (!css_tryget_online(&blkcg->css)) {
>> -		ret = -ENODEV;
>> -		goto err_free_blkg;
>> -	}
>> -
>>   	/* allocate */
>>   	if (!new_blkg) {
>>   		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
>>   		if (unlikely(!new_blkg)) {
>>   			ret = -ENOMEM;
>> -			goto err_put_css;
>> +			goto err_free_blkg;
>>   		}
>>   	}
>>   	blkg = new_blkg;
>>   
>>   	/* link parent */
>>   	if (blkcg_parent(blkcg)) {
>>   		blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
>>   		if (WARN_ON_ONCE(!blkg->parent)) {
>>   			ret = -ENODEV;
>> -			goto err_put_css;
>> +			goto err_free_blkg;
>>   		}
>>   		blkg_get(blkg->parent);
>>   	}
>>   
>>   	/* invoke per-policy init */
>> @@ -440,12 +442,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>>   
>>   	/* @blkg failed fully initialized, use the usual release path */
>>   	blkg_put(blkg);
>>   	return ERR_PTR(ret);
>>   
>> -err_put_css:
>> -	css_put(&blkcg->css);
>>   err_free_blkg:
>>   	if (new_blkg)
>>   		blkg_free(new_blkg);
>>   	return ERR_PTR(ret);
>>   }


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
  2026-06-16  1:23   ` Hou Tao
@ 2026-06-16 16:44     ` Tang Yizhou
  0 siblings, 0 replies; 4+ messages in thread
From: Tang Yizhou @ 2026-06-16 16:44 UTC (permalink / raw)
  To: Hou Tao, yukuai, Zizhi Wo, axboe, tj, josef, linux-block
  Cc: cgroups, yangerkun, chengzhihao1

On 16/6/26 9:23 am, Hou Tao wrote:
> Hi,
> 
> On 6/16/2026 12:16 AM, Yu Kuai wrote:
>> Hi,
>>
>> 在 2026/6/15 19:55, Zizhi Wo 写道:
>>> From: Zizhi Wo <wozizhi@huawei.com>
>>>
>>> [BUG]
>>> Our fuzz testing triggered a blkcg use-after-free issue:
>>>
>>>    BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0
>>>    Call Trace:
>>>    ...
>>>    blkcg_deactivate_policy+0x244/0x4d0
>>>    ioc_rqos_exit+0x44/0xe0
>>>    rq_qos_exit+0xba/0x120
>>>    __del_gendisk+0x50b/0x800
>>>    del_gendisk+0xff/0x190
>>>    ...
>>>
>>> [CAUSE]
>>> process1						process2
>>> cgroup_rmdir
>>> ...
>>>    css_killed_work_fn
>>>      offline_css
>>>      ...
>>>        blkcg_destroy_blkgs
>>>        ...
>>>          __blkg_release
>>> 	  css_put(&blkg->blkcg->css)
>>>            blkg_free
>>> 	    INIT_WORK(xxx, blkg_free_workfn)
>>> 	    schedule_work
>>>      css_put
>>>      ...
>>>        blkcg_css_free
>>>          kfree(blkcg)--------blkcg has been freed!!!
>>> ====================================schedule_work
>>>                blkg_free_workfn
>>> 							__del_gendisk
>>> 							  rq_qos_exit
>>> 							    ioc_rqos_exit
>>> 							      blkcg_deactivate_policy
>>> 							        mutex_lock(&q->blkcg_mutex)
>>> 								spin_lock_irq(&q->queue_lock)
>>> 							        list_for_each_entry(blkg, xxx)
>>> 								  blkcg = blkg->blkcg
>>> 								  spin_lock(&blkcg->lock)-------UAF!!!
>>> 	        mutex_lock(&q->blkcg_mutex)
>>> 	        spin_lock_irq(&q->queue_lock)
>>> 	        /* Only then is the blkg removed from the list */
>>> 	        list_del_init(&blkg->q_node)
>>>
>>> As a result, a blkg can still be reachable through q->blkg_list while
>>> its ->blkcg has already been freed.
>>>
>>> [Fix]
>>> Fix this by deferring the blkcg css_put() until after the blkg has been
>>> unlinked from q->blkg_list in blkg_free_workfn(). This ensures that the
>>> blkcg outlives every blkg still reachable through q->blkg_list, so any
>>> iterator holding q->queue_lock is guaranteed to observe a valid
>>> blkg->blkcg.
>>>
>>> While at it, move css_tryget_online() from blkg_create() into blkg_alloc()
>>> so that the css reference is owned by the alloc/free pair rather than
>>> straddling layers:
>>> blkg_alloc()  <-> blkg_free()
>>> blkg_create() <-> blkg_destroy()
>>>
>>> Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
>>> Suggested-by: Hou Tao <houtao1@huawei.com>
>>> Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
>>> ---
>>> v2:
>>>   - Move css_tryget_online() from blkg_create() into blkg_alloc() so the
>>>     css reference follows the blkg's own lifetime, making the put in
>>>     blkg_free_workfn() symmetric with the get in blkg_alloc().
>>>
>>> v1: https://lore.kernel.org/all/20260518010932.633707-1-wozizhi@huaweicloud.com/
>>>
>>>   block/blk-cgroup.c | 24 ++++++++++++------------
>>>   1 file changed, 12 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>> index bc63bd220865..27414c291e49 100644
>>> --- a/block/blk-cgroup.c
>>> +++ b/block/blk-cgroup.c
>>> @@ -132,10 +132,15 @@ static void blkg_free_workfn(struct work_struct *work)
>>>   	if (blkg->parent)
>>>   		blkg_put(blkg->parent);
>>>   	spin_lock_irq(&q->queue_lock);
>>>   	list_del_init(&blkg->q_node);
>>>   	spin_unlock_irq(&q->queue_lock);
>>> +	/*
>>> +	 * Release blkcg css ref only after blkg is removed from q->blkg_list,
>>> +	 * so concurrent iterators won't see a blkg with a freed blkcg.
>>> +	 */
>>> +	css_put(&blkg->blkcg->css);
>>>   	mutex_unlock(&q->blkcg_mutex);
>> Please move css_put after mutex_unlock, unless there is a strong reason.
> 
> I think blkcg_mutex is used here to serialize the access of blkg->q_node
> and blkg->blkcg. We could move the css_put after the mutex_unlock(),
> however it stills depends on the mutex_lock and mutex_unlock pair on
> blkcg_mutex implicitly. Instead of such implicit dependency, we move the
> css_put inside the lock to make it be explicit.

Hi, I think I understand your point. Keeping css_put() inside blkcg_mutex makes the dependency explicit, since the same mutex serializes both the removal of blkg->q_node and the access to blkg->blkcg.

Placing css_put() after mutex_unlock(&q->blkcg_mutex) is still functionally correct. The blkg has already been removed from q->blkg_list under the mutex, so once we drop the mutex no iterator can reach this blkg anymore.

The benefit of moving it out is a smaller critical section.

-- 
Best Regards,
Yi

>>
>> With above change, feel free to add:
>>
>> Reviewed-by: Yu Kuai <yukuai@fygo.io>
>>
>>>   
>>>   	blk_put_queue(q);
>>>   	free_percpu(blkg->iostat_cpu);
>>>   	percpu_ref_exit(&blkg->refcnt);
>>> @@ -177,12 +182,10 @@ static void __blkg_release(struct rcu_head *rcu)
>>>   	 * blkg_stat_lock is for serializing blkg stat update
>>>   	 */
>>>   	for_each_possible_cpu(cpu)
>>>   		__blkcg_rstat_flush(blkcg, cpu);
>>>   
>>> -	/* release the blkcg and parent blkg refs this blkg has been holding */
>>> -	css_put(&blkg->blkcg->css);
>>>   	blkg_free(blkg);
>>>   }
>>>   
>>>   /*
>>>    * A group is RCU protected, but having an rcu lock does not mean that one
>>> @@ -311,10 +314,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>>>   	blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
>>>   	if (!blkg->iostat_cpu)
>>>   		goto out_exit_refcnt;
>>>   	if (!blk_get_queue(disk->queue))
>>>   		goto out_free_iostat;
>>> +	/* blkg holds a reference to blkcg */
>>> +	if (!css_tryget_online(&blkcg->css))
>>> +		goto out_put_queue;
>>>   
>>>   	blkg->q = disk->queue;
>>>   	INIT_LIST_HEAD(&blkg->q_node);
>>>   	blkg->blkcg = blkcg;
>>>   	blkg->iostat.blkg = blkg;
>>> @@ -351,10 +357,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>>>   
>>>   out_free_pds:
>>>   	while (--i >= 0)
>>>   		if (blkg->pd[i])
>>>   			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
>>> +	css_put(&blkcg->css);
>>> +out_put_queue:
>>>   	blk_put_queue(disk->queue);
>>>   out_free_iostat:
>>>   	free_percpu(blkg->iostat_cpu);
>>>   out_exit_refcnt:
>>>   	percpu_ref_exit(&blkg->refcnt);
>>> @@ -379,32 +387,26 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>>>   	if (blk_queue_dying(disk->queue)) {
>>>   		ret = -ENODEV;
>>>   		goto err_free_blkg;
>>>   	}
>>>   
>>> -	/* blkg holds a reference to blkcg */
>>> -	if (!css_tryget_online(&blkcg->css)) {
>>> -		ret = -ENODEV;
>>> -		goto err_free_blkg;
>>> -	}
>>> -
>>>   	/* allocate */
>>>   	if (!new_blkg) {
>>>   		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
>>>   		if (unlikely(!new_blkg)) {
>>>   			ret = -ENOMEM;
>>> -			goto err_put_css;
>>> +			goto err_free_blkg;
>>>   		}
>>>   	}
>>>   	blkg = new_blkg;
>>>   
>>>   	/* link parent */
>>>   	if (blkcg_parent(blkcg)) {
>>>   		blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
>>>   		if (WARN_ON_ONCE(!blkg->parent)) {
>>>   			ret = -ENODEV;
>>> -			goto err_put_css;
>>> +			goto err_free_blkg;
>>>   		}
>>>   		blkg_get(blkg->parent);
>>>   	}
>>>   
>>>   	/* invoke per-policy init */
>>> @@ -440,12 +442,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>>>   
>>>   	/* @blkg failed fully initialized, use the usual release path */
>>>   	blkg_put(blkg);
>>>   	return ERR_PTR(ret);
>>>   
>>> -err_put_css:
>>> -	css_put(&blkcg->css);
>>>   err_free_blkg:
>>>   	if (new_blkg)
>>>   		blkg_free(new_blkg);
>>>   	return ERR_PTR(ret);
>>>   }
> 
> 



^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-06-16 16:44 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-15 11:55 [PATCH V2] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue Zizhi Wo
2026-06-15 16:16 ` Yu Kuai
2026-06-16  1:23   ` Hou Tao
2026-06-16 16:44     ` Tang Yizhou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox