Linux cgroups development
 help / color / mirror / Atom feed
* [PATCH V3] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
@ 2026-06-16  1:17 Zizhi Wo
  2026-06-16 16:50 ` Tang Yizhou
  0 siblings, 1 reply; 2+ messages in thread
From: Zizhi Wo @ 2026-06-16  1:17 UTC (permalink / raw)
  To: axboe, tj, josef, linux-block
  Cc: cgroups, yangerkun, chengzhihao1, houtao1, yukuai, wozizhi

From: Zizhi Wo <wozizhi@huawei.com>

[BUG]
Our fuzz testing triggered a blkcg use-after-free issue:

  BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0
  Call Trace:
  ...
  blkcg_deactivate_policy+0x244/0x4d0
  ioc_rqos_exit+0x44/0xe0
  rq_qos_exit+0xba/0x120
  __del_gendisk+0x50b/0x800
  del_gendisk+0xff/0x190
  ...

[CAUSE]
process1						process2
cgroup_rmdir
...
  css_killed_work_fn
    offline_css
    ...
      blkcg_destroy_blkgs
      ...
        __blkg_release
	  css_put(&blkg->blkcg->css)
          blkg_free
	    INIT_WORK(xxx, blkg_free_workfn)
	    schedule_work
    css_put
    ...
      blkcg_css_free
        kfree(blkcg)--------blkcg has been freed!!!
====================================schedule_work
              blkg_free_workfn
							__del_gendisk
							  rq_qos_exit
							    ioc_rqos_exit
							      blkcg_deactivate_policy
							        mutex_lock(&q->blkcg_mutex)
								spin_lock_irq(&q->queue_lock)
							        list_for_each_entry(blkg, xxx)
								  blkcg = blkg->blkcg
								  spin_lock(&blkcg->lock)-------UAF!!!
	        mutex_lock(&q->blkcg_mutex)
	        spin_lock_irq(&q->queue_lock)
	        /* Only then is the blkg removed from the list */
	        list_del_init(&blkg->q_node)

As a result, a blkg can still be reachable through q->blkg_list while
its ->blkcg has already been freed.

[Fix]
Fix this by deferring the blkcg css_put() until after the blkg has been
unlinked from q->blkg_list in blkg_free_workfn(). This ensures that the
blkcg outlives every blkg still reachable through q->blkg_list, so any
iterator holding q->queue_lock is guaranteed to observe a valid
blkg->blkcg.

While at it, move css_tryget_online() from blkg_create() into blkg_alloc()
so that the css reference is owned by the alloc/free pair rather than
straddling layers:
blkg_alloc()  <-> blkg_free()
blkg_create() <-> blkg_destroy()

Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Suggested-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
Reviewed-by: Yu Kuai <yukuai@fygo.io>
---
v3:
 - move css_put() after mutex_unlock() in blkg_free_workfn().

v2:
 - Move css_tryget_online() from blkg_create() into blkg_alloc() so the
   css reference follows the blkg's own lifetime, making the put in
   blkg_free_workfn() symmetric with the get in blkg_alloc().

v1: https://lore.kernel.org/all/20260518010932.633707-1-wozizhi@huaweicloud.com/
 block/blk-cgroup.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bc63bd220865..3ac41f766caf 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -136,6 +136,11 @@ static void blkg_free_workfn(struct work_struct *work)
 	spin_unlock_irq(&q->queue_lock);
 	mutex_unlock(&q->blkcg_mutex);
 
+	/*
+	 * Release blkcg css ref only after blkg is removed from q->blkg_list,
+	 * so concurrent iterators won't see a blkg with a freed blkcg.
+	 */
+	css_put(&blkg->blkcg->css);
 	blk_put_queue(q);
 	free_percpu(blkg->iostat_cpu);
 	percpu_ref_exit(&blkg->refcnt);
@@ -179,8 +184,6 @@ static void __blkg_release(struct rcu_head *rcu)
 	for_each_possible_cpu(cpu)
 		__blkcg_rstat_flush(blkcg, cpu);
 
-	/* release the blkcg and parent blkg refs this blkg has been holding */
-	css_put(&blkg->blkcg->css);
 	blkg_free(blkg);
 }
 
@@ -313,6 +316,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 		goto out_exit_refcnt;
 	if (!blk_get_queue(disk->queue))
 		goto out_free_iostat;
+	/* blkg holds a reference to blkcg */
+	if (!css_tryget_online(&blkcg->css))
+		goto out_put_queue;
 
 	blkg->q = disk->queue;
 	INIT_LIST_HEAD(&blkg->q_node);
@@ -353,6 +359,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 	while (--i >= 0)
 		if (blkg->pd[i])
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+	css_put(&blkcg->css);
+out_put_queue:
 	blk_put_queue(disk->queue);
 out_free_iostat:
 	free_percpu(blkg->iostat_cpu);
@@ -381,18 +389,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 		goto err_free_blkg;
 	}
 
-	/* blkg holds a reference to blkcg */
-	if (!css_tryget_online(&blkcg->css)) {
-		ret = -ENODEV;
-		goto err_free_blkg;
-	}
-
 	/* allocate */
 	if (!new_blkg) {
 		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
-			goto err_put_css;
+			goto err_free_blkg;
 		}
 	}
 	blkg = new_blkg;
@@ -402,7 +404,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 		blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -ENODEV;
-			goto err_put_css;
+			goto err_free_blkg;
 		}
 		blkg_get(blkg->parent);
 	}
@@ -442,8 +444,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
-err_put_css:
-	css_put(&blkcg->css);
 err_free_blkg:
 	if (new_blkg)
 		blkg_free(new_blkg);
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH V3] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
  2026-06-16  1:17 [PATCH V3] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue Zizhi Wo
@ 2026-06-16 16:50 ` Tang Yizhou
  0 siblings, 0 replies; 2+ messages in thread
From: Tang Yizhou @ 2026-06-16 16:50 UTC (permalink / raw)
  To: Zizhi Wo, axboe, tj, josef, linux-block
  Cc: cgroups, yangerkun, chengzhihao1, houtao1, yukuai

On 16/6/26 9:17 am, Zizhi Wo wrote:
> From: Zizhi Wo <wozizhi@huawei.com>
> 
> [BUG]
> Our fuzz testing triggered a blkcg use-after-free issue:
> 
>   BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0
>   Call Trace:
>   ...
>   blkcg_deactivate_policy+0x244/0x4d0
>   ioc_rqos_exit+0x44/0xe0
>   rq_qos_exit+0xba/0x120
>   __del_gendisk+0x50b/0x800
>   del_gendisk+0xff/0x190
>   ...
> 
> [CAUSE]
> process1						process2
> cgroup_rmdir
> ...
>   css_killed_work_fn
>     offline_css
>     ...
>       blkcg_destroy_blkgs
>       ...
>         __blkg_release
> 	  css_put(&blkg->blkcg->css)
>           blkg_free
> 	    INIT_WORK(xxx, blkg_free_workfn)
> 	    schedule_work
>     css_put
>     ...
>       blkcg_css_free
>         kfree(blkcg)--------blkcg has been freed!!!
> ====================================schedule_work
>               blkg_free_workfn
> 							__del_gendisk
> 							  rq_qos_exit
> 							    ioc_rqos_exit
> 							      blkcg_deactivate_policy
> 							        mutex_lock(&q->blkcg_mutex)
> 								spin_lock_irq(&q->queue_lock)
> 							        list_for_each_entry(blkg, xxx)
> 								  blkcg = blkg->blkcg
> 								  spin_lock(&blkcg->lock)-------UAF!!!
> 	        mutex_lock(&q->blkcg_mutex)
> 	        spin_lock_irq(&q->queue_lock)
> 	        /* Only then is the blkg removed from the list */
> 	        list_del_init(&blkg->q_node)
> 
> As a result, a blkg can still be reachable through q->blkg_list while
> its ->blkcg has already been freed.
> 
> [Fix]
> Fix this by deferring the blkcg css_put() until after the blkg has been
> unlinked from q->blkg_list in blkg_free_workfn(). This ensures that the
> blkcg outlives every blkg still reachable through q->blkg_list, so any
> iterator holding q->queue_lock is guaranteed to observe a valid
> blkg->blkcg.
> 
> While at it, move css_tryget_online() from blkg_create() into blkg_alloc()
> so that the css reference is owned by the alloc/free pair rather than
> straddling layers:
> blkg_alloc()  <-> blkg_free()
> blkg_create() <-> blkg_destroy()
> 
> Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
> Suggested-by: Hou Tao <houtao1@huawei.com>
> Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
> Reviewed-by: Yu Kuai <yukuai@fygo.io>
> ---
> v3:
>  - move css_put() after mutex_unlock() in blkg_free_workfn().
> 
> v2:
>  - Move css_tryget_online() from blkg_create() into blkg_alloc() so the
>    css reference follows the blkg's own lifetime, making the put in
>    blkg_free_workfn() symmetric with the get in blkg_alloc().
> 
> v1: https://lore.kernel.org/all/20260518010932.633707-1-wozizhi@huaweicloud.com/
>  block/blk-cgroup.c | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index bc63bd220865..3ac41f766caf 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -136,6 +136,11 @@ static void blkg_free_workfn(struct work_struct *work)
>  	spin_unlock_irq(&q->queue_lock);
>  	mutex_unlock(&q->blkcg_mutex);
>  
> +	/*
> +	 * Release blkcg css ref only after blkg is removed from q->blkg_list,
> +	 * so concurrent iterators won't see a blkg with a freed blkcg.
> +	 */
> +	css_put(&blkg->blkcg->css);
>  	blk_put_queue(q);
>  	free_percpu(blkg->iostat_cpu);
>  	percpu_ref_exit(&blkg->refcnt);
> @@ -179,8 +184,6 @@ static void __blkg_release(struct rcu_head *rcu)
>  	for_each_possible_cpu(cpu)
>  		__blkcg_rstat_flush(blkcg, cpu);
>  
> -	/* release the blkcg and parent blkg refs this blkg has been holding */
> -	css_put(&blkg->blkcg->css);
>  	blkg_free(blkg);
>  }
>  
> @@ -313,6 +316,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>  		goto out_exit_refcnt;
>  	if (!blk_get_queue(disk->queue))
>  		goto out_free_iostat;
> +	/* blkg holds a reference to blkcg */
> +	if (!css_tryget_online(&blkcg->css))
> +		goto out_put_queue;
>  
>  	blkg->q = disk->queue;
>  	INIT_LIST_HEAD(&blkg->q_node);
> @@ -353,6 +359,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
>  	while (--i >= 0)
>  		if (blkg->pd[i])
>  			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
> +	css_put(&blkcg->css);
> +out_put_queue:
>  	blk_put_queue(disk->queue);
>  out_free_iostat:
>  	free_percpu(blkg->iostat_cpu);
> @@ -381,18 +389,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>  		goto err_free_blkg;
>  	}
>  
> -	/* blkg holds a reference to blkcg */
> -	if (!css_tryget_online(&blkcg->css)) {
> -		ret = -ENODEV;
> -		goto err_free_blkg;
> -	}
> -
>  	/* allocate */
>  	if (!new_blkg) {
>  		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
>  		if (unlikely(!new_blkg)) {
>  			ret = -ENOMEM;
> -			goto err_put_css;
> +			goto err_free_blkg;
>  		}
>  	}
>  	blkg = new_blkg;
> @@ -402,7 +404,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>  		blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
>  		if (WARN_ON_ONCE(!blkg->parent)) {
>  			ret = -ENODEV;
> -			goto err_put_css;
> +			goto err_free_blkg;
>  		}
>  		blkg_get(blkg->parent);
>  	}
> @@ -442,8 +444,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>  	blkg_put(blkg);
>  	return ERR_PTR(ret);
>  
> -err_put_css:
> -	css_put(&blkcg->css);
>  err_free_blkg:
>  	if (new_blkg)
>  		blkg_free(new_blkg);

LGTM.

Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>

-- 
Best Regards,
Yi


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-06-16 16:50 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-16  1:17 [PATCH V3] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue Zizhi Wo
2026-06-16 16:50 ` Tang Yizhou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox