[PATCH] cgroup: Move cgroup_task_dead() to task

public inbox for cgroups@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up
@ 2026-03-11 12:08 Sebastian Andrzej Siewior
  2026-03-13 17:33 ` Michal Koutný
  0 siblings, 1 reply; 5+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-11 12:08 UTC (permalink / raw)
  To: cgroups, linux-rt-devel
  Cc: Michal Koutný, Ben Segall, Clark Williams, Dietmar Eggemann,
	Ingo Molnar, Johannes Weiner, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Tejun Heo, Valentin Schneider,
	Vincent Guittot

The cgroup clean up (via cgroup_task_dead()) has been moved to
finish_task_switch() so that sched_ext can observe the task attached to
its cgroups one last time before the task is gone.

This clean up has been added to irq_work on PREEMPT_RT because
finish_task_switch() is invoked with disabled preemption and
cgroup_task_dead() needs to acquire sleeping locks.

Invoking cgroup_task_dead() in finish_task_switch() is too late and
creates a small window in which the task is observed in the cgroup list
after it died and this confuses systemd. To close the window, tasks
which are exiting are no longer exposed to the user. Now there is no
reason to invoke cgroup_task_dead() in finish_task_switch() and it can
be delayed further so simplify the code.

Move cgroup_task_dead() to cgroup_task_free() which is invoked during
RCU clean up of the task_struct.

Fixes: 9311e6c29b34 ("cgroup: Fix sleeping from invalid context warning on PREEMPT_RT")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/cgroup.h |  2 --
 include/linux/sched.h  |  3 ---
 kernel/cgroup/cgroup.c | 56 ++----------------------------------------
 kernel/sched/core.c    |  6 -----
 4 files changed, 2 insertions(+), 65 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bc892e3b37eea..4068035176c41 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -138,7 +138,6 @@ extern void cgroup_cancel_fork(struct task_struct *p,
 extern void cgroup_post_fork(struct task_struct *p,
 			     struct kernel_clone_args *kargs);
 void cgroup_task_exit(struct task_struct *p);
-void cgroup_task_dead(struct task_struct *p);
 void cgroup_task_release(struct task_struct *p);
 void cgroup_task_free(struct task_struct *p);
 
@@ -682,7 +681,6 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
 static inline void cgroup_post_fork(struct task_struct *p,
 				    struct kernel_clone_args *kargs) {}
 static inline void cgroup_task_exit(struct task_struct *p) {}
-static inline void cgroup_task_dead(struct task_struct *p) {}
 static inline void cgroup_task_release(struct task_struct *p) {}
 static inline void cgroup_task_free(struct task_struct *p) {}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7b4a980eb2f0..375d0c958081d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1321,9 +1321,6 @@ struct task_struct {
 	struct css_set __rcu		*cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
 	struct list_head		cg_list;
-#ifdef CONFIG_PREEMPT_RT
-	struct llist_node		cg_dead_lnode;
-#endif	/* CONFIG_PREEMPT_RT */
 #endif	/* CONFIG_CGROUPS */
 #ifdef CONFIG_X86_CPU_RESCTRL
 	u32				closid;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7921633ea1058..684b773cd5cb4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -285,7 +285,6 @@ static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 			      struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
-static void cgroup_rt_init(void);
 
 #ifdef CONFIG_DEBUG_CGROUP_REF
 #define CGROUP_REF_FN_ATTRS	noinline
@@ -6362,7 +6361,6 @@ int __init cgroup_init(void)
 	BUG_ON(ss_rstat_init(NULL));
 
 	get_user_ns(init_cgroup_ns.user_ns);
-	cgroup_rt_init();
 
 	cgroup_lock();
 
@@ -6993,7 +6991,7 @@ void cgroup_task_exit(struct task_struct *tsk)
 	} while_each_subsys_mask();
 }
 
-static void do_cgroup_task_dead(struct task_struct *tsk)
+static void cgroup_task_dead(struct task_struct *tsk)
 {
 	struct css_set *cset;
 	unsigned long flags;
@@ -7019,57 +7017,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
 	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
-#ifdef CONFIG_PREEMPT_RT
-/*
- * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
- * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
- * this lead to sleeping in the invalid context warning bug. css_set_lock is too
- * big to become a raw_spinlock. The task_dead path doesn't need to run
- * synchronously but can't be delayed indefinitely either as the dead task pins
- * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
- * irq_work to allow batching while ensuring timely completion.
- */
-static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
-static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
-
-static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
-{
-	struct llist_node *lnode;
-	struct task_struct *task, *next;
-
-	lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
-	llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
-		do_cgroup_task_dead(task);
-		put_task_struct(task);
-	}
-}
-
-static void __init cgroup_rt_init(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
-		per_cpu(cgrp_dead_tasks_iwork, cpu) =
-			IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
-	}
-}
-
-void cgroup_task_dead(struct task_struct *task)
-{
-	get_task_struct(task);
-	llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
-	irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
-}
-#else	/* CONFIG_PREEMPT_RT */
-static void __init cgroup_rt_init(void) {}
-
-void cgroup_task_dead(struct task_struct *task)
-{
-	do_cgroup_task_dead(task);
-}
-#endif	/* CONFIG_PREEMPT_RT */
-
 void cgroup_task_release(struct task_struct *task)
 {
 	struct cgroup_subsys *ss;
@@ -7084,6 +7031,7 @@ void cgroup_task_free(struct task_struct *task)
 {
 	struct css_set *cset = task_css_set(task);
 
+	cgroup_task_dead(task);
 	if (!list_empty(&task->cg_list)) {
 		spin_lock_irq(&css_set_lock);
 		css_set_skip_task_iters(task_css_set(task), task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7f77c165a6e0..0e67f6db3204a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5181,13 +5181,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
 
-		/*
-		 * sched_ext_dead() must come before cgroup_task_dead() to
-		 * prevent cgroups from being removed while its member tasks are
-		 * visible to SCX schedulers.
-		 */
 		sched_ext_dead(prev);
-		cgroup_task_dead(prev);
 
 		/* Task is done with its stack. */
 		put_task_stack(prev);
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up
  2026-03-11 12:08 [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up Sebastian Andrzej Siewior
@ 2026-03-13 17:33 ` Michal Koutný
  2026-03-14  9:17   ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 5+ messages in thread
From: Michal Koutný @ 2026-03-13 17:33 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: cgroups, linux-rt-devel, Ben Segall, Clark Williams,
	Dietmar Eggemann, Ingo Molnar, Johannes Weiner, Juri Lelli,
	Mel Gorman, Peter Zijlstra, Steven Rostedt, Tejun Heo,
	Valentin Schneider, Vincent Guittot

[-- Attachment #1: Type: text/plain, Size: 688 bytes --]

Hello.

On Wed, Mar 11, 2026 at 01:08:29PM +0100, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> @@ -7084,6 +7031,7 @@ void cgroup_task_free(struct task_struct *task)
>  {
>  	struct css_set *cset = task_css_set(task);
>  
> +	cgroup_task_dead(task);
>  	if (!list_empty(&task->cg_list)) {
>  		spin_lock_irq(&css_set_lock);
>  		css_set_skip_task_iters(task_css_set(task), task);

Erm, isn't this way too late?
I see that cset->dying_tasks is appended in do_cgroup_task_dead() (which
I was used to find in cgroup_exit()).

(Also, whole cgroup_task_dead() becomes single use thing so it could be
open-coded in the place where it belongs.)

Thanks,
Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up
  2026-03-13 17:33 ` Michal Koutný
@ 2026-03-14  9:17   ` Sebastian Andrzej Siewior
  2026-03-15 20:15     ` Tejun Heo
  0 siblings, 1 reply; 5+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-14  9:17 UTC (permalink / raw)
  To: Michal Koutný
  Cc: cgroups, linux-rt-devel, Ben Segall, Clark Williams,
	Dietmar Eggemann, Ingo Molnar, Johannes Weiner, Juri Lelli,
	Mel Gorman, Peter Zijlstra, Steven Rostedt, Tejun Heo,
	Valentin Schneider, Vincent Guittot

On 2026-03-13 18:33:14 [+0100], Michal Koutný wrote:
> Hello.
> 
> On Wed, Mar 11, 2026 at 01:08:29PM +0100, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> > @@ -7084,6 +7031,7 @@ void cgroup_task_free(struct task_struct *task)
> >  {
> >  	struct css_set *cset = task_css_set(task);
> >  
> > +	cgroup_task_dead(task);
> >  	if (!list_empty(&task->cg_list)) {
> >  		spin_lock_irq(&css_set_lock);
> >  		css_set_skip_task_iters(task_css_set(task), task);
> 
> Erm, isn't this way too late?

My understanding is that the point is that the cgroup must not be
exposed to userland once the task is about to die. This has been moved
after the fine schedule() which is too late because the parent is
signaled before that and the removal happens after that.
So now we hide the tasks in the iterator once the task is PF_EXITING.

> I see that cset->dying_tasks is appended in do_cgroup_task_dead() (which
> I was used to find in cgroup_exit()).

So this could be probably removed later on if this is the only purpose.

> (Also, whole cgroup_task_dead() becomes single use thing so it could be
> open-coded in the place where it belongs.)

If this what we want, I could inline it there.

> Thanks,
> Michal

Sebastian

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up
  2026-03-14  9:17   ` Sebastian Andrzej Siewior
@ 2026-03-15 20:15     ` Tejun Heo
  2026-03-16  8:06       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 5+ messages in thread
From: Tejun Heo @ 2026-03-15 20:15 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Michal Koutný, cgroups, linux-rt-devel, Ben Segall,
	Clark Williams, Dietmar Eggemann, Ingo Molnar, Johannes Weiner,
	Juri Lelli, Mel Gorman, Peter Zijlstra, Steven Rostedt,
	Valentin Schneider, Vincent Guittot

Hello,

On Sat, Mar 14, 2026 at 10:17:52AM +0100, Sebastian Andrzej Siewior wrote:
> On 2026-03-13 18:33:14 [+0100], Michal Koutný wrote:
> > Hello.
> > 
> > On Wed, Mar 11, 2026 at 01:08:29PM +0100, Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> > > @@ -7084,6 +7031,7 @@ void cgroup_task_free(struct task_struct *task)
> > >  {
> > >  	struct css_set *cset = task_css_set(task);
> > >  
> > > +	cgroup_task_dead(task);
> > >  	if (!list_empty(&task->cg_list)) {
> > >  		spin_lock_irq(&css_set_lock);
> > >  		css_set_skip_task_iters(task_css_set(task), task);
> > 
> > Erm, isn't this way too late?
> 
> My understanding is that the point is that the cgroup must not be
> exposed to userland once the task is about to die. This has been moved
> after the fine schedule() which is too late because the parent is
> signaled before that and the removal happens after that.
> So now we hide the tasks in the iterator once the task is PF_EXITING.

Yeah, I think it's too late. That's where we tell the cgroup is becoming
empty among other things. cgroup_task_free() waits for all task refcnts to
drop which can take any amount of time. You can also imagine situations
where e.g. userspace pins a specific task through pidfd unitl the cgroup it
belongs becomes empty, which would have worked before but now would
deadlock.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up
  2026-03-15 20:15     ` Tejun Heo
@ 2026-03-16  8:06       ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 5+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-03-16  8:06 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Koutný, cgroups, linux-rt-devel, Ben Segall,
	Clark Williams, Dietmar Eggemann, Ingo Molnar, Johannes Weiner,
	Juri Lelli, Mel Gorman, Peter Zijlstra, Steven Rostedt,
	Valentin Schneider, Vincent Guittot

On 2026-03-15 10:15:41 [-1000], Tejun Heo wrote:
> Hello,
Hi,

> Yeah, I think it's too late. That's where we tell the cgroup is becoming
> empty among other things. cgroup_task_free() waits for all task refcnts to
> drop which can take any amount of time. You can also imagine situations
> where e.g. userspace pins a specific task through pidfd unitl the cgroup it
> belongs becomes empty, which would have worked before but now would
> deadlock.

I see. What about moving it to delayed_put_task_struct() such as the
change on top below? It also RCU and only acquired by bpf_task_acquire().
Is this also a concern or would that work?

--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
 extern void cgroup_post_fork(struct task_struct *p,
 			     struct kernel_clone_args *kargs);
 void cgroup_task_exit(struct task_struct *p);
+void cgroup_task_dead(struct task_struct *p);
 void cgroup_task_release(struct task_struct *p);
 void cgroup_task_free(struct task_struct *p);
 
@@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
 static inline void cgroup_post_fork(struct task_struct *p,
 				    struct kernel_clone_args *kargs) {}
 static inline void cgroup_task_exit(struct task_struct *p) {}
+static inline void cgroup_task_dead(struct task_struct *p) {}
 static inline void cgroup_task_release(struct task_struct *p) {}
 static inline void cgroup_task_free(struct task_struct *p) {}
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 684b773cd5cb4..28cdb8d3210e7 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6991,7 +6991,7 @@ void cgroup_task_exit(struct task_struct *tsk)
 	} while_each_subsys_mask();
 }
 
-static void cgroup_task_dead(struct task_struct *tsk)
+void cgroup_task_dead(struct task_struct *tsk)
 {
 	struct css_set *cset;
 	unsigned long flags;
@@ -7031,7 +7031,6 @@ void cgroup_task_free(struct task_struct *task)
 {
 	struct css_set *cset = task_css_set(task);
 
-	cgroup_task_dead(task);
 	if (!list_empty(&task->cg_list)) {
 		spin_lock_irq(&css_set_lock);
 		css_set_skip_task_iters(task_css_set(task), task);
diff --git a/kernel/exit.c b/kernel/exit.c
index ede3117fa7d41..7a94425122238 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -228,6 +228,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	rethook_flush_task(tsk);
 	perf_event_delayed_put(tsk);
 	trace_sched_process_free(tsk);
+	cgroup_task_dead(tsk);
 	put_task_struct(tsk);
 }
 

> Thanks.

Sebastian

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-03-16  8:06 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-11 12:08 [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up Sebastian Andrzej Siewior
2026-03-13 17:33 ` Michal Koutný
2026-03-14  9:17   ` Sebastian Andrzej Siewior
2026-03-15 20:15     ` Tejun Heo
2026-03-16  8:06       ` Sebastian Andrzej Siewior

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox