public inbox for cgroups@vger.kernel.org
 help / color / mirror / Atom feed
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
To: cgroups@vger.kernel.org, linux-rt-devel@lists.linux.dev
Cc: "Michal Koutný" <mkoutny@suse.com>,
	"Ben Segall" <bsegall@google.com>,
	"Clark Williams" <clrkwllms@kernel.org>,
	"Dietmar Eggemann" <dietmar.eggemann@arm.com>,
	"Ingo Molnar" <mingo@redhat.com>,
	"Johannes Weiner" <hannes@cmpxchg.org>,
	"Juri Lelli" <juri.lelli@redhat.com>,
	"Mel Gorman" <mgorman@suse.de>,
	"Peter Zijlstra" <peterz@infradead.org>,
	"Steven Rostedt" <rostedt@goodmis.org>,
	"Tejun Heo" <tj@kernel.org>,
	"Valentin Schneider" <vschneid@redhat.com>,
	"Vincent Guittot" <vincent.guittot@linaro.org>
Subject: [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up
Date: Wed, 11 Mar 2026 13:08:29 +0100	[thread overview]
Message-ID: <20260311120829.rEHY-xh9@linutronix.de> (raw)

The cgroup clean up (via cgroup_task_dead()) has been moved to
finish_task_switch() so that sched_ext can observe the task attached to
its cgroups one last time before the task is gone.

This clean up has been added to irq_work on PREEMPT_RT because
finish_task_switch() is invoked with disabled preemption and
cgroup_task_dead() needs to acquire sleeping locks.

Invoking cgroup_task_dead() in finish_task_switch() is too late and
creates a small window in which the task is observed in the cgroup list
after it died and this confuses systemd. To close the window, tasks
which are exiting are no longer exposed to the user. Now there is no
reason to invoke cgroup_task_dead() in finish_task_switch() and it can
be delayed further so simplify the code.

Move cgroup_task_dead() to cgroup_task_free() which is invoked during
RCU clean up of the task_struct.

Fixes: 9311e6c29b34 ("cgroup: Fix sleeping from invalid context warning on PREEMPT_RT")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/cgroup.h |  2 --
 include/linux/sched.h  |  3 ---
 kernel/cgroup/cgroup.c | 56 ++----------------------------------------
 kernel/sched/core.c    |  6 -----
 4 files changed, 2 insertions(+), 65 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bc892e3b37eea..4068035176c41 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -138,7 +138,6 @@ extern void cgroup_cancel_fork(struct task_struct *p,
 extern void cgroup_post_fork(struct task_struct *p,
 			     struct kernel_clone_args *kargs);
 void cgroup_task_exit(struct task_struct *p);
-void cgroup_task_dead(struct task_struct *p);
 void cgroup_task_release(struct task_struct *p);
 void cgroup_task_free(struct task_struct *p);
 
@@ -682,7 +681,6 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
 static inline void cgroup_post_fork(struct task_struct *p,
 				    struct kernel_clone_args *kargs) {}
 static inline void cgroup_task_exit(struct task_struct *p) {}
-static inline void cgroup_task_dead(struct task_struct *p) {}
 static inline void cgroup_task_release(struct task_struct *p) {}
 static inline void cgroup_task_free(struct task_struct *p) {}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7b4a980eb2f0..375d0c958081d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1321,9 +1321,6 @@ struct task_struct {
 	struct css_set __rcu		*cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
 	struct list_head		cg_list;
-#ifdef CONFIG_PREEMPT_RT
-	struct llist_node		cg_dead_lnode;
-#endif	/* CONFIG_PREEMPT_RT */
 #endif	/* CONFIG_CGROUPS */
 #ifdef CONFIG_X86_CPU_RESCTRL
 	u32				closid;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7921633ea1058..684b773cd5cb4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -285,7 +285,6 @@ static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 			      struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
-static void cgroup_rt_init(void);
 
 #ifdef CONFIG_DEBUG_CGROUP_REF
 #define CGROUP_REF_FN_ATTRS	noinline
@@ -6362,7 +6361,6 @@ int __init cgroup_init(void)
 	BUG_ON(ss_rstat_init(NULL));
 
 	get_user_ns(init_cgroup_ns.user_ns);
-	cgroup_rt_init();
 
 	cgroup_lock();
 
@@ -6993,7 +6991,7 @@ void cgroup_task_exit(struct task_struct *tsk)
 	} while_each_subsys_mask();
 }
 
-static void do_cgroup_task_dead(struct task_struct *tsk)
+static void cgroup_task_dead(struct task_struct *tsk)
 {
 	struct css_set *cset;
 	unsigned long flags;
@@ -7019,57 +7017,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
 	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
-#ifdef CONFIG_PREEMPT_RT
-/*
- * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
- * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
- * this lead to sleeping in the invalid context warning bug. css_set_lock is too
- * big to become a raw_spinlock. The task_dead path doesn't need to run
- * synchronously but can't be delayed indefinitely either as the dead task pins
- * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
- * irq_work to allow batching while ensuring timely completion.
- */
-static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
-static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
-
-static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
-{
-	struct llist_node *lnode;
-	struct task_struct *task, *next;
-
-	lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
-	llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
-		do_cgroup_task_dead(task);
-		put_task_struct(task);
-	}
-}
-
-static void __init cgroup_rt_init(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
-		per_cpu(cgrp_dead_tasks_iwork, cpu) =
-			IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
-	}
-}
-
-void cgroup_task_dead(struct task_struct *task)
-{
-	get_task_struct(task);
-	llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
-	irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
-}
-#else	/* CONFIG_PREEMPT_RT */
-static void __init cgroup_rt_init(void) {}
-
-void cgroup_task_dead(struct task_struct *task)
-{
-	do_cgroup_task_dead(task);
-}
-#endif	/* CONFIG_PREEMPT_RT */
-
 void cgroup_task_release(struct task_struct *task)
 {
 	struct cgroup_subsys *ss;
@@ -7084,6 +7031,7 @@ void cgroup_task_free(struct task_struct *task)
 {
 	struct css_set *cset = task_css_set(task);
 
+	cgroup_task_dead(task);
 	if (!list_empty(&task->cg_list)) {
 		spin_lock_irq(&css_set_lock);
 		css_set_skip_task_iters(task_css_set(task), task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7f77c165a6e0..0e67f6db3204a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5181,13 +5181,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
 
-		/*
-		 * sched_ext_dead() must come before cgroup_task_dead() to
-		 * prevent cgroups from being removed while its member tasks are
-		 * visible to SCX schedulers.
-		 */
 		sched_ext_dead(prev);
-		cgroup_task_dead(prev);
 
 		/* Task is done with its stack. */
 		put_task_stack(prev);
-- 
2.53.0


             reply	other threads:[~2026-03-11 12:08 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-11 12:08 Sebastian Andrzej Siewior [this message]
2026-03-13 17:33 ` [PATCH] cgroup: Move cgroup_task_dead() to task_struct clean up Michal Koutný
2026-03-14  9:17   ` Sebastian Andrzej Siewior
2026-03-15 20:15     ` Tejun Heo
2026-03-16  8:06       ` Sebastian Andrzej Siewior

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260311120829.rEHY-xh9@linutronix.de \
    --to=bigeasy@linutronix.de \
    --cc=bsegall@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=clrkwllms@kernel.org \
    --cc=dietmar.eggemann@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=juri.lelli@redhat.com \
    --cc=linux-rt-devel@lists.linux.dev \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=mkoutny@suse.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox