All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: "Johannes Weiner" <hannes@cmpxchg.org>,
	"Michal Koutný" <mkoutny@suse.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Petr Malat <oss@malat.biz>, Bert Karwatzki <spasswolf@web.de>,
	kernel test robot <oliver.sang@intel.com>,
	Martin Pitt <martin@piware.de>,
	cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	Tejun Heo <tj@kernel.org>
Subject: [PATCH 4/5] cgroup: Add per-subsys-css kill_css_finish deferral
Date: Mon,  4 May 2026 14:51:20 -1000	[thread overview]
Message-ID: <20260505005121.1230198-5-tj@kernel.org> (raw)
In-Reply-To: <20260505005121.1230198-1-tj@kernel.org>

93618edf7538 ("cgroup: Defer css percpu_ref kill on rmdir until cgroup is
depopulated") deferred kill_css_finish() at the cgroup level: rmdir waits
for the entire cgroup's populated count to drop to zero, then fires
kill_css_finish() on every subsystem css at once. Replace that with
per-subsys-css deferral. Each subsystem css now tracks its own hierarchical
populated count and independently defers its kill_css_finish() until its own
subtree drains.

The rmdir-race fix carries through unchanged in shape. The dying css's
->css_offline() still waits until no PF_EXITING task references it, and v2's
cgroup-level machinery goes away.

cgroup_apply_control_disable() has the same race shape (PF_EXITING tasks
pinning a css whose ->css_offline() is about to run) and stays synchronous
here. This patch lays the groundwork for fixing it - per-cgroup waiting
can't gate one subsys css being killed while the rest of the cgroup stays
live, but per-css can.

Subtree-wide invariant preserved: a dying ancestor css stays populated
through nr_populated_children until every dying descendant's task drains, so
the walker fires the ancestor's kill_finish_work only after all descendants
have drained.

Add paired smp_mb()s in kill_css_sync() and css_update_populated() to fence
the StoreLoad on (CSS_DYING, populated counter), guaranteeing that either
the walker queues kill_finish_work or the caller fires synchronously.
cgroup_destroy_locked() was implicitly fenced by an unrelated css_set_lock
pair; cgroup_apply_control_disable() in the next patch is not.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  6 +--
 kernel/cgroup/cgroup.c      | 83 +++++++++++++++++++------------------
 2 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c4929f7bbe5a..de2cd6238c2a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -262,6 +262,9 @@ struct cgroup_subsys_state {
 	int nr_populated_csets;
 	int nr_populated_children;
 
+	/* deferred kill_css_finish() queued by css_update_populated() */
+	struct work_struct kill_finish_work;
+
 	/*
 	 * A singly-linked list of css structures to be rstat flushed.
 	 * This is a scratch field to be used exclusively by
@@ -615,9 +618,6 @@ struct cgroup {
 	/* used to wait for offlining of csses */
 	wait_queue_head_t offline_waitq;
 
-	/* defers killing csses after removal until cgroup is depopulated */
-	struct work_struct finish_destroy_work;
-
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dd4ea9d83100..fa24102535d9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -264,7 +264,6 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_skip(struct css_task_iter *it,
 			       struct task_struct *task);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
-static void cgroup_finish_destroy(struct cgroup *cgrp);
 static void kill_css_sync(struct cgroup_subsys_state *css);
 static void kill_css_finish(struct cgroup_subsys_state *css);
 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
@@ -801,13 +800,19 @@ static void css_update_populated(struct cgroup_subsys_state *css, bool populated
 			break;
 
 		/*
-		 * Subtree just emptied below an offlined cgrp. Fire deferred
-		 * destroy. The transition is one-shot.
+		 * Pair with smp_mb() in kill_css_sync(). Either we observe
+		 * CSS_DYING and queue, or the caller observes our decrement
+		 * and fires synchronously.
 		 */
-		if (cgrp && was_populated && !css_is_online(css)) {
-			cgroup_get(cgrp);
-			WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
-						 &cgrp->finish_destroy_work));
+		smp_mb();
+
+		/*
+		 * Subtree just emptied below a dying css. Fire deferred kill.
+		 * The transition is one-shot for a dying css.
+		 */
+		if (was_populated && css_is_dying(css)) {
+			css_get(css);
+			WARN_ON_ONCE(!queue_work(cgroup_offline_wq, &css->kill_finish_work));
 		}
 
 		if (cgrp) {
@@ -2064,16 +2069,6 @@ static int cgroup_reconfigure(struct fs_context *fc)
 	return 0;
 }
 
-static void cgroup_finish_destroy_work_fn(struct work_struct *work)
-{
-	struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);
-
-	cgroup_lock();
-	cgroup_finish_destroy(cgrp);
-	cgroup_unlock();
-	cgroup_put(cgrp);
-}
-
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
@@ -2100,7 +2095,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 #endif
 
 	init_waitqueue_head(&cgrp->offline_waitq);
-	INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
 	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
 }
 
@@ -5695,6 +5689,22 @@ static void css_release(struct percpu_ref *ref)
 	queue_work(cgroup_release_wq, &css->destroy_work);
 }
 
+/*
+ * Deferred kill_css_finish() fired from css_update_populated() once a dying
+ * css's hierarchical populated state drops to zero. Pinned by css_get() at the
+ * queue site; matched by css_put() here.
+ */
+static void kill_css_finish_work_fn(struct work_struct *work)
+{
+	struct cgroup_subsys_state *css =
+		container_of(work, struct cgroup_subsys_state, kill_finish_work);
+
+	cgroup_lock();
+	kill_css_finish(css);
+	cgroup_unlock();
+	css_put(css);
+}
+
 static void init_and_link_css(struct cgroup_subsys_state *css,
 			      struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
@@ -5708,6 +5718,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	css->id = -1;
 	INIT_LIST_HEAD(&css->sibling);
 	INIT_LIST_HEAD(&css->children);
+	INIT_WORK(&css->kill_finish_work, kill_css_finish_work_fn);
 	css->serial_nr = css_serial_nr_next++;
 	atomic_set(&css->online_cnt, 0);
 
@@ -6083,6 +6094,13 @@ static void kill_css_sync(struct cgroup_subsys_state *css)
 
 	css->flags |= CSS_DYING;
 
+	/*
+	 * Pair with smp_mb() in css_update_populated(). Either our
+	 * caller observes the walker's decrement and fires
+	 * synchronously, or the walker observes CSS_DYING and queues.
+	 */
+	smp_mb();
+
 	/*
 	 * This must happen before css is disassociated with its cgroup.
 	 * See seq_css() for details.
@@ -6158,9 +6176,9 @@ static void kill_css_finish(struct cgroup_subsys_state *css)
  * - This function: synchronous user-visible state teardown plus kill_css_sync()
  *   on each subsystem css.
  *
- * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
- *   each subsystem css. Fires once @cgrp's subtree is fully drained, either
- *   inline here or from css_update_populated().
+ * - For each subsys css: fire kill_css_finish() synchronously if the subtree is
+ *   already drained, otherwise rely on css_update_populated() to queue
+ *   kill_finish_work when the last populated cset under the css empties.
  *
  * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
  *   ->css_offline() -> release/free.
@@ -6238,29 +6256,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
 
-	if (!cgroup_is_populated(cgrp))
-		cgroup_finish_destroy(cgrp);
+	for_each_css(css, ssid, cgrp) {
+		if (!css_is_populated(css))
+			kill_css_finish(css);
+	}
 
 	return 0;
 };
 
-/**
- * cgroup_finish_destroy - deferred half of @cgrp destruction
- * @cgrp: cgroup whose subtree just became empty
- *
- * See cgroup_destroy_locked() for the rationale.
- */
-static void cgroup_finish_destroy(struct cgroup *cgrp)
-{
-	struct cgroup_subsys_state *css;
-	int ssid;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	for_each_css(css, ssid, cgrp)
-		kill_css_finish(css);
-}
-
 int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
-- 
2.54.0


  parent reply	other threads:[~2026-05-05  0:51 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-05  0:51 [PATCHSET cgroup/for-7.2] cgroup: Per-css kill_css_finish deferral Tejun Heo
2026-05-05  0:51 ` [PATCH 1/5] cgroup: Inline cgroup_has_tasks() in cgroup.h Tejun Heo
2026-05-05  0:51 ` [PATCH 2/5] cgroup: Annotate unlocked nr_populated_* accesses with READ_ONCE/WRITE_ONCE Tejun Heo
2026-05-05  0:51 ` [PATCH 3/5] cgroup: Move populated counters to cgroup_subsys_state Tejun Heo
2026-05-05  0:51 ` Tejun Heo [this message]
2026-05-05  0:51 ` [PATCH 5/5] cgroup: Defer kill_css_finish() in cgroup_apply_control_disable() Tejun Heo
2026-05-13 21:01 ` [PATCHSET cgroup/for-7.2] cgroup: Per-css kill_css_finish deferral Tejun Heo
2026-05-15 17:28 ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260505005121.1230198-5-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=bigeasy@linutronix.de \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=martin@piware.de \
    --cc=mkoutny@suse.com \
    --cc=oliver.sang@intel.com \
    --cc=oss@malat.biz \
    --cc=spasswolf@web.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.