From: Tejun Heo <tj@kernel.org>
To: "Johannes Weiner" <hannes@cmpxchg.org>,
"Michal Koutný" <mkoutny@suse.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
Petr Malat <oss@malat.biz>, Bert Karwatzki <spasswolf@web.de>,
kernel test robot <oliver.sang@intel.com>,
Martin Pitt <martin@piware.de>,
cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
Tejun Heo <tj@kernel.org>
Subject: [PATCH 4/5] cgroup: Add per-subsys-css kill_css_finish deferral
Date: Mon, 4 May 2026 14:51:20 -1000 [thread overview]
Message-ID: <20260505005121.1230198-5-tj@kernel.org> (raw)
In-Reply-To: <20260505005121.1230198-1-tj@kernel.org>
93618edf7538 ("cgroup: Defer css percpu_ref kill on rmdir until cgroup is
depopulated") deferred kill_css_finish() at the cgroup level: rmdir waits
for the entire cgroup's populated count to drop to zero, then fires
kill_css_finish() on every subsystem css at once. Replace that with
per-subsys-css deferral. Each subsystem css now tracks its own hierarchical
populated count and independently defers its kill_css_finish() until its own
subtree drains.
The rmdir-race fix carries through unchanged in shape. The dying css's
->css_offline() still waits until no PF_EXITING task references it, and v2's
cgroup-level machinery goes away.
cgroup_apply_control_disable() has the same race shape (PF_EXITING tasks
pinning a css whose ->css_offline() is about to run) and stays synchronous
here. This patch lays the groundwork for fixing it - per-cgroup waiting
can't gate one subsys css being killed while the rest of the cgroup stays
live, but per-css can.
Subtree-wide invariant preserved: a dying ancestor css stays populated
through nr_populated_children until every dying descendant's task drains, so
the walker fires the ancestor's kill_finish_work only after all descendants
have drained.
Add paired smp_mb()s in kill_css_sync() and css_update_populated() to fence
the StoreLoad on (CSS_DYING, populated counter), guaranteeing that either
the walker queues kill_finish_work or the caller fires synchronously.
cgroup_destroy_locked() was implicitly fenced by an unrelated css_set_lock
pair; cgroup_apply_control_disable() in the next patch is not.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/cgroup-defs.h | 6 +--
kernel/cgroup/cgroup.c | 83 +++++++++++++++++++------------------
2 files changed, 46 insertions(+), 43 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c4929f7bbe5a..de2cd6238c2a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -262,6 +262,9 @@ struct cgroup_subsys_state {
int nr_populated_csets;
int nr_populated_children;
+ /* deferred kill_css_finish() queued by css_update_populated() */
+ struct work_struct kill_finish_work;
+
/*
* A singly-linked list of css structures to be rstat flushed.
* This is a scratch field to be used exclusively by
@@ -615,9 +618,6 @@ struct cgroup {
/* used to wait for offlining of csses */
wait_queue_head_t offline_waitq;
- /* defers killing csses after removal until cgroup is depopulated */
- struct work_struct finish_destroy_work;
-
/* used to schedule release agent */
struct work_struct release_agent_work;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dd4ea9d83100..fa24102535d9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -264,7 +264,6 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static void cgroup_finish_destroy(struct cgroup *cgrp);
static void kill_css_sync(struct cgroup_subsys_state *css);
static void kill_css_finish(struct cgroup_subsys_state *css);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
@@ -801,13 +800,19 @@ static void css_update_populated(struct cgroup_subsys_state *css, bool populated
break;
/*
- * Subtree just emptied below an offlined cgrp. Fire deferred
- * destroy. The transition is one-shot.
+ * Pair with smp_mb() in kill_css_sync(). Either we observe
+ * CSS_DYING and queue, or the caller observes our decrement
+ * and fires synchronously.
*/
- if (cgrp && was_populated && !css_is_online(css)) {
- cgroup_get(cgrp);
- WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
- &cgrp->finish_destroy_work));
+ smp_mb();
+
+ /*
+ * Subtree just emptied below a dying css. Fire deferred kill.
+ * The transition is one-shot for a dying css.
+ */
+ if (was_populated && css_is_dying(css)) {
+ css_get(css);
+ WARN_ON_ONCE(!queue_work(cgroup_offline_wq, &css->kill_finish_work));
}
if (cgrp) {
@@ -2064,16 +2069,6 @@ static int cgroup_reconfigure(struct fs_context *fc)
return 0;
}
-static void cgroup_finish_destroy_work_fn(struct work_struct *work)
-{
- struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);
-
- cgroup_lock();
- cgroup_finish_destroy(cgrp);
- cgroup_unlock();
- cgroup_put(cgrp);
-}
-
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
@@ -2100,7 +2095,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
- INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -5695,6 +5689,22 @@ static void css_release(struct percpu_ref *ref)
queue_work(cgroup_release_wq, &css->destroy_work);
}
+/*
+ * Deferred kill_css_finish() fired from css_update_populated() once a dying
+ * css's hierarchical populated state drops to zero. Pinned by css_get() at the
+ * queue site; matched by css_put() here.
+ */
+static void kill_css_finish_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, kill_finish_work);
+
+ cgroup_lock();
+ kill_css_finish(css);
+ cgroup_unlock();
+ css_put(css);
+}
+
static void init_and_link_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
@@ -5708,6 +5718,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
+ INIT_WORK(&css->kill_finish_work, kill_css_finish_work_fn);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -6083,6 +6094,13 @@ static void kill_css_sync(struct cgroup_subsys_state *css)
css->flags |= CSS_DYING;
+ /*
+ * Pair with smp_mb() in css_update_populated(). Either our
+ * caller observes the walker's decrement and fires
+ * synchronously, or the walker observes CSS_DYING and queues.
+ */
+ smp_mb();
+
/*
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
@@ -6158,9 +6176,9 @@ static void kill_css_finish(struct cgroup_subsys_state *css)
* - This function: synchronous user-visible state teardown plus kill_css_sync()
* on each subsystem css.
*
- * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
- * each subsystem css. Fires once @cgrp's subtree is fully drained, either
- * inline here or from css_update_populated().
+ * - For each subsys css: fire kill_css_finish() synchronously if the subtree is
+ * already drained, otherwise rely on css_update_populated() to queue
+ * kill_finish_work when the last populated cset under the css empties.
*
* - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
* ->css_offline() -> release/free.
@@ -6238,29 +6256,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
- if (!cgroup_is_populated(cgrp))
- cgroup_finish_destroy(cgrp);
+ for_each_css(css, ssid, cgrp) {
+ if (!css_is_populated(css))
+ kill_css_finish(css);
+ }
return 0;
};
-/**
- * cgroup_finish_destroy - deferred half of @cgrp destruction
- * @cgrp: cgroup whose subtree just became empty
- *
- * See cgroup_destroy_locked() for the rationale.
- */
-static void cgroup_finish_destroy(struct cgroup *cgrp)
-{
- struct cgroup_subsys_state *css;
- int ssid;
-
- lockdep_assert_held(&cgroup_mutex);
-
- for_each_css(css, ssid, cgrp)
- kill_css_finish(css);
-}
-
int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
--
2.54.0
next prev parent reply other threads:[~2026-05-05 0:51 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-05 0:51 [PATCHSET cgroup/for-7.2] cgroup: Per-css kill_css_finish deferral Tejun Heo
2026-05-05 0:51 ` [PATCH 1/5] cgroup: Inline cgroup_has_tasks() in cgroup.h Tejun Heo
2026-05-05 0:51 ` [PATCH 2/5] cgroup: Annotate unlocked nr_populated_* accesses with READ_ONCE/WRITE_ONCE Tejun Heo
2026-05-05 0:51 ` [PATCH 3/5] cgroup: Move populated counters to cgroup_subsys_state Tejun Heo
2026-05-05 0:51 ` Tejun Heo [this message]
2026-05-05 0:51 ` [PATCH 5/5] cgroup: Defer kill_css_finish() in cgroup_apply_control_disable() Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260505005121.1230198-5-tj@kernel.org \
--to=tj@kernel.org \
--cc=bigeasy@linutronix.de \
--cc=cgroups@vger.kernel.org \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=martin@piware.de \
--cc=mkoutny@suse.com \
--cc=oliver.sang@intel.com \
--cc=oss@malat.biz \
--cc=spasswolf@web.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox