From: Shakeel Butt <shakeel.butt@linux.dev>
To: Tejun Heo <tj@kernel.org>,
Andrew Morton <akpm@linux-foundation.org>,
Alexei Starovoitov <ast@kernel.org>
Cc: "Johannes Weiner" <hannes@cmpxchg.org>,
"Michal Hocko" <mhocko@kernel.org>,
"Roman Gushchin" <roman.gushchin@linux.dev>,
"Muchun Song" <muchun.song@linux.dev>,
"Yosry Ahmed" <yosry.ahmed@linux.dev>,
"Michal Koutný" <mkoutny@suse.com>,
"Vlastimil Babka" <vbabka@suse.cz>,
"Sebastian Andrzej Siewior" <bigeasy@linutronix.de>,
"JP Kobryn" <inwardvessel@gmail.com>,
bpf@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org,
linux-kernel@vger.kernel.org,
"Meta kernel team" <kernel-team@meta.com>
Subject: [RFC PATCH 3/3] cgroup: make css_rstat_updated nmi safe
Date: Mon, 28 Apr 2025 23:12:09 -0700 [thread overview]
Message-ID: <20250429061211.1295443-4-shakeel.butt@linux.dev> (raw)
In-Reply-To: <20250429061211.1295443-1-shakeel.butt@linux.dev>
To make css_rstat_updated() able to safely run in nmi context, it can
not spin on locks and rather has to do trylock on the per-cpu per-ss raw
spinlock. This patch implements the backlog mechanism to handle the
failure in acquiring the per-cpu per-ss raw spinlock.
Each subsystem provides a per-cpu lockless list on which the kernel
stores the css given to css_rstat_updated() on trylock failure. These
lockless lists serve as backlog. On cgroup stats flushing code path, the
kernel first processes all the per-cpu lockless backlog lists of the
given ss and then proceeds to flush the update stat trees.
With css_rstat_updated() being nmi safe, the memch stats can and will be
converted to be nmi safe to enable nmi safe mem charging.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
kernel/cgroup/rstat.c | 99 +++++++++++++++++++++++++++++++++----------
1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d3092b4c85d7..ac533e46afa9 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -11,6 +11,7 @@
static DEFINE_SPINLOCK(rstat_base_lock);
static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -42,6 +43,13 @@ static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
}
+static struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
+{
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
+}
+
/*
* Helper functions for rstat per CPU locks.
*
@@ -86,6 +94,21 @@ unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
return flags;
}
+static __always_inline
+bool _css_rstat_cpu_trylock(struct cgroup_subsys_state *css, int cpu,
+ unsigned long *flags)
+{
+ struct cgroup *cgrp = css->cgroup;
+ raw_spinlock_t *cpu_lock;
+ bool contended;
+
+ cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
+ contended = !raw_spin_trylock_irqsave(cpu_lock, *flags);
+ if (contended)
+ trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+ return !contended;
+}
+
static __always_inline
void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
unsigned long flags, const bool fast_path)
@@ -102,32 +125,16 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
-/**
- * css_rstat_updated - keep track of updated rstat_cpu
- * @css: target cgroup subsystem state
- * @cpu: cpu on which rstat_cpu was updated
- *
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
- */
-__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+static void css_add_to_backlog(struct cgroup_subsys_state *css, int cpu)
{
- unsigned long flags;
-
- /*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
- *
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @css is on the list by
- * testing the next pointer for NULL.
- */
- if (data_race(css_rstat_cpu(css, cpu)->updated_next))
- return;
+ struct llist_head *lhead = ss_lhead_cpu(css->ss, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- flags = _css_rstat_cpu_lock(css, cpu, true);
+ llist_add_iff_not_on_list(&rstatc->lnode, lhead);
+}
+static void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
/* put @css and all ancestors on the corresponding updated lists */
while (true) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -153,6 +160,51 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
css = parent;
}
+}
+
+static void css_process_backlog(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
+
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_rstat_updated(rstatc->owner, cpu);
+ }
+}
+
+/**
+ * css_rstat_updated - keep track of updated rstat_cpu
+ * @css: target cgroup subsystem state
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
+ * rstat_cpu->updated_children list. See the comment on top of
+ * css_rstat_cpu definition for details.
+ */
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
+ unsigned long flags;
+
+ /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @css is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+ return;
+
+ if (!_css_rstat_cpu_trylock(css, cpu, &flags)) {
+ css_add_to_backlog(css, cpu);
+ return;
+ }
+
+ __css_rstat_updated(css, cpu);
_css_rstat_cpu_unlock(css, cpu, flags, true);
}
@@ -255,6 +307,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
flags = _css_rstat_cpu_lock(root, cpu, false);
+ css_process_backlog(root->ss, cpu);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
goto unlock_ret;
--
2.47.1
next prev parent reply other threads:[~2025-04-29 6:12 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-29 6:12 [RFC PATCH 0/3] cgroup: nmi safe css_rstat_updated Shakeel Butt
2025-04-29 6:12 ` [RFC PATCH 1/3] llist: add list_add_iff_not_on_list() Shakeel Butt
2025-04-30 12:44 ` [RFC PATCH 1/3] llist: add list_add_iff_not_on_list()g Yosry Ahmed
2025-04-29 6:12 ` [RFC PATCH 2/3] cgroup: support to enable nmi-safe css_rstat_updated Shakeel Butt
2025-04-29 6:12 ` Shakeel Butt [this message]
2025-04-30 13:14 ` [RFC PATCH 3/3] cgroup: make css_rstat_updated nmi safe Yosry Ahmed
2025-05-01 22:10 ` Shakeel Butt
2025-05-06 9:41 ` Yosry Ahmed
2025-05-06 19:30 ` Shakeel Butt
2025-05-07 6:52 ` Yosry Ahmed
2025-04-29 6:12 ` [OFFLIST PATCH 1/2] cgroup: use separate rstat trees for each subsystem Shakeel Butt
2025-04-29 6:12 ` [OFFLIST PATCH 2/2] cgroup: use subsystem-specific rstat locks to avoid contention Shakeel Butt
2025-04-29 6:15 ` Shakeel Butt
2025-05-21 22:23 ` Klara Modin
2025-05-21 22:29 ` Tejun Heo
2025-05-21 23:23 ` Shakeel Butt
2025-05-21 23:33 ` Shakeel Butt
2025-05-21 23:47 ` JP Kobryn
2025-05-21 23:50 ` Shakeel Butt
2025-05-21 23:52 ` JP Kobryn
2025-05-21 23:47 ` Shakeel Butt
2025-04-29 6:15 ` [OFFLIST PATCH 1/2] cgroup: use separate rstat trees for each subsystem Shakeel Butt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250429061211.1295443-4-shakeel.butt@linux.dev \
--to=shakeel.butt@linux.dev \
--cc=akpm@linux-foundation.org \
--cc=ast@kernel.org \
--cc=bigeasy@linutronix.de \
--cc=bpf@vger.kernel.org \
--cc=cgroups@vger.kernel.org \
--cc=hannes@cmpxchg.org \
--cc=inwardvessel@gmail.com \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=roman.gushchin@linux.dev \
--cc=tj@kernel.org \
--cc=vbabka@suse.cz \
--cc=yosry.ahmed@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.