* [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock
@ 2026-05-19 17:31 Thomas Falcon
2026-05-19 17:49 ` Tejun Heo
2026-05-22 16:08 ` Michal Koutný
0 siblings, 2 replies; 4+ messages in thread
From: Thomas Falcon @ 2026-05-19 17:31 UTC (permalink / raw)
To: Tejun Heo, Johannes Weiner, Michal Koutný; +Cc: cgroups, linux-kernel
Implement rstat_ss_lock and rstat_base_lock as read-write locks
instead of spinlocks. In addition, update tracing to reflect new
locking implementation.
The benchmark below, meant to simulate a workload performing many
concurrent cgroup cpu.stat reads, completes in 134 seconds on an Intel
Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when
using spinlocks.
Signed-off-by: Thomas Falcon <thomas.falcon@intel.com>
---
Initially discussed here: https://lore.kernel.org/cgroups/20250617102644.752201-2-bertrand.wlodarczyk@intel.com/
Benchmark: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
---
include/linux/cgroup-defs.h | 2 +-
include/trace/events/cgroup.h | 22 ++++++++--------
kernel/cgroup/rstat.c | 47 ++++++++++++++++++++++-------------
3 files changed, 43 insertions(+), 28 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 50a784da7a81..8609d6f6b29f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -861,7 +861,7 @@ struct cgroup_subsys {
*/
unsigned int depends_on;
- spinlock_t rstat_ss_lock;
+ rwlock_t rstat_ss_lock;
struct llist_head __percpu *lhead; /* lockless update list head */
};
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index b736da06340a..1bd80a08c116 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,16 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
DECLARE_EVENT_CLASS(cgroup_rstat,
- TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+ TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
- TP_ARGS(cgrp, cpu, contended),
+ TP_ARGS(cgrp, cpu, type, contended),
TP_STRUCT__entry(
__field( int, root )
__field( int, level )
__field( u64, id )
__field( int, cpu )
+ __string( type, type )
__field( bool, contended )
),
@@ -223,12 +224,13 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
__entry->id = cgroup_id(cgrp);
__entry->level = cgrp->level;
__entry->cpu = cpu;
+ __assign_str(type);
__entry->contended = contended;
),
- TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
+ TP_printk("root=%d id=%llu level=%d cpu=%d lock-type=%s lock contended:%d",
__entry->root, __entry->id, __entry->level,
- __entry->cpu, __entry->contended)
+ __entry->cpu, __get_str(type), __entry->contended)
);
/*
@@ -238,23 +240,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
*/
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
- TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+ TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
- TP_ARGS(cgrp, cpu, contended)
+ TP_ARGS(cgrp, cpu, type, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
- TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+ TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
- TP_ARGS(cgrp, cpu, contended)
+ TP_ARGS(cgrp, cpu, type, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
- TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+ TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
- TP_ARGS(cgrp, cpu, contended)
+ TP_ARGS(cgrp, cpu, type, contended)
);
#endif /* _TRACE_CGROUP_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 150e5871e66f..e33f31914b7d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,7 +9,7 @@
#include <trace/events/cgroup.h>
-static DEFINE_SPINLOCK(rstat_base_lock);
+static DEFINE_RWLOCK(rstat_base_lock);
static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -37,7 +37,7 @@ static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
}
-static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
+static rwlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
{
if (ss)
return &ss->rstat_ss_lock;
@@ -356,32 +356,45 @@ __bpf_hook_end();
* number processed last.
*/
static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
- int cpu_in_loop)
+ int cpu_in_loop, bool write)
__acquires(ss_rstat_lock(css->ss))
{
+ const char *type = write ? "write" : "read";
struct cgroup *cgrp = css->cgroup;
- spinlock_t *lock;
+ rwlock_t *lock;
bool contended;
lock = ss_rstat_lock(css->ss);
- contended = !spin_trylock_irq(lock);
+
+ local_irq_disable();
+ contended = write ? !write_trylock(lock) : !read_trylock(lock);
if (contended) {
- trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
- spin_lock_irq(lock);
+ local_irq_enable();
+ trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, type, contended);
+ if (write)
+ write_lock_irq(lock);
+ else
+ read_lock_irq(lock);
}
- trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+
+ trace_cgroup_rstat_locked(cgrp, cpu_in_loop, type, contended);
}
static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
- int cpu_in_loop)
+ int cpu_in_loop, bool write)
__releases(ss_rstat_lock(css->ss))
{
+ const char *type = write ? "write" : "read";
struct cgroup *cgrp = css->cgroup;
- spinlock_t *lock;
+ rwlock_t *lock;
lock = ss_rstat_lock(css->ss);
- trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
- spin_unlock_irq(lock);
+ trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, type, false);
+
+ if (write)
+ write_unlock_irq(lock);
+ else
+ read_unlock_irq(lock);
}
/**
@@ -414,7 +427,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
struct cgroup_subsys_state *pos;
/* Reacquire for each CPU to avoid disabling IRQs too long */
- __css_rstat_lock(css, cpu);
+ __css_rstat_lock(css, cpu, true);
pos = css_rstat_updated_list(css, cpu);
for (; pos; pos = pos->rstat_flush_next) {
if (is_self) {
@@ -424,7 +437,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
} else
pos->ss->css_rstat_flush(pos, cpu);
}
- __css_rstat_unlock(css, cpu);
+ __css_rstat_unlock(css, cpu, true);
if (!cond_resched())
cpu_relax();
}
@@ -525,7 +538,7 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
return -ENOMEM;
}
- spin_lock_init(ss_rstat_lock(ss));
+ rwlock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
init_llist_head(ss_lhead_cpu(ss, cpu));
@@ -717,11 +730,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
if (cgroup_parent(cgrp)) {
css_rstat_flush(&cgrp->self);
- __css_rstat_lock(&cgrp->self, -1);
+ __css_rstat_lock(&cgrp->self, -1, false);
bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&bstat.cputime.utime, &bstat.cputime.stime);
- __css_rstat_unlock(&cgrp->self, -1);
+ __css_rstat_unlock(&cgrp->self, -1, false);
} else {
root_cgroup_cputime(&bstat);
}
--
2.43.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock
2026-05-19 17:31 [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock Thomas Falcon
@ 2026-05-19 17:49 ` Tejun Heo
2026-05-19 18:04 ` Falcon, Thomas
2026-05-22 16:08 ` Michal Koutný
1 sibling, 1 reply; 4+ messages in thread
From: Tejun Heo @ 2026-05-19 17:49 UTC (permalink / raw)
To: Thomas Falcon; +Cc: Johannes Weiner, Michal Koutný, cgroups, linux-kernel
On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon wrote:
> Implement rstat_ss_lock and rstat_base_lock as read-write locks
> instead of spinlocks. In addition, update tracing to reflect new
> locking implementation.
>
> The benchmark below, meant to simulate a workload performing many
> concurrent cgroup cpu.stat reads, completes in 134 seconds on an Intel
> Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when
> using spinlocks.
Can you try using seqlock for readers? That'd be decouple readers a lot
better than rwlocks.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock
2026-05-19 17:49 ` Tejun Heo
@ 2026-05-19 18:04 ` Falcon, Thomas
0 siblings, 0 replies; 4+ messages in thread
From: Falcon, Thomas @ 2026-05-19 18:04 UTC (permalink / raw)
To: tj@kernel.org
Cc: mkoutny@suse.com, hannes@cmpxchg.org,
linux-kernel@vger.kernel.org, cgroups@vger.kernel.org
On Tue, 2026-05-19 at 07:49 -1000, Tejun Heo wrote:
> On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon wrote:
> > Implement rstat_ss_lock and rstat_base_lock as read-write locks
> > instead of spinlocks. In addition, update tracing to reflect new
> > locking implementation.
> >
> > The benchmark below, meant to simulate a workload performing many
> > concurrent cgroup cpu.stat reads, completes in 134 seconds on an
> > Intel
> > Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when
> > using spinlocks.
>
> Can you try using seqlock for readers? That'd be decouple readers a
> lot
> better than rwlocks.
>
> Thanks.
>
Hi, thanks for the quick response. I will try that.
Thanks,
Tom
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock
2026-05-19 17:31 [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock Thomas Falcon
2026-05-19 17:49 ` Tejun Heo
@ 2026-05-22 16:08 ` Michal Koutný
1 sibling, 0 replies; 4+ messages in thread
From: Michal Koutný @ 2026-05-22 16:08 UTC (permalink / raw)
To: Thomas Falcon; +Cc: Tejun Heo, Johannes Weiner, cgroups, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 1875 bytes --]
Hi Thomas.
On Tue, May 19, 2026 at 12:31:34PM -0500, Thomas Falcon <thomas.falcon@intel.com> wrote:
> @@ -414,7 +427,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
> struct cgroup_subsys_state *pos;
>
> /* Reacquire for each CPU to avoid disabling IRQs too long */
> - __css_rstat_lock(css, cpu);
> + __css_rstat_lock(css, cpu, true);
> pos = css_rstat_updated_list(css, cpu);
> for (; pos; pos = pos->rstat_flush_next) {
> if (is_self) {
> @@ -424,7 +437,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
> } else
> pos->ss->css_rstat_flush(pos, cpu);
> }
> - __css_rstat_unlock(css, cpu);
> + __css_rstat_unlock(css, cpu, true);
> if (!cond_resched())
> cpu_relax();
> }
> @@ -717,11 +730,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
>
> if (cgroup_parent(cgrp)) {
> css_rstat_flush(&cgrp->self);
> - __css_rstat_lock(&cgrp->self, -1);
> + __css_rstat_lock(&cgrp->self, -1, false);
> bstat = cgrp->bstat;
> cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
> &bstat.cputime.utime, &bstat.cputime.stime);
> - __css_rstat_unlock(&cgrp->self, -1);
> + __css_rstat_unlock(&cgrp->self, -1, false);
I was wondering where these distinctions of readers vs writers stem from
and here I see that it's mainly the per-subsys vs rstat_base_lock.
Given that cputime_adjust() is here only modifying the local bstat
value, the read-like lock makes sense.
However, there's still the cgroup's flush right above which would take
the per-subsys locks in write-mode anyway.
Can you add some more explanation why this works?
More generally, I'm wondering where are the opportunities for replacing
per-subsys lock with an RW lock (or seqcount).
Thanks for looking into cpu.stat scalability,
Michal
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-05-22 16:08 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-19 17:31 [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock Thomas Falcon
2026-05-19 17:49 ` Tejun Heo
2026-05-19 18:04 ` Falcon, Thomas
2026-05-22 16:08 ` Michal Koutný
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox