Linux cgroups development
 help / color / mirror / Atom feed
* [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock
@ 2026-05-19 17:31 Thomas Falcon
  2026-05-19 17:49 ` Tejun Heo
  2026-05-22 16:08 ` Michal Koutný
  0 siblings, 2 replies; 4+ messages in thread
From: Thomas Falcon @ 2026-05-19 17:31 UTC (permalink / raw)
  To: Tejun Heo, Johannes Weiner, Michal Koutný; +Cc: cgroups, linux-kernel

Implement rstat_ss_lock and rstat_base_lock as read-write locks
instead of spinlocks. In addition, update tracing to reflect new
locking implementation.

The benchmark below, meant to simulate a workload performing many
concurrent cgroup cpu.stat reads, completes in 134 seconds on an Intel
Xeon Platinum 8568Y with 144 cpus compared to 241 seconds when
using spinlocks.

Signed-off-by: Thomas Falcon <thomas.falcon@intel.com>
---
Initially discussed here: https://lore.kernel.org/cgroups/20250617102644.752201-2-bertrand.wlodarczyk@intel.com/
Benchmark: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
---
include/linux/cgroup-defs.h   |  2 +-
 include/trace/events/cgroup.h | 22 ++++++++--------
 kernel/cgroup/rstat.c         | 47 ++++++++++++++++++++++-------------
 3 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 50a784da7a81..8609d6f6b29f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -861,7 +861,7 @@ struct cgroup_subsys {
 	 */
 	unsigned int depends_on;
 
-	spinlock_t rstat_ss_lock;
+	rwlock_t rstat_ss_lock;
 	struct llist_head __percpu *lhead; /* lockless update list head */
 };
 
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index b736da06340a..1bd80a08c116 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,16 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
 
 DECLARE_EVENT_CLASS(cgroup_rstat,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended),
+	TP_ARGS(cgrp, cpu, type, contended),
 
 	TP_STRUCT__entry(
 		__field(	int,		root			)
 		__field(	int,		level			)
 		__field(	u64,		id			)
 		__field(	int,		cpu			)
+		__string(	type,		type			)
 		__field(	bool,		contended		)
 	),
 
@@ -223,12 +224,13 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
 		__entry->cpu = cpu;
+		__assign_str(type);
 		__entry->contended = contended;
 	),
 
-	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
+	TP_printk("root=%d id=%llu level=%d cpu=%d lock-type=%s lock contended:%d",
 		  __entry->root, __entry->id, __entry->level,
-		  __entry->cpu, __entry->contended)
+		  __entry->cpu, __get_str(type), __entry->contended)
 );
 
 /*
@@ -238,23 +240,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
  */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, cpu, type, contended)
 );
 
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, cpu, type, contended)
 );
 
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, const char *type, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, cpu, type, contended)
 );
 
 #endif /* _TRACE_CGROUP_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 150e5871e66f..e33f31914b7d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,7 +9,7 @@
 
 #include <trace/events/cgroup.h>
 
-static DEFINE_SPINLOCK(rstat_base_lock);
+static DEFINE_RWLOCK(rstat_base_lock);
 static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -37,7 +37,7 @@ static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
 	return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
 }
 
-static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
+static rwlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
 {
 	if (ss)
 		return &ss->rstat_ss_lock;
@@ -356,32 +356,45 @@ __bpf_hook_end();
  * number processed last.
  */
 static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
-		int cpu_in_loop)
+				    int cpu_in_loop, bool write)
 	__acquires(ss_rstat_lock(css->ss))
 {
+	const char *type = write ? "write" : "read";
 	struct cgroup *cgrp = css->cgroup;
-	spinlock_t *lock;
+	rwlock_t *lock;
 	bool contended;
 
 	lock = ss_rstat_lock(css->ss);
-	contended = !spin_trylock_irq(lock);
+
+	local_irq_disable();
+	contended = write ? !write_trylock(lock) : !read_trylock(lock);
 	if (contended) {
-		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
-		spin_lock_irq(lock);
+		local_irq_enable();
+		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, type, contended);
+		if (write)
+			write_lock_irq(lock);
+		else
+			read_lock_irq(lock);
 	}
-	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+
+	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, type, contended);
 }
 
 static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
-				      int cpu_in_loop)
+				      int cpu_in_loop, bool write)
 	__releases(ss_rstat_lock(css->ss))
 {
+	const char *type = write ? "write" : "read";
 	struct cgroup *cgrp = css->cgroup;
-	spinlock_t *lock;
+	rwlock_t *lock;
 
 	lock = ss_rstat_lock(css->ss);
-	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
-	spin_unlock_irq(lock);
+	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, type, false);
+
+	if (write)
+		write_unlock_irq(lock);
+	else
+		read_unlock_irq(lock);
 }
 
 /**
@@ -414,7 +427,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
 		struct cgroup_subsys_state *pos;
 
 		/* Reacquire for each CPU to avoid disabling IRQs too long */
-		__css_rstat_lock(css, cpu);
+		__css_rstat_lock(css, cpu, true);
 		pos = css_rstat_updated_list(css, cpu);
 		for (; pos; pos = pos->rstat_flush_next) {
 			if (is_self) {
@@ -424,7 +437,7 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
 			} else
 				pos->ss->css_rstat_flush(pos, cpu);
 		}
-		__css_rstat_unlock(css, cpu);
+		__css_rstat_unlock(css, cpu, true);
 		if (!cond_resched())
 			cpu_relax();
 	}
@@ -525,7 +538,7 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
 			return -ENOMEM;
 	}
 
-	spin_lock_init(ss_rstat_lock(ss));
+	rwlock_init(ss_rstat_lock(ss));
 	for_each_possible_cpu(cpu)
 		init_llist_head(ss_lhead_cpu(ss, cpu));
 
@@ -717,11 +730,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 
 	if (cgroup_parent(cgrp)) {
 		css_rstat_flush(&cgrp->self);
-		__css_rstat_lock(&cgrp->self, -1);
+		__css_rstat_lock(&cgrp->self, -1, false);
 		bstat = cgrp->bstat;
 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 			       &bstat.cputime.utime, &bstat.cputime.stime);
-		__css_rstat_unlock(&cgrp->self, -1);
+		__css_rstat_unlock(&cgrp->self, -1, false);
 	} else {
 		root_cgroup_cputime(&bstat);
 	}
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-05-22 16:08 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-19 17:31 [RFC PATCH] cgroup/rstat: convert rstat lock from spinlock to rwlock Thomas Falcon
2026-05-19 17:49 ` Tejun Heo
2026-05-19 18:04   ` Falcon, Thomas
2026-05-22 16:08 ` Michal Koutný

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox