linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop
@ 2025-07-22 12:43 Bertrand Wlodarczyk
  2025-07-22 15:23 ` Shakeel Butt
  2025-07-31 15:26 ` Michal Koutný
  0 siblings, 2 replies; 3+ messages in thread
From: Bertrand Wlodarczyk @ 2025-07-22 12:43 UTC (permalink / raw)
  To: tj, hannes, mkoutny, cgroups, linux-kernel
  Cc: shakeel.butt, inwardvessel, Bertrand Wlodarczyk

By moving the lock outside the CPU loop, we reduce the frequency
of costly lock acquisition.
This adjustment slightly improves performance in scenarios where
multiple CPUs concurrently attempt to acquire the lock for
the same css.

The cpu argument passed to __css_rstat_lock, which was utilized
by the event system, has been removed because it is no longer in use.
---
Results:
 
QEMU vm
+-------+---------+
| main  | patched |
+-------+---------+
| 9.17s | 2.36s   |
+-------+---------+
ext4 raw image with debian:
qemu-system-x86_64 -enable-kvm -cpu host -smp 102 -m 16G -kernel linux-cgroup/arch/x86/boot/bzImage -drive file=rootfs.ext4,if=virtio,format=raw -append "rootwait root=/dev/vda console=tty1 console=ttyS0 nokaslr cgroup_enable=memory cgroup_memory=1" -net nic,model=virtio -net user -nographic

Benchmark code: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
---
 include/trace/events/cgroup.h | 22 ++++++++++------------
 kernel/cgroup/rstat.c         | 20 +++++++++-----------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index ba9229af9a34..eb674eef8b99 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,14 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
 
 DECLARE_EVENT_CLASS(cgroup_rstat,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended),
+	TP_ARGS(cgrp, contended),
 
 	TP_STRUCT__entry(
 		__field(	int,		root			)
 		__field(	int,		level			)
 		__field(	u64,		id			)
-		__field(	int,		cpu			)
 		__field(	bool,		contended		)
 	),
 
@@ -222,13 +221,12 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
 		__entry->root = cgrp->root->hierarchy_id;
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
-		__entry->cpu = cpu;
 		__entry->contended = contended;
 	),
 
-	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
+	TP_printk("root=%d id=%llu level=%d lock contended:%d",
 		  __entry->root, __entry->id, __entry->level,
-		  __entry->cpu, __entry->contended)
+		  __entry->contended)
 );
 
 /*
@@ -238,23 +236,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
  */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, contended)
 );
 
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, contended)
 );
 
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+	TP_PROTO(struct cgroup *cgrp, bool contended),
 
-	TP_ARGS(cgrp, cpu, contended)
+	TP_ARGS(cgrp, contended)
 );
 
 #endif /* _TRACE_CGROUP_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c8a48cf83878..dd312fe1896d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -326,8 +326,7 @@ __bpf_hook_end();
  * value -1 is used when obtaining the main lock else this is the CPU
  * number processed last.
  */
-static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
-		int cpu_in_loop)
+static inline void __css_rstat_lock(struct cgroup_subsys_state *css)
 	__acquires(ss_rstat_lock(css->ss))
 {
 	struct cgroup *cgrp = css->cgroup;
@@ -337,21 +336,20 @@ static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
 	lock = ss_rstat_lock(css->ss);
 	contended = !spin_trylock_irq(lock);
 	if (contended) {
-		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+		trace_cgroup_rstat_lock_contended(cgrp, contended);
 		spin_lock_irq(lock);
 	}
-	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+	trace_cgroup_rstat_locked(cgrp, contended);
 }
 
-static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
-				      int cpu_in_loop)
+static inline void __css_rstat_unlock(struct cgroup_subsys_state *css)
 	__releases(ss_rstat_lock(css->ss))
 {
 	struct cgroup *cgrp = css->cgroup;
 	spinlock_t *lock;
 
 	lock = ss_rstat_lock(css->ss);
-	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+	trace_cgroup_rstat_unlock(cgrp, false);
 	spin_unlock_irq(lock);
 }
 
@@ -381,11 +379,11 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
 		return;
 
 	might_sleep();
+	__css_rstat_lock(css);
 	for_each_possible_cpu(cpu) {
 		struct cgroup_subsys_state *pos;
 
 		/* Reacquire for each CPU to avoid disabling IRQs too long */
-		__css_rstat_lock(css, cpu);
 		pos = css_rstat_updated_list(css, cpu);
 		for (; pos; pos = pos->rstat_flush_next) {
 			if (is_self) {
@@ -395,10 +393,10 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
 			} else
 				pos->ss->css_rstat_flush(pos, cpu);
 		}
-		__css_rstat_unlock(css, cpu);
 		if (!cond_resched())
 			cpu_relax();
 	}
+	__css_rstat_unlock(css);
 }
 
 int css_rstat_init(struct cgroup_subsys_state *css)
@@ -685,11 +683,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 
 	if (cgroup_parent(cgrp)) {
 		css_rstat_flush(&cgrp->self);
-		__css_rstat_lock(&cgrp->self, -1);
+		__css_rstat_lock(&cgrp->self);
 		bstat = cgrp->bstat;
 		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 			       &bstat.cputime.utime, &bstat.cputime.stime);
-		__css_rstat_unlock(&cgrp->self, -1);
+		__css_rstat_unlock(&cgrp->self);
 	} else {
 		root_cgroup_cputime(&bstat);
 	}
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop
  2025-07-22 12:43 [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop Bertrand Wlodarczyk
@ 2025-07-22 15:23 ` Shakeel Butt
  2025-07-31 15:26 ` Michal Koutný
  1 sibling, 0 replies; 3+ messages in thread
From: Shakeel Butt @ 2025-07-22 15:23 UTC (permalink / raw)
  To: Bertrand Wlodarczyk
  Cc: tj, hannes, mkoutny, edumazet, gthelen, yosry.ahmed, cgroups,
	linux-kernel, inwardvessel

On Tue, Jul 22, 2025 at 02:43:19PM +0200, Bertrand Wlodarczyk wrote:
> By moving the lock outside the CPU loop, we reduce the frequency
> of costly lock acquisition.
> This adjustment slightly improves performance in scenarios where
> multiple CPUs concurrently attempt to acquire the lock for
> the same css.

Did you see the commit 0efc297a3c497 ("cgroup/rstat: avoid disabling
irqs for O(num_cpu)") for the reasoning on why we are doing this?

> 
> The cpu argument passed to __css_rstat_lock, which was utilized
> by the event system, has been removed because it is no longer in use.
> ---
> Results:
>  
> QEMU vm
> +-------+---------+
> | main  | patched |
> +-------+---------+
> | 9.17s | 2.36s   |
> +-------+---------+
> ext4 raw image with debian:
> qemu-system-x86_64 -enable-kvm -cpu host -smp 102 -m 16G -kernel linux-cgroup/arch/x86/boot/bzImage -drive file=rootfs.ext4,if=virtio,format=raw -append "rootwait root=/dev/vda console=tty1 console=ttyS0 nokaslr cgroup_enable=memory cgroup_memory=1" -net nic,model=virtio -net user -nographic
> 
> Benchmark code: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
> musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
> ---
>  include/trace/events/cgroup.h | 22 ++++++++++------------
>  kernel/cgroup/rstat.c         | 20 +++++++++-----------
>  2 files changed, 19 insertions(+), 23 deletions(-)
> 
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index ba9229af9a34..eb674eef8b99 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -206,15 +206,14 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>  
>  DECLARE_EVENT_CLASS(cgroup_rstat,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended),
> +	TP_ARGS(cgrp, contended),
>  
>  	TP_STRUCT__entry(
>  		__field(	int,		root			)
>  		__field(	int,		level			)
>  		__field(	u64,		id			)
> -		__field(	int,		cpu			)
>  		__field(	bool,		contended		)
>  	),
>  
> @@ -222,13 +221,12 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>  		__entry->root = cgrp->root->hierarchy_id;
>  		__entry->id = cgroup_id(cgrp);
>  		__entry->level = cgrp->level;
> -		__entry->cpu = cpu;
>  		__entry->contended = contended;
>  	),
>  
> -	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
> +	TP_printk("root=%d id=%llu level=%d lock contended:%d",
>  		  __entry->root, __entry->id, __entry->level,
> -		  __entry->cpu, __entry->contended)
> +		  __entry->contended)
>  );
>  
>  /*
> @@ -238,23 +236,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>   */
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  #endif /* _TRACE_CGROUP_H */
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index c8a48cf83878..dd312fe1896d 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -326,8 +326,7 @@ __bpf_hook_end();
>   * value -1 is used when obtaining the main lock else this is the CPU
>   * number processed last.
>   */
> -static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
> -		int cpu_in_loop)
> +static inline void __css_rstat_lock(struct cgroup_subsys_state *css)
>  	__acquires(ss_rstat_lock(css->ss))
>  {
>  	struct cgroup *cgrp = css->cgroup;
> @@ -337,21 +336,20 @@ static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
>  	lock = ss_rstat_lock(css->ss);
>  	contended = !spin_trylock_irq(lock);
>  	if (contended) {
> -		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
> +		trace_cgroup_rstat_lock_contended(cgrp, contended);
>  		spin_lock_irq(lock);
>  	}
> -	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
> +	trace_cgroup_rstat_locked(cgrp, contended);
>  }
>  
> -static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
> -				      int cpu_in_loop)
> +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css)
>  	__releases(ss_rstat_lock(css->ss))
>  {
>  	struct cgroup *cgrp = css->cgroup;
>  	spinlock_t *lock;
>  
>  	lock = ss_rstat_lock(css->ss);
> -	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
> +	trace_cgroup_rstat_unlock(cgrp, false);
>  	spin_unlock_irq(lock);
>  }
>  
> @@ -381,11 +379,11 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
>  		return;
>  
>  	might_sleep();
> +	__css_rstat_lock(css);
>  	for_each_possible_cpu(cpu) {
>  		struct cgroup_subsys_state *pos;
>  
>  		/* Reacquire for each CPU to avoid disabling IRQs too long */
> -		__css_rstat_lock(css, cpu);
>  		pos = css_rstat_updated_list(css, cpu);
>  		for (; pos; pos = pos->rstat_flush_next) {
>  			if (is_self) {
> @@ -395,10 +393,10 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
>  			} else
>  				pos->ss->css_rstat_flush(pos, cpu);
>  		}
> -		__css_rstat_unlock(css, cpu);
>  		if (!cond_resched())

cond_resched() with the spin lock held?

>  			cpu_relax();
>  	}
> +	__css_rstat_unlock(css);
>  }
>  
>  int css_rstat_init(struct cgroup_subsys_state *css)
> @@ -685,11 +683,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
>  
>  	if (cgroup_parent(cgrp)) {
>  		css_rstat_flush(&cgrp->self);
> -		__css_rstat_lock(&cgrp->self, -1);
> +		__css_rstat_lock(&cgrp->self);
>  		bstat = cgrp->bstat;
>  		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
>  			       &bstat.cputime.utime, &bstat.cputime.stime);
> -		__css_rstat_unlock(&cgrp->self, -1);
> +		__css_rstat_unlock(&cgrp->self);
>  	} else {
>  		root_cgroup_cputime(&bstat);
>  	}
> -- 
> 2.49.0
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop
  2025-07-22 12:43 [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop Bertrand Wlodarczyk
  2025-07-22 15:23 ` Shakeel Butt
@ 2025-07-31 15:26 ` Michal Koutný
  1 sibling, 0 replies; 3+ messages in thread
From: Michal Koutný @ 2025-07-31 15:26 UTC (permalink / raw)
  To: Bertrand Wlodarczyk
  Cc: tj, hannes, cgroups, linux-kernel, shakeel.butt, inwardvessel

[-- Attachment #1: Type: text/plain, Size: 817 bytes --]

On Tue, Jul 22, 2025 at 02:43:19PM +0200, Bertrand Wlodarczyk <bertrand.wlodarczyk@intel.com> wrote:
> By moving the lock outside the CPU loop, we reduce the frequency
> of costly lock acquisition.

So IIUC, mere acquisition is so costly that it dwarves actual holding
(flushing) as there are presumbly no updates collected.

As Shakeel wrote, there are reasons for not holding this lock for very
long (beside that being a general rule).

> This adjustment slightly improves performance in scenarios where
> multiple CPUs concurrently attempt to acquire the lock for
> the same css.

It means they can read cpu.stat more frequently but will they get any
fresher or more precise data? This is not clear to me, I understand why
this benchmark improves but does it represent some real world
improvement?

Thanks,
Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-07-31 15:26 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-22 12:43 [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop Bertrand Wlodarczyk
2025-07-22 15:23 ` Shakeel Butt
2025-07-31 15:26 ` Michal Koutný

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).