[PATCH bpf-next] selftests/bpf: scale benchmark counting by using per-CPU counters

BPF List
 help / color / mirror / Atom feed

* [PATCH bpf-next] selftests/bpf: scale benchmark counting by using per-CPU counters
@ 2024-03-12 22:23 Andrii Nakryiko
  2024-03-15  2:33 ` Alexei Starovoitov
  0 siblings, 1 reply; 5+ messages in thread
From: Andrii Nakryiko @ 2024-03-12 22:23 UTC (permalink / raw)
  To: bpf, ast, daniel, martin.lau; +Cc: andrii, kernel-team

When benchmarking with multiple threads (-pN, where N>1), we start
contending on single atomic counter that both BPF trigger benchmarks are
using, as well as "baseline" tests in user space (trig-base and
trig-uprobe-base benchmarks). As such, we start bottlenecking on
something completely irrelevant to benchmark at hand.

Scale counting up by using per-CPU counters on BPF side. On use space
side we do the next best thing: hash thread ID to approximate per-CPU
behavior. It seems to work quite well in practice.

To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8,
16, and 32 threads:
  - trig-uprobe-base (no syscalls, pure tight counting loop in user-space);
  - trig-base (get_pgid() syscall, atomic counter in user-space);
  - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU
    counter on BPF side).

Command used:

  for b in uprobe-base base fentry; do \
    for p in 1 2 4 8 16 32; do \
      printf "%-11s %2d: %s\n" $b $p \
        "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \
    done; \
  done

Before these changes, aggregate throughput across all threads doesn't
scale well with number of threads, it actually even falls sharply for
uprobe-base due to a very high contention:

  uprobe-base  1:  138.998 ± 0.650M/s
  uprobe-base  2:   70.526 ± 1.147M/s
  uprobe-base  4:   63.114 ± 0.302M/s
  uprobe-base  8:   54.177 ± 0.138M/s
  uprobe-base 16:   45.439 ± 0.057M/s
  uprobe-base 32:   37.163 ± 0.242M/s
  base         1:   16.940 ± 0.182M/s
  base         2:   19.231 ± 0.105M/s
  base         4:   21.479 ± 0.038M/s
  base         8:   23.030 ± 0.037M/s
  base        16:   22.034 ± 0.004M/s
  base        32:   18.152 ± 0.013M/s
  fentry       1:   14.794 ± 0.054M/s
  fentry       2:   17.341 ± 0.055M/s
  fentry       4:   23.792 ± 0.024M/s
  fentry       8:   21.557 ± 0.047M/s
  fentry      16:   21.121 ± 0.004M/s
  fentry      32:   17.067 ± 0.023M/s

After these changes, we see almost perfect linear scaling, as expected.
The sub-linear scaling when going from 8 to 16 threads is interesting
and consistent on my test machine, but I haven't investigated what is
causing it this peculiar slowdown (across all benchmarks, could be due
to hyperthreading effects, not sure).

  uprobe-base  1:  139.980 ± 0.648M/s
  uprobe-base  2:  270.244 ± 0.379M/s
  uprobe-base  4:  532.044 ± 1.519M/s
  uprobe-base  8: 1004.571 ± 3.174M/s
  uprobe-base 16: 1720.098 ± 0.744M/s
  uprobe-base 32: 3506.659 ± 8.549M/s
  base         1:   16.869 ± 0.071M/s
  base         2:   33.007 ± 0.092M/s
  base         4:   64.670 ± 0.203M/s
  base         8:  121.969 ± 0.210M/s
  base        16:  207.832 ± 0.112M/s
  base        32:  424.227 ± 1.477M/s
  fentry       1:   14.777 ± 0.087M/s
  fentry       2:   28.575 ± 0.146M/s
  fentry       4:   56.234 ± 0.176M/s
  fentry       8:  106.095 ± 0.385M/s
  fentry      16:  181.440 ± 0.032M/s
  fentry      32:  369.131 ± 0.693M/s

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../selftests/bpf/benchs/bench_trigger.c      | 40 ++++++++++++++++---
 .../selftests/bpf/progs/trigger_bench.c       | 39 ++++++++++++------
 2 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index ace0d1011a8e..f520bb183702 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -1,15 +1,45 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <unistd.h>
 #include "bench.h"
 #include "trigger_bench.skel.h"
 #include "trace_helpers.h"
 
+/* adjust slot shift in inc_hits() if changing */
+#define MAX_BUCKETS 256
+
 /* BPF triggering benchmarks */
 static struct trigger_ctx {
 	struct trigger_bench *skel;
 } ctx;
 
-static struct counter base_hits;
+static struct counter base_hits[MAX_BUCKETS];
+
+static __always_inline void inc_counter(struct counter *counters)
+{
+	static __thread int tid = 0;
+	unsigned slot;
+
+	if (unlikely(tid == 0))
+		tid = gettid();
+
+	/* multiplicative hashing, it's fast */
+	slot = 2654435769U * tid;
+	slot >>= 24;
+
+	atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
+}
+
+static __always_inline long sum_counters(struct counter *counters)
+{
+	int i;
+	long sum = 0;
+
+	for (i = 0; i < MAX_BUCKETS; i++)
+		sum += atomic_swap(&counters[i].value, 0);
+	return sum;
+}
 
 static void trigger_validate(void)
 {
@@ -23,14 +53,14 @@ static void *trigger_base_producer(void *input)
 {
 	while (true) {
 		(void)syscall(__NR_getpgid);
-		atomic_inc(&base_hits.value);
+		inc_counter(base_hits);
 	}
 	return NULL;
 }
 
 static void trigger_base_measure(struct bench_res *res)
 {
-	res->hits = atomic_swap(&base_hits.value, 0);
+	res->hits = sum_counters(base_hits);
 }
 
 static void *trigger_producer(void *input)
@@ -42,7 +72,7 @@ static void *trigger_producer(void *input)
 
 static void trigger_measure(struct bench_res *res)
 {
-	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+	res->hits = sum_counters(ctx.skel->bss->hits);
 }
 
 static void setup_ctx(void)
@@ -164,7 +194,7 @@ static void *uprobe_base_producer(void *input)
 {
 	while (true) {
 		uprobe_target_nop();
-		atomic_inc(&base_hits.value);
+		inc_counter(base_hits);
 	}
 	return NULL;
 }
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 5fda43901033..42ec202015ed 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -9,12 +9,27 @@
 
 char _license[] SEC("license") = "GPL";
 
-long hits = 0;
+#define CPU_MASK 255
+#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */
+
+/* matches struct counter in bench.h */
+struct counter {
+	long value;
+} __attribute__((aligned(128)));
+
+struct counter hits[MAX_CPUS];
+
+static __always_inline void inc_counter(void)
+{
+	int cpu = bpf_get_smp_processor_id();
+
+	__sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1);
+}
 
 SEC("tp/syscalls/sys_enter_getpgid")
 int bench_trigger_tp(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
@@ -22,69 +37,69 @@ SEC("raw_tp/sys_enter")
 int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id)
 {
 	if (id == __NR_getpgid)
-		__sync_add_and_fetch(&hits, 1);
+		inc_counter();
 	return 0;
 }
 
 SEC("kprobe/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kretprobe/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kretprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kprobe_multi(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_kretprobe_multi(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fexit/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fexit(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry_sleep(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
 SEC("fmod_ret/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fmodret(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return -22;
 }
 
 SEC("uprobe")
 int bench_trigger_uprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] selftests/bpf: scale benchmark counting by using per-CPU counters
  2024-03-12 22:23 [PATCH bpf-next] selftests/bpf: scale benchmark counting by using per-CPU counters Andrii Nakryiko
@ 2024-03-15  2:33 ` Alexei Starovoitov
  2024-03-15  3:55   ` Andrii Nakryiko
  0 siblings, 1 reply; 5+ messages in thread
From: Alexei Starovoitov @ 2024-03-15  2:33 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Kernel Team

On Tue, Mar 12, 2024 at 3:23 PM Andrii Nakryiko <andrii@kernel.org> wrote:
>
>
>  static void trigger_base_measure(struct bench_res *res)
>  {
> -       res->hits = atomic_swap(&base_hits.value, 0);
> +       res->hits = sum_counters(base_hits);
>  }
>
>  static void *trigger_producer(void *input)
> @@ -42,7 +72,7 @@ static void *trigger_producer(void *input)
>
>  static void trigger_measure(struct bench_res *res)
>  {
> -       res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
> +       res->hits = sum_counters(ctx.skel->bss->hits);
>  }

It was zeroing counters before.
Do you need to zero them now?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] selftests/bpf: scale benchmark counting by using per-CPU counters
  2024-03-15  2:33 ` Alexei Starovoitov
@ 2024-03-15  3:55   ` Andrii Nakryiko
  2024-03-15  4:47     ` Alexei Starovoitov
  0 siblings, 1 reply; 5+ messages in thread
From: Andrii Nakryiko @ 2024-03-15  3:55 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andrii Nakryiko, bpf, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Kernel Team

On Thu, Mar 14, 2024 at 7:33 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 12, 2024 at 3:23 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> >
> >
> >  static void trigger_base_measure(struct bench_res *res)
> >  {
> > -       res->hits = atomic_swap(&base_hits.value, 0);
> > +       res->hits = sum_counters(base_hits);
> >  }
> >
> >  static void *trigger_producer(void *input)
> > @@ -42,7 +72,7 @@ static void *trigger_producer(void *input)
> >
> >  static void trigger_measure(struct bench_res *res)
> >  {
> > -       res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
> > +       res->hits = sum_counters(ctx.skel->bss->hits);
> >  }
>
> It was zeroing counters before.
> Do you need to zero them now?
>

sum_counters() does zero them out, it does the same atomic_swap(0).

Or are you saying we should switch to having cumulative counters and
never swap? Because it should work as well, but it just requires a bit
more care in remembering old cumulative sum and reporting the
difference to `res->hits`. Probably not worth the hassle.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] selftests/bpf: scale benchmark counting by using per-CPU counters
  2024-03-15  3:55   ` Andrii Nakryiko
@ 2024-03-15  4:47     ` Alexei Starovoitov
  2024-03-15  4:56       ` Andrii Nakryiko
  0 siblings, 1 reply; 5+ messages in thread
From: Alexei Starovoitov @ 2024-03-15  4:47 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Kernel Team

On Thu, Mar 14, 2024 at 8:56 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Thu, Mar 14, 2024 at 7:33 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Mar 12, 2024 at 3:23 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > >
> > >
> > >  static void trigger_base_measure(struct bench_res *res)
> > >  {
> > > -       res->hits = atomic_swap(&base_hits.value, 0);
> > > +       res->hits = sum_counters(base_hits);
> > >  }
> > >
> > >  static void *trigger_producer(void *input)
> > > @@ -42,7 +72,7 @@ static void *trigger_producer(void *input)
> > >
> > >  static void trigger_measure(struct bench_res *res)
> > >  {
> > > -       res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
> > > +       res->hits = sum_counters(ctx.skel->bss->hits);
> > >  }
> >
> > It was zeroing counters before.
> > Do you need to zero them now?
> >
>
> sum_counters() does zero them out, it does the same atomic_swap(0).

I simply missed the swap in there.
Could you rename it to sum_and_zero_counters() or
something, so it's obvious?
A helper that says 'sum_counters' isn't expected to have such side effects.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] selftests/bpf: scale benchmark counting by using per-CPU counters
  2024-03-15  4:47     ` Alexei Starovoitov
@ 2024-03-15  4:56       ` Andrii Nakryiko
  0 siblings, 0 replies; 5+ messages in thread
From: Andrii Nakryiko @ 2024-03-15  4:56 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andrii Nakryiko, bpf, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Kernel Team

On Thu, Mar 14, 2024 at 9:47 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 14, 2024 at 8:56 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Thu, Mar 14, 2024 at 7:33 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Mar 12, 2024 at 3:23 PM Andrii Nakryiko <andrii@kernel.org> wrote:
> > > >
> > > >
> > > >  static void trigger_base_measure(struct bench_res *res)
> > > >  {
> > > > -       res->hits = atomic_swap(&base_hits.value, 0);
> > > > +       res->hits = sum_counters(base_hits);
> > > >  }
> > > >
> > > >  static void *trigger_producer(void *input)
> > > > @@ -42,7 +72,7 @@ static void *trigger_producer(void *input)
> > > >
> > > >  static void trigger_measure(struct bench_res *res)
> > > >  {
> > > > -       res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
> > > > +       res->hits = sum_counters(ctx.skel->bss->hits);
> > > >  }
> > >
> > > It was zeroing counters before.
> > > Do you need to zero them now?
> > >
> >
> > sum_counters() does zero them out, it does the same atomic_swap(0).
>
> I simply missed the swap in there.
> Could you rename it to sum_and_zero_counters() or
> something, so it's obvious?
> A helper that says 'sum_counters' isn't expected to have such side effects.

Yep, sure. I should have called it sum_and_reset_counters(). BTW, I
have since added another set of benchmarks to measure fentry/fexit and
kprobe/kretprobe throughput, but this time minimizing the amount of
syscalls. I'll bundle both together and send them as v2. This new
benchmark would definitely be bottlenecked on counting, so it depends
on these distributed counters for better counting.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-03-15  4:56 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-03-12 22:23 [PATCH bpf-next] selftests/bpf: scale benchmark counting by using per-CPU counters Andrii Nakryiko
2024-03-15  2:33 ` Alexei Starovoitov
2024-03-15  3:55   ` Andrii Nakryiko
2024-03-15  4:47     ` Alexei Starovoitov
2024-03-15  4:56       ` Andrii Nakryiko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox