[PATCH bpf-next v1] selftests/bpf: Add benchmark for tail call performance

The Linux Kernel Mailing List
 help / color / mirror / Atom feed

* [PATCH bpf-next v1] selftests/bpf: Add benchmark for tail call performance
@ 2026-06-30  2:23 Tiezhu Yang
  2026-06-30  3:11 ` bot+bpf-ci
  2026-06-30 18:10 ` Alexei Starovoitov
  0 siblings, 2 replies; 3+ messages in thread
From: Tiezhu Yang @ 2026-06-30  2:23 UTC (permalink / raw)
  To: Andrii Nakryiko, Eduard Zingerman; +Cc: loongarch, bpf, linux-kernel

Add a dedicated benchmark to measure the runtime performance and overhead
of tail calls. This helps developers detect performance regressions across
different kernel versions and optimization phases.

The benchmark sets up a standard tracepoint to intercept syscalls triggered
by a dedicated producer thread running a dead loop. The execution path is
strictly bounded by the tail call depth limit, safely preventing any core
lockup or infinite recursion risks.

To eliminate cacheline bouncing and global locking variance, Per-CPU array
maps are utilized to track execution hits across multiple cores.

To evaluate the JIT compiler architecture under complex control flows, it
interleaves direct tail calls with bpf2bpf tail calls.

This forces the tracking context at the target program's entry prologue to
toggle dynamically between a scalar count (0 to 33) and a massive kernel
pointer address, providing a robust micro-architectural stress test which
consists of:

1. tailcall_bench_main: The entry program filtering processes by PID, and
   introducing a high-frequency alternating execution path via the syscall
   arguments to switch between a direct tail call and a bpf2bpf tail call.
2. tailcall_bench_target: The final target destination hop which safely
   terminates the mixed execution flow and increments the step counter.

All functions utilize explicit "struct tracepoint_raw_syscalls_sys_enter"
context types to ensure strict type alignment and clear pointer provenance
for the BPF verifier.

Additionally, provide a test script run_bench_tailcall.sh to automate the
execution under strict core affinity and isolation for reliable profiling,
formatting the captured metrics directly into the performance report.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
---
 tools/testing/selftests/bpf/Makefile          |  2 +
 tools/testing/selftests/bpf/bench.c           |  2 +
 .../selftests/bpf/benchs/bench_tailcall.c     | 90 +++++++++++++++++++
 .../bpf/benchs/run_bench_tailcall.sh          | 18 ++++
 .../selftests/bpf/progs/tailcall_bench.c      | 89 ++++++++++++++++++
 5 files changed, 201 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_tailcall.c
 create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bench.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index b642ee489ea6..584504bc87a6 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -980,6 +980,7 @@ $(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_tr
 $(OUTPUT)/bench_bpf_nop.o: $(OUTPUT)/bpf_nop_bench.skel.h bench_bpf_timing.h
 $(OUTPUT)/bench_xdp_lb.o: $(OUTPUT)/xdp_lb_bench.skel.h bench_bpf_timing.h
 $(OUTPUT)/bench_bpf_timing.o: bench_bpf_timing.h
+$(OUTPUT)/bench_tailcall.o: $(OUTPUT)/tailcall_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -1005,6 +1006,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_bpf_timing.o \
 		 $(OUTPUT)/bench_bpf_nop.o \
 		 $(OUTPUT)/bench_xdp_lb.o \
+		 $(OUTPUT)/bench_tailcall.o \
 		 $(OUTPUT)/usdt_1.o \
 		 $(OUTPUT)/usdt_2.o \
 		 #
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 3d9d2cd7764b..a79b86316d28 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -582,6 +582,7 @@ extern const struct bench bench_lpm_trie_delete;
 extern const struct bench bench_lpm_trie_free;
 extern const struct bench bench_bpf_nop;
 extern const struct bench bench_xdp_lb;
+extern const struct bench bench_tailcall;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -665,6 +666,7 @@ static const struct bench *benchs[] = {
 	&bench_lpm_trie_free,
 	&bench_bpf_nop,
 	&bench_xdp_lb,
+	&bench_tailcall,
 };
 
 static void find_benchmark(void)
diff --git a/tools/testing/selftests/bpf/benchs/bench_tailcall.c b/tools/testing/selftests/bpf/benchs/bench_tailcall.c
new file mode 100644
index 000000000000..a203017f5e28
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_tailcall.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <bpf/bpf.h>
+#include "bench.h"
+#include "tailcall_bench.skel.h"
+
+static struct ctx {
+	struct tailcall_bench *skel;
+	struct bpf_link *link;
+	int map_fd;
+	int ncpus;
+	unsigned int percpu_size;
+} ctx;
+
+static void tailcall_measure(struct bench_res *res)
+{
+	__u32 key = 0;
+	__u8 *values;
+	__u64 total_hits = 0;
+
+	values = calloc(ctx.ncpus, ctx.percpu_size);
+	if (!values)
+		return;
+
+	if (bpf_map_lookup_elem(ctx.map_fd, &key, values) != 0)
+		return;
+
+	for (int i = 0; i < ctx.ncpus; i++)
+		total_hits += *(__u64 *)(values + i * ctx.percpu_size);
+
+	res->hits = total_hits;
+	free(values);
+}
+
+static void *tailcall_producer(void *input)
+{
+	unsigned long arg = 0;
+
+	while (true) {
+		/* Toggle the argument between 0 and 1 on every iteration */
+		syscall(__NR_getpgid, arg & 1);
+		arg++;
+	}
+
+	return NULL;
+}
+
+static void tailcall_setup(void)
+{
+	int main_fd, target_fd, jmp_map_fd;
+	__u32 key1 = 1;
+
+	ctx.skel = tailcall_bench__open();
+	if (!ctx.skel)
+		exit(1);
+
+	ctx.skel->data->my_pid = getpid();
+	ctx.ncpus = libbpf_num_possible_cpus();
+
+	if (tailcall_bench__load(ctx.skel))
+		exit(1);
+
+	jmp_map_fd = bpf_map__fd(ctx.skel->maps.jmp_table);
+	ctx.map_fd = bpf_map__fd(ctx.skel->maps.pcpu_hits_map);
+	ctx.percpu_size = bpf_map__value_size(ctx.skel->maps.pcpu_hits_map);
+
+	if (ctx.map_fd < 0 || jmp_map_fd < 0)
+		exit(1);
+
+	main_fd = bpf_program__fd(ctx.skel->progs.tailcall_bench_main);
+	target_fd = bpf_program__fd(ctx.skel->progs.tailcall_bench_target);
+
+	/* Map key 1 directly to the final target program */
+	bpf_map_update_elem(jmp_map_fd, &key1, &target_fd, BPF_ANY);
+
+	ctx.link = bpf_program__attach(ctx.skel->progs.tailcall_bench_main);
+	if (!ctx.link)
+		exit(1);
+}
+
+const struct bench bench_tailcall = {
+	.name = "tailcall",
+	.setup = tailcall_setup,
+	.producer_thread = tailcall_producer,
+	.measure = tailcall_measure,
+	.report_progress = ops_report_progress,
+	.report_final = ops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh b/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
new file mode 100755
index 000000000000..c687f34455e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# 1. Load the official common benchmark utilities
+source ./benchs/run_common.sh
+
+# 2. Strict error handling configurations
+set -eufo pipefail
+
+# 3. Use default bench binary path if not exported by the framework
+BENCH_BIN=${BENCH:-./bench}
+
+# 4. Run with strict core affinity and isolation for reliable profiling
+RUN_BENCH="numactl --physcpubind=0,2 --membind=0 nice -n -20 $BENCH_BIN -w5 -d20 -a"
+
+# 5. Capture the output string and pass it straight into summarize_ops
+# This satisfies the framework's internal parameter bounds without triggering set -u.
+summarize_ops "tailcall" "$($RUN_BENCH tailcall)"
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bench.c b/tools/testing/selftests/bpf/progs/tailcall_bench.c
new file mode 100644
index 000000000000..68a50c7b1d06
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bench.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+/*
+ * Define the standard kernel tracepoint context structure explicitly
+ * to provide the Clang compiler with exact memory offsets.
+ */
+struct tracepoint_raw_syscalls_sys_enter {
+	unsigned long long unused;
+	long id;
+	unsigned long args[6];
+};
+
+__u32 my_pid SEC(".data") = 0;
+
+/* High-performance Per-CPU Array Map to eliminate global lock variance */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, __u64);
+} pcpu_hits_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 2);
+	__type(key, __u32);
+	__type(value, __u32);
+} jmp_table SEC(".maps");
+
+static __always_inline void increment_pcpu_counter(void)
+{
+	__u32 key = 0;
+	__u64 *val;
+
+	val = bpf_map_lookup_elem(&pcpu_hits_map, &key);
+	if (val) {
+		/* Lockless and stable per-CPU increment without cacheline bounce */
+		(*val)++;
+	}
+}
+
+/* Target Program: The final destination of the tail call */
+SEC("tracepoint/raw_syscalls/sys_enter")
+int tailcall_bench_target(struct tracepoint_raw_syscalls_sys_enter *ctx)
+{
+	increment_pcpu_counter();
+	return 0;
+}
+
+/* bpf2bpf Sub-function driving a tail call to pointerize the counter */
+static __noinline int bpf2bpf_tailcall(struct tracepoint_raw_syscalls_sys_enter *ctx)
+{
+	bpf_tail_call(ctx, &jmp_table, 1);
+	return 0;
+}
+
+/* Main program: Entry point for filtered syscall tracepoints */
+SEC("tracepoint/raw_syscalls/sys_enter")
+int tailcall_bench_main(struct tracepoint_raw_syscalls_sys_enter *ctx)
+{
+	__u32 current_pid = bpf_get_current_pid_tgid() >> 32;
+
+	if (current_pid != my_pid)
+		return 0;
+
+	increment_pcpu_counter();
+
+	/*
+	 * Branch based on the syscall's first argument from user space.
+	 * Alternating between a direct tail call and a bpf2bpf tail call
+	 * forces the tail call counter at the target program's prologue to
+	 * swing dynamically between a pure scalar value and an inherited
+	 * kernel pointer.
+	 */
+	if (ctx->args[0] & 1) {
+		/* Path A: Direct tail call -> pure scalar value */
+		bpf_tail_call(ctx, &jmp_table, 1);
+	} else {
+		/* Path B: bpf2bpf tail call -> inherited kernel pointer */
+		bpf2bpf_tailcall(ctx);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH bpf-next v1] selftests/bpf: Add benchmark for tail call performance
  2026-06-30  2:23 [PATCH bpf-next v1] selftests/bpf: Add benchmark for tail call performance Tiezhu Yang
@ 2026-06-30  3:11 ` bot+bpf-ci
  2026-06-30 18:10 ` Alexei Starovoitov
  1 sibling, 0 replies; 3+ messages in thread
From: bot+bpf-ci @ 2026-06-30  3:11 UTC (permalink / raw)
  To: yangtiezhu, andrii, eddyz87
  Cc: loongarch, bpf, linux-kernel, ast, andrii, daniel, martin.lau,
	eddyz87, yonghong.song, clm, ihor.solodrai

[-- Attachment #1: Type: text/plain, Size: 5154 bytes --]

> diff --git a/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh b/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
> new file mode 100755
> index 000000000000..c687f34455e8
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
> @@ -0,0 +1,18 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +
> +# 1. Load the official common benchmark utilities
> +source ./benchs/run_common.sh
> +
> +# 2. Strict error handling configurations
> +set -eufo pipefail
> +
> +# 3. Use default bench binary path if not exported by the framework
> +BENCH_BIN=${BENCH:-./bench}
> +
> +# 4. Run with strict core affinity and isolation for reliable profiling
> +RUN_BENCH="numactl --physcpubind=0,2 --membind=0 nice -n -20 $BENCH_BIN -w5 -d20 -a"
> +
> +# 5. Capture the output string and pass it straight into summarize_ops
> +# This satisfies the framework's internal parameter bounds without triggering set -u.
> +summarize_ops "tailcall" "$($RUN_BENCH tailcall)"

This isn't a bug, but could the numbered step comments (# 1. .. # 5.) be
trimmed?  Several of them restate the line right below them, for example:

	# 1. Load the official common benchmark utilities
	source ./benchs/run_common.sh

The neighbouring run_bench_*.sh scripts here tend to be a couple of lines
with little commentary, so the running narration stands out.  Keeping only
the comments that explain a non-obvious choice (such as the cpu-affinity
flags) might fit the directory better.

> diff --git a/tools/testing/selftests/bpf/progs/tailcall_bench.c b/tools/testing/selftests/bpf/progs/tailcall_bench.c
> new file mode 100644
> index 000000000000..68a50c7b1d06
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/tailcall_bench.c

[ ... ]

> +/* High-performance Per-CPU Array Map to eliminate global lock variance */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> +	__uint(max_entries, 1);
> +	__type(key, __u32);
> +	__type(value, __u64);
> +} pcpu_hits_map SEC(".maps");

[ ... ]

> +static __always_inline void increment_pcpu_counter(void)
> +{
> +	__u32 key = 0;
> +	__u64 *val;
> +
> +	val = bpf_map_lookup_elem(&pcpu_hits_map, &key);
> +	if (val) {
> +		/* Lockless and stable per-CPU increment without cacheline bounce */
> +		(*val)++;
> +	}
> +}

This isn't a bug, but do the "high-performance / lockless / no cacheline
bounce" comments add anything beyond what the per-CPU map type already
conveys?  The increment comment in particular sits directly above a plain
(*val)++;.  Could these be dropped?

The commit message has a similar question.  Quoting it in full:

> Add a dedicated benchmark to measure the runtime performance and overhead
> of tail calls. This helps developers detect performance regressions across
> different kernel versions and optimization phases.
>
> The benchmark sets up a standard tracepoint to intercept syscalls triggered
> by a dedicated producer thread running a dead loop. The execution path is
> strictly bounded by the tail call depth limit, safely preventing any core
> lockup or infinite recursion risks.
>
> To eliminate cacheline bouncing and global locking variance, Per-CPU array
> maps are utilized to track execution hits across multiple cores.
>
> To evaluate the JIT compiler architecture under complex control flows, it
> interleaves direct tail calls with bpf2bpf tail calls.
>
> This forces the tracking context at the target program's entry prologue to
> toggle dynamically between a scalar count (0 to 33) and a massive kernel
> pointer address, providing a robust micro-architectural stress test which
> consists of:
>
> 1. tailcall_bench_main: The entry program filtering processes by PID, and
>    introducing a high-frequency alternating execution path via the syscall
>    arguments to switch between a direct tail call and a bpf2bpf tail call.
> 2. tailcall_bench_target: The final target destination hop which safely
>    terminates the mixed execution flow and increments the step counter.
>
> All functions utilize explicit "struct tracepoint_raw_syscalls_sys_enter"
> context types to ensure strict type alignment and clear pointer provenance
> for the BPF verifier.
>
> Additionally, provide a test script run_bench_tailcall.sh to automate the
> execution under strict core affinity and isolation for reliable profiling,
> formatting the captured metrics directly into the performance report.
>
> Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>

This isn't a bug, but could the changelog focus more on why a dedicated
tail-call throughput benchmark is needed over the existing infrastructure?

The text mostly walks through what each program does, and uses phrasing like
"robust micro-architectural stress test", "massive kernel pointer address",
and "strict type alignment and clear pointer provenance".  A reviewer comes
away knowing what the two programs are, but less about the load-bearing
design rationale.


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/28416687457

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH bpf-next v1] selftests/bpf: Add benchmark for tail call performance
  2026-06-30  2:23 [PATCH bpf-next v1] selftests/bpf: Add benchmark for tail call performance Tiezhu Yang
  2026-06-30  3:11 ` bot+bpf-ci
@ 2026-06-30 18:10 ` Alexei Starovoitov
  1 sibling, 0 replies; 3+ messages in thread
From: Alexei Starovoitov @ 2026-06-30 18:10 UTC (permalink / raw)
  To: Tiezhu Yang, Andrii Nakryiko, Eduard Zingerman
  Cc: loongarch, bpf, linux-kernel

On Mon Jun 29, 2026 at 7:23 PM PDT, Tiezhu Yang wrote:
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
> @@ -0,0 +1,18 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +
> +# 1. Load the official common benchmark utilities
> +source ./benchs/run_common.sh
> +
> +# 2. Strict error handling configurations
> +set -eufo pipefail
> +
> +# 3. Use default bench binary path if not exported by the framework
> +BENCH_BIN=${BENCH:-./bench}
> +
> +# 4. Run with strict core affinity and isolation for reliable profiling
> +RUN_BENCH="numactl --physcpubind=0,2 --membind=0 nice -n -20 $BENCH_BIN -w5 -d20 -a"
> +
> +# 5. Capture the output string and pass it straight into summarize_ops
> +# This satisfies the framework's internal parameter bounds without triggering set -u.

Too much LLM smell here.

In general, I don't think we need a bench for tail calls.
tail calls are more or less deprecated and often in the way of implementing
new features (like 6+ arguments). There is no need to work on
improving their performance.

pw-bot: cr

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-30 18:10 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-30  2:23 [PATCH bpf-next v1] selftests/bpf: Add benchmark for tail call performance Tiezhu Yang
2026-06-30  3:11 ` bot+bpf-ci
2026-06-30 18:10 ` Alexei Starovoitov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox