* [PATCH bpf-next] bpf: Optimize recursion detection for arm64
@ 2025-11-04 16:49 Puranjay Mohan
2025-11-04 23:52 ` Alexei Starovoitov
0 siblings, 1 reply; 5+ messages in thread
From: Puranjay Mohan @ 2025-11-04 16:49 UTC (permalink / raw)
To: bpf
Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Eduard Zingerman, Kumar Kartikeya Dwivedi, kernel-team
BPF programs detect recursion by a per-cpu active flag in struct
bpf_prog. This flag is set/unset in the trampoline using atomic
operations to prevent inter-context recursion.
Some arm64 platforms have slow per-CPU atomic operations, for example,
the Neoverse V2. This commit therefore changes the recursion detection
mechanism to allow four levels of recursion (normal -> softirq -> hardirq
-> NMI). With allowing limited recursion, we can now stop using atomic
operations. This approach is similar to get_recursion_context() in perf.
Change active to a per-cpu array of four u8 values, one for each context
and use non-atomic increment/decrement on them.
This improves the performance on ARM64 (64-CPU Neoverse-N1):
+----------------+-------------------+-------------------+---------+
| Benchmark | Base run | Patched run | Δ (%) |
+----------------+-------------------+-------------------+---------+
| fentry | 3.694 ± 0.003M/s | 3.828 ± 0.007M/s | +3.63% |
| fexit | 1.389 ± 0.006M/s | 1.406 ± 0.003M/s | +1.22% |
| fmodret | 1.366 ± 0.011M/s | 1.398 ± 0.002M/s | +2.34% |
| rawtp | 3.453 ± 0.026M/s | 3.714 ± 0.003M/s | +7.56% |
| tp | 2.596 ± 0.005M/s | 2.699 ± 0.006M/s | +3.97% |
+----------------+-------------------+-------------------+---------+
Benchmarked using: tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
include/linux/bpf.h | 4 +++-
kernel/bpf/core.c | 3 ++-
kernel/bpf/trampoline.c | 22 ++++++++++++++++++----
kernel/trace/bpf_trace.c | 11 +++++++----
4 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a47d67db3be5..920902e0f384 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1728,6 +1728,8 @@ struct bpf_prog_aux {
struct bpf_stream stream[2];
};
+#define BPF_NR_CONTEXTS 4
+
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
@@ -1754,7 +1756,7 @@ struct bpf_prog {
u8 tag[BPF_TAG_SIZE];
};
struct bpf_prog_stats __percpu *stats;
- int __percpu *active;
+ u8 __percpu *active; /* u8[BPF_NR_CONTEXTS] for rerecursion protection */
unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn);
struct bpf_prog_aux *aux; /* Auxiliary fields */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d595fe512498..6fe2e22385a6 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -112,7 +112,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
vfree(fp);
return NULL;
}
- fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+ fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 8,
+ bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (!fp->active) {
vfree(fp);
kfree(aux);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5949095e51c3..e6b9c7e34990 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -899,11 +899,15 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
+ u8 rctx = interrupt_context_level();
+ u8 *active;
+
rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ active = this_cpu_ptr(prog->active);
+ if (unlikely(++active[rctx] != 1)) {
bpf_prog_inc_misses_counter(prog);
if (prog->aux->recursion_detected)
prog->aux->recursion_detected(prog);
@@ -944,10 +948,13 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
+ u8 rctx = interrupt_context_level();
+ u8 *active = this_cpu_ptr(prog->active);
+
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- this_cpu_dec(*(prog->active));
+ active[rctx]--;
rcu_read_unlock_migrate();
}
@@ -977,13 +984,17 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
struct bpf_tramp_run_ctx *run_ctx)
{
+ u8 rctx = interrupt_context_level();
+ u8 *active;
+
rcu_read_lock_trace();
migrate_disable();
might_fault();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ active = this_cpu_ptr(prog->active);
+ if (unlikely(++active[rctx] != 1)) {
bpf_prog_inc_misses_counter(prog);
if (prog->aux->recursion_detected)
prog->aux->recursion_detected(prog);
@@ -995,10 +1006,13 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
struct bpf_tramp_run_ctx *run_ctx)
{
+ u8 rctx = interrupt_context_level();
+ u8 *active = this_cpu_ptr(prog->active);
+
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- this_cpu_dec(*(prog->active));
+ active[rctx]--;
migrate_enable();
rcu_read_unlock_trace();
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a795f7afbf3d..4c0751710cff 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2059,14 +2059,18 @@ static __always_inline
void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
struct bpf_prog *prog = link->link.prog;
+ u8 rctx = interrupt_context_level();
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
+ u8 *active;
cant_sleep();
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ active = this_cpu_ptr(prog->active);
+ if (unlikely(active[rctx])) {
bpf_prog_inc_misses_counter(prog);
- goto out;
+ return;
}
+ active[rctx]++;
run_ctx.bpf_cookie = link->cookie;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
@@ -2076,8 +2080,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
rcu_read_unlock();
bpf_reset_run_ctx(old_run_ctx);
-out:
- this_cpu_dec(*(prog->active));
+ active[rctx]--;
}
#define UNPACK(...) __VA_ARGS__
--
2.47.3
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH bpf-next] bpf: Optimize recursion detection for arm64
2025-11-04 16:49 [PATCH bpf-next] bpf: Optimize recursion detection for arm64 Puranjay Mohan
@ 2025-11-04 23:52 ` Alexei Starovoitov
2025-11-05 1:30 ` Alexei Starovoitov
2025-11-05 5:26 ` Puranjay Mohan
0 siblings, 2 replies; 5+ messages in thread
From: Alexei Starovoitov @ 2025-11-04 23:52 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Kernel Team
On Tue, Nov 4, 2025 at 8:49 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> BPF programs detect recursion by a per-cpu active flag in struct
> bpf_prog. This flag is set/unset in the trampoline using atomic
> operations to prevent inter-context recursion.
>
> Some arm64 platforms have slow per-CPU atomic operations, for example,
> the Neoverse V2. This commit therefore changes the recursion detection
> mechanism to allow four levels of recursion (normal -> softirq -> hardirq
> -> NMI). With allowing limited recursion, we can now stop using atomic
> operations. This approach is similar to get_recursion_context() in perf.
>
> Change active to a per-cpu array of four u8 values, one for each context
> and use non-atomic increment/decrement on them.
>
> This improves the performance on ARM64 (64-CPU Neoverse-N1):
>
> +----------------+-------------------+-------------------+---------+
> | Benchmark | Base run | Patched run | Δ (%) |
> +----------------+-------------------+-------------------+---------+
> | fentry | 3.694 ± 0.003M/s | 3.828 ± 0.007M/s | +3.63% |
> | fexit | 1.389 ± 0.006M/s | 1.406 ± 0.003M/s | +1.22% |
> | fmodret | 1.366 ± 0.011M/s | 1.398 ± 0.002M/s | +2.34% |
> | rawtp | 3.453 ± 0.026M/s | 3.714 ± 0.003M/s | +7.56% |
> | tp | 2.596 ± 0.005M/s | 2.699 ± 0.006M/s | +3.97% |
> +----------------+-------------------+-------------------+---------+
The gain is nice, but absolute numbers look very low.
I see fentry doing 52M on the debug kernel with kasan inside VM.
The patch itself looks good to me, but I realized that we cannot
use this approach for progs with a private stack,
since they require a strict one user per cpu.
Also tracing progs might have conceptually similar restriction.
A prog could use per-cpu map to store some data.
If prog is attached to some function that may be called from
task and irq context the irq execution will write over per-cpu data
and when it returns the same prog in task context will see garbage.
I'm afraid get_recursion_context() approach won't work. Sorry for
not-thought-through suggestion.
Looking at the other thread it looks like this_cpu_inc_return()
is actually fast on arm64, while this_cpu_inc() is horrible.
And we're using _return() flavor almost everywhere,
so it's probably fine, but this patch shows that there is room
for improvement.
Please check why absolute numbers are so low though.
Also let's benchmark xchg(prog->active, 1) vs this_cpu_inc_return().
And its variant this_cpu_xchg().
xchg() will probably be slower.
this_cpu_xchg() may be faster?
pls test a few x86 and arm64 setups.
pw-bot: cr
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH bpf-next] bpf: Optimize recursion detection for arm64
2025-11-04 23:52 ` Alexei Starovoitov
@ 2025-11-05 1:30 ` Alexei Starovoitov
2025-11-05 6:30 ` Puranjay Mohan
2025-11-05 5:26 ` Puranjay Mohan
1 sibling, 1 reply; 5+ messages in thread
From: Alexei Starovoitov @ 2025-11-05 1:30 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Kernel Team
On Tue, Nov 4, 2025 at 3:52 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Nov 4, 2025 at 8:49 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >
> > BPF programs detect recursion by a per-cpu active flag in struct
> > bpf_prog. This flag is set/unset in the trampoline using atomic
> > operations to prevent inter-context recursion.
> >
> > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > the Neoverse V2. This commit therefore changes the recursion detection
> > mechanism to allow four levels of recursion (normal -> softirq -> hardirq
> > -> NMI). With allowing limited recursion, we can now stop using atomic
> > operations. This approach is similar to get_recursion_context() in perf.
> >
> > Change active to a per-cpu array of four u8 values, one for each context
> > and use non-atomic increment/decrement on them.
> >
> > This improves the performance on ARM64 (64-CPU Neoverse-N1):
> >
> > +----------------+-------------------+-------------------+---------+
> > | Benchmark | Base run | Patched run | Δ (%) |
> > +----------------+-------------------+-------------------+---------+
> > | fentry | 3.694 ± 0.003M/s | 3.828 ± 0.007M/s | +3.63% |
> > | fexit | 1.389 ± 0.006M/s | 1.406 ± 0.003M/s | +1.22% |
> > | fmodret | 1.366 ± 0.011M/s | 1.398 ± 0.002M/s | +2.34% |
> > | rawtp | 3.453 ± 0.026M/s | 3.714 ± 0.003M/s | +7.56% |
> > | tp | 2.596 ± 0.005M/s | 2.699 ± 0.006M/s | +3.97% |
> > +----------------+-------------------+-------------------+---------+
>
> The gain is nice, but absolute numbers look very low.
> I see fentry doing 52M on the debug kernel with kasan inside VM.
>
> The patch itself looks good to me, but I realized that we cannot
> use this approach for progs with a private stack,
> since they require a strict one user per cpu.
>
> Also tracing progs might have conceptually similar restriction.
> A prog could use per-cpu map to store some data.
> If prog is attached to some function that may be called from
> task and irq context the irq execution will write over per-cpu data
> and when it returns the same prog in task context will see garbage.
> I'm afraid get_recursion_context() approach won't work. Sorry for
> not-thought-through suggestion.
Actually the get_recursion_context() approach can be salvaged.
Instead of:
+ active = this_cpu_ptr(prog->active);
+ if (unlikely(++active[rctx] != 1)) {
how about
active = this_cpu_ptr(prog->active);
++active[rctx];
if (unlikely(*(u32 *)active != 1 << rctx * 8)) {
that should preserve single prog per cpu rule,
and hopefully have better performance than this_cpu_inc_return,
xchg, and this_cpu_xchg.
Also noticed that we use this_cpu_dec() which is probably just as slow.
So the first experiment to do is:
- this_cpu_dec(*(prog->active));
+ this_cpu_dec_return(*(prog->active));
Also as a pre-patch please wrap inc/dec into two helpers
and use them everywhere.
Will simplify all these experiments.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH bpf-next] bpf: Optimize recursion detection for arm64
2025-11-04 23:52 ` Alexei Starovoitov
2025-11-05 1:30 ` Alexei Starovoitov
@ 2025-11-05 5:26 ` Puranjay Mohan
1 sibling, 0 replies; 5+ messages in thread
From: Puranjay Mohan @ 2025-11-05 5:26 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Puranjay Mohan, bpf, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Kernel Team
On Wed, Nov 5, 2025 at 12:52 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Nov 4, 2025 at 8:49 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >
> > BPF programs detect recursion by a per-cpu active flag in struct
> > bpf_prog. This flag is set/unset in the trampoline using atomic
> > operations to prevent inter-context recursion.
> >
> > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > the Neoverse V2. This commit therefore changes the recursion detection
> > mechanism to allow four levels of recursion (normal -> softirq -> hardirq
> > -> NMI). With allowing limited recursion, we can now stop using atomic
> > operations. This approach is similar to get_recursion_context() in perf.
> >
> > Change active to a per-cpu array of four u8 values, one for each context
> > and use non-atomic increment/decrement on them.
> >
> > This improves the performance on ARM64 (64-CPU Neoverse-N1):
> >
> > +----------------+-------------------+-------------------+---------+
> > | Benchmark | Base run | Patched run | Δ (%) |
> > +----------------+-------------------+-------------------+---------+
> > | fentry | 3.694 ± 0.003M/s | 3.828 ± 0.007M/s | +3.63% |
> > | fexit | 1.389 ± 0.006M/s | 1.406 ± 0.003M/s | +1.22% |
> > | fmodret | 1.366 ± 0.011M/s | 1.398 ± 0.002M/s | +2.34% |
> > | rawtp | 3.453 ± 0.026M/s | 3.714 ± 0.003M/s | +7.56% |
> > | tp | 2.596 ± 0.005M/s | 2.699 ± 0.006M/s | +3.97% |
> > +----------------+-------------------+-------------------+---------+
>
> The gain is nice, but absolute numbers look very low.
> I see fentry doing 52M on the debug kernel with kasan inside VM.
>
> The patch itself looks good to me, but I realized that we cannot
> use this approach for progs with a private stack,
> since they require a strict one user per cpu.
I figured that out after sending the patch and was going to suggest
per-cpu-per-context private stack, but that is an overkill.
> Also tracing progs might have conceptually similar restriction.
> A prog could use per-cpu map to store some data.
> If prog is attached to some function that may be called from
> task and irq context the irq execution will write over per-cpu data
> and when it returns the same prog in task context will see garbage.
> I'm afraid get_recursion_context() approach won't work. Sorry for
> not-thought-through suggestion.
>
> Looking at the other thread it looks like this_cpu_inc_return()
> is actually fast on arm64, while this_cpu_inc() is horrible.
> And we're using _return() flavor almost everywhere,
> so it's probably fine, but this patch shows that there is room
> for improvement.
> Please check why absolute numbers are so low though.
I was using kvm with qemu and gave it 32 cpus, will try to use a full
metal host to see if I get better numbers.
>
> Also let's benchmark xchg(prog->active, 1) vs this_cpu_inc_return().
> And its variant this_cpu_xchg().
> xchg() will probably be slower.
> this_cpu_xchg() may be faster?
> pls test a few x86 and arm64 setups.
>
> pw-bot: cr
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH bpf-next] bpf: Optimize recursion detection for arm64
2025-11-05 1:30 ` Alexei Starovoitov
@ 2025-11-05 6:30 ` Puranjay Mohan
0 siblings, 0 replies; 5+ messages in thread
From: Puranjay Mohan @ 2025-11-05 6:30 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Puranjay Mohan, bpf, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Kernel Team
On Wed, Nov 5, 2025 at 2:30 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Nov 4, 2025 at 3:52 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Nov 4, 2025 at 8:49 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> > >
> > > BPF programs detect recursion by a per-cpu active flag in struct
> > > bpf_prog. This flag is set/unset in the trampoline using atomic
> > > operations to prevent inter-context recursion.
> > >
> > > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > > the Neoverse V2. This commit therefore changes the recursion detection
> > > mechanism to allow four levels of recursion (normal -> softirq -> hardirq
> > > -> NMI). With allowing limited recursion, we can now stop using atomic
> > > operations. This approach is similar to get_recursion_context() in perf.
> > >
> > > Change active to a per-cpu array of four u8 values, one for each context
> > > and use non-atomic increment/decrement on them.
> > >
> > > This improves the performance on ARM64 (64-CPU Neoverse-N1):
> > >
> > > +----------------+-------------------+-------------------+---------+
> > > | Benchmark | Base run | Patched run | Δ (%) |
> > > +----------------+-------------------+-------------------+---------+
> > > | fentry | 3.694 ± 0.003M/s | 3.828 ± 0.007M/s | +3.63% |
> > > | fexit | 1.389 ± 0.006M/s | 1.406 ± 0.003M/s | +1.22% |
> > > | fmodret | 1.366 ± 0.011M/s | 1.398 ± 0.002M/s | +2.34% |
> > > | rawtp | 3.453 ± 0.026M/s | 3.714 ± 0.003M/s | +7.56% |
> > > | tp | 2.596 ± 0.005M/s | 2.699 ± 0.006M/s | +3.97% |
> > > +----------------+-------------------+-------------------+---------+
> >
> > The gain is nice, but absolute numbers look very low.
> > I see fentry doing 52M on the debug kernel with kasan inside VM.
> >
> > The patch itself looks good to me, but I realized that we cannot
> > use this approach for progs with a private stack,
> > since they require a strict one user per cpu.
> >
> > Also tracing progs might have conceptually similar restriction.
> > A prog could use per-cpu map to store some data.
> > If prog is attached to some function that may be called from
> > task and irq context the irq execution will write over per-cpu data
> > and when it returns the same prog in task context will see garbage.
> > I'm afraid get_recursion_context() approach won't work. Sorry for
> > not-thought-through suggestion.
>
> Actually the get_recursion_context() approach can be salvaged.
> Instead of:
> + active = this_cpu_ptr(prog->active);
> + if (unlikely(++active[rctx] != 1)) {
>
> how about
> active = this_cpu_ptr(prog->active);
> ++active[rctx];
> if (unlikely(*(u32 *)active != 1 << rctx * 8)) {
Yes, I think this should work after changing it to be endianness safe.
This should be the fastest as it doesn't use any atomic operations.
> that should preserve single prog per cpu rule,
> and hopefully have better performance than this_cpu_inc_return,
> xchg, and this_cpu_xchg.
>
> Also noticed that we use this_cpu_dec() which is probably just as slow.
> So the first experiment to do is:
> - this_cpu_dec(*(prog->active));
> + this_cpu_dec_return(*(prog->active));
Okay, I will try this first.
>
> Also as a pre-patch please wrap inc/dec into two helpers
> and use them everywhere.
> Will simplify all these experiments.
So, I will go ahead and test all the different setups on arm64 and see
which is the best.
Thanks,
Puranjay
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-11-05 6:30 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-04 16:49 [PATCH bpf-next] bpf: Optimize recursion detection for arm64 Puranjay Mohan
2025-11-04 23:52 ` Alexei Starovoitov
2025-11-05 1:30 ` Alexei Starovoitov
2025-11-05 6:30 ` Puranjay Mohan
2025-11-05 5:26 ` Puranjay Mohan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox