[PATCH bpf-next] bpf: Optimize recursion detection for arm64

BPF List
 help / color / mirror / Atom feed

* [PATCH bpf-next] bpf: Optimize recursion detection for arm64
@ 2025-11-04 16:49 Puranjay Mohan
  2025-11-04 23:52 ` Alexei Starovoitov
  0 siblings, 1 reply; 5+ messages in thread
From: Puranjay Mohan @ 2025-11-04 16:49 UTC (permalink / raw)
  To: bpf
  Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, kernel-team

BPF programs detect recursion by a per-cpu active flag in struct
bpf_prog. This flag is set/unset in the trampoline using atomic
operations to prevent inter-context recursion.

Some arm64 platforms have slow per-CPU atomic operations, for example,
the Neoverse V2.  This commit therefore changes the recursion detection
mechanism to allow four levels of recursion (normal -> softirq -> hardirq
-> NMI). With allowing limited recursion, we can now stop using atomic
operations. This approach is similar to get_recursion_context() in perf.

Change active to a per-cpu array of four u8 values, one for each context
and use non-atomic increment/decrement on them.

This improves the performance on ARM64 (64-CPU Neoverse-N1):

 +----------------+-------------------+-------------------+---------+
 |    Benchmark   |     Base run      |   Patched run     |  Δ (%)  |
 +----------------+-------------------+-------------------+---------+
 | fentry         |  3.694 ± 0.003M/s |  3.828 ± 0.007M/s | +3.63%  |
 | fexit          |  1.389 ± 0.006M/s |  1.406 ± 0.003M/s | +1.22%  |
 | fmodret        |  1.366 ± 0.011M/s |  1.398 ± 0.002M/s | +2.34%  |
 | rawtp          |  3.453 ± 0.026M/s |  3.714 ± 0.003M/s | +7.56%  |
 | tp             |  2.596 ± 0.005M/s |  2.699 ± 0.006M/s | +3.97%  |
 +----------------+-------------------+-------------------+---------+

 Benchmarked using: tools/testing/selftests/bpf/benchs/run_bench_trigger.sh

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/bpf.h      |  4 +++-
 kernel/bpf/core.c        |  3 ++-
 kernel/bpf/trampoline.c  | 22 ++++++++++++++++++----
 kernel/trace/bpf_trace.c | 11 +++++++----
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a47d67db3be5..920902e0f384 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1728,6 +1728,8 @@ struct bpf_prog_aux {
 	struct bpf_stream stream[2];
 };
 
+#define BPF_NR_CONTEXTS	4
+
 struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
 	u16			jited:1,	/* Is our filter JIT'ed? */
@@ -1754,7 +1756,7 @@ struct bpf_prog {
 		u8 tag[BPF_TAG_SIZE];
 	};
 	struct bpf_prog_stats __percpu *stats;
-	int __percpu		*active;
+	u8 __percpu		*active;	/* u8[BPF_NR_CONTEXTS] for rerecursion protection */
 	unsigned int		(*bpf_func)(const void *ctx,
 					    const struct bpf_insn *insn);
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d595fe512498..6fe2e22385a6 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -112,7 +112,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 		vfree(fp);
 		return NULL;
 	}
-	fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+	fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 8,
+					bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
 	if (!fp->active) {
 		vfree(fp);
 		kfree(aux);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5949095e51c3..e6b9c7e34990 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -899,11 +899,15 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
 static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
 	__acquires(RCU)
 {
+	u8 rctx = interrupt_context_level();
+	u8 *active;
+
 	rcu_read_lock_dont_migrate();
 
 	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
 
-	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+	active = this_cpu_ptr(prog->active);
+	if (unlikely(++active[rctx] != 1)) {
 		bpf_prog_inc_misses_counter(prog);
 		if (prog->aux->recursion_detected)
 			prog->aux->recursion_detected(prog);
@@ -944,10 +948,13 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
 					  struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
 {
+	u8 rctx = interrupt_context_level();
+	u8 *active = this_cpu_ptr(prog->active);
+
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
 
 	update_prog_stats(prog, start);
-	this_cpu_dec(*(prog->active));
+	active[rctx]--;
 	rcu_read_unlock_migrate();
 }
 
@@ -977,13 +984,17 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx)
 {
+	u8 rctx = interrupt_context_level();
+	u8 *active;
+
 	rcu_read_lock_trace();
 	migrate_disable();
 	might_fault();
 
 	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
 
-	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+	active = this_cpu_ptr(prog->active);
+	if (unlikely(++active[rctx] != 1)) {
 		bpf_prog_inc_misses_counter(prog);
 		if (prog->aux->recursion_detected)
 			prog->aux->recursion_detected(prog);
@@ -995,10 +1006,13 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
 					     struct bpf_tramp_run_ctx *run_ctx)
 {
+	u8 rctx = interrupt_context_level();
+	u8 *active = this_cpu_ptr(prog->active);
+
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
 
 	update_prog_stats(prog, start);
-	this_cpu_dec(*(prog->active));
+	active[rctx]--;
 	migrate_enable();
 	rcu_read_unlock_trace();
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a795f7afbf3d..4c0751710cff 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2059,14 +2059,18 @@ static __always_inline
 void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
 {
 	struct bpf_prog *prog = link->link.prog;
+	u8 rctx = interrupt_context_level();
 	struct bpf_run_ctx *old_run_ctx;
 	struct bpf_trace_run_ctx run_ctx;
+	u8 *active;
 
 	cant_sleep();
-	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+	active = this_cpu_ptr(prog->active);
+	if (unlikely(active[rctx])) {
 		bpf_prog_inc_misses_counter(prog);
-		goto out;
+		return;
 	}
+	active[rctx]++;
 
 	run_ctx.bpf_cookie = link->cookie;
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
@@ -2076,8 +2080,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
 	rcu_read_unlock();
 
 	bpf_reset_run_ctx(old_run_ctx);
-out:
-	this_cpu_dec(*(prog->active));
+	active[rctx]--;
 }
 
 #define UNPACK(...)			__VA_ARGS__
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] bpf: Optimize recursion detection for arm64
  2025-11-04 16:49 [PATCH bpf-next] bpf: Optimize recursion detection for arm64 Puranjay Mohan
@ 2025-11-04 23:52 ` Alexei Starovoitov
  2025-11-05  1:30   ` Alexei Starovoitov
  2025-11-05  5:26   ` Puranjay Mohan
  0 siblings, 2 replies; 5+ messages in thread
From: Alexei Starovoitov @ 2025-11-04 23:52 UTC (permalink / raw)
  To: Puranjay Mohan
  Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
	Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Kernel Team

On Tue, Nov 4, 2025 at 8:49 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> BPF programs detect recursion by a per-cpu active flag in struct
> bpf_prog. This flag is set/unset in the trampoline using atomic
> operations to prevent inter-context recursion.
>
> Some arm64 platforms have slow per-CPU atomic operations, for example,
> the Neoverse V2.  This commit therefore changes the recursion detection
> mechanism to allow four levels of recursion (normal -> softirq -> hardirq
> -> NMI). With allowing limited recursion, we can now stop using atomic
> operations. This approach is similar to get_recursion_context() in perf.
>
> Change active to a per-cpu array of four u8 values, one for each context
> and use non-atomic increment/decrement on them.
>
> This improves the performance on ARM64 (64-CPU Neoverse-N1):
>
>  +----------------+-------------------+-------------------+---------+
>  |    Benchmark   |     Base run      |   Patched run     |  Δ (%)  |
>  +----------------+-------------------+-------------------+---------+
>  | fentry         |  3.694 ± 0.003M/s |  3.828 ± 0.007M/s | +3.63%  |
>  | fexit          |  1.389 ± 0.006M/s |  1.406 ± 0.003M/s | +1.22%  |
>  | fmodret        |  1.366 ± 0.011M/s |  1.398 ± 0.002M/s | +2.34%  |
>  | rawtp          |  3.453 ± 0.026M/s |  3.714 ± 0.003M/s | +7.56%  |
>  | tp             |  2.596 ± 0.005M/s |  2.699 ± 0.006M/s | +3.97%  |
>  +----------------+-------------------+-------------------+---------+

The gain is nice, but absolute numbers look very low.
I see fentry doing 52M on the debug kernel with kasan inside VM.

The patch itself looks good to me, but I realized that we cannot
use this approach for progs with a private stack,
since they require a strict one user per cpu.

Also tracing progs might have conceptually similar restriction.
A prog could use per-cpu map to store some data.
If prog is attached to some function that may be called from
task and irq context the irq execution will write over per-cpu data
and when it returns the same prog in task context will see garbage.
I'm afraid get_recursion_context() approach won't work. Sorry for
not-thought-through suggestion.

Looking at the other thread it looks like this_cpu_inc_return()
is actually fast on arm64, while this_cpu_inc() is horrible.
And we're using _return() flavor almost everywhere,
so it's probably fine, but this patch shows that there is room
for improvement.
Please check why absolute numbers are so low though.

Also let's benchmark xchg(prog->active, 1) vs this_cpu_inc_return().
And its variant this_cpu_xchg().
xchg() will probably be slower.
this_cpu_xchg() may be faster?
pls test a few x86 and arm64 setups.

pw-bot: cr

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] bpf: Optimize recursion detection for arm64
  2025-11-04 23:52 ` Alexei Starovoitov
@ 2025-11-05  1:30   ` Alexei Starovoitov
  2025-11-05  6:30     ` Puranjay Mohan
  2025-11-05  5:26   ` Puranjay Mohan
  1 sibling, 1 reply; 5+ messages in thread
From: Alexei Starovoitov @ 2025-11-05  1:30 UTC (permalink / raw)
  To: Puranjay Mohan
  Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
	Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Kernel Team

On Tue, Nov 4, 2025 at 3:52 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Nov 4, 2025 at 8:49 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >
> > BPF programs detect recursion by a per-cpu active flag in struct
> > bpf_prog. This flag is set/unset in the trampoline using atomic
> > operations to prevent inter-context recursion.
> >
> > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > the Neoverse V2.  This commit therefore changes the recursion detection
> > mechanism to allow four levels of recursion (normal -> softirq -> hardirq
> > -> NMI). With allowing limited recursion, we can now stop using atomic
> > operations. This approach is similar to get_recursion_context() in perf.
> >
> > Change active to a per-cpu array of four u8 values, one for each context
> > and use non-atomic increment/decrement on them.
> >
> > This improves the performance on ARM64 (64-CPU Neoverse-N1):
> >
> >  +----------------+-------------------+-------------------+---------+
> >  |    Benchmark   |     Base run      |   Patched run     |  Δ (%)  |
> >  +----------------+-------------------+-------------------+---------+
> >  | fentry         |  3.694 ± 0.003M/s |  3.828 ± 0.007M/s | +3.63%  |
> >  | fexit          |  1.389 ± 0.006M/s |  1.406 ± 0.003M/s | +1.22%  |
> >  | fmodret        |  1.366 ± 0.011M/s |  1.398 ± 0.002M/s | +2.34%  |
> >  | rawtp          |  3.453 ± 0.026M/s |  3.714 ± 0.003M/s | +7.56%  |
> >  | tp             |  2.596 ± 0.005M/s |  2.699 ± 0.006M/s | +3.97%  |
> >  +----------------+-------------------+-------------------+---------+
>
> The gain is nice, but absolute numbers look very low.
> I see fentry doing 52M on the debug kernel with kasan inside VM.
>
> The patch itself looks good to me, but I realized that we cannot
> use this approach for progs with a private stack,
> since they require a strict one user per cpu.
>
> Also tracing progs might have conceptually similar restriction.
> A prog could use per-cpu map to store some data.
> If prog is attached to some function that may be called from
> task and irq context the irq execution will write over per-cpu data
> and when it returns the same prog in task context will see garbage.
> I'm afraid get_recursion_context() approach won't work. Sorry for
> not-thought-through suggestion.

Actually the get_recursion_context() approach can be salvaged.
Instead of:
+       active = this_cpu_ptr(prog->active);
+       if (unlikely(++active[rctx] != 1)) {

how about
active = this_cpu_ptr(prog->active);
++active[rctx];
if (unlikely(*(u32 *)active != 1 << rctx * 8)) {

that should preserve single prog per cpu rule,
and hopefully have better performance than this_cpu_inc_return,
xchg, and this_cpu_xchg.

Also noticed that we use this_cpu_dec() which is probably just as slow.
So the first experiment to do is:
- this_cpu_dec(*(prog->active));
+ this_cpu_dec_return(*(prog->active));


Also as a pre-patch please wrap inc/dec into two helpers
and use them everywhere.
Will simplify all these experiments.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] bpf: Optimize recursion detection for arm64
  2025-11-04 23:52 ` Alexei Starovoitov
  2025-11-05  1:30   ` Alexei Starovoitov
@ 2025-11-05  5:26   ` Puranjay Mohan
  1 sibling, 0 replies; 5+ messages in thread
From: Puranjay Mohan @ 2025-11-05  5:26 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Puranjay Mohan, bpf, Alexei Starovoitov, Andrii Nakryiko,
	Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Kernel Team

On Wed, Nov 5, 2025 at 12:52 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Nov 4, 2025 at 8:49 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >
> > BPF programs detect recursion by a per-cpu active flag in struct
> > bpf_prog. This flag is set/unset in the trampoline using atomic
> > operations to prevent inter-context recursion.
> >
> > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > the Neoverse V2.  This commit therefore changes the recursion detection
> > mechanism to allow four levels of recursion (normal -> softirq -> hardirq
> > -> NMI). With allowing limited recursion, we can now stop using atomic
> > operations. This approach is similar to get_recursion_context() in perf.
> >
> > Change active to a per-cpu array of four u8 values, one for each context
> > and use non-atomic increment/decrement on them.
> >
> > This improves the performance on ARM64 (64-CPU Neoverse-N1):
> >
> >  +----------------+-------------------+-------------------+---------+
> >  |    Benchmark   |     Base run      |   Patched run     |  Δ (%)  |
> >  +----------------+-------------------+-------------------+---------+
> >  | fentry         |  3.694 ± 0.003M/s |  3.828 ± 0.007M/s | +3.63%  |
> >  | fexit          |  1.389 ± 0.006M/s |  1.406 ± 0.003M/s | +1.22%  |
> >  | fmodret        |  1.366 ± 0.011M/s |  1.398 ± 0.002M/s | +2.34%  |
> >  | rawtp          |  3.453 ± 0.026M/s |  3.714 ± 0.003M/s | +7.56%  |
> >  | tp             |  2.596 ± 0.005M/s |  2.699 ± 0.006M/s | +3.97%  |
> >  +----------------+-------------------+-------------------+---------+
>
> The gain is nice, but absolute numbers look very low.
> I see fentry doing 52M on the debug kernel with kasan inside VM.
>
> The patch itself looks good to me, but I realized that we cannot
> use this approach for progs with a private stack,
> since they require a strict one user per cpu.


I figured that out after sending the patch and was going to suggest
per-cpu-per-context private stack, but that is an overkill.


> Also tracing progs might have conceptually similar restriction.
> A prog could use per-cpu map to store some data.
> If prog is attached to some function that may be called from
> task and irq context the irq execution will write over per-cpu data
> and when it returns the same prog in task context will see garbage.
> I'm afraid get_recursion_context() approach won't work. Sorry for
> not-thought-through suggestion.
>
> Looking at the other thread it looks like this_cpu_inc_return()
> is actually fast on arm64, while this_cpu_inc() is horrible.
> And we're using _return() flavor almost everywhere,
> so it's probably fine, but this patch shows that there is room
> for improvement.
> Please check why absolute numbers are so low though.

I was using kvm with qemu and gave it 32 cpus, will try to use a full
metal host to see if I get better numbers.

>
> Also let's benchmark xchg(prog->active, 1) vs this_cpu_inc_return().
> And its variant this_cpu_xchg().
> xchg() will probably be slower.
> this_cpu_xchg() may be faster?
> pls test a few x86 and arm64 setups.
>
> pw-bot: cr

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] bpf: Optimize recursion detection for arm64
  2025-11-05  1:30   ` Alexei Starovoitov
@ 2025-11-05  6:30     ` Puranjay Mohan
  0 siblings, 0 replies; 5+ messages in thread
From: Puranjay Mohan @ 2025-11-05  6:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Puranjay Mohan, bpf, Alexei Starovoitov, Andrii Nakryiko,
	Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Kernel Team

On Wed, Nov 5, 2025 at 2:30 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Nov 4, 2025 at 3:52 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Nov 4, 2025 at 8:49 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> > >
> > > BPF programs detect recursion by a per-cpu active flag in struct
> > > bpf_prog. This flag is set/unset in the trampoline using atomic
> > > operations to prevent inter-context recursion.
> > >
> > > Some arm64 platforms have slow per-CPU atomic operations, for example,
> > > the Neoverse V2.  This commit therefore changes the recursion detection
> > > mechanism to allow four levels of recursion (normal -> softirq -> hardirq
> > > -> NMI). With allowing limited recursion, we can now stop using atomic
> > > operations. This approach is similar to get_recursion_context() in perf.
> > >
> > > Change active to a per-cpu array of four u8 values, one for each context
> > > and use non-atomic increment/decrement on them.
> > >
> > > This improves the performance on ARM64 (64-CPU Neoverse-N1):
> > >
> > >  +----------------+-------------------+-------------------+---------+
> > >  |    Benchmark   |     Base run      |   Patched run     |  Δ (%)  |
> > >  +----------------+-------------------+-------------------+---------+
> > >  | fentry         |  3.694 ± 0.003M/s |  3.828 ± 0.007M/s | +3.63%  |
> > >  | fexit          |  1.389 ± 0.006M/s |  1.406 ± 0.003M/s | +1.22%  |
> > >  | fmodret        |  1.366 ± 0.011M/s |  1.398 ± 0.002M/s | +2.34%  |
> > >  | rawtp          |  3.453 ± 0.026M/s |  3.714 ± 0.003M/s | +7.56%  |
> > >  | tp             |  2.596 ± 0.005M/s |  2.699 ± 0.006M/s | +3.97%  |
> > >  +----------------+-------------------+-------------------+---------+
> >
> > The gain is nice, but absolute numbers look very low.
> > I see fentry doing 52M on the debug kernel with kasan inside VM.
> >
> > The patch itself looks good to me, but I realized that we cannot
> > use this approach for progs with a private stack,
> > since they require a strict one user per cpu.
> >
> > Also tracing progs might have conceptually similar restriction.
> > A prog could use per-cpu map to store some data.
> > If prog is attached to some function that may be called from
> > task and irq context the irq execution will write over per-cpu data
> > and when it returns the same prog in task context will see garbage.
> > I'm afraid get_recursion_context() approach won't work. Sorry for
> > not-thought-through suggestion.
>
> Actually the get_recursion_context() approach can be salvaged.
> Instead of:
> +       active = this_cpu_ptr(prog->active);
> +       if (unlikely(++active[rctx] != 1)) {
>
> how about
> active = this_cpu_ptr(prog->active);
> ++active[rctx];
> if (unlikely(*(u32 *)active != 1 << rctx * 8)) {

Yes, I think this should work after changing it to be endianness safe.

This should be the fastest as it doesn't use any atomic operations.

> that should preserve single prog per cpu rule,
> and hopefully have better performance than this_cpu_inc_return,
> xchg, and this_cpu_xchg.
>
> Also noticed that we use this_cpu_dec() which is probably just as slow.
> So the first experiment to do is:
> - this_cpu_dec(*(prog->active));
> + this_cpu_dec_return(*(prog->active));

Okay, I will try this first.

>
> Also as a pre-patch please wrap inc/dec into two helpers
> and use them everywhere.
> Will simplify all these experiments.

So, I will go ahead and test all the different setups on arm64 and see
which is the best.

Thanks,
Puranjay

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-11-05  6:30 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-04 16:49 [PATCH bpf-next] bpf: Optimize recursion detection for arm64 Puranjay Mohan
2025-11-04 23:52 ` Alexei Starovoitov
2025-11-05  1:30   ` Alexei Starovoitov
2025-11-05  6:30     ` Puranjay Mohan
2025-11-05  5:26   ` Puranjay Mohan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox