* [PATCH bpf-next v2 0/2] bpf, x86: inline bpf_get_current_task() for x86_64 @ 2026-01-04 13:16 Menglong Dong 2026-01-04 13:16 ` [PATCH bpf-next v2 1/2] " Menglong Dong 2026-01-04 13:16 ` [PATCH bpf-next v2 2/2] selftests/bpf: test the jited inline of bpf_get_current_task Menglong Dong 0 siblings, 2 replies; 7+ messages in thread From: Menglong Dong @ 2026-01-04 13:16 UTC (permalink / raw) To: ast, eddyz87 Cc: davem, dsahern, daniel, andrii, martin.lau, song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, tglx, mingo, bp, dave.hansen, x86, hpa, netdev, bpf, linux-kernel Inline bpf_get_current_task() and bpf_get_current_task_btf() for x86_64 to obtain better performance, and add the testcase for it. Changes since v1: - add the testcase - remove the usage of const_current_task Menglong Dong (2): bpf, x86: inline bpf_get_current_task() for x86_64 selftests/bpf: test the jited inline of bpf_get_current_task arch/x86/net/bpf_jit_comp.c | 34 ++++++++++++++++++ .../selftests/bpf/prog_tests/verifier.c | 2 ++ .../selftests/bpf/progs/verifier_jit_inline.c | 35 +++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_jit_inline.c -- 2.52.0 ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH bpf-next v2 1/2] bpf, x86: inline bpf_get_current_task() for x86_64 2026-01-04 13:16 [PATCH bpf-next v2 0/2] bpf, x86: inline bpf_get_current_task() for x86_64 Menglong Dong @ 2026-01-04 13:16 ` Menglong Dong 2026-01-05 17:45 ` Eduard Zingerman 2026-01-05 18:04 ` Alexei Starovoitov 2026-01-04 13:16 ` [PATCH bpf-next v2 2/2] selftests/bpf: test the jited inline of bpf_get_current_task Menglong Dong 1 sibling, 2 replies; 7+ messages in thread From: Menglong Dong @ 2026-01-04 13:16 UTC (permalink / raw) To: ast, eddyz87 Cc: davem, dsahern, daniel, andrii, martin.lau, song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, tglx, mingo, bp, dave.hansen, x86, hpa, netdev, bpf, linux-kernel Inline bpf_get_current_task() and bpf_get_current_task_btf() for x86_64 to obtain better performance. The instruction we use here is: 65 48 8B 04 25 [offset] // mov rax, gs:[offset] Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn> --- v2: - check the variable type in emit_ldx_percpu_r0 with __verify_pcpu_ptr - remove the usage of const_current_task --- arch/x86/net/bpf_jit_comp.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e3b1c4b1d550..f5ff7c77aad7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1300,6 +1300,25 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); } +static void __emit_ldx_percpu_r0(u8 **pprog, __force unsigned long ptr) +{ + u8 *prog = *pprog; + + /* mov rax, gs:[ptr] */ + EMIT2(0x65, 0x48); + EMIT2(0x8B, 0x04); + EMIT1(0x25); + EMIT((u32)ptr, 4); + + *pprog = prog; +} + +#define emit_ldx_percpu_r0(prog, variable) \ + do { \ + __verify_pcpu_ptr(&(variable)); \ + __emit_ldx_percpu_r0(&prog, (__force unsigned long)&(variable));\ + } while (0) + static int emit_atomic_rmw(u8 **pprog, u32 atomic_op, u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { @@ -2441,6 +2460,12 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_CALL: { u8 *ip = image + addrs[i - 1]; + if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task || + insn->imm == BPF_FUNC_get_current_task_btf)) { + emit_ldx_percpu_r0(prog, current_task); + break; + } + func = (u8 *) __bpf_call_base + imm32; if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) { LOAD_TAIL_CALL_CNT_PTR(stack_depth); @@ -4082,3 +4107,14 @@ bool bpf_jit_supports_timed_may_goto(void) { return true; } + +bool bpf_jit_inlines_helper_call(s32 imm) +{ + switch (imm) { + case BPF_FUNC_get_current_task: + case BPF_FUNC_get_current_task_btf: + return true; + default: + return false; + } +} -- 2.52.0 ^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH bpf-next v2 1/2] bpf, x86: inline bpf_get_current_task() for x86_64 2026-01-04 13:16 ` [PATCH bpf-next v2 1/2] " Menglong Dong @ 2026-01-05 17:45 ` Eduard Zingerman 2026-01-06 1:57 ` Menglong Dong 2026-01-05 18:04 ` Alexei Starovoitov 1 sibling, 1 reply; 7+ messages in thread From: Eduard Zingerman @ 2026-01-05 17:45 UTC (permalink / raw) To: Menglong Dong, ast Cc: davem, dsahern, daniel, andrii, martin.lau, song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, tglx, mingo, bp, dave.hansen, x86, hpa, netdev, bpf, linux-kernel On Sun, 2026-01-04 at 21:16 +0800, Menglong Dong wrote: > Inline bpf_get_current_task() and bpf_get_current_task_btf() for x86_64 > to obtain better performance. The instruction we use here is: > > 65 48 8B 04 25 [offset] // mov rax, gs:[offset] > > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn> > --- > v2: > - check the variable type in emit_ldx_percpu_r0 with __verify_pcpu_ptr > - remove the usage of const_current_task > --- > arch/x86/net/bpf_jit_comp.c | 36 ++++++++++++++++++++++++++++++++++++ > 1 file changed, 36 insertions(+) > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > index e3b1c4b1d550..f5ff7c77aad7 100644 > --- a/arch/x86/net/bpf_jit_comp.c > +++ b/arch/x86/net/bpf_jit_comp.c > @@ -1300,6 +1300,25 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) > emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); > } > > +static void __emit_ldx_percpu_r0(u8 **pprog, __force unsigned long ptr) > +{ > + u8 *prog = *pprog; > + > + /* mov rax, gs:[ptr] */ > + EMIT2(0x65, 0x48); > + EMIT2(0x8B, 0x04); > + EMIT1(0x25); > + EMIT((u32)ptr, 4); > + > + *pprog = prog; > +} > + > +#define emit_ldx_percpu_r0(prog, variable) \ > + do { \ > + __verify_pcpu_ptr(&(variable)); \ > + __emit_ldx_percpu_r0(&prog, (__force unsigned long)&(variable));\ > + } while (0) > + > static int emit_atomic_rmw(u8 **pprog, u32 atomic_op, > u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) > { > @@ -2441,6 +2460,12 @@ st: if (is_imm8(insn->off)) > case BPF_JMP | BPF_CALL: { > u8 *ip = image + addrs[i - 1]; > > + if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task || > + insn->imm == BPF_FUNC_get_current_task_btf)) { I think this should be guarded by IS_ENABLED(CONFIG_SMP). The current.h:get_current() used arch/x86/include/asm/percpu.h:this_cpu_read_stable() that is unrolled to __raw_cpu_read_stable(), which uses __force_percpu_arg(), which uses __force_percpu_prefix, which is defined differently depending on CONFIG_SMP. > + emit_ldx_percpu_r0(prog, current_task); > + break; > + } > + > func = (u8 *) __bpf_call_base + imm32; > if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) { > LOAD_TAIL_CALL_CNT_PTR(stack_depth); > @@ -4082,3 +4107,14 @@ bool bpf_jit_supports_timed_may_goto(void) > { > return true; > } > + > +bool bpf_jit_inlines_helper_call(s32 imm) > +{ > + switch (imm) { > + case BPF_FUNC_get_current_task: > + case BPF_FUNC_get_current_task_btf: > + return true; > + default: > + return false; > + } > +} ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH bpf-next v2 1/2] bpf, x86: inline bpf_get_current_task() for x86_64 2026-01-05 17:45 ` Eduard Zingerman @ 2026-01-06 1:57 ` Menglong Dong 0 siblings, 0 replies; 7+ messages in thread From: Menglong Dong @ 2026-01-06 1:57 UTC (permalink / raw) To: Menglong Dong, ast, Eduard Zingerman Cc: davem, dsahern, daniel, andrii, martin.lau, song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, tglx, mingo, bp, dave.hansen, x86, hpa, netdev, bpf, linux-kernel On 2026/1/6 01:45 Eduard Zingerman <eddyz87@gmail.com> write: > On Sun, 2026-01-04 at 21:16 +0800, Menglong Dong wrote: > > Inline bpf_get_current_task() and bpf_get_current_task_btf() for x86_64 > > to obtain better performance. The instruction we use here is: > > > > 65 48 8B 04 25 [offset] // mov rax, gs:[offset] > > > > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn> > > --- > > v2: > > - check the variable type in emit_ldx_percpu_r0 with __verify_pcpu_ptr > > - remove the usage of const_current_task > > --- > > arch/x86/net/bpf_jit_comp.c | 36 ++++++++++++++++++++++++++++++++++++ > > 1 file changed, 36 insertions(+) > > > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > > index e3b1c4b1d550..f5ff7c77aad7 100644 > > --- a/arch/x86/net/bpf_jit_comp.c > > +++ b/arch/x86/net/bpf_jit_comp.c > > @@ -1300,6 +1300,25 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) > > emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); > > } > > > > +static void __emit_ldx_percpu_r0(u8 **pprog, __force unsigned long ptr) > > +{ > > + u8 *prog = *pprog; > > + > > + /* mov rax, gs:[ptr] */ > > + EMIT2(0x65, 0x48); > > + EMIT2(0x8B, 0x04); > > + EMIT1(0x25); > > + EMIT((u32)ptr, 4); > > + > > + *pprog = prog; > > +} > > + > > +#define emit_ldx_percpu_r0(prog, variable) \ > > + do { \ > > + __verify_pcpu_ptr(&(variable)); \ > > + __emit_ldx_percpu_r0(&prog, (__force unsigned long)&(variable));\ > > + } while (0) > > + > > static int emit_atomic_rmw(u8 **pprog, u32 atomic_op, > > u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) > > { > > @@ -2441,6 +2460,12 @@ st: if (is_imm8(insn->off)) > > case BPF_JMP | BPF_CALL: { > > u8 *ip = image + addrs[i - 1]; > > > > + if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task || > > + insn->imm == BPF_FUNC_get_current_task_btf)) { > > I think this should be guarded by IS_ENABLED(CONFIG_SMP). > The current.h:get_current() used > arch/x86/include/asm/percpu.h:this_cpu_read_stable() that is unrolled > to __raw_cpu_read_stable(), which uses __force_percpu_arg(), which uses > __force_percpu_prefix, which is defined differently depending on CONFIG_SMP. Yeah, I missed this part. I'll use BPF_MOV64_PERCPU_REG() in the next version, which should avoid this problem. Thanks! Menglong Dong > > > + emit_ldx_percpu_r0(prog, current_task); > > + break; > > + } > > + > > func = (u8 *) __bpf_call_base + imm32; > > if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) { > > LOAD_TAIL_CALL_CNT_PTR(stack_depth); > > @@ -4082,3 +4107,14 @@ bool bpf_jit_supports_timed_may_goto(void) > > { > > return true; > > } > > + > > +bool bpf_jit_inlines_helper_call(s32 imm) > > +{ > > + switch (imm) { > > + case BPF_FUNC_get_current_task: > > + case BPF_FUNC_get_current_task_btf: > > + return true; > > + default: > > + return false; > > + } > > +} > ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH bpf-next v2 1/2] bpf, x86: inline bpf_get_current_task() for x86_64 2026-01-04 13:16 ` [PATCH bpf-next v2 1/2] " Menglong Dong 2026-01-05 17:45 ` Eduard Zingerman @ 2026-01-05 18:04 ` Alexei Starovoitov 2026-01-06 1:52 ` Menglong Dong 1 sibling, 1 reply; 7+ messages in thread From: Alexei Starovoitov @ 2026-01-05 18:04 UTC (permalink / raw) To: Menglong Dong Cc: Alexei Starovoitov, Eduard, David S. Miller, David Ahern, Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, X86 ML, H. Peter Anvin, Network Development, bpf, LKML On Sun, Jan 4, 2026 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote: > > Inline bpf_get_current_task() and bpf_get_current_task_btf() for x86_64 > to obtain better performance. The instruction we use here is: > > 65 48 8B 04 25 [offset] // mov rax, gs:[offset] > > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn> > --- > v2: > - check the variable type in emit_ldx_percpu_r0 with __verify_pcpu_ptr > - remove the usage of const_current_task > --- > arch/x86/net/bpf_jit_comp.c | 36 ++++++++++++++++++++++++++++++++++++ > 1 file changed, 36 insertions(+) > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > index e3b1c4b1d550..f5ff7c77aad7 100644 > --- a/arch/x86/net/bpf_jit_comp.c > +++ b/arch/x86/net/bpf_jit_comp.c > @@ -1300,6 +1300,25 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) > emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); > } > > +static void __emit_ldx_percpu_r0(u8 **pprog, __force unsigned long ptr) > +{ > + u8 *prog = *pprog; > + > + /* mov rax, gs:[ptr] */ > + EMIT2(0x65, 0x48); > + EMIT2(0x8B, 0x04); > + EMIT1(0x25); > + EMIT((u32)ptr, 4); > + > + *pprog = prog; > +} Why asm? Let's use BPF_MOV64_PERCPU_REG() similar to the way BPF_FUNC_get_smp_processor_id inlining is handled. pw-bot: cr ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH bpf-next v2 1/2] bpf, x86: inline bpf_get_current_task() for x86_64 2026-01-05 18:04 ` Alexei Starovoitov @ 2026-01-06 1:52 ` Menglong Dong 0 siblings, 0 replies; 7+ messages in thread From: Menglong Dong @ 2026-01-06 1:52 UTC (permalink / raw) To: Menglong Dong, Alexei Starovoitov Cc: Alexei Starovoitov, Eduard, David S. Miller, David Ahern, Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, X86 ML, H. Peter Anvin, Network Development, bpf, LKML On 2026/1/6 02:04 Alexei Starovoitov <alexei.starovoitov@gmail.com> write: > On Sun, Jan 4, 2026 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote: > > > > Inline bpf_get_current_task() and bpf_get_current_task_btf() for x86_64 > > to obtain better performance. The instruction we use here is: > > > > 65 48 8B 04 25 [offset] // mov rax, gs:[offset] > > > > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn> > > --- > > v2: > > - check the variable type in emit_ldx_percpu_r0 with __verify_pcpu_ptr > > - remove the usage of const_current_task > > --- > > arch/x86/net/bpf_jit_comp.c | 36 ++++++++++++++++++++++++++++++++++++ > > 1 file changed, 36 insertions(+) > > > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c > > index e3b1c4b1d550..f5ff7c77aad7 100644 > > --- a/arch/x86/net/bpf_jit_comp.c > > +++ b/arch/x86/net/bpf_jit_comp.c > > @@ -1300,6 +1300,25 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) > > emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); > > } > > > > +static void __emit_ldx_percpu_r0(u8 **pprog, __force unsigned long ptr) > > +{ > > + u8 *prog = *pprog; > > + > > + /* mov rax, gs:[ptr] */ > > + EMIT2(0x65, 0x48); > > + EMIT2(0x8B, 0x04); > > + EMIT1(0x25); > > + EMIT((u32)ptr, 4); > > + > > + *pprog = prog; > > +} > > Why asm? > Let's use BPF_MOV64_PERCPU_REG() similar to the way > BPF_FUNC_get_smp_processor_id inlining is handled. Ah, this is a good point. I didn't know the existing of BPF_MOV64_PERCPU_REG :/ So we can inline it directly in the verifier instead of the arch. I'll use it in the next version. Thanks! Menglong Dong > > pw-bot: cr > ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH bpf-next v2 2/2] selftests/bpf: test the jited inline of bpf_get_current_task 2026-01-04 13:16 [PATCH bpf-next v2 0/2] bpf, x86: inline bpf_get_current_task() for x86_64 Menglong Dong 2026-01-04 13:16 ` [PATCH bpf-next v2 1/2] " Menglong Dong @ 2026-01-04 13:16 ` Menglong Dong 1 sibling, 0 replies; 7+ messages in thread From: Menglong Dong @ 2026-01-04 13:16 UTC (permalink / raw) To: ast, eddyz87 Cc: davem, dsahern, daniel, andrii, martin.lau, song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, tglx, mingo, bp, dave.hansen, x86, hpa, netdev, bpf, linux-kernel Add the testcase for the jited inline of bpf_get_current_task(). Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn> --- .../selftests/bpf/prog_tests/verifier.c | 2 ++ .../selftests/bpf/progs/verifier_jit_inline.c | 35 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_jit_inline.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 5829ffd70f8f..47eb78c808c0 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -110,6 +110,7 @@ #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" #include "verifier_lsm.skel.h" +#include "verifier_jit_inline.skel.h" #include "irq.skel.h" #define MAX_ENTRIES 11 @@ -251,6 +252,7 @@ void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } void test_verifier_lsm(void) { RUN(verifier_lsm); } void test_irq(void) { RUN(irq); } void test_verifier_mtu(void) { RUN(verifier_mtu); } +void test_verifier_jit_inline(void) { RUN(verifier_jit_inline); } static int init_test_val_map(struct bpf_object *obj, char *map_name) { diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c new file mode 100644 index 000000000000..398a6405d00a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64) + +SEC("fentry/bpf_fentry_test1") +__description("Jit inline, bpf_get_current_task") +__success __retval(0) +__arch_x86_64 +__jited(" movq %gs:{{.*}}, %rax") +__arch_arm64 +__jited(" mrs x7, SP_EL0") +int inline_bpf_get_current_task(void) +{ + bpf_get_current_task(); + + return 0; +} + +#else + +SEC("kprobe") +__description("Jit inline is not supported, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; -- 2.52.0 ^ permalink raw reply related [flat|nested] 7+ messages in thread
end of thread, other threads:[~2026-01-06 1:57 UTC | newest] Thread overview: 7+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-01-04 13:16 [PATCH bpf-next v2 0/2] bpf, x86: inline bpf_get_current_task() for x86_64 Menglong Dong 2026-01-04 13:16 ` [PATCH bpf-next v2 1/2] " Menglong Dong 2026-01-05 17:45 ` Eduard Zingerman 2026-01-06 1:57 ` Menglong Dong 2026-01-05 18:04 ` Alexei Starovoitov 2026-01-06 1:52 ` Menglong Dong 2026-01-04 13:16 ` [PATCH bpf-next v2 2/2] selftests/bpf: test the jited inline of bpf_get_current_task Menglong Dong
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox