[PATCH bpf-next 2/4] bpf, x64: tailcall: Eliminate max_entries and bpf_func access at runtime

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Leon Hwang <leon.hwang@linux.dev>
To: bpf@vger.kernel.org
Cc: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	Eduard Zingerman <eddyz87@gmail.com>, Song Liu <song@kernel.org>,
	Yonghong Song <yonghong.song@linux.dev>,
	John Fastabend <john.fastabend@gmail.com>,
	KP Singh <kpsingh@kernel.org>,
	Stanislav Fomichev <sdf@fomichev.me>, Hao Luo <haoluo@google.com>,
	Jiri Olsa <jolsa@kernel.org>,
	Puranjay Mohan <puranjay@kernel.org>,
	Xu Kuohai <xukuohai@huaweicloud.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will@kernel.org>,
	"David S . Miller" <davem@davemloft.net>,
	David Ahern <dsahern@kernel.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org, "H . Peter Anvin" <hpa@zytor.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	kernel-patches-bot@fb.com, Leon Hwang <leon.hwang@linux.dev>
Subject: [PATCH bpf-next 2/4] bpf, x64: tailcall: Eliminate max_entries and bpf_func access at runtime
Date: Fri,  2 Jan 2026 23:00:30 +0800	[thread overview]
Message-ID: <20260102150032.53106-3-leon.hwang@linux.dev> (raw)
In-Reply-To: <20260102150032.53106-1-leon.hwang@linux.dev>

Optimize BPF tail calls on x86_64 by eliminating runtime memory accesses
for max_entries and prog->bpf_func when the prog array map is known at
verification time.

The verifier now encodes three fields in the tail call instruction's imm:
  - bits 0-7:   map index in used_maps[] (max 63)
  - bits 8-15:  dynamic array flag (1 if map pointer is poisoned)
  - bits 16-31: poke table index + 1 for direct tail calls (max 1023)

For static tail calls (map known at verification time):
  - max_entries is embedded as an immediate in the comparison instruction
  - The cached target from array->ptrs[max_entries + index] is used
    directly, avoiding the prog->bpf_func dereference

For dynamic tail calls (map pointer poisoned):
  - Fall back to runtime lookup of max_entries and prog->bpf_func

This reduces cache misses and improves tail call performance for the
common case where the prog array is statically known.

Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
 arch/x86/net/bpf_jit_comp.c | 51 +++++++++++++++++++++++++++----------
 kernel/bpf/verifier.c       | 30 ++++++++++++++++++++--
 2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index e3b1c4b1d550..9fd707612da5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -733,11 +733,13 @@ static void emit_return(u8 **pprog, u8 *ip)
  * out:
  */
 static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
+					u32 map_index, bool dyn_array,
 					u8 **pprog, bool *callee_regs_used,
 					u32 stack_depth, u8 *ip,
 					struct jit_context *ctx)
 {
 	int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
+	struct bpf_map *map = bpf_prog->aux->used_maps[map_index];
 	u8 *prog = *pprog, *start = *pprog;
 	int offset;
 
@@ -752,11 +754,14 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 	 *	goto out;
 	 */
 	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
-	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
-	      offsetof(struct bpf_array, map.max_entries));
+	if (dyn_array)
+		EMIT3(0x3B, 0x56,                 /* cmp edx, dword ptr [rsi + 16] */
+		      offsetof(struct bpf_array, map.max_entries));
+	else
+		EMIT2_off32(0x81, 0xFA, map->max_entries); /* cmp edx, imm32 (map->max_entries) */
 
 	offset = ctx->tail_call_indirect_label - (prog + 2 - start);
-	EMIT2(X86_JBE, offset);                   /* jbe out */
+	EMIT2(X86_JAE, offset);                   /* jae out */
 
 	/*
 	 * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
@@ -768,9 +773,15 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 	offset = ctx->tail_call_indirect_label - (prog + 2 - start);
 	EMIT2(X86_JAE, offset);                   /* jae out */
 
-	/* prog = array->ptrs[index]; */
-	EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6,       /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */
-		    offsetof(struct bpf_array, ptrs));
+	/*
+	 * if (dyn_array)
+	 *	prog = array->ptrs[index];
+	 * else
+	 *	tgt = array->ptrs[max_entries + index];
+	 */
+	offset = offsetof(struct bpf_array, ptrs);
+	offset += dyn_array ? 0 : map->max_entries * sizeof(void *);
+	EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, offset); /* mov rcx, [rsi + rdx * 8 + offset] */
 
 	/*
 	 * if (prog == NULL)
@@ -803,11 +814,14 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 		EMIT3_off32(0x48, 0x81, 0xC4,     /* add rsp, sd */
 			    round_up(stack_depth, 8));
 
-	/* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */
-	EMIT4(0x48, 0x8B, 0x49,                   /* mov rcx, qword ptr [rcx + 32] */
-	      offsetof(struct bpf_prog, bpf_func));
-	EMIT4(0x48, 0x83, 0xC1,                   /* add rcx, X86_TAIL_CALL_OFFSET */
-	      X86_TAIL_CALL_OFFSET);
+	if (dyn_array) {
+		/* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */
+		EMIT4(0x48, 0x8B, 0x49,           /* mov rcx, qword ptr [rcx + 32] */
+		      offsetof(struct bpf_prog, bpf_func));
+		EMIT4(0x48, 0x83, 0xC1,           /* add rcx, X86_TAIL_CALL_OFFSET */
+		      X86_TAIL_CALL_OFFSET);
+	}
+
 	/*
 	 * Now we're ready to jump into next BPF program
 	 * rdi == ctx (1st arg)
@@ -2461,15 +2475,21 @@ st:			if (is_imm8(insn->off))
 		}
 
 		case BPF_JMP | BPF_TAIL_CALL:
-			if (imm32)
+			bool dynamic_array = (imm32 >> 8) & 0xFF;
+			u32 map_index = imm32 & 0xFF;
+			s32 imm16 = imm32 >> 16;
+
+			if (imm16)
 				emit_bpf_tail_call_direct(bpf_prog,
-							  &bpf_prog->aux->poke_tab[imm32 - 1],
+							  &bpf_prog->aux->poke_tab[imm16 - 1],
 							  &prog, image + addrs[i - 1],
 							  callee_regs_used,
 							  stack_depth,
 							  ctx);
 			else
 				emit_bpf_tail_call_indirect(bpf_prog,
+							    map_index,
+							    dynamic_array,
 							    &prog,
 							    callee_regs_used,
 							    stack_depth,
@@ -4047,6 +4067,11 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 	}
 }
 
+int bpf_arch_tail_call_prologue_offset(void)
+{
+	return X86_TAIL_CALL_OFFSET;
+}
+
 bool bpf_jit_supports_arena(void)
 {
 	return true;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3d44c5d06623..ab9c84e76a62 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22602,6 +22602,18 @@ static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *pat
 	return 0;
 }
 
+static int tail_call_find_map_index(struct bpf_verifier_env *env, struct bpf_map *map)
+{
+	int i;
+
+	for (i = 0; i < env->used_map_cnt; i++) {
+		if (env->used_maps[i] == map)
+			return i;
+	}
+
+	return -ENOENT;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
@@ -22993,10 +23005,24 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			 * call and to prevent accidental JITing by JIT compiler
 			 * that doesn't support bpf_tail_call yet
 			 */
-			insn->imm = 0;
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
 
+			/*
+			 * insn->imm contains 3 fields:
+			 *   map index(8 bits):   6 bits are enough, 63 max
+			 *   poisoned(8 bits):    1 bit is enough
+			 *   poke index(16 bits): 1023 max
+			 */
+
 			aux = &env->insn_aux_data[i + delta];
+			insn->imm = tail_call_find_map_index(env, aux->map_ptr_state.map_ptr);
+			if (insn->imm < 0) {
+				verifier_bug(env, "index not found for prog array map\n");
+				return -EINVAL;
+			}
+
+			insn->imm |= bpf_map_ptr_poisoned(aux) << 8;
+
 			if (env->bpf_capable && !prog->blinding_requested &&
 			    prog->jit_requested &&
 			    !bpf_map_key_poisoned(aux) &&
@@ -23015,7 +23041,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 					return ret;
 				}
 
-				insn->imm = ret + 1;
+				insn->imm |= (ret + 1) << 16;
 				goto next_insn;
 			}
 
-- 
2.52.0

next prev parent reply	other threads:[~2026-01-02 15:01 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-02 15:00 [PATCH bpf-next 0/4] bpf: tailcall: Eliminate max_entries and bpf_func access at runtime Leon Hwang
2026-01-02 15:00 ` [PATCH bpf-next 1/4] bpf: tailcall: Introduce bpf_arch_tail_call_prologue_offset Leon Hwang
2026-01-02 15:21   ` bot+bpf-ci
2026-01-02 15:38     ` Leon Hwang
2026-01-02 15:00 ` Leon Hwang [this message]
2026-01-02 15:00 ` [PATCH bpf-next 3/4] bpf, arm64: tailcall: Eliminate max_entries and bpf_func access at runtime Leon Hwang
2026-01-02 15:00 ` [PATCH bpf-next 4/4] bpf, lib/test_bpf: Fix broken tailcall tests Leon Hwang
2026-01-03  0:10 ` [PATCH bpf-next 0/4] bpf: tailcall: Eliminate max_entries and bpf_func access at runtime Alexei Starovoitov

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:e3b1c4b1d55 dfblob:9fd707612da dfblob:3d44c5d0662
dfblob:ab9c84e76a6 )
 OR (
bs:"[PATCH bpf-next 2/4] bpf, x64: tailcall: Eliminate max_entries and bpf_func access at runtime" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260102150032.53106-3-leon.hwang@linux.dev \
    --to=leon.hwang@linux.dev \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bp@alien8.de \
    --cc=bpf@vger.kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=daniel@iogearbox.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=eddyz87@gmail.com \
    --cc=haoluo@google.com \
    --cc=hpa@zytor.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kernel-patches-bot@fb.com \
    --cc=kpsingh@kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=mingo@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=puranjay@kernel.org \
    --cc=sdf@fomichev.me \
    --cc=song@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=will@kernel.org \
    --cc=x86@kernel.org \
    --cc=xukuohai@huaweicloud.com \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).