public inbox for bpf@vger.kernel.org
 help / color / mirror / Atom feed
From: Yonghong Song <yonghong.song@linux.dev>
To: bpf@vger.kernel.org
Cc: Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	"Jose E . Marchesi" <jose.marchesi@oracle.com>,
	kernel-team@fb.com, Martin KaFai Lau <martin.lau@kernel.org>
Subject: [PATCH bpf-next v4 15/18] bpf,x86: Implement JIT support for stack arguments
Date: Sat, 11 Apr 2026 22:00:26 -0700	[thread overview]
Message-ID: <20260412050033.267815-1-yonghong.song@linux.dev> (raw)
In-Reply-To: <20260412045826.254200-1-yonghong.song@linux.dev>

Add x86_64 JIT support for BPF functions and kfuncs with more than
5 arguments. The extra arguments are passed through a stack area
addressed by register r12 (BPF_REG_STACK_ARG_BASE) in BPF bytecode,
which the JIT translates to native code.

The JIT follows the x86-64 calling convention for both BPF-to-BPF
and kfunc calls:
  - Arg 6 is passed in the R9 register
  - Args 7+ are passed on the stack

Incoming arg 6 (BPF r12+8) is translated to a MOV from R9 rather
than a memory load. Incoming args 7+ (BPF r12+16, r12+24, ...) map
directly to [rbp + 16], [rbp + 24], ..., matching the x86-64 stack
layout after CALL + PUSH RBP, so no offset adjustment is needed.

The verifier guarantees that neither tail_call_reachable nor
priv_stack is set when outgoing stack args exist, so R9 is always
available. When BPF bytecode writes to the arg-6 stack slot
(the most negative outgoing offset), the JIT emits a MOV into R9
instead of a memory store. Outgoing args 7+ are placed at [rsp]
in a pre-allocated area below callee-saved registers, using:
  native_off = outgoing_arg_base + bpf_off

The native x86_64 stack layout:

  high address
  +-------------------------+
  | incoming stack arg N    |  [rbp + 16 + (N-2)*8]  (from caller)
  | ...                     |
  | incoming stack arg 7    |  [rbp + 16]
  +-------------------------+
  | return address          |  [rbp + 8]
  | saved rbp               |  [rbp]
  +-------------------------+
  | BPF program stack       |  (round_up(stack_depth, 8) bytes)
  +-------------------------+
  | callee-saved regs       |  (r12, rbx, r13, r14, r15 as needed)
  +-------------------------+
  | outgoing arg M          |  [rsp + (M-7)*8]
  | ...                     |
  | outgoing arg 7          |  [rsp]
  +-------------------------+  rsp
  low address

  (Arg 6 is in R9, not on the stack)

  [1] https://github.com/llvm/llvm-project/pull/189060

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 arch/x86/net/bpf_jit_comp.c | 172 ++++++++++++++++++++++++++++++++++--
 1 file changed, 164 insertions(+), 8 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 32864dbc2c4e..ec57b9a6b417 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -390,6 +390,34 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
 	*pprog = prog;
 }
 
+/* add rsp, depth */
+static void emit_add_rsp(u8 **pprog, u16 depth)
+{
+	u8 *prog = *pprog;
+
+	if (!depth)
+		return;
+	if (is_imm8(depth))
+		EMIT4(0x48, 0x83, 0xC4, depth); /* add rsp, imm8 */
+	else
+		EMIT3_off32(0x48, 0x81, 0xC4, depth); /* add rsp, imm32 */
+	*pprog = prog;
+}
+
+/* sub rsp, depth */
+static void emit_sub_rsp(u8 **pprog, u16 depth)
+{
+	u8 *prog = *pprog;
+
+	if (!depth)
+		return;
+	if (is_imm8(depth))
+		EMIT4(0x48, 0x83, 0xEC, depth); /* sub rsp, imm8 */
+	else
+		EMIT3_off32(0x48, 0x81, 0xEC, depth); /* sub rsp, imm32 */
+	*pprog = prog;
+}
+
 static void emit_nops(u8 **pprog, int len)
 {
 	u8 *prog = *pprog;
@@ -725,8 +753,8 @@ static void emit_return(u8 **pprog, u8 *ip)
  */
 static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 					u8 **pprog, bool *callee_regs_used,
-					u32 stack_depth, u8 *ip,
-					struct jit_context *ctx)
+					u32 stack_depth, u16 outgoing_depth,
+					u8 *ip, struct jit_context *ctx)
 {
 	int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
 	u8 *prog = *pprog, *start = *pprog;
@@ -775,6 +803,9 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 	/* Inc tail_call_cnt if the slot is populated. */
 	EMIT4(0x48, 0x83, 0x00, 0x01);            /* add qword ptr [rax], 1 */
 
+	/* Deallocate outgoing stack arg area. */
+	emit_add_rsp(&prog, outgoing_depth);
+
 	if (bpf_prog->aux->exception_boundary) {
 		pop_callee_regs(&prog, all_callee_regs_used);
 		pop_r12(&prog);
@@ -815,6 +846,7 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
 				      struct bpf_jit_poke_descriptor *poke,
 				      u8 **pprog, u8 *ip,
 				      bool *callee_regs_used, u32 stack_depth,
+				      u16 outgoing_depth,
 				      struct jit_context *ctx)
 {
 	int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
@@ -842,6 +874,9 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
 	/* Inc tail_call_cnt if the slot is populated. */
 	EMIT4(0x48, 0x83, 0x00, 0x01);                /* add qword ptr [rax], 1 */
 
+	/* Deallocate outgoing stack arg area. */
+	emit_add_rsp(&prog, outgoing_depth);
+
 	if (bpf_prog->aux->exception_boundary) {
 		pop_callee_regs(&prog, all_callee_regs_used);
 		pop_r12(&prog);
@@ -1664,16 +1699,48 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	int i, excnt = 0;
 	int ilen, proglen = 0;
 	u8 *prog = temp;
+	u16 stack_arg_depth, incoming_stack_arg_depth, outgoing_stack_arg_depth;
+	u16 outgoing_rsp;
 	u32 stack_depth;
+	int callee_saved_size;
+	s32 outgoing_arg_base;
+	bool has_stack_args;
 	int err;
 
 	stack_depth = bpf_prog->aux->stack_depth;
+	stack_arg_depth = bpf_prog->aux->stack_arg_depth;
+	incoming_stack_arg_depth = bpf_prog->aux->incoming_stack_arg_depth;
+	outgoing_stack_arg_depth = stack_arg_depth - incoming_stack_arg_depth;
 	priv_stack_ptr = bpf_prog->aux->priv_stack_ptr;
 	if (priv_stack_ptr) {
 		priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8);
 		stack_depth = 0;
 	}
 
+	/*
+	 * Follow x86-64 calling convention for both BPF-to-BPF and
+	 * kfunc calls:
+	 *   - Arg 6 is passed in R9 register
+	 *   - Args 7+ are passed on the stack at [rsp]
+	 *
+	 * Incoming arg 6 is read from R9 (BPF r12+8 → MOV from R9).
+	 * Incoming args 7+ are read from [rbp + 16], [rbp + 24], ...
+	 * (BPF r12+16, r12+24, ... map directly with no offset change).
+	 *
+	 * The verifier guarantees that neither tail_call_reachable nor
+	 * priv_stack is set when outgoing stack args exist, so R9 is
+	 * always available.
+	 *
+	 * Stack layout (high to low):
+	 *   [rbp + 16 + ...]    incoming stack args 7+ (from caller)
+	 *   [rbp + 8]           return address
+	 *   [rbp]               saved rbp
+	 *   [rbp - prog_stack]  program stack
+	 *   [below]             callee-saved regs
+	 *   [below]             outgoing args 7+ (= rsp)
+	 */
+	has_stack_args = stack_arg_depth > 0;
+
 	arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
 	user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
 
@@ -1700,6 +1767,41 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 			push_r12(&prog);
 		push_callee_regs(&prog, callee_regs_used);
 	}
+
+	/* Compute callee-saved register area size. */
+	callee_saved_size = 0;
+	if (bpf_prog->aux->exception_boundary || arena_vm_start)
+		callee_saved_size += 8; /* r12 */
+	if (bpf_prog->aux->exception_boundary) {
+		callee_saved_size += 4 * 8; /* rbx, r13, r14, r15 */
+	} else {
+		int j;
+
+		for (j = 0; j < 4; j++)
+			if (callee_regs_used[j])
+				callee_saved_size += 8;
+	}
+	/*
+	 * Base offset from rbp for translating BPF outgoing args 7+
+	 * to native offsets:
+	 *   native_off = outgoing_arg_base + bpf_off
+	 *
+	 * BPF outgoing offsets are negative (r12 - N*8 for arg6,
+	 * ..., r12 - 8 for last arg). Arg 6 goes to R9 directly,
+	 * so only args 7+ occupy the outgoing stack area.
+	 *
+	 * Note that tail_call_reachable is guaranteed to be false when
+	 * stack args exist, so tcc pushes need not be accounted for.
+	 */
+	outgoing_arg_base = -(round_up(stack_depth, 8) + callee_saved_size);
+
+	/*
+	 * Allocate outgoing stack arg area for args 7+ only.
+	 * Arg 6 goes into r9 register, not on stack.
+	 */
+	outgoing_rsp = outgoing_stack_arg_depth > 8 ?  outgoing_stack_arg_depth - 8 : 0;
+	emit_sub_rsp(&prog, outgoing_rsp);
+
 	if (arena_vm_start)
 		emit_mov_imm64(&prog, X86_REG_R12,
 			       arena_vm_start >> 32, (u32) arena_vm_start);
@@ -1715,13 +1817,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	prog = temp;
 
 	for (i = 1; i <= insn_cnt; i++, insn++) {
+		bool adjust_stack_arg_off = false;
 		const s32 imm32 = insn->imm;
 		u32 dst_reg = insn->dst_reg;
 		u32 src_reg = insn->src_reg;
 		u8 b2 = 0, b3 = 0;
 		u8 *start_of_ldx;
 		s64 jmp_offset;
-		s16 insn_off;
+		s32 insn_off;
 		u8 jmp_cond;
 		u8 *func;
 		int nops;
@@ -1734,6 +1837,21 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 				dst_reg = X86_REG_R9;
 		}
 
+		if (has_stack_args) {
+			u8 class = BPF_CLASS(insn->code);
+
+			if (class == BPF_LDX &&
+			    src_reg == BPF_REG_STACK_ARG_BASE) {
+				src_reg = BPF_REG_FP;
+				adjust_stack_arg_off = true;
+			}
+			if ((class == BPF_STX || class == BPF_ST) &&
+			    dst_reg == BPF_REG_STACK_ARG_BASE) {
+				dst_reg = BPF_REG_FP;
+				adjust_stack_arg_off = true;
+			}
+		}
+
 		switch (insn->code) {
 			/* ALU */
 		case BPF_ALU | BPF_ADD | BPF_X:
@@ -2129,12 +2247,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 				EMIT1(0xC7);
 			goto st;
 		case BPF_ST | BPF_MEM | BPF_DW:
+			if (adjust_stack_arg_off && insn->off == -outgoing_stack_arg_depth) {
+				/* Arg 6: store immediate in r9 register */
+				emit_mov_imm64(&prog, X86_REG_R9, imm32 >> 31, (u32)imm32);
+				break;
+			}
 			EMIT2(add_1mod(0x48, dst_reg), 0xC7);
 
-st:			if (is_imm8(insn->off))
-				EMIT2(add_1reg(0x40, dst_reg), insn->off);
+st:			insn_off = insn->off;
+			if (adjust_stack_arg_off)
+				insn_off = outgoing_arg_base + insn_off;
+			if (is_imm8(insn_off))
+				EMIT2(add_1reg(0x40, dst_reg), insn_off);
 			else
-				EMIT1_off32(add_1reg(0x80, dst_reg), insn->off);
+				EMIT1_off32(add_1reg(0x80, dst_reg), insn_off);
 
 			EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
 			break;
@@ -2144,7 +2270,15 @@ st:			if (is_imm8(insn->off))
 		case BPF_STX | BPF_MEM | BPF_H:
 		case BPF_STX | BPF_MEM | BPF_W:
 		case BPF_STX | BPF_MEM | BPF_DW:
-			emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
+			if (adjust_stack_arg_off && insn->off == -outgoing_stack_arg_depth) {
+				/* Arg 6: store register value in r9 */
+				EMIT_mov(X86_REG_R9, src_reg);
+				break;
+			}
+			insn_off = insn->off;
+			if (adjust_stack_arg_off)
+				insn_off = outgoing_arg_base + insn_off;
+			emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
 			break;
 
 		case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
@@ -2243,6 +2377,18 @@ st:			if (is_imm8(insn->off))
 		case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
 		case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
 			insn_off = insn->off;
+			if (adjust_stack_arg_off) {
+				if (insn_off == 8) {
+					/* Incoming arg 6: read from r9 */
+					EMIT_mov(dst_reg, X86_REG_R9);
+					break;
+				}
+				/*
+				 * Incoming args 7+: native_off == bpf_off
+				 * (r12+16 → [rbp+16], r12+24 → [rbp+24], ...)
+				 * No offset adjustment needed.
+				 */
+			}
 
 			if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
 			    BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
@@ -2468,12 +2614,14 @@ st:			if (is_imm8(insn->off))
 							  &prog, image + addrs[i - 1],
 							  callee_regs_used,
 							  stack_depth,
+							  outgoing_rsp,
 							  ctx);
 			else
 				emit_bpf_tail_call_indirect(bpf_prog,
 							    &prog,
 							    callee_regs_used,
 							    stack_depth,
+							    outgoing_rsp,
 							    image + addrs[i - 1],
 							    ctx);
 			break;
@@ -2734,6 +2882,8 @@ st:			if (is_imm8(insn->off))
 				if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog))
 					return -EINVAL;
 			}
+			/* Deallocate outgoing args 7+ area. */
+			emit_add_rsp(&prog, outgoing_rsp);
 			if (bpf_prog->aux->exception_boundary) {
 				pop_callee_regs(&prog, all_callee_regs_used);
 				pop_r12(&prog);
@@ -3757,7 +3907,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		prog->aux->jit_data = jit_data;
 	}
 	priv_stack_ptr = prog->aux->priv_stack_ptr;
-	if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+	/*
+	 * x86-64 uses R9 for both private stack frame pointer and
+	 * outgoing arg 6, so disable private stack when outgoing
+	 * stack args are present.
+	 */
+	if (!priv_stack_ptr && prog->aux->jits_use_priv_stack &&
+	    prog->aux->stack_arg_depth == prog->aux->incoming_stack_arg_depth) {
 		/* Allocate actual private stack size with verifier-calculated
 		 * stack size plus two memory guards to protect overflow and
 		 * underflow.
-- 
2.52.0


  parent reply	other threads:[~2026-04-12  5:00 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-12  4:58 [PATCH bpf-next v4 00/18] bpf: Support stack arguments for BPF functions and kfuncs Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 01/18] bpf: Remove unused parameter from check_map_kptr_access() Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 02/18] bpf: Change from "arg #%d" to "arg#%d" in verifier log Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 03/18] bpf: Refactor to avoid redundant calculation of bpf_reg_state Yonghong Song
2026-04-12  5:31   ` bot+bpf-ci
2026-04-12  4:58 ` [PATCH bpf-next v4 04/18] bpf: Refactor to handle memory and size together Yonghong Song
2026-04-12  5:31   ` bot+bpf-ci
2026-04-12  4:58 ` [PATCH bpf-next v4 05/18] bpf: Change some regno type from u32 to int type Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 06/18] bpf: Use argument index instead of register index in kfunc verifier logs Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-12  4:59 ` [PATCH bpf-next v4 07/18] bpf: Introduce bpf register BPF_REG_STACK_ARG_BASE Yonghong Song
2026-04-12  4:59 ` [PATCH bpf-next v4 08/18] bpf: Reuse MAX_BPF_FUNC_ARGS for maximum number of arguments Yonghong Song
2026-04-12  4:59 ` [PATCH bpf-next v4 09/18] bpf: Support stack arguments for bpf functions Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-12  5:00 ` [PATCH bpf-next v4 10/18] bpf: Fix interaction between stack argument PTR_TO_STACK and dead slot poisoning Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-12  5:00 ` [PATCH bpf-next v4 11/18] bpf: Reject stack arguments in non-JITed programs Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 12/18] bpf: Reject stack arguments if tail call reachable Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-12  5:00 ` [PATCH bpf-next v4 13/18] bpf: Support stack arguments for kfunc calls Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-12  5:00 ` [PATCH bpf-next v4 14/18] bpf: Enable stack argument support for x86_64 Yonghong Song
2026-04-12  5:00 ` Yonghong Song [this message]
2026-04-12  5:43   ` [PATCH bpf-next v4 15/18] bpf,x86: Implement JIT support for stack arguments bot+bpf-ci
2026-04-12  5:00 ` [PATCH bpf-next v4 16/18] selftests/bpf: Add tests for BPF function " Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 17/18] selftests/bpf: Add negative test for greater-than-8-byte kfunc stack argument Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 18/18] selftests/bpf: Add verifier tests for stack argument validation Yonghong Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260412050033.267815-1-yonghong.song@linux.dev \
    --to=yonghong.song@linux.dev \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=jose.marchesi@oracle.com \
    --cc=kernel-team@fb.com \
    --cc=martin.lau@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox