* [PATCH bpf-next v11 1/7] bpf: Find eligible subprogs for private stack support
2024-11-09 2:53 [PATCH bpf-next v11 0/7] bpf: Support private stack for bpf progs Yonghong Song
@ 2024-11-09 2:53 ` Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 2/7] bpf: Enable private stack for eligible subprogs Yonghong Song
` (5 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yonghong Song @ 2024-11-09 2:53 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Private stack will be allocated with percpu allocator in jit time.
To avoid complexity at runtime, only one copy of private stack is
available per cpu per prog. So runtime recursion check is necessary
to avoid stack corruption.
Current private stack only supports kprobe/perf_event/tp/raw_tp
which has recursion check in the kernel, and prog types that use
bpf trampoline recursion check. For trampoline related prog types,
currently only tracing progs have recursion checking.
To avoid complexity, all async_cb subprogs use normal kernel stack
including those subprogs used by both main prog subtree and async_cb
subtree. Any prog having tail call also uses kernel stack.
To avoid jit penalty with private stack support, a subprog stack
size threshold is set such that only if the stack size is no less
than the threshold, private stack is supported. The current threshold
is 64 bytes. This avoids jit penality if the stack usage is small.
A useless 'continue' is also removed from a loop in func
check_max_stack_depth().
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
include/linux/bpf_verifier.h | 7 +++
include/linux/filter.h | 1 +
kernel/bpf/core.c | 5 ++
kernel/bpf/verifier.c | 96 ++++++++++++++++++++++++++++++++----
4 files changed, 99 insertions(+), 10 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4513372c5bc8..456fd2265345 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -649,6 +649,12 @@ struct bpf_subprog_arg_info {
};
};
+enum priv_stack_mode {
+ PRIV_STACK_UNKNOWN,
+ NO_PRIV_STACK,
+ PRIV_STACK_ADAPTIVE,
+};
+
struct bpf_subprog_info {
/* 'start' has to be the first field otherwise find_subprog() won't work */
u32 start; /* insn idx of function entry point */
@@ -669,6 +675,7 @@ struct bpf_subprog_info {
/* true if bpf_fastcall stack region is used by functions that can't be inlined */
bool keep_fastcall_stack: 1;
+ enum priv_stack_mode priv_stack_mode;
u8 arg_cnt;
struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
};
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7d7578a8eac1..3a21947f2fd4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
+bool bpf_jit_supports_private_stack(void);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
bool bpf_helper_changes_pkt_data(void *func);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 233ea78f8f1b..14d9288441f2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
return false;
}
+bool __weak bpf_jit_supports_private_stack(void)
+{
+ return false;
+}
+
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 132fc172961f..ebaf44329b83 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
+#define BPF_PRIV_STACK_MIN_SIZE 64
+
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -6034,6 +6036,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
+{
+ if (!bpf_jit_supports_private_stack())
+ return NO_PRIV_STACK;
+
+ /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+ * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+ * explicitly.
+ */
+ switch (prog->type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return PRIV_STACK_ADAPTIVE;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (bpf_prog_check_recur(prog))
+ return PRIV_STACK_ADAPTIVE;
+ fallthrough;
+ default:
+ break;
+ }
+
+ return NO_PRIV_STACK;
+}
+
static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
{
if (env->prog->jit_requested)
@@ -6051,17 +6081,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ bool priv_stack_supported)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int depth = 0, frame = 0, i, subprog_end;
+ int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
i = subprog[idx].start;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -6088,11 +6121,31 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
depth);
return -EACCES;
}
- depth += round_up_stack_depth(env, subprog[idx].stack_depth);
- if (depth > MAX_BPF_STACK) {
- verbose(env, "combined stack size of %d calls is %d. Too large\n",
- frame + 1, depth);
- return -EACCES;
+
+ subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (priv_stack_supported) {
+ /* Request private stack support only if the subprog stack
+ * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
+ * avoid jit penalty if the stack usage is small.
+ */
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
+ subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+ subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
+ }
+
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ if (subprog_depth > MAX_BPF_STACK) {
+ verbose(env, "stack size of subprog %d is %d. Too large\n",
+ idx, subprog_depth);
+ return -EACCES;
+ }
+ } else {
+ depth += subprog_depth;
+ if (depth > MAX_BPF_STACK) {
+ verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ frame + 1, depth);
+ return -EACCES;
+ }
}
continue_func:
subprog_end = subprog[idx + 1].start;
@@ -6149,6 +6202,8 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
}
i = next_insn;
idx = sidx;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -6182,7 +6237,8 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
*/
if (frame == 0)
return 0;
- depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
+ depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
frame--;
i = ret_insn[frame];
idx = ret_prog[frame];
@@ -6191,16 +6247,36 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
+ enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
struct bpf_subprog_info *si = env->subprog_info;
+ bool priv_stack_supported;
int ret;
for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].has_tail_call) {
+ priv_stack_mode = NO_PRIV_STACK;
+ break;
+ }
+ }
+
+ if (priv_stack_mode == PRIV_STACK_UNKNOWN)
+ priv_stack_mode = bpf_enable_priv_stack(env->prog);
+
+ /* All async_cb subprogs use normal kernel stack. If a particular
+ * subprog appears in both main prog and async_cb subtree, that
+ * subprog will use normal kernel stack to avoid potential nesting.
+ * The reverse subprog traversal ensures when main prog subtree is
+ * checked, the subprogs appearing in async_cb subtrees are already
+ * marked as using normal kernel stack, so stack size checking can
+ * be done properly.
+ */
+ for (int i = env->subprog_cnt - 1; i >= 0; i--) {
if (!i || si[i].is_async_cb) {
- ret = check_max_stack_depth_subprog(env, i);
+ priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
+ ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
if (ret < 0)
return ret;
}
- continue;
}
return 0;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH bpf-next v11 2/7] bpf: Enable private stack for eligible subprogs
2024-11-09 2:53 [PATCH bpf-next v11 0/7] bpf: Support private stack for bpf progs Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 1/7] bpf: Find eligible subprogs for private stack support Yonghong Song
@ 2024-11-09 2:53 ` Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 3/7] bpf, x86: Avoid repeated usage of bpf_prog->aux->stack_depth Yonghong Song
` (4 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yonghong Song @ 2024-11-09 2:53 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
If private stack is used by any subprog, set that subprog
prog->aux->jits_use_priv_stack to be true so later jit can allocate
private stack for that subprog properly.
Also set env->prog->aux->jits_use_priv_stack to be true if
any subprog uses private stack. This is a use case for a
single main prog (no subprogs) to use private stack, and
also a use case for later struct-ops progs where
env->prog->aux->jits_use_priv_stack will enable recursion
check if any subprog uses private stack.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
include/linux/bpf.h | 1 +
kernel/bpf/verifier.c | 11 +++++++++++
2 files changed, 12 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b84613b10ac..15f20d508174 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1523,6 +1523,7 @@ struct bpf_prog_aux {
bool exception_cb;
bool exception_boundary;
bool is_extended; /* true if extended by freplace program */
+ bool jits_use_priv_stack;
u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
struct bpf_arena *arena;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ebaf44329b83..8a7576233814 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6278,6 +6278,14 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
return ret;
}
}
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ env->prog->aux->jits_use_priv_stack = true;
+ break;
+ }
+ }
+
return 0;
}
@@ -20221,6 +20229,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->jits_use_priv_stack = true;
+
func[i]->jit_requested = 1;
func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH bpf-next v11 3/7] bpf, x86: Avoid repeated usage of bpf_prog->aux->stack_depth
2024-11-09 2:53 [PATCH bpf-next v11 0/7] bpf: Support private stack for bpf progs Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 1/7] bpf: Find eligible subprogs for private stack support Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 2/7] bpf: Enable private stack for eligible subprogs Yonghong Song
@ 2024-11-09 2:53 ` Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit Yonghong Song
` (3 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yonghong Song @ 2024-11-09 2:53 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Refactor the code to avoid repeated usage of bpf_prog->aux->stack_depth
in do_jit() func. If the private stack is used, the stack_depth will be
0 for that prog. Refactoring make it easy to adjust stack_depth.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
arch/x86/net/bpf_jit_comp.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 06b080b61aa5..3ff638c37999 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1425,14 +1425,17 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
int i, excnt = 0;
int ilen, proglen = 0;
u8 *prog = temp;
+ u32 stack_depth;
int err;
+ stack_depth = bpf_prog->aux->stack_depth;
+
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
detect_reg_usage(insn, insn_cnt, callee_regs_used);
- emit_prologue(&prog, bpf_prog->aux->stack_depth,
+ emit_prologue(&prog, stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
/* Exception callback will clobber callee regs for its own use, and
@@ -2128,7 +2131,7 @@ st: if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
- LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
+ LOAD_TAIL_CALL_CNT_PTR(stack_depth);
ip += 7;
}
if (!imm32)
@@ -2145,13 +2148,13 @@ st: if (is_imm8(insn->off))
&bpf_prog->aux->poke_tab[imm32 - 1],
&prog, image + addrs[i - 1],
callee_regs_used,
- bpf_prog->aux->stack_depth,
+ stack_depth,
ctx);
else
emit_bpf_tail_call_indirect(bpf_prog,
&prog,
callee_regs_used,
- bpf_prog->aux->stack_depth,
+ stack_depth,
image + addrs[i - 1],
ctx);
break;
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit
2024-11-09 2:53 [PATCH bpf-next v11 0/7] bpf: Support private stack for bpf progs Yonghong Song
` (2 preceding siblings ...)
2024-11-09 2:53 ` [PATCH bpf-next v11 3/7] bpf, x86: Avoid repeated usage of bpf_prog->aux->stack_depth Yonghong Song
@ 2024-11-09 2:53 ` Yonghong Song
2024-11-09 20:14 ` Alexei Starovoitov
2024-11-09 2:53 ` [PATCH bpf-next v11 5/7] selftests/bpf: Add tracing prog private stack tests Yonghong Song
` (2 subsequent siblings)
6 siblings, 1 reply; 13+ messages in thread
From: Yonghong Song @ 2024-11-09 2:53 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Private stack is allocated in function bpf_int_jit_compile() with
alignment 16. The x86 register 9 (X86_REG_R9) is used to replace
bpf frame register (BPF_REG_10). The private stack is used per
subprog per cpu. The X86_REG_R9 is saved and restored around every
func call (not including tailcall) to maintain correctness of
X86_REG_R9.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
arch/x86/net/bpf_jit_comp.c | 77 +++++++++++++++++++++++++++++++++++++
include/linux/bpf.h | 1 +
2 files changed, 78 insertions(+)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3ff638c37999..55556a64f776 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -325,6 +325,22 @@ struct jit_context {
/* Number of bytes that will be skipped on tailcall */
#define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE)
+static void push_r9(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x51); /* push r9 */
+ *pprog = prog;
+}
+
+static void pop_r9(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x59); /* pop r9 */
+ *pprog = prog;
+}
+
static void push_r12(u8 **pprog)
{
u8 *prog = *pprog;
@@ -1404,6 +1420,24 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
*pprog = prog;
}
+static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
+{
+ u8 *prog = *pprog;
+
+ /* movabs r9, priv_frame_ptr */
+ emit_mov_imm64(&prog, X86_REG_R9, (__force long) priv_frame_ptr >> 32,
+ (u32) (__force long) priv_frame_ptr);
+
+#ifdef CONFIG_SMP
+ /* add <r9>, gs:[<off>] */
+ EMIT2(0x65, 0x4c);
+ EMIT3(0x03, 0x0c, 0x25);
+ EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+
+ *pprog = prog;
+}
+
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
#define __LOAD_TCC_PTR(off) \
@@ -1421,6 +1455,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
int insn_cnt = bpf_prog->len;
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+ void __percpu *priv_frame_ptr = NULL;
u64 arena_vm_start, user_vm_start;
int i, excnt = 0;
int ilen, proglen = 0;
@@ -1429,6 +1464,10 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
int err;
stack_depth = bpf_prog->aux->stack_depth;
+ if (bpf_prog->aux->priv_stack_ptr) {
+ priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
+ stack_depth = 0;
+ }
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
@@ -1457,6 +1496,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
emit_mov_imm64(&prog, X86_REG_R12,
arena_vm_start >> 32, (u32) arena_vm_start);
+ if (priv_frame_ptr)
+ emit_priv_frame_ptr(&prog, priv_frame_ptr);
+
ilen = prog - temp;
if (rw_image)
memcpy(rw_image + proglen, temp, ilen);
@@ -1476,6 +1518,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
u8 *func;
int nops;
+ if (priv_frame_ptr) {
+ if (src_reg == BPF_REG_FP)
+ src_reg = X86_REG_R9;
+
+ if (dst_reg == BPF_REG_FP)
+ dst_reg = X86_REG_R9;
+ }
+
switch (insn->code) {
/* ALU */
case BPF_ALU | BPF_ADD | BPF_X:
@@ -2136,9 +2186,15 @@ st: if (is_imm8(insn->off))
}
if (!imm32)
return -EINVAL;
+ if (priv_frame_ptr) {
+ push_r9(&prog);
+ ip += 2;
+ }
ip += x86_call_depth_emit_accounting(&prog, func, ip);
if (emit_call(&prog, func, ip))
return -EINVAL;
+ if (priv_frame_ptr)
+ pop_r9(&prog);
break;
}
@@ -3323,6 +3379,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
struct bpf_binary_header *rw_header = NULL;
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
+ void __percpu *priv_stack_ptr = NULL;
struct x64_jit_data *jit_data;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
@@ -3359,6 +3416,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
prog->aux->jit_data = jit_data;
}
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+ priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
+ if (!priv_stack_ptr) {
+ prog = orig_prog;
+ goto out_priv_stack;
+ }
+ prog->aux->priv_stack_ptr = priv_stack_ptr;
+ }
addrs = jit_data->addrs;
if (addrs) {
ctx = jit_data->ctx;
@@ -3494,6 +3560,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bpf_prog_fill_jited_linfo(prog, addrs + 1);
out_addrs:
kvfree(addrs);
+ if (!image && priv_stack_ptr) {
+ free_percpu(priv_stack_ptr);
+ prog->aux->priv_stack_ptr = NULL;
+ }
+out_priv_stack:
kfree(jit_data);
prog->aux->jit_data = NULL;
}
@@ -3547,6 +3618,7 @@ void bpf_jit_free(struct bpf_prog *prog)
prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
+ free_percpu(prog->aux->priv_stack_ptr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
}
@@ -3562,6 +3634,11 @@ bool bpf_jit_supports_exceptions(void)
return IS_ENABLED(CONFIG_UNWINDER_ORC);
}
+bool bpf_jit_supports_private_stack(void)
+{
+ return true;
+}
+
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
#if defined(CONFIG_UNWINDER_ORC)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 15f20d508174..9cfb8f55d691 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1507,6 +1507,7 @@ struct bpf_prog_aux {
u32 max_rdwr_access;
struct btf *attach_btf;
const struct bpf_ctx_arg_aux *ctx_arg_info;
+ void __percpu *priv_stack_ptr;
struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
struct bpf_prog *dst_prog;
struct bpf_trampoline *dst_trampoline;
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit
2024-11-09 2:53 ` [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit Yonghong Song
@ 2024-11-09 20:14 ` Alexei Starovoitov
2024-11-10 2:34 ` Yonghong Song
2024-11-11 23:18 ` Yonghong Song
0 siblings, 2 replies; 13+ messages in thread
From: Alexei Starovoitov @ 2024-11-09 20:14 UTC (permalink / raw)
To: Yonghong Song
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Kernel Team, Martin KaFai Lau, Tejun Heo
On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> stack_depth = bpf_prog->aux->stack_depth;
> + if (bpf_prog->aux->priv_stack_ptr) {
> + priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
> + stack_depth = 0;
> + }
...
> + priv_stack_ptr = prog->aux->priv_stack_ptr;
> + if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
> + priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
After applying I started to see crashes running test_progs -j like:
[ 173.465191] Oops: general protection fault, probably for
non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
[ 173.466053] KASAN: probably user-memory-access in range
[0x00000000000057c8-0x00000000000057cf]
[ 173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
[ 173.466053] Call Trace:
[ 173.466053] <IRQ>
[ 173.466053] ? die_addr+0x40/0xa0
[ 173.466053] ? exc_general_protection+0x138/0x1f0
[ 173.466053] ? asm_exc_general_protection+0x26/0x30
[ 173.466053] ? dst_dev_put+0x1e/0x220
[ 173.466053] rt_fibinfo_free_cpus.part.0+0x8c/0x130
[ 173.466053] fib_nh_common_release+0xd6/0x2a0
[ 173.466053] free_fib_info_rcu+0x129/0x360
[ 173.466053] ? rcu_core+0xa55/0x1340
[ 173.466053] rcu_core+0xa55/0x1340
[ 173.466053] ? rcutree_report_cpu_dead+0x380/0x380
[ 173.466053] ? hrtimer_interrupt+0x319/0x7c0
[ 173.466053] handle_softirqs+0x14c/0x4d0
[ 35.134115] Oops: general protection fault, probably for
non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
[ 35.135089] KASAN: probably user-memory-access in range
[0x00007fff880fdde0-0x00007fff880fdde7]
[ 35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
[ 35.135089] Call Trace:
[ 35.135089] <TASK>
[ 35.135089] ? die_addr+0x40/0xa0
[ 35.135089] ? exc_general_protection+0x138/0x1f0
[ 35.135089] ? asm_exc_general_protection+0x26/0x30
[ 35.135089] ? destroy_workqueue+0x4b4/0xa70
[ 35.135089] ? destroy_workqueue+0x592/0xa70
[ 35.135089] ? __mutex_unlock_slowpath.isra.0+0x270/0x270
[ 35.135089] ext4_put_super+0xff/0xd70
[ 35.135089] generic_shutdown_super+0x148/0x4c0
[ 35.135089] kill_block_super+0x3b/0x90
[ 35.135089] ext4_kill_sb+0x65/0x90
I think I see the bug... quoted it above...
Please make sure you reproduce it first.
Then let's figure out a way how to test for such things and
what we can do to make kasan detect it sooner,
since above crashes have no indication at all that bpf priv stack
is responsible.
If there is another bug in priv stack and it will cause future
crashes we need to make sure that priv stack corruption is
detected by kasan (or whatever mechanism) earlier.
We cannot land private stack support when there is
a possibility of such silent corruption.
pw-bot: cr
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit
2024-11-09 20:14 ` Alexei Starovoitov
@ 2024-11-10 2:34 ` Yonghong Song
2024-11-11 23:18 ` Yonghong Song
1 sibling, 0 replies; 13+ messages in thread
From: Yonghong Song @ 2024-11-10 2:34 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Kernel Team, Martin KaFai Lau, Tejun Heo
On 11/9/24 12:14 PM, Alexei Starovoitov wrote:
> On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> stack_depth = bpf_prog->aux->stack_depth;
>> + if (bpf_prog->aux->priv_stack_ptr) {
>> + priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
>> + stack_depth = 0;
>> + }
> ...
>
>> + priv_stack_ptr = prog->aux->priv_stack_ptr;
>> + if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
>> + priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
> After applying I started to see crashes running test_progs -j like:
>
> [ 173.465191] Oops: general protection fault, probably for
> non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
> [ 173.466053] KASAN: probably user-memory-access in range
> [0x00000000000057c8-0x00000000000057cf]
> [ 173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
> [ 173.466053] Call Trace:
> [ 173.466053] <IRQ>
> [ 173.466053] ? die_addr+0x40/0xa0
> [ 173.466053] ? exc_general_protection+0x138/0x1f0
> [ 173.466053] ? asm_exc_general_protection+0x26/0x30
> [ 173.466053] ? dst_dev_put+0x1e/0x220
> [ 173.466053] rt_fibinfo_free_cpus.part.0+0x8c/0x130
> [ 173.466053] fib_nh_common_release+0xd6/0x2a0
> [ 173.466053] free_fib_info_rcu+0x129/0x360
> [ 173.466053] ? rcu_core+0xa55/0x1340
> [ 173.466053] rcu_core+0xa55/0x1340
> [ 173.466053] ? rcutree_report_cpu_dead+0x380/0x380
> [ 173.466053] ? hrtimer_interrupt+0x319/0x7c0
> [ 173.466053] handle_softirqs+0x14c/0x4d0
>
> [ 35.134115] Oops: general protection fault, probably for
> non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
> [ 35.135089] KASAN: probably user-memory-access in range
> [0x00007fff880fdde0-0x00007fff880fdde7]
> [ 35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
> [ 35.135089] Call Trace:
> [ 35.135089] <TASK>
> [ 35.135089] ? die_addr+0x40/0xa0
> [ 35.135089] ? exc_general_protection+0x138/0x1f0
> [ 35.135089] ? asm_exc_general_protection+0x26/0x30
> [ 35.135089] ? destroy_workqueue+0x4b4/0xa70
> [ 35.135089] ? destroy_workqueue+0x592/0xa70
> [ 35.135089] ? __mutex_unlock_slowpath.isra.0+0x270/0x270
> [ 35.135089] ext4_put_super+0xff/0xd70
> [ 35.135089] generic_shutdown_super+0x148/0x4c0
> [ 35.135089] kill_block_super+0x3b/0x90
> [ 35.135089] ext4_kill_sb+0x65/0x90
>
> I think I see the bug... quoted it above...
>
> Please make sure you reproduce it first.
>
> Then let's figure out a way how to test for such things and
> what we can do to make kasan detect it sooner,
> since above crashes have no indication at all that bpf priv stack
> is responsible.
> If there is another bug in priv stack and it will cause future
> crashes we need to make sure that priv stack corruption is
> detected by kasan (or whatever mechanism) earlier.
>
> We cannot land private stack support when there is
> a possibility of such silent corruption.
I can reproduce it now when running multiple times.
I will debug this ASAP.
>
> pw-bot: cr
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit
2024-11-09 20:14 ` Alexei Starovoitov
2024-11-10 2:34 ` Yonghong Song
@ 2024-11-11 23:18 ` Yonghong Song
2024-11-12 1:29 ` Alexei Starovoitov
1 sibling, 1 reply; 13+ messages in thread
From: Yonghong Song @ 2024-11-11 23:18 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Kernel Team, Martin KaFai Lau, Tejun Heo
On 11/9/24 12:14 PM, Alexei Starovoitov wrote:
> On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> stack_depth = bpf_prog->aux->stack_depth;
>> + if (bpf_prog->aux->priv_stack_ptr) {
>> + priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
>> + stack_depth = 0;
>> + }
> ...
>
>> + priv_stack_ptr = prog->aux->priv_stack_ptr;
>> + if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
>> + priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
> After applying I started to see crashes running test_progs -j like:
>
> [ 173.465191] Oops: general protection fault, probably for
> non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
> [ 173.466053] KASAN: probably user-memory-access in range
> [0x00000000000057c8-0x00000000000057cf]
> [ 173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
> [ 173.466053] Call Trace:
> [ 173.466053] <IRQ>
> [ 173.466053] ? die_addr+0x40/0xa0
> [ 173.466053] ? exc_general_protection+0x138/0x1f0
> [ 173.466053] ? asm_exc_general_protection+0x26/0x30
> [ 173.466053] ? dst_dev_put+0x1e/0x220
> [ 173.466053] rt_fibinfo_free_cpus.part.0+0x8c/0x130
> [ 173.466053] fib_nh_common_release+0xd6/0x2a0
> [ 173.466053] free_fib_info_rcu+0x129/0x360
> [ 173.466053] ? rcu_core+0xa55/0x1340
> [ 173.466053] rcu_core+0xa55/0x1340
> [ 173.466053] ? rcutree_report_cpu_dead+0x380/0x380
> [ 173.466053] ? hrtimer_interrupt+0x319/0x7c0
> [ 173.466053] handle_softirqs+0x14c/0x4d0
>
> [ 35.134115] Oops: general protection fault, probably for
> non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
> [ 35.135089] KASAN: probably user-memory-access in range
> [0x00007fff880fdde0-0x00007fff880fdde7]
> [ 35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
> [ 35.135089] Call Trace:
> [ 35.135089] <TASK>
> [ 35.135089] ? die_addr+0x40/0xa0
> [ 35.135089] ? exc_general_protection+0x138/0x1f0
> [ 35.135089] ? asm_exc_general_protection+0x26/0x30
> [ 35.135089] ? destroy_workqueue+0x4b4/0xa70
> [ 35.135089] ? destroy_workqueue+0x592/0xa70
> [ 35.135089] ? __mutex_unlock_slowpath.isra.0+0x270/0x270
> [ 35.135089] ext4_put_super+0xff/0xd70
> [ 35.135089] generic_shutdown_super+0x148/0x4c0
> [ 35.135089] kill_block_super+0x3b/0x90
> [ 35.135089] ext4_kill_sb+0x65/0x90
>
> I think I see the bug... quoted it above...
>
> Please make sure you reproduce it first.
Indeed, to use the allocation size round_up(stack_depth, 16) for __alloc_percpu_gfp()
indeed fixed the problem.
The following is additional change on top of this patch set to
- fix the memory access bug as suggested by Alexei in the above
- Add guard space for private stack, additional 16 bytes at the
end of stack will be the guard space. The content is prepopulated
and checked at per cpu private stack free site. If the content
is not expected, a kernel message will emit.
- Add kasan support for guard space.
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 55556a64f776..d796d419bb48 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1446,6 +1446,9 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
#define LOAD_TAIL_CALL_CNT_PTR(stack) \
__LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
+#define PRIV_STACK_GUARD_SZ 16
+#define PRIV_STACK_GUARD_VAL 0xEB9F1234eb9f1234ULL
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
@@ -1462,10 +1465,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
u8 *prog = temp;
u32 stack_depth;
int err;
+ // int stack_size;
stack_depth = bpf_prog->aux->stack_depth;
if (bpf_prog->aux->priv_stack_ptr) {
- priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
+ priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16) + PRIV_STACK_GUARD_SZ;
stack_depth = 0;
}
@@ -1496,8 +1500,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
emit_mov_imm64(&prog, X86_REG_R12,
arena_vm_start >> 32, (u32) arena_vm_start);
- if (priv_frame_ptr)
+ if (priv_frame_ptr) {
+#if 0
+ /* hack to emit and write some data to guard area */
+ emit_priv_frame_ptr(&prog, bpf_prog->aux->priv_stack_ptr);
+
+ /* See case BPF_ST | BPF_MEM | BPF_W */
+ EMIT2(0x41, 0xC7);
+ EMIT2(add_1reg(0x40, X86_REG_R9), 0);
+ EMIT(0xFFFFFFFF, bpf_size_to_x86_bytes(BPF_W));
+#endif
emit_priv_frame_ptr(&prog, priv_frame_ptr);
+ }
ilen = prog - temp;
if (rw_image)
@@ -3383,11 +3397,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
struct x64_jit_data *jit_data;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
+ int priv_stack_size, cpu;
bool tmp_blinded = false;
bool extra_pass = false;
bool padding = false;
u8 *rw_image = NULL;
u8 *image = NULL;
+ u64 *guard_ptr;
int *addrs;
int pass;
int i;
@@ -3418,11 +3434,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
priv_stack_ptr = prog->aux->priv_stack_ptr;
if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
- priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
+ priv_stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
+ priv_stack_ptr = __alloc_percpu_gfp(priv_stack_size, 16, GFP_KERNEL);
if (!priv_stack_ptr) {
prog = orig_prog;
goto out_priv_stack;
}
+ for_each_possible_cpu(cpu) {
+ guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ guard_ptr[0] = guard_ptr[1] = PRIV_STACK_GUARD_VAL;
+ kasan_poison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ);
+ }
prog->aux->priv_stack_ptr = priv_stack_ptr;
}
addrs = jit_data->addrs;
@@ -3561,6 +3583,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
out_addrs:
kvfree(addrs);
if (!image && priv_stack_ptr) {
+ for_each_possible_cpu(cpu) {
+ guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
+ }
free_percpu(priv_stack_ptr);
prog->aux->priv_stack_ptr = NULL;
}
@@ -3603,6 +3629,9 @@ void bpf_jit_free(struct bpf_prog *prog)
if (prog->jited) {
struct x64_jit_data *jit_data = prog->aux->jit_data;
struct bpf_binary_header *hdr;
+ void __percpu *priv_stack_ptr;
+ u64 *guard_ptr;
+ int cpu;
/*
* If we fail the final pass of JIT (from jit_subprogs),
@@ -3618,7 +3647,21 @@ void bpf_jit_free(struct bpf_prog *prog)
prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
- free_percpu(prog->aux->priv_stack_ptr);
+
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (priv_stack_ptr) {
+ int stack_size;
+
+ stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
+ for_each_possible_cpu(cpu) {
+ guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
+ if (guard_ptr[0] != PRIV_STACK_GUARD_VAL || guard_ptr[1] != PRIV_STACK_GUARD_VAL)
+ pr_err("Private stack Overflow happened for prog %sx\n", prog->aux->name);
+ }
+ free_percpu(priv_stack_ptr);
+ }
+
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
}
This fixed the issue Alexei discovered.
16 bytes guard space is allocated since allocation is done with 16byte aligned
with multiple-16 size. If bpf program overflows the stack (change '#if 0' to '#if 1')
in the above change, we will see:
[root@arch-fb-vm1 bpf]# ./test_progs -n 336
[ 28.447390] bpf_testmod: loading out-of-tree module taints kernel.
[ 28.448180] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
#336/1 struct_ops_private_stack/private_stack:OK
#336/2 struct_ops_private_stack/private_stack_fail:OK
#336/3 struct_ops_private_stack/private_stack_recur:OK
#336 struct_ops_private_stack:OK
Summary: 1/3 PASSED, 0 SKIPPED, 0 FAILED
[ 28.737710] Private stack Overflow happened for prog Fx
[ 28.739284] Private stack Overflow happened for prog Fx
[ 28.968732] Private stack Overflow happened for prog Fx
Here the func name is 'Fx' (representing the sub prog). We might need
to add more meaningful info (e.g. main prog name) to make message more
meaningful.
I added some changes related kasan. If I made a change to guard space in kernel (not in bpf prog),
the kasan can print out the error message properly. But unfortunately, in jit, there is no
kasan instrumentation so warning (with "#if 1" change) is not reported. One possibility is
if kernel config enables kasan, bpf jit could add kasan to jited binary. Not sure the
complexity and whether it is worthwhile or not since supposedly verifier should already
prevent overflow and we already have a guard check (Private stack overflow happened ...)
in jit.
> Then let's figure out a way how to test for such things and
> what we can do to make kasan detect it sooner,
> since above crashes have no indication at all that bpf priv stack
> is responsible.
> If there is another bug in priv stack and it will cause future
> crashes we need to make sure that priv stack corruption is
> detected by kasan (or whatever mechanism) earlier.
>
> We cannot land private stack support when there is
> a possibility of such silent corruption.
>
> pw-bot: cr
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit
2024-11-11 23:18 ` Yonghong Song
@ 2024-11-12 1:29 ` Alexei Starovoitov
2024-11-12 3:42 ` Yonghong Song
0 siblings, 1 reply; 13+ messages in thread
From: Alexei Starovoitov @ 2024-11-12 1:29 UTC (permalink / raw)
To: Yonghong Song
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Kernel Team, Martin KaFai Lau, Tejun Heo
On Mon, Nov 11, 2024 at 3:18 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
>
>
> On 11/9/24 12:14 PM, Alexei Starovoitov wrote:
> > On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>
> >> stack_depth = bpf_prog->aux->stack_depth;
> >> + if (bpf_prog->aux->priv_stack_ptr) {
> >> + priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
> >> + stack_depth = 0;
> >> + }
> > ...
> >
> >> + priv_stack_ptr = prog->aux->priv_stack_ptr;
> >> + if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
> >> + priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
> > After applying I started to see crashes running test_progs -j like:
> >
> > [ 173.465191] Oops: general protection fault, probably for
> > non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
> > [ 173.466053] KASAN: probably user-memory-access in range
> > [0x00000000000057c8-0x00000000000057cf]
> > [ 173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
> > [ 173.466053] Call Trace:
> > [ 173.466053] <IRQ>
> > [ 173.466053] ? die_addr+0x40/0xa0
> > [ 173.466053] ? exc_general_protection+0x138/0x1f0
> > [ 173.466053] ? asm_exc_general_protection+0x26/0x30
> > [ 173.466053] ? dst_dev_put+0x1e/0x220
> > [ 173.466053] rt_fibinfo_free_cpus.part.0+0x8c/0x130
> > [ 173.466053] fib_nh_common_release+0xd6/0x2a0
> > [ 173.466053] free_fib_info_rcu+0x129/0x360
> > [ 173.466053] ? rcu_core+0xa55/0x1340
> > [ 173.466053] rcu_core+0xa55/0x1340
> > [ 173.466053] ? rcutree_report_cpu_dead+0x380/0x380
> > [ 173.466053] ? hrtimer_interrupt+0x319/0x7c0
> > [ 173.466053] handle_softirqs+0x14c/0x4d0
> >
> > [ 35.134115] Oops: general protection fault, probably for
> > non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
> > [ 35.135089] KASAN: probably user-memory-access in range
> > [0x00007fff880fdde0-0x00007fff880fdde7]
> > [ 35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
> > [ 35.135089] Call Trace:
> > [ 35.135089] <TASK>
> > [ 35.135089] ? die_addr+0x40/0xa0
> > [ 35.135089] ? exc_general_protection+0x138/0x1f0
> > [ 35.135089] ? asm_exc_general_protection+0x26/0x30
> > [ 35.135089] ? destroy_workqueue+0x4b4/0xa70
> > [ 35.135089] ? destroy_workqueue+0x592/0xa70
> > [ 35.135089] ? __mutex_unlock_slowpath.isra.0+0x270/0x270
> > [ 35.135089] ext4_put_super+0xff/0xd70
> > [ 35.135089] generic_shutdown_super+0x148/0x4c0
> > [ 35.135089] kill_block_super+0x3b/0x90
> > [ 35.135089] ext4_kill_sb+0x65/0x90
> >
> > I think I see the bug... quoted it above...
> >
> > Please make sure you reproduce it first.
>
> Indeed, to use the allocation size round_up(stack_depth, 16) for __alloc_percpu_gfp()
> indeed fixed the problem.
>
> The following is additional change on top of this patch set to
> - fix the memory access bug as suggested by Alexei in the above
> - Add guard space for private stack, additional 16 bytes at the
> end of stack will be the guard space. The content is prepopulated
> and checked at per cpu private stack free site. If the content
> is not expected, a kernel message will emit.
> - Add kasan support for guard space.
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 55556a64f776..d796d419bb48 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -1446,6 +1446,9 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
> #define LOAD_TAIL_CALL_CNT_PTR(stack) \
> __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
>
> +#define PRIV_STACK_GUARD_SZ 16
> +#define PRIV_STACK_GUARD_VAL 0xEB9F1234eb9f1234ULL
> +
> static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> int oldproglen, struct jit_context *ctx, bool jmp_padding)
> {
> @@ -1462,10 +1465,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
> u8 *prog = temp;
> u32 stack_depth;
> int err;
> + // int stack_size;
>
> stack_depth = bpf_prog->aux->stack_depth;
> if (bpf_prog->aux->priv_stack_ptr) {
> - priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
> + priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16) + PRIV_STACK_GUARD_SZ;
> stack_depth = 0;
Since priv stack is not really a stack there is no need to align it to 16
and no need to round_up() either.
let's drop these parts and it will simplify the code.
Re: GUARD_SZ.
I think it's better to guard top and bottom.
8 byte for each will do.
> }
>
> @@ -1496,8 +1500,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
> emit_mov_imm64(&prog, X86_REG_R12,
> arena_vm_start >> 32, (u32) arena_vm_start);
>
> - if (priv_frame_ptr)
> + if (priv_frame_ptr) {
> +#if 0
> + /* hack to emit and write some data to guard area */
> + emit_priv_frame_ptr(&prog, bpf_prog->aux->priv_stack_ptr);
> +
> + /* See case BPF_ST | BPF_MEM | BPF_W */
> + EMIT2(0x41, 0xC7);
> + EMIT2(add_1reg(0x40, X86_REG_R9), 0);
> + EMIT(0xFFFFFFFF, bpf_size_to_x86_bytes(BPF_W));
> +#endif
> emit_priv_frame_ptr(&prog, priv_frame_ptr);
> + }
>
> ilen = prog - temp;
> if (rw_image)
> @@ -3383,11 +3397,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
> struct x64_jit_data *jit_data;
> int proglen, oldproglen = 0;
> struct jit_context ctx = {};
> + int priv_stack_size, cpu;
> bool tmp_blinded = false;
> bool extra_pass = false;
> bool padding = false;
> u8 *rw_image = NULL;
> u8 *image = NULL;
> + u64 *guard_ptr;
> int *addrs;
> int pass;
> int i;
> @@ -3418,11 +3434,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
> }
> priv_stack_ptr = prog->aux->priv_stack_ptr;
> if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
> - priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
> + priv_stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
> + priv_stack_ptr = __alloc_percpu_gfp(priv_stack_size, 16, GFP_KERNEL);
> if (!priv_stack_ptr) {
> prog = orig_prog;
> goto out_priv_stack;
> }
> + for_each_possible_cpu(cpu) {
> + guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
> + guard_ptr[0] = guard_ptr[1] = PRIV_STACK_GUARD_VAL;
> + kasan_poison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ);
with top and bottom guards there will be two calls to poison.
But did you check that percpu allocs come from the vmalloc region?
Does kasan_poison_vmalloc() actually work or silently nop ?
> + }
> prog->aux->priv_stack_ptr = priv_stack_ptr;
> }
> addrs = jit_data->addrs;
> @@ -3561,6 +3583,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
> out_addrs:
> kvfree(addrs);
> if (!image && priv_stack_ptr) {
> + for_each_possible_cpu(cpu) {
> + guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
> + kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
> + }
> free_percpu(priv_stack_ptr);
> prog->aux->priv_stack_ptr = NULL;
> }
> @@ -3603,6 +3629,9 @@ void bpf_jit_free(struct bpf_prog *prog)
> if (prog->jited) {
> struct x64_jit_data *jit_data = prog->aux->jit_data;
> struct bpf_binary_header *hdr;
> + void __percpu *priv_stack_ptr;
> + u64 *guard_ptr;
> + int cpu;
>
> /*
> * If we fail the final pass of JIT (from jit_subprogs),
> @@ -3618,7 +3647,21 @@ void bpf_jit_free(struct bpf_prog *prog)
> prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
> hdr = bpf_jit_binary_pack_hdr(prog);
> bpf_jit_binary_pack_free(hdr, NULL);
> - free_percpu(prog->aux->priv_stack_ptr);
> +
> + priv_stack_ptr = prog->aux->priv_stack_ptr;
> + if (priv_stack_ptr) {
> + int stack_size;
> +
> + stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
> + for_each_possible_cpu(cpu) {
> + guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
> + kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
> + if (guard_ptr[0] != PRIV_STACK_GUARD_VAL || guard_ptr[1] != PRIV_STACK_GUARD_VAL)
> + pr_err("Private stack Overflow happened for prog %sx\n", prog->aux->name);
> + }
Common helper is needed to check guards before free_percpu.
> + free_percpu(priv_stack_ptr);
> + }
> +
> WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
> }
>
> This fixed the issue Alexei discovered.
>
> 16 bytes guard space is allocated since allocation is done with 16byte aligned
> with multiple-16 size. If bpf program overflows the stack (change '#if 0' to '#if 1')
> in the above change, we will see:
>
> [root@arch-fb-vm1 bpf]# ./test_progs -n 336
> [ 28.447390] bpf_testmod: loading out-of-tree module taints kernel.
> [ 28.448180] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
> #336/1 struct_ops_private_stack/private_stack:OK
> #336/2 struct_ops_private_stack/private_stack_fail:OK
> #336/3 struct_ops_private_stack/private_stack_recur:OK
> #336 struct_ops_private_stack:OK
> Summary: 1/3 PASSED, 0 SKIPPED, 0 FAILED
> [ 28.737710] Private stack Overflow happened for prog Fx
> [ 28.739284] Private stack Overflow happened for prog Fx
> [ 28.968732] Private stack Overflow happened for prog Fx
>
> Here the func name is 'Fx' (representing the sub prog). We might need
> to add more meaningful info (e.g. main prog name) to make message more
> meaningful.
Probably worth introducing a helper like:
const char *bpf_get_prog_name(prog)
{
if (fp->aux->ksym.prog)
return prog->aux->ksym.name;
return prog->aux->name;
}
>
> I added some changes related kasan. If I made a change to guard space in kernel (not in bpf prog),
> the kasan can print out the error message properly. But unfortunately, in jit, there is no
> kasan instrumentation so warning (with "#if 1" change) is not reported. One possibility is
> if kernel config enables kasan, bpf jit could add kasan to jited binary. Not sure the
> complexity and whether it is worthwhile or not since supposedly verifier should already
> prevent overflow and we already have a guard check (Private stack overflow happened ...)
> in jit.
As a follow up we should teach JIT to emit calls __asan_loadN/storeN
when processing LDX/STX.
imo it's been long overdue.
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit
2024-11-12 1:29 ` Alexei Starovoitov
@ 2024-11-12 3:42 ` Yonghong Song
0 siblings, 0 replies; 13+ messages in thread
From: Yonghong Song @ 2024-11-12 3:42 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Kernel Team, Martin KaFai Lau, Tejun Heo
On 11/11/24 5:29 PM, Alexei Starovoitov wrote:
> On Mon, Nov 11, 2024 at 3:18 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>
>>
>> On 11/9/24 12:14 PM, Alexei Starovoitov wrote:
>>> On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>> stack_depth = bpf_prog->aux->stack_depth;
>>>> + if (bpf_prog->aux->priv_stack_ptr) {
>>>> + priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
>>>> + stack_depth = 0;
>>>> + }
>>> ...
>>>
>>>> + priv_stack_ptr = prog->aux->priv_stack_ptr;
>>>> + if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
>>>> + priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
>>> After applying I started to see crashes running test_progs -j like:
>>>
>>> [ 173.465191] Oops: general protection fault, probably for
>>> non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
>>> [ 173.466053] KASAN: probably user-memory-access in range
>>> [0x00000000000057c8-0x00000000000057cf]
>>> [ 173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
>>> [ 173.466053] Call Trace:
>>> [ 173.466053] <IRQ>
>>> [ 173.466053] ? die_addr+0x40/0xa0
>>> [ 173.466053] ? exc_general_protection+0x138/0x1f0
>>> [ 173.466053] ? asm_exc_general_protection+0x26/0x30
>>> [ 173.466053] ? dst_dev_put+0x1e/0x220
>>> [ 173.466053] rt_fibinfo_free_cpus.part.0+0x8c/0x130
>>> [ 173.466053] fib_nh_common_release+0xd6/0x2a0
>>> [ 173.466053] free_fib_info_rcu+0x129/0x360
>>> [ 173.466053] ? rcu_core+0xa55/0x1340
>>> [ 173.466053] rcu_core+0xa55/0x1340
>>> [ 173.466053] ? rcutree_report_cpu_dead+0x380/0x380
>>> [ 173.466053] ? hrtimer_interrupt+0x319/0x7c0
>>> [ 173.466053] handle_softirqs+0x14c/0x4d0
>>>
>>> [ 35.134115] Oops: general protection fault, probably for
>>> non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
>>> [ 35.135089] KASAN: probably user-memory-access in range
>>> [0x00007fff880fdde0-0x00007fff880fdde7]
>>> [ 35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
>>> [ 35.135089] Call Trace:
>>> [ 35.135089] <TASK>
>>> [ 35.135089] ? die_addr+0x40/0xa0
>>> [ 35.135089] ? exc_general_protection+0x138/0x1f0
>>> [ 35.135089] ? asm_exc_general_protection+0x26/0x30
>>> [ 35.135089] ? destroy_workqueue+0x4b4/0xa70
>>> [ 35.135089] ? destroy_workqueue+0x592/0xa70
>>> [ 35.135089] ? __mutex_unlock_slowpath.isra.0+0x270/0x270
>>> [ 35.135089] ext4_put_super+0xff/0xd70
>>> [ 35.135089] generic_shutdown_super+0x148/0x4c0
>>> [ 35.135089] kill_block_super+0x3b/0x90
>>> [ 35.135089] ext4_kill_sb+0x65/0x90
>>>
>>> I think I see the bug... quoted it above...
>>>
>>> Please make sure you reproduce it first.
>> Indeed, to use the allocation size round_up(stack_depth, 16) for __alloc_percpu_gfp()
>> indeed fixed the problem.
>>
>> The following is additional change on top of this patch set to
>> - fix the memory access bug as suggested by Alexei in the above
>> - Add guard space for private stack, additional 16 bytes at the
>> end of stack will be the guard space. The content is prepopulated
>> and checked at per cpu private stack free site. If the content
>> is not expected, a kernel message will emit.
>> - Add kasan support for guard space.
>>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 55556a64f776..d796d419bb48 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -1446,6 +1446,9 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
>> #define LOAD_TAIL_CALL_CNT_PTR(stack) \
>> __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
>>
>> +#define PRIV_STACK_GUARD_SZ 16
>> +#define PRIV_STACK_GUARD_VAL 0xEB9F1234eb9f1234ULL
>> +
>> static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
>> int oldproglen, struct jit_context *ctx, bool jmp_padding)
>> {
>> @@ -1462,10 +1465,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>> u8 *prog = temp;
>> u32 stack_depth;
>> int err;
>> + // int stack_size;
>>
>> stack_depth = bpf_prog->aux->stack_depth;
>> if (bpf_prog->aux->priv_stack_ptr) {
>> - priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
>> + priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16) + PRIV_STACK_GUARD_SZ;
>> stack_depth = 0;
> Since priv stack is not really a stack there is no need to align it to 16
> and no need to round_up() either.
> let's drop these parts and it will simplify the code.
>
> Re: GUARD_SZ.
> I think it's better to guard top and bottom.
> 8 byte for each will do.
Make sense for both. I will align to 8 bytes.
>
>> }
>>
>> @@ -1496,8 +1500,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>> emit_mov_imm64(&prog, X86_REG_R12,
>> arena_vm_start >> 32, (u32) arena_vm_start);
>>
>> - if (priv_frame_ptr)
>> + if (priv_frame_ptr) {
>> +#if 0
>> + /* hack to emit and write some data to guard area */
>> + emit_priv_frame_ptr(&prog, bpf_prog->aux->priv_stack_ptr);
>> +
>> + /* See case BPF_ST | BPF_MEM | BPF_W */
>> + EMIT2(0x41, 0xC7);
>> + EMIT2(add_1reg(0x40, X86_REG_R9), 0);
>> + EMIT(0xFFFFFFFF, bpf_size_to_x86_bytes(BPF_W));
>> +#endif
>> emit_priv_frame_ptr(&prog, priv_frame_ptr);
>> + }
>>
>> ilen = prog - temp;
>> if (rw_image)
>> @@ -3383,11 +3397,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>> struct x64_jit_data *jit_data;
>> int proglen, oldproglen = 0;
>> struct jit_context ctx = {};
>> + int priv_stack_size, cpu;
>> bool tmp_blinded = false;
>> bool extra_pass = false;
>> bool padding = false;
>> u8 *rw_image = NULL;
>> u8 *image = NULL;
>> + u64 *guard_ptr;
>> int *addrs;
>> int pass;
>> int i;
>> @@ -3418,11 +3434,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>> }
>> priv_stack_ptr = prog->aux->priv_stack_ptr;
>> if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
>> - priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
>> + priv_stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
>> + priv_stack_ptr = __alloc_percpu_gfp(priv_stack_size, 16, GFP_KERNEL);
>> if (!priv_stack_ptr) {
>> prog = orig_prog;
>> goto out_priv_stack;
>> }
>> + for_each_possible_cpu(cpu) {
>> + guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
>> + guard_ptr[0] = guard_ptr[1] = PRIV_STACK_GUARD_VAL;
>> + kasan_poison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ);
> with top and bottom guards there will be two calls to poison.
>
> But did you check that percpu allocs come from the vmalloc region?
> Does kasan_poison_vmalloc() actually work or silently nop ?
I tried again. it is silent nop. So later when we add kasan for bpf progs,
we need to tackle this as well.
>
>> + }
>> prog->aux->priv_stack_ptr = priv_stack_ptr;
>> }
>> addrs = jit_data->addrs;
>> @@ -3561,6 +3583,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>> out_addrs:
>> kvfree(addrs);
>> if (!image && priv_stack_ptr) {
>> + for_each_possible_cpu(cpu) {
>> + guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
>> + kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
>> + }
>> free_percpu(priv_stack_ptr);
>> prog->aux->priv_stack_ptr = NULL;
>> }
>> @@ -3603,6 +3629,9 @@ void bpf_jit_free(struct bpf_prog *prog)
>> if (prog->jited) {
>> struct x64_jit_data *jit_data = prog->aux->jit_data;
>> struct bpf_binary_header *hdr;
>> + void __percpu *priv_stack_ptr;
>> + u64 *guard_ptr;
>> + int cpu;
>>
>> /*
>> * If we fail the final pass of JIT (from jit_subprogs),
>> @@ -3618,7 +3647,21 @@ void bpf_jit_free(struct bpf_prog *prog)
>> prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
>> hdr = bpf_jit_binary_pack_hdr(prog);
>> bpf_jit_binary_pack_free(hdr, NULL);
>> - free_percpu(prog->aux->priv_stack_ptr);
>> +
>> + priv_stack_ptr = prog->aux->priv_stack_ptr;
>> + if (priv_stack_ptr) {
>> + int stack_size;
>> +
>> + stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
>> + for_each_possible_cpu(cpu) {
>> + guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
>> + kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
>> + if (guard_ptr[0] != PRIV_STACK_GUARD_VAL || guard_ptr[1] != PRIV_STACK_GUARD_VAL)
>> + pr_err("Private stack Overflow happened for prog %sx\n", prog->aux->name);
>> + }
> Common helper is needed to check guards before free_percpu.
Ack.
>
>> + free_percpu(priv_stack_ptr);
>> + }
>> +
>> WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
>> }
>>
>> This fixed the issue Alexei discovered.
>>
>> 16 bytes guard space is allocated since allocation is done with 16byte aligned
>> with multiple-16 size. If bpf program overflows the stack (change '#if 0' to '#if 1')
>> in the above change, we will see:
>>
>> [root@arch-fb-vm1 bpf]# ./test_progs -n 336
>> [ 28.447390] bpf_testmod: loading out-of-tree module taints kernel.
>> [ 28.448180] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
>> #336/1 struct_ops_private_stack/private_stack:OK
>> #336/2 struct_ops_private_stack/private_stack_fail:OK
>> #336/3 struct_ops_private_stack/private_stack_recur:OK
>> #336 struct_ops_private_stack:OK
>> Summary: 1/3 PASSED, 0 SKIPPED, 0 FAILED
>> [ 28.737710] Private stack Overflow happened for prog Fx
>> [ 28.739284] Private stack Overflow happened for prog Fx
>> [ 28.968732] Private stack Overflow happened for prog Fx
>>
>> Here the func name is 'Fx' (representing the sub prog). We might need
>> to add more meaningful info (e.g. main prog name) to make message more
>> meaningful.
> Probably worth introducing a helper like:
>
> const char *bpf_get_prog_name(prog)
> {
> if (fp->aux->ksym.prog)
> return prog->aux->ksym.name;
> return prog->aux->name;
> }
Looks good. Thanks for suggestion.
>
>
>> I added some changes related kasan. If I made a change to guard space in kernel (not in bpf prog),
>> the kasan can print out the error message properly. But unfortunately, in jit, there is no
>> kasan instrumentation so warning (with "#if 1" change) is not reported. One possibility is
>> if kernel config enables kasan, bpf jit could add kasan to jited binary. Not sure the
>> complexity and whether it is worthwhile or not since supposedly verifier should already
>> prevent overflow and we already have a guard check (Private stack overflow happened ...)
>> in jit.
> As a follow up we should teach JIT to emit calls __asan_loadN/storeN
> when processing LDX/STX.
> imo it's been long overdue.
I will fix the random crash issue and add guard support.
Will do followup for jit kasan support.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH bpf-next v11 5/7] selftests/bpf: Add tracing prog private stack tests
2024-11-09 2:53 [PATCH bpf-next v11 0/7] bpf: Support private stack for bpf progs Yonghong Song
` (3 preceding siblings ...)
2024-11-09 2:53 ` [PATCH bpf-next v11 4/7] bpf, x86: Support private stack in jit Yonghong Song
@ 2024-11-09 2:53 ` Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 6/7] bpf: Support private stack for struct_ops progs Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 7/7] selftests/bpf: Add struct_ops prog private stack tests Yonghong Song
6 siblings, 0 replies; 13+ messages in thread
From: Yonghong Song @ 2024-11-09 2:53 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Some private stack tests are added including:
- main prog only with stack size greater than BPF_PSTACK_MIN_SIZE.
- main prog only with stack size smaller than BPF_PSTACK_MIN_SIZE.
- prog with one subprog having MAX_BPF_STACK stack size and another
subprog having non-zero small stack size.
- prog with callback function.
- prog with exception in main prog or subprog.
- prog with async callback without nesting
- prog with async callback with possible nesting
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
.../selftests/bpf/prog_tests/verifier.c | 2 +
.../bpf/progs/verifier_private_stack.c | 272 ++++++++++++++++++
2 files changed, 274 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/verifier_private_stack.c
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 75f7a2ce334b..d9f65adb456b 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -61,6 +61,7 @@
#include "verifier_or_jmp32_k.skel.h"
#include "verifier_precision.skel.h"
#include "verifier_prevent_map_lookup.skel.h"
+#include "verifier_private_stack.skel.h"
#include "verifier_raw_stack.skel.h"
#include "verifier_raw_tp_writable.skel.h"
#include "verifier_reg_equal.skel.h"
@@ -188,6 +189,7 @@ void test_verifier_bpf_fastcall(void) { RUN(verifier_bpf_fastcall); }
void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); }
void test_verifier_precision(void) { RUN(verifier_precision); }
void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); }
+void test_verifier_private_stack(void) { RUN(verifier_private_stack); }
void test_verifier_raw_stack(void) { RUN(verifier_raw_stack); }
void test_verifier_raw_tp_writable(void) { RUN(verifier_raw_tp_writable); }
void test_verifier_reg_equal(void) { RUN(verifier_reg_equal); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
new file mode 100644
index 000000000000..b1fbdf119553
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+/* From include/linux/filter.h */
+#define MAX_BPF_STACK 512
+
+#if defined(__TARGET_ARCH_x86)
+
+struct elem {
+ struct bpf_timer t;
+ char pad[256];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct elem);
+} array SEC(".maps");
+
+SEC("kprobe")
+__description("Private stack, single prog")
+__success
+__arch_x86_64
+__jited(" movabsq $0x{{.*}}, %r9")
+__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" movl $0x2a, %edi")
+__jited(" movq %rdi, -0x100(%r9)")
+__naked void private_stack_single_prog(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - 256) = r1; \
+ r0 = 0; \
+ exit; \
+" ::: __clobber_all);
+}
+
+SEC("raw_tp")
+__description("No private stack")
+__success
+__arch_x86_64
+__jited(" subq $0x8, %rsp")
+__naked void no_private_stack_nested(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - 8) = r1; \
+ r0 = 0; \
+ exit; \
+" ::: __clobber_all);
+}
+
+__used
+__naked static void cumulative_stack_depth_subprog(void)
+{
+ asm volatile (" \
+ r1 = 41; \
+ *(u64 *)(r10 - 32) = r1; \
+ call %[bpf_get_smp_processor_id]; \
+ exit; \
+" :
+ : __imm(bpf_get_smp_processor_id)
+ : __clobber_all);
+}
+
+SEC("kprobe")
+__description("Private stack, subtree > MAX_BPF_STACK")
+__success
+__arch_x86_64
+/* private stack fp for the main prog */
+__jited(" movabsq $0x{{.*}}, %r9")
+__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" movl $0x2a, %edi")
+__jited(" movq %rdi, -0x200(%r9)")
+__jited(" pushq %r9")
+__jited(" callq 0x{{.*}}")
+__jited(" popq %r9")
+__jited(" xorl %eax, %eax")
+__naked void private_stack_nested_1(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - %[max_bpf_stack]) = r1; \
+ call cumulative_stack_depth_subprog; \
+ r0 = 0; \
+ exit; \
+" :
+ : __imm_const(max_bpf_stack, MAX_BPF_STACK)
+ : __clobber_all);
+}
+
+__naked __noinline __used
+static unsigned long loop_callback(void)
+{
+ asm volatile (" \
+ call %[bpf_get_prandom_u32]; \
+ r1 = 42; \
+ *(u64 *)(r10 - 512) = r1; \
+ call cumulative_stack_depth_subprog; \
+ r0 = 0; \
+ exit; \
+" :
+ : __imm(bpf_get_prandom_u32)
+ : __clobber_common);
+}
+
+SEC("raw_tp")
+__description("Private stack, callback")
+__success
+__arch_x86_64
+/* for func loop_callback */
+__jited("func #1")
+__jited(" endbr64")
+__jited(" nopl (%rax,%rax)")
+__jited(" nopl (%rax)")
+__jited(" pushq %rbp")
+__jited(" movq %rsp, %rbp")
+__jited(" endbr64")
+__jited(" movabsq $0x{{.*}}, %r9")
+__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" pushq %r9")
+__jited(" callq")
+__jited(" popq %r9")
+__jited(" movl $0x2a, %edi")
+__jited(" movq %rdi, -0x200(%r9)")
+__jited(" pushq %r9")
+__jited(" callq")
+__jited(" popq %r9")
+__naked void private_stack_callback(void)
+{
+ asm volatile (" \
+ r1 = 1; \
+ r2 = %[loop_callback]; \
+ r3 = 0; \
+ r4 = 0; \
+ call %[bpf_loop]; \
+ r0 = 0; \
+ exit; \
+" :
+ : __imm_ptr(loop_callback),
+ __imm(bpf_loop)
+ : __clobber_common);
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, exception in main prog")
+__success __retval(0)
+__arch_x86_64
+__jited(" pushq %r9")
+__jited(" callq")
+__jited(" popq %r9")
+int private_stack_exception_main_prog(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - 512) = r1; \
+" ::: __clobber_common);
+
+ bpf_throw(0);
+ return 0;
+}
+
+__used static int subprog_exception(void)
+{
+ bpf_throw(0);
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, exception in subprog")
+__success __retval(0)
+__arch_x86_64
+__jited(" movq %rdi, -0x200(%r9)")
+__jited(" pushq %r9")
+__jited(" callq")
+__jited(" popq %r9")
+int private_stack_exception_sub_prog(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - 512) = r1; \
+ call subprog_exception; \
+" ::: __clobber_common);
+
+ return 0;
+}
+
+int glob;
+__noinline static void subprog2(int *val)
+{
+ glob += val[0] * 2;
+}
+
+__noinline static void subprog1(int *val)
+{
+ int tmp[64] = {};
+
+ tmp[0] = *val;
+ subprog2(tmp);
+}
+
+__noinline static int timer_cb1(void *map, int *key, struct bpf_timer *timer)
+{
+ subprog1(key);
+ return 0;
+}
+
+__noinline static int timer_cb2(void *map, int *key, struct bpf_timer *timer)
+{
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, async callback, not nested")
+__success __retval(0)
+__arch_x86_64
+__jited(" movabsq $0x{{.*}}, %r9")
+int private_stack_async_callback_1(void)
+{
+ struct bpf_timer *arr_timer;
+ int array_key = 0;
+
+ arr_timer = bpf_map_lookup_elem(&array, &array_key);
+ if (!arr_timer)
+ return 0;
+
+ bpf_timer_init(arr_timer, &array, 1);
+ bpf_timer_set_callback(arr_timer, timer_cb2);
+ bpf_timer_start(arr_timer, 0, 0);
+ subprog1(&array_key);
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, async callback, potential nesting")
+__success __retval(0)
+__arch_x86_64
+__jited(" subq $0x100, %rsp")
+int private_stack_async_callback_2(void)
+{
+ struct bpf_timer *arr_timer;
+ int array_key = 0;
+
+ arr_timer = bpf_map_lookup_elem(&array, &array_key);
+ if (!arr_timer)
+ return 0;
+
+ bpf_timer_init(arr_timer, &array, 1);
+ bpf_timer_set_callback(arr_timer, timer_cb1);
+ bpf_timer_start(arr_timer, 0, 0);
+ subprog1(&array_key);
+ return 0;
+}
+
+#else
+
+SEC("kprobe")
+__description("private stack is not supported, use a dummy test")
+__success
+int dummy_test(void)
+{
+ return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH bpf-next v11 6/7] bpf: Support private stack for struct_ops progs
2024-11-09 2:53 [PATCH bpf-next v11 0/7] bpf: Support private stack for bpf progs Yonghong Song
` (4 preceding siblings ...)
2024-11-09 2:53 ` [PATCH bpf-next v11 5/7] selftests/bpf: Add tracing prog private stack tests Yonghong Song
@ 2024-11-09 2:53 ` Yonghong Song
2024-11-09 2:53 ` [PATCH bpf-next v11 7/7] selftests/bpf: Add struct_ops prog private stack tests Yonghong Song
6 siblings, 0 replies; 13+ messages in thread
From: Yonghong Song @ 2024-11-09 2:53 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
For struct_ops progs, whether a particular prog uses private stack
depends on prog->aux->priv_stack_requested setting before actual
insn-level verification for that prog. One particular implementation
is to piggyback on struct_ops->check_member(). The next patch has
an example for this. The struct_ops->check_member() sets
prog->aux->priv_stack_requested to be true which enables private stack
usage.
The struct_ops prog follows the same rule as kprobe/tracing progs after
function bpf_enable_priv_stack(). For example, even a struct_ops prog
requests private stack, it could still use normal kernel stack if
the stack size is small (< 64 bytes).
Similar to tracing progs, nested same cpu same prog run will be skipped.
A field (recursion_detected()) is added to bpf_prog_aux structure.
If bpf_prog->aux->recursion_detected is implemented by the struct_ops
subsystem and nested same cpu/prog happens, the function will be
triggered to report an error, collect related info, etc.
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
include/linux/bpf.h | 2 ++
include/linux/bpf_verifier.h | 1 +
kernel/bpf/trampoline.c | 4 ++++
kernel/bpf/verifier.c | 7 ++++++-
4 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9cfb8f55d691..ae50826f9ace 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1525,9 +1525,11 @@ struct bpf_prog_aux {
bool exception_boundary;
bool is_extended; /* true if extended by freplace program */
bool jits_use_priv_stack;
+ bool priv_stack_requested;
u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
struct bpf_arena *arena;
+ void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
const struct btf_type *attach_func_proto;
/* function name for valid attach_btf_id */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 456fd2265345..f013988d2e7f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -895,6 +895,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
case BPF_PROG_TYPE_TRACING:
return prog->expected_attach_type != BPF_TRACE_ITER;
case BPF_PROG_TYPE_STRUCT_OPS:
+ return prog->aux->jits_use_priv_stack;
case BPF_PROG_TYPE_LSM:
return false;
default:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 9f36c049f4c2..a8d188b31da5 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -899,6 +899,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
@@ -975,6 +977,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8a7576233814..cf69caf67d61 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6054,7 +6054,7 @@ static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_STRUCT_OPS:
- if (bpf_prog_check_recur(prog))
+ if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog))
return PRIV_STACK_ADAPTIVE;
fallthrough;
default:
@@ -22000,6 +22000,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
}
+ if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
+ verbose(env, "Private stack not supported by jit\n");
+ return -EACCES;
+ }
+
/* btf_ctx_access() used this to provide argument type info */
prog->aux->ctx_arg_info =
st_ops_desc->arg_info[member_idx].info;
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH bpf-next v11 7/7] selftests/bpf: Add struct_ops prog private stack tests
2024-11-09 2:53 [PATCH bpf-next v11 0/7] bpf: Support private stack for bpf progs Yonghong Song
` (5 preceding siblings ...)
2024-11-09 2:53 ` [PATCH bpf-next v11 6/7] bpf: Support private stack for struct_ops progs Yonghong Song
@ 2024-11-09 2:53 ` Yonghong Song
6 siblings, 0 replies; 13+ messages in thread
From: Yonghong Song @ 2024-11-09 2:53 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Add three tests for struct_ops using private stack.
./test_progs -t struct_ops_private_stack
#333/1 struct_ops_private_stack/private_stack:OK
#333/2 struct_ops_private_stack/private_stack_fail:OK
#333/3 struct_ops_private_stack/private_stack_recur:OK
#333 struct_ops_private_stack:OK
The following is a snippet of a struct_ops check_member() implementation:
u32 moff = __btf_member_bit_offset(t, member) / 8;
switch (moff) {
case offsetof(struct bpf_testmod_ops3, test_1):
prog->aux->priv_stack_requested = true;
prog->aux->recursion_detected = test_1_recursion_detected;
fallthrough;
default:
break;
}
return 0;
The first test is with nested two different callback functions where the
first prog has more than 512 byte stack size (including subprogs) with
private stack enabled.
The second test is a negative test where the second prog has more than 512
byte stack size without private stack enabled.
The third test is the same callback function recursing itself. At run time,
the jit trampoline recursion check kicks in to prevent the recursion. The
recursion_detected() callback function is implemented by the bpf_testmod,
the following message in dmesg
bpf_testmod: oh no, recursing into test_1, recursion_misses 1
demonstrates the callback function is indeed triggered when recursion miss
happens.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
.../selftests/bpf/bpf_testmod/bpf_testmod.c | 104 +++++++++++++++++
.../selftests/bpf/bpf_testmod/bpf_testmod.h | 5 +
.../bpf/prog_tests/struct_ops_private_stack.c | 106 ++++++++++++++++++
.../bpf/progs/struct_ops_private_stack.c | 62 ++++++++++
.../bpf/progs/struct_ops_private_stack_fail.c | 62 ++++++++++
.../progs/struct_ops_private_stack_recur.c | 50 +++++++++
6 files changed, 389 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 987d41af71d2..cc9dde507aba 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -245,6 +245,39 @@ __bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx)
call_rcu(&ctx->rcu, testmod_free_cb);
}
+static struct bpf_testmod_ops3 *st_ops3;
+
+static int bpf_testmod_test_3(void)
+{
+ return 0;
+}
+
+static int bpf_testmod_test_4(void)
+{
+ return 0;
+}
+
+static struct bpf_testmod_ops3 __bpf_testmod_ops3 = {
+ .test_1 = bpf_testmod_test_3,
+ .test_2 = bpf_testmod_test_4,
+};
+
+static void bpf_testmod_test_struct_ops3(void)
+{
+ if (st_ops3)
+ st_ops3->test_1();
+}
+
+__bpf_kfunc void bpf_testmod_ops3_call_test_1(void)
+{
+ st_ops3->test_1();
+}
+
+__bpf_kfunc void bpf_testmod_ops3_call_test_2(void)
+{
+ st_ops3->test_2();
+}
+
struct bpf_testmod_btf_type_tag_1 {
int a;
};
@@ -382,6 +415,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
(void)trace_bpf_testmod_test_raw_tp_null(NULL);
+ bpf_testmod_test_struct_ops3();
+
struct_arg3 = kmalloc((sizeof(struct bpf_testmod_struct_arg_3) +
sizeof(int)), GFP_KERNEL);
if (struct_arg3 != NULL) {
@@ -586,6 +621,8 @@ BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU)
BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1)
+BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2)
BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
BTF_ID_LIST(bpf_testmod_dtor_ids)
@@ -1096,6 +1133,10 @@ static const struct bpf_verifier_ops bpf_testmod_verifier_ops = {
.is_valid_access = bpf_testmod_ops_is_valid_access,
};
+static const struct bpf_verifier_ops bpf_testmod_verifier_ops3 = {
+ .is_valid_access = bpf_testmod_ops_is_valid_access,
+};
+
static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
{
struct bpf_testmod_ops *ops = kdata;
@@ -1175,6 +1216,68 @@ struct bpf_struct_ops bpf_testmod_ops2 = {
.owner = THIS_MODULE,
};
+static int st_ops3_reg(void *kdata, struct bpf_link *link)
+{
+ int err = 0;
+
+ mutex_lock(&st_ops_mutex);
+ if (st_ops3) {
+ pr_err("st_ops has already been registered\n");
+ err = -EEXIST;
+ goto unlock;
+ }
+ st_ops3 = kdata;
+
+unlock:
+ mutex_unlock(&st_ops_mutex);
+ return err;
+}
+
+static void st_ops3_unreg(void *kdata, struct bpf_link *link)
+{
+ mutex_lock(&st_ops_mutex);
+ st_ops3 = NULL;
+ mutex_unlock(&st_ops_mutex);
+}
+
+static void test_1_recursion_detected(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+
+ stats = this_cpu_ptr(prog->stats);
+ printk("bpf_testmod: oh no, recursing into test_1, recursion_misses %llu",
+ u64_stats_read(&stats->misses));
+}
+
+static int st_ops3_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_testmod_ops3, test_1):
+ prog->aux->priv_stack_requested = true;
+ prog->aux->recursion_detected = test_1_recursion_detected;
+ fallthrough;
+ default:
+ break;
+ }
+ return 0;
+}
+
+struct bpf_struct_ops bpf_testmod_ops3 = {
+ .verifier_ops = &bpf_testmod_verifier_ops3,
+ .init = bpf_testmod_ops_init,
+ .init_member = bpf_testmod_ops_init_member,
+ .reg = st_ops3_reg,
+ .unreg = st_ops3_unreg,
+ .check_member = st_ops3_check_member,
+ .cfi_stubs = &__bpf_testmod_ops3,
+ .name = "bpf_testmod_ops3",
+ .owner = THIS_MODULE,
+};
+
static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args)
{
return 0;
@@ -1333,6 +1436,7 @@ static int bpf_testmod_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set);
ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops);
ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2);
+ ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops3, bpf_testmod_ops3);
ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops);
ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors,
ARRAY_SIZE(bpf_testmod_dtors),
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
index fb7dff47597a..356803d1c10e 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
@@ -94,6 +94,11 @@ struct bpf_testmod_ops2 {
int (*test_1)(void);
};
+struct bpf_testmod_ops3 {
+ int (*test_1)(void);
+ int (*test_2)(void);
+};
+
struct st_ops_args {
u64 a;
};
diff --git a/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
new file mode 100644
index 000000000000..4006879ca3fe
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "struct_ops_private_stack.skel.h"
+#include "struct_ops_private_stack_fail.skel.h"
+#include "struct_ops_private_stack_recur.skel.h"
+
+static void test_private_stack(void)
+{
+ struct struct_ops_private_stack *skel;
+ struct bpf_link *link;
+ int err;
+
+ skel = struct_ops_private_stack__open();
+ if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack__open"))
+ return;
+
+ if (skel->data->skip) {
+ test__skip();
+ goto cleanup;
+ }
+
+ err = struct_ops_private_stack__load(skel);
+ if (!ASSERT_OK(err, "struct_ops_private_stack__load"))
+ goto cleanup;
+
+ link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
+ if (!ASSERT_OK_PTR(link, "attach_struct_ops"))
+ goto cleanup;
+
+ ASSERT_OK(trigger_module_test_read(256), "trigger_read");
+
+ ASSERT_EQ(skel->bss->val_i, 3, "val_i");
+ ASSERT_EQ(skel->bss->val_j, 8, "val_j");
+
+ bpf_link__destroy(link);
+
+cleanup:
+ struct_ops_private_stack__destroy(skel);
+}
+
+static void test_private_stack_fail(void)
+{
+ struct struct_ops_private_stack_fail *skel;
+ int err;
+
+ skel = struct_ops_private_stack_fail__open();
+ if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack_fail__open"))
+ return;
+
+ if (skel->data->skip) {
+ test__skip();
+ goto cleanup;
+ }
+
+ err = struct_ops_private_stack_fail__load(skel);
+ if (!ASSERT_ERR(err, "struct_ops_private_stack_fail__load"))
+ goto cleanup;
+ return;
+
+cleanup:
+ struct_ops_private_stack_fail__destroy(skel);
+}
+
+static void test_private_stack_recur(void)
+{
+ struct struct_ops_private_stack_recur *skel;
+ struct bpf_link *link;
+ int err;
+
+ skel = struct_ops_private_stack_recur__open();
+ if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack_recur__open"))
+ return;
+
+ if (skel->data->skip) {
+ test__skip();
+ goto cleanup;
+ }
+
+ err = struct_ops_private_stack_recur__load(skel);
+ if (!ASSERT_OK(err, "struct_ops_private_stack_recur__load"))
+ goto cleanup;
+
+ link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
+ if (!ASSERT_OK_PTR(link, "attach_struct_ops"))
+ goto cleanup;
+
+ ASSERT_OK(trigger_module_test_read(256), "trigger_read");
+
+ ASSERT_EQ(skel->bss->val_j, 3, "val_j");
+
+ bpf_link__destroy(link);
+
+cleanup:
+ struct_ops_private_stack_recur__destroy(skel);
+}
+
+void test_struct_ops_private_stack(void)
+{
+ if (test__start_subtest("private_stack"))
+ test_private_stack();
+ if (test__start_subtest("private_stack_fail"))
+ test_private_stack_fail();
+ if (test__start_subtest("private_stack_recur"))
+ test_private_stack_recur();
+}
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
new file mode 100644
index 000000000000..8ea57e5348ab
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_2(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+ return val_i + a[10] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+ /* stack size 200 bytes */
+ int b[50] = {};
+
+ b[20] = 2;
+ return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+ /* stack size 400 bytes */
+ int a[100] = {};
+
+ a[10] = 1;
+ val_i = subprog1(a);
+ bpf_testmod_ops3_call_test_2();
+ return 0;
+}
+
+SEC("struct_ops")
+int BPF_PROG(test_2)
+{
+ /* stack size 200 bytes */
+ int a[50] = {};
+
+ a[10] = 3;
+ val_j = subprog1(a);
+ return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+ .test_1 = (void *)test_1,
+ .test_2 = (void *)test_2,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
new file mode 100644
index 000000000000..1f55ec4cee37
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_2(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+ return val_i + a[10] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+ /* stack size 200 bytes */
+ int b[50] = {};
+
+ b[20] = 2;
+ return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+ /* stack size 100 bytes */
+ int a[25] = {};
+
+ a[10] = 1;
+ val_i = subprog1(a);
+ bpf_testmod_ops3_call_test_2();
+ return 0;
+}
+
+SEC("struct_ops")
+int BPF_PROG(test_2)
+{
+ /* stack size 400 bytes */
+ int a[100] = {};
+
+ a[10] = 3;
+ val_j = subprog1(a);
+ return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+ .test_1 = (void *)test_1,
+ .test_2 = (void *)test_2,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
new file mode 100644
index 000000000000..f2f300d50988
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_1(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+ return val_i + a[1] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+ /* stack size 400 bytes */
+ int b[100] = {};
+
+ b[20] = 2;
+ return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+ /* stack size 20 bytes */
+ int a[5] = {};
+
+ a[1] = 1;
+ val_j += subprog1(a);
+ bpf_testmod_ops3_call_test_1();
+ return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+ .test_1 = (void *)test_1,
+};
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread