* [PATCH bpf-next v10 1/7] bpf: Find eligible subprogs for private stack support
2024-11-07 2:41 [PATCH bpf-next v10 0/7] bpf: Support private stack for bpf progs Yonghong Song
@ 2024-11-07 2:41 ` Yonghong Song
2024-11-07 2:41 ` [PATCH bpf-next v10 2/7] bpf: Enable private stack for eligible subprogs Yonghong Song
` (5 subsequent siblings)
6 siblings, 0 replies; 11+ messages in thread
From: Yonghong Song @ 2024-11-07 2:41 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Private stack will be allocated with percpu allocator in jit time.
To avoid complexity at runtime, only one copy of private stack is
available per cpu per prog. So runtime recursion check is necessary
to avoid stack corruption.
Current private stack only supports kprobe/perf_event/tp/raw_tp
which has recursion check in the kernel, and prog types that use
bpf trampoline recursion check. For trampoline related prog types,
currently only tracing progs have recursion checking.
To avoid complexity, all async_cb subprogs use normal kernel stack
including those subprogs used by both main prog subtree and async_cb
subtree. Any prog having tail call also uses kernel stack.
To avoid jit penalty with private stack support, a subprog stack
size threshold is set such that only if the stack size is no less
than the threshold, private stack is supported. The current threshold
is 64 bytes. This avoids jit penality if the stack usage is small.
A useless 'continue' is also removed from a loop in func
check_max_stack_depth().
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
include/linux/bpf.h | 1 +
include/linux/bpf_verifier.h | 7 +++
include/linux/filter.h | 1 +
kernel/bpf/core.c | 5 ++
kernel/bpf/verifier.c | 96 ++++++++++++++++++++++++++++++++----
5 files changed, 100 insertions(+), 10 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b84613b10ac..dc1e58b90493 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1523,6 +1523,7 @@ struct bpf_prog_aux {
bool exception_cb;
bool exception_boundary;
bool is_extended; /* true if extended by freplace program */
+ bool priv_stack_requested;
u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
struct bpf_arena *arena;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4513372c5bc8..456fd2265345 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -649,6 +649,12 @@ struct bpf_subprog_arg_info {
};
};
+enum priv_stack_mode {
+ PRIV_STACK_UNKNOWN,
+ NO_PRIV_STACK,
+ PRIV_STACK_ADAPTIVE,
+};
+
struct bpf_subprog_info {
/* 'start' has to be the first field otherwise find_subprog() won't work */
u32 start; /* insn idx of function entry point */
@@ -669,6 +675,7 @@ struct bpf_subprog_info {
/* true if bpf_fastcall stack region is used by functions that can't be inlined */
bool keep_fastcall_stack: 1;
+ enum priv_stack_mode priv_stack_mode;
u8 arg_cnt;
struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
};
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7d7578a8eac1..3a21947f2fd4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
+bool bpf_jit_supports_private_stack(void);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
bool bpf_helper_changes_pkt_data(void *func);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 233ea78f8f1b..14d9288441f2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
return false;
}
+bool __weak bpf_jit_supports_private_stack(void)
+{
+ return false;
+}
+
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7958d6ff6b73..2284b909b499 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
+#define BPF_PRIV_STACK_MIN_SIZE 64
+
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -6034,6 +6036,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
+{
+ if (!bpf_jit_supports_private_stack())
+ return NO_PRIV_STACK;
+
+ /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+ * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+ * explicitly.
+ */
+ switch (prog->type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return PRIV_STACK_ADAPTIVE;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (bpf_prog_check_recur(prog))
+ return PRIV_STACK_ADAPTIVE;
+ fallthrough;
+ default:
+ break;
+ }
+
+ return NO_PRIV_STACK;
+}
+
static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
{
if (env->prog->jit_requested)
@@ -6051,17 +6081,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ bool priv_stack_supported)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int depth = 0, frame = 0, i, subprog_end;
+ int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
i = subprog[idx].start;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -6088,11 +6121,31 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
depth);
return -EACCES;
}
- depth += round_up_stack_depth(env, subprog[idx].stack_depth);
- if (depth > MAX_BPF_STACK) {
- verbose(env, "combined stack size of %d calls is %d. Too large\n",
- frame + 1, depth);
- return -EACCES;
+
+ subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (priv_stack_supported) {
+ /* Request private stack support only if the subprog stack
+ * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
+ * avoid jit penalty if the stack usage is small.
+ */
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
+ subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+ subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
+ }
+
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ if (subprog_depth > MAX_BPF_STACK) {
+ verbose(env, "stack size of subprog %d is %d. Too large\n",
+ idx, subprog_depth);
+ return -EACCES;
+ }
+ } else {
+ depth += subprog_depth;
+ if (depth > MAX_BPF_STACK) {
+ verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ frame + 1, depth);
+ return -EACCES;
+ }
}
continue_func:
subprog_end = subprog[idx + 1].start;
@@ -6149,6 +6202,8 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
}
i = next_insn;
idx = sidx;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -6182,7 +6237,8 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
*/
if (frame == 0)
return 0;
- depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
+ depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
frame--;
i = ret_insn[frame];
idx = ret_prog[frame];
@@ -6191,16 +6247,36 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
+ enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
struct bpf_subprog_info *si = env->subprog_info;
+ bool priv_stack_supported;
int ret;
for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].has_tail_call) {
+ priv_stack_mode = NO_PRIV_STACK;
+ break;
+ }
+ }
+
+ if (priv_stack_mode == PRIV_STACK_UNKNOWN)
+ priv_stack_mode = bpf_enable_priv_stack(env->prog);
+
+ /* All async_cb subprogs use normal kernel stack. If a particular
+ * subprog appears in both main prog and async_cb subtree, that
+ * subprog will use normal kernel stack to avoid potential nesting.
+ * The reverse subprog traversal ensures when main prog subtree is
+ * checked, the subprogs appearing in async_cb subtrees are already
+ * marked as using normal kernel stack, so stack size checking can
+ * be done properly.
+ */
+ for (int i = env->subprog_cnt - 1; i >= 0; i--) {
if (!i || si[i].is_async_cb) {
- ret = check_max_stack_depth_subprog(env, i);
+ priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
+ ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
if (ret < 0)
return ret;
}
- continue;
}
return 0;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH bpf-next v10 2/7] bpf: Enable private stack for eligible subprogs
2024-11-07 2:41 [PATCH bpf-next v10 0/7] bpf: Support private stack for bpf progs Yonghong Song
2024-11-07 2:41 ` [PATCH bpf-next v10 1/7] bpf: Find eligible subprogs for private stack support Yonghong Song
@ 2024-11-07 2:41 ` Yonghong Song
2024-11-08 19:11 ` Alexei Starovoitov
2024-11-07 2:41 ` [PATCH bpf-next v10 3/7] bpf, x86: Avoid repeated usage of bpf_prog->aux->stack_depth Yonghong Song
` (4 subsequent siblings)
6 siblings, 1 reply; 11+ messages in thread
From: Yonghong Song @ 2024-11-07 2:41 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
If private stack is requested by any subprog, set that subprog
prog->aux->priv_stack_requested to be true so later jit can allocate
private stack for that subprog properly.
Also set env->prog->aux->priv_stack_requested to be true if
subprog[0] requests private stack. This is a use case for a
single main prog (no subprogs) to use private stack.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
kernel/bpf/verifier.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2284b909b499..09bb9dc939d6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6278,6 +6278,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
return ret;
}
}
+
+ if (si[0].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ env->prog->aux->priv_stack_requested = true;
+
return 0;
}
@@ -20211,6 +20215,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->priv_stack_requested = true;
+
func[i]->jit_requested = 1;
func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
--
2.43.5
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH bpf-next v10 2/7] bpf: Enable private stack for eligible subprogs
2024-11-07 2:41 ` [PATCH bpf-next v10 2/7] bpf: Enable private stack for eligible subprogs Yonghong Song
@ 2024-11-08 19:11 ` Alexei Starovoitov
2024-11-08 21:41 ` Yonghong Song
0 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2024-11-08 19:11 UTC (permalink / raw)
To: Yonghong Song
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Kernel Team, Martin KaFai Lau, Tejun Heo
On Wed, Nov 6, 2024 at 6:42 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2284b909b499..09bb9dc939d6 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -6278,6 +6278,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
> return ret;
> }
> }
> +
and patch 6 adds this line here:
+ env->prog->aux->priv_stack_requested = false;
> + if (si[0].priv_stack_mode == PRIV_STACK_ADAPTIVE)
> + env->prog->aux->priv_stack_requested = true;
> +
which makes the above hard to reason about.
I think the root of the problem is the dual meaning of
the priv_stack_requested flag.
On one side it's a way for sched-ext to ask bpf core to enable priv stack,
and on the other side it's a request from bpf core to bpf jit
to allocate it.
I think it's better to split these two conditions.
Extra bool is cheap.
How about 'bool priv_stack_requested' will be used by sched-ext only
and patch 6 largely stays as-is.
While patch 1 drops the introduction of priv_stack_requested flag.
Instead 'bool jits_use_priv_stack' is introduced in the patch 2
and used by JITs to allocate priv stack.
I know we use 'jit_requested' to tell JITs to jit it,
so we can bike shed on alternative ways to name these two flags.
> return 0;
> }
>
> @@ -20211,6 +20215,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>
> func[i]->aux->name[0] = 'F';
> func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
> + if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
> + func[i]->aux->priv_stack_requested = true;
> +
> func[i]->jit_requested = 1;
> func[i]->blinding_requested = prog->blinding_requested;
> func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
> --
> 2.43.5
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH bpf-next v10 2/7] bpf: Enable private stack for eligible subprogs
2024-11-08 19:11 ` Alexei Starovoitov
@ 2024-11-08 21:41 ` Yonghong Song
0 siblings, 0 replies; 11+ messages in thread
From: Yonghong Song @ 2024-11-08 21:41 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Kernel Team, Martin KaFai Lau, Tejun Heo
On 11/8/24 11:11 AM, Alexei Starovoitov wrote:
> On Wed, Nov 6, 2024 at 6:42 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 2284b909b499..09bb9dc939d6 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -6278,6 +6278,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
>> return ret;
>> }
>> }
>> +
> and patch 6 adds this line here:
> + env->prog->aux->priv_stack_requested = false;
>
>> + if (si[0].priv_stack_mode == PRIV_STACK_ADAPTIVE)
>> + env->prog->aux->priv_stack_requested = true;
>> +
> which makes the above hard to reason about.
>
> I think the root of the problem is the dual meaning of
> the priv_stack_requested flag.
> On one side it's a way for sched-ext to ask bpf core to enable priv stack,
> and on the other side it's a request from bpf core to bpf jit
> to allocate it.
You are right. I use the same priv_stack_requested to do two things.
>
> I think it's better to split these two conditions.
> Extra bool is cheap.
>
> How about 'bool priv_stack_requested' will be used by sched-ext only
> and patch 6 largely stays as-is.
>
> While patch 1 drops the introduction of priv_stack_requested flag.
> Instead 'bool jits_use_priv_stack' is introduced in the patch 2
> and used by JITs to allocate priv stack.
>
> I know we use 'jit_requested' to tell JITs to jit it,
> so we can bike shed on alternative ways to name these two flags.
Two bool's approach sound good to me. As you mentioned, two bool's
are not expensive and can make logic cleaner. Will do in the next
revision.
>
>> return 0;
>> }
>>
>> @@ -20211,6 +20215,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>>
>> func[i]->aux->name[0] = 'F';
>> func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
>> + if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
>> + func[i]->aux->priv_stack_requested = true;
>> +
>> func[i]->jit_requested = 1;
>> func[i]->blinding_requested = prog->blinding_requested;
>> func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
>> --
>> 2.43.5
>>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH bpf-next v10 3/7] bpf, x86: Avoid repeated usage of bpf_prog->aux->stack_depth
2024-11-07 2:41 [PATCH bpf-next v10 0/7] bpf: Support private stack for bpf progs Yonghong Song
2024-11-07 2:41 ` [PATCH bpf-next v10 1/7] bpf: Find eligible subprogs for private stack support Yonghong Song
2024-11-07 2:41 ` [PATCH bpf-next v10 2/7] bpf: Enable private stack for eligible subprogs Yonghong Song
@ 2024-11-07 2:41 ` Yonghong Song
2024-11-07 2:41 ` [PATCH bpf-next v10 4/7] bpf, x86: Support private stack in jit Yonghong Song
` (3 subsequent siblings)
6 siblings, 0 replies; 11+ messages in thread
From: Yonghong Song @ 2024-11-07 2:41 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Refactor the code to avoid repeated usage of bpf_prog->aux->stack_depth
in do_jit() func. If the private stack is used, the stack_depth will be
0 for that prog. Refactoring make it easy to adjust stack_depth.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
arch/x86/net/bpf_jit_comp.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 06b080b61aa5..3ff638c37999 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1425,14 +1425,17 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
int i, excnt = 0;
int ilen, proglen = 0;
u8 *prog = temp;
+ u32 stack_depth;
int err;
+ stack_depth = bpf_prog->aux->stack_depth;
+
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
detect_reg_usage(insn, insn_cnt, callee_regs_used);
- emit_prologue(&prog, bpf_prog->aux->stack_depth,
+ emit_prologue(&prog, stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
/* Exception callback will clobber callee regs for its own use, and
@@ -2128,7 +2131,7 @@ st: if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
- LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
+ LOAD_TAIL_CALL_CNT_PTR(stack_depth);
ip += 7;
}
if (!imm32)
@@ -2145,13 +2148,13 @@ st: if (is_imm8(insn->off))
&bpf_prog->aux->poke_tab[imm32 - 1],
&prog, image + addrs[i - 1],
callee_regs_used,
- bpf_prog->aux->stack_depth,
+ stack_depth,
ctx);
else
emit_bpf_tail_call_indirect(bpf_prog,
&prog,
callee_regs_used,
- bpf_prog->aux->stack_depth,
+ stack_depth,
image + addrs[i - 1],
ctx);
break;
--
2.43.5
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH bpf-next v10 4/7] bpf, x86: Support private stack in jit
2024-11-07 2:41 [PATCH bpf-next v10 0/7] bpf: Support private stack for bpf progs Yonghong Song
` (2 preceding siblings ...)
2024-11-07 2:41 ` [PATCH bpf-next v10 3/7] bpf, x86: Avoid repeated usage of bpf_prog->aux->stack_depth Yonghong Song
@ 2024-11-07 2:41 ` Yonghong Song
2024-11-07 2:42 ` [PATCH bpf-next v10 5/7] selftests/bpf: Add tracing prog private stack tests Yonghong Song
` (2 subsequent siblings)
6 siblings, 0 replies; 11+ messages in thread
From: Yonghong Song @ 2024-11-07 2:41 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Private stack is allocated in function bpf_int_jit_compile() with
alignment 16. The x86 register 9 (X86_REG_R9) is used to replace
bpf frame register (BPF_REG_10). The private stack is used per
subprog per cpu. The X86_REG_R9 is saved and restored around every
func call (not including tailcall) to maintain correctness of
X86_REG_R9.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
arch/x86/net/bpf_jit_comp.c | 77 +++++++++++++++++++++++++++++++++++++
include/linux/bpf.h | 1 +
2 files changed, 78 insertions(+)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3ff638c37999..a96ab2fac2b9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -325,6 +325,22 @@ struct jit_context {
/* Number of bytes that will be skipped on tailcall */
#define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE)
+static void push_r9(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x51); /* push r9 */
+ *pprog = prog;
+}
+
+static void pop_r9(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x59); /* pop r9 */
+ *pprog = prog;
+}
+
static void push_r12(u8 **pprog)
{
u8 *prog = *pprog;
@@ -1404,6 +1420,24 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
*pprog = prog;
}
+static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
+{
+ u8 *prog = *pprog;
+
+ /* movabs r9, priv_frame_ptr */
+ emit_mov_imm64(&prog, X86_REG_R9, (__force long) priv_frame_ptr >> 32,
+ (u32) (__force long) priv_frame_ptr);
+
+#ifdef CONFIG_SMP
+ /* add <r9>, gs:[<off>] */
+ EMIT2(0x65, 0x4c);
+ EMIT3(0x03, 0x0c, 0x25);
+ EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+
+ *pprog = prog;
+}
+
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
#define __LOAD_TCC_PTR(off) \
@@ -1421,6 +1455,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
int insn_cnt = bpf_prog->len;
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+ void __percpu *priv_frame_ptr = NULL;
u64 arena_vm_start, user_vm_start;
int i, excnt = 0;
int ilen, proglen = 0;
@@ -1429,6 +1464,10 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
int err;
stack_depth = bpf_prog->aux->stack_depth;
+ if (bpf_prog->aux->priv_stack_ptr) {
+ priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
+ stack_depth = 0;
+ }
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
@@ -1457,6 +1496,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
emit_mov_imm64(&prog, X86_REG_R12,
arena_vm_start >> 32, (u32) arena_vm_start);
+ if (priv_frame_ptr)
+ emit_priv_frame_ptr(&prog, priv_frame_ptr);
+
ilen = prog - temp;
if (rw_image)
memcpy(rw_image + proglen, temp, ilen);
@@ -1476,6 +1518,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
u8 *func;
int nops;
+ if (priv_frame_ptr) {
+ if (src_reg == BPF_REG_FP)
+ src_reg = X86_REG_R9;
+
+ if (dst_reg == BPF_REG_FP)
+ dst_reg = X86_REG_R9;
+ }
+
switch (insn->code) {
/* ALU */
case BPF_ALU | BPF_ADD | BPF_X:
@@ -2136,9 +2186,15 @@ st: if (is_imm8(insn->off))
}
if (!imm32)
return -EINVAL;
+ if (priv_frame_ptr) {
+ push_r9(&prog);
+ ip += 2;
+ }
ip += x86_call_depth_emit_accounting(&prog, func, ip);
if (emit_call(&prog, func, ip))
return -EINVAL;
+ if (priv_frame_ptr)
+ pop_r9(&prog);
break;
}
@@ -3323,6 +3379,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
struct bpf_binary_header *rw_header = NULL;
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
+ void __percpu *priv_stack_ptr = NULL;
struct x64_jit_data *jit_data;
int proglen, oldproglen = 0;
struct jit_context ctx = {};
@@ -3359,6 +3416,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
prog->aux->jit_data = jit_data;
}
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (!priv_stack_ptr && prog->aux->priv_stack_requested) {
+ priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
+ if (!priv_stack_ptr) {
+ prog = orig_prog;
+ goto out_priv_stack;
+ }
+ prog->aux->priv_stack_ptr = priv_stack_ptr;
+ }
addrs = jit_data->addrs;
if (addrs) {
ctx = jit_data->ctx;
@@ -3494,6 +3560,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bpf_prog_fill_jited_linfo(prog, addrs + 1);
out_addrs:
kvfree(addrs);
+ if (!image && priv_stack_ptr) {
+ free_percpu(priv_stack_ptr);
+ prog->aux->priv_stack_ptr = NULL;
+ }
+out_priv_stack:
kfree(jit_data);
prog->aux->jit_data = NULL;
}
@@ -3547,6 +3618,7 @@ void bpf_jit_free(struct bpf_prog *prog)
prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
+ free_percpu(prog->aux->priv_stack_ptr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
}
@@ -3562,6 +3634,11 @@ bool bpf_jit_supports_exceptions(void)
return IS_ENABLED(CONFIG_UNWINDER_ORC);
}
+bool bpf_jit_supports_private_stack(void)
+{
+ return true;
+}
+
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
#if defined(CONFIG_UNWINDER_ORC)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index dc1e58b90493..644ad35b88ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1507,6 +1507,7 @@ struct bpf_prog_aux {
u32 max_rdwr_access;
struct btf *attach_btf;
const struct bpf_ctx_arg_aux *ctx_arg_info;
+ void __percpu *priv_stack_ptr;
struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
struct bpf_prog *dst_prog;
struct bpf_trampoline *dst_trampoline;
--
2.43.5
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH bpf-next v10 5/7] selftests/bpf: Add tracing prog private stack tests
2024-11-07 2:41 [PATCH bpf-next v10 0/7] bpf: Support private stack for bpf progs Yonghong Song
` (3 preceding siblings ...)
2024-11-07 2:41 ` [PATCH bpf-next v10 4/7] bpf, x86: Support private stack in jit Yonghong Song
@ 2024-11-07 2:42 ` Yonghong Song
2024-11-07 2:42 ` [PATCH bpf-next v10 6/7] bpf: Support private stack for struct_ops progs Yonghong Song
2024-11-07 2:42 ` [PATCH bpf-next v10 7/7] selftests/bpf: Add struct_ops prog private stack tests Yonghong Song
6 siblings, 0 replies; 11+ messages in thread
From: Yonghong Song @ 2024-11-07 2:42 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Some private stack tests are added including:
- main prog only with stack size greater than BPF_PSTACK_MIN_SIZE.
- main prog only with stack size smaller than BPF_PSTACK_MIN_SIZE.
- prog with one subprog having MAX_BPF_STACK stack size and another
subprog having non-zero small stack size.
- prog with callback function.
- prog with exception in main prog or subprog.
- prog with async callback without nesting
- prog with async callback with possible nesting
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
.../selftests/bpf/prog_tests/verifier.c | 2 +
.../bpf/progs/verifier_private_stack.c | 272 ++++++++++++++++++
2 files changed, 274 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/verifier_private_stack.c
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 75f7a2ce334b..d9f65adb456b 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -61,6 +61,7 @@
#include "verifier_or_jmp32_k.skel.h"
#include "verifier_precision.skel.h"
#include "verifier_prevent_map_lookup.skel.h"
+#include "verifier_private_stack.skel.h"
#include "verifier_raw_stack.skel.h"
#include "verifier_raw_tp_writable.skel.h"
#include "verifier_reg_equal.skel.h"
@@ -188,6 +189,7 @@ void test_verifier_bpf_fastcall(void) { RUN(verifier_bpf_fastcall); }
void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); }
void test_verifier_precision(void) { RUN(verifier_precision); }
void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); }
+void test_verifier_private_stack(void) { RUN(verifier_private_stack); }
void test_verifier_raw_stack(void) { RUN(verifier_raw_stack); }
void test_verifier_raw_tp_writable(void) { RUN(verifier_raw_tp_writable); }
void test_verifier_reg_equal(void) { RUN(verifier_reg_equal); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
new file mode 100644
index 000000000000..b1fbdf119553
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+/* From include/linux/filter.h */
+#define MAX_BPF_STACK 512
+
+#if defined(__TARGET_ARCH_x86)
+
+struct elem {
+ struct bpf_timer t;
+ char pad[256];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct elem);
+} array SEC(".maps");
+
+SEC("kprobe")
+__description("Private stack, single prog")
+__success
+__arch_x86_64
+__jited(" movabsq $0x{{.*}}, %r9")
+__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" movl $0x2a, %edi")
+__jited(" movq %rdi, -0x100(%r9)")
+__naked void private_stack_single_prog(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - 256) = r1; \
+ r0 = 0; \
+ exit; \
+" ::: __clobber_all);
+}
+
+SEC("raw_tp")
+__description("No private stack")
+__success
+__arch_x86_64
+__jited(" subq $0x8, %rsp")
+__naked void no_private_stack_nested(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - 8) = r1; \
+ r0 = 0; \
+ exit; \
+" ::: __clobber_all);
+}
+
+__used
+__naked static void cumulative_stack_depth_subprog(void)
+{
+ asm volatile (" \
+ r1 = 41; \
+ *(u64 *)(r10 - 32) = r1; \
+ call %[bpf_get_smp_processor_id]; \
+ exit; \
+" :
+ : __imm(bpf_get_smp_processor_id)
+ : __clobber_all);
+}
+
+SEC("kprobe")
+__description("Private stack, subtree > MAX_BPF_STACK")
+__success
+__arch_x86_64
+/* private stack fp for the main prog */
+__jited(" movabsq $0x{{.*}}, %r9")
+__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" movl $0x2a, %edi")
+__jited(" movq %rdi, -0x200(%r9)")
+__jited(" pushq %r9")
+__jited(" callq 0x{{.*}}")
+__jited(" popq %r9")
+__jited(" xorl %eax, %eax")
+__naked void private_stack_nested_1(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - %[max_bpf_stack]) = r1; \
+ call cumulative_stack_depth_subprog; \
+ r0 = 0; \
+ exit; \
+" :
+ : __imm_const(max_bpf_stack, MAX_BPF_STACK)
+ : __clobber_all);
+}
+
+__naked __noinline __used
+static unsigned long loop_callback(void)
+{
+ asm volatile (" \
+ call %[bpf_get_prandom_u32]; \
+ r1 = 42; \
+ *(u64 *)(r10 - 512) = r1; \
+ call cumulative_stack_depth_subprog; \
+ r0 = 0; \
+ exit; \
+" :
+ : __imm(bpf_get_prandom_u32)
+ : __clobber_common);
+}
+
+SEC("raw_tp")
+__description("Private stack, callback")
+__success
+__arch_x86_64
+/* for func loop_callback */
+__jited("func #1")
+__jited(" endbr64")
+__jited(" nopl (%rax,%rax)")
+__jited(" nopl (%rax)")
+__jited(" pushq %rbp")
+__jited(" movq %rsp, %rbp")
+__jited(" endbr64")
+__jited(" movabsq $0x{{.*}}, %r9")
+__jited(" addq %gs:0x{{.*}}, %r9")
+__jited(" pushq %r9")
+__jited(" callq")
+__jited(" popq %r9")
+__jited(" movl $0x2a, %edi")
+__jited(" movq %rdi, -0x200(%r9)")
+__jited(" pushq %r9")
+__jited(" callq")
+__jited(" popq %r9")
+__naked void private_stack_callback(void)
+{
+ asm volatile (" \
+ r1 = 1; \
+ r2 = %[loop_callback]; \
+ r3 = 0; \
+ r4 = 0; \
+ call %[bpf_loop]; \
+ r0 = 0; \
+ exit; \
+" :
+ : __imm_ptr(loop_callback),
+ __imm(bpf_loop)
+ : __clobber_common);
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, exception in main prog")
+__success __retval(0)
+__arch_x86_64
+__jited(" pushq %r9")
+__jited(" callq")
+__jited(" popq %r9")
+int private_stack_exception_main_prog(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - 512) = r1; \
+" ::: __clobber_common);
+
+ bpf_throw(0);
+ return 0;
+}
+
+__used static int subprog_exception(void)
+{
+ bpf_throw(0);
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, exception in subprog")
+__success __retval(0)
+__arch_x86_64
+__jited(" movq %rdi, -0x200(%r9)")
+__jited(" pushq %r9")
+__jited(" callq")
+__jited(" popq %r9")
+int private_stack_exception_sub_prog(void)
+{
+ asm volatile (" \
+ r1 = 42; \
+ *(u64 *)(r10 - 512) = r1; \
+ call subprog_exception; \
+" ::: __clobber_common);
+
+ return 0;
+}
+
+int glob;
+__noinline static void subprog2(int *val)
+{
+ glob += val[0] * 2;
+}
+
+__noinline static void subprog1(int *val)
+{
+ int tmp[64] = {};
+
+ tmp[0] = *val;
+ subprog2(tmp);
+}
+
+__noinline static int timer_cb1(void *map, int *key, struct bpf_timer *timer)
+{
+ subprog1(key);
+ return 0;
+}
+
+__noinline static int timer_cb2(void *map, int *key, struct bpf_timer *timer)
+{
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, async callback, not nested")
+__success __retval(0)
+__arch_x86_64
+__jited(" movabsq $0x{{.*}}, %r9")
+int private_stack_async_callback_1(void)
+{
+ struct bpf_timer *arr_timer;
+ int array_key = 0;
+
+ arr_timer = bpf_map_lookup_elem(&array, &array_key);
+ if (!arr_timer)
+ return 0;
+
+ bpf_timer_init(arr_timer, &array, 1);
+ bpf_timer_set_callback(arr_timer, timer_cb2);
+ bpf_timer_start(arr_timer, 0, 0);
+ subprog1(&array_key);
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test9")
+__description("Private stack, async callback, potential nesting")
+__success __retval(0)
+__arch_x86_64
+__jited(" subq $0x100, %rsp")
+int private_stack_async_callback_2(void)
+{
+ struct bpf_timer *arr_timer;
+ int array_key = 0;
+
+ arr_timer = bpf_map_lookup_elem(&array, &array_key);
+ if (!arr_timer)
+ return 0;
+
+ bpf_timer_init(arr_timer, &array, 1);
+ bpf_timer_set_callback(arr_timer, timer_cb1);
+ bpf_timer_start(arr_timer, 0, 0);
+ subprog1(&array_key);
+ return 0;
+}
+
+#else
+
+SEC("kprobe")
+__description("private stack is not supported, use a dummy test")
+__success
+int dummy_test(void)
+{
+ return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
--
2.43.5
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH bpf-next v10 6/7] bpf: Support private stack for struct_ops progs
2024-11-07 2:41 [PATCH bpf-next v10 0/7] bpf: Support private stack for bpf progs Yonghong Song
` (4 preceding siblings ...)
2024-11-07 2:42 ` [PATCH bpf-next v10 5/7] selftests/bpf: Add tracing prog private stack tests Yonghong Song
@ 2024-11-07 2:42 ` Yonghong Song
2024-11-07 19:20 ` Tejun Heo
2024-11-07 2:42 ` [PATCH bpf-next v10 7/7] selftests/bpf: Add struct_ops prog private stack tests Yonghong Song
6 siblings, 1 reply; 11+ messages in thread
From: Yonghong Song @ 2024-11-07 2:42 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
For struct_ops progs, whether a particular prog uses private stack
depends on prog->aux->priv_stack_requested setting before actual
insn-level verification for that prog. One particular implementation
is to piggyback on struct_ops->check_member(). The next patch has
an example for this. The struct_ops->check_member() sets
prog->aux->priv_stack_requested to be true which enables private stack
usage.
The struct_ops prog follows the same rule as kprobe/tracing progs after
function bpf_enable_priv_stack(). For example, even a struct_ops prog
requests private stack, it could still use normal kernel stack if
the stack size is small (< 64 bytes).
The prog->aux->priv_stack_requested is also used for recursion checking
for struct_ops progs. Similar to tracing progs, nested same cpu same
prog run will be skipped. A field (recursion_detected()) is added to
bpf_prog_aux structure. If bpf_prog->aux->recursion_detected
is implemented by the struct_ops subsystem and nested same cpu/prog
happens, the function will be triggered to report an error, collect
related info, etc.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
include/linux/bpf.h | 1 +
include/linux/bpf_verifier.h | 1 +
kernel/bpf/trampoline.c | 4 ++++
kernel/bpf/verifier.c | 6 ++++++
4 files changed, 12 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 644ad35b88ec..938e9c4fa9b1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1528,6 +1528,7 @@ struct bpf_prog_aux {
u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
struct bpf_arena *arena;
+ void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
const struct btf_type *attach_func_proto;
/* function name for valid attach_btf_id */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 456fd2265345..6c1a173c30e6 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -895,6 +895,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
case BPF_PROG_TYPE_TRACING:
return prog->expected_attach_type != BPF_TRACE_ITER;
case BPF_PROG_TYPE_STRUCT_OPS:
+ return prog->aux->priv_stack_requested;
case BPF_PROG_TYPE_LSM:
return false;
default:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 9f36c049f4c2..a8d188b31da5 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -899,6 +899,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
@@ -975,6 +977,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 09bb9dc939d6..d0e5038a844a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6279,6 +6279,7 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
}
}
+ env->prog->aux->priv_stack_requested = false;
if (si[0].priv_stack_mode == PRIV_STACK_ADAPTIVE)
env->prog->aux->priv_stack_requested = true;
@@ -21986,6 +21987,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
}
+ if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
+ verbose(env, "Private stack not supported by jit\n");
+ return -EACCES;
+ }
+
/* btf_ctx_access() used this to provide argument type info */
prog->aux->ctx_arg_info =
st_ops_desc->arg_info[member_idx].info;
--
2.43.5
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH bpf-next v10 6/7] bpf: Support private stack for struct_ops progs
2024-11-07 2:42 ` [PATCH bpf-next v10 6/7] bpf: Support private stack for struct_ops progs Yonghong Song
@ 2024-11-07 19:20 ` Tejun Heo
0 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2024-11-07 19:20 UTC (permalink / raw)
To: Yonghong Song
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
kernel-team, Martin KaFai Lau
On Wed, Nov 06, 2024 at 06:42:09PM -0800, Yonghong Song wrote:
> For struct_ops progs, whether a particular prog uses private stack
> depends on prog->aux->priv_stack_requested setting before actual
> insn-level verification for that prog. One particular implementation
> is to piggyback on struct_ops->check_member(). The next patch has
> an example for this. The struct_ops->check_member() sets
> prog->aux->priv_stack_requested to be true which enables private stack
> usage.
>
> The struct_ops prog follows the same rule as kprobe/tracing progs after
> function bpf_enable_priv_stack(). For example, even a struct_ops prog
> requests private stack, it could still use normal kernel stack if
> the stack size is small (< 64 bytes).
>
> The prog->aux->priv_stack_requested is also used for recursion checking
> for struct_ops progs. Similar to tracing progs, nested same cpu same
> prog run will be skipped. A field (recursion_detected()) is added to
> bpf_prog_aux structure. If bpf_prog->aux->recursion_detected
> is implemented by the struct_ops subsystem and nested same cpu/prog
> happens, the function will be triggered to report an error, collect
> related info, etc.
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
From user's POV, this looks great:
Acked-by: Tejun Heo <tj@kernel.org>
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH bpf-next v10 7/7] selftests/bpf: Add struct_ops prog private stack tests
2024-11-07 2:41 [PATCH bpf-next v10 0/7] bpf: Support private stack for bpf progs Yonghong Song
` (5 preceding siblings ...)
2024-11-07 2:42 ` [PATCH bpf-next v10 6/7] bpf: Support private stack for struct_ops progs Yonghong Song
@ 2024-11-07 2:42 ` Yonghong Song
6 siblings, 0 replies; 11+ messages in thread
From: Yonghong Song @ 2024-11-07 2:42 UTC (permalink / raw)
To: bpf
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
Martin KaFai Lau, Tejun Heo
Add three tests for struct_ops using private stack.
./test_progs -t struct_ops_private_stack
#333/1 struct_ops_private_stack/private_stack:OK
#333/2 struct_ops_private_stack/private_stack_fail:OK
#333/3 struct_ops_private_stack/private_stack_recur:OK
#333 struct_ops_private_stack:OK
The following is a snippet of a struct_ops check_member() implementation:
u32 moff = __btf_member_bit_offset(t, member) / 8;
switch (moff) {
case offsetof(struct bpf_testmod_ops3, test_1):
prog->aux->priv_stack_requested = true;
prog->aux->recursion_detected = test_1_recursion_detected;
fallthrough;
default:
break;
}
return 0;
The first test is with nested two different callback functions where the
first prog has more than 512 byte stack size (including subprogs) with
private stack enabled.
The second test is a negative test where the second prog has more than 512
byte stack size without private stack enabled.
The third test is the same callback function recursing itself. At run time,
the jit trampoline recursion check kicks in to prevent the recursion. The
recursion_detected() callback function is implemented by the bpf_testmod,
the following message in dmesg
bpf_testmod: oh no, recursing into test_1, recursion_misses 1
demonstrates the callback function is indeed triggered when recursion miss
happens.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
.../selftests/bpf/bpf_testmod/bpf_testmod.c | 104 +++++++++++++++++
.../selftests/bpf/bpf_testmod/bpf_testmod.h | 5 +
.../bpf/prog_tests/struct_ops_private_stack.c | 106 ++++++++++++++++++
.../bpf/progs/struct_ops_private_stack.c | 62 ++++++++++
.../bpf/progs/struct_ops_private_stack_fail.c | 62 ++++++++++
.../progs/struct_ops_private_stack_recur.c | 50 +++++++++
6 files changed, 389 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 987d41af71d2..cc9dde507aba 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -245,6 +245,39 @@ __bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx)
call_rcu(&ctx->rcu, testmod_free_cb);
}
+static struct bpf_testmod_ops3 *st_ops3;
+
+static int bpf_testmod_test_3(void)
+{
+ return 0;
+}
+
+static int bpf_testmod_test_4(void)
+{
+ return 0;
+}
+
+static struct bpf_testmod_ops3 __bpf_testmod_ops3 = {
+ .test_1 = bpf_testmod_test_3,
+ .test_2 = bpf_testmod_test_4,
+};
+
+static void bpf_testmod_test_struct_ops3(void)
+{
+ if (st_ops3)
+ st_ops3->test_1();
+}
+
+__bpf_kfunc void bpf_testmod_ops3_call_test_1(void)
+{
+ st_ops3->test_1();
+}
+
+__bpf_kfunc void bpf_testmod_ops3_call_test_2(void)
+{
+ st_ops3->test_2();
+}
+
struct bpf_testmod_btf_type_tag_1 {
int a;
};
@@ -382,6 +415,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
(void)trace_bpf_testmod_test_raw_tp_null(NULL);
+ bpf_testmod_test_struct_ops3();
+
struct_arg3 = kmalloc((sizeof(struct bpf_testmod_struct_arg_3) +
sizeof(int)), GFP_KERNEL);
if (struct_arg3 != NULL) {
@@ -586,6 +621,8 @@ BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU)
BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1)
+BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2)
BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
BTF_ID_LIST(bpf_testmod_dtor_ids)
@@ -1096,6 +1133,10 @@ static const struct bpf_verifier_ops bpf_testmod_verifier_ops = {
.is_valid_access = bpf_testmod_ops_is_valid_access,
};
+static const struct bpf_verifier_ops bpf_testmod_verifier_ops3 = {
+ .is_valid_access = bpf_testmod_ops_is_valid_access,
+};
+
static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
{
struct bpf_testmod_ops *ops = kdata;
@@ -1175,6 +1216,68 @@ struct bpf_struct_ops bpf_testmod_ops2 = {
.owner = THIS_MODULE,
};
+static int st_ops3_reg(void *kdata, struct bpf_link *link)
+{
+ int err = 0;
+
+ mutex_lock(&st_ops_mutex);
+ if (st_ops3) {
+ pr_err("st_ops has already been registered\n");
+ err = -EEXIST;
+ goto unlock;
+ }
+ st_ops3 = kdata;
+
+unlock:
+ mutex_unlock(&st_ops_mutex);
+ return err;
+}
+
+static void st_ops3_unreg(void *kdata, struct bpf_link *link)
+{
+ mutex_lock(&st_ops_mutex);
+ st_ops3 = NULL;
+ mutex_unlock(&st_ops_mutex);
+}
+
+static void test_1_recursion_detected(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+
+ stats = this_cpu_ptr(prog->stats);
+ printk("bpf_testmod: oh no, recursing into test_1, recursion_misses %llu",
+ u64_stats_read(&stats->misses));
+}
+
+static int st_ops3_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_testmod_ops3, test_1):
+ prog->aux->priv_stack_requested = true;
+ prog->aux->recursion_detected = test_1_recursion_detected;
+ fallthrough;
+ default:
+ break;
+ }
+ return 0;
+}
+
+struct bpf_struct_ops bpf_testmod_ops3 = {
+ .verifier_ops = &bpf_testmod_verifier_ops3,
+ .init = bpf_testmod_ops_init,
+ .init_member = bpf_testmod_ops_init_member,
+ .reg = st_ops3_reg,
+ .unreg = st_ops3_unreg,
+ .check_member = st_ops3_check_member,
+ .cfi_stubs = &__bpf_testmod_ops3,
+ .name = "bpf_testmod_ops3",
+ .owner = THIS_MODULE,
+};
+
static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args)
{
return 0;
@@ -1333,6 +1436,7 @@ static int bpf_testmod_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set);
ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops);
ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2);
+ ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops3, bpf_testmod_ops3);
ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops);
ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors,
ARRAY_SIZE(bpf_testmod_dtors),
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
index fb7dff47597a..356803d1c10e 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
@@ -94,6 +94,11 @@ struct bpf_testmod_ops2 {
int (*test_1)(void);
};
+struct bpf_testmod_ops3 {
+ int (*test_1)(void);
+ int (*test_2)(void);
+};
+
struct st_ops_args {
u64 a;
};
diff --git a/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
new file mode 100644
index 000000000000..4006879ca3fe
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "struct_ops_private_stack.skel.h"
+#include "struct_ops_private_stack_fail.skel.h"
+#include "struct_ops_private_stack_recur.skel.h"
+
+static void test_private_stack(void)
+{
+ struct struct_ops_private_stack *skel;
+ struct bpf_link *link;
+ int err;
+
+ skel = struct_ops_private_stack__open();
+ if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack__open"))
+ return;
+
+ if (skel->data->skip) {
+ test__skip();
+ goto cleanup;
+ }
+
+ err = struct_ops_private_stack__load(skel);
+ if (!ASSERT_OK(err, "struct_ops_private_stack__load"))
+ goto cleanup;
+
+ link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
+ if (!ASSERT_OK_PTR(link, "attach_struct_ops"))
+ goto cleanup;
+
+ ASSERT_OK(trigger_module_test_read(256), "trigger_read");
+
+ ASSERT_EQ(skel->bss->val_i, 3, "val_i");
+ ASSERT_EQ(skel->bss->val_j, 8, "val_j");
+
+ bpf_link__destroy(link);
+
+cleanup:
+ struct_ops_private_stack__destroy(skel);
+}
+
+static void test_private_stack_fail(void)
+{
+ struct struct_ops_private_stack_fail *skel;
+ int err;
+
+ skel = struct_ops_private_stack_fail__open();
+ if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack_fail__open"))
+ return;
+
+ if (skel->data->skip) {
+ test__skip();
+ goto cleanup;
+ }
+
+ err = struct_ops_private_stack_fail__load(skel);
+ if (!ASSERT_ERR(err, "struct_ops_private_stack_fail__load"))
+ goto cleanup;
+ return;
+
+cleanup:
+ struct_ops_private_stack_fail__destroy(skel);
+}
+
+static void test_private_stack_recur(void)
+{
+ struct struct_ops_private_stack_recur *skel;
+ struct bpf_link *link;
+ int err;
+
+ skel = struct_ops_private_stack_recur__open();
+ if (!ASSERT_OK_PTR(skel, "struct_ops_private_stack_recur__open"))
+ return;
+
+ if (skel->data->skip) {
+ test__skip();
+ goto cleanup;
+ }
+
+ err = struct_ops_private_stack_recur__load(skel);
+ if (!ASSERT_OK(err, "struct_ops_private_stack_recur__load"))
+ goto cleanup;
+
+ link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
+ if (!ASSERT_OK_PTR(link, "attach_struct_ops"))
+ goto cleanup;
+
+ ASSERT_OK(trigger_module_test_read(256), "trigger_read");
+
+ ASSERT_EQ(skel->bss->val_j, 3, "val_j");
+
+ bpf_link__destroy(link);
+
+cleanup:
+ struct_ops_private_stack_recur__destroy(skel);
+}
+
+void test_struct_ops_private_stack(void)
+{
+ if (test__start_subtest("private_stack"))
+ test_private_stack();
+ if (test__start_subtest("private_stack_fail"))
+ test_private_stack_fail();
+ if (test__start_subtest("private_stack_recur"))
+ test_private_stack_recur();
+}
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
new file mode 100644
index 000000000000..8ea57e5348ab
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_2(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+ return val_i + a[10] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+ /* stack size 200 bytes */
+ int b[50] = {};
+
+ b[20] = 2;
+ return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+ /* stack size 400 bytes */
+ int a[100] = {};
+
+ a[10] = 1;
+ val_i = subprog1(a);
+ bpf_testmod_ops3_call_test_2();
+ return 0;
+}
+
+SEC("struct_ops")
+int BPF_PROG(test_2)
+{
+ /* stack size 200 bytes */
+ int a[50] = {};
+
+ a[10] = 3;
+ val_j = subprog1(a);
+ return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+ .test_1 = (void *)test_1,
+ .test_2 = (void *)test_2,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
new file mode 100644
index 000000000000..1f55ec4cee37
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_fail.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_2(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+ return val_i + a[10] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+ /* stack size 200 bytes */
+ int b[50] = {};
+
+ b[20] = 2;
+ return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+ /* stack size 100 bytes */
+ int a[25] = {};
+
+ a[10] = 1;
+ val_i = subprog1(a);
+ bpf_testmod_ops3_call_test_2();
+ return 0;
+}
+
+SEC("struct_ops")
+int BPF_PROG(test_2)
+{
+ /* stack size 400 bytes */
+ int a[100] = {};
+
+ a[10] = 3;
+ val_j = subprog1(a);
+ return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+ .test_1 = (void *)test_1,
+ .test_2 = (void *)test_2,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
new file mode 100644
index 000000000000..15d4e914dc92
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_private_stack_recur.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#if defined(__TARGET_ARCH_x86)
+bool skip __attribute((__section__(".data"))) = false;
+#else
+bool skip = true;
+#endif
+
+void bpf_testmod_ops3_call_test_1(void) __ksym;
+
+int val_i, val_j;
+
+__noinline static int subprog2(int *a, int *b)
+{
+ return val_i + a[10] + b[20];
+}
+
+__noinline static int subprog1(int *a)
+{
+ /* stack size 400 bytes */
+ int b[100] = {};
+
+ b[20] = 2;
+ return subprog2(a, b);
+}
+
+
+SEC("struct_ops")
+int BPF_PROG(test_1)
+{
+ /* stack size 400 bytes */
+ int a[100] = {};
+
+ a[10] = 1;
+ val_j += subprog1(a);
+ bpf_testmod_ops3_call_test_1();
+ return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_testmod_ops3 testmod_1 = {
+ .test_1 = (void *)test_1,
+};
--
2.43.5
^ permalink raw reply related [flat|nested] 11+ messages in thread