From: Yonghong Song <yonghong.song@linux.dev>
To: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: bpf <bpf@vger.kernel.org>, Alexei Starovoitov <ast@kernel.org>,
Andrii Nakryiko <andrii@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
Kernel Team <kernel-team@fb.com>,
Martin KaFai Lau <martin.lau@kernel.org>,
Tejun Heo <tj@kernel.org>
Subject: Re: [PATCH bpf-next v6 1/9] bpf: Allow each subprog having stack size of 512 bytes
Date: Mon, 21 Oct 2024 20:21:01 -0700 [thread overview]
Message-ID: <c6e5040b-9558-481f-b1fc-f77dc9ce90c1@linux.dev> (raw)
In-Reply-To: <CAADnVQ+ZXMh_QKy0nd-n7my1SETroockPjpVVJOAWsE3tB_5sg@mail.gmail.com>
On 10/21/24 6:18 PM, Alexei Starovoitov wrote:
> On Sun, Oct 20, 2024 at 12:14 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> With private stack support, each subprog can have stack with up to 512
>> bytes. The limit of 512 bytes per subprog is kept to avoid increasing
>> verifier complexity since greater than 512 bytes will cause big verifier
>> change and increase memory consumption and verification time.
>>
>> If private stack is supported, for a bpf prog, esp. when it has
>> subprogs, private stack will be allocated for the main prog
>> and for each callback subprog. For example,
>> main_prog
>> subprog1
>> calling helper
>> subprog10 (callback func)
>> subprog11
>> subprog2
>> calling helper
>> subprog10 (callback func)
>> subprog11
>>
>> Separate private allocations for main_prog and callback_fn subprog10
>> will make things easier since the helper function uses the kernel stack.
>>
>> In this patch, some tracing programs are allowed to use private
>> stack since tracing prog may be triggered in the middle of some other
>> prog runs. Additional subprog info is also collected for later to
>> allocate private stack for main prog and each callback functions.
>>
>> Note that if any tail_call is called in the prog (including all subprogs),
>> then private stack is not used.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>> include/linux/bpf.h | 1 +
>> include/linux/bpf_verifier.h | 3 ++
>> include/linux/filter.h | 1 +
>> kernel/bpf/core.c | 5 ++
>> kernel/bpf/verifier.c | 100 ++++++++++++++++++++++++++++++-----
>> 5 files changed, 97 insertions(+), 13 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 0c216e71cec7..6ad8ace7075a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1490,6 +1490,7 @@ struct bpf_prog_aux {
>> bool exception_cb;
>> bool exception_boundary;
>> bool is_extended; /* true if extended by freplace program */
>> + bool priv_stack_eligible;
>> u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
>> struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
>> struct bpf_arena *arena;
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 4513372c5bc8..bcfe868e3801 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -659,6 +659,8 @@ struct bpf_subprog_info {
>> * are used for bpf_fastcall spills and fills.
>> */
>> s16 fastcall_stack_off;
>> + u16 subtree_stack_depth;
>> + u16 subtree_top_idx;
>> bool has_tail_call: 1;
>> bool tail_call_reachable: 1;
>> bool has_ld_abs: 1;
>> @@ -668,6 +670,7 @@ struct bpf_subprog_info {
>> bool args_cached: 1;
>> /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>> bool keep_fastcall_stack: 1;
>> + bool priv_stack_eligible: 1;
>>
>> u8 arg_cnt;
>> struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 7d7578a8eac1..3a21947f2fd4 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
>> bool bpf_jit_supports_ptr_xchg(void);
>> bool bpf_jit_supports_arena(void);
>> bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
>> +bool bpf_jit_supports_private_stack(void);
>> u64 bpf_arch_uaddress_limit(void);
>> void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
>> bool bpf_helper_changes_pkt_data(void *func);
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 233ea78f8f1b..14d9288441f2 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
>> return false;
>> }
>>
>> +bool __weak bpf_jit_supports_private_stack(void)
>> +{
>> + return false;
>> +}
>> +
>> void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
>> {
>> }
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index f514247ba8ba..45bea4066272 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
>>
>> #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
>>
>> +#define BPF_PRIV_STACK_MIN_SUBTREE_SIZE 128
>> +
>> static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
>> static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
>> static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
>> @@ -5982,6 +5984,41 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
>> strict);
>> }
>>
>> +static bool bpf_enable_private_stack(struct bpf_verifier_env *env)
>> +{
>> + if (!bpf_jit_supports_private_stack())
>> + return false;
>> +
>> + switch (env->prog->type) {
>> + case BPF_PROG_TYPE_KPROBE:
>> + case BPF_PROG_TYPE_TRACEPOINT:
>> + case BPF_PROG_TYPE_PERF_EVENT:
>> + case BPF_PROG_TYPE_RAW_TRACEPOINT:
>> + return true;
>> + case BPF_PROG_TYPE_TRACING:
>> + if (env->prog->expected_attach_type != BPF_TRACE_ITER)
>> + return true;
>> + fallthrough;
>> + default:
>> + return false;
>> + }
>> +}
>> +
>> +static bool is_priv_stack_supported(struct bpf_verifier_env *env)
>> +{
>> + struct bpf_subprog_info *si = env->subprog_info;
>> + bool has_tail_call = false;
>> +
>> + for (int i = 0; i < env->subprog_cnt; i++) {
>> + if (si[i].has_tail_call) {
>> + has_tail_call = true;
>> + break;
>> + }
>> + }
>> +
>> + return !has_tail_call && bpf_enable_private_stack(env);
>> +}
>> +
>> static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
>> {
>> if (env->prog->jit_requested)
>> @@ -5999,16 +6036,21 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
>> * Since recursion is prevented by check_cfg() this algorithm
>> * only needs a local stack of MAX_CALL_FRAMES to remember callsites
>> */
>> -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>> +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>> + bool check_priv_stack, bool priv_stack_supported)
>> {
>> struct bpf_subprog_info *subprog = env->subprog_info;
>> struct bpf_insn *insn = env->prog->insnsi;
>> int depth = 0, frame = 0, i, subprog_end;
>> bool tail_call_reachable = false;
>> + bool priv_stack_eligible = false;
>> int ret_insn[MAX_CALL_FRAMES];
>> int ret_prog[MAX_CALL_FRAMES];
>> - int j;
>> + int j, subprog_stack_depth;
>> + int orig_idx = idx;
>>
>> + if (check_priv_stack)
>> + subprog[idx].subtree_top_idx = idx;
>> i = subprog[idx].start;
>> process_func:
>> /* protect against potential stack overflow that might happen when
>> @@ -6030,18 +6072,33 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>> * tailcall will unwind the current stack frame but it will not get rid
>> * of caller's stack as shown on the example above.
>> */
>> - if (idx && subprog[idx].has_tail_call && depth >= 256) {
>> + if (!check_priv_stack && idx && subprog[idx].has_tail_call && depth >= 256) {
>> verbose(env,
>> "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
>> depth);
>> return -EACCES;
>> }
>> - depth += round_up_stack_depth(env, subprog[idx].stack_depth);
>> - if (depth > MAX_BPF_STACK) {
>> + subprog_stack_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
>> + depth += subprog_stack_depth;
>> + if (!check_priv_stack && !priv_stack_supported && depth > MAX_BPF_STACK) {
>> verbose(env, "combined stack size of %d calls is %d. Too large\n",
>> frame + 1, depth);
>> return -EACCES;
>> }
>> + if (check_priv_stack) {
>> + if (subprog_stack_depth > MAX_BPF_STACK) {
>> + verbose(env, "stack size of subprog %d is %d. Too large\n",
>> + idx, subprog_stack_depth);
>> + return -EACCES;
>> + }
>> +
>> + if (!priv_stack_eligible && depth >= BPF_PRIV_STACK_MIN_SUBTREE_SIZE) {
>> + subprog[orig_idx].priv_stack_eligible = true;
>> + env->prog->aux->priv_stack_eligible = priv_stack_eligible = true;
>> + }
>> + subprog[orig_idx].subtree_stack_depth =
>> + max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
>> + }
>> continue_func:
>> subprog_end = subprog[idx + 1].start;
>> for (; i < subprog_end; i++) {
>> @@ -6078,6 +6135,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>> next_insn = i + insn[i].imm + 1;
>> sidx = find_subprog(env, next_insn);
>> if (sidx < 0) {
>> + /* It is possible that callback func has been removed as dead code after
>> + * instruction rewrites, e.g. bpf_loop with cnt 0.
>> + */
>> + if (check_priv_stack)
>> + continue;
>> +
> and this extra hack only because check_max_stack_depth() will
> be called the 2nd time ?
> Why call it twice at all ?
> Record everything in the first pass.
The individual stack size may increase between check_max_stack_depth() and jit.
So we have to go through second pass to compute precise subtree (prog + subprogs)
stack size, which is needed to allocate percpu private stack.
One thing we could do is to record the (sub)prog<->subprog relations in the first
pass and right before the jit do another pass to calculate subtree stack size.
I guess that is what you suggest?
>
>> WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
>> next_insn);
>> return -EFAULT;
>> @@ -6097,8 +6160,10 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>> }
>> i = next_insn;
>> idx = sidx;
>> + if (check_priv_stack)
>> + subprog[idx].subtree_top_idx = orig_idx;
>>
>> - if (subprog[idx].has_tail_call)
>> + if (!check_priv_stack && subprog[idx].has_tail_call)
>> tail_call_reachable = true;
>>
>> frame++;
>> @@ -6122,7 +6187,7 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>> }
>> subprog[ret_prog[j]].tail_call_reachable = true;
>> }
>> - if (subprog[0].tail_call_reachable)
>> + if (!check_priv_stack && subprog[0].tail_call_reachable)
>> env->prog->aux->tail_call_reachable = true;
>>
>> /* end of for() loop means the last insn of the 'subprog'
>> @@ -6137,14 +6202,18 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>> goto continue_func;
>> }
>>
>> -static int check_max_stack_depth(struct bpf_verifier_env *env)
>> +static int check_max_stack_depth(struct bpf_verifier_env *env, bool check_priv_stack,
>> + bool priv_stack_supported)
>> {
>> struct bpf_subprog_info *si = env->subprog_info;
>> + bool check_subprog;
>> int ret;
>>
>> for (int i = 0; i < env->subprog_cnt; i++) {
>> - if (!i || si[i].is_async_cb) {
>> - ret = check_max_stack_depth_subprog(env, i);
>> + check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
> why?
> This looks very suspicious.
This is to simplify jit. For example,
main_prog <=== main_prog_priv_stack_ptr
subprog1 <=== there is a helper which has a callback_fn
<=== for example bpf_for_each_map_elem
callback_fn
subprog2
In callback_fn, we cannot simplify do
r9 += stack_size_for_callback_fn
since r9 may have been clobbered between subprog1 and callback_fn.
That is why currently I allocate private_stack separately for callback_fn.
Alternatively we could do
callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
where off equals to (stack size tree main_prog+subprog1).
I can do this approach too with a little more information in prog->aux.
WDYT?
>
>> + if (check_subprog) {
>> + ret = check_max_stack_depth_subprog(env, i, check_priv_stack,
>> + priv_stack_supported);
>> if (ret < 0)
>> return ret;
>> }
>> @@ -22303,7 +22372,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>> struct bpf_verifier_env *env;
>> int i, len, ret = -EINVAL, err;
>> u32 log_true_size;
>> - bool is_priv;
>> + bool is_priv, priv_stack_supported = false;
>>
>> /* no program is valid */
>> if (ARRAY_SIZE(bpf_verifier_ops) == 0)
>> @@ -22430,8 +22499,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>> if (ret == 0)
>> ret = remove_fastcall_spills_fills(env);
>>
>> - if (ret == 0)
>> - ret = check_max_stack_depth(env);
>> + if (ret == 0) {
>> + priv_stack_supported = is_priv_stack_supported(env);
>> + ret = check_max_stack_depth(env, false, priv_stack_supported);
>> + }
>>
>> /* instruction rewrites happen after this point */
>> if (ret == 0)
>> @@ -22465,6 +22536,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>> : false;
>> }
>>
>> + if (ret == 0 && priv_stack_supported)
>> + ret = check_max_stack_depth(env, true, true);
>> +
>> if (ret == 0)
>> ret = fixup_call_args(env);
>>
>> --
>> 2.43.5
>>
next prev parent reply other threads:[~2024-10-22 3:21 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-20 19:13 [PATCH bpf-next v6 0/9] bpf: Support private stack for bpf progs Yonghong Song
2024-10-20 19:13 ` [PATCH bpf-next v6 1/9] bpf: Allow each subprog having stack size of 512 bytes Yonghong Song
2024-10-22 1:18 ` Alexei Starovoitov
2024-10-22 3:21 ` Yonghong Song [this message]
2024-10-22 3:43 ` Alexei Starovoitov
2024-10-22 4:08 ` Yonghong Song
2024-10-22 20:13 ` Yonghong Song
2024-10-22 20:41 ` Alexei Starovoitov
2024-10-22 21:29 ` Kumar Kartikeya Dwivedi
2024-10-22 21:36 ` Kumar Kartikeya Dwivedi
2024-10-22 21:43 ` Yonghong Song
2024-10-22 21:57 ` Alexei Starovoitov
2024-10-22 22:41 ` Yonghong Song
2024-10-22 22:59 ` Alexei Starovoitov
2024-10-22 23:53 ` Yonghong Song
2024-10-20 19:13 ` [PATCH bpf-next v6 2/9] bpf: Rename bpf_struct_ops_arg_info to bpf_struct_ops_func_info Yonghong Song
2024-10-20 19:13 ` [PATCH bpf-next v6 3/9] bpf: Support private stack for struct ops programs Yonghong Song
2024-10-22 1:34 ` Alexei Starovoitov
2024-10-22 2:59 ` Yonghong Song
2024-10-22 17:26 ` Martin KaFai Lau
2024-10-22 20:19 ` Alexei Starovoitov
2024-10-23 21:00 ` Tejun Heo
2024-10-23 23:07 ` Alexei Starovoitov
2024-10-24 0:56 ` Tejun Heo
2024-10-20 19:14 ` [PATCH bpf-next v6 4/9] bpf: Mark each subprog with proper private stack modes Yonghong Song
2024-10-20 22:01 ` Jiri Olsa
2024-10-21 4:22 ` Yonghong Song
2024-10-20 19:14 ` [PATCH bpf-next v6 5/9] bpf, x86: Refactor func emit_prologue Yonghong Song
2024-10-20 19:14 ` [PATCH bpf-next v6 6/9] bpf, x86: Create a helper for certain "reg <op>= imm" operations Yonghong Song
2024-10-20 19:14 ` [PATCH bpf-next v6 7/9] bpf, x86: Add jit support for private stack Yonghong Song
2024-10-20 19:14 ` [PATCH bpf-next v6 8/9] selftests/bpf: Add tracing prog private stack tests Yonghong Song
2024-10-20 21:59 ` Jiri Olsa
2024-10-21 4:32 ` Yonghong Song
2024-10-21 10:40 ` Jiri Olsa
2024-10-21 16:19 ` Yonghong Song
2024-10-21 21:13 ` Jiri Olsa
2024-10-20 19:14 ` [PATCH bpf-next v6 9/9] selftests/bpf: Add struct_ops " Yonghong Song
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=c6e5040b-9558-481f-b1fc-f77dc9ce90c1@linux.dev \
--to=yonghong.song@linux.dev \
--cc=alexei.starovoitov@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=kernel-team@fb.com \
--cc=martin.lau@kernel.org \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox