BPF List
 help / color / mirror / Atom feed
From: Yonghong Song <yonghong.song@linux.dev>
To: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: bpf <bpf@vger.kernel.org>, Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Kernel Team <kernel-team@fb.com>,
	Martin KaFai Lau <martin.lau@kernel.org>,
	Tejun Heo <tj@kernel.org>
Subject: Re: [PATCH bpf-next v6 1/9] bpf: Allow each subprog having stack size of 512 bytes
Date: Mon, 21 Oct 2024 20:21:01 -0700	[thread overview]
Message-ID: <c6e5040b-9558-481f-b1fc-f77dc9ce90c1@linux.dev> (raw)
In-Reply-To: <CAADnVQ+ZXMh_QKy0nd-n7my1SETroockPjpVVJOAWsE3tB_5sg@mail.gmail.com>


On 10/21/24 6:18 PM, Alexei Starovoitov wrote:
> On Sun, Oct 20, 2024 at 12:14 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> With private stack support, each subprog can have stack with up to 512
>> bytes. The limit of 512 bytes per subprog is kept to avoid increasing
>> verifier complexity since greater than 512 bytes will cause big verifier
>> change and increase memory consumption and verification time.
>>
>> If private stack is supported, for a bpf prog, esp. when it has
>> subprogs, private stack will be allocated for the main prog
>> and for each callback subprog. For example,
>>    main_prog
>>      subprog1
>>        calling helper
>>          subprog10 (callback func)
>>            subprog11
>>      subprog2
>>        calling helper
>>          subprog10 (callback func)
>>            subprog11
>>
>> Separate private allocations for main_prog and callback_fn subprog10
>> will make things easier since the helper function uses the kernel stack.
>>
>> In this patch, some tracing programs are allowed to use private
>> stack since tracing prog may be triggered in the middle of some other
>> prog runs. Additional subprog info is also collected for later to
>> allocate private stack for main prog and each callback functions.
>>
>> Note that if any tail_call is called in the prog (including all subprogs),
>> then private stack is not used.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h          |   1 +
>>   include/linux/bpf_verifier.h |   3 ++
>>   include/linux/filter.h       |   1 +
>>   kernel/bpf/core.c            |   5 ++
>>   kernel/bpf/verifier.c        | 100 ++++++++++++++++++++++++++++++-----
>>   5 files changed, 97 insertions(+), 13 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 0c216e71cec7..6ad8ace7075a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1490,6 +1490,7 @@ struct bpf_prog_aux {
>>          bool exception_cb;
>>          bool exception_boundary;
>>          bool is_extended; /* true if extended by freplace program */
>> +       bool priv_stack_eligible;
>>          u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
>>          struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
>>          struct bpf_arena *arena;
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 4513372c5bc8..bcfe868e3801 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -659,6 +659,8 @@ struct bpf_subprog_info {
>>           * are used for bpf_fastcall spills and fills.
>>           */
>>          s16 fastcall_stack_off;
>> +       u16 subtree_stack_depth;
>> +       u16 subtree_top_idx;
>>          bool has_tail_call: 1;
>>          bool tail_call_reachable: 1;
>>          bool has_ld_abs: 1;
>> @@ -668,6 +670,7 @@ struct bpf_subprog_info {
>>          bool args_cached: 1;
>>          /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>>          bool keep_fastcall_stack: 1;
>> +       bool priv_stack_eligible: 1;
>>
>>          u8 arg_cnt;
>>          struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 7d7578a8eac1..3a21947f2fd4 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
>>   bool bpf_jit_supports_ptr_xchg(void);
>>   bool bpf_jit_supports_arena(void);
>>   bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
>> +bool bpf_jit_supports_private_stack(void);
>>   u64 bpf_arch_uaddress_limit(void);
>>   void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
>>   bool bpf_helper_changes_pkt_data(void *func);
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 233ea78f8f1b..14d9288441f2 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
>>          return false;
>>   }
>>
>> +bool __weak bpf_jit_supports_private_stack(void)
>> +{
>> +       return false;
>> +}
>> +
>>   void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
>>   {
>>   }
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index f514247ba8ba..45bea4066272 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
>>
>>   #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
>>
>> +#define BPF_PRIV_STACK_MIN_SUBTREE_SIZE        128
>> +
>>   static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
>>   static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
>>   static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
>> @@ -5982,6 +5984,41 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
>>                                             strict);
>>   }
>>
>> +static bool bpf_enable_private_stack(struct bpf_verifier_env *env)
>> +{
>> +       if (!bpf_jit_supports_private_stack())
>> +               return false;
>> +
>> +       switch (env->prog->type) {
>> +       case BPF_PROG_TYPE_KPROBE:
>> +       case BPF_PROG_TYPE_TRACEPOINT:
>> +       case BPF_PROG_TYPE_PERF_EVENT:
>> +       case BPF_PROG_TYPE_RAW_TRACEPOINT:
>> +               return true;
>> +       case BPF_PROG_TYPE_TRACING:
>> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
>> +                       return true;
>> +               fallthrough;
>> +       default:
>> +               return false;
>> +       }
>> +}
>> +
>> +static bool is_priv_stack_supported(struct bpf_verifier_env *env)
>> +{
>> +       struct bpf_subprog_info *si = env->subprog_info;
>> +       bool has_tail_call = false;
>> +
>> +       for (int i = 0; i < env->subprog_cnt; i++) {
>> +               if (si[i].has_tail_call) {
>> +                       has_tail_call = true;
>> +                       break;
>> +               }
>> +       }
>> +
>> +       return !has_tail_call && bpf_enable_private_stack(env);
>> +}
>> +
>>   static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
>>   {
>>          if (env->prog->jit_requested)
>> @@ -5999,16 +6036,21 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
>>    * Since recursion is prevented by check_cfg() this algorithm
>>    * only needs a local stack of MAX_CALL_FRAMES to remember callsites
>>    */
>> -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>> +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>> +                                        bool check_priv_stack, bool priv_stack_supported)
>>   {
>>          struct bpf_subprog_info *subprog = env->subprog_info;
>>          struct bpf_insn *insn = env->prog->insnsi;
>>          int depth = 0, frame = 0, i, subprog_end;
>>          bool tail_call_reachable = false;
>> +       bool priv_stack_eligible = false;
>>          int ret_insn[MAX_CALL_FRAMES];
>>          int ret_prog[MAX_CALL_FRAMES];
>> -       int j;
>> +       int j, subprog_stack_depth;
>> +       int orig_idx = idx;
>>
>> +       if (check_priv_stack)
>> +               subprog[idx].subtree_top_idx = idx;
>>          i = subprog[idx].start;
>>   process_func:
>>          /* protect against potential stack overflow that might happen when
>> @@ -6030,18 +6072,33 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>           * tailcall will unwind the current stack frame but it will not get rid
>>           * of caller's stack as shown on the example above.
>>           */
>> -       if (idx && subprog[idx].has_tail_call && depth >= 256) {
>> +       if (!check_priv_stack && idx && subprog[idx].has_tail_call && depth >= 256) {
>>                  verbose(env,
>>                          "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
>>                          depth);
>>                  return -EACCES;
>>          }
>> -       depth += round_up_stack_depth(env, subprog[idx].stack_depth);
>> -       if (depth > MAX_BPF_STACK) {
>> +       subprog_stack_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
>> +       depth += subprog_stack_depth;
>> +       if (!check_priv_stack && !priv_stack_supported && depth > MAX_BPF_STACK) {
>>                  verbose(env, "combined stack size of %d calls is %d. Too large\n",
>>                          frame + 1, depth);
>>                  return -EACCES;
>>          }
>> +       if (check_priv_stack) {
>> +               if (subprog_stack_depth > MAX_BPF_STACK) {
>> +                       verbose(env, "stack size of subprog %d is %d. Too large\n",
>> +                               idx, subprog_stack_depth);
>> +                       return -EACCES;
>> +               }
>> +
>> +               if (!priv_stack_eligible && depth >= BPF_PRIV_STACK_MIN_SUBTREE_SIZE) {
>> +                       subprog[orig_idx].priv_stack_eligible = true;
>> +                       env->prog->aux->priv_stack_eligible = priv_stack_eligible = true;
>> +               }
>> +               subprog[orig_idx].subtree_stack_depth =
>> +                       max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
>> +       }
>>   continue_func:
>>          subprog_end = subprog[idx + 1].start;
>>          for (; i < subprog_end; i++) {
>> @@ -6078,6 +6135,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>                  next_insn = i + insn[i].imm + 1;
>>                  sidx = find_subprog(env, next_insn);
>>                  if (sidx < 0) {
>> +                       /* It is possible that callback func has been removed as dead code after
>> +                        * instruction rewrites, e.g. bpf_loop with cnt 0.
>> +                        */
>> +                       if (check_priv_stack)
>> +                               continue;
>> +
> and this extra hack only because check_max_stack_depth() will
> be called the 2nd time ?
> Why call it twice at all ?
> Record everything in the first pass.

The individual stack size may increase between check_max_stack_depth() and jit.
So we have to go through second pass to compute precise subtree (prog + subprogs)
stack size, which is needed to allocate percpu private stack.

One thing we could do is to record the (sub)prog<->subprog relations in the first
pass and right before the jit do another pass to calculate subtree stack size.
I guess that is what you suggest?

>
>>                          WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
>>                                    next_insn);
>>                          return -EFAULT;
>> @@ -6097,8 +6160,10 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>                  }
>>                  i = next_insn;
>>                  idx = sidx;
>> +               if (check_priv_stack)
>> +                       subprog[idx].subtree_top_idx = orig_idx;
>>
>> -               if (subprog[idx].has_tail_call)
>> +               if (!check_priv_stack && subprog[idx].has_tail_call)
>>                          tail_call_reachable = true;
>>
>>                  frame++;
>> @@ -6122,7 +6187,7 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>                          }
>>                          subprog[ret_prog[j]].tail_call_reachable = true;
>>                  }
>> -       if (subprog[0].tail_call_reachable)
>> +       if (!check_priv_stack && subprog[0].tail_call_reachable)
>>                  env->prog->aux->tail_call_reachable = true;
>>
>>          /* end of for() loop means the last insn of the 'subprog'
>> @@ -6137,14 +6202,18 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>          goto continue_func;
>>   }
>>
>> -static int check_max_stack_depth(struct bpf_verifier_env *env)
>> +static int check_max_stack_depth(struct bpf_verifier_env *env, bool check_priv_stack,
>> +                                bool priv_stack_supported)
>>   {
>>          struct bpf_subprog_info *si = env->subprog_info;
>> +       bool check_subprog;
>>          int ret;
>>
>>          for (int i = 0; i < env->subprog_cnt; i++) {
>> -               if (!i || si[i].is_async_cb) {
>> -                       ret = check_max_stack_depth_subprog(env, i);
>> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
> why?
> This looks very suspicious.

This is to simplify jit. For example,
    main_prog   <=== main_prog_priv_stack_ptr
      subprog1  <=== there is a helper which has a callback_fn
                <=== for example bpf_for_each_map_elem

        callback_fn
          subprog2

In callback_fn, we cannot simplify do
    r9 += stack_size_for_callback_fn
since r9 may have been clobbered between subprog1 and callback_fn.
That is why currently I allocate private_stack separately for callback_fn.

Alternatively we could do
    callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
where off equals to (stack size tree main_prog+subprog1).
I can do this approach too with a little more information in prog->aux.
WDYT?

>
>> +               if (check_subprog) {
>> +                       ret = check_max_stack_depth_subprog(env, i, check_priv_stack,
>> +                                                           priv_stack_supported);
>>                          if (ret < 0)
>>                                  return ret;
>>                  }
>> @@ -22303,7 +22372,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>>          struct bpf_verifier_env *env;
>>          int i, len, ret = -EINVAL, err;
>>          u32 log_true_size;
>> -       bool is_priv;
>> +       bool is_priv, priv_stack_supported = false;
>>
>>          /* no program is valid */
>>          if (ARRAY_SIZE(bpf_verifier_ops) == 0)
>> @@ -22430,8 +22499,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>>          if (ret == 0)
>>                  ret = remove_fastcall_spills_fills(env);
>>
>> -       if (ret == 0)
>> -               ret = check_max_stack_depth(env);
>> +       if (ret == 0) {
>> +               priv_stack_supported = is_priv_stack_supported(env);
>> +               ret = check_max_stack_depth(env, false, priv_stack_supported);
>> +       }
>>
>>          /* instruction rewrites happen after this point */
>>          if (ret == 0)
>> @@ -22465,6 +22536,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>>                                                                       : false;
>>          }
>>
>> +       if (ret == 0 && priv_stack_supported)
>> +               ret = check_max_stack_depth(env, true, true);
>> +
>>          if (ret == 0)
>>                  ret = fixup_call_args(env);
>>
>> --
>> 2.43.5
>>

  reply	other threads:[~2024-10-22  3:21 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-20 19:13 [PATCH bpf-next v6 0/9] bpf: Support private stack for bpf progs Yonghong Song
2024-10-20 19:13 ` [PATCH bpf-next v6 1/9] bpf: Allow each subprog having stack size of 512 bytes Yonghong Song
2024-10-22  1:18   ` Alexei Starovoitov
2024-10-22  3:21     ` Yonghong Song [this message]
2024-10-22  3:43       ` Alexei Starovoitov
2024-10-22  4:08         ` Yonghong Song
2024-10-22 20:13         ` Yonghong Song
2024-10-22 20:41           ` Alexei Starovoitov
2024-10-22 21:29             ` Kumar Kartikeya Dwivedi
2024-10-22 21:36               ` Kumar Kartikeya Dwivedi
2024-10-22 21:43             ` Yonghong Song
2024-10-22 21:57               ` Alexei Starovoitov
2024-10-22 22:41                 ` Yonghong Song
2024-10-22 22:59                   ` Alexei Starovoitov
2024-10-22 23:53                     ` Yonghong Song
2024-10-20 19:13 ` [PATCH bpf-next v6 2/9] bpf: Rename bpf_struct_ops_arg_info to bpf_struct_ops_func_info Yonghong Song
2024-10-20 19:13 ` [PATCH bpf-next v6 3/9] bpf: Support private stack for struct ops programs Yonghong Song
2024-10-22  1:34   ` Alexei Starovoitov
2024-10-22  2:59     ` Yonghong Song
2024-10-22 17:26     ` Martin KaFai Lau
2024-10-22 20:19       ` Alexei Starovoitov
2024-10-23 21:00         ` Tejun Heo
2024-10-23 23:07           ` Alexei Starovoitov
2024-10-24  0:56             ` Tejun Heo
2024-10-20 19:14 ` [PATCH bpf-next v6 4/9] bpf: Mark each subprog with proper private stack modes Yonghong Song
2024-10-20 22:01   ` Jiri Olsa
2024-10-21  4:22     ` Yonghong Song
2024-10-20 19:14 ` [PATCH bpf-next v6 5/9] bpf, x86: Refactor func emit_prologue Yonghong Song
2024-10-20 19:14 ` [PATCH bpf-next v6 6/9] bpf, x86: Create a helper for certain "reg <op>= imm" operations Yonghong Song
2024-10-20 19:14 ` [PATCH bpf-next v6 7/9] bpf, x86: Add jit support for private stack Yonghong Song
2024-10-20 19:14 ` [PATCH bpf-next v6 8/9] selftests/bpf: Add tracing prog private stack tests Yonghong Song
2024-10-20 21:59   ` Jiri Olsa
2024-10-21  4:32     ` Yonghong Song
2024-10-21 10:40       ` Jiri Olsa
2024-10-21 16:19         ` Yonghong Song
2024-10-21 21:13           ` Jiri Olsa
2024-10-20 19:14 ` [PATCH bpf-next v6 9/9] selftests/bpf: Add struct_ops " Yonghong Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c6e5040b-9558-481f-b1fc-f77dc9ce90c1@linux.dev \
    --to=yonghong.song@linux.dev \
    --cc=alexei.starovoitov@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@fb.com \
    --cc=martin.lau@kernel.org \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox