From: Martin KaFai Lau <martin.lau@linux.dev>
To: Amery Hung <amery.hung@bytedance.com>
Cc: bpf@vger.kernel.org, netdev@vger.kernel.org,
daniel@iogearbox.net, andrii@kernel.org,
alexei.starovoitov@gmail.com, martin.lau@kernel.org,
sinquersw@gmail.com, toke@redhat.com, jhs@mojatatu.com,
jiri@resnulli.us, stfomichev@gmail.com,
ekarani.silvestre@ccc.ufcg.edu.br, yangpeihao@sjtu.edu.cn,
xiyou.wangcong@gmail.com, yepeilin.cs@gmail.com,
ameryhung@gmail.com
Subject: Re: [PATCH bpf-next v1 01/13] bpf: Support getting referenced kptr from struct_ops argument
Date: Tue, 17 Dec 2024 16:58:11 -0800 [thread overview]
Message-ID: <65399ffd-da8a-436a-81fd-b5bd3e4b8a54@linux.dev> (raw)
In-Reply-To: <20241213232958.2388301-2-amery.hung@bytedance.com>
On 12/13/24 3:29 PM, Amery Hung wrote:
> Allows struct_ops programs to acqurie referenced kptrs from arguments
> by directly reading the argument.
>
> The verifier will acquire a reference for struct_ops a argument tagged
> with "__ref" in the stub function in the beginning of the main program.
> The user will be able to access the referenced kptr directly by reading
> the context as long as it has not been released by the program.
>
> This new mechanism to acquire referenced kptr (compared to the existing
> "kfunc with KF_ACQUIRE") is introduced for ergonomic and semantic reasons.
> In the first use case, Qdisc_ops, an skb is passed to .enqueue in the
> first argument. This mechanism provides a natural way for users to get a
> referenced kptr in the .enqueue struct_ops programs and makes sure that a
> qdisc will always enqueue or drop the skb.
>
> Signed-off-by: Amery Hung <amery.hung@bytedance.com>
> ---
> include/linux/bpf.h | 3 +++
> kernel/bpf/bpf_struct_ops.c | 26 ++++++++++++++++++++------
> kernel/bpf/btf.c | 1 +
> kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++++++++++---
> 4 files changed, 56 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 1b84613b10ac..72bf941d1daf 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -968,6 +968,7 @@ struct bpf_insn_access_aux {
> struct {
> struct btf *btf;
> u32 btf_id;
> + u32 ref_obj_id;
> };
> };
> struct bpf_verifier_log *log; /* for verbose logs */
> @@ -1480,6 +1481,8 @@ struct bpf_ctx_arg_aux {
> enum bpf_reg_type reg_type;
> struct btf *btf;
> u32 btf_id;
> + u32 ref_obj_id;
> + bool refcounted;
> };
>
> struct btf_mod_pair {
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index fda3dd2ee984..6e7795744f6a 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -145,6 +145,7 @@ void bpf_struct_ops_image_free(void *image)
> }
>
> #define MAYBE_NULL_SUFFIX "__nullable"
> +#define REFCOUNTED_SUFFIX "__ref"
> #define MAX_STUB_NAME 128
>
> /* Return the type info of a stub function, if it exists.
> @@ -206,9 +207,11 @@ static int prepare_arg_info(struct btf *btf,
> struct bpf_struct_ops_arg_info *arg_info)
> {
> const struct btf_type *stub_func_proto, *pointed_type;
> + bool is_nullable = false, is_refcounted = false;
> const struct btf_param *stub_args, *args;
> struct bpf_ctx_arg_aux *info, *info_buf;
> u32 nargs, arg_no, info_cnt = 0;
> + const char *suffix;
> u32 arg_btf_id;
> int offset;
>
> @@ -240,12 +243,19 @@ static int prepare_arg_info(struct btf *btf,
> info = info_buf;
> for (arg_no = 0; arg_no < nargs; arg_no++) {
> /* Skip arguments that is not suffixed with
> - * "__nullable".
> + * "__nullable or __ref".
> */
> - if (!btf_param_match_suffix(btf, &stub_args[arg_no],
> - MAYBE_NULL_SUFFIX))
> + is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no],
> + MAYBE_NULL_SUFFIX);
> + is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no],
> + REFCOUNTED_SUFFIX);
> + if (!is_nullable && !is_refcounted)
> continue;
>
> + if (is_nullable)
> + suffix = MAYBE_NULL_SUFFIX;
> + else if (is_refcounted)
> + suffix = REFCOUNTED_SUFFIX;
> /* Should be a pointer to struct */
> pointed_type = btf_type_resolve_ptr(btf,
> args[arg_no].type,
> @@ -253,7 +263,7 @@ static int prepare_arg_info(struct btf *btf,
> if (!pointed_type ||
> !btf_type_is_struct(pointed_type)) {
> pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
> - st_ops_name, member_name, MAYBE_NULL_SUFFIX);
> + st_ops_name, member_name, suffix);
> goto err_out;
> }
>
> @@ -271,11 +281,15 @@ static int prepare_arg_info(struct btf *btf,
> }
>
> /* Fill the information of the new argument */
> - info->reg_type =
> - PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
> info->btf_id = arg_btf_id;
> info->btf = btf;
> info->offset = offset;
> + if (is_nullable) {
> + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
> + } else if (is_refcounted) {
> + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID;
> + info->refcounted = true;
> + }
>
> info++;
> info_cnt++;
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index e7a59e6462a9..a05ccf9ee032 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -6580,6 +6580,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> info->reg_type = ctx_arg_info->reg_type;
> info->btf = ctx_arg_info->btf ? : btf_vmlinux;
> info->btf_id = ctx_arg_info->btf_id;
> + info->ref_obj_id = ctx_arg_info->ref_obj_id;
> return true;
> }
> }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 9f5de8d4fbd0..69753096075f 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1402,6 +1402,17 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
> return -EINVAL;
> }
>
> +static bool find_reference_state(struct bpf_func_state *state, int ptr_id)
> +{
> + int i;
> +
> + for (i = 0; i < state->acquired_refs; i++)
> + if (state->refs[i].id == ptr_id)
> + return true;
> +
> + return false;
> +}
> +
> static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr)
> {
> int i, last_idx;
> @@ -5798,7 +5809,8 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
> /* check access to 'struct bpf_context' fields. Supports fixed offsets only */
> static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
> enum bpf_access_type t, enum bpf_reg_type *reg_type,
> - struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx)
> + struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx,
> + u32 *ref_obj_id)
> {
> struct bpf_insn_access_aux info = {
> .reg_type = *reg_type,
> @@ -5820,8 +5832,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
> *is_retval = info.is_retval;
>
> if (base_type(*reg_type) == PTR_TO_BTF_ID) {
> + if (info.ref_obj_id &&
> + !find_reference_state(cur_func(env), info.ref_obj_id)) {
> + verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
> + off);
> + return -EACCES;
> + }
> +
> *btf = info.btf;
> *btf_id = info.btf_id;
> + *ref_obj_id = info.ref_obj_id;
> } else {
> env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
> }
> @@ -7135,7 +7155,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
> struct bpf_retval_range range;
> enum bpf_reg_type reg_type = SCALAR_VALUE;
> struct btf *btf = NULL;
> - u32 btf_id = 0;
> + u32 btf_id = 0, ref_obj_id = 0;
>
> if (t == BPF_WRITE && value_regno >= 0 &&
> is_pointer_value(env, value_regno)) {
> @@ -7148,7 +7168,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
> return err;
>
> err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf,
> - &btf_id, &is_retval, is_ldsx);
> + &btf_id, &is_retval, is_ldsx, &ref_obj_id);
> if (err)
> verbose_linfo(env, insn_idx, "; ");
> if (!err && t == BPF_READ && value_regno >= 0) {
> @@ -7179,6 +7199,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
> if (base_type(reg_type) == PTR_TO_BTF_ID) {
> regs[value_regno].btf = btf;
> regs[value_regno].btf_id = btf_id;
> + regs[value_regno].ref_obj_id = ref_obj_id;
> }
> }
> regs[value_regno].type = reg_type;
> @@ -21662,6 +21683,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
> {
> bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
> struct bpf_subprog_info *sub = subprog_info(env, subprog);
> + struct bpf_ctx_arg_aux *ctx_arg_info;
> struct bpf_verifier_state *state;
> struct bpf_reg_state *regs;
> int ret, i;
> @@ -21769,6 +21791,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
> mark_reg_known_zero(env, regs, BPF_REG_1);
> }
>
> + if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
> + ctx_arg_info = (struct bpf_ctx_arg_aux *)env->prog->aux->ctx_arg_info;
> + for (i = 0; i < env->prog->aux->ctx_arg_info_size; i++)
> + if (ctx_arg_info[i].refcounted)
> + ctx_arg_info[i].ref_obj_id = acquire_reference_state(env, 0);
There is a conflict in the bpf-next/master. acquire_reference_state has been
refactored in commit 769b0f1c8214. From looking at the net/sched/sch_*.c
changes, they should not have conflict with the net-next/main. I would suggest
to rebase this set on bpf-next/master.
At the first glance, the ref_obj_id assignment looks racy because ctx_arg_info
is shared by different bpf progs that may be verified in parallel. After another
thought, this should be fine because it should always end up having the same
ref_obj_id for the same arg-no, right? Not sure if UBSAN can understand this
without using the READ/WRITE_ONCE. but adding READ/WRITE_ONCE when using
ref_obj_id will be quite puzzling when reading the verifier code. Any better idea?
Other than the subprog, afaik, the bpf prog triggered by the bpf_tail_call can
also take the 'u64 *ctx' array. May be disallow using tailcall in all ops in the
bpf qdisc. env->subprog_info[i].has_tail_call has already tracked whether the
tail_call is used.
> + }
> +
> ret = do_check(env);
> out:
> /* check for NULL is necessary, since cur_state can be freed inside
next prev parent reply other threads:[~2024-12-18 0:58 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-13 23:29 [PATCH bpf-next v1 00/13] bpf qdisc Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 01/13] bpf: Support getting referenced kptr from struct_ops argument Amery Hung
2024-12-18 0:58 ` Martin KaFai Lau [this message]
2024-12-18 1:24 ` Alexei Starovoitov
2024-12-18 16:09 ` Amery Hung
2024-12-18 17:20 ` Alexei Starovoitov
2024-12-18 1:44 ` Jakub Kicinski
2024-12-18 16:57 ` Amery Hung
2024-12-19 23:06 ` Martin KaFai Lau
2024-12-13 23:29 ` [PATCH bpf-next v1 02/13] selftests/bpf: Test referenced kptr arguments of struct_ops programs Amery Hung
2024-12-18 1:17 ` Martin KaFai Lau
2024-12-18 16:10 ` Amery Hung
2024-12-19 3:40 ` Yonghong Song
2024-12-19 20:49 ` Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 03/13] bpf: Allow struct_ops prog to return referenced kptr Amery Hung
2024-12-18 22:29 ` Martin KaFai Lau
2024-12-13 23:29 ` [PATCH bpf-next v1 04/13] selftests/bpf: Test returning referenced kptr from struct_ops programs Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 05/13] bpf: net_sched: Support implementation of Qdisc_ops in bpf Amery Hung
2024-12-14 4:51 ` Cong Wang
2024-12-18 23:37 ` Martin KaFai Lau
2024-12-13 23:29 ` [PATCH bpf-next v1 06/13] bpf: net_sched: Add basic bpf qdisc kfuncs Amery Hung
2024-12-18 17:11 ` Amery Hung
2024-12-19 7:37 ` Martin KaFai Lau
2024-12-20 0:32 ` Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 07/13] bpf: net_sched: Add a qdisc watchdog timer Amery Hung
2024-12-19 1:16 ` Martin KaFai Lau
2024-12-20 19:24 ` Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 08/13] bpf: net_sched: Support updating bstats Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 09/13] bpf: net_sched: Support updating qstats Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 10/13] bpf: net_sched: Allow writing to more Qdisc members Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 11/13] libbpf: Support creating and destroying qdisc Amery Hung
2024-12-17 18:32 ` Andrii Nakryiko
2024-12-17 19:08 ` Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 12/13] selftests: Add a basic fifo qdisc test Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 13/13] selftests: Add a bpf fq qdisc to selftest Amery Hung
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=65399ffd-da8a-436a-81fd-b5bd3e4b8a54@linux.dev \
--to=martin.lau@linux.dev \
--cc=alexei.starovoitov@gmail.com \
--cc=amery.hung@bytedance.com \
--cc=ameryhung@gmail.com \
--cc=andrii@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=ekarani.silvestre@ccc.ufcg.edu.br \
--cc=jhs@mojatatu.com \
--cc=jiri@resnulli.us \
--cc=martin.lau@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=sinquersw@gmail.com \
--cc=stfomichev@gmail.com \
--cc=toke@redhat.com \
--cc=xiyou.wangcong@gmail.com \
--cc=yangpeihao@sjtu.edu.cn \
--cc=yepeilin.cs@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.