netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Martin KaFai Lau <martin.lau@linux.dev>
To: Amery Hung <amery.hung@bytedance.com>
Cc: bpf@vger.kernel.org, netdev@vger.kernel.org,
	daniel@iogearbox.net, andrii@kernel.org,
	alexei.starovoitov@gmail.com, martin.lau@kernel.org,
	sinquersw@gmail.com, toke@redhat.com, jhs@mojatatu.com,
	jiri@resnulli.us, stfomichev@gmail.com,
	ekarani.silvestre@ccc.ufcg.edu.br, yangpeihao@sjtu.edu.cn,
	xiyou.wangcong@gmail.com, yepeilin.cs@gmail.com,
	ameryhung@gmail.com
Subject: Re: [PATCH bpf-next v1 01/13] bpf: Support getting referenced kptr from struct_ops argument
Date: Tue, 17 Dec 2024 16:58:11 -0800	[thread overview]
Message-ID: <65399ffd-da8a-436a-81fd-b5bd3e4b8a54@linux.dev> (raw)
In-Reply-To: <20241213232958.2388301-2-amery.hung@bytedance.com>

On 12/13/24 3:29 PM, Amery Hung wrote:
> Allows struct_ops programs to acqurie referenced kptrs from arguments
> by directly reading the argument.
> 
> The verifier will acquire a reference for struct_ops a argument tagged
> with "__ref" in the stub function in the beginning of the main program.
> The user will be able to access the referenced kptr directly by reading
> the context as long as it has not been released by the program.
> 
> This new mechanism to acquire referenced kptr (compared to the existing
> "kfunc with KF_ACQUIRE") is introduced for ergonomic and semantic reasons.
> In the first use case, Qdisc_ops, an skb is passed to .enqueue in the
> first argument. This mechanism provides a natural way for users to get a
> referenced kptr in the .enqueue struct_ops programs and makes sure that a
> qdisc will always enqueue or drop the skb.
> 
> Signed-off-by: Amery Hung <amery.hung@bytedance.com>
> ---
>   include/linux/bpf.h         |  3 +++
>   kernel/bpf/bpf_struct_ops.c | 26 ++++++++++++++++++++------
>   kernel/bpf/btf.c            |  1 +
>   kernel/bpf/verifier.c       | 35 ++++++++++++++++++++++++++++++++---
>   4 files changed, 56 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 1b84613b10ac..72bf941d1daf 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -968,6 +968,7 @@ struct bpf_insn_access_aux {
>   		struct {
>   			struct btf *btf;
>   			u32 btf_id;
> +			u32 ref_obj_id;
>   		};
>   	};
>   	struct bpf_verifier_log *log; /* for verbose logs */
> @@ -1480,6 +1481,8 @@ struct bpf_ctx_arg_aux {
>   	enum bpf_reg_type reg_type;
>   	struct btf *btf;
>   	u32 btf_id;
> +	u32 ref_obj_id;
> +	bool refcounted;
>   };
>   
>   struct btf_mod_pair {
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index fda3dd2ee984..6e7795744f6a 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -145,6 +145,7 @@ void bpf_struct_ops_image_free(void *image)
>   }
>   
>   #define MAYBE_NULL_SUFFIX "__nullable"
> +#define REFCOUNTED_SUFFIX "__ref"
>   #define MAX_STUB_NAME 128
>   
>   /* Return the type info of a stub function, if it exists.
> @@ -206,9 +207,11 @@ static int prepare_arg_info(struct btf *btf,
>   			    struct bpf_struct_ops_arg_info *arg_info)
>   {
>   	const struct btf_type *stub_func_proto, *pointed_type;
> +	bool is_nullable = false, is_refcounted = false;
>   	const struct btf_param *stub_args, *args;
>   	struct bpf_ctx_arg_aux *info, *info_buf;
>   	u32 nargs, arg_no, info_cnt = 0;
> +	const char *suffix;
>   	u32 arg_btf_id;
>   	int offset;
>   
> @@ -240,12 +243,19 @@ static int prepare_arg_info(struct btf *btf,
>   	info = info_buf;
>   	for (arg_no = 0; arg_no < nargs; arg_no++) {
>   		/* Skip arguments that is not suffixed with
> -		 * "__nullable".
> +		 * "__nullable or __ref".
>   		 */
> -		if (!btf_param_match_suffix(btf, &stub_args[arg_no],
> -					    MAYBE_NULL_SUFFIX))
> +		is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no],
> +						     MAYBE_NULL_SUFFIX);
> +		is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no],
> +						       REFCOUNTED_SUFFIX);
> +		if (!is_nullable && !is_refcounted)
>   			continue;
>   
> +		if (is_nullable)
> +			suffix = MAYBE_NULL_SUFFIX;
> +		else if (is_refcounted)
> +			suffix = REFCOUNTED_SUFFIX;
>   		/* Should be a pointer to struct */
>   		pointed_type = btf_type_resolve_ptr(btf,
>   						    args[arg_no].type,
> @@ -253,7 +263,7 @@ static int prepare_arg_info(struct btf *btf,
>   		if (!pointed_type ||
>   		    !btf_type_is_struct(pointed_type)) {
>   			pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
> -				st_ops_name, member_name, MAYBE_NULL_SUFFIX);
> +				st_ops_name, member_name, suffix);
>   			goto err_out;
>   		}
>   
> @@ -271,11 +281,15 @@ static int prepare_arg_info(struct btf *btf,
>   		}
>   
>   		/* Fill the information of the new argument */
> -		info->reg_type =
> -			PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
>   		info->btf_id = arg_btf_id;
>   		info->btf = btf;
>   		info->offset = offset;
> +		if (is_nullable) {
> +			info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
> +		} else if (is_refcounted) {
> +			info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID;
> +			info->refcounted = true;
> +		}
>   
>   		info++;
>   		info_cnt++;
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index e7a59e6462a9..a05ccf9ee032 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -6580,6 +6580,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
>   			info->reg_type = ctx_arg_info->reg_type;
>   			info->btf = ctx_arg_info->btf ? : btf_vmlinux;
>   			info->btf_id = ctx_arg_info->btf_id;
> +			info->ref_obj_id = ctx_arg_info->ref_obj_id;
>   			return true;
>   		}
>   	}
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 9f5de8d4fbd0..69753096075f 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1402,6 +1402,17 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
>   	return -EINVAL;
>   }
>   
> +static bool find_reference_state(struct bpf_func_state *state, int ptr_id)
> +{
> +	int i;
> +
> +	for (i = 0; i < state->acquired_refs; i++)
> +		if (state->refs[i].id == ptr_id)
> +			return true;
> +
> +	return false;
> +}
> +
>   static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr)
>   {
>   	int i, last_idx;
> @@ -5798,7 +5809,8 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
>   /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
>   static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
>   			    enum bpf_access_type t, enum bpf_reg_type *reg_type,
> -			    struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx)
> +			    struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx,
> +			    u32 *ref_obj_id)
>   {
>   	struct bpf_insn_access_aux info = {
>   		.reg_type = *reg_type,
> @@ -5820,8 +5832,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
>   		*is_retval = info.is_retval;
>   
>   		if (base_type(*reg_type) == PTR_TO_BTF_ID) {
> +			if (info.ref_obj_id &&
> +			    !find_reference_state(cur_func(env), info.ref_obj_id)) {
> +				verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
> +					off);
> +				return -EACCES;
> +			}
> +
>   			*btf = info.btf;
>   			*btf_id = info.btf_id;
> +			*ref_obj_id = info.ref_obj_id;
>   		} else {
>   			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
>   		}
> @@ -7135,7 +7155,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>   		struct bpf_retval_range range;
>   		enum bpf_reg_type reg_type = SCALAR_VALUE;
>   		struct btf *btf = NULL;
> -		u32 btf_id = 0;
> +		u32 btf_id = 0, ref_obj_id = 0;
>   
>   		if (t == BPF_WRITE && value_regno >= 0 &&
>   		    is_pointer_value(env, value_regno)) {
> @@ -7148,7 +7168,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>   			return err;
>   
>   		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
> -				       &btf_id, &is_retval, is_ldsx);
> +				       &btf_id, &is_retval, is_ldsx, &ref_obj_id);
>   		if (err)
>   			verbose_linfo(env, insn_idx, "; ");
>   		if (!err && t == BPF_READ && value_regno >= 0) {
> @@ -7179,6 +7199,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>   				if (base_type(reg_type) == PTR_TO_BTF_ID) {
>   					regs[value_regno].btf = btf;
>   					regs[value_regno].btf_id = btf_id;
> +					regs[value_regno].ref_obj_id = ref_obj_id;
>   				}
>   			}
>   			regs[value_regno].type = reg_type;
> @@ -21662,6 +21683,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
>   {
>   	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
>   	struct bpf_subprog_info *sub = subprog_info(env, subprog);
> +	struct bpf_ctx_arg_aux *ctx_arg_info;
>   	struct bpf_verifier_state *state;
>   	struct bpf_reg_state *regs;
>   	int ret, i;
> @@ -21769,6 +21791,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
>   		mark_reg_known_zero(env, regs, BPF_REG_1);
>   	}
>   
> +	if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
> +		ctx_arg_info = (struct bpf_ctx_arg_aux *)env->prog->aux->ctx_arg_info;
> +		for (i = 0; i < env->prog->aux->ctx_arg_info_size; i++)
> +			if (ctx_arg_info[i].refcounted)
> +				ctx_arg_info[i].ref_obj_id = acquire_reference_state(env, 0);

There is a conflict in the bpf-next/master. acquire_reference_state has been 
refactored in commit 769b0f1c8214. From looking at the net/sched/sch_*.c 
changes, they should not have conflict with the net-next/main. I would suggest 
to rebase this set on bpf-next/master.

At the first glance, the ref_obj_id assignment looks racy because ctx_arg_info 
is shared by different bpf progs that may be verified in parallel. After another 
thought, this should be fine because it should always end up having the same 
ref_obj_id for the same arg-no, right? Not sure if UBSAN can understand this 
without using the READ/WRITE_ONCE. but adding READ/WRITE_ONCE when using 
ref_obj_id will be quite puzzling when reading the verifier code. Any better idea?

Other than the subprog, afaik, the bpf prog triggered by the bpf_tail_call can 
also take the 'u64 *ctx' array. May be disallow using tailcall in all ops in the 
bpf qdisc. env->subprog_info[i].has_tail_call has already tracked whether the 
tail_call is used.

> +	}
> +
>   	ret = do_check(env);
>   out:
>   	/* check for NULL is necessary, since cur_state can be freed inside


  reply	other threads:[~2024-12-18  0:58 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-13 23:29 [PATCH bpf-next v1 00/13] bpf qdisc Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 01/13] bpf: Support getting referenced kptr from struct_ops argument Amery Hung
2024-12-18  0:58   ` Martin KaFai Lau [this message]
2024-12-18  1:24     ` Alexei Starovoitov
2024-12-18 16:09       ` Amery Hung
2024-12-18 17:20         ` Alexei Starovoitov
2024-12-18  1:44     ` Jakub Kicinski
2024-12-18 16:57     ` Amery Hung
2024-12-19 23:06       ` Martin KaFai Lau
2024-12-13 23:29 ` [PATCH bpf-next v1 02/13] selftests/bpf: Test referenced kptr arguments of struct_ops programs Amery Hung
2024-12-18  1:17   ` Martin KaFai Lau
2024-12-18 16:10     ` Amery Hung
2024-12-19  3:40   ` Yonghong Song
2024-12-19 20:49     ` Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 03/13] bpf: Allow struct_ops prog to return referenced kptr Amery Hung
2024-12-18 22:29   ` Martin KaFai Lau
2024-12-13 23:29 ` [PATCH bpf-next v1 04/13] selftests/bpf: Test returning referenced kptr from struct_ops programs Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 05/13] bpf: net_sched: Support implementation of Qdisc_ops in bpf Amery Hung
2024-12-14  4:51   ` Cong Wang
2024-12-18 23:37   ` Martin KaFai Lau
2024-12-13 23:29 ` [PATCH bpf-next v1 06/13] bpf: net_sched: Add basic bpf qdisc kfuncs Amery Hung
2024-12-18 17:11   ` Amery Hung
2024-12-19  7:37   ` Martin KaFai Lau
2024-12-20  0:32     ` Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 07/13] bpf: net_sched: Add a qdisc watchdog timer Amery Hung
2024-12-19  1:16   ` Martin KaFai Lau
2024-12-20 19:24     ` Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 08/13] bpf: net_sched: Support updating bstats Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 09/13] bpf: net_sched: Support updating qstats Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 10/13] bpf: net_sched: Allow writing to more Qdisc members Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 11/13] libbpf: Support creating and destroying qdisc Amery Hung
2024-12-17 18:32   ` Andrii Nakryiko
2024-12-17 19:08     ` Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 12/13] selftests: Add a basic fifo qdisc test Amery Hung
2024-12-13 23:29 ` [PATCH bpf-next v1 13/13] selftests: Add a bpf fq qdisc to selftest Amery Hung

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=65399ffd-da8a-436a-81fd-b5bd3e4b8a54@linux.dev \
    --to=martin.lau@linux.dev \
    --cc=alexei.starovoitov@gmail.com \
    --cc=amery.hung@bytedance.com \
    --cc=ameryhung@gmail.com \
    --cc=andrii@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=ekarani.silvestre@ccc.ufcg.edu.br \
    --cc=jhs@mojatatu.com \
    --cc=jiri@resnulli.us \
    --cc=martin.lau@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=sinquersw@gmail.com \
    --cc=stfomichev@gmail.com \
    --cc=toke@redhat.com \
    --cc=xiyou.wangcong@gmail.com \
    --cc=yangpeihao@sjtu.edu.cn \
    --cc=yepeilin.cs@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).