Re: [RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Martin KaFai Lau <martin.lau@linux.dev>
To: kuifeng@meta.com
Cc: sinquersw@gmail.com, bpf@vger.kernel.org, ast@kernel.org,
	song@kernel.org, kernel-team@meta.com, andrii@kernel.org
Subject: Re: [RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.
Date: Wed, 2 Aug 2023 12:25:02 -0700	[thread overview]
Message-ID: <39bad6b2-5498-b2e3-c70c-e27a74834026@linux.dev> (raw)
In-Reply-To: <20230722052248.1062582-2-kuifeng@meta.com>

On 7/21/23 10:22 PM, kuifeng@meta.com wrote:
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 739c15906a65..b2f81193f97b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7135,6 +7135,15 @@ struct bpf_sockopt {
>   	__s32	optname;
>   	__s32	optlen;
>   	__s32	retval;
> +
> +	__bpf_md_ptr(void *, user_optval);
> +	__bpf_md_ptr(void *, user_optval_end);
> +	__u32	flags;

I will follow up on the UAPI discussion in the other existing thread.

> +};
> +
> +enum bpf_sockopt_flags {
> +	/* optval is a pointer to user space memory */
> +	BPF_SOCKOPT_FLAG_OPTVAL_USER	= (1U << 0),
>   };
>   
>   struct bpf_pidns_info {
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 5b2741aa0d9b..b268bbfa6c53 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -38,15 +38,26 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>   	const struct bpf_prog_array *array;
>   	struct bpf_run_ctx *old_run_ctx;
>   	struct bpf_cg_run_ctx run_ctx;
> +	bool do_sleepable;
>   	u32 func_ret;
>   
> +	do_sleepable =
> +		atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
> +
>   	run_ctx.retval = retval;
>   	migrate_disable();
> -	rcu_read_lock();
> +	if (do_sleepable) {
> +		might_fault();
> +		rcu_read_lock_trace();
> +	} else
> +		rcu_read_lock();
>   	array = rcu_dereference(cgrp->effective[atype]);

array is now under rcu_read_lock_trace for the "do_sleepable" case.

The array free side should be changed also to wait for the tasks_trace gp. 
Please check if any bpf_prog_array_free() usages in cgroup.c should be replaced 
with bpf_prog_array_free_sleepable().

Another high level comment is, other cgroup hooks may get sleepable support in 
the future. In particular the SEC("lsm_cgroup") when considering other lsm hooks 
in SEC("lsm.s") have sleepable support already. just want to bring up here for 
awareness. This set can focus on get/setsockopt for now.

>   	item = &array->items[0];
>   	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
>   	while ((prog = READ_ONCE(item->prog))) {
> +		if (do_sleepable && !prog->aux->sleepable)
> +			rcu_read_lock();
> +
>   		run_ctx.prog_item = item;
>   		func_ret = run_prog(prog, ctx);
>   		if (ret_flags) {
> @@ -56,13 +67,43 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>   		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
>   			run_ctx.retval = -EPERM;
>   		item++;
> +
> +		if (do_sleepable && !prog->aux->sleepable)
> +			rcu_read_unlock();
>   	}
>   	bpf_reset_run_ctx(old_run_ctx);
> -	rcu_read_unlock();
> +	if (do_sleepable)
> +		rcu_read_unlock_trace();
> +	else
> +		rcu_read_unlock();
>   	migrate_enable();
>   	return run_ctx.retval;
>   }
>   
> +static __always_inline bool
> +has_only_sleepable_prog_cg(const struct cgroup_bpf *cgrp,
> +			 enum cgroup_bpf_attach_type atype)
> +{
> +	const struct bpf_prog_array_item *item;
> +	const struct bpf_prog *prog;
> +	int cnt = 0;
> +	bool ret = true;
> +
> +	rcu_read_lock();
> +	item = &rcu_dereference(cgrp->effective[atype])->items[0];
> +	while (ret && (prog = READ_ONCE(item->prog))) {
> +		if (!prog->aux->sleepable)
> +			ret = false;
> +		item++;
> +		cnt++;
> +	}
> +	rcu_read_unlock();
> +	if (cnt == 0)
> +		ret = false;
> +
> +	return ret;
> +}
> +
>   unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
>   				       const struct bpf_insn *insn)
>   {
> @@ -1773,7 +1814,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
>   static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
>   			     struct bpf_sockopt_buf *buf)
>   {
> -	if (ctx->optval == buf->data)
> +	if (ctx->optval == buf->data ||
> +	    ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
>   		return;
>   	kfree(ctx->optval);
>   }
> @@ -1781,7 +1823,8 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
>   static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
>   				  struct bpf_sockopt_buf *buf)
>   {
> -	return ctx->optval != buf->data;
> +	return ctx->optval != buf->data &&
> +		!(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
>   }
>   
>   int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> @@ -1796,21 +1839,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>   		.optname = *optname,
>   	};
>   	int ret, max_optlen;
> +	bool alloc_mem;
> +
> +	alloc_mem = !has_only_sleepable_prog_cg(&cgrp->bpf, CGROUP_SETSOCKOPT);

hmm... I am not sure if this would work. The cgroup->effective[atype] could have 
been changed after this has_only_sleepable_prog_cg() check. For example, a 
non-sleepable prog is attached after this check. The latter 
bpf_prog_run_array_cg() may end up having an incorrect ctx.

A quick thought is, this alloc decision should be postponed to the 
bpf_prog_run_array_cg()? It may be better to have a different 
bpf_prog_run_array_cg for set/getsockopt here, not sure.

> +	if (!alloc_mem) {
> +		max_optlen = *optlen;
> +		ctx.optlen = *optlen;
> +		ctx.user_optval = optval;
> +		ctx.user_optval_end = optval + *optlen;
> +		ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
> +	} else {
> +		/* Allocate a bit more than the initial user buffer for
> +		 * BPF program. The canonical use case is overriding
> +		 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
> +		 */
> +		max_optlen = max_t(int, 16, *optlen);
> +		max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> +		if (max_optlen < 0)
> +			return max_optlen;
>   
> -	/* Allocate a bit more than the initial user buffer for
> -	 * BPF program. The canonical use case is overriding
> -	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
> -	 */
> -	max_optlen = max_t(int, 16, *optlen);
> -	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> -	if (max_optlen < 0)
> -		return max_optlen;
> -
> -	ctx.optlen = *optlen;
> +		ctx.optlen = *optlen;
>   
> -	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
> -		ret = -EFAULT;
> -		goto out;
> +		if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
>   	}

next prev parent reply	other threads:[~2023-08-02 19:25 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-22  5:22 [RFC bpf-next 0/5] Sleepable BPF programs on cgroup {get,set}sockopt kuifeng
2023-07-22  5:22 ` [RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt kuifeng
2023-07-22 16:09   ` kernel test robot
2023-07-22 18:22   ` kernel test robot
2023-07-24 18:36   ` Stanislav Fomichev
2023-07-31 22:02     ` Kui-Feng Lee
2023-07-31 23:35       ` Stanislav Fomichev
2023-08-01 17:31         ` Kui-Feng Lee
2023-08-01 18:08           ` Stanislav Fomichev
2023-08-02 22:28             ` Martin KaFai Lau
2023-08-02 19:25   ` Martin KaFai Lau [this message]
2023-07-22  5:22 ` [RFC bpf-next 2/5] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user() kuifeng
2023-07-22  6:39   ` kernel test robot
2023-07-22  6:49   ` kernel test robot
2023-07-22  7:41   ` kernel test robot
2023-07-22 20:26   ` kernel test robot
2023-08-02 19:59   ` Martin KaFai Lau
2023-07-22  5:22 ` [RFC bpf-next 3/5] bpf: Add a new dynptr type for CGRUP_SOCKOPT kuifeng
2023-07-22  5:22 ` [RFC bpf-next 4/5] bpf: Prevent BPF programs from access the buffer pointed by user_optval kuifeng
2023-07-22  5:22 ` [RFC bpf-next 5/5] bpf: Add test cases for sleepable BPF programs of the CGROUP_SOCKOPT type kuifeng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=39bad6b2-5498-b2e3-c70c-e27a74834026@linux.dev \
    --to=martin.lau@linux.dev \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=kernel-team@meta.com \
    --cc=kuifeng@meta.com \
    --cc=sinquersw@gmail.com \
    --cc=song@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.