From: Martin KaFai Lau <martin.lau@linux.dev>
To: kuifeng@meta.com
Cc: sinquersw@gmail.com, bpf@vger.kernel.org, ast@kernel.org,
song@kernel.org, kernel-team@meta.com, andrii@kernel.org
Subject: Re: [RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.
Date: Wed, 2 Aug 2023 12:25:02 -0700 [thread overview]
Message-ID: <39bad6b2-5498-b2e3-c70c-e27a74834026@linux.dev> (raw)
In-Reply-To: <20230722052248.1062582-2-kuifeng@meta.com>
On 7/21/23 10:22 PM, kuifeng@meta.com wrote:
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 739c15906a65..b2f81193f97b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7135,6 +7135,15 @@ struct bpf_sockopt {
> __s32 optname;
> __s32 optlen;
> __s32 retval;
> +
> + __bpf_md_ptr(void *, user_optval);
> + __bpf_md_ptr(void *, user_optval_end);
> + __u32 flags;
I will follow up on the UAPI discussion in the other existing thread.
> +};
> +
> +enum bpf_sockopt_flags {
> + /* optval is a pointer to user space memory */
> + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
> };
>
> struct bpf_pidns_info {
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 5b2741aa0d9b..b268bbfa6c53 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -38,15 +38,26 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> const struct bpf_prog_array *array;
> struct bpf_run_ctx *old_run_ctx;
> struct bpf_cg_run_ctx run_ctx;
> + bool do_sleepable;
> u32 func_ret;
>
> + do_sleepable =
> + atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
> +
> run_ctx.retval = retval;
> migrate_disable();
> - rcu_read_lock();
> + if (do_sleepable) {
> + might_fault();
> + rcu_read_lock_trace();
> + } else
> + rcu_read_lock();
> array = rcu_dereference(cgrp->effective[atype]);
array is now under rcu_read_lock_trace for the "do_sleepable" case.
The array free side should be changed also to wait for the tasks_trace gp.
Please check if any bpf_prog_array_free() usages in cgroup.c should be replaced
with bpf_prog_array_free_sleepable().
Another high level comment is, other cgroup hooks may get sleepable support in
the future. In particular the SEC("lsm_cgroup") when considering other lsm hooks
in SEC("lsm.s") have sleepable support already. just want to bring up here for
awareness. This set can focus on get/setsockopt for now.
> item = &array->items[0];
> old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> while ((prog = READ_ONCE(item->prog))) {
> + if (do_sleepable && !prog->aux->sleepable)
> + rcu_read_lock();
> +
> run_ctx.prog_item = item;
> func_ret = run_prog(prog, ctx);
> if (ret_flags) {
> @@ -56,13 +67,43 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
> run_ctx.retval = -EPERM;
> item++;
> +
> + if (do_sleepable && !prog->aux->sleepable)
> + rcu_read_unlock();
> }
> bpf_reset_run_ctx(old_run_ctx);
> - rcu_read_unlock();
> + if (do_sleepable)
> + rcu_read_unlock_trace();
> + else
> + rcu_read_unlock();
> migrate_enable();
> return run_ctx.retval;
> }
>
> +static __always_inline bool
> +has_only_sleepable_prog_cg(const struct cgroup_bpf *cgrp,
> + enum cgroup_bpf_attach_type atype)
> +{
> + const struct bpf_prog_array_item *item;
> + const struct bpf_prog *prog;
> + int cnt = 0;
> + bool ret = true;
> +
> + rcu_read_lock();
> + item = &rcu_dereference(cgrp->effective[atype])->items[0];
> + while (ret && (prog = READ_ONCE(item->prog))) {
> + if (!prog->aux->sleepable)
> + ret = false;
> + item++;
> + cnt++;
> + }
> + rcu_read_unlock();
> + if (cnt == 0)
> + ret = false;
> +
> + return ret;
> +}
> +
> unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
> const struct bpf_insn *insn)
> {
> @@ -1773,7 +1814,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> struct bpf_sockopt_buf *buf)
> {
> - if (ctx->optval == buf->data)
> + if (ctx->optval == buf->data ||
> + ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
> return;
> kfree(ctx->optval);
> }
> @@ -1781,7 +1823,8 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
> struct bpf_sockopt_buf *buf)
> {
> - return ctx->optval != buf->data;
> + return ctx->optval != buf->data &&
> + !(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
> }
>
> int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> @@ -1796,21 +1839,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> .optname = *optname,
> };
> int ret, max_optlen;
> + bool alloc_mem;
> +
> + alloc_mem = !has_only_sleepable_prog_cg(&cgrp->bpf, CGROUP_SETSOCKOPT);
hmm... I am not sure if this would work. The cgroup->effective[atype] could have
been changed after this has_only_sleepable_prog_cg() check. For example, a
non-sleepable prog is attached after this check. The latter
bpf_prog_run_array_cg() may end up having an incorrect ctx.
A quick thought is, this alloc decision should be postponed to the
bpf_prog_run_array_cg()? It may be better to have a different
bpf_prog_run_array_cg for set/getsockopt here, not sure.
> + if (!alloc_mem) {
> + max_optlen = *optlen;
> + ctx.optlen = *optlen;
> + ctx.user_optval = optval;
> + ctx.user_optval_end = optval + *optlen;
> + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
> + } else {
> + /* Allocate a bit more than the initial user buffer for
> + * BPF program. The canonical use case is overriding
> + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
> + */
> + max_optlen = max_t(int, 16, *optlen);
> + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> + if (max_optlen < 0)
> + return max_optlen;
>
> - /* Allocate a bit more than the initial user buffer for
> - * BPF program. The canonical use case is overriding
> - * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
> - */
> - max_optlen = max_t(int, 16, *optlen);
> - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> - if (max_optlen < 0)
> - return max_optlen;
> -
> - ctx.optlen = *optlen;
> + ctx.optlen = *optlen;
>
> - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
> - ret = -EFAULT;
> - goto out;
> + if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
> + ret = -EFAULT;
> + goto out;
> + }
> }
next prev parent reply other threads:[~2023-08-02 19:25 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-07-22 5:22 [RFC bpf-next 0/5] Sleepable BPF programs on cgroup {get,set}sockopt kuifeng
2023-07-22 5:22 ` [RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt kuifeng
2023-07-22 16:09 ` kernel test robot
2023-07-22 18:22 ` kernel test robot
2023-07-24 18:36 ` Stanislav Fomichev
2023-07-31 22:02 ` Kui-Feng Lee
2023-07-31 23:35 ` Stanislav Fomichev
2023-08-01 17:31 ` Kui-Feng Lee
2023-08-01 18:08 ` Stanislav Fomichev
2023-08-02 22:28 ` Martin KaFai Lau
2023-08-02 19:25 ` Martin KaFai Lau [this message]
2023-07-22 5:22 ` [RFC bpf-next 2/5] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user() kuifeng
2023-07-22 6:39 ` kernel test robot
2023-07-22 6:49 ` kernel test robot
2023-07-22 7:41 ` kernel test robot
2023-07-22 20:26 ` kernel test robot
2023-08-02 19:59 ` Martin KaFai Lau
2023-07-22 5:22 ` [RFC bpf-next 3/5] bpf: Add a new dynptr type for CGRUP_SOCKOPT kuifeng
2023-07-22 5:22 ` [RFC bpf-next 4/5] bpf: Prevent BPF programs from access the buffer pointed by user_optval kuifeng
2023-07-22 5:22 ` [RFC bpf-next 5/5] bpf: Add test cases for sleepable BPF programs of the CGROUP_SOCKOPT type kuifeng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=39bad6b2-5498-b2e3-c70c-e27a74834026@linux.dev \
--to=martin.lau@linux.dev \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=kernel-team@meta.com \
--cc=kuifeng@meta.com \
--cc=sinquersw@gmail.com \
--cc=song@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.