From: Martin KaFai Lau <martin.lau@linux.dev>
To: kuifeng@meta.com
Cc: sinquersw@gmail.com, bpf@vger.kernel.org, ast@kernel.org,
song@kernel.org, kernel-team@meta.com, andrii@kernel.org
Subject: Re: [RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.
Date: Wed, 2 Aug 2023 12:25:02 -0700 [thread overview]
Message-ID: <39bad6b2-5498-b2e3-c70c-e27a74834026@linux.dev> (raw)
In-Reply-To: <20230722052248.1062582-2-kuifeng@meta.com>
On 7/21/23 10:22 PM, kuifeng@meta.com wrote:
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 739c15906a65..b2f81193f97b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7135,6 +7135,15 @@ struct bpf_sockopt {
> __s32 optname;
> __s32 optlen;
> __s32 retval;
> +
> + __bpf_md_ptr(void *, user_optval);
> + __bpf_md_ptr(void *, user_optval_end);
> + __u32 flags;
I will follow up on the UAPI discussion in the other existing thread.
> +};
> +
> +enum bpf_sockopt_flags {
> + /* optval is a pointer to user space memory */
> + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
> };
>
> struct bpf_pidns_info {
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 5b2741aa0d9b..b268bbfa6c53 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -38,15 +38,26 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> const struct bpf_prog_array *array;
> struct bpf_run_ctx *old_run_ctx;
> struct bpf_cg_run_ctx run_ctx;
> + bool do_sleepable;
> u32 func_ret;
>
> + do_sleepable =
> + atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
> +
> run_ctx.retval = retval;
> migrate_disable();
> - rcu_read_lock();
> + if (do_sleepable) {
> + might_fault();
> + rcu_read_lock_trace();
> + } else
> + rcu_read_lock();
> array = rcu_dereference(cgrp->effective[atype]);
array is now under rcu_read_lock_trace for the "do_sleepable" case.
The array free side should be changed also to wait for the tasks_trace gp.
Please check if any bpf_prog_array_free() usages in cgroup.c should be replaced
with bpf_prog_array_free_sleepable().
Another high level comment is, other cgroup hooks may get sleepable support in
the future. In particular the SEC("lsm_cgroup") when considering other lsm hooks
in SEC("lsm.s") have sleepable support already. just want to bring up here for
awareness. This set can focus on get/setsockopt for now.
> item = &array->items[0];
> old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> while ((prog = READ_ONCE(item->prog))) {
> + if (do_sleepable && !prog->aux->sleepable)
> + rcu_read_lock();
> +
> run_ctx.prog_item = item;
> func_ret = run_prog(prog, ctx);
> if (ret_flags) {
> @@ -56,13 +67,43 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
> run_ctx.retval = -EPERM;
> item++;
> +
> + if (do_sleepable && !prog->aux->sleepable)
> + rcu_read_unlock();
> }
> bpf_reset_run_ctx(old_run_ctx);
> - rcu_read_unlock();
> + if (do_sleepable)
> + rcu_read_unlock_trace();
> + else
> + rcu_read_unlock();
> migrate_enable();
> return run_ctx.retval;
> }
>
> +static __always_inline bool
> +has_only_sleepable_prog_cg(const struct cgroup_bpf *cgrp,
> + enum cgroup_bpf_attach_type atype)
> +{
> + const struct bpf_prog_array_item *item;
> + const struct bpf_prog *prog;
> + int cnt = 0;
> + bool ret = true;
> +
> + rcu_read_lock();
> + item = &rcu_dereference(cgrp->effective[atype])->items[0];
> + while (ret && (prog = READ_ONCE(item->prog))) {
> + if (!prog->aux->sleepable)
> + ret = false;
> + item++;
> + cnt++;
> + }
> + rcu_read_unlock();
> + if (cnt == 0)
> + ret = false;
> +
> + return ret;
> +}
> +
> unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
> const struct bpf_insn *insn)
> {
> @@ -1773,7 +1814,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> struct bpf_sockopt_buf *buf)
> {
> - if (ctx->optval == buf->data)
> + if (ctx->optval == buf->data ||
> + ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
> return;
> kfree(ctx->optval);
> }
> @@ -1781,7 +1823,8 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
> struct bpf_sockopt_buf *buf)
> {
> - return ctx->optval != buf->data;
> + return ctx->optval != buf->data &&
> + !(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
> }
>
> int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> @@ -1796,21 +1839,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> .optname = *optname,
> };
> int ret, max_optlen;
> + bool alloc_mem;
> +
> + alloc_mem = !has_only_sleepable_prog_cg(&cgrp->bpf, CGROUP_SETSOCKOPT);
hmm... I am not sure if this would work. The cgroup->effective[atype] could have
been changed after this has_only_sleepable_prog_cg() check. For example, a
non-sleepable prog is attached after this check. The latter
bpf_prog_run_array_cg() may end up having an incorrect ctx.
A quick thought is, this alloc decision should be postponed to the
bpf_prog_run_array_cg()? It may be better to have a different
bpf_prog_run_array_cg for set/getsockopt here, not sure.
> + if (!alloc_mem) {
> + max_optlen = *optlen;
> + ctx.optlen = *optlen;
> + ctx.user_optval = optval;
> + ctx.user_optval_end = optval + *optlen;
> + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
> + } else {
> + /* Allocate a bit more than the initial user buffer for
> + * BPF program. The canonical use case is overriding
> + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
> + */
> + max_optlen = max_t(int, 16, *optlen);
> + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> + if (max_optlen < 0)
> + return max_optlen;
>
> - /* Allocate a bit more than the initial user buffer for
> - * BPF program. The canonical use case is overriding
> - * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
> - */
> - max_optlen = max_t(int, 16, *optlen);
> - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> - if (max_optlen < 0)
> - return max_optlen;
> -
> - ctx.optlen = *optlen;
> + ctx.optlen = *optlen;
>
> - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
> - ret = -EFAULT;
> - goto out;
> + if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
> + ret = -EFAULT;
> + goto out;
> + }
> }
next prev parent reply other threads:[~2023-08-02 19:25 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-07-22 5:22 [RFC bpf-next 0/5] Sleepable BPF programs on cgroup {get,set}sockopt kuifeng
2023-07-22 5:22 ` [RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt kuifeng
2023-07-24 18:36 ` Stanislav Fomichev
2023-07-31 22:02 ` Kui-Feng Lee
2023-07-31 23:35 ` Stanislav Fomichev
2023-08-01 17:31 ` Kui-Feng Lee
2023-08-01 18:08 ` Stanislav Fomichev
2023-08-02 22:28 ` Martin KaFai Lau
2023-08-02 19:25 ` Martin KaFai Lau [this message]
2023-07-22 5:22 ` [RFC bpf-next 2/5] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user() kuifeng
2023-08-02 19:59 ` Martin KaFai Lau
2023-07-22 5:22 ` [RFC bpf-next 3/5] bpf: Add a new dynptr type for CGRUP_SOCKOPT kuifeng
2023-07-22 5:22 ` [RFC bpf-next 4/5] bpf: Prevent BPF programs from access the buffer pointed by user_optval kuifeng
2023-07-22 5:22 ` [RFC bpf-next 5/5] bpf: Add test cases for sleepable BPF programs of the CGROUP_SOCKOPT type kuifeng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=39bad6b2-5498-b2e3-c70c-e27a74834026@linux.dev \
--to=martin.lau@linux.dev \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=kernel-team@meta.com \
--cc=kuifeng@meta.com \
--cc=sinquersw@gmail.com \
--cc=song@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox