* [RFC bpf-next v2 1/6] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.
2023-08-11 4:31 [RFC bpf-next v2 0/6] Sleepable BPF programs on cgroup {get,set}sockopt thinker.li
@ 2023-08-11 4:31 ` thinker.li
2023-08-11 23:01 ` Stanislav Fomichev
2023-08-11 4:31 ` [RFC bpf-next v2 2/6] bpf: Prevent BPF programs from access the buffer pointed by user_optval thinker.li
` (4 subsequent siblings)
5 siblings, 1 reply; 18+ messages in thread
From: thinker.li @ 2023-08-11 4:31 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf,
yonghong.song
Cc: sinquersw, kuifeng, Kui-Feng Lee
From: Kui-Feng Lee <kuifeng@meta.com>
Enable sleepable cgroup/{get,set}sockopt hooks.
The sleepable BPF programs attached to cgroup/{get,set}sockopt hooks may
received a pointer to the optval in user space instead of a kernel
copy. ctx->user_optval and ctx->user_optval_end are the pointers to the
begin and end of the user space buffer if receiving a user space
buffer. ctx->optval and ctx->optval_end will be a kernel copy if receiving
a kernel space buffer.
A program receives a user space buffer if ctx->flags &
BPF_SOCKOPT_FLAG_OPTVAL_USER is true, otherwise it receives a kernel space
buffer. The BPF programs should not read/write from/to a user space buffer
dirrectly. It should access the buffer through bpf_copy_from_user() and
bpf_copy_to_user() provided in the following patches.
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
include/linux/filter.h | 1 +
include/uapi/linux/bpf.h | 7 +
kernel/bpf/cgroup.c | 229 ++++++++++++++++++++++++++-------
kernel/bpf/verifier.c | 7 +-
tools/include/uapi/linux/bpf.h | 7 +
tools/lib/bpf/libbpf.c | 2 +
6 files changed, 206 insertions(+), 47 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 761af6b3cf2b..54169a641d40 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1337,6 +1337,7 @@ struct bpf_sockopt_kern {
s32 level;
s32 optname;
s32 optlen;
+ u32 flags;
/* for retval in struct bpf_cg_run_ctx */
struct task_struct *current_task;
/* Temporary "register" for indirect stores to ppos. */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70da85200695..fff6f7dff408 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7138,6 +7138,13 @@ struct bpf_sockopt {
__s32 optname;
__s32 optlen;
__s32 retval;
+
+ __u32 flags;
+};
+
+enum bpf_sockopt_flags {
+ /* optval is a pointer to user space memory */
+ BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
};
struct bpf_pidns_info {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..59489d9619a3 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -28,25 +28,46 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
* function pointer.
*/
static __always_inline int
-bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
- enum cgroup_bpf_attach_type atype,
- const void *ctx, bpf_prog_run_fn run_prog,
- int retval, u32 *ret_flags)
+bpf_prog_run_array_cg_cb(const struct cgroup_bpf *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ const void *ctx, bpf_prog_run_fn run_prog,
+ int retval, u32 *ret_flags,
+ int (*progs_cb)(void *, const struct bpf_prog_array_item *),
+ void *progs_cb_arg)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_cg_run_ctx run_ctx;
+ bool do_sleepable;
u32 func_ret;
+ int err;
+
+ do_sleepable =
+ atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
run_ctx.retval = retval;
migrate_disable();
- rcu_read_lock();
+ if (do_sleepable) {
+ might_fault();
+ rcu_read_lock_trace();
+ } else
+ rcu_read_lock();
array = rcu_dereference(cgrp->effective[atype]);
item = &array->items[0];
+
+ if (progs_cb) {
+ err = progs_cb(progs_cb_arg, item);
+ if (err)
+ return err;
+ }
+
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
while ((prog = READ_ONCE(item->prog))) {
+ if (do_sleepable && !prog->aux->sleepable)
+ rcu_read_lock();
+
run_ctx.prog_item = item;
func_ret = run_prog(prog, ctx);
if (ret_flags) {
@@ -56,13 +77,48 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
run_ctx.retval = -EPERM;
item++;
+
+ if (do_sleepable && !prog->aux->sleepable)
+ rcu_read_unlock();
}
bpf_reset_run_ctx(old_run_ctx);
- rcu_read_unlock();
+ if (do_sleepable)
+ rcu_read_unlock_trace();
+ else
+ rcu_read_unlock();
migrate_enable();
return run_ctx.retval;
}
+static __always_inline void
+count_sleepable_prog_cg(const struct bpf_prog_array_item *item,
+ int *sleepable, int *non_sleepable)
+{
+ const struct bpf_prog *prog;
+ bool ret = true;
+
+ *sleepable = 0;
+ *non_sleepable = 0;
+
+ while (ret && (prog = READ_ONCE(item->prog))) {
+ if (prog->aux->sleepable)
+ (*sleepable)++;
+ else
+ (*non_sleepable)++;
+ item++;
+ }
+}
+
+static __always_inline int
+bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ const void *ctx, bpf_prog_run_fn run_prog,
+ int retval, u32 *ret_flags)
+{
+ return bpf_prog_run_array_cg_cb(cgrp, atype, ctx, run_prog, retval,
+ ret_flags, NULL, NULL);
+}
+
unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
const struct bpf_insn *insn)
{
@@ -307,7 +363,7 @@ static void cgroup_bpf_release(struct work_struct *work)
old_array = rcu_dereference_protected(
cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
- bpf_prog_array_free(old_array);
+ bpf_prog_array_free_sleepable(old_array);
}
list_for_each_entry_safe(storage, stmp, storages, list_cg) {
@@ -451,7 +507,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
- bpf_prog_array_free(old_array);
+ bpf_prog_array_free_sleepable(old_array);
}
/**
@@ -491,7 +547,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
return 0;
cleanup:
for (i = 0; i < NR; i++)
- bpf_prog_array_free(arrays[i]);
+ bpf_prog_array_free_sleepable(arrays[i]);
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_put(p);
@@ -525,7 +581,7 @@ static int update_effective_progs(struct cgroup *cgrp,
if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
if (unlikely(desc->bpf.inactive)) {
- bpf_prog_array_free(desc->bpf.inactive);
+ bpf_prog_array_free_sleepable(desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
continue;
@@ -544,7 +600,7 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
- bpf_prog_array_free(desc->bpf.inactive);
+ bpf_prog_array_free_sleepable(desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -1740,7 +1796,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
#ifdef CONFIG_NET
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
- struct bpf_sockopt_buf *buf)
+ struct bpf_sockopt_buf *buf, bool force_alloc)
{
if (unlikely(max_optlen < 0))
return -EINVAL;
@@ -1752,7 +1808,7 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
max_optlen = PAGE_SIZE;
}
- if (max_optlen <= sizeof(buf->data)) {
+ if (max_optlen <= sizeof(buf->data) && !force_alloc) {
/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
* bytes avoid the cost of kzalloc.
*/
@@ -1773,7 +1829,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
struct bpf_sockopt_buf *buf)
{
- if (ctx->optval == buf->data)
+ if (ctx->optval == buf->data ||
+ ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
return;
kfree(ctx->optval);
}
@@ -1781,7 +1838,50 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
struct bpf_sockopt_buf *buf)
{
- return ctx->optval != buf->data;
+ return ctx->optval != buf->data &&
+ !(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
+}
+
+struct filter_sockopt_cb_args {
+ struct bpf_sockopt_kern *ctx;
+ struct bpf_sockopt_buf *buf;
+ int max_optlen;
+};
+
+static int filter_setsockopt_progs_cb(void *arg,
+ const struct bpf_prog_array_item *item)
+{
+ struct filter_sockopt_cb_args *cb_args = arg;
+ struct bpf_sockopt_kern *ctx = cb_args->ctx;
+ char *optval = ctx->optval;
+ int sleepable_cnt, non_sleepable_cnt;
+ int max_optlen;
+
+ count_sleepable_prog_cg(item, &sleepable_cnt, &non_sleepable_cnt);
+
+ if (!non_sleepable_cnt)
+ return 0;
+
+ /* Allocate a bit more than the initial user buffer for
+ * BPF program. The canonical use case is overriding
+ * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
+ */
+ max_optlen = max_t(int, 16, ctx->optlen);
+ /* We need to force allocating from heap if there are sleepable
+ * programs since they may created dynptrs from ctx->optval. In
+ * this case, dynptrs will try to free the buffer that is actually
+ * on the stack without this flag.
+ */
+ max_optlen = sockopt_alloc_buf(ctx, max_optlen, cb_args->buf,
+ !!sleepable_cnt);
+ if (max_optlen < 0)
+ return max_optlen;
+
+ if (copy_from_user(ctx->optval, optval,
+ min(ctx->optlen, max_optlen)) != 0)
+ return -EFAULT;
+
+ return 0;
}
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
@@ -1795,27 +1895,22 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
.level = *level,
.optname = *optname,
};
+ struct filter_sockopt_cb_args cb_args = {
+ .ctx = &ctx,
+ .buf = &buf,
+ };
int ret, max_optlen;
- /* Allocate a bit more than the initial user buffer for
- * BPF program. The canonical use case is overriding
- * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
- */
- max_optlen = max_t(int, 16, *optlen);
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
- if (max_optlen < 0)
- return max_optlen;
-
+ max_optlen = *optlen;
ctx.optlen = *optlen;
-
- if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
- ret = -EFAULT;
- goto out;
- }
+ ctx.optval = optval;
+ ctx.optval_end = optval + *optlen;
+ ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
lock_sock(sk);
- ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
- &ctx, bpf_prog_run, 0, NULL);
+ ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
+ &ctx, bpf_prog_run, 0, NULL,
+ filter_setsockopt_progs_cb, &cb_args);
release_sock(sk);
if (ret)
@@ -1824,7 +1919,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
ret = 1;
- } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
+ } else if (ctx.optlen > (ctx.optval_end - ctx.optval) ||
+ ctx.optlen < -1) {
/* optlen is out of bounds */
if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
@@ -1846,6 +1942,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
if (ctx.optlen != 0) {
*optlen = ctx.optlen;
+ if (ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
+ return 0;
/* We've used bpf_sockopt_kern->buf as an intermediary
* storage, but the BPF program indicates that we need
* to pass this data to the kernel setsockopt handler.
@@ -1874,6 +1972,36 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
return ret;
}
+static int filter_getsockopt_progs_cb(void *arg,
+ const struct bpf_prog_array_item *item)
+{
+ struct filter_sockopt_cb_args *cb_args = arg;
+ struct bpf_sockopt_kern *ctx = cb_args->ctx;
+ int sleepable_cnt, non_sleepable_cnt;
+ int max_optlen;
+ char *optval;
+
+ count_sleepable_prog_cg(item, &sleepable_cnt, &non_sleepable_cnt);
+
+ if (!non_sleepable_cnt)
+ return 0;
+
+ optval = ctx->optval;
+ max_optlen = sockopt_alloc_buf(ctx, cb_args->max_optlen,
+ cb_args->buf, false);
+ if (max_optlen < 0)
+ return max_optlen;
+
+ if (copy_from_user(ctx->optval, optval,
+ min(ctx->optlen, max_optlen)) != 0)
+ return -EFAULT;
+
+ ctx->flags = 0;
+ cb_args->max_optlen = max_optlen;
+
+ return 0;
+}
+
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int optname, char __user *optval,
int __user *optlen, int max_optlen,
@@ -1887,15 +2015,16 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
.optname = optname,
.current_task = current,
};
+ struct filter_sockopt_cb_args cb_args = {
+ .ctx = &ctx,
+ .buf = &buf,
+ .max_optlen = max_optlen,
+ };
int orig_optlen;
int ret;
orig_optlen = max_optlen;
ctx.optlen = max_optlen;
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
- if (max_optlen < 0)
- return max_optlen;
-
if (!retval) {
/* If kernel getsockopt finished successfully,
* copy whatever was returned to the user back
@@ -1914,18 +2043,19 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
orig_optlen = ctx.optlen;
-
- if (copy_from_user(ctx.optval, optval,
- min(ctx.optlen, max_optlen)) != 0) {
- ret = -EFAULT;
- goto out;
- }
}
+ ctx.optval = optval;
+ ctx.optval_end = optval + max_optlen;
+ ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+
lock_sock(sk);
- ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
- &ctx, bpf_prog_run, retval, NULL);
+ ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL,
+ filter_getsockopt_progs_cb,
+ &cb_args);
release_sock(sk);
+ max_optlen = cb_args.max_optlen;
if (ret < 0)
goto out;
@@ -1942,7 +2072,9 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
if (ctx.optlen != 0) {
- if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
+ if (optval &&
+ !(ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) &&
+ copy_to_user(optval, ctx.optval, ctx.optlen)) {
ret = -EFAULT;
goto out;
}
@@ -2388,6 +2520,10 @@ static bool cg_sockopt_is_valid_access(int off, int size,
if (size != size_default)
return false;
return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+ case offsetof(struct bpf_sockopt, flags):
+ if (size != sizeof(__u32))
+ return false;
+ break;
default:
if (size != size_default)
return false;
@@ -2481,6 +2617,9 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sockopt, optval_end):
*insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
break;
+ case offsetof(struct bpf_sockopt, flags):
+ *insn++ = CG_SOCKOPT_READ_FIELD(flags);
+ break;
}
return insn - insn_buf;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 132f25dab931..fbc0096693e7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19538,9 +19538,11 @@ static bool can_be_sleepable(struct bpf_prog *prog)
return false;
}
}
+
return prog->type == BPF_PROG_TYPE_LSM ||
prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
- prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
+ prog->type == BPF_PROG_TYPE_CGROUP_SOCKOPT;
}
static int check_attach_btf_id(struct bpf_verifier_env *env)
@@ -19562,7 +19564,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
if (prog->aux->sleepable && !can_be_sleepable(prog)) {
- verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe,"
+ "cgroup, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 70da85200695..fff6f7dff408 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7138,6 +7138,13 @@ struct bpf_sockopt {
__s32 optname;
__s32 optlen;
__s32 retval;
+
+ __u32 flags;
+};
+
+enum bpf_sockopt_flags {
+ /* optval is a pointer to user space memory */
+ BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
};
struct bpf_pidns_info {
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 17883f5a44b9..3be9270bbc33 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8766,7 +8766,9 @@ static const struct bpf_sec_def section_defs[] = {
SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE),
SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE),
SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE),
+ SEC_DEF("cgroup/getsockopt.s", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE | SEC_SLEEPABLE),
SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE),
+ SEC_DEF("cgroup/setsockopt.s", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE | SEC_SLEEPABLE),
SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT),
SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE),
SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE),
--
2.34.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 1/6] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.
2023-08-11 4:31 ` [RFC bpf-next v2 1/6] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt thinker.li
@ 2023-08-11 23:01 ` Stanislav Fomichev
2023-08-11 23:17 ` Kui-Feng Lee
2023-08-11 23:22 ` Kui-Feng Lee
0 siblings, 2 replies; 18+ messages in thread
From: Stanislav Fomichev @ 2023-08-11 23:01 UTC (permalink / raw)
To: thinker.li
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, yonghong.song,
sinquersw, kuifeng
On 08/10, thinker.li@gmail.com wrote:
> From: Kui-Feng Lee <kuifeng@meta.com>
>
> Enable sleepable cgroup/{get,set}sockopt hooks.
>
> The sleepable BPF programs attached to cgroup/{get,set}sockopt hooks may
> received a pointer to the optval in user space instead of a kernel
> copy. ctx->user_optval and ctx->user_optval_end are the pointers to the
> begin and end of the user space buffer if receiving a user space
> buffer. ctx->optval and ctx->optval_end will be a kernel copy if receiving
> a kernel space buffer.
>
> A program receives a user space buffer if ctx->flags &
> BPF_SOCKOPT_FLAG_OPTVAL_USER is true, otherwise it receives a kernel space
> buffer. The BPF programs should not read/write from/to a user space buffer
> dirrectly. It should access the buffer through bpf_copy_from_user() and
> bpf_copy_to_user() provided in the following patches.
>
> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
> ---
> include/linux/filter.h | 1 +
> include/uapi/linux/bpf.h | 7 +
> kernel/bpf/cgroup.c | 229 ++++++++++++++++++++++++++-------
> kernel/bpf/verifier.c | 7 +-
> tools/include/uapi/linux/bpf.h | 7 +
> tools/lib/bpf/libbpf.c | 2 +
> 6 files changed, 206 insertions(+), 47 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 761af6b3cf2b..54169a641d40 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1337,6 +1337,7 @@ struct bpf_sockopt_kern {
> s32 level;
> s32 optname;
> s32 optlen;
> + u32 flags;
> /* for retval in struct bpf_cg_run_ctx */
> struct task_struct *current_task;
> /* Temporary "register" for indirect stores to ppos. */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 70da85200695..fff6f7dff408 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7138,6 +7138,13 @@ struct bpf_sockopt {
> __s32 optname;
> __s32 optlen;
> __s32 retval;
> +
> + __u32 flags;
> +};
> +
> +enum bpf_sockopt_flags {
> + /* optval is a pointer to user space memory */
> + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
> };
>
> struct bpf_pidns_info {
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 5b2741aa0d9b..59489d9619a3 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -28,25 +28,46 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
> * function pointer.
> */
> static __always_inline int
> -bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> - enum cgroup_bpf_attach_type atype,
> - const void *ctx, bpf_prog_run_fn run_prog,
> - int retval, u32 *ret_flags)
> +bpf_prog_run_array_cg_cb(const struct cgroup_bpf *cgrp,
> + enum cgroup_bpf_attach_type atype,
> + const void *ctx, bpf_prog_run_fn run_prog,
> + int retval, u32 *ret_flags,
> + int (*progs_cb)(void *, const struct bpf_prog_array_item *),
> + void *progs_cb_arg)
> {
> const struct bpf_prog_array_item *item;
> const struct bpf_prog *prog;
> const struct bpf_prog_array *array;
> struct bpf_run_ctx *old_run_ctx;
> struct bpf_cg_run_ctx run_ctx;
> + bool do_sleepable;
> u32 func_ret;
> + int err;
> +
> + do_sleepable =
> + atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
>
> run_ctx.retval = retval;
> migrate_disable();
> - rcu_read_lock();
> + if (do_sleepable) {
> + might_fault();
> + rcu_read_lock_trace();
> + } else
> + rcu_read_lock();
> array = rcu_dereference(cgrp->effective[atype]);
> item = &array->items[0];
> +
> + if (progs_cb) {
> + err = progs_cb(progs_cb_arg, item);
> + if (err)
> + return err;
> + }
> +
> old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> while ((prog = READ_ONCE(item->prog))) {
> + if (do_sleepable && !prog->aux->sleepable)
> + rcu_read_lock();
> +
> run_ctx.prog_item = item;
> func_ret = run_prog(prog, ctx);
> if (ret_flags) {
> @@ -56,13 +77,48 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
> run_ctx.retval = -EPERM;
> item++;
> +
> + if (do_sleepable && !prog->aux->sleepable)
> + rcu_read_unlock();
> }
> bpf_reset_run_ctx(old_run_ctx);
> - rcu_read_unlock();
> + if (do_sleepable)
> + rcu_read_unlock_trace();
> + else
> + rcu_read_unlock();
> migrate_enable();
> return run_ctx.retval;
> }
>
> +static __always_inline void
> +count_sleepable_prog_cg(const struct bpf_prog_array_item *item,
> + int *sleepable, int *non_sleepable)
> +{
> + const struct bpf_prog *prog;
> + bool ret = true;
> +
> + *sleepable = 0;
> + *non_sleepable = 0;
> +
> + while (ret && (prog = READ_ONCE(item->prog))) {
> + if (prog->aux->sleepable)
> + (*sleepable)++;
> + else
> + (*non_sleepable)++;
> + item++;
> + }
> +}
Can we maintain this info in the slow attach/detach path?
Running the loop every time we get a sockopt syscall another time
seems like something we can avoid? (even though it's a slow path,
seems like we can do better?)
> +
> +static __always_inline int
> +bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
> + enum cgroup_bpf_attach_type atype,
> + const void *ctx, bpf_prog_run_fn run_prog,
> + int retval, u32 *ret_flags)
> +{
> + return bpf_prog_run_array_cg_cb(cgrp, atype, ctx, run_prog, retval,
> + ret_flags, NULL, NULL);
> +}
> +
> unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
> const struct bpf_insn *insn)
> {
> @@ -307,7 +363,7 @@ static void cgroup_bpf_release(struct work_struct *work)
> old_array = rcu_dereference_protected(
> cgrp->bpf.effective[atype],
> lockdep_is_held(&cgroup_mutex));
> - bpf_prog_array_free(old_array);
> + bpf_prog_array_free_sleepable(old_array);
> }
>
> list_for_each_entry_safe(storage, stmp, storages, list_cg) {
> @@ -451,7 +507,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
> /* free prog array after grace period, since __cgroup_bpf_run_*()
> * might be still walking the array
> */
> - bpf_prog_array_free(old_array);
> + bpf_prog_array_free_sleepable(old_array);
> }
>
> /**
> @@ -491,7 +547,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
> return 0;
> cleanup:
> for (i = 0; i < NR; i++)
> - bpf_prog_array_free(arrays[i]);
> + bpf_prog_array_free_sleepable(arrays[i]);
>
> for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
> cgroup_bpf_put(p);
> @@ -525,7 +581,7 @@ static int update_effective_progs(struct cgroup *cgrp,
>
> if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
> if (unlikely(desc->bpf.inactive)) {
> - bpf_prog_array_free(desc->bpf.inactive);
> + bpf_prog_array_free_sleepable(desc->bpf.inactive);
> desc->bpf.inactive = NULL;
> }
> continue;
> @@ -544,7 +600,7 @@ static int update_effective_progs(struct cgroup *cgrp,
> css_for_each_descendant_pre(css, &cgrp->self) {
> struct cgroup *desc = container_of(css, struct cgroup, self);
>
> - bpf_prog_array_free(desc->bpf.inactive);
> + bpf_prog_array_free_sleepable(desc->bpf.inactive);
> desc->bpf.inactive = NULL;
> }
>
> @@ -1740,7 +1796,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
>
> #ifdef CONFIG_NET
> static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> - struct bpf_sockopt_buf *buf)
> + struct bpf_sockopt_buf *buf, bool force_alloc)
> {
> if (unlikely(max_optlen < 0))
> return -EINVAL;
> @@ -1752,7 +1808,7 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> max_optlen = PAGE_SIZE;
> }
>
> - if (max_optlen <= sizeof(buf->data)) {
> + if (max_optlen <= sizeof(buf->data) && !force_alloc) {
> /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
> * bytes avoid the cost of kzalloc.
> */
> @@ -1773,7 +1829,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> struct bpf_sockopt_buf *buf)
> {
> - if (ctx->optval == buf->data)
> + if (ctx->optval == buf->data ||
> + ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
> return;
> kfree(ctx->optval);
> }
> @@ -1781,7 +1838,50 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
> struct bpf_sockopt_buf *buf)
> {
> - return ctx->optval != buf->data;
> + return ctx->optval != buf->data &&
> + !(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
> +}
> +
> +struct filter_sockopt_cb_args {
> + struct bpf_sockopt_kern *ctx;
> + struct bpf_sockopt_buf *buf;
> + int max_optlen;
> +};
> +
> +static int filter_setsockopt_progs_cb(void *arg,
> + const struct bpf_prog_array_item *item)
> +{
> + struct filter_sockopt_cb_args *cb_args = arg;
> + struct bpf_sockopt_kern *ctx = cb_args->ctx;
> + char *optval = ctx->optval;
> + int sleepable_cnt, non_sleepable_cnt;
> + int max_optlen;
> +
> + count_sleepable_prog_cg(item, &sleepable_cnt, &non_sleepable_cnt);
> +
> + if (!non_sleepable_cnt)
> + return 0;
> +
> + /* Allocate a bit more than the initial user buffer for
> + * BPF program. The canonical use case is overriding
> + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
> + */
> + max_optlen = max_t(int, 16, ctx->optlen);
> + /* We need to force allocating from heap if there are sleepable
> + * programs since they may created dynptrs from ctx->optval. In
> + * this case, dynptrs will try to free the buffer that is actually
> + * on the stack without this flag.
> + */
> + max_optlen = sockopt_alloc_buf(ctx, max_optlen, cb_args->buf,
> + !!sleepable_cnt);
> + if (max_optlen < 0)
> + return max_optlen;
> +
> + if (copy_from_user(ctx->optval, optval,
> + min(ctx->optlen, max_optlen)) != 0)
> + return -EFAULT;
> +
> + return 0;
> }
>
> int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> @@ -1795,27 +1895,22 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> .level = *level,
> .optname = *optname,
> };
> + struct filter_sockopt_cb_args cb_args = {
> + .ctx = &ctx,
> + .buf = &buf,
> + };
> int ret, max_optlen;
>
> - /* Allocate a bit more than the initial user buffer for
> - * BPF program. The canonical use case is overriding
> - * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
> - */
> - max_optlen = max_t(int, 16, *optlen);
> - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> - if (max_optlen < 0)
> - return max_optlen;
> -
> + max_optlen = *optlen;
> ctx.optlen = *optlen;
> -
> - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
> - ret = -EFAULT;
> - goto out;
> - }
> + ctx.optval = optval;
> + ctx.optval_end = optval + *optlen;
> + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
>
> lock_sock(sk);
> - ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
> - &ctx, bpf_prog_run, 0, NULL);
> + ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
> + &ctx, bpf_prog_run, 0, NULL,
> + filter_setsockopt_progs_cb, &cb_args);
> release_sock(sk);
>
> if (ret)
> @@ -1824,7 +1919,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> if (ctx.optlen == -1) {
> /* optlen set to -1, bypass kernel */
> ret = 1;
> - } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
> + } else if (ctx.optlen > (ctx.optval_end - ctx.optval) ||
> + ctx.optlen < -1) {
> /* optlen is out of bounds */
> if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
> pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
> @@ -1846,6 +1942,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> */
> if (ctx.optlen != 0) {
> *optlen = ctx.optlen;
> + if (ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
> + return 0;
> /* We've used bpf_sockopt_kern->buf as an intermediary
> * storage, but the BPF program indicates that we need
> * to pass this data to the kernel setsockopt handler.
> @@ -1874,6 +1972,36 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> return ret;
> }
>
> +static int filter_getsockopt_progs_cb(void *arg,
> + const struct bpf_prog_array_item *item)
> +{
> + struct filter_sockopt_cb_args *cb_args = arg;
> + struct bpf_sockopt_kern *ctx = cb_args->ctx;
> + int sleepable_cnt, non_sleepable_cnt;
> + int max_optlen;
> + char *optval;
> +
> + count_sleepable_prog_cg(item, &sleepable_cnt, &non_sleepable_cnt);
> +
> + if (!non_sleepable_cnt)
> + return 0;
> +
> + optval = ctx->optval;
> + max_optlen = sockopt_alloc_buf(ctx, cb_args->max_optlen,
> + cb_args->buf, false);
> + if (max_optlen < 0)
> + return max_optlen;
> +
> + if (copy_from_user(ctx->optval, optval,
> + min(ctx->optlen, max_optlen)) != 0)
> + return -EFAULT;
> +
> + ctx->flags = 0;
> + cb_args->max_optlen = max_optlen;
> +
> + return 0;
> +}
> +
> int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
> int optname, char __user *optval,
> int __user *optlen, int max_optlen,
> @@ -1887,15 +2015,16 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
> .optname = optname,
> .current_task = current,
> };
> + struct filter_sockopt_cb_args cb_args = {
> + .ctx = &ctx,
> + .buf = &buf,
> + .max_optlen = max_optlen,
> + };
> int orig_optlen;
> int ret;
>
> orig_optlen = max_optlen;
> ctx.optlen = max_optlen;
> - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
> - if (max_optlen < 0)
> - return max_optlen;
> -
> if (!retval) {
> /* If kernel getsockopt finished successfully,
> * copy whatever was returned to the user back
> @@ -1914,18 +2043,19 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
> goto out;
> }
> orig_optlen = ctx.optlen;
> -
> - if (copy_from_user(ctx.optval, optval,
> - min(ctx.optlen, max_optlen)) != 0) {
> - ret = -EFAULT;
> - goto out;
> - }
> }
>
> + ctx.optval = optval;
> + ctx.optval_end = optval + max_optlen;
> + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
> +
> lock_sock(sk);
> - ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
> - &ctx, bpf_prog_run, retval, NULL);
> + ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_GETSOCKOPT,
> + &ctx, bpf_prog_run, retval, NULL,
> + filter_getsockopt_progs_cb,
> + &cb_args);
> release_sock(sk);
> + max_optlen = cb_args.max_optlen;
>
> if (ret < 0)
> goto out;
> @@ -1942,7 +2072,9 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
> }
>
> if (ctx.optlen != 0) {
> - if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
> + if (optval &&
> + !(ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) &&
> + copy_to_user(optval, ctx.optval, ctx.optlen)) {
> ret = -EFAULT;
> goto out;
> }
> @@ -2388,6 +2520,10 @@ static bool cg_sockopt_is_valid_access(int off, int size,
> if (size != size_default)
> return false;
> return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
> + case offsetof(struct bpf_sockopt, flags):
> + if (size != sizeof(__u32))
> + return false;
> + break;
> default:
> if (size != size_default)
> return false;
> @@ -2481,6 +2617,9 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
> case offsetof(struct bpf_sockopt, optval_end):
> *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
> break;
> + case offsetof(struct bpf_sockopt, flags):
> + *insn++ = CG_SOCKOPT_READ_FIELD(flags);
> + break;
> }
>
> return insn - insn_buf;
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 132f25dab931..fbc0096693e7 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -19538,9 +19538,11 @@ static bool can_be_sleepable(struct bpf_prog *prog)
> return false;
> }
> }
> +
Extra newline?
> return prog->type == BPF_PROG_TYPE_LSM ||
> prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
> - prog->type == BPF_PROG_TYPE_STRUCT_OPS;
> + prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
> + prog->type == BPF_PROG_TYPE_CGROUP_SOCKOPT;
> }
>
> static int check_attach_btf_id(struct bpf_verifier_env *env)
> @@ -19562,7 +19564,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
> }
>
> if (prog->aux->sleepable && !can_be_sleepable(prog)) {
> - verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
> + verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe,"
> + "cgroup, and struct_ops programs can be sleepable\n");
I don't think we split the lines even if they are too long.
Makes them ungreppable :-(
> return -EINVAL;
> }
>
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 70da85200695..fff6f7dff408 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -7138,6 +7138,13 @@ struct bpf_sockopt {
> __s32 optname;
> __s32 optlen;
> __s32 retval;
> +
> + __u32 flags;
> +};
> +
> +enum bpf_sockopt_flags {
> + /* optval is a pointer to user space memory */
> + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
> };
Why do we need to export the flag? Do we still want the users
to do something based on the presence/absence of
BPF_SOCKOPT_FLAG_OPTVAL_USER?
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 1/6] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.
2023-08-11 23:01 ` Stanislav Fomichev
@ 2023-08-11 23:17 ` Kui-Feng Lee
2023-08-11 23:22 ` Kui-Feng Lee
1 sibling, 0 replies; 18+ messages in thread
From: Kui-Feng Lee @ 2023-08-11 23:17 UTC (permalink / raw)
To: Stanislav Fomichev, thinker.li
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, yonghong.song,
kuifeng
On 8/11/23 16:01, Stanislav Fomichev wrote:
> On 08/10, thinker.li@gmail.com wrote:
>> From: Kui-Feng Lee <kuifeng@meta.com>
>>
>> Enable sleepable cgroup/{get,set}sockopt hooks.
>>
>> The sleepable BPF programs attached to cgroup/{get,set}sockopt hooks may
>> received a pointer to the optval in user space instead of a kernel
>> copy. ctx->user_optval and ctx->user_optval_end are the pointers to the
>> begin and end of the user space buffer if receiving a user space
>> buffer. ctx->optval and ctx->optval_end will be a kernel copy if receiving
>> a kernel space buffer.
>>
>> A program receives a user space buffer if ctx->flags &
>> BPF_SOCKOPT_FLAG_OPTVAL_USER is true, otherwise it receives a kernel space
>> buffer. The BPF programs should not read/write from/to a user space buffer
>> dirrectly. It should access the buffer through bpf_copy_from_user() and
>> bpf_copy_to_user() provided in the following patches.
>>
>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
>> ---
>> include/linux/filter.h | 1 +
>> include/uapi/linux/bpf.h | 7 +
>> kernel/bpf/cgroup.c | 229 ++++++++++++++++++++++++++-------
>> kernel/bpf/verifier.c | 7 +-
>> tools/include/uapi/linux/bpf.h | 7 +
>> tools/lib/bpf/libbpf.c | 2 +
>> 6 files changed, 206 insertions(+), 47 deletions(-)
>>
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 761af6b3cf2b..54169a641d40 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -1337,6 +1337,7 @@ struct bpf_sockopt_kern {
>> s32 level;
>> s32 optname;
>> s32 optlen;
>> + u32 flags;
>> /* for retval in struct bpf_cg_run_ctx */
>> struct task_struct *current_task;
>> /* Temporary "register" for indirect stores to ppos. */
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 70da85200695..fff6f7dff408 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -7138,6 +7138,13 @@ struct bpf_sockopt {
>> __s32 optname;
>> __s32 optlen;
>> __s32 retval;
>> +
>> + __u32 flags;
>> +};
>> +
>> +enum bpf_sockopt_flags {
>> + /* optval is a pointer to user space memory */
>> + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
>> };
>>
>> struct bpf_pidns_info {
>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index 5b2741aa0d9b..59489d9619a3 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>> @@ -28,25 +28,46 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
>> * function pointer.
>> */
>> static __always_inline int
>> -bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>> - enum cgroup_bpf_attach_type atype,
>> - const void *ctx, bpf_prog_run_fn run_prog,
>> - int retval, u32 *ret_flags)
>> +bpf_prog_run_array_cg_cb(const struct cgroup_bpf *cgrp,
>> + enum cgroup_bpf_attach_type atype,
>> + const void *ctx, bpf_prog_run_fn run_prog,
>> + int retval, u32 *ret_flags,
>> + int (*progs_cb)(void *, const struct bpf_prog_array_item *),
>> + void *progs_cb_arg)
>> {
>> const struct bpf_prog_array_item *item;
>> const struct bpf_prog *prog;
>> const struct bpf_prog_array *array;
>> struct bpf_run_ctx *old_run_ctx;
>> struct bpf_cg_run_ctx run_ctx;
>> + bool do_sleepable;
>> u32 func_ret;
>> + int err;
>> +
>> + do_sleepable =
>> + atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
>>
>> run_ctx.retval = retval;
>> migrate_disable();
>> - rcu_read_lock();
>> + if (do_sleepable) {
>> + might_fault();
>> + rcu_read_lock_trace();
>> + } else
>> + rcu_read_lock();
>> array = rcu_dereference(cgrp->effective[atype]);
>> item = &array->items[0];
>> +
>> + if (progs_cb) {
>> + err = progs_cb(progs_cb_arg, item);
>> + if (err)
>> + return err;
>> + }
>> +
>> old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
>> while ((prog = READ_ONCE(item->prog))) {
>> + if (do_sleepable && !prog->aux->sleepable)
>> + rcu_read_lock();
>> +
>> run_ctx.prog_item = item;
>> func_ret = run_prog(prog, ctx);
>> if (ret_flags) {
>> @@ -56,13 +77,48 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>> if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
>> run_ctx.retval = -EPERM;
>> item++;
>> +
>> + if (do_sleepable && !prog->aux->sleepable)
>> + rcu_read_unlock();
>> }
>> bpf_reset_run_ctx(old_run_ctx);
>> - rcu_read_unlock();
>> + if (do_sleepable)
>> + rcu_read_unlock_trace();
>> + else
>> + rcu_read_unlock();
>> migrate_enable();
>> return run_ctx.retval;
>> }
>>
>> +static __always_inline void
>> +count_sleepable_prog_cg(const struct bpf_prog_array_item *item,
>> + int *sleepable, int *non_sleepable)
>> +{
>> + const struct bpf_prog *prog;
>> + bool ret = true;
>> +
>> + *sleepable = 0;
>> + *non_sleepable = 0;
>> +
>> + while (ret && (prog = READ_ONCE(item->prog))) {
>> + if (prog->aux->sleepable)
>> + (*sleepable)++;
>> + else
>> + (*non_sleepable)++;
>> + item++;
>> + }
>> +}
>
> Can we maintain this info in the slow attach/detach path?
> Running the loop every time we get a sockopt syscall another time
> seems like something we can avoid? (even though it's a slow path,
> seems like we can do better?)
I can move this function into compute_effective_progs().
>
>> +
>> +static __always_inline int
>> +bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>> + enum cgroup_bpf_attach_type atype,
>> + const void *ctx, bpf_prog_run_fn run_prog,
>> + int retval, u32 *ret_flags)
>> +{
>> + return bpf_prog_run_array_cg_cb(cgrp, atype, ctx, run_prog, retval,
>> + ret_flags, NULL, NULL);
>> +}
>> +
>> unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
>> const struct bpf_insn *insn)
>> {
>> @@ -307,7 +363,7 @@ static void cgroup_bpf_release(struct work_struct *work)
>> old_array = rcu_dereference_protected(
>> cgrp->bpf.effective[atype],
>> lockdep_is_held(&cgroup_mutex));
>> - bpf_prog_array_free(old_array);
>> + bpf_prog_array_free_sleepable(old_array);
>> }
>>
>> list_for_each_entry_safe(storage, stmp, storages, list_cg) {
>> @@ -451,7 +507,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
>> /* free prog array after grace period, since __cgroup_bpf_run_*()
>> * might be still walking the array
>> */
>> - bpf_prog_array_free(old_array);
>> + bpf_prog_array_free_sleepable(old_array);
>> }
>>
>> /**
>> @@ -491,7 +547,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
>> return 0;
>> cleanup:
>> for (i = 0; i < NR; i++)
>> - bpf_prog_array_free(arrays[i]);
>> + bpf_prog_array_free_sleepable(arrays[i]);
>>
>> for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
>> cgroup_bpf_put(p);
>> @@ -525,7 +581,7 @@ static int update_effective_progs(struct cgroup *cgrp,
>>
>> if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
>> if (unlikely(desc->bpf.inactive)) {
>> - bpf_prog_array_free(desc->bpf.inactive);
>> + bpf_prog_array_free_sleepable(desc->bpf.inactive);
>> desc->bpf.inactive = NULL;
>> }
>> continue;
>> @@ -544,7 +600,7 @@ static int update_effective_progs(struct cgroup *cgrp,
>> css_for_each_descendant_pre(css, &cgrp->self) {
>> struct cgroup *desc = container_of(css, struct cgroup, self);
>>
>> - bpf_prog_array_free(desc->bpf.inactive);
>> + bpf_prog_array_free_sleepable(desc->bpf.inactive);
>> desc->bpf.inactive = NULL;
>> }
>>
>> @@ -1740,7 +1796,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
>>
>> #ifdef CONFIG_NET
>> static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
>> - struct bpf_sockopt_buf *buf)
>> + struct bpf_sockopt_buf *buf, bool force_alloc)
>> {
>> if (unlikely(max_optlen < 0))
>> return -EINVAL;
>> @@ -1752,7 +1808,7 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
>> max_optlen = PAGE_SIZE;
>> }
>>
>> - if (max_optlen <= sizeof(buf->data)) {
>> + if (max_optlen <= sizeof(buf->data) && !force_alloc) {
>> /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
>> * bytes avoid the cost of kzalloc.
>> */
>> @@ -1773,7 +1829,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
>> static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
>> struct bpf_sockopt_buf *buf)
>> {
>> - if (ctx->optval == buf->data)
>> + if (ctx->optval == buf->data ||
>> + ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
>> return;
>> kfree(ctx->optval);
>> }
>> @@ -1781,7 +1838,50 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
>> static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
>> struct bpf_sockopt_buf *buf)
>> {
>> - return ctx->optval != buf->data;
>> + return ctx->optval != buf->data &&
>> + !(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
>> +}
>> +
>> +struct filter_sockopt_cb_args {
>> + struct bpf_sockopt_kern *ctx;
>> + struct bpf_sockopt_buf *buf;
>> + int max_optlen;
>> +};
>> +
>> +static int filter_setsockopt_progs_cb(void *arg,
>> + const struct bpf_prog_array_item *item)
>> +{
>> + struct filter_sockopt_cb_args *cb_args = arg;
>> + struct bpf_sockopt_kern *ctx = cb_args->ctx;
>> + char *optval = ctx->optval;
>> + int sleepable_cnt, non_sleepable_cnt;
>> + int max_optlen;
>> +
>> + count_sleepable_prog_cg(item, &sleepable_cnt, &non_sleepable_cnt);
>> +
>> + if (!non_sleepable_cnt)
>> + return 0;
>> +
>> + /* Allocate a bit more than the initial user buffer for
>> + * BPF program. The canonical use case is overriding
>> + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
>> + */
>> + max_optlen = max_t(int, 16, ctx->optlen);
>> + /* We need to force allocating from heap if there are sleepable
>> + * programs since they may created dynptrs from ctx->optval. In
>> + * this case, dynptrs will try to free the buffer that is actually
>> + * on the stack without this flag.
>> + */
>> + max_optlen = sockopt_alloc_buf(ctx, max_optlen, cb_args->buf,
>> + !!sleepable_cnt);
>> + if (max_optlen < 0)
>> + return max_optlen;
>> +
>> + if (copy_from_user(ctx->optval, optval,
>> + min(ctx->optlen, max_optlen)) != 0)
>> + return -EFAULT;
>> +
>> + return 0;
>> }
>>
>> int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> @@ -1795,27 +1895,22 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> .level = *level,
>> .optname = *optname,
>> };
>> + struct filter_sockopt_cb_args cb_args = {
>> + .ctx = &ctx,
>> + .buf = &buf,
>> + };
>> int ret, max_optlen;
>>
>> - /* Allocate a bit more than the initial user buffer for
>> - * BPF program. The canonical use case is overriding
>> - * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
>> - */
>> - max_optlen = max_t(int, 16, *optlen);
>> - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
>> - if (max_optlen < 0)
>> - return max_optlen;
>> -
>> + max_optlen = *optlen;
>> ctx.optlen = *optlen;
>> -
>> - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
>> - ret = -EFAULT;
>> - goto out;
>> - }
>> + ctx.optval = optval;
>> + ctx.optval_end = optval + *optlen;
>> + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
>>
>> lock_sock(sk);
>> - ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
>> - &ctx, bpf_prog_run, 0, NULL);
>> + ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
>> + &ctx, bpf_prog_run, 0, NULL,
>> + filter_setsockopt_progs_cb, &cb_args);
>> release_sock(sk);
>>
>> if (ret)
>> @@ -1824,7 +1919,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> if (ctx.optlen == -1) {
>> /* optlen set to -1, bypass kernel */
>> ret = 1;
>> - } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
>> + } else if (ctx.optlen > (ctx.optval_end - ctx.optval) ||
>> + ctx.optlen < -1) {
>> /* optlen is out of bounds */
>> if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
>> pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
>> @@ -1846,6 +1942,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> */
>> if (ctx.optlen != 0) {
>> *optlen = ctx.optlen;
>> + if (ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
>> + return 0;
>> /* We've used bpf_sockopt_kern->buf as an intermediary
>> * storage, but the BPF program indicates that we need
>> * to pass this data to the kernel setsockopt handler.
>> @@ -1874,6 +1972,36 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> return ret;
>> }
>>
>> +static int filter_getsockopt_progs_cb(void *arg,
>> + const struct bpf_prog_array_item *item)
>> +{
>> + struct filter_sockopt_cb_args *cb_args = arg;
>> + struct bpf_sockopt_kern *ctx = cb_args->ctx;
>> + int sleepable_cnt, non_sleepable_cnt;
>> + int max_optlen;
>> + char *optval;
>> +
>> + count_sleepable_prog_cg(item, &sleepable_cnt, &non_sleepable_cnt);
>> +
>> + if (!non_sleepable_cnt)
>> + return 0;
>> +
>> + optval = ctx->optval;
>> + max_optlen = sockopt_alloc_buf(ctx, cb_args->max_optlen,
>> + cb_args->buf, false);
>> + if (max_optlen < 0)
>> + return max_optlen;
>> +
>> + if (copy_from_user(ctx->optval, optval,
>> + min(ctx->optlen, max_optlen)) != 0)
>> + return -EFAULT;
>> +
>> + ctx->flags = 0;
>> + cb_args->max_optlen = max_optlen;
>> +
>> + return 0;
>> +}
>> +
>> int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>> int optname, char __user *optval,
>> int __user *optlen, int max_optlen,
>> @@ -1887,15 +2015,16 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>> .optname = optname,
>> .current_task = current,
>> };
>> + struct filter_sockopt_cb_args cb_args = {
>> + .ctx = &ctx,
>> + .buf = &buf,
>> + .max_optlen = max_optlen,
>> + };
>> int orig_optlen;
>> int ret;
>>
>> orig_optlen = max_optlen;
>> ctx.optlen = max_optlen;
>> - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
>> - if (max_optlen < 0)
>> - return max_optlen;
>> -
>> if (!retval) {
>> /* If kernel getsockopt finished successfully,
>> * copy whatever was returned to the user back
>> @@ -1914,18 +2043,19 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>> goto out;
>> }
>> orig_optlen = ctx.optlen;
>> -
>> - if (copy_from_user(ctx.optval, optval,
>> - min(ctx.optlen, max_optlen)) != 0) {
>> - ret = -EFAULT;
>> - goto out;
>> - }
>> }
>>
>> + ctx.optval = optval;
>> + ctx.optval_end = optval + max_optlen;
>> + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
>> +
>> lock_sock(sk);
>> - ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
>> - &ctx, bpf_prog_run, retval, NULL);
>> + ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_GETSOCKOPT,
>> + &ctx, bpf_prog_run, retval, NULL,
>> + filter_getsockopt_progs_cb,
>> + &cb_args);
>> release_sock(sk);
>> + max_optlen = cb_args.max_optlen;
>>
>> if (ret < 0)
>> goto out;
>> @@ -1942,7 +2072,9 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>> }
>>
>> if (ctx.optlen != 0) {
>> - if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
>> + if (optval &&
>> + !(ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) &&
>> + copy_to_user(optval, ctx.optval, ctx.optlen)) {
>> ret = -EFAULT;
>> goto out;
>> }
>> @@ -2388,6 +2520,10 @@ static bool cg_sockopt_is_valid_access(int off, int size,
>> if (size != size_default)
>> return false;
>> return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
>> + case offsetof(struct bpf_sockopt, flags):
>> + if (size != sizeof(__u32))
>> + return false;
>> + break;
>> default:
>> if (size != size_default)
>> return false;
>> @@ -2481,6 +2617,9 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
>> case offsetof(struct bpf_sockopt, optval_end):
>> *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
>> break;
>> + case offsetof(struct bpf_sockopt, flags):
>> + *insn++ = CG_SOCKOPT_READ_FIELD(flags);
>> + break;
>> }
>>
>> return insn - insn_buf;
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 132f25dab931..fbc0096693e7 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -19538,9 +19538,11 @@ static bool can_be_sleepable(struct bpf_prog *prog)
>> return false;
>> }
>> }
>> +
>
> Extra newline?
>
>> return prog->type == BPF_PROG_TYPE_LSM ||
>> prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
>> - prog->type == BPF_PROG_TYPE_STRUCT_OPS;
>> + prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
>> + prog->type == BPF_PROG_TYPE_CGROUP_SOCKOPT;
>> }
>>
>> static int check_attach_btf_id(struct bpf_verifier_env *env)
>> @@ -19562,7 +19564,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
>> }
>>
>> if (prog->aux->sleepable && !can_be_sleepable(prog)) {
>> - verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
>> + verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe,"
>> + "cgroup, and struct_ops programs can be sleepable\n");
>
> I don't think we split the lines even if they are too long.
> Makes them ungreppable :-(
>
>> return -EINVAL;
>> }
>>
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index 70da85200695..fff6f7dff408 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -7138,6 +7138,13 @@ struct bpf_sockopt {
>> __s32 optname;
>> __s32 optlen;
>> __s32 retval;
>> +
>> + __u32 flags;
>> +};
>> +
>> +enum bpf_sockopt_flags {
>> + /* optval is a pointer to user space memory */
>> + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
>> };
>
> Why do we need to export the flag? Do we still want the users
> to do something based on the presence/absence of
> BPF_SOCKOPT_FLAG_OPTVAL_USER?
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 1/6] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.
2023-08-11 23:01 ` Stanislav Fomichev
2023-08-11 23:17 ` Kui-Feng Lee
@ 2023-08-11 23:22 ` Kui-Feng Lee
1 sibling, 0 replies; 18+ messages in thread
From: Kui-Feng Lee @ 2023-08-11 23:22 UTC (permalink / raw)
To: Stanislav Fomichev, thinker.li
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, yonghong.song,
kuifeng
On 8/11/23 16:01, Stanislav Fomichev wrote:
> On 08/10, thinker.li@gmail.com wrote:
>> From: Kui-Feng Lee <kuifeng@meta.com>
>>
>> Enable sleepable cgroup/{get,set}sockopt hooks.
>>
>> The sleepable BPF programs attached to cgroup/{get,set}sockopt hooks may
>> received a pointer to the optval in user space instead of a kernel
>> copy. ctx->user_optval and ctx->user_optval_end are the pointers to the
>> begin and end of the user space buffer if receiving a user space
>> buffer. ctx->optval and ctx->optval_end will be a kernel copy if receiving
>> a kernel space buffer.
>>
>> A program receives a user space buffer if ctx->flags &
>> BPF_SOCKOPT_FLAG_OPTVAL_USER is true, otherwise it receives a kernel space
>> buffer. The BPF programs should not read/write from/to a user space buffer
>> dirrectly. It should access the buffer through bpf_copy_from_user() and
>> bpf_copy_to_user() provided in the following patches.
>>
>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
>> ---
>> include/linux/filter.h | 1 +
>> include/uapi/linux/bpf.h | 7 +
>> kernel/bpf/cgroup.c | 229 ++++++++++++++++++++++++++-------
>> kernel/bpf/verifier.c | 7 +-
>> tools/include/uapi/linux/bpf.h | 7 +
>> tools/lib/bpf/libbpf.c | 2 +
>> 6 files changed, 206 insertions(+), 47 deletions(-)
>>
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 761af6b3cf2b..54169a641d40 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -1337,6 +1337,7 @@ struct bpf_sockopt_kern {
>> s32 level;
>> s32 optname;
>> s32 optlen;
>> + u32 flags;
>> /* for retval in struct bpf_cg_run_ctx */
>> struct task_struct *current_task;
>> /* Temporary "register" for indirect stores to ppos. */
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 70da85200695..fff6f7dff408 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -7138,6 +7138,13 @@ struct bpf_sockopt {
>> __s32 optname;
>> __s32 optlen;
>> __s32 retval;
>> +
>> + __u32 flags;
>> +};
>> +
>> +enum bpf_sockopt_flags {
>> + /* optval is a pointer to user space memory */
>> + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
>> };
>>
>> struct bpf_pidns_info {
>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index 5b2741aa0d9b..59489d9619a3 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>> @@ -28,25 +28,46 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
>> * function pointer.
>> */
>> static __always_inline int
>> -bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>> - enum cgroup_bpf_attach_type atype,
>> - const void *ctx, bpf_prog_run_fn run_prog,
>> - int retval, u32 *ret_flags)
>> +bpf_prog_run_array_cg_cb(const struct cgroup_bpf *cgrp,
>> + enum cgroup_bpf_attach_type atype,
>> + const void *ctx, bpf_prog_run_fn run_prog,
>> + int retval, u32 *ret_flags,
>> + int (*progs_cb)(void *, const struct bpf_prog_array_item *),
>> + void *progs_cb_arg)
>> {
>> const struct bpf_prog_array_item *item;
>> const struct bpf_prog *prog;
>> const struct bpf_prog_array *array;
>> struct bpf_run_ctx *old_run_ctx;
>> struct bpf_cg_run_ctx run_ctx;
>> + bool do_sleepable;
>> u32 func_ret;
>> + int err;
>> +
>> + do_sleepable =
>> + atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
>>
>> run_ctx.retval = retval;
>> migrate_disable();
>> - rcu_read_lock();
>> + if (do_sleepable) {
>> + might_fault();
>> + rcu_read_lock_trace();
>> + } else
>> + rcu_read_lock();
>> array = rcu_dereference(cgrp->effective[atype]);
>> item = &array->items[0];
>> +
>> + if (progs_cb) {
>> + err = progs_cb(progs_cb_arg, item);
>> + if (err)
>> + return err;
>> + }
>> +
>> old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
>> while ((prog = READ_ONCE(item->prog))) {
>> + if (do_sleepable && !prog->aux->sleepable)
>> + rcu_read_lock();
>> +
>> run_ctx.prog_item = item;
>> func_ret = run_prog(prog, ctx);
>> if (ret_flags) {
>> @@ -56,13 +77,48 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>> if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
>> run_ctx.retval = -EPERM;
>> item++;
>> +
>> + if (do_sleepable && !prog->aux->sleepable)
>> + rcu_read_unlock();
>> }
>> bpf_reset_run_ctx(old_run_ctx);
>> - rcu_read_unlock();
>> + if (do_sleepable)
>> + rcu_read_unlock_trace();
>> + else
>> + rcu_read_unlock();
>> migrate_enable();
>> return run_ctx.retval;
>> }
>>
>> +static __always_inline void
>> +count_sleepable_prog_cg(const struct bpf_prog_array_item *item,
>> + int *sleepable, int *non_sleepable)
>> +{
>> + const struct bpf_prog *prog;
>> + bool ret = true;
>> +
>> + *sleepable = 0;
>> + *non_sleepable = 0;
>> +
>> + while (ret && (prog = READ_ONCE(item->prog))) {
>> + if (prog->aux->sleepable)
>> + (*sleepable)++;
>> + else
>> + (*non_sleepable)++;
>> + item++;
>> + }
>> +}
>
> Can we maintain this info in the slow attach/detach path?
> Running the loop every time we get a sockopt syscall another time
> seems like something we can avoid? (even though it's a slow path,
> seems like we can do better?)
>
>> +
>> +static __always_inline int
>> +bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
>> + enum cgroup_bpf_attach_type atype,
>> + const void *ctx, bpf_prog_run_fn run_prog,
>> + int retval, u32 *ret_flags)
>> +{
>> + return bpf_prog_run_array_cg_cb(cgrp, atype, ctx, run_prog, retval,
>> + ret_flags, NULL, NULL);
>> +}
>> +
>> unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
>> const struct bpf_insn *insn)
>> {
>> @@ -307,7 +363,7 @@ static void cgroup_bpf_release(struct work_struct *work)
>> old_array = rcu_dereference_protected(
>> cgrp->bpf.effective[atype],
>> lockdep_is_held(&cgroup_mutex));
>> - bpf_prog_array_free(old_array);
>> + bpf_prog_array_free_sleepable(old_array);
>> }
>>
>> list_for_each_entry_safe(storage, stmp, storages, list_cg) {
>> @@ -451,7 +507,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
>> /* free prog array after grace period, since __cgroup_bpf_run_*()
>> * might be still walking the array
>> */
>> - bpf_prog_array_free(old_array);
>> + bpf_prog_array_free_sleepable(old_array);
>> }
>>
>> /**
>> @@ -491,7 +547,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
>> return 0;
>> cleanup:
>> for (i = 0; i < NR; i++)
>> - bpf_prog_array_free(arrays[i]);
>> + bpf_prog_array_free_sleepable(arrays[i]);
>>
>> for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
>> cgroup_bpf_put(p);
>> @@ -525,7 +581,7 @@ static int update_effective_progs(struct cgroup *cgrp,
>>
>> if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
>> if (unlikely(desc->bpf.inactive)) {
>> - bpf_prog_array_free(desc->bpf.inactive);
>> + bpf_prog_array_free_sleepable(desc->bpf.inactive);
>> desc->bpf.inactive = NULL;
>> }
>> continue;
>> @@ -544,7 +600,7 @@ static int update_effective_progs(struct cgroup *cgrp,
>> css_for_each_descendant_pre(css, &cgrp->self) {
>> struct cgroup *desc = container_of(css, struct cgroup, self);
>>
>> - bpf_prog_array_free(desc->bpf.inactive);
>> + bpf_prog_array_free_sleepable(desc->bpf.inactive);
>> desc->bpf.inactive = NULL;
>> }
>>
>> @@ -1740,7 +1796,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
>>
>> #ifdef CONFIG_NET
>> static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
>> - struct bpf_sockopt_buf *buf)
>> + struct bpf_sockopt_buf *buf, bool force_alloc)
>> {
>> if (unlikely(max_optlen < 0))
>> return -EINVAL;
>> @@ -1752,7 +1808,7 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
>> max_optlen = PAGE_SIZE;
>> }
>>
>> - if (max_optlen <= sizeof(buf->data)) {
>> + if (max_optlen <= sizeof(buf->data) && !force_alloc) {
>> /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
>> * bytes avoid the cost of kzalloc.
>> */
>> @@ -1773,7 +1829,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
>> static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
>> struct bpf_sockopt_buf *buf)
>> {
>> - if (ctx->optval == buf->data)
>> + if (ctx->optval == buf->data ||
>> + ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
>> return;
>> kfree(ctx->optval);
>> }
>> @@ -1781,7 +1838,50 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
>> static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
>> struct bpf_sockopt_buf *buf)
>> {
>> - return ctx->optval != buf->data;
>> + return ctx->optval != buf->data &&
>> + !(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
>> +}
>> +
>> +struct filter_sockopt_cb_args {
>> + struct bpf_sockopt_kern *ctx;
>> + struct bpf_sockopt_buf *buf;
>> + int max_optlen;
>> +};
>> +
>> +static int filter_setsockopt_progs_cb(void *arg,
>> + const struct bpf_prog_array_item *item)
>> +{
>> + struct filter_sockopt_cb_args *cb_args = arg;
>> + struct bpf_sockopt_kern *ctx = cb_args->ctx;
>> + char *optval = ctx->optval;
>> + int sleepable_cnt, non_sleepable_cnt;
>> + int max_optlen;
>> +
>> + count_sleepable_prog_cg(item, &sleepable_cnt, &non_sleepable_cnt);
>> +
>> + if (!non_sleepable_cnt)
>> + return 0;
>> +
>> + /* Allocate a bit more than the initial user buffer for
>> + * BPF program. The canonical use case is overriding
>> + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
>> + */
>> + max_optlen = max_t(int, 16, ctx->optlen);
>> + /* We need to force allocating from heap if there are sleepable
>> + * programs since they may created dynptrs from ctx->optval. In
>> + * this case, dynptrs will try to free the buffer that is actually
>> + * on the stack without this flag.
>> + */
>> + max_optlen = sockopt_alloc_buf(ctx, max_optlen, cb_args->buf,
>> + !!sleepable_cnt);
>> + if (max_optlen < 0)
>> + return max_optlen;
>> +
>> + if (copy_from_user(ctx->optval, optval,
>> + min(ctx->optlen, max_optlen)) != 0)
>> + return -EFAULT;
>> +
>> + return 0;
>> }
>>
>> int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> @@ -1795,27 +1895,22 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> .level = *level,
>> .optname = *optname,
>> };
>> + struct filter_sockopt_cb_args cb_args = {
>> + .ctx = &ctx,
>> + .buf = &buf,
>> + };
>> int ret, max_optlen;
>>
>> - /* Allocate a bit more than the initial user buffer for
>> - * BPF program. The canonical use case is overriding
>> - * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
>> - */
>> - max_optlen = max_t(int, 16, *optlen);
>> - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
>> - if (max_optlen < 0)
>> - return max_optlen;
>> -
>> + max_optlen = *optlen;
>> ctx.optlen = *optlen;
>> -
>> - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
>> - ret = -EFAULT;
>> - goto out;
>> - }
>> + ctx.optval = optval;
>> + ctx.optval_end = optval + *optlen;
>> + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
>>
>> lock_sock(sk);
>> - ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
>> - &ctx, bpf_prog_run, 0, NULL);
>> + ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
>> + &ctx, bpf_prog_run, 0, NULL,
>> + filter_setsockopt_progs_cb, &cb_args);
>> release_sock(sk);
>>
>> if (ret)
>> @@ -1824,7 +1919,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> if (ctx.optlen == -1) {
>> /* optlen set to -1, bypass kernel */
>> ret = 1;
>> - } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
>> + } else if (ctx.optlen > (ctx.optval_end - ctx.optval) ||
>> + ctx.optlen < -1) {
>> /* optlen is out of bounds */
>> if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
>> pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
>> @@ -1846,6 +1942,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> */
>> if (ctx.optlen != 0) {
>> *optlen = ctx.optlen;
>> + if (ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
>> + return 0;
>> /* We've used bpf_sockopt_kern->buf as an intermediary
>> * storage, but the BPF program indicates that we need
>> * to pass this data to the kernel setsockopt handler.
>> @@ -1874,6 +1972,36 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>> return ret;
>> }
>>
>> +static int filter_getsockopt_progs_cb(void *arg,
>> + const struct bpf_prog_array_item *item)
>> +{
>> + struct filter_sockopt_cb_args *cb_args = arg;
>> + struct bpf_sockopt_kern *ctx = cb_args->ctx;
>> + int sleepable_cnt, non_sleepable_cnt;
>> + int max_optlen;
>> + char *optval;
>> +
>> + count_sleepable_prog_cg(item, &sleepable_cnt, &non_sleepable_cnt);
>> +
>> + if (!non_sleepable_cnt)
>> + return 0;
>> +
>> + optval = ctx->optval;
>> + max_optlen = sockopt_alloc_buf(ctx, cb_args->max_optlen,
>> + cb_args->buf, false);
>> + if (max_optlen < 0)
>> + return max_optlen;
>> +
>> + if (copy_from_user(ctx->optval, optval,
>> + min(ctx->optlen, max_optlen)) != 0)
>> + return -EFAULT;
>> +
>> + ctx->flags = 0;
>> + cb_args->max_optlen = max_optlen;
>> +
>> + return 0;
>> +}
>> +
>> int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>> int optname, char __user *optval,
>> int __user *optlen, int max_optlen,
>> @@ -1887,15 +2015,16 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>> .optname = optname,
>> .current_task = current,
>> };
>> + struct filter_sockopt_cb_args cb_args = {
>> + .ctx = &ctx,
>> + .buf = &buf,
>> + .max_optlen = max_optlen,
>> + };
>> int orig_optlen;
>> int ret;
>>
>> orig_optlen = max_optlen;
>> ctx.optlen = max_optlen;
>> - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
>> - if (max_optlen < 0)
>> - return max_optlen;
>> -
>> if (!retval) {
>> /* If kernel getsockopt finished successfully,
>> * copy whatever was returned to the user back
>> @@ -1914,18 +2043,19 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>> goto out;
>> }
>> orig_optlen = ctx.optlen;
>> -
>> - if (copy_from_user(ctx.optval, optval,
>> - min(ctx.optlen, max_optlen)) != 0) {
>> - ret = -EFAULT;
>> - goto out;
>> - }
>> }
>>
>> + ctx.optval = optval;
>> + ctx.optval_end = optval + max_optlen;
>> + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
>> +
>> lock_sock(sk);
>> - ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
>> - &ctx, bpf_prog_run, retval, NULL);
>> + ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_GETSOCKOPT,
>> + &ctx, bpf_prog_run, retval, NULL,
>> + filter_getsockopt_progs_cb,
>> + &cb_args);
>> release_sock(sk);
>> + max_optlen = cb_args.max_optlen;
>>
>> if (ret < 0)
>> goto out;
>> @@ -1942,7 +2072,9 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>> }
>>
>> if (ctx.optlen != 0) {
>> - if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
>> + if (optval &&
>> + !(ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) &&
>> + copy_to_user(optval, ctx.optval, ctx.optlen)) {
>> ret = -EFAULT;
>> goto out;
>> }
>> @@ -2388,6 +2520,10 @@ static bool cg_sockopt_is_valid_access(int off, int size,
>> if (size != size_default)
>> return false;
>> return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
>> + case offsetof(struct bpf_sockopt, flags):
>> + if (size != sizeof(__u32))
>> + return false;
>> + break;
>> default:
>> if (size != size_default)
>> return false;
>> @@ -2481,6 +2617,9 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
>> case offsetof(struct bpf_sockopt, optval_end):
>> *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
>> break;
>> + case offsetof(struct bpf_sockopt, flags):
>> + *insn++ = CG_SOCKOPT_READ_FIELD(flags);
>> + break;
>> }
>>
>> return insn - insn_buf;
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 132f25dab931..fbc0096693e7 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -19538,9 +19538,11 @@ static bool can_be_sleepable(struct bpf_prog *prog)
>> return false;
>> }
>> }
>> +
>
> Extra newline?
>
>> return prog->type == BPF_PROG_TYPE_LSM ||
>> prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
>> - prog->type == BPF_PROG_TYPE_STRUCT_OPS;
>> + prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
>> + prog->type == BPF_PROG_TYPE_CGROUP_SOCKOPT;
>> }
>>
>> static int check_attach_btf_id(struct bpf_verifier_env *env)
>> @@ -19562,7 +19564,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
>> }
>>
>> if (prog->aux->sleepable && !can_be_sleepable(prog)) {
>> - verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
>> + verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe,"
>> + "cgroup, and struct_ops programs can be sleepable\n");
>
> I don't think we split the lines even if they are too long.
> Makes them ungreppable :-(
Got it!
>
>> return -EINVAL;
>> }
>>
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index 70da85200695..fff6f7dff408 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -7138,6 +7138,13 @@ struct bpf_sockopt {
>> __s32 optname;
>> __s32 optlen;
>> __s32 retval;
>> +
>> + __u32 flags;
>> +};
>> +
>> +enum bpf_sockopt_flags {
>> + /* optval is a pointer to user space memory */
>> + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
>> };
>
> Why do we need to export the flag? Do we still want the users
> to do something based on the presence/absence of
> BPF_SOCKOPT_FLAG_OPTVAL_USER?
I will remove this flag from the view of BPF programs.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [RFC bpf-next v2 2/6] bpf: Prevent BPF programs from access the buffer pointed by user_optval.
2023-08-11 4:31 [RFC bpf-next v2 0/6] Sleepable BPF programs on cgroup {get,set}sockopt thinker.li
2023-08-11 4:31 ` [RFC bpf-next v2 1/6] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt thinker.li
@ 2023-08-11 4:31 ` thinker.li
2023-08-11 6:27 ` Yonghong Song
2023-08-11 4:31 ` [RFC bpf-next v2 3/6] bpf: rename bpf_copy_to_user() thinker.li
` (3 subsequent siblings)
5 siblings, 1 reply; 18+ messages in thread
From: thinker.li @ 2023-08-11 4:31 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf,
yonghong.song
Cc: sinquersw, kuifeng, Kui-Feng Lee
From: Kui-Feng Lee <kuifeng@meta.com>
Since the buffer pointed by ctx->user_optval is in user space, BPF programs
in kernel space should not access it directly. They should use
bpf_copy_from_user() and bpf_copy_to_user() to move data between user and
kernel space.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
kernel/bpf/cgroup.c | 16 +++++++++--
kernel/bpf/verifier.c | 66 +++++++++++++++++++++----------------------
2 files changed, 47 insertions(+), 35 deletions(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 59489d9619a3..5bf3115b265c 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -2509,12 +2509,24 @@ static bool cg_sockopt_is_valid_access(int off, int size,
case offsetof(struct bpf_sockopt, optval):
if (size != sizeof(__u64))
return false;
- info->reg_type = PTR_TO_PACKET;
+ if (prog->aux->sleepable)
+ /* Prohibit access to the memory pointed by optval
+ * in sleepable programs.
+ */
+ info->reg_type = PTR_TO_PACKET | MEM_USER;
+ else
+ info->reg_type = PTR_TO_PACKET;
break;
case offsetof(struct bpf_sockopt, optval_end):
if (size != sizeof(__u64))
return false;
- info->reg_type = PTR_TO_PACKET_END;
+ if (prog->aux->sleepable)
+ /* Prohibit access to the memory pointed by
+ * optval_end in sleepable programs.
+ */
+ info->reg_type = PTR_TO_PACKET_END | MEM_USER;
+ else
+ info->reg_type = PTR_TO_PACKET_END;
break;
case offsetof(struct bpf_sockopt, retval):
if (size != size_default)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fbc0096693e7..ca27be76207a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13364,7 +13364,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
- if (reg->type == type && reg->id == dst_reg->id)
+ if (base_type(reg->type) == type && reg->id == dst_reg->id)
/* keep the maximum range already checked */
reg->range = max(reg->range, new_range);
}));
@@ -13917,84 +13917,84 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
switch (BPF_OP(insn->code)) {
case BPF_JGT:
- if ((dst_reg->type == PTR_TO_PACKET &&
- src_reg->type == PTR_TO_PACKET_END) ||
- (dst_reg->type == PTR_TO_PACKET_META &&
+ if ((base_type(dst_reg->type) == PTR_TO_PACKET &&
+ base_type(src_reg->type) == PTR_TO_PACKET_END) ||
+ (base_type(dst_reg->type) == PTR_TO_PACKET_META &&
reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
find_good_pkt_pointers(this_branch, dst_reg,
- dst_reg->type, false);
+ base_type(dst_reg->type), false);
mark_pkt_end(other_branch, insn->dst_reg, true);
- } else if ((dst_reg->type == PTR_TO_PACKET_END &&
- src_reg->type == PTR_TO_PACKET) ||
+ } else if ((base_type(dst_reg->type) == PTR_TO_PACKET_END &&
+ base_type(src_reg->type) == PTR_TO_PACKET) ||
(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
- src_reg->type == PTR_TO_PACKET_META)) {
+ base_type(src_reg->type) == PTR_TO_PACKET_META)) {
/* pkt_end > pkt_data', pkt_data > pkt_meta' */
find_good_pkt_pointers(other_branch, src_reg,
- src_reg->type, true);
+ base_type(src_reg->type), true);
mark_pkt_end(this_branch, insn->src_reg, false);
} else {
return false;
}
break;
case BPF_JLT:
- if ((dst_reg->type == PTR_TO_PACKET &&
- src_reg->type == PTR_TO_PACKET_END) ||
- (dst_reg->type == PTR_TO_PACKET_META &&
+ if ((base_type(dst_reg->type) == PTR_TO_PACKET &&
+ base_type(src_reg->type) == PTR_TO_PACKET_END) ||
+ (base_type(dst_reg->type) == PTR_TO_PACKET_META &&
reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
find_good_pkt_pointers(other_branch, dst_reg,
- dst_reg->type, true);
+ base_type(dst_reg->type), true);
mark_pkt_end(this_branch, insn->dst_reg, false);
- } else if ((dst_reg->type == PTR_TO_PACKET_END &&
- src_reg->type == PTR_TO_PACKET) ||
+ } else if ((base_type(dst_reg->type) == PTR_TO_PACKET_END &&
+ base_type(src_reg->type) == PTR_TO_PACKET) ||
(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
- src_reg->type == PTR_TO_PACKET_META)) {
+ base_type(src_reg->type) == PTR_TO_PACKET_META)) {
/* pkt_end < pkt_data', pkt_data > pkt_meta' */
find_good_pkt_pointers(this_branch, src_reg,
- src_reg->type, false);
+ base_type(src_reg->type), false);
mark_pkt_end(other_branch, insn->src_reg, true);
} else {
return false;
}
break;
case BPF_JGE:
- if ((dst_reg->type == PTR_TO_PACKET &&
- src_reg->type == PTR_TO_PACKET_END) ||
- (dst_reg->type == PTR_TO_PACKET_META &&
+ if ((base_type(dst_reg->type) == PTR_TO_PACKET &&
+ base_type(src_reg->type) == PTR_TO_PACKET_END) ||
+ (base_type(dst_reg->type) == PTR_TO_PACKET_META &&
reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
find_good_pkt_pointers(this_branch, dst_reg,
- dst_reg->type, true);
+ base_type(dst_reg->type), true);
mark_pkt_end(other_branch, insn->dst_reg, false);
- } else if ((dst_reg->type == PTR_TO_PACKET_END &&
- src_reg->type == PTR_TO_PACKET) ||
+ } else if ((base_type(dst_reg->type) == PTR_TO_PACKET_END &&
+ base_type(src_reg->type) == PTR_TO_PACKET) ||
(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
- src_reg->type == PTR_TO_PACKET_META)) {
+ base_type(src_reg->type) == PTR_TO_PACKET_META)) {
/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
find_good_pkt_pointers(other_branch, src_reg,
- src_reg->type, false);
+ base_type(src_reg->type), false);
mark_pkt_end(this_branch, insn->src_reg, true);
} else {
return false;
}
break;
case BPF_JLE:
- if ((dst_reg->type == PTR_TO_PACKET &&
- src_reg->type == PTR_TO_PACKET_END) ||
- (dst_reg->type == PTR_TO_PACKET_META &&
+ if ((base_type(dst_reg->type) == PTR_TO_PACKET &&
+ base_type(src_reg->type) == PTR_TO_PACKET_END) ||
+ (base_type(dst_reg->type) == PTR_TO_PACKET_META &&
reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
find_good_pkt_pointers(other_branch, dst_reg,
- dst_reg->type, false);
+ base_type(dst_reg->type), false);
mark_pkt_end(this_branch, insn->dst_reg, true);
- } else if ((dst_reg->type == PTR_TO_PACKET_END &&
- src_reg->type == PTR_TO_PACKET) ||
+ } else if ((base_type(dst_reg->type) == PTR_TO_PACKET_END &&
+ base_type(src_reg->type) == PTR_TO_PACKET) ||
(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
- src_reg->type == PTR_TO_PACKET_META)) {
+ base_type(src_reg->type) == PTR_TO_PACKET_META)) {
/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
find_good_pkt_pointers(this_branch, src_reg,
- src_reg->type, true);
+ base_type(src_reg->type), true);
mark_pkt_end(other_branch, insn->src_reg, false);
} else {
return false;
--
2.34.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 2/6] bpf: Prevent BPF programs from access the buffer pointed by user_optval.
2023-08-11 4:31 ` [RFC bpf-next v2 2/6] bpf: Prevent BPF programs from access the buffer pointed by user_optval thinker.li
@ 2023-08-11 6:27 ` Yonghong Song
2023-08-11 16:01 ` Kui-Feng Lee
0 siblings, 1 reply; 18+ messages in thread
From: Yonghong Song @ 2023-08-11 6:27 UTC (permalink / raw)
To: thinker.li, bpf, ast, martin.lau, song, kernel-team, andrii, sdf
Cc: sinquersw, kuifeng
On 8/10/23 9:31 PM, thinker.li@gmail.com wrote:
> From: Kui-Feng Lee <kuifeng@meta.com>
>
> Since the buffer pointed by ctx->user_optval is in user space, BPF programs
> in kernel space should not access it directly. They should use
> bpf_copy_from_user() and bpf_copy_to_user() to move data between user and
> kernel space.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
You probably only want
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
since it matches email 'From'. Also,
From: Kui-Feng Lee <kuifeng@meta.com>
is different from email 'From'. Strange.
[...]
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 2/6] bpf: Prevent BPF programs from access the buffer pointed by user_optval.
2023-08-11 6:27 ` Yonghong Song
@ 2023-08-11 16:01 ` Kui-Feng Lee
0 siblings, 0 replies; 18+ messages in thread
From: Kui-Feng Lee @ 2023-08-11 16:01 UTC (permalink / raw)
To: yonghong.song, thinker.li, bpf, ast, martin.lau, song,
kernel-team, andrii, sdf
Cc: kuifeng
On 8/10/23 23:27, Yonghong Song wrote:
>
>
> On 8/10/23 9:31 PM, thinker.li@gmail.com wrote:
>> From: Kui-Feng Lee <kuifeng@meta.com>
>>
>> Since the buffer pointed by ctx->user_optval is in user space, BPF
>> programs
>> in kernel space should not access it directly. They should use
>> bpf_copy_from_user() and bpf_copy_to_user() to move data between user and
>> kernel space.
>>
>> Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
>
> You probably only want
> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
> since it matches email 'From'. Also,
> From: Kui-Feng Lee <kuifeng@meta.com>
> is different from email 'From'. Strange.
Got it!sssssssssssssss
>
> [...]
^ permalink raw reply [flat|nested] 18+ messages in thread
* [RFC bpf-next v2 3/6] bpf: rename bpf_copy_to_user().
2023-08-11 4:31 [RFC bpf-next v2 0/6] Sleepable BPF programs on cgroup {get,set}sockopt thinker.li
2023-08-11 4:31 ` [RFC bpf-next v2 1/6] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt thinker.li
2023-08-11 4:31 ` [RFC bpf-next v2 2/6] bpf: Prevent BPF programs from access the buffer pointed by user_optval thinker.li
@ 2023-08-11 4:31 ` thinker.li
2023-08-11 4:31 ` [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user() thinker.li
` (2 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: thinker.li @ 2023-08-11 4:31 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf,
yonghong.song
Cc: sinquersw, kuifeng, Kui-Feng Lee
From: Kui-Feng Lee <thinker.li@gmail.com>
Rename bpf_copy_to_user() to avoid confliction with the BPF kfunc.
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
kernel/bpf/syscall.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7f4e8c357a6a..81b6e60cc030 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3297,8 +3297,8 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
raw_tp_link->btp->tp->name);
}
-static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
- u32 len)
+static int _bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
+ u32 len)
{
if (ulen >= len + 1) {
if (copy_to_user(ubuf, buf, len + 1))
@@ -3334,7 +3334,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
if (!ubuf)
return 0;
- return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
+ return _bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
}
static const struct bpf_link_ops bpf_raw_tp_link_lops = {
@@ -3388,7 +3388,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
if (buf) {
len = strlen(buf);
- err = bpf_copy_to_user(uname, buf, ulen, len);
+ err = _bpf_copy_to_user(uname, buf, ulen, len);
if (err)
return err;
} else {
--
2.34.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user().
2023-08-11 4:31 [RFC bpf-next v2 0/6] Sleepable BPF programs on cgroup {get,set}sockopt thinker.li
` (2 preceding siblings ...)
2023-08-11 4:31 ` [RFC bpf-next v2 3/6] bpf: rename bpf_copy_to_user() thinker.li
@ 2023-08-11 4:31 ` thinker.li
2023-08-11 23:05 ` Stanislav Fomichev
2023-08-11 4:31 ` [RFC bpf-next v2 5/6] bpf: Add a new dynptr type for CGRUP_SOCKOPT thinker.li
2023-08-11 4:31 ` [RFC bpf-next v2 6/6] bpf: Add test cases for sleepable BPF programs of the CGROUP_SOCKOPT type thinker.li
5 siblings, 1 reply; 18+ messages in thread
From: thinker.li @ 2023-08-11 4:31 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf,
yonghong.song
Cc: sinquersw, kuifeng, Kui-Feng Lee
From: Kui-Feng Lee <kuifeng@meta.com>
Provide bpf_copy_from_user() and bpf_copy_to_user() to the BPF programs
attached to cgroup/{set,get}sockopt. bpf_copy_to_user() is a new kfunc to
copy data from an kernel space buffer to a user space buffer. They are only
available for sleepable BPF programs. bpf_copy_to_user() is only available
to the BPF programs attached to cgroup/getsockopt.
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
kernel/bpf/cgroup.c | 6 ++++++
kernel/bpf/helpers.c | 31 +++++++++++++++++++++++++++++++
2 files changed, 37 insertions(+)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5bf3115b265c..c15a72860d2a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -2461,6 +2461,12 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#endif
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
+
+ case BPF_FUNC_copy_from_user:
+ if (prog->aux->sleepable)
+ return &bpf_copy_from_user_proto;
+ return NULL;
+
default:
return bpf_base_func_proto(func_id);
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb91cae0612a..ff240db1512c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -669,6 +669,26 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
.arg3_type = ARG_ANYTHING,
};
+/**
+ * int bpf_copy_to_user(void *dst, u32 size, const void *kern_ptr)
+ * Description
+ * Read *size* bytes from kernel space address *kern_ptr* and
+ * store the data in user space address *dst*. This is a
+ * wrapper of **copy_to_user**\ ().
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ */
+__bpf_kfunc int bpf_copy_to_user(void *dst__uninit, u32 dst__sz,
+ const void *src__ign)
+{
+ int ret = copy_to_user(dst__uninit, src__ign, dst__sz);
+
+ if (unlikely(ret))
+ return -EFAULT;
+
+ return ret;
+}
+
BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
{
@@ -2456,6 +2476,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_copy_to_user, KF_SLEEPABLE)
BTF_SET8_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -2494,6 +2515,15 @@ static const struct btf_kfunc_id_set common_kfunc_set = {
.set = &common_btf_ids,
};
+BTF_SET8_START(cgroup_common_btf_ids)
+BTF_ID_FLAGS(func, bpf_copy_to_user, KF_SLEEPABLE)
+BTF_SET8_END(cgroup_common_btf_ids)
+
+static const struct btf_kfunc_id_set cgroup_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cgroup_common_btf_ids,
+};
+
static int __init kfunc_init(void)
{
int ret;
@@ -2513,6 +2543,7 @@ static int __init kfunc_init(void)
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCKOPT, &cgroup_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
THIS_MODULE);
--
2.34.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user().
2023-08-11 4:31 ` [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user() thinker.li
@ 2023-08-11 23:05 ` Stanislav Fomichev
2023-08-11 23:27 ` Kui-Feng Lee
0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Fomichev @ 2023-08-11 23:05 UTC (permalink / raw)
To: thinker.li
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, yonghong.song,
sinquersw, kuifeng
On 08/10, thinker.li@gmail.com wrote:
> From: Kui-Feng Lee <kuifeng@meta.com>
>
> Provide bpf_copy_from_user() and bpf_copy_to_user() to the BPF programs
> attached to cgroup/{set,get}sockopt. bpf_copy_to_user() is a new kfunc to
> copy data from an kernel space buffer to a user space buffer. They are only
> available for sleepable BPF programs. bpf_copy_to_user() is only available
> to the BPF programs attached to cgroup/getsockopt.
>
> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
> ---
> kernel/bpf/cgroup.c | 6 ++++++
> kernel/bpf/helpers.c | 31 +++++++++++++++++++++++++++++++
> 2 files changed, 37 insertions(+)
>
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 5bf3115b265c..c15a72860d2a 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -2461,6 +2461,12 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> #endif
> case BPF_FUNC_perf_event_output:
> return &bpf_event_output_data_proto;
> +
> + case BPF_FUNC_copy_from_user:
> + if (prog->aux->sleepable)
> + return &bpf_copy_from_user_proto;
> + return NULL;
If we just allow copy to/from, I'm not sure I understand how the buffer
sharing between sleepable/non-sleepable works.
Let's assume I have two progs in the chain:
1. non-sleepable - copies the buffer, does some modifications; since
we don't copy the buffer back after every prog run, the modifications
stay in the kernel buffer
2. sleepable - runs and just gets the user pointer? does it mean this
sleepable program doesn't see the changes from (1)?
IOW, do we need some custom sockopt copy_to/from that handle this
potential buffer location transparently or am I missing something?
Assuming we want to support this at all. If we do, might deserve a
selftest.
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user().
2023-08-11 23:05 ` Stanislav Fomichev
@ 2023-08-11 23:27 ` Kui-Feng Lee
2023-08-11 23:31 ` Kui-Feng Lee
0 siblings, 1 reply; 18+ messages in thread
From: Kui-Feng Lee @ 2023-08-11 23:27 UTC (permalink / raw)
To: Stanislav Fomichev, thinker.li
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, yonghong.song,
kuifeng
On 8/11/23 16:05, Stanislav Fomichev wrote:
> On 08/10, thinker.li@gmail.com wrote:
>> From: Kui-Feng Lee <kuifeng@meta.com>
>>
>> Provide bpf_copy_from_user() and bpf_copy_to_user() to the BPF programs
>> attached to cgroup/{set,get}sockopt. bpf_copy_to_user() is a new kfunc to
>> copy data from an kernel space buffer to a user space buffer. They are only
>> available for sleepable BPF programs. bpf_copy_to_user() is only available
>> to the BPF programs attached to cgroup/getsockopt.
>>
>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
>> ---
>> kernel/bpf/cgroup.c | 6 ++++++
>> kernel/bpf/helpers.c | 31 +++++++++++++++++++++++++++++++
>> 2 files changed, 37 insertions(+)
>>
>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index 5bf3115b265c..c15a72860d2a 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>> @@ -2461,6 +2461,12 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>> #endif
>> case BPF_FUNC_perf_event_output:
>> return &bpf_event_output_data_proto;
>> +
>> + case BPF_FUNC_copy_from_user:
>> + if (prog->aux->sleepable)
>> + return &bpf_copy_from_user_proto;
>> + return NULL;
>
> If we just allow copy to/from, I'm not sure I understand how the buffer
> sharing between sleepable/non-sleepable works.
>
> Let's assume I have two progs in the chain:
> 1. non-sleepable - copies the buffer, does some modifications; since
> we don't copy the buffer back after every prog run, the modifications
> stay in the kernel buffer
> 2. sleepable - runs and just gets the user pointer? does it mean this
> sleepable program doesn't see the changes from (1)?
>
> IOW, do we need some custom sockopt copy_to/from that handle this
> potential buffer location transparently or am I missing something?
>
> Assuming we want to support this at all. If we do, might deserve a
> selftest.
It is why BPF_SOCKOPT_FLAG_OPTVAL_USER is there.
It helps programs to make a right decision.
However, I am going to remove bpf_copy_from_user()
since we have bpf_so_optval_copy_to() and bpf_so_optval_copy_to_r().
Does it make sense to you?
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user().
2023-08-11 23:27 ` Kui-Feng Lee
@ 2023-08-11 23:31 ` Kui-Feng Lee
2023-08-14 17:07 ` Stanislav Fomichev
0 siblings, 1 reply; 18+ messages in thread
From: Kui-Feng Lee @ 2023-08-11 23:31 UTC (permalink / raw)
To: Stanislav Fomichev, thinker.li
Cc: bpf, ast, martin.lau, song, kernel-team, andrii, yonghong.song,
kuifeng
On 8/11/23 16:27, Kui-Feng Lee wrote:
>
>
> On 8/11/23 16:05, Stanislav Fomichev wrote:
>> On 08/10, thinker.li@gmail.com wrote:
>>> From: Kui-Feng Lee <kuifeng@meta.com>
>>>
>>> Provide bpf_copy_from_user() and bpf_copy_to_user() to the BPF programs
>>> attached to cgroup/{set,get}sockopt. bpf_copy_to_user() is a new
>>> kfunc to
>>> copy data from an kernel space buffer to a user space buffer. They
>>> are only
>>> available for sleepable BPF programs. bpf_copy_to_user() is only
>>> available
>>> to the BPF programs attached to cgroup/getsockopt.
>>>
>>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
>>> ---
>>> kernel/bpf/cgroup.c | 6 ++++++
>>> kernel/bpf/helpers.c | 31 +++++++++++++++++++++++++++++++
>>> 2 files changed, 37 insertions(+)
>>>
>>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>>> index 5bf3115b265c..c15a72860d2a 100644
>>> --- a/kernel/bpf/cgroup.c
>>> +++ b/kernel/bpf/cgroup.c
>>> @@ -2461,6 +2461,12 @@ cg_sockopt_func_proto(enum bpf_func_id
>>> func_id, const struct bpf_prog *prog)
>>> #endif
>>> case BPF_FUNC_perf_event_output:
>>> return &bpf_event_output_data_proto;
>>> +
>>> + case BPF_FUNC_copy_from_user:
>>> + if (prog->aux->sleepable)
>>> + return &bpf_copy_from_user_proto;
>>> + return NULL;
>>
>> If we just allow copy to/from, I'm not sure I understand how the buffer
>> sharing between sleepable/non-sleepable works.
>>
>> Let's assume I have two progs in the chain:
>> 1. non-sleepable - copies the buffer, does some modifications; since
>> we don't copy the buffer back after every prog run, the modifications
>> stay in the kernel buffer
>> 2. sleepable - runs and just gets the user pointer? does it mean this
>> sleepable program doesn't see the changes from (1)?
It is still visible from sleepable programs. Sleepable programs
will receive a pointer to the buffer in the kernel.
And, BPF_SOCKOPT_FLAG_OPTVAL_USER is clear.
>>
>> IOW, do we need some custom sockopt copy_to/from that handle this
>> potential buffer location transparently or am I missing something?
>>
>> Assuming we want to support this at all. If we do, might deserve a
>> selftest.
>
> It is why BPF_SOCKOPT_FLAG_OPTVAL_USER is there.
> It helps programs to make a right decision.
> However, I am going to remove bpf_copy_from_user()
> since we have bpf_so_optval_copy_to() and bpf_so_optval_copy_to_r().
> Does it make sense to you?
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user().
2023-08-11 23:31 ` Kui-Feng Lee
@ 2023-08-14 17:07 ` Stanislav Fomichev
2023-08-14 19:20 ` Kui-Feng Lee
0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Fomichev @ 2023-08-14 17:07 UTC (permalink / raw)
To: Kui-Feng Lee
Cc: thinker.li, bpf, ast, martin.lau, song, kernel-team, andrii,
yonghong.song, kuifeng
On 08/11, Kui-Feng Lee wrote:
>
>
> On 8/11/23 16:27, Kui-Feng Lee wrote:
> >
> >
> > On 8/11/23 16:05, Stanislav Fomichev wrote:
> > > On 08/10, thinker.li@gmail.com wrote:
> > > > From: Kui-Feng Lee <kuifeng@meta.com>
> > > >
> > > > Provide bpf_copy_from_user() and bpf_copy_to_user() to the BPF programs
> > > > attached to cgroup/{set,get}sockopt. bpf_copy_to_user() is a new
> > > > kfunc to
> > > > copy data from an kernel space buffer to a user space buffer.
> > > > They are only
> > > > available for sleepable BPF programs. bpf_copy_to_user() is only
> > > > available
> > > > to the BPF programs attached to cgroup/getsockopt.
> > > >
> > > > Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
> > > > ---
> > > > kernel/bpf/cgroup.c | 6 ++++++
> > > > kernel/bpf/helpers.c | 31 +++++++++++++++++++++++++++++++
> > > > 2 files changed, 37 insertions(+)
> > > >
> > > > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > > > index 5bf3115b265c..c15a72860d2a 100644
> > > > --- a/kernel/bpf/cgroup.c
> > > > +++ b/kernel/bpf/cgroup.c
> > > > @@ -2461,6 +2461,12 @@ cg_sockopt_func_proto(enum bpf_func_id
> > > > func_id, const struct bpf_prog *prog)
> > > > #endif
> > > > case BPF_FUNC_perf_event_output:
> > > > return &bpf_event_output_data_proto;
> > > > +
> > > > + case BPF_FUNC_copy_from_user:
> > > > + if (prog->aux->sleepable)
> > > > + return &bpf_copy_from_user_proto;
> > > > + return NULL;
> > >
> > > If we just allow copy to/from, I'm not sure I understand how the buffer
> > > sharing between sleepable/non-sleepable works.
> > >
> > > Let's assume I have two progs in the chain:
> > > 1. non-sleepable - copies the buffer, does some modifications; since
> > > we don't copy the buffer back after every prog run, the modifications
> > > stay in the kernel buffer
> > > 2. sleepable - runs and just gets the user pointer? does it mean this
> > > sleepable program doesn't see the changes from (1)?
>
> It is still visible from sleepable programs. Sleepable programs
> will receive a pointer to the buffer in the kernel.
> And, BPF_SOCKOPT_FLAG_OPTVAL_USER is clear.
>
> > >
> > > IOW, do we need some custom sockopt copy_to/from that handle this
> > > potential buffer location transparently or am I missing something?
> > >
> > > Assuming we want to support this at all. If we do, might deserve a
> > > selftest.
> >
> > It is why BPF_SOCKOPT_FLAG_OPTVAL_USER is there.
> > It helps programs to make a right decision.
> > However, I am going to remove bpf_copy_from_user()
> > since we have bpf_so_optval_copy_to() and bpf_so_optval_copy_to_r().
> > Does it make sense to you?
Ah, so that's where it's handled. I didn't read that far :-)
In this case yes, let's have only those helpers.
Btw, do we also really need bpf_so_optval_copy_to_r? If we are doing
dynptr, let's only have bpf_so_optval_copy_to version?
I'd also call them something like bpf_sockopt_copy_{to,from}. That
"_so_optval_" is not super intuitive imho.
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user().
2023-08-14 17:07 ` Stanislav Fomichev
@ 2023-08-14 19:20 ` Kui-Feng Lee
2023-08-14 20:16 ` Stanislav Fomichev
0 siblings, 1 reply; 18+ messages in thread
From: Kui-Feng Lee @ 2023-08-14 19:20 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: thinker.li, bpf, ast, martin.lau, song, kernel-team, andrii,
yonghong.song, kuifeng
On 8/14/23 10:07, Stanislav Fomichev wrote:
> On 08/11, Kui-Feng Lee wrote:
>>
>>
>> On 8/11/23 16:27, Kui-Feng Lee wrote:
>>>
>>>
>>> On 8/11/23 16:05, Stanislav Fomichev wrote:
>>>> On 08/10, thinker.li@gmail.com wrote:
>>>>> From: Kui-Feng Lee <kuifeng@meta.com>
>>>>>
>>>>> Provide bpf_copy_from_user() and bpf_copy_to_user() to the BPF programs
>>>>> attached to cgroup/{set,get}sockopt. bpf_copy_to_user() is a new
>>>>> kfunc to
>>>>> copy data from an kernel space buffer to a user space buffer.
>>>>> They are only
>>>>> available for sleepable BPF programs. bpf_copy_to_user() is only
>>>>> available
>>>>> to the BPF programs attached to cgroup/getsockopt.
>>>>>
>>>>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
>>>>> ---
>>>>> kernel/bpf/cgroup.c | 6 ++++++
>>>>> kernel/bpf/helpers.c | 31 +++++++++++++++++++++++++++++++
>>>>> 2 files changed, 37 insertions(+)
>>>>>
>>>>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>>>>> index 5bf3115b265c..c15a72860d2a 100644
>>>>> --- a/kernel/bpf/cgroup.c
>>>>> +++ b/kernel/bpf/cgroup.c
>>>>> @@ -2461,6 +2461,12 @@ cg_sockopt_func_proto(enum bpf_func_id
>>>>> func_id, const struct bpf_prog *prog)
>>>>> #endif
>>>>> case BPF_FUNC_perf_event_output:
>>>>> return &bpf_event_output_data_proto;
>>>>> +
>>>>> + case BPF_FUNC_copy_from_user:
>>>>> + if (prog->aux->sleepable)
>>>>> + return &bpf_copy_from_user_proto;
>>>>> + return NULL;
>>>>
>>>> If we just allow copy to/from, I'm not sure I understand how the buffer
>>>> sharing between sleepable/non-sleepable works.
>>>>
>>>> Let's assume I have two progs in the chain:
>>>> 1. non-sleepable - copies the buffer, does some modifications; since
>>>> we don't copy the buffer back after every prog run, the modifications
>>>> stay in the kernel buffer
>>>> 2. sleepable - runs and just gets the user pointer? does it mean this
>>>> sleepable program doesn't see the changes from (1)?
>>
>> It is still visible from sleepable programs. Sleepable programs
>> will receive a pointer to the buffer in the kernel.
>> And, BPF_SOCKOPT_FLAG_OPTVAL_USER is clear.
>>
>>>>
>>>> IOW, do we need some custom sockopt copy_to/from that handle this
>>>> potential buffer location transparently or am I missing something?
>>>>
>>>> Assuming we want to support this at all. If we do, might deserve a
>>>> selftest.
>>>
>>> It is why BPF_SOCKOPT_FLAG_OPTVAL_USER is there.
>>> It helps programs to make a right decision.
>>> However, I am going to remove bpf_copy_from_user()
>>> since we have bpf_so_optval_copy_to() and bpf_so_optval_copy_to_r().
>>> Does it make sense to you?
>
> Ah, so that's where it's handled. I didn't read that far :-)
> In this case yes, let's have only those helpers.
>
> Btw, do we also really need bpf_so_optval_copy_to_r? If we are doing
> dynptr, let's only have bpf_so_optval_copy_to version?
Don't you think bpf_so_optval_copy_to_r() is handy to copy
a simple string to the user space?
>
> I'd also call them something like bpf_sockopt_copy_{to,from}. That
> "_so_optval_" is not super intuitive imho.
Agree!
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user().
2023-08-14 19:20 ` Kui-Feng Lee
@ 2023-08-14 20:16 ` Stanislav Fomichev
0 siblings, 0 replies; 18+ messages in thread
From: Stanislav Fomichev @ 2023-08-14 20:16 UTC (permalink / raw)
To: Kui-Feng Lee
Cc: thinker.li, bpf, ast, martin.lau, song, kernel-team, andrii,
yonghong.song, kuifeng
On Mon, Aug 14, 2023 at 12:20 PM Kui-Feng Lee <sinquersw@gmail.com> wrote:
>
>
>
> On 8/14/23 10:07, Stanislav Fomichev wrote:
> > On 08/11, Kui-Feng Lee wrote:
> >>
> >>
> >> On 8/11/23 16:27, Kui-Feng Lee wrote:
> >>>
> >>>
> >>> On 8/11/23 16:05, Stanislav Fomichev wrote:
> >>>> On 08/10, thinker.li@gmail.com wrote:
> >>>>> From: Kui-Feng Lee <kuifeng@meta.com>
> >>>>>
> >>>>> Provide bpf_copy_from_user() and bpf_copy_to_user() to the BPF programs
> >>>>> attached to cgroup/{set,get}sockopt. bpf_copy_to_user() is a new
> >>>>> kfunc to
> >>>>> copy data from an kernel space buffer to a user space buffer.
> >>>>> They are only
> >>>>> available for sleepable BPF programs. bpf_copy_to_user() is only
> >>>>> available
> >>>>> to the BPF programs attached to cgroup/getsockopt.
> >>>>>
> >>>>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
> >>>>> ---
> >>>>> kernel/bpf/cgroup.c | 6 ++++++
> >>>>> kernel/bpf/helpers.c | 31 +++++++++++++++++++++++++++++++
> >>>>> 2 files changed, 37 insertions(+)
> >>>>>
> >>>>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> >>>>> index 5bf3115b265c..c15a72860d2a 100644
> >>>>> --- a/kernel/bpf/cgroup.c
> >>>>> +++ b/kernel/bpf/cgroup.c
> >>>>> @@ -2461,6 +2461,12 @@ cg_sockopt_func_proto(enum bpf_func_id
> >>>>> func_id, const struct bpf_prog *prog)
> >>>>> #endif
> >>>>> case BPF_FUNC_perf_event_output:
> >>>>> return &bpf_event_output_data_proto;
> >>>>> +
> >>>>> + case BPF_FUNC_copy_from_user:
> >>>>> + if (prog->aux->sleepable)
> >>>>> + return &bpf_copy_from_user_proto;
> >>>>> + return NULL;
> >>>>
> >>>> If we just allow copy to/from, I'm not sure I understand how the buffer
> >>>> sharing between sleepable/non-sleepable works.
> >>>>
> >>>> Let's assume I have two progs in the chain:
> >>>> 1. non-sleepable - copies the buffer, does some modifications; since
> >>>> we don't copy the buffer back after every prog run, the modifications
> >>>> stay in the kernel buffer
> >>>> 2. sleepable - runs and just gets the user pointer? does it mean this
> >>>> sleepable program doesn't see the changes from (1)?
> >>
> >> It is still visible from sleepable programs. Sleepable programs
> >> will receive a pointer to the buffer in the kernel.
> >> And, BPF_SOCKOPT_FLAG_OPTVAL_USER is clear.
> >>
> >>>>
> >>>> IOW, do we need some custom sockopt copy_to/from that handle this
> >>>> potential buffer location transparently or am I missing something?
> >>>>
> >>>> Assuming we want to support this at all. If we do, might deserve a
> >>>> selftest.
> >>>
> >>> It is why BPF_SOCKOPT_FLAG_OPTVAL_USER is there.
> >>> It helps programs to make a right decision.
> >>> However, I am going to remove bpf_copy_from_user()
> >>> since we have bpf_so_optval_copy_to() and bpf_so_optval_copy_to_r().
> >>> Does it make sense to you?
> >
> > Ah, so that's where it's handled. I didn't read that far :-)
> > In this case yes, let's have only those helpers.
> >
> > Btw, do we also really need bpf_so_optval_copy_to_r? If we are doing
> > dynptr, let's only have bpf_so_optval_copy_to version?
>
> Don't you think bpf_so_optval_copy_to_r() is handy to copy
> a simple string to the user space?
Yeah, it is convenient, but it feels like something we can avoid if we
are using dynptrs (exclusively)?
Can the programs have a simple wrapper around
bpf_so_optval_from+bpf_so_optval_copy_to to provide the same ptr-based
api?
static inline void bpf_so_optval_copy_to_r(sopt, ptr, sz) {
bpf_so_optval_alloc(sopt, sz, &dynptr);
some_dynptr_copy_to(&dynptr, ptr, siz);
bpf_so_optval_copy_to(sopt, &dynptr);
bpf_so_optval_release(&dynptr);
}
Or maybe we should instead have a new kfunc that turns sopt ptr+sz
into a dynptr?
static inline void bpf_so_optval_copy_to_r(sopt, ptr, sz) {
bpf_so_optval_from_user_ptr(sopt, &dynptr);
bpf_so_optval_copy_to(sopt, &dynptr);
bpf_so_optval_release(&dynptr);
}
That probably fits better into dynptr world?
^ permalink raw reply [flat|nested] 18+ messages in thread
* [RFC bpf-next v2 5/6] bpf: Add a new dynptr type for CGRUP_SOCKOPT.
2023-08-11 4:31 [RFC bpf-next v2 0/6] Sleepable BPF programs on cgroup {get,set}sockopt thinker.li
` (3 preceding siblings ...)
2023-08-11 4:31 ` [RFC bpf-next v2 4/6] bpf: Provide bpf_copy_from_user() and bpf_copy_to_user() thinker.li
@ 2023-08-11 4:31 ` thinker.li
2023-08-11 4:31 ` [RFC bpf-next v2 6/6] bpf: Add test cases for sleepable BPF programs of the CGROUP_SOCKOPT type thinker.li
5 siblings, 0 replies; 18+ messages in thread
From: thinker.li @ 2023-08-11 4:31 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf,
yonghong.song
Cc: sinquersw, kuifeng, Kui-Feng Lee
From: Kui-Feng Lee <kuifeng@meta.com>
The new dynptr type (BPF_DYNPTR_TYPE_CGROUP_SOCKOPT) will be used by BPF
programs to create a buffer that can be installed on ctx to replace
exisiting optval or user_optval.
Installation is only allowed if ctx->flags &
BPF_SOCKOPT_FLAG_OPTVAL_REPLACE is true. It is enabled only for sleepable
programs on the cgroup/setsockopt hook. BPF programs can install a new
buffer holding by a dynptr to increase the size of optval passed to
setsockopt().
Installation is not enabled for cgroup/getsockopt since you can not
increased a buffer created, by user program, to return data from
getsockopt().
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
include/linux/bpf.h | 7 +-
include/uapi/linux/bpf.h | 4 +
kernel/bpf/btf.c | 3 +
kernel/bpf/cgroup.c | 5 +-
kernel/bpf/helpers.c | 202 +++++++++++++++++++++++++++++++++
kernel/bpf/verifier.c | 47 +++++++-
tools/include/uapi/linux/bpf.h | 4 +
7 files changed, 268 insertions(+), 4 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index abe75063630b..618ca061f319 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -663,12 +663,15 @@ enum bpf_type_flag {
/* DYNPTR points to xdp_buff */
DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS),
+ /* DYNPTR points to optval buffer of bpf_sockopt */
+ DYNPTR_TYPE_CGROUP_SOCKOPT = BIT(17 + BPF_BASE_TYPE_BITS),
+
__BPF_TYPE_FLAG_MAX,
__BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1,
};
#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
- | DYNPTR_TYPE_XDP)
+ | DYNPTR_TYPE_XDP | DYNPTR_TYPE_CGROUP_SOCKOPT)
/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
@@ -1206,6 +1209,8 @@ enum bpf_dynptr_type {
BPF_DYNPTR_TYPE_SKB,
/* Underlying data is a xdp_buff */
BPF_DYNPTR_TYPE_XDP,
+ /* Underlying data is for the optval of a cgroup sock */
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT,
};
int bpf_dynptr_check_size(u32 size);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fff6f7dff408..c648a7a2b985 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7145,6 +7145,10 @@ struct bpf_sockopt {
enum bpf_sockopt_flags {
/* optval is a pointer to user space memory */
BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
+ /* able to install new optval */
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE = (1U << 1),
+ /* optval is referenced by a dynptr */
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR = (1U << 2),
};
struct bpf_pidns_info {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 249657c466dd..6d6a040688be 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -217,6 +217,7 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_SOCKET_FILTER,
BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_CGROUP_SOCKOPT,
BTF_KFUNC_HOOK_MAX,
};
@@ -7846,6 +7847,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_LWT;
case BPF_PROG_TYPE_NETFILTER:
return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ return BTF_KFUNC_HOOK_CGROUP_SOCKOPT;
default:
return BTF_KFUNC_HOOK_MAX;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c15a72860d2a..196391c6716a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1877,6 +1877,8 @@ static int filter_setsockopt_progs_cb(void *arg,
if (max_optlen < 0)
return max_optlen;
+ ctx->flags = BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
+
if (copy_from_user(ctx->optval, optval,
min(ctx->optlen, max_optlen)) != 0)
return -EFAULT;
@@ -1905,7 +1907,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
ctx.optlen = *optlen;
ctx.optval = optval;
ctx.optval_end = optval + *optlen;
- ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+ ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER |
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
lock_sock(sk);
ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ff240db1512c..981dde97460b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1557,6 +1557,7 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
/* Source and destination may possibly overlap, hence use memmove to
* copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
* pointing to overlapping PTR_TO_MAP_VALUE regions.
@@ -1602,6 +1603,7 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
if (flags)
return -EINVAL;
/* Source and destination may possibly overlap, hence use memmove to
@@ -1654,6 +1656,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
return (unsigned long)(ptr->data + ptr->offset + offset);
case BPF_DYNPTR_TYPE_SKB:
case BPF_DYNPTR_TYPE_XDP:
@@ -2281,6 +2284,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
return ptr->data + ptr->offset + offset;
case BPF_DYNPTR_TYPE_SKB:
if (buffer__opt)
@@ -2449,6 +2453,198 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
rcu_read_unlock();
}
+/* Create a buffer of the given size for a {set,get}sockopt BPF filter.
+ *
+ * This kfunc is only avaliabe for sleeplabe contexts. The dynptr should be
+ * released by bpf_so_optval_install() or bpf_sockopt_release().
+ */
+__bpf_kfunc int bpf_so_optval_alloc(struct bpf_sockopt *sopt, int size,
+ struct bpf_dynptr_kern *ptr__uninit)
+{
+ void *optval;
+ int err;
+
+ bpf_dynptr_set_null(ptr__uninit);
+
+ err = bpf_dynptr_check_size(size);
+ if (err)
+ return err;
+
+ optval = kzalloc(size, GFP_KERNEL);
+ if (!optval)
+ return -ENOMEM;
+
+ bpf_dynptr_init(ptr__uninit, optval,
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT, 0, size);
+
+ return size;
+}
+
+/* Install the buffer of the dynptr into the sockopt context.
+ *
+ * This kfunc is only avaliabe for sleeplabe contexts. The dynptr should be
+ * allocated by bpf_so_optval_alloc(). The dynptr is invalid after
+ * returning from this function successfully.
+ */
+__bpf_kfunc int bpf_so_optval_install(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+
+ if (!(sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_REPLACE) ||
+ bpf_dynptr_get_type(ptr) != BPF_DYNPTR_TYPE_CGROUP_SOCKOPT ||
+ !ptr->data)
+ return -EINVAL;
+
+ if (sopt_kern->optval == ptr->data &&
+ !(sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)) {
+ /* This dynptr is initialized by bpf_so_optval_from() and
+ * the optval is not overwritten by bpf_so_optval_install()
+ * yet.
+ */
+ bpf_dynptr_set_null(ptr);
+ sopt_kern->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR;
+ return 0;
+ }
+
+ if (sopt_kern->optval &&
+ !(sopt_kern->flags & (BPF_SOCKOPT_FLAG_OPTVAL_USER |
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR)))
+ kfree(sopt_kern->optval);
+
+ sopt_kern->optval = ptr->data;
+ sopt_kern->optval_end = ptr->data + __bpf_dynptr_size(ptr);
+ sopt_kern->optlen = __bpf_dynptr_size(ptr);
+ sopt_kern->flags &= ~(BPF_SOCKOPT_FLAG_OPTVAL_USER |
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_so_optval_release(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+
+ if (bpf_dynptr_get_type(ptr) != BPF_DYNPTR_TYPE_CGROUP_SOCKOPT ||
+ !ptr->data)
+ return -EINVAL;
+
+ if (sopt_kern->optval == ptr->data &&
+ !(sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER))
+ /* This dynptr is initialized by bpf_so_optval_from() and
+ * the optval is not overwritten by bpf_so_optval_install()
+ * yet.
+ */
+ sopt_kern->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR;
+ else
+ kfree(ptr->data);
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+/* Initialize a sockopt dynptr from a user or installed optval pointer.
+ *
+ * sopt->optval can be a user pointer or a kernel pointer. A kernel pointer
+ * can be a buffer allocated by the caller of the BPF program or a buffer
+ * installed by other BPF programs through bpf_so_optval_install().
+ *
+ * Atmost one dynptr shall be created by this function at any moment, or
+ * it will return -EINVAL. You can create another dypptr by this function
+ * after release the previous one by bpf_so_optval_release().
+ *
+ * A dynptr that is initialized when optval is a user pointer is an
+ * exception. In this case, the dynptr will point to a kernel buffer with
+ * the same content as the user buffer. To simplify the code, users should
+ * always make sure having only one dynptr initialized by this function at
+ * any moment.
+ */
+__bpf_kfunc int bpf_so_optval_from(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr__uninit,
+ unsigned int size)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+ int err;
+
+ bpf_dynptr_set_null(ptr__uninit);
+
+ if (size > (sopt_kern->optval_end - sopt_kern->optval))
+ return -EINVAL;
+
+ if (size == 0)
+ size = min(sopt_kern->optlen,
+ (int)(sopt_kern->optval_end - sopt_kern->optval));
+
+ if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR)
+ return -EINVAL;
+
+ if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ err = bpf_so_optval_alloc(sopt, sopt_kern->optlen, ptr__uninit);
+ if (err >= 0)
+ err = copy_from_user(ptr__uninit->data,
+ sopt_kern->optval,
+ size);
+ return err;
+ }
+
+ bpf_dynptr_init(ptr__uninit, sopt_kern->optval,
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT, 0,
+ size);
+ sopt_kern->flags |= BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR;
+
+ return size;
+}
+
+/**
+ * int bpf_so_optval_copy_to_r(struct bpf_sockopt *sopt,
+ * void *ptr, u32 ptr__sz)
+ * Description
+ * Copy data from *ptr* to *sopt->optval*.
+ * Return
+ * >= 0 on success, or a negative error in case of failure.
+ */
+__bpf_kfunc int bpf_so_optval_copy_to_r(struct bpf_sockopt *sopt,
+ void *ptr, u32 ptr__sz)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+ int ret;
+
+ if (ptr__sz > (sopt_kern->optval_end - sopt_kern->optval))
+ return -EINVAL;
+
+ if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ ret = copy_to_user(sopt_kern->optval, ptr,
+ ptr__sz);
+ if (unlikely(ret))
+ return -EFAULT;
+ } else {
+ /* Use memmove() in case of optval & ptr overlap. */
+ memmove(sopt_kern->optval, ptr, ptr__sz);
+ ret = ptr__sz;
+ }
+
+ return ret;
+}
+
+/**
+ * int bpf_so_optval_copy_to(struct bpf_sockopt *sopt,
+ * struct bpf_dynptr_kern *ptr)
+ * Description
+ * Copy data from *ptr* to *sopt->optval*.
+ * Return
+ * >= 0 on success, or a negative error in case of failure.
+ */
+__bpf_kfunc int bpf_so_optval_copy_to(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr)
+{
+ __u32 size = bpf_dynptr_size(ptr);
+
+ return bpf_so_optval_copy_to_r(sopt, ptr->data, size);
+}
+
__diag_pop();
BTF_SET8_START(generic_btf_ids)
@@ -2517,6 +2713,12 @@ static const struct btf_kfunc_id_set common_kfunc_set = {
BTF_SET8_START(cgroup_common_btf_ids)
BTF_ID_FLAGS(func, bpf_copy_to_user, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_copy_to_r, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_copy_to, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_alloc, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_install, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_release, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_from, KF_SLEEPABLE)
BTF_SET8_END(cgroup_common_btf_ids)
static const struct btf_kfunc_id_set cgroup_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ca27be76207a..a65e0117139e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -745,6 +745,8 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
return "skb";
case BPF_DYNPTR_TYPE_XDP:
return "xdp";
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return "cgroup_sockopt";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
@@ -826,6 +828,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_SKB;
case DYNPTR_TYPE_XDP:
return BPF_DYNPTR_TYPE_XDP;
+ case DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
@@ -842,6 +846,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
return DYNPTR_TYPE_SKB;
case BPF_DYNPTR_TYPE_XDP:
return DYNPTR_TYPE_XDP;
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return DYNPTR_TYPE_CGROUP_SOCKOPT;
default:
return 0;
}
@@ -849,7 +855,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
- return type == BPF_DYNPTR_TYPE_RINGBUF;
+ return type == BPF_DYNPTR_TYPE_RINGBUF ||
+ type == BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
@@ -10271,6 +10278,10 @@ enum special_kfunc_type {
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
+ KF_bpf_sockopt_alloc_optval,
+ KF_bpf_so_optval_install,
+ KF_bpf_so_optval_release,
+ KF_bpf_so_optval_from,
};
BTF_SET_START(special_kfunc_set)
@@ -10291,6 +10302,10 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_so_optval_alloc)
+BTF_ID(func, bpf_so_optval_install)
+BTF_ID(func, bpf_so_optval_release)
+BTF_ID(func, bpf_so_optval_from)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -10313,6 +10328,10 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_so_optval_alloc)
+BTF_ID(func, bpf_so_optval_install)
+BTF_ID(func, bpf_so_optval_release)
+BTF_ID(func, bpf_so_optval_from)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -10966,6 +10985,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
arg_type |= OBJ_RELEASE;
break;
case KF_ARG_PTR_TO_DYNPTR:
+ if (meta->func_id == special_kfunc_list[KF_bpf_so_optval_install] ||
+ meta->func_id == special_kfunc_list[KF_bpf_so_optval_release]) {
+ int ref_obj_id = dynptr_ref_obj_id(env, reg);
+
+ if (ref_obj_id < 0) {
+ verbose(env, "R%d is not a valid dynptr\n", regno);
+ return -EINVAL;
+ }
+
+ /* Required by check_func_arg_reg_off() */
+ arg_type |= ARG_PTR_TO_DYNPTR | OBJ_RELEASE;
+ meta->release_regno = regno;
+ }
+ break;
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
@@ -11053,6 +11086,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
return -EFAULT;
}
+ } else if ((meta->func_id == special_kfunc_list[KF_bpf_sockopt_alloc_optval] ||
+ meta->func_id == special_kfunc_list[KF_bpf_so_optval_from]) &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ dynptr_arg_type |= DYNPTR_TYPE_CGROUP_SOCKOPT;
}
ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
@@ -11361,7 +11398,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
if (meta.release_regno) {
- err = release_reference(env, regs[meta.release_regno].ref_obj_id);
+ verbose(env, "release refcounted PTR_TO_BTF_ID %s\n",
+ meta.func_name);
+ if (meta.func_id == special_kfunc_list[KF_bpf_so_optval_install] ||
+ meta.func_id == special_kfunc_list[KF_bpf_so_optval_release])
+ err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]);
+ else
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, meta.func_id);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fff6f7dff408..c648a7a2b985 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7145,6 +7145,10 @@ struct bpf_sockopt {
enum bpf_sockopt_flags {
/* optval is a pointer to user space memory */
BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
+ /* able to install new optval */
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE = (1U << 1),
+ /* optval is referenced by a dynptr */
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR = (1U << 2),
};
struct bpf_pidns_info {
--
2.34.1
^ permalink raw reply related [flat|nested] 18+ messages in thread* [RFC bpf-next v2 6/6] bpf: Add test cases for sleepable BPF programs of the CGROUP_SOCKOPT type.
2023-08-11 4:31 [RFC bpf-next v2 0/6] Sleepable BPF programs on cgroup {get,set}sockopt thinker.li
` (4 preceding siblings ...)
2023-08-11 4:31 ` [RFC bpf-next v2 5/6] bpf: Add a new dynptr type for CGRUP_SOCKOPT thinker.li
@ 2023-08-11 4:31 ` thinker.li
5 siblings, 0 replies; 18+ messages in thread
From: thinker.li @ 2023-08-11 4:31 UTC (permalink / raw)
To: bpf, ast, martin.lau, song, kernel-team, andrii, sdf,
yonghong.song
Cc: sinquersw, kuifeng, Kui-Feng Lee
From: Kui-Feng Lee <kuifeng@meta.com>
Do the same test as non-sleepable ones.
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
.../testing/selftests/bpf/bpf_experimental.h | 44 ++++
tools/testing/selftests/bpf/bpf_kfuncs.h | 47 ++++
.../selftests/bpf/prog_tests/sockopt_sk.c | 34 ++-
.../testing/selftests/bpf/progs/sockopt_sk.c | 242 ++++++++++++++++++
.../selftests/bpf/verifier/sleepable.c | 2 +-
5 files changed, 366 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 209811b1993a..553c60818996 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -131,4 +131,48 @@ extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *nod
*/
extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
+/*
+ * Description
+ * Copy data from *ptr* to *sopt->optval*.
+ * Return
+ * >= 0 on success, or a negative error in case of failure.
+ */
+extern int bpf_so_optval_copy_to_r(struct bpf_sockopt *sopt,
+ void *ptr, u32 ptr__sz) __ksym;
+/*
+ * Description
+ * Copy data from *ptr* to *sopt->optval*.
+ * Return
+ * >= 0 on success, or a negative error in case of failure.
+ */
+extern int bpf_so_optval_copy_to(struct bpf_sockopt *sopt,
+ struct bpf_dynptr *ptr) __ksym;
+
+/* Description
+ * Allocate a buffer of 'size' bytes for being installed as optval.
+ * Returns
+ * > 0 on success, the size of the allocated buffer
+ * -ENOMEM or -EINVAL on failure
+ */
+extern int bpf_so_optval_alloc(struct bpf_sockopt *sopt, int size,
+ struct bpf_dynptr *ptr__uninit) __ksym;
+
+/* Description
+ * Install the buffer pointed to by 'ptr' as optval.
+ * Returns
+ * 0 on success
+ * -EINVAL if the buffer is too small
+ */
+extern int bpf_so_optval_install(struct bpf_sockopt *sopt,
+ struct bpf_dynptr *ptr) __ksym;
+
+/* Description
+ * Release the buffer allocated by bpf_so_optval_alloc.
+ * Returns
+ * 0 on success
+ * -EINVAL if the buffer was not allocated by bpf_so_optval_alloc
+ */
+extern int bpf_so_optval_release(struct bpf_sockopt *sopt,
+ struct bpf_dynptr *ptr) __ksym;
+
#endif
diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index 642dda0e758a..7c01825b10c8 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -41,4 +41,51 @@ extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym;
extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym;
extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym;
+extern int bpf_copy_to_user(void *dst__uninit, __u32 dst__sz,
+ const void *src_ign) __ksym;
+
+extern int bpf_so_optval_copy_to_r(struct bpf_sockopt *sopt,
+ void *ptr, __u32 ptr__sz) __ksym;
+
+extern int bpf_so_optval_copy_to(struct bpf_sockopt *sopt,
+ struct bpf_dynptr *ptr) __ksym;
+
+/* Description
+ * Allocate a buffer of 'size' bytes for being installed as optval.
+ * Returns
+ * > 0 on success, the size of the allocated buffer
+ * -ENOMEM or -EINVAL on failure
+ */
+extern int bpf_so_optval_alloc(struct bpf_sockopt *sopt, int size,
+ struct bpf_dynptr *ptr__uninit) __ksym;
+
+/* Description
+ * Install the buffer pointed to by 'ptr' as optval.
+ * Returns
+ * 0 on success
+ * -EINVAL if the buffer is too small
+ */
+extern int bpf_so_optval_install(struct bpf_sockopt *sopt,
+ struct bpf_dynptr *ptr) __ksym;
+
+/* Description
+ * Release the buffer allocated by bpf_so_optval_alloc.
+ * Returns
+ * 0 on success
+ * -EINVAL if the buffer was not allocated by bpf_so_optval_alloc
+ */
+extern int bpf_so_optval_release(struct bpf_sockopt *sopt,
+ struct bpf_dynptr *ptr) __ksym;
+
+/* Description
+ * Initialize a dynptr to access the content of optval passing
+ * to {get,set}sockopt()s.
+ * Returns
+ * > 0 on success, the size of the allocated buffer
+ * -ENOMEM or -EINVAL on failure
+ */
+extern int bpf_so_optval_from(struct bpf_sockopt *sopt,
+ struct bpf_dynptr *ptr__uninit,
+ unsigned int size) __ksym;
+
#endif
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index 05d0e07da394..e18a40d89860 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -92,6 +92,7 @@ static int getsetsockopt(void)
}
if (buf.u8[0] != 0x01) {
log_err("Unexpected buf[0] 0x%02x != 0x01", buf.u8[0]);
+ log_err("optlen %d", optlen);
goto err;
}
@@ -220,7 +221,7 @@ static int getsetsockopt(void)
return -1;
}
-static void run_test(int cgroup_fd)
+static void run_test_nonsleepable(int cgroup_fd)
{
struct sockopt_sk *skel;
@@ -246,6 +247,32 @@ static void run_test(int cgroup_fd)
sockopt_sk__destroy(skel);
}
+static void run_test_sleepable(int cgroup_fd)
+{
+ struct sockopt_sk *skel;
+
+ skel = sockopt_sk__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_load"))
+ goto cleanup;
+
+ skel->bss->page_size = getpagesize();
+
+ skel->links._setsockopt_s =
+ bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd);
+ if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link"))
+ goto cleanup;
+
+ skel->links._getsockopt_s =
+ bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd);
+ if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link"))
+ goto cleanup;
+
+ ASSERT_OK(getsetsockopt(), "getsetsockopt");
+
+cleanup:
+ sockopt_sk__destroy(skel);
+}
+
void test_sockopt_sk(void)
{
int cgroup_fd;
@@ -254,6 +281,9 @@ void test_sockopt_sk(void)
if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /sockopt_sk"))
return;
- run_test(cgroup_fd);
+ if (test__start_subtest("nonsleepable"))
+ run_test_nonsleepable(cgroup_fd);
+ if (test__start_subtest("sleepable"))
+ run_test_sleepable(cgroup_fd);
close(cgroup_fd);
}
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index cb990a7d3d45..e1aacefd301e 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -5,6 +5,9 @@
#include <netinet/in.h>
#include <bpf/bpf_helpers.h>
+typedef int bool;
+#include "bpf_kfuncs.h"
+
char _license[] SEC("license") = "GPL";
int page_size = 0; /* userspace should set it */
@@ -136,6 +139,129 @@ int _getsockopt(struct bpf_sockopt *ctx)
return 1;
}
+SEC("cgroup/getsockopt.s")
+int _getsockopt_s(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+ struct tcp_zerocopy_receive *zcvr;
+ struct bpf_dynptr optval_dynptr;
+ struct sockopt_sk *storage;
+ struct bpf_sock *sk;
+ char buf[1];
+ __u64 addr;
+ int ret;
+
+ if (ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ optval_end = ctx->optval_end;
+ optval = ctx->optval;
+ }
+
+ /* Bypass AF_NETLINK. */
+ sk = ctx->sk;
+ if (sk && sk->family == AF_NETLINK)
+ return 1;
+
+ /* Make sure bpf_get_netns_cookie is callable.
+ */
+ if (bpf_get_netns_cookie(NULL) == 0)
+ return 0;
+
+ if (bpf_get_netns_cookie(ctx) == 0)
+ return 0;
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
+ /* Not interested in SOL_IP:IP_TOS;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ return 1;
+ }
+
+ if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
+ /* Not interested in SOL_SOCKET:SO_SNDBUF;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ return 1;
+ }
+
+ if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) {
+ /* Not interested in SOL_TCP:TCP_CONGESTION;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ return 1;
+ }
+
+ if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) {
+ /* Verify that TCP_ZEROCOPY_RECEIVE triggers.
+ * It has a custom implementation for performance
+ * reasons.
+ */
+
+ bpf_so_optval_from(ctx, &optval_dynptr, sizeof(*zcvr));
+ zcvr = bpf_dynptr_data(&optval_dynptr, 0, sizeof(*zcvr));
+ addr = zcvr ? zcvr->address : 0;
+ bpf_so_optval_release(ctx, &optval_dynptr);
+
+ return addr != 0 ? 0 : 1;
+ }
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+ if (optval + 1 > optval_end)
+ return 0; /* bounds check */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ /* Always export 0x55 */
+ buf[0] = 0x55;
+ ret = bpf_so_optval_alloc(ctx, 1, &optval_dynptr);
+ if (ret >= 0) {
+ bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0);
+ ret = bpf_so_optval_copy_to(ctx, &optval_dynptr);
+ }
+ bpf_so_optval_release(ctx, &optval_dynptr);
+ if (ret < 0)
+ return 0;
+ ctx->optlen = 1;
+
+ /* Userspace buffer is PAGE_SIZE * 2, but BPF
+ * program can only see the first PAGE_SIZE
+ * bytes of data.
+ */
+ if (optval_end - optval != page_size && 0)
+ return 0; /* unexpected data size */
+
+ return 1;
+ }
+
+ if (ctx->level != SOL_CUSTOM)
+ return 0; /* deny everything except custom level */
+
+ if (optval + 1 > optval_end)
+ return 0; /* bounds check */
+
+ storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0; /* couldn't get sk storage */
+
+ if (!ctx->retval)
+ return 0; /* kernel should not have handled
+ * SOL_CUSTOM, something is wrong!
+ */
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ buf[0] = storage->val;
+ ret = bpf_so_optval_copy_to_r(ctx, buf, 1);
+ if (ret < 0)
+ return 0;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
SEC("cgroup/setsockopt")
int _setsockopt(struct bpf_sockopt *ctx)
{
@@ -236,3 +362,119 @@ int _setsockopt(struct bpf_sockopt *ctx)
ctx->optlen = 0;
return 1;
}
+
+SEC("cgroup/setsockopt.s")
+int _setsockopt_s(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ struct bpf_dynptr optval_buf;
+ __u8 *optval = ctx->optval;
+ struct sockopt_sk *storage;
+ struct bpf_sock *sk;
+ __u8 tmp_u8;
+ __u32 tmp;
+ int ret;
+
+ if (ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ optval_end = ctx->optval_end;
+ optval = ctx->optval;
+ }
+
+ /* Bypass AF_NETLINK. */
+ sk = ctx->sk;
+ if (sk && sk->family == AF_NETLINK)
+ return -1;
+
+ /* Make sure bpf_get_netns_cookie is callable.
+ */
+ if (bpf_get_netns_cookie(NULL) == 0)
+ return 0;
+
+ if (bpf_get_netns_cookie(ctx) == 0)
+ return 0;
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
+ /* Not interested in SOL_IP:IP_TOS;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
+ return 1;
+ }
+
+ if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
+ /* Overwrite SO_SNDBUF value */
+
+ ret = bpf_so_optval_alloc(ctx, sizeof(__u32), &optval_buf);
+ if (ret < 0)
+ bpf_so_optval_release(ctx, &optval_buf);
+ else {
+ tmp = 0x55AA;
+ bpf_dynptr_write(&optval_buf, 0, &tmp, sizeof(tmp), 0);
+ ret = bpf_so_optval_install(ctx, &optval_buf);
+ }
+
+ return ret >= 0 ? 1 : 0;
+ }
+
+ if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) {
+ /* Always use cubic */
+
+ ret = bpf_so_optval_alloc(ctx, 5, &optval_buf);
+ if (ret < 0) {
+ bpf_so_optval_release(ctx, &optval_buf);
+ return 0;
+ }
+ bpf_dynptr_write(&optval_buf, 0, "cubic", 5, 0);
+ ret = bpf_so_optval_install(ctx, &optval_buf);
+ if (ret < 0)
+ return 0;
+ ctx->optlen = 5;
+
+ return 1;
+ }
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+ /* Original optlen is larger than PAGE_SIZE. */
+ if (ctx->optlen != page_size * 2)
+ return 0; /* unexpected data size */
+
+ ret = bpf_so_optval_alloc(ctx, 1, &optval_buf);
+ if (ret < 0) {
+ bpf_so_optval_release(ctx, &optval_buf);
+ return 0;
+ }
+ tmp_u8 = 0;
+ bpf_dynptr_write(&optval_buf, 0, &tmp_u8, 1, 0);
+ ret = bpf_so_optval_install(ctx, &optval_buf);
+ if (ret < 0)
+ return 0;
+ ctx->optlen = 1;
+
+ return 1;
+ }
+
+ if (ctx->level != SOL_CUSTOM)
+ return 0; /* deny everything except custom level */
+
+ if (optval + 1 > optval_end)
+ return 0; /* bounds check */
+
+ storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0; /* couldn't get sk storage */
+
+ bpf_so_optval_from(ctx, &optval_buf, sizeof(__u8));
+ optval = bpf_dynptr_data(&optval_buf, 0, sizeof(__u8));
+ if (optval) {
+ storage->val = *optval;
+ ctx->optlen = -1; /* BPF has consumed this option, don't call
+ * kernel setsockopt handler.
+ */
+ }
+ bpf_so_optval_release(ctx, &optval_buf);
+
+ return optval ? 1 : 0;
+}
+
diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c
index 1f0d2bdc673f..4b6c1117ec9f 100644
--- a/tools/testing/selftests/bpf/verifier/sleepable.c
+++ b/tools/testing/selftests/bpf/verifier/sleepable.c
@@ -85,7 +85,7 @@
.expected_attach_type = BPF_TRACE_RAW_TP,
.kfunc = "sched_switch",
.result = REJECT,
- .errstr = "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable",
+ .errstr = "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, cgroup, and struct_ops programs can be sleepable",
.flags = BPF_F_SLEEPABLE,
.runs = -1,
},
--
2.34.1
^ permalink raw reply related [flat|nested] 18+ messages in thread