From: thinker.li@gmail.com
To: bpf@vger.kernel.org, ast@kernel.org, martin.lau@linux.dev,
song@kernel.org, kernel-team@meta.com, andrii@kernel.org,
sdf@google.com, yonghong.song@linux.dev
Cc: sinquersw@gmail.com, kuifeng@meta.com,
Kui-Feng Lee <thinker.li@gmail.com>
Subject: [RFC bpf-next v4 5/6] bpf: Add a new dynptr type for CGRUP_SOCKOPT.
Date: Fri, 18 Aug 2023 20:01:42 -0700 [thread overview]
Message-ID: <20230819030143.419729-6-thinker.li@gmail.com> (raw)
In-Reply-To: <20230819030143.419729-1-thinker.li@gmail.com>
From: Kui-Feng Lee <thinker.li@gmail.com>
The new dynptr type (BPF_DYNPTR_TYPE_CGROUP_SOCKOPT) will be used by BPF
programs to create a buffer that can be installed on ctx to replace
exisiting optval or user_optval.
Installation is only allowed if ctx->flags &
BPF_SOCKOPT_FLAG_OPTVAL_REPLACE is true. It is enabled only for sleepable
programs on the cgroup/setsockopt hook. BPF programs can install a new
buffer holding by a dynptr to increase the size of optval passed to
setsockopt().
Installation is not enabled for cgroup/getsockopt since you can not
increased a buffer created, by user program, to return data from
getsockopt().
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
include/linux/bpf.h | 7 ++-
include/linux/filter.h | 4 ++
kernel/bpf/btf.c | 3 +
kernel/bpf/cgroup.c | 5 +-
kernel/bpf/helpers.c | 140 +++++++++++++++++++++++++++++++++++++++++
kernel/bpf/verifier.c | 38 ++++++++++-
6 files changed, 193 insertions(+), 4 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 40a3d392b7f1..aad34298bfd3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -663,12 +663,15 @@ enum bpf_type_flag {
/* DYNPTR points to xdp_buff */
DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS),
+ /* DYNPTR points to optval buffer of bpf_sockopt */
+ DYNPTR_TYPE_CGROUP_SOCKOPT = BIT(17 + BPF_BASE_TYPE_BITS),
+
__BPF_TYPE_FLAG_MAX,
__BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1,
};
#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
- | DYNPTR_TYPE_XDP)
+ | DYNPTR_TYPE_XDP | DYNPTR_TYPE_CGROUP_SOCKOPT)
/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
@@ -1208,6 +1211,8 @@ enum bpf_dynptr_type {
BPF_DYNPTR_TYPE_SKB,
/* Underlying data is a xdp_buff */
BPF_DYNPTR_TYPE_XDP,
+ /* Underlying data is for the optval of a cgroup sock */
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT,
};
int bpf_dynptr_check_size(u32 size);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2aa2a96526de..df12fddd2f21 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1347,6 +1347,10 @@ struct bpf_sockopt_kern {
enum bpf_sockopt_kern_flags {
/* optval is a pointer to user space memory */
BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
+ /* able to install new optval */
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE = (1U << 1),
+ /* optval is referenced by a dynptr */
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR = (1U << 2),
};
int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 249657c466dd..6d6a040688be 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -217,6 +217,7 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_SOCKET_FILTER,
BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_CGROUP_SOCKOPT,
BTF_KFUNC_HOOK_MAX,
};
@@ -7846,6 +7847,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_LWT;
case BPF_PROG_TYPE_NETFILTER:
return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ return BTF_KFUNC_HOOK_CGROUP_SOCKOPT;
default:
return BTF_KFUNC_HOOK_MAX;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 1b2006dac4d5..b27a4fbc6ffe 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1866,6 +1866,8 @@ static int filter_setsockopt_progs_cb(void *arg,
if (max_optlen < 0)
return max_optlen;
+ ctx->flags = BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
+
if (copy_from_user(ctx->optval, optval,
min(ctx->optlen, max_optlen)) != 0)
return -EFAULT;
@@ -1894,7 +1896,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
ctx.optlen = *optlen;
ctx.optval = optval;
ctx.optval_end = optval + *optlen;
- ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+ ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER |
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
lock_sock(sk);
ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb91cae0612a..5be1fd9e64f3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1519,6 +1519,51 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
};
+static int __bpf_sockopt_store_bytes(struct bpf_sockopt_kern *sopt, u32 offset,
+ void *src, u32 len)
+{
+ int buf_len, err;
+ void *buf;
+
+ if (!src)
+ return 0;
+
+ if (sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ if (!(sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_REPLACE))
+ return copy_to_user(sopt->optval + offset, src, len) ?
+ -EFAULT : 0;
+ buf_len = sopt->optval_end - sopt->optval;
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ err = copy_from_user(buf, sopt->optval, buf_len) ? -EFAULT : 0;
+ if (err < 0) {
+ kfree(buf);
+ return err;
+ }
+ sopt->optval = buf;
+ sopt->optval_end = buf + len;
+ sopt->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_USER;
+ memcpy(buf + offset, src, len);
+ }
+
+ memcpy(sopt->optval + offset, src, len);
+
+ return 0;
+}
+
+static int __bpf_sockopt_load_bytes(struct bpf_sockopt_kern *sopt, u32 offset,
+ void *dst, u32 len)
+{
+ if (sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
+ return copy_from_user(dst, sopt->optval + offset, len) ?
+ -EFAULT : 0;
+
+ memcpy(dst, sopt->optval + offset, len);
+
+ return 0;
+}
+
BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
{
@@ -1547,6 +1592,8 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
case BPF_DYNPTR_TYPE_XDP:
return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return __bpf_sockopt_load_bytes(src->data, src->offset + offset, dst, len);
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1597,6 +1644,10 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
if (flags)
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return __bpf_sockopt_store_bytes(dst->data,
+ dst->offset + offset,
+ src, len);
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1634,6 +1685,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
return (unsigned long)(ptr->data + ptr->offset + offset);
case BPF_DYNPTR_TYPE_SKB:
case BPF_DYNPTR_TYPE_XDP:
@@ -2278,6 +2330,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
return buffer__opt;
}
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return NULL;
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -2429,6 +2483,80 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
rcu_read_unlock();
}
+__bpf_kfunc int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr)
+{
+ bpf_dynptr_set_null(ptr);
+ return 0;
+}
+
+/* Initialize a sockopt dynptr from a user or installed optval pointer.
+ *
+ * sopt->optval can be a user pointer or a kernel pointer. A kernel pointer
+ * can be a buffer allocated by the caller of the BPF program or a buffer
+ * installed by other BPF programs through bpf_sockopt_dynptr_install().
+ *
+ * Atmost one dynptr shall be created by this function at any moment, or
+ * it will return -EINVAL. You can create another dypptr by this function
+ * after release the previous one by bpf_sockopt_dynptr_release().
+ *
+ * A dynptr that is initialized when optval is a user pointer is an
+ * exception. In this case, the dynptr will point to a kernel buffer with
+ * the same content as the user buffer. To simplify the code, users should
+ * always make sure having only one dynptr initialized by this function at
+ * any moment.
+ */
+__bpf_kfunc int bpf_dynptr_from_sockopt(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr__uninit)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+ unsigned int size;
+
+ size = sopt_kern->optval_end - sopt_kern->optval;
+
+ bpf_dynptr_init(ptr__uninit, sopt,
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT, 0,
+ size);
+
+ return size;
+}
+
+__bpf_kfunc int bpf_sockopt_grow_to(struct bpf_sockopt *sopt,
+ u32 newsize)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+ void *newoptval;
+ int err;
+
+ if (newsize > DYNPTR_MAX_SIZE)
+ return -EINVAL;
+
+ if (newsize <= sopt_kern->optlen)
+ return 0;
+
+ if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ newoptval = kmalloc(newsize, GFP_KERNEL);
+ if (!newoptval)
+ return -ENOMEM;
+ err = copy_from_user(newoptval, sopt_kern->optval,
+ sopt_kern->optval_end - sopt_kern->optval);
+ if (err < 0) {
+ kfree(newoptval);
+ return err;
+ }
+ sopt_kern->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_USER;
+ } else {
+ newoptval = krealloc(sopt_kern->optval, newsize, GFP_KERNEL);
+ if (!newoptval)
+ return -ENOMEM;
+ }
+
+ sopt_kern->optval = newoptval;
+ sopt_kern->optval_end = newoptval + newsize;
+
+ return 0;
+}
+
__diag_pop();
BTF_SET8_START(generic_btf_ids)
@@ -2494,6 +2622,17 @@ static const struct btf_kfunc_id_set common_kfunc_set = {
.set = &common_btf_ids,
};
+BTF_SET8_START(cgroup_common_btf_ids)
+BTF_ID_FLAGS(func, bpf_sockopt_dynptr_release, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_dynptr_from_sockopt, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_sockopt_grow_to, KF_SLEEPABLE)
+BTF_SET8_END(cgroup_common_btf_ids)
+
+static const struct btf_kfunc_id_set cgroup_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cgroup_common_btf_ids,
+};
+
static int __init kfunc_init(void)
{
int ret;
@@ -2513,6 +2652,7 @@ static int __init kfunc_init(void)
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCKOPT, &cgroup_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
THIS_MODULE);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83731e998b09..15119ff90bff 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -755,6 +755,8 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
return "skb";
case BPF_DYNPTR_TYPE_XDP:
return "xdp";
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return "cgroup_sockopt";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
@@ -836,6 +838,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_SKB;
case DYNPTR_TYPE_XDP:
return BPF_DYNPTR_TYPE_XDP;
+ case DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
@@ -852,6 +856,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
return DYNPTR_TYPE_SKB;
case BPF_DYNPTR_TYPE_XDP:
return DYNPTR_TYPE_XDP;
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return DYNPTR_TYPE_CGROUP_SOCKOPT;
default:
return 0;
}
@@ -859,7 +865,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
- return type == BPF_DYNPTR_TYPE_RINGBUF;
+ return type == BPF_DYNPTR_TYPE_RINGBUF ||
+ type == BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
@@ -10300,6 +10307,8 @@ enum special_kfunc_type {
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
+ KF_bpf_sockopt_dynptr_release,
+ KF_bpf_dynptr_from_sockopt,
};
BTF_SET_START(special_kfunc_set)
@@ -10320,6 +10329,8 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_sockopt_dynptr_release)
+BTF_ID(func, bpf_dynptr_from_sockopt)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -10342,6 +10353,8 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_sockopt_dynptr_release)
+BTF_ID(func, bpf_dynptr_from_sockopt)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -10995,6 +11008,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
arg_type |= OBJ_RELEASE;
break;
case KF_ARG_PTR_TO_DYNPTR:
+ if (meta->func_id == special_kfunc_list[KF_bpf_sockopt_dynptr_release]) {
+ int ref_obj_id = dynptr_ref_obj_id(env, reg);
+
+ if (ref_obj_id < 0) {
+ verbose(env, "R%d is not a valid dynptr\n", regno);
+ return -EINVAL;
+ }
+
+ /* Required by check_func_arg_reg_off() */
+ arg_type |= ARG_PTR_TO_DYNPTR | OBJ_RELEASE;
+ meta->release_regno = regno;
+ }
+ break;
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
@@ -11082,6 +11108,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
return -EFAULT;
}
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_sockopt] &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ dynptr_arg_type |= DYNPTR_TYPE_CGROUP_SOCKOPT;
}
ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
@@ -11390,7 +11419,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
if (meta.release_regno) {
- err = release_reference(env, regs[meta.release_regno].ref_obj_id);
+ verbose(env, "release refcounted PTR_TO_BTF_ID %s\n",
+ meta.func_name);
+ if (meta.func_id == special_kfunc_list[KF_bpf_sockopt_dynptr_release])
+ err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]);
+ else
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, meta.func_id);
--
2.34.1
next prev parent reply other threads:[~2023-08-19 3:01 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-08-19 3:01 [RFC bpf-next v4 0/6] Sleepable BPF programs on cgroup {get,set}sockopt thinker.li
2023-08-19 3:01 ` [RFC bpf-next v4 1/6] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt thinker.li
2023-08-19 3:01 ` [RFC bpf-next v4 2/6] libbpf: add sleepable sections for {get,set}sockopt() thinker.li
2023-08-19 3:01 ` [RFC bpf-next v4 3/6] Add PTR_TO_AUX thinker.li
2023-08-19 3:01 ` [RFC bpf-next v4 4/6] bpf: Prevent BPF programs from access the buffer pointed by user_optval thinker.li
2023-08-19 3:01 ` thinker.li [this message]
2023-08-19 3:01 ` [RFC bpf-next v4 6/6] selftests/bpf: Add test cases for sleepable BPF programs of the CGROUP_SOCKOPT type thinker.li
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230819030143.419729-6-thinker.li@gmail.com \
--to=thinker.li@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=kernel-team@meta.com \
--cc=kuifeng@meta.com \
--cc=martin.lau@linux.dev \
--cc=sdf@google.com \
--cc=sinquersw@gmail.com \
--cc=song@kernel.org \
--cc=yonghong.song@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox