[PATCH 3/5] io_uring: bpf: extend io_uring with bpf struct_ops

bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>, io-uring@vger.kernel.org
Cc: Caleb Sander Mateos <csander@purestorage.com>,
	Akilesh Kailash <akailash@google.com>,
	bpf@vger.kernel.org, Alexei Starovoitov <ast@kernel.org>,
	Ming Lei <ming.lei@redhat.com>
Subject: [PATCH 3/5] io_uring: bpf: extend io_uring with bpf struct_ops
Date: Wed,  5 Nov 2025 00:21:18 +0800	[thread overview]
Message-ID: <20251104162123.1086035-4-ming.lei@redhat.com> (raw)
In-Reply-To: <20251104162123.1086035-1-ming.lei@redhat.com>

io_uring can be extended with bpf struct_ops in the following ways:

1) add new io_uring operation from application
- one typical use case is for operating device zero-copy buffer, which
belongs to kernel, and not visible or too expensive to export to
userspace, such as supporting copy data from this buffer to userspace,
decompressing data to zero-copy buffer in Android case[1][2], or
checksum/decrypting.

[1] https://lpc.events/event/18/contributions/1710/attachments/1440/3070/LPC2024_ublk_zero_copy.pdf

2) extend 64 byte SQE, since bpf map can be used to store IO data
   conveniently

3) communicate in IO chain, since bpf map can be shared among IOs,
when one bpf IO is completed, data can be written to IO chain wide
bpf map, then the following bpf IO can retrieve the data from this bpf
map, this way is more flexible than io_uring built-in buffer

4) pretty handy to inject error for test purpose

bpf struct_ops is one very handy way to attach bpf prog with kernel, and
this patch simply wires existed io_uring operation callbacks with added
uring bpf struct_ops, so application can define its own uring bpf
operations.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 include/uapi/linux/io_uring.h |   9 ++
 io_uring/bpf.c                | 271 +++++++++++++++++++++++++++++++++-
 io_uring/io_uring.c           |   1 +
 io_uring/io_uring.h           |   3 +-
 io_uring/uring_bpf.h          |  30 ++++
 5 files changed, 311 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b8c49813b4e5..94d2050131ac 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -74,6 +74,7 @@ struct io_uring_sqe {
 		__u32		install_fd_flags;
 		__u32		nop_flags;
 		__u32		pipe_flags;
+		__u32		bpf_op_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -427,6 +428,13 @@ enum io_uring_op {
 #define IORING_RECVSEND_BUNDLE		(1U << 4)
 #define IORING_SEND_VECTORIZED		(1U << 5)
 
+/*
+ * sqe->bpf_op_flags		top 8bits is for storing bpf op
+ *				The other 24bits are used for bpf prog
+ */
+#define IORING_BPF_OP_BITS	(8)
+#define IORING_BPF_OP_SHIFT	(24)
+
 /*
  * cqe.res for IORING_CQE_F_NOTIF if
  * IORING_SEND_ZC_REPORT_USAGE was requested
@@ -631,6 +639,7 @@ struct io_uring_params {
 #define IORING_FEAT_MIN_TIMEOUT		(1U << 15)
 #define IORING_FEAT_RW_ATTR		(1U << 16)
 #define IORING_FEAT_NO_IOWAIT		(1U << 17)
+#define IORING_FEAT_BPF			(1U << 18)
 
 /*
  * io_uring_register(2) opcodes and arguments
diff --git a/io_uring/bpf.c b/io_uring/bpf.c
index bb1e37d1e804..8227be6d5a10 100644
--- a/io_uring/bpf.c
+++ b/io_uring/bpf.c
@@ -4,28 +4,95 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <uapi/linux/io_uring.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
 #include "io_uring.h"
 #include "uring_bpf.h"
 
+#define MAX_BPF_OPS_COUNT	(1 << IORING_BPF_OP_BITS)
+
 static DEFINE_MUTEX(uring_bpf_ctx_lock);
 static LIST_HEAD(uring_bpf_ctx_list);
+DEFINE_STATIC_SRCU(uring_bpf_srcu);
+static struct uring_bpf_ops bpf_ops[MAX_BPF_OPS_COUNT];
 
-int io_uring_bpf_issue(struct io_kiocb *req, unsigned int issue_flags)
+static inline unsigned char uring_bpf_get_op(unsigned int op_flags)
 {
-	return -ECANCELED;
+	return (unsigned char)(op_flags >> IORING_BPF_OP_SHIFT);
+}
+
+static inline unsigned int uring_bpf_get_flags(unsigned int op_flags)
+{
+	return op_flags & ((1U << IORING_BPF_OP_SHIFT) - 1);
+}
+
+static inline struct uring_bpf_ops *uring_bpf_get_ops(struct uring_bpf_data *data)
+{
+	return &bpf_ops[uring_bpf_get_op(data->opf)];
 }
 
 int io_uring_bpf_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	struct uring_bpf_data *data = io_kiocb_to_cmd(req, struct uring_bpf_data);
+	unsigned int op_flags = READ_ONCE(sqe->bpf_op_flags);
+	struct uring_bpf_ops *ops;
+
+	if (!(req->ctx->flags & IORING_SETUP_BPF))
+		return -EACCES;
+
+	data->opf = op_flags;
+	ops = &bpf_ops[uring_bpf_get_op(data->opf)];
+
+	if (ops->prep_fn)
+		return ops->prep_fn(data, sqe);
 	return -EOPNOTSUPP;
 }
 
+static int __io_uring_bpf_issue(struct io_kiocb *req)
+{
+	struct uring_bpf_data *data = io_kiocb_to_cmd(req, struct uring_bpf_data);
+	struct uring_bpf_ops *ops = uring_bpf_get_ops(data);
+
+	if (ops->issue_fn)
+		return ops->issue_fn(data);
+	return -ECANCELED;
+}
+
+int io_uring_bpf_issue(struct io_kiocb *req, unsigned int issue_flags)
+{
+	if (issue_flags & IO_URING_F_UNLOCKED) {
+		int idx, ret;
+
+		idx = srcu_read_lock(&uring_bpf_srcu);
+		ret = __io_uring_bpf_issue(req);
+		srcu_read_unlock(&uring_bpf_srcu, idx);
+
+		return ret;
+	}
+	return __io_uring_bpf_issue(req);
+}
+
 void io_uring_bpf_fail(struct io_kiocb *req)
 {
+	struct uring_bpf_data *data = io_kiocb_to_cmd(req, struct uring_bpf_data);
+	struct uring_bpf_ops *ops = uring_bpf_get_ops(data);
+
+	if (ops->fail_fn)
+		ops->fail_fn(data);
 }
 
 void io_uring_bpf_cleanup(struct io_kiocb *req)
 {
+	struct uring_bpf_data *data = io_kiocb_to_cmd(req, struct uring_bpf_data);
+	struct uring_bpf_ops *ops = uring_bpf_get_ops(data);
+
+	if (ops->cleanup_fn)
+		ops->cleanup_fn(data);
 }
 
 void uring_bpf_add_ctx(struct io_ring_ctx *ctx)
@@ -39,3 +106,203 @@ void uring_bpf_del_ctx(struct io_ring_ctx *ctx)
 	guard(mutex)(&uring_bpf_ctx_lock);
 	list_del(&ctx->bpf_node);
 }
+
+static const struct btf_type *uring_bpf_data_type;
+
+static bool uring_bpf_ops_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int uring_bpf_ops_btf_struct_access(struct bpf_verifier_log *log,
+					const struct bpf_reg_state *reg,
+					int off, int size)
+{
+	const struct btf_type *t;
+
+	t = btf_type_by_id(reg->btf, reg->btf_id);
+	if (t != uring_bpf_data_type) {
+		bpf_log(log, "only read is supported\n");
+		return -EACCES;
+	}
+
+	if (off < offsetof(struct uring_bpf_data, pdu) ||
+			off + size >= sizeof(struct uring_bpf_data))
+		return -EACCES;
+
+	return NOT_INIT;
+}
+
+static const struct bpf_verifier_ops io_bpf_verifier_ops = {
+	.get_func_proto = bpf_base_func_proto,
+	.is_valid_access = uring_bpf_ops_is_valid_access,
+	.btf_struct_access = uring_bpf_ops_btf_struct_access,
+};
+
+static int uring_bpf_ops_init(struct btf *btf)
+{
+	s32 type_id;
+
+	type_id = btf_find_by_name_kind(btf, "uring_bpf_data", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	uring_bpf_data_type = btf_type_by_id(btf, type_id);
+	return 0;
+}
+
+static int uring_bpf_ops_check_member(const struct btf_type *t,
+				   const struct btf_member *member,
+				   const struct bpf_prog *prog)
+{
+	return 0;
+}
+
+static int uring_bpf_ops_init_member(const struct btf_type *t,
+				 const struct btf_member *member,
+				 void *kdata, const void *udata)
+{
+	const struct uring_bpf_ops *uuring_bpf_ops;
+	struct uring_bpf_ops *kuring_bpf_ops;
+	u32 moff;
+
+	uuring_bpf_ops = (const struct uring_bpf_ops *)udata;
+	kuring_bpf_ops = (struct uring_bpf_ops *)kdata;
+
+	moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct uring_bpf_ops, id):
+		/* For dev_id, this function has to copy it and return 1 to
+		 * indicate that the data has been handled by the struct_ops
+		 * type, or the verifier will reject the map if the value of
+		 * those fields is not zero.
+		 */
+		kuring_bpf_ops->id = uuring_bpf_ops->id;
+		return 1;
+	}
+	return 0;
+}
+
+static int io_bpf_reg_unreg(struct uring_bpf_ops *ops, bool reg)
+{
+	struct io_ring_ctx *ctx;
+	int ret = 0;
+
+	guard(mutex)(&uring_bpf_ctx_lock);
+	list_for_each_entry(ctx, &uring_bpf_ctx_list, bpf_node)
+		mutex_lock(&ctx->uring_lock);
+
+	if (reg) {
+		if (bpf_ops[ops->id].issue_fn)
+			ret = -EBUSY;
+		else
+			bpf_ops[ops->id] = *ops;
+	} else {
+		bpf_ops[ops->id] = (struct uring_bpf_ops) {0};
+	}
+
+	synchronize_srcu(&uring_bpf_srcu);
+
+	list_for_each_entry(ctx, &uring_bpf_ctx_list, bpf_node)
+		mutex_unlock(&ctx->uring_lock);
+
+	return ret;
+}
+
+static int io_bpf_reg(void *kdata, struct bpf_link *link)
+{
+	struct uring_bpf_ops *ops = kdata;
+
+	return io_bpf_reg_unreg(ops, true);
+}
+
+static void io_bpf_unreg(void *kdata, struct bpf_link *link)
+{
+	struct uring_bpf_ops *ops = kdata;
+
+	io_bpf_reg_unreg(ops, false);
+}
+
+static int io_bpf_prep_io(struct uring_bpf_data *data, const struct io_uring_sqe *sqe)
+{
+	return -EOPNOTSUPP;
+}
+
+static int io_bpf_issue_io(struct uring_bpf_data *data)
+{
+	return -ECANCELED;
+}
+
+static void io_bpf_fail_io(struct uring_bpf_data *data)
+{
+}
+
+static void io_bpf_cleanup_io(struct uring_bpf_data *data)
+{
+}
+
+static struct uring_bpf_ops __bpf_uring_bpf_ops = {
+	.prep_fn	= io_bpf_prep_io,
+	.issue_fn	= io_bpf_issue_io,
+	.fail_fn	= io_bpf_fail_io,
+	.cleanup_fn	= io_bpf_cleanup_io,
+};
+
+static struct bpf_struct_ops bpf_uring_bpf_ops = {
+	.verifier_ops = &io_bpf_verifier_ops,
+	.init = uring_bpf_ops_init,
+	.check_member = uring_bpf_ops_check_member,
+	.init_member = uring_bpf_ops_init_member,
+	.reg = io_bpf_reg,
+	.unreg = io_bpf_unreg,
+	.name = "uring_bpf_ops",
+	.cfi_stubs = &__bpf_uring_bpf_ops,
+	.owner = THIS_MODULE,
+};
+
+__bpf_kfunc_start_defs();
+__bpf_kfunc void uring_bpf_set_result(struct uring_bpf_data *data, int res)
+{
+	struct io_kiocb *req = cmd_to_io_kiocb(data);
+
+	if (res < 0)
+		req_set_fail(req);
+	io_req_set_res(req, res, 0);
+}
+
+/* io_kiocb layout might be changed */
+__bpf_kfunc struct io_kiocb *uring_bpf_data_to_req(struct uring_bpf_data *data)
+{
+	return cmd_to_io_kiocb(data);
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(uring_bpf_kfuncs)
+BTF_ID_FLAGS(func, uring_bpf_set_result)
+BTF_ID_FLAGS(func, uring_bpf_data_to_req)
+BTF_KFUNCS_END(uring_bpf_kfuncs)
+
+static const struct btf_kfunc_id_set uring_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &uring_bpf_kfuncs,
+};
+
+int __init io_bpf_init(void)
+{
+	int err;
+
+	err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &uring_kfunc_set);
+	if (err) {
+		pr_warn("error while setting UBLK BPF tracing kfuncs: %d", err);
+		return err;
+	}
+
+	err = register_bpf_struct_ops(&bpf_uring_bpf_ops, uring_bpf_ops);
+	if (err)
+		pr_warn("error while registering io_uring bpf struct ops: %d", err);
+
+	return 0;
+}
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 38f03f6c28cb..d2517e09407a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3851,6 +3851,7 @@ static int __init io_uring_init(void)
 	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
 #endif
 
+	io_bpf_init();
 	return 0;
 };
 __initcall(io_uring_init);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 4baf21a9e1ee..3f19bb079bcc 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -34,7 +34,8 @@
 			IORING_FEAT_RECVSEND_BUNDLE |\
 			IORING_FEAT_MIN_TIMEOUT |\
 			IORING_FEAT_RW_ATTR |\
-			IORING_FEAT_NO_IOWAIT)
+			IORING_FEAT_NO_IOWAIT |\
+			IORING_FEAT_BPF);
 
 #define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\
 			IORING_SETUP_SQPOLL |\
diff --git a/io_uring/uring_bpf.h b/io_uring/uring_bpf.h
index b6cda6df99b1..c76eba887d22 100644
--- a/io_uring/uring_bpf.h
+++ b/io_uring/uring_bpf.h
@@ -2,6 +2,29 @@
 #ifndef IOU_BPF_H
 #define IOU_BPF_H
 
+struct uring_bpf_data {
+	/* readonly for bpf prog */
+	struct file     *file;
+	u32		opf;
+
+	/* writeable for bpf prog */
+	u8              pdu[64 - sizeof(struct file *) - sizeof(u32)];
+};
+
+typedef int (*uring_io_prep_t)(struct uring_bpf_data *data,
+			       const struct io_uring_sqe *sqe);
+typedef int (*uring_io_issue_t)(struct uring_bpf_data *data);
+typedef void (*uring_io_fail_t)(struct uring_bpf_data *data);
+typedef void (*uring_io_cleanup_t)(struct uring_bpf_data *data);
+
+struct uring_bpf_ops {
+	unsigned short		id;
+	uring_io_prep_t		prep_fn;
+	uring_io_issue_t	issue_fn;
+	uring_io_fail_t		fail_fn;
+	uring_io_cleanup_t	cleanup_fn;
+};
+
 #ifdef CONFIG_IO_URING_BPF
 int io_uring_bpf_issue(struct io_kiocb *req, unsigned int issue_flags);
 int io_uring_bpf_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
@@ -11,6 +34,8 @@ void io_uring_bpf_cleanup(struct io_kiocb *req);
 void uring_bpf_add_ctx(struct io_ring_ctx *ctx);
 void uring_bpf_del_ctx(struct io_ring_ctx *ctx);
 
+int __init io_bpf_init(void);
+
 #else
 static inline int io_uring_bpf_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -33,5 +58,10 @@ static inline void uring_bpf_add_ctx(struct io_ring_ctx *ctx)
 static inline void uring_bpf_del_ctx(struct io_ring_ctx *ctx)
 {
 }
+
+static inline int __init io_bpf_init(void)
+{
+	return 0;
+}
 #endif
 #endif
-- 
2.47.0

next prev parent reply	other threads:[~2025-11-04 16:22 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-04 16:21 [PATCH 0/5] io_uring: add IORING_OP_BPF for extending io_uring Ming Lei
2025-11-04 16:21 ` [PATCH 1/5] io_uring: prepare for extending io_uring with bpf Ming Lei
2025-11-04 16:21 ` [PATCH 2/5] io_uring: bpf: add io_uring_ctx setup for BPF into one list Ming Lei
2025-11-04 16:21 ` Ming Lei [this message]
2025-11-07 19:02   ` [PATCH 3/5] io_uring: bpf: extend io_uring with bpf struct_ops kernel test robot
2025-11-08  6:53   ` kernel test robot
2025-11-13 10:32   ` Stefan Metzmacher
2025-11-13 10:59     ` Ming Lei
2025-11-13 11:19       ` Stefan Metzmacher
2025-11-14  3:00         ` Ming Lei
2025-12-08 22:45           ` Caleb Sander Mateos
2025-12-09  3:08             ` Ming Lei
2025-12-10 16:11               ` Caleb Sander Mateos
2025-11-19 14:39   ` Jonathan Corbet
2025-11-20  1:46     ` Ming Lei
2025-11-20  1:51       ` Ming Lei
2025-11-04 16:21 ` [PATCH 4/5] io_uring: bpf: add buffer support for IORING_OP_BPF Ming Lei
2025-11-13 10:42   ` Stefan Metzmacher
2025-11-13 11:04     ` Ming Lei
2025-11-13 11:25       ` Stefan Metzmacher
2025-11-04 16:21 ` [PATCH 5/5] io_uring: bpf: add io_uring_bpf_req_memcpy() kfunc Ming Lei
2025-11-07 18:51   ` kernel test robot
2025-11-05 12:47 ` [PATCH 0/5] io_uring: add IORING_OP_BPF for extending io_uring Pavel Begunkov
2025-11-05 15:57   ` Ming Lei
2025-11-06 16:03     ` Pavel Begunkov
2025-11-07 15:54       ` Ming Lei
2025-11-11 14:07         ` Pavel Begunkov
2025-11-13  4:18           ` Ming Lei
2025-11-19 19:00             ` Pavel Begunkov

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:b8c49813b4e dfblob:94d2050131a dfblob:bb1e37d1e80
dfblob:8227be6d5a1 dfblob:38f03f6c28c dfblob:d2517e09407
dfblob:4baf21a9e1e dfblob:3f19bb079bc dfblob:b6cda6df99b
dfblob:c76eba887d2 )
 OR (
bs:"[PATCH 3/5] io_uring: bpf: extend io_uring with bpf struct_ops" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251104162123.1086035-4-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=akailash@google.com \
    --cc=ast@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=bpf@vger.kernel.org \
    --cc=csander@purestorage.com \
    --cc=io-uring@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).