All of lore.kernel.org
 help / color / mirror / Atom feed
From: Keith Busch <kbusch@meta.com>
To: <ming.lei@redhat.com>, <asml.silence@gmail.com>,
	<axboe@kernel.dk>, <linux-block@vger.kernel.org>,
	<io-uring@vger.kernel.org>
Cc: <bernd@bsbernd.com>, Keith Busch <kbusch@kernel.org>
Subject: [PATCHv2 3/6] io_uring: add support for kernel registered bvecs
Date: Mon, 10 Feb 2025 16:56:43 -0800	[thread overview]
Message-ID: <20250211005646.222452-4-kbusch@meta.com> (raw)
In-Reply-To: <20250211005646.222452-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

Provide an interface for the kernel to leverage the existing
pre-registered buffers that io_uring provides. User space can reference
these later to achieve zero-copy IO.

User space must register an empty fixed buffer table with io_uring in
order for the kernel to make use of it.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/io_uring.h       |   1 +
 include/linux/io_uring_types.h |   4 ++
 io_uring/rsrc.c                | 100 +++++++++++++++++++++++++++++++--
 io_uring/rsrc.h                |   1 +
 4 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 85fe4e6b275c7..b5637a2aae340 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 #include <linux/xarray.h>
 #include <uapi/linux/io_uring.h>
+#include <linux/blk-mq.h>
 
 #if defined(CONFIG_IO_URING)
 void __io_uring_cancel(bool cancel_all);
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e2fef264ff8b8..99aac2d52fbae 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -693,4 +693,8 @@ static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
 	return ctx->flags & IORING_SETUP_CQE32;
 }
 
+int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
+			    void (*release)(void *), unsigned int index);
+void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int tag);
+
 #endif
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 30f08cf13ef60..14efec8587888 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -110,8 +110,9 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 
 		if (!refcount_dec_and_test(&imu->refs))
 			return;
-		for (i = 0; i < imu->nr_bvecs; i++)
-			unpin_user_page(imu->bvec[i].bv_page);
+		if (node->type == IORING_RSRC_BUFFER)
+			for (i = 0; i < imu->nr_bvecs; i++)
+				unpin_user_page(imu->bvec[i].bv_page);
 		if (imu->acct_pages)
 			io_unaccount_mem(ctx, imu->acct_pages);
 		kvfree(imu);
@@ -240,6 +241,13 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		struct io_rsrc_node *node;
 		u64 tag = 0;
 
+		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
+		node = io_rsrc_node_lookup(&ctx->buf_table, i);
+		if (node && node->type != IORING_RSRC_BUFFER) {
+			err = -EBUSY;
+			break;
+		}
+
 		uvec = u64_to_user_ptr(user_data);
 		iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
 		if (IS_ERR(iov)) {
@@ -265,7 +273,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			}
 			node->tag = tag;
 		}
-		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
 		io_reset_rsrc_node(ctx, &ctx->buf_table, i);
 		ctx->buf_table.nodes[i] = node;
 		if (ctx->compat)
@@ -452,6 +459,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 		if (io_slot_file(node))
 			fput(io_slot_file(node));
 		break;
+	case IORING_RSRC_KBUFFER:
 	case IORING_RSRC_BUFFER:
 		if (node->buf)
 			io_buffer_unmap(ctx, node);
@@ -862,6 +870,79 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
+int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
+			    void (*release)(void *), unsigned int index)
+{
+	struct io_rsrc_data *data = &ctx->buf_table;
+	struct req_iterator rq_iter;
+	struct io_mapped_ubuf *imu;
+	struct io_rsrc_node *node;
+	struct bio_vec bv;
+	u16 nr_bvecs;
+	int i = 0;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (!data->nr)
+		return -EINVAL;
+	if (index >= data->nr)
+		return -EINVAL;
+
+	node = data->nodes[index];
+	if (node)
+		return -EBUSY;
+
+	node = io_rsrc_node_alloc(IORING_RSRC_KBUFFER);
+	if (!node)
+		return -ENOMEM;
+
+	node->release = release;
+	node->priv = rq;
+
+	nr_bvecs = blk_rq_nr_phys_segments(rq);
+	imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+	if (!imu) {
+		kfree(node);
+		return -ENOMEM;
+	}
+
+	imu->ubuf = 0;
+	imu->len = blk_rq_bytes(rq);
+	imu->acct_pages = 0;
+	imu->nr_bvecs = nr_bvecs;
+	refcount_set(&imu->refs, 1);
+	node->buf = imu;
+
+	rq_for_each_bvec(bv, rq, rq_iter)
+		bvec_set_page(&node->buf->bvec[i++], bv.bv_page, bv.bv_len,
+			      bv.bv_offset);
+	data->nodes[index] = node;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
+
+void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index)
+{
+	struct io_rsrc_data *data = &ctx->buf_table;
+	struct io_rsrc_node *node;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (!data->nr)
+		return;
+	if (index >= data->nr)
+		return;
+
+	node = data->nodes[index];
+	if (!node || !node->buf)
+		return;
+	if (node->type != IORING_RSRC_KBUFFER)
+		return;
+	io_reset_rsrc_node(ctx, data, index);
+}
+EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
+
 int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node,
 		    u64 buf_addr, size_t len)
 {
@@ -888,8 +969,8 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node,
 		/*
 		 * Don't use iov_iter_advance() here, as it's really slow for
 		 * using the latter parts of a big fixed buffer - it iterates
-		 * over each segment manually. We can cheat a bit here, because
-		 * we know that:
+		 * over each segment manually. We can cheat a bit here for user
+		 * registered nodes, because we know that:
 		 *
 		 * 1) it's a BVEC iter, we set it up
 		 * 2) all bvecs are the same in size, except potentially the
@@ -903,8 +984,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node,
 		 */
 		const struct bio_vec *bvec = imu->bvec;
 
+		/*
+		 * Kernel buffer bvecs, on the other hand, don't necessarily
+		 * have the size property of user registered ones, so we have
+		 * to use the slow iter advance.
+		 */
 		if (offset < bvec->bv_len) {
 			iter->iov_offset = offset;
+		} else if (node->type == IORING_RSRC_KBUFFER) {
+			iov_iter_advance(iter, offset);
 		} else {
 			unsigned long seg_skip;
 
@@ -1004,7 +1092,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		if (!src_node) {
 			dst_node = NULL;
 		} else {
-			dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+			dst_node = io_rsrc_node_alloc(src_node->type);
 			if (!dst_node) {
 				ret = -ENOMEM;
 				goto out_free;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index a3826ab84e666..8147dfc26f737 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,6 +13,7 @@
 enum {
 	IORING_RSRC_FILE		= 0,
 	IORING_RSRC_BUFFER		= 1,
+	IORING_RSRC_KBUFFER		= 2,
 };
 
 struct io_rsrc_node {
-- 
2.43.5


  parent reply	other threads:[~2025-02-11  0:57 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-11  0:56 [PATCHv2 0/6] ublk zero-copy support Keith Busch
2025-02-11  0:56 ` [PATCHv2 1/6] io_uring: use node for import Keith Busch
2025-02-11  0:56 ` [PATCHv2 2/6] io_uring: create resource release callback Keith Busch
2025-02-13  1:31   ` Pavel Begunkov
2025-02-13  1:58     ` Keith Busch
2025-02-13 13:06       ` Pavel Begunkov
2025-02-11  0:56 ` Keith Busch [this message]
2025-02-13  1:33   ` [PATCHv2 3/6] io_uring: add support for kernel registered bvecs Pavel Begunkov
2025-02-14  3:30   ` Ming Lei
2025-02-14 15:26     ` Keith Busch
2025-02-15  1:34       ` Ming Lei
2025-02-18 20:34         ` Keith Busch
2025-02-11  0:56 ` [PATCHv2 4/6] ublk: zc register/unregister bvec Keith Busch
2025-02-12  2:49   ` Ming Lei
2025-02-12  4:11     ` Keith Busch
2025-02-12  9:24       ` Ming Lei
2025-02-12 14:59         ` Keith Busch
2025-02-13  2:12   ` Pavel Begunkov
2025-02-11  0:56 ` [PATCHv2 5/6] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-11  0:56 ` [PATCHv2 6/6] io_uring: cache nodes and mapped buffers Keith Busch
2025-02-11 15:17   ` kernel test robot
2025-02-11 16:47   ` Keith Busch
2025-02-12  1:42   ` kernel test robot
2025-02-12  2:29 ` [PATCHv2 0/6] ublk zero-copy support Ming Lei
2025-02-12 15:28   ` Keith Busch
2025-02-12 16:06     ` Pavel Begunkov
2025-02-13  1:52       ` Ming Lei
2025-02-13 15:12 ` lizetao
2025-02-13 16:06   ` Keith Busch
2025-02-14  3:39     ` lizetao
2025-02-14  2:41   ` Ming Lei
2025-02-14  4:21     ` lizetao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250211005646.222452-4-kbusch@meta.com \
    --to=kbusch@meta.com \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=bernd@bsbernd.com \
    --cc=io-uring@vger.kernel.org \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.