All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Keith Busch <kbusch@meta.com>
Cc: asml.silence@gmail.com, axboe@kernel.dk,
	linux-block@vger.kernel.org, io-uring@vger.kernel.org,
	bernd@bsbernd.com, csander@purestorage.com,
	Keith Busch <kbusch@kernel.org>
Subject: Re: [PATCHv5 07/11] io_uring: add support for kernel registered bvecs
Date: Tue, 25 Feb 2025 17:40:14 +0800	[thread overview]
Message-ID: <Z72P_nnZD9i-ya-1@fedora> (raw)
In-Reply-To: <20250224213116.3509093-8-kbusch@meta.com>

On Mon, Feb 24, 2025 at 01:31:12PM -0800, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Provide an interface for the kernel to leverage the existing
> pre-registered buffers that io_uring provides. User space can reference
> these later to achieve zero-copy IO.
> 
> User space must register an empty fixed buffer table with io_uring in
> order for the kernel to make use of it.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  include/linux/io_uring/cmd.h |   7 ++
>  io_uring/rsrc.c              | 123 +++++++++++++++++++++++++++++++++--
>  io_uring/rsrc.h              |   8 +++
>  3 files changed, 131 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
> index 87150dc0a07cf..cf8d80d847344 100644
> --- a/include/linux/io_uring/cmd.h
> +++ b/include/linux/io_uring/cmd.h
> @@ -4,6 +4,7 @@
>  
>  #include <uapi/linux/io_uring.h>
>  #include <linux/io_uring_types.h>
> +#include <linux/blk-mq.h>
>  
>  /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
>  #define IORING_URING_CMD_CANCELABLE	(1U << 30)
> @@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
>  	return cmd_to_io_kiocb(cmd)->async_data;
>  }
>  
> +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
> +			    void (*release)(void *), unsigned int index,
> +			    unsigned int issue_flags);
> +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
> +			       unsigned int issue_flags);
> +
>  #endif /* _LINUX_IO_URING_CMD_H */
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index f814526982c36..e0c6ed3aef5b5 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -9,6 +9,7 @@
>  #include <linux/hugetlb.h>
>  #include <linux/compat.h>
>  #include <linux/io_uring.h>
> +#include <linux/io_uring/cmd.h>
>  
>  #include <uapi/linux/io_uring.h>
>  
> @@ -104,14 +105,21 @@ int io_buffer_validate(struct iovec *iov)
>  static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
>  {
>  	struct io_mapped_ubuf *imu = node->buf;
> -	unsigned int i;
>  
>  	if (!refcount_dec_and_test(&imu->refs))
>  		return;
> -	for (i = 0; i < imu->nr_bvecs; i++)
> -		unpin_user_page(imu->bvec[i].bv_page);
> -	if (imu->acct_pages)
> -		io_unaccount_mem(ctx, imu->acct_pages);
> +
> +	if (imu->release) {
> +		imu->release(imu->priv);
> +	} else {
> +		unsigned int i;
> +
> +		for (i = 0; i < imu->nr_bvecs; i++)
> +			unpin_user_page(imu->bvec[i].bv_page);
> +		if (imu->acct_pages)
> +			io_unaccount_mem(ctx, imu->acct_pages);
> +	}
> +
>  	kvfree(imu);
>  }
>  
> @@ -761,6 +769,9 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>  	imu->len = iov->iov_len;
>  	imu->nr_bvecs = nr_pages;
>  	imu->folio_shift = PAGE_SHIFT;
> +	imu->release = NULL;
> +	imu->priv = NULL;
> +	imu->perm = IO_IMU_READABLE | IO_IMU_WRITEABLE;
>  	if (coalesced)
>  		imu->folio_shift = data.folio_shift;
>  	refcount_set(&imu->refs, 1);
> @@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>  	return ret;
>  }
>  
> +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
> +			    void (*release)(void *), unsigned int index,
> +			    unsigned int issue_flags)
> +{
> +	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
> +	struct io_rsrc_data *data = &ctx->buf_table;
> +	struct req_iterator rq_iter;
> +	struct io_mapped_ubuf *imu;
> +	struct io_rsrc_node *node;
> +	struct bio_vec bv, *bvec;
> +	u16 nr_bvecs;
> +	int ret = 0;
> +
> +
> +	io_ring_submit_lock(ctx, issue_flags);
> +	if (index >= data->nr) {
> +		ret = -EINVAL;
> +		goto unlock;
> +	}
> +	index = array_index_nospec(index, data->nr);
> +
> +	if (data->nodes[index] ) {
> +		ret = -EBUSY;
> +		goto unlock;
> +	}
> +
> +	node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> +	if (!node) {
> +		ret = -ENOMEM;
> +		goto unlock;
> +	}
> +
> +	nr_bvecs = blk_rq_nr_phys_segments(rq);
> +	imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> +	if (!imu) {
> +		kfree(node);
> +		ret = -ENOMEM;
> +		goto unlock;
> +	}
> +
> +	imu->ubuf = 0;
> +	imu->len = blk_rq_bytes(rq);
> +	imu->acct_pages = 0;
> +	imu->folio_shift = PAGE_SHIFT;
> +	imu->nr_bvecs = nr_bvecs;
> +	refcount_set(&imu->refs, 1);
> +	imu->release = release;
> +	imu->priv = rq;
> +
> +	if (op_is_write(req_op(rq)))
> +		imu->perm = IO_IMU_WRITEABLE;
> +	else
> +		imu->perm = IO_IMU_READABLE;

Looks the above is wrong, if request is for write op, the buffer
should be readable & !writeable.

IO_IMU_WRITEABLE is supposed to mean the buffer is writeable, isn't it?

> +
> +	bvec = imu->bvec;
> +	rq_for_each_bvec(bv, rq, rq_iter)
> +		*bvec++ = bv;
> +
> +	node->buf = imu;
> +	data->nodes[index] = node;
> +unlock:
> +	io_ring_submit_unlock(ctx, issue_flags);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
> +
> +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
> +			       unsigned int issue_flags)
> +{
> +	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
> +	struct io_rsrc_data *data = &ctx->buf_table;
> +	struct io_rsrc_node *node;
> +
> +	io_ring_submit_lock(ctx, issue_flags);
> +	if (index >= data->nr)
> +		goto unlock;
> +	index = array_index_nospec(index, data->nr);
> +
> +	node = data->nodes[index];
> +	if (!node || !node->buf->release)
> +		goto unlock;
> +
> +	io_put_rsrc_node(ctx, node);
> +	data->nodes[index] = NULL;
> +unlock:
> +	io_ring_submit_unlock(ctx, issue_flags);
> +}
> +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
> +
>  static int io_import_fixed(int ddir, struct iov_iter *iter,
>  			   struct io_mapped_ubuf *imu,
>  			   u64 buf_addr, size_t len)
> @@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
>  	/* not inside the mapped region */
>  	if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
>  		return -EFAULT;
> +	if (!(imu->perm & (1 << ddir)))
> +		return -EFAULT;
>  
>  	/*
>  	 * Might not be a start of buffer, set size appropriately
> @@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
>  		/*
>  		 * Don't use iov_iter_advance() here, as it's really slow for
>  		 * using the latter parts of a big fixed buffer - it iterates
> -		 * over each segment manually. We can cheat a bit here, because
> -		 * we know that:
> +		 * over each segment manually. We can cheat a bit here for user
> +		 * registered nodes, because we know that:
>  		 *
>  		 * 1) it's a BVEC iter, we set it up
>  		 * 2) all bvecs are the same in size, except potentially the
> @@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
>  		 */
>  		const struct bio_vec *bvec = imu->bvec;
>  
> +		/*
> +		 * Kernel buffer bvecs, on the other hand, don't necessarily
> +		 * have the size property of user registered ones, so we have
> +		 * to use the slow iter advance.
> +		 */
>  		if (offset < bvec->bv_len) {
>  			iter->iov_offset = offset;
> +		} else if (imu->release) {
> +			iov_iter_advance(iter, offset);
>  		} else {
>  			unsigned long seg_skip;
>  
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index f0e9080599646..64bf35667cf9c 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -20,6 +20,11 @@ struct io_rsrc_node {
>  	};
>  };
>  
> +enum {
> +	IO_IMU_READABLE		= 1 << 0,
> +	IO_IMU_WRITEABLE	= 1 << 1,
> +};
> +

The above definition could be wrong too, IO_IMU_READABLE is supposed to
mean that the buffer is readable, but it is aligned with 1 << ITER_DEST.


Thanks,
Ming


  reply	other threads:[~2025-02-25  9:40 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-24 21:31 [PATCHv5 00/11] ublk zero copy support Keith Busch
2025-02-24 21:31 ` [PATCHv5 01/11] io_uring/rsrc: remove redundant check for valid imu Keith Busch
2025-02-25  8:37   ` Ming Lei
2025-02-25 13:13   ` Pavel Begunkov
2025-02-24 21:31 ` [PATCHv5 02/11] io_uring/nop: reuse req->buf_index Keith Busch
2025-02-24 23:30   ` Jens Axboe
2025-02-25  0:02     ` Keith Busch
2025-02-25  8:43   ` Ming Lei
2025-02-25 13:13   ` Pavel Begunkov
2025-02-24 21:31 ` [PATCHv5 03/11] io_uring/net: reuse req->buf_index for sendzc Keith Busch
2025-02-25  8:44   ` Ming Lei
2025-02-25 13:14   ` Pavel Begunkov
2025-02-24 21:31 ` [PATCHv5 04/11] io_uring/nvme: pass issue_flags to io_uring_cmd_import_fixed() Keith Busch
2025-02-25  8:52   ` Ming Lei
2025-02-24 21:31 ` [PATCHv5 05/11] io_uring: combine buffer lookup and import Keith Busch
2025-02-25  8:55   ` Ming Lei
2025-02-24 21:31 ` [PATCHv5 06/11] io_uring/rw: move fixed buffer import to issue path Keith Busch
2025-02-25  9:26   ` Ming Lei
2025-02-25 13:57   ` Pavel Begunkov
2025-02-25 20:57   ` Caleb Sander Mateos
2025-02-25 21:16     ` Keith Busch
2025-02-24 21:31 ` [PATCHv5 07/11] io_uring: add support for kernel registered bvecs Keith Busch
2025-02-25  9:40   ` Ming Lei [this message]
2025-02-25 17:32     ` Keith Busch
2025-02-25 22:47       ` Ming Lei
2025-02-25 22:55         ` Keith Busch
2025-02-25 14:00   ` Pavel Begunkov
2025-02-25 14:05     ` Pavel Begunkov
2025-02-25 20:58   ` Caleb Sander Mateos
2025-02-24 21:31 ` [PATCHv5 08/11] nvme: map uring_cmd data even if address is 0 Keith Busch
2025-02-25  9:41   ` Ming Lei
2025-02-24 21:31 ` [PATCHv5 09/11] ublk: zc register/unregister bvec Keith Busch
2025-02-25 11:00   ` Ming Lei
2025-02-25 16:35     ` Keith Busch
2025-02-25 22:56       ` Ming Lei
2025-02-25 16:19   ` Pavel Begunkov
2025-02-25 16:27     ` Keith Busch
2025-02-25 16:42       ` Pavel Begunkov
2025-02-25 16:52         ` Keith Busch
2025-02-27  4:16           ` Ming Lei
2025-02-25 21:14   ` Caleb Sander Mateos
2025-02-26  8:15   ` Ming Lei
2025-02-26 17:10     ` Keith Busch
2025-02-27  4:19       ` Ming Lei
2025-02-24 21:31 ` [PATCHv5 10/11] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-25 16:04   ` Pavel Begunkov
2025-02-24 21:31 ` [PATCHv5 11/11] io_uring: cache nodes and mapped buffers Keith Busch
2025-02-25 13:11   ` Pavel Begunkov
2025-02-25 14:10 ` [PATCHv5 00/11] ublk zero copy support Pavel Begunkov
2025-02-25 14:47   ` Jens Axboe
2025-02-25 15:07 ` (subset) " Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z72P_nnZD9i-ya-1@fedora \
    --to=ming.lei@redhat.com \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=bernd@bsbernd.com \
    --cc=csander@purestorage.com \
    --cc=io-uring@vger.kernel.org \
    --cc=kbusch@kernel.org \
    --cc=kbusch@meta.com \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.