From: Jens Axboe <axboe@kernel.dk>
To: io-uring@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 6/7] io_uring/kbuf: add support for mapping type KBUF_MODE_BVEC
Date: Wed, 23 Oct 2024 10:07:39 -0600 [thread overview]
Message-ID: <20241023161522.1126423-7-axboe@kernel.dk> (raw)
In-Reply-To: <20241023161522.1126423-1-axboe@kernel.dk>
The provided buffer helpers always map to iovecs. Add a new mode,
KBUF_MODE_BVEC, which instead maps it to a bio_vec array instead. For
use with zero-copy scenarios, where the caller would want to turn it
into a bio_vec anyway, and this avoids first iterating and filling out
and iovec array, only for the caller to then iterate it again and turn
it into a bio_vec array.
Since it's now managing both iovecs and bvecs, change the naming of
buf_sel_arg->nr_iovs member to nr_vecs instead.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
io_uring/kbuf.c | 170 +++++++++++++++++++++++++++++++++++++++++++-----
io_uring/kbuf.h | 9 ++-
io_uring/net.c | 10 +--
3 files changed, 165 insertions(+), 24 deletions(-)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 42579525c4bd..10a3a7a27e9a 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -16,6 +16,7 @@
#include "opdef.h"
#include "kbuf.h"
#include "memmap.h"
+#include "rsrc.h"
/* BIDs are addressed by a 16-bit field in a CQE */
#define MAX_BIDS_PER_BGID (1 << 16)
@@ -117,20 +118,135 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
return NULL;
}
+static struct io_mapped_ubuf *io_ubuf_from_buf(struct io_ring_ctx *ctx,
+ u64 addr, unsigned int *offset)
+{
+ struct io_mapped_ubuf *imu;
+ u16 idx;
+
+ /*
+ * Get registered buffer index and offset, encoded into the
+ * addr base value.
+ */
+ idx = addr & ((1ULL << IOU_BUF_REGBUF_BITS) - 1);
+ addr >>= IOU_BUF_REGBUF_BITS;
+ *offset = addr & ((1ULL << IOU_BUF_OFFSET_BITS) - 1);
+
+ if (unlikely(idx >= ctx->nr_user_bufs))
+ return ERR_PTR(-EFAULT);
+
+ idx = array_index_nospec(idx, ctx->nr_user_bufs);
+ imu = READ_ONCE(ctx->user_bufs[idx]);
+ if (unlikely(*offset >= imu->len))
+ return ERR_PTR(-EFAULT);
+
+ return imu;
+}
+
+static bool io_expand_bvecs(struct buf_sel_arg *arg)
+{
+ int nvecs = arg->nr_vecs + 8;
+ struct bio_vec *bv;
+
+ if (!(arg->mode & KBUF_MODE_EXPAND))
+ return false;
+
+ bv = kmalloc_array(nvecs, sizeof(struct bio_vec), GFP_KERNEL);
+ if (unlikely(!bv))
+ return false;
+ memcpy(bv, arg->bvecs, arg->nr_vecs * sizeof(*bv));
+ if (arg->mode & KBUF_MODE_FREE)
+ kfree(arg->bvecs);
+ arg->bvecs = bv;
+ arg->nr_vecs = nvecs;
+ arg->mode |= KBUF_MODE_FREE;
+ return true;
+}
+
+static int io_fill_bvecs(struct io_ring_ctx *ctx, u64 addr,
+ struct buf_sel_arg *arg, unsigned int len,
+ int *vec_off)
+{
+ struct bio_vec *src, *src_prv = NULL;
+ struct io_mapped_ubuf *imu;
+ unsigned int llen = len;
+ unsigned int offset;
+
+ imu = io_ubuf_from_buf(ctx, addr, &offset);
+ if (unlikely(IS_ERR(imu)))
+ return PTR_ERR(imu);
+
+ if (unlikely(offset >= imu->len || len > imu->len))
+ return -EOVERFLOW;
+ if (unlikely(offset > imu->len - len))
+ return -EOVERFLOW;
+
+ src = imu->bvec;
+ if (offset > src->bv_len) {
+ unsigned long seg_skip;
+
+ offset -= src->bv_len;
+ seg_skip = 1 + (offset >> imu->folio_shift);
+ offset &= ((1UL << imu->folio_shift) - 1);
+ src += seg_skip;
+ }
+
+ do {
+ unsigned int this_len = len;
+
+ if (this_len + offset > src->bv_len)
+ this_len = src->bv_len - offset;
+
+ /*
+ * If contig with previous bio_vec, merge it to minimize the
+ * number of segments needed. If not, then add a new segment,
+ * expanding the number of available slots, if needed.
+ */
+ if (src_prv &&
+ page_folio(src_prv->bv_page) == page_folio(src->bv_page) &&
+ src_prv->bv_page + 1 == src->bv_page) {
+ arg->bvecs[*vec_off - 1].bv_len += this_len;
+ } else {
+ struct bio_vec *dst;
+
+ if (*vec_off == arg->nr_vecs && !io_expand_bvecs(arg))
+ break;
+
+ dst = &arg->bvecs[*vec_off];
+ dst->bv_page = src->bv_page;
+ dst->bv_len = this_len;
+ dst->bv_offset = offset;
+ (*vec_off)++;
+ }
+ offset = 0;
+ len -= this_len;
+ src_prv = src++;
+ } while (len);
+
+ return llen - len;
+}
+
static int io_provided_buffers_select(struct io_kiocb *req,
struct buf_sel_arg *arg,
struct io_buffer_list *bl, size_t *len)
{
- struct iovec *iov = arg->iovs;
void __user *buf;
+ int ret;
buf = io_provided_buffer_select(req, len, bl);
if (unlikely(!buf))
return -ENOBUFS;
- iov[0].iov_base = buf;
- iov[0].iov_len = *len;
- return 1;
+ if (arg->mode & KBUF_MODE_BVEC) {
+ u64 addr = (unsigned long)(uintptr_t) buf;
+
+ *len = io_fill_bvecs(req->ctx, addr, arg, *len, &ret);
+ } else {
+ arg->iovs[0].iov_base = buf;
+ arg->iovs[0].iov_len = *len;
+ ret = 1;
+ }
+ return ret;
}
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
@@ -196,13 +312,16 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
#define PEEK_MAX_IMPORT 256
static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
- struct io_buffer_list *bl)
+ struct io_buffer_list *bl, int *nbufs)
{
struct io_uring_buf_ring *br = bl->buf_ring;
struct iovec *iov = arg->iovs;
- int nr_iovs = arg->nr_iovs;
+ int nr_iovs = arg->nr_vecs;
__u16 nr_avail, tail, head;
struct io_uring_buf *buf;
+ int vec_off;
+
+ BUILD_BUG_ON(sizeof(struct iovec) > sizeof(struct bio_vec));
tail = smp_load_acquire(&br->tail);
head = bl->head;
@@ -236,10 +355,12 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
/*
* only alloc a bigger array if we know we have data to map, eg not
- * a speculative peek operation.
+ * a speculative peek operation. Note that struct bio_vec and
+ * struct iovec are the same size, so we can use them interchangably
+ * here as it's just for sizing purposes.
*/
if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
- iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
+ iov = kmalloc_array(nr_avail, sizeof(struct bio_vec), GFP_KERNEL);
if (unlikely(!iov))
return -ENOMEM;
if (arg->mode & KBUF_MODE_FREE)
@@ -255,6 +376,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
if (!arg->max_len)
arg->max_len = INT_MAX;
+ vec_off = 0;
req->buf_index = buf->bid;
do {
u32 len = buf->len;
@@ -266,15 +388,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
buf->len = len;
}
- iov->iov_base = u64_to_user_ptr(buf->addr);
- iov->iov_len = len;
- iov++;
+ if (arg->mode & KBUF_MODE_BVEC) {
+ int ret;
+
+ ret = io_fill_bvecs(req->ctx, buf->addr, arg, len, &vec_off);
+ if (unlikely(ret < 0))
+ return ret;
+ len = ret;
+ } else {
+ iov->iov_base = u64_to_user_ptr(buf->addr);
+ iov->iov_len = len;
+ iov++;
+ vec_off++;
+ }
arg->out_len += len;
arg->max_len -= len;
+ (*nbufs)++;
if (!arg->max_len)
break;
-
buf = io_ring_head_to_buf(br, ++head, bl->mask);
} while (--nr_iovs);
@@ -283,7 +415,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
req->flags |= REQ_F_BUFFER_RING;
req->buf_list = bl;
- return iov - arg->iovs;
+ return vec_off;
}
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
@@ -299,7 +431,9 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
goto out_unlock;
if (bl->flags & IOBL_BUF_RING) {
- ret = io_ring_buffers_peek(req, arg, bl);
+ int nbufs = 0;
+
+ ret = io_ring_buffers_peek(req, arg, bl, &nbufs);
/*
* Don't recycle these buffers if we need to go through poll.
* Nobody else can use them anyway, and holding on to provided
@@ -307,9 +441,9 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
* side anyway with normal buffers. Besides, we already
* committed them, they cannot be put back in the queue.
*/
- if (ret > 0) {
+ if (nbufs) {
req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
- io_kbuf_commit(req, bl, arg->out_len, ret);
+ io_kbuf_commit(req, bl, arg->out_len, nbufs);
}
} else {
ret = io_provided_buffers_select(req, arg, bl, &arg->out_len);
@@ -332,7 +466,9 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
return -ENOENT;
if (bl->flags & IOBL_BUF_RING) {
- ret = io_ring_buffers_peek(req, arg, bl);
+ int nbufs = 0;
+
+ ret = io_ring_buffers_peek(req, arg, bl, &nbufs);
if (ret > 0)
req->flags |= REQ_F_BUFFERS_COMMIT;
return ret;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 36aadfe5ac00..7c56ba994f21 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -53,13 +53,18 @@ enum {
KBUF_MODE_EXPAND = 1,
/* if bigger vec allocated, free old one */
KBUF_MODE_FREE = 2,
+ /* turn into bio_vecs, not iovecs */
+ KBUF_MODE_BVEC = 4,
};
struct buf_sel_arg {
- struct iovec *iovs;
+ union {
+ struct iovec *iovs;
+ struct bio_vec *bvecs;
+ };
size_t out_len;
size_t max_len;
- unsigned short nr_iovs;
+ unsigned short nr_vecs;
unsigned short mode;
};
diff --git a/io_uring/net.c b/io_uring/net.c
index dbef14aa50f9..154756762a46 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -643,17 +643,17 @@ static int io_send_import(struct io_kiocb *req, unsigned int issue_flags)
struct buf_sel_arg arg = {
.iovs = &kmsg->fast_iov,
.max_len = min_not_zero(sr->len, INT_MAX),
- .nr_iovs = 1,
+ .nr_vecs = 1,
};
if (kmsg->free_iov) {
- arg.nr_iovs = kmsg->free_iov_nr;
+ arg.nr_vecs = kmsg->free_iov_nr;
arg.iovs = kmsg->free_iov;
arg.mode = KBUF_MODE_FREE;
}
if (!(sr->flags & IORING_RECVSEND_BUNDLE))
- arg.nr_iovs = 1;
+ arg.nr_vecs = 1;
else
arg.mode |= KBUF_MODE_EXPAND;
@@ -1140,12 +1140,12 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
sr->flags & IORING_RECVSEND_BUNDLE) {
struct buf_sel_arg arg = {
.iovs = &kmsg->fast_iov,
- .nr_iovs = 1,
+ .nr_vecs = 1,
.mode = KBUF_MODE_EXPAND,
};
if (kmsg->free_iov) {
- arg.nr_iovs = kmsg->free_iov_nr;
+ arg.nr_vecs = kmsg->free_iov_nr;
arg.iovs = kmsg->free_iov;
arg.mode |= KBUF_MODE_FREE;
}
--
2.45.2
next prev parent reply other threads:[~2024-10-23 16:15 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-23 16:07 [PATCHSET RFC 0/7] Add support for provided registered buffers Jens Axboe
2024-10-23 16:07 ` [PATCH 1/7] io_uring/kbuf: mark buf_sel_arg mode as KBUF_MODE_FREE once allocated Jens Axboe
2024-10-23 16:07 ` [PATCH 2/7] io_uring/kbuf: change io_provided_buffers_select() calling convention Jens Axboe
2024-10-23 16:07 ` [PATCH 3/7] io_uring/net: abstract out io_send_import() helper Jens Axboe
2024-10-23 16:07 ` [PATCH 4/7] io_uring/net: move send zc fixed buffer import into helper Jens Axboe
2024-10-23 16:07 ` [PATCH 5/7] io_uring: add ability for provided buffer to index registered buffers Jens Axboe
2024-10-24 15:44 ` Pavel Begunkov
2024-10-24 15:57 ` Jens Axboe
2024-10-24 16:17 ` Pavel Begunkov
2024-10-24 17:16 ` Jens Axboe
2024-10-24 18:20 ` Pavel Begunkov
2024-10-24 19:53 ` Jens Axboe
2024-10-24 22:46 ` Jens Axboe
2024-10-23 16:07 ` Jens Axboe [this message]
2024-10-24 15:22 ` [PATCH 6/7] io_uring/kbuf: add support for mapping type KBUF_MODE_BVEC Pavel Begunkov
2024-10-24 15:27 ` Jens Axboe
2024-10-24 15:40 ` Pavel Begunkov
2024-10-24 15:49 ` Jens Axboe
2024-10-23 16:07 ` [PATCH 7/7] io_uring/net: add provided buffer and bundle support to send zc Jens Axboe
2024-10-24 14:44 ` Pavel Begunkov
2024-10-24 14:48 ` Jens Axboe
2024-10-24 15:36 ` Pavel Begunkov
2024-10-24 14:36 ` [PATCHSET RFC 0/7] Add support for provided registered buffers Pavel Begunkov
2024-10-24 14:43 ` Jens Axboe
2024-10-24 15:04 ` Pavel Begunkov
2024-10-24 15:11 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241023161522.1126423-7-axboe@kernel.dk \
--to=axboe@kernel.dk \
--cc=io-uring@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox