From: David Wei <dw@davidwei.uk>
To: io-uring@vger.kernel.org, netdev@vger.kernel.org
Cc: David Wei <dw@davidwei.uk>, Jens Axboe <axboe@kernel.dk>,
Pavel Begunkov <asml.silence@gmail.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
"David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jesper Dangaard Brouer <hawk@kernel.org>,
David Ahern <dsahern@kernel.org>,
Mina Almasry <almasrymina@google.com>
Subject: [PATCH v1 10/15] io_uring/zcrx: add io_zcrx_area
Date: Mon, 7 Oct 2024 15:15:58 -0700 [thread overview]
Message-ID: <20241007221603.1703699-11-dw@davidwei.uk> (raw)
In-Reply-To: <20241007221603.1703699-1-dw@davidwei.uk>
From: David Wei <davidhwei@meta.com>
Add io_zcrx_area that represents a region of userspace memory that is
used for zero copy. During ifq registration, userspace passes in the
uaddr and len of userspace memory, which is then pinned by the kernel.
Each net_iov is mapped to one of these pages.
The freelist is a spinlock protected list that keeps track of all the
net_iovs/pages that aren't used.
For now, there is only one area per ifq and area registration happens
implicitly as part of ifq registration. There is no API for
adding/removing areas yet. The struct for area registration is there for
future extensibility once we support multiple areas and TCP devmem.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
include/uapi/linux/io_uring.h | 9 ++++
io_uring/rsrc.c | 2 +-
io_uring/rsrc.h | 1 +
io_uring/zcrx.c | 93 ++++++++++++++++++++++++++++++++++-
io_uring/zcrx.h | 16 ++++++
5 files changed, 118 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 567cdb89711e..ffd315d8c6b5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -831,6 +831,15 @@ struct io_uring_zcrx_offsets {
__u64 __resv[2];
};
+struct io_uring_zcrx_area_reg {
+ __u64 addr;
+ __u64 len;
+ __u64 rq_area_token;
+ __u32 flags;
+ __u32 __resv1;
+ __u64 __resv2[2];
+};
+
/*
* Argument for IORING_REGISTER_ZCRX_IFQ
*/
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 453867add7ca..42606404019e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -85,7 +85,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0;
}
-static int io_buffer_validate(struct iovec *iov)
+int io_buffer_validate(struct iovec *iov)
{
unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c032ca3436ca..e691e8ed849b 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -74,6 +74,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned size, unsigned type);
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
+int io_buffer_validate(struct iovec *iov);
static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 79d79b9b8df8..8382129402ac 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -10,6 +10,7 @@
#include "kbuf.h"
#include "memmap.h"
#include "zcrx.h"
+#include "rsrc.h"
#define IO_RQ_MAX_ENTRIES 32768
@@ -40,6 +41,83 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
ifq->rqes = NULL;
}
+static void io_zcrx_free_area(struct io_zcrx_area *area)
+{
+ if (area->freelist)
+ kvfree(area->freelist);
+ if (area->nia.niovs)
+ kvfree(area->nia.niovs);
+ if (area->pages) {
+ unpin_user_pages(area->pages, area->nia.num_niovs);
+ kvfree(area->pages);
+ }
+ kfree(area);
+}
+
+static int io_zcrx_create_area(struct io_ring_ctx *ctx,
+ struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area **res,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ struct io_zcrx_area *area;
+ int i, ret, nr_pages;
+ struct iovec iov;
+
+ if (area_reg->flags || area_reg->rq_area_token)
+ return -EINVAL;
+ if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
+ return -EINVAL;
+ if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+ return -EINVAL;
+
+ iov.iov_base = u64_to_user_ptr(area_reg->addr);
+ iov.iov_len = area_reg->len;
+ ret = io_buffer_validate(&iov);
+ if (ret)
+ return ret;
+
+ ret = -ENOMEM;
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
+ if (!area)
+ goto err;
+
+ area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
+ &nr_pages);
+ if (IS_ERR(area->pages)) {
+ ret = PTR_ERR(area->pages);
+ area->pages = NULL;
+ goto err;
+ }
+ area->nia.num_niovs = nr_pages;
+
+ area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!area->nia.niovs)
+ goto err;
+
+ area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!area->freelist)
+ goto err;
+
+ for (i = 0; i < nr_pages; i++) {
+ area->freelist[i] = i;
+ }
+
+ area->free_count = nr_pages;
+ area->ifq = ifq;
+ /* we're only supporting one area per ifq for now */
+ area->area_id = 0;
+ area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
+ spin_lock_init(&area->freelist_lock);
+ *res = area;
+ return 0;
+err:
+ if (area)
+ io_zcrx_free_area(area);
+ return ret;
+}
+
static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
@@ -55,6 +133,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
{
+ if (ifq->area)
+ io_zcrx_free_area(ifq->area);
+
io_free_rbuf_ring(ifq);
kfree(ifq);
}
@@ -62,6 +143,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
+ struct io_uring_zcrx_area_reg area;
struct io_uring_zcrx_ifq_reg reg;
struct io_zcrx_ifq *ifq;
size_t ring_sz, rqes_sz;
@@ -93,7 +175,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
}
reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
- if (!reg.area_ptr)
+ if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
return -EFAULT;
ifq = io_zcrx_ifq_alloc(ctx);
@@ -104,6 +186,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
if (ret)
goto err;
+ ret = io_zcrx_create_area(ctx, ifq, &ifq->area, &area);
+ if (ret)
+ goto err;
+
ifq->rq_entries = reg.rq_entries;
ifq->if_rxq = reg.if_rxq;
@@ -118,7 +204,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ret = -EFAULT;
goto err;
}
-
+ if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
+ ret = -EFAULT;
+ goto err;
+ }
ctx->ifq = ifq;
return 0;
err:
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 4ef94e19d36b..2fcbeb3d5501 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -3,10 +3,26 @@
#define IOU_ZC_RX_H
#include <linux/io_uring_types.h>
+#include <net/page_pool/types.h>
+
+struct io_zcrx_area {
+ struct net_iov_area nia;
+ struct io_zcrx_ifq *ifq;
+
+ u16 area_id;
+ struct page **pages;
+
+ /* freelist */
+ spinlock_t freelist_lock ____cacheline_aligned_in_smp;
+ u32 free_count;
+ u32 *freelist;
+};
struct io_zcrx_ifq {
struct io_ring_ctx *ctx;
struct net_device *dev;
+ struct io_zcrx_area *area;
+
struct io_uring *rq_ring;
struct io_uring_zcrx_rqe *rqes;
u32 rq_entries;
--
2.43.5
next prev parent reply other threads:[~2024-10-07 22:16 UTC|newest]
Thread overview: 126+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-07 22:15 [PATCH v1 00/15] io_uring zero copy rx David Wei
2024-10-07 22:15 ` [PATCH v1 01/15] net: devmem: pull struct definitions out of ifdef David Wei
2024-10-09 20:17 ` Mina Almasry
2024-10-09 23:16 ` Pavel Begunkov
2024-10-10 18:01 ` Mina Almasry
2024-10-10 18:57 ` Pavel Begunkov
2024-10-13 22:38 ` Pavel Begunkov
2024-10-07 22:15 ` [PATCH v1 02/15] net: prefix devmem specific helpers David Wei
2024-10-09 20:19 ` Mina Almasry
2024-10-07 22:15 ` [PATCH v1 03/15] net: generalise net_iov chunk owners David Wei
2024-10-08 15:46 ` Stanislav Fomichev
2024-10-08 16:34 ` Pavel Begunkov
2024-10-09 16:28 ` Stanislav Fomichev
2024-10-11 18:44 ` David Wei
2024-10-11 22:02 ` Pavel Begunkov
2024-10-11 22:25 ` Mina Almasry
2024-10-11 23:12 ` Pavel Begunkov
2024-10-09 20:44 ` Mina Almasry
2024-10-09 22:13 ` Pavel Begunkov
2024-10-09 22:19 ` Pavel Begunkov
2024-10-07 22:15 ` [PATCH v1 04/15] net: page_pool: create hooks for custom page providers David Wei
2024-10-09 20:49 ` Mina Almasry
2024-10-09 22:02 ` Pavel Begunkov
2024-10-07 22:15 ` [PATCH v1 05/15] net: prepare for non devmem TCP memory providers David Wei
2024-10-09 20:56 ` Mina Almasry
2024-10-09 21:45 ` Pavel Begunkov
2024-10-13 22:33 ` Pavel Begunkov
2024-10-07 22:15 ` [PATCH v1 06/15] net: page_pool: add ->scrub mem provider callback David Wei
2024-10-09 21:00 ` Mina Almasry
2024-10-09 21:59 ` Pavel Begunkov
2024-10-10 17:54 ` Mina Almasry
2024-10-13 17:25 ` David Wei
2024-10-14 13:37 ` Pavel Begunkov
2024-10-14 22:58 ` Mina Almasry
2024-10-16 17:42 ` Pavel Begunkov
2024-11-01 17:18 ` Mina Almasry
2024-11-01 18:35 ` Pavel Begunkov
2024-11-01 19:24 ` Mina Almasry
2024-11-01 21:38 ` Pavel Begunkov
2024-11-04 20:42 ` Mina Almasry
2024-11-04 21:27 ` Pavel Begunkov
2024-10-07 22:15 ` [PATCH v1 07/15] net: page pool: add helper creating area from pages David Wei
2024-10-09 21:11 ` Mina Almasry
2024-10-09 21:34 ` Pavel Begunkov
2024-10-07 22:15 ` [PATCH v1 08/15] net: add helper executing custom callback from napi David Wei
2024-10-08 22:25 ` Joe Damato
2024-10-09 15:09 ` Pavel Begunkov
2024-10-09 16:13 ` Joe Damato
2024-10-09 19:12 ` Pavel Begunkov
2024-10-07 22:15 ` [PATCH v1 09/15] io_uring/zcrx: add interface queue and refill queue David Wei
2024-10-09 17:50 ` Jens Axboe
2024-10-09 18:09 ` Jens Axboe
2024-10-09 19:08 ` Pavel Begunkov
2024-10-11 22:11 ` Pavel Begunkov
2024-10-13 17:32 ` David Wei
2024-10-07 22:15 ` David Wei [this message]
2024-10-09 18:02 ` [PATCH v1 10/15] io_uring/zcrx: add io_zcrx_area Jens Axboe
2024-10-09 19:05 ` Pavel Begunkov
2024-10-09 19:06 ` Jens Axboe
2024-10-09 21:29 ` Mina Almasry
2024-10-07 22:15 ` [PATCH v1 11/15] io_uring/zcrx: implement zerocopy receive pp memory provider David Wei
2024-10-09 18:10 ` Jens Axboe
2024-10-09 22:01 ` Mina Almasry
2024-10-09 22:58 ` Pavel Begunkov
2024-10-10 18:19 ` Mina Almasry
2024-10-10 20:26 ` Pavel Begunkov
2024-10-10 20:53 ` Mina Almasry
2024-10-10 20:58 ` Mina Almasry
2024-10-10 21:22 ` Pavel Begunkov
2024-10-11 0:32 ` Mina Almasry
2024-10-11 1:49 ` Pavel Begunkov
2024-10-07 22:16 ` [PATCH v1 12/15] io_uring/zcrx: add io_recvzc request David Wei
2024-10-09 18:28 ` Jens Axboe
2024-10-09 18:51 ` Pavel Begunkov
2024-10-09 19:01 ` Jens Axboe
2024-10-09 19:27 ` Pavel Begunkov
2024-10-09 19:42 ` Jens Axboe
2024-10-09 19:47 ` Pavel Begunkov
2024-10-09 19:50 ` Jens Axboe
2024-10-07 22:16 ` [PATCH v1 13/15] io_uring/zcrx: add copy fallback David Wei
2024-10-08 15:58 ` Stanislav Fomichev
2024-10-08 16:39 ` Pavel Begunkov
2024-10-08 16:40 ` David Wei
2024-10-09 16:30 ` Stanislav Fomichev
2024-10-09 23:05 ` Pavel Begunkov
2024-10-11 6:22 ` David Wei
2024-10-11 14:43 ` Stanislav Fomichev
2024-10-09 18:38 ` Jens Axboe
2024-10-07 22:16 ` [PATCH v1 14/15] io_uring/zcrx: set pp memory provider for an rx queue David Wei
2024-10-09 18:42 ` Jens Axboe
2024-10-10 13:09 ` Pavel Begunkov
2024-10-10 13:19 ` Jens Axboe
2024-10-07 22:16 ` [PATCH v1 15/15] io_uring/zcrx: throttle receive requests David Wei
2024-10-09 18:43 ` Jens Axboe
2024-10-07 22:20 ` [PATCH v1 00/15] io_uring zero copy rx David Wei
2024-10-08 23:10 ` Joe Damato
2024-10-09 15:07 ` Pavel Begunkov
2024-10-09 16:10 ` Joe Damato
2024-10-09 16:12 ` Jens Axboe
2024-10-11 6:15 ` David Wei
2024-10-09 15:27 ` Jens Axboe
2024-10-09 15:38 ` David Ahern
2024-10-09 15:43 ` Jens Axboe
2024-10-09 15:49 ` Pavel Begunkov
2024-10-09 15:50 ` Jens Axboe
2024-10-09 16:35 ` David Ahern
2024-10-09 16:50 ` Jens Axboe
2024-10-09 16:53 ` Jens Axboe
2024-10-09 17:12 ` Jens Axboe
2024-10-10 14:21 ` Jens Axboe
2024-10-10 15:03 ` David Ahern
2024-10-10 15:15 ` Jens Axboe
2024-10-10 18:11 ` Jens Axboe
2024-10-14 8:42 ` David Laight
2024-10-09 16:55 ` Mina Almasry
2024-10-09 16:57 ` Jens Axboe
2024-10-09 19:32 ` Mina Almasry
2024-10-09 19:43 ` Pavel Begunkov
2024-10-09 19:47 ` Jens Axboe
2024-10-09 17:19 ` David Ahern
2024-10-09 18:21 ` Pedro Tammela
2024-10-10 13:19 ` Pavel Begunkov
2024-10-11 0:35 ` David Wei
2024-10-11 14:28 ` Pedro Tammela
2024-10-11 0:29 ` David Wei
2024-10-11 19:43 ` Mina Almasry
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241007221603.1703699-11-dw@davidwei.uk \
--to=dw@davidwei.uk \
--cc=almasrymina@google.com \
--cc=asml.silence@gmail.com \
--cc=axboe@kernel.dk \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=hawk@kernel.org \
--cc=io-uring@vger.kernel.org \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox