* [PATCH 1/8] io_uring/zcrx: make scrubbing more reliable
2026-05-19 11:44 [PATCH 0/8] first zcrx updates for 7.2 Pavel Begunkov
@ 2026-05-19 11:44 ` Pavel Begunkov
2026-05-19 11:44 ` [PATCH 2/8] io_uring/zcrx: poison pointers on unregistration Pavel Begunkov
` (6 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 11:44 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, netdev
Currently, scrubbing is done once before killing all recvzc requests.
It's fine as those are cancelled and don't return buffers afterwards,
but it'll be more reliable not to rely that much on cancellations.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
io_uring/zcrx.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 7b93c87b8371..60cef10dc491 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -986,6 +986,12 @@ void io_unregister_zcrx(struct io_ring_ctx *ctx)
}
if (!ifq)
break;
+ /*
+ * io_uring can run requests and return buffers to the user
+ * after termination, scrub it again.
+ */
+ if (refcount_read(&ifq->user_refs) == 0)
+ io_zcrx_scrub(ifq);
io_put_zcrx_ifq(ifq);
}
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 2/8] io_uring/zcrx: poison pointers on unregistration
2026-05-19 11:44 [PATCH 0/8] first zcrx updates for 7.2 Pavel Begunkov
2026-05-19 11:44 ` [PATCH 1/8] io_uring/zcrx: make scrubbing more reliable Pavel Begunkov
@ 2026-05-19 11:44 ` Pavel Begunkov
2026-05-19 11:44 ` [PATCH 3/8] io_uring/zcrx: remove extra ifq close Pavel Begunkov
` (5 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 11:44 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, netdev
Nobody should be touching area and other pointers after zcrx
destruction, poison them instead of zeroing.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
io_uring/zcrx.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 60cef10dc491..4bf6635c222f 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -245,14 +245,13 @@ static void io_release_area_mem(struct io_zcrx_mem *mem)
{
if (mem->is_dmabuf) {
io_release_dmabuf(mem);
- return;
- }
- if (mem->pages) {
+ } else if (mem->pages) {
unpin_user_pages(mem->pages, mem->nr_folios);
sg_free_table(mem->sgt);
- mem->sgt = NULL;
kvfree(mem->pages);
}
+ mem->pages = IO_URING_PTR_POISON;
+ mem->sgt = IO_URING_PTR_POISON;
}
static int io_import_area(struct io_zcrx_ifq *ifq,
@@ -403,8 +402,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
io_free_region(ifq->user, &ifq->rq_region);
- ifq->rq.ring = NULL;
- ifq->rq.rqes = NULL;
+ ifq->rq.ring = IO_URING_PTR_POISON;
+ ifq->rq.rqes = IO_URING_PTR_POISON;
}
static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 3/8] io_uring/zcrx: remove extra ifq close
2026-05-19 11:44 [PATCH 0/8] first zcrx updates for 7.2 Pavel Begunkov
2026-05-19 11:44 ` [PATCH 1/8] io_uring/zcrx: make scrubbing more reliable Pavel Begunkov
2026-05-19 11:44 ` [PATCH 2/8] io_uring/zcrx: poison pointers on unregistration Pavel Begunkov
@ 2026-05-19 11:44 ` Pavel Begunkov
2026-05-19 11:44 ` [PATCH 4/8] io_uring/zcrx: reorder fd allocation in zcrx_export() Pavel Begunkov
` (4 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 11:44 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, netdev
By the time io_zcrx_ifq_free() is called the interface queue should
already be closed, so io_close_queue() will be a no-op. Remove the call
and add a couple of warnings.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
io_uring/zcrx.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 4bf6635c222f..f4440881960f 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -575,7 +575,10 @@ static void io_close_queue(struct io_zcrx_ifq *ifq)
static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
{
- io_close_queue(ifq);
+ if (WARN_ON_ONCE(ifq->if_rxq != -1))
+ return;
+ if (WARN_ON_ONCE(ifq->netdev != NULL))
+ return;
if (ifq->area)
io_zcrx_free_area(ifq, ifq->area);
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 4/8] io_uring/zcrx: reorder fd allocation in zcrx_export()
2026-05-19 11:44 [PATCH 0/8] first zcrx updates for 7.2 Pavel Begunkov
` (2 preceding siblings ...)
2026-05-19 11:44 ` [PATCH 3/8] io_uring/zcrx: remove extra ifq close Pavel Begunkov
@ 2026-05-19 11:44 ` Pavel Begunkov
2026-05-19 11:44 ` [PATCH 5/8] io_uring/zcrx: add ctx pointer to zcrx Pavel Begunkov
` (3 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 11:44 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, netdev
From: Bertie Tryner <bertietryner@gmail.com>
Currently, zcrx_export() allocates a file descriptor and copies the
control structure to userspace before the backing file is created.
While the operation returns an error on failure, it is cleaner to
follow the standard kernel pattern of performing the copy_to_user()
and fd_install() only after all resource allocations (like the
anon_inode) have succeeded. This aligns the code with other
fd-publishing paths in the VFS.
Signed-off-by: Bertie Tryner <Bertie.Tryner@warwick.ac.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
io_uring/zcrx.c | 25 ++++++++++++++-----------
1 file changed, 14 insertions(+), 11 deletions(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index f4440881960f..24a9ebbd9d8f 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -699,19 +699,10 @@ static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
{
struct zcrx_ctrl_export *ce = &ctrl->zc_export;
struct file *file;
- int fd = -1;
+ int fd;
if (!mem_is_zero(ce, sizeof(*ce)))
return -EINVAL;
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- ce->zcrx_fd = fd;
- if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
- put_unused_fd(fd);
- return -EFAULT;
- }
refcount_inc(&ifq->refs);
refcount_inc(&ifq->user_refs);
@@ -719,11 +710,23 @@ static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
ifq, O_CLOEXEC, NULL);
if (IS_ERR(file)) {
- put_unused_fd(fd);
zcrx_unregister(ifq);
return PTR_ERR(file);
}
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ fput(file);
+ return fd;
+ }
+
+ ce->zcrx_fd = fd;
+ if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
+ fput(file);
+ put_unused_fd(fd);
+ return -EFAULT;
+ }
+
fd_install(fd, file);
return 0;
}
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 5/8] io_uring/zcrx: add ctx pointer to zcrx
2026-05-19 11:44 [PATCH 0/8] first zcrx updates for 7.2 Pavel Begunkov
` (3 preceding siblings ...)
2026-05-19 11:44 ` [PATCH 4/8] io_uring/zcrx: reorder fd allocation in zcrx_export() Pavel Begunkov
@ 2026-05-19 11:44 ` Pavel Begunkov
2026-05-19 11:46 ` Pavel Begunkov
2026-05-19 11:44 ` [PATCH 6/8] io_uring/zcrx: notify user when out of buffers Pavel Begunkov
` (2 subsequent siblings)
7 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 11:44 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, netdev
zcrx will need to have a pointer to an owning ctx to communicate
different events. Reference the ctx while it's attached to zcrx, and
rely on zcrx termination to drop the ctx to avoid circular ref deps.
Co-developed-by: Vishwanath Seshagiri <vishs@meta.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
io_uring/zcrx.c | 39 +++++++++++++++++++++++++++++++--------
io_uring/zcrx.h | 3 +++
2 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 24a9ebbd9d8f..2d8a0c453212 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -44,6 +44,17 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio
return container_of(owner, struct io_zcrx_area, nia);
}
+static bool zcrx_set_ring_ctx(struct io_zcrx_ifq *zcrx,
+ struct io_ring_ctx *ctx)
+{
+ guard(spinlock_bh)(&zcrx->ctx_lock);
+ if (zcrx->master_ctx)
+ return false;
+ percpu_ref_get(&ctx->refs);
+ zcrx->master_ctx = ctx;
+ return true;
+}
+
static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
@@ -530,6 +541,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
return NULL;
ifq->if_rxq = -1;
+ spin_lock_init(&ifq->ctx_lock);
spin_lock_init(&ifq->rq.lock);
mutex_init(&ifq->pp_lock);
refcount_set(&ifq->refs, 1);
@@ -579,6 +591,8 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
return;
if (WARN_ON_ONCE(ifq->netdev != NULL))
return;
+ if (WARN_ON_ONCE(ifq->master_ctx))
+ return;
if (ifq->area)
io_zcrx_free_area(ifq, ifq->area);
@@ -655,17 +669,24 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
}
}
-static void zcrx_unregister_user(struct io_zcrx_ifq *ifq)
+static void zcrx_unregister_user(struct io_zcrx_ifq *ifq, struct io_ring_ctx *ctx)
{
+ scoped_guard(spinlock_bh, &ifq->ctx_lock) {
+ if (ctx && ifq->master_ctx == ctx) {
+ ifq->master_ctx = NULL;
+ percpu_ref_put(&ctx->refs);
+ }
+ }
+
if (refcount_dec_and_test(&ifq->user_refs)) {
io_close_queue(ifq);
io_zcrx_scrub(ifq);
}
}
-static void zcrx_unregister(struct io_zcrx_ifq *ifq)
+static void zcrx_unregister(struct io_zcrx_ifq *ifq, struct io_ring_ctx *ctx)
{
- zcrx_unregister_user(ifq);
+ zcrx_unregister_user(ifq, ctx);
io_put_zcrx_ifq(ifq);
}
@@ -685,7 +706,7 @@ static int zcrx_box_release(struct inode *inode, struct file *file)
if (WARN_ON_ONCE(!ifq))
return -EFAULT;
- zcrx_unregister(ifq);
+ zcrx_unregister(ifq, NULL);
return 0;
}
@@ -710,7 +731,7 @@ static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
ifq, O_CLOEXEC, NULL);
if (IS_ERR(file)) {
- zcrx_unregister(ifq);
+ zcrx_unregister(ifq, NULL);
return PTR_ERR(file);
}
@@ -786,7 +807,7 @@ static int import_zcrx(struct io_ring_ctx *ctx,
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->zcrx_ctxs, id);
err:
- zcrx_unregister(ifq);
+ zcrx_unregister(ifq, ctx);
return ret;
}
@@ -931,12 +952,14 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
ret = -EFAULT;
goto err;
}
+
+ zcrx_set_ring_ctx(ifq, ctx);
return 0;
err:
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->zcrx_ctxs, id);
ifq_free:
- zcrx_unregister(ifq);
+ zcrx_unregister(ifq, ctx);
return ret;
}
@@ -966,7 +989,7 @@ void io_terminate_zcrx(struct io_ring_ctx *ctx)
break;
set_zcrx_entry_mark(ctx, id);
id++;
- zcrx_unregister_user(ifq);
+ zcrx_unregister_user(ifq, ctx);
}
}
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 75e0a4e6ef6e..76389a5dd50f 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -72,6 +72,9 @@ struct io_zcrx_ifq {
*/
struct mutex pp_lock;
struct io_mapped_region rq_region;
+
+ spinlock_t ctx_lock;
+ struct io_ring_ctx *master_ctx;
};
#if defined(CONFIG_IO_URING_ZCRX)
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 11:44 [PATCH 0/8] first zcrx updates for 7.2 Pavel Begunkov
` (4 preceding siblings ...)
2026-05-19 11:44 ` [PATCH 5/8] io_uring/zcrx: add ctx pointer to zcrx Pavel Begunkov
@ 2026-05-19 11:44 ` Pavel Begunkov
2026-05-19 15:26 ` Jens Axboe
2026-05-19 11:44 ` [PATCH 7/8] io_uring/zcrx: notify user on frag copy fallback Pavel Begunkov
2026-05-19 11:44 ` [PATCH 8/8] io_uring/zcrx: add shared-memory notification statistics Pavel Begunkov
7 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 11:44 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, netdev
There are currently no easy ways for the user to know if zcrx is out of
buffers and page pool fails to allocate. Add uapi for zcrx to communicate
it back.
It's implemented as a separate CQE, which for now is posted to the creator
ctx. To use it, on registration the user space needs to pass an instance
of struct zcrx_notification_desc, which tells the kernel the user_data
for resulting CQEs and which event types are expected / allowed.
When an allowed event happens, zcrx will post a CQE containing the
specified user_data, and lower bits of cqe->res will be set to the event
mask. Before the kernel could post another notification of the given
type, the user needs to acknowledge that it processed the previous one
by issuing IORING_REGISTER_ZCRX_CTRL with ZCRX_CTRL_ARM_NOTIFICATION.
The only notification type the patch implements is
ZCRX_NOTIF_NO_BUFFERS, but we'll need more of them in the future.
Co-developed-by: Vishwanath Seshagiri <vishs@meta.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
include/uapi/linux/io_uring/zcrx.h | 24 ++++++++-
io_uring/io_uring.c | 2 +-
io_uring/io_uring.h | 1 +
io_uring/zcrx.c | 86 +++++++++++++++++++++++++++++-
io_uring/zcrx.h | 7 ++-
5 files changed, 115 insertions(+), 5 deletions(-)
diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h
index 5ce02c7a6096..67185566ad3c 100644
--- a/include/uapi/linux/io_uring/zcrx.h
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -65,6 +65,20 @@ enum zcrx_features {
* value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
*/
ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
+ ZCRX_FEATURE_NOTIFICATION = 1 << 1,
+};
+
+enum zcrx_notification_type {
+ ZCRX_NOTIF_NO_BUFFERS,
+
+ __ZCRX_NOTIF_TYPE_LAST,
+};
+
+struct zcrx_notification_desc {
+ __u64 user_data;
+ __u32 type_mask;
+ __u32 __resv1;
+ __u64 __resv2[10];
};
/*
@@ -82,12 +96,14 @@ struct io_uring_zcrx_ifq_reg {
struct io_uring_zcrx_offsets offsets;
__u32 zcrx_id;
__u32 rx_buf_len;
- __u64 __resv[3];
+ __u64 notif_desc; /* see struct zcrx_notification_desc */
+ __u64 __resv[2];
};
enum zcrx_ctrl_op {
ZCRX_CTRL_FLUSH_RQ,
ZCRX_CTRL_EXPORT,
+ ZCRX_CTRL_ARM_NOTIFICATION,
__ZCRX_CTRL_LAST,
};
@@ -101,6 +117,11 @@ struct zcrx_ctrl_export {
__u32 __resv1[11];
};
+struct zcrx_ctrl_arm_notif {
+ __u32 notif_type;
+ __u32 __resv[11];
+};
+
struct zcrx_ctrl {
__u32 zcrx_id;
__u32 op; /* see enum zcrx_ctrl_op */
@@ -109,6 +130,7 @@ struct zcrx_ctrl {
union {
struct zcrx_ctrl_export zc_export;
struct zcrx_ctrl_flush_rq zc_flush;
+ struct zcrx_ctrl_arm_notif zc_arm_notif;
};
};
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2ebb0ba37c4f..c5972274cce1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -160,7 +160,7 @@ static void io_poison_cached_req(struct io_kiocb *req)
req->apoll = IO_URING_PTR_POISON;
}
-static void io_poison_req(struct io_kiocb *req)
+void io_poison_req(struct io_kiocb *req)
{
io_poison_cached_req(req);
req->async_data = IO_URING_PTR_POISON;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e612a66ee80e..de0a3bed58d1 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -213,6 +213,7 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
void io_activate_pollwq(struct io_ring_ctx *ctx);
void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src);
+void io_poison_req(struct io_kiocb *req);
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 2d8a0c453212..455226790553 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -767,6 +767,8 @@ static int import_zcrx(struct io_ring_ctx *ctx,
return -EINVAL;
if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
return -EINVAL;
+ if (reg->notif_desc)
+ return -EINVAL;
if (reg->flags & ~ZCRX_REG_IMPORT)
return -EINVAL;
@@ -855,6 +857,7 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
int io_register_zcrx(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
+ struct zcrx_notification_desc notif;
struct io_uring_zcrx_area_reg area;
struct io_uring_zcrx_ifq_reg reg;
struct io_uring_region_desc rd;
@@ -898,10 +901,22 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
return -EFAULT;
+ memset(¬if, 0, sizeof(notif));
+ if (reg.notif_desc && copy_from_user(¬if, u64_to_user_ptr(reg.notif_desc),
+ sizeof(notif)))
+ return -EFAULT;
+ if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK)
+ return -EINVAL;
+ if (notif.__resv1 || !mem_is_zero(¬if.__resv2, sizeof(notif.__resv2)))
+ return -EINVAL;
+
ifq = io_zcrx_ifq_alloc(ctx);
if (!ifq)
return -ENOMEM;
+ ifq->notif_data = notif.user_data;
+ ifq->allowed_notif_mask = notif.type_mask;
+
if (ctx->user) {
get_uid(ctx->user);
ifq->user = ctx->user;
@@ -953,7 +968,8 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
goto err;
}
- zcrx_set_ring_ctx(ifq, ctx);
+ if (notif.type_mask)
+ zcrx_set_ring_ctx(ifq, ctx);
return 0;
err:
scoped_guard(mutex, &ctx->mmap_lock)
@@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
return allocated;
}
+static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
+{
+ struct io_kiocb *req = tw_req.req;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
+ percpu_ref_put(&ctx->refs);
+ io_poison_req(req);
+ kmem_cache_free(req_cachep, req);
+}
+
+static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
+{
+ gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
+ u32 type_mask = 1 << type;
+ struct io_kiocb *req;
+
+ if (!(type_mask & ifq->allowed_notif_mask))
+ return;
+
+ guard(spinlock_bh)(&ifq->ctx_lock);
+ if (!ifq->master_ctx)
+ return;
+ if (type_mask & ifq->fired_notifs)
+ return;
+
+ req = kmem_cache_alloc(req_cachep, gfp);
+ if (unlikely(!req))
+ return;
+
+ ifq->fired_notifs |= type_mask;
+
+ req->opcode = IORING_OP_NOP;
+ req->cqe.user_data = ifq->notif_data;
+ req->cqe.res = type;
+ req->ctx = ifq->master_ctx;
+ percpu_ref_get(&req->ctx->refs);
+ req->tctx = NULL;
+ req->io_task_work.func = zcrx_notif_tw;
+ io_req_task_work_add(req);
+}
+
static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
{
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
@@ -1142,8 +1200,10 @@ static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
goto out_return;
allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc);
- if (!allocated)
+ if (!allocated) {
+ zcrx_send_notif(ifq, ZCRX_NOTIF_NO_BUFFERS);
return 0;
+ }
out_return:
zcrx_sync_for_device(pp, ifq, netmems, allocated);
allocated--;
@@ -1292,12 +1352,32 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
return 0;
}
+static int zcrx_arm_notif(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
+ struct zcrx_ctrl *ctrl)
+{
+ const struct zcrx_ctrl_arm_notif *an = &ctrl->zc_arm_notif;
+ unsigned type_mask;
+
+ if (an->notif_type >= __ZCRX_NOTIF_TYPE_LAST)
+ return -EINVAL;
+ if (!mem_is_zero(&an->__resv, sizeof(an->__resv)))
+ return -EINVAL;
+
+ guard(spinlock_bh)(&zcrx->ctx_lock);
+ type_mask = 1U << an->notif_type;
+ if (type_mask & ~zcrx->fired_notifs)
+ return -EINVAL;
+ zcrx->fired_notifs &= ~type_mask;
+ return 0;
+}
+
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
struct zcrx_ctrl ctrl;
struct io_zcrx_ifq *zcrx;
BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush));
+ BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_arm_notif));
if (nr_args)
return -EINVAL;
@@ -1315,6 +1395,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
return zcrx_flush_rq(ctx, zcrx, &ctrl);
case ZCRX_CTRL_EXPORT:
return zcrx_export(ctx, zcrx, &ctrl, arg);
+ case ZCRX_CTRL_ARM_NOTIFICATION:
+ return zcrx_arm_notif(ctx, zcrx, &ctrl);
}
return -EOPNOTSUPP;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 76389a5dd50f..e8b7717d6adf 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -9,7 +9,9 @@
#include <net/net_trackers.h>
#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
-#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE)
+#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE |\
+ ZCRX_FEATURE_NOTIFICATION)
+#define ZCRX_NOTIF_TYPE_MASK (1U << ZCRX_NOTIF_NO_BUFFERS)
struct io_zcrx_mem {
unsigned long size;
@@ -75,6 +77,9 @@ struct io_zcrx_ifq {
spinlock_t ctx_lock;
struct io_ring_ctx *master_ctx;
+ u32 allowed_notif_mask;
+ u32 fired_notifs;
+ u64 notif_data;
};
#if defined(CONFIG_IO_URING_ZCRX)
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 11:44 ` [PATCH 6/8] io_uring/zcrx: notify user when out of buffers Pavel Begunkov
@ 2026-05-19 15:26 ` Jens Axboe
2026-05-19 15:30 ` Pavel Begunkov
0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2026-05-19 15:26 UTC (permalink / raw)
To: Pavel Begunkov, io-uring; +Cc: netdev
On 5/19/26 5:44 AM, Pavel Begunkov wrote:
> @@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
> return allocated;
> }
>
> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
> +{
> + struct io_kiocb *req = tw_req.req;
> + struct io_ring_ctx *ctx = req->ctx;
> +
> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
> + percpu_ref_put(&ctx->refs);
> + io_poison_req(req);
> + kmem_cache_free(req_cachep, req);
> +}
> +
> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
> +{
> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
> + u32 type_mask = 1 << type;
> + struct io_kiocb *req;
> +
> + if (!(type_mask & ifq->allowed_notif_mask))
> + return;
> +
> + guard(spinlock_bh)(&ifq->ctx_lock);
> + if (!ifq->master_ctx)
> + return;
> + if (type_mask & ifq->fired_notifs)
> + return;
> +
> + req = kmem_cache_alloc(req_cachep, gfp);
> + if (unlikely(!req))
> + return;
It'd be nice to avoid an allocation here inside ctx_lock and with bh's
disabled, which looks like is also the only reason why GFP_ATOMIC is
being used here.
Maybe opportunistically check ->fired_notifs early? Might also avoid the
lock in the first place if we get back-to-back of these.
--
Jens Axboe
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 15:26 ` Jens Axboe
@ 2026-05-19 15:30 ` Pavel Begunkov
2026-05-19 15:37 ` Jens Axboe
0 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 15:30 UTC (permalink / raw)
To: Jens Axboe, io-uring; +Cc: netdev
On 5/19/26 16:26, Jens Axboe wrote:
> On 5/19/26 5:44 AM, Pavel Begunkov wrote:
>> @@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
>> return allocated;
>> }
>>
>> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
>> +{
>> + struct io_kiocb *req = tw_req.req;
>> + struct io_ring_ctx *ctx = req->ctx;
>> +
>> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
>> + percpu_ref_put(&ctx->refs);
>> + io_poison_req(req);
>> + kmem_cache_free(req_cachep, req);
>> +}
>> +
>> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
>> +{
>> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
>> + u32 type_mask = 1 << type;
>> + struct io_kiocb *req;
>> +
>> + if (!(type_mask & ifq->allowed_notif_mask))
>> + return;
>> +
>> + guard(spinlock_bh)(&ifq->ctx_lock);
>> + if (!ifq->master_ctx)
>> + return;
>> + if (type_mask & ifq->fired_notifs)
>> + return;
>> +
>> + req = kmem_cache_alloc(req_cachep, gfp);
>> + if (unlikely(!req))
>> + return;
>
> It'd be nice to avoid an allocation here inside ctx_lock and with bh's
> disabled, which looks like is also the only reason why GFP_ATOMIC is
> being used here.
I thought about it, but it's already bh, it'd need to do pre
allocations + caching to be reliable, but that's left out for now.
> Maybe opportunistically check ->fired_notifs early? Might also avoid the
> lock in the first place if we get back-to-back of these.
Slow path, doesn't matter
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 15:30 ` Pavel Begunkov
@ 2026-05-19 15:37 ` Jens Axboe
2026-05-19 15:40 ` Pavel Begunkov
0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2026-05-19 15:37 UTC (permalink / raw)
To: Pavel Begunkov, io-uring; +Cc: netdev
On 5/19/26 9:30 AM, Pavel Begunkov wrote:
> On 5/19/26 16:26, Jens Axboe wrote:
>> On 5/19/26 5:44 AM, Pavel Begunkov wrote:
>>> @@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
>>> return allocated;
>>> }
>>> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
>>> +{
>>> + struct io_kiocb *req = tw_req.req;
>>> + struct io_ring_ctx *ctx = req->ctx;
>>> +
>>> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
>>> + percpu_ref_put(&ctx->refs);
>>> + io_poison_req(req);
>>> + kmem_cache_free(req_cachep, req);
>>> +}
>>> +
>>> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
>>> +{
>>> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
>>> + u32 type_mask = 1 << type;
>>> + struct io_kiocb *req;
>>> +
>>> + if (!(type_mask & ifq->allowed_notif_mask))
>>> + return;
>>> +
>>> + guard(spinlock_bh)(&ifq->ctx_lock);
>>> + if (!ifq->master_ctx)
>>> + return;
>>> + if (type_mask & ifq->fired_notifs)
>>> + return;
>>> +
>>> + req = kmem_cache_alloc(req_cachep, gfp);
>>> + if (unlikely(!req))
>>> + return;
>>
>> It'd be nice to avoid an allocation here inside ctx_lock and with bh's
>> disabled, which looks like is also the only reason why GFP_ATOMIC is
>> being used here.
>
> I thought about it, but it's already bh, it'd need to do pre
> allocations + caching to be reliable, but that's left out for now.
Not sure I follow - GFP_KERNEL would be more reliable than GFP_ATOMIC.
What's the contract in terms of the notification? If we fail the alloc,
then userspace can't rely on the notification on the refill failure.
Are we under bh save already here, before doing it ourselves? If so,
then how does the guard work?
>> Maybe opportunistically check ->fired_notifs early? Might also avoid the
>> lock in the first place if we get back-to-back of these.
>
> Slow path, doesn't matter
Agree, not a huge deal as we hope to not hit the notif path.
--
Jens Axboe
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 15:37 ` Jens Axboe
@ 2026-05-19 15:40 ` Pavel Begunkov
2026-05-19 15:43 ` Jens Axboe
0 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 15:40 UTC (permalink / raw)
To: Jens Axboe, io-uring; +Cc: netdev
On 5/19/26 16:37, Jens Axboe wrote:
> On 5/19/26 9:30 AM, Pavel Begunkov wrote:
>> On 5/19/26 16:26, Jens Axboe wrote:
>>> On 5/19/26 5:44 AM, Pavel Begunkov wrote:
>>>> @@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
>>>> return allocated;
>>>> }
>>>> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
>>>> +{
>>>> + struct io_kiocb *req = tw_req.req;
>>>> + struct io_ring_ctx *ctx = req->ctx;
>>>> +
>>>> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
>>>> + percpu_ref_put(&ctx->refs);
>>>> + io_poison_req(req);
>>>> + kmem_cache_free(req_cachep, req);
>>>> +}
>>>> +
>>>> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
>>>> +{
>>>> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
>>>> + u32 type_mask = 1 << type;
>>>> + struct io_kiocb *req;
>>>> +
>>>> + if (!(type_mask & ifq->allowed_notif_mask))
>>>> + return;
>>>> +
>>>> + guard(spinlock_bh)(&ifq->ctx_lock);
>>>> + if (!ifq->master_ctx)
>>>> + return;
>>>> + if (type_mask & ifq->fired_notifs)
>>>> + return;
>>>> +
>>>> + req = kmem_cache_alloc(req_cachep, gfp);
>>>> + if (unlikely(!req))
>>>> + return;
>>>
>>> It'd be nice to avoid an allocation here inside ctx_lock and with bh's
>>> disabled, which looks like is also the only reason why GFP_ATOMIC is
>>> being used here.
>>
>> I thought about it, but it's already bh, it'd need to do pre
>> allocations + caching to be reliable, but that's left out for now.
>
> Not sure I follow - GFP_KERNEL would be more reliable than GFP_ATOMIC.
> What's the contract in terms of the notification? If we fail the alloc,
> then userspace can't rely on the notification on the refill failure.
>
> Are we under bh save already here, before doing it ourselves? If so,
> then how does the guard work?
In 99% of cases it's called from softirq, not sure what you mean
by how it works.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 15:40 ` Pavel Begunkov
@ 2026-05-19 15:43 ` Jens Axboe
2026-05-19 16:04 ` Pavel Begunkov
0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2026-05-19 15:43 UTC (permalink / raw)
To: Pavel Begunkov, io-uring; +Cc: netdev
On 5/19/26 9:40 AM, Pavel Begunkov wrote:
> On 5/19/26 16:37, Jens Axboe wrote:
>> On 5/19/26 9:30 AM, Pavel Begunkov wrote:
>>> On 5/19/26 16:26, Jens Axboe wrote:
>>>> On 5/19/26 5:44 AM, Pavel Begunkov wrote:
>>>>> @@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
>>>>> return allocated;
>>>>> }
>>>>> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
>>>>> +{
>>>>> + struct io_kiocb *req = tw_req.req;
>>>>> + struct io_ring_ctx *ctx = req->ctx;
>>>>> +
>>>>> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
>>>>> + percpu_ref_put(&ctx->refs);
>>>>> + io_poison_req(req);
>>>>> + kmem_cache_free(req_cachep, req);
>>>>> +}
>>>>> +
>>>>> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
>>>>> +{
>>>>> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
>>>>> + u32 type_mask = 1 << type;
>>>>> + struct io_kiocb *req;
>>>>> +
>>>>> + if (!(type_mask & ifq->allowed_notif_mask))
>>>>> + return;
>>>>> +
>>>>> + guard(spinlock_bh)(&ifq->ctx_lock);
>>>>> + if (!ifq->master_ctx)
>>>>> + return;
>>>>> + if (type_mask & ifq->fired_notifs)
>>>>> + return;
>>>>> +
>>>>> + req = kmem_cache_alloc(req_cachep, gfp);
>>>>> + if (unlikely(!req))
>>>>> + return;
>>>>
>>>> It'd be nice to avoid an allocation here inside ctx_lock and with bh's
>>>> disabled, which looks like is also the only reason why GFP_ATOMIC is
>>>> being used here.
>>>
>>> I thought about it, but it's already bh, it'd need to do pre
>>> allocations + caching to be reliable, but that's left out for now.
>>
>> Not sure I follow - GFP_KERNEL would be more reliable than GFP_ATOMIC.
>> What's the contract in terms of the notification? If we fail the alloc,
>> then userspace can't rely on the notification on the refill failure.
>>
>> Are we under bh save already here, before doing it ourselves? If so,
>> then how does the guard work?
>
> In 99% of cases it's called from softirq, not sure what you mean
> by how it works.
Ah ok, I thought you meant it was already called with softirqs disabled.
In which case the guard would seem broken, as we'd enable softirqs when
exiting. But if we're just inside softirq yeah it's fine, and there's no
point shuffling the allocation either.
Question on the contract still stands, in terms of missing a
notification. I guess since it's a hint basically it doesn't really
matter, just something that should be documented on the userspace side.
Do you have test cases for these?
--
Jens Axboe
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 15:43 ` Jens Axboe
@ 2026-05-19 16:04 ` Pavel Begunkov
2026-05-19 16:08 ` Pavel Begunkov
2026-05-19 16:09 ` Jens Axboe
0 siblings, 2 replies; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 16:04 UTC (permalink / raw)
To: Jens Axboe, io-uring
Cc: netdev, Clément Léger, Vishwanath Seshagiri
On 5/19/26 16:43, Jens Axboe wrote:
> On 5/19/26 9:40 AM, Pavel Begunkov wrote:
>> On 5/19/26 16:37, Jens Axboe wrote:
>>> On 5/19/26 9:30 AM, Pavel Begunkov wrote:
>>>> On 5/19/26 16:26, Jens Axboe wrote:
>>>>> On 5/19/26 5:44 AM, Pavel Begunkov wrote:
>>>>>> @@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
>>>>>> return allocated;
>>>>>> }
>>>>>> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
>>>>>> +{
>>>>>> + struct io_kiocb *req = tw_req.req;
>>>>>> + struct io_ring_ctx *ctx = req->ctx;
>>>>>> +
>>>>>> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
>>>>>> + percpu_ref_put(&ctx->refs);
>>>>>> + io_poison_req(req);
>>>>>> + kmem_cache_free(req_cachep, req);
>>>>>> +}
>>>>>> +
>>>>>> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
>>>>>> +{
>>>>>> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
>>>>>> + u32 type_mask = 1 << type;
>>>>>> + struct io_kiocb *req;
>>>>>> +
>>>>>> + if (!(type_mask & ifq->allowed_notif_mask))
>>>>>> + return;
>>>>>> +
>>>>>> + guard(spinlock_bh)(&ifq->ctx_lock);
>>>>>> + if (!ifq->master_ctx)
>>>>>> + return;
>>>>>> + if (type_mask & ifq->fired_notifs)
>>>>>> + return;
>>>>>> +
>>>>>> + req = kmem_cache_alloc(req_cachep, gfp);
>>>>>> + if (unlikely(!req))
>>>>>> + return;
>>>>>
>>>>> It'd be nice to avoid an allocation here inside ctx_lock and with bh's
>>>>> disabled, which looks like is also the only reason why GFP_ATOMIC is
>>>>> being used here.
>>>>
>>>> I thought about it, but it's already bh, it'd need to do pre
>>>> allocations + caching to be reliable, but that's left out for now.
>>>
>>> Not sure I follow - GFP_KERNEL would be more reliable than GFP_ATOMIC.
>>> What's the contract in terms of the notification? If we fail the alloc,
>>> then userspace can't rely on the notification on the refill failure.
>>>
>>> Are we under bh save already here, before doing it ourselves? If so,
>>> then how does the guard work?
>>
>> In 99% of cases it's called from softirq, not sure what you mean
>> by how it works.
>
> Ah ok, I thought you meant it was already called with softirqs disabled.
> In which case the guard would seem broken, as we'd enable softirqs when
> exiting. But if we're just inside softirq yeah it's fine, and there's no
> point shuffling the allocation either.
Softirqs are run with bh disabled, but bh_disable()/enable() are
reenterable.
> Question on the contract still stands, in terms of missing a
> notification. I guess since it's a hint basically it doesn't really
> matter, just something that should be documented on the userspace side.
Should rather be improved than documented, I'd say, but it's still
better than not getting anything at all. And it's the only place
where it can in theory be dropped, e.g. CQE overflow handling,
though different GFP.
> Do you have test cases for these?
Clement needs to resend them. Actually, seems I forgot to CC Vish
and Clement here, my bad.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 16:04 ` Pavel Begunkov
@ 2026-05-19 16:08 ` Pavel Begunkov
2026-05-19 16:09 ` Jens Axboe
1 sibling, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 16:08 UTC (permalink / raw)
To: Jens Axboe, io-uring
Cc: netdev, Clément Léger, Vishwanath Seshagiri
On 5/19/26 17:04, Pavel Begunkov wrote:
> On 5/19/26 16:43, Jens Axboe wrote:
>> On 5/19/26 9:40 AM, Pavel Begunkov wrote:
>>> On 5/19/26 16:37, Jens Axboe wrote:
>>>> On 5/19/26 9:30 AM, Pavel Begunkov wrote:
>>>>> On 5/19/26 16:26, Jens Axboe wrote:
>>>>>> On 5/19/26 5:44 AM, Pavel Begunkov wrote:
>>>>>>> @@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
>>>>>>> return allocated;
>>>>>>> }
>>>>>>> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
>>>>>>> +{
>>>>>>> + struct io_kiocb *req = tw_req.req;
>>>>>>> + struct io_ring_ctx *ctx = req->ctx;
>>>>>>> +
>>>>>>> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
>>>>>>> + percpu_ref_put(&ctx->refs);
>>>>>>> + io_poison_req(req);
>>>>>>> + kmem_cache_free(req_cachep, req);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
>>>>>>> +{
>>>>>>> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
>>>>>>> + u32 type_mask = 1 << type;
>>>>>>> + struct io_kiocb *req;
>>>>>>> +
>>>>>>> + if (!(type_mask & ifq->allowed_notif_mask))
>>>>>>> + return;
>>>>>>> +
>>>>>>> + guard(spinlock_bh)(&ifq->ctx_lock);
>>>>>>> + if (!ifq->master_ctx)
>>>>>>> + return;
>>>>>>> + if (type_mask & ifq->fired_notifs)
>>>>>>> + return;
>>>>>>> +
>>>>>>> + req = kmem_cache_alloc(req_cachep, gfp);
>>>>>>> + if (unlikely(!req))
>>>>>>> + return;
>>>>>>
>>>>>> It'd be nice to avoid an allocation here inside ctx_lock and with bh's
>>>>>> disabled, which looks like is also the only reason why GFP_ATOMIC is
>>>>>> being used here.
>>>>>
>>>>> I thought about it, but it's already bh, it'd need to do pre
>>>>> allocations + caching to be reliable, but that's left out for now.
>>>>
>>>> Not sure I follow - GFP_KERNEL would be more reliable than GFP_ATOMIC.
>>>> What's the contract in terms of the notification? If we fail the alloc,
>>>> then userspace can't rely on the notification on the refill failure.
>>>>
>>>> Are we under bh save already here, before doing it ourselves? If so,
>>>> then how does the guard work?
>>>
>>> In 99% of cases it's called from softirq, not sure what you mean
>>> by how it works.
>>
>> Ah ok, I thought you meant it was already called with softirqs disabled.
>> In which case the guard would seem broken, as we'd enable softirqs when
>> exiting. But if we're just inside softirq yeah it's fine, and there's no
>> point shuffling the allocation either.
>
> Softirqs are run with bh disabled, but bh_disable()/enable() are
> reenterable.
Better to say they're counting nesting
>> Question on the contract still stands, in terms of missing a
>> notification. I guess since it's a hint basically it doesn't really
>> matter, just something that should be documented on the userspace side.
>
> Should rather be improved than documented, I'd say, but it's still
> better than not getting anything at all. And it's the only place
> where it can in theory be dropped, e.g. CQE overflow handling,
> though different GFP.
>
>> Do you have test cases for these?
>
> Clement needs to resend them. Actually, seems I forgot to CC Vish
> and Clement here, my bad.
>
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 6/8] io_uring/zcrx: notify user when out of buffers
2026-05-19 16:04 ` Pavel Begunkov
2026-05-19 16:08 ` Pavel Begunkov
@ 2026-05-19 16:09 ` Jens Axboe
1 sibling, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2026-05-19 16:09 UTC (permalink / raw)
To: Pavel Begunkov, io-uring
Cc: netdev, Clément Léger, Vishwanath Seshagiri
On 5/19/26 10:04 AM, Pavel Begunkov wrote:
> On 5/19/26 16:43, Jens Axboe wrote:
>> On 5/19/26 9:40 AM, Pavel Begunkov wrote:
>>> On 5/19/26 16:37, Jens Axboe wrote:
>>>> On 5/19/26 9:30 AM, Pavel Begunkov wrote:
>>>>> On 5/19/26 16:26, Jens Axboe wrote:
>>>>>> On 5/19/26 5:44 AM, Pavel Begunkov wrote:
>>>>>>> @@ -1126,6 +1142,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
>>>>>>> return allocated;
>>>>>>> }
>>>>>>> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
>>>>>>> +{
>>>>>>> + struct io_kiocb *req = tw_req.req;
>>>>>>> + struct io_ring_ctx *ctx = req->ctx;
>>>>>>> +
>>>>>>> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
>>>>>>> + percpu_ref_put(&ctx->refs);
>>>>>>> + io_poison_req(req);
>>>>>>> + kmem_cache_free(req_cachep, req);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
>>>>>>> +{
>>>>>>> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
>>>>>>> + u32 type_mask = 1 << type;
>>>>>>> + struct io_kiocb *req;
>>>>>>> +
>>>>>>> + if (!(type_mask & ifq->allowed_notif_mask))
>>>>>>> + return;
>>>>>>> +
>>>>>>> + guard(spinlock_bh)(&ifq->ctx_lock);
>>>>>>> + if (!ifq->master_ctx)
>>>>>>> + return;
>>>>>>> + if (type_mask & ifq->fired_notifs)
>>>>>>> + return;
>>>>>>> +
>>>>>>> + req = kmem_cache_alloc(req_cachep, gfp);
>>>>>>> + if (unlikely(!req))
>>>>>>> + return;
>>>>>>
>>>>>> It'd be nice to avoid an allocation here inside ctx_lock and with bh's
>>>>>> disabled, which looks like is also the only reason why GFP_ATOMIC is
>>>>>> being used here.
>>>>>
>>>>> I thought about it, but it's already bh, it'd need to do pre
>>>>> allocations + caching to be reliable, but that's left out for now.
>>>>
>>>> Not sure I follow - GFP_KERNEL would be more reliable than GFP_ATOMIC.
>>>> What's the contract in terms of the notification? If we fail the alloc,
>>>> then userspace can't rely on the notification on the refill failure.
>>>>
>>>> Are we under bh save already here, before doing it ourselves? If so,
>>>> then how does the guard work?
>>>
>>> In 99% of cases it's called from softirq, not sure what you mean
>>> by how it works.
>>
>> Ah ok, I thought you meant it was already called with softirqs disabled.
>> In which case the guard would seem broken, as we'd enable softirqs when
>> exiting. But if we're just inside softirq yeah it's fine, and there's no
>> point shuffling the allocation either.
>
> Softirqs are run with bh disabled, but bh_disable()/enable() are
> reenterable.
No worries on that then.
>> Question on the contract still stands, in terms of missing a
>> notification. I guess since it's a hint basically it doesn't really
>> matter, just something that should be documented on the userspace side.
>
> Should rather be improved than documented, I'd say, but it's still
Of course, that's why I was originally asking about what the contract is
here - is it a hint, or is it more than that? In either case, should be
documented what the application can rely on. And might not be too bad to
harden, since it also really doesn't make sense to have more than one of
these inflight at the time anyway.
> better than not getting anything at all. And it's the only place
> where it can in theory be dropped, e.g. CQE overflow handling,
> though different GFP.
>
>> Do you have test cases for these?
>
> Clement needs to resend them. Actually, seems I forgot to CC Vish
> and Clement here, my bad.
Sounds good.
--
Jens Axboe
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 7/8] io_uring/zcrx: notify user on frag copy fallback
2026-05-19 11:44 [PATCH 0/8] first zcrx updates for 7.2 Pavel Begunkov
` (5 preceding siblings ...)
2026-05-19 11:44 ` [PATCH 6/8] io_uring/zcrx: notify user when out of buffers Pavel Begunkov
@ 2026-05-19 11:44 ` Pavel Begunkov
2026-05-19 11:44 ` [PATCH 8/8] io_uring/zcrx: add shared-memory notification statistics Pavel Begunkov
7 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 11:44 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, netdev
From: Clément Léger <cleger@meta.com>
Add a ZCRX_NOTIF_COPY notification type to signal userspace when a
received fragment could not be delivered using zero-copy and was
instead copied into a buffer.
Signed-off-by: Clément Léger <cleger@meta.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
include/uapi/linux/io_uring/zcrx.h | 1 +
io_uring/zcrx.c | 7 ++++++-
io_uring/zcrx.h | 2 +-
3 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h
index 67185566ad3c..3f7b72b09878 100644
--- a/include/uapi/linux/io_uring/zcrx.h
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -70,6 +70,7 @@ enum zcrx_features {
enum zcrx_notification_type {
ZCRX_NOTIF_NO_BUFFERS,
+ ZCRX_NOTIF_COPY,
__ZCRX_NOTIF_TYPE_LAST,
};
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 455226790553..1e7c305da0d0 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -1533,8 +1533,13 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
const skb_frag_t *frag, int off, int len)
{
struct page *page = skb_frag_page(frag);
+ int ret;
+
+ ret = io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
+ if (ret > 0)
+ zcrx_send_notif(ifq, ZCRX_NOTIF_COPY);
- return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
+ return ret;
}
static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index e8b7717d6adf..54d91b580eaf 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -11,7 +11,7 @@
#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE |\
ZCRX_FEATURE_NOTIFICATION)
-#define ZCRX_NOTIF_TYPE_MASK (1U << ZCRX_NOTIF_NO_BUFFERS)
+#define ZCRX_NOTIF_TYPE_MASK ((1U << ZCRX_NOTIF_NO_BUFFERS) | (1U << ZCRX_NOTIF_COPY))
struct io_zcrx_mem {
unsigned long size;
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 8/8] io_uring/zcrx: add shared-memory notification statistics
2026-05-19 11:44 [PATCH 0/8] first zcrx updates for 7.2 Pavel Begunkov
` (6 preceding siblings ...)
2026-05-19 11:44 ` [PATCH 7/8] io_uring/zcrx: notify user on frag copy fallback Pavel Begunkov
@ 2026-05-19 11:44 ` Pavel Begunkov
7 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2026-05-19 11:44 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, netdev
From: Clément Léger <cleger@meta.com>
Add support for an optional stats struct embedded in the refill queue
region, allowing userspace to monitor copy-fallback in real-time.
Userspace queries the stats struct size and alignment via
IO_URING_QUERY_ZCRX_NOTIF (notif_stats_size / notif_stats_alignment),
then provides a stats_offset in zcrx_notification_desc pointing to a
location within the refill queue region.
The kernel updates the stats counters in-place on every copy-fallback
event.
Signed-off-by: Clément Léger <cleger@meta.com>
[pavel: rename io_uring_zcrx_notif_stats]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
include/uapi/linux/io_uring/query.h | 12 +++++++
include/uapi/linux/io_uring/zcrx.h | 15 ++++++--
io_uring/query.c | 16 +++++++++
io_uring/zcrx.c | 54 +++++++++++++++++++++++++++--
io_uring/zcrx.h | 1 +
5 files changed, 94 insertions(+), 4 deletions(-)
diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index 95500759cc13..1a68eca7c6b4 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -23,6 +23,7 @@ enum {
IO_URING_QUERY_OPCODES = 0,
IO_URING_QUERY_ZCRX = 1,
IO_URING_QUERY_SCQ = 2,
+ IO_URING_QUERY_ZCRX_NOTIF = 3,
__IO_URING_QUERY_MAX,
};
@@ -62,6 +63,17 @@ struct io_uring_query_zcrx {
__u64 __resv2;
};
+struct io_uring_query_zcrx_notif {
+ /* Bitmask of supported ZCRX_NOTIF_* flags */
+ __u32 notif_flags;
+ /* Size of io_uring_zcrx_notif_stats */
+ __u32 notif_stats_size;
+ /* Required alignment for the stats struct within the region (ie stats_offset) */
+ __u32 notif_stats_off_alignment;
+ __u32 __resv1;
+ __u64 __resv2[4];
+};
+
struct io_uring_query_scq {
/* The SQ/CQ rings header size */
__u64 hdr_size;
diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h
index 3f7b72b09878..15c05c45ce36 100644
--- a/include/uapi/linux/io_uring/zcrx.h
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -75,11 +75,22 @@ enum zcrx_notification_type {
__ZCRX_NOTIF_TYPE_LAST,
};
+enum zcrx_notification_desc_flags {
+ /* If set, stats_offset holds a valid offset to a notif_stats struct */
+ ZCRX_NOTIF_DESC_FLAG_STATS = 1 << 0,
+};
+
+struct zcrx_notif_stats {
+ __u64 copy_count; /* cumulative copy-fallback CQEs */
+ __u64 copy_bytes; /* cumulative bytes copied */
+};
+
struct zcrx_notification_desc {
__u64 user_data;
__u32 type_mask;
- __u32 __resv1;
- __u64 __resv2[10];
+ __u32 flags; /* see enum zcrx_notification_desc_flags */
+ __u64 stats_offset; /* offset from the beginning of refill ring region for stats */
+ __u64 __resv2[9];
};
/*
diff --git a/io_uring/query.c b/io_uring/query.c
index c1704d088374..d529d94aa8f4 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -9,6 +9,7 @@
union io_query_data {
struct io_uring_query_opcode opcodes;
struct io_uring_query_zcrx zcrx;
+ struct io_uring_query_zcrx_notif zcrx_notif;
struct io_uring_query_scq scq;
};
@@ -44,6 +45,18 @@ static ssize_t io_query_zcrx(union io_query_data *data)
return sizeof(*e);
}
+static ssize_t io_query_zcrx_notif(union io_query_data *data)
+{
+ struct io_uring_query_zcrx_notif *e = &data->zcrx_notif;
+
+ e->notif_flags = ZCRX_NOTIF_TYPE_MASK;
+ e->notif_stats_size = sizeof(struct zcrx_notif_stats);
+ e->notif_stats_off_alignment = __alignof__(struct zcrx_notif_stats);
+ e->__resv1 = 0;
+ memset(&e->__resv2, 0, sizeof(e->__resv2));
+ return sizeof(*e);
+}
+
static ssize_t io_query_scq(union io_query_data *data)
{
struct io_uring_query_scq *e = &data->scq;
@@ -83,6 +96,9 @@ static int io_handle_query_entry(union io_query_data *data, void __user *uhdr,
case IO_URING_QUERY_ZCRX:
ret = io_query_zcrx(data);
break;
+ case IO_URING_QUERY_ZCRX_NOTIF:
+ ret = io_query_zcrx_notif(data);
+ break;
case IO_URING_QUERY_SCQ:
ret = io_query_scq(data);
break;
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 1e7c305da0d0..6d078beaf0ca 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -415,6 +415,7 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
io_free_region(ifq->user, &ifq->rq_region);
ifq->rq.ring = IO_URING_PTR_POISON;
ifq->rq.rqes = IO_URING_PTR_POISON;
+ ifq->notif_stats = IO_URING_PTR_POISON;
}
static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
@@ -854,6 +855,33 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
return ret;
}
+static int zcrx_validate_notif_stats(struct io_zcrx_ifq *ifq,
+ const struct io_uring_zcrx_ifq_reg *reg,
+ const struct zcrx_notification_desc *notif)
+{
+ size_t stats_off = notif->stats_offset;
+ size_t used, end;
+
+ used = reg->offsets.rqes +
+ sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
+
+ if (!IS_ALIGNED(stats_off, __alignof__(struct zcrx_notif_stats)))
+ return -EINVAL;
+ if (stats_off < used)
+ return -ERANGE;
+ if (check_add_overflow(stats_off,
+ sizeof(struct zcrx_notif_stats),
+ &end))
+ return -ERANGE;
+ if (end > io_region_size(&ifq->rq_region))
+ return -ERANGE;
+
+ ifq->notif_stats = io_region_get_ptr(&ifq->rq_region) + stats_off;
+ memset(ifq->notif_stats, 0, sizeof(*ifq->notif_stats));
+
+ return 0;
+}
+
int io_register_zcrx(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
@@ -907,7 +935,13 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
return -EFAULT;
if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK)
return -EINVAL;
- if (notif.__resv1 || !mem_is_zero(¬if.__resv2, sizeof(notif.__resv2)))
+ if (notif.flags & ~ZCRX_NOTIF_DESC_FLAG_STATS)
+ return -EINVAL;
+ if (!(notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS)) {
+ if (notif.stats_offset)
+ return -EINVAL;
+ }
+ if (!mem_is_zero(¬if.__resv2, sizeof(notif.__resv2)))
return -EINVAL;
ifq = io_zcrx_ifq_alloc(ctx);
@@ -938,6 +972,12 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
if (ret)
goto err;
+ if (notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS) {
+ ret = zcrx_validate_notif_stats(ifq, ®, ¬if);
+ if (ret)
+ goto err;
+ }
+
ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF);
if (!(reg.flags & ZCRX_REG_NODEV)) {
@@ -1153,6 +1193,11 @@ static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
kmem_cache_free(req_cachep, req);
}
+static void zcrx_stat_add(__u64 *p, s64 v)
+{
+ WRITE_ONCE(*p, READ_ONCE(*p) + v);
+}
+
static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
{
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
@@ -1536,8 +1581,13 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
int ret;
ret = io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
- if (ret > 0)
+ if (ret > 0) {
+ if (ifq->notif_stats) {
+ zcrx_stat_add(&ifq->notif_stats->copy_count, 1);
+ zcrx_stat_add(&ifq->notif_stats->copy_bytes, ret);
+ }
zcrx_send_notif(ifq, ZCRX_NOTIF_COPY);
+ }
return ret;
}
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 54d91b580eaf..fa00900e479e 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -80,6 +80,7 @@ struct io_zcrx_ifq {
u32 allowed_notif_mask;
u32 fired_notifs;
u64 notif_data;
+ struct zcrx_notif_stats *notif_stats;
};
#if defined(CONFIG_IO_URING_ZCRX)
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread