Netdev List
 help / color / mirror / Atom feed
* [PATCH 1/5] io_uring/zcrx: notify user when out of buffers
From: Clément Léger @ 2026-04-22 11:25 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov, Jens Axboe
  Cc: linux-doc, linux-kernel, linux-kselftest, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Jonathan Corbet, Shuah Khan, Vishwanath Seshagiri,
	Vishwanath Seshagiri
In-Reply-To: <20260422112522.3316660-1-cleger@meta.com>

From: Pavel Begunkov <asml.silence@gmail.com>

There are currently no easy ways for the user to know if zcrx is out of
buffers and page pool fails to allocate. Add uapi for zcrx to communicate
it back.

It's implemented as a separate CQE, which for now is posted to the creator
ctx. To use it, on registration the user space needs to pass an instance
of struct zcrx_notification_desc, which tells the kernel the user_data
for resulting CQEs and which event types are expected / allowed.

When an allowed event happens, zcrx will post a CQE containing the
specified user_data, and lower bits of cqe->res will be set to the event
mask. Before the kernel could post another notification of the given
type, the user needs to acknowledge that it processed the previous one
by issuing IORING_REGISTER_ZCRX_CTRL with ZCRX_CTRL_ARM_NOTIFICATION.

The only notification type the patch implements yet is
ZCRX_NOTIF_NO_BUFFERS. Next commit adds copy fallback signaling.

Co-developed-by: Vishwanath Seshagiri <vishs@meta.com>
Signed-off-by: Vishwanath Seshagiri <vishs@meta.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/uapi/linux/io_uring/zcrx.h | 22 ++++++-
 io_uring/zcrx.c                    | 98 +++++++++++++++++++++++++++++-
 io_uring/zcrx.h                    | 11 +++-
 3 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h
index 5ce02c7a6096..b8596d7d47b6 100644
--- a/include/uapi/linux/io_uring/zcrx.h
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -65,6 +65,18 @@ enum zcrx_features {
 	 * value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
 	 */
 	ZCRX_FEATURE_RX_PAGE_SIZE	= 1 << 0,
+	ZCRX_FEATURE_NOTIFICATION	= 1 << 1,
+};
+
+enum zcrx_notification_type {
+	ZCRX_NOTIF_NO_BUFFERS = 1 << 0,
+};
+
+struct zcrx_notification_desc {
+	__u64	user_data;
+	__u32	type_mask;
+	__u32	__resv1;
+	__u64	__resv2[10];
 };
 
 /*
@@ -82,12 +94,14 @@ struct io_uring_zcrx_ifq_reg {
 	struct io_uring_zcrx_offsets offsets;
 	__u32	zcrx_id;
 	__u32	rx_buf_len;
-	__u64	__resv[3];
+	__u64	notif_desc; /* see struct zcrx_notification_desc */
+	__u64	__resv[2];
 };
 
 enum zcrx_ctrl_op {
 	ZCRX_CTRL_FLUSH_RQ,
 	ZCRX_CTRL_EXPORT,
+	ZCRX_CTRL_ARM_NOTIFICATION,
 
 	__ZCRX_CTRL_LAST,
 };
@@ -101,6 +115,11 @@ struct zcrx_ctrl_export {
 	__u32 		__resv1[11];
 };
 
+struct zcrx_ctrl_arm_notif {
+	__u32		type_mask;
+	__u32		__resv[11];
+};
+
 struct zcrx_ctrl {
 	__u32	zcrx_id;
 	__u32	op; /* see enum zcrx_ctrl_op */
@@ -109,6 +128,7 @@ struct zcrx_ctrl {
 	union {
 		struct zcrx_ctrl_export		zc_export;
 		struct zcrx_ctrl_flush_rq	zc_flush;
+		struct zcrx_ctrl_arm_notif	zc_arm_notif;
 	};
 };
 
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 9a83d7eb4210..35ca28cb6583 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -44,6 +44,16 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio
 	return container_of(owner, struct io_zcrx_area, nia);
 }
 
+static bool zcrx_set_ring_ctx(struct io_zcrx_ifq *zcrx, struct io_ring_ctx *ctx)
+{
+	guard(spinlock_bh)(&zcrx->ctx_lock);
+	if (zcrx->master_ctx)
+		return false;
+	percpu_ref_get(&ctx->refs);
+	zcrx->master_ctx = ctx;
+	return true;
+}
+
 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
 {
 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
@@ -531,6 +541,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
 
 	ifq->if_rxq = -1;
 	spin_lock_init(&ifq->rq.lock);
+	spin_lock_init(&ifq->ctx_lock);
 	mutex_init(&ifq->pp_lock);
 	refcount_set(&ifq->refs, 1);
 	refcount_set(&ifq->user_refs, 1);
@@ -585,6 +596,11 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 	if (ifq->dev)
 		put_device(ifq->dev);
 
+	scoped_guard(spinlock_bh, &ifq->ctx_lock) {
+		if (ifq->master_ctx)
+			percpu_ref_put(&ifq->master_ctx->refs);
+	}
+
 	io_free_rbuf_ring(ifq);
 	mutex_destroy(&ifq->pp_lock);
 	kfree(ifq);
@@ -738,6 +754,8 @@ static int import_zcrx(struct io_ring_ctx *ctx,
 		return -EINVAL;
 	if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
 		return -EINVAL;
+	if (reg->notif_desc)
+		return -EINVAL;
 	if (reg->flags & ~ZCRX_REG_IMPORT)
 		return -EINVAL;
 
@@ -826,6 +844,7 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
 int io_register_zcrx(struct io_ring_ctx *ctx,
 		     struct io_uring_zcrx_ifq_reg __user *arg)
 {
+	struct zcrx_notification_desc notif;
 	struct io_uring_zcrx_area_reg area;
 	struct io_uring_zcrx_ifq_reg reg;
 	struct io_uring_region_desc rd;
@@ -869,10 +888,22 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
 	if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
 		return -EFAULT;
 
+	memset(&notif, 0, sizeof(notif));
+	if (reg.notif_desc && copy_from_user(&notif, u64_to_user_ptr(reg.notif_desc),
+					     sizeof(notif)))
+		return -EFAULT;
+	if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK)
+		return -EINVAL;
+	if (notif.__resv1 || !mem_is_zero(&notif.__resv2, sizeof(notif.__resv2)))
+		return -EINVAL;
+
 	ifq = io_zcrx_ifq_alloc(ctx);
 	if (!ifq)
 		return -ENOMEM;
 
+	ifq->notif_data = notif.user_data;
+	ifq->allowed_notif_mask = notif.type_mask;
+
 	if (ctx->user) {
 		get_uid(ctx->user);
 		ifq->user = ctx->user;
@@ -923,6 +954,9 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
 		ret = -EFAULT;
 		goto err;
 	}
+
+	if (notif.type_mask)
+		zcrx_set_ring_ctx(ifq, ctx);
 	return 0;
 err:
 	scoped_guard(mutex, &ctx->mmap_lock)
@@ -1089,6 +1123,46 @@ static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *if
 	return allocated;
 }
 
+static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
+{
+	struct io_kiocb *req = tw_req.req;
+	struct io_ring_ctx *ctx = req->ctx;
+
+	io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
+	percpu_ref_put(&ctx->refs);
+	kfree_rcu(req, rcu_head);
+}
+
+static void zcrx_send_notif(struct io_zcrx_ifq *ifq, u32 type_mask)
+{
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
+	struct io_kiocb *req;
+
+	if (!(type_mask & ifq->allowed_notif_mask))
+		return;
+
+	guard(spinlock_bh)(&ifq->ctx_lock);
+	if (!ifq->master_ctx)
+		return;
+	if (type_mask & ifq->fired_notifs)
+		return;
+
+	req = kmem_cache_alloc(req_cachep, gfp);
+	if (unlikely(!req))
+		return;
+
+	ifq->fired_notifs |= type_mask;
+
+	req->opcode = IORING_OP_NOP;
+	req->cqe.user_data = ifq->notif_data;
+	req->cqe.res = type_mask;
+	req->ctx = ifq->master_ctx;
+	percpu_ref_get(&req->ctx->refs);
+	req->tctx = NULL;
+	req->io_task_work.func = zcrx_notif_tw;
+	io_req_task_work_add(req);
+}
+
 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
 {
 	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
@@ -1105,8 +1179,10 @@ static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
 		goto out_return;
 
 	allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc);
-	if (!allocated)
+	if (!allocated) {
+		zcrx_send_notif(ifq, ZCRX_NOTIF_NO_BUFFERS);
 		return 0;
+	}
 out_return:
 	zcrx_sync_for_device(pp, ifq, netmems, allocated);
 	allocated--;
@@ -1255,12 +1331,30 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
 	return 0;
 }
 
+static int zcrx_arm_notif(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
+			  struct zcrx_ctrl *ctrl)
+{
+	const struct zcrx_ctrl_arm_notif *an = &ctrl->zc_arm_notif;
+
+	if (an->type_mask & ~ZCRX_NOTIF_TYPE_MASK)
+		return -EINVAL;
+	if (!mem_is_zero(&an->__resv, sizeof(an->__resv)))
+		return -EINVAL;
+
+	guard(spinlock_bh)(&zcrx->ctx_lock);
+	if (an->type_mask & ~zcrx->fired_notifs)
+		return -EINVAL;
+	zcrx->fired_notifs &= ~an->type_mask;
+	return 0;
+}
+
 int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
 {
 	struct zcrx_ctrl ctrl;
 	struct io_zcrx_ifq *zcrx;
 
 	BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush));
+	BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_arm_notif));
 
 	if (nr_args)
 		return -EINVAL;
@@ -1278,6 +1372,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
 		return zcrx_flush_rq(ctx, zcrx, &ctrl);
 	case ZCRX_CTRL_EXPORT:
 		return zcrx_export(ctx, zcrx, &ctrl, arg);
+	case ZCRX_CTRL_ARM_NOTIFICATION:
+		return zcrx_arm_notif(ctx, zcrx, &ctrl);
 	}
 
 	return -EOPNOTSUPP;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 75e0a4e6ef6e..3ddebed06d57 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -9,7 +9,9 @@
 #include <net/net_trackers.h>
 
 #define ZCRX_SUPPORTED_REG_FLAGS	(ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
-#define ZCRX_FEATURES			(ZCRX_FEATURE_RX_PAGE_SIZE)
+#define ZCRX_FEATURES			(ZCRX_FEATURE_RX_PAGE_SIZE |\
+					 ZCRX_FEATURE_NOTIFICATION)
+#define ZCRX_NOTIF_TYPE_MASK		(ZCRX_NOTIF_NO_BUFFERS)
 
 struct io_zcrx_mem {
 	unsigned long			size;
@@ -72,6 +74,13 @@ struct io_zcrx_ifq {
 	 */
 	struct mutex			pp_lock;
 	struct io_mapped_region		rq_region;
+
+	/* Locks the access to notifification context data */
+	spinlock_t			ctx_lock;
+	struct io_ring_ctx		*master_ctx;
+	u32				allowed_notif_mask;
+	u32				fired_notifs;
+	u64				notif_data;
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)
-- 
2.52.0


^ permalink raw reply related

* [PATCH 2/5] io_uring/zcrx: notify user on frag copy fallback
From: Clément Léger @ 2026-04-22 11:25 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov, Jens Axboe
  Cc: Clément Léger, linux-doc, linux-kernel, linux-kselftest,
	netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Vishwanath Seshagiri
In-Reply-To: <20260422112522.3316660-1-cleger@meta.com>

Add a ZCRX_NOTIF_COPY notification type to signal userspace when a
received fragment could not be delivered using zero-copy and was
instead copied into a buffer.

Signed-off-by: Clément Léger <cleger@meta.com>
---
 include/uapi/linux/io_uring/zcrx.h | 1 +
 io_uring/zcrx.c                    | 7 ++++++-
 io_uring/zcrx.h                    | 3 ++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h
index b8596d7d47b6..e0c0079626c8 100644
--- a/include/uapi/linux/io_uring/zcrx.h
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -70,6 +70,7 @@ enum zcrx_features {
 
 enum zcrx_notification_type {
 	ZCRX_NOTIF_NO_BUFFERS = 1 << 0,
+	ZCRX_NOTIF_COPY = 1 << 1
 };
 
 struct zcrx_notification_desc {
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 35ca28cb6583..732e585aa13a 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -1510,8 +1510,13 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 			     const skb_frag_t *frag, int off, int len)
 {
 	struct page *page = skb_frag_page(frag);
+	int ret;
+
+	ret = io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
+	if (ret > 0)
+		zcrx_send_notif(ifq, ZCRX_NOTIF_COPY);
 
-	return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
+	return ret;
 }
 
 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 3ddebed06d57..1bd63adaa711 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -11,7 +11,8 @@
 #define ZCRX_SUPPORTED_REG_FLAGS	(ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
 #define ZCRX_FEATURES			(ZCRX_FEATURE_RX_PAGE_SIZE |\
 					 ZCRX_FEATURE_NOTIFICATION)
-#define ZCRX_NOTIF_TYPE_MASK		(ZCRX_NOTIF_NO_BUFFERS)
+#define ZCRX_NOTIF_TYPE_MASK		(ZCRX_NOTIF_NO_BUFFERS |\
+					 ZCRX_NOTIF_COPY)
 
 struct io_zcrx_mem {
 	unsigned long			size;
-- 
2.52.0


^ permalink raw reply related

* [PATCH 3/5] io_uring/zcrx: add shared-memory notification statistics
From: Clément Léger @ 2026-04-22 11:25 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov, Jens Axboe
  Cc: Clément Léger, linux-doc, linux-kernel, linux-kselftest,
	netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Vishwanath Seshagiri
In-Reply-To: <20260422112522.3316660-1-cleger@meta.com>

Add support for an optional stats struct embedded in the refill queue
region, allowing userspace to monitor copy-fallback and no-buffers events
in real-time.

Userspace queries the stats struct size and alignment via
IO_URING_QUERY_ZCRX (notif_stats_size / notif_stats_alignment), then
provides a stats_offset in zcrx_notification_desc pointing to a location
within the refill queue region.

The kernel updates the stats counters in-place using atomic ops on every
copy-fallback and no-buffers event.

Signed-off-by: Clément Léger <cleger@meta.com>
---
 include/uapi/linux/io_uring/query.h | 12 +++++++
 include/uapi/linux/io_uring/zcrx.h  | 15 +++++++--
 io_uring/query.c                    | 14 ++++++++
 io_uring/zcrx.c                     | 50 +++++++++++++++++++++++++++--
 io_uring/zcrx.h                     |  1 +
 5 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index 95500759cc13..738c35c7d05c 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -23,6 +23,7 @@ enum {
 	IO_URING_QUERY_OPCODES			= 0,
 	IO_URING_QUERY_ZCRX			= 1,
 	IO_URING_QUERY_SCQ			= 2,
+	IO_URING_QUERY_ZCRX_NOTIF		= 3,
 
 	__IO_URING_QUERY_MAX,
 };
@@ -62,6 +63,17 @@ struct io_uring_query_zcrx {
 	__u64 __resv2;
 };
 
+struct io_uring_query_zcrx_notif {
+	/* Bitmask of supported ZCRX_NOTIF_* flags*/
+	__u32 notif_flags;
+	/* Size of io_uring_zcrx_notif_stats */
+	__u32 notif_stats_size;
+	/* Required alignment for the stats struct within the region (ie stats_offset) */
+	__u32 notif_stats_off_alignment;
+	__u32 resv1;
+	__u64 __resv2[10];
+};
+
 struct io_uring_query_scq {
 	/* The SQ/CQ rings header size */
 	__u64 hdr_size;
diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h
index e0c0079626c8..ae9bbca3004c 100644
--- a/include/uapi/linux/io_uring/zcrx.h
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -73,11 +73,22 @@ enum zcrx_notification_type {
 	ZCRX_NOTIF_COPY = 1 << 1
 };
 
+enum zcrx_notification_desc_flags {
+	/* If set, stats_offset holds a valid offset to a notif_stats struct */
+	ZCRX_NOTIF_DESC_FLAG_STATS = 1 << 0,
+};
+
+struct io_uring_zcrx_notif_stats {
+	__u64	copy_count;	/* cumulative copy-fallback CQEs */
+	__u64	copy_bytes;	/* cumulative bytes copied */
+};
+
 struct zcrx_notification_desc {
 	__u64	user_data;
 	__u32	type_mask;
-	__u32	__resv1;
-	__u64	__resv2[10];
+	__u32	flags; /* see enum zcrx_notification_desc_flags */
+	__u64	stats_offset; /* offset from the beginning of refill ring region for stats */
+	__u64	__resv2[9];
 };
 
 /*
diff --git a/io_uring/query.c b/io_uring/query.c
index c1704d088374..3591106e139d 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -9,6 +9,7 @@
 union io_query_data {
 	struct io_uring_query_opcode opcodes;
 	struct io_uring_query_zcrx zcrx;
+	struct io_uring_query_zcrx_notif zcrx_notif;
 	struct io_uring_query_scq scq;
 };
 
@@ -44,6 +45,16 @@ static ssize_t io_query_zcrx(union io_query_data *data)
 	return sizeof(*e);
 }
 
+static ssize_t io_query_zcrx_notif(union io_query_data *data)
+{
+	struct io_uring_query_zcrx_notif *e = &data->zcrx_notif;
+
+	e->notif_flags = ZCRX_NOTIF_TYPE_MASK;
+	e->notif_stats_size = sizeof(struct io_uring_zcrx_notif_stats);
+	e->notif_stats_off_alignment = __alignof__(struct io_uring_zcrx_notif_stats);
+	return sizeof(*e);
+}
+
 static ssize_t io_query_scq(union io_query_data *data)
 {
 	struct io_uring_query_scq *e = &data->scq;
@@ -83,6 +94,9 @@ static int io_handle_query_entry(union io_query_data *data, void __user *uhdr,
 	case IO_URING_QUERY_ZCRX:
 		ret = io_query_zcrx(data);
 		break;
+	case IO_URING_QUERY_ZCRX_NOTIF:
+		ret = io_query_zcrx_notif(data);
+		break;
 	case IO_URING_QUERY_SCQ:
 		ret = io_query_scq(data);
 		break;
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 732e585aa13a..c61f94fb14c3 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -414,6 +414,7 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
 	io_free_region(ifq->user, &ifq->rq_region);
 	ifq->rq.ring = NULL;
 	ifq->rq.rqes = NULL;
+	ifq->notif_stats = NULL;
 }
 
 static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
@@ -841,6 +842,33 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
 	return ret;
 }
 
+static int zcrx_validate_notif_stats(struct io_zcrx_ifq *ifq,
+				     const struct io_uring_zcrx_ifq_reg *reg,
+				     const struct zcrx_notification_desc *notif)
+{
+	size_t stats_off = notif->stats_offset;
+	size_t used, end;
+
+	used = reg->offsets.rqes +
+	       sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
+
+	if (!IS_ALIGNED(stats_off, __alignof__(struct io_uring_zcrx_notif_stats)))
+		return -EINVAL;
+	if (stats_off < used)
+		return -ERANGE;
+	if (check_add_overflow(stats_off,
+			       sizeof(struct io_uring_zcrx_notif_stats),
+			       &end))
+		return -ERANGE;
+	if (end > io_region_size(&ifq->rq_region))
+		return -ERANGE;
+
+	ifq->notif_stats = io_region_get_ptr(&ifq->rq_region) + stats_off;
+	memset(ifq->notif_stats, 0, sizeof(*ifq->notif_stats));
+
+	return 0;
+}
+
 int io_register_zcrx(struct io_ring_ctx *ctx,
 		     struct io_uring_zcrx_ifq_reg __user *arg)
 {
@@ -894,7 +922,9 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK)
 		return -EINVAL;
-	if (notif.__resv1 || !mem_is_zero(&notif.__resv2, sizeof(notif.__resv2)))
+	if (notif.flags & ~ZCRX_NOTIF_DESC_FLAG_STATS)
+		return -EINVAL;
+	if (!mem_is_zero(&notif.__resv2, sizeof(notif.__resv2)))
 		return -EINVAL;
 
 	ifq = io_zcrx_ifq_alloc(ctx);
@@ -925,6 +955,12 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
 	if (ret)
 		goto err;
 
+	if (notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS) {
+		ret = zcrx_validate_notif_stats(ifq, &reg, &notif);
+		if (ret)
+			goto err;
+	}
+
 	ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF);
 
 	if (!(reg.flags & ZCRX_REG_NODEV)) {
@@ -1133,6 +1169,11 @@ static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
 	kfree_rcu(req, rcu_head);
 }
 
+static void zcrx_stat_add(__u64 *p, s64 v)
+{
+	WRITE_ONCE(*p, READ_ONCE(*p) + v);
+}
+
 static void zcrx_send_notif(struct io_zcrx_ifq *ifq, u32 type_mask)
 {
 	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
@@ -1513,8 +1554,13 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 	int ret;
 
 	ret = io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
-	if (ret > 0)
+	if (ret > 0) {
+		if (ifq->notif_stats) {
+			zcrx_stat_add(&ifq->notif_stats->copy_count, 1);
+			zcrx_stat_add(&ifq->notif_stats->copy_bytes, ret);
+		}
 		zcrx_send_notif(ifq, ZCRX_NOTIF_COPY);
+	}
 
 	return ret;
 }
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 1bd63adaa711..0dcf486ff530 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -82,6 +82,7 @@ struct io_zcrx_ifq {
 	u32				allowed_notif_mask;
 	u32				fired_notifs;
 	u64				notif_data;
+	struct io_uring_zcrx_notif_stats *notif_stats;
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)
-- 
2.52.0


^ permalink raw reply related

* [PATCH 4/5] Documentation: networking: document zcrx notifications and statistics
From: Clément Léger @ 2026-04-22 11:25 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov, Jens Axboe
  Cc: Clément Léger, linux-doc, linux-kernel, linux-kselftest,
	netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Vishwanath Seshagiri
In-Reply-To: <20260422112522.3316660-1-cleger@meta.com>

Document the zcrx notification system and shared-memory statistics
that were introduced to let userspace monitor zero-copy receive health.
The notification section covers the two notification types
(ZCRX_NOTIF_NO_BUFFERS, ZCRX_NOTIF_COPY), registration via
zcrx_notification_desc, and the fire-once / re-arm mechanism via
ZCRX_CTRL_ARM_NOTIFICATION. The statistics section covers the optional
shared-memory io_uring_zcrx_notif_stats structure placed in the refill
ring region, including how to query its layout via
IO_URING_QUERY_ZCRX_NOTIF.

Signed-off-by: Clément Léger <cleger@meta.com>
---
 Documentation/networking/iou-zcrx.rst | 106 ++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/Documentation/networking/iou-zcrx.rst b/Documentation/networking/iou-zcrx.rst
index 7f3f4b2e6cf2..b17205fe55aa 100644
--- a/Documentation/networking/iou-zcrx.rst
+++ b/Documentation/networking/iou-zcrx.rst
@@ -196,6 +196,112 @@ Return buffers back to the kernel to be used again::
   rqe->len = cqe->res;
   IO_URING_WRITE_ONCE(*refill_ring.ktail, ++refill_ring.rq_tail);
 
+Notifications
+-------------
+
+When zero-copy receive encounters conditions that affect performance or
+functionality, the kernel can notify userspace via dedicated CQE notifications.
+The application must register a notification descriptor during
+``IORING_REGISTER_ZCRX_IFQ`` to receive them.
+
+Supported features can be detected by checking for ``ZCRX_FEATURE_NOTIFICATION``
+in the features bitmask returned by ``IO_URING_QUERY_ZCRX``.
+
+**Notification types**
+
+``ZCRX_NOTIF_NO_BUFFERS``
+  Fired when the page pool fails to allocate because the zcrx buffer area is
+  exhausted.
+
+``ZCRX_NOTIF_COPY``
+  Fired when a received fragment could not be delivered zero-copy and was
+  instead copied into a buffer.
+
+**Registering notifications**
+
+Allocate and fill a ``struct zcrx_notification_desc``::
+
+  struct zcrx_notification_desc notif = {
+    .user_data = MY_NOTIF_USER_DATA,
+    .type_mask = ZCRX_NOTIF_NO_BUFFERS | ZCRX_NOTIF_COPY,
+  };
+
+  reg.notif_desc = (__u64)(unsigned long)&notif;
+
+``user_data`` is the value that will appear in the notification CQE's
+``user_data`` field. ``type_mask`` selects which notification types the
+application wants to receive.
+
+When a registered event occurs, the kernel posts a CQE with the specified
+``user_data`` and ``cqe->res`` set to a bitmask of the triggered notification
+types.
+
+**Rate limiting**
+
+Each notification type fires once until the application explicitly re-arms it.
+To re-arm, issue ``IORING_REGISTER_ZCRX_CTRL`` with
+``ZCRX_CTRL_ARM_NOTIFICATION``::
+
+  struct zcrx_ctrl ctrl = {
+    .zcrx_id = zcrx_id,
+    .op = ZCRX_CTRL_ARM_NOTIFICATION,
+    .zc_arm_notif = {
+      .type_mask = ZCRX_NOTIF_NO_BUFFERS | ZCRX_NOTIF_COPY,
+    },
+  };
+
+  io_uring_register(ring_fd, IORING_REGISTER_ZCRX_CTRL, &ctrl, 0);
+
+Only notification types that have previously fired can be re-armed.
+
+Notification statistics
+-----------------------
+
+In addition to CQE-based notifications, the kernel can maintain a shared-memory
+statistics structure that is updated on every relevant event. All stats are
+updated regardless of which notification flags were registered.
+
+The statistics structure layout and alignment requirements can be queried via
+``IO_URING_QUERY_ZCRX_NOTIF``. The application must query the structure size
+and alignment requirements so that it allocates enough memory for the region
+to fit both the refill ring and the stats structure.
+
+To enable statistics, place the stats structure after the refill ring entries
+within the same mapped region, and set the ``ZCRX_NOTIF_DESC_FLAG_STATS`` flag
+in the notification descriptor::
+
+  /* Compute offset for the stats struct (after refill ring entries) */
+  size_t stats_offset = ring_size;
+  ring_size += ALIGN_UP(sizeof(struct io_uring_zcrx_notif_stats), PAGE_SIZE);
+
+  /* Map the region with the extra space */
+  ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+  struct zcrx_notification_desc notif = {
+    .user_data = MY_NOTIF_USER_DATA,
+    .type_mask = ZCRX_NOTIF_COPY,
+    .flags = ZCRX_NOTIF_DESC_FLAG_STATS,
+    .stats_offset = stats_offset,
+  };
+
+The ``stats_offset`` must satisfy the alignment reported by
+``notif_stats_off_alignment`` and must point to a location within the mapped
+region that does not overlap with the refill ring header or entries.
+
+Application can read stat counters them at any time::
+
+  volatile struct io_uring_zcrx_notif_stats *stats =
+    (void *)((char *)ring_ptr + stats_offset);
+
+  printf("copy fallbacks: %llu (%llu bytes)\n",
+         IO_URING_READ_ONCE(stats->copy_count),
+	 IO_URING_READ_ONCE(stats->copy_bytes));
+
+``copy_count`` is incremented each time a fragment is copied instead of being
+delivered via zero-copy. ``copy_bytes`` accumulates the total number of bytes
+copied.
+
 Area chunking
 -------------
 
-- 
2.52.0


^ permalink raw reply related

* [PATCH 5/5] selftests: iou-zcrx: add notification and stats test for zcrx
From: Clément Léger @ 2026-04-22 11:25 UTC (permalink / raw)
  To: io-uring, Pavel Begunkov, Jens Axboe
  Cc: Clément Léger, linux-doc, linux-kernel, linux-kselftest,
	netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Vishwanath Seshagiri
In-Reply-To: <20260422112522.3316660-1-cleger@meta.com>

Add a selftest to verify that ZCRX notification are properly delivered
to userspace and that the shared-memory notification stats (copy_count,
copy_bytes) are correctly incremented when zero-copy RX falls back to
copying or when it runs out of buffers.

The test registers a notification descriptor during
IORING_REGISTER_ZCRX_IFQ with a stats region placed after the refill
queue entries. A new -n flag verifies that the copy fallback is
triggered and -b/-a flags allows to check for out of buffer
notification.

To reliably trigger copy fallback, the Python test uses a new
single_no_flow() setup variant that configures tcp-data-split and RSS
but without ethtool flow rule. Without flow steering, traffic arrives
on non-zcrx queues as regular pages, forcing the kernel copy-fallback
path in io_zcrx_copy_frag().

Out-of-buffer notification is verified by using a smaller receive area
and by avoiding recycling the buffers so that the kernel runs out of
buffer quickly.

Signed-off-by: Clément Léger <cleger@meta.com>
---
 .../selftests/drivers/net/hw/iou-zcrx.c       | 112 ++++++++++++++++--
 .../selftests/drivers/net/hw/iou-zcrx.py      |  49 +++++++-
 2 files changed, 149 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
index 240d13dbc54e..3c95e6460c24 100644
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
@@ -52,7 +52,27 @@ struct t_io_uring_zcrx_ifq_reg {
 	struct io_uring_zcrx_offsets offsets;
 	__u32	zcrx_id;
 	__u32	rx_buf_len;
-	__u64	__resv[3];
+	__u64	notif_desc;
+	__u64	__resv[2];
+};
+
+#define ZCRX_NOTIF_NO_BUFFERS		(1 << 0)
+#define ZCRX_NOTIF_COPY			(1 << 1)
+#define ZCRX_NOTIF_DESC_FLAG_STATS	(1 << 0)
+
+#define NOTIF_USER_DATA			3
+
+struct t_zcrx_notification_desc {
+	__u64	user_data;
+	__u32	type_mask;
+	__u32	flags;
+	__u64	stats_offset;
+	__u64	__resv2[9];
+};
+
+struct t_io_uring_zcrx_notif_stats {
+	__u64	copy_count;
+	__u64	copy_bytes;
 };
 
 static long page_size;
@@ -84,7 +104,10 @@ static int cfg_oneshot_recvs;
 static int cfg_send_size = SEND_SIZE;
 static struct sockaddr_in6 cfg_addr;
 static unsigned int cfg_rx_buf_len;
+static size_t cfg_area_size;
 static bool cfg_dry_run;
+static bool cfg_copy_fallback;
+static bool cfg_no_buffers;
 
 static char *payload;
 static void *area_ptr;
@@ -95,6 +118,8 @@ static unsigned long area_token;
 static int connfd;
 static bool stop;
 static size_t received;
+static unsigned int notif_received_mask;
+static size_t notif_stats_offset;
 
 static unsigned long gettimeofday_ms(void)
 {
@@ -142,6 +167,7 @@ static void setup_zcrx(struct io_uring *ring)
 {
 	unsigned int ifindex;
 	unsigned int rq_entries = 4096;
+	size_t area_size = cfg_area_size ? cfg_area_size : AREA_SIZE;
 	int ret;
 
 	ifindex = if_nametoindex(cfg_ifname);
@@ -150,7 +176,7 @@ static void setup_zcrx(struct io_uring *ring)
 
 	if (cfg_rx_buf_len && cfg_rx_buf_len != page_size) {
 		area_ptr = mmap(NULL,
-				AREA_SIZE,
+				area_size,
 				PROT_READ | PROT_WRITE,
 				MAP_ANONYMOUS | MAP_PRIVATE |
 				MAP_HUGETLB | MAP_HUGE_2MB,
@@ -162,7 +188,7 @@ static void setup_zcrx(struct io_uring *ring)
 		}
 	} else {
 		area_ptr = mmap(NULL,
-				AREA_SIZE,
+				area_size,
 				PROT_READ | PROT_WRITE,
 				MAP_ANONYMOUS | MAP_PRIVATE,
 				0,
@@ -172,6 +198,12 @@ static void setup_zcrx(struct io_uring *ring)
 	}
 
 	ring_size = get_refill_ring_size(rq_entries);
+
+	if (cfg_copy_fallback) {
+		notif_stats_offset = ring_size;
+		ring_size += ALIGN_UP(sizeof(struct t_io_uring_zcrx_notif_stats), page_size);
+	}
+
 	ring_ptr = mmap(NULL,
 			ring_size,
 			PROT_READ | PROT_WRITE,
@@ -187,10 +219,11 @@ static void setup_zcrx(struct io_uring *ring)
 
 	struct io_uring_zcrx_area_reg area_reg = {
 		.addr = (__u64)(unsigned long)area_ptr,
-		.len = AREA_SIZE,
+		.len = area_size,
 		.flags = 0,
 	};
 
+	struct t_zcrx_notification_desc notif_desc;
 	struct t_io_uring_zcrx_ifq_reg reg = {
 		.if_idx = ifindex,
 		.if_rxq = cfg_queue_id,
@@ -200,11 +233,32 @@ static void setup_zcrx(struct io_uring *ring)
 		.rx_buf_len = cfg_rx_buf_len,
 	};
 
+	if (cfg_copy_fallback || cfg_no_buffers) {
+		__u32 type_mask = 0;
+
+		if (cfg_copy_fallback)
+			type_mask = ZCRX_NOTIF_COPY;
+		if (cfg_no_buffers)
+			type_mask = ZCRX_NOTIF_NO_BUFFERS;
+
+		memset(&notif_desc, 0, sizeof(notif_desc));
+		notif_desc.user_data = NOTIF_USER_DATA;
+		notif_desc.type_mask = type_mask;
+		if (cfg_copy_fallback) {
+			notif_desc.flags = ZCRX_NOTIF_DESC_FLAG_STATS;
+			notif_desc.stats_offset = notif_stats_offset;
+		}
+		reg.notif_desc = (__u64)(unsigned long)&notif_desc;
+	}
+
 	ret = io_uring_register_ifq(ring, (void *)&reg);
 	if (cfg_rx_buf_len && (ret == -EINVAL || ret == -EOPNOTSUPP ||
 			       ret == -ERANGE)) {
 		printf("Large chunks are not supported %i\n", ret);
 		exit(SKIP_CODE);
+	} else if ((cfg_copy_fallback || cfg_no_buffers) && ret == -EINVAL) {
+		printf("Notifications not supported %i\n", ret);
+		exit(SKIP_CODE);
 	} else if (ret) {
 		error(1, 0, "io_uring_register_ifq(): %d", ret);
 	}
@@ -304,10 +358,13 @@ static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe)
 	}
 	received += n;
 
-	rqe = &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)];
-	rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token;
-	rqe->len = cqe->res;
-	io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail);
+	/* Skip ring refill so that we ran out of buffers quickly */
+	if (!cfg_no_buffers) {
+		rqe = &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)];
+		rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token;
+		rqe->len = cqe->res;
+		io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail);
+	}
 }
 
 static void server_loop(struct io_uring *ring)
@@ -324,8 +381,15 @@ static void server_loop(struct io_uring *ring)
 			process_accept(ring, cqe);
 		else if (cqe->user_data == 2)
 			process_recvzc(ring, cqe);
-		else
+		else if ((cfg_copy_fallback || cfg_no_buffers) &&
+			 cqe->user_data == NOTIF_USER_DATA) {
+			notif_received_mask |= cqe->res;
+			if (cfg_no_buffers &&
+			    (cqe->res & ZCRX_NOTIF_NO_BUFFERS))
+				stop = true;
+		} else {
 			error(1, 0, "unknown cqe");
+		}
 		count++;
 	}
 	io_uring_cq_advance(ring, count);
@@ -374,6 +438,23 @@ static void run_server(void)
 
 	if (!stop)
 		error(1, 0, "test failed\n");
+
+	if (cfg_copy_fallback) {
+		struct t_io_uring_zcrx_notif_stats *stats =
+			(void *)((char *)ring_ptr + notif_stats_offset);
+
+		if (!(notif_received_mask & ZCRX_NOTIF_COPY))
+			error(1, 0, "expected copy fallback notification");
+		if (!IO_URING_READ_ONCE(stats->copy_count))
+			error(1, 0, "expected copy_count > 0");
+		if (!IO_URING_READ_ONCE(stats->copy_bytes))
+			error(1, 0, "expected copy_bytes > 0");
+	}
+
+	if (cfg_no_buffers) {
+		if (!(notif_received_mask & ZCRX_NOTIF_NO_BUFFERS))
+			error(1, 0, "expected no-buffers notification");
+	}
 }
 
 static void run_client(void)
@@ -425,7 +506,7 @@ static void parse_opts(int argc, char **argv)
 		usage(argv[0]);
 	cfg_payload_len = max_payload_len;
 
-	while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) != -1) {
+	while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:a:dnb")) != -1) {
 		switch (c) {
 		case 's':
 			if (cfg_client)
@@ -466,8 +547,19 @@ static void parse_opts(int argc, char **argv)
 		case 'd':
 			cfg_dry_run = true;
 			break;
+		case 'n':
+			cfg_copy_fallback = true;
+			break;
+		case 'b':
+			cfg_no_buffers = true;
+			break;
+		case 'a':
+			cfg_area_size = strtoul(optarg, NULL, 0) * page_size;
+			break;
 		}
 	}
+	if (cfg_copy_fallback && cfg_no_buffers)
+		error(1, 0, "Pass one of -n or -b");
 
 	if (cfg_server && addr)
 		error(1, 0, "Receiver cannot have -h specified");
diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
index e81724cb5542..f7f1cbff5959 100755
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
@@ -41,7 +41,9 @@ def set_flow_rule_rss(cfg, rss_ctx_id):
     return int(values)
 
 
-def single(cfg):
+def single_no_flow(cfg):
+    """Like single() but without a flow rule."""
+
     channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
     channels = channels['combined-count']
     if channels < 2:
@@ -65,6 +67,9 @@ def single(cfg):
     ethtool(f"-X {cfg.ifname} equal {cfg.target}")
     defer(ethtool, f"-X {cfg.ifname} default")
 
+def single(cfg):
+    single_no_flow(cfg)
+
     flow_rule_id = set_flow_rule(cfg)
     defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
 
@@ -130,6 +135,26 @@ def test_zcrx_oneshot(cfg, setup) -> None:
         cmd(tx_cmd, host=cfg.remote)
 
 
+@ksft_variants([
+    KsftNamedVariant("single", single_no_flow),
+])
+def test_zcrx_notif(cfg, setup) -> None:
+    """Test zcrx copy fallback notification.
+
+    Omits the flow rule so traffic arrives on non-zcrx queues as regular
+    pages, forcing the kernel copy-fallback path. Asserts that the
+    ZCRX_NOTIF_COPY notification CQE is delivered."""
+
+    cfg.require_ipver('6')
+
+    setup(cfg)
+    rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -n"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840"
+    with bkg(rx_cmd, exit_wait=True):
+        wait_port_listen(cfg.port, proto="tcp")
+        cmd(tx_cmd, host=cfg.remote)
+
+
 def test_zcrx_large_chunks(cfg) -> None:
     """Test zcrx with large buffer chunks."""
 
@@ -157,6 +182,25 @@ def test_zcrx_large_chunks(cfg) -> None:
         cmd(tx_cmd, host=cfg.remote)
 
 
+@ksft_variants([
+    KsftNamedVariant("single", single),
+])
+def test_zcrx_notif_no_buffers(cfg, setup) -> None:
+    """Test zcrx out-of-buffer notification.
+
+    Skips buffer refill so the pool is quickly exhausted, triggering
+    a ZCRX_NOTIF_NO_BUFFERS notification CQE."""
+
+    cfg.require_ipver('6')
+
+    setup(cfg)
+    rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -b -a 64"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840"
+    with bkg(rx_cmd, exit_wait=True):
+        wait_port_listen(cfg.port, proto="tcp")
+        cmd(tx_cmd, host=cfg.remote, fail=False)
+
+
 def main() -> None:
     with NetDrvEpEnv(__file__) as cfg:
         cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx")
@@ -166,7 +210,8 @@ def main() -> None:
         cfg.netnl = NetdevFamily()
         cfg.port = rand_port()
         ksft_run(globs=globals(), cases=[test_zcrx, test_zcrx_oneshot,
-                                        test_zcrx_large_chunks], args=(cfg, ))
+                                        test_zcrx_large_chunks, test_zcrx_notif,
+                                        test_zcrx_notif_no_buffers], args=(cfg, ))
     ksft_exit()
 
 
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH] ipv6: udp: fix memory leak in udpv6_sendmsg error path
From: Sabrina Dubroca @ 2026-04-22 11:55 UTC (permalink / raw)
  To: Mingyu Wang
  Cc: willemdebruijn.kernel, davem, dsahern, edumazet, kuba, pabeni,
	horms, netdev, linux-kernel
In-Reply-To: <20260422105802.486216-1-25181214217@stu.xidian.edu.cn>

2026-04-22, 18:58:02 +0800, Mingyu Wang wrote:
> During fuzzing with failslab enabled, a memory leak was observed in the
> IPv6 UDP send path.
> 
> When sending via the lockless fast path (!corkreq), udpv6_sendmsg()
> calls ip6_make_skb() and assumes that the routing entry (dst_entry)
> reference has been stolen by the callee. However, if ip6_make_skb()
> fails early (e.g., due to an ENOMEM from memory allocation failure),
> it returns an error pointer without consuming the dst reference.

Not in all cases? If ip6_setup_cork() fails, we call
ip6_cork_release() which will release the dst. The MSG_PROBE path also
releases the dst. __ip6_flush_pending_frames() also looks like it does
that.

> Since udpv6_sendmsg() unconditionally jumps to the 'out_no_dst' label,
> the unconsumed dst_entry is never released, resulting in a memory leak.
> 
> Fix this by explicitly calling dst_release(dst) when ip6_make_skb()
> returns an error.
> 
> Signed-off-by: Mingyu Wang <25181214217@stu.xidian.edu.cn>

And this is missing a Fixes tag.

>  net/ipv6/udp.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 15e032194ecc..b83ecfd729af 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1706,8 +1706,11 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  				   dst_rt6_info(dst),
>  				   msg->msg_flags, &cork);
>  		err = PTR_ERR(skb);
> -		if (!IS_ERR_OR_NULL(skb))
> +		if (!IS_ERR_OR_NULL(skb)) {
>  			err = udp_v6_send_skb(skb, fl6, &cork.base);
> +		} else {
> +			dst_release(dst);
> +		}
>  		/* ip6_make_skb steals dst reference */

This comment becomes really confusing after your patch.

>  		goto out_no_dst;
>  	}
> -- 
> 2.34.1
> 
> 

-- 
Sabrina

^ permalink raw reply

* Re: [PATCH net-next] Documentation: net/smc: correct old value of smcr_max_recv_wr
From: Alexandra Winter @ 2026-04-22 12:08 UTC (permalink / raw)
  To: Mahanta Jambigi, andrew+netdev, davem, edumazet, kuba, pabeni,
	alibuda, dust.li, sidraya, wenjia
  Cc: pasic, horms, tonylu, guwen, netdev, linux-s390
In-Reply-To: <20260422085159.459678-1-mjambigi@linux.ibm.com>

net-next is closed!

On 22.04.26 10:51, Mahanta Jambigi wrote:
> The smc-sysctl.rst documentation incorrectly stated that the previous
> hardcoded maximum number of WR buffers on the receive path (smcr_max_recv_wr)
> was 16. The correct historical value used before the introduction of the sysctl
> control was 48. Update the documentation to reflect the accurate default value.

s/default/historical/
or remove the last sentence

> 
> Fixes: aef3cdb47bbb net/smc: make wr buffer count configurable
> Reviewed-by: Halil Pasic <pasic@linux.ibm.com>
> Reviewed-by: Sidraya Jayagond <sidraya@linux.ibm.com>
> Signed-off-by: Mahanta Jambigi <mjambigi@linux.ibm.com>
> ---
>  Documentation/networking/smc-sysctl.rst | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst
> index 904a910f198e..279d15e61899 100644
> --- a/Documentation/networking/smc-sysctl.rst
> +++ b/Documentation/networking/smc-sysctl.rst
> @@ -100,14 +100,14 @@ smcr_max_recv_wr - INTEGER
>  	depending on the workload it can be a bottleneck in a sense that threads
>  	have to wait for work request buffers to become available. Before the
>  	introduction of this control the maximal number of work request buffers
> -	available on the receive path used to be hard coded to 16. With this control
> +	available on the receive path used to be hard coded to 48. With this control
>  	it becomes configurable. The acceptable range is between 2 and 2048.
>  
>  	Please be aware that all the buffers need to be allocated as a physically
>  	continuous array in which each element is a single buffer and has the size
>  	of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails, we keep retrying
>  	with half of the buffer count until it is ether successful or (unlikely)
		typo in old documentation: s/ether/either/g

> -	we dip below the old hard coded value which is 16 where we give up much
> +	we dip below the old hard coded value which is 48 where we give up much
>  	like before having this control.
>  
>  	Default: 48


^ permalink raw reply

* [PATCH] mptcp: do not drop partial packets
From: Shardul Bankar @ 2026-04-22 12:09 UTC (permalink / raw)
  To: matttbe, martineau
  Cc: geliang, pabeni, davem, edumazet, kuba, horms, netdev, mptcp,
	linux-kernel, janak, kalpan.jani, shardulsb08, Shardul Bankar

When a packet arrives with map_seq < ack_seq < end_seq, the beginning
of the packet has already been acknowledged but the end contains new
data.  Currently the entire packet is dropped as "old data," forcing
the sender to retransmit.

Instead, skip the already-acked bytes by adjusting the skb offset and
enqueue only the new portion.  Update bytes_received and ack_seq to
reflect the new data consumed.

A previous attempt at this fix (commit 1d2ce718811a ("mptcp: do not
drop partial packets"), reverted in commit bf39160c4218 ("Revert
"mptcp: do not drop partial packets"")) also added a zero-window
check and changed rcv_wnd_sent initialization, which caused test
regressions.  This version addresses only the partial packet handling
without modifying receive window accounting.

Fixes: ab174ad8ef76 ("mptcp: move ooo skbs into msk out of order queue.")
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/600
Signed-off-by: Shardul Bankar <shardul.b@mpiricsoftware.com>
---
 net/mptcp/protocol.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 614c3f583ca0..6858e6e283e3 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -397,12 +397,27 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
 		return false;
 	}
 
-	/* old data, keep it simple and drop the whole pkt, sender
-	 * will retransmit as needed, if needed.
+	/* Completely old data? */
+	if (!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq)) {
+		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+		mptcp_drop(sk, skb);
+		return false;
+	}
+
+	/* Partial packet: map_seq < ack_seq < end_seq.
+	 * Skip the already-acked bytes and enqueue the new data.
 	 */
-	MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
-	mptcp_drop(sk, skb);
-	return false;
+	copy_len = MPTCP_SKB_CB(skb)->end_seq - msk->ack_seq;
+	MPTCP_SKB_CB(skb)->offset += msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+	msk->bytes_received += copy_len;
+	WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+	tail = skb_peek_tail(&sk->sk_receive_queue);
+	if (tail && mptcp_try_coalesce(sk, tail, skb))
+		return true;
+
+	skb_set_owner_r(skb, sk);
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+	return true;
 }
 
 static void mptcp_stop_rtx_timer(struct sock *sk)
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net 00/18] Remove a number of ISA and PCMCIA Ethernet drivers
From: Andrew Lunn @ 2026-04-22 12:11 UTC (permalink / raw)
  To: Byron Stanoszek
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	linux-kernel, netdev, linux-doc
In-Reply-To: <9a0bc592-fb74-f646-1752-4359c0ac31a2@polinggroup.com>

On Tue, Apr 21, 2026 at 11:03:28PM -0400, Byron Stanoszek wrote:
> On Wed, 22 Apr 2026, Andrew Lunn wrote:
> > 
> > Could you live with v6.18, which has an expected EOL of December 2028?
> > If you are only updating once per year, security is not an issue, you
> > just want stability.
> 
> I could for the time being, but this hasn't worked for me in the past. Usually
> what happens is the PC breaks down, and the customer swaps in a new
> backplane+SBC and moves all their PCI cards over. I then find I need to update
> the kernel just to get the Intel DRM to work properly on the new CPU. Some of
> these systems were installed back in the Linux 2.6 era, so I've gone through
> several "Intel DRM not working" steps ever since CPUs started getting
> integrated graphics. 2028 will come fast.

Hi Byron

I will drop this driver from the patchset.

	Andrew

^ permalink raw reply

* Re: [PATCH net 11/18] drivers: net: cirrus: cs89x0: Remove this driver
From: Andrew Lunn @ 2026-04-22 12:13 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	linux-kernel, netdev, linux-doc
In-Reply-To: <CAMuHMdWz=ucmKxHXmzKj=oTn6yMVxPnkNxtG6X2C3ts_ZCg4Cw@mail.gmail.com>

> > -config CS89x0_PLATFORM
> > -       tristate "CS89x0 platform driver support"
> > -       depends on ARM || (COMPILE_TEST && !PPC)
> > -       select CS89x0
> > -       help
> > -         Say Y to compile the cs89x0 platform driver. This makes this driver
> > -         suitable for use on certain evaluation boards such as the iMX21ADS.
> > -
> > -         To compile this driver as a module, choose M here. The module
> > -         will be called cs89x0.
> 
> This is the more modern DT-based part...

No dependency on OF?

> However, no users of these compatible values ever appeared upstream.

Thanks for the information. That helps with the removal.

       Andrew

^ permalink raw reply

* Re: [PATCH net 14/18] drivers: net: xircom: xirc2ps: Remove this driver
From: Andrew Lunn @ 2026-04-22 12:15 UTC (permalink / raw)
  To: Michael Fritscher
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Jonathan Corbet, Shuah Khan, linux-kernel, netdev,
	linux-doc
In-Reply-To: <73e3a34c-f1dc-403b-b007-18ff85d66ea1@fritscher.net>

On Wed, Apr 22, 2026 at 08:21:23AM +0200, Michael Fritscher wrote:
> Good day,
> 
> actually, I do use Xircom PCMCIA network cards (yes, the 16 bit ones) on
> Lenovo X60/X61 laptops as a second LAN card for server maintenances with
> current 64 bit distros (e.g. Debian Trixie, which I plan to update to
> Trixie+1 when available). Why? Because I have them and they are working ;-)
 
Hi Michael

I will drop this from the patchset for the moment.

Would you be willing to take up the Maintainer role for it?

	Andrew


^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH iwl-next] ice: init desired_dcbx_cfg in default DCB config
From: Arland, ArpanaX @ 2026-04-22 12:16 UTC (permalink / raw)
  To: Loktionov, Aleksandr, intel-wired-lan@lists.osuosl.org,
	Nguyen, Anthony L, Loktionov, Aleksandr
  Cc: netdev@vger.kernel.org, Czapnik, Lukasz
In-Reply-To: <20260320050541.422592-1-aleksandr.loktionov@intel.com>

> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of Aleksandr Loktionov
> Sent: Friday, March 20, 2026 10:36 AM
> To: intel-wired-lan@lists.osuosl.org; Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Loktionov, Aleksandr <aleksandr.loktionov@intel.com>
> Cc: netdev@vger.kernel.org; Czapnik, Lukasz <lukasz.czapnik@intel.com>
> Subject: [Intel-wired-lan] [PATCH iwl-next] ice: init desired_dcbx_cfg in default DCB config
>
> From: Lukasz Czapnik <lukasz.czapnik@intel.com>
>
> When DCBX is disabled in firmware the driver falls back to software LLDP mode and applies a default DCB configuration via ice_dcb_sw_dflt_cfg().
> This function properly initializes local_dcbx_cfg with valid parameters including etscfg.maxtcs from hardware capabilities. However, desired_dcbx_cfg was never initialized in this path.
>
> All DCB netlink functions (ice_dcbnl_setpfc, ice_dcbnl_setets, etc.) use desired_dcbx_cfg as the base configuration for user-requested changes.
> When desired_dcbx_cfg remains uninitialized with etscfg.maxtcs=0, the firmware rejects the configuration for 4+ port NICs, causing DCB configuration commands to fail.
>
> It is not a problem for 1 or 2 port NICs where we support 8 TCs - in that case FW accepts maxtc=0, treating it as 8.
>
> Fix it by copying local_dcbx_cfg (which was freshly initialized) into desired_dcbx_cfg after the default config is applied.
>
> Fixes: b94b013eb626 ("ice: Implement DCBNL support")
> Signed-off-by: Lukasz Czapnik <lukasz.czapnik@intel.com>
> Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
> ---
>  drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 3 +++
>  1 file changed, 3 insertions(+)
> 

Tested-by: Arpana Arland <arpanax.arland@intel.com> (A Contingent worker at Intel)


^ permalink raw reply

* Re: [PATCH net 11/18] drivers: net: cirrus: cs89x0: Remove this driver
From: Geert Uytterhoeven @ 2026-04-22 12:17 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	linux-kernel, netdev, linux-doc
In-Reply-To: <c6696785-6f1f-4747-996a-d86a60a23d0b@lunn.ch>

Hi Andrew,

On Wed, 22 Apr 2026 at 14:13, Andrew Lunn <andrew@lunn.ch> wrote:
> > > -config CS89x0_PLATFORM
> > > -       tristate "CS89x0 platform driver support"
> > > -       depends on ARM || (COMPILE_TEST && !PPC)
> > > -       select CS89x0
> > > -       help
> > > -         Say Y to compile the cs89x0 platform driver. This makes this driver
> > > -         suitable for use on certain evaluation boards such as the iMX21ADS.
> > > -
> > > -         To compile this driver as a module, choose M here. The module
> > > -         will be called cs89x0.
> >
> > This is the more modern DT-based part...
>
> No dependency on OF?

I guess no one bothered to add it, as it presumably builds fine without.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH iwl-next] ice: reduce loglevel to debug for 'Can't delete DSCP' message
From: Arland, ArpanaX @ 2026-04-22 12:17 UTC (permalink / raw)
  To: Loktionov, Aleksandr, intel-wired-lan@lists.osuosl.org,
	Nguyen, Anthony L, Loktionov, Aleksandr
  Cc: netdev@vger.kernel.org
In-Reply-To: <20260320050548.422684-1-aleksandr.loktionov@intel.com>

> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of Aleksandr Loktionov
> Sent: Friday, March 20, 2026 10:36 AM
> To: intel-wired-lan@lists.osuosl.org; Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Loktionov, Aleksandr <aleksandr.loktionov@intel.com>
> Cc: netdev@vger.kernel.org
> Subject: [Intel-wired-lan] [PATCH iwl-next] ice: reduce loglevel to debug for 'Can't delete DSCP' message
>
> From: Grzegorz Nitka <grzegorz.nitka@intel.com>
>
> Reduce netdev message "Can't delete DSCP netlink app ..." from error to debug level when in FW (firmware) DCB mode.
> This is not a real error and a kind of expected behavior. The device is fully operational and error message might be wrongly interpreted by the user.
>
> Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>
> Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
> ---
>  drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>

Tested-by: Arpana Arland <arpanax.arland@intel.com> (A Contingent worker at Intel)


^ permalink raw reply

* [PATCH net 0/1] pull request: fixes for ovpn 2026-04-22
From: Antonio Quartulli @ 2026-04-22 12:32 UTC (permalink / raw)
  To: netdev
  Cc: Antonio Quartulli, Sabrina Dubroca, Ralf Lici, Jakub Kicinski,
	Paolo Abeni, Andrew Lunn, David S. Miller, Eric Dumazet

Hello netdev team,

This PR is a new version of the one sent on March 20th.
We are trying to address a remote, yet possible, race condition between
tearing down an ovpn interface and adding new peers via netlink.

After further discussion including Jakub and Sabrina, the following
patch came up.

It should address all raised concerns.
Sorry for taking so long to get back to this.

Please pull or let mw know of any issue!

Thanks a lot,
	Antonio


The following changes since commit 478ed6b7d2577439c610f91fa8759a4c878a4264:

  net/sched: sch_dualpi2: drain both C-queue and L-queue in dualpi2_change() (2026-04-21 15:00:39 +0200)

are available in the Git repository at:

  https://github.com/OpenVPN/ovpn-net-next.git tags/ovpn-net-20260422

for you to fetch changes up to eac0dfc41d179d5f78d6d7ee401c63f48ba04b5a:

  ovpn: fix race between deleting interface and adding new peer (2026-04-22 14:30:17 +0200)

----------------------------------------------------------------
Included change:
* fix race condition between interface teardown and new peer being
  added via netlink

----------------------------------------------------------------
Antonio Quartulli (1):
      ovpn: fix race between deleting interface and adding new peer

 drivers/net/ovpn/main.c | 12 ++----------
 drivers/net/ovpn/peer.c |  8 ++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

^ permalink raw reply

* [PATCH net 1/1] ovpn: fix race between deleting interface and adding new peer
From: Antonio Quartulli @ 2026-04-22 12:32 UTC (permalink / raw)
  To: netdev
  Cc: Antonio Quartulli, Sabrina Dubroca, Ralf Lici, Jakub Kicinski,
	Paolo Abeni, Andrew Lunn, David S. Miller, Eric Dumazet,
	Hyunwoo Kim
In-Reply-To: <20260422123242.530882-1-antonio@openvpn.net>

While deleting an existing ovpn interface, there is a very
narrow window where adding a new peer via netlink may cause
the netdevice to hang and prevent its unregistration.

It may happen during ovpn_dellink(), when all existing peers are
freed and the device is queued for deregistration, but a
CMD_PEER_NEW message comes in adding a new peer that takes again
a reference to the netdev.

At this point there is no way to release the device because we are
under the assumption that all peers were already released.

Fix the race condition by releasing all peers in ndo_uninit(),
when the netdevice has already been removed from the netdev
list. Also ovpn_peer_add() has now an extra check that
forces the function to bail out if the device reg_state is
UNREGISTERING (or later state).

This way any incoming CMD_PEER_NEW racing with the interface
deletion routine will simply stop before adding the peer.

At this point ovpn_dellink() becomes empty and can just be
removed.

Reported-by: Hyunwoo Kim <imv4bel@gmail.com>
Closes: https://lore.kernel.org/netdev/aaVgJ16edTfQkYbx@v4bel/
Suggested-by: Sabrina Dubroca <sd@queasysnail.net>
Fixes: 80747caef33d ("ovpn: introduce the ovpn_peer object")
Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
---
 drivers/net/ovpn/main.c | 12 ++----------
 drivers/net/ovpn/peer.c |  8 ++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
index 2e0420febda0..0eab305780c7 100644
--- a/drivers/net/ovpn/main.c
+++ b/drivers/net/ovpn/main.c
@@ -92,6 +92,8 @@ static void ovpn_net_uninit(struct net_device *dev)
 {
 	struct ovpn_priv *ovpn = netdev_priv(dev);
 
+	cancel_delayed_work_sync(&ovpn->keepalive_work);
+	ovpn_peers_free(ovpn, NULL, OVPN_DEL_PEER_REASON_TEARDOWN);
 	gro_cells_destroy(&ovpn->gro_cells);
 }
 
@@ -208,15 +210,6 @@ static int ovpn_newlink(struct net_device *dev,
 	return register_netdevice(dev);
 }
 
-static void ovpn_dellink(struct net_device *dev, struct list_head *head)
-{
-	struct ovpn_priv *ovpn = netdev_priv(dev);
-
-	cancel_delayed_work_sync(&ovpn->keepalive_work);
-	ovpn_peers_free(ovpn, NULL, OVPN_DEL_PEER_REASON_TEARDOWN);
-	unregister_netdevice_queue(dev, head);
-}
-
 static int ovpn_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ovpn_priv *ovpn = netdev_priv(dev);
@@ -235,7 +228,6 @@ static struct rtnl_link_ops ovpn_link_ops = {
 	.policy = ovpn_policy,
 	.maxtype = IFLA_OVPN_MAX,
 	.newlink = ovpn_newlink,
-	.dellink = ovpn_dellink,
 	.fill_info = ovpn_fill_info,
 };
 
diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c
index c02dfab51a6e..8ef485a8c851 100644
--- a/drivers/net/ovpn/peer.c
+++ b/drivers/net/ovpn/peer.c
@@ -1034,6 +1034,14 @@ static int ovpn_peer_add_p2p(struct ovpn_priv *ovpn, struct ovpn_peer *peer)
  */
 int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer)
 {
+	/* Prevent adding new peers while destroying the ovpn interface.
+	 * Failing to do so would end up holding the device reference
+	 * endlessly hostage of the new peer object with no chance of
+	 * release..
+	 */
+	if (ovpn->dev->reg_state >= NETREG_UNREGISTERING)
+		return -ENODEV;
+
 	switch (ovpn->mode) {
 	case OVPN_MODE_MP:
 		return ovpn_peer_add_mp(ovpn, peer);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 0/2] ice: ptp: fix E825 timer synchronization and locking
From: Grzegorz Nitka @ 2026-04-22 12:31 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: netdev, anthony.l.nguyen, przemyslaw.kitszel, Grzegorz Nitka

These two fixes address E825 PTP synchronization issues in
ice_ptp_hw.c.

The first patch serializes PHY timer start against concurrent PTP
command paths by holding the global PTP semaphore while programming
TIMETUS registers and issuing INIT_INCVAL.

The second patch fixes semaphore access for E825 2xNAC configurations by
making ice_ptp_lock() and ice_ptp_unlock() use the primary NAC register
block, matching the rest of the primary-only PTP register access path.

Together, the series closes two locking gaps in E825 timer control: one
during PHY timer initialization and one in 2xNAC semaphore selection.

Grzegorz Nitka (2):
  ice: ptp: serialize E825 PHY timer start with PTP lock
  ice: ptp: use primary NAC semaphore on E825

 drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 24 +++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)


base-commit: 3662329d3f304a421e0230ee3913dab021ec3a3d
-- 
2.39.3


^ permalink raw reply

* [PATCH iwl-net 1/2] ice: ptp: serialize E825 PHY timer start with PTP lock
From: Grzegorz Nitka @ 2026-04-22 12:31 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: netdev, anthony.l.nguyen, przemyslaw.kitszel, Grzegorz Nitka,
	Arkadiusz Kubalewski
In-Reply-To: <20260422123144.485930-1-grzegorz.nitka@intel.com>

ice_start_phy_timer_eth56g() programs TIMETUS registers and issues
INIT_INCVAL without holding the global PTP semaphore.

This allows concurrent PTP command paths to interleave with PHY timer
start, which can make the sequence fail and leave timer initialization
inconsistent.

Take the PTP lock around TIMETUS registers programming and INIT_INCVAL
command execution, and make sure the lock is released on all error paths.

Keep the subsequent sync step outside of this critical section, since
ice_sync_phy_timer_eth56g() takes the same semaphore internally.

Fixes: 7cab44f1c35f ("ice: Introduce ETH56G PHY model for E825C products")
Reviewed-by: Arkadiusz Kubalewski <Arkadiusz.kubalewski@intel.com>
Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
index 672218e5d1f9..8bb94e785f2a 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -2141,16 +2141,23 @@ int ice_start_phy_timer_eth56g(struct ice_hw *hw, u8 port)
 	}
 	incval = (u64)hi << 32 | lo;
 
+	if (!ice_ptp_lock(hw)) {
+		dev_err(ice_hw_to_dev(hw), "Failed to acquire PTP semaphore\n");
+		return -EBUSY;
+	}
+
 	err = ice_write_40b_ptp_reg_eth56g(hw, port, PHY_REG_TIMETUS_L, incval);
 	if (err)
-		return err;
+		goto err_ptp_unlock;
 
 	err = ice_ptp_one_port_cmd(hw, port, ICE_PTP_INIT_INCVAL);
 	if (err)
-		return err;
+		goto err_ptp_unlock;
 
 	ice_ptp_exec_tmr_cmd(hw);
 
+	ice_ptp_unlock(hw);
+
 	err = ice_sync_phy_timer_eth56g(hw, port);
 	if (err)
 		return err;
@@ -2166,6 +2173,10 @@ int ice_start_phy_timer_eth56g(struct ice_hw *hw, u8 port)
 	ice_debug(hw, ICE_DBG_PTP, "Enabled clock on PHY port %u\n", port);
 
 	return 0;
+
+err_ptp_unlock:
+	ice_ptp_unlock(hw);
+	return err;
 }
 
 /**
-- 
2.39.3


^ permalink raw reply related

* [PATCH iwl-net 2/2] ice: ptp: use primary NAC semaphore on E825
From: Grzegorz Nitka @ 2026-04-22 12:31 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: netdev, anthony.l.nguyen, przemyslaw.kitszel, Grzegorz Nitka,
	Arkadiusz Kubalewski
In-Reply-To: <20260422123144.485930-1-grzegorz.nitka@intel.com>

For E825 2xNAC configurations, PTP semaphore operations must hit the
primary NAC register block so both sides coordinate on the same lock.

Commit e2193f9f9ec9 ("ice: enable timesync operation on 2xNAC E825
devices") updated other primary-only PTP register accesses to
use the primary NAC on non-primary functions, but left ice_ptp_lock()
and ice_ptp_unlock() operating on the local NAC. As a result, secondary
NAC PTP paths can take a different semaphore than the primary side.

Select the primary hardware in ice_ptp_lock() and ice_ptp_unlock() when
the current function is not primary, keeping semaphore operations
symmetric and consistent with the rest of the 2xNAC PTP register access
path.

Fixes: e2193f9f9ec9 ("ice: enable timesync operation on 2xNAC E825 devices")
Reviewed-by: Arkadiusz Kubalewski <Arkadiusz.kubalewski@intel.com>
Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
index 8bb94e785f2a..2c18e16fe053 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -5264,9 +5264,13 @@ static void ice_ptp_init_phy_e830(struct ice_ptp_hw *ptp)
  */
 bool ice_ptp_lock(struct ice_hw *hw)
 {
+	struct ice_pf *pf = container_of(hw, struct ice_pf, hw);
 	u32 hw_lock;
 	int i;
 
+	if (!ice_is_primary(hw))
+		hw = ice_get_primary_hw(pf);
+
 #define MAX_TRIES 15
 
 	for (i = 0; i < MAX_TRIES; i++) {
@@ -5293,6 +5297,11 @@ bool ice_ptp_lock(struct ice_hw *hw)
  */
 void ice_ptp_unlock(struct ice_hw *hw)
 {
+	struct ice_pf *pf = container_of(hw, struct ice_pf, hw);
+
+	if (!ice_is_primary(hw))
+		hw = ice_get_primary_hw(pf);
+
 	wr32(hw, PFTSYN_SEM + (PFTSYN_SEM_BYTES * hw->pf_id), 0);
 }
 
-- 
2.39.3


^ permalink raw reply related

* [PATCH net v3 0/2] tcp: symmetric challenge ACK for SEG.ACK > SND.NXT
From: Jiayuan Chen @ 2026-04-22 12:35 UTC (permalink / raw)
  To: netdev
  Cc: Jiayuan Chen, Eric Dumazet, Neal Cardwell, Kuniyuki Iwashima,
	David S. Miller, David Ahern, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Shuah Khan, linux-kernel, linux-kselftest

Commit 354e4aa391ed ("tcp: RFC 5961 5.2 Blind Data Injection Attack
Mitigation") quotes RFC 5961 Section 5.2 in full, which requires
that any incoming segment whose ACK value falls outside
[SND.UNA - MAX.SND.WND, SND.NXT] MUST be discarded and an ACK sent
back.  Linux currently sends that challenge ACK only on the lower
edge (SEG.ACK < SND.UNA - MAX.SND.WND); on the symmetric upper edge
(SEG.ACK > SND.NXT) the segment is silently dropped with
SKB_DROP_REASON_TCP_ACK_UNSENT_DATA.

Patch 1 completes the mitigation by emitting a rate-limited challenge
ACK on that branch, reusing tcp_send_challenge_ack() and honouring
FLAG_NO_CHALLENGE_ACK for consistency with the lower-edge case.  It
also updates the existing tcp_ts_recent_invalid_ack.pkt selftest,
which drives this exact path, to consume the new challenge ACK so
bisect stays clean.

Patch 2 adds a new packetdrill selftest that exercises RFC 5961
Section 5.2 on both edges of the acceptable window, filling a gap in
the selftests tree (neither edge had dedicated coverage before).

---

Changelog
=========
v2 -> v3:
  - I don't think the AI's concern holds, but I think I can split write(2000)
    into write(1000) so the test doesn't rely on kernel internals.
    sashiko: https://sashiko.dev/#/patchset/20260421014128.289362-1-jiayuan.chen%40linux.dev
v1 -> v2:
  - Add Reviewed-by tag.
  - Fold the tcp_ts_recent_invalid_ack.pkt update into patch 1 so
    that bisect stays clean and the fix is self-contained for
    backport.
  - Extend the new selftest to cover both edges of RFC 5961
    Section 5.2 (SEG.ACK > SND.NXT and SEG.ACK < SND.UNA -
    MAX.SND.WND) in a single connection, and rename it to
    tcp_rfc5961_ack-out-of-window.pkt.  Neither edge had explicit
    packetdrill coverage before.

v1: https://lore.kernel.org/netdev/20260420025428.101192-1-jiayuan.chen@linux.dev/

Jiayuan Chen (2):
  tcp: send a challenge ACK on SEG.ACK > SND.NXT
  selftests/net: packetdrill: cover RFC 5961 5.2 challenge ACK on both
    edges

 net/ipv4/tcp_input.c                          | 10 ++--
 .../tcp_rfc5961_ack-out-of-window.pkt         | 48 +++++++++++++++++++
 .../packetdrill/tcp_ts_recent_invalid_ack.pkt |  4 +-
 3 files changed, 58 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rfc5961_ack-out-of-window.pkt

-- 
2.43.0


^ permalink raw reply

* [PATCH net v3 1/2] tcp: send a challenge ACK on SEG.ACK > SND.NXT
From: Jiayuan Chen @ 2026-04-22 12:35 UTC (permalink / raw)
  To: netdev
  Cc: Jiayuan Chen, Eric Dumazet, Neal Cardwell, Kuniyuki Iwashima,
	David S. Miller, David Ahern, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Shuah Khan, linux-kernel, linux-kselftest
In-Reply-To: <20260422123605.320000-1-jiayuan.chen@linux.dev>

RFC 5961 Section 5.2 validates an incoming segment's ACK value
against the range [SND.UNA - MAX.SND.WND, SND.NXT] and states:

  "All incoming segments whose ACK value doesn't satisfy the above
   condition MUST be discarded and an ACK sent back."

Commit 354e4aa391ed ("tcp: RFC 5961 5.2 Blind Data Injection Attack
Mitigation") opted Linux into this mitigation and implements the
challenge ACK on the lower side (SEG.ACK < SND.UNA - MAX.SND.WND),
but the symmetric upper side (SEG.ACK > SND.NXT) still takes the
pre-RFC-5961 path and silently returns
SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, even though RFC 793 Section 3.9
(now RFC 9293 Section 3.10.7.4) has always required:

  "If the ACK acknowledges something not yet sent (SEG.ACK > SND.NXT)
   then send an ACK, drop the segment, and return."

Complete the mitigation by sending a challenge ACK on that branch,
reusing the existing tcp_send_challenge_ack() path which already
enforces the per-socket RFC 5961 Section 7 rate limit via
__tcp_oow_rate_limited().  FLAG_NO_CHALLENGE_ACK is honoured for
symmetry with the lower-edge case.

Update the existing tcp_ts_recent_invalid_ack.pkt selftest, which
drives this exact path, to consume the new challenge ACK.

Fixes: 354e4aa391ed ("tcp: RFC 5961 5.2 Blind Data Injection Attack Mitigation")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_input.c                                   | 10 +++++++---
 .../net/packetdrill/tcp_ts_recent_invalid_ack.pkt      |  4 +++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 021f745747c5..c2b6f05acdfa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4284,11 +4284,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		goto old_ack;
 	}
 
-	/* If the ack includes data we haven't sent yet, discard
-	 * this segment (RFC793 Section 3.9).
+	/* If the ack includes data we haven't sent yet, drop the
+	 * segment.  RFC 793 Section 3.9 and RFC 5961 Section 5.2
+	 * require us to send an ACK back in that case.
 	 */
-	if (after(ack, tp->snd_nxt))
+	if (after(ack, tp->snd_nxt)) {
+		if (!(flag & FLAG_NO_CHALLENGE_ACK))
+			tcp_send_challenge_ack(sk, false);
 		return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
+	}
 
 	if (after(ack, prior_snd_una)) {
 		flag |= FLAG_SND_UNA_ADVANCED;
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt
index 174ce9a1bfc0..ee6baf7c36cf 100644
--- a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt
+++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt
@@ -19,7 +19,9 @@
 
 // bad packet with high tsval (its ACK sequence is above our sndnxt)
    +0 < F. 1:1(0) ack 9999 win 20000 <nop,nop,TS val 200000 ecr 100>
-
+// Challenge ACK for SEG.ACK > SND.NXT (RFC 5961 5.2 / RFC 793 3.9).
+// ecr=200 (not 200000) proves ts_recent was not updated from the bad packet.
+   +0 > . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 200>
 
    +0 < . 1:1001(1000) ack 1 win 20000 <nop,nop,TS val 201 ecr 100>
    +0 > . 1:1(0) ack 1001 <nop,nop,TS val 200 ecr 201>
-- 
2.43.0


^ permalink raw reply related

* [PATCH net v3 2/2] selftests/net: packetdrill: cover RFC 5961 5.2 challenge ACK on both edges
From: Jiayuan Chen @ 2026-04-22 12:35 UTC (permalink / raw)
  To: netdev
  Cc: Jiayuan Chen, Eric Dumazet, Neal Cardwell, Kuniyuki Iwashima,
	David S. Miller, David Ahern, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Shuah Khan, linux-kernel, linux-kselftest
In-Reply-To: <20260422123605.320000-1-jiayuan.chen@linux.dev>

RFC 5961 Section 5.2 / RFC 793 Section 3.9 require a challenge ACK
whenever an incoming SEG.ACK falls outside
[SND.UNA - MAX.SND.WND, SND.NXT].  There is currently no packetdrill
coverage for either edge.

Add tcp_rfc5961_ack-out-of-window.pkt, which in a single passive-open
connection exercises:

  - Upper edge (SEG.ACK > SND.NXT): peer ACKs data that was never
    sent before the server has transmitted anything.
  - Lower edge (SEG.ACK < SND.UNA - MAX.SND.WND): after the server
    has sent 2000 bytes (the peer-advertised rwnd forces two 1000-byte
    segments, both acknowledged), peer sends an ACK that is older
    than the acceptable window.

Both cases must elicit a challenge ACK
<SEQ = SND.NXT, ACK = RCV.NXT, CTL = ACK>.  The per-socket RFC 5961
Section 7 rate limit is disabled for the duration of the test so that
both challenge ACKs can fire back-to-back.

Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 .../tcp_rfc5961_ack-out-of-window.pkt         | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rfc5961_ack-out-of-window.pkt

diff --git a/tools/testing/selftests/net/packetdrill/tcp_rfc5961_ack-out-of-window.pkt b/tools/testing/selftests/net/packetdrill/tcp_rfc5961_ack-out-of-window.pkt
new file mode 100644
index 000000000000..2776b8728085
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rfc5961_ack-out-of-window.pkt
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// RFC 5961 Section 5.2 / RFC 793 Section 3.9: an incoming segment's
+// ACK value must lie in [SND.UNA - MAX.SND.WND, SND.NXT]; otherwise
+// the receiver MUST discard the segment and send a challenge ACK
+// back.  Exercise both edges of that window in a single connection.
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_invalid_ratelimit=0
+`
+
+   0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+  +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+  +0 bind(3, ..., ...) = 0
+  +0 listen(3, 1) = 0
+
+// Three-way handshake.  Peer advertises rwnd = 1000 (no wscale), so
+// MAX.SND.WND is tracked as 1000.
+  +0 < S 0:0(0) win 1000 <mss 1000,sackOK,nop,nop,nop,wscale 0>
+  +0 > S. 0:0(0) ack 1 <...>
++.1 < . 1:1(0) ack 1 win 1000
+  +0 accept(3, ..., ...) = 4
+
+// ---- Upper edge: SEG.ACK > SND.NXT --------------------------------
+// Server has sent nothing yet, so SND.UNA = SND.NXT = 1.
+// Peer sends a pure ACK with SEG.ACK = 2, beyond SND.NXT.
+  +0 < . 1:1(0) ack 2 win 1000
+// Expect a challenge ACK: <SEQ = SND.NXT = 1, ACK = RCV.NXT = 1>.
+  +0 > . 1:1(0) ack 1
+
+// Advance SND.UNA past MAX.SND.WND so that the lower edge becomes
+// reachable.  Issue two 1-MSS writes so each skb is exactly one MSS
+// and PSH is set by tcp_push() at the end of each sendmsg, keeping
+// the setup independent of the TSO / tcp_fragment split path.
+  +0 write(4, ..., 1000) = 1000
+  +0 > P. 1:1001(1000) ack 1
++.01 < . 1:1(0) ack 1001 win 1000
+  +0 write(4, ..., 1000) = 1000
+  +0 > P. 1001:2001(1000) ack 1
++.01 < . 1:1(0) ack 2001 win 1000
+// Now SND.UNA = SND.NXT = 2001, MAX.SND.WND = 1000, bytes_acked = 2000.
+
+// ---- Lower edge: SEG.ACK < SND.UNA - MAX.SND.WND ------------------
+// SND.UNA - MAX.SND.WND = 2001 - 1000 = 1001, so SEG.ACK = 1000 falls
+// below the acceptable range.
+  +0 < . 1:1(0) ack 1000 win 1000
+// Expect a challenge ACK: <SEQ = SND.NXT = 2001, ACK = RCV.NXT = 1>.
+  +0 > . 2001:2001(0) ack 1
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH] rxrpc: fix missing validation of ticket length in
From: David Howells @ 2026-04-22 12:37 UTC (permalink / raw)
  To: Anderson Nascimento
  Cc: dhowells, Marc Dionne, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Steve Dickson,
	linux-afs, netdev, linux-kernel
In-Reply-To: <20260422003206.1017863-1-anderson@allelesecurity.com>

> Subject: [PATCH] rxrpc: fix missing validation of ticket length in

...in rxrpc_preparse(), I presume?

David


^ permalink raw reply

* Re: [PATCH net v3 1/1] net: hsr: limit node table growth
From: Felix Maurer @ 2026-04-22 12:38 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Ren Wei, netdev, davem, edumazet, kuba, pabeni, horms, kees,
	kexinsun, luka.gejak, m-karicheri2, yuantan098, yifanwucs,
	tomapufckgml, bird, xuyuqiabc, royenheart
In-Reply-To: <20260422105854.trLbmAmZ@linutronix.de>

On Wed, Apr 22, 2026 at 12:58:54PM +0200, Sebastian Andrzej Siewior wrote:
> On 2026-04-22 11:45:38 [+0200], Felix Maurer wrote:
> > > I don't think the node count exceeds 100 in production. So having a
> > > counter which is incremented while adding to the list and decremented
> > > while removing items from the list would optimize the "worst case". So
> > > instead traversing the list with 1000 we would just give up.
> >
> > The counter is what I had in mind. I agree that allocating under the
> > lock isn't what we want.
> >
> > I'd argue counting through the whole list is the normal case.
>
> yeah but counting here is just a register increment which is cheap.
>
> > hsr_add_node() is only called after the node table has been searched
> > already (without the lock). Here we go through the whole list again
> > under the lock to prevent TOCTOU-type situations.
> >
> > I agree that, overall, it would be optimizing the worst case, but I
> > think it may be worth it to prevent the memory allocations and walking
> > the whole list. But I'd go along with the (current) on-the-fly counting
> > as well.
>
> Yeah. But then you have to manage the counter on add and removal just
> for this "we have too many nodes" case. And theoretically you would have
> to hold the list_lock while checking the counter because nodes might be
> added on both sides in the RX path (unless you check early lockless &
> optimistic and then again before adding under the lock).

Alright, I agree. Let's keep this part as it is (counting while iterting
through the list).

Thanks,
   Felix


^ permalink raw reply

* Re: [PATCH net 14/18] drivers: net: xircom: xirc2ps: Remove this driver
From: michael @ 2026-04-22 12:46 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Jonathan Corbet, Shuah Khan, linux-kernel, netdev,
	linux-doc
In-Reply-To: <408d1987-6ecb-4d3b-afed-8e202c8ff21d@lunn.ch>

Am 2026-04-22 14:15, schrieb Andrew Lunn:
> On Wed, Apr 22, 2026 at 08:21:23AM +0200, Michael Fritscher wrote:
>> Good day,
>> 
>> actually, I do use Xircom PCMCIA network cards (yes, the 16 bit ones) 
>> on
>> Lenovo X60/X61 laptops as a second LAN card for server maintenances 
>> with
>> current 64 bit distros (e.g. Debian Trixie, which I plan to update to
>> Trixie+1 when available). Why? Because I have them and they are 
>> working ;-)
> 
> Hi Michael
> 
> I will drop this from the patchset for the moment.
> 
> Would you be willing to take up the Maintainer role for it?
> 
> 	Andrew

Hello Andrew,

thanks! If someone mentors me I could try... I have experience with C 
programming, esp. in the embedded world, but (almost) no experience with 
the Linux kernel development.
But regression tests etc. can be conducted by me - on 32 and 64 bit 
machines. The oldest being a P120, the newest the mentioned X61 with its 
C2D CPU.

Best regards
Michael

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox