Netdev List
 help / color / mirror / Atom feed
* [PATCH RFC net-next v4 09/14] xsk: extend xskq_cons_read_desc_batch to count nb_pkts
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Add a new parameter nb_pkts to count how many packets are needed
practically by copy mode with the help of XDP_PKT_CONTD option.

Add descs to provide a way to pass xs->desc_cache to store the
descriptors for copy mode.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/xdp/xsk_queue.h | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index ec08d9c102b1..354f6fe86893 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -263,12 +263,12 @@ static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool,
 	parsed->mb = xp_mb_desc(desc);
 }
 
-static inline
-u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
-			      u32 max)
+static inline u32
+__xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+			    struct xdp_desc *descs, u32 max, u32 *nb_pkts,
+			    u32 max_segs)
 {
 	u32 cached_cons = q->cached_cons, nb_entries = 0;
-	struct xdp_desc *descs = pool->tx_descs;
 	u32 total_descs = 0, nr_frags = 0;
 
 	/* track first entry, if stumble upon *any* invalid descriptor, rewind
@@ -288,9 +288,11 @@ u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
 		if (likely(!parsed.mb)) {
 			total_descs += (nr_frags + 1);
 			nr_frags = 0;
+			if (nb_pkts)
+				(*nb_pkts)++;
 		} else {
 			nr_frags++;
-			if (nr_frags == pool->xdp_zc_max_segs) {
+			if (nr_frags == max_segs) {
 				nr_frags = 0;
 				break;
 			}
@@ -304,6 +306,14 @@ u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
 	return total_descs;
 }
 
+static inline u32
+xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+			  u32 max)
+{
+	return __xskq_cons_read_desc_batch(q, pool, pool->tx_descs, max,
+					   NULL, pool->xdp_zc_max_segs);
+}
+
 /* Functions for consumers */
 
 static inline void __xskq_cons_release(struct xsk_queue *q)
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 08/14] xsk: rename nb_pkts to nb_descs in xsk_tx_peek_release_desc_batch
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Rename the last parameter to nb_descs for more accurate naming. Next
patch will add a real nb_pkts parameter to help copy mode count how
many pakcets are needed.

No functional change here.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/xdp/xsk.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7a6991bc19a8..6cd2e58e170c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -485,16 +485,16 @@ EXPORT_SYMBOL(xsk_tx_peek_desc);
 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
 {
 	struct xdp_desc *descs = pool->tx_descs;
-	u32 nb_pkts = 0;
+	u32 nb_descs = 0;
 
-	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
-		nb_pkts++;
+	while (nb_descs < max_entries && xsk_tx_peek_desc(pool, &descs[nb_descs]))
+		nb_descs++;
 
 	xsk_tx_release(pool);
-	return nb_pkts;
+	return nb_descs;
 }
 
-u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_descs)
 {
 	struct xdp_sock *xs;
 
@@ -502,16 +502,16 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
 	if (!list_is_singular(&pool->xsk_tx_list)) {
 		/* Fallback to the non-batched version */
 		rcu_read_unlock();
-		return xsk_tx_peek_release_fallback(pool, nb_pkts);
+		return xsk_tx_peek_release_fallback(pool, nb_descs);
 	}
 
 	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
 	if (!xs) {
-		nb_pkts = 0;
+		nb_descs = 0;
 		goto out;
 	}
 
-	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
+	nb_descs = xskq_cons_nb_entries(xs->tx, nb_descs);
 
 	/* This is the backpressure mechanism for the Tx path. Try to
 	 * reserve space in the completion queue for all packets, but
@@ -519,23 +519,23 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
 	 * packets. This avoids having to implement any buffering in
 	 * the Tx path.
 	 */
-	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
-	if (!nb_pkts)
+	nb_descs = xskq_prod_nb_free(pool->cq, nb_descs);
+	if (!nb_descs)
 		goto out;
 
-	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
-	if (!nb_pkts) {
+	nb_descs = xskq_cons_read_desc_batch(xs->tx, pool, nb_descs);
+	if (!nb_descs) {
 		xs->tx->queue_empty_descs++;
 		goto out;
 	}
 
 	__xskq_cons_release(xs->tx);
-	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
+	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_descs);
 	xs->sk.sk_write_space(&xs->sk);
 
 out:
 	rcu_read_unlock();
-	return nb_pkts;
+	return nb_descs;
 }
 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
 
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 07/14] xsk: try to skip validating skb list in xmit path
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

This patch moves the SG check ahead, which is the only place we need to
handle very carefully because either in xsk_build_skb_zerocopy() or in
multi-buffer mode nr_frags (in skb_needs_linearize()) is used[1].

In most cases, for xsk, it's totally not needed to validate and check
the skb in validate_xmit_skb_list() that adds numerous checks in the
extremely hot path. In this kind of workload, even the overhead of
mathematical operations is not trivial.

Performance-wise, I run './xdpsock -i enp2s0f0np0 -t  -S -s 64' on 1Gb/sec
ixgbe driver to verify. It stably goes up by 5.48%

[1]: https://lore.kernel.org/all/20251125115754.46793-1-kerneljasonxing@gmail.com/

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/core/dev.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index a6abd621a7f3..aa38993b9dd4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4899,6 +4899,7 @@ int xsk_direct_xmit_batch(struct xdp_sock *xs, struct net_device *dev)
 	u16 queue_id = xs->queue_id;
 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_id);
 	struct sk_buff_head *send_queue = &xs->batch.send_queue;
+	bool need_validate = !(dev->features & NETIF_F_SG);
 	int ret = NETDEV_TX_BUSY;
 	struct sk_buff *skb;
 	bool more = true;
@@ -4906,15 +4907,17 @@ int xsk_direct_xmit_batch(struct xdp_sock *xs, struct net_device *dev)
 	local_bh_disable();
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	while ((skb = __skb_dequeue(send_queue)) != NULL) {
-		struct sk_buff *orig_skb = skb;
-		bool again = false;
-
-		skb = validate_xmit_skb_list(skb, dev, &again);
-		if (skb != orig_skb) {
-			dev_core_stats_tx_dropped_inc(dev);
-			kfree_skb_list(skb);
-			ret = NET_XMIT_DROP;
-			break;
+		if (unlikely(need_validate)) {
+			struct sk_buff *orig_skb = skb;
+			bool again = false;
+
+			skb = validate_xmit_skb_list(skb, dev, &again);
+			if (skb != orig_skb) {
+				dev_core_stats_tx_dropped_inc(dev);
+				kfree_skb_list(skb);
+				ret = NET_XMIT_DROP;
+				break;
+			}
 		}
 
 		if (netif_xmit_frozen_or_drv_stopped(txq)) {
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 06/14] xsk: support dynamic xmit.more control for batch xmit
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Only set xmit.more false for the last skb.

In theory, only making xmit.more false for the last packets to be
sent in each round can bring much benefit like avoid triggering too
many irqs.

Compared to the numbers for batch mode, a huge improvement (26%) can
be seen on i40e/ixgbe driver since the cost of triggering irqs is
expensive.

Suggested-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/core/dev.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index e33a2406d8ca..a6abd621a7f3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4901,6 +4901,7 @@ int xsk_direct_xmit_batch(struct xdp_sock *xs, struct net_device *dev)
 	struct sk_buff_head *send_queue = &xs->batch.send_queue;
 	int ret = NETDEV_TX_BUSY;
 	struct sk_buff *skb;
+	bool more = true;
 
 	local_bh_disable();
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
@@ -4920,8 +4921,12 @@ int xsk_direct_xmit_batch(struct xdp_sock *xs, struct net_device *dev)
 			__skb_queue_head(send_queue, skb);
 			break;
 		}
+
+		if (!skb_peek(send_queue))
+			more = false;
+
 		skb_set_queue_mapping(skb, queue_id);
-		ret = netdev_start_xmit(skb, dev, txq, false);
+		ret = netdev_start_xmit(skb, dev, txq, more);
 		if (ret != NETDEV_TX_OK) {
 			if (ret == NETDEV_TX_BUSY)
 				__skb_queue_head(send_queue, skb);
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 05/14] xsk: add direct xmit in batch function
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Add batch xmit logic.

Only grabbing the lock and disable bottom half once and sent all
the aggregated packets in one loop. Via skb->list, the already built
skbs can be handled one by one.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/xdp_sock.h |  1 +
 net/core/dev.c         | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 2151aab8f0a1..0609e3b04279 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -141,6 +141,7 @@ struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			      struct sk_buff *allocated_skb,
 			      struct xdp_desc *desc);
 int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err);
+int xsk_direct_xmit_batch(struct xdp_sock *xs, struct net_device *dev);
 
 /**
  *  xsk_tx_metadata_to_compl - Save enough relevant metadata information
diff --git a/net/core/dev.c b/net/core/dev.c
index 4519f0e59beb..e33a2406d8ca 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -163,6 +163,7 @@
 #include <net/page_pool/memory_provider.h>
 #include <net/rps.h>
 #include <linux/phy_link_topology.h>
+#include <net/xdp_sock.h>
 
 #include "dev.h"
 #include "devmem.h"
@@ -4893,6 +4894,46 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 }
 EXPORT_SYMBOL(__dev_queue_xmit);
 
+int xsk_direct_xmit_batch(struct xdp_sock *xs, struct net_device *dev)
+{
+	u16 queue_id = xs->queue_id;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_id);
+	struct sk_buff_head *send_queue = &xs->batch.send_queue;
+	int ret = NETDEV_TX_BUSY;
+	struct sk_buff *skb;
+
+	local_bh_disable();
+	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	while ((skb = __skb_dequeue(send_queue)) != NULL) {
+		struct sk_buff *orig_skb = skb;
+		bool again = false;
+
+		skb = validate_xmit_skb_list(skb, dev, &again);
+		if (skb != orig_skb) {
+			dev_core_stats_tx_dropped_inc(dev);
+			kfree_skb_list(skb);
+			ret = NET_XMIT_DROP;
+			break;
+		}
+
+		if (netif_xmit_frozen_or_drv_stopped(txq)) {
+			__skb_queue_head(send_queue, skb);
+			break;
+		}
+		skb_set_queue_mapping(skb, queue_id);
+		ret = netdev_start_xmit(skb, dev, txq, false);
+		if (ret != NETDEV_TX_OK) {
+			if (ret == NETDEV_TX_BUSY)
+				__skb_queue_head(send_queue, skb);
+			break;
+		}
+	}
+	HARD_TX_UNLOCK(dev, txq);
+	local_bh_enable();
+
+	return ret;
+}
+
 int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 {
 	struct net_device *dev = skb->dev;
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 04/14] xsk: cache data buffers to avoid frequently calling kmalloc_reserve
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

It's beneficial for small data transmission.

Replace per-SKB kmalloc_reserve() with on-demand bulk allocation from
skb_small_head_cache for small packets.

Add a persistent per-socket data buffer cache (batch.data_cache /
batch.data_count) that survives across batch cycles, similar to how
batch.send_queue caches built SKBs.

Inside the Phase-1 per-descriptor loop, when a small packet needs a
data buffer and the cache is empty, a single kmem_cache_alloc_bulk()
refills it with generic_xmit_batch objects. Subsequent small packets
pop directly from the cache. Large packets bypass the cache entirely
and fall back to kmalloc_reserve(). Unused buffers remain in the
cache for the next batch.

I observed that kmalloc_reserve() consumes nearly 40% which seems
unavoidable at the first glance, thinking adding the bulk mechanism
should contribute to the performance. That's the motivation of this
patch. Now, the feature gives us around 10% improvement.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/xdp_sock.h |  2 ++
 net/core/skbuff.c      | 27 ++++++++++++++++++++++-----
 net/xdp/xsk.c          | 24 ++++++++++++++++++++----
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 84f0aee3fb10..2151aab8f0a1 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -51,6 +51,8 @@ struct xsk_batch {
 	struct sk_buff **skb_cache;
 	struct xdp_desc *desc_cache;
 	struct sk_buff_head send_queue;
+	unsigned int data_count;
+	void **data_cache;
 };
 
 struct xdp_sock {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f29cecacd8bb..5726b1566b2b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -661,9 +661,11 @@ int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err
 	unsigned int total_truesize = 0;
 	struct sk_buff *skb = NULL;
 	int node = NUMA_NO_NODE;
+	void **dc = batch->data_cache;
+	unsigned int dc_count = batch->data_count;
 	u32 i = 0, j, k = 0;
 	bool need_alloc;
-	u8 *data;
+	void *data;
 
 	base_len = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
 	if (!(dev->priv_flags & IFF_TX_SKB_NO_LINEAR))
@@ -683,6 +685,13 @@ int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err
 		nb_pkts = skb_count;
 
 alloc_data:
+	if (dc_count < nb_pkts && !(gfp_mask & KMALLOC_NOT_NORMAL_BITS))
+		dc_count += kmem_cache_alloc_bulk(
+				net_hotdata.skb_small_head_cache,
+				gfp_mask | __GFP_NOMEMALLOC | __GFP_NOWARN,
+				batch->generic_xmit_batch - dc_count,
+				&dc[dc_count]);
+
 	/*
 	 * Phase 1: Allocate data buffers and initialize SKBs.
 	 * Pre-scan descriptors to determine packet boundaries, so we can
@@ -710,10 +719,17 @@ int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err
 
 			skb = skbs[skb_count - 1 - i];
 			skbuff_clear(skb);
-			data = kmalloc_reserve(&size, gfp_mask, node, skb);
-			if (unlikely(!data)) {
-				*err = -ENOBUFS;
-				break;
+			if (dc_count &&
+			    SKB_HEAD_ALIGN(size) <= SKB_SMALL_HEAD_CACHE_SIZE) {
+				data = dc[--dc_count];
+				size = SKB_SMALL_HEAD_CACHE_SIZE;
+			} else {
+				data = kmalloc_reserve(&size, gfp_mask,
+						       node, skb);
+				if (unlikely(!data)) {
+					*err = -ENOBUFS;
+					break;
+				}
 			}
 			__finalize_skb_around(skb, data, size);
 			/* Replace skb_set_owner_w() with the following */
@@ -762,6 +778,7 @@ int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err
 	while (k < i)
 		kfree_skb(skbs[skb_count - 1 - k++]);
 
+	batch->data_count = dc_count;
 	batch->skb_count = skb_count - i;
 
 	return j;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f97bc9cf9b9a..7a6991bc19a8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1229,14 +1229,22 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
 }
 
 static void xsk_batch_reset(struct xsk_batch *batch, struct sk_buff **skbs,
-			    struct xdp_desc *descs, unsigned int size)
-{
+			    struct xdp_desc *descs, void **data,
+			    unsigned int size)
+{
+	if (batch->data_count)
+		kmem_cache_free_bulk(net_hotdata.skb_small_head_cache,
+				     batch->data_count,
+				     batch->data_cache);
+	kfree(batch->data_cache);
 	if (batch->skb_count)
 		kmem_cache_free_bulk(net_hotdata.skbuff_cache,
 				     batch->skb_count,
 				     (void **)batch->skb_cache);
 	kfree(batch->skb_cache);
 	kvfree(batch->desc_cache);
+	batch->data_cache = data;
+	batch->data_count = 0;
 	batch->skb_cache = skbs;
 	batch->desc_cache = descs;
 	batch->skb_count = 0;
@@ -1272,7 +1280,7 @@ static int xsk_release(struct socket *sock)
 	xskq_destroy(xs->tx);
 	xskq_destroy(xs->fq_tmp);
 	xskq_destroy(xs->cq_tmp);
-	xsk_batch_reset(&xs->batch, NULL, NULL, 0);
+	xsk_batch_reset(&xs->batch, NULL, NULL, NULL, 0);
 
 	sock_orphan(sk);
 	sock->sk = NULL;
@@ -1620,6 +1628,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		struct xsk_batch *batch = &xs->batch;
 		struct xdp_desc *descs;
 		struct sk_buff **skbs;
+		void **data;
 		unsigned int size;
 		int ret = 0;
 
@@ -1638,14 +1647,21 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			ret = -ENOMEM;
 			goto out;
 		}
+		data = kmalloc_array(size, sizeof(void *), GFP_KERNEL);
+		if (!data) {
+			kfree(skbs);
+			ret = -ENOMEM;
+			goto out;
+		}
 		descs = kvcalloc(size, sizeof(struct xdp_desc), GFP_KERNEL);
 		if (!descs) {
+			kfree(data);
 			kfree(skbs);
 			ret = -ENOMEM;
 			goto out;
 		}
 
-		xsk_batch_reset(batch, skbs, descs, size);
+		xsk_batch_reset(batch, skbs, descs, data, size);
 out:
 		mutex_unlock(&xs->mutex);
 		return ret;
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 03/14] xsk: add xsk_alloc_batch_skb() to build skbs in batch
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Support allocating and building skbs in batch.

There are three steps for one batched allocation:
1. Reserve the skb and count the skb->truesize. It provides a way
   that for later patch to speed up small data transmission by
   diminishing the impact of kmalloc_reserve().
2. Add the total of truesize to sk_wmem_alloc at one time. The load and
   store of sk_wmem_alloc is time-consuming, so this batch process makes
   it gain the performance improvement.
3. Copy data and then finish initialization of each skb.

This patch uses kmem_cache_alloc_bulk() to complete the batch allocation
which relies on the global common cache 'net_hotdata.skbuff_cache'. Use
a xsk standalone skb cache (namely, xs->skb_cache) to store allocated
skbs instead of resorting to napi_alloc_cache that was designed for
softirq condition.

After allocating memory for each of skbs, in a 'for' loop, the patch
borrows part of __alloc_skb() to initialize skb and then calls
xsk_build_skb() to complete the rest of initialization process, like
copying data and stuff. To achieve a better result, the allocation
function only uses the function we need to keep it super clean, like
skb_set_owner_w() that is simplified into two lines of codes.

Add batch.send_queue and use the skb->list to make skbs into one chain
so that they can be easily sent which is shown in the subsequent patches.

In terms of freeing skbs process, napi_consume_skb() in the tx completion
would put the skb into global cache 'net_hotdata.skbuff_cache' that
implements the deferred freeing skb feature to avoid freeing skb one
by one to improve the performance.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/xdp_sock.h |   3 +
 net/core/skbuff.c      | 121 +++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk.c          |   7 +++
 3 files changed, 131 insertions(+)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 90c709fd1239..84f0aee3fb10 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -47,8 +47,10 @@ struct xsk_map {
 
 struct xsk_batch {
 	u32 generic_xmit_batch;
+	unsigned int skb_count;
 	struct sk_buff **skb_cache;
 	struct xdp_desc *desc_cache;
+	struct sk_buff_head send_queue;
 };
 
 struct xdp_sock {
@@ -136,6 +138,7 @@ INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *));
 struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			      struct sk_buff *allocated_skb,
 			      struct xdp_desc *desc);
+int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err);
 
 /**
  *  xsk_tx_metadata_to_compl - Save enough relevant metadata information
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4045d7c484a1..f29cecacd8bb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -83,6 +83,7 @@
 #include <net/psp/types.h>
 #include <net/dropreason.h>
 #include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -647,6 +648,126 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
 	return obj;
 }
 
+#ifdef CONFIG_XDP_SOCKETS
+int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err)
+{
+	struct xsk_batch *batch = &xs->batch;
+	struct xdp_desc *descs = batch->desc_cache;
+	struct sk_buff **skbs = batch->skb_cache;
+	u32 alloc_descs, base_len, wmem, sndbuf;
+	gfp_t gfp_mask = xs->sk.sk_allocation;
+	u32 skb_count = batch->skb_count;
+	struct net_device *dev = xs->dev;
+	unsigned int total_truesize = 0;
+	struct sk_buff *skb = NULL;
+	int node = NUMA_NO_NODE;
+	u32 i = 0, j, k = 0;
+	bool need_alloc;
+	u8 *data;
+
+	base_len = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+	if (!(dev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+		base_len += dev->needed_tailroom;
+
+	if (xs->skb)
+		nb_pkts--;
+
+	if (skb_count >= nb_pkts)
+		goto alloc_data;
+
+	skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+					   gfp_mask,
+					   nb_pkts - skb_count,
+					   (void **)&skbs[skb_count]);
+	if (skb_count < nb_pkts)
+		nb_pkts = skb_count;
+
+alloc_data:
+	/*
+	 * Phase 1: Allocate data buffers and initialize SKBs.
+	 * Pre-scan descriptors to determine packet boundaries, so we can
+	 * batch the sk_wmem_alloc charge in Phase 2.
+	 */
+	need_alloc = !xs->skb;
+	wmem = sk_wmem_alloc_get(&xs->sk);
+	sndbuf = READ_ONCE(xs->sk.sk_sndbuf);
+	for (j = 0; j < nb_descs; j++) {
+		if (need_alloc) {
+			u32 size = base_len;
+
+			if (!(dev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+				size += descs[j].len;
+
+			if (i >= nb_pkts) {
+				*err = -EAGAIN;
+				break;
+			}
+
+			if (wmem + size + total_truesize > sndbuf) {
+				*err = -EAGAIN;
+				break;
+			}
+
+			skb = skbs[skb_count - 1 - i];
+			skbuff_clear(skb);
+			data = kmalloc_reserve(&size, gfp_mask, node, skb);
+			if (unlikely(!data)) {
+				*err = -ENOBUFS;
+				break;
+			}
+			__finalize_skb_around(skb, data, size);
+			/* Replace skb_set_owner_w() with the following */
+			skb->sk = &xs->sk;
+			skb->destructor = sock_wfree;
+			total_truesize += skb->truesize;
+			i++;
+			need_alloc = false;
+		}
+		if (!xp_mb_desc(&descs[j]))
+			need_alloc = true;
+	}
+	alloc_descs = j;
+
+	/*
+	 * Phase 2: Batch charge sk_wmem_alloc.
+	 * One refcount_add() replaces N per-SKB skb_set_owner_w() calls,
+	 * which gains much performance improvement.
+	 */
+	if (total_truesize)
+		refcount_add(total_truesize, &xs->sk.sk_wmem_alloc);
+
+	/* Phase 3: Build SKBs with packet data */
+	for (j = 0; j < alloc_descs; j++) {
+		if (!xs->skb) {
+			skb = skbs[skb_count - 1 - k];
+			k++;
+		}
+
+		skb = xsk_build_skb(xs, skb, &descs[j]);
+		if (IS_ERR(skb)) {
+			*err = PTR_ERR(skb);
+			break;
+		}
+
+		if (xp_mb_desc(&descs[j])) {
+			xs->skb = skb;
+			continue;
+		}
+
+		xs->skb = NULL;
+		__skb_queue_tail(&batch->send_queue, skb);
+	}
+
+	/* Phase 4: Reclaim unused allocated SKBs */
+	while (k < i)
+		kfree_skb(skbs[skb_count - 1 - k++]);
+
+	batch->skb_count = skb_count - i;
+
+	return j;
+}
+#endif
+
 /* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
  *	'private' fields and also do memory statistics to find all the
  *	[BEEP] leaks.
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ecd5b9c424b8..f97bc9cf9b9a 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -25,6 +25,7 @@
 #include <linux/vmalloc.h>
 #include <net/xdp_sock_drv.h>
 #include <net/busy_poll.h>
+#include <net/hotdata.h>
 #include <net/netdev_lock.h>
 #include <net/netdev_rx_queue.h>
 #include <net/xdp.h>
@@ -1230,10 +1231,15 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
 static void xsk_batch_reset(struct xsk_batch *batch, struct sk_buff **skbs,
 			    struct xdp_desc *descs, unsigned int size)
 {
+	if (batch->skb_count)
+		kmem_cache_free_bulk(net_hotdata.skbuff_cache,
+				     batch->skb_count,
+				     (void **)batch->skb_cache);
 	kfree(batch->skb_cache);
 	kvfree(batch->desc_cache);
 	batch->skb_cache = skbs;
 	batch->desc_cache = descs;
+	batch->skb_count = 0;
 	batch->generic_xmit_batch = size;
 }
 
@@ -1946,6 +1952,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 
 	INIT_LIST_HEAD(&xs->map_list);
 	spin_lock_init(&xs->map_list_lock);
+	__skb_queue_head_init(&xs->batch.send_queue);
 
 	mutex_lock(&net->xdp.lock);
 	sk_add_node_rcu(sk, &net->xdp.list);
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 02/14] xsk: extend xsk_build_skb() to support passing an already allocated skb
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

To avoid reinvent the wheel, the patch provides a way to let batch
feature reuse xsk_build_skb() as the rest process of the whole
initialization just after the skb is allocated.

The original xsk_build_skb() itself allocates a new skb by calling
sock_alloc_send_skb whether in copy mode or zerocopy mode. Add a new
parameter allocated skb to let other callers to pass an already
allocated skb to support later xmit batch feature. It replaces the
previous allocation of memory function with a bulk one.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/xdp_sock.h |  3 +++
 net/xdp/xsk.c          | 23 ++++++++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 965cab9a0465..90c709fd1239 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -133,6 +133,9 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(struct list_head *flush_list);
 INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *));
+struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+			      struct sk_buff *allocated_skb,
+			      struct xdp_desc *desc);
 
 /**
  *  xsk_tx_metadata_to_compl - Save enough relevant metadata information
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 6122db8606fe..ecd5b9c424b8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -718,6 +718,7 @@ static int xsk_skb_metadata(struct sk_buff *skb, void *buffer,
 }
 
 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
+					      struct sk_buff *allocated_skb,
 					      struct xdp_desc *desc)
 {
 	struct xsk_buff_pool *pool = xs->pool;
@@ -734,7 +735,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 	if (!skb) {
 		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
 
-		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+		if (!allocated_skb)
+			skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+		else
+			skb = allocated_skb;
 		if (unlikely(!skb))
 			return ERR_PTR(err);
 
@@ -799,15 +803,16 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 	return skb;
 }
 
-static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
-				     struct xdp_desc *desc)
+struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+			      struct sk_buff *allocated_skb,
+			      struct xdp_desc *desc)
 {
 	struct net_device *dev = xs->dev;
 	struct sk_buff *skb = xs->skb;
 	int err;
 
 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
-		skb = xsk_build_skb_zerocopy(xs, desc);
+		skb = xsk_build_skb_zerocopy(xs, allocated_skb, desc);
 		if (IS_ERR(skb)) {
 			err = PTR_ERR(skb);
 			skb = NULL;
@@ -822,8 +827,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 		if (!skb) {
 			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
-			tr = dev->needed_tailroom;
-			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+			if (!allocated_skb) {
+				tr = dev->needed_tailroom;
+				skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+			} else {
+				skb = allocated_skb;
+			}
 			if (unlikely(!skb))
 				goto free_err;
 
@@ -943,7 +952,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 			goto out;
 		}
 
-		skb = xsk_build_skb(xs, &desc);
+		skb = xsk_build_skb(xs, NULL, &desc);
 		if (IS_ERR(skb)) {
 			err = PTR_ERR(skb);
 			if (err != -EOVERFLOW)
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 01/14] xsk: introduce XDP_GENERIC_XMIT_BATCH setsockopt
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Add a new socket option to provide an alternative to achieve a higher
overall throughput with the rest of series applied. As the corresponding
documentataion I added says, it might increase the latency because the
heavy allocation cannot be avoided especially when the shortage of
memory occurs. So this patch don't turn this feature as default.

Add generic_xmit_batch to tertermine how many descriptors are handled
at one time. It shouldn't be larger than max_tx_budget or smaller than
one that is the default value (disabling batch mode).

Introduce skb_cache when setting setsockopt with xs->mutex protection to
store newly allocated skbs at one time.

Introduce desc_cache to temporarily cache what descriptors the xsk is
about to send each round.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 Documentation/networking/af_xdp.rst | 17 +++++++++++
 include/net/xdp_sock.h              |  7 +++++
 include/uapi/linux/if_xdp.h         |  1 +
 net/xdp/xsk.c                       | 47 +++++++++++++++++++++++++++++
 tools/include/uapi/linux/if_xdp.h   |  1 +
 5 files changed, 73 insertions(+)

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 50d92084a49c..7a8d219efe71 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -447,6 +447,23 @@ mode to allow application to tune the per-socket maximum iteration for
 better throughput and less frequency of send syscall.
 Allowed range is [32, xs->tx->nentries].
 
+XDP_GENERIC_XMIT_BATCH
+----------------------
+
+It provides an option that allows application to use batch xmit in the copy
+mode. Batch process tries to allocate a certain number skbs through bulk
+mechanism first and then initialize them and finally send them out at one
+time.
+It applies efficient bulk allocation/deallocation function, avoid frequently
+grabbing/releasing a few locks (like cache lock and queue lock), minimizing
+triggering IRQs from the driver side, which generally gain the overall
+performance improvement as observed by xdpsock benchmark.
+Potential side effect is that it might increase the latency of per packet
+due to memory allocation that is unavoidable and time-consuming.
+Setting a relatively large value of batch size could benifit for scenarios
+like bulk transmission. The maximum value shouldn't be larger than
+xs->max_tx_budget.
+
 XDP_STATISTICS getsockopt
 -------------------------
 
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 23e8861e8b25..965cab9a0465 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -45,6 +45,12 @@ struct xsk_map {
 	struct xdp_sock __rcu *xsk_map[];
 };
 
+struct xsk_batch {
+	u32 generic_xmit_batch;
+	struct sk_buff **skb_cache;
+	struct xdp_desc *desc_cache;
+};
+
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
 	struct sock sk;
@@ -89,6 +95,7 @@ struct xdp_sock {
 	struct mutex mutex;
 	struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
 	struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
+	struct xsk_batch batch;
 };
 
 /*
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 23a062781468..44cb72cd328e 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -80,6 +80,7 @@ struct xdp_mmap_offsets {
 #define XDP_STATISTICS			7
 #define XDP_OPTIONS			8
 #define XDP_MAX_TX_SKB_BUDGET		9
+#define XDP_GENERIC_XMIT_BATCH		10
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 6149f6a79897..6122db8606fe 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1218,6 +1218,16 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
 	}
 }
 
+static void xsk_batch_reset(struct xsk_batch *batch, struct sk_buff **skbs,
+			    struct xdp_desc *descs, unsigned int size)
+{
+	kfree(batch->skb_cache);
+	kvfree(batch->desc_cache);
+	batch->skb_cache = skbs;
+	batch->desc_cache = descs;
+	batch->generic_xmit_batch = size;
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -1247,6 +1257,7 @@ static int xsk_release(struct socket *sock)
 	xskq_destroy(xs->tx);
 	xskq_destroy(xs->fq_tmp);
 	xskq_destroy(xs->cq_tmp);
+	xsk_batch_reset(&xs->batch, NULL, NULL, 0);
 
 	sock_orphan(sk);
 	sock->sk = NULL;
@@ -1588,6 +1599,42 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		WRITE_ONCE(xs->max_tx_budget, budget);
 		return 0;
 	}
+	case XDP_GENERIC_XMIT_BATCH:
+	{
+		struct xsk_buff_pool *pool = xs->pool;
+		struct xsk_batch *batch = &xs->batch;
+		struct xdp_desc *descs;
+		struct sk_buff **skbs;
+		unsigned int size;
+		int ret = 0;
+
+		if (optlen != sizeof(size))
+			return -EINVAL;
+		if (copy_from_sockptr(&size, optval, sizeof(size)))
+			return -EFAULT;
+		if (size == batch->generic_xmit_batch)
+			return 0;
+		if (!size || size > xs->max_tx_budget || !pool)
+			return -EACCES;
+
+		mutex_lock(&xs->mutex);
+		skbs = kmalloc(size * sizeof(struct sk_buff *), GFP_KERNEL);
+		if (!skbs) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		descs = kvcalloc(size, sizeof(struct xdp_desc), GFP_KERNEL);
+		if (!descs) {
+			kfree(skbs);
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		xsk_batch_reset(batch, skbs, descs, size);
+out:
+		mutex_unlock(&xs->mutex);
+		return ret;
+	}
 	default:
 		break;
 	}
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 23a062781468..44cb72cd328e 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -80,6 +80,7 @@ struct xdp_mmap_offsets {
 #define XDP_STATISTICS			7
 #define XDP_OPTIONS			8
 #define XDP_MAX_TX_SKB_BUDGET		9
+#define XDP_GENERIC_XMIT_BATCH		10
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 00/14] xsk: batch xmit in copy mode
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing

From: Jason Xing <kernelxing@tencent.com>

Greetings, everyone. This is the batch feature series. Even though
net-next is closed, I would appreciate any feedbacks and suggestions
on this! Many thanks!

Bottom line up front: it improves the performance by 88.2% stably.

# Background
This series is focused on the performance improvement in copy mode. As
observed in the physical servers, there are much room left to ramp up
the transmission for copy mode, compared to zerocopy mode.

Even though we can apply zerocopy to achieve a much better performance,
some limitations are still there especially for virtio and veth cases
due to the implementation in the host. In the real world, hundreds and
thousands of hosts like at Tencent still don't support zerocopy mode
for VMs, so copy mode is the only way we can resort to. Being general
is its strong advantage.

Zerocopy has a good function name xskq_cons_read_desc_batch() which
reads descriptors in batch and then sends them out at a time, rather
than just read and send the descriptor one by one in a loop. Similar
batch ideas can be seen from classic mechanisms like GSO/GRO which
also try to handle as many packets as they can at one time. So the
motivation and idea of the series actually originated from them.

# AF_PACKET Comparison
Looking back to the initial design and implementation of AF_XDP, it's
not hard to find the big difference it made is to speed up the
transmission when zerocopy mode is enabled. So the conclusion is that
zerocopy mode of AF_XDP outperforms AF_PACKET that still uses copy mode.
As to the whole logic of copy mode for both of them, they looks quite
similar, especially when application using AF_PACKET sets
PACKET_QDISC_BYPASS option. Digging into the details of AF_PACKET, we
can find the implementation is comparatively heavy which can also be
proved by the real test as shown below. The numbers of AF_PACKET test
are a little bit lower.

# Batch Mode
At the current moment, I consider copy mode of AF_XDP as a half bypass
mechanism to some extent in comparison with the well known bypass
mechanism like DPDK. To avoid much consumption in kernel as much as
possible, then the batch xmit is proposed to aggregate descriptors in a
certain small group and then read/allocate/build/send them in individual
loops.

Applications are allowed to use setsockopt to enlarge the default value.
Please note that since memory allocation can be time consuming and heavy
due to lack of memory that results in complicated memory reclaim, it
might not be that good to hold one descriptor for too long, which brings
high latency for one skb.

# Experiments
Tested on ixgbe at 10Gb/sec with the following settings:
1. mitigations off
2. ethtool -G enp2s0f1 tx 512
3. sysctl -w net.core.skb_defer_max=0
4. sysctl -w net.core.wmem_max=21299200 and sndbuf is the same value
5. XDP_MAX_TX_SKB_BUDGET 512

taskset -c 1 ./xdpsock -i enp2s0f1 -t  -S -s 64

copy mode(before):          1,801,007 pps (baseline)
AF_PACKET:                  1,375,808 pps (-23.6%)
zc mode:                   13,333,593 pps (+640.3%)
batch mode(batch 1):        1,976,821 pps (+9.8%)
batch mode(batch 64):       3,389,704 pps (+88.2%)
batch mode(batch 256):      3,387,563 pps (+88.0%)

---
RFC v4
Link: https://lore.kernel.org/all/20251021131209.41491-1-kerneljasonxing@gmail.com/
1. fix a few bugs in v3
2. add a few optimizations
The series is built on top of commit 2ce8a41113ed (net: hsr: emit
notification for PRP slave2 changed hw addr on port deletion). Since the
changes compared to v3 are too many, please review the series from scratch.
Thanks!

v3
Link: https://lore.kernel.org/all/20250825135342.53110-1-kerneljasonxing@gmail.com/
1. I retested and got different test numbers. Previous test is not that
right because my env has two NUMA nodes and only the first one has a
faster speed.
2. To achieve a stable performance result, the development and
evaluation are also finished in physical servers just like the numbers
that I share.
3. I didn't use pool->tx_descs because sockets can share the same umem
pool.
3. Use skb list to chain the allocated and built skbs to send.
5. Add AF_PACKET test numbers.

V2
Link: https://lore.kernel.org/all/20250811131236.56206-1-kerneljasonxing@gmail.com/
1. add xmit.more sub-feature (Jesper)
2. add kmem_cache_alloc_bulk (Jesper and Maciej)

Jason Xing (14):
  xsk: introduce XDP_GENERIC_XMIT_BATCH setsockopt
  xsk: extend xsk_build_skb() to support passing an already allocated
    skb
  xsk: add xsk_alloc_batch_skb() to build skbs in batch
  xsk: cache data buffers to avoid frequently calling kmalloc_reserve
  xsk: add direct xmit in batch function
  xsk: support dynamic xmit.more control for batch xmit
  xsk: try to skip validating skb list in xmit path
  xsk: rename nb_pkts to nb_descs in xsk_tx_peek_release_desc_batch
  xsk: extend xskq_cons_read_desc_batch to count nb_pkts
  xsk: extend xsk_cq_reserve_locked() to reserve n slots
  xsk: support batch xmit main logic
  xsk: separate read-mostly and write-heavy fields in xsk_buff_pool
  xsk: retire old xmit path in copy mode
  xsk: optimize xsk_build_skb for batch copy-mode fast path

 Documentation/networking/af_xdp.rst |  17 ++
 include/net/xdp_sock.h              |  17 ++
 include/net/xsk_buff_pool.h         |  10 +-
 include/uapi/linux/if_xdp.h         |   1 +
 net/core/dev.c                      |  49 +++++
 net/core/skbuff.c                   | 152 +++++++++++++++
 net/xdp/xsk.c                       | 279 ++++++++++++++++++++--------
 net/xdp/xsk_queue.h                 |  40 +++-
 tools/include/uapi/linux/if_xdp.h   |   1 +
 9 files changed, 473 insertions(+), 93 deletions(-)

-- 
2.41.3


^ permalink raw reply

* Re: [PATCH net v1] net/mlx5: Fix HCA caps leak on notifier init failure
From: Cosmin Ratiu @ 2026-04-15  8:26 UTC (permalink / raw)
  To: Saeed Mahameed, prathameshdeshpande7@gmail.com, Carolina Jubran,
	leon@kernel.org
  Cc: Tariq Toukan, kuba@kernel.org, netdev@vger.kernel.org,
	linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260415005022.34764-1-prathameshdeshpande7@gmail.com>

On Wed, 2026-04-15 at 01:49 +0100, Prathamesh Deshpande wrote:
> mlx5_mdev_init() allocates HCA caps via mlx5_hca_caps_alloc() before
> calling mlx5_notifiers_init(). If notifier initialization fails, the
> error path jumps to err_hca_caps and skips mlx5_hca_caps_free(),
> leaking
> allocated caps.
> 
> Add a dedicated unwind label for notifier-init failure that frees HCA
> caps before continuing the existing cleanup sequence.
> 
> Fixes: b6b03097f982 ("net/mlx5: Initialize events outside devlink
> lock")

Thank you for the fix, LGTM!

> Signed-off-by: Prathamesh Deshpande <prathameshdeshpande7@gmail.com>

Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>

^ permalink raw reply

* Re: [PATCH net] net: pse-pd: fix out-of-bounds bitmap access in pse_isr() on 32-bit
From: Kory Maincent @ 2026-04-15  8:22 UTC (permalink / raw)
  To: Oleksij Rempel
  Cc: Jakub Kicinski, netdev, linux-kernel, Carlo Szelinsky,
	thomas.petazzoni, Andrew Lunn, David S. Miller, Eric Dumazet,
	Paolo Abeni
In-Reply-To: <ad6LqkKrXwNJfN0B@pengutronix.de>

On Tue, 14 Apr 2026 20:47:06 +0200
Oleksij Rempel <o.rempel@pengutronix.de> wrote:

> Hi Kory,
> 
> On Tue, Apr 14, 2026 at 05:13:30PM +0200, Kory Maincent wrote:
> > @@ -1340,6 +1341,11 @@ int devm_pse_irq_helper(struct pse_controller_dev
> > *pcdev, int irq, if (!h->notifs)
> >  		return -ENOMEM;
> >  
> > +	h->notifs_mask = devm_kcalloc(dev, BITS_TO_LONGS(pcdev->nr_lines),
> > +				      sizeof(*h->notifs_mask),
> > GFP_KERNEL);  
> 
> May be better devm_bitmap_zalloc() instead of devm_kcalloc()?

Oh didn't know this one. Yes that's better. I will send v2 with it.

Regards,
-- 
Köry Maincent, Bootlin
Embedded Linux and kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH net-next v3 2/2] e1000e: limit endianness conversion to boundary words
From: Dahan, AvigailX @ 2026-04-15  8:14 UTC (permalink / raw)
  To: Agalakov Daniil, Tony Nguyen
  Cc: Przemek Kitszel, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, intel-wired-lan, netdev,
	linux-kernel, lvc-project, Daniil Iskhakov, Roman Razov
In-Reply-To: <20260401120919.282668-3-ade@amicon.ru>



On 01/04/2026 15:08, Agalakov Daniil wrote:
> [Why]
> In e1000_set_eeprom(), the eeprom_buff is allocated to hold a range of
> words. However, only the boundary words (the first and the last) are
> populated from the EEPROM if the write request is not word-aligned.
> The words in the middle of the buffer remain uninitialized because they
> are intended to be completely overwritten by the new data via memcpy().
> 
> The previous implementation had a loop that performed le16_to_cpus()
> on the entire buffer. This resulted in endianness conversion being
> performed on uninitialized memory for all interior words.
> 
> Fix this by converting the endianness only for the boundary words
> immediately after they are successfully read from the EEPROM.
> 
> Found by Linux Verification Center (linuxtesting.org) with SVACE.
> 
> Co-developed-by: Iskhakov Daniil <dish@amicon.ru>
> Signed-off-by: Iskhakov Daniil <dish@amicon.ru>
> Signed-off-by: Agalakov Daniil <ade@amicon.ru>
> ---
> v3:
>   - Reverted to v1's "check-then-convert" logic: the return value of
>     e1000_read_nvm() is now checked before performing le16_to_cpus().
>   - Removed the redundant full-buffer loops that caused double endianness
>     conversion in v2.
> 
> v2:
>   - Split from the original bugfix series and targeted at 'net-next'.
>   - Removed the Fixes: tag; limiting the conversion scope is an
>     improvement to avoid unnecessary processing of uninitialized memory.
>   - Improved commit description for clarity.
>   - Note on e1000e: this driver already contains the necessary return
>     value checks for EEPROM reads, so only the endianness conversion
>     cleanup is included for e1000e.
> 
>   drivers/net/ethernet/intel/e1000e/ethtool.c | 19 ++++++++++++-------
>   1 file changed, 12 insertions(+), 7 deletions(-)
> 

Tested-by: Avigail Dahan <avigailx.dahan@intel.com>

^ permalink raw reply

* [PATCH net v3 5/5] net: mana: Fix EQ leak in mana_remove on NULL port
From: Erni Sri Satya Vennela @ 2026-04-15  8:09 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ernis, ssengar, dipayanroy, gargaditya,
	shirazsaleem, kees, kotaranov, leon, shacharr, stephen,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260415080944.732901-1-ernis@linux.microsoft.com>

In mana_remove(), when a NULL port is encountered in the port iteration
loop, 'goto out' skips the mana_destroy_eq(ac) call, leaking the event
queues allocated earlier by mana_create_eq().

This can happen when mana_probe_port() fails for port 0, leaving
ac->ports[0] as NULL. On driver unload or error cleanup, mana_remove()
hits the NULL entry and jumps past mana_destroy_eq().

Change 'goto out' to 'break' so the for-loop exits normally and
mana_destroy_eq() is always reached. Remove the now-unreferenced out:
label.

Fixes: 1e2d0824a9c3 ("net: mana: Add support for EQ sharing")
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v3;
* Update Fixes tag to appropriate commit id.
Changes in v2:
* Apply the patch in net instead of net-next.
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 39b18577fb51..98e2fcc797ca 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3752,7 +3752,7 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 		if (!ndev) {
 			if (i == 0)
 				dev_err(dev, "No net device to remove\n");
-			goto out;
+			break;
 		}
 
 		apc = netdev_priv(ndev);
@@ -3783,7 +3783,7 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 	}
 
 	mana_destroy_eq(ac);
-out:
+
 	if (ac->per_port_queue_reset_wq) {
 		destroy_workqueue(ac->per_port_queue_reset_wq);
 		ac->per_port_queue_reset_wq = NULL;
-- 
2.34.1


^ permalink raw reply related

* [PATCH net v3 4/5] net: mana: Don't overwrite port probe error with add_adev result
From: Erni Sri Satya Vennela @ 2026-04-15  8:09 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ernis, ssengar, dipayanroy, gargaditya,
	shirazsaleem, kees, kotaranov, leon, shacharr, stephen,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260415080944.732901-1-ernis@linux.microsoft.com>

In mana_probe(), if mana_probe_port() fails for any port, the error
is stored in 'err' and the loop breaks. However, the subsequent
unconditional 'err = add_adev(gd, "eth")' overwrites this error.
If add_adev() succeeds, mana_probe() returns success despite ports
being left in a partially initialized state (ac->ports[i] == NULL).

Only call add_adev() when there is no prior error, so the probe
correctly fails and triggers mana_remove() cleanup.

Fixes: ced82fce77e9 ("net: mana: Probe rdma device in mana driver")
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v3:
*  Fix inaccurate comments.
Changes in v2:
* Apply the patch in net instead of net-next.
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ce1b7ec46a27..39b18577fb51 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3680,10 +3680,9 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 	if (!resuming) {
 		for (i = 0; i < ac->num_ports; i++) {
 			err = mana_probe_port(ac, i, &ac->ports[i]);
-			/* we log the port for which the probe failed and stop
-			 * probes for subsequent ports.
-			 * Note that we keep running ports, for which the probes
-			 * were successful, unless add_adev fails too
+			/* Log the port for which the probe failed, stop probing
+			 * subsequent ports, and skip add_adev.
+			 * mana_remove() will clean up already-probed ports.
 			 */
 			if (err) {
 				dev_err(dev, "Probe Failed for port %d\n", i);
@@ -3697,10 +3696,9 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 			enable_work(&apc->queue_reset_work);
 			err = mana_attach(ac->ports[i]);
 			rtnl_unlock();
-			/* we log the port for which the attach failed and stop
-			 * attach for subsequent ports
-			 * Note that we keep running ports, for which the attach
-			 * were successful, unless add_adev fails too
+			/* Log the port for which the attach failed, stop
+			 * attaching subsequent ports, and skip add_adev.
+			 * mana_remove() will clean up already-attached ports.
 			 */
 			if (err) {
 				dev_err(dev, "Attach Failed for port %d\n", i);
@@ -3709,7 +3707,8 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 		}
 	}
 
-	err = add_adev(gd, "eth");
+	if (!err)
+		err = add_adev(gd, "eth");
 
 	schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH net v3 3/5] net: mana: Guard mana_remove against double invocation
From: Erni Sri Satya Vennela @ 2026-04-15  8:09 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ernis, ssengar, dipayanroy, gargaditya,
	shirazsaleem, kees, kotaranov, leon, shacharr, stephen,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260415080944.732901-1-ernis@linux.microsoft.com>

If PM resume fails (e.g., mana_attach() returns an error), mana_probe()
calls mana_remove(), which tears down the device and sets
gd->gdma_context = NULL and gd->driver_data = NULL.

However, a failed resume callback does not automatically unbind the
driver. When the device is eventually unbound, mana_remove() is invoked
a second time. Without a NULL check, it dereferences gc->dev with
gc == NULL, causing a kernel panic.

Add an early return if gdma_context or driver_data is NULL so the second
invocation is harmless. Move the dev = gc->dev assignment after the
guard so it cannot dereference NULL.

Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v3:
* Add this patch to the patchset
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 468ed60a8a00..ce1b7ec46a27 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3731,11 +3731,16 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 	struct gdma_context *gc = gd->gdma_context;
 	struct mana_context *ac = gd->driver_data;
 	struct mana_port_context *apc;
-	struct device *dev = gc->dev;
+	struct device *dev;
 	struct net_device *ndev;
 	int err;
 	int i;
 
+	if (!gc || !ac)
+		return;
+
+	dev = gc->dev;
+
 	disable_work_sync(&ac->link_change_work);
 	cancel_delayed_work_sync(&ac->gf_stats_work);
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH net v3 2/5] net: mana: Init gf_stats_work before potential error paths in probe
From: Erni Sri Satya Vennela @ 2026-04-15  8:09 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ernis, ssengar, dipayanroy, gargaditya,
	shirazsaleem, kees, kotaranov, leon, shacharr, stephen,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260415080944.732901-1-ernis@linux.microsoft.com>

Move INIT_DELAYED_WORK(gf_stats_work) to before mana_create_eq(),
while keeping schedule_delayed_work() at its original location.

Previously, if any function between mana_create_eq() and the
INIT_DELAYED_WORK call failed, mana_probe() would call mana_remove()
which unconditionally calls cancel_delayed_work_sync(gf_stats_work)
in __flush_work() or debug object warnings with
CONFIG_DEBUG_OBJECTS_WORK enabled.

Fixes: be4f1d67ec56 ("net: mana: Add standard counter rx_missed_errors")
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v3:
* No change
Changes in v2:
* Apply the patch in net instead of net-next.
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index e3e4b6de6668..468ed60a8a00 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3635,6 +3635,8 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 		INIT_WORK(&ac->link_change_work, mana_link_state_handle);
 	}
 
+	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
+
 	err = mana_create_eq(ac);
 	if (err) {
 		dev_err(dev, "Failed to create EQs: %d\n", err);
@@ -3709,7 +3711,6 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	err = add_adev(gd, "eth");
 
-	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 	schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
 
 out:
-- 
2.34.1


^ permalink raw reply related

* [PATCH net v3 1/5] net: mana: Init link_change_work before potential error paths in probe
From: Erni Sri Satya Vennela @ 2026-04-15  8:09 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ernis, ssengar, dipayanroy, gargaditya,
	shirazsaleem, kees, kotaranov, leon, shacharr, stephen,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260415080944.732901-1-ernis@linux.microsoft.com>

Move INIT_WORK(link_change_work) to right after the mana_context
allocation, before any error path that could reach mana_remove().

Previously, if mana_create_eq() or mana_query_device_cfg() failed,
mana_probe() would jump to the error path which calls mana_remove().
mana_remove() unconditionally calls disable_work_sync(link_change_work),
but the work struct had not been initialized yet. This can trigger
CONFIG_DEBUG_OBJECTS_WORK enabled.

Fixes: 54133f9b4b53 ("net: mana: Support HW link state events")
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v3:
* No change.
Changes in v2:
* Apply the patch in net instead of net-next.
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 6302432b9bf6..e3e4b6de6668 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3631,6 +3631,8 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 		ac->gdma_dev = gd;
 		gd->driver_data = ac;
+
+		INIT_WORK(&ac->link_change_work, mana_link_state_handle);
 	}
 
 	err = mana_create_eq(ac);
@@ -3648,8 +3650,6 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	if (!resuming) {
 		ac->num_ports = num_ports;
-
-		INIT_WORK(&ac->link_change_work, mana_link_state_handle);
 	} else {
 		if (ac->num_ports != num_ports) {
 			dev_err(dev, "The number of vPorts changed: %d->%d\n",
-- 
2.34.1


^ permalink raw reply related

* [PATCH net v3 0/5] net: mana: Fix probe/remove error path bugs
From: Erni Sri Satya Vennela @ 2026-04-15  8:09 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ernis, ssengar, dipayanroy, gargaditya,
	shirazsaleem, kees, kotaranov, leon, shacharr, stephen,
	linux-hyperv, netdev, linux-kernel

Fix five bugs in mana_probe()/mana_remove() error handling that can
cause warnings on uninitialized work structs, NULL pointer dereferences,
masked errors, and resource leaks when early probe steps fail.

Patches 1-2 move work struct initialization (link_change_work and
gf_stats_work) to before any error path that could trigger
mana_remove(), preventing WARN_ON in __flush_work() or debug object
warnings when sync cancellation runs on uninitialized work structs.

Patch 3 guards mana_remove() against double invocation. If PM resume
fails, mana_probe() calls mana_remove() which sets gdma_context and
driver_data to NULL. A failed resume does not unbind the driver, so
when the device is eventually unbound, mana_remove() is called again
and dereferences NULL, causing a kernel panic. An early return on
NULL gdma_context or driver_data makes the second call harmless.

Patch 4 prevents add_adev() from overwriting a port probe error,
which could leave the driver in a broken state with NULL ports while
reporting success.

Patch 5 changes 'goto out' to 'break' in mana_remove()'s port loop
so that mana_destroy_eq() is always reached, preventing EQ leaks when
a NULL port is encountered.
---
Changes in v3:
* Add patch 3: net: mana: Guard mana_remove against double invocation.
* Fix inaccurate comments.
* Correct Fixes tag from ca9c54d2d6a5 to 1e2d0824a9c3.
Changes in v2:
* Apply the patchset in net instead of net-next.
---
Erni Sri Satya Vennela (5):
  net: mana: Init link_change_work before potential error paths in probe
  net: mana: Init gf_stats_work before potential error paths in probe
  net: mana: Guard mana_remove against double invocation
  net: mana: Don't overwrite port probe error with add_adev result
  net: mana: Fix EQ leak in mana_remove on NULL port

 drivers/net/ethernet/microsoft/mana/mana_en.c | 35 +++++++++++--------
 1 file changed, 20 insertions(+), 15 deletions(-)

-- 
2.34.1


^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH iwl-next] e1000e: use ktime_get_real_ns() in e1000e_systim_reset()
From: Dahan, AvigailX @ 2026-04-15  8:04 UTC (permalink / raw)
  To: Aleksandr Loktionov, intel-wired-lan, anthony.l.nguyen
  Cc: netdev, Jacob Keller, Simon Horman
In-Reply-To: <20260408083644.1621317-1-aleksandr.loktionov@intel.com>



On 08/04/2026 11:36, Aleksandr Loktionov wrote:
> Replace ktime_to_ns(ktime_get_real()) with the direct equivalent
> ktime_get_real_ns() in e1000e_systim_reset().  Using the combined helper
> avoids the unnecessary intermediate ktime_t variable and makes the
> intent clearer.
> 
> Suggested-by: Jacob Keller <jacob.e.keller@intel.com>
> Suggested-by: Simon Horman <horms@kernel.org>
> Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
> ---
>   drivers/net/ethernet/intel/e1000e/netdev.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 

Tested-by: Avigail Dahan <avigailx.dahan@intel.com>

^ permalink raw reply

* Re: [PATCH v3 net] ax25: fix OOB read after address header strip in ax25_rcv()
From: David Laight @ 2026-04-15  7:59 UTC (permalink / raw)
  To: Ashutosh Desai
  Cc: netdev, linux-hams, jreuter, davem, edumazet, kuba, pabeni, horms,
	stable, linux-kernel
In-Reply-To: <20260415063654.3831353-1-ashutoshdesai993@gmail.com>

On Wed, 15 Apr 2026 06:36:54 +0000
Ashutosh Desai <ashutoshdesai993@gmail.com> wrote:

> A remote station can send a crafted KISS frame that is just long enough
> to pass ax25_addr_parse() (minimum 14 address bytes) but carries no
> control or PID bytes. After ax25_kiss_rcv() strips the KISS framing
> byte and ax25_rcv() strips the address header with skb_pull(), skb->len
> drops to zero. The subsequent reads of skb->data[0] (control byte) and
> skb->data[1] (PID byte) are then out of bounds, which can crash the
> kernel or leak heap memory to a remote attacker.
> 
> Use pskb_may_pull(skb, 2) after the skb_pull() to ensure both bytes
> are in the linear area before reading them. Discard malformed frames
> that carry no control/PID pair.

Is it just worth linearising the skb on entry to all this code?
I believe all the frames are relatively short and low frequency.
So the actual overhead is insignificant, but it makes all the sanity
checks trivial.
It is even likely (hand waving) that the extra copy for non-linear data
is faster than all the checks for non-linear data.

	David


^ permalink raw reply

* Re: [RFC PATCH 1/2] kernel/notifier: replace single-linked list with double-linked list for reverse traversal
From: Christoph Hellwig @ 2026-04-15  7:40 UTC (permalink / raw)
  To: chensong_2000
  Cc: rafael, lenb, mturquette, sboyd, viresh.kumar, agk, snitzer,
	mpatocka, bmarzins, song, yukuai, linan122, jason.wessel, danielt,
	dianders, horms, davem, edumazet, kuba, pabeni, paulmck, frederic,
	mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, pmladek, joe.lawrence, rostedt, mhiramat,
	mark.rutland, mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <20260415070137.17860-1-chensong_2000@189.cn>

On Wed, Apr 15, 2026 at 03:01:37PM +0800, chensong_2000@189.cn wrote:
> diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
> index 132a9df98471..b776dbd5a382 100644
> --- a/drivers/acpi/sleep.c
> +++ b/drivers/acpi/sleep.c
> @@ -56,7 +56,6 @@ static int tts_notify_reboot(struct notifier_block *this,
>  
>  static struct notifier_block tts_notifier = {
>  	.notifier_call	= tts_notify_reboot,
> -	.next		= NULL,
>  	.priority	= 0,

IFF this becomes important for some reason (and right now I don't see
it), please start by using proper wrappers for notifiers so that the
implementation details don't leak into the users.  That would actually
be useful on it's own even.


^ permalink raw reply

* Re: [RFC PATCH 0/2] Decouple ftrace/livepatch from module loader via notifier priority and reverse traversal
From: Christoph Hellwig @ 2026-04-15  7:38 UTC (permalink / raw)
  To: chensong_2000
  Cc: rafael, lenb, mturquette, sboyd, viresh.kumar, agk, snitzer,
	mpatocka, bmarzins, song, yukuai, linan122, jason.wessel, danielt,
	dianders, horms, davem, edumazet, kuba, pabeni, paulmck, frederic,
	mcgrof, petr.pavlu, da.gomez, samitolvanen, atomlin, jpoimboe,
	jikos, mbenes, pmladek, joe.lawrence, rostedt, mhiramat,
	mark.rutland, mathieu.desnoyers, linux-modules, linux-kernel,
	linux-trace-kernel, linux-acpi, linux-clk, linux-pm,
	live-patching, dm-devel, linux-raid, kgdb-bugreport, netdev
In-Reply-To: <20260413080140.180616-1-chensong_2000@189.cn>

On Mon, Apr 13, 2026 at 04:01:40PM +0800, chensong_2000@189.cn wrote:
> From: Song Chen <chensong_2000@189.cn>
> 
> This patchset addresses a long-standing tight coupling between the
> module loader and two of its key consumers: ftrace and livepatch.
> 
> Background:
> 
> The module loader currently hard-codes direct calls to
> ftrace_module_enable(), klp_module_coming(), klp_module_going() and
> ftrace_release_mod() inside prepare_coming_module() and the module
> unload path.

And that is bad why?

>  13 files changed, 290 insertions(+), 74 deletions(-)

This is a lot of new complex code touching a lot of places for no obvious
gain.  What is the reason for this series?  Does it prepare for something
else?


^ permalink raw reply

* Re: [PATCH net-next v9 04/10] net: phy: Create SFP phy_port before registering upstream
From: Maxime Chevallier @ 2026-04-15  7:19 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: davem, Jakub Kicinski, Eric Dumazet, Paolo Abeni, Russell King,
	Heiner Kallweit, netdev, linux-kernel, thomas.petazzoni,
	Christophe Leroy, Herve Codina, Florian Fainelli, Vladimir Oltean,
	Köry Maincent, Marek Behún, Oleksij Rempel,
	Nicolò Veronese, Simon Horman, mwojtas, Romain Gantois,
	Daniel Golle, Dimitri Fedrau
In-Reply-To: <2bdd6224-6acf-49db-955a-5552e64d4b73@lunn.ch>

Hi Andrew,

On 15/04/2026 01:46, Andrew Lunn wrote:
> On Fri, Apr 03, 2026 at 02:37:48PM +0200, Maxime Chevallier wrote:
>> When dealing with PHY-driven SFP, we create a phy_port representing the
>> SFP bus when we know we have such a bus.
> 
> I'm missing the big picture here.
> 
> Do we have three different things represented in the topology:
> 
> SFP bus-> SFP cage-> SFP module
> 
> 	Andrew

Ah by bad, this is a wording issue, this is the port for the cage indeed.

The model I ended-up with is to represent the SFP cage itself as a PHY
port, but I've been calling that the "sfp bus port" in the code so far :/

Let me try to summarize the phy_port usage, especially with SFP :


         phylink->sfp_bus_port (should be renamed sfp_cage_port)
          vacant = true
              |
              v
+-----+   +----------+
| MAC | - | SFP cage |
+-----+   +----------+


           phylink->sfp_bus_port, vacant = false
                |
                |             phylink->mod_port
                v                v
+-----+     +----------+     +------------+
| MAC | --- | SFP cage | --- | SFP Module |
+-----+     +----------+     +------------+


                    one of phydev->ports
                           v
+-----+     +-----+     +------+
| MAC | --- | PHY | --- | RJ45 |
+-----+     +-----+     +------+


                      one of phydev->ports
                             v
                          +------+
+-----+     +-----+   /-- | RJ45 |
| MAC | --- | PHY | -+    +------+
+-----+     |     | -+    +----------+
            +-----+   \-- | SFP Cage |
                          +----------+
                             ^
                      phydev->sfp_bus_port +
                   is also in phydev->ports
                     port->vacant = true


                      one of phydev->ports
                             v
                          +------+
+-----+     +-----+   /-- | RJ45 |         phydev->mod_port
| MAC | --- | PHY | -+    +------+              v
+-----+     |     | -+    +----------+     +------------+
            +-----+   \-- | SFP Cage | --- | SFP Module |
                          +----------+     +------------+
                             ^
                      phydev->sfp_bus_port +
                   is also in phydev->ports
                     port->vacant = false


I should probably add that at least in the cover. Does it clarify the
picture a little bit ?

Thanks a lot for the reviews :)

Maxime

^ permalink raw reply

* Re: [PATCH net v2 3/4] net: mana: Don't overwrite port probe error with add_adev result
From: Erni Sri Satya Vennela @ 2026-04-15  7:04 UTC (permalink / raw)
  To: Simon Horman
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ssengar, dipayanroy, gargaditya,
	shirazsaleem, kees, kotaranov, leon, shacharr, stephen,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260414153503.769476-1-horms@kernel.org>

On Tue, Apr 14, 2026 at 04:35:03PM +0100, Simon Horman wrote:
> From: 'Simon Horman' <horms@kernel.org>
> 
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> Full review at: https://sashiko.dev
> ---
> net: mana: Don't overwrite port probe error with add_adev result
> 
> This commit ensures that if mana_probe_port() or mana_attach() fails, the
> error is not overwritten by the result of add_adev(). By preserving the
> error, it triggers mana_remove() cleanup instead of leaving ports partially
> initialized.
> 
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index 9d095a6fb56c7..4d42cbfb3f15b 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -3634,10 +3634,9 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
> >  	if (!resuming) {
> >  		for (i = 0; i < ac->num_ports; i++) {
> >  			err = mana_probe_port(ac, i, &ac->ports[i]);
> > -			/* we log the port for which the probe failed and stop
> > -			 * probes for subsequent ports.
> > -			 * Note that we keep running ports, for which the probes
> > -			 * were successful, unless add_adev fails too
> > +			/* Log the port for which the probe failed, stop probing
> > +			 * subsequent ports, and skip add_adev.
> > +			 * Already-probed ports remain functional.
> >  			 */
> 
> Are these comments accurate regarding ports remaining functional?
> 
> Since this patch preserves the error code, execution falls through to the end
> of mana_probe() where it checks if err is set and calls mana_remove().
> mana_remove() will iterate over the successfully allocated ports, completely
> detach them, unregister their net_devices, and destroy event queues.
> 
> It appears no ports remain functional because the device is completely torn
> down.
>
I will update the patch with the updated comment message. 
> [ ... ]
> 
> > @@ -3651,10 +3650,9 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
> >  			enable_work(&apc->queue_reset_work);
> >  			err = mana_attach(ac->ports[i]);
> >  			rtnl_unlock();
> > -			/* we log the port for which the attach failed and stop
> > -			 * attach for subsequent ports
> > -			 * Note that we keep running ports, for which the attach
> > -			 * were successful, unless add_adev fails too
> > +			/* Log the port for which the attach failed, stop
> > +			 * attaching subsequent ports, and skip add_adev.
> > +			 * Already-attached ports remain functional.
> >  			 */
> >  			if (err) {
> >  				dev_err(dev, "Attach Failed for port %d\n", i);
> > @@ -3663,7 +3661,8 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
> 
> Does propagating this error during PM resume introduce a NULL pointer
> dereference?
> 
> If PM resume fails here (resuming == true), the preserved error code causes
> mana_probe() to call mana_remove(), which tears down the device and sets
> gd->gdma_context = NULL.
> 
> However, a failed resume callback does not automatically unbind the driver.
> When the device is eventually unbound later, mana_remove() is invoked again:
> 
> drivers/net/ethernet/microsoft/mana/mana_en.c:mana_remove() {
>     struct gdma_context *gc = gd->gdma_context;
>     struct device *dev = gc->dev;
>     ...
> }
> 
> In the second invocation, gd->gdma_context is NULL, meaning gc is NULL,
> which could lead to a kernel panic when dereferencing gc->dev.

Thankyou for pointing it out, Simon.
Since this is a pre-existing bug, I will create a different patch for
this change and make it as part of this patchset.

- Vennela

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox