[PATCH RFC net-next 0/2] xsk: fix immature cq descriptor production (II)

bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RFC net-next 0/2] xsk: fix immature cq descriptor production (II)
@ 2025-10-31  9:32 Jason Xing
  2025-10-31  9:32 ` [PATCH RFC net-next 1/2] Revert "xsk: Fix immature cq descriptor production" Jason Xing
  2025-10-31  9:32 ` [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs Jason Xing
  0 siblings, 2 replies; 12+ messages in thread
From: Jason Xing @ 2025-10-31  9:32 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend, joe, willemdebruijn.kernel, fmancera, csmate
  Cc: bpf, netdev, Jason Xing

From: Jason Xing <kernelxing@tencent.com>

This series was made based on the previous work[1] to fix the issue
without causing too much performance impact.

[1]: commit 30f241fcf52a ("xsk: Fix immature cq descriptor production")

Jason Xing (2):
  Revert "xsk: Fix immature cq descriptor production"
  xsk: introduce a cached cq to temporarily store descriptor addrs

 include/net/xdp_sock.h      |   1 +
 include/net/xsk_buff_pool.h |   1 +
 net/xdp/xsk.c               | 182 ++++++++++++++++--------------------
 net/xdp/xsk_buff_pool.c     |   1 +
 net/xdp/xsk_queue.h         |  12 ---
 5 files changed, 84 insertions(+), 113 deletions(-)

-- 
2.41.3


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH RFC net-next 1/2] Revert "xsk: Fix immature cq descriptor production"
  2025-10-31  9:32 [PATCH RFC net-next 0/2] xsk: fix immature cq descriptor production (II) Jason Xing
@ 2025-10-31  9:32 ` Jason Xing
  2025-10-31  9:32 ` [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs Jason Xing
  1 sibling, 0 replies; 12+ messages in thread
From: Jason Xing @ 2025-10-31  9:32 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend, joe, willemdebruijn.kernel, fmancera, csmate
  Cc: bpf, netdev, Jason Xing

From: Jason Xing <kernelxing@tencent.com>

This patch was made manually to revert previous fix, in order to
easily implement a new logic based on the previous code base.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/xdp/xsk.c       | 118 ++++++++------------------------------------
 net/xdp/xsk_queue.h |  12 -----
 2 files changed, 20 insertions(+), 110 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7b0c68a70888..05010c1bbfbd 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,20 +36,6 @@
 #define TX_BATCH_SIZE 32
 #define MAX_PER_SOCKET_BUDGET 32
 
-struct xsk_addr_node {
-	u64 addr;
-	struct list_head addr_node;
-};
-
-struct xsk_addr_head {
-	u32 num_descs;
-	struct list_head addrs_list;
-};
-
-static struct kmem_cache *xsk_tx_generic_cache;
-
-#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
-
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -546,43 +532,24 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
+static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	ret = xskq_prod_reserve(pool->cq);
+	ret = xskq_prod_reserve_addr(pool->cq, addr);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 
 	return ret;
 }
 
-static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
-				      struct sk_buff *skb)
+static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
 {
-	struct xsk_addr_node *pos, *tmp;
-	u32 descs_processed = 0;
 	unsigned long flags;
-	u32 idx;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	idx = xskq_get_prod(pool->cq);
-
-	xskq_prod_write_addr(pool->cq, idx,
-			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
-	descs_processed++;
-
-	if (unlikely(XSKCB(skb)->num_descs > 1)) {
-		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
-			xskq_prod_write_addr(pool->cq, idx + descs_processed,
-					     pos->addr);
-			descs_processed++;
-			list_del(&pos->addr_node);
-			kmem_cache_free(xsk_tx_generic_cache, pos);
-		}
-	}
-	xskq_prod_submit_n(pool->cq, descs_processed);
+	xskq_prod_submit_n(pool->cq, n);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
@@ -595,14 +562,9 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
-static void xsk_inc_num_desc(struct sk_buff *skb)
-{
-	XSKCB(skb)->num_descs++;
-}
-
 static u32 xsk_get_num_desc(struct sk_buff *skb)
 {
-	return XSKCB(skb)->num_descs;
+	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
 }
 
 static void xsk_destruct_skb(struct sk_buff *skb)
@@ -614,38 +576,31 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
 	}
 
-	xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);
+	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
 	sock_wfree(skb);
 }
 
-static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
-			      u64 addr)
+static void xsk_set_destructor_arg(struct sk_buff *skb)
+{
+	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+
+	skb_shinfo(skb)->destructor_arg = (void *)num;
+}
+
+static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs)
 {
-	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
-	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
 	skb->dev = xs->dev;
 	skb->priority = READ_ONCE(xs->sk.sk_priority);
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
-	XSKCB(skb)->num_descs = 0;
 	skb->destructor = xsk_destruct_skb;
-	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
 {
 	struct xdp_sock *xs = xdp_sk(skb->sk);
-	u32 num_descs = xsk_get_num_desc(skb);
-	struct xsk_addr_node *pos, *tmp;
-
-	if (unlikely(num_descs > 1)) {
-		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
-			list_del(&pos->addr_node);
-			kmem_cache_free(xsk_tx_generic_cache, pos);
-		}
-	}
 
 	skb->destructor = sock_wfree;
-	xsk_cq_cancel_locked(xs->pool, num_descs);
+	xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
 	/* Free skb without triggering the perf drop trace */
 	consume_skb(skb);
 	xs->skb = NULL;
@@ -701,7 +656,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 {
 	struct xsk_buff_pool *pool = xs->pool;
 	u32 hr, len, ts, offset, copy, copied;
-	struct xsk_addr_node *xsk_addr;
 	struct sk_buff *skb = xs->skb;
 	struct page *page;
 	void *buffer;
@@ -720,23 +674,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 
 		skb_reserve(skb, hr);
 
-		xsk_skb_init_misc(skb, xs, desc->addr);
+		xsk_skb_init_misc(skb, xs);
 		if (desc->options & XDP_TX_METADATA) {
 			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
 			if (unlikely(err))
 				return ERR_PTR(err);
 		}
-	} else {
-		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
-		if (!xsk_addr)
-			return ERR_PTR(-ENOMEM);
-
-		/* in case of -EOVERFLOW that could happen below,
-		 * xsk_consume_skb() will release this node as whole skb
-		 * would be dropped, which implies freeing all list elements
-		 */
-		xsk_addr->addr = desc->addr;
-		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
 	}
 
 	len = desc->len;
@@ -804,7 +747,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			if (unlikely(err))
 				goto free_err;
 
-			xsk_skb_init_misc(skb, xs, desc->addr);
+			xsk_skb_init_misc(skb, xs);
 			if (desc->options & XDP_TX_METADATA) {
 				err = xsk_skb_metadata(skb, buffer, desc,
 						       xs->pool, hr);
@@ -813,7 +756,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			}
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
-			struct xsk_addr_node *xsk_addr;
 			struct page *page;
 			u8 *vaddr;
 
@@ -828,26 +770,16 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 				goto free_err;
 			}
 
-			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
-			if (!xsk_addr) {
-				__free_page(page);
-				err = -ENOMEM;
-				goto free_err;
-			}
-
 			vaddr = kmap_local_page(page);
 			memcpy(vaddr, buffer, len);
 			kunmap_local(vaddr);
 
 			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
 			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
-
-			xsk_addr->addr = desc->addr;
-			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
 		}
 	}
 
-	xsk_inc_num_desc(skb);
+	xsk_set_destructor_arg(skb);
 
 	return skb;
 
@@ -857,7 +789,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 	if (err == -EOVERFLOW) {
 		/* Drop the packet */
-		xsk_inc_num_desc(xs->skb);
+		xsk_set_destructor_arg(xs->skb);
 		xsk_drop_skb(xs->skb);
 		xskq_cons_release(xs->tx);
 	} else {
@@ -900,7 +832,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_locked(xs->pool);
+		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
 		if (err) {
 			err = -EAGAIN;
 			goto out;
@@ -1903,18 +1835,8 @@ static int __init xsk_init(void)
 	if (err)
 		goto out_pernet;
 
-	xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
-						 sizeof(struct xsk_addr_node),
-						 0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!xsk_tx_generic_cache) {
-		err = -ENOMEM;
-		goto out_unreg_notif;
-	}
-
 	return 0;
 
-out_unreg_notif:
-	unregister_netdevice_notifier(&xsk_netdev_notifier);
 out_pernet:
 	unregister_pernet_subsys(&xsk_net_ops);
 out_sk:
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 1eb8d9f8b104..23c02066caaf 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -369,11 +369,6 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
 
 /* Functions for producers */
 
-static inline u32 xskq_get_prod(struct xsk_queue *q)
-{
-	return READ_ONCE(q->ring->producer);
-}
-
 static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
 {
 	u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -420,13 +415,6 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
-static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
-{
-	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-
-	ring->desc[idx & q->ring_mask] = addr;
-}
-
 static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
 					      u32 nb_entries)
 {
-- 
2.41.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-10-31  9:32 [PATCH RFC net-next 0/2] xsk: fix immature cq descriptor production (II) Jason Xing
  2025-10-31  9:32 ` [PATCH RFC net-next 1/2] Revert "xsk: Fix immature cq descriptor production" Jason Xing
@ 2025-10-31  9:32 ` Jason Xing
  2025-10-31 14:02   ` Maciej Fijalkowski
  1 sibling, 1 reply; 12+ messages in thread
From: Jason Xing @ 2025-10-31  9:32 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend, joe, willemdebruijn.kernel, fmancera, csmate
  Cc: bpf, netdev, Jason Xing

From: Jason Xing <kernelxing@tencent.com>

Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
production"), there is one issue[1] which causes the wrong publish
of descriptors in race condidtion. The above commit fixes the issue
but adds more memory operations in the xmit hot path and interrupt
context, which can cause side effect in performance.

This patch tries to propose a new solution to fix the problem
without manipulating the allocation and deallocation of memory. One
of the key points is that I borrowed the idea from the above commit
that postpones updating the ring->descs in xsk_destruct_skb()
instead of in __xsk_generic_xmit().

The core logics are as show below:
1. allocate a new local queue. Only its cached_prod member is used.
2. write the descriptors into the local queue in the xmit path. And
   record the cached_prod as @start_addr that reflects the
   start position of this queue so that later the skb can easily
   find where its addrs are written in the destruction phase.
3. initialize the upper 24 bits of destructor_arg to store @start_addr
   in xsk_skb_init_misc().
4. Initialize the lower 8 bits of destructor_arg to store how many
   descriptors the skb owns in xsk_update_num_desc().
5. write the desc addr(s) from the @start_addr from the cached cq
   one by one into the real cq in xsk_destruct_skb(). In turn sync
   the global state of the cq.

The format of destructor_arg is designed as:
 ------------------------ --------
|       start_addr       |  num   |
 ------------------------ --------
Using upper 24 bits is enough to keep the temporary descriptors. And
it's also enough to use lower 8 bits to show the number of descriptors
that one skb owns.

[1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
I posted the series as an RFC because I'd like to hear more opinions on
the current rought approach so that the fix[2] can be avoided and
mitigate the impact of performance. This patch might have bugs because
I decided to spend more time on it after we come to an agreement. Please
review the overall concepts. Thanks!

Maciej, could you share with me the way you tested jumbo frame? I used
./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
nic more than 90%, which means I cannot see the performance impact.

[2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
---
 include/net/xdp_sock.h      |   1 +
 include/net/xsk_buff_pool.h |   1 +
 net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
 net/xdp/xsk_buff_pool.c     |   1 +
 4 files changed, 84 insertions(+), 23 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ce587a225661..0d90d97e0a62 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -89,6 +89,7 @@ struct xdp_sock {
 	struct mutex mutex;
 	struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
 	struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
+	struct xsk_queue *cached_cq;
 };
 
 /*
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index cac56e6b0869..b52491f93c7d 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -63,6 +63,7 @@ struct xsk_buff_pool {
 	/* Data path members as close to free_heads at the end as possible. */
 	struct xsk_queue *fq ____cacheline_aligned_in_smp;
 	struct xsk_queue *cq;
+	struct xsk_queue *cached_cq;
 	/* For performance reasons, each buff pool has its own array of dma_pages
 	 * even when they are identical.
 	 */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 05010c1bbfbd..3951c2bc9d97 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -532,25 +532,33 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr,
+				      u32 *start_addr)
 {
+	struct xdp_umem_ring *ring;
 	unsigned long flags;
 	int ret;
 
+	ring = (struct xdp_umem_ring *)pool->cached_cq->ring;
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	ret = xskq_prod_reserve_addr(pool->cq, addr);
+	ret = xskq_prod_reserve(pool->cq);
+	if (!ret) {
+		*start_addr = pool->cached_cq->cached_prod++ & pool->cq->ring_mask;
+		ring->desc[*start_addr] = addr;
+	}
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 
 	return ret;
 }
 
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_write_addr(struct xsk_buff_pool *pool, u32 addr)
 {
-	unsigned long flags;
+	struct xsk_queue *cq = pool->cq;
+	struct xsk_queue *ccq = pool->cached_cq;
+	struct xdp_umem_ring * ccqr = (struct xdp_umem_ring *)ccq->ring;
+	struct xdp_umem_ring * cqr = (struct xdp_umem_ring *)cq->ring;
 
-	spin_lock_irqsave(&pool->cq_lock, flags);
-	xskq_prod_submit_n(pool->cq, n);
-	spin_unlock_irqrestore(&pool->cq_lock, flags);
+	cqr->desc[cq->cached_prod++ & cq->ring_mask] = ccqr->desc[addr++];
 }
 
 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
@@ -562,9 +570,41 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
-static u32 xsk_get_num_desc(struct sk_buff *skb)
+#define XSK_DESTRUCTOR_DESCS_SHIFT 8
+#define XSK_DESTRUCTOR_DESCS_MASK \
+	((1ULL << XSK_DESTRUCTOR_DESCS_SHIFT) - 1)
+
+static u8 xsk_get_num_desc(struct sk_buff *skb)
+{
+	long val = (long)skb_shinfo(skb)->destructor_arg;
+
+	return (u8)val & XSK_DESTRUCTOR_DESCS_MASK;
+}
+
+static u32 xsk_get_start_addr(struct sk_buff *skb)
 {
-	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
+	long val = (long)skb_shinfo(skb)->destructor_arg;
+
+	return val >> XSK_DESTRUCTOR_DESCS_SHIFT;
+}
+
+static long xsk_get_destructor_arg(struct sk_buff *skb)
+{
+	return (long)skb_shinfo(skb)->destructor_arg;
+}
+
+static void xsk_cq_submit_locked(struct sk_buff *skb)
+{
+	struct xsk_buff_pool *pool = xdp_sk(skb->sk)->pool;
+	u32 addr = xsk_get_start_addr(skb);
+	u8 num = xsk_get_num_desc(skb), i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->cq_lock, flags);
+	for (i = 0; i < num; i++)
+		xsk_cq_write_addr(pool, addr);
+	xskq_prod_submit_n(pool->cq, num);
+	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
 static void xsk_destruct_skb(struct sk_buff *skb)
@@ -576,23 +616,30 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
 	}
 
-	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+	xsk_cq_submit_locked(skb);
 	sock_wfree(skb);
 }
 
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static void xsk_update_num_desc(struct sk_buff *skb)
 {
-	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+	long val = xsk_get_destructor_arg(skb);
+	u8 num = xsk_get_num_desc(skb) + 1;
 
-	skb_shinfo(skb)->destructor_arg = (void *)num;
+	val = (val & ~(XSK_DESTRUCTOR_DESCS_MASK)) | num;
+	skb_shinfo(skb)->destructor_arg = (void *)val;
 }
 
-static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs)
+static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
+			      u32 start_addr)
 {
+	long val;
+
 	skb->dev = xs->dev;
 	skb->priority = READ_ONCE(xs->sk.sk_priority);
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
 	skb->destructor = xsk_destruct_skb;
+	val = start_addr << XSK_DESTRUCTOR_DESCS_SHIFT;
+	skb_shinfo(skb)->destructor_arg = (void *)val;
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
@@ -652,7 +699,8 @@ static int xsk_skb_metadata(struct sk_buff *skb, void *buffer,
 }
 
 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
-					      struct xdp_desc *desc)
+					      struct xdp_desc *desc,
+					      u32 start_addr)
 {
 	struct xsk_buff_pool *pool = xs->pool;
 	u32 hr, len, ts, offset, copy, copied;
@@ -674,7 +722,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 
 		skb_reserve(skb, hr);
 
-		xsk_skb_init_misc(skb, xs);
+		xsk_skb_init_misc(skb, xs, start_addr);
 		if (desc->options & XDP_TX_METADATA) {
 			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
 			if (unlikely(err))
@@ -713,14 +761,15 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 }
 
 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
-				     struct xdp_desc *desc)
+				     struct xdp_desc *desc,
+				     u32 start_addr)
 {
 	struct net_device *dev = xs->dev;
 	struct sk_buff *skb = xs->skb;
 	int err;
 
 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
-		skb = xsk_build_skb_zerocopy(xs, desc);
+		skb = xsk_build_skb_zerocopy(xs, desc, start_addr);
 		if (IS_ERR(skb)) {
 			err = PTR_ERR(skb);
 			skb = NULL;
@@ -747,7 +796,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			if (unlikely(err))
 				goto free_err;
 
-			xsk_skb_init_misc(skb, xs);
+			xsk_skb_init_misc(skb, xs, start_addr);
 			if (desc->options & XDP_TX_METADATA) {
 				err = xsk_skb_metadata(skb, buffer, desc,
 						       xs->pool, hr);
@@ -779,7 +828,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 		}
 	}
 
-	xsk_set_destructor_arg(skb);
+	xsk_update_num_desc(skb);
 
 	return skb;
 
@@ -789,7 +838,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 	if (err == -EOVERFLOW) {
 		/* Drop the packet */
-		xsk_set_destructor_arg(xs->skb);
+		xsk_update_num_desc(xs->skb);
 		xsk_drop_skb(xs->skb);
 		xskq_cons_release(xs->tx);
 	} else {
@@ -822,6 +871,8 @@ static int __xsk_generic_xmit(struct sock *sk)
 
 	max_batch = READ_ONCE(xs->max_tx_budget);
 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
+		u32 start_addr;
+
 		if (max_batch-- == 0) {
 			err = -EAGAIN;
 			goto out;
@@ -832,13 +883,14 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr,
+						 &start_addr);
 		if (err) {
 			err = -EAGAIN;
 			goto out;
 		}
 
-		skb = xsk_build_skb(xs, &desc);
+		skb = xsk_build_skb(xs, &desc, start_addr);
 		if (IS_ERR(skb)) {
 			err = PTR_ERR(skb);
 			if (err != -EOVERFLOW)
@@ -1142,6 +1194,7 @@ static int xsk_release(struct socket *sock)
 	xskq_destroy(xs->tx);
 	xskq_destroy(xs->fq_tmp);
 	xskq_destroy(xs->cq_tmp);
+	xskq_destroy(xs->cached_cq);
 
 	sock_orphan(sk);
 	sock->sk = NULL;
@@ -1321,6 +1374,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
 	xs->fq_tmp = NULL;
 	xs->cq_tmp = NULL;
+	xs->cached_cq = NULL;
 
 	xs->dev = dev;
 	xs->zc = xs->umem->zc;
@@ -1458,6 +1512,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
 			&xs->cq_tmp;
 		err = xsk_init_queue(entries, q, true);
+		if (!err && optname == XDP_UMEM_COMPLETION_RING) {
+			q = &xs->cached_cq;
+			err = xsk_init_queue(entries, q, true);
+		}
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index aa9788f20d0d..6e170107dec7 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 
 	pool->fq = xs->fq_tmp;
 	pool->cq = xs->cq_tmp;
+	pool->cached_cq = xs->cached_cq;
 
 	for (i = 0; i < pool->free_heads_cnt; i++) {
 		xskb = &pool->heads[i];
-- 
2.41.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-10-31  9:32 ` [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs Jason Xing
@ 2025-10-31 14:02   ` Maciej Fijalkowski
  2025-10-31 23:59     ` Jason Xing
  0 siblings, 1 reply; 12+ messages in thread
From: Maciej Fijalkowski @ 2025-10-31 14:02 UTC (permalink / raw)
  To: Jason Xing
  Cc: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	jonathan.lemon, sdf, ast, daniel, hawk, john.fastabend, joe,
	willemdebruijn.kernel, fmancera, csmate, bpf, netdev, Jason Xing

On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
> 
> Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), there is one issue[1] which causes the wrong publish
> of descriptors in race condidtion. The above commit fixes the issue
> but adds more memory operations in the xmit hot path and interrupt
> context, which can cause side effect in performance.
> 
> This patch tries to propose a new solution to fix the problem
> without manipulating the allocation and deallocation of memory. One
> of the key points is that I borrowed the idea from the above commit
> that postpones updating the ring->descs in xsk_destruct_skb()
> instead of in __xsk_generic_xmit().
> 
> The core logics are as show below:
> 1. allocate a new local queue. Only its cached_prod member is used.
> 2. write the descriptors into the local queue in the xmit path. And
>    record the cached_prod as @start_addr that reflects the
>    start position of this queue so that later the skb can easily
>    find where its addrs are written in the destruction phase.
> 3. initialize the upper 24 bits of destructor_arg to store @start_addr
>    in xsk_skb_init_misc().
> 4. Initialize the lower 8 bits of destructor_arg to store how many
>    descriptors the skb owns in xsk_update_num_desc().
> 5. write the desc addr(s) from the @start_addr from the cached cq
>    one by one into the real cq in xsk_destruct_skb(). In turn sync
>    the global state of the cq.
> 
> The format of destructor_arg is designed as:
>  ------------------------ --------
> |       start_addr       |  num   |
>  ------------------------ --------
> Using upper 24 bits is enough to keep the temporary descriptors. And
> it's also enough to use lower 8 bits to show the number of descriptors
> that one skb owns.
> 
> [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> 
> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
> I posted the series as an RFC because I'd like to hear more opinions on
> the current rought approach so that the fix[2] can be avoided and
> mitigate the impact of performance. This patch might have bugs because
> I decided to spend more time on it after we come to an agreement. Please
> review the overall concepts. Thanks!
> 
> Maciej, could you share with me the way you tested jumbo frame? I used
> ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> nic more than 90%, which means I cannot see the performance impact.
> 
> [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> ---
>  include/net/xdp_sock.h      |   1 +
>  include/net/xsk_buff_pool.h |   1 +
>  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
>  net/xdp/xsk_buff_pool.c     |   1 +
>  4 files changed, 84 insertions(+), 23 deletions(-)

(...)

> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index aa9788f20d0d..6e170107dec7 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
>  
>  	pool->fq = xs->fq_tmp;
>  	pool->cq = xs->cq_tmp;
> +	pool->cached_cq = xs->cached_cq;

Jason,

pool can be shared between multiple sockets that bind to same <netdev,qid>
tuple. I believe here you're opening up for the very same issue Eryk
initially reported.

>  
>  	for (i = 0; i < pool->free_heads_cnt; i++) {
>  		xskb = &pool->heads[i];
> -- 
> 2.41.3
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-10-31 14:02   ` Maciej Fijalkowski
@ 2025-10-31 23:59     ` Jason Xing
  2025-11-03 15:00       ` Maciej Fijalkowski
  0 siblings, 1 reply; 12+ messages in thread
From: Jason Xing @ 2025-10-31 23:59 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	jonathan.lemon, sdf, ast, daniel, hawk, john.fastabend, joe,
	willemdebruijn.kernel, fmancera, csmate, bpf, netdev, Jason Xing

On Fri, Oct 31, 2025 at 10:02 PM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > production"), there is one issue[1] which causes the wrong publish
> > of descriptors in race condidtion. The above commit fixes the issue
> > but adds more memory operations in the xmit hot path and interrupt
> > context, which can cause side effect in performance.
> >
> > This patch tries to propose a new solution to fix the problem
> > without manipulating the allocation and deallocation of memory. One
> > of the key points is that I borrowed the idea from the above commit
> > that postpones updating the ring->descs in xsk_destruct_skb()
> > instead of in __xsk_generic_xmit().
> >
> > The core logics are as show below:
> > 1. allocate a new local queue. Only its cached_prod member is used.
> > 2. write the descriptors into the local queue in the xmit path. And
> >    record the cached_prod as @start_addr that reflects the
> >    start position of this queue so that later the skb can easily
> >    find where its addrs are written in the destruction phase.
> > 3. initialize the upper 24 bits of destructor_arg to store @start_addr
> >    in xsk_skb_init_misc().
> > 4. Initialize the lower 8 bits of destructor_arg to store how many
> >    descriptors the skb owns in xsk_update_num_desc().
> > 5. write the desc addr(s) from the @start_addr from the cached cq
> >    one by one into the real cq in xsk_destruct_skb(). In turn sync
> >    the global state of the cq.
> >
> > The format of destructor_arg is designed as:
> >  ------------------------ --------
> > |       start_addr       |  num   |
> >  ------------------------ --------
> > Using upper 24 bits is enough to keep the temporary descriptors. And
> > it's also enough to use lower 8 bits to show the number of descriptors
> > that one skb owns.
> >
> > [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> >
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> > I posted the series as an RFC because I'd like to hear more opinions on
> > the current rought approach so that the fix[2] can be avoided and
> > mitigate the impact of performance. This patch might have bugs because
> > I decided to spend more time on it after we come to an agreement. Please
> > review the overall concepts. Thanks!
> >
> > Maciej, could you share with me the way you tested jumbo frame? I used
> > ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> > nic more than 90%, which means I cannot see the performance impact.

Could you provide the command you used? Thanks :)

> >
> > [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> > ---
> >  include/net/xdp_sock.h      |   1 +
> >  include/net/xsk_buff_pool.h |   1 +
> >  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
> >  net/xdp/xsk_buff_pool.c     |   1 +
> >  4 files changed, 84 insertions(+), 23 deletions(-)
>
> (...)
>
> > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > index aa9788f20d0d..6e170107dec7 100644
> > --- a/net/xdp/xsk_buff_pool.c
> > +++ b/net/xdp/xsk_buff_pool.c
> > @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> >
> >       pool->fq = xs->fq_tmp;
> >       pool->cq = xs->cq_tmp;
> > +     pool->cached_cq = xs->cached_cq;
>
> Jason,
>
> pool can be shared between multiple sockets that bind to same <netdev,qid>
> tuple. I believe here you're opening up for the very same issue Eryk
> initially reported.

Actually it shouldn't happen because the cached_cq is more of the
temporary array that helps the skb store its start position. The
cached_prod of cached_cq can only be increased, not decreased. In the
skb destruction phase, only those skbs that go to the end of life need
to sync its desc from cached_cq to cq. For some skbs that are released
before the tx completion, we don't need to clear its record in
cached_cq at all and cq remains untouched.

To put it in a simple way, the patch you proposed uses kmem_cached*
helpers to store the addr and write the addr into cq at the end of
lifecycle while the current patch uses a pre-allocated memory to
store. So it avoids the allocation and deallocation.

Unless I'm missing something important. If so, I'm still convinced
this temporary queue can solve the problem since essentially it's a
better substitute for kmem cache to retain high performance.

Thanks,
Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-10-31 23:59     ` Jason Xing
@ 2025-11-03 15:00       ` Maciej Fijalkowski
  2025-11-03 23:29         ` Jason Xing
  2025-11-11 13:03         ` Jason Xing
  0 siblings, 2 replies; 12+ messages in thread
From: Maciej Fijalkowski @ 2025-11-03 15:00 UTC (permalink / raw)
  To: Jason Xing
  Cc: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	jonathan.lemon, sdf, ast, daniel, hawk, john.fastabend, joe,
	willemdebruijn.kernel, fmancera, csmate, bpf, netdev, Jason Xing

On Sat, Nov 01, 2025 at 07:59:36AM +0800, Jason Xing wrote:
> On Fri, Oct 31, 2025 at 10:02 PM Maciej Fijalkowski
> <maciej.fijalkowski@intel.com> wrote:
> >
> > On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> > > From: Jason Xing <kernelxing@tencent.com>
> > >
> > > Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > > production"), there is one issue[1] which causes the wrong publish
> > > of descriptors in race condidtion. The above commit fixes the issue
> > > but adds more memory operations in the xmit hot path and interrupt
> > > context, which can cause side effect in performance.
> > >
> > > This patch tries to propose a new solution to fix the problem
> > > without manipulating the allocation and deallocation of memory. One
> > > of the key points is that I borrowed the idea from the above commit
> > > that postpones updating the ring->descs in xsk_destruct_skb()
> > > instead of in __xsk_generic_xmit().
> > >
> > > The core logics are as show below:
> > > 1. allocate a new local queue. Only its cached_prod member is used.
> > > 2. write the descriptors into the local queue in the xmit path. And
> > >    record the cached_prod as @start_addr that reflects the
> > >    start position of this queue so that later the skb can easily
> > >    find where its addrs are written in the destruction phase.
> > > 3. initialize the upper 24 bits of destructor_arg to store @start_addr
> > >    in xsk_skb_init_misc().
> > > 4. Initialize the lower 8 bits of destructor_arg to store how many
> > >    descriptors the skb owns in xsk_update_num_desc().
> > > 5. write the desc addr(s) from the @start_addr from the cached cq
> > >    one by one into the real cq in xsk_destruct_skb(). In turn sync
> > >    the global state of the cq.
> > >
> > > The format of destructor_arg is designed as:
> > >  ------------------------ --------
> > > |       start_addr       |  num   |
> > >  ------------------------ --------
> > > Using upper 24 bits is enough to keep the temporary descriptors. And
> > > it's also enough to use lower 8 bits to show the number of descriptors
> > > that one skb owns.
> > >
> > > [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> > >
> > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > ---
> > > I posted the series as an RFC because I'd like to hear more opinions on
> > > the current rought approach so that the fix[2] can be avoided and
> > > mitigate the impact of performance. This patch might have bugs because
> > > I decided to spend more time on it after we come to an agreement. Please
> > > review the overall concepts. Thanks!
> > >
> > > Maciej, could you share with me the way you tested jumbo frame? I used
> > > ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> > > nic more than 90%, which means I cannot see the performance impact.
> 
> Could you provide the command you used? Thanks :)
> 
> > >
> > > [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> > > ---
> > >  include/net/xdp_sock.h      |   1 +
> > >  include/net/xsk_buff_pool.h |   1 +
> > >  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
> > >  net/xdp/xsk_buff_pool.c     |   1 +
> > >  4 files changed, 84 insertions(+), 23 deletions(-)
> >
> > (...)
> >
> > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > index aa9788f20d0d..6e170107dec7 100644
> > > --- a/net/xdp/xsk_buff_pool.c
> > > +++ b/net/xdp/xsk_buff_pool.c
> > > @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > >
> > >       pool->fq = xs->fq_tmp;
> > >       pool->cq = xs->cq_tmp;
> > > +     pool->cached_cq = xs->cached_cq;
> >
> > Jason,
> >
> > pool can be shared between multiple sockets that bind to same <netdev,qid>
> > tuple. I believe here you're opening up for the very same issue Eryk
> > initially reported.
> 
> Actually it shouldn't happen because the cached_cq is more of the
> temporary array that helps the skb store its start position. The
> cached_prod of cached_cq can only be increased, not decreased. In the
> skb destruction phase, only those skbs that go to the end of life need
> to sync its desc from cached_cq to cq. For some skbs that are released
> before the tx completion, we don't need to clear its record in
> cached_cq at all and cq remains untouched.
> 
> To put it in a simple way, the patch you proposed uses kmem_cached*
> helpers to store the addr and write the addr into cq at the end of
> lifecycle while the current patch uses a pre-allocated memory to
> store. So it avoids the allocation and deallocation.
> 
> Unless I'm missing something important. If so, I'm still convinced
> this temporary queue can solve the problem since essentially it's a
> better substitute for kmem cache to retain high performance.

I need a bit more time on this, probably I'll respond tomorrow.

> 
> Thanks,
> Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-11-03 15:00       ` Maciej Fijalkowski
@ 2025-11-03 23:29         ` Jason Xing
  2025-11-11 13:03         ` Jason Xing
  1 sibling, 0 replies; 12+ messages in thread
From: Jason Xing @ 2025-11-03 23:29 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	jonathan.lemon, sdf, ast, daniel, hawk, john.fastabend, joe,
	willemdebruijn.kernel, fmancera, csmate, bpf, netdev, Jason Xing

On Mon, Nov 3, 2025 at 11:00 PM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Sat, Nov 01, 2025 at 07:59:36AM +0800, Jason Xing wrote:
> > On Fri, Oct 31, 2025 at 10:02 PM Maciej Fijalkowski
> > <maciej.fijalkowski@intel.com> wrote:
> > >
> > > On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> > > > From: Jason Xing <kernelxing@tencent.com>
> > > >
> > > > Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > > > production"), there is one issue[1] which causes the wrong publish
> > > > of descriptors in race condidtion. The above commit fixes the issue
> > > > but adds more memory operations in the xmit hot path and interrupt
> > > > context, which can cause side effect in performance.
> > > >
> > > > This patch tries to propose a new solution to fix the problem
> > > > without manipulating the allocation and deallocation of memory. One
> > > > of the key points is that I borrowed the idea from the above commit
> > > > that postpones updating the ring->descs in xsk_destruct_skb()
> > > > instead of in __xsk_generic_xmit().
> > > >
> > > > The core logics are as show below:
> > > > 1. allocate a new local queue. Only its cached_prod member is used.
> > > > 2. write the descriptors into the local queue in the xmit path. And
> > > >    record the cached_prod as @start_addr that reflects the
> > > >    start position of this queue so that later the skb can easily
> > > >    find where its addrs are written in the destruction phase.
> > > > 3. initialize the upper 24 bits of destructor_arg to store @start_addr
> > > >    in xsk_skb_init_misc().
> > > > 4. Initialize the lower 8 bits of destructor_arg to store how many
> > > >    descriptors the skb owns in xsk_update_num_desc().
> > > > 5. write the desc addr(s) from the @start_addr from the cached cq
> > > >    one by one into the real cq in xsk_destruct_skb(). In turn sync
> > > >    the global state of the cq.
> > > >
> > > > The format of destructor_arg is designed as:
> > > >  ------------------------ --------
> > > > |       start_addr       |  num   |
> > > >  ------------------------ --------
> > > > Using upper 24 bits is enough to keep the temporary descriptors. And
> > > > it's also enough to use lower 8 bits to show the number of descriptors
> > > > that one skb owns.
> > > >
> > > > [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> > > >
> > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > ---
> > > > I posted the series as an RFC because I'd like to hear more opinions on
> > > > the current rought approach so that the fix[2] can be avoided and
> > > > mitigate the impact of performance. This patch might have bugs because
> > > > I decided to spend more time on it after we come to an agreement. Please
> > > > review the overall concepts. Thanks!
> > > >
> > > > Maciej, could you share with me the way you tested jumbo frame? I used
> > > > ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> > > > nic more than 90%, which means I cannot see the performance impact.
> >
> > Could you provide the command you used? Thanks :)
> >
> > > >
> > > > [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> > > > ---
> > > >  include/net/xdp_sock.h      |   1 +
> > > >  include/net/xsk_buff_pool.h |   1 +
> > > >  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
> > > >  net/xdp/xsk_buff_pool.c     |   1 +
> > > >  4 files changed, 84 insertions(+), 23 deletions(-)
> > >
> > > (...)
> > >
> > > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > > index aa9788f20d0d..6e170107dec7 100644
> > > > --- a/net/xdp/xsk_buff_pool.c
> > > > +++ b/net/xdp/xsk_buff_pool.c
> > > > @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > > >
> > > >       pool->fq = xs->fq_tmp;
> > > >       pool->cq = xs->cq_tmp;
> > > > +     pool->cached_cq = xs->cached_cq;
> > >
> > > Jason,
> > >
> > > pool can be shared between multiple sockets that bind to same <netdev,qid>
> > > tuple. I believe here you're opening up for the very same issue Eryk
> > > initially reported.
> >
> > Actually it shouldn't happen because the cached_cq is more of the
> > temporary array that helps the skb store its start position. The
> > cached_prod of cached_cq can only be increased, not decreased. In the
> > skb destruction phase, only those skbs that go to the end of life need
> > to sync its desc from cached_cq to cq. For some skbs that are released
> > before the tx completion, we don't need to clear its record in
> > cached_cq at all and cq remains untouched.
> >
> > To put it in a simple way, the patch you proposed uses kmem_cached*
> > helpers to store the addr and write the addr into cq at the end of
> > lifecycle while the current patch uses a pre-allocated memory to
> > store. So it avoids the allocation and deallocation.
> >
> > Unless I'm missing something important. If so, I'm still convinced
> > this temporary queue can solve the problem since essentially it's a
> > better substitute for kmem cache to retain high performance.
>
> I need a bit more time on this, probably I'll respond tomorrow.

Take your time - if we all come to an agreement on this series, then I
can proceed to take care of some minor details to avoid bugs
happening. Thanks in advance.

Thanks,
Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-11-03 15:00       ` Maciej Fijalkowski
  2025-11-03 23:29         ` Jason Xing
@ 2025-11-11 13:03         ` Jason Xing
  2025-11-11 13:44           ` Magnus Karlsson
  1 sibling, 1 reply; 12+ messages in thread
From: Jason Xing @ 2025-11-11 13:03 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	jonathan.lemon, sdf, ast, daniel, hawk, john.fastabend, joe,
	willemdebruijn.kernel, fmancera, csmate, bpf, netdev, Jason Xing

Hi Maciej,

On Mon, Nov 3, 2025 at 11:00 PM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Sat, Nov 01, 2025 at 07:59:36AM +0800, Jason Xing wrote:
> > On Fri, Oct 31, 2025 at 10:02 PM Maciej Fijalkowski
> > <maciej.fijalkowski@intel.com> wrote:
> > >
> > > On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> > > > From: Jason Xing <kernelxing@tencent.com>
> > > >
> > > > Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > > > production"), there is one issue[1] which causes the wrong publish
> > > > of descriptors in race condidtion. The above commit fixes the issue
> > > > but adds more memory operations in the xmit hot path and interrupt
> > > > context, which can cause side effect in performance.
> > > >
> > > > This patch tries to propose a new solution to fix the problem
> > > > without manipulating the allocation and deallocation of memory. One
> > > > of the key points is that I borrowed the idea from the above commit
> > > > that postpones updating the ring->descs in xsk_destruct_skb()
> > > > instead of in __xsk_generic_xmit().
> > > >
> > > > The core logics are as show below:
> > > > 1. allocate a new local queue. Only its cached_prod member is used.
> > > > 2. write the descriptors into the local queue in the xmit path. And
> > > >    record the cached_prod as @start_addr that reflects the
> > > >    start position of this queue so that later the skb can easily
> > > >    find where its addrs are written in the destruction phase.
> > > > 3. initialize the upper 24 bits of destructor_arg to store @start_addr
> > > >    in xsk_skb_init_misc().
> > > > 4. Initialize the lower 8 bits of destructor_arg to store how many
> > > >    descriptors the skb owns in xsk_update_num_desc().
> > > > 5. write the desc addr(s) from the @start_addr from the cached cq
> > > >    one by one into the real cq in xsk_destruct_skb(). In turn sync
> > > >    the global state of the cq.
> > > >
> > > > The format of destructor_arg is designed as:
> > > >  ------------------------ --------
> > > > |       start_addr       |  num   |
> > > >  ------------------------ --------
> > > > Using upper 24 bits is enough to keep the temporary descriptors. And
> > > > it's also enough to use lower 8 bits to show the number of descriptors
> > > > that one skb owns.
> > > >
> > > > [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> > > >
> > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > ---
> > > > I posted the series as an RFC because I'd like to hear more opinions on
> > > > the current rought approach so that the fix[2] can be avoided and
> > > > mitigate the impact of performance. This patch might have bugs because
> > > > I decided to spend more time on it after we come to an agreement. Please
> > > > review the overall concepts. Thanks!
> > > >
> > > > Maciej, could you share with me the way you tested jumbo frame? I used
> > > > ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> > > > nic more than 90%, which means I cannot see the performance impact.
> >
> > Could you provide the command you used? Thanks :)
> >
> > > >
> > > > [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> > > > ---
> > > >  include/net/xdp_sock.h      |   1 +
> > > >  include/net/xsk_buff_pool.h |   1 +
> > > >  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
> > > >  net/xdp/xsk_buff_pool.c     |   1 +
> > > >  4 files changed, 84 insertions(+), 23 deletions(-)
> > >
> > > (...)
> > >
> > > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > > index aa9788f20d0d..6e170107dec7 100644
> > > > --- a/net/xdp/xsk_buff_pool.c
> > > > +++ b/net/xdp/xsk_buff_pool.c
> > > > @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > > >
> > > >       pool->fq = xs->fq_tmp;
> > > >       pool->cq = xs->cq_tmp;
> > > > +     pool->cached_cq = xs->cached_cq;
> > >
> > > Jason,
> > >
> > > pool can be shared between multiple sockets that bind to same <netdev,qid>
> > > tuple. I believe here you're opening up for the very same issue Eryk
> > > initially reported.
> >
> > Actually it shouldn't happen because the cached_cq is more of the
> > temporary array that helps the skb store its start position. The
> > cached_prod of cached_cq can only be increased, not decreased. In the
> > skb destruction phase, only those skbs that go to the end of life need
> > to sync its desc from cached_cq to cq. For some skbs that are released
> > before the tx completion, we don't need to clear its record in
> > cached_cq at all and cq remains untouched.
> >
> > To put it in a simple way, the patch you proposed uses kmem_cached*
> > helpers to store the addr and write the addr into cq at the end of
> > lifecycle while the current patch uses a pre-allocated memory to
> > store. So it avoids the allocation and deallocation.
> >
> > Unless I'm missing something important. If so, I'm still convinced
> > this temporary queue can solve the problem since essentially it's a
> > better substitute for kmem cache to retain high performance.
>
> I need a bit more time on this, probably I'll respond tomorrow.

I'd like to know if you have any further comments on this? And should
I continue to post as an official series?

Thanks,
Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-11-11 13:03         ` Jason Xing
@ 2025-11-11 13:44           ` Magnus Karlsson
  2025-11-11 14:02             ` Jason Xing
  0 siblings, 1 reply; 12+ messages in thread
From: Magnus Karlsson @ 2025-11-11 13:44 UTC (permalink / raw)
  To: Jason Xing
  Cc: Maciej Fijalkowski, davem, edumazet, kuba, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend, joe, willemdebruijn.kernel, fmancera, csmate, bpf,
	netdev, Jason Xing

On Tue, 11 Nov 2025 at 14:06, Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> Hi Maciej,
>
> On Mon, Nov 3, 2025 at 11:00 PM Maciej Fijalkowski
> <maciej.fijalkowski@intel.com> wrote:
> >
> > On Sat, Nov 01, 2025 at 07:59:36AM +0800, Jason Xing wrote:
> > > On Fri, Oct 31, 2025 at 10:02 PM Maciej Fijalkowski
> > > <maciej.fijalkowski@intel.com> wrote:
> > > >
> > > > On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> > > > > From: Jason Xing <kernelxing@tencent.com>
> > > > >
> > > > > Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > > > > production"), there is one issue[1] which causes the wrong publish
> > > > > of descriptors in race condidtion. The above commit fixes the issue
> > > > > but adds more memory operations in the xmit hot path and interrupt
> > > > > context, which can cause side effect in performance.
> > > > >
> > > > > This patch tries to propose a new solution to fix the problem
> > > > > without manipulating the allocation and deallocation of memory. One
> > > > > of the key points is that I borrowed the idea from the above commit
> > > > > that postpones updating the ring->descs in xsk_destruct_skb()
> > > > > instead of in __xsk_generic_xmit().
> > > > >
> > > > > The core logics are as show below:
> > > > > 1. allocate a new local queue. Only its cached_prod member is used.
> > > > > 2. write the descriptors into the local queue in the xmit path. And
> > > > >    record the cached_prod as @start_addr that reflects the
> > > > >    start position of this queue so that later the skb can easily
> > > > >    find where its addrs are written in the destruction phase.
> > > > > 3. initialize the upper 24 bits of destructor_arg to store @start_addr
> > > > >    in xsk_skb_init_misc().
> > > > > 4. Initialize the lower 8 bits of destructor_arg to store how many
> > > > >    descriptors the skb owns in xsk_update_num_desc().
> > > > > 5. write the desc addr(s) from the @start_addr from the cached cq
> > > > >    one by one into the real cq in xsk_destruct_skb(). In turn sync
> > > > >    the global state of the cq.
> > > > >
> > > > > The format of destructor_arg is designed as:
> > > > >  ------------------------ --------
> > > > > |       start_addr       |  num   |
> > > > >  ------------------------ --------
> > > > > Using upper 24 bits is enough to keep the temporary descriptors. And
> > > > > it's also enough to use lower 8 bits to show the number of descriptors
> > > > > that one skb owns.
> > > > >
> > > > > [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> > > > >
> > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > > ---
> > > > > I posted the series as an RFC because I'd like to hear more opinions on
> > > > > the current rought approach so that the fix[2] can be avoided and
> > > > > mitigate the impact of performance. This patch might have bugs because
> > > > > I decided to spend more time on it after we come to an agreement. Please
> > > > > review the overall concepts. Thanks!
> > > > >
> > > > > Maciej, could you share with me the way you tested jumbo frame? I used
> > > > > ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> > > > > nic more than 90%, which means I cannot see the performance impact.
> > >
> > > Could you provide the command you used? Thanks :)
> > >
> > > > >
> > > > > [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> > > > > ---
> > > > >  include/net/xdp_sock.h      |   1 +
> > > > >  include/net/xsk_buff_pool.h |   1 +
> > > > >  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
> > > > >  net/xdp/xsk_buff_pool.c     |   1 +
> > > > >  4 files changed, 84 insertions(+), 23 deletions(-)
> > > >
> > > > (...)
> > > >
> > > > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > > > index aa9788f20d0d..6e170107dec7 100644
> > > > > --- a/net/xdp/xsk_buff_pool.c
> > > > > +++ b/net/xdp/xsk_buff_pool.c
> > > > > @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > > > >
> > > > >       pool->fq = xs->fq_tmp;
> > > > >       pool->cq = xs->cq_tmp;
> > > > > +     pool->cached_cq = xs->cached_cq;
> > > >
> > > > Jason,
> > > >
> > > > pool can be shared between multiple sockets that bind to same <netdev,qid>
> > > > tuple. I believe here you're opening up for the very same issue Eryk
> > > > initially reported.
> > >
> > > Actually it shouldn't happen because the cached_cq is more of the
> > > temporary array that helps the skb store its start position. The
> > > cached_prod of cached_cq can only be increased, not decreased. In the
> > > skb destruction phase, only those skbs that go to the end of life need
> > > to sync its desc from cached_cq to cq. For some skbs that are released
> > > before the tx completion, we don't need to clear its record in
> > > cached_cq at all and cq remains untouched.
> > >
> > > To put it in a simple way, the patch you proposed uses kmem_cached*
> > > helpers to store the addr and write the addr into cq at the end of
> > > lifecycle while the current patch uses a pre-allocated memory to
> > > store. So it avoids the allocation and deallocation.
> > >
> > > Unless I'm missing something important. If so, I'm still convinced
> > > this temporary queue can solve the problem since essentially it's a
> > > better substitute for kmem cache to retain high performance.
> >
> > I need a bit more time on this, probably I'll respond tomorrow.
>
> I'd like to know if you have any further comments on this? And should
> I continue to post as an official series?

Hi Jason,

Maciej has been out-of-office for a couple of days. He should
hopefully be back later this week, so please wait for his comments.

> Thanks,
> Jason
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-11-11 13:44           ` Magnus Karlsson
@ 2025-11-11 14:02             ` Jason Xing
  2025-11-14 15:52               ` Maciej Fijalkowski
  0 siblings, 1 reply; 12+ messages in thread
From: Jason Xing @ 2025-11-11 14:02 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: Maciej Fijalkowski, davem, edumazet, kuba, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend, joe, willemdebruijn.kernel, fmancera, csmate, bpf,
	netdev, Jason Xing

Hi Magnus,

On Tue, Nov 11, 2025 at 9:44 PM Magnus Karlsson
<magnus.karlsson@gmail.com> wrote:
>
> On Tue, 11 Nov 2025 at 14:06, Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > Hi Maciej,
> >
> > On Mon, Nov 3, 2025 at 11:00 PM Maciej Fijalkowski
> > <maciej.fijalkowski@intel.com> wrote:
> > >
> > > On Sat, Nov 01, 2025 at 07:59:36AM +0800, Jason Xing wrote:
> > > > On Fri, Oct 31, 2025 at 10:02 PM Maciej Fijalkowski
> > > > <maciej.fijalkowski@intel.com> wrote:
> > > > >
> > > > > On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> > > > > > From: Jason Xing <kernelxing@tencent.com>
> > > > > >
> > > > > > Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > > > > > production"), there is one issue[1] which causes the wrong publish
> > > > > > of descriptors in race condidtion. The above commit fixes the issue
> > > > > > but adds more memory operations in the xmit hot path and interrupt
> > > > > > context, which can cause side effect in performance.
> > > > > >
> > > > > > This patch tries to propose a new solution to fix the problem
> > > > > > without manipulating the allocation and deallocation of memory. One
> > > > > > of the key points is that I borrowed the idea from the above commit
> > > > > > that postpones updating the ring->descs in xsk_destruct_skb()
> > > > > > instead of in __xsk_generic_xmit().
> > > > > >
> > > > > > The core logics are as show below:
> > > > > > 1. allocate a new local queue. Only its cached_prod member is used.
> > > > > > 2. write the descriptors into the local queue in the xmit path. And
> > > > > >    record the cached_prod as @start_addr that reflects the
> > > > > >    start position of this queue so that later the skb can easily
> > > > > >    find where its addrs are written in the destruction phase.
> > > > > > 3. initialize the upper 24 bits of destructor_arg to store @start_addr
> > > > > >    in xsk_skb_init_misc().
> > > > > > 4. Initialize the lower 8 bits of destructor_arg to store how many
> > > > > >    descriptors the skb owns in xsk_update_num_desc().
> > > > > > 5. write the desc addr(s) from the @start_addr from the cached cq
> > > > > >    one by one into the real cq in xsk_destruct_skb(). In turn sync
> > > > > >    the global state of the cq.
> > > > > >
> > > > > > The format of destructor_arg is designed as:
> > > > > >  ------------------------ --------
> > > > > > |       start_addr       |  num   |
> > > > > >  ------------------------ --------
> > > > > > Using upper 24 bits is enough to keep the temporary descriptors. And
> > > > > > it's also enough to use lower 8 bits to show the number of descriptors
> > > > > > that one skb owns.
> > > > > >
> > > > > > [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> > > > > >
> > > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > > > ---
> > > > > > I posted the series as an RFC because I'd like to hear more opinions on
> > > > > > the current rought approach so that the fix[2] can be avoided and
> > > > > > mitigate the impact of performance. This patch might have bugs because
> > > > > > I decided to spend more time on it after we come to an agreement. Please
> > > > > > review the overall concepts. Thanks!
> > > > > >
> > > > > > Maciej, could you share with me the way you tested jumbo frame? I used
> > > > > > ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> > > > > > nic more than 90%, which means I cannot see the performance impact.
> > > >
> > > > Could you provide the command you used? Thanks :)
> > > >
> > > > > >
> > > > > > [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> > > > > > ---
> > > > > >  include/net/xdp_sock.h      |   1 +
> > > > > >  include/net/xsk_buff_pool.h |   1 +
> > > > > >  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
> > > > > >  net/xdp/xsk_buff_pool.c     |   1 +
> > > > > >  4 files changed, 84 insertions(+), 23 deletions(-)
> > > > >
> > > > > (...)
> > > > >
> > > > > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > > > > index aa9788f20d0d..6e170107dec7 100644
> > > > > > --- a/net/xdp/xsk_buff_pool.c
> > > > > > +++ b/net/xdp/xsk_buff_pool.c
> > > > > > @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > > > > >
> > > > > >       pool->fq = xs->fq_tmp;
> > > > > >       pool->cq = xs->cq_tmp;
> > > > > > +     pool->cached_cq = xs->cached_cq;
> > > > >
> > > > > Jason,
> > > > >
> > > > > pool can be shared between multiple sockets that bind to same <netdev,qid>
> > > > > tuple. I believe here you're opening up for the very same issue Eryk
> > > > > initially reported.
> > > >
> > > > Actually it shouldn't happen because the cached_cq is more of the
> > > > temporary array that helps the skb store its start position. The
> > > > cached_prod of cached_cq can only be increased, not decreased. In the
> > > > skb destruction phase, only those skbs that go to the end of life need
> > > > to sync its desc from cached_cq to cq. For some skbs that are released
> > > > before the tx completion, we don't need to clear its record in
> > > > cached_cq at all and cq remains untouched.
> > > >
> > > > To put it in a simple way, the patch you proposed uses kmem_cached*
> > > > helpers to store the addr and write the addr into cq at the end of
> > > > lifecycle while the current patch uses a pre-allocated memory to
> > > > store. So it avoids the allocation and deallocation.
> > > >
> > > > Unless I'm missing something important. If so, I'm still convinced
> > > > this temporary queue can solve the problem since essentially it's a
> > > > better substitute for kmem cache to retain high performance.
> > >
> > > I need a bit more time on this, probably I'll respond tomorrow.
> >
> > I'd like to know if you have any further comments on this? And should
> > I continue to post as an official series?
>
> Hi Jason,
>
> Maciej has been out-of-office for a couple of days. He should
> hopefully be back later this week, so please wait for his comments.

Thanks for letting me know. I will wait :)

Thanks,
Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-11-11 14:02             ` Jason Xing
@ 2025-11-14 15:52               ` Maciej Fijalkowski
  2025-11-14 23:46                 ` Jason Xing
  0 siblings, 1 reply; 12+ messages in thread
From: Maciej Fijalkowski @ 2025-11-14 15:52 UTC (permalink / raw)
  To: Jason Xing
  Cc: Magnus Karlsson, davem, edumazet, kuba, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend, joe, willemdebruijn.kernel, fmancera, csmate, bpf,
	netdev, Jason Xing

On Tue, Nov 11, 2025 at 10:02:58PM +0800, Jason Xing wrote:
> Hi Magnus,
> 
> On Tue, Nov 11, 2025 at 9:44 PM Magnus Karlsson
> <magnus.karlsson@gmail.com> wrote:
> >
> > On Tue, 11 Nov 2025 at 14:06, Jason Xing <kerneljasonxing@gmail.com> wrote:
> > >
> > > Hi Maciej,
> > >
> > > On Mon, Nov 3, 2025 at 11:00 PM Maciej Fijalkowski
> > > <maciej.fijalkowski@intel.com> wrote:
> > > >
> > > > On Sat, Nov 01, 2025 at 07:59:36AM +0800, Jason Xing wrote:
> > > > > On Fri, Oct 31, 2025 at 10:02 PM Maciej Fijalkowski
> > > > > <maciej.fijalkowski@intel.com> wrote:
> > > > > >
> > > > > > On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> > > > > > > From: Jason Xing <kernelxing@tencent.com>
> > > > > > >
> > > > > > > Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > > > > > > production"), there is one issue[1] which causes the wrong publish
> > > > > > > of descriptors in race condidtion. The above commit fixes the issue
> > > > > > > but adds more memory operations in the xmit hot path and interrupt
> > > > > > > context, which can cause side effect in performance.
> > > > > > >
> > > > > > > This patch tries to propose a new solution to fix the problem
> > > > > > > without manipulating the allocation and deallocation of memory. One
> > > > > > > of the key points is that I borrowed the idea from the above commit
> > > > > > > that postpones updating the ring->descs in xsk_destruct_skb()
> > > > > > > instead of in __xsk_generic_xmit().
> > > > > > >
> > > > > > > The core logics are as show below:
> > > > > > > 1. allocate a new local queue. Only its cached_prod member is used.
> > > > > > > 2. write the descriptors into the local queue in the xmit path. And
> > > > > > >    record the cached_prod as @start_addr that reflects the
> > > > > > >    start position of this queue so that later the skb can easily
> > > > > > >    find where its addrs are written in the destruction phase.
> > > > > > > 3. initialize the upper 24 bits of destructor_arg to store @start_addr
> > > > > > >    in xsk_skb_init_misc().
> > > > > > > 4. Initialize the lower 8 bits of destructor_arg to store how many
> > > > > > >    descriptors the skb owns in xsk_update_num_desc().
> > > > > > > 5. write the desc addr(s) from the @start_addr from the cached cq
> > > > > > >    one by one into the real cq in xsk_destruct_skb(). In turn sync
> > > > > > >    the global state of the cq.
> > > > > > >
> > > > > > > The format of destructor_arg is designed as:
> > > > > > >  ------------------------ --------
> > > > > > > |       start_addr       |  num   |
> > > > > > >  ------------------------ --------
> > > > > > > Using upper 24 bits is enough to keep the temporary descriptors. And
> > > > > > > it's also enough to use lower 8 bits to show the number of descriptors
> > > > > > > that one skb owns.
> > > > > > >
> > > > > > > [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> > > > > > >
> > > > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > > > > ---
> > > > > > > I posted the series as an RFC because I'd like to hear more opinions on
> > > > > > > the current rought approach so that the fix[2] can be avoided and
> > > > > > > mitigate the impact of performance. This patch might have bugs because
> > > > > > > I decided to spend more time on it after we come to an agreement. Please
> > > > > > > review the overall concepts. Thanks!
> > > > > > >
> > > > > > > Maciej, could you share with me the way you tested jumbo frame? I used
> > > > > > > ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> > > > > > > nic more than 90%, which means I cannot see the performance impact.
> > > > >
> > > > > Could you provide the command you used? Thanks :)
> > > > >
> > > > > > >
> > > > > > > [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> > > > > > > ---
> > > > > > >  include/net/xdp_sock.h      |   1 +
> > > > > > >  include/net/xsk_buff_pool.h |   1 +
> > > > > > >  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
> > > > > > >  net/xdp/xsk_buff_pool.c     |   1 +
> > > > > > >  4 files changed, 84 insertions(+), 23 deletions(-)
> > > > > >
> > > > > > (...)
> > > > > >
> > > > > > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > > > > > index aa9788f20d0d..6e170107dec7 100644
> > > > > > > --- a/net/xdp/xsk_buff_pool.c
> > > > > > > +++ b/net/xdp/xsk_buff_pool.c
> > > > > > > @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > > > > > >
> > > > > > >       pool->fq = xs->fq_tmp;
> > > > > > >       pool->cq = xs->cq_tmp;
> > > > > > > +     pool->cached_cq = xs->cached_cq;
> > > > > >
> > > > > > Jason,
> > > > > >
> > > > > > pool can be shared between multiple sockets that bind to same <netdev,qid>
> > > > > > tuple. I believe here you're opening up for the very same issue Eryk
> > > > > > initially reported.
> > > > >
> > > > > Actually it shouldn't happen because the cached_cq is more of the
> > > > > temporary array that helps the skb store its start position. The
> > > > > cached_prod of cached_cq can only be increased, not decreased. In the
> > > > > skb destruction phase, only those skbs that go to the end of life need
> > > > > to sync its desc from cached_cq to cq. For some skbs that are released
> > > > > before the tx completion, we don't need to clear its record in
> > > > > cached_cq at all and cq remains untouched.
> > > > >
> > > > > To put it in a simple way, the patch you proposed uses kmem_cached*
> > > > > helpers to store the addr and write the addr into cq at the end of
> > > > > lifecycle while the current patch uses a pre-allocated memory to
> > > > > store. So it avoids the allocation and deallocation.
> > > > >
> > > > > Unless I'm missing something important. If so, I'm still convinced
> > > > > this temporary queue can solve the problem since essentially it's a
> > > > > better substitute for kmem cache to retain high performance.

Back after health issues!

Jason, I am still not convinced about this solution.

In shared pool setups, the temp cq will also be shared, which means that
two parallel processes can produce addresses onto temp cq and therefore
expose address to a socket that it does not belong to. In order to make
this work you would have to know upfront the descriptor count of given
frame and reserve this during processing the first descriptor.

socket 0			socket 1
prod addr 0xAA
prod addr 0xBB
				prod addr 0xDD
prod addr 0xCC
				prod addr 0xEE

socket 0 calls skb destructor with num desc == 3, placing 0xDD onto cq
which has not been sent yet, therefore potentially corrupting it.

For now, I think we should move forward with Fernando's fix as there have
been multiple reports already regarding broken state of xsk copy mode.

> > > >
> > > > I need a bit more time on this, probably I'll respond tomorrow.
> > >
> > > I'd like to know if you have any further comments on this? And should
> > > I continue to post as an official series?
> >
> > Hi Jason,
> >
> > Maciej has been out-of-office for a couple of days. He should
> > hopefully be back later this week, so please wait for his comments.
> 
> Thanks for letting me know. I will wait :)
> 
> Thanks,
> Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs
  2025-11-14 15:52               ` Maciej Fijalkowski
@ 2025-11-14 23:46                 ` Jason Xing
  0 siblings, 0 replies; 12+ messages in thread
From: Jason Xing @ 2025-11-14 23:46 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: Magnus Karlsson, davem, edumazet, kuba, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend, joe, willemdebruijn.kernel, fmancera, csmate, bpf,
	netdev, Jason Xing

On Fri, Nov 14, 2025 at 11:53 PM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Tue, Nov 11, 2025 at 10:02:58PM +0800, Jason Xing wrote:
> > Hi Magnus,
> >
> > On Tue, Nov 11, 2025 at 9:44 PM Magnus Karlsson
> > <magnus.karlsson@gmail.com> wrote:
> > >
> > > On Tue, 11 Nov 2025 at 14:06, Jason Xing <kerneljasonxing@gmail.com> wrote:
> > > >
> > > > Hi Maciej,
> > > >
> > > > On Mon, Nov 3, 2025 at 11:00 PM Maciej Fijalkowski
> > > > <maciej.fijalkowski@intel.com> wrote:
> > > > >
> > > > > On Sat, Nov 01, 2025 at 07:59:36AM +0800, Jason Xing wrote:
> > > > > > On Fri, Oct 31, 2025 at 10:02 PM Maciej Fijalkowski
> > > > > > <maciej.fijalkowski@intel.com> wrote:
> > > > > > >
> > > > > > > On Fri, Oct 31, 2025 at 05:32:30PM +0800, Jason Xing wrote:
> > > > > > > > From: Jason Xing <kernelxing@tencent.com>
> > > > > > > >
> > > > > > > > Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > > > > > > > production"), there is one issue[1] which causes the wrong publish
> > > > > > > > of descriptors in race condidtion. The above commit fixes the issue
> > > > > > > > but adds more memory operations in the xmit hot path and interrupt
> > > > > > > > context, which can cause side effect in performance.
> > > > > > > >
> > > > > > > > This patch tries to propose a new solution to fix the problem
> > > > > > > > without manipulating the allocation and deallocation of memory. One
> > > > > > > > of the key points is that I borrowed the idea from the above commit
> > > > > > > > that postpones updating the ring->descs in xsk_destruct_skb()
> > > > > > > > instead of in __xsk_generic_xmit().
> > > > > > > >
> > > > > > > > The core logics are as show below:
> > > > > > > > 1. allocate a new local queue. Only its cached_prod member is used.
> > > > > > > > 2. write the descriptors into the local queue in the xmit path. And
> > > > > > > >    record the cached_prod as @start_addr that reflects the
> > > > > > > >    start position of this queue so that later the skb can easily
> > > > > > > >    find where its addrs are written in the destruction phase.
> > > > > > > > 3. initialize the upper 24 bits of destructor_arg to store @start_addr
> > > > > > > >    in xsk_skb_init_misc().
> > > > > > > > 4. Initialize the lower 8 bits of destructor_arg to store how many
> > > > > > > >    descriptors the skb owns in xsk_update_num_desc().
> > > > > > > > 5. write the desc addr(s) from the @start_addr from the cached cq
> > > > > > > >    one by one into the real cq in xsk_destruct_skb(). In turn sync
> > > > > > > >    the global state of the cq.
> > > > > > > >
> > > > > > > > The format of destructor_arg is designed as:
> > > > > > > >  ------------------------ --------
> > > > > > > > |       start_addr       |  num   |
> > > > > > > >  ------------------------ --------
> > > > > > > > Using upper 24 bits is enough to keep the temporary descriptors. And
> > > > > > > > it's also enough to use lower 8 bits to show the number of descriptors
> > > > > > > > that one skb owns.
> > > > > > > >
> > > > > > > > [1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/
> > > > > > > >
> > > > > > > > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > > > > > > > ---
> > > > > > > > I posted the series as an RFC because I'd like to hear more opinions on
> > > > > > > > the current rought approach so that the fix[2] can be avoided and
> > > > > > > > mitigate the impact of performance. This patch might have bugs because
> > > > > > > > I decided to spend more time on it after we come to an agreement. Please
> > > > > > > > review the overall concepts. Thanks!
> > > > > > > >
> > > > > > > > Maciej, could you share with me the way you tested jumbo frame? I used
> > > > > > > > ./xdpsock -i enp2s0f1 -t -q 1 -S -s 9728 but the xdpsock utilizes the
> > > > > > > > nic more than 90%, which means I cannot see the performance impact.
> > > > > >
> > > > > > Could you provide the command you used? Thanks :)
> > > > > >
> > > > > > > >
> > > > > > > > [2]:https://lore.kernel.org/all/20251030140355.4059-1-fmancera@suse.de/
> > > > > > > > ---
> > > > > > > >  include/net/xdp_sock.h      |   1 +
> > > > > > > >  include/net/xsk_buff_pool.h |   1 +
> > > > > > > >  net/xdp/xsk.c               | 104 ++++++++++++++++++++++++++++--------
> > > > > > > >  net/xdp/xsk_buff_pool.c     |   1 +
> > > > > > > >  4 files changed, 84 insertions(+), 23 deletions(-)
> > > > > > >
> > > > > > > (...)
> > > > > > >
> > > > > > > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > > > > > > index aa9788f20d0d..6e170107dec7 100644
> > > > > > > > --- a/net/xdp/xsk_buff_pool.c
> > > > > > > > +++ b/net/xdp/xsk_buff_pool.c
> > > > > > > > @@ -99,6 +99,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> > > > > > > >
> > > > > > > >       pool->fq = xs->fq_tmp;
> > > > > > > >       pool->cq = xs->cq_tmp;
> > > > > > > > +     pool->cached_cq = xs->cached_cq;
> > > > > > >
> > > > > > > Jason,
> > > > > > >
> > > > > > > pool can be shared between multiple sockets that bind to same <netdev,qid>
> > > > > > > tuple. I believe here you're opening up for the very same issue Eryk
> > > > > > > initially reported.
> > > > > >
> > > > > > Actually it shouldn't happen because the cached_cq is more of the
> > > > > > temporary array that helps the skb store its start position. The
> > > > > > cached_prod of cached_cq can only be increased, not decreased. In the
> > > > > > skb destruction phase, only those skbs that go to the end of life need
> > > > > > to sync its desc from cached_cq to cq. For some skbs that are released
> > > > > > before the tx completion, we don't need to clear its record in
> > > > > > cached_cq at all and cq remains untouched.
> > > > > >
> > > > > > To put it in a simple way, the patch you proposed uses kmem_cached*
> > > > > > helpers to store the addr and write the addr into cq at the end of
> > > > > > lifecycle while the current patch uses a pre-allocated memory to
> > > > > > store. So it avoids the allocation and deallocation.
> > > > > >
> > > > > > Unless I'm missing something important. If so, I'm still convinced
> > > > > > this temporary queue can solve the problem since essentially it's a
> > > > > > better substitute for kmem cache to retain high performance.
>
> Back after health issues!

Hi Maciej,

Hope you're fully recovered:)

>
> Jason, I am still not convinced about this solution.
>
> In shared pool setups, the temp cq will also be shared, which means that
> two parallel processes can produce addresses onto temp cq and therefore
> expose address to a socket that it does not belong to. In order to make
> this work you would have to know upfront the descriptor count of given
> frame and reserve this during processing the first descriptor.
>
> socket 0                        socket 1
> prod addr 0xAA
> prod addr 0xBB
>                                 prod addr 0xDD
> prod addr 0xCC
>                                 prod addr 0xEE
>
> socket 0 calls skb destructor with num desc == 3, placing 0xDD onto cq
> which has not been sent yet, therefore potentially corrupting it.

Thanks for spotting this case!

Yes, it can happen, so let's turn into a per-xsk granularity? If each
xsk has its own temp queue, then the problem would disappear and good
news is that we don't need extra locks like pool->cq_lock to prevent
multiple parallel xsks accessing the temp queue.

Hope you can agree with this method. It borrows your idea and then
only uses a _pre-allocated buffer_ to replace kmem_cache_alloc() in
the hot path. This solution will direct us more to a high performance
direction. IMHO, I‘d rather not see any degradation in performance
because of some issues.

Thanks,
Jason

>
> For now, I think we should move forward with Fernando's fix as there have
> been multiple reports already regarding broken state of xsk copy mode.
>
> > > > >
> > > > > I need a bit more time on this, probably I'll respond tomorrow.
> > > >
> > > > I'd like to know if you have any further comments on this? And should
> > > > I continue to post as an official series?
> > >
> > > Hi Jason,
> > >
> > > Maciej has been out-of-office for a couple of days. He should
> > > hopefully be back later this week, so please wait for his comments.
> >
> > Thanks for letting me know. I will wait :)
> >
> > Thanks,
> > Jason

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2025-11-14 23:47 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-31  9:32 [PATCH RFC net-next 0/2] xsk: fix immature cq descriptor production (II) Jason Xing
2025-10-31  9:32 ` [PATCH RFC net-next 1/2] Revert "xsk: Fix immature cq descriptor production" Jason Xing
2025-10-31  9:32 ` [PATCH RFC net-next 2/2] xsk: introduce a cached cq to temporarily store descriptor addrs Jason Xing
2025-10-31 14:02   ` Maciej Fijalkowski
2025-10-31 23:59     ` Jason Xing
2025-11-03 15:00       ` Maciej Fijalkowski
2025-11-03 23:29         ` Jason Xing
2025-11-11 13:03         ` Jason Xing
2025-11-11 13:44           ` Magnus Karlsson
2025-11-11 14:02             ` Jason Xing
2025-11-14 15:52               ` Maciej Fijalkowski
2025-11-14 23:46                 ` Jason Xing

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).