All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6 bpf] xsk: fix immature cq descriptor production
@ 2025-08-20 15:44 Maciej Fijalkowski
  2025-08-20 17:06 ` Stanislav Fomichev
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Maciej Fijalkowski @ 2025-08-20 15:44 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii
  Cc: netdev, magnus.karlsson, stfomichev, aleksander.lobakin,
	Maciej Fijalkowski, Eryk Kubanski

Eryk reported an issue that I have put under Closes: tag, related to
umem addrs being prematurely produced onto pool's completion queue.
Let us make the skb's destructor responsible for producing all addrs
that given skb used.

Introduce struct xsk_addrs which will carry descriptor count with array
of addresses taken from processed descriptors that will be carried via
skb_shared_info::destructor_arg. This way we can refer to it within
xsk_destruct_skb(). In order to mitigate the overhead that will be
coming from memory allocations, let us introduce kmem_cache of
xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
system.

Commit from fixes tag introduced the buggy behavior, it was not broken
from day 1, but rather when xsk multi-buffer got introduced.

Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---

v1:
https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
v2:
https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
v3:
https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
v4:
https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
v5:
https://lore.kernel.org/bpf/aKXBHGPxjpBDKOHq@boxer/T/

v1->v2:
* store addrs in array carried via destructor_arg instead having them
  stored in skb headroom; cleaner and less hacky approach;
v2->v3:
* use kmem_cache for xsk_addrs allocation (Stan/Olek)
* set err when xsk_addrs allocation fails (Dan)
* change xsk_addrs layout to avoid holes
* free xsk_addrs on error path
* rebase
v3->v4:
* have kmem_cache as percpu vars
* don't drop unnecessary braces (unrelated) (Stan)
* use idx + i in xskq_prod_write_addr (Stan)
* alloc kmem_cache on bind (Stan)
* keep num_descs as first member in xsk_addrs (Magnus)
* add ack from Magnus
v4->v5:
* have a single kmem_cache per xsk subsystem (Stan)
v5->v6:
* free skb in xsk_build_skb_zerocopy() when xsk_addrs allocation fails
  (Stan)
* unregister netdev notifier if creating kmem_cache fails (Stan)

---
 net/xdp/xsk.c       | 95 +++++++++++++++++++++++++++++++++++++--------
 net/xdp/xsk_queue.h | 12 ++++++
 2 files changed, 91 insertions(+), 16 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9c3acecc14b1..989d5ffb4273 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,13 @@
 #define TX_BATCH_SIZE 32
 #define MAX_PER_SOCKET_BUDGET 32
 
+struct xsk_addrs {
+	u32 num_descs;
+	u64 addrs[MAX_SKB_FRAGS + 1];
+};
+
+static struct kmem_cache *xsk_tx_generic_cache;
+
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	ret = xskq_prod_reserve_addr(pool->cq, addr);
+	ret = xskq_prod_reserve(pool->cq);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 
 	return ret;
 }
 
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
+				      struct sk_buff *skb)
 {
+	struct xsk_buff_pool *pool = xs->pool;
+	struct xsk_addrs *xsk_addrs;
 	unsigned long flags;
+	u32 num_desc, i;
+	u32 idx;
+
+	xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	num_desc = xsk_addrs->num_descs;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	xskq_prod_submit_n(pool->cq, n);
+	idx = xskq_get_prod(pool->cq);
+
+	for (i = 0; i < num_desc; i++)
+		xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
+	xskq_prod_submit_n(pool->cq, num_desc);
+
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
+	kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
 }
 
 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
@@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
-static u32 xsk_get_num_desc(struct sk_buff *skb)
-{
-	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
-}
-
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
 	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
 	}
 
-	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+	xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
 	sock_wfree(skb);
 }
 
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static u32 xsk_get_num_desc(struct sk_buff *skb)
 {
-	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+	struct xsk_addrs *addrs;
 
-	skb_shinfo(skb)->destructor_arg = (void *)num;
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	return addrs->num_descs;
+}
+
+static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
+{
+	skb_shinfo(skb)->destructor_arg = (void *)addrs;
+}
+
+static void xsk_inc_skb_descs(struct sk_buff *skb)
+{
+	struct xsk_addrs *addrs;
+
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	addrs->num_descs++;
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
 {
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 
+	kmem_cache_free(xsk_tx_generic_cache,
+			(struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
 	skb->destructor = sock_wfree;
 	xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
 	/* Free skb without triggering the perf drop trace */
@@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 {
 	struct xsk_buff_pool *pool = xs->pool;
 	u32 hr, len, ts, offset, copy, copied;
+	struct xsk_addrs *addrs = NULL;
 	struct sk_buff *skb = xs->skb;
 	struct page *page;
 	void *buffer;
@@ -623,6 +656,14 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 			return ERR_PTR(err);
 
 		skb_reserve(skb, hr);
+
+		addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+		if (!addrs) {
+			kfree(skb);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		xsk_set_destructor_arg(skb, addrs);
 	}
 
 	addr = desc->addr;
@@ -662,6 +703,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 {
 	struct xsk_tx_metadata *meta = NULL;
 	struct net_device *dev = xs->dev;
+	struct xsk_addrs *addrs = NULL;
 	struct sk_buff *skb = xs->skb;
 	bool first_frag = false;
 	int err;
@@ -694,6 +736,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			err = skb_store_bits(skb, 0, buffer, len);
 			if (unlikely(err))
 				goto free_err;
+
+			addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+			if (!addrs) {
+				err = -ENOMEM;
+				goto free_err;
+			}
+
+			xsk_set_destructor_arg(skb, addrs);
+
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
 			struct page *page;
@@ -759,7 +810,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
 	skb->destructor = xsk_destruct_skb;
 	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
-	xsk_set_destructor_arg(skb);
+
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	addrs->addrs[addrs->num_descs++] = desc->addr;
 
 	return skb;
 
@@ -769,7 +822,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 	if (err == -EOVERFLOW) {
 		/* Drop the packet */
-		xsk_set_destructor_arg(xs->skb);
+		xsk_inc_skb_descs(xs->skb);
 		xsk_drop_skb(xs->skb);
 		xskq_cons_release(xs->tx);
 	} else {
@@ -812,7 +865,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+		err = xsk_cq_reserve_locked(xs->pool);
 		if (err) {
 			err = -EAGAIN;
 			goto out;
@@ -1815,8 +1868,18 @@ static int __init xsk_init(void)
 	if (err)
 		goto out_pernet;
 
+	xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
+						 sizeof(struct xsk_addrs), 0,
+						 SLAB_HWCACHE_ALIGN, NULL);
+	if (!xsk_tx_generic_cache) {
+		err = -ENOMEM;
+		goto out_unreg_notif;
+	}
+
 	return 0;
 
+out_unreg_notif:
+	unregister_netdevice_notifier(&xsk_netdev_notifier);
 out_pernet:
 	unregister_pernet_subsys(&xsk_net_ops);
 out_sk:
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..f16f390370dc 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
 
 /* Functions for producers */
 
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+	return READ_ONCE(q->ring->producer);
+}
+
 static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
 {
 	u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	ring->desc[idx & q->ring_mask] = addr;
+}
+
 static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
 					      u32 nb_entries)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread
* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
@ 2025-08-27 23:11 kernel test robot
  0 siblings, 0 replies; 14+ messages in thread
From: kernel test robot @ 2025-08-27 23:11 UTC (permalink / raw)
  To: oe-kbuild; +Cc: lkp, Dan Carpenter

BCC: lkp@intel.com
CC: oe-kbuild-all@lists.linux.dev
In-Reply-To: <20250820154416.2248012-1-maciej.fijalkowski@intel.com>
References: <20250820154416.2248012-1-maciej.fijalkowski@intel.com>
TO: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
TO: bpf@vger.kernel.org
TO: ast@kernel.org
TO: daniel@iogearbox.net
TO: andrii@kernel.org
CC: netdev@vger.kernel.org
CC: magnus.karlsson@intel.com
CC: stfomichev@gmail.com
CC: aleksander.lobakin@intel.com
CC: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
CC: Eryk Kubanski <e.kubanski@partner.samsung.com>

Hi Maciej,

kernel test robot noticed the following build warnings:

[auto build test WARNING on bpf/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Maciej-Fijalkowski/xsk-fix-immature-cq-descriptor-production/20250820-235001
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git master
patch link:    https://lore.kernel.org/r/20250820154416.2248012-1-maciej.fijalkowski%40intel.com
patch subject: [PATCH v6 bpf] xsk: fix immature cq descriptor production
:::::: branch date: 7 days ago
:::::: commit date: 7 days ago
config: powerpc-randconfig-r072-20250827 (https://download.01.org/0day-ci/archive/20250828/202508280638.Dy15CCOf-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 8.5.0

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <error27@gmail.com>
| Closes: https://lore.kernel.org/r/202508280638.Dy15CCOf-lkp@intel.com/

smatch warnings:
net/xdp/xsk.c:662 xsk_build_skb_zerocopy() error: use kfree_skb() here instead of kfree(skb)

vim +662 net/xdp/xsk.c

cf24f5a5feeaae3 Tirthendu Sarkar   2023-07-19  638  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  639  static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  640  					      struct xdp_desc *desc)
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  641  {
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  642  	struct xsk_buff_pool *pool = xs->pool;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  643  	u32 hr, len, ts, offset, copy, copied;
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20  644  	struct xsk_addrs *addrs = NULL;
cf24f5a5feeaae3 Tirthendu Sarkar   2023-07-19  645  	struct sk_buff *skb = xs->skb;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  646  	struct page *page;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  647  	void *buffer;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  648  	int err, i;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  649  	u64 addr;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  650  
cf24f5a5feeaae3 Tirthendu Sarkar   2023-07-19  651  	if (!skb) {
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  652  		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  653  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  654  		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  655  		if (unlikely(!skb))
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  656  			return ERR_PTR(err);
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  657  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  658  		skb_reserve(skb, hr);
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20  659  
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20  660  		addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20  661  		if (!addrs) {
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 @662  			kfree(skb);
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20  663  			return ERR_PTR(-ENOMEM);
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20  664  		}
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20  665  
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20  666  		xsk_set_destructor_arg(skb, addrs);
cf24f5a5feeaae3 Tirthendu Sarkar   2023-07-19  667  	}
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  668  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  669  	addr = desc->addr;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  670  	len = desc->len;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  671  	ts = pool->unaligned ? len : pool->chunk_size;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  672  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  673  	buffer = xsk_buff_raw_get_data(pool, addr);
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  674  	offset = offset_in_page(buffer);
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  675  	addr = buffer - pool->addrs;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  676  
cf24f5a5feeaae3 Tirthendu Sarkar   2023-07-19  677  	for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
cf24f5a5feeaae3 Tirthendu Sarkar   2023-07-19  678  		if (unlikely(i >= MAX_SKB_FRAGS))
9d0a67b9d42c630 Tirthendu Sarkar   2023-08-23  679  			return ERR_PTR(-EOVERFLOW);
cf24f5a5feeaae3 Tirthendu Sarkar   2023-07-19  680  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  681  		page = pool->umem->pgs[addr >> PAGE_SHIFT];
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  682  		get_page(page);
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  683  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  684  		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  685  		skb_fill_page_desc(skb, i, page, offset, copy);
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  686  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  687  		copied += copy;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  688  		addr += copy;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  689  		offset = 0;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  690  	}
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  691  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  692  	skb->len += len;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  693  	skb->data_len += len;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  694  	skb->truesize += ts;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  695  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  696  	refcount_add(ts, &xs->sk.sk_wmem_alloc);
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  697  
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  698  	return skb;
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  699  }
9c8f21e6f8856a9 Xuan Zhuo          2021-02-18  700  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2025-08-30 10:33 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-20 15:44 [PATCH v6 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-20 17:06 ` Stanislav Fomichev
2025-08-22 16:20 ` patchwork-bot+netdevbpf
2025-08-26  8:14 ` Dan Carpenter
2025-08-26 12:23   ` Daniel Borkmann
2025-08-26 13:40     ` Jason Xing
2025-08-29 16:14     ` Maciej Fijalkowski
2025-08-30 10:32       ` Jason Xing
2025-08-26 16:00 ` Jason Xing
2025-08-26 18:23   ` Magnus Karlsson
2025-08-26 19:03     ` Maciej Fijalkowski
2025-08-26 19:13       ` Maciej Fijalkowski
2025-08-27  0:25         ` Jason Xing
  -- strict thread matches above, loose matches on Subject: below --
2025-08-27 23:11 kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.