netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 bpf] xsk: fix immature cq descriptor production
@ 2025-08-13 17:12 Maciej Fijalkowski
  2025-08-13 21:31 ` Stanislav Fomichev
  2025-08-17 22:05 ` kernel test robot
  0 siblings, 2 replies; 3+ messages in thread
From: Maciej Fijalkowski @ 2025-08-13 17:12 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii
  Cc: netdev, magnus.karlsson, stfomichev, aleksander.lobakin,
	Maciej Fijalkowski, Eryk Kubanski

Eryk reported an issue that I have put under Closes: tag, related to
umem addrs being prematurely produced onto pool's completion queue.
Let us make the skb's destructor responsible for producing all addrs
that given skb used.

Introduce struct xsk_addrs which will carry descriptor count with array
of addresses taken from processed descriptors that will be carried via
skb_shared_info::destructor_arg. This way we can refer to it within
xsk_destruct_skb(). In order to mitigate the overhead that will be
coming from memory allocations, let us introduce kmem_cache of
xsk_addrs, but be smart about scalability, as assigning unique cache per
each socket might be expensive.

Store kmem_cache as percpu variables with embedded refcounting and let
the xsk code index it by queue id. In case when a NIC#0 already runs
copy mode xsk socket on queue #10 and user starts a socket on
<NIC#1,qid#10> tuple, the kmem_cache is reused. Keep the pointer to
kmem_cache in xdp_sock to avoid accessing bss in data path.

Commit from fixes tag introduced the buggy behavior, it was not broken
from day 1, but rather when xsk multi-buffer got introduced.

Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---

v1:
https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
v2:
https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
v3:
https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/

v1->v2:
* store addrs in array carried via destructor_arg instead having them
  stored in skb headroom; cleaner and less hacky approach;
v2->v3:
* use kmem_cache for xsk_addrs allocation (Stan/Olek)
* set err when xsk_addrs allocation fails (Dan)
* change xsk_addrs layout to avoid holes
* free xsk_addrs on error path
* rebase
v3->v4:
* have kmem_cache as percpu vars
* don't drop unnecessary braces (unrelated) (Stan)
* use idx + i in xskq_prod_write_addr (Stan)
* alloc kmem_cache on bind (Stan)
* keep num_descs as first member in xsk_addrs (Magnus)
* add ack from Magnus

---
 include/net/xdp_sock.h |   1 +
 net/xdp/xsk.c          | 140 +++++++++++++++++++++++++++++++++++------
 net/xdp/xsk_queue.h    |  12 ++++
 3 files changed, 135 insertions(+), 18 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ce587a225661..4701e7f41bee 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -61,6 +61,7 @@ struct xdp_sock {
 		XSK_BOUND,
 		XSK_UNBOUND,
 	} state;
+	struct kmem_cache *generic_cache;
 
 	struct xsk_queue *tx ____cacheline_aligned_in_smp;
 	struct list_head tx_list;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9c3acecc14b1..d245784163ec 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,18 @@
 #define TX_BATCH_SIZE 32
 #define MAX_PER_SOCKET_BUDGET 32
 
+struct xsk_addrs {
+	u32 num_descs;
+	u64 addrs[MAX_SKB_FRAGS + 1];
+};
+
+struct xsk_generic_cache {
+	struct kmem_cache *cache;
+	refcount_t users;
+};
+
+DEFINE_PER_CPU(struct xsk_generic_cache, system_xsk_generic_cache);
+
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -532,25 +544,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	ret = xskq_prod_reserve_addr(pool->cq, addr);
+	ret = xskq_prod_reserve(pool->cq);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 
 	return ret;
 }
 
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
+				      struct sk_buff *skb)
 {
+	struct xsk_buff_pool *pool = xs->pool;
+	struct xsk_addrs *xsk_addrs;
 	unsigned long flags;
+	u32 num_desc, i;
+	u32 idx;
+
+	xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	num_desc = xsk_addrs->num_descs;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	xskq_prod_submit_n(pool->cq, n);
+	idx = xskq_get_prod(pool->cq);
+
+	for (i = 0; i < num_desc; i++)
+		xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
+	xskq_prod_submit_n(pool->cq, num_desc);
+
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
+	kmem_cache_free(xs->generic_cache, xsk_addrs);
 }
 
 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
@@ -562,11 +588,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
-static u32 xsk_get_num_desc(struct sk_buff *skb)
-{
-	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
-}
-
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
 	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -576,21 +597,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
 	}
 
-	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+	xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
 	sock_wfree(skb);
 }
 
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static u32 xsk_get_num_desc(struct sk_buff *skb)
 {
-	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+	struct xsk_addrs *addrs;
+
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	return addrs->num_descs;
+}
 
-	skb_shinfo(skb)->destructor_arg = (void *)num;
+static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
+{
+	skb_shinfo(skb)->destructor_arg = (void *)addrs;
+}
+
+static void xsk_inc_skb_descs(struct sk_buff *skb)
+{
+	struct xsk_addrs *addrs;
+
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	addrs->num_descs++;
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
 {
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 
+	kmem_cache_free(xs->generic_cache,
+			(struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
 	skb->destructor = sock_wfree;
 	xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
 	/* Free skb without triggering the perf drop trace */
@@ -605,10 +642,12 @@ static void xsk_drop_skb(struct sk_buff *skb)
 }
 
 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
-					      struct xdp_desc *desc)
+					      struct xdp_desc *desc,
+					      struct kmem_cache *cache)
 {
 	struct xsk_buff_pool *pool = xs->pool;
 	u32 hr, len, ts, offset, copy, copied;
+	struct xsk_addrs *addrs = NULL;
 	struct sk_buff *skb = xs->skb;
 	struct page *page;
 	void *buffer;
@@ -623,6 +662,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 			return ERR_PTR(err);
 
 		skb_reserve(skb, hr);
+
+		addrs = kmem_cache_zalloc(cache, GFP_KERNEL);
+		if (!addrs)
+			return ERR_PTR(-ENOMEM);
+
+		xsk_set_destructor_arg(skb, addrs);
 	}
 
 	addr = desc->addr;
@@ -662,12 +707,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 {
 	struct xsk_tx_metadata *meta = NULL;
 	struct net_device *dev = xs->dev;
+	struct xsk_addrs *addrs = NULL;
 	struct sk_buff *skb = xs->skb;
 	bool first_frag = false;
 	int err;
 
 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
-		skb = xsk_build_skb_zerocopy(xs, desc);
+		skb = xsk_build_skb_zerocopy(xs, desc, xs->generic_cache);
 		if (IS_ERR(skb)) {
 			err = PTR_ERR(skb);
 			goto free_err;
@@ -694,6 +740,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			err = skb_store_bits(skb, 0, buffer, len);
 			if (unlikely(err))
 				goto free_err;
+
+			addrs = kmem_cache_zalloc(xs->generic_cache, GFP_KERNEL);
+			if (!addrs) {
+				err = -ENOMEM;
+				goto free_err;
+			}
+
+			xsk_set_destructor_arg(skb, addrs);
+
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
 			struct page *page;
@@ -759,7 +814,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
 	skb->destructor = xsk_destruct_skb;
 	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
-	xsk_set_destructor_arg(skb);
+
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	addrs->addrs[addrs->num_descs++] = desc->addr;
 
 	return skb;
 
@@ -769,7 +826,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 	if (err == -EOVERFLOW) {
 		/* Drop the packet */
-		xsk_set_destructor_arg(xs->skb);
+		xsk_inc_skb_descs(xs->skb);
 		xsk_drop_skb(xs->skb);
 		xskq_cons_release(xs->tx);
 	} else {
@@ -812,7 +869,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+		err = xsk_cq_reserve_locked(xs->pool);
 		if (err) {
 			err = -EAGAIN;
 			goto out;
@@ -1095,6 +1152,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
 
 static int xsk_release(struct socket *sock)
 {
+	struct xsk_generic_cache *pcpu_cache;
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 	struct net *net;
@@ -1123,6 +1181,15 @@ static int xsk_release(struct socket *sock)
 	xskq_destroy(xs->fq_tmp);
 	xskq_destroy(xs->cq_tmp);
 
+	pcpu_cache = per_cpu_ptr(&system_xsk_generic_cache, xs->queue_id);
+	if (pcpu_cache->cache) {
+		if (refcount_dec_and_test(&pcpu_cache->users)) {
+			kmem_cache_destroy(pcpu_cache->cache);
+			pcpu_cache->cache = NULL;
+			xs->generic_cache = NULL;
+		}
+	}
+
 	sock_orphan(sk);
 	sock->sk = NULL;
 
@@ -1153,6 +1220,33 @@ static bool xsk_validate_queues(struct xdp_sock *xs)
 	return xs->fq_tmp && xs->cq_tmp;
 }
 
+static int xsk_alloc_generic_xmit_cache(struct xdp_sock *xs, u16 qid)
+{
+	struct xsk_generic_cache *pcpu_cache =
+		per_cpu_ptr(&system_xsk_generic_cache, qid);
+	struct kmem_cache *cache;
+	char cache_name[32];
+
+	if (refcount_read(&pcpu_cache->users) > 0) {
+		refcount_inc(&pcpu_cache->users);
+		xs->generic_cache = pcpu_cache->cache;
+		return 0;
+	}
+
+	snprintf(cache_name, sizeof(cache_name),
+		 "xsk_generic_xmit_cache%d", qid);
+	cache = kmem_cache_create(cache_name, sizeof(struct xsk_addrs), 0,
+				  SLAB_HWCACHE_ALIGN, NULL);
+	if (!cache)
+		return -ENOMEM;
+
+	refcount_set(&pcpu_cache->users, 1);
+	pcpu_cache->cache = cache;
+	xs->generic_cache = pcpu_cache->cache;
+
+	return 0;
+}
+
 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -1306,6 +1400,16 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	xs->zc = xs->umem->zc;
 	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
 	xs->queue_id = qid;
+
+	if (!xs->zc) {
+		err = xsk_alloc_generic_xmit_cache(xs, qid);
+		if (err) {
+			xp_destroy(xs->pool);
+			xs->pool = NULL;
+			goto out_unlock;
+		}
+	}
+
 	xp_add_xsk(xs->pool, xs);
 
 	if (qid < dev->real_num_rx_queues) {
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..f16f390370dc 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
 
 /* Functions for producers */
 
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+	return READ_ONCE(q->ring->producer);
+}
+
 static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
 {
 	u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	ring->desc[idx & q->ring_mask] = addr;
+}
+
 static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
 					      u32 nb_entries)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v4 bpf] xsk: fix immature cq descriptor production
  2025-08-13 17:12 [PATCH v4 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
@ 2025-08-13 21:31 ` Stanislav Fomichev
  2025-08-17 22:05 ` kernel test robot
  1 sibling, 0 replies; 3+ messages in thread
From: Stanislav Fomichev @ 2025-08-13 21:31 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
	aleksander.lobakin, Eryk Kubanski

On 08/13, Maciej Fijalkowski wrote:
> Eryk reported an issue that I have put under Closes: tag, related to
> umem addrs being prematurely produced onto pool's completion queue.
> Let us make the skb's destructor responsible for producing all addrs
> that given skb used.
> 
> Introduce struct xsk_addrs which will carry descriptor count with array
> of addresses taken from processed descriptors that will be carried via
> skb_shared_info::destructor_arg. This way we can refer to it within
> xsk_destruct_skb(). In order to mitigate the overhead that will be
> coming from memory allocations, let us introduce kmem_cache of
> xsk_addrs, but be smart about scalability, as assigning unique cache per
> each socket might be expensive.
> 
> Store kmem_cache as percpu variables with embedded refcounting and let
> the xsk code index it by queue id. In case when a NIC#0 already runs
> copy mode xsk socket on queue #10 and user starts a socket on
> <NIC#1,qid#10> tuple, the kmem_cache is reused. Keep the pointer to
> kmem_cache in xdp_sock to avoid accessing bss in data path.
> 
> Commit from fixes tag introduced the buggy behavior, it was not broken
> from day 1, but rather when xsk multi-buffer got introduced.
> 
> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
> 
> v1:
> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> v2:
> https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> v3:
> https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> 
> v1->v2:
> * store addrs in array carried via destructor_arg instead having them
>   stored in skb headroom; cleaner and less hacky approach;
> v2->v3:
> * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> * set err when xsk_addrs allocation fails (Dan)
> * change xsk_addrs layout to avoid holes
> * free xsk_addrs on error path
> * rebase
> v3->v4:

[..]

> * have kmem_cache as percpu vars

I wonder whether we're over thinking it and should just have one "global"
kmem cache for xsk subsystem. The mm layer should, presumably,
do all that percpu stuff already internally. Have you seen any perf
issues with that approach?

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v4 bpf] xsk: fix immature cq descriptor production
  2025-08-13 17:12 [PATCH v4 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
  2025-08-13 21:31 ` Stanislav Fomichev
@ 2025-08-17 22:05 ` kernel test robot
  1 sibling, 0 replies; 3+ messages in thread
From: kernel test robot @ 2025-08-17 22:05 UTC (permalink / raw)
  To: Maciej Fijalkowski, bpf, ast, daniel, andrii
  Cc: oe-kbuild-all, netdev, magnus.karlsson, stfomichev,
	aleksander.lobakin, Maciej Fijalkowski, Eryk Kubanski

Hi Maciej,

kernel test robot noticed the following build warnings:

[auto build test WARNING on bpf/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Maciej-Fijalkowski/xsk-fix-immature-cq-descriptor-production/20250814-012129
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git master
patch link:    https://lore.kernel.org/r/20250813171210.2205259-1-maciej.fijalkowski%40intel.com
patch subject: [PATCH v4 bpf] xsk: fix immature cq descriptor production
config: x86_64-randconfig-121-20250817 (https://download.01.org/0day-ci/archive/20250818/202508180712.ZeSSNfkM-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250818/202508180712.ZeSSNfkM-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202508180712.ZeSSNfkM-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> net/xdp/xsk.c:49:1: sparse: sparse: symbol '__pcpu_scope_system_xsk_generic_cache' was not declared. Should it be static?

vim +/__pcpu_scope_system_xsk_generic_cache +49 net/xdp/xsk.c

    48	
  > 49	DEFINE_PER_CPU(struct xsk_generic_cache, system_xsk_generic_cache);
    50	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-08-17 22:07 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-13 17:12 [PATCH v4 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-13 21:31 ` Stanislav Fomichev
2025-08-17 22:05 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).