[PATCH bpf-next v1 0/2] xsk: introduce pre-allocated memory per xsk CQ

bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH bpf-next v1 0/2] xsk: introduce pre-allocated memory per xsk CQ
@ 2025-12-09  8:59 Jason Xing
  2025-12-09  8:59 ` [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket Jason Xing
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Jason Xing @ 2025-12-09  8:59 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing

From: Jason Xing <kernelxing@tencent.com>

This series was made based on the previous work[1] to fix the issue
without causing too much performance impact through adding a
pre-allocated memory for each xsk.

[1]: commit 30f241fcf52a ("xsk: Fix immature cq descriptor production")

Jason Xing (2):
  xsk: introduce local_cq for each af_xdp socket
  xsk: introduce a dedicated local completion queue for each xsk

 include/net/xdp_sock.h |   8 ++
 net/xdp/xsk.c          | 202 ++++++++++++++++++++---------------------
 2 files changed, 105 insertions(+), 105 deletions(-)

-- 
2.41.3


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket
  2025-12-09  8:59 [PATCH bpf-next v1 0/2] xsk: introduce pre-allocated memory per xsk CQ Jason Xing
@ 2025-12-09  8:59 ` Jason Xing
  2025-12-09  9:28   ` bot+bpf-ci
  2025-12-09  8:59 ` [PATCH bpf-next v1 2/2] xsk: introduce a dedicated local completion queue for each xsk Jason Xing
  2025-12-09 14:29 ` [syzbot ci] Re: xsk: introduce pre-allocated memory per xsk CQ syzbot ci
  2 siblings, 1 reply; 7+ messages in thread
From: Jason Xing @ 2025-12-09  8:59 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing

From: Jason Xing <kernelxing@tencent.com>

This is a prep that will be used to store the addr(s) of descriptors so
that each skb going to the end of life can publish corresponding addr(s)
in its completion queue that can be read by userspace.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/xdp_sock.h |  8 ++++++++
 net/xdp/xsk.c          | 44 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 23e8861e8b25..c53ab2609d8c 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -45,6 +45,12 @@ struct xsk_map {
 	struct xdp_sock __rcu *xsk_map[];
 };
 
+struct local_cq {
+	u32 prod ____cacheline_aligned_in_smp;
+	u32 ring_mask ____cacheline_aligned_in_smp;
+	u64 desc[] ____cacheline_aligned_in_smp;
+};
+
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
 	struct sock sk;
@@ -89,6 +95,8 @@ struct xdp_sock {
 	struct mutex mutex;
 	struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
 	struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
+	/* Maintain addr(s) of descriptors locally */
+	struct local_cq *lcq;
 };
 
 /*
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f093c3453f64..ce165d093105 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1212,6 +1212,24 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
 	}
 }
 
+static int xsk_init_local_cq(struct xdp_sock *xs)
+{
+	u32 nentries = xs->pool->cq->nentries;
+	size_t size = struct_size_t(struct local_cq, desc, nentries);
+
+	xs->lcq = vmalloc(size);
+	if (!xs->lcq)
+		return -ENOMEM;
+	xs->lcq->ring_mask = nentries - 1;
+
+	return 0;
+}
+
+static void xsk_clear_local_cq(struct xdp_sock *xs)
+{
+	vfree(xs->lcq);
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -1241,6 +1259,7 @@ static int xsk_release(struct socket *sock)
 	xskq_destroy(xs->tx);
 	xskq_destroy(xs->fq_tmp);
 	xskq_destroy(xs->cq_tmp);
+	xsk_clear_local_cq(xs);
 
 	sock_orphan(sk);
 	sock->sk = NULL;
@@ -1360,9 +1379,18 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr
 				goto out_unlock;
 			}
 
+			err = xsk_init_local_cq(xs);
+			if (err) {
+				xp_destroy(xs->pool);
+				xs->pool = NULL;
+				sockfd_put(sock);
+				goto out_unlock;
+			}
+
 			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
 						   qid);
 			if (err) {
+				xsk_clear_local_cq(xs);
 				xp_destroy(xs->pool);
 				xs->pool = NULL;
 				sockfd_put(sock);
@@ -1380,6 +1408,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr
 			xp_get_pool(umem_xs->pool);
 			xs->pool = umem_xs->pool;
 
+			err = xsk_init_local_cq(xs);
+			if (err) {
+				xp_put_pool(xs->pool);
+				xs->pool = NULL;
+				sockfd_put(sock);
+				goto out_unlock;
+			}
 			/* If underlying shared umem was created without Tx
 			 * ring, allocate Tx descs array that Tx batching API
 			 * utilizes
@@ -1387,6 +1422,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr
 			if (xs->tx && !xs->pool->tx_descs) {
 				err = xp_alloc_tx_descs(xs->pool, xs);
 				if (err) {
+					xsk_clear_local_cq(xs);
 					xp_put_pool(xs->pool);
 					xs->pool = NULL;
 					sockfd_put(sock);
@@ -1409,8 +1445,16 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr
 			goto out_unlock;
 		}
 
+		err = xsk_init_local_cq(xs);
+		if (err) {
+			xp_destroy(xs->pool);
+			xs->pool = NULL;
+			goto out_unlock;
+		}
+
 		err = xp_assign_dev(xs->pool, dev, qid, flags);
 		if (err) {
+			xsk_clear_local_cq(xs);
 			xp_destroy(xs->pool);
 			xs->pool = NULL;
 			goto out_unlock;
-- 
2.41.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH bpf-next v1 2/2] xsk: introduce a dedicated local completion queue for each xsk
  2025-12-09  8:59 [PATCH bpf-next v1 0/2] xsk: introduce pre-allocated memory per xsk CQ Jason Xing
  2025-12-09  8:59 ` [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket Jason Xing
@ 2025-12-09  8:59 ` Jason Xing
  2025-12-09 14:29 ` [syzbot ci] Re: xsk: introduce pre-allocated memory per xsk CQ syzbot ci
  2 siblings, 0 replies; 7+ messages in thread
From: Jason Xing @ 2025-12-09  8:59 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing

From: Jason Xing <kernelxing@tencent.com>

Before the commit 30f241fcf52a ("xsk: Fix immature cq descriptor
production"), there is one issue[1] which causes the wrong publish
of descriptors in race condidtion. The above commit fixes the issue
but adds more memory operations in the xmit hot path and interrupt
context, which can cause side effect in performance.

Based on the existing infrastructure, this patch tries to propose
a new solution to fix the problem by using a pre-allocated memory
that is local completion queue to avoid frequently performing memory
functions. The benefit comes from replacing xsk_tx_generic_cache with
local cq.

The core logics are as show below:
1. allocate a new local completion queue when setting the real queue.
2. write the descriptors into the local cq in the xmit path. And
   record the prod as @start_pos that reflects the start position of
   skb in this queue so that later the skb can easily write the desc
   addr(s) from local cq to cq addrs in the destruction phase.
3. initialize the upper 24 bits of destructor_arg to store @start_pos
   in xsk_skb_init_misc().
4. Initialize the lower 8 bits of destructor_arg to store how many
   descriptors the skb owns in xsk_inc_num_desc().
5. write the desc addr(s) from the @start_addr from the local cq
   one by one into the real cq in xsk_destruct_skb(). In turn sync
   the global state of the cq as before.

The format of destructor_arg is designed as:
 ------------------------ --------
|       start_pos        |  num   |
 ------------------------ --------
Using upper 24 bits is enough to keep the temporary descriptors. And
it's also enough to use lower 8 bits to show the number of descriptors
that one skb owns.

[1]: https://lore.kernel.org/all/20250530095957.43248-1-e.kubanski@partner.samsung.com/

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/xdp/xsk.c | 158 +++++++++++++++++---------------------------------
 1 file changed, 53 insertions(+), 105 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ce165d093105..c51ea9526e5f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,8 +41,6 @@ struct xsk_addrs {
 	u64 addrs[MAX_SKB_FRAGS + 1];
 };
 
-static struct kmem_cache *xsk_tx_generic_cache;
-
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -539,81 +537,87 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
+static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
 {
+	struct xsk_buff_pool *pool = xs->pool;
+	struct local_cq *lcq = xs->lcq;
 	int ret;
 
 	spin_lock(&pool->cq_cached_prod_lock);
 	ret = xskq_prod_reserve(pool->cq);
 	spin_unlock(&pool->cq_cached_prod_lock);
+	if (!ret)
+		lcq->desc[lcq->prod++ & lcq->ring_mask] = addr;
 
 	return ret;
 }
 
-static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
+#define XSK_DESTRUCTOR_DESCS_SHIFT 8
+#define XSK_DESTRUCTOR_DESCS_MASK \
+	((1ULL << XSK_DESTRUCTOR_DESCS_SHIFT) - 1)
+
+static long xsk_get_destructor_arg(struct sk_buff *skb)
 {
-	return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
+	return (long)skb_shinfo(skb)->destructor_arg;
 }
 
-static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
+static u8 xsk_get_num_desc(struct sk_buff *skb)
 {
-	return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
+	long val = xsk_get_destructor_arg(skb);
+
+	return (u8)val & XSK_DESTRUCTOR_DESCS_MASK;
 }
 
-static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+/* Record the position of first desc in local cq */
+static void xsk_skb_destructor_set_addr(struct sk_buff *skb,
+					struct xdp_sock *xs)
 {
-	skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+	long val;
+
+	val = ((xs->lcq->prod - 1) & xs->lcq->ring_mask) << XSK_DESTRUCTOR_DESCS_SHIFT;
+	skb_shinfo(skb)->destructor_arg = (void *)val;
 }
 
+/* Only update the lower bits to adjust number of descriptors the skb
+ * carries. We have enough bits to increase the value of number of
+ * descriptors that should be within MAX_SKB_FRAGS, so increase it by
+ * one directly.
+ */
 static void xsk_inc_num_desc(struct sk_buff *skb)
 {
-	struct xsk_addrs *xsk_addr;
+	long val = xsk_get_destructor_arg(skb) + 1;
 
-	if (!xsk_skb_destructor_is_addr(skb)) {
-		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
-		xsk_addr->num_descs++;
-	}
+	skb_shinfo(skb)->destructor_arg = (void *)val;
 }
 
-static u32 xsk_get_num_desc(struct sk_buff *skb)
+static u32 xsk_get_start_addr(struct sk_buff *skb)
 {
-	struct xsk_addrs *xsk_addr;
+	long val = xsk_get_destructor_arg(skb);
 
-	if (xsk_skb_destructor_is_addr(skb))
-		return 1;
+	return val >> XSK_DESTRUCTOR_DESCS_SHIFT;
+}
 
-	xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+static void xsk_cq_write_addr(struct sk_buff *skb, u32 desc_processed)
+{
+	struct xsk_buff_pool *pool = xdp_sk(skb->sk)->pool;
+	u32 idx, addr, pos = xsk_get_start_addr(skb);
+	struct xdp_sock *xs = xdp_sk(skb->sk);
 
-	return xsk_addr->num_descs;
+	idx = xskq_get_prod(pool->cq) + desc_processed;
+	addr = xs->lcq->desc[(pos + desc_processed) & xs->lcq->ring_mask];
+	xskq_prod_write_addr(pool->cq, idx, addr);
 }
 
-static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
-				      struct sk_buff *skb)
+static void xsk_cq_submit_addr_locked(struct sk_buff *skb)
 {
-	u32 num_descs = xsk_get_num_desc(skb);
-	struct xsk_addrs *xsk_addr;
-	u32 descs_processed = 0;
+	struct xsk_buff_pool *pool = xdp_sk(skb->sk)->pool;
+	u8 i, num = xsk_get_num_desc(skb);
 	unsigned long flags;
-	u32 idx, i;
 
 	spin_lock_irqsave(&pool->cq_prod_lock, flags);
-	idx = xskq_get_prod(pool->cq);
-
-	if (unlikely(num_descs > 1)) {
-		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
-
-		for (i = 0; i < num_descs; i++) {
-			xskq_prod_write_addr(pool->cq, idx + descs_processed,
-					     xsk_addr->addrs[i]);
-			descs_processed++;
-		}
-		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
-	} else {
-		xskq_prod_write_addr(pool->cq, idx,
-				     xsk_skb_destructor_get_addr(skb));
-		descs_processed++;
-	}
-	xskq_prod_submit_n(pool->cq, descs_processed);
+	for (i = 0; i < num; i++)
+		xsk_cq_write_addr(skb, i);
+	xskq_prod_submit_n(pool->cq, num);
 	spin_unlock_irqrestore(&pool->cq_prod_lock, flags);
 }
 
@@ -634,30 +638,23 @@ void xsk_destruct_skb(struct sk_buff *skb)
 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
 	}
 
-	xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);
+	xsk_cq_submit_addr_locked(skb);
 	sock_wfree(skb);
 }
 
-static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
-			      u64 addr)
+static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs)
 {
 	skb->dev = xs->dev;
 	skb->priority = READ_ONCE(xs->sk.sk_priority);
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
 	skb->destructor = xsk_destruct_skb;
-	xsk_skb_destructor_set_addr(skb, addr);
+	xsk_skb_destructor_set_addr(skb, xs);
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
 {
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 	u32 num_descs = xsk_get_num_desc(skb);
-	struct xsk_addrs *xsk_addr;
-
-	if (unlikely(num_descs > 1)) {
-		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
-		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
-	}
 
 	skb->destructor = sock_wfree;
 	xsk_cq_cancel_locked(xs->pool, num_descs);
@@ -734,33 +731,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 
 		skb_reserve(skb, hr);
 
-		xsk_skb_init_misc(skb, xs, desc->addr);
+		xsk_skb_init_misc(skb, xs);
 		if (desc->options & XDP_TX_METADATA) {
 			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
 			if (unlikely(err))
 				return ERR_PTR(err);
 		}
-	} else {
-		struct xsk_addrs *xsk_addr;
-
-		if (xsk_skb_destructor_is_addr(skb)) {
-			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
-						     GFP_KERNEL);
-			if (!xsk_addr)
-				return ERR_PTR(-ENOMEM);
-
-			xsk_addr->num_descs = 1;
-			xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
-			skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
-		} else {
-			xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
-		}
-
-		/* in case of -EOVERFLOW that could happen below,
-		 * xsk_consume_skb() will release this node as whole skb
-		 * would be dropped, which implies freeing all list elements
-		 */
-		xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
 	}
 
 	len = desc->len;
@@ -828,7 +804,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			if (unlikely(err))
 				goto free_err;
 
-			xsk_skb_init_misc(skb, xs, desc->addr);
+			xsk_skb_init_misc(skb, xs);
 			if (desc->options & XDP_TX_METADATA) {
 				err = xsk_skb_metadata(skb, buffer, desc,
 						       xs->pool, hr);
@@ -837,25 +813,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			}
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
-			struct xsk_addrs *xsk_addr;
 			struct page *page;
 			u8 *vaddr;
 
-			if (xsk_skb_destructor_is_addr(skb)) {
-				xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
-							     GFP_KERNEL);
-				if (!xsk_addr) {
-					err = -ENOMEM;
-					goto free_err;
-				}
-
-				xsk_addr->num_descs = 1;
-				xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
-				skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
-			} else {
-				xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
-			}
-
 			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
 				err = -EOVERFLOW;
 				goto free_err;
@@ -873,8 +833,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
 			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
-
-			xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
 		}
 	}
 
@@ -931,7 +889,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_locked(xs->pool);
+		err = xsk_cq_reserve_addr_locked(xs, desc.addr);
 		if (err) {
 			err = -EAGAIN;
 			goto out;
@@ -1978,18 +1936,8 @@ static int __init xsk_init(void)
 	if (err)
 		goto out_pernet;
 
-	xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
-						 sizeof(struct xsk_addrs),
-						 0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!xsk_tx_generic_cache) {
-		err = -ENOMEM;
-		goto out_unreg_notif;
-	}
-
 	return 0;
 
-out_unreg_notif:
-	unregister_netdevice_notifier(&xsk_netdev_notifier);
 out_pernet:
 	unregister_pernet_subsys(&xsk_net_ops);
 out_sk:
-- 
2.41.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket
  2025-12-09  8:59 ` [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket Jason Xing
@ 2025-12-09  9:28   ` bot+bpf-ci
  2025-12-09 14:19     ` Jason Xing
  0 siblings, 1 reply; 7+ messages in thread
From: bot+bpf-ci @ 2025-12-09  9:28 UTC (permalink / raw)
  To: kerneljasonxing, davem, edumazet, kuba, pabeni, bjorn,
	magnus.karlsson, maciej.fijalkowski, jonathan.lemon, sdf, ast,
	daniel, hawk, john.fastabend
  Cc: bpf, netdev, kernelxing, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai

[-- Attachment #1: Type: text/plain, Size: 2254 bytes --]

> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 23e8861e8b25..c53ab2609d8c 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -45,6 +45,12 @@ struct xsk_map {
>  	struct xdp_sock __rcu *xsk_map[];
>  };
>
> +struct local_cq {
> +	u32 prod ____cacheline_aligned_in_smp;
> +	u32 ring_mask ____cacheline_aligned_in_smp;
> +	u64 desc[] ____cacheline_aligned_in_smp;
> +};

The prod field is not initialized in xsk_init_local_cq(). Since vmalloc()
does not zero memory, prod will contain garbage values when first used.
The next commit in this series (722af5ef5295 "xsk: introduce a dedicated
local completion queue for each xsk") uses prod with lcq->prod++ before
any initialization occurs.

Should prod be initialized to 0 in xsk_init_local_cq()?

> +
>  struct xdp_sock {
>  	/* struct sock must be the first member of struct xdp_sock */
>  	struct sock sk;

[ ... ]

> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index f093c3453f64..ce165d093105 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c

[ ... ]

> +static int xsk_init_local_cq(struct xdp_sock *xs)
> +{
> +	u32 nentries = xs->pool->cq->nentries;
> +	size_t size = struct_size_t(struct local_cq, desc, nentries);
> +
> +	xs->lcq = vmalloc(size);
> +	if (!xs->lcq)
> +		return -ENOMEM;
> +	xs->lcq->ring_mask = nentries - 1;
> +
> +	return 0;
> +}

Can xs->pool->cq be NULL here? In the XDP_SHARED_UMEM path with
different queue_id or device (xsk_bind()->xp_create_and_assign_umem()->
xsk_init_local_cq()), xp_create_and_assign_umem() sets pool->cq =
xs->cq_tmp, but there's no validation that xs->cq_tmp is non-NULL
before calling xsk_init_local_cq().

For comparison, the non-shared umem path validates cq_tmp via
xsk_validate_queues() before reaching xsk_init_local_cq(). Should the
XDP_SHARED_UMEM path with different qid/dev also validate that cq_tmp
is set before calling xp_create_and_assign_umem()?

> +
> +static void xsk_clear_local_cq(struct xdp_sock *xs)
> +{
> +	vfree(xs->lcq);
> +}

[ ... ]

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/20058064407

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket
  2025-12-09  9:28   ` bot+bpf-ci
@ 2025-12-09 14:19     ` Jason Xing
  0 siblings, 0 replies; 7+ messages in thread
From: Jason Xing @ 2025-12-09 14:19 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend, bpf, netdev, kernelxing, andrii, martin.lau,
	eddyz87, yonghong.song, clm, ihor.solodrai

On Tue, Dec 9, 2025 at 5:29 PM <bot+bpf-ci@kernel.org> wrote:
>
> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > index 23e8861e8b25..c53ab2609d8c 100644
> > --- a/include/net/xdp_sock.h
> > +++ b/include/net/xdp_sock.h
> > @@ -45,6 +45,12 @@ struct xsk_map {
> >       struct xdp_sock __rcu *xsk_map[];
> >  };
> >
> > +struct local_cq {
> > +     u32 prod ____cacheline_aligned_in_smp;
> > +     u32 ring_mask ____cacheline_aligned_in_smp;
> > +     u64 desc[] ____cacheline_aligned_in_smp;
> > +};
>
> The prod field is not initialized in xsk_init_local_cq(). Since vmalloc()
> does not zero memory, prod will contain garbage values when first used.
> The next commit in this series (722af5ef5295 "xsk: introduce a dedicated
> local completion queue for each xsk") uses prod with lcq->prod++ before
> any initialization occurs.
>
> Should prod be initialized to 0 in xsk_init_local_cq()?

Yes, it can be done here. Will fix it. Thanks.


>
> > +
> >  struct xdp_sock {
> >       /* struct sock must be the first member of struct xdp_sock */
> >       struct sock sk;
>
> [ ... ]
>
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index f093c3453f64..ce165d093105 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
>
> [ ... ]
>
> > +static int xsk_init_local_cq(struct xdp_sock *xs)
> > +{
> > +     u32 nentries = xs->pool->cq->nentries;
> > +     size_t size = struct_size_t(struct local_cq, desc, nentries);
> > +
> > +     xs->lcq = vmalloc(size);
> > +     if (!xs->lcq)
> > +             return -ENOMEM;
> > +     xs->lcq->ring_mask = nentries - 1;
> > +
> > +     return 0;
> > +}
>
> Can xs->pool->cq be NULL here? In the XDP_SHARED_UMEM path with
> different queue_id or device (xsk_bind()->xp_create_and_assign_umem()->
> xsk_init_local_cq()), xp_create_and_assign_umem() sets pool->cq =
> xs->cq_tmp, but there's no validation that xs->cq_tmp is non-NULL
> before calling xsk_init_local_cq().

Yes, it can happen here. But it should not happen in theory. I did a
quick analysis at the following link:
https://lore.kernel.org/all/CAL+tcoDQ6MyuGRE8mJi_cafqyO0wfgOw5WTqnCvKGbQBhKOGpA@mail.gmail.com/.
I felt tired and sleepy right now and will dig deeper into this
tomorrow. Hopefully I'm right.

My take is this NULL pointer of cq case should be avoided.

Thanks,
Jason

>
> For comparison, the non-shared umem path validates cq_tmp via
> xsk_validate_queues() before reaching xsk_init_local_cq(). Should the
> XDP_SHARED_UMEM path with different qid/dev also validate that cq_tmp
> is set before calling xp_create_and_assign_umem()?
>
> > +
> > +static void xsk_clear_local_cq(struct xdp_sock *xs)
> > +{
> > +     vfree(xs->lcq);
> > +}
>
> [ ... ]
>
>
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
>
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/20058064407

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [syzbot ci] Re: xsk: introduce pre-allocated memory per xsk CQ
  2025-12-09  8:59 [PATCH bpf-next v1 0/2] xsk: introduce pre-allocated memory per xsk CQ Jason Xing
  2025-12-09  8:59 ` [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket Jason Xing
  2025-12-09  8:59 ` [PATCH bpf-next v1 2/2] xsk: introduce a dedicated local completion queue for each xsk Jason Xing
@ 2025-12-09 14:29 ` syzbot ci
  2025-12-09 23:35   ` Jason Xing
  2 siblings, 1 reply; 7+ messages in thread
From: syzbot ci @ 2025-12-09 14:29 UTC (permalink / raw)
  To: ast, bjorn, bpf, daniel, davem, edumazet, hawk, john.fastabend,
	jonathan.lemon, kerneljasonxing, kernelxing, kuba,
	maciej.fijalkowski, magnus.karlsson, netdev, pabeni, sdf
  Cc: syzbot, syzkaller-bugs

syzbot ci has tested the following series

[v1] xsk: introduce pre-allocated memory per xsk CQ
https://lore.kernel.org/all/20251209085950.96231-1-kerneljasonxing@gmail.com
* [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket
* [PATCH bpf-next v1 2/2] xsk: introduce a dedicated local completion queue for each xsk

and found the following issue:
WARNING in vfree

Full report is available here:
https://ci.syzbot.org/series/5df45d2b-41d6-4675-b3ad-4503516a9ae1

***

WARNING in vfree

tree:      bpf-next
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git
base:      835a50753579aa8368a08fca307e638723207768
arch:      amd64
compiler:  Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
config:    https://ci.syzbot.org/builds/726617c2-a613-4879-9987-91e65545dba1/config
C repro:   https://ci.syzbot.org/findings/0c82a0b1-8cb3-49ae-9fbe-fa3bd02c2ba0/c_repro
syz repro: https://ci.syzbot.org/findings/0c82a0b1-8cb3-49ae-9fbe-fa3bd02c2ba0/syz_repro

------------[ cut here ]------------
Trying to vfree() nonexistent vm area (ffffc900034e6000)
WARNING: mm/vmalloc.c:3423 at 0x0, CPU#0: syz.0.19/5983
Modules linked in:
CPU: 0 UID: 0 PID: 5983 Comm: syz.0.19 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:vfree+0x393/0x400 mm/vmalloc.c:3422
Code: e8 72 1d ab ff 4c 89 f7 48 83 c4 18 5b 41 5c 41 5d 41 5e 41 5f 5d e9 0c fa ff ff e8 57 1d ab ff 48 8d 3d 10 fc 6e 0d 4c 89 f6 <67> 48 0f b9 3a e9 fd fd ff ff e8 3e 1d ab ff 4c 89 e7 e8 66 00 00
RSP: 0018:ffffc90004e37c40 EFLAGS: 00010293
RAX: ffffffff82162d09 RBX: 0000000000000000 RCX: ffff88816c66d7c0
RDX: 0000000000000000 RSI: ffffc900034e6000 RDI: ffffffff8f852920
RBP: 1ffff1102289d8bf R08: ffff88810005f1bb R09: 1ffff1102000be37
R10: dffffc0000000000 R11: ffffed102000be38 R12: ffff88801eed1818
R13: dffffc0000000000 R14: ffffc900034e6000 R15: ffff8881144ec608
FS:  0000555574db2500(0000) GS:ffff88818eab1000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00002000000000c0 CR3: 0000000112a48000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 xsk_clear_local_cq net/xdp/xsk.c:1188 [inline]
 xsk_release+0x6b3/0x880 net/xdp/xsk.c:1220
 __sock_release net/socket.c:653 [inline]
 sock_close+0xc3/0x240 net/socket.c:1446
 __fput+0x44c/0xa70 fs/file_table.c:468
 task_work_run+0x1d4/0x260 kernel/task_work.c:233
 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
 __exit_to_user_mode_loop kernel/entry/common.c:44 [inline]
 exit_to_user_mode_loop+0xff/0x4f0 kernel/entry/common.c:75
 __exit_to_user_mode_prepare include/linux/irq-entry-common.h:226 [inline]
 syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:256 [inline]
 syscall_exit_to_user_mode_work include/linux/entry-common.h:159 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:194 [inline]
 do_syscall_64+0x2e3/0xf80 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7eff8518f7c9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffd6ca61b78 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4
RAX: 0000000000000000 RBX: 000000000001253b RCX: 00007eff8518f7c9
RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003
RBP: 0000000000000000 R08: 0000000000000001 R09: 000000086ca61e6f
R10: 0000001b2f920000 R11: 0000000000000246 R12: 00007eff853e5fac
R13: 00007eff853e5fa0 R14: ffffffffffffffff R15: 0000000000000003
 </TASK>
----------------
Code disassembly (best guess):
   0:	e8 72 1d ab ff       	call   0xffab1d77
   5:	4c 89 f7             	mov    %r14,%rdi
   8:	48 83 c4 18          	add    $0x18,%rsp
   c:	5b                   	pop    %rbx
   d:	41 5c                	pop    %r12
   f:	41 5d                	pop    %r13
  11:	41 5e                	pop    %r14
  13:	41 5f                	pop    %r15
  15:	5d                   	pop    %rbp
  16:	e9 0c fa ff ff       	jmp    0xfffffa27
  1b:	e8 57 1d ab ff       	call   0xffab1d77
  20:	48 8d 3d 10 fc 6e 0d 	lea    0xd6efc10(%rip),%rdi        # 0xd6efc37
  27:	4c 89 f6             	mov    %r14,%rsi
* 2a:	67 48 0f b9 3a       	ud1    (%edx),%rdi <-- trapping instruction
  2f:	e9 fd fd ff ff       	jmp    0xfffffe31
  34:	e8 3e 1d ab ff       	call   0xffab1d77
  39:	4c 89 e7             	mov    %r12,%rdi
  3c:	e8                   	.byte 0xe8
  3d:	66 00 00             	data16 add %al,(%rax)


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [syzbot ci] Re: xsk: introduce pre-allocated memory per xsk CQ
  2025-12-09 14:29 ` [syzbot ci] Re: xsk: introduce pre-allocated memory per xsk CQ syzbot ci
@ 2025-12-09 23:35   ` Jason Xing
  0 siblings, 0 replies; 7+ messages in thread
From: Jason Xing @ 2025-12-09 23:35 UTC (permalink / raw)
  To: syzbot ci
  Cc: ast, bjorn, bpf, daniel, davem, edumazet, hawk, john.fastabend,
	jonathan.lemon, kernelxing, kuba, maciej.fijalkowski,
	magnus.karlsson, netdev, pabeni, sdf, syzbot, syzkaller-bugs

On Tue, Dec 9, 2025 at 10:29 PM syzbot ci
<syzbot+cib018e69d32b0c0b5@syzkaller.appspotmail.com> wrote:
>
> syzbot ci has tested the following series
>
> [v1] xsk: introduce pre-allocated memory per xsk CQ
> https://lore.kernel.org/all/20251209085950.96231-1-kerneljasonxing@gmail.com
> * [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket
> * [PATCH bpf-next v1 2/2] xsk: introduce a dedicated local completion queue for each xsk
>
> and found the following issue:
> WARNING in vfree
>
> Full report is available here:
> https://ci.syzbot.org/series/5df45d2b-41d6-4675-b3ad-4503516a9ae1
>
> ***
>
> WARNING in vfree
>
> tree:      bpf-next
> URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git
> base:      835a50753579aa8368a08fca307e638723207768
> arch:      amd64
> compiler:  Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
> config:    https://ci.syzbot.org/builds/726617c2-a613-4879-9987-91e65545dba1/config
> C repro:   https://ci.syzbot.org/findings/0c82a0b1-8cb3-49ae-9fbe-fa3bd02c2ba0/c_repro
> syz repro: https://ci.syzbot.org/findings/0c82a0b1-8cb3-49ae-9fbe-fa3bd02c2ba0/syz_repro

Very similar issue that I've explained in another two threads. In this
case where xsk that is the r1 socket here only initializes the rx
queue rather than the completion queue, xsk_init_local_cq() reads the
nentries of pool cq that actually is zero. So in xsk_release()
clearing the 0 bytes lcq area led to the following warning.

Thanks,
Jason

>
> ------------[ cut here ]------------
> Trying to vfree() nonexistent vm area (ffffc900034e6000)
> WARNING: mm/vmalloc.c:3423 at 0x0, CPU#0: syz.0.19/5983
> Modules linked in:
> CPU: 0 UID: 0 PID: 5983 Comm: syz.0.19 Not tainted syzkaller #0 PREEMPT(full)
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> RIP: 0010:vfree+0x393/0x400 mm/vmalloc.c:3422
> Code: e8 72 1d ab ff 4c 89 f7 48 83 c4 18 5b 41 5c 41 5d 41 5e 41 5f 5d e9 0c fa ff ff e8 57 1d ab ff 48 8d 3d 10 fc 6e 0d 4c 89 f6 <67> 48 0f b9 3a e9 fd fd ff ff e8 3e 1d ab ff 4c 89 e7 e8 66 00 00
> RSP: 0018:ffffc90004e37c40 EFLAGS: 00010293
> RAX: ffffffff82162d09 RBX: 0000000000000000 RCX: ffff88816c66d7c0
> RDX: 0000000000000000 RSI: ffffc900034e6000 RDI: ffffffff8f852920
> RBP: 1ffff1102289d8bf R08: ffff88810005f1bb R09: 1ffff1102000be37
> R10: dffffc0000000000 R11: ffffed102000be38 R12: ffff88801eed1818
> R13: dffffc0000000000 R14: ffffc900034e6000 R15: ffff8881144ec608
> FS:  0000555574db2500(0000) GS:ffff88818eab1000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00002000000000c0 CR3: 0000000112a48000 CR4: 00000000000006f0
> Call Trace:
>  <TASK>
>  xsk_clear_local_cq net/xdp/xsk.c:1188 [inline]
>  xsk_release+0x6b3/0x880 net/xdp/xsk.c:1220
>  __sock_release net/socket.c:653 [inline]
>  sock_close+0xc3/0x240 net/socket.c:1446
>  __fput+0x44c/0xa70 fs/file_table.c:468
>  task_work_run+0x1d4/0x260 kernel/task_work.c:233
>  resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
>  __exit_to_user_mode_loop kernel/entry/common.c:44 [inline]
>  exit_to_user_mode_loop+0xff/0x4f0 kernel/entry/common.c:75
>  __exit_to_user_mode_prepare include/linux/irq-entry-common.h:226 [inline]
>  syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:256 [inline]
>  syscall_exit_to_user_mode_work include/linux/entry-common.h:159 [inline]
>  syscall_exit_to_user_mode include/linux/entry-common.h:194 [inline]
>  do_syscall_64+0x2e3/0xf80 arch/x86/entry/syscall_64.c:100
>  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> RIP: 0033:0x7eff8518f7c9
> Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:00007ffd6ca61b78 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4
> RAX: 0000000000000000 RBX: 000000000001253b RCX: 00007eff8518f7c9
> RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003
> RBP: 0000000000000000 R08: 0000000000000001 R09: 000000086ca61e6f
> R10: 0000001b2f920000 R11: 0000000000000246 R12: 00007eff853e5fac
> R13: 00007eff853e5fa0 R14: ffffffffffffffff R15: 0000000000000003
>  </TASK>
> ----------------
> Code disassembly (best guess):
>    0:   e8 72 1d ab ff          call   0xffab1d77
>    5:   4c 89 f7                mov    %r14,%rdi
>    8:   48 83 c4 18             add    $0x18,%rsp
>    c:   5b                      pop    %rbx
>    d:   41 5c                   pop    %r12
>    f:   41 5d                   pop    %r13
>   11:   41 5e                   pop    %r14
>   13:   41 5f                   pop    %r15
>   15:   5d                      pop    %rbp
>   16:   e9 0c fa ff ff          jmp    0xfffffa27
>   1b:   e8 57 1d ab ff          call   0xffab1d77
>   20:   48 8d 3d 10 fc 6e 0d    lea    0xd6efc10(%rip),%rdi        # 0xd6efc37
>   27:   4c 89 f6                mov    %r14,%rsi
> * 2a:   67 48 0f b9 3a          ud1    (%edx),%rdi <-- trapping instruction
>   2f:   e9 fd fd ff ff          jmp    0xfffffe31
>   34:   e8 3e 1d ab ff          call   0xffab1d77
>   39:   4c 89 e7                mov    %r12,%rdi
>   3c:   e8                      .byte 0xe8
>   3d:   66 00 00                data16 add %al,(%rax)
>
>
> ***
>
> If these findings have caused you to resend the series or submit a
> separate fix, please add the following tag to your commit message:
>   Tested-by: syzbot@syzkaller.appspotmail.com
>
> ---
> This report is generated by a bot. It may contain errors.
> syzbot ci engineers can be reached at syzkaller@googlegroups.com.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-12-09 23:36 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-09  8:59 [PATCH bpf-next v1 0/2] xsk: introduce pre-allocated memory per xsk CQ Jason Xing
2025-12-09  8:59 ` [PATCH bpf-next v1 1/2] xsk: introduce local_cq for each af_xdp socket Jason Xing
2025-12-09  9:28   ` bot+bpf-ci
2025-12-09 14:19     ` Jason Xing
2025-12-09  8:59 ` [PATCH bpf-next v1 2/2] xsk: introduce a dedicated local completion queue for each xsk Jason Xing
2025-12-09 14:29 ` [syzbot ci] Re: xsk: introduce pre-allocated memory per xsk CQ syzbot ci
2025-12-09 23:35   ` Jason Xing

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).