[PATCH v2 bpf] xsk: fix immature cq descriptor production

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 bpf] xsk: fix immature cq descriptor production
@ 2025-07-05 13:55 Maciej Fijalkowski
  2025-07-07 13:59 ` Alexander Lobakin
  2025-07-14 19:04 ` Dan Carpenter
  0 siblings, 2 replies; 10+ messages in thread
From: Maciej Fijalkowski @ 2025-07-05 13:55 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii
  Cc: netdev, magnus.karlsson, stfomichev, Maciej Fijalkowski,
	Eryk Kubanski

Eryk reported an issue that I have put under Closes: tag, related to
umem addrs being prematurely produced onto pool's completion queue.
Let us make the skb's destructor responsible for producing all addrs
that given skb used.

Commit from fixes tag introduced the buggy behavior, it was not broken
from day 1, but rather when xsk multi-buffer got introduced.

Introduce a struct which will carry descriptor count with array of
addresses taken from processed descriptors that will be carried via
skb_shared_info::destructor_arg. This way we can refer to it within
xsk_destruct_skb().

To summarize, behavior is changed from:
- produce addr to cq, increase cq's cached_prod
- increment descriptor count and store it on
- (xmit and rest of path...)
  skb_shared_info::destructor_arg
- use destructor_arg on skb destructor to update global state of cq
  producer

to the following:
- increment cq's cached_prod
- increment descriptor count, save xdp_desc::addr in custom array and
  store this custom array on skb_shared_info::destructor_arg
- (xmit and rest of path...)
- use destructor_arg on skb destructor to walk the array of addrs and
  write them to cq and finally update global state of cq producer

Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
v1:
https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/

v1->v2:
* store addrs in array carried via destructor_arg instead having them
  stored in skb headroom; cleaner and less hacky approach;
---
 net/xdp/xsk.c       | 79 ++++++++++++++++++++++++++++++++++-----------
 net/xdp/xsk_queue.h | 12 +++++++
 2 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 72c000c0ae5f..9f0ce87d440f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,11 @@
 #define TX_BATCH_SIZE 32
 #define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
 
+struct xsk_addrs {
+	u32 num_descs;
+	u64 addrs[MAX_SKB_FRAGS + 1];
+};
+
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -528,25 +533,38 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	ret = xskq_prod_reserve_addr(pool->cq, addr);
+	ret = xskq_prod_reserve(pool->cq);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 
 	return ret;
 }
 
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
+				      struct sk_buff *skb)
 {
+	struct xsk_addrs *xsk_addrs;
 	unsigned long flags;
+	u32 num_desc, i;
+	u32 idx;
+
+	xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	num_desc = xsk_addrs->num_descs;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	xskq_prod_submit_n(pool->cq, n);
+	idx = xskq_get_prod(pool->cq);
+
+	for (i = 0; i < num_desc; i++, idx++)
+		xskq_prod_write_addr(pool->cq, idx, xsk_addrs->addrs[i]);
+	xskq_prod_submit_n(pool->cq, num_desc);
+
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
+	kfree(xsk_addrs);
 }
 
 static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
@@ -558,29 +576,37 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
-static u32 xsk_get_num_desc(struct sk_buff *skb)
-{
-	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
-}
-
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
 	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
 
-	if (compl->tx_timestamp) {
+	if (compl->tx_timestamp)
 		/* sw completion timestamp, not a real one */
 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
-	}
 
-	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+	xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);
 	sock_wfree(skb);
 }
 
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static u32 xsk_get_num_desc(struct sk_buff *skb)
 {
-	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+	struct xsk_addrs *addrs;
 
-	skb_shinfo(skb)->destructor_arg = (void *)num;
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	return addrs->num_descs;
+}
+
+static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
+{
+	skb_shinfo(skb)->destructor_arg = (void *)addrs;
+}
+
+static void xsk_inc_skb_descs(struct sk_buff *skb)
+{
+	struct xsk_addrs *addrs;
+
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	addrs->num_descs++;
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
@@ -605,6 +631,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 {
 	struct xsk_buff_pool *pool = xs->pool;
 	u32 hr, len, ts, offset, copy, copied;
+	struct xsk_addrs *addrs = NULL;
 	struct sk_buff *skb = xs->skb;
 	struct page *page;
 	void *buffer;
@@ -619,6 +646,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 			return ERR_PTR(err);
 
 		skb_reserve(skb, hr);
+
+		addrs = kzalloc(sizeof(*addrs), GFP_KERNEL);
+		if (!addrs)
+			return ERR_PTR(-ENOMEM);
+
+		xsk_set_destructor_arg(skb, addrs);
 	}
 
 	addr = desc->addr;
@@ -658,6 +691,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 {
 	struct xsk_tx_metadata *meta = NULL;
 	struct net_device *dev = xs->dev;
+	struct xsk_addrs *addrs = NULL;
 	struct sk_buff *skb = xs->skb;
 	bool first_frag = false;
 	int err;
@@ -690,6 +724,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			err = skb_store_bits(skb, 0, buffer, len);
 			if (unlikely(err))
 				goto free_err;
+
+			addrs = kzalloc(sizeof(*addrs), GFP_KERNEL);
+			if (!addrs)
+				goto free_err;
+
+			xsk_set_destructor_arg(skb, addrs);
+
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
 			struct page *page;
@@ -755,7 +796,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
 	skb->destructor = xsk_destruct_skb;
 	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
-	xsk_set_destructor_arg(skb);
+
+	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+	addrs->addrs[addrs->num_descs++] = desc->addr;
 
 	return skb;
 
@@ -765,7 +808,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 	if (err == -EOVERFLOW) {
 		/* Drop the packet */
-		xsk_set_destructor_arg(xs->skb);
+		xsk_inc_skb_descs(xs->skb);
 		xsk_drop_skb(xs->skb);
 		xskq_cons_release(xs->tx);
 	} else {
@@ -807,7 +850,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+		err = xsk_cq_reserve_locked(xs->pool);
 		if (err) {
 			err = -EAGAIN;
 			goto out;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..f16f390370dc 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
 
 /* Functions for producers */
 
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+	return READ_ONCE(q->ring->producer);
+}
+
 static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
 {
 	u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	ring->desc[idx & q->ring_mask] = addr;
+}
+
 static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
 					      u32 nb_entries)
 {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-05 13:55 [PATCH v2 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
@ 2025-07-07 13:59 ` Alexander Lobakin
  2025-07-07 15:06   ` Stanislav Fomichev
  2025-07-14 19:04 ` Dan Carpenter
  1 sibling, 1 reply; 10+ messages in thread
From: Alexander Lobakin @ 2025-07-07 13:59 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson, stfomichev,
	Eryk Kubanski

From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Sat,  5 Jul 2025 15:55:12 +0200

> Eryk reported an issue that I have put under Closes: tag, related to
> umem addrs being prematurely produced onto pool's completion queue.
> Let us make the skb's destructor responsible for producing all addrs
> that given skb used.
> 
> Commit from fixes tag introduced the buggy behavior, it was not broken
> from day 1, but rather when xsk multi-buffer got introduced.
> 
> Introduce a struct which will carry descriptor count with array of
> addresses taken from processed descriptors that will be carried via
> skb_shared_info::destructor_arg. This way we can refer to it within
> xsk_destruct_skb().
> 
> To summarize, behavior is changed from:
> - produce addr to cq, increase cq's cached_prod
> - increment descriptor count and store it on
> - (xmit and rest of path...)
>   skb_shared_info::destructor_arg
> - use destructor_arg on skb destructor to update global state of cq
>   producer
> 
> to the following:
> - increment cq's cached_prod
> - increment descriptor count, save xdp_desc::addr in custom array and
>   store this custom array on skb_shared_info::destructor_arg
> - (xmit and rest of path...)
> - use destructor_arg on skb destructor to walk the array of addrs and
>   write them to cq and finally update global state of cq producer
> 
> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
> v1:
> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> 
> v1->v2:
> * store addrs in array carried via destructor_arg instead having them
>   stored in skb headroom; cleaner and less hacky approach;

Might look cleaner, but what about the performance given that you're
adding a memory allocation?

(I realize that's only for the skb mode, still)

Yeah we anyway allocate an skb and may even copy the whole frame, just
curious.
I could recommend using skb->cb for that, but its 48 bytes would cover
only 6 addresses =\

Headroom is no that hacky; we even place &xdp_frame there :D

> ---
>  net/xdp/xsk.c       | 79 ++++++++++++++++++++++++++++++++++-----------
>  net/xdp/xsk_queue.h | 12 +++++++
>  2 files changed, 73 insertions(+), 18 deletions(-)

[...]

> @@ -619,6 +646,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  			return ERR_PTR(err);
>  
>  		skb_reserve(skb, hr);
> +
> +		addrs = kzalloc(sizeof(*addrs), GFP_KERNEL);

Are you sure you can use sleeping GFP_KERNEL here, not GFP_ATOMIC?

> +		if (!addrs)
> +			return ERR_PTR(-ENOMEM);
> +
> +		xsk_set_destructor_arg(skb, addrs);
>  	}
>  
>  	addr = desc->addr;

Thanks,
Olek

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-07 13:59 ` Alexander Lobakin
@ 2025-07-07 15:06   ` Stanislav Fomichev
  2025-07-07 15:47     ` Alexander Lobakin
  0 siblings, 1 reply; 10+ messages in thread
From: Stanislav Fomichev @ 2025-07-07 15:06 UTC (permalink / raw)
  To: Alexander Lobakin
  Cc: Maciej Fijalkowski, bpf, ast, daniel, andrii, netdev,
	magnus.karlsson, Eryk Kubanski

On 07/07, Alexander Lobakin wrote:
> From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> Date: Sat,  5 Jul 2025 15:55:12 +0200
> 
> > Eryk reported an issue that I have put under Closes: tag, related to
> > umem addrs being prematurely produced onto pool's completion queue.
> > Let us make the skb's destructor responsible for producing all addrs
> > that given skb used.
> > 
> > Commit from fixes tag introduced the buggy behavior, it was not broken
> > from day 1, but rather when xsk multi-buffer got introduced.
> > 
> > Introduce a struct which will carry descriptor count with array of
> > addresses taken from processed descriptors that will be carried via
> > skb_shared_info::destructor_arg. This way we can refer to it within
> > xsk_destruct_skb().
> > 
> > To summarize, behavior is changed from:
> > - produce addr to cq, increase cq's cached_prod
> > - increment descriptor count and store it on
> > - (xmit and rest of path...)
> >   skb_shared_info::destructor_arg
> > - use destructor_arg on skb destructor to update global state of cq
> >   producer
> > 
> > to the following:
> > - increment cq's cached_prod
> > - increment descriptor count, save xdp_desc::addr in custom array and
> >   store this custom array on skb_shared_info::destructor_arg
> > - (xmit and rest of path...)
> > - use destructor_arg on skb destructor to walk the array of addrs and
> >   write them to cq and finally update global state of cq producer
> > 
> > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > ---
> > v1:
> > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > 
> > v1->v2:
> > * store addrs in array carried via destructor_arg instead having them
> >   stored in skb headroom; cleaner and less hacky approach;
> 
> Might look cleaner, but what about the performance given that you're
> adding a memory allocation?
> 
> (I realize that's only for the skb mode, still)
> 
> Yeah we anyway allocate an skb and may even copy the whole frame, just
> curious.
> I could recommend using skb->cb for that, but its 48 bytes would cover
> only 6 addresses =\

Can we pre-allocate an array of xsk_addrs during xsk_bind (the number of
xsk_addrs is bound by the tx ring size)? Then we can remove the alloc on tx
and replace it with some code to manage that pool of xsk_addrs..

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-07 15:06   ` Stanislav Fomichev
@ 2025-07-07 15:47     ` Alexander Lobakin
  2025-07-07 18:40       ` Stanislav Fomichev
  0 siblings, 1 reply; 10+ messages in thread
From: Alexander Lobakin @ 2025-07-07 15:47 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Maciej Fijalkowski, bpf, ast, daniel, andrii, netdev,
	magnus.karlsson, Eryk Kubanski

From: Stanislav Fomichev <stfomichev@gmail.com>
Date: Mon, 7 Jul 2025 08:06:21 -0700

> On 07/07, Alexander Lobakin wrote:
>> From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
>> Date: Sat,  5 Jul 2025 15:55:12 +0200
>>
>>> Eryk reported an issue that I have put under Closes: tag, related to
>>> umem addrs being prematurely produced onto pool's completion queue.
>>> Let us make the skb's destructor responsible for producing all addrs
>>> that given skb used.
>>>
>>> Commit from fixes tag introduced the buggy behavior, it was not broken
>>> from day 1, but rather when xsk multi-buffer got introduced.
>>>
>>> Introduce a struct which will carry descriptor count with array of
>>> addresses taken from processed descriptors that will be carried via
>>> skb_shared_info::destructor_arg. This way we can refer to it within
>>> xsk_destruct_skb().
>>>
>>> To summarize, behavior is changed from:
>>> - produce addr to cq, increase cq's cached_prod
>>> - increment descriptor count and store it on
>>> - (xmit and rest of path...)
>>>   skb_shared_info::destructor_arg
>>> - use destructor_arg on skb destructor to update global state of cq
>>>   producer
>>>
>>> to the following:
>>> - increment cq's cached_prod
>>> - increment descriptor count, save xdp_desc::addr in custom array and
>>>   store this custom array on skb_shared_info::destructor_arg
>>> - (xmit and rest of path...)
>>> - use destructor_arg on skb destructor to walk the array of addrs and
>>>   write them to cq and finally update global state of cq producer
>>>
>>> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
>>> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
>>> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
>>> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
>>> ---
>>> v1:
>>> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
>>>
>>> v1->v2:
>>> * store addrs in array carried via destructor_arg instead having them
>>>   stored in skb headroom; cleaner and less hacky approach;
>>
>> Might look cleaner, but what about the performance given that you're
>> adding a memory allocation?
>>
>> (I realize that's only for the skb mode, still)
>>
>> Yeah we anyway allocate an skb and may even copy the whole frame, just
>> curious.
>> I could recommend using skb->cb for that, but its 48 bytes would cover
>> only 6 addresses =\

BTW isn't num_descs from that new structure would be the same as
shinfo->nr_frags + 1 (or just nr_frags for xsk_build_skb_zerocopy())?

> 
> Can we pre-allocate an array of xsk_addrs during xsk_bind (the number of
> xsk_addrs is bound by the tx ring size)? Then we can remove the alloc on tx
> and replace it with some code to manage that pool of xsk_addrs..

Nice idea BTW.

We could even use system per-cpu Page Pools to allocate these structs*
:D It wouldn't waste 1 page per one struct as PP is frag-aware and has
API for allocating only a small frag.

Headroom stuff was also ok to me: we either way allocate a new skb, so
we could allocate it with a bit bigger headroom and put that table there
being sure that nobody will overwrite it (some drivers insert special
headers or descriptors in front of the actual skb->data).

[*] Offtop: we could also use system PP to allocate skbs in
xsk_build_skb() just like it's done in xdp_build_skb_from_zc() +
xdp_copy_frags_from_zc() -- no way to avoid memcpy(), but the payload
buffers would be recycled then.

Thanks,
Olek

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-07 15:47     ` Alexander Lobakin
@ 2025-07-07 18:40       ` Stanislav Fomichev
  2025-07-08 14:14         ` Maciej Fijalkowski
  0 siblings, 1 reply; 10+ messages in thread
From: Stanislav Fomichev @ 2025-07-07 18:40 UTC (permalink / raw)
  To: Alexander Lobakin
  Cc: Maciej Fijalkowski, bpf, ast, daniel, andrii, netdev,
	magnus.karlsson, Eryk Kubanski

On 07/07, Alexander Lobakin wrote:
> From: Stanislav Fomichev <stfomichev@gmail.com>
> Date: Mon, 7 Jul 2025 08:06:21 -0700
> 
> > On 07/07, Alexander Lobakin wrote:
> >> From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> >> Date: Sat,  5 Jul 2025 15:55:12 +0200
> >>
> >>> Eryk reported an issue that I have put under Closes: tag, related to
> >>> umem addrs being prematurely produced onto pool's completion queue.
> >>> Let us make the skb's destructor responsible for producing all addrs
> >>> that given skb used.
> >>>
> >>> Commit from fixes tag introduced the buggy behavior, it was not broken
> >>> from day 1, but rather when xsk multi-buffer got introduced.
> >>>
> >>> Introduce a struct which will carry descriptor count with array of
> >>> addresses taken from processed descriptors that will be carried via
> >>> skb_shared_info::destructor_arg. This way we can refer to it within
> >>> xsk_destruct_skb().
> >>>
> >>> To summarize, behavior is changed from:
> >>> - produce addr to cq, increase cq's cached_prod
> >>> - increment descriptor count and store it on
> >>> - (xmit and rest of path...)
> >>>   skb_shared_info::destructor_arg
> >>> - use destructor_arg on skb destructor to update global state of cq
> >>>   producer
> >>>
> >>> to the following:
> >>> - increment cq's cached_prod
> >>> - increment descriptor count, save xdp_desc::addr in custom array and
> >>>   store this custom array on skb_shared_info::destructor_arg
> >>> - (xmit and rest of path...)
> >>> - use destructor_arg on skb destructor to walk the array of addrs and
> >>>   write them to cq and finally update global state of cq producer
> >>>
> >>> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> >>> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> >>> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> >>> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> >>> ---
> >>> v1:
> >>> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> >>>
> >>> v1->v2:
> >>> * store addrs in array carried via destructor_arg instead having them
> >>>   stored in skb headroom; cleaner and less hacky approach;
> >>
> >> Might look cleaner, but what about the performance given that you're
> >> adding a memory allocation?
> >>
> >> (I realize that's only for the skb mode, still)
> >>
> >> Yeah we anyway allocate an skb and may even copy the whole frame, just
> >> curious.
> >> I could recommend using skb->cb for that, but its 48 bytes would cover
> >> only 6 addresses =\
> 
> BTW isn't num_descs from that new structure would be the same as
> shinfo->nr_frags + 1 (or just nr_frags for xsk_build_skb_zerocopy())?

So you're saying we don't need to store it? Agreed. But storing the rest
in cb still might be problematic with kconfig-configurable MAX_SKB_FRAGS?

> > Can we pre-allocate an array of xsk_addrs during xsk_bind (the number of
> > xsk_addrs is bound by the tx ring size)? Then we can remove the alloc on tx
> > and replace it with some code to manage that pool of xsk_addrs..
> 
> Nice idea BTW.
> 
> We could even use system per-cpu Page Pools to allocate these structs*
> :D It wouldn't waste 1 page per one struct as PP is frag-aware and has
> API for allocating only a small frag.
> 
> Headroom stuff was also ok to me: we either way allocate a new skb, so
> we could allocate it with a bit bigger headroom and put that table there
> being sure that nobody will overwrite it (some drivers insert special
> headers or descriptors in front of the actual skb->data).
> 
> [*] Offtop: we could also use system PP to allocate skbs in
> xsk_build_skb() just like it's done in xdp_build_skb_from_zc() +
> xdp_copy_frags_from_zc() -- no way to avoid memcpy(), but the payload
> buffers would be recycled then.

Or maybe kmem_cache_alloc_node with a custom cache is good enough?
Headroom also feels ok if we store the whole xsk_addrs struct in it.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-07 18:40       ` Stanislav Fomichev
@ 2025-07-08 14:14         ` Maciej Fijalkowski
  2025-07-08 15:54           ` Alexander Lobakin
  0 siblings, 1 reply; 10+ messages in thread
From: Maciej Fijalkowski @ 2025-07-08 14:14 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Alexander Lobakin, bpf, ast, daniel, andrii, netdev,
	magnus.karlsson, Eryk Kubanski

On Mon, Jul 07, 2025 at 11:40:48AM -0700, Stanislav Fomichev wrote:
> On 07/07, Alexander Lobakin wrote:
> > From: Stanislav Fomichev <stfomichev@gmail.com>
> > Date: Mon, 7 Jul 2025 08:06:21 -0700
> > 
> > > On 07/07, Alexander Lobakin wrote:
> > >> From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > >> Date: Sat,  5 Jul 2025 15:55:12 +0200
> > >>
> > >>> Eryk reported an issue that I have put under Closes: tag, related to
> > >>> umem addrs being prematurely produced onto pool's completion queue.
> > >>> Let us make the skb's destructor responsible for producing all addrs
> > >>> that given skb used.
> > >>>
> > >>> Commit from fixes tag introduced the buggy behavior, it was not broken
> > >>> from day 1, but rather when xsk multi-buffer got introduced.
> > >>>
> > >>> Introduce a struct which will carry descriptor count with array of
> > >>> addresses taken from processed descriptors that will be carried via
> > >>> skb_shared_info::destructor_arg. This way we can refer to it within
> > >>> xsk_destruct_skb().
> > >>>
> > >>> To summarize, behavior is changed from:
> > >>> - produce addr to cq, increase cq's cached_prod
> > >>> - increment descriptor count and store it on
> > >>> - (xmit and rest of path...)
> > >>>   skb_shared_info::destructor_arg
> > >>> - use destructor_arg on skb destructor to update global state of cq
> > >>>   producer
> > >>>
> > >>> to the following:
> > >>> - increment cq's cached_prod
> > >>> - increment descriptor count, save xdp_desc::addr in custom array and
> > >>>   store this custom array on skb_shared_info::destructor_arg
> > >>> - (xmit and rest of path...)
> > >>> - use destructor_arg on skb destructor to walk the array of addrs and
> > >>>   write them to cq and finally update global state of cq producer
> > >>>
> > >>> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > >>> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > >>> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > >>> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > >>> ---
> > >>> v1:
> > >>> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > >>>
> > >>> v1->v2:
> > >>> * store addrs in array carried via destructor_arg instead having them
> > >>>   stored in skb headroom; cleaner and less hacky approach;
> > >>
> > >> Might look cleaner, but what about the performance given that you're
> > >> adding a memory allocation?
> > >>
> > >> (I realize that's only for the skb mode, still)
> > >>
> > >> Yeah we anyway allocate an skb and may even copy the whole frame, just
> > >> curious.
> > >> I could recommend using skb->cb for that, but its 48 bytes would cover
> > >> only 6 addresses =\
> > 
> > BTW isn't num_descs from that new structure would be the same as
> > shinfo->nr_frags + 1 (or just nr_frags for xsk_build_skb_zerocopy())?
> 
> So you're saying we don't need to store it? Agreed. But storing the rest
> in cb still might be problematic with kconfig-configurable MAX_SKB_FRAGS?

Hi Stan & Olek,

no, as said in v1 drivers might linearize the skb and all frags will be
lost. This storage is needed unfortunately.

> 
> > > Can we pre-allocate an array of xsk_addrs during xsk_bind (the number of
> > > xsk_addrs is bound by the tx ring size)? Then we can remove the alloc on tx
> > > and replace it with some code to manage that pool of xsk_addrs..

That would be pool-bound which makes it a shared resource so I believe
that we would repeat the problem being fixed here ;)

> > 
> > Nice idea BTW.
> > 
> > We could even use system per-cpu Page Pools to allocate these structs*
> > :D It wouldn't waste 1 page per one struct as PP is frag-aware and has
> > API for allocating only a small frag.
> > 
> > Headroom stuff was also ok to me: we either way allocate a new skb, so
> > we could allocate it with a bit bigger headroom and put that table there
> > being sure that nobody will overwrite it (some drivers insert special
> > headers or descriptors in front of the actual skb->data).

headroom approach was causing one of bpf selftests to fail, but I didn't
check in-depth the reason. I didn't really like the check in destructor if
addr array was corrupted in v1 and I came up with v2 which seems to me a
cleaner fix.

> > 
> > [*] Offtop: we could also use system PP to allocate skbs in
> > xsk_build_skb() just like it's done in xdp_build_skb_from_zc() +
> > xdp_copy_frags_from_zc() -- no way to avoid memcpy(), but the payload
> > buffers would be recycled then.
> 
> Or maybe kmem_cache_alloc_node with a custom cache is good enough?
> Headroom also feels ok if we store the whole xsk_addrs struct in it.

Yep both of these approaches was something I considered, but keep in mind
it's a bugfix so I didn't want to go with something flashy. I have not
observed big performance impact but I checked only MAX_SKB_FRAGS being set
to standard value.

Would you guys be ok if I do the follow-up with possible optimization
after my vacation which would be a -next candidate?

Thanks,
MF

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-08 14:14         ` Maciej Fijalkowski
@ 2025-07-08 15:54           ` Alexander Lobakin
  2025-07-08 17:49             ` Stanislav Fomichev
  0 siblings, 1 reply; 10+ messages in thread
From: Alexander Lobakin @ 2025-07-08 15:54 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: Stanislav Fomichev, bpf, ast, daniel, andrii, netdev,
	magnus.karlsson, Eryk Kubanski

From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 8 Jul 2025 16:14:39 +0200

> On Mon, Jul 07, 2025 at 11:40:48AM -0700, Stanislav Fomichev wrote:
>> On 07/07, Alexander Lobakin wrote:

[...]

>>> BTW isn't num_descs from that new structure would be the same as
>>> shinfo->nr_frags + 1 (or just nr_frags for xsk_build_skb_zerocopy())?
>>
>> So you're saying we don't need to store it? Agreed. But storing the rest
>> in cb still might be problematic with kconfig-configurable MAX_SKB_FRAGS?

For sure skb->cb is too small for 17+ u64s.

> 
> Hi Stan & Olek,
> 
> no, as said in v1 drivers might linearize the skb and all frags will be
> lost. This storage is needed unfortunately.

Aaah sorry. In this case yeah, you need this separate frag count.

> 
>>
>>>> Can we pre-allocate an array of xsk_addrs during xsk_bind (the number of
>>>> xsk_addrs is bound by the tx ring size)? Then we can remove the alloc on tx
>>>> and replace it with some code to manage that pool of xsk_addrs..
> 
> That would be pool-bound which makes it a shared resource so I believe
> that we would repeat the problem being fixed here ;)

Except the system Page Pool idea right below maybe :>

> 
>>>
>>> Nice idea BTW.
>>>
>>> We could even use system per-cpu Page Pools to allocate these structs*
>>> :D It wouldn't waste 1 page per one struct as PP is frag-aware and has
>>> API for allocating only a small frag.
>>>
>>> Headroom stuff was also ok to me: we either way allocate a new skb, so
>>> we could allocate it with a bit bigger headroom and put that table there
>>> being sure that nobody will overwrite it (some drivers insert special
>>> headers or descriptors in front of the actual skb->data).
> 
> headroom approach was causing one of bpf selftests to fail, but I didn't
> check in-depth the reason. I didn't really like the check in destructor if
> addr array was corrupted in v1 and I came up with v2 which seems to me a
> cleaner fix.
> 
>>>
>>> [*] Offtop: we could also use system PP to allocate skbs in
>>> xsk_build_skb() just like it's done in xdp_build_skb_from_zc() +
>>> xdp_copy_frags_from_zc() -- no way to avoid memcpy(), but the payload
>>> buffers would be recycled then.
>>
>> Or maybe kmem_cache_alloc_node with a custom cache is good enough?
>> Headroom also feels ok if we store the whole xsk_addrs struct in it.
> 
> Yep both of these approaches was something I considered, but keep in mind
> it's a bugfix so I didn't want to go with something flashy. I have not
> observed big performance impact but I checked only MAX_SKB_FRAGS being set
> to standard value.
> 
> Would you guys be ok if I do the follow-up with possible optimization
> after my vacation which would be a -next candidate?

As a fix, it's totally fine for me to go in the current form, sure.

> 
> Thanks,
> MF

Thanks,
Olek

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-08 15:54           ` Alexander Lobakin
@ 2025-07-08 17:49             ` Stanislav Fomichev
  2025-07-09 10:00               ` Maciej Fijalkowski
  0 siblings, 1 reply; 10+ messages in thread
From: Stanislav Fomichev @ 2025-07-08 17:49 UTC (permalink / raw)
  To: Alexander Lobakin
  Cc: Maciej Fijalkowski, bpf, ast, daniel, andrii, netdev,
	magnus.karlsson, Eryk Kubanski

On 07/08, Alexander Lobakin wrote:
> From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> Date: Tue, 8 Jul 2025 16:14:39 +0200
> 
> > On Mon, Jul 07, 2025 at 11:40:48AM -0700, Stanislav Fomichev wrote:
> >> On 07/07, Alexander Lobakin wrote:
> 
> [...]
> 
> >>> BTW isn't num_descs from that new structure would be the same as
> >>> shinfo->nr_frags + 1 (or just nr_frags for xsk_build_skb_zerocopy())?
> >>
> >> So you're saying we don't need to store it? Agreed. But storing the rest
> >> in cb still might be problematic with kconfig-configurable MAX_SKB_FRAGS?
> 
> For sure skb->cb is too small for 17+ u64s.
> 
> > 
> > Hi Stan & Olek,
> > 
> > no, as said in v1 drivers might linearize the skb and all frags will be
> > lost. This storage is needed unfortunately.
> 
> Aaah sorry. In this case yeah, you need this separate frag count.
> 
> > 
> >>
> >>>> Can we pre-allocate an array of xsk_addrs during xsk_bind (the number of
> >>>> xsk_addrs is bound by the tx ring size)? Then we can remove the alloc on tx
> >>>> and replace it with some code to manage that pool of xsk_addrs..
> > 
> > That would be pool-bound which makes it a shared resource so I believe
> > that we would repeat the problem being fixed here ;)
> 
> Except the system Page Pool idea right below maybe :>
 
 It doesn't have to be a shared resource, the pool (in whatever form) can be
 per xsk. (unless I'm missing something)

> >>> Nice idea BTW.
> >>>
> >>> We could even use system per-cpu Page Pools to allocate these structs*
> >>> :D It wouldn't waste 1 page per one struct as PP is frag-aware and has
> >>> API for allocating only a small frag.
> >>>
> >>> Headroom stuff was also ok to me: we either way allocate a new skb, so
> >>> we could allocate it with a bit bigger headroom and put that table there
> >>> being sure that nobody will overwrite it (some drivers insert special
> >>> headers or descriptors in front of the actual skb->data).
> > 
> > headroom approach was causing one of bpf selftests to fail, but I didn't
> > check in-depth the reason. I didn't really like the check in destructor if
> > addr array was corrupted in v1 and I came up with v2 which seems to me a
> > cleaner fix.
> > 
> >>>
> >>> [*] Offtop: we could also use system PP to allocate skbs in
> >>> xsk_build_skb() just like it's done in xdp_build_skb_from_zc() +
> >>> xdp_copy_frags_from_zc() -- no way to avoid memcpy(), but the payload
> >>> buffers would be recycled then.
> >>
> >> Or maybe kmem_cache_alloc_node with a custom cache is good enough?
> >> Headroom also feels ok if we store the whole xsk_addrs struct in it.
> > 
> > Yep both of these approaches was something I considered, but keep in mind
> > it's a bugfix so I didn't want to go with something flashy. I have not
> > observed big performance impact but I checked only MAX_SKB_FRAGS being set
> > to standard value.
> > 
> > Would you guys be ok if I do the follow-up with possible optimization
> > after my vacation which would be a -next candidate?
> 
> As a fix, it's totally fine for me to go in the current form, sure.

+1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-08 17:49             ` Stanislav Fomichev
@ 2025-07-09 10:00               ` Maciej Fijalkowski
  0 siblings, 0 replies; 10+ messages in thread
From: Maciej Fijalkowski @ 2025-07-09 10:00 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Alexander Lobakin, bpf, ast, daniel, andrii, netdev,
	magnus.karlsson, Eryk Kubanski

On Tue, Jul 08, 2025 at 10:49:36AM -0700, Stanislav Fomichev wrote:
> On 07/08, Alexander Lobakin wrote:
> > From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > Date: Tue, 8 Jul 2025 16:14:39 +0200
> > 
> > > On Mon, Jul 07, 2025 at 11:40:48AM -0700, Stanislav Fomichev wrote:
> > >> On 07/07, Alexander Lobakin wrote:
> > 
> > [...]
> > 
> > >>> BTW isn't num_descs from that new structure would be the same as
> > >>> shinfo->nr_frags + 1 (or just nr_frags for xsk_build_skb_zerocopy())?
> > >>
> > >> So you're saying we don't need to store it? Agreed. But storing the rest
> > >> in cb still might be problematic with kconfig-configurable MAX_SKB_FRAGS?
> > 
> > For sure skb->cb is too small for 17+ u64s.
> > 
> > > 
> > > Hi Stan & Olek,
> > > 
> > > no, as said in v1 drivers might linearize the skb and all frags will be
> > > lost. This storage is needed unfortunately.
> > 
> > Aaah sorry. In this case yeah, you need this separate frag count.
> > 
> > > 
> > >>
> > >>>> Can we pre-allocate an array of xsk_addrs during xsk_bind (the number of
> > >>>> xsk_addrs is bound by the tx ring size)? Then we can remove the alloc on tx
> > >>>> and replace it with some code to manage that pool of xsk_addrs..
> > > 
> > > That would be pool-bound which makes it a shared resource so I believe
> > > that we would repeat the problem being fixed here ;)
> > 
> > Except the system Page Pool idea right below maybe :>
>  
>  It doesn't have to be a shared resource, the pool (in whatever form) can be
>  per xsk. (unless I'm missing something)

It can not. when you attach multiple xsks to same {dev,qid} tuple pool is
shared.
https://elixir.bootlin.com/linux/v6.16-rc5/source/net/xdp/xsk.c#L1249

Not sure right now if we could store the pointer to xsk_addrs in
xdp_sock maybe. Let me get back to this after my time off.

BTW I didn't realize you added yourself as xsk reviewer. Would be nice to
have CCed Magnus or me when doing so :P

> 
> > >>> Nice idea BTW.
> > >>>
> > >>> We could even use system per-cpu Page Pools to allocate these structs*
> > >>> :D It wouldn't waste 1 page per one struct as PP is frag-aware and has
> > >>> API for allocating only a small frag.
> > >>>
> > >>> Headroom stuff was also ok to me: we either way allocate a new skb, so
> > >>> we could allocate it with a bit bigger headroom and put that table there
> > >>> being sure that nobody will overwrite it (some drivers insert special
> > >>> headers or descriptors in front of the actual skb->data).
> > > 
> > > headroom approach was causing one of bpf selftests to fail, but I didn't
> > > check in-depth the reason. I didn't really like the check in destructor if
> > > addr array was corrupted in v1 and I came up with v2 which seems to me a
> > > cleaner fix.
> > > 
> > >>>
> > >>> [*] Offtop: we could also use system PP to allocate skbs in
> > >>> xsk_build_skb() just like it's done in xdp_build_skb_from_zc() +
> > >>> xdp_copy_frags_from_zc() -- no way to avoid memcpy(), but the payload
> > >>> buffers would be recycled then.
> > >>
> > >> Or maybe kmem_cache_alloc_node with a custom cache is good enough?
> > >> Headroom also feels ok if we store the whole xsk_addrs struct in it.
> > > 
> > > Yep both of these approaches was something I considered, but keep in mind
> > > it's a bugfix so I didn't want to go with something flashy. I have not
> > > observed big performance impact but I checked only MAX_SKB_FRAGS being set
> > > to standard value.
> > > 
> > > Would you guys be ok if I do the follow-up with possible optimization
> > > after my vacation which would be a -next candidate?
> > 
> > As a fix, it's totally fine for me to go in the current form, sure.
> 
> +1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 bpf] xsk: fix immature cq descriptor production
  2025-07-05 13:55 [PATCH v2 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
  2025-07-07 13:59 ` Alexander Lobakin
@ 2025-07-14 19:04 ` Dan Carpenter
  1 sibling, 0 replies; 10+ messages in thread
From: Dan Carpenter @ 2025-07-14 19:04 UTC (permalink / raw)
  To: oe-kbuild, Maciej Fijalkowski, bpf, ast, daniel, andrii
  Cc: lkp, oe-kbuild-all, netdev, magnus.karlsson, stfomichev,
	Maciej Fijalkowski, Eryk Kubanski

Hi Maciej,

kernel test robot noticed the following build warnings:

url:    https://github.com/intel-lab-lkp/linux/commits/Maciej-Fijalkowski/xsk-fix-immature-cq-descriptor-production/20250705-215714
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git master
patch link:    https://lore.kernel.org/r/20250705135512.1963216-1-maciej.fijalkowski%40intel.com
patch subject: [PATCH v2 bpf] xsk: fix immature cq descriptor production
config: x86_64-randconfig-r071-20250706 (https://download.01.org/0day-ci/archive/20250706/202507061447.DwFSGum1-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202507061447.DwFSGum1-lkp@intel.com/

smatch warnings:
net/xdp/xsk.c:819 xsk_build_skb() warn: passing zero to 'ERR_PTR'

vim +/ERR_PTR +819 net/xdp/xsk.c

9c8f21e6f8856a Xuan Zhuo                 2021-02-18  689  static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  690  				     struct xdp_desc *desc)
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  691  {
48eb03dd26304c Stanislav Fomichev        2023-11-27  692  	struct xsk_tx_metadata *meta = NULL;
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  693  	struct net_device *dev = xs->dev;
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  694  	struct xsk_addrs *addrs = NULL;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  695  	struct sk_buff *skb = xs->skb;
48eb03dd26304c Stanislav Fomichev        2023-11-27  696  	bool first_frag = false;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  697  	int err;
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  698  
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  699  	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  700  		skb = xsk_build_skb_zerocopy(xs, desc);
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  701  		if (IS_ERR(skb)) {
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  702  			err = PTR_ERR(skb);
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  703  			goto free_err;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  704  		}
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  705  	} else {
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  706  		u32 hr, tr, len;
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  707  		void *buffer;
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  708  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  709  		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  710  		len = desc->len;
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  711  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  712  		if (!skb) {
0c0d0f42ffa6ac Felix Maurer              2024-11-14  713  			first_frag = true;
0c0d0f42ffa6ac Felix Maurer              2024-11-14  714  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  715  			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  716  			tr = dev->needed_tailroom;
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  717  			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  718  			if (unlikely(!skb))
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  719  				goto free_err;
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  720  
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  721  			skb_reserve(skb, hr);
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  722  			skb_put(skb, len);
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  723  
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  724  			err = skb_store_bits(skb, 0, buffer, len);
0c0d0f42ffa6ac Felix Maurer              2024-11-14  725  			if (unlikely(err))
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  726  				goto free_err;
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  727  
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  728  			addrs = kzalloc(sizeof(*addrs), GFP_KERNEL);
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  729  			if (!addrs)
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  730  				goto free_err;

err is not set on this path.

regards,
dan carpenter

67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  731  
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  732  			xsk_set_destructor_arg(skb, addrs);
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  733  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  734  		} else {
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  735  			int nr_frags = skb_shinfo(skb)->nr_frags;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  736  			struct page *page;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  737  			u8 *vaddr;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  738  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  739  			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
9d0a67b9d42c63 Tirthendu Sarkar          2023-08-23  740  				err = -EOVERFLOW;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  741  				goto free_err;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  742  			}
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  743  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  744  			page = alloc_page(xs->sk.sk_allocation);
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  745  			if (unlikely(!page)) {
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  746  				err = -EAGAIN;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  747  				goto free_err;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  748  			}
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  749  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  750  			vaddr = kmap_local_page(page);
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  751  			memcpy(vaddr, buffer, len);
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  752  			kunmap_local(vaddr);
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  753  
2127c604383666 Sebastian Andrzej Siewior 2024-02-02  754  			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
2127c604383666 Sebastian Andrzej Siewior 2024-02-02  755  			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  756  		}
48eb03dd26304c Stanislav Fomichev        2023-11-27  757  
48eb03dd26304c Stanislav Fomichev        2023-11-27  758  		if (first_frag && desc->options & XDP_TX_METADATA) {
48eb03dd26304c Stanislav Fomichev        2023-11-27  759  			if (unlikely(xs->pool->tx_metadata_len == 0)) {
48eb03dd26304c Stanislav Fomichev        2023-11-27  760  				err = -EINVAL;
48eb03dd26304c Stanislav Fomichev        2023-11-27  761  				goto free_err;
48eb03dd26304c Stanislav Fomichev        2023-11-27  762  			}
48eb03dd26304c Stanislav Fomichev        2023-11-27  763  
48eb03dd26304c Stanislav Fomichev        2023-11-27  764  			meta = buffer - xs->pool->tx_metadata_len;
ce59f9686e0eca Stanislav Fomichev        2023-11-27  765  			if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
ce59f9686e0eca Stanislav Fomichev        2023-11-27  766  				err = -EINVAL;
ce59f9686e0eca Stanislav Fomichev        2023-11-27  767  				goto free_err;
ce59f9686e0eca Stanislav Fomichev        2023-11-27  768  			}
48eb03dd26304c Stanislav Fomichev        2023-11-27  769  
48eb03dd26304c Stanislav Fomichev        2023-11-27  770  			if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
48eb03dd26304c Stanislav Fomichev        2023-11-27  771  				if (unlikely(meta->request.csum_start +
48eb03dd26304c Stanislav Fomichev        2023-11-27  772  					     meta->request.csum_offset +
48eb03dd26304c Stanislav Fomichev        2023-11-27  773  					     sizeof(__sum16) > len)) {
48eb03dd26304c Stanislav Fomichev        2023-11-27  774  					err = -EINVAL;
48eb03dd26304c Stanislav Fomichev        2023-11-27  775  					goto free_err;
48eb03dd26304c Stanislav Fomichev        2023-11-27  776  				}
48eb03dd26304c Stanislav Fomichev        2023-11-27  777  
48eb03dd26304c Stanislav Fomichev        2023-11-27  778  				skb->csum_start = hr + meta->request.csum_start;
48eb03dd26304c Stanislav Fomichev        2023-11-27  779  				skb->csum_offset = meta->request.csum_offset;
48eb03dd26304c Stanislav Fomichev        2023-11-27  780  				skb->ip_summed = CHECKSUM_PARTIAL;
11614723af26e7 Stanislav Fomichev        2023-11-27  781  
11614723af26e7 Stanislav Fomichev        2023-11-27  782  				if (unlikely(xs->pool->tx_sw_csum)) {
11614723af26e7 Stanislav Fomichev        2023-11-27  783  					err = skb_checksum_help(skb);
11614723af26e7 Stanislav Fomichev        2023-11-27  784  					if (err)
11614723af26e7 Stanislav Fomichev        2023-11-27  785  						goto free_err;
11614723af26e7 Stanislav Fomichev        2023-11-27  786  				}
48eb03dd26304c Stanislav Fomichev        2023-11-27  787  			}
ca4419f15abd19 Song Yoong Siang          2025-02-16  788  
ca4419f15abd19 Song Yoong Siang          2025-02-16  789  			if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
ca4419f15abd19 Song Yoong Siang          2025-02-16  790  				skb->skb_mstamp_ns = meta->request.launch_time;
48eb03dd26304c Stanislav Fomichev        2023-11-27  791  		}
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  792  	}
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  793  
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  794  	skb->dev = dev;
10bbf1652c1cca Eric Dumazet              2023-09-21  795  	skb->priority = READ_ONCE(xs->sk.sk_priority);
3c5b4d69c358a9 Eric Dumazet              2023-07-28  796  	skb->mark = READ_ONCE(xs->sk.sk_mark);
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  797  	skb->destructor = xsk_destruct_skb;
48eb03dd26304c Stanislav Fomichev        2023-11-27  798  	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  799  
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  800  	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  801  	addrs->addrs[addrs->num_descs++] = desc->addr;
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  802  
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  803  	return skb;
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  804  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  805  free_err:
0c0d0f42ffa6ac Felix Maurer              2024-11-14  806  	if (first_frag && skb)
0c0d0f42ffa6ac Felix Maurer              2024-11-14  807  		kfree_skb(skb);
0c0d0f42ffa6ac Felix Maurer              2024-11-14  808  
9d0a67b9d42c63 Tirthendu Sarkar          2023-08-23  809  	if (err == -EOVERFLOW) {
9d0a67b9d42c63 Tirthendu Sarkar          2023-08-23  810  		/* Drop the packet */
67a37dcecabbf0 Maciej Fijalkowski        2025-07-05  811  		xsk_inc_skb_descs(xs->skb);
9d0a67b9d42c63 Tirthendu Sarkar          2023-08-23  812  		xsk_drop_skb(xs->skb);
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  813  		xskq_cons_release(xs->tx);
9d0a67b9d42c63 Tirthendu Sarkar          2023-08-23  814  	} else {
9d0a67b9d42c63 Tirthendu Sarkar          2023-08-23  815  		/* Let application retry */
e6c4047f512280 Maciej Fijalkowski        2024-10-07  816  		xsk_cq_cancel_locked(xs->pool, 1);
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  817  	}
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19  818  
cf24f5a5feeaae Tirthendu Sarkar          2023-07-19 @819  	return ERR_PTR(err);
9c8f21e6f8856a Xuan Zhuo                 2021-02-18  820  }

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2025-07-14 19:05 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-05 13:55 [PATCH v2 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-07-07 13:59 ` Alexander Lobakin
2025-07-07 15:06   ` Stanislav Fomichev
2025-07-07 15:47     ` Alexander Lobakin
2025-07-07 18:40       ` Stanislav Fomichev
2025-07-08 14:14         ` Maciej Fijalkowski
2025-07-08 15:54           ` Alexander Lobakin
2025-07-08 17:49             ` Stanislav Fomichev
2025-07-09 10:00               ` Maciej Fijalkowski
2025-07-14 19:04 ` Dan Carpenter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).