* [PATCH v6 bpf] xsk: fix immature cq descriptor production
@ 2025-08-20 15:44 Maciej Fijalkowski
2025-08-20 17:06 ` Stanislav Fomichev
` (3 more replies)
0 siblings, 4 replies; 14+ messages in thread
From: Maciej Fijalkowski @ 2025-08-20 15:44 UTC (permalink / raw)
To: bpf, ast, daniel, andrii
Cc: netdev, magnus.karlsson, stfomichev, aleksander.lobakin,
Maciej Fijalkowski, Eryk Kubanski
Eryk reported an issue that I have put under Closes: tag, related to
umem addrs being prematurely produced onto pool's completion queue.
Let us make the skb's destructor responsible for producing all addrs
that given skb used.
Introduce struct xsk_addrs which will carry descriptor count with array
of addresses taken from processed descriptors that will be carried via
skb_shared_info::destructor_arg. This way we can refer to it within
xsk_destruct_skb(). In order to mitigate the overhead that will be
coming from memory allocations, let us introduce kmem_cache of
xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
system.
Commit from fixes tag introduced the buggy behavior, it was not broken
from day 1, but rather when xsk multi-buffer got introduced.
Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
v1:
https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
v2:
https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
v3:
https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
v4:
https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
v5:
https://lore.kernel.org/bpf/aKXBHGPxjpBDKOHq@boxer/T/
v1->v2:
* store addrs in array carried via destructor_arg instead having them
stored in skb headroom; cleaner and less hacky approach;
v2->v3:
* use kmem_cache for xsk_addrs allocation (Stan/Olek)
* set err when xsk_addrs allocation fails (Dan)
* change xsk_addrs layout to avoid holes
* free xsk_addrs on error path
* rebase
v3->v4:
* have kmem_cache as percpu vars
* don't drop unnecessary braces (unrelated) (Stan)
* use idx + i in xskq_prod_write_addr (Stan)
* alloc kmem_cache on bind (Stan)
* keep num_descs as first member in xsk_addrs (Magnus)
* add ack from Magnus
v4->v5:
* have a single kmem_cache per xsk subsystem (Stan)
v5->v6:
* free skb in xsk_build_skb_zerocopy() when xsk_addrs allocation fails
(Stan)
* unregister netdev notifier if creating kmem_cache fails (Stan)
---
net/xdp/xsk.c | 95 +++++++++++++++++++++++++++++++++++++--------
net/xdp/xsk_queue.h | 12 ++++++
2 files changed, 91 insertions(+), 16 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9c3acecc14b1..989d5ffb4273 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,13 @@
#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET 32
+struct xsk_addrs {
+ u32 num_descs;
+ u64 addrs[MAX_SKB_FRAGS + 1];
+};
+
+static struct kmem_cache *xsk_tx_generic_cache;
+
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
}
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&pool->cq_lock, flags);
- ret = xskq_prod_reserve_addr(pool->cq, addr);
+ ret = xskq_prod_reserve(pool->cq);
spin_unlock_irqrestore(&pool->cq_lock, flags);
return ret;
}
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
+ struct sk_buff *skb)
{
+ struct xsk_buff_pool *pool = xs->pool;
+ struct xsk_addrs *xsk_addrs;
unsigned long flags;
+ u32 num_desc, i;
+ u32 idx;
+
+ xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ num_desc = xsk_addrs->num_descs;
spin_lock_irqsave(&pool->cq_lock, flags);
- xskq_prod_submit_n(pool->cq, n);
+ idx = xskq_get_prod(pool->cq);
+
+ for (i = 0; i < num_desc; i++)
+ xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
+ xskq_prod_submit_n(pool->cq, num_desc);
+
spin_unlock_irqrestore(&pool->cq_lock, flags);
+ kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
}
static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
@@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
spin_unlock_irqrestore(&pool->cq_lock, flags);
}
-static u32 xsk_get_num_desc(struct sk_buff *skb)
-{
- return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
-}
-
static void xsk_destruct_skb(struct sk_buff *skb)
{
struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
*compl->tx_timestamp = ktime_get_tai_fast_ns();
}
- xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+ xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
sock_wfree(skb);
}
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static u32 xsk_get_num_desc(struct sk_buff *skb)
{
- long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+ struct xsk_addrs *addrs;
- skb_shinfo(skb)->destructor_arg = (void *)num;
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ return addrs->num_descs;
+}
+
+static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
+{
+ skb_shinfo(skb)->destructor_arg = (void *)addrs;
+}
+
+static void xsk_inc_skb_descs(struct sk_buff *skb)
+{
+ struct xsk_addrs *addrs;
+
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ addrs->num_descs++;
}
static void xsk_consume_skb(struct sk_buff *skb)
{
struct xdp_sock *xs = xdp_sk(skb->sk);
+ kmem_cache_free(xsk_tx_generic_cache,
+ (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
skb->destructor = sock_wfree;
xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
/* Free skb without triggering the perf drop trace */
@@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
{
struct xsk_buff_pool *pool = xs->pool;
u32 hr, len, ts, offset, copy, copied;
+ struct xsk_addrs *addrs = NULL;
struct sk_buff *skb = xs->skb;
struct page *page;
void *buffer;
@@ -623,6 +656,14 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
skb_reserve(skb, hr);
+
+ addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (!addrs) {
+ kfree(skb);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ xsk_set_destructor_arg(skb, addrs);
}
addr = desc->addr;
@@ -662,6 +703,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
{
struct xsk_tx_metadata *meta = NULL;
struct net_device *dev = xs->dev;
+ struct xsk_addrs *addrs = NULL;
struct sk_buff *skb = xs->skb;
bool first_frag = false;
int err;
@@ -694,6 +736,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
err = skb_store_bits(skb, 0, buffer, len);
if (unlikely(err))
goto free_err;
+
+ addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (!addrs) {
+ err = -ENOMEM;
+ goto free_err;
+ }
+
+ xsk_set_destructor_arg(skb, addrs);
+
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
struct page *page;
@@ -759,7 +810,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
- xsk_set_destructor_arg(skb);
+
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ addrs->addrs[addrs->num_descs++] = desc->addr;
return skb;
@@ -769,7 +822,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
if (err == -EOVERFLOW) {
/* Drop the packet */
- xsk_set_destructor_arg(xs->skb);
+ xsk_inc_skb_descs(xs->skb);
xsk_drop_skb(xs->skb);
xskq_cons_release(xs->tx);
} else {
@@ -812,7 +865,7 @@ static int __xsk_generic_xmit(struct sock *sk)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+ err = xsk_cq_reserve_locked(xs->pool);
if (err) {
err = -EAGAIN;
goto out;
@@ -1815,8 +1868,18 @@ static int __init xsk_init(void)
if (err)
goto out_pernet;
+ xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
+ sizeof(struct xsk_addrs), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!xsk_tx_generic_cache) {
+ err = -ENOMEM;
+ goto out_unreg_notif;
+ }
+
return 0;
+out_unreg_notif:
+ unregister_netdevice_notifier(&xsk_netdev_notifier);
out_pernet:
unregister_pernet_subsys(&xsk_net_ops);
out_sk:
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..f16f390370dc 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
/* Functions for producers */
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+ return READ_ONCE(q->ring->producer);
+}
+
static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
{
u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
return 0;
}
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ ring->desc[idx & q->ring_mask] = addr;
+}
+
static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
u32 nb_entries)
{
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-20 15:44 [PATCH v6 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
@ 2025-08-20 17:06 ` Stanislav Fomichev
2025-08-22 16:20 ` patchwork-bot+netdevbpf
` (2 subsequent siblings)
3 siblings, 0 replies; 14+ messages in thread
From: Stanislav Fomichev @ 2025-08-20 17:06 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On 08/20, Maciej Fijalkowski wrote:
> Eryk reported an issue that I have put under Closes: tag, related to
> umem addrs being prematurely produced onto pool's completion queue.
> Let us make the skb's destructor responsible for producing all addrs
> that given skb used.
>
> Introduce struct xsk_addrs which will carry descriptor count with array
> of addresses taken from processed descriptors that will be carried via
> skb_shared_info::destructor_arg. This way we can refer to it within
> xsk_destruct_skb(). In order to mitigate the overhead that will be
> coming from memory allocations, let us introduce kmem_cache of
> xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> system.
>
> Commit from fixes tag introduced the buggy behavior, it was not broken
> from day 1, but rather when xsk multi-buffer got introduced.
>
> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
>
> v1:
> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> v2:
> https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> v3:
> https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> v4:
> https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
> v5:
> https://lore.kernel.org/bpf/aKXBHGPxjpBDKOHq@boxer/T/
>
> v1->v2:
> * store addrs in array carried via destructor_arg instead having them
> stored in skb headroom; cleaner and less hacky approach;
> v2->v3:
> * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> * set err when xsk_addrs allocation fails (Dan)
> * change xsk_addrs layout to avoid holes
> * free xsk_addrs on error path
> * rebase
> v3->v4:
> * have kmem_cache as percpu vars
> * don't drop unnecessary braces (unrelated) (Stan)
> * use idx + i in xskq_prod_write_addr (Stan)
> * alloc kmem_cache on bind (Stan)
> * keep num_descs as first member in xsk_addrs (Magnus)
> * add ack from Magnus
> v4->v5:
> * have a single kmem_cache per xsk subsystem (Stan)
> v5->v6:
> * free skb in xsk_build_skb_zerocopy() when xsk_addrs allocation fails
> (Stan)
> * unregister netdev notifier if creating kmem_cache fails (Stan)
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Thanks!
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-20 15:44 [PATCH v6 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-20 17:06 ` Stanislav Fomichev
@ 2025-08-22 16:20 ` patchwork-bot+netdevbpf
2025-08-26 8:14 ` Dan Carpenter
2025-08-26 16:00 ` Jason Xing
3 siblings, 0 replies; 14+ messages in thread
From: patchwork-bot+netdevbpf @ 2025-08-22 16:20 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson, stfomichev,
aleksander.lobakin, e.kubanski
Hello:
This patch was applied to bpf/bpf.git (master)
by Daniel Borkmann <daniel@iogearbox.net>:
On Wed, 20 Aug 2025 17:44:16 +0200 you wrote:
> Eryk reported an issue that I have put under Closes: tag, related to
> umem addrs being prematurely produced onto pool's completion queue.
> Let us make the skb's destructor responsible for producing all addrs
> that given skb used.
>
> Introduce struct xsk_addrs which will carry descriptor count with array
> of addresses taken from processed descriptors that will be carried via
> skb_shared_info::destructor_arg. This way we can refer to it within
> xsk_destruct_skb(). In order to mitigate the overhead that will be
> coming from memory allocations, let us introduce kmem_cache of
> xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> system.
>
> [...]
Here is the summary with links:
- [v6,bpf] xsk: fix immature cq descriptor production
https://git.kernel.org/bpf/bpf/c/dd9de524183a
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-20 15:44 [PATCH v6 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-20 17:06 ` Stanislav Fomichev
2025-08-22 16:20 ` patchwork-bot+netdevbpf
@ 2025-08-26 8:14 ` Dan Carpenter
2025-08-26 12:23 ` Daniel Borkmann
2025-08-26 16:00 ` Jason Xing
3 siblings, 1 reply; 14+ messages in thread
From: Dan Carpenter @ 2025-08-26 8:14 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson, stfomichev,
aleksander.lobakin, Eryk Kubanski
On Wed, Aug 20, 2025 at 05:44:16PM +0200, Maciej Fijalkowski wrote:
> return ERR_PTR(err);
>
> skb_reserve(skb, hr);
> +
> + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> + if (!addrs) {
> + kfree(skb);
This needs to be kfree_skb(skb);
regards,
dan carpenter
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + xsk_set_destructor_arg(skb, addrs);
> }
>
> addr = desc->addr;
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-26 8:14 ` Dan Carpenter
@ 2025-08-26 12:23 ` Daniel Borkmann
2025-08-26 13:40 ` Jason Xing
2025-08-29 16:14 ` Maciej Fijalkowski
0 siblings, 2 replies; 14+ messages in thread
From: Daniel Borkmann @ 2025-08-26 12:23 UTC (permalink / raw)
To: Dan Carpenter, Maciej Fijalkowski
Cc: bpf, ast, andrii, netdev, magnus.karlsson, stfomichev,
aleksander.lobakin, Eryk Kubanski
On 8/26/25 10:14 AM, Dan Carpenter wrote:
> On Wed, Aug 20, 2025 at 05:44:16PM +0200, Maciej Fijalkowski wrote:
>> return ERR_PTR(err);
>>
>> skb_reserve(skb, hr);
>> +
>> + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
>> + if (!addrs) {
>> + kfree(skb);
>
> This needs to be kfree_skb(skb);
Oh well, good catch! Maciej, given this commit did not land yet in Linus' tree,
I can toss the commit from bpf tree assuming you send a v7?
Also, looking at xsk_build_skb(), do we similarly need to free that allocated
skb when we hit the ERR_PTR(-EOVERFLOW) ? Mentioned function has the following
in the free_err path:
if (first_frag && skb)
kfree_skb(skb);
Pls double check.
> regards,
> dan carpenter
>
>> + return ERR_PTR(-ENOMEM);
>> + }
>> +
>> + xsk_set_destructor_arg(skb, addrs);
>> }
>>
>> addr = desc->addr;
>
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-26 12:23 ` Daniel Borkmann
@ 2025-08-26 13:40 ` Jason Xing
2025-08-29 16:14 ` Maciej Fijalkowski
1 sibling, 0 replies; 14+ messages in thread
From: Jason Xing @ 2025-08-26 13:40 UTC (permalink / raw)
To: Daniel Borkmann
Cc: Dan Carpenter, Maciej Fijalkowski, bpf, ast, andrii, netdev,
magnus.karlsson, stfomichev, aleksander.lobakin, Eryk Kubanski
On Tue, Aug 26, 2025 at 8:24 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 8/26/25 10:14 AM, Dan Carpenter wrote:
> > On Wed, Aug 20, 2025 at 05:44:16PM +0200, Maciej Fijalkowski wrote:
> >> return ERR_PTR(err);
> >>
> >> skb_reserve(skb, hr);
> >> +
> >> + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> >> + if (!addrs) {
> >> + kfree(skb);
> >
> > This needs to be kfree_skb(skb);
>
> Oh well, good catch! Maciej, given this commit did not land yet in Linus' tree,
> I can toss the commit from bpf tree assuming you send a v7?
>
> Also, looking at xsk_build_skb(), do we similarly need to free that allocated
> skb when we hit the ERR_PTR(-EOVERFLOW) ? Mentioned function has the following
> in the free_err path:
>
> if (first_frag && skb)
> kfree_skb(skb);
>
> Pls double check.
Sorry to join the party late...
I have to bring bad news here. After I tested[1] on VM, I saw an
around 18.8% performance decrease (from 745,705 to 627,331) with this
patch applied.It can be reproduced stably :(
[1]: ./xdpsock -i eth1 -t -S -s 64
Thanks,
Jason
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-26 12:23 ` Daniel Borkmann
2025-08-26 13:40 ` Jason Xing
@ 2025-08-29 16:14 ` Maciej Fijalkowski
2025-08-30 10:32 ` Jason Xing
1 sibling, 1 reply; 14+ messages in thread
From: Maciej Fijalkowski @ 2025-08-29 16:14 UTC (permalink / raw)
To: Daniel Borkmann
Cc: Dan Carpenter, bpf, ast, andrii, netdev, magnus.karlsson,
stfomichev, aleksander.lobakin, Eryk Kubanski
On Tue, Aug 26, 2025 at 02:23:34PM +0200, Daniel Borkmann wrote:
> On 8/26/25 10:14 AM, Dan Carpenter wrote:
> > On Wed, Aug 20, 2025 at 05:44:16PM +0200, Maciej Fijalkowski wrote:
> > > return ERR_PTR(err);
> > > skb_reserve(skb, hr);
> > > +
> > > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > > + if (!addrs) {
> > > + kfree(skb);
> >
> > This needs to be kfree_skb(skb);
>
> Oh well, good catch! Maciej, given this commit did not land yet in Linus' tree,
> I can toss the commit from bpf tree assuming you send a v7?
>
> Also, looking at xsk_build_skb(), do we similarly need to free that allocated
> skb when we hit the ERR_PTR(-EOVERFLOW) ? Mentioned function has the following
> in the free_err path:
>
> if (first_frag && skb)
> kfree_skb(skb);
>
> Pls double check.
for EOVERFLOW we drop skb and then we continue with consuming next
descriptors from XSK Tx queue. Every other errno causes this loop
processing to stop and give the control back to application side. skb
pointer is kept within xdp_sock and on next syscall we will retry with
sending it.
if (err == -EOVERFLOW) {
xsk_drop_skb(xs->skb);
-> xsk_consume_skb(skb);
-> consume_skb(skb);
since it's a drop, i wonder if we should have a kfree_skb() with proper
drop reason for XSK subsystem, but that's for a different discussion.
I will now send a v7 which is supposed to address the reported perf impact
by Jason...keep your fingers crossed for me not messing anything again
around this code base:D
>
> > regards,
> > dan carpenter
> >
> > > + return ERR_PTR(-ENOMEM);
> > > + }
> > > +
> > > + xsk_set_destructor_arg(skb, addrs);
> > > }
> > > addr = desc->addr;
> >
>
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-29 16:14 ` Maciej Fijalkowski
@ 2025-08-30 10:32 ` Jason Xing
0 siblings, 0 replies; 14+ messages in thread
From: Jason Xing @ 2025-08-30 10:32 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: Daniel Borkmann, Dan Carpenter, bpf, ast, andrii, netdev,
magnus.karlsson, stfomichev, aleksander.lobakin, Eryk Kubanski
On Sat, Aug 30, 2025 at 12:15 AM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Tue, Aug 26, 2025 at 02:23:34PM +0200, Daniel Borkmann wrote:
> > On 8/26/25 10:14 AM, Dan Carpenter wrote:
> > > On Wed, Aug 20, 2025 at 05:44:16PM +0200, Maciej Fijalkowski wrote:
> > > > return ERR_PTR(err);
> > > > skb_reserve(skb, hr);
> > > > +
> > > > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > > > + if (!addrs) {
> > > > + kfree(skb);
> > >
> > > This needs to be kfree_skb(skb);
> >
> > Oh well, good catch! Maciej, given this commit did not land yet in Linus' tree,
> > I can toss the commit from bpf tree assuming you send a v7?
> >
> > Also, looking at xsk_build_skb(), do we similarly need to free that allocated
> > skb when we hit the ERR_PTR(-EOVERFLOW) ? Mentioned function has the following
> > in the free_err path:
> >
> > if (first_frag && skb)
> > kfree_skb(skb);
> >
> > Pls double check.
>
> for EOVERFLOW we drop skb and then we continue with consuming next
> descriptors from XSK Tx queue. Every other errno causes this loop
> processing to stop and give the control back to application side. skb
> pointer is kept within xdp_sock and on next syscall we will retry with
> sending it.
>
> if (err == -EOVERFLOW) {
> xsk_drop_skb(xs->skb);
> -> xsk_consume_skb(skb);
> -> consume_skb(skb);
>
> since it's a drop, i wonder if we should have a kfree_skb() with proper
> drop reason for XSK subsystem, but that's for a different discussion.
I agree on this point. A few days ago, I scanned the code over and
over again and stumbled on this. Sure, we can add reasons into it.
Thanks,
Jason
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-20 15:44 [PATCH v6 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
` (2 preceding siblings ...)
2025-08-26 8:14 ` Dan Carpenter
@ 2025-08-26 16:00 ` Jason Xing
2025-08-26 18:23 ` Magnus Karlsson
3 siblings, 1 reply; 14+ messages in thread
From: Jason Xing @ 2025-08-26 16:00 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson, stfomichev,
aleksander.lobakin, Eryk Kubanski
On Wed, Aug 20, 2025 at 11:49 PM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> Eryk reported an issue that I have put under Closes: tag, related to
> umem addrs being prematurely produced onto pool's completion queue.
> Let us make the skb's destructor responsible for producing all addrs
> that given skb used.
>
> Introduce struct xsk_addrs which will carry descriptor count with array
> of addresses taken from processed descriptors that will be carried via
> skb_shared_info::destructor_arg. This way we can refer to it within
> xsk_destruct_skb(). In order to mitigate the overhead that will be
> coming from memory allocations, let us introduce kmem_cache of
> xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> system.
>
> Commit from fixes tag introduced the buggy behavior, it was not broken
> from day 1, but rather when xsk multi-buffer got introduced.
>
> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
>
> v1:
> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> v2:
> https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> v3:
> https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> v4:
> https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
> v5:
> https://lore.kernel.org/bpf/aKXBHGPxjpBDKOHq@boxer/T/
>
> v1->v2:
> * store addrs in array carried via destructor_arg instead having them
> stored in skb headroom; cleaner and less hacky approach;
> v2->v3:
> * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> * set err when xsk_addrs allocation fails (Dan)
> * change xsk_addrs layout to avoid holes
> * free xsk_addrs on error path
> * rebase
> v3->v4:
> * have kmem_cache as percpu vars
> * don't drop unnecessary braces (unrelated) (Stan)
> * use idx + i in xskq_prod_write_addr (Stan)
> * alloc kmem_cache on bind (Stan)
> * keep num_descs as first member in xsk_addrs (Magnus)
> * add ack from Magnus
> v4->v5:
> * have a single kmem_cache per xsk subsystem (Stan)
> v5->v6:
> * free skb in xsk_build_skb_zerocopy() when xsk_addrs allocation fails
> (Stan)
> * unregister netdev notifier if creating kmem_cache fails (Stan)
>
> ---
> net/xdp/xsk.c | 95 +++++++++++++++++++++++++++++++++++++--------
> net/xdp/xsk_queue.h | 12 ++++++
> 2 files changed, 91 insertions(+), 16 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 9c3acecc14b1..989d5ffb4273 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,6 +36,13 @@
> #define TX_BATCH_SIZE 32
> #define MAX_PER_SOCKET_BUDGET 32
>
> +struct xsk_addrs {
> + u32 num_descs;
> + u64 addrs[MAX_SKB_FRAGS + 1];
> +};
> +
> +static struct kmem_cache *xsk_tx_generic_cache;
IMHO, adding a few heavy operations of allocating and freeing from
cache in the hot path is not a good choice. What I've been trying so
hard lately is to minimize the times of manipulating memory as much as
possible :( Memory hotspot can be easily captured by perf.
We might provide an new option in setsockopt() to let users
specifically support this use case since it does harm to normal cases?
> +
> void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> {
> if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> }
>
> -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> {
> unsigned long flags;
> int ret;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> - ret = xskq_prod_reserve_addr(pool->cq, addr);
> + ret = xskq_prod_reserve(pool->cq);
> spin_unlock_irqrestore(&pool->cq_lock, flags);
>
> return ret;
> }
>
> -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> + struct sk_buff *skb)
> {
> + struct xsk_buff_pool *pool = xs->pool;
> + struct xsk_addrs *xsk_addrs;
> unsigned long flags;
> + u32 num_desc, i;
> + u32 idx;
> +
> + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + num_desc = xsk_addrs->num_descs;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> - xskq_prod_submit_n(pool->cq, n);
> + idx = xskq_get_prod(pool->cq);
> +
> + for (i = 0; i < num_desc; i++)
> + xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
> + xskq_prod_submit_n(pool->cq, num_desc);
> +
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> + kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
> }
>
> static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> @@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> }
>
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> -}
> -
> static void xsk_destruct_skb(struct sk_buff *skb)
> {
> struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> @@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> *compl->tx_timestamp = ktime_get_tai_fast_ns();
> }
>
> - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> sock_wfree(skb);
> }
>
> -static void xsk_set_destructor_arg(struct sk_buff *skb)
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> {
> - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> + struct xsk_addrs *addrs;
>
> - skb_shinfo(skb)->destructor_arg = (void *)num;
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + return addrs->num_descs;
> +}
> +
> +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> +{
> + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> +}
> +
> +static void xsk_inc_skb_descs(struct sk_buff *skb)
> +{
> + struct xsk_addrs *addrs;
> +
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + addrs->num_descs++;
> }
>
> static void xsk_consume_skb(struct sk_buff *skb)
> {
> struct xdp_sock *xs = xdp_sk(skb->sk);
>
> + kmem_cache_free(xsk_tx_generic_cache,
> + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
Replying to Daniel here: when EOVERFLOW occurs, it will finally go to
above function and clear the allocated memory and skb.
> skb->destructor = sock_wfree;
> xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> /* Free skb without triggering the perf drop trace */
> @@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> {
> struct xsk_buff_pool *pool = xs->pool;
> u32 hr, len, ts, offset, copy, copied;
> + struct xsk_addrs *addrs = NULL;
nit: no need to set to "NULL" at the begining.
> struct sk_buff *skb = xs->skb;
> struct page *page;
> void *buffer;
> @@ -623,6 +656,14 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> return ERR_PTR(err);
>
> skb_reserve(skb, hr);
> +
> + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> + if (!addrs) {
> + kfree(skb);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + xsk_set_destructor_arg(skb, addrs);
> }
>
> addr = desc->addr;
> @@ -662,6 +703,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> {
> struct xsk_tx_metadata *meta = NULL;
> struct net_device *dev = xs->dev;
> + struct xsk_addrs *addrs = NULL;
> struct sk_buff *skb = xs->skb;
> bool first_frag = false;
> int err;
> @@ -694,6 +736,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> err = skb_store_bits(skb, 0, buffer, len);
> if (unlikely(err))
> goto free_err;
> +
> + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> + if (!addrs) {
> + err = -ENOMEM;
> + goto free_err;
> + }
> +
> + xsk_set_destructor_arg(skb, addrs);
> +
> } else {
> int nr_frags = skb_shinfo(skb)->nr_frags;
> struct page *page;
> @@ -759,7 +810,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> skb->mark = READ_ONCE(xs->sk.sk_mark);
> skb->destructor = xsk_destruct_skb;
> xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> - xsk_set_destructor_arg(skb);
> +
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + addrs->addrs[addrs->num_descs++] = desc->addr;
>
> return skb;
>
> @@ -769,7 +822,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>
> if (err == -EOVERFLOW) {
> /* Drop the packet */
> - xsk_set_destructor_arg(xs->skb);
> + xsk_inc_skb_descs(xs->skb);
> xsk_drop_skb(xs->skb);
> xskq_cons_release(xs->tx);
> } else {
> @@ -812,7 +865,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> + err = xsk_cq_reserve_locked(xs->pool);
> if (err) {
> err = -EAGAIN;
> goto out;
> @@ -1815,8 +1868,18 @@ static int __init xsk_init(void)
> if (err)
> goto out_pernet;
>
> + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> + sizeof(struct xsk_addrs), 0,
> + SLAB_HWCACHE_ALIGN, NULL);
> + if (!xsk_tx_generic_cache) {
> + err = -ENOMEM;
> + goto out_unreg_notif;
> + }
> +
> return 0;
>
> +out_unreg_notif:
> + unregister_netdevice_notifier(&xsk_netdev_notifier);
> out_pernet:
> unregister_pernet_subsys(&xsk_net_ops);
> out_sk:
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index 46d87e961ad6..f16f390370dc 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
>
> /* Functions for producers */
>
> +static inline u32 xskq_get_prod(struct xsk_queue *q)
> +{
> + return READ_ONCE(q->ring->producer);
> +}
> +
> static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
> {
> u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
> @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
> return 0;
> }
>
> +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
> +{
> + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
> +
> + ring->desc[idx & q->ring_mask] = addr;
> +}
> +
> static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
> u32 nb_entries)
> {
> --
> 2.34.1
>
>
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-26 16:00 ` Jason Xing
@ 2025-08-26 18:23 ` Magnus Karlsson
2025-08-26 19:03 ` Maciej Fijalkowski
0 siblings, 1 reply; 14+ messages in thread
From: Magnus Karlsson @ 2025-08-26 18:23 UTC (permalink / raw)
To: Jason Xing
Cc: Maciej Fijalkowski, bpf, ast, daniel, andrii, netdev,
magnus.karlsson, stfomichev, aleksander.lobakin, Eryk Kubanski
On Tue, 26 Aug 2025 at 18:07, Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Wed, Aug 20, 2025 at 11:49 PM Maciej Fijalkowski
> <maciej.fijalkowski@intel.com> wrote:
> >
> > Eryk reported an issue that I have put under Closes: tag, related to
> > umem addrs being prematurely produced onto pool's completion queue.
> > Let us make the skb's destructor responsible for producing all addrs
> > that given skb used.
> >
> > Introduce struct xsk_addrs which will carry descriptor count with array
> > of addresses taken from processed descriptors that will be carried via
> > skb_shared_info::destructor_arg. This way we can refer to it within
> > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > coming from memory allocations, let us introduce kmem_cache of
> > xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> > system.
> >
> > Commit from fixes tag introduced the buggy behavior, it was not broken
> > from day 1, but rather when xsk multi-buffer got introduced.
> >
> > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > ---
> >
> > v1:
> > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > v2:
> > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> > v3:
> > https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> > v4:
> > https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
> > v5:
> > https://lore.kernel.org/bpf/aKXBHGPxjpBDKOHq@boxer/T/
> >
> > v1->v2:
> > * store addrs in array carried via destructor_arg instead having them
> > stored in skb headroom; cleaner and less hacky approach;
> > v2->v3:
> > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > * set err when xsk_addrs allocation fails (Dan)
> > * change xsk_addrs layout to avoid holes
> > * free xsk_addrs on error path
> > * rebase
> > v3->v4:
> > * have kmem_cache as percpu vars
> > * don't drop unnecessary braces (unrelated) (Stan)
> > * use idx + i in xskq_prod_write_addr (Stan)
> > * alloc kmem_cache on bind (Stan)
> > * keep num_descs as first member in xsk_addrs (Magnus)
> > * add ack from Magnus
> > v4->v5:
> > * have a single kmem_cache per xsk subsystem (Stan)
> > v5->v6:
> > * free skb in xsk_build_skb_zerocopy() when xsk_addrs allocation fails
> > (Stan)
> > * unregister netdev notifier if creating kmem_cache fails (Stan)
> >
> > ---
> > net/xdp/xsk.c | 95 +++++++++++++++++++++++++++++++++++++--------
> > net/xdp/xsk_queue.h | 12 ++++++
> > 2 files changed, 91 insertions(+), 16 deletions(-)
> >
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index 9c3acecc14b1..989d5ffb4273 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -36,6 +36,13 @@
> > #define TX_BATCH_SIZE 32
> > #define MAX_PER_SOCKET_BUDGET 32
> >
> > +struct xsk_addrs {
> > + u32 num_descs;
> > + u64 addrs[MAX_SKB_FRAGS + 1];
> > +};
> > +
> > +static struct kmem_cache *xsk_tx_generic_cache;
>
> IMHO, adding a few heavy operations of allocating and freeing from
> cache in the hot path is not a good choice. What I've been trying so
> hard lately is to minimize the times of manipulating memory as much as
> possible :( Memory hotspot can be easily captured by perf.
>
> We might provide an new option in setsockopt() to let users
> specifically support this use case since it does harm to normal cases?
Agree with you that we should not harm the normal case here. Instead
of introducing a setsockopt, how about we detect the case when this
can happen in the code? If I remember correctly, it can only occur in
the XDP_SHARED_UMEM mode were the xsk pool is shared between
processes. If this can be tested (by introducing a new bit in the xsk
pool if that is necessary), we could have two potential skb
destructors: the old one for the "normal" case and the new one with
the list of addresses to complete (using the expensive allocations and
deallocations) when it is strictly required i.e., when the xsk pool is
shared. Maciej, you are more in to the details of this, so what do you
think? Would something like this be a potential path forward?
>
> > +
> > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> > {
> > if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> > @@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> > return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> > }
> >
> > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> > {
> > unsigned long flags;
> > int ret;
> >
> > spin_lock_irqsave(&pool->cq_lock, flags);
> > - ret = xskq_prod_reserve_addr(pool->cq, addr);
> > + ret = xskq_prod_reserve(pool->cq);
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> >
> > return ret;
> > }
> >
> > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> > + struct sk_buff *skb)
> > {
> > + struct xsk_buff_pool *pool = xs->pool;
> > + struct xsk_addrs *xsk_addrs;
> > unsigned long flags;
> > + u32 num_desc, i;
> > + u32 idx;
> > +
> > + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + num_desc = xsk_addrs->num_descs;
> >
> > spin_lock_irqsave(&pool->cq_lock, flags);
> > - xskq_prod_submit_n(pool->cq, n);
> > + idx = xskq_get_prod(pool->cq);
> > +
> > + for (i = 0; i < num_desc; i++)
> > + xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
> > + xskq_prod_submit_n(pool->cq, num_desc);
> > +
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > + kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
> > }
> >
> > static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > @@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > }
> >
> > -static u32 xsk_get_num_desc(struct sk_buff *skb)
> > -{
> > - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> > -}
> > -
> > static void xsk_destruct_skb(struct sk_buff *skb)
> > {
> > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> > @@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > *compl->tx_timestamp = ktime_get_tai_fast_ns();
> > }
> >
> > - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> > + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> > sock_wfree(skb);
> > }
> >
> > -static void xsk_set_destructor_arg(struct sk_buff *skb)
> > +static u32 xsk_get_num_desc(struct sk_buff *skb)
> > {
> > - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> > + struct xsk_addrs *addrs;
> >
> > - skb_shinfo(skb)->destructor_arg = (void *)num;
> > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + return addrs->num_descs;
> > +}
> > +
> > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> > +{
> > + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> > +}
> > +
> > +static void xsk_inc_skb_descs(struct sk_buff *skb)
> > +{
> > + struct xsk_addrs *addrs;
> > +
> > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + addrs->num_descs++;
> > }
> >
> > static void xsk_consume_skb(struct sk_buff *skb)
> > {
> > struct xdp_sock *xs = xdp_sk(skb->sk);
> >
> > + kmem_cache_free(xsk_tx_generic_cache,
> > + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
>
> Replying to Daniel here: when EOVERFLOW occurs, it will finally go to
> above function and clear the allocated memory and skb.
>
> > skb->destructor = sock_wfree;
> > xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> > /* Free skb without triggering the perf drop trace */
> > @@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > {
> > struct xsk_buff_pool *pool = xs->pool;
> > u32 hr, len, ts, offset, copy, copied;
> > + struct xsk_addrs *addrs = NULL;
>
> nit: no need to set to "NULL" at the begining.
>
> > struct sk_buff *skb = xs->skb;
> > struct page *page;
> > void *buffer;
> > @@ -623,6 +656,14 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > return ERR_PTR(err);
> >
> > skb_reserve(skb, hr);
> > +
> > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > + if (!addrs) {
> > + kfree(skb);
> > + return ERR_PTR(-ENOMEM);
> > + }
> > +
> > + xsk_set_destructor_arg(skb, addrs);
> > }
> >
> > addr = desc->addr;
> > @@ -662,6 +703,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > {
> > struct xsk_tx_metadata *meta = NULL;
> > struct net_device *dev = xs->dev;
> > + struct xsk_addrs *addrs = NULL;
> > struct sk_buff *skb = xs->skb;
> > bool first_frag = false;
> > int err;
> > @@ -694,6 +736,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > err = skb_store_bits(skb, 0, buffer, len);
> > if (unlikely(err))
> > goto free_err;
> > +
> > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > + if (!addrs) {
> > + err = -ENOMEM;
> > + goto free_err;
> > + }
> > +
> > + xsk_set_destructor_arg(skb, addrs);
> > +
> > } else {
> > int nr_frags = skb_shinfo(skb)->nr_frags;
> > struct page *page;
> > @@ -759,7 +810,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > skb->mark = READ_ONCE(xs->sk.sk_mark);
> > skb->destructor = xsk_destruct_skb;
> > xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> > - xsk_set_destructor_arg(skb);
> > +
> > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + addrs->addrs[addrs->num_descs++] = desc->addr;
> >
> > return skb;
> >
> > @@ -769,7 +822,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> >
> > if (err == -EOVERFLOW) {
> > /* Drop the packet */
> > - xsk_set_destructor_arg(xs->skb);
> > + xsk_inc_skb_descs(xs->skb);
> > xsk_drop_skb(xs->skb);
> > xskq_cons_release(xs->tx);
> > } else {
> > @@ -812,7 +865,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> > * if there is space in it. This avoids having to implement
> > * any buffering in the Tx path.
> > */
> > - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> > + err = xsk_cq_reserve_locked(xs->pool);
> > if (err) {
> > err = -EAGAIN;
> > goto out;
> > @@ -1815,8 +1868,18 @@ static int __init xsk_init(void)
> > if (err)
> > goto out_pernet;
> >
> > + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> > + sizeof(struct xsk_addrs), 0,
> > + SLAB_HWCACHE_ALIGN, NULL);
> > + if (!xsk_tx_generic_cache) {
> > + err = -ENOMEM;
> > + goto out_unreg_notif;
> > + }
> > +
> > return 0;
> >
> > +out_unreg_notif:
> > + unregister_netdevice_notifier(&xsk_netdev_notifier);
> > out_pernet:
> > unregister_pernet_subsys(&xsk_net_ops);
> > out_sk:
> > diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> > index 46d87e961ad6..f16f390370dc 100644
> > --- a/net/xdp/xsk_queue.h
> > +++ b/net/xdp/xsk_queue.h
> > @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
> >
> > /* Functions for producers */
> >
> > +static inline u32 xskq_get_prod(struct xsk_queue *q)
> > +{
> > + return READ_ONCE(q->ring->producer);
> > +}
> > +
> > static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
> > {
> > u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
> > @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
> > return 0;
> > }
> >
> > +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
> > +{
> > + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
> > +
> > + ring->desc[idx & q->ring_mask] = addr;
> > +}
> > +
> > static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
> > u32 nb_entries)
> > {
> > --
> > 2.34.1
> >
> >
>
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-26 18:23 ` Magnus Karlsson
@ 2025-08-26 19:03 ` Maciej Fijalkowski
2025-08-26 19:13 ` Maciej Fijalkowski
0 siblings, 1 reply; 14+ messages in thread
From: Maciej Fijalkowski @ 2025-08-26 19:03 UTC (permalink / raw)
To: Magnus Karlsson
Cc: Jason Xing, bpf, ast, daniel, andrii, netdev, magnus.karlsson,
stfomichev, aleksander.lobakin, Eryk Kubanski
On Tue, Aug 26, 2025 at 08:23:04PM +0200, Magnus Karlsson wrote:
> On Tue, 26 Aug 2025 at 18:07, Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > On Wed, Aug 20, 2025 at 11:49 PM Maciej Fijalkowski
> > <maciej.fijalkowski@intel.com> wrote:
> > >
> > > Eryk reported an issue that I have put under Closes: tag, related to
> > > umem addrs being prematurely produced onto pool's completion queue.
> > > Let us make the skb's destructor responsible for producing all addrs
> > > that given skb used.
> > >
> > > Introduce struct xsk_addrs which will carry descriptor count with array
> > > of addresses taken from processed descriptors that will be carried via
> > > skb_shared_info::destructor_arg. This way we can refer to it within
> > > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > > coming from memory allocations, let us introduce kmem_cache of
> > > xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> > > system.
> > >
> > > Commit from fixes tag introduced the buggy behavior, it was not broken
> > > from day 1, but rather when xsk multi-buffer got introduced.
> > >
> > > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > > Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > > ---
> > >
> > > v1:
> > > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > > v2:
> > > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> > > v3:
> > > https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> > > v4:
> > > https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
> > > v5:
> > > https://lore.kernel.org/bpf/aKXBHGPxjpBDKOHq@boxer/T/
> > >
> > > v1->v2:
> > > * store addrs in array carried via destructor_arg instead having them
> > > stored in skb headroom; cleaner and less hacky approach;
> > > v2->v3:
> > > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > > * set err when xsk_addrs allocation fails (Dan)
> > > * change xsk_addrs layout to avoid holes
> > > * free xsk_addrs on error path
> > > * rebase
> > > v3->v4:
> > > * have kmem_cache as percpu vars
> > > * don't drop unnecessary braces (unrelated) (Stan)
> > > * use idx + i in xskq_prod_write_addr (Stan)
> > > * alloc kmem_cache on bind (Stan)
> > > * keep num_descs as first member in xsk_addrs (Magnus)
> > > * add ack from Magnus
> > > v4->v5:
> > > * have a single kmem_cache per xsk subsystem (Stan)
> > > v5->v6:
> > > * free skb in xsk_build_skb_zerocopy() when xsk_addrs allocation fails
> > > (Stan)
> > > * unregister netdev notifier if creating kmem_cache fails (Stan)
> > >
> > > ---
> > > net/xdp/xsk.c | 95 +++++++++++++++++++++++++++++++++++++--------
> > > net/xdp/xsk_queue.h | 12 ++++++
> > > 2 files changed, 91 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > index 9c3acecc14b1..989d5ffb4273 100644
> > > --- a/net/xdp/xsk.c
> > > +++ b/net/xdp/xsk.c
> > > @@ -36,6 +36,13 @@
> > > #define TX_BATCH_SIZE 32
> > > #define MAX_PER_SOCKET_BUDGET 32
> > >
> > > +struct xsk_addrs {
> > > + u32 num_descs;
> > > + u64 addrs[MAX_SKB_FRAGS + 1];
> > > +};
> > > +
> > > +static struct kmem_cache *xsk_tx_generic_cache;
> >
> > IMHO, adding a few heavy operations of allocating and freeing from
> > cache in the hot path is not a good choice. What I've been trying so
> > hard lately is to minimize the times of manipulating memory as much as
> > possible :( Memory hotspot can be easily captured by perf.
> >
> > We might provide an new option in setsockopt() to let users
> > specifically support this use case since it does harm to normal cases?
>
> Agree with you that we should not harm the normal case here. Instead
> of introducing a setsockopt, how about we detect the case when this
> can happen in the code? If I remember correctly, it can only occur in
> the XDP_SHARED_UMEM mode were the xsk pool is shared between
> processes. If this can be tested (by introducing a new bit in the xsk
> pool if that is necessary), we could have two potential skb
> destructors: the old one for the "normal" case and the new one with
> the list of addresses to complete (using the expensive allocations and
> deallocations) when it is strictly required i.e., when the xsk pool is
> shared. Maciej, you are more in to the details of this, so what do you
> think? Would something like this be a potential path forward?
Meh, i was focused on 9k mtu impact, it was about 5% on my machine but now
i checked small packets and indeed i see 12-14% perf regression.
I'll look into this so Daniel, for now let's drop this unfortunate
patch...
>
> >
> > > +
> > > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> > > {
> > > if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> > > @@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> > > return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> > > }
> > >
> > > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> > > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> > > {
> > > unsigned long flags;
> > > int ret;
> > >
> > > spin_lock_irqsave(&pool->cq_lock, flags);
> > > - ret = xskq_prod_reserve_addr(pool->cq, addr);
> > > + ret = xskq_prod_reserve(pool->cq);
> > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > >
> > > return ret;
> > > }
> > >
> > > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> > > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> > > + struct sk_buff *skb)
> > > {
> > > + struct xsk_buff_pool *pool = xs->pool;
> > > + struct xsk_addrs *xsk_addrs;
> > > unsigned long flags;
> > > + u32 num_desc, i;
> > > + u32 idx;
> > > +
> > > + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > + num_desc = xsk_addrs->num_descs;
> > >
> > > spin_lock_irqsave(&pool->cq_lock, flags);
> > > - xskq_prod_submit_n(pool->cq, n);
> > > + idx = xskq_get_prod(pool->cq);
> > > +
> > > + for (i = 0; i < num_desc; i++)
> > > + xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
> > > + xskq_prod_submit_n(pool->cq, num_desc);
> > > +
> > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > + kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
> > > }
> > >
> > > static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > > @@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > }
> > >
> > > -static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > -{
> > > - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> > > -}
> > > -
> > > static void xsk_destruct_skb(struct sk_buff *skb)
> > > {
> > > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> > > @@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > > *compl->tx_timestamp = ktime_get_tai_fast_ns();
> > > }
> > >
> > > - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> > > + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> > > sock_wfree(skb);
> > > }
> > >
> > > -static void xsk_set_destructor_arg(struct sk_buff *skb)
> > > +static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > {
> > > - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> > > + struct xsk_addrs *addrs;
> > >
> > > - skb_shinfo(skb)->destructor_arg = (void *)num;
> > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > + return addrs->num_descs;
> > > +}
> > > +
> > > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> > > +{
> > > + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> > > +}
> > > +
> > > +static void xsk_inc_skb_descs(struct sk_buff *skb)
> > > +{
> > > + struct xsk_addrs *addrs;
> > > +
> > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > + addrs->num_descs++;
> > > }
> > >
> > > static void xsk_consume_skb(struct sk_buff *skb)
> > > {
> > > struct xdp_sock *xs = xdp_sk(skb->sk);
> > >
> > > + kmem_cache_free(xsk_tx_generic_cache,
> > > + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> >
> > Replying to Daniel here: when EOVERFLOW occurs, it will finally go to
> > above function and clear the allocated memory and skb.
> >
> > > skb->destructor = sock_wfree;
> > > xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> > > /* Free skb without triggering the perf drop trace */
> > > @@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > {
> > > struct xsk_buff_pool *pool = xs->pool;
> > > u32 hr, len, ts, offset, copy, copied;
> > > + struct xsk_addrs *addrs = NULL;
> >
> > nit: no need to set to "NULL" at the begining.
> >
> > > struct sk_buff *skb = xs->skb;
> > > struct page *page;
> > > void *buffer;
> > > @@ -623,6 +656,14 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > return ERR_PTR(err);
> > >
> > > skb_reserve(skb, hr);
> > > +
> > > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > > + if (!addrs) {
> > > + kfree(skb);
> > > + return ERR_PTR(-ENOMEM);
> > > + }
> > > +
> > > + xsk_set_destructor_arg(skb, addrs);
> > > }
> > >
> > > addr = desc->addr;
> > > @@ -662,6 +703,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > {
> > > struct xsk_tx_metadata *meta = NULL;
> > > struct net_device *dev = xs->dev;
> > > + struct xsk_addrs *addrs = NULL;
> > > struct sk_buff *skb = xs->skb;
> > > bool first_frag = false;
> > > int err;
> > > @@ -694,6 +736,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > err = skb_store_bits(skb, 0, buffer, len);
> > > if (unlikely(err))
> > > goto free_err;
> > > +
> > > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > > + if (!addrs) {
> > > + err = -ENOMEM;
> > > + goto free_err;
> > > + }
> > > +
> > > + xsk_set_destructor_arg(skb, addrs);
> > > +
> > > } else {
> > > int nr_frags = skb_shinfo(skb)->nr_frags;
> > > struct page *page;
> > > @@ -759,7 +810,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > skb->mark = READ_ONCE(xs->sk.sk_mark);
> > > skb->destructor = xsk_destruct_skb;
> > > xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> > > - xsk_set_destructor_arg(skb);
> > > +
> > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > + addrs->addrs[addrs->num_descs++] = desc->addr;
> > >
> > > return skb;
> > >
> > > @@ -769,7 +822,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > >
> > > if (err == -EOVERFLOW) {
> > > /* Drop the packet */
> > > - xsk_set_destructor_arg(xs->skb);
> > > + xsk_inc_skb_descs(xs->skb);
> > > xsk_drop_skb(xs->skb);
> > > xskq_cons_release(xs->tx);
> > > } else {
> > > @@ -812,7 +865,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> > > * if there is space in it. This avoids having to implement
> > > * any buffering in the Tx path.
> > > */
> > > - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> > > + err = xsk_cq_reserve_locked(xs->pool);
> > > if (err) {
> > > err = -EAGAIN;
> > > goto out;
> > > @@ -1815,8 +1868,18 @@ static int __init xsk_init(void)
> > > if (err)
> > > goto out_pernet;
> > >
> > > + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> > > + sizeof(struct xsk_addrs), 0,
> > > + SLAB_HWCACHE_ALIGN, NULL);
> > > + if (!xsk_tx_generic_cache) {
> > > + err = -ENOMEM;
> > > + goto out_unreg_notif;
> > > + }
> > > +
> > > return 0;
> > >
> > > +out_unreg_notif:
> > > + unregister_netdevice_notifier(&xsk_netdev_notifier);
> > > out_pernet:
> > > unregister_pernet_subsys(&xsk_net_ops);
> > > out_sk:
> > > diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> > > index 46d87e961ad6..f16f390370dc 100644
> > > --- a/net/xdp/xsk_queue.h
> > > +++ b/net/xdp/xsk_queue.h
> > > @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
> > >
> > > /* Functions for producers */
> > >
> > > +static inline u32 xskq_get_prod(struct xsk_queue *q)
> > > +{
> > > + return READ_ONCE(q->ring->producer);
> > > +}
> > > +
> > > static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
> > > {
> > > u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
> > > @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
> > > return 0;
> > > }
> > >
> > > +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
> > > +{
> > > + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
> > > +
> > > + ring->desc[idx & q->ring_mask] = addr;
> > > +}
> > > +
> > > static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
> > > u32 nb_entries)
> > > {
> > > --
> > > 2.34.1
> > >
> > >
> >
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-26 19:03 ` Maciej Fijalkowski
@ 2025-08-26 19:13 ` Maciej Fijalkowski
2025-08-27 0:25 ` Jason Xing
0 siblings, 1 reply; 14+ messages in thread
From: Maciej Fijalkowski @ 2025-08-26 19:13 UTC (permalink / raw)
To: Magnus Karlsson
Cc: Jason Xing, bpf, ast, daniel, andrii, netdev, magnus.karlsson,
stfomichev, aleksander.lobakin, Eryk Kubanski
On Tue, Aug 26, 2025 at 09:03:45PM +0200, Maciej Fijalkowski wrote:
> On Tue, Aug 26, 2025 at 08:23:04PM +0200, Magnus Karlsson wrote:
> > On Tue, 26 Aug 2025 at 18:07, Jason Xing <kerneljasonxing@gmail.com> wrote:
> > >
> > > On Wed, Aug 20, 2025 at 11:49 PM Maciej Fijalkowski
> > > <maciej.fijalkowski@intel.com> wrote:
> > > >
> > > > Eryk reported an issue that I have put under Closes: tag, related to
> > > > umem addrs being prematurely produced onto pool's completion queue.
> > > > Let us make the skb's destructor responsible for producing all addrs
> > > > that given skb used.
> > > >
> > > > Introduce struct xsk_addrs which will carry descriptor count with array
> > > > of addresses taken from processed descriptors that will be carried via
> > > > skb_shared_info::destructor_arg. This way we can refer to it within
> > > > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > > > coming from memory allocations, let us introduce kmem_cache of
> > > > xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> > > > system.
> > > >
> > > > Commit from fixes tag introduced the buggy behavior, it was not broken
> > > > from day 1, but rather when xsk multi-buffer got introduced.
> > > >
> > > > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > > > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > > > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > > > Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> > > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > > > ---
> > > >
> > > > v1:
> > > > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > > > v2:
> > > > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> > > > v3:
> > > > https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> > > > v4:
> > > > https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
> > > > v5:
> > > > https://lore.kernel.org/bpf/aKXBHGPxjpBDKOHq@boxer/T/
> > > >
> > > > v1->v2:
> > > > * store addrs in array carried via destructor_arg instead having them
> > > > stored in skb headroom; cleaner and less hacky approach;
> > > > v2->v3:
> > > > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > > > * set err when xsk_addrs allocation fails (Dan)
> > > > * change xsk_addrs layout to avoid holes
> > > > * free xsk_addrs on error path
> > > > * rebase
> > > > v3->v4:
> > > > * have kmem_cache as percpu vars
> > > > * don't drop unnecessary braces (unrelated) (Stan)
> > > > * use idx + i in xskq_prod_write_addr (Stan)
> > > > * alloc kmem_cache on bind (Stan)
> > > > * keep num_descs as first member in xsk_addrs (Magnus)
> > > > * add ack from Magnus
> > > > v4->v5:
> > > > * have a single kmem_cache per xsk subsystem (Stan)
> > > > v5->v6:
> > > > * free skb in xsk_build_skb_zerocopy() when xsk_addrs allocation fails
> > > > (Stan)
> > > > * unregister netdev notifier if creating kmem_cache fails (Stan)
> > > >
> > > > ---
> > > > net/xdp/xsk.c | 95 +++++++++++++++++++++++++++++++++++++--------
> > > > net/xdp/xsk_queue.h | 12 ++++++
> > > > 2 files changed, 91 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > index 9c3acecc14b1..989d5ffb4273 100644
> > > > --- a/net/xdp/xsk.c
> > > > +++ b/net/xdp/xsk.c
> > > > @@ -36,6 +36,13 @@
> > > > #define TX_BATCH_SIZE 32
> > > > #define MAX_PER_SOCKET_BUDGET 32
> > > >
> > > > +struct xsk_addrs {
> > > > + u32 num_descs;
> > > > + u64 addrs[MAX_SKB_FRAGS + 1];
> > > > +};
> > > > +
> > > > +static struct kmem_cache *xsk_tx_generic_cache;
> > >
> > > IMHO, adding a few heavy operations of allocating and freeing from
> > > cache in the hot path is not a good choice. What I've been trying so
> > > hard lately is to minimize the times of manipulating memory as much as
> > > possible :( Memory hotspot can be easily captured by perf.
> > >
> > > We might provide an new option in setsockopt() to let users
> > > specifically support this use case since it does harm to normal cases?
> >
> > Agree with you that we should not harm the normal case here. Instead
> > of introducing a setsockopt, how about we detect the case when this
> > can happen in the code? If I remember correctly, it can only occur in
> > the XDP_SHARED_UMEM mode were the xsk pool is shared between
> > processes. If this can be tested (by introducing a new bit in the xsk
> > pool if that is necessary), we could have two potential skb
> > destructors: the old one for the "normal" case and the new one with
> > the list of addresses to complete (using the expensive allocations and
> > deallocations) when it is strictly required i.e., when the xsk pool is
> > shared. Maciej, you are more in to the details of this, so what do you
> > think? Would something like this be a potential path forward?
>
> Meh, i was focused on 9k mtu impact, it was about 5% on my machine but now
> i checked small packets and indeed i see 12-14% perf regression.
>
> I'll look into this so Daniel, for now let's drop this unfortunate
> patch...
One more thing - Jason, you still need to focus your work on this approach
where we produce cq entries from destructor. I just need to come up with
smarter way of producing descs to be consumed by destructor :<
>
> >
> > >
> > > > +
> > > > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> > > > {
> > > > if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> > > > @@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> > > > return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> > > > }
> > > >
> > > > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> > > > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> > > > {
> > > > unsigned long flags;
> > > > int ret;
> > > >
> > > > spin_lock_irqsave(&pool->cq_lock, flags);
> > > > - ret = xskq_prod_reserve_addr(pool->cq, addr);
> > > > + ret = xskq_prod_reserve(pool->cq);
> > > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > >
> > > > return ret;
> > > > }
> > > >
> > > > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> > > > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> > > > + struct sk_buff *skb)
> > > > {
> > > > + struct xsk_buff_pool *pool = xs->pool;
> > > > + struct xsk_addrs *xsk_addrs;
> > > > unsigned long flags;
> > > > + u32 num_desc, i;
> > > > + u32 idx;
> > > > +
> > > > + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > > + num_desc = xsk_addrs->num_descs;
> > > >
> > > > spin_lock_irqsave(&pool->cq_lock, flags);
> > > > - xskq_prod_submit_n(pool->cq, n);
> > > > + idx = xskq_get_prod(pool->cq);
> > > > +
> > > > + for (i = 0; i < num_desc; i++)
> > > > + xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
> > > > + xskq_prod_submit_n(pool->cq, num_desc);
> > > > +
> > > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > > + kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
> > > > }
> > > >
> > > > static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > > > @@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > > }
> > > >
> > > > -static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > > -{
> > > > - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> > > > -}
> > > > -
> > > > static void xsk_destruct_skb(struct sk_buff *skb)
> > > > {
> > > > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> > > > @@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > > > *compl->tx_timestamp = ktime_get_tai_fast_ns();
> > > > }
> > > >
> > > > - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> > > > + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> > > > sock_wfree(skb);
> > > > }
> > > >
> > > > -static void xsk_set_destructor_arg(struct sk_buff *skb)
> > > > +static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > > {
> > > > - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> > > > + struct xsk_addrs *addrs;
> > > >
> > > > - skb_shinfo(skb)->destructor_arg = (void *)num;
> > > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > > + return addrs->num_descs;
> > > > +}
> > > > +
> > > > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> > > > +{
> > > > + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> > > > +}
> > > > +
> > > > +static void xsk_inc_skb_descs(struct sk_buff *skb)
> > > > +{
> > > > + struct xsk_addrs *addrs;
> > > > +
> > > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > > + addrs->num_descs++;
> > > > }
> > > >
> > > > static void xsk_consume_skb(struct sk_buff *skb)
> > > > {
> > > > struct xdp_sock *xs = xdp_sk(skb->sk);
> > > >
> > > > + kmem_cache_free(xsk_tx_generic_cache,
> > > > + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> > >
> > > Replying to Daniel here: when EOVERFLOW occurs, it will finally go to
> > > above function and clear the allocated memory and skb.
> > >
> > > > skb->destructor = sock_wfree;
> > > > xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> > > > /* Free skb without triggering the perf drop trace */
> > > > @@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > {
> > > > struct xsk_buff_pool *pool = xs->pool;
> > > > u32 hr, len, ts, offset, copy, copied;
> > > > + struct xsk_addrs *addrs = NULL;
> > >
> > > nit: no need to set to "NULL" at the begining.
> > >
> > > > struct sk_buff *skb = xs->skb;
> > > > struct page *page;
> > > > void *buffer;
> > > > @@ -623,6 +656,14 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > return ERR_PTR(err);
> > > >
> > > > skb_reserve(skb, hr);
> > > > +
> > > > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > > > + if (!addrs) {
> > > > + kfree(skb);
> > > > + return ERR_PTR(-ENOMEM);
> > > > + }
> > > > +
> > > > + xsk_set_destructor_arg(skb, addrs);
> > > > }
> > > >
> > > > addr = desc->addr;
> > > > @@ -662,6 +703,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > {
> > > > struct xsk_tx_metadata *meta = NULL;
> > > > struct net_device *dev = xs->dev;
> > > > + struct xsk_addrs *addrs = NULL;
> > > > struct sk_buff *skb = xs->skb;
> > > > bool first_frag = false;
> > > > int err;
> > > > @@ -694,6 +736,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > err = skb_store_bits(skb, 0, buffer, len);
> > > > if (unlikely(err))
> > > > goto free_err;
> > > > +
> > > > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > > > + if (!addrs) {
> > > > + err = -ENOMEM;
> > > > + goto free_err;
> > > > + }
> > > > +
> > > > + xsk_set_destructor_arg(skb, addrs);
> > > > +
> > > > } else {
> > > > int nr_frags = skb_shinfo(skb)->nr_frags;
> > > > struct page *page;
> > > > @@ -759,7 +810,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > skb->mark = READ_ONCE(xs->sk.sk_mark);
> > > > skb->destructor = xsk_destruct_skb;
> > > > xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> > > > - xsk_set_destructor_arg(skb);
> > > > +
> > > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > > + addrs->addrs[addrs->num_descs++] = desc->addr;
> > > >
> > > > return skb;
> > > >
> > > > @@ -769,7 +822,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > >
> > > > if (err == -EOVERFLOW) {
> > > > /* Drop the packet */
> > > > - xsk_set_destructor_arg(xs->skb);
> > > > + xsk_inc_skb_descs(xs->skb);
> > > > xsk_drop_skb(xs->skb);
> > > > xskq_cons_release(xs->tx);
> > > > } else {
> > > > @@ -812,7 +865,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> > > > * if there is space in it. This avoids having to implement
> > > > * any buffering in the Tx path.
> > > > */
> > > > - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> > > > + err = xsk_cq_reserve_locked(xs->pool);
> > > > if (err) {
> > > > err = -EAGAIN;
> > > > goto out;
> > > > @@ -1815,8 +1868,18 @@ static int __init xsk_init(void)
> > > > if (err)
> > > > goto out_pernet;
> > > >
> > > > + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> > > > + sizeof(struct xsk_addrs), 0,
> > > > + SLAB_HWCACHE_ALIGN, NULL);
> > > > + if (!xsk_tx_generic_cache) {
> > > > + err = -ENOMEM;
> > > > + goto out_unreg_notif;
> > > > + }
> > > > +
> > > > return 0;
> > > >
> > > > +out_unreg_notif:
> > > > + unregister_netdevice_notifier(&xsk_netdev_notifier);
> > > > out_pernet:
> > > > unregister_pernet_subsys(&xsk_net_ops);
> > > > out_sk:
> > > > diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> > > > index 46d87e961ad6..f16f390370dc 100644
> > > > --- a/net/xdp/xsk_queue.h
> > > > +++ b/net/xdp/xsk_queue.h
> > > > @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
> > > >
> > > > /* Functions for producers */
> > > >
> > > > +static inline u32 xskq_get_prod(struct xsk_queue *q)
> > > > +{
> > > > + return READ_ONCE(q->ring->producer);
> > > > +}
> > > > +
> > > > static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
> > > > {
> > > > u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
> > > > @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
> > > > return 0;
> > > > }
> > > >
> > > > +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
> > > > +{
> > > > + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
> > > > +
> > > > + ring->desc[idx & q->ring_mask] = addr;
> > > > +}
> > > > +
> > > > static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
> > > > u32 nb_entries)
> > > > {
> > > > --
> > > > 2.34.1
> > > >
> > > >
> > >
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
2025-08-26 19:13 ` Maciej Fijalkowski
@ 2025-08-27 0:25 ` Jason Xing
0 siblings, 0 replies; 14+ messages in thread
From: Jason Xing @ 2025-08-27 0:25 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: Magnus Karlsson, bpf, ast, daniel, andrii, netdev,
magnus.karlsson, stfomichev, aleksander.lobakin, Eryk Kubanski
On Wed, Aug 27, 2025 at 3:13 AM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Tue, Aug 26, 2025 at 09:03:45PM +0200, Maciej Fijalkowski wrote:
> > On Tue, Aug 26, 2025 at 08:23:04PM +0200, Magnus Karlsson wrote:
> > > On Tue, 26 Aug 2025 at 18:07, Jason Xing <kerneljasonxing@gmail.com> wrote:
> > > >
> > > > On Wed, Aug 20, 2025 at 11:49 PM Maciej Fijalkowski
> > > > <maciej.fijalkowski@intel.com> wrote:
> > > > >
> > > > > Eryk reported an issue that I have put under Closes: tag, related to
> > > > > umem addrs being prematurely produced onto pool's completion queue.
> > > > > Let us make the skb's destructor responsible for producing all addrs
> > > > > that given skb used.
> > > > >
> > > > > Introduce struct xsk_addrs which will carry descriptor count with array
> > > > > of addresses taken from processed descriptors that will be carried via
> > > > > skb_shared_info::destructor_arg. This way we can refer to it within
> > > > > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > > > > coming from memory allocations, let us introduce kmem_cache of
> > > > > xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> > > > > system.
> > > > >
> > > > > Commit from fixes tag introduced the buggy behavior, it was not broken
> > > > > from day 1, but rather when xsk multi-buffer got introduced.
> > > > >
> > > > > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > > > > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > > > > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > > > > Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> > > > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > > > > ---
> > > > >
> > > > > v1:
> > > > > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > > > > v2:
> > > > > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> > > > > v3:
> > > > > https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> > > > > v4:
> > > > > https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
> > > > > v5:
> > > > > https://lore.kernel.org/bpf/aKXBHGPxjpBDKOHq@boxer/T/
> > > > >
> > > > > v1->v2:
> > > > > * store addrs in array carried via destructor_arg instead having them
> > > > > stored in skb headroom; cleaner and less hacky approach;
> > > > > v2->v3:
> > > > > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > > > > * set err when xsk_addrs allocation fails (Dan)
> > > > > * change xsk_addrs layout to avoid holes
> > > > > * free xsk_addrs on error path
> > > > > * rebase
> > > > > v3->v4:
> > > > > * have kmem_cache as percpu vars
> > > > > * don't drop unnecessary braces (unrelated) (Stan)
> > > > > * use idx + i in xskq_prod_write_addr (Stan)
> > > > > * alloc kmem_cache on bind (Stan)
> > > > > * keep num_descs as first member in xsk_addrs (Magnus)
> > > > > * add ack from Magnus
> > > > > v4->v5:
> > > > > * have a single kmem_cache per xsk subsystem (Stan)
> > > > > v5->v6:
> > > > > * free skb in xsk_build_skb_zerocopy() when xsk_addrs allocation fails
> > > > > (Stan)
> > > > > * unregister netdev notifier if creating kmem_cache fails (Stan)
> > > > >
> > > > > ---
> > > > > net/xdp/xsk.c | 95 +++++++++++++++++++++++++++++++++++++--------
> > > > > net/xdp/xsk_queue.h | 12 ++++++
> > > > > 2 files changed, 91 insertions(+), 16 deletions(-)
> > > > >
> > > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > > index 9c3acecc14b1..989d5ffb4273 100644
> > > > > --- a/net/xdp/xsk.c
> > > > > +++ b/net/xdp/xsk.c
> > > > > @@ -36,6 +36,13 @@
> > > > > #define TX_BATCH_SIZE 32
> > > > > #define MAX_PER_SOCKET_BUDGET 32
> > > > >
> > > > > +struct xsk_addrs {
> > > > > + u32 num_descs;
> > > > > + u64 addrs[MAX_SKB_FRAGS + 1];
> > > > > +};
> > > > > +
> > > > > +static struct kmem_cache *xsk_tx_generic_cache;
> > > >
> > > > IMHO, adding a few heavy operations of allocating and freeing from
> > > > cache in the hot path is not a good choice. What I've been trying so
> > > > hard lately is to minimize the times of manipulating memory as much as
> > > > possible :( Memory hotspot can be easily captured by perf.
> > > >
> > > > We might provide an new option in setsockopt() to let users
> > > > specifically support this use case since it does harm to normal cases?
> > >
> > > Agree with you that we should not harm the normal case here. Instead
> > > of introducing a setsockopt, how about we detect the case when this
> > > can happen in the code? If I remember correctly, it can only occur in
> > > the XDP_SHARED_UMEM mode were the xsk pool is shared between
> > > processes. If this can be tested (by introducing a new bit in the xsk
> > > pool if that is necessary), we could have two potential skb
> > > destructors: the old one for the "normal" case and the new one with
> > > the list of addresses to complete (using the expensive allocations and
> > > deallocations) when it is strictly required i.e., when the xsk pool is
> > > shared. Maciej, you are more in to the details of this, so what do you
> > > think? Would something like this be a potential path forward?
> >
> > Meh, i was focused on 9k mtu impact, it was about 5% on my machine but now
> > i checked small packets and indeed i see 12-14% perf regression.
> >
> > I'll look into this so Daniel, for now let's drop this unfortunate
> > patch...
>
> One more thing - Jason, you still need to focus your work on this approach
> where we produce cq entries from destructor. I just need to come up with
> smarter way of producing descs to be consumed by destructor :<
No problem. Before getting to that batch feature, I'm dealing with
AF_PACKET implementation first which probably takes much time than
needed.
Thanks,
Jason
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v6 bpf] xsk: fix immature cq descriptor production
@ 2025-08-27 23:11 kernel test robot
0 siblings, 0 replies; 14+ messages in thread
From: kernel test robot @ 2025-08-27 23:11 UTC (permalink / raw)
To: oe-kbuild; +Cc: lkp, Dan Carpenter
BCC: lkp@intel.com
CC: oe-kbuild-all@lists.linux.dev
In-Reply-To: <20250820154416.2248012-1-maciej.fijalkowski@intel.com>
References: <20250820154416.2248012-1-maciej.fijalkowski@intel.com>
TO: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
TO: bpf@vger.kernel.org
TO: ast@kernel.org
TO: daniel@iogearbox.net
TO: andrii@kernel.org
CC: netdev@vger.kernel.org
CC: magnus.karlsson@intel.com
CC: stfomichev@gmail.com
CC: aleksander.lobakin@intel.com
CC: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
CC: Eryk Kubanski <e.kubanski@partner.samsung.com>
Hi Maciej,
kernel test robot noticed the following build warnings:
[auto build test WARNING on bpf/master]
url: https://github.com/intel-lab-lkp/linux/commits/Maciej-Fijalkowski/xsk-fix-immature-cq-descriptor-production/20250820-235001
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git master
patch link: https://lore.kernel.org/r/20250820154416.2248012-1-maciej.fijalkowski%40intel.com
patch subject: [PATCH v6 bpf] xsk: fix immature cq descriptor production
:::::: branch date: 7 days ago
:::::: commit date: 7 days ago
config: powerpc-randconfig-r072-20250827 (https://download.01.org/0day-ci/archive/20250828/202508280638.Dy15CCOf-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 8.5.0
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <error27@gmail.com>
| Closes: https://lore.kernel.org/r/202508280638.Dy15CCOf-lkp@intel.com/
smatch warnings:
net/xdp/xsk.c:662 xsk_build_skb_zerocopy() error: use kfree_skb() here instead of kfree(skb)
vim +662 net/xdp/xsk.c
cf24f5a5feeaae3 Tirthendu Sarkar 2023-07-19 638
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 639 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 640 struct xdp_desc *desc)
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 641 {
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 642 struct xsk_buff_pool *pool = xs->pool;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 643 u32 hr, len, ts, offset, copy, copied;
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 644 struct xsk_addrs *addrs = NULL;
cf24f5a5feeaae3 Tirthendu Sarkar 2023-07-19 645 struct sk_buff *skb = xs->skb;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 646 struct page *page;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 647 void *buffer;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 648 int err, i;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 649 u64 addr;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 650
cf24f5a5feeaae3 Tirthendu Sarkar 2023-07-19 651 if (!skb) {
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 652 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 653
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 654 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 655 if (unlikely(!skb))
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 656 return ERR_PTR(err);
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 657
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 658 skb_reserve(skb, hr);
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 659
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 660 addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 661 if (!addrs) {
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 @662 kfree(skb);
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 663 return ERR_PTR(-ENOMEM);
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 664 }
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 665
b3a8246d6ffa300 Maciej Fijalkowski 2025-08-20 666 xsk_set_destructor_arg(skb, addrs);
cf24f5a5feeaae3 Tirthendu Sarkar 2023-07-19 667 }
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 668
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 669 addr = desc->addr;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 670 len = desc->len;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 671 ts = pool->unaligned ? len : pool->chunk_size;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 672
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 673 buffer = xsk_buff_raw_get_data(pool, addr);
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 674 offset = offset_in_page(buffer);
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 675 addr = buffer - pool->addrs;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 676
cf24f5a5feeaae3 Tirthendu Sarkar 2023-07-19 677 for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
cf24f5a5feeaae3 Tirthendu Sarkar 2023-07-19 678 if (unlikely(i >= MAX_SKB_FRAGS))
9d0a67b9d42c630 Tirthendu Sarkar 2023-08-23 679 return ERR_PTR(-EOVERFLOW);
cf24f5a5feeaae3 Tirthendu Sarkar 2023-07-19 680
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 681 page = pool->umem->pgs[addr >> PAGE_SHIFT];
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 682 get_page(page);
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 683
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 684 copy = min_t(u32, PAGE_SIZE - offset, len - copied);
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 685 skb_fill_page_desc(skb, i, page, offset, copy);
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 686
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 687 copied += copy;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 688 addr += copy;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 689 offset = 0;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 690 }
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 691
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 692 skb->len += len;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 693 skb->data_len += len;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 694 skb->truesize += ts;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 695
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 696 refcount_add(ts, &xs->sk.sk_wmem_alloc);
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 697
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 698 return skb;
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 699 }
9c8f21e6f8856a9 Xuan Zhuo 2021-02-18 700
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2025-08-30 10:33 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-20 15:44 [PATCH v6 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-20 17:06 ` Stanislav Fomichev
2025-08-22 16:20 ` patchwork-bot+netdevbpf
2025-08-26 8:14 ` Dan Carpenter
2025-08-26 12:23 ` Daniel Borkmann
2025-08-26 13:40 ` Jason Xing
2025-08-29 16:14 ` Maciej Fijalkowski
2025-08-30 10:32 ` Jason Xing
2025-08-26 16:00 ` Jason Xing
2025-08-26 18:23 ` Magnus Karlsson
2025-08-26 19:03 ` Maciej Fijalkowski
2025-08-26 19:13 ` Maciej Fijalkowski
2025-08-27 0:25 ` Jason Xing
-- strict thread matches above, loose matches on Subject: below --
2025-08-27 23:11 kernel test robot
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.