* [PATCH v5 bpf] xsk: fix immature cq descriptor production
@ 2025-08-19 11:55 Maciej Fijalkowski
2025-08-19 20:48 ` Stanislav Fomichev
0 siblings, 1 reply; 3+ messages in thread
From: Maciej Fijalkowski @ 2025-08-19 11:55 UTC (permalink / raw)
To: bpf, ast, daniel, andrii
Cc: netdev, magnus.karlsson, stfomichev, aleksander.lobakin,
Maciej Fijalkowski, Eryk Kubanski
Eryk reported an issue that I have put under Closes: tag, related to
umem addrs being prematurely produced onto pool's completion queue.
Let us make the skb's destructor responsible for producing all addrs
that given skb used.
Introduce struct xsk_addrs which will carry descriptor count with array
of addresses taken from processed descriptors that will be carried via
skb_shared_info::destructor_arg. This way we can refer to it within
xsk_destruct_skb(). In order to mitigate the overhead that will be
coming from memory allocations, let us introduce kmem_cache of
xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
system.
Commit from fixes tag introduced the buggy behavior, it was not broken
from day 1, but rather when xsk multi-buffer got introduced.
Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
v1:
https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
v2:
https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
v3:
https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
v4:
https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
v1->v2:
* store addrs in array carried via destructor_arg instead having them
stored in skb headroom; cleaner and less hacky approach;
v2->v3:
* use kmem_cache for xsk_addrs allocation (Stan/Olek)
* set err when xsk_addrs allocation fails (Dan)
* change xsk_addrs layout to avoid holes
* free xsk_addrs on error path
* rebase
v3->v4:
* have kmem_cache as percpu vars
* don't drop unnecessary braces (unrelated) (Stan)
* use idx + i in xskq_prod_write_addr (Stan)
* alloc kmem_cache on bind (Stan)
* keep num_descs as first member in xsk_addrs (Magnus)
* add ack from Magnus
v4->v5:
* have a single kmem_cache per xsk subsystem (Stan)
---
net/xdp/xsk.c | 91 +++++++++++++++++++++++++++++++++++++--------
net/xdp/xsk_queue.h | 12 ++++++
2 files changed, 87 insertions(+), 16 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9c3acecc14b1..012991de9df2 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,13 @@
#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET 32
+struct xsk_addrs {
+ u32 num_descs;
+ u64 addrs[MAX_SKB_FRAGS + 1];
+};
+
+static struct kmem_cache *xsk_tx_generic_cache;
+
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
}
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&pool->cq_lock, flags);
- ret = xskq_prod_reserve_addr(pool->cq, addr);
+ ret = xskq_prod_reserve(pool->cq);
spin_unlock_irqrestore(&pool->cq_lock, flags);
return ret;
}
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
+ struct sk_buff *skb)
{
+ struct xsk_buff_pool *pool = xs->pool;
+ struct xsk_addrs *xsk_addrs;
unsigned long flags;
+ u32 num_desc, i;
+ u32 idx;
+
+ xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ num_desc = xsk_addrs->num_descs;
spin_lock_irqsave(&pool->cq_lock, flags);
- xskq_prod_submit_n(pool->cq, n);
+ idx = xskq_get_prod(pool->cq);
+
+ for (i = 0; i < num_desc; i++)
+ xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
+ xskq_prod_submit_n(pool->cq, num_desc);
+
spin_unlock_irqrestore(&pool->cq_lock, flags);
+ kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
}
static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
@@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
spin_unlock_irqrestore(&pool->cq_lock, flags);
}
-static u32 xsk_get_num_desc(struct sk_buff *skb)
-{
- return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
-}
-
static void xsk_destruct_skb(struct sk_buff *skb)
{
struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
*compl->tx_timestamp = ktime_get_tai_fast_ns();
}
- xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+ xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
sock_wfree(skb);
}
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static u32 xsk_get_num_desc(struct sk_buff *skb)
{
- long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+ struct xsk_addrs *addrs;
- skb_shinfo(skb)->destructor_arg = (void *)num;
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ return addrs->num_descs;
+}
+
+static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
+{
+ skb_shinfo(skb)->destructor_arg = (void *)addrs;
+}
+
+static void xsk_inc_skb_descs(struct sk_buff *skb)
+{
+ struct xsk_addrs *addrs;
+
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ addrs->num_descs++;
}
static void xsk_consume_skb(struct sk_buff *skb)
{
struct xdp_sock *xs = xdp_sk(skb->sk);
+ kmem_cache_free(xsk_tx_generic_cache,
+ (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
skb->destructor = sock_wfree;
xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
/* Free skb without triggering the perf drop trace */
@@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
{
struct xsk_buff_pool *pool = xs->pool;
u32 hr, len, ts, offset, copy, copied;
+ struct xsk_addrs *addrs = NULL;
struct sk_buff *skb = xs->skb;
struct page *page;
void *buffer;
@@ -623,6 +656,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
skb_reserve(skb, hr);
+
+ addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (!addrs)
+ return ERR_PTR(-ENOMEM);
+
+ xsk_set_destructor_arg(skb, addrs);
}
addr = desc->addr;
@@ -662,6 +701,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
{
struct xsk_tx_metadata *meta = NULL;
struct net_device *dev = xs->dev;
+ struct xsk_addrs *addrs = NULL;
struct sk_buff *skb = xs->skb;
bool first_frag = false;
int err;
@@ -694,6 +734,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
err = skb_store_bits(skb, 0, buffer, len);
if (unlikely(err))
goto free_err;
+
+ addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (!addrs) {
+ err = -ENOMEM;
+ goto free_err;
+ }
+
+ xsk_set_destructor_arg(skb, addrs);
+
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
struct page *page;
@@ -759,7 +808,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
- xsk_set_destructor_arg(skb);
+
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ addrs->addrs[addrs->num_descs++] = desc->addr;
return skb;
@@ -769,7 +820,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
if (err == -EOVERFLOW) {
/* Drop the packet */
- xsk_set_destructor_arg(xs->skb);
+ xsk_inc_skb_descs(xs->skb);
xsk_drop_skb(xs->skb);
xskq_cons_release(xs->tx);
} else {
@@ -812,7 +863,7 @@ static int __xsk_generic_xmit(struct sock *sk)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+ err = xsk_cq_reserve_locked(xs->pool);
if (err) {
err = -EAGAIN;
goto out;
@@ -1815,6 +1866,14 @@ static int __init xsk_init(void)
if (err)
goto out_pernet;
+ xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
+ sizeof(struct xsk_addrs), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!xsk_tx_generic_cache) {
+ err = -ENOMEM;
+ goto out_pernet;
+ }
+
return 0;
out_pernet:
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..f16f390370dc 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
/* Functions for producers */
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+ return READ_ONCE(q->ring->producer);
+}
+
static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
{
u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
return 0;
}
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ ring->desc[idx & q->ring_mask] = addr;
+}
+
static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
u32 nb_entries)
{
--
2.34.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v5 bpf] xsk: fix immature cq descriptor production
2025-08-19 11:55 [PATCH v5 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
@ 2025-08-19 20:48 ` Stanislav Fomichev
2025-08-20 12:35 ` Maciej Fijalkowski
0 siblings, 1 reply; 3+ messages in thread
From: Stanislav Fomichev @ 2025-08-19 20:48 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On 08/19, Maciej Fijalkowski wrote:
> Eryk reported an issue that I have put under Closes: tag, related to
> umem addrs being prematurely produced onto pool's completion queue.
> Let us make the skb's destructor responsible for producing all addrs
> that given skb used.
>
> Introduce struct xsk_addrs which will carry descriptor count with array
> of addresses taken from processed descriptors that will be carried via
> skb_shared_info::destructor_arg. This way we can refer to it within
> xsk_destruct_skb(). In order to mitigate the overhead that will be
> coming from memory allocations, let us introduce kmem_cache of
> xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> system.
>
> Commit from fixes tag introduced the buggy behavior, it was not broken
> from day 1, but rather when xsk multi-buffer got introduced.
>
> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
>
> v1:
> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> v2:
> https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> v3:
> https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> v4:
> https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
>
> v1->v2:
> * store addrs in array carried via destructor_arg instead having them
> stored in skb headroom; cleaner and less hacky approach;
> v2->v3:
> * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> * set err when xsk_addrs allocation fails (Dan)
> * change xsk_addrs layout to avoid holes
> * free xsk_addrs on error path
> * rebase
> v3->v4:
> * have kmem_cache as percpu vars
> * don't drop unnecessary braces (unrelated) (Stan)
> * use idx + i in xskq_prod_write_addr (Stan)
> * alloc kmem_cache on bind (Stan)
> * keep num_descs as first member in xsk_addrs (Magnus)
> * add ack from Magnus
> v4->v5:
> * have a single kmem_cache per xsk subsystem (Stan)
>
> ---
> net/xdp/xsk.c | 91 +++++++++++++++++++++++++++++++++++++--------
> net/xdp/xsk_queue.h | 12 ++++++
> 2 files changed, 87 insertions(+), 16 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 9c3acecc14b1..012991de9df2 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,6 +36,13 @@
> #define TX_BATCH_SIZE 32
> #define MAX_PER_SOCKET_BUDGET 32
>
> +struct xsk_addrs {
> + u32 num_descs;
> + u64 addrs[MAX_SKB_FRAGS + 1];
> +};
> +
> +static struct kmem_cache *xsk_tx_generic_cache;
> +
> void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> {
> if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> }
>
> -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> {
> unsigned long flags;
> int ret;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> - ret = xskq_prod_reserve_addr(pool->cq, addr);
> + ret = xskq_prod_reserve(pool->cq);
> spin_unlock_irqrestore(&pool->cq_lock, flags);
>
> return ret;
> }
>
> -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> + struct sk_buff *skb)
> {
> + struct xsk_buff_pool *pool = xs->pool;
> + struct xsk_addrs *xsk_addrs;
> unsigned long flags;
> + u32 num_desc, i;
> + u32 idx;
> +
> + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + num_desc = xsk_addrs->num_descs;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> - xskq_prod_submit_n(pool->cq, n);
> + idx = xskq_get_prod(pool->cq);
> +
> + for (i = 0; i < num_desc; i++)
> + xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
> + xskq_prod_submit_n(pool->cq, num_desc);
> +
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> + kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
> }
>
> static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> @@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> }
>
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> -}
> -
> static void xsk_destruct_skb(struct sk_buff *skb)
> {
> struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> @@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> *compl->tx_timestamp = ktime_get_tai_fast_ns();
> }
>
> - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> sock_wfree(skb);
> }
>
> -static void xsk_set_destructor_arg(struct sk_buff *skb)
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> {
> - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> + struct xsk_addrs *addrs;
>
> - skb_shinfo(skb)->destructor_arg = (void *)num;
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + return addrs->num_descs;
> +}
> +
> +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> +{
> + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> +}
> +
> +static void xsk_inc_skb_descs(struct sk_buff *skb)
> +{
> + struct xsk_addrs *addrs;
> +
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + addrs->num_descs++;
> }
>
> static void xsk_consume_skb(struct sk_buff *skb)
> {
> struct xdp_sock *xs = xdp_sk(skb->sk);
>
> + kmem_cache_free(xsk_tx_generic_cache,
> + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> skb->destructor = sock_wfree;
> xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> /* Free skb without triggering the perf drop trace */
> @@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> {
> struct xsk_buff_pool *pool = xs->pool;
> u32 hr, len, ts, offset, copy, copied;
> + struct xsk_addrs *addrs = NULL;
> struct sk_buff *skb = xs->skb;
> struct page *page;
> void *buffer;
> @@ -623,6 +656,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> return ERR_PTR(err);
>
> skb_reserve(skb, hr);
> +
> + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> + if (!addrs)
> + return ERR_PTR(-ENOMEM);
Do we need to kfree the skb that we allocated on line 621 above? (maybe
not because I always get confused by the mb/overflow handling here)
> +
> + xsk_set_destructor_arg(skb, addrs);
> }
>
> addr = desc->addr;
> @@ -662,6 +701,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> {
> struct xsk_tx_metadata *meta = NULL;
> struct net_device *dev = xs->dev;
> + struct xsk_addrs *addrs = NULL;
> struct sk_buff *skb = xs->skb;
> bool first_frag = false;
> int err;
> @@ -694,6 +734,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> err = skb_store_bits(skb, 0, buffer, len);
> if (unlikely(err))
> goto free_err;
> +
> + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> + if (!addrs) {
> + err = -ENOMEM;
> + goto free_err;
> + }
> +
> + xsk_set_destructor_arg(skb, addrs);
> +
> } else {
> int nr_frags = skb_shinfo(skb)->nr_frags;
> struct page *page;
> @@ -759,7 +808,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> skb->mark = READ_ONCE(xs->sk.sk_mark);
> skb->destructor = xsk_destruct_skb;
> xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> - xsk_set_destructor_arg(skb);
> +
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + addrs->addrs[addrs->num_descs++] = desc->addr;
>
> return skb;
>
> @@ -769,7 +820,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>
> if (err == -EOVERFLOW) {
> /* Drop the packet */
> - xsk_set_destructor_arg(xs->skb);
> + xsk_inc_skb_descs(xs->skb);
> xsk_drop_skb(xs->skb);
> xskq_cons_release(xs->tx);
> } else {
> @@ -812,7 +863,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> + err = xsk_cq_reserve_locked(xs->pool);
> if (err) {
> err = -EAGAIN;
> goto out;
> @@ -1815,6 +1866,14 @@ static int __init xsk_init(void)
> if (err)
> goto out_pernet;
>
> + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> + sizeof(struct xsk_addrs), 0,
> + SLAB_HWCACHE_ALIGN, NULL);
> + if (!xsk_tx_generic_cache) {
> + err = -ENOMEM;
> + goto out_pernet;
This probably needs an unregister_netdevice_notifier call?
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v5 bpf] xsk: fix immature cq descriptor production
2025-08-19 20:48 ` Stanislav Fomichev
@ 2025-08-20 12:35 ` Maciej Fijalkowski
0 siblings, 0 replies; 3+ messages in thread
From: Maciej Fijalkowski @ 2025-08-20 12:35 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On Tue, Aug 19, 2025 at 01:48:00PM -0700, Stanislav Fomichev wrote:
> On 08/19, Maciej Fijalkowski wrote:
> > Eryk reported an issue that I have put under Closes: tag, related to
> > umem addrs being prematurely produced onto pool's completion queue.
> > Let us make the skb's destructor responsible for producing all addrs
> > that given skb used.
> >
> > Introduce struct xsk_addrs which will carry descriptor count with array
> > of addresses taken from processed descriptors that will be carried via
> > skb_shared_info::destructor_arg. This way we can refer to it within
> > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > coming from memory allocations, let us introduce kmem_cache of
> > xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> > system.
> >
> > Commit from fixes tag introduced the buggy behavior, it was not broken
> > from day 1, but rather when xsk multi-buffer got introduced.
> >
> > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > ---
> >
> > v1:
> > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > v2:
> > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> > v3:
> > https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@intel.com/
> > v4:
> > https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@intel.com/
> >
> > v1->v2:
> > * store addrs in array carried via destructor_arg instead having them
> > stored in skb headroom; cleaner and less hacky approach;
> > v2->v3:
> > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > * set err when xsk_addrs allocation fails (Dan)
> > * change xsk_addrs layout to avoid holes
> > * free xsk_addrs on error path
> > * rebase
> > v3->v4:
> > * have kmem_cache as percpu vars
> > * don't drop unnecessary braces (unrelated) (Stan)
> > * use idx + i in xskq_prod_write_addr (Stan)
> > * alloc kmem_cache on bind (Stan)
> > * keep num_descs as first member in xsk_addrs (Magnus)
> > * add ack from Magnus
> > v4->v5:
> > * have a single kmem_cache per xsk subsystem (Stan)
> >
> > ---
> > net/xdp/xsk.c | 91 +++++++++++++++++++++++++++++++++++++--------
> > net/xdp/xsk_queue.h | 12 ++++++
> > 2 files changed, 87 insertions(+), 16 deletions(-)
> >
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index 9c3acecc14b1..012991de9df2 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -36,6 +36,13 @@
> > #define TX_BATCH_SIZE 32
> > #define MAX_PER_SOCKET_BUDGET 32
> >
> > +struct xsk_addrs {
> > + u32 num_descs;
> > + u64 addrs[MAX_SKB_FRAGS + 1];
> > +};
> > +
> > +static struct kmem_cache *xsk_tx_generic_cache;
> > +
> > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> > {
> > if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> > @@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> > return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> > }
> >
> > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> > {
> > unsigned long flags;
> > int ret;
> >
> > spin_lock_irqsave(&pool->cq_lock, flags);
> > - ret = xskq_prod_reserve_addr(pool->cq, addr);
> > + ret = xskq_prod_reserve(pool->cq);
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> >
> > return ret;
> > }
> >
> > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> > + struct sk_buff *skb)
> > {
> > + struct xsk_buff_pool *pool = xs->pool;
> > + struct xsk_addrs *xsk_addrs;
> > unsigned long flags;
> > + u32 num_desc, i;
> > + u32 idx;
> > +
> > + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + num_desc = xsk_addrs->num_descs;
> >
> > spin_lock_irqsave(&pool->cq_lock, flags);
> > - xskq_prod_submit_n(pool->cq, n);
> > + idx = xskq_get_prod(pool->cq);
> > +
> > + for (i = 0; i < num_desc; i++)
> > + xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
> > + xskq_prod_submit_n(pool->cq, num_desc);
> > +
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > + kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
> > }
> >
> > static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > @@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > }
> >
> > -static u32 xsk_get_num_desc(struct sk_buff *skb)
> > -{
> > - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> > -}
> > -
> > static void xsk_destruct_skb(struct sk_buff *skb)
> > {
> > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> > @@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > *compl->tx_timestamp = ktime_get_tai_fast_ns();
> > }
> >
> > - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> > + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> > sock_wfree(skb);
> > }
> >
> > -static void xsk_set_destructor_arg(struct sk_buff *skb)
> > +static u32 xsk_get_num_desc(struct sk_buff *skb)
> > {
> > - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> > + struct xsk_addrs *addrs;
> >
> > - skb_shinfo(skb)->destructor_arg = (void *)num;
> > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + return addrs->num_descs;
> > +}
> > +
> > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> > +{
> > + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> > +}
> > +
> > +static void xsk_inc_skb_descs(struct sk_buff *skb)
> > +{
> > + struct xsk_addrs *addrs;
> > +
> > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + addrs->num_descs++;
> > }
> >
> > static void xsk_consume_skb(struct sk_buff *skb)
> > {
> > struct xdp_sock *xs = xdp_sk(skb->sk);
> >
> > + kmem_cache_free(xsk_tx_generic_cache,
> > + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> > skb->destructor = sock_wfree;
> > xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> > /* Free skb without triggering the perf drop trace */
> > @@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > {
> > struct xsk_buff_pool *pool = xs->pool;
> > u32 hr, len, ts, offset, copy, copied;
> > + struct xsk_addrs *addrs = NULL;
> > struct sk_buff *skb = xs->skb;
> > struct page *page;
> > void *buffer;
> > @@ -623,6 +656,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > return ERR_PTR(err);
> >
> > skb_reserve(skb, hr);
> > +
> > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > + if (!addrs)
> > + return ERR_PTR(-ENOMEM);
>
> Do we need to kfree the skb that we allocated on line 621 above? (maybe
> not because I always get confused by the mb/overflow handling here)
Awesome catches Stan. I'm fed up with these changes and I manage to
introduce these two bugs you're pointing out:)
My reasoning was that even if we write the errno for skb, I assumed that
branch below:
free_err:
if (first_frag && skb)
kfree_skb(skb);
will be taken, but the thing is we don't set @first_frag = true for
xsk_build_skb_zerocopy().
I will explicitly free skb where you're suggesting and of course
unregister the netdev notifier.
Thanks!
>
> > +
> > + xsk_set_destructor_arg(skb, addrs);
> > }
> >
(...)
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-08-20 12:35 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-19 11:55 [PATCH v5 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-19 20:48 ` Stanislav Fomichev
2025-08-20 12:35 ` Maciej Fijalkowski
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).