* [PATCH v3 bpf] xsk: fix immature cq descriptor production
@ 2025-08-06 15:41 Maciej Fijalkowski
2025-08-06 16:43 ` Stanislav Fomichev
` (2 more replies)
0 siblings, 3 replies; 8+ messages in thread
From: Maciej Fijalkowski @ 2025-08-06 15:41 UTC (permalink / raw)
To: bpf, ast, daniel, andrii
Cc: netdev, magnus.karlsson, aleksander.lobakin, Maciej Fijalkowski,
Eryk Kubanski
Eryk reported an issue that I have put under Closes: tag, related to
umem addrs being prematurely produced onto pool's completion queue.
Let us make the skb's destructor responsible for producing all addrs
that given skb used.
Introduce struct xsk_addrs which will carry descriptor count with array
of addresses taken from processed descriptors that will be carried via
skb_shared_info::destructor_arg. This way we can refer to it within
xsk_destruct_skb(). In order to mitigate the overhead that will be
coming from memory allocations, let us introduce kmem_cache of xsk_addrs
onto xdp_sock. Utilize the existing struct hole in xdp_sock for that.
Commit from fixes tag introduced the buggy behavior, it was not broken
from day 1, but rather when xsk multi-buffer got introduced.
Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
v1:
https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
v2:
https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
v1->v2:
* store addrs in array carried via destructor_arg instead having them
stored in skb headroom; cleaner and less hacky approach;
v2->v3:
* use kmem_cache for xsk_addrs allocation (Stan/Olek)
* set err when xsk_addrs allocation fails (Dan)
* change xsk_addrs layout to avoid holes
* free xsk_addrs on error path
* rebase
---
include/net/xdp_sock.h | 1 +
net/xdp/xsk.c | 94 ++++++++++++++++++++++++++++++++++--------
net/xdp/xsk_queue.h | 12 ++++++
3 files changed, 89 insertions(+), 18 deletions(-)
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ce587a225661..5ba9ad4c110f 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -61,6 +61,7 @@ struct xdp_sock {
XSK_BOUND,
XSK_UNBOUND,
} state;
+ struct kmem_cache *xsk_addrs_cache;
struct xsk_queue *tx ____cacheline_aligned_in_smp;
struct list_head tx_list;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9c3acecc14b1..d77cde0131be 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,11 @@
#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET 32
+struct xsk_addrs {
+ u64 addrs[MAX_SKB_FRAGS + 1];
+ u32 num_descs;
+};
+
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -532,25 +537,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
}
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&pool->cq_lock, flags);
- ret = xskq_prod_reserve_addr(pool->cq, addr);
+ ret = xskq_prod_reserve(pool->cq);
spin_unlock_irqrestore(&pool->cq_lock, flags);
return ret;
}
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
+ struct sk_buff *skb)
{
+ struct xsk_buff_pool *pool = xs->pool;
+ struct xsk_addrs *xsk_addrs;
unsigned long flags;
+ u32 num_desc, i;
+ u32 idx;
+
+ xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ num_desc = xsk_addrs->num_descs;
spin_lock_irqsave(&pool->cq_lock, flags);
- xskq_prod_submit_n(pool->cq, n);
+ idx = xskq_get_prod(pool->cq);
+
+ for (i = 0; i < num_desc; i++, idx++)
+ xskq_prod_write_addr(pool->cq, idx, xsk_addrs->addrs[i]);
+ xskq_prod_submit_n(pool->cq, num_desc);
+
spin_unlock_irqrestore(&pool->cq_lock, flags);
+ kmem_cache_free(xs->xsk_addrs_cache, xsk_addrs);
}
static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
@@ -562,35 +581,45 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
spin_unlock_irqrestore(&pool->cq_lock, flags);
}
-static u32 xsk_get_num_desc(struct sk_buff *skb)
-{
- return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
-}
-
static void xsk_destruct_skb(struct sk_buff *skb)
{
struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
- if (compl->tx_timestamp) {
+ if (compl->tx_timestamp)
/* sw completion timestamp, not a real one */
*compl->tx_timestamp = ktime_get_tai_fast_ns();
- }
- xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+ xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
sock_wfree(skb);
}
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static u32 xsk_get_num_desc(struct sk_buff *skb)
+{
+ struct xsk_addrs *addrs;
+
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ return addrs->num_descs;
+}
+
+static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
{
- long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+ skb_shinfo(skb)->destructor_arg = (void *)addrs;
+}
+
+static void xsk_inc_skb_descs(struct sk_buff *skb)
+{
+ struct xsk_addrs *addrs;
- skb_shinfo(skb)->destructor_arg = (void *)num;
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ addrs->num_descs++;
}
static void xsk_consume_skb(struct sk_buff *skb)
{
struct xdp_sock *xs = xdp_sk(skb->sk);
+ kmem_cache_free(xs->xsk_addrs_cache,
+ (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
skb->destructor = sock_wfree;
xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
/* Free skb without triggering the perf drop trace */
@@ -609,6 +638,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
{
struct xsk_buff_pool *pool = xs->pool;
u32 hr, len, ts, offset, copy, copied;
+ struct xsk_addrs *addrs = NULL;
struct sk_buff *skb = xs->skb;
struct page *page;
void *buffer;
@@ -623,6 +653,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
skb_reserve(skb, hr);
+
+ addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
+ if (!addrs)
+ return ERR_PTR(-ENOMEM);
+
+ xsk_set_destructor_arg(skb, addrs);
}
addr = desc->addr;
@@ -662,6 +698,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
{
struct xsk_tx_metadata *meta = NULL;
struct net_device *dev = xs->dev;
+ struct xsk_addrs *addrs = NULL;
struct sk_buff *skb = xs->skb;
bool first_frag = false;
int err;
@@ -694,6 +731,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
err = skb_store_bits(skb, 0, buffer, len);
if (unlikely(err))
goto free_err;
+
+ addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
+ if (!addrs) {
+ err = -ENOMEM;
+ goto free_err;
+ }
+
+ xsk_set_destructor_arg(skb, addrs);
+
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
struct page *page;
@@ -759,7 +805,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
- xsk_set_destructor_arg(skb);
+
+ addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ addrs->addrs[addrs->num_descs++] = desc->addr;
return skb;
@@ -769,7 +817,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
if (err == -EOVERFLOW) {
/* Drop the packet */
- xsk_set_destructor_arg(xs->skb);
+ xsk_inc_skb_descs(xs->skb);
xsk_drop_skb(xs->skb);
xskq_cons_release(xs->tx);
} else {
@@ -812,7 +860,7 @@ static int __xsk_generic_xmit(struct sock *sk)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+ err = xsk_cq_reserve_locked(xs->pool);
if (err) {
err = -EAGAIN;
goto out;
@@ -1122,6 +1170,7 @@ static int xsk_release(struct socket *sock)
xskq_destroy(xs->tx);
xskq_destroy(xs->fq_tmp);
xskq_destroy(xs->cq_tmp);
+ kmem_cache_destroy(xs->xsk_addrs_cache);
sock_orphan(sk);
sock->sk = NULL;
@@ -1765,6 +1814,15 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
sock_prot_inuse_add(net, &xsk_proto, 1);
+ xs->xsk_addrs_cache = kmem_cache_create("xsk_generic_xmit_cache",
+ sizeof(struct xsk_addrs), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+
+ if (!xs->xsk_addrs_cache) {
+ sk_free(sk);
+ return -ENOMEM;
+ }
+
return 0;
}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..f16f390370dc 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
/* Functions for producers */
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+ return READ_ONCE(q->ring->producer);
+}
+
static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
{
u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
return 0;
}
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ ring->desc[idx & q->ring_mask] = addr;
+}
+
static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
u32 nb_entries)
{
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH v3 bpf] xsk: fix immature cq descriptor production
2025-08-06 15:41 [PATCH v3 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
@ 2025-08-06 16:43 ` Stanislav Fomichev
2025-08-06 20:42 ` Maciej Fijalkowski
2025-08-07 8:06 ` [syzbot ci] " syzbot ci
2025-08-08 9:48 ` [PATCH v3 bpf] " Magnus Karlsson
2 siblings, 1 reply; 8+ messages in thread
From: Stanislav Fomichev @ 2025-08-06 16:43 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On 08/06, Maciej Fijalkowski wrote:
> Eryk reported an issue that I have put under Closes: tag, related to
> umem addrs being prematurely produced onto pool's completion queue.
> Let us make the skb's destructor responsible for producing all addrs
> that given skb used.
>
> Introduce struct xsk_addrs which will carry descriptor count with array
> of addresses taken from processed descriptors that will be carried via
> skb_shared_info::destructor_arg. This way we can refer to it within
> xsk_destruct_skb(). In order to mitigate the overhead that will be
> coming from memory allocations, let us introduce kmem_cache of xsk_addrs
> onto xdp_sock. Utilize the existing struct hole in xdp_sock for that.
>
> Commit from fixes tag introduced the buggy behavior, it was not broken
> from day 1, but rather when xsk multi-buffer got introduced.
>
> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
> v1:
> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> v2:
> https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
>
> v1->v2:
> * store addrs in array carried via destructor_arg instead having them
> stored in skb headroom; cleaner and less hacky approach;
> v2->v3:
> * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> * set err when xsk_addrs allocation fails (Dan)
> * change xsk_addrs layout to avoid holes
> * free xsk_addrs on error path
> * rebase
> ---
> include/net/xdp_sock.h | 1 +
> net/xdp/xsk.c | 94 ++++++++++++++++++++++++++++++++++--------
> net/xdp/xsk_queue.h | 12 ++++++
> 3 files changed, 89 insertions(+), 18 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index ce587a225661..5ba9ad4c110f 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -61,6 +61,7 @@ struct xdp_sock {
> XSK_BOUND,
> XSK_UNBOUND,
> } state;
> + struct kmem_cache *xsk_addrs_cache;
>
> struct xsk_queue *tx ____cacheline_aligned_in_smp;
> struct list_head tx_list;
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 9c3acecc14b1..d77cde0131be 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,6 +36,11 @@
> #define TX_BATCH_SIZE 32
> #define MAX_PER_SOCKET_BUDGET 32
>
> +struct xsk_addrs {
> + u64 addrs[MAX_SKB_FRAGS + 1];
> + u32 num_descs;
> +};
> +
> void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> {
> if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -532,25 +537,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> }
>
> -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> {
> unsigned long flags;
> int ret;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> - ret = xskq_prod_reserve_addr(pool->cq, addr);
> + ret = xskq_prod_reserve(pool->cq);
> spin_unlock_irqrestore(&pool->cq_lock, flags);
>
> return ret;
> }
>
> -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> + struct sk_buff *skb)
> {
> + struct xsk_buff_pool *pool = xs->pool;
> + struct xsk_addrs *xsk_addrs;
> unsigned long flags;
> + u32 num_desc, i;
> + u32 idx;
> +
> + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + num_desc = xsk_addrs->num_descs;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> - xskq_prod_submit_n(pool->cq, n);
> + idx = xskq_get_prod(pool->cq);
> +
> + for (i = 0; i < num_desc; i++, idx++)
> + xskq_prod_write_addr(pool->cq, idx, xsk_addrs->addrs[i]);
optional nit: maybe do xskq_prod_write_addr(, idx+i, ) instead of 'idx++'
in the loop? I got a bit confused here until I spotted that idx++..
But up to you, feel free to ignore, maybe it's just me.
> + xskq_prod_submit_n(pool->cq, num_desc);
> +
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> + kmem_cache_free(xs->xsk_addrs_cache, xsk_addrs);
> }
>
> static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> @@ -562,35 +581,45 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> }
>
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> -}
> -
> static void xsk_destruct_skb(struct sk_buff *skb)
> {
> struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
>
[..]
> - if (compl->tx_timestamp) {
> + if (compl->tx_timestamp)
> /* sw completion timestamp, not a real one */
> *compl->tx_timestamp = ktime_get_tai_fast_ns();
> - }
Seems to be unrelated, can probably drop if you happen to respin?
> - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> sock_wfree(skb);
> }
>
> -static void xsk_set_destructor_arg(struct sk_buff *skb)
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> +{
> + struct xsk_addrs *addrs;
> +
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + return addrs->num_descs;
> +}
> +
> +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> {
> - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> +}
> +
> +static void xsk_inc_skb_descs(struct sk_buff *skb)
> +{
> + struct xsk_addrs *addrs;
>
> - skb_shinfo(skb)->destructor_arg = (void *)num;
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + addrs->num_descs++;
> }
>
> static void xsk_consume_skb(struct sk_buff *skb)
> {
> struct xdp_sock *xs = xdp_sk(skb->sk);
>
> + kmem_cache_free(xs->xsk_addrs_cache,
> + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> skb->destructor = sock_wfree;
> xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> /* Free skb without triggering the perf drop trace */
> @@ -609,6 +638,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> {
> struct xsk_buff_pool *pool = xs->pool;
> u32 hr, len, ts, offset, copy, copied;
> + struct xsk_addrs *addrs = NULL;
> struct sk_buff *skb = xs->skb;
> struct page *page;
> void *buffer;
> @@ -623,6 +653,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> return ERR_PTR(err);
>
> skb_reserve(skb, hr);
> +
> + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> + if (!addrs)
> + return ERR_PTR(-ENOMEM);
> +
> + xsk_set_destructor_arg(skb, addrs);
> }
>
> addr = desc->addr;
> @@ -662,6 +698,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> {
> struct xsk_tx_metadata *meta = NULL;
> struct net_device *dev = xs->dev;
> + struct xsk_addrs *addrs = NULL;
> struct sk_buff *skb = xs->skb;
> bool first_frag = false;
> int err;
> @@ -694,6 +731,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> err = skb_store_bits(skb, 0, buffer, len);
> if (unlikely(err))
> goto free_err;
> +
> + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> + if (!addrs) {
> + err = -ENOMEM;
> + goto free_err;
> + }
> +
> + xsk_set_destructor_arg(skb, addrs);
> +
> } else {
> int nr_frags = skb_shinfo(skb)->nr_frags;
> struct page *page;
> @@ -759,7 +805,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> skb->mark = READ_ONCE(xs->sk.sk_mark);
> skb->destructor = xsk_destruct_skb;
> xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> - xsk_set_destructor_arg(skb);
> +
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + addrs->addrs[addrs->num_descs++] = desc->addr;
>
> return skb;
>
> @@ -769,7 +817,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>
> if (err == -EOVERFLOW) {
> /* Drop the packet */
> - xsk_set_destructor_arg(xs->skb);
> + xsk_inc_skb_descs(xs->skb);
> xsk_drop_skb(xs->skb);
> xskq_cons_release(xs->tx);
> } else {
> @@ -812,7 +860,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> + err = xsk_cq_reserve_locked(xs->pool);
> if (err) {
> err = -EAGAIN;
> goto out;
> @@ -1122,6 +1170,7 @@ static int xsk_release(struct socket *sock)
> xskq_destroy(xs->tx);
> xskq_destroy(xs->fq_tmp);
> xskq_destroy(xs->cq_tmp);
> + kmem_cache_destroy(xs->xsk_addrs_cache);
>
> sock_orphan(sk);
> sock->sk = NULL;
> @@ -1765,6 +1814,15 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
>
> sock_prot_inuse_add(net, &xsk_proto, 1);
>
[..]
> + xs->xsk_addrs_cache = kmem_cache_create("xsk_generic_xmit_cache",
> + sizeof(struct xsk_addrs), 0,
> + SLAB_HWCACHE_ALIGN, NULL);
> +
> + if (!xs->xsk_addrs_cache) {
> + sk_free(sk);
> + return -ENOMEM;
> + }
Should we move this up to happen before sk_add_node_rcu? Otherwise we
also have to do sk_del_node_init_rcu on !xs->xsk_addrs_cache here?
Btw, alternatively, why not make this happen at bind time when we know
whether the socket is gonna be copy or zc? And do it only for the copy
mode?
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v3 bpf] xsk: fix immature cq descriptor production
2025-08-06 16:43 ` Stanislav Fomichev
@ 2025-08-06 20:42 ` Maciej Fijalkowski
2025-08-07 12:01 ` Maciej Fijalkowski
0 siblings, 1 reply; 8+ messages in thread
From: Maciej Fijalkowski @ 2025-08-06 20:42 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On Wed, Aug 06, 2025 at 09:43:53AM -0700, Stanislav Fomichev wrote:
> On 08/06, Maciej Fijalkowski wrote:
> > Eryk reported an issue that I have put under Closes: tag, related to
> > umem addrs being prematurely produced onto pool's completion queue.
> > Let us make the skb's destructor responsible for producing all addrs
> > that given skb used.
> >
> > Introduce struct xsk_addrs which will carry descriptor count with array
> > of addresses taken from processed descriptors that will be carried via
> > skb_shared_info::destructor_arg. This way we can refer to it within
> > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > coming from memory allocations, let us introduce kmem_cache of xsk_addrs
> > onto xdp_sock. Utilize the existing struct hole in xdp_sock for that.
> >
> > Commit from fixes tag introduced the buggy behavior, it was not broken
> > from day 1, but rather when xsk multi-buffer got introduced.
> >
> > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > ---
> > v1:
> > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > v2:
> > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> >
> > v1->v2:
> > * store addrs in array carried via destructor_arg instead having them
> > stored in skb headroom; cleaner and less hacky approach;
> > v2->v3:
> > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > * set err when xsk_addrs allocation fails (Dan)
> > * change xsk_addrs layout to avoid holes
> > * free xsk_addrs on error path
> > * rebase
> > ---
> > include/net/xdp_sock.h | 1 +
> > net/xdp/xsk.c | 94 ++++++++++++++++++++++++++++++++++--------
> > net/xdp/xsk_queue.h | 12 ++++++
> > 3 files changed, 89 insertions(+), 18 deletions(-)
> >
> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > index ce587a225661..5ba9ad4c110f 100644
> > --- a/include/net/xdp_sock.h
> > +++ b/include/net/xdp_sock.h
> > @@ -61,6 +61,7 @@ struct xdp_sock {
> > XSK_BOUND,
> > XSK_UNBOUND,
> > } state;
> > + struct kmem_cache *xsk_addrs_cache;
> >
> > struct xsk_queue *tx ____cacheline_aligned_in_smp;
> > struct list_head tx_list;
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index 9c3acecc14b1..d77cde0131be 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -36,6 +36,11 @@
> > #define TX_BATCH_SIZE 32
> > #define MAX_PER_SOCKET_BUDGET 32
> >
> > +struct xsk_addrs {
> > + u64 addrs[MAX_SKB_FRAGS + 1];
> > + u32 num_descs;
> > +};
> > +
> > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> > {
> > if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> > @@ -532,25 +537,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> > return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> > }
> >
> > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> > {
> > unsigned long flags;
> > int ret;
> >
> > spin_lock_irqsave(&pool->cq_lock, flags);
> > - ret = xskq_prod_reserve_addr(pool->cq, addr);
> > + ret = xskq_prod_reserve(pool->cq);
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> >
> > return ret;
> > }
> >
> > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> > + struct sk_buff *skb)
> > {
> > + struct xsk_buff_pool *pool = xs->pool;
> > + struct xsk_addrs *xsk_addrs;
> > unsigned long flags;
> > + u32 num_desc, i;
> > + u32 idx;
> > +
> > + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + num_desc = xsk_addrs->num_descs;
> >
> > spin_lock_irqsave(&pool->cq_lock, flags);
> > - xskq_prod_submit_n(pool->cq, n);
> > + idx = xskq_get_prod(pool->cq);
> > +
> > + for (i = 0; i < num_desc; i++, idx++)
> > + xskq_prod_write_addr(pool->cq, idx, xsk_addrs->addrs[i]);
>
> optional nit: maybe do xskq_prod_write_addr(, idx+i, ) instead of 'idx++'
> in the loop? I got a bit confused here until I spotted that idx++..
> But up to you, feel free to ignore, maybe it's just me.
>
> > + xskq_prod_submit_n(pool->cq, num_desc);
> > +
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > + kmem_cache_free(xs->xsk_addrs_cache, xsk_addrs);
> > }
> >
> > static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > @@ -562,35 +581,45 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > }
> >
> > -static u32 xsk_get_num_desc(struct sk_buff *skb)
> > -{
> > - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> > -}
> > -
> > static void xsk_destruct_skb(struct sk_buff *skb)
> > {
> > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> >
>
> [..]
>
> > - if (compl->tx_timestamp) {
> > + if (compl->tx_timestamp)
> > /* sw completion timestamp, not a real one */
> > *compl->tx_timestamp = ktime_get_tai_fast_ns();
> > - }
>
> Seems to be unrelated, can probably drop if you happen to respin?
>
> > - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> > + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> > sock_wfree(skb);
> > }
> >
> > -static void xsk_set_destructor_arg(struct sk_buff *skb)
> > +static u32 xsk_get_num_desc(struct sk_buff *skb)
> > +{
> > + struct xsk_addrs *addrs;
> > +
> > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + return addrs->num_descs;
> > +}
> > +
> > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> > {
> > - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> > + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> > +}
> > +
> > +static void xsk_inc_skb_descs(struct sk_buff *skb)
> > +{
> > + struct xsk_addrs *addrs;
> >
> > - skb_shinfo(skb)->destructor_arg = (void *)num;
> > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + addrs->num_descs++;
> > }
> >
> > static void xsk_consume_skb(struct sk_buff *skb)
> > {
> > struct xdp_sock *xs = xdp_sk(skb->sk);
> >
> > + kmem_cache_free(xs->xsk_addrs_cache,
> > + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> > skb->destructor = sock_wfree;
> > xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> > /* Free skb without triggering the perf drop trace */
> > @@ -609,6 +638,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > {
> > struct xsk_buff_pool *pool = xs->pool;
> > u32 hr, len, ts, offset, copy, copied;
> > + struct xsk_addrs *addrs = NULL;
> > struct sk_buff *skb = xs->skb;
> > struct page *page;
> > void *buffer;
> > @@ -623,6 +653,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > return ERR_PTR(err);
> >
> > skb_reserve(skb, hr);
> > +
> > + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> > + if (!addrs)
> > + return ERR_PTR(-ENOMEM);
> > +
> > + xsk_set_destructor_arg(skb, addrs);
> > }
> >
> > addr = desc->addr;
> > @@ -662,6 +698,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > {
> > struct xsk_tx_metadata *meta = NULL;
> > struct net_device *dev = xs->dev;
> > + struct xsk_addrs *addrs = NULL;
> > struct sk_buff *skb = xs->skb;
> > bool first_frag = false;
> > int err;
> > @@ -694,6 +731,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > err = skb_store_bits(skb, 0, buffer, len);
> > if (unlikely(err))
> > goto free_err;
> > +
> > + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> > + if (!addrs) {
> > + err = -ENOMEM;
> > + goto free_err;
> > + }
> > +
> > + xsk_set_destructor_arg(skb, addrs);
> > +
> > } else {
> > int nr_frags = skb_shinfo(skb)->nr_frags;
> > struct page *page;
> > @@ -759,7 +805,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > skb->mark = READ_ONCE(xs->sk.sk_mark);
> > skb->destructor = xsk_destruct_skb;
> > xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> > - xsk_set_destructor_arg(skb);
> > +
> > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > + addrs->addrs[addrs->num_descs++] = desc->addr;
> >
> > return skb;
> >
> > @@ -769,7 +817,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> >
> > if (err == -EOVERFLOW) {
> > /* Drop the packet */
> > - xsk_set_destructor_arg(xs->skb);
> > + xsk_inc_skb_descs(xs->skb);
> > xsk_drop_skb(xs->skb);
> > xskq_cons_release(xs->tx);
> > } else {
> > @@ -812,7 +860,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> > * if there is space in it. This avoids having to implement
> > * any buffering in the Tx path.
> > */
> > - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> > + err = xsk_cq_reserve_locked(xs->pool);
> > if (err) {
> > err = -EAGAIN;
> > goto out;
> > @@ -1122,6 +1170,7 @@ static int xsk_release(struct socket *sock)
> > xskq_destroy(xs->tx);
> > xskq_destroy(xs->fq_tmp);
> > xskq_destroy(xs->cq_tmp);
> > + kmem_cache_destroy(xs->xsk_addrs_cache);
> >
> > sock_orphan(sk);
> > sock->sk = NULL;
> > @@ -1765,6 +1814,15 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
> >
> > sock_prot_inuse_add(net, &xsk_proto, 1);
> >
>
> [..]
>
> > + xs->xsk_addrs_cache = kmem_cache_create("xsk_generic_xmit_cache",
> > + sizeof(struct xsk_addrs), 0,
> > + SLAB_HWCACHE_ALIGN, NULL);
> > +
> > + if (!xs->xsk_addrs_cache) {
> > + sk_free(sk);
> > + return -ENOMEM;
> > + }
>
> Should we move this up to happen before sk_add_node_rcu? Otherwise we
> also have to do sk_del_node_init_rcu on !xs->xsk_addrs_cache here?
>
> Btw, alternatively, why not make this happen at bind time when we know
> whether the socket is gonna be copy or zc? And do it only for the copy
> mode?
thanks for quick review Stan. makes sense to do it for copy mode only.
i'll send next revision tomorrow.
Maciej
^ permalink raw reply [flat|nested] 8+ messages in thread
* [syzbot ci] Re: xsk: fix immature cq descriptor production
2025-08-06 15:41 [PATCH v3 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-06 16:43 ` Stanislav Fomichev
@ 2025-08-07 8:06 ` syzbot ci
2025-08-08 9:48 ` [PATCH v3 bpf] " Magnus Karlsson
2 siblings, 0 replies; 8+ messages in thread
From: syzbot ci @ 2025-08-07 8:06 UTC (permalink / raw)
To: aleksander.lobakin, andrii, ast, bpf, daniel, e.kubanski,
maciej.fijalkowski, magnus.karlsson, netdev
Cc: syzbot, syzkaller-bugs
syzbot ci has tested the following series
[v3] xsk: fix immature cq descriptor production
https://lore.kernel.org/all/20250806154127.2161434-1-maciej.fijalkowski@intel.com
* [PATCH v3 bpf] xsk: fix immature cq descriptor production
and found the following issue:
WARNING in xsk_create
Full report is available here:
https://ci.syzbot.org/series/ed9b41fb-c772-4c8d-ab6b-07919dac7f3f
***
WARNING in xsk_create
tree: bpf
URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf.git
base: e8d780dcd957d80725ad5dd00bab53b856429bc0
arch: amd64
compiler: Debian clang version 20.1.7 (++20250616065708+6146a88f6049-1~exp1~20250616065826.132), Debian LLD 20.1.7
config: https://ci.syzbot.org/builds/ac640846-151f-4c3e-8a63-10a1d56881e1/config
syz repro: https://ci.syzbot.org/findings/34ebabe4-f302-4e4b-9951-0a44d704970a/syz_repro
------------[ cut here ]------------
kmem_cache of name 'xsk_generic_xmit_cache' already exists
WARNING: CPU: 1 PID: 6031 at mm/slab_common.c:110 kmem_cache_sanity_check mm/slab_common.c:109 [inline]
WARNING: CPU: 1 PID: 6031 at mm/slab_common.c:110 __kmem_cache_create_args+0xa3/0x320 mm/slab_common.c:307
Modules linked in:
CPU: 1 UID: 0 PID: 6031 Comm: syz.2.21 Not tainted 6.16.0-syzkaller-06699-ge8d780dcd957-dirty #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:kmem_cache_sanity_check mm/slab_common.c:109 [inline]
RIP: 0010:__kmem_cache_create_args+0xa3/0x320 mm/slab_common.c:307
Code: 81 fc 58 a5 22 8e 74 26 49 8b 7c 24 f8 48 89 de e8 32 81 67 09 85 c0 75 e2 90 48 c7 c7 f2 e1 98 8d 48 89 de e8 5e 00 7f ff 90 <0f> 0b 90 90 48 89 df be 20 00 00 00 e8 cc 82 67 09 48 85 c0 0f 85
RSP: 0018:ffffc90002dffcc8 EFLAGS: 00010246
RAX: 2d59588130194a00 RBX: ffffffff8cb69260 RCX: ffff888105d20000
RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000002
RBP: 0000000000000010 R08: ffffc90002dff9e7 R09: 1ffff920005bff3c
R10: dffffc0000000000 R11: fffff520005bff3d R12: ffff88801fde6928
R13: 0000607e5bfbe4c0 R14: ffffc90002dffd60 R15: 0000000000000098
FS: 00007f455d4c26c0(0000) GS:ffff8881a3c7e000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f455c7b7dac CR3: 0000000106b38000 CR4: 00000000000006f0
Call Trace:
<TASK>
__kmem_cache_create include/linux/slab.h:353 [inline]
xsk_create+0x67e/0x8d0 net/xdp/xsk.c:1817
__sock_create+0x4b3/0x9f0 net/socket.c:1589
sock_create net/socket.c:1647 [inline]
__sys_socket_create net/socket.c:1684 [inline]
__sys_socket+0xd7/0x1b0 net/socket.c:1731
__do_sys_socket net/socket.c:1745 [inline]
__se_sys_socket net/socket.c:1743 [inline]
__x64_sys_socket+0x7a/0x90 net/socket.c:1743
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f455c58ebe9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f455d4c2038 EFLAGS: 00000246 ORIG_RAX: 0000000000000029
RAX: ffffffffffffffda RBX: 00007f455c7b5fa0 RCX: 00007f455c58ebe9
RDX: 0000000000000000 RSI: 0000000000000003 RDI: 000000000000002c
RBP: 00007f455c611e19 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f455c7b6038 R14: 00007f455c7b5fa0 R15: 00007ffd678e28c8
</TASK>
***
If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
Tested-by: syzbot@syzkaller.appspotmail.com
---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v3 bpf] xsk: fix immature cq descriptor production
2025-08-06 20:42 ` Maciej Fijalkowski
@ 2025-08-07 12:01 ` Maciej Fijalkowski
2025-08-07 16:37 ` Stanislav Fomichev
0 siblings, 1 reply; 8+ messages in thread
From: Maciej Fijalkowski @ 2025-08-07 12:01 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On Wed, Aug 06, 2025 at 10:42:58PM +0200, Maciej Fijalkowski wrote:
> On Wed, Aug 06, 2025 at 09:43:53AM -0700, Stanislav Fomichev wrote:
> > On 08/06, Maciej Fijalkowski wrote:
> > > Eryk reported an issue that I have put under Closes: tag, related to
> > > umem addrs being prematurely produced onto pool's completion queue.
> > > Let us make the skb's destructor responsible for producing all addrs
> > > that given skb used.
> > >
> > > Introduce struct xsk_addrs which will carry descriptor count with array
> > > of addresses taken from processed descriptors that will be carried via
> > > skb_shared_info::destructor_arg. This way we can refer to it within
> > > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > > coming from memory allocations, let us introduce kmem_cache of xsk_addrs
> > > onto xdp_sock. Utilize the existing struct hole in xdp_sock for that.
> > >
> > > Commit from fixes tag introduced the buggy behavior, it was not broken
> > > from day 1, but rather when xsk multi-buffer got introduced.
> > >
> > > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > > ---
> > > v1:
> > > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > > v2:
> > > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> > >
> > > v1->v2:
> > > * store addrs in array carried via destructor_arg instead having them
> > > stored in skb headroom; cleaner and less hacky approach;
> > > v2->v3:
> > > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > > * set err when xsk_addrs allocation fails (Dan)
> > > * change xsk_addrs layout to avoid holes
> > > * free xsk_addrs on error path
> > > * rebase
> > > ---
> > > include/net/xdp_sock.h | 1 +
> > > net/xdp/xsk.c | 94 ++++++++++++++++++++++++++++++++++--------
> > > net/xdp/xsk_queue.h | 12 ++++++
> > > 3 files changed, 89 insertions(+), 18 deletions(-)
> > >
> > > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > > index ce587a225661..5ba9ad4c110f 100644
> > > --- a/include/net/xdp_sock.h
> > > +++ b/include/net/xdp_sock.h
> > > @@ -61,6 +61,7 @@ struct xdp_sock {
> > > XSK_BOUND,
> > > XSK_UNBOUND,
> > > } state;
> > > + struct kmem_cache *xsk_addrs_cache;
> > >
> > > struct xsk_queue *tx ____cacheline_aligned_in_smp;
> > > struct list_head tx_list;
> > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > index 9c3acecc14b1..d77cde0131be 100644
> > > --- a/net/xdp/xsk.c
> > > +++ b/net/xdp/xsk.c
> > > @@ -36,6 +36,11 @@
> > > #define TX_BATCH_SIZE 32
> > > #define MAX_PER_SOCKET_BUDGET 32
> > >
> > > +struct xsk_addrs {
> > > + u64 addrs[MAX_SKB_FRAGS + 1];
> > > + u32 num_descs;
> > > +};
> > > +
> > > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> > > {
> > > if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> > > @@ -532,25 +537,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> > > return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> > > }
> > >
> > > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> > > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> > > {
> > > unsigned long flags;
> > > int ret;
> > >
> > > spin_lock_irqsave(&pool->cq_lock, flags);
> > > - ret = xskq_prod_reserve_addr(pool->cq, addr);
> > > + ret = xskq_prod_reserve(pool->cq);
> > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > >
> > > return ret;
> > > }
> > >
> > > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> > > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> > > + struct sk_buff *skb)
> > > {
> > > + struct xsk_buff_pool *pool = xs->pool;
> > > + struct xsk_addrs *xsk_addrs;
> > > unsigned long flags;
> > > + u32 num_desc, i;
> > > + u32 idx;
> > > +
> > > + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > + num_desc = xsk_addrs->num_descs;
> > >
> > > spin_lock_irqsave(&pool->cq_lock, flags);
> > > - xskq_prod_submit_n(pool->cq, n);
> > > + idx = xskq_get_prod(pool->cq);
> > > +
> > > + for (i = 0; i < num_desc; i++, idx++)
> > > + xskq_prod_write_addr(pool->cq, idx, xsk_addrs->addrs[i]);
> >
> > optional nit: maybe do xskq_prod_write_addr(, idx+i, ) instead of 'idx++'
> > in the loop? I got a bit confused here until I spotted that idx++..
> > But up to you, feel free to ignore, maybe it's just me.
ugh i missed these comments. sure i can do that.
> >
> > > + xskq_prod_submit_n(pool->cq, num_desc);
> > > +
> > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > + kmem_cache_free(xs->xsk_addrs_cache, xsk_addrs);
> > > }
> > >
> > > static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > > @@ -562,35 +581,45 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > }
> > >
> > > -static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > -{
> > > - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> > > -}
> > > -
> > > static void xsk_destruct_skb(struct sk_buff *skb)
> > > {
> > > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> > >
> >
> > [..]
> >
> > > - if (compl->tx_timestamp) {
> > > + if (compl->tx_timestamp)
> > > /* sw completion timestamp, not a real one */
> > > *compl->tx_timestamp = ktime_get_tai_fast_ns();
> > > - }
> >
> > Seems to be unrelated, can probably drop if you happen to respin?
yes, i'll pull out this sophisticated change to separate commit:P
> >
> > > - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> > > + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> > > sock_wfree(skb);
> > > }
> > >
> > > -static void xsk_set_destructor_arg(struct sk_buff *skb)
> > > +static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > +{
> > > + struct xsk_addrs *addrs;
> > > +
> > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > + return addrs->num_descs;
> > > +}
> > > +
> > > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> > > {
> > > - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> > > + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> > > +}
> > > +
> > > +static void xsk_inc_skb_descs(struct sk_buff *skb)
> > > +{
> > > + struct xsk_addrs *addrs;
> > >
> > > - skb_shinfo(skb)->destructor_arg = (void *)num;
> > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > + addrs->num_descs++;
> > > }
> > >
> > > static void xsk_consume_skb(struct sk_buff *skb)
> > > {
> > > struct xdp_sock *xs = xdp_sk(skb->sk);
> > >
> > > + kmem_cache_free(xs->xsk_addrs_cache,
> > > + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> > > skb->destructor = sock_wfree;
> > > xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> > > /* Free skb without triggering the perf drop trace */
> > > @@ -609,6 +638,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > {
> > > struct xsk_buff_pool *pool = xs->pool;
> > > u32 hr, len, ts, offset, copy, copied;
> > > + struct xsk_addrs *addrs = NULL;
> > > struct sk_buff *skb = xs->skb;
> > > struct page *page;
> > > void *buffer;
> > > @@ -623,6 +653,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > return ERR_PTR(err);
> > >
> > > skb_reserve(skb, hr);
> > > +
> > > + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> > > + if (!addrs)
> > > + return ERR_PTR(-ENOMEM);
> > > +
> > > + xsk_set_destructor_arg(skb, addrs);
> > > }
> > >
> > > addr = desc->addr;
> > > @@ -662,6 +698,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > {
> > > struct xsk_tx_metadata *meta = NULL;
> > > struct net_device *dev = xs->dev;
> > > + struct xsk_addrs *addrs = NULL;
> > > struct sk_buff *skb = xs->skb;
> > > bool first_frag = false;
> > > int err;
> > > @@ -694,6 +731,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > err = skb_store_bits(skb, 0, buffer, len);
> > > if (unlikely(err))
> > > goto free_err;
> > > +
> > > + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> > > + if (!addrs) {
> > > + err = -ENOMEM;
> > > + goto free_err;
> > > + }
> > > +
> > > + xsk_set_destructor_arg(skb, addrs);
> > > +
> > > } else {
> > > int nr_frags = skb_shinfo(skb)->nr_frags;
> > > struct page *page;
> > > @@ -759,7 +805,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > skb->mark = READ_ONCE(xs->sk.sk_mark);
> > > skb->destructor = xsk_destruct_skb;
> > > xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> > > - xsk_set_destructor_arg(skb);
> > > +
> > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > + addrs->addrs[addrs->num_descs++] = desc->addr;
> > >
> > > return skb;
> > >
> > > @@ -769,7 +817,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > >
> > > if (err == -EOVERFLOW) {
> > > /* Drop the packet */
> > > - xsk_set_destructor_arg(xs->skb);
> > > + xsk_inc_skb_descs(xs->skb);
> > > xsk_drop_skb(xs->skb);
> > > xskq_cons_release(xs->tx);
> > > } else {
> > > @@ -812,7 +860,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> > > * if there is space in it. This avoids having to implement
> > > * any buffering in the Tx path.
> > > */
> > > - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> > > + err = xsk_cq_reserve_locked(xs->pool);
> > > if (err) {
> > > err = -EAGAIN;
> > > goto out;
> > > @@ -1122,6 +1170,7 @@ static int xsk_release(struct socket *sock)
> > > xskq_destroy(xs->tx);
> > > xskq_destroy(xs->fq_tmp);
> > > xskq_destroy(xs->cq_tmp);
> > > + kmem_cache_destroy(xs->xsk_addrs_cache);
> > >
> > > sock_orphan(sk);
> > > sock->sk = NULL;
> > > @@ -1765,6 +1814,15 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
> > >
> > > sock_prot_inuse_add(net, &xsk_proto, 1);
> > >
> >
> > [..]
> >
> > > + xs->xsk_addrs_cache = kmem_cache_create("xsk_generic_xmit_cache",
> > > + sizeof(struct xsk_addrs), 0,
> > > + SLAB_HWCACHE_ALIGN, NULL);
> > > +
> > > + if (!xs->xsk_addrs_cache) {
> > > + sk_free(sk);
> > > + return -ENOMEM;
> > > + }
> >
> > Should we move this up to happen before sk_add_node_rcu? Otherwise we
> > also have to do sk_del_node_init_rcu on !xs->xsk_addrs_cache here?
> >
> > Btw, alternatively, why not make this happen at bind time when we know
> > whether the socket is gonna be copy or zc? And do it only for the copy
> > mode?
>
> thanks for quick review Stan. makes sense to do it for copy mode only.
> i'll send next revision tomorrow.
FWIW syzbot reported an issue that "xsk_generic_xmit_cache" exists, so
probably we should include queue id within name so that each socket gets
its own cache with unique name.
>
> Maciej
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v3 bpf] xsk: fix immature cq descriptor production
2025-08-07 12:01 ` Maciej Fijalkowski
@ 2025-08-07 16:37 ` Stanislav Fomichev
2025-08-13 11:47 ` Maciej Fijalkowski
0 siblings, 1 reply; 8+ messages in thread
From: Stanislav Fomichev @ 2025-08-07 16:37 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On 08/07, Maciej Fijalkowski wrote:
> On Wed, Aug 06, 2025 at 10:42:58PM +0200, Maciej Fijalkowski wrote:
> > On Wed, Aug 06, 2025 at 09:43:53AM -0700, Stanislav Fomichev wrote:
> > > On 08/06, Maciej Fijalkowski wrote:
> > > > Eryk reported an issue that I have put under Closes: tag, related to
> > > > umem addrs being prematurely produced onto pool's completion queue.
> > > > Let us make the skb's destructor responsible for producing all addrs
> > > > that given skb used.
> > > >
> > > > Introduce struct xsk_addrs which will carry descriptor count with array
> > > > of addresses taken from processed descriptors that will be carried via
> > > > skb_shared_info::destructor_arg. This way we can refer to it within
> > > > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > > > coming from memory allocations, let us introduce kmem_cache of xsk_addrs
> > > > onto xdp_sock. Utilize the existing struct hole in xdp_sock for that.
> > > >
> > > > Commit from fixes tag introduced the buggy behavior, it was not broken
> > > > from day 1, but rather when xsk multi-buffer got introduced.
> > > >
> > > > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > > > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > > > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > > > ---
> > > > v1:
> > > > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > > > v2:
> > > > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> > > >
> > > > v1->v2:
> > > > * store addrs in array carried via destructor_arg instead having them
> > > > stored in skb headroom; cleaner and less hacky approach;
> > > > v2->v3:
> > > > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > > > * set err when xsk_addrs allocation fails (Dan)
> > > > * change xsk_addrs layout to avoid holes
> > > > * free xsk_addrs on error path
> > > > * rebase
> > > > ---
> > > > include/net/xdp_sock.h | 1 +
> > > > net/xdp/xsk.c | 94 ++++++++++++++++++++++++++++++++++--------
> > > > net/xdp/xsk_queue.h | 12 ++++++
> > > > 3 files changed, 89 insertions(+), 18 deletions(-)
> > > >
> > > > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> > > > index ce587a225661..5ba9ad4c110f 100644
> > > > --- a/include/net/xdp_sock.h
> > > > +++ b/include/net/xdp_sock.h
> > > > @@ -61,6 +61,7 @@ struct xdp_sock {
> > > > XSK_BOUND,
> > > > XSK_UNBOUND,
> > > > } state;
> > > > + struct kmem_cache *xsk_addrs_cache;
> > > >
> > > > struct xsk_queue *tx ____cacheline_aligned_in_smp;
> > > > struct list_head tx_list;
> > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > index 9c3acecc14b1..d77cde0131be 100644
> > > > --- a/net/xdp/xsk.c
> > > > +++ b/net/xdp/xsk.c
> > > > @@ -36,6 +36,11 @@
> > > > #define TX_BATCH_SIZE 32
> > > > #define MAX_PER_SOCKET_BUDGET 32
> > > >
> > > > +struct xsk_addrs {
> > > > + u64 addrs[MAX_SKB_FRAGS + 1];
> > > > + u32 num_descs;
> > > > +};
> > > > +
> > > > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> > > > {
> > > > if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> > > > @@ -532,25 +537,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> > > > return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> > > > }
> > > >
> > > > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> > > > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> > > > {
> > > > unsigned long flags;
> > > > int ret;
> > > >
> > > > spin_lock_irqsave(&pool->cq_lock, flags);
> > > > - ret = xskq_prod_reserve_addr(pool->cq, addr);
> > > > + ret = xskq_prod_reserve(pool->cq);
> > > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > >
> > > > return ret;
> > > > }
> > > >
> > > > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> > > > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> > > > + struct sk_buff *skb)
> > > > {
> > > > + struct xsk_buff_pool *pool = xs->pool;
> > > > + struct xsk_addrs *xsk_addrs;
> > > > unsigned long flags;
> > > > + u32 num_desc, i;
> > > > + u32 idx;
> > > > +
> > > > + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > > + num_desc = xsk_addrs->num_descs;
> > > >
> > > > spin_lock_irqsave(&pool->cq_lock, flags);
> > > > - xskq_prod_submit_n(pool->cq, n);
> > > > + idx = xskq_get_prod(pool->cq);
> > > > +
> > > > + for (i = 0; i < num_desc; i++, idx++)
> > > > + xskq_prod_write_addr(pool->cq, idx, xsk_addrs->addrs[i]);
> > >
> > > optional nit: maybe do xskq_prod_write_addr(, idx+i, ) instead of 'idx++'
> > > in the loop? I got a bit confused here until I spotted that idx++..
> > > But up to you, feel free to ignore, maybe it's just me.
>
> ugh i missed these comments. sure i can do that.
>
> > >
> > > > + xskq_prod_submit_n(pool->cq, num_desc);
> > > > +
> > > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > > + kmem_cache_free(xs->xsk_addrs_cache, xsk_addrs);
> > > > }
> > > >
> > > > static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > > > @@ -562,35 +581,45 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > > > spin_unlock_irqrestore(&pool->cq_lock, flags);
> > > > }
> > > >
> > > > -static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > > -{
> > > > - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> > > > -}
> > > > -
> > > > static void xsk_destruct_skb(struct sk_buff *skb)
> > > > {
> > > > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> > > >
> > >
> > > [..]
> > >
> > > > - if (compl->tx_timestamp) {
> > > > + if (compl->tx_timestamp)
> > > > /* sw completion timestamp, not a real one */
> > > > *compl->tx_timestamp = ktime_get_tai_fast_ns();
> > > > - }
> > >
> > > Seems to be unrelated, can probably drop if you happen to respin?
>
> yes, i'll pull out this sophisticated change to separate commit:P
>
> > >
> > > > - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> > > > + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> > > > sock_wfree(skb);
> > > > }
> > > >
> > > > -static void xsk_set_destructor_arg(struct sk_buff *skb)
> > > > +static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > > +{
> > > > + struct xsk_addrs *addrs;
> > > > +
> > > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > > + return addrs->num_descs;
> > > > +}
> > > > +
> > > > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> > > > {
> > > > - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> > > > + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> > > > +}
> > > > +
> > > > +static void xsk_inc_skb_descs(struct sk_buff *skb)
> > > > +{
> > > > + struct xsk_addrs *addrs;
> > > >
> > > > - skb_shinfo(skb)->destructor_arg = (void *)num;
> > > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > > + addrs->num_descs++;
> > > > }
> > > >
> > > > static void xsk_consume_skb(struct sk_buff *skb)
> > > > {
> > > > struct xdp_sock *xs = xdp_sk(skb->sk);
> > > >
> > > > + kmem_cache_free(xs->xsk_addrs_cache,
> > > > + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> > > > skb->destructor = sock_wfree;
> > > > xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> > > > /* Free skb without triggering the perf drop trace */
> > > > @@ -609,6 +638,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > {
> > > > struct xsk_buff_pool *pool = xs->pool;
> > > > u32 hr, len, ts, offset, copy, copied;
> > > > + struct xsk_addrs *addrs = NULL;
> > > > struct sk_buff *skb = xs->skb;
> > > > struct page *page;
> > > > void *buffer;
> > > > @@ -623,6 +653,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > return ERR_PTR(err);
> > > >
> > > > skb_reserve(skb, hr);
> > > > +
> > > > + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> > > > + if (!addrs)
> > > > + return ERR_PTR(-ENOMEM);
> > > > +
> > > > + xsk_set_destructor_arg(skb, addrs);
> > > > }
> > > >
> > > > addr = desc->addr;
> > > > @@ -662,6 +698,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > {
> > > > struct xsk_tx_metadata *meta = NULL;
> > > > struct net_device *dev = xs->dev;
> > > > + struct xsk_addrs *addrs = NULL;
> > > > struct sk_buff *skb = xs->skb;
> > > > bool first_frag = false;
> > > > int err;
> > > > @@ -694,6 +731,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > err = skb_store_bits(skb, 0, buffer, len);
> > > > if (unlikely(err))
> > > > goto free_err;
> > > > +
> > > > + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> > > > + if (!addrs) {
> > > > + err = -ENOMEM;
> > > > + goto free_err;
> > > > + }
> > > > +
> > > > + xsk_set_destructor_arg(skb, addrs);
> > > > +
> > > > } else {
> > > > int nr_frags = skb_shinfo(skb)->nr_frags;
> > > > struct page *page;
> > > > @@ -759,7 +805,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > skb->mark = READ_ONCE(xs->sk.sk_mark);
> > > > skb->destructor = xsk_destruct_skb;
> > > > xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> > > > - xsk_set_destructor_arg(skb);
> > > > +
> > > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > > > + addrs->addrs[addrs->num_descs++] = desc->addr;
> > > >
> > > > return skb;
> > > >
> > > > @@ -769,7 +817,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > >
> > > > if (err == -EOVERFLOW) {
> > > > /* Drop the packet */
> > > > - xsk_set_destructor_arg(xs->skb);
> > > > + xsk_inc_skb_descs(xs->skb);
> > > > xsk_drop_skb(xs->skb);
> > > > xskq_cons_release(xs->tx);
> > > > } else {
> > > > @@ -812,7 +860,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> > > > * if there is space in it. This avoids having to implement
> > > > * any buffering in the Tx path.
> > > > */
> > > > - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> > > > + err = xsk_cq_reserve_locked(xs->pool);
> > > > if (err) {
> > > > err = -EAGAIN;
> > > > goto out;
> > > > @@ -1122,6 +1170,7 @@ static int xsk_release(struct socket *sock)
> > > > xskq_destroy(xs->tx);
> > > > xskq_destroy(xs->fq_tmp);
> > > > xskq_destroy(xs->cq_tmp);
> > > > + kmem_cache_destroy(xs->xsk_addrs_cache);
> > > >
> > > > sock_orphan(sk);
> > > > sock->sk = NULL;
> > > > @@ -1765,6 +1814,15 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
> > > >
> > > > sock_prot_inuse_add(net, &xsk_proto, 1);
> > > >
> > >
> > > [..]
> > >
> > > > + xs->xsk_addrs_cache = kmem_cache_create("xsk_generic_xmit_cache",
> > > > + sizeof(struct xsk_addrs), 0,
> > > > + SLAB_HWCACHE_ALIGN, NULL);
> > > > +
> > > > + if (!xs->xsk_addrs_cache) {
> > > > + sk_free(sk);
> > > > + return -ENOMEM;
> > > > + }
> > >
> > > Should we move this up to happen before sk_add_node_rcu? Otherwise we
> > > also have to do sk_del_node_init_rcu on !xs->xsk_addrs_cache here?
> > >
> > > Btw, alternatively, why not make this happen at bind time when we know
> > > whether the socket is gonna be copy or zc? And do it only for the copy
> > > mode?
> >
> > thanks for quick review Stan. makes sense to do it for copy mode only.
> > i'll send next revision tomorrow.
>
> FWIW syzbot reported an issue that "xsk_generic_xmit_cache" exists, so
> probably we should include queue id within name so that each socket gets
> its own cache with unique name.
Interesting. I was wondering whether it's gonna be confusing to see
multiple "xsk_generic_xmit_cache" entries in /proc/slabinfo, but looks
like it's not allowed :-)
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v3 bpf] xsk: fix immature cq descriptor production
2025-08-06 15:41 [PATCH v3 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-06 16:43 ` Stanislav Fomichev
2025-08-07 8:06 ` [syzbot ci] " syzbot ci
@ 2025-08-08 9:48 ` Magnus Karlsson
2 siblings, 0 replies; 8+ messages in thread
From: Magnus Karlsson @ 2025-08-08 9:48 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On Wed, 6 Aug 2025 at 17:41, Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> Eryk reported an issue that I have put under Closes: tag, related to
> umem addrs being prematurely produced onto pool's completion queue.
> Let us make the skb's destructor responsible for producing all addrs
> that given skb used.
>
> Introduce struct xsk_addrs which will carry descriptor count with array
> of addresses taken from processed descriptors that will be carried via
> skb_shared_info::destructor_arg. This way we can refer to it within
> xsk_destruct_skb(). In order to mitigate the overhead that will be
> coming from memory allocations, let us introduce kmem_cache of xsk_addrs
> onto xdp_sock. Utilize the existing struct hole in xdp_sock for that.
>
> Commit from fixes tag introduced the buggy behavior, it was not broken
> from day 1, but rather when xsk multi-buffer got introduced.
>
> Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
> v1:
> https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> v2:
> https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
>
> v1->v2:
> * store addrs in array carried via destructor_arg instead having them
> stored in skb headroom; cleaner and less hacky approach;
> v2->v3:
> * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> * set err when xsk_addrs allocation fails (Dan)
> * change xsk_addrs layout to avoid holes
> * free xsk_addrs on error path
> * rebase
> ---
> include/net/xdp_sock.h | 1 +
> net/xdp/xsk.c | 94 ++++++++++++++++++++++++++++++++++--------
> net/xdp/xsk_queue.h | 12 ++++++
> 3 files changed, 89 insertions(+), 18 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index ce587a225661..5ba9ad4c110f 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -61,6 +61,7 @@ struct xdp_sock {
> XSK_BOUND,
> XSK_UNBOUND,
> } state;
> + struct kmem_cache *xsk_addrs_cache;
>
> struct xsk_queue *tx ____cacheline_aligned_in_smp;
> struct list_head tx_list;
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 9c3acecc14b1..d77cde0131be 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,6 +36,11 @@
> #define TX_BATCH_SIZE 32
> #define MAX_PER_SOCKET_BUDGET 32
>
> +struct xsk_addrs {
> + u64 addrs[MAX_SKB_FRAGS + 1];
> + u32 num_descs;
> +};
As you will have to produce a v4, I suggest that you put num_descs
first in this struct. The current allocation will lead to 2 cache line
accesses for the case when we have only one fragment. By setting it
first, it will reduced to one cache line access. Yes, we will create a
hole, but I think wasting 4 bytes here is worth it. What do you think?
Apart from that, looks good.
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> +
> void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> {
> if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -532,25 +537,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> }
>
> -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> {
> unsigned long flags;
> int ret;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> - ret = xskq_prod_reserve_addr(pool->cq, addr);
> + ret = xskq_prod_reserve(pool->cq);
> spin_unlock_irqrestore(&pool->cq_lock, flags);
>
> return ret;
> }
>
> -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> + struct sk_buff *skb)
> {
> + struct xsk_buff_pool *pool = xs->pool;
> + struct xsk_addrs *xsk_addrs;
> unsigned long flags;
> + u32 num_desc, i;
> + u32 idx;
> +
> + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + num_desc = xsk_addrs->num_descs;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> - xskq_prod_submit_n(pool->cq, n);
> + idx = xskq_get_prod(pool->cq);
> +
> + for (i = 0; i < num_desc; i++, idx++)
> + xskq_prod_write_addr(pool->cq, idx, xsk_addrs->addrs[i]);
> + xskq_prod_submit_n(pool->cq, num_desc);
> +
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> + kmem_cache_free(xs->xsk_addrs_cache, xsk_addrs);
> }
>
> static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> @@ -562,35 +581,45 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> }
>
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> -}
> -
> static void xsk_destruct_skb(struct sk_buff *skb)
> {
> struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
>
> - if (compl->tx_timestamp) {
> + if (compl->tx_timestamp)
> /* sw completion timestamp, not a real one */
> *compl->tx_timestamp = ktime_get_tai_fast_ns();
> - }
>
> - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> sock_wfree(skb);
> }
>
> -static void xsk_set_destructor_arg(struct sk_buff *skb)
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> +{
> + struct xsk_addrs *addrs;
> +
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + return addrs->num_descs;
> +}
> +
> +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> {
> - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> + skb_shinfo(skb)->destructor_arg = (void *)addrs;
> +}
> +
> +static void xsk_inc_skb_descs(struct sk_buff *skb)
> +{
> + struct xsk_addrs *addrs;
>
> - skb_shinfo(skb)->destructor_arg = (void *)num;
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + addrs->num_descs++;
> }
>
> static void xsk_consume_skb(struct sk_buff *skb)
> {
> struct xdp_sock *xs = xdp_sk(skb->sk);
>
> + kmem_cache_free(xs->xsk_addrs_cache,
> + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> skb->destructor = sock_wfree;
> xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> /* Free skb without triggering the perf drop trace */
> @@ -609,6 +638,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> {
> struct xsk_buff_pool *pool = xs->pool;
> u32 hr, len, ts, offset, copy, copied;
> + struct xsk_addrs *addrs = NULL;
> struct sk_buff *skb = xs->skb;
> struct page *page;
> void *buffer;
> @@ -623,6 +653,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> return ERR_PTR(err);
>
> skb_reserve(skb, hr);
> +
> + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> + if (!addrs)
> + return ERR_PTR(-ENOMEM);
> +
> + xsk_set_destructor_arg(skb, addrs);
> }
>
> addr = desc->addr;
> @@ -662,6 +698,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> {
> struct xsk_tx_metadata *meta = NULL;
> struct net_device *dev = xs->dev;
> + struct xsk_addrs *addrs = NULL;
> struct sk_buff *skb = xs->skb;
> bool first_frag = false;
> int err;
> @@ -694,6 +731,15 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> err = skb_store_bits(skb, 0, buffer, len);
> if (unlikely(err))
> goto free_err;
> +
> + addrs = kmem_cache_zalloc(xs->xsk_addrs_cache, GFP_KERNEL);
> + if (!addrs) {
> + err = -ENOMEM;
> + goto free_err;
> + }
> +
> + xsk_set_destructor_arg(skb, addrs);
> +
> } else {
> int nr_frags = skb_shinfo(skb)->nr_frags;
> struct page *page;
> @@ -759,7 +805,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> skb->mark = READ_ONCE(xs->sk.sk_mark);
> skb->destructor = xsk_destruct_skb;
> xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
> - xsk_set_destructor_arg(skb);
> +
> + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + addrs->addrs[addrs->num_descs++] = desc->addr;
>
> return skb;
>
> @@ -769,7 +817,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>
> if (err == -EOVERFLOW) {
> /* Drop the packet */
> - xsk_set_destructor_arg(xs->skb);
> + xsk_inc_skb_descs(xs->skb);
> xsk_drop_skb(xs->skb);
> xskq_cons_release(xs->tx);
> } else {
> @@ -812,7 +860,7 @@ static int __xsk_generic_xmit(struct sock *sk)
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
> + err = xsk_cq_reserve_locked(xs->pool);
> if (err) {
> err = -EAGAIN;
> goto out;
> @@ -1122,6 +1170,7 @@ static int xsk_release(struct socket *sock)
> xskq_destroy(xs->tx);
> xskq_destroy(xs->fq_tmp);
> xskq_destroy(xs->cq_tmp);
> + kmem_cache_destroy(xs->xsk_addrs_cache);
>
> sock_orphan(sk);
> sock->sk = NULL;
> @@ -1765,6 +1814,15 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
>
> sock_prot_inuse_add(net, &xsk_proto, 1);
>
> + xs->xsk_addrs_cache = kmem_cache_create("xsk_generic_xmit_cache",
> + sizeof(struct xsk_addrs), 0,
> + SLAB_HWCACHE_ALIGN, NULL);
> +
> + if (!xs->xsk_addrs_cache) {
> + sk_free(sk);
> + return -ENOMEM;
> + }
> +
> return 0;
> }
>
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index 46d87e961ad6..f16f390370dc 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
>
> /* Functions for producers */
>
> +static inline u32 xskq_get_prod(struct xsk_queue *q)
> +{
> + return READ_ONCE(q->ring->producer);
> +}
> +
> static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
> {
> u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
> @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
> return 0;
> }
>
> +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
> +{
> + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
> +
> + ring->desc[idx & q->ring_mask] = addr;
> +}
> +
> static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
> u32 nb_entries)
> {
> --
> 2.34.1
>
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v3 bpf] xsk: fix immature cq descriptor production
2025-08-07 16:37 ` Stanislav Fomichev
@ 2025-08-13 11:47 ` Maciej Fijalkowski
0 siblings, 0 replies; 8+ messages in thread
From: Maciej Fijalkowski @ 2025-08-13 11:47 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson,
aleksander.lobakin, Eryk Kubanski
On Thu, Aug 07, 2025 at 09:37:20AM -0700, Stanislav Fomichev wrote:
> On 08/07, Maciej Fijalkowski wrote:
> > On Wed, Aug 06, 2025 at 10:42:58PM +0200, Maciej Fijalkowski wrote:
> > > On Wed, Aug 06, 2025 at 09:43:53AM -0700, Stanislav Fomichev wrote:
> > > > On 08/06, Maciej Fijalkowski wrote:
> > > > > Eryk reported an issue that I have put under Closes: tag, related to
> > > > > umem addrs being prematurely produced onto pool's completion queue.
> > > > > Let us make the skb's destructor responsible for producing all addrs
> > > > > that given skb used.
> > > > >
> > > > > Introduce struct xsk_addrs which will carry descriptor count with array
> > > > > of addresses taken from processed descriptors that will be carried via
> > > > > skb_shared_info::destructor_arg. This way we can refer to it within
> > > > > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > > > > coming from memory allocations, let us introduce kmem_cache of xsk_addrs
> > > > > onto xdp_sock. Utilize the existing struct hole in xdp_sock for that.
> > > > >
> > > > > Commit from fixes tag introduced the buggy behavior, it was not broken
> > > > > from day 1, but rather when xsk multi-buffer got introduced.
> > > > >
> > > > > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > > > > Reported-by: Eryk Kubanski <e.kubanski@partner.samsung.com>
> > > > > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/
> > > > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > > > > ---
> > > > > v1:
> > > > > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@intel.com/
> > > > > v2:
> > > > > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@intel.com/
> > > > >
> > > > > v1->v2:
> > > > > * store addrs in array carried via destructor_arg instead having them
> > > > > stored in skb headroom; cleaner and less hacky approach;
> > > > > v2->v3:
> > > > > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > > > > * set err when xsk_addrs allocation fails (Dan)
> > > > > * change xsk_addrs layout to avoid holes
> > > > > * free xsk_addrs on error path
> > > > > * rebase
> > > > > ---
> > > > > include/net/xdp_sock.h | 1 +
> > > > > net/xdp/xsk.c | 94 ++++++++++++++++++++++++++++++++++--------
> > > > > net/xdp/xsk_queue.h | 12 ++++++
> > > > > 3 files changed, 89 insertions(+), 18 deletions(-)
> > > > >
> > > >
(...)
> > > > > + xs->xsk_addrs_cache = kmem_cache_create("xsk_generic_xmit_cache",
> > > > > + sizeof(struct xsk_addrs), 0,
> > > > > + SLAB_HWCACHE_ALIGN, NULL);
> > > > > +
> > > > > + if (!xs->xsk_addrs_cache) {
> > > > > + sk_free(sk);
> > > > > + return -ENOMEM;
> > > > > + }
> > > >
> > > > Should we move this up to happen before sk_add_node_rcu? Otherwise we
> > > > also have to do sk_del_node_init_rcu on !xs->xsk_addrs_cache here?
> > > >
> > > > Btw, alternatively, why not make this happen at bind time when we know
> > > > whether the socket is gonna be copy or zc? And do it only for the copy
> > > > mode?
> > >
> > > thanks for quick review Stan. makes sense to do it for copy mode only.
> > > i'll send next revision tomorrow.
> >
> > FWIW syzbot reported an issue that "xsk_generic_xmit_cache" exists, so
> > probably we should include queue id within name so that each socket gets
> > its own cache with unique name.
>
> Interesting. I was wondering whether it's gonna be confusing to see
> multiple "xsk_generic_xmit_cache" entries in /proc/slabinfo, but looks
> like it's not allowed :-)
I played with this a bit more, side note is that i have not seen these
entries in /proc/slabinfo unless i provided SLAB_POISON to
kmem_cache_create().
Besides I think a solution where each socket adds its own kmem_cache is
not really scalable. In theory, if someone would have such a use case that
would require loading copy mode xsk socket per each queue on NIC and there
would be multiple NICs that require it on the system, then you get a
pretty massive count of kmem_caches. I am not sure what would be the
consequences of that.
I come up with idea to have these kmem_caches as percpu vars with embedded
refcounting. xsk will index these by queue id and at the bind time if
kmem_cache under certain id was already created we just bump the refcnt.
I'll send a v4 with this implemented and I would appreciate the input on
this :)
>
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2025-08-13 11:47 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-06 15:41 [PATCH v3 bpf] xsk: fix immature cq descriptor production Maciej Fijalkowski
2025-08-06 16:43 ` Stanislav Fomichev
2025-08-06 20:42 ` Maciej Fijalkowski
2025-08-07 12:01 ` Maciej Fijalkowski
2025-08-07 16:37 ` Stanislav Fomichev
2025-08-13 11:47 ` Maciej Fijalkowski
2025-08-07 8:06 ` [syzbot ci] " syzbot ci
2025-08-08 9:48 ` [PATCH v3 bpf] " Magnus Karlsson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).