* [PATCH net v4] xsk: avoid data corruption on cq descriptor number
@ 2025-11-18 12:48 Fernando Fernandez Mancera
2025-11-18 18:26 ` Maciej Fijalkowski
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Fernando Fernandez Mancera @ 2025-11-18 12:48 UTC (permalink / raw)
To: netdev
Cc: csmate, kerneljasonxing, maciej.fijalkowski, bpf, davem, edumazet,
kuba, pabeni, horms, Fernando Fernandez Mancera
Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
production"), the descriptor number is stored in skb control block and
xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
pool's completion queue.
skb control block shouldn't be used for this purpose as after transmit
xsk doesn't have control over it and other subsystems could use it. This
leads to the following kernel panic due to a NULL pointer dereference.
BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: Oops: 0000 [#1] SMP NOPTI
CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy) Debian 6.16.12-1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
RIP: 0010:xsk_destruct_skb+0xd0/0x180
[...]
Call Trace:
<IRQ>
? napi_complete_done+0x7a/0x1a0
ip_rcv_core+0x1bb/0x340
ip_rcv+0x30/0x1f0
__netif_receive_skb_one_core+0x85/0xa0
process_backlog+0x87/0x130
__napi_poll+0x28/0x180
net_rx_action+0x339/0x420
handle_softirqs+0xdc/0x320
? handle_edge_irq+0x90/0x1e0
do_softirq.part.0+0x3b/0x60
</IRQ>
<TASK>
__local_bh_enable_ip+0x60/0x70
__dev_direct_xmit+0x14e/0x1f0
__xsk_generic_xmit+0x482/0xb70
? __remove_hrtimer+0x41/0xa0
? __xsk_generic_xmit+0x51/0xb70
? _raw_spin_unlock_irqrestore+0xe/0x40
xsk_sendmsg+0xda/0x1c0
__sys_sendto+0x1ee/0x200
__x64_sys_sendto+0x24/0x30
do_syscall_64+0x84/0x2f0
? __pfx_pollwake+0x10/0x10
? __rseq_handle_notify_resume+0xad/0x4c0
? restore_fpregs_from_fpstate+0x3c/0x90
? switch_fpu_return+0x5b/0xe0
? do_syscall_64+0x204/0x2f0
? do_syscall_64+0x204/0x2f0
? do_syscall_64+0x204/0x2f0
entry_SYSCALL_64_after_hwframe+0x76/0x7e
</TASK>
[...]
Kernel panic - not syncing: Fatal exception in interrupt
Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
Instead use the skb destructor_arg pointer along with pointer tagging.
As pointers are always aligned to 8B, use the bottom bit to indicate
whether this a single address or an allocated struct containing several
addresses.
Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
---
v2: remove some leftovers on skb_build and simplify fragmented traffic
logic
v3: drop skb extension approach, instead use pointer tagging in
destructor_arg to know whether we have a single address or an allocated
struct with multiple ones. Also, move from bpf to net as requested
v4: repost after rebasing
---
net/xdp/xsk.c | 130 ++++++++++++++++++++++++++++----------------------
1 file changed, 74 insertions(+), 56 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7b0c68a70888..d7354a3e2545 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,20 +36,13 @@
#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET 32
-struct xsk_addr_node {
- u64 addr;
- struct list_head addr_node;
-};
-
-struct xsk_addr_head {
+struct xsk_addrs {
u32 num_descs;
- struct list_head addrs_list;
+ u64 addrs[MAX_SKB_FRAGS + 1];
};
static struct kmem_cache *xsk_tx_generic_cache;
-#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
-
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -558,29 +551,53 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
return ret;
}
+static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
+{
+ return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
+}
+
+static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
+{
+ return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
+}
+
+static u32 xsk_get_num_desc(struct sk_buff *skb)
+{
+ struct xsk_addrs *xsk_addr;
+
+ if (xsk_skb_destructor_is_addr(skb))
+ return 1;
+
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+
+ return xsk_addr->num_descs;
+}
+
static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
struct sk_buff *skb)
{
- struct xsk_addr_node *pos, *tmp;
+ u32 num_descs = xsk_get_num_desc(skb);
+ struct xsk_addrs *xsk_addr;
u32 descs_processed = 0;
unsigned long flags;
- u32 idx;
+ u32 idx, i;
spin_lock_irqsave(&pool->cq_lock, flags);
idx = xskq_get_prod(pool->cq);
- xskq_prod_write_addr(pool->cq, idx,
- (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
- descs_processed++;
+ if (unlikely(num_descs > 1)) {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
- if (unlikely(XSKCB(skb)->num_descs > 1)) {
- list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+ for (i = 0; i < num_descs; i++) {
xskq_prod_write_addr(pool->cq, idx + descs_processed,
- pos->addr);
+ xsk_addr->addrs[i]);
descs_processed++;
- list_del(&pos->addr_node);
- kmem_cache_free(xsk_tx_generic_cache, pos);
}
+ kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
+ } else {
+ xskq_prod_write_addr(pool->cq, idx,
+ xsk_skb_destructor_get_addr(skb));
+ descs_processed++;
}
xskq_prod_submit_n(pool->cq, descs_processed);
spin_unlock_irqrestore(&pool->cq_lock, flags);
@@ -595,16 +612,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
spin_unlock_irqrestore(&pool->cq_lock, flags);
}
-static void xsk_inc_num_desc(struct sk_buff *skb)
-{
- XSKCB(skb)->num_descs++;
-}
-
-static u32 xsk_get_num_desc(struct sk_buff *skb)
-{
- return XSKCB(skb)->num_descs;
-}
-
static void xsk_destruct_skb(struct sk_buff *skb)
{
struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -621,27 +628,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
u64 addr)
{
- BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
- INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
skb->dev = xs->dev;
skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
- XSKCB(skb)->num_descs = 0;
skb->destructor = xsk_destruct_skb;
- skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
+ skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
}
static void xsk_consume_skb(struct sk_buff *skb)
{
struct xdp_sock *xs = xdp_sk(skb->sk);
u32 num_descs = xsk_get_num_desc(skb);
- struct xsk_addr_node *pos, *tmp;
+ struct xsk_addrs *xsk_addr;
if (unlikely(num_descs > 1)) {
- list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
- list_del(&pos->addr_node);
- kmem_cache_free(xsk_tx_generic_cache, pos);
- }
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
}
skb->destructor = sock_wfree;
@@ -701,7 +703,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
{
struct xsk_buff_pool *pool = xs->pool;
u32 hr, len, ts, offset, copy, copied;
- struct xsk_addr_node *xsk_addr;
struct sk_buff *skb = xs->skb;
struct page *page;
void *buffer;
@@ -727,16 +728,27 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
}
} else {
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
- if (!xsk_addr)
- return ERR_PTR(-ENOMEM);
+ struct xsk_addrs *xsk_addr;
+
+ if (xsk_skb_destructor_is_addr(skb)) {
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
+ GFP_KERNEL);
+ if (!xsk_addr)
+ return ERR_PTR(-ENOMEM);
+
+ xsk_addr->num_descs = 1;
+ xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
+ skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
+ } else {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ }
/* in case of -EOVERFLOW that could happen below,
* xsk_consume_skb() will release this node as whole skb
* would be dropped, which implies freeing all list elements
*/
- xsk_addr->addr = desc->addr;
- list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+ xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
+ xsk_addr->num_descs++;
}
len = desc->len;
@@ -813,7 +825,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
}
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
- struct xsk_addr_node *xsk_addr;
+ struct xsk_addrs *xsk_addr;
struct page *page;
u8 *vaddr;
@@ -828,11 +840,20 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
goto free_err;
}
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
- if (!xsk_addr) {
- __free_page(page);
- err = -ENOMEM;
- goto free_err;
+ if (xsk_skb_destructor_is_addr(skb)) {
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
+ GFP_KERNEL);
+ if (!xsk_addr) {
+ __free_page(page);
+ err = -ENOMEM;
+ goto free_err;
+ }
+
+ xsk_addr->num_descs = 1;
+ xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
+ skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
+ } else {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
}
vaddr = kmap_local_page(page);
@@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
- xsk_addr->addr = desc->addr;
- list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+ xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
+ xsk_addr->num_descs++;
}
}
- xsk_inc_num_desc(skb);
-
return skb;
free_err:
@@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
if (err == -EOVERFLOW) {
/* Drop the packet */
- xsk_inc_num_desc(xs->skb);
xsk_drop_skb(xs->skb);
xskq_cons_release(xs->tx);
} else {
@@ -1904,7 +1922,7 @@ static int __init xsk_init(void)
goto out_pernet;
xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
- sizeof(struct xsk_addr_node),
+ sizeof(struct xsk_addrs),
0, SLAB_HWCACHE_ALIGN, NULL);
if (!xsk_tx_generic_cache) {
err = -ENOMEM;
--
2.51.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH net v4] xsk: avoid data corruption on cq descriptor number
2025-11-18 12:48 [PATCH net v4] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
@ 2025-11-18 18:26 ` Maciej Fijalkowski
2025-11-18 23:57 ` Jason Xing
2025-11-20 3:07 ` Jason Xing
2 siblings, 0 replies; 7+ messages in thread
From: Maciej Fijalkowski @ 2025-11-18 18:26 UTC (permalink / raw)
To: Fernando Fernandez Mancera
Cc: netdev, csmate, kerneljasonxing, bpf, davem, edumazet, kuba,
pabeni, horms
On Tue, Nov 18, 2025 at 01:48:07PM +0100, Fernando Fernandez Mancera wrote:
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
>
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
>
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: Oops: 0000 [#1] SMP NOPTI
> CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy) Debian 6.16.12-1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
> RIP: 0010:xsk_destruct_skb+0xd0/0x180
> [...]
> Call Trace:
> <IRQ>
> ? napi_complete_done+0x7a/0x1a0
> ip_rcv_core+0x1bb/0x340
> ip_rcv+0x30/0x1f0
> __netif_receive_skb_one_core+0x85/0xa0
> process_backlog+0x87/0x130
> __napi_poll+0x28/0x180
> net_rx_action+0x339/0x420
> handle_softirqs+0xdc/0x320
> ? handle_edge_irq+0x90/0x1e0
> do_softirq.part.0+0x3b/0x60
> </IRQ>
> <TASK>
> __local_bh_enable_ip+0x60/0x70
> __dev_direct_xmit+0x14e/0x1f0
> __xsk_generic_xmit+0x482/0xb70
> ? __remove_hrtimer+0x41/0xa0
> ? __xsk_generic_xmit+0x51/0xb70
> ? _raw_spin_unlock_irqrestore+0xe/0x40
> xsk_sendmsg+0xda/0x1c0
> __sys_sendto+0x1ee/0x200
> __x64_sys_sendto+0x24/0x30
> do_syscall_64+0x84/0x2f0
> ? __pfx_pollwake+0x10/0x10
> ? __rseq_handle_notify_resume+0xad/0x4c0
> ? restore_fpregs_from_fpstate+0x3c/0x90
> ? switch_fpu_return+0x5b/0xe0
> ? do_syscall_64+0x204/0x2f0
> ? do_syscall_64+0x204/0x2f0
> ? do_syscall_64+0x204/0x2f0
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
> </TASK>
> [...]
> Kernel panic - not syncing: Fatal exception in interrupt
> Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
>
> Instead use the skb destructor_arg pointer along with pointer tagging.
> As pointers are always aligned to 8B, use the bottom bit to indicate
> whether this a single address or an allocated struct containing several
> addresses.
>
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
you haven't include my ack, so:
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
> v2: remove some leftovers on skb_build and simplify fragmented traffic
> logic
>
> v3: drop skb extension approach, instead use pointer tagging in
> destructor_arg to know whether we have a single address or an allocated
> struct with multiple ones. Also, move from bpf to net as requested
>
> v4: repost after rebasing
> ---
> net/xdp/xsk.c | 130 ++++++++++++++++++++++++++++----------------------
> 1 file changed, 74 insertions(+), 56 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..d7354a3e2545 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,20 +36,13 @@
> #define TX_BATCH_SIZE 32
> #define MAX_PER_SOCKET_BUDGET 32
>
> -struct xsk_addr_node {
> - u64 addr;
> - struct list_head addr_node;
> -};
> -
> -struct xsk_addr_head {
> +struct xsk_addrs {
> u32 num_descs;
> - struct list_head addrs_list;
> + u64 addrs[MAX_SKB_FRAGS + 1];
> };
>
> static struct kmem_cache *xsk_tx_generic_cache;
>
> -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
> -
> void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> {
> if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -558,29 +551,53 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> return ret;
> }
>
> +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
> +{
> + return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
> +}
> +
> +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
> +{
> + return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
> +}
> +
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> +{
> + struct xsk_addrs *xsk_addr;
> +
> + if (xsk_skb_destructor_is_addr(skb))
> + return 1;
> +
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +
> + return xsk_addr->num_descs;
> +}
> +
> static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
> struct sk_buff *skb)
> {
> - struct xsk_addr_node *pos, *tmp;
> + u32 num_descs = xsk_get_num_desc(skb);
> + struct xsk_addrs *xsk_addr;
> u32 descs_processed = 0;
> unsigned long flags;
> - u32 idx;
> + u32 idx, i;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> idx = xskq_get_prod(pool->cq);
>
> - xskq_prod_write_addr(pool->cq, idx,
> - (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
> - descs_processed++;
> + if (unlikely(num_descs > 1)) {
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>
> - if (unlikely(XSKCB(skb)->num_descs > 1)) {
> - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> + for (i = 0; i < num_descs; i++) {
> xskq_prod_write_addr(pool->cq, idx + descs_processed,
> - pos->addr);
> + xsk_addr->addrs[i]);
> descs_processed++;
> - list_del(&pos->addr_node);
> - kmem_cache_free(xsk_tx_generic_cache, pos);
> }
> + kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
> + } else {
> + xskq_prod_write_addr(pool->cq, idx,
> + xsk_skb_destructor_get_addr(skb));
> + descs_processed++;
> }
> xskq_prod_submit_n(pool->cq, descs_processed);
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> @@ -595,16 +612,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> }
>
> -static void xsk_inc_num_desc(struct sk_buff *skb)
> -{
> - XSKCB(skb)->num_descs++;
> -}
> -
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> - return XSKCB(skb)->num_descs;
> -}
> -
> static void xsk_destruct_skb(struct sk_buff *skb)
> {
> struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> @@ -621,27 +628,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
> u64 addr)
> {
> - BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> - INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
> skb->dev = xs->dev;
> skb->priority = READ_ONCE(xs->sk.sk_priority);
> skb->mark = READ_ONCE(xs->sk.sk_mark);
> - XSKCB(skb)->num_descs = 0;
> skb->destructor = xsk_destruct_skb;
> - skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
> }
>
> static void xsk_consume_skb(struct sk_buff *skb)
> {
> struct xdp_sock *xs = xdp_sk(skb->sk);
> u32 num_descs = xsk_get_num_desc(skb);
> - struct xsk_addr_node *pos, *tmp;
> + struct xsk_addrs *xsk_addr;
>
> if (unlikely(num_descs > 1)) {
> - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> - list_del(&pos->addr_node);
> - kmem_cache_free(xsk_tx_generic_cache, pos);
> - }
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
> }
>
> skb->destructor = sock_wfree;
> @@ -701,7 +703,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> {
> struct xsk_buff_pool *pool = xs->pool;
> u32 hr, len, ts, offset, copy, copied;
> - struct xsk_addr_node *xsk_addr;
> struct sk_buff *skb = xs->skb;
> struct page *page;
> void *buffer;
> @@ -727,16 +728,27 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> return ERR_PTR(err);
> }
> } else {
> - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> - if (!xsk_addr)
> - return ERR_PTR(-ENOMEM);
> + struct xsk_addrs *xsk_addr;
> +
> + if (xsk_skb_destructor_is_addr(skb)) {
> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> + GFP_KERNEL);
> + if (!xsk_addr)
> + return ERR_PTR(-ENOMEM);
> +
> + xsk_addr->num_descs = 1;
> + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> + } else {
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + }
>
> /* in case of -EOVERFLOW that could happen below,
> * xsk_consume_skb() will release this node as whole skb
> * would be dropped, which implies freeing all list elements
> */
> - xsk_addr->addr = desc->addr;
> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> + xsk_addr->num_descs++;
> }
>
> len = desc->len;
> @@ -813,7 +825,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> }
> } else {
> int nr_frags = skb_shinfo(skb)->nr_frags;
> - struct xsk_addr_node *xsk_addr;
> + struct xsk_addrs *xsk_addr;
> struct page *page;
> u8 *vaddr;
>
> @@ -828,11 +840,20 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> goto free_err;
> }
>
> - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> - if (!xsk_addr) {
> - __free_page(page);
> - err = -ENOMEM;
> - goto free_err;
> + if (xsk_skb_destructor_is_addr(skb)) {
> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> + GFP_KERNEL);
> + if (!xsk_addr) {
> + __free_page(page);
> + err = -ENOMEM;
> + goto free_err;
> + }
> +
> + xsk_addr->num_descs = 1;
> + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> + } else {
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> }
>
> vaddr = kmap_local_page(page);
> @@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
> refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>
> - xsk_addr->addr = desc->addr;
> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> + xsk_addr->num_descs++;
> }
> }
>
> - xsk_inc_num_desc(skb);
> -
> return skb;
>
> free_err:
> @@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>
> if (err == -EOVERFLOW) {
> /* Drop the packet */
> - xsk_inc_num_desc(xs->skb);
> xsk_drop_skb(xs->skb);
> xskq_cons_release(xs->tx);
> } else {
> @@ -1904,7 +1922,7 @@ static int __init xsk_init(void)
> goto out_pernet;
>
> xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> - sizeof(struct xsk_addr_node),
> + sizeof(struct xsk_addrs),
> 0, SLAB_HWCACHE_ALIGN, NULL);
> if (!xsk_tx_generic_cache) {
> err = -ENOMEM;
> --
> 2.51.1
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH net v4] xsk: avoid data corruption on cq descriptor number
2025-11-18 12:48 [PATCH net v4] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
2025-11-18 18:26 ` Maciej Fijalkowski
@ 2025-11-18 23:57 ` Jason Xing
2025-11-20 3:07 ` Jason Xing
2 siblings, 0 replies; 7+ messages in thread
From: Jason Xing @ 2025-11-18 23:57 UTC (permalink / raw)
To: Fernando Fernandez Mancera
Cc: netdev, csmate, maciej.fijalkowski, bpf, davem, edumazet, kuba,
pabeni, horms
On Tue, Nov 18, 2025 at 8:48 PM Fernando Fernandez Mancera
<fmancera@suse.de> wrote:
>
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
>
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
>
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: Oops: 0000 [#1] SMP NOPTI
> CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy) Debian 6.16.12-1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
> RIP: 0010:xsk_destruct_skb+0xd0/0x180
> [...]
> Call Trace:
> <IRQ>
> ? napi_complete_done+0x7a/0x1a0
> ip_rcv_core+0x1bb/0x340
> ip_rcv+0x30/0x1f0
> __netif_receive_skb_one_core+0x85/0xa0
> process_backlog+0x87/0x130
> __napi_poll+0x28/0x180
> net_rx_action+0x339/0x420
> handle_softirqs+0xdc/0x320
> ? handle_edge_irq+0x90/0x1e0
> do_softirq.part.0+0x3b/0x60
> </IRQ>
> <TASK>
> __local_bh_enable_ip+0x60/0x70
> __dev_direct_xmit+0x14e/0x1f0
> __xsk_generic_xmit+0x482/0xb70
> ? __remove_hrtimer+0x41/0xa0
> ? __xsk_generic_xmit+0x51/0xb70
> ? _raw_spin_unlock_irqrestore+0xe/0x40
> xsk_sendmsg+0xda/0x1c0
> __sys_sendto+0x1ee/0x200
> __x64_sys_sendto+0x24/0x30
> do_syscall_64+0x84/0x2f0
> ? __pfx_pollwake+0x10/0x10
> ? __rseq_handle_notify_resume+0xad/0x4c0
> ? restore_fpregs_from_fpstate+0x3c/0x90
> ? switch_fpu_return+0x5b/0xe0
> ? do_syscall_64+0x204/0x2f0
> ? do_syscall_64+0x204/0x2f0
> ? do_syscall_64+0x204/0x2f0
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
> </TASK>
> [...]
> Kernel panic - not syncing: Fatal exception in interrupt
> Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
>
> Instead use the skb destructor_arg pointer along with pointer tagging.
> As pointers are always aligned to 8B, use the bottom bit to indicate
> whether this a single address or an allocated struct containing several
> addresses.
>
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Thanks for your effort and the fix!
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH net v4] xsk: avoid data corruption on cq descriptor number
2025-11-18 12:48 [PATCH net v4] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
2025-11-18 18:26 ` Maciej Fijalkowski
2025-11-18 23:57 ` Jason Xing
@ 2025-11-20 3:07 ` Jason Xing
2025-11-20 9:06 ` Fernando Fernandez Mancera
2 siblings, 1 reply; 7+ messages in thread
From: Jason Xing @ 2025-11-20 3:07 UTC (permalink / raw)
To: Fernando Fernandez Mancera
Cc: netdev, csmate, maciej.fijalkowski, bpf, davem, edumazet, kuba,
pabeni, horms
On Tue, Nov 18, 2025 at 8:48 PM Fernando Fernandez Mancera
<fmancera@suse.de> wrote:
>
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
>
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
>
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: Oops: 0000 [#1] SMP NOPTI
> CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy) Debian 6.16.12-1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
> RIP: 0010:xsk_destruct_skb+0xd0/0x180
> [...]
> Call Trace:
> <IRQ>
> ? napi_complete_done+0x7a/0x1a0
> ip_rcv_core+0x1bb/0x340
> ip_rcv+0x30/0x1f0
> __netif_receive_skb_one_core+0x85/0xa0
> process_backlog+0x87/0x130
> __napi_poll+0x28/0x180
> net_rx_action+0x339/0x420
> handle_softirqs+0xdc/0x320
> ? handle_edge_irq+0x90/0x1e0
> do_softirq.part.0+0x3b/0x60
> </IRQ>
> <TASK>
> __local_bh_enable_ip+0x60/0x70
> __dev_direct_xmit+0x14e/0x1f0
> __xsk_generic_xmit+0x482/0xb70
> ? __remove_hrtimer+0x41/0xa0
> ? __xsk_generic_xmit+0x51/0xb70
> ? _raw_spin_unlock_irqrestore+0xe/0x40
> xsk_sendmsg+0xda/0x1c0
> __sys_sendto+0x1ee/0x200
> __x64_sys_sendto+0x24/0x30
> do_syscall_64+0x84/0x2f0
> ? __pfx_pollwake+0x10/0x10
> ? __rseq_handle_notify_resume+0xad/0x4c0
> ? restore_fpregs_from_fpstate+0x3c/0x90
> ? switch_fpu_return+0x5b/0xe0
> ? do_syscall_64+0x204/0x2f0
> ? do_syscall_64+0x204/0x2f0
> ? do_syscall_64+0x204/0x2f0
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
> </TASK>
> [...]
> Kernel panic - not syncing: Fatal exception in interrupt
> Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
>
> Instead use the skb destructor_arg pointer along with pointer tagging.
> As pointers are always aligned to 8B, use the bottom bit to indicate
> whether this a single address or an allocated struct containing several
> addresses.
>
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
> ---
> v2: remove some leftovers on skb_build and simplify fragmented traffic
> logic
>
> v3: drop skb extension approach, instead use pointer tagging in
> destructor_arg to know whether we have a single address or an allocated
> struct with multiple ones. Also, move from bpf to net as requested
>
> v4: repost after rebasing
> ---
> net/xdp/xsk.c | 130 ++++++++++++++++++++++++++++----------------------
> 1 file changed, 74 insertions(+), 56 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..d7354a3e2545 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,20 +36,13 @@
> #define TX_BATCH_SIZE 32
> #define MAX_PER_SOCKET_BUDGET 32
>
> -struct xsk_addr_node {
> - u64 addr;
> - struct list_head addr_node;
> -};
> -
> -struct xsk_addr_head {
> +struct xsk_addrs {
> u32 num_descs;
> - struct list_head addrs_list;
> + u64 addrs[MAX_SKB_FRAGS + 1];
> };
>
> static struct kmem_cache *xsk_tx_generic_cache;
>
> -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
> -
> void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> {
> if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -558,29 +551,53 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> return ret;
> }
>
> +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
> +{
> + return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
> +}
> +
> +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
> +{
> + return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
> +}
> +
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> +{
> + struct xsk_addrs *xsk_addr;
> +
> + if (xsk_skb_destructor_is_addr(skb))
> + return 1;
> +
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +
> + return xsk_addr->num_descs;
> +}
> +
> static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
> struct sk_buff *skb)
> {
> - struct xsk_addr_node *pos, *tmp;
> + u32 num_descs = xsk_get_num_desc(skb);
> + struct xsk_addrs *xsk_addr;
> u32 descs_processed = 0;
> unsigned long flags;
> - u32 idx;
> + u32 idx, i;
>
> spin_lock_irqsave(&pool->cq_lock, flags);
> idx = xskq_get_prod(pool->cq);
>
> - xskq_prod_write_addr(pool->cq, idx,
> - (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
> - descs_processed++;
> + if (unlikely(num_descs > 1)) {
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>
> - if (unlikely(XSKCB(skb)->num_descs > 1)) {
> - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> + for (i = 0; i < num_descs; i++) {
> xskq_prod_write_addr(pool->cq, idx + descs_processed,
> - pos->addr);
> + xsk_addr->addrs[i]);
> descs_processed++;
> - list_del(&pos->addr_node);
> - kmem_cache_free(xsk_tx_generic_cache, pos);
> }
> + kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
> + } else {
> + xskq_prod_write_addr(pool->cq, idx,
> + xsk_skb_destructor_get_addr(skb));
> + descs_processed++;
> }
> xskq_prod_submit_n(pool->cq, descs_processed);
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> @@ -595,16 +612,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> }
>
> -static void xsk_inc_num_desc(struct sk_buff *skb)
> -{
> - XSKCB(skb)->num_descs++;
> -}
> -
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> - return XSKCB(skb)->num_descs;
> -}
> -
> static void xsk_destruct_skb(struct sk_buff *skb)
> {
> struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> @@ -621,27 +628,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
> u64 addr)
> {
> - BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> - INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
> skb->dev = xs->dev;
> skb->priority = READ_ONCE(xs->sk.sk_priority);
> skb->mark = READ_ONCE(xs->sk.sk_mark);
> - XSKCB(skb)->num_descs = 0;
> skb->destructor = xsk_destruct_skb;
> - skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
> }
>
> static void xsk_consume_skb(struct sk_buff *skb)
> {
> struct xdp_sock *xs = xdp_sk(skb->sk);
> u32 num_descs = xsk_get_num_desc(skb);
> - struct xsk_addr_node *pos, *tmp;
> + struct xsk_addrs *xsk_addr;
>
> if (unlikely(num_descs > 1)) {
> - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> - list_del(&pos->addr_node);
> - kmem_cache_free(xsk_tx_generic_cache, pos);
> - }
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
> }
>
> skb->destructor = sock_wfree;
> @@ -701,7 +703,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> {
> struct xsk_buff_pool *pool = xs->pool;
> u32 hr, len, ts, offset, copy, copied;
> - struct xsk_addr_node *xsk_addr;
> struct sk_buff *skb = xs->skb;
> struct page *page;
> void *buffer;
> @@ -727,16 +728,27 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> return ERR_PTR(err);
> }
> } else {
> - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> - if (!xsk_addr)
> - return ERR_PTR(-ENOMEM);
> + struct xsk_addrs *xsk_addr;
> +
> + if (xsk_skb_destructor_is_addr(skb)) {
> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> + GFP_KERNEL);
> + if (!xsk_addr)
> + return ERR_PTR(-ENOMEM);
> +
> + xsk_addr->num_descs = 1;
> + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> + } else {
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> + }
>
> /* in case of -EOVERFLOW that could happen below,
> * xsk_consume_skb() will release this node as whole skb
> * would be dropped, which implies freeing all list elements
> */
> - xsk_addr->addr = desc->addr;
> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> + xsk_addr->num_descs++;
> }
>
> len = desc->len;
> @@ -813,7 +825,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> }
> } else {
> int nr_frags = skb_shinfo(skb)->nr_frags;
> - struct xsk_addr_node *xsk_addr;
> + struct xsk_addrs *xsk_addr;
> struct page *page;
> u8 *vaddr;
>
> @@ -828,11 +840,20 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> goto free_err;
> }
>
> - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> - if (!xsk_addr) {
> - __free_page(page);
> - err = -ENOMEM;
> - goto free_err;
> + if (xsk_skb_destructor_is_addr(skb)) {
> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> + GFP_KERNEL);
> + if (!xsk_addr) {
> + __free_page(page);
> + err = -ENOMEM;
> + goto free_err;
> + }
> +
> + xsk_addr->num_descs = 1;
> + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> + } else {
> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> }
>
> vaddr = kmap_local_page(page);
> @@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
> refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>
> - xsk_addr->addr = desc->addr;
> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> + xsk_addr->num_descs++;
Wait, it's too late to increment it... Please find below.
> }
> }
>
> - xsk_inc_num_desc(skb);
> -
> return skb;
>
> free_err:
> @@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>
> if (err == -EOVERFLOW) {
> /* Drop the packet */
> - xsk_inc_num_desc(xs->skb);
Why did you remove this line? The error can occur in the above hidden
snippet[1] without IFF_TX_SKB_NO_LINEAR setting and then we will fail
to increment it by one.
[1]: https://elixir.bootlin.com/linux/v6.18-rc6/source/net/xdp/xsk.c#L821
Thanks,
Jason
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH net v4] xsk: avoid data corruption on cq descriptor number
2025-11-20 3:07 ` Jason Xing
@ 2025-11-20 9:06 ` Fernando Fernandez Mancera
2025-11-20 10:56 ` Jason Xing
0 siblings, 1 reply; 7+ messages in thread
From: Fernando Fernandez Mancera @ 2025-11-20 9:06 UTC (permalink / raw)
To: Jason Xing
Cc: netdev, csmate, maciej.fijalkowski, bpf, davem, edumazet, kuba,
pabeni, horms
On 11/20/25 4:07 AM, Jason Xing wrote:
> On Tue, Nov 18, 2025 at 8:48 PM Fernando Fernandez Mancera
> <fmancera@suse.de> wrote:
[...]>> @@ -828,11 +840,20 @@ static struct sk_buff
*xsk_build_skb(struct xdp_sock *xs,
>> goto free_err;
>> }
>>
>> - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
>> - if (!xsk_addr) {
>> - __free_page(page);
>> - err = -ENOMEM;
>> - goto free_err;
>> + if (xsk_skb_destructor_is_addr(skb)) {
>> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
>> + GFP_KERNEL);
>> + if (!xsk_addr) {
>> + __free_page(page);
>> + err = -ENOMEM;
>> + goto free_err;
>> + }
>> +
>> + xsk_addr->num_descs = 1;
>> + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
>> + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
>> + } else {
>> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>> }
>>
>> vaddr = kmap_local_page(page);
>> @@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>> skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
>> refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>>
>> - xsk_addr->addr = desc->addr;
>> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
>> + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
>> + xsk_addr->num_descs++;
>
> Wait, it's too late to increment it... Please find below.
>
>> }
>> }
>>
>> - xsk_inc_num_desc(skb);
>> -
>> return skb;
>>
>> free_err:
>> @@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>
>> if (err == -EOVERFLOW) {
>> /* Drop the packet */
>> - xsk_inc_num_desc(xs->skb);
>
> Why did you remove this line? The error can occur in the above hidden
> snippet[1] without IFF_TX_SKB_NO_LINEAR setting and then we will fail
> to increment it by one.
>
>
That is a good catch. Let me fix this logic.. I missed that the
-EOVERFLOW is returned in different moments for xsk_build_skb_zerocopy()
and xsk_build_skb(). Keeping the increment logic as it was it is better.
> [1]: https://elixir.bootlin.com/linux/v6.18-rc6/source/net/xdp/xsk.c#L821
>
> Thanks,
> Jason
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH net v4] xsk: avoid data corruption on cq descriptor number
2025-11-20 9:06 ` Fernando Fernandez Mancera
@ 2025-11-20 10:56 ` Jason Xing
2025-11-20 11:04 ` Fernando Fernandez Mancera
0 siblings, 1 reply; 7+ messages in thread
From: Jason Xing @ 2025-11-20 10:56 UTC (permalink / raw)
To: Fernando Fernandez Mancera
Cc: netdev, csmate, maciej.fijalkowski, bpf, davem, edumazet, kuba,
pabeni, horms
On Thu, Nov 20, 2025 at 5:06 PM Fernando Fernandez Mancera
<fmancera@suse.de> wrote:
>
>
>
> On 11/20/25 4:07 AM, Jason Xing wrote:
> > On Tue, Nov 18, 2025 at 8:48 PM Fernando Fernandez Mancera
> > <fmancera@suse.de> wrote:
> [...]>> @@ -828,11 +840,20 @@ static struct sk_buff
> *xsk_build_skb(struct xdp_sock *xs,
> >> goto free_err;
> >> }
> >>
> >> - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> >> - if (!xsk_addr) {
> >> - __free_page(page);
> >> - err = -ENOMEM;
> >> - goto free_err;
> >> + if (xsk_skb_destructor_is_addr(skb)) {
> >> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> >> + GFP_KERNEL);
> >> + if (!xsk_addr) {
> >> + __free_page(page);
> >> + err = -ENOMEM;
> >> + goto free_err;
> >> + }
> >> +
> >> + xsk_addr->num_descs = 1;
> >> + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> >> + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> >> + } else {
> >> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> >> }
> >>
> >> vaddr = kmap_local_page(page);
> >> @@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> >> skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
> >> refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
> >>
> >> - xsk_addr->addr = desc->addr;
> >> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> >> + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> >> + xsk_addr->num_descs++;
> >
> > Wait, it's too late to increment it... Please find below.
> >
> >> }
> >> }
> >>
> >> - xsk_inc_num_desc(skb);
> >> -
>
>
>
> >> return skb;
> >>
> >> free_err:
> >> @@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> >>
> >> if (err == -EOVERFLOW) {
> >> /* Drop the packet */
> >> - xsk_inc_num_desc(xs->skb);
> >
> > Why did you remove this line? The error can occur in the above hidden
> > snippet[1] without IFF_TX_SKB_NO_LINEAR setting and then we will fail
> > to increment it by one.
> >
> >
> That is a good catch. Let me fix this logic.. I missed that the
> -EOVERFLOW is returned in different moments for xsk_build_skb_zerocopy()
> and xsk_build_skb(). Keeping the increment logic as it was it is better.
Right. Thanks!
My new solution based on net-next with your patch is ready now :)
Thanks,
Jason
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH net v4] xsk: avoid data corruption on cq descriptor number
2025-11-20 10:56 ` Jason Xing
@ 2025-11-20 11:04 ` Fernando Fernandez Mancera
0 siblings, 0 replies; 7+ messages in thread
From: Fernando Fernandez Mancera @ 2025-11-20 11:04 UTC (permalink / raw)
To: Jason Xing
Cc: netdev, csmate, maciej.fijalkowski, bpf, davem, edumazet, kuba,
pabeni, horms
On 11/20/25 11:56 AM, Jason Xing wrote:
> On Thu, Nov 20, 2025 at 5:06 PM Fernando Fernandez Mancera
> <fmancera@suse.de> wrote:
>>
>>
>>
>> On 11/20/25 4:07 AM, Jason Xing wrote:
>>> On Tue, Nov 18, 2025 at 8:48 PM Fernando Fernandez Mancera
>>> <fmancera@suse.de> wrote:
>> [...]>> @@ -828,11 +840,20 @@ static struct sk_buff
>> *xsk_build_skb(struct xdp_sock *xs,
>>>> goto free_err;
>>>> }
>>>>
>>>> - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
>>>> - if (!xsk_addr) {
>>>> - __free_page(page);
>>>> - err = -ENOMEM;
>>>> - goto free_err;
>>>> + if (xsk_skb_destructor_is_addr(skb)) {
>>>> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
>>>> + GFP_KERNEL);
>>>> + if (!xsk_addr) {
>>>> + __free_page(page);
>>>> + err = -ENOMEM;
>>>> + goto free_err;
>>>> + }
>>>> +
>>>> + xsk_addr->num_descs = 1;
>>>> + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
>>>> + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
>>>> + } else {
>>>> + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>>>> }
>>>>
>>>> vaddr = kmap_local_page(page);
>>>> @@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>>> skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
>>>> refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>>>>
>>>> - xsk_addr->addr = desc->addr;
>>>> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
>>>> + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
>>>> + xsk_addr->num_descs++;
>>>
>>> Wait, it's too late to increment it... Please find below.
>>>
>>>> }
>>>> }
>>>>
>>>> - xsk_inc_num_desc(skb);
>>>> -
>>
>>
>>
>>>> return skb;
>>>>
>>>> free_err:
>>>> @@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>>>
>>>> if (err == -EOVERFLOW) {
>>>> /* Drop the packet */
>>>> - xsk_inc_num_desc(xs->skb);
>>>
>>> Why did you remove this line? The error can occur in the above hidden
>>> snippet[1] without IFF_TX_SKB_NO_LINEAR setting and then we will fail
>>> to increment it by one.
>>>
>>>
>> That is a good catch. Let me fix this logic.. I missed that the
>> -EOVERFLOW is returned in different moments for xsk_build_skb_zerocopy()
>> and xsk_build_skb(). Keeping the increment logic as it was it is better.
>
> Right. Thanks!
>
> My new solution based on net-next with your patch is ready now :)
>
Awesome, thanks!
I just sent out the v5.
@Maciej I dropped your ACK since I changed the code. Feel free to add it
back and sorry for making you take it a look to it again.
> Thanks,
> Jason
>
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2025-11-20 11:04 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-18 12:48 [PATCH net v4] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
2025-11-18 18:26 ` Maciej Fijalkowski
2025-11-18 23:57 ` Jason Xing
2025-11-20 3:07 ` Jason Xing
2025-11-20 9:06 ` Fernando Fernandez Mancera
2025-11-20 10:56 ` Jason Xing
2025-11-20 11:04 ` Fernando Fernandez Mancera
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).