[PATCH net v2] xsk: avoid data corruption on cq descriptor number

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net v2] xsk: avoid data corruption on cq descriptor number
@ 2025-10-24 10:40 Fernando Fernandez Mancera
  2025-10-24 17:16 ` Maciej Fijalkowski
  0 siblings, 1 reply; 4+ messages in thread
From: Fernando Fernandez Mancera @ 2025-10-24 10:40 UTC (permalink / raw)
  To: netdev
  Cc: csmate, kerneljasonxing, maciej.fijalkowski, bjorn, sdf,
	jonathan.lemon, bpf, davem, edumazet, kuba, pabeni, horms,
	Fernando Fernandez Mancera

Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
production"), the descriptor number is stored in skb control block and
xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
pool's completion queue.

skb control block shouldn't be used for this purpose as after transmit
xsk doesn't have control over it and other subsystems could use it. This
leads to the following kernel panic due to a NULL pointer dereference.

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: Oops: 0000 [#1] SMP NOPTI
 CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
 RIP: 0010:xsk_destruct_skb+0xd0/0x180
 [...]
 Call Trace:
  <IRQ>
  ? napi_complete_done+0x7a/0x1a0
  ip_rcv_core+0x1bb/0x340
  ip_rcv+0x30/0x1f0
  __netif_receive_skb_one_core+0x85/0xa0
  process_backlog+0x87/0x130
  __napi_poll+0x28/0x180
  net_rx_action+0x339/0x420
  handle_softirqs+0xdc/0x320
  ? handle_edge_irq+0x90/0x1e0
  do_softirq.part.0+0x3b/0x60
  </IRQ>
  <TASK>
  __local_bh_enable_ip+0x60/0x70
  __dev_direct_xmit+0x14e/0x1f0
  __xsk_generic_xmit+0x482/0xb70
  ? __remove_hrtimer+0x41/0xa0
  ? __xsk_generic_xmit+0x51/0xb70
  ? _raw_spin_unlock_irqrestore+0xe/0x40
  xsk_sendmsg+0xda/0x1c0
  __sys_sendto+0x1ee/0x200
  __x64_sys_sendto+0x24/0x30
  do_syscall_64+0x84/0x2f0
  ? __pfx_pollwake+0x10/0x10
  ? __rseq_handle_notify_resume+0xad/0x4c0
  ? restore_fpregs_from_fpstate+0x3c/0x90
  ? switch_fpu_return+0x5b/0xe0
  ? do_syscall_64+0x204/0x2f0
  ? do_syscall_64+0x204/0x2f0
  ? do_syscall_64+0x204/0x2f0
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
  </TASK>
 [...]
 Kernel panic - not syncing: Fatal exception in interrupt
 Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)

The approach proposed stores the first address also in the xsk_addr_node
along with the number of descriptors. The head xsk_addr_node is
referenced in skb_shinfo(skb)->destructor_arg. The rest of the fragments
store the address on the list.

This is less efficient as 4 bytes are wasted when storing each address.

Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
---
v2: fix erroneous page handling and fix typos on commit message.
---
 net/xdp/xsk.c | 52 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7b0c68a70888..965cf071b036 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -37,18 +37,14 @@
 #define MAX_PER_SOCKET_BUDGET 32
 
 struct xsk_addr_node {
+	u32 num_descs;
 	u64 addr;
 	struct list_head addr_node;
 };
 
-struct xsk_addr_head {
-	u32 num_descs;
-	struct list_head addrs_list;
-};
-
 static struct kmem_cache *xsk_tx_generic_cache;
 
-#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
+#define XSK_TX_HEAD(skb) ((struct xsk_addr_node *)((skb_shinfo(skb)->destructor_arg)))
 
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
@@ -569,12 +565,11 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
 	spin_lock_irqsave(&pool->cq_lock, flags);
 	idx = xskq_get_prod(pool->cq);
 
-	xskq_prod_write_addr(pool->cq, idx,
-			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
+	xskq_prod_write_addr(pool->cq, idx, XSK_TX_HEAD(skb)->addr);
 	descs_processed++;
 
-	if (unlikely(XSKCB(skb)->num_descs > 1)) {
-		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+	if (unlikely(XSK_TX_HEAD(skb)->num_descs > 1)) {
+		list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
 			xskq_prod_write_addr(pool->cq, idx + descs_processed,
 					     pos->addr);
 			descs_processed++;
@@ -582,6 +577,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
 			kmem_cache_free(xsk_tx_generic_cache, pos);
 		}
 	}
+	kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
 	xskq_prod_submit_n(pool->cq, descs_processed);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
@@ -597,12 +593,12 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 
 static void xsk_inc_num_desc(struct sk_buff *skb)
 {
-	XSKCB(skb)->num_descs++;
+	XSK_TX_HEAD(skb)->num_descs++;
 }
 
 static u32 xsk_get_num_desc(struct sk_buff *skb)
 {
-	return XSKCB(skb)->num_descs;
+	return XSK_TX_HEAD(skb)->num_descs;
 }
 
 static void xsk_destruct_skb(struct sk_buff *skb)
@@ -619,16 +615,16 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 }
 
 static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
-			      u64 addr)
+			      struct xsk_addr_node *head, u64 addr)
 {
-	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
-	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
+	INIT_LIST_HEAD(&head->addr_node);
+	head->addr = addr;
+	head->num_descs = 0;
 	skb->dev = xs->dev;
 	skb->priority = READ_ONCE(xs->sk.sk_priority);
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
-	XSKCB(skb)->num_descs = 0;
 	skb->destructor = xsk_destruct_skb;
-	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
+	skb_shinfo(skb)->destructor_arg = (void *)head;
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
@@ -638,11 +634,12 @@ static void xsk_consume_skb(struct sk_buff *skb)
 	struct xsk_addr_node *pos, *tmp;
 
 	if (unlikely(num_descs > 1)) {
-		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+		list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
 			list_del(&pos->addr_node);
 			kmem_cache_free(xsk_tx_generic_cache, pos);
 		}
 	}
+	kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
 
 	skb->destructor = sock_wfree;
 	xsk_cq_cancel_locked(xs->pool, num_descs);
@@ -720,7 +717,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 
 		skb_reserve(skb, hr);
 
-		xsk_skb_init_misc(skb, xs, desc->addr);
+		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+		if (!xsk_addr)
+			return ERR_PTR(-ENOMEM);
+
+		xsk_skb_init_misc(skb, xs, xsk_addr, desc->addr);
 		if (desc->options & XDP_TX_METADATA) {
 			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
 			if (unlikely(err))
@@ -736,7 +737,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 		 * would be dropped, which implies freeing all list elements
 		 */
 		xsk_addr->addr = desc->addr;
-		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+		list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
 	}
 
 	len = desc->len;
@@ -773,6 +774,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 				     struct xdp_desc *desc)
 {
 	struct net_device *dev = xs->dev;
+	struct xsk_addr_node *xsk_addr;
 	struct sk_buff *skb = xs->skb;
 	int err;
 
@@ -804,7 +806,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			if (unlikely(err))
 				goto free_err;
 
-			xsk_skb_init_misc(skb, xs, desc->addr);
+			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+			if (!xsk_addr) {
+				err = -ENOMEM;
+				goto free_err;
+			}
+			xsk_skb_init_misc(skb, xs, xsk_addr, desc->addr);
 			if (desc->options & XDP_TX_METADATA) {
 				err = xsk_skb_metadata(skb, buffer, desc,
 						       xs->pool, hr);
@@ -813,7 +820,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			}
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
-			struct xsk_addr_node *xsk_addr;
 			struct page *page;
 			u8 *vaddr;
 
@@ -843,7 +849,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
 
 			xsk_addr->addr = desc->addr;
-			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+			list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
 		}
 	}
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH net v2] xsk: avoid data corruption on cq descriptor number
  2025-10-24 10:40 [PATCH net v2] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
@ 2025-10-24 17:16 ` Maciej Fijalkowski
  2025-10-25 18:18   ` Fernando Fernandez Mancera
  0 siblings, 1 reply; 4+ messages in thread
From: Maciej Fijalkowski @ 2025-10-24 17:16 UTC (permalink / raw)
  To: Fernando Fernandez Mancera
  Cc: netdev, csmate, kerneljasonxing, bjorn, sdf, jonathan.lemon, bpf,
	davem, edumazet, kuba, pabeni, horms

On Fri, Oct 24, 2025 at 12:40:49PM +0200, Fernando Fernandez Mancera wrote:
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
> 
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
> 
>  BUG: kernel NULL pointer dereference, address: 0000000000000000
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x0000) - not-present page
>  PGD 0 P4D 0
>  Oops: Oops: 0000 [#1] SMP NOPTI
>  CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
>  RIP: 0010:xsk_destruct_skb+0xd0/0x180
>  [...]
>  Call Trace:
>   <IRQ>
>   ? napi_complete_done+0x7a/0x1a0
>   ip_rcv_core+0x1bb/0x340
>   ip_rcv+0x30/0x1f0
>   __netif_receive_skb_one_core+0x85/0xa0
>   process_backlog+0x87/0x130
>   __napi_poll+0x28/0x180
>   net_rx_action+0x339/0x420
>   handle_softirqs+0xdc/0x320
>   ? handle_edge_irq+0x90/0x1e0
>   do_softirq.part.0+0x3b/0x60
>   </IRQ>
>   <TASK>
>   __local_bh_enable_ip+0x60/0x70
>   __dev_direct_xmit+0x14e/0x1f0
>   __xsk_generic_xmit+0x482/0xb70
>   ? __remove_hrtimer+0x41/0xa0
>   ? __xsk_generic_xmit+0x51/0xb70
>   ? _raw_spin_unlock_irqrestore+0xe/0x40
>   xsk_sendmsg+0xda/0x1c0
>   __sys_sendto+0x1ee/0x200
>   __x64_sys_sendto+0x24/0x30
>   do_syscall_64+0x84/0x2f0
>   ? __pfx_pollwake+0x10/0x10
>   ? __rseq_handle_notify_resume+0xad/0x4c0
>   ? restore_fpregs_from_fpstate+0x3c/0x90
>   ? switch_fpu_return+0x5b/0xe0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>   </TASK>
>  [...]
>  Kernel panic - not syncing: Fatal exception in interrupt
>  Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> 
> The approach proposed stores the first address also in the xsk_addr_node
> along with the number of descriptors. The head xsk_addr_node is
> referenced in skb_shinfo(skb)->destructor_arg. The rest of the fragments
> store the address on the list.
> 
> This is less efficient as 4 bytes are wasted when storing each address.

Hi Fernando,
it's not about 4 bytes being wasted but rather memory allocation that you
introduce here. I tried hard to avoid hurting non-fragmented traffic,
below you can find impact reported by Jason from similar approach as
yours:
https://lore.kernel.org/bpf/CAL+tcoD=Gn6ZmJ+_Y48vPRyHVHmP-7irsx=fRVRnyhDrpTrEtQ@mail.gmail.com/

I assume this patch will yield a similar performance degradation...

> 
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
> ---
> v2: fix erroneous page handling and fix typos on commit message.
> ---
>  net/xdp/xsk.c | 52 ++++++++++++++++++++++++++++-----------------------
>  1 file changed, 29 insertions(+), 23 deletions(-)
> 
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..965cf071b036 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -37,18 +37,14 @@
>  #define MAX_PER_SOCKET_BUDGET 32
>  
>  struct xsk_addr_node {
> +	u32 num_descs;
>  	u64 addr;
>  	struct list_head addr_node;
>  };
>  
> -struct xsk_addr_head {
> -	u32 num_descs;
> -	struct list_head addrs_list;
> -};
> -
>  static struct kmem_cache *xsk_tx_generic_cache;
>  
> -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
> +#define XSK_TX_HEAD(skb) ((struct xsk_addr_node *)((skb_shinfo(skb)->destructor_arg)))
>  
>  void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
>  {
> @@ -569,12 +565,11 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
>  	spin_lock_irqsave(&pool->cq_lock, flags);
>  	idx = xskq_get_prod(pool->cq);
>  
> -	xskq_prod_write_addr(pool->cq, idx,
> -			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
> +	xskq_prod_write_addr(pool->cq, idx, XSK_TX_HEAD(skb)->addr);
>  	descs_processed++;
>  
> -	if (unlikely(XSKCB(skb)->num_descs > 1)) {
> -		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> +	if (unlikely(XSK_TX_HEAD(skb)->num_descs > 1)) {
> +		list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
>  			xskq_prod_write_addr(pool->cq, idx + descs_processed,
>  					     pos->addr);
>  			descs_processed++;
> @@ -582,6 +577,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
>  			kmem_cache_free(xsk_tx_generic_cache, pos);
>  		}
>  	}
> +	kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
>  	xskq_prod_submit_n(pool->cq, descs_processed);
>  	spin_unlock_irqrestore(&pool->cq_lock, flags);
>  }
> @@ -597,12 +593,12 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
>  
>  static void xsk_inc_num_desc(struct sk_buff *skb)
>  {
> -	XSKCB(skb)->num_descs++;
> +	XSK_TX_HEAD(skb)->num_descs++;
>  }
>  
>  static u32 xsk_get_num_desc(struct sk_buff *skb)
>  {
> -	return XSKCB(skb)->num_descs;
> +	return XSK_TX_HEAD(skb)->num_descs;
>  }
>  
>  static void xsk_destruct_skb(struct sk_buff *skb)
> @@ -619,16 +615,16 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>  }
>  
>  static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
> -			      u64 addr)
> +			      struct xsk_addr_node *head, u64 addr)
>  {
> -	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> -	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
> +	INIT_LIST_HEAD(&head->addr_node);
> +	head->addr = addr;
> +	head->num_descs = 0;
>  	skb->dev = xs->dev;
>  	skb->priority = READ_ONCE(xs->sk.sk_priority);
>  	skb->mark = READ_ONCE(xs->sk.sk_mark);
> -	XSKCB(skb)->num_descs = 0;
>  	skb->destructor = xsk_destruct_skb;
> -	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> +	skb_shinfo(skb)->destructor_arg = (void *)head;
>  }
>  
>  static void xsk_consume_skb(struct sk_buff *skb)
> @@ -638,11 +634,12 @@ static void xsk_consume_skb(struct sk_buff *skb)
>  	struct xsk_addr_node *pos, *tmp;
>  
>  	if (unlikely(num_descs > 1)) {
> -		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> +		list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
>  			list_del(&pos->addr_node);
>  			kmem_cache_free(xsk_tx_generic_cache, pos);
>  		}
>  	}
> +	kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
>  
>  	skb->destructor = sock_wfree;
>  	xsk_cq_cancel_locked(xs->pool, num_descs);
> @@ -720,7 +717,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  
>  		skb_reserve(skb, hr);
>  
> -		xsk_skb_init_misc(skb, xs, desc->addr);
> +		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> +		if (!xsk_addr)
> +			return ERR_PTR(-ENOMEM);
> +
> +		xsk_skb_init_misc(skb, xs, xsk_addr, desc->addr);
>  		if (desc->options & XDP_TX_METADATA) {
>  			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
>  			if (unlikely(err))
> @@ -736,7 +737,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  		 * would be dropped, which implies freeing all list elements
>  		 */
>  		xsk_addr->addr = desc->addr;
> -		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +		list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
>  	}
>  
>  	len = desc->len;
> @@ -773,6 +774,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  				     struct xdp_desc *desc)
>  {
>  	struct net_device *dev = xs->dev;
> +	struct xsk_addr_node *xsk_addr;
>  	struct sk_buff *skb = xs->skb;
>  	int err;
>  
> @@ -804,7 +806,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			if (unlikely(err))
>  				goto free_err;
>  
> -			xsk_skb_init_misc(skb, xs, desc->addr);
> +			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> +			if (!xsk_addr) {
> +				err = -ENOMEM;
> +				goto free_err;
> +			}
> +			xsk_skb_init_misc(skb, xs, xsk_addr, desc->addr);
>  			if (desc->options & XDP_TX_METADATA) {
>  				err = xsk_skb_metadata(skb, buffer, desc,
>  						       xs->pool, hr);
> @@ -813,7 +820,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			}
>  		} else {
>  			int nr_frags = skb_shinfo(skb)->nr_frags;
> -			struct xsk_addr_node *xsk_addr;
>  			struct page *page;
>  			u8 *vaddr;
>  
> @@ -843,7 +849,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>  
>  			xsk_addr->addr = desc->addr;
> -			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +			list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
>  		}
>  	}
>  
> -- 
> 2.51.0
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net v2] xsk: avoid data corruption on cq descriptor number
  2025-10-24 17:16 ` Maciej Fijalkowski
@ 2025-10-25 18:18   ` Fernando Fernandez Mancera
  2025-10-27 12:32     ` Maciej Fijalkowski
  0 siblings, 1 reply; 4+ messages in thread
From: Fernando Fernandez Mancera @ 2025-10-25 18:18 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: netdev, csmate, kerneljasonxing, bjorn, sdf, jonathan.lemon, bpf,
	davem, edumazet, kuba, pabeni, horms



On 10/24/25 7:16 PM, Maciej Fijalkowski wrote:
> On Fri, Oct 24, 2025 at 12:40:49PM +0200, Fernando Fernandez Mancera wrote:
>> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
>> production"), the descriptor number is stored in skb control block and
>> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
>> pool's completion queue.
>>
>> skb control block shouldn't be used for this purpose as after transmit
>> xsk doesn't have control over it and other subsystems could use it. This
>> leads to the following kernel panic due to a NULL pointer dereference.
>>
>>   BUG: kernel NULL pointer dereference, address: 0000000000000000
>>   #PF: supervisor read access in kernel mode
>>   #PF: error_code(0x0000) - not-present page
>>   PGD 0 P4D 0
>>   Oops: Oops: 0000 [#1] SMP NOPTI
>>   CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
>>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
>>   RIP: 0010:xsk_destruct_skb+0xd0/0x180
>>   [...]
>>   Call Trace:
>>    <IRQ>
>>    ? napi_complete_done+0x7a/0x1a0
>>    ip_rcv_core+0x1bb/0x340
>>    ip_rcv+0x30/0x1f0
>>    __netif_receive_skb_one_core+0x85/0xa0
>>    process_backlog+0x87/0x130
>>    __napi_poll+0x28/0x180
>>    net_rx_action+0x339/0x420
>>    handle_softirqs+0xdc/0x320
>>    ? handle_edge_irq+0x90/0x1e0
>>    do_softirq.part.0+0x3b/0x60
>>    </IRQ>
>>    <TASK>
>>    __local_bh_enable_ip+0x60/0x70
>>    __dev_direct_xmit+0x14e/0x1f0
>>    __xsk_generic_xmit+0x482/0xb70
>>    ? __remove_hrtimer+0x41/0xa0
>>    ? __xsk_generic_xmit+0x51/0xb70
>>    ? _raw_spin_unlock_irqrestore+0xe/0x40
>>    xsk_sendmsg+0xda/0x1c0
>>    __sys_sendto+0x1ee/0x200
>>    __x64_sys_sendto+0x24/0x30
>>    do_syscall_64+0x84/0x2f0
>>    ? __pfx_pollwake+0x10/0x10
>>    ? __rseq_handle_notify_resume+0xad/0x4c0
>>    ? restore_fpregs_from_fpstate+0x3c/0x90
>>    ? switch_fpu_return+0x5b/0xe0
>>    ? do_syscall_64+0x204/0x2f0
>>    ? do_syscall_64+0x204/0x2f0
>>    ? do_syscall_64+0x204/0x2f0
>>    entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>    </TASK>
>>   [...]
>>   Kernel panic - not syncing: Fatal exception in interrupt
>>   Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
>>
>> The approach proposed stores the first address also in the xsk_addr_node
>> along with the number of descriptors. The head xsk_addr_node is
>> referenced in skb_shinfo(skb)->destructor_arg. The rest of the fragments
>> store the address on the list.
>>
>> This is less efficient as 4 bytes are wasted when storing each address.
> 
> Hi Fernando,
> it's not about 4 bytes being wasted but rather memory allocation that you
> introduce here. I tried hard to avoid hurting non-fragmented traffic,
> below you can find impact reported by Jason from similar approach as
> yours:
> https://lore.kernel.org/bpf/CAL+tcoD=Gn6ZmJ+_Y48vPRyHVHmP-7irsx=fRVRnyhDrpTrEtQ@mail.gmail.com/
> 
> I assume this patch will yield a similar performance degradation...
> 

It does, thank you for explaining Maciej. I have been thinking about 
possible solutions and remembered skb extensions. If I am not wrong, it 
shouldn't yield a performance degratation or at least it should be a 
much less severe one. Although, XDP_SOCKETS Kconfig would require 
"select SKB_EXTENSIONS".

What do you think about this approach? I could draft a series for 
net-next.. I am just looking for different options other than using skb 
control block because I believe similar issues will arise in the future 
even if we fix the current one on ip_rcv..

Thanks,
Fernando.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net v2] xsk: avoid data corruption on cq descriptor number
  2025-10-25 18:18   ` Fernando Fernandez Mancera
@ 2025-10-27 12:32     ` Maciej Fijalkowski
  0 siblings, 0 replies; 4+ messages in thread
From: Maciej Fijalkowski @ 2025-10-27 12:32 UTC (permalink / raw)
  To: Fernando Fernandez Mancera
  Cc: netdev, csmate, kerneljasonxing, bjorn, sdf, jonathan.lemon, bpf,
	davem, edumazet, kuba, pabeni, horms

On Sat, Oct 25, 2025 at 08:18:17PM +0200, Fernando Fernandez Mancera wrote:
> 
> 
> On 10/24/25 7:16 PM, Maciej Fijalkowski wrote:
> > On Fri, Oct 24, 2025 at 12:40:49PM +0200, Fernando Fernandez Mancera wrote:
> > > Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> > > production"), the descriptor number is stored in skb control block and
> > > xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> > > pool's completion queue.
> > > 
> > > skb control block shouldn't be used for this purpose as after transmit
> > > xsk doesn't have control over it and other subsystems could use it. This
> > > leads to the following kernel panic due to a NULL pointer dereference.
> > > 
> > >   BUG: kernel NULL pointer dereference, address: 0000000000000000
> > >   #PF: supervisor read access in kernel mode
> > >   #PF: error_code(0x0000) - not-present page
> > >   PGD 0 P4D 0
> > >   Oops: Oops: 0000 [#1] SMP NOPTI
> > >   CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
> > >   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
> > >   RIP: 0010:xsk_destruct_skb+0xd0/0x180
> > >   [...]
> > >   Call Trace:
> > >    <IRQ>
> > >    ? napi_complete_done+0x7a/0x1a0
> > >    ip_rcv_core+0x1bb/0x340
> > >    ip_rcv+0x30/0x1f0
> > >    __netif_receive_skb_one_core+0x85/0xa0
> > >    process_backlog+0x87/0x130
> > >    __napi_poll+0x28/0x180
> > >    net_rx_action+0x339/0x420
> > >    handle_softirqs+0xdc/0x320
> > >    ? handle_edge_irq+0x90/0x1e0
> > >    do_softirq.part.0+0x3b/0x60
> > >    </IRQ>
> > >    <TASK>
> > >    __local_bh_enable_ip+0x60/0x70
> > >    __dev_direct_xmit+0x14e/0x1f0
> > >    __xsk_generic_xmit+0x482/0xb70
> > >    ? __remove_hrtimer+0x41/0xa0
> > >    ? __xsk_generic_xmit+0x51/0xb70
> > >    ? _raw_spin_unlock_irqrestore+0xe/0x40
> > >    xsk_sendmsg+0xda/0x1c0
> > >    __sys_sendto+0x1ee/0x200
> > >    __x64_sys_sendto+0x24/0x30
> > >    do_syscall_64+0x84/0x2f0
> > >    ? __pfx_pollwake+0x10/0x10
> > >    ? __rseq_handle_notify_resume+0xad/0x4c0
> > >    ? restore_fpregs_from_fpstate+0x3c/0x90
> > >    ? switch_fpu_return+0x5b/0xe0
> > >    ? do_syscall_64+0x204/0x2f0
> > >    ? do_syscall_64+0x204/0x2f0
> > >    ? do_syscall_64+0x204/0x2f0
> > >    entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > >    </TASK>
> > >   [...]
> > >   Kernel panic - not syncing: Fatal exception in interrupt
> > >   Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> > > 
> > > The approach proposed stores the first address also in the xsk_addr_node
> > > along with the number of descriptors. The head xsk_addr_node is
> > > referenced in skb_shinfo(skb)->destructor_arg. The rest of the fragments
> > > store the address on the list.
> > > 
> > > This is less efficient as 4 bytes are wasted when storing each address.
> > 
> > Hi Fernando,
> > it's not about 4 bytes being wasted but rather memory allocation that you
> > introduce here. I tried hard to avoid hurting non-fragmented traffic,
> > below you can find impact reported by Jason from similar approach as
> > yours:
> > https://lore.kernel.org/bpf/CAL+tcoD=Gn6ZmJ+_Y48vPRyHVHmP-7irsx=fRVRnyhDrpTrEtQ@mail.gmail.com/
> > 
> > I assume this patch will yield a similar performance degradation...
> > 
> 
> It does, thank you for explaining Maciej. I have been thinking about
> possible solutions and remembered skb extensions. If I am not wrong, it
> shouldn't yield a performance degratation or at least it should be a much
> less severe one. Although, XDP_SOCKETS Kconfig would require "select
> SKB_EXTENSIONS".

SGTM. However I have not used this feature in the past, so I'm looking
forward for your implementation.

> 
> What do you think about this approach? I could draft a series for net-next..

That should be targetted to 'bpf' as a fix, still.

> I am just looking for different options other than using skb control block
> because I believe similar issues will arise in the future even if we fix the
> current one on ip_rcv..

Yes I agree. Heart says 'just fix the IP layer' but as you said, we never
know if other layer would wipe out the cb.

> 
> Thanks,
> Fernando.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-10-27 12:32 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-24 10:40 [PATCH net v2] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
2025-10-24 17:16 ` Maciej Fijalkowski
2025-10-25 18:18   ` Fernando Fernandez Mancera
2025-10-27 12:32     ` Maciej Fijalkowski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).