netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net] xsk: avoid data corruption on cq descriptor number
@ 2025-10-21 15:06 Fernando Fernandez Mancera
  2025-10-21 16:25 ` Simon Horman
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Fernando Fernandez Mancera @ 2025-10-21 15:06 UTC (permalink / raw)
  To: netdev
  Cc: csmate, kerneljasonxing, maciej.fijalkowski, bjorn, sdf,
	jonathan.lemon, bpf, Fernando Fernandez Mancera

Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
production"), the descriptor number is store in skb control block and
xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
pool's completion queue.

skb control block shouldn't be used for this purpose as after transmit
xsk doesn't have control over it and other subsystems could use it. This
leads to the following kernel panic due to a NULL pointer dereference.

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: Oops: 0000 [#1] SMP NOPTI
 CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
 RIP: 0010:xsk_destruct_skb+0xd0/0x180
 [...]
 Call Trace:
  <IRQ>
  ? napi_complete_done+0x7a/0x1a0
  ip_rcv_core+0x1bb/0x340
  ip_rcv+0x30/0x1f0
  __netif_receive_skb_one_core+0x85/0xa0
  process_backlog+0x87/0x130
  __napi_poll+0x28/0x180
  net_rx_action+0x339/0x420
  handle_softirqs+0xdc/0x320
  ? handle_edge_irq+0x90/0x1e0
  do_softirq.part.0+0x3b/0x60
  </IRQ>
  <TASK>
  __local_bh_enable_ip+0x60/0x70
  __dev_direct_xmit+0x14e/0x1f0
  __xsk_generic_xmit+0x482/0xb70
  ? __remove_hrtimer+0x41/0xa0
  ? __xsk_generic_xmit+0x51/0xb70
  ? _raw_spin_unlock_irqrestore+0xe/0x40
  xsk_sendmsg+0xda/0x1c0
  __sys_sendto+0x1ee/0x200
  __x64_sys_sendto+0x24/0x30
  do_syscall_64+0x84/0x2f0
  ? __pfx_pollwake+0x10/0x10
  ? __rseq_handle_notify_resume+0xad/0x4c0
  ? restore_fpregs_from_fpstate+0x3c/0x90
  ? switch_fpu_return+0x5b/0xe0
  ? do_syscall_64+0x204/0x2f0
  ? do_syscall_64+0x204/0x2f0
  ? do_syscall_64+0x204/0x2f0
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
  </TASK>
 [...]
 Kernel panic - not syncing: Fatal exception in interrupt
 Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)

The approach proposed store the first address also in the xsk_addr_node
along with the number of descriptors. The head xsk_addr_node is
referenced by skb_shinfo(skb)->destructor_arg. The rest of the fragments
store the address on the list.

This is less efficient as the kmem_cache must be initialized even if a
single fragment is received and also 4 bytes are wasted when storing
each address.

Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
---
Note: Please notice I am not an XDP expert so I cannot tell if this
would cause a performance regression, advice is welcomed.
---
 net/xdp/xsk.c | 57 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7b0c68a70888..203934aeade6 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -37,18 +37,14 @@
 #define MAX_PER_SOCKET_BUDGET 32
 
 struct xsk_addr_node {
+	u32 num_descs;
 	u64 addr;
 	struct list_head addr_node;
 };
 
-struct xsk_addr_head {
-	u32 num_descs;
-	struct list_head addrs_list;
-};
-
 static struct kmem_cache *xsk_tx_generic_cache;
 
-#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
+#define XSK_TX_HEAD(skb) ((struct xsk_addr_node *)((skb_shinfo(skb)->destructor_arg)))
 
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
@@ -569,12 +565,11 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
 	spin_lock_irqsave(&pool->cq_lock, flags);
 	idx = xskq_get_prod(pool->cq);
 
-	xskq_prod_write_addr(pool->cq, idx,
-			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
+	xskq_prod_write_addr(pool->cq, idx, XSK_TX_HEAD(skb)->addr);
 	descs_processed++;
 
-	if (unlikely(XSKCB(skb)->num_descs > 1)) {
-		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+	if (unlikely(XSK_TX_HEAD(skb)->num_descs > 1)) {
+		list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
 			xskq_prod_write_addr(pool->cq, idx + descs_processed,
 					     pos->addr);
 			descs_processed++;
@@ -582,6 +577,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
 			kmem_cache_free(xsk_tx_generic_cache, pos);
 		}
 	}
+	kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
 	xskq_prod_submit_n(pool->cq, descs_processed);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
@@ -597,12 +593,12 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 
 static void xsk_inc_num_desc(struct sk_buff *skb)
 {
-	XSKCB(skb)->num_descs++;
+	XSK_TX_HEAD(skb)->num_descs++;
 }
 
 static u32 xsk_get_num_desc(struct sk_buff *skb)
 {
-	return XSKCB(skb)->num_descs;
+	return XSK_TX_HEAD(skb)->num_descs;
 }
 
 static void xsk_destruct_skb(struct sk_buff *skb)
@@ -619,16 +615,16 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 }
 
 static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
-			      u64 addr)
+			      struct xsk_addr_node *head, u64 addr)
 {
-	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
-	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
+	INIT_LIST_HEAD(&head->addr_node);
+	head->addr = addr;
+	head->num_descs = 0;
 	skb->dev = xs->dev;
 	skb->priority = READ_ONCE(xs->sk.sk_priority);
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
-	XSKCB(skb)->num_descs = 0;
 	skb->destructor = xsk_destruct_skb;
-	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
+	skb_shinfo(skb)->destructor_arg = (void *)head;
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
@@ -638,11 +634,12 @@ static void xsk_consume_skb(struct sk_buff *skb)
 	struct xsk_addr_node *pos, *tmp;
 
 	if (unlikely(num_descs > 1)) {
-		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+		list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
 			list_del(&pos->addr_node);
 			kmem_cache_free(xsk_tx_generic_cache, pos);
 		}
 	}
+	kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
 
 	skb->destructor = sock_wfree;
 	xsk_cq_cancel_locked(xs->pool, num_descs);
@@ -712,6 +709,8 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 	buffer = xsk_buff_raw_get_data(pool, addr);
 
 	if (!skb) {
+		struct xsk_addr_node *head_addr;
+
 		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
 
 		skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
@@ -720,7 +719,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 
 		skb_reserve(skb, hr);
 
-		xsk_skb_init_misc(skb, xs, desc->addr);
+		head_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+		if (!head_addr)
+			return ERR_PTR(-ENOMEM);
+
+		xsk_skb_init_misc(skb, xs, head_addr, desc->addr);
 		if (desc->options & XDP_TX_METADATA) {
 			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
 			if (unlikely(err))
@@ -736,7 +739,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 		 * would be dropped, which implies freeing all list elements
 		 */
 		xsk_addr->addr = desc->addr;
-		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+		list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
 	}
 
 	len = desc->len;
@@ -774,6 +777,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 {
 	struct net_device *dev = xs->dev;
 	struct sk_buff *skb = xs->skb;
+	struct page *page;
 	int err;
 
 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
@@ -791,6 +795,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 		len = desc->len;
 
 		if (!skb) {
+			struct xsk_addr_node *head_addr;
+
 			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
 			tr = dev->needed_tailroom;
 			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
@@ -804,7 +810,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			if (unlikely(err))
 				goto free_err;
 
-			xsk_skb_init_misc(skb, xs, desc->addr);
+			head_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+			if (!head_addr) {
+				__free_page(page);
+				err = -ENOMEM;
+				goto free_err;
+			}
+			xsk_skb_init_misc(skb, xs, head_addr, desc->addr);
 			if (desc->options & XDP_TX_METADATA) {
 				err = xsk_skb_metadata(skb, buffer, desc,
 						       xs->pool, hr);
@@ -814,7 +826,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
 			struct xsk_addr_node *xsk_addr;
-			struct page *page;
 			u8 *vaddr;
 
 			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
@@ -843,7 +854,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
 
 			xsk_addr->addr = desc->addr;
-			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+			list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
 		}
 	}
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH net] xsk: avoid data corruption on cq descriptor number
  2025-10-21 15:06 [PATCH net] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
@ 2025-10-21 16:25 ` Simon Horman
  2025-10-21 18:25   ` Fernando Fernandez Mancera
  2025-10-22  5:23 ` kernel test robot
  2025-10-22 18:24 ` Fernando Fernandez Mancera
  2 siblings, 1 reply; 5+ messages in thread
From: Simon Horman @ 2025-10-21 16:25 UTC (permalink / raw)
  To: Fernando Fernandez Mancera
  Cc: netdev, csmate, kerneljasonxing, maciej.fijalkowski, bjorn, sdf,
	jonathan.lemon, bpf

On Tue, Oct 21, 2025 at 05:06:56PM +0200, Fernando Fernandez Mancera wrote:

...

> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c

...

> @@ -774,6 +777,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  {
>  	struct net_device *dev = xs->dev;
>  	struct sk_buff *skb = xs->skb;
> +	struct page *page;
>  	int err;
>  
>  	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
> @@ -791,6 +795,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  		len = desc->len;
>  
>  		if (!skb) {
> +			struct xsk_addr_node *head_addr;
> +
>  			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
>  			tr = dev->needed_tailroom;
>  			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
> @@ -804,7 +810,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			if (unlikely(err))
>  				goto free_err;
>  
> -			xsk_skb_init_misc(skb, xs, desc->addr);
> +			head_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> +			if (!head_addr) {
> +				__free_page(page);

Hi Fernando,

Perhaps the page changes to xsk_build_skb() aren't needed
because page seems to be uninitialised here.

Flagged by W=1 builds with Clang 21.1.1, and Smatch.

> +				err = -ENOMEM;
> +				goto free_err;
> +			}
> +			xsk_skb_init_misc(skb, xs, head_addr, desc->addr);
>  			if (desc->options & XDP_TX_METADATA) {
>  				err = xsk_skb_metadata(skb, buffer, desc,
>  						       xs->pool, hr);
> @@ -814,7 +826,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  		} else {
>  			int nr_frags = skb_shinfo(skb)->nr_frags;
>  			struct xsk_addr_node *xsk_addr;
> -			struct page *page;
>  			u8 *vaddr;
>  
>  			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
> @@ -843,7 +854,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>  
>  			xsk_addr->addr = desc->addr;
> -			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +			list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
>  		}
>  	}
>  
> -- 
> 2.51.0
> 
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH net] xsk: avoid data corruption on cq descriptor number
  2025-10-21 16:25 ` Simon Horman
@ 2025-10-21 18:25   ` Fernando Fernandez Mancera
  0 siblings, 0 replies; 5+ messages in thread
From: Fernando Fernandez Mancera @ 2025-10-21 18:25 UTC (permalink / raw)
  To: Simon Horman
  Cc: netdev, csmate, kerneljasonxing, maciej.fijalkowski, bjorn, sdf,
	jonathan.lemon, bpf



On 10/21/25 6:25 PM, Simon Horman wrote:
> On Tue, Oct 21, 2025 at 05:06:56PM +0200, Fernando Fernandez Mancera wrote:
> 
> ...
> 
>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> 
> ...
> 
>> @@ -774,6 +777,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>   {
>>   	struct net_device *dev = xs->dev;
>>   	struct sk_buff *skb = xs->skb;
>> +	struct page *page;
>>   	int err;
>>   
>>   	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
>> @@ -791,6 +795,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>   		len = desc->len;
>>   
>>   		if (!skb) {
>> +			struct xsk_addr_node *head_addr;
>> +
>>   			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
>>   			tr = dev->needed_tailroom;
>>   			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
>> @@ -804,7 +810,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>   			if (unlikely(err))
>>   				goto free_err;
>>   
>> -			xsk_skb_init_misc(skb, xs, desc->addr);
>> +			head_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
>> +			if (!head_addr) {
>> +				__free_page(page);
> 
> Hi Fernando,
> 
> Perhaps the page changes to xsk_build_skb() aren't needed
> because page seems to be uninitialised here.
> 
> Flagged by W=1 builds with Clang 21.1.1, and Smatch.
> 

Ah, I don't know know what I was thinking about.. but yes, that page 
handling isn't needed of course as it isn't initialized. (I even needed 
the page struct because it wasn't available in that context).

I will send a V2 patch if the proposed solution is accepted. In additon, 
the performance impact isn't as bad as I thought.. just 4 bytes pero 
skb.. as the cache must be initialized anyway.

>> +				err = -ENOMEM;
>> +				goto free_err;
>> +			}
>> +			xsk_skb_init_misc(skb, xs, head_addr, desc->addr);
>>   			if (desc->options & XDP_TX_METADATA) {
>>   				err = xsk_skb_metadata(skb, buffer, desc,
>>   						       xs->pool, hr);
>> @@ -814,7 +826,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>   		} else {
>>   			int nr_frags = skb_shinfo(skb)->nr_frags;
>>   			struct xsk_addr_node *xsk_addr;
>> -			struct page *page;
>>   			u8 *vaddr;
>>   
>>   			if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
>> @@ -843,7 +854,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>   			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>>   
>>   			xsk_addr->addr = desc->addr;
>> -			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
>> +			list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
>>   		}
>>   	}
>>   
>> -- 
>> 2.51.0
>>
>>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH net] xsk: avoid data corruption on cq descriptor number
  2025-10-21 15:06 [PATCH net] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
  2025-10-21 16:25 ` Simon Horman
@ 2025-10-22  5:23 ` kernel test robot
  2025-10-22 18:24 ` Fernando Fernandez Mancera
  2 siblings, 0 replies; 5+ messages in thread
From: kernel test robot @ 2025-10-22  5:23 UTC (permalink / raw)
  To: Fernando Fernandez Mancera, netdev
  Cc: llvm, oe-kbuild-all, csmate, kerneljasonxing, maciej.fijalkowski,
	bjorn, sdf, jonathan.lemon, bpf, Fernando Fernandez Mancera

Hi Fernando,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Fernando-Fernandez-Mancera/xsk-avoid-data-corruption-on-cq-descriptor-number/20251021-231144
base:   net/main
patch link:    https://lore.kernel.org/r/20251021150656.6704-1-fmancera%40suse.de
patch subject: [PATCH net] xsk: avoid data corruption on cq descriptor number
config: x86_64-buildonly-randconfig-001-20251022 (https://download.01.org/0day-ci/archive/20251022/202510221337.KR7g9eX0-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251022/202510221337.KR7g9eX0-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510221337.KR7g9eX0-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> net/xdp/xsk.c:815:17: warning: variable 'page' is uninitialized when used here [-Wuninitialized]
     815 |                                 __free_page(page);
         |                                             ^~~~
   include/linux/gfp.h:385:41: note: expanded from macro '__free_page'
     385 | #define __free_page(page) __free_pages((page), 0)
         |                                         ^~~~
   net/xdp/xsk.c:780:19: note: initialize the variable 'page' to silence this warning
     780 |         struct page *page;
         |                          ^
         |                           = NULL
   1 warning generated.


vim +/page +815 net/xdp/xsk.c

   774	
   775	static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
   776					     struct xdp_desc *desc)
   777	{
   778		struct net_device *dev = xs->dev;
   779		struct sk_buff *skb = xs->skb;
   780		struct page *page;
   781		int err;
   782	
   783		if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
   784			skb = xsk_build_skb_zerocopy(xs, desc);
   785			if (IS_ERR(skb)) {
   786				err = PTR_ERR(skb);
   787				skb = NULL;
   788				goto free_err;
   789			}
   790		} else {
   791			u32 hr, tr, len;
   792			void *buffer;
   793	
   794			buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
   795			len = desc->len;
   796	
   797			if (!skb) {
   798				struct xsk_addr_node *head_addr;
   799	
   800				hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
   801				tr = dev->needed_tailroom;
   802				skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
   803				if (unlikely(!skb))
   804					goto free_err;
   805	
   806				skb_reserve(skb, hr);
   807				skb_put(skb, len);
   808	
   809				err = skb_store_bits(skb, 0, buffer, len);
   810				if (unlikely(err))
   811					goto free_err;
   812	
   813				head_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
   814				if (!head_addr) {
 > 815					__free_page(page);
   816					err = -ENOMEM;
   817					goto free_err;
   818				}
   819				xsk_skb_init_misc(skb, xs, head_addr, desc->addr);
   820				if (desc->options & XDP_TX_METADATA) {
   821					err = xsk_skb_metadata(skb, buffer, desc,
   822							       xs->pool, hr);
   823					if (unlikely(err))
   824						goto free_err;
   825				}
   826			} else {
   827				int nr_frags = skb_shinfo(skb)->nr_frags;
   828				struct xsk_addr_node *xsk_addr;
   829				u8 *vaddr;
   830	
   831				if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
   832					err = -EOVERFLOW;
   833					goto free_err;
   834				}
   835	
   836				page = alloc_page(xs->sk.sk_allocation);
   837				if (unlikely(!page)) {
   838					err = -EAGAIN;
   839					goto free_err;
   840				}
   841	
   842				xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
   843				if (!xsk_addr) {
   844					__free_page(page);
   845					err = -ENOMEM;
   846					goto free_err;
   847				}
   848	
   849				vaddr = kmap_local_page(page);
   850				memcpy(vaddr, buffer, len);
   851				kunmap_local(vaddr);
   852	
   853				skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
   854				refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
   855	
   856				xsk_addr->addr = desc->addr;
   857				list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
   858			}
   859		}
   860	
   861		xsk_inc_num_desc(skb);
   862	
   863		return skb;
   864	
   865	free_err:
   866		if (skb && !skb_shinfo(skb)->nr_frags)
   867			kfree_skb(skb);
   868	
   869		if (err == -EOVERFLOW) {
   870			/* Drop the packet */
   871			xsk_inc_num_desc(xs->skb);
   872			xsk_drop_skb(xs->skb);
   873			xskq_cons_release(xs->tx);
   874		} else {
   875			/* Let application retry */
   876			xsk_cq_cancel_locked(xs->pool, 1);
   877		}
   878	
   879		return ERR_PTR(err);
   880	}
   881	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH net] xsk: avoid data corruption on cq descriptor number
  2025-10-21 15:06 [PATCH net] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
  2025-10-21 16:25 ` Simon Horman
  2025-10-22  5:23 ` kernel test robot
@ 2025-10-22 18:24 ` Fernando Fernandez Mancera
  2 siblings, 0 replies; 5+ messages in thread
From: Fernando Fernandez Mancera @ 2025-10-22 18:24 UTC (permalink / raw)
  To: netdev
  Cc: csmate, kerneljasonxing, maciej.fijalkowski, bjorn, sdf,
	jonathan.lemon, bpf



On 10/21/25 5:06 PM, Fernando Fernandez Mancera wrote:
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is store in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
> 
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
> 
>   BUG: kernel NULL pointer dereference, address: 0000000000000000
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
>   PGD 0 P4D 0
>   Oops: Oops: 0000 [#1] SMP NOPTI
>   CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
>   RIP: 0010:xsk_destruct_skb+0xd0/0x180
>   [...]
>   Call Trace:
>    <IRQ>
>    ? napi_complete_done+0x7a/0x1a0
>    ip_rcv_core+0x1bb/0x340
>    ip_rcv+0x30/0x1f0
>    __netif_receive_skb_one_core+0x85/0xa0
>    process_backlog+0x87/0x130
>    __napi_poll+0x28/0x180
>    net_rx_action+0x339/0x420
>    handle_softirqs+0xdc/0x320
>    ? handle_edge_irq+0x90/0x1e0
>    do_softirq.part.0+0x3b/0x60
>    </IRQ>
>    <TASK>
>    __local_bh_enable_ip+0x60/0x70
>    __dev_direct_xmit+0x14e/0x1f0
>    __xsk_generic_xmit+0x482/0xb70
>    ? __remove_hrtimer+0x41/0xa0
>    ? __xsk_generic_xmit+0x51/0xb70
>    ? _raw_spin_unlock_irqrestore+0xe/0x40
>    xsk_sendmsg+0xda/0x1c0
>    __sys_sendto+0x1ee/0x200
>    __x64_sys_sendto+0x24/0x30
>    do_syscall_64+0x84/0x2f0
>    ? __pfx_pollwake+0x10/0x10
>    ? __rseq_handle_notify_resume+0xad/0x4c0
>    ? restore_fpregs_from_fpstate+0x3c/0x90
>    ? switch_fpu_return+0x5b/0xe0
>    ? do_syscall_64+0x204/0x2f0
>    ? do_syscall_64+0x204/0x2f0
>    ? do_syscall_64+0x204/0x2f0
>    entry_SYSCALL_64_after_hwframe+0x76/0x7e
>    </TASK>
>   [...]
>   Kernel panic - not syncing: Fatal exception in interrupt
>   Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> 
> The approach proposed store the first address also in the xsk_addr_node
> along with the number of descriptors. The head xsk_addr_node is
> referenced by skb_shinfo(skb)->destructor_arg. The rest of the fragments
> store the address on the list.
> 
> This is less efficient as the kmem_cache must be initialized even if a
> single fragment is received and also 4 bytes are wasted when storing
> each address.
> 
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
> ---
> Note: Please notice I am not an XDP expert so I cannot tell if this
> would cause a performance regression, advice is welcomed.
> ---
>   net/xdp/xsk.c | 57 ++++++++++++++++++++++++++++++---------------------
>   1 file changed, 34 insertions(+), 23 deletions(-)
> 
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..203934aeade6 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -37,18 +37,14 @@
>   #define MAX_PER_SOCKET_BUDGET 32
>   
>   struct xsk_addr_node {
> +	u32 num_descs;
>   	u64 addr;
>   	struct list_head addr_node;
>   };
>   
> -struct xsk_addr_head {
> -	u32 num_descs;
> -	struct list_head addrs_list;
> -};
> -
>   static struct kmem_cache *xsk_tx_generic_cache;
>   

FWIW, if the 4 bytes wasted for each umem address other than the initial 
one is too much, we can add another kmem_cache for just the xsk_addr_head..

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-10-22 18:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-21 15:06 [PATCH net] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
2025-10-21 16:25 ` Simon Horman
2025-10-21 18:25   ` Fernando Fernandez Mancera
2025-10-22  5:23 ` kernel test robot
2025-10-22 18:24 ` Fernando Fernandez Mancera

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).