bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
To: Fernando Fernandez Mancera <fmancera@suse.de>
Cc: <netdev@vger.kernel.org>, <csmate@nop.hu>,
	<kerneljasonxing@gmail.com>, <bjorn@kernel.org>,
	<sdf@fomichev.me>, <jonathan.lemon@gmail.com>,
	<bpf@vger.kernel.org>, <davem@davemloft.net>,
	<edumazet@google.com>, <kuba@kernel.org>, <pabeni@redhat.com>,
	<horms@kernel.org>
Subject: Re: [PATCH net v3] xsk: avoid data corruption on cq descriptor number
Date: Mon, 17 Nov 2025 17:08:43 +0100	[thread overview]
Message-ID: <aRtIiIvfVwJCmcn1@boxer> (raw)
In-Reply-To: <20251030140355.4059-1-fmancera@suse.de>

On Thu, Oct 30, 2025 at 03:03:55PM +0100, Fernando Fernandez Mancera wrote:
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
> 
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
> 
>  BUG: kernel NULL pointer dereference, address: 0000000000000000
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x0000) - not-present page
>  PGD 0 P4D 0
>  Oops: Oops: 0000 [#1] SMP NOPTI
>  CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
>  RIP: 0010:xsk_destruct_skb+0xd0/0x180
>  [...]
>  Call Trace:
>   <IRQ>
>   ? napi_complete_done+0x7a/0x1a0
>   ip_rcv_core+0x1bb/0x340
>   ip_rcv+0x30/0x1f0
>   __netif_receive_skb_one_core+0x85/0xa0
>   process_backlog+0x87/0x130
>   __napi_poll+0x28/0x180
>   net_rx_action+0x339/0x420
>   handle_softirqs+0xdc/0x320
>   ? handle_edge_irq+0x90/0x1e0
>   do_softirq.part.0+0x3b/0x60
>   </IRQ>
>   <TASK>
>   __local_bh_enable_ip+0x60/0x70
>   __dev_direct_xmit+0x14e/0x1f0
>   __xsk_generic_xmit+0x482/0xb70
>   ? __remove_hrtimer+0x41/0xa0
>   ? __xsk_generic_xmit+0x51/0xb70
>   ? _raw_spin_unlock_irqrestore+0xe/0x40
>   xsk_sendmsg+0xda/0x1c0
>   __sys_sendto+0x1ee/0x200
>   __x64_sys_sendto+0x24/0x30
>   do_syscall_64+0x84/0x2f0
>   ? __pfx_pollwake+0x10/0x10
>   ? __rseq_handle_notify_resume+0xad/0x4c0
>   ? restore_fpregs_from_fpstate+0x3c/0x90
>   ? switch_fpu_return+0x5b/0xe0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>   </TASK>
>  [...]
>  Kernel panic - not syncing: Fatal exception in interrupt
>  Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> 
> Instead use the skb destructor_arg pointer along with pointer tagging.
> As pointers are always aligned to 8B, use the bottom bit to indicate
> whether this a single address or an allocated struct containing several
> addresses.
> 
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>

Tested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>

Fernando thanks for stepping in and providing this fix!
And thanks Jakub for ptr tagging trick.

@BPF maintainers, please apply this patch.

> ---
> v2: remove some leftovers on skb_build and simplify fragmented traffic
> logic
> 
> v3: drop skb extension approach, instead use pointer tagging in
> destructor_arg to know whether we have a single address or an allocated
> struct with multiple ones. Also, move from bpf to net as requested
> 
> Note: tested with the crash reproducer and xdpsock tool
> ---
>  net/xdp/xsk.c | 130 ++++++++++++++++++++++++++++----------------------
>  1 file changed, 74 insertions(+), 56 deletions(-)
> 
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..d7354a3e2545 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,20 +36,13 @@
>  #define TX_BATCH_SIZE 32
>  #define MAX_PER_SOCKET_BUDGET 32
>  
> -struct xsk_addr_node {
> -	u64 addr;
> -	struct list_head addr_node;
> -};
> -
> -struct xsk_addr_head {
> +struct xsk_addrs {
>  	u32 num_descs;
> -	struct list_head addrs_list;
> +	u64 addrs[MAX_SKB_FRAGS + 1];
>  };
>  
>  static struct kmem_cache *xsk_tx_generic_cache;
>  
> -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
> -
>  void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
>  {
>  	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -558,29 +551,53 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
>  	return ret;
>  }
>  
> +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
> +{
> +	return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
> +}
> +
> +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
> +{
> +	return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
> +}
> +
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> +{
> +	struct xsk_addrs *xsk_addr;
> +
> +	if (xsk_skb_destructor_is_addr(skb))
> +		return 1;
> +
> +	xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +
> +	return xsk_addr->num_descs;
> +}
> +
>  static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
>  				      struct sk_buff *skb)
>  {
> -	struct xsk_addr_node *pos, *tmp;
> +	u32 num_descs = xsk_get_num_desc(skb);
> +	struct xsk_addrs *xsk_addr;
>  	u32 descs_processed = 0;
>  	unsigned long flags;
> -	u32 idx;
> +	u32 idx, i;
>  
>  	spin_lock_irqsave(&pool->cq_lock, flags);
>  	idx = xskq_get_prod(pool->cq);
>  
> -	xskq_prod_write_addr(pool->cq, idx,
> -			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
> -	descs_processed++;
> +	if (unlikely(num_descs > 1)) {
> +		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>  
> -	if (unlikely(XSKCB(skb)->num_descs > 1)) {
> -		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> +		for (i = 0; i < num_descs; i++) {
>  			xskq_prod_write_addr(pool->cq, idx + descs_processed,
> -					     pos->addr);
> +					     xsk_addr->addrs[i]);
>  			descs_processed++;
> -			list_del(&pos->addr_node);
> -			kmem_cache_free(xsk_tx_generic_cache, pos);
>  		}
> +		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
> +	} else {
> +		xskq_prod_write_addr(pool->cq, idx,
> +				     xsk_skb_destructor_get_addr(skb));
> +		descs_processed++;
>  	}
>  	xskq_prod_submit_n(pool->cq, descs_processed);
>  	spin_unlock_irqrestore(&pool->cq_lock, flags);
> @@ -595,16 +612,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
>  	spin_unlock_irqrestore(&pool->cq_lock, flags);
>  }
>  
> -static void xsk_inc_num_desc(struct sk_buff *skb)
> -{
> -	XSKCB(skb)->num_descs++;
> -}
> -
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> -	return XSKCB(skb)->num_descs;
> -}
> -
>  static void xsk_destruct_skb(struct sk_buff *skb)
>  {
>  	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> @@ -621,27 +628,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>  static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
>  			      u64 addr)
>  {
> -	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> -	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
>  	skb->dev = xs->dev;
>  	skb->priority = READ_ONCE(xs->sk.sk_priority);
>  	skb->mark = READ_ONCE(xs->sk.sk_mark);
> -	XSKCB(skb)->num_descs = 0;
>  	skb->destructor = xsk_destruct_skb;
> -	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> +	skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
>  }
>  
>  static void xsk_consume_skb(struct sk_buff *skb)
>  {
>  	struct xdp_sock *xs = xdp_sk(skb->sk);
>  	u32 num_descs = xsk_get_num_desc(skb);
> -	struct xsk_addr_node *pos, *tmp;
> +	struct xsk_addrs *xsk_addr;
>  
>  	if (unlikely(num_descs > 1)) {
> -		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> -			list_del(&pos->addr_node);
> -			kmem_cache_free(xsk_tx_generic_cache, pos);
> -		}
> +		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
>  	}
>  
>  	skb->destructor = sock_wfree;
> @@ -701,7 +703,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  {
>  	struct xsk_buff_pool *pool = xs->pool;
>  	u32 hr, len, ts, offset, copy, copied;
> -	struct xsk_addr_node *xsk_addr;
>  	struct sk_buff *skb = xs->skb;
>  	struct page *page;
>  	void *buffer;
> @@ -727,16 +728,27 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  				return ERR_PTR(err);
>  		}
>  	} else {
> -		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> -		if (!xsk_addr)
> -			return ERR_PTR(-ENOMEM);
> +		struct xsk_addrs *xsk_addr;
> +
> +		if (xsk_skb_destructor_is_addr(skb)) {
> +			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> +						     GFP_KERNEL);
> +			if (!xsk_addr)
> +				return ERR_PTR(-ENOMEM);
> +
> +			xsk_addr->num_descs = 1;
> +			xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> +			skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> +		} else {
> +			xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +		}
>  
>  		/* in case of -EOVERFLOW that could happen below,
>  		 * xsk_consume_skb() will release this node as whole skb
>  		 * would be dropped, which implies freeing all list elements
>  		 */
> -		xsk_addr->addr = desc->addr;
> -		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +		xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> +		xsk_addr->num_descs++;
>  	}
>  
>  	len = desc->len;
> @@ -813,7 +825,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			}
>  		} else {
>  			int nr_frags = skb_shinfo(skb)->nr_frags;
> -			struct xsk_addr_node *xsk_addr;
> +			struct xsk_addrs *xsk_addr;
>  			struct page *page;
>  			u8 *vaddr;
>  
> @@ -828,11 +840,20 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  				goto free_err;
>  			}
>  
> -			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> -			if (!xsk_addr) {
> -				__free_page(page);
> -				err = -ENOMEM;
> -				goto free_err;
> +			if (xsk_skb_destructor_is_addr(skb)) {
> +				xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> +							     GFP_KERNEL);
> +				if (!xsk_addr) {
> +					__free_page(page);
> +					err = -ENOMEM;
> +					goto free_err;
> +				}
> +
> +				xsk_addr->num_descs = 1;
> +				xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> +				skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> +			} else {
> +				xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>  			}
>  
>  			vaddr = kmap_local_page(page);
> @@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
>  			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>  
> -			xsk_addr->addr = desc->addr;
> -			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +			xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> +			xsk_addr->num_descs++;
>  		}
>  	}
>  
> -	xsk_inc_num_desc(skb);
> -
>  	return skb;
>  
>  free_err:
> @@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  
>  	if (err == -EOVERFLOW) {
>  		/* Drop the packet */
> -		xsk_inc_num_desc(xs->skb);
>  		xsk_drop_skb(xs->skb);
>  		xskq_cons_release(xs->tx);
>  	} else {
> @@ -1904,7 +1922,7 @@ static int __init xsk_init(void)
>  		goto out_pernet;
>  
>  	xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> -						 sizeof(struct xsk_addr_node),
> +						 sizeof(struct xsk_addrs),
>  						 0, SLAB_HWCACHE_ALIGN, NULL);
>  	if (!xsk_tx_generic_cache) {
>  		err = -ENOMEM;
> -- 
> 2.51.0
> 

  parent reply	other threads:[~2025-11-17 16:08 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-30 14:03 [PATCH net v3] xsk: avoid data corruption on cq descriptor number Fernando Fernandez Mancera
2025-10-31  9:51 ` Jason Xing
2025-10-31 10:05   ` Fernando Fernandez Mancera
2025-10-31 10:23     ` Jason Xing
2025-10-31 10:37       ` Fernando Fernandez Mancera
2025-11-17 16:08 ` Maciej Fijalkowski [this message]
2025-11-17 16:11   ` Fernando Fernandez Mancera
2025-11-17 23:59     ` Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aRtIiIvfVwJCmcn1@boxer \
    --to=maciej.fijalkowski@intel.com \
    --cc=bjorn@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=csmate@nop.hu \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=fmancera@suse.de \
    --cc=horms@kernel.org \
    --cc=jonathan.lemon@gmail.com \
    --cc=kerneljasonxing@gmail.com \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=sdf@fomichev.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).