netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Fernando Fernandez Mancera <fmancera@suse.de>
To: bpf@vger.kernel.org
Cc: netdev@vger.kernel.org, magnus.karlsson@intel.com,
	maciej.fijalkowski@intel.com, sdf@fomichev.me,
	kerneljasonxing@gmail.com, fw@strlen.de,
	Fernando Fernandez Mancera <fmancera@suse.de>
Subject: [PATCH 2/2 bpf v2] xsk: avoid data corruption on cq descriptor number
Date: Tue, 28 Oct 2025 19:30:32 +0100	[thread overview]
Message-ID: <20251028183032.5350-2-fmancera@suse.de> (raw)
In-Reply-To: <20251028183032.5350-1-fmancera@suse.de>

Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
production"), the descriptor number is stored in skb control block and
xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
pool's completion queue.

skb control block shouldn't be used for this purpose as after transmit
xsk doesn't have control over it and other subsystems could use it. This
leads to the following kernel panic due to a NULL pointer dereference.

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: Oops: 0000 [#1] SMP NOPTI
 CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
 RIP: 0010:xsk_destruct_skb+0xd0/0x180
 [...]
 Call Trace:
  <IRQ>
  ? napi_complete_done+0x7a/0x1a0
  ip_rcv_core+0x1bb/0x340
  ip_rcv+0x30/0x1f0
  __netif_receive_skb_one_core+0x85/0xa0
  process_backlog+0x87/0x130
  __napi_poll+0x28/0x180
  net_rx_action+0x339/0x420
  handle_softirqs+0xdc/0x320
  ? handle_edge_irq+0x90/0x1e0
  do_softirq.part.0+0x3b/0x60
  </IRQ>
  <TASK>
  __local_bh_enable_ip+0x60/0x70
  __dev_direct_xmit+0x14e/0x1f0
  __xsk_generic_xmit+0x482/0xb70
  ? __remove_hrtimer+0x41/0xa0
  ? __xsk_generic_xmit+0x51/0xb70
  ? _raw_spin_unlock_irqrestore+0xe/0x40
  xsk_sendmsg+0xda/0x1c0
  __sys_sendto+0x1ee/0x200
  __x64_sys_sendto+0x24/0x30
  do_syscall_64+0x84/0x2f0
  ? __pfx_pollwake+0x10/0x10
  ? __rseq_handle_notify_resume+0xad/0x4c0
  ? restore_fpregs_from_fpstate+0x3c/0x90
  ? switch_fpu_return+0x5b/0xe0
  ? do_syscall_64+0x204/0x2f0
  ? do_syscall_64+0x204/0x2f0
  ? do_syscall_64+0x204/0x2f0
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
  </TASK>
 [...]
 Kernel panic - not syncing: Fatal exception in interrupt
 Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)

Use the skb XDP extension to store the number of cq descriptors along
with a list of umem addresses.

Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
---
v2: remove some leftovers on skb_build and simplify fragmented traffic
logic
---
 net/xdp/xsk.c | 70 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 21 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7b0c68a70888..7c04cadf79b9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,15 +41,8 @@ struct xsk_addr_node {
 	struct list_head addr_node;
 };
 
-struct xsk_addr_head {
-	u32 num_descs;
-	struct list_head addrs_list;
-};
-
 static struct kmem_cache *xsk_tx_generic_cache;
 
-#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
-
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -562,6 +555,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
 				      struct sk_buff *skb)
 {
 	struct xsk_addr_node *pos, *tmp;
+	struct xdp_skb_ext *ext;
 	u32 descs_processed = 0;
 	unsigned long flags;
 	u32 idx;
@@ -573,14 +567,16 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
 			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
 	descs_processed++;
 
-	if (unlikely(XSKCB(skb)->num_descs > 1)) {
-		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+	ext = skb_ext_find(skb, SKB_EXT_XDP);
+	if (unlikely(ext)) {
+		list_for_each_entry_safe(pos, tmp, &ext->addrs_list, addr_node) {
 			xskq_prod_write_addr(pool->cq, idx + descs_processed,
 					     pos->addr);
 			descs_processed++;
 			list_del(&pos->addr_node);
 			kmem_cache_free(xsk_tx_generic_cache, pos);
 		}
+		skb_ext_del(skb, SKB_EXT_XDP);
 	}
 	xskq_prod_submit_n(pool->cq, descs_processed);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
@@ -597,12 +593,19 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 
 static void xsk_inc_num_desc(struct sk_buff *skb)
 {
-	XSKCB(skb)->num_descs++;
+	struct xdp_skb_ext *ext;
+
+	ext = skb_ext_find(skb, SKB_EXT_XDP);
+	if (ext)
+		ext->num_descs++;
 }
 
 static u32 xsk_get_num_desc(struct sk_buff *skb)
 {
-	return XSKCB(skb)->num_descs;
+	struct xdp_skb_ext *ext;
+
+	ext = skb_ext_find(skb, SKB_EXT_XDP);
+	return ext ? ext->num_descs : 1;
 }
 
 static void xsk_destruct_skb(struct sk_buff *skb)
@@ -621,12 +624,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
 			      u64 addr)
 {
-	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
-	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
 	skb->dev = xs->dev;
 	skb->priority = READ_ONCE(xs->sk.sk_priority);
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
-	XSKCB(skb)->num_descs = 0;
 	skb->destructor = xsk_destruct_skb;
 	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
 }
@@ -636,12 +636,15 @@ static void xsk_consume_skb(struct sk_buff *skb)
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 	u32 num_descs = xsk_get_num_desc(skb);
 	struct xsk_addr_node *pos, *tmp;
+	struct xdp_skb_ext *ext;
 
-	if (unlikely(num_descs > 1)) {
-		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+	ext = skb_ext_find(skb, SKB_EXT_XDP);
+	if (unlikely(ext)) {
+		list_for_each_entry_safe(pos, tmp, &ext->addrs_list, addr_node) {
 			list_del(&pos->addr_node);
 			kmem_cache_free(xsk_tx_generic_cache, pos);
 		}
+		skb_ext_del(skb, SKB_EXT_XDP);
 	}
 
 	skb->destructor = sock_wfree;
@@ -727,6 +730,18 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 				return ERR_PTR(err);
 		}
 	} else {
+		struct xdp_skb_ext *ext;
+
+		ext = skb_ext_find(skb, SKB_EXT_XDP);
+		if (!ext) {
+			ext = skb_ext_add(skb, SKB_EXT_XDP);
+			if (!ext)
+				return ERR_PTR(-ENOMEM);
+			memset(ext, 0, sizeof(*ext));
+			INIT_LIST_HEAD(&ext->addrs_list);
+			ext->num_descs = 1;
+		}
+
 		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
 		if (!xsk_addr)
 			return ERR_PTR(-ENOMEM);
@@ -736,7 +751,8 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 		 * would be dropped, which implies freeing all list elements
 		 */
 		xsk_addr->addr = desc->addr;
-		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+		list_add_tail(&xsk_addr->addr_node, &ext->addrs_list);
+		xsk_inc_num_desc(skb);
 	}
 
 	len = desc->len;
@@ -814,6 +830,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
 			struct xsk_addr_node *xsk_addr;
+			struct xdp_skb_ext *ext;
 			struct page *page;
 			u8 *vaddr;
 
@@ -828,6 +845,19 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 				goto free_err;
 			}
 
+			ext = skb_ext_find(skb, SKB_EXT_XDP);
+			if (!ext) {
+				ext = skb_ext_add(skb, SKB_EXT_XDP);
+				if (!ext) {
+					__free_page(page);
+					err = -ENOMEM;
+					goto free_err;
+				}
+				memset(ext, 0, sizeof(*ext));
+				INIT_LIST_HEAD(&ext->addrs_list);
+				ext->num_descs = 1;
+			}
+
 			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
 			if (!xsk_addr) {
 				__free_page(page);
@@ -843,12 +873,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
 
 			xsk_addr->addr = desc->addr;
-			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+			list_add_tail(&xsk_addr->addr_node, &ext->addrs_list);
+			xsk_inc_num_desc(skb);
 		}
 	}
 
-	xsk_inc_num_desc(skb);
-
 	return skb;
 
 free_err:
@@ -857,7 +886,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 	if (err == -EOVERFLOW) {
 		/* Drop the packet */
-		xsk_inc_num_desc(xs->skb);
 		xsk_drop_skb(xs->skb);
 		xskq_cons_release(xs->tx);
 	} else {
-- 
2.51.0


  reply	other threads:[~2025-10-28 18:30 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-28 18:30 [PATCH 1/2 bpf v2] xdp: add XDP extension to skb Fernando Fernandez Mancera
2025-10-28 18:30 ` Fernando Fernandez Mancera [this message]
2025-10-28 23:01   ` [PATCH 2/2 bpf v2] xsk: avoid data corruption on cq descriptor number Jakub Kicinski
2025-10-29  7:51     ` Fernando Fernandez Mancera
2025-10-29 23:22       ` Jakub Kicinski
2025-10-30  8:38         ` Fernando Fernandez Mancera
2025-10-30  1:05   ` Jason Xing
2025-10-28 22:55 ` [PATCH 1/2 bpf v2] xdp: add XDP extension to skb Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251028183032.5350-2-fmancera@suse.de \
    --to=fmancera@suse.de \
    --cc=bpf@vger.kernel.org \
    --cc=fw@strlen.de \
    --cc=kerneljasonxing@gmail.com \
    --cc=maciej.fijalkowski@intel.com \
    --cc=magnus.karlsson@intel.com \
    --cc=netdev@vger.kernel.org \
    --cc=sdf@fomichev.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).