Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net] net: airoha: fix BQL underflow and UAF in shared QDMA TX ring
From: Lorenzo Bianconi @ 2026-06-18  6:13 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Wayen Yan, linux-arm-kernel, linux-mediatek, netdev,
	Lorenzo Bianconi

When multiple netdevs share a QDMA TX ring and one device is stopped,
netdev_tx_reset_subqueue() zeroes that device's BQL counters while its
pending skbs remain in the shared HW TX ring. When NAPI later completes
those skbs via netdev_tx_completed_queue(), the already-zeroed
dql->num_queued counter underflows.
Moreover, in the airoha_remove() path, netdevs are unregistered
sequentially while skbs from previously unregistered netdevs may still
reference freed net_device memory via skb->dev, causing a use-after-free
during BQL accounting.
Fix both issues:
- Remove netdev_tx_reset_subqueue() from airoha_dev_stop() so pending
  skbs are completed naturally by NAPI with proper BQL accounting.
- Add netdev_tx_completed_queue() in airoha_qdma_cleanup_tx_queue()
  to properly account for skbs freed during queue teardown.
- Introduce airoha_qdma_tx_disable() to stop TX on all registered
  netdevs for a given QDMA instance under RTNL lock.
- Move DMA engine start/stop into probe/remove and
  airoha_qdma_cleanup(), ensuring TX queues are cleaned up while all
  netdevs are still registered and skb->dev is valid.

Fixes: 6df0488dc9dd ("net: airoha: fix BQL accounting in airoha_qdma_cleanup_tx_queue()")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/ethernet/airoha/airoha_eth.c | 95 ++++++++++++++++++++++++--------
 1 file changed, 72 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
index 64dde6464f3f..4d6a061cd779 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.c
+++ b/drivers/net/ethernet/airoha/airoha_eth.c
@@ -1004,6 +1004,7 @@ static int airoha_qdma_tx_napi_poll(struct napi_struct *napi, int budget)
 
 		e = &q->entry[index];
 		skb = e->skb;
+		e->skb = NULL;
 
 		dma_unmap_single(eth->dev, e->dma_addr, e->dma_len,
 				 DMA_TO_DEVICE);
@@ -1147,6 +1148,42 @@ static int airoha_qdma_init_tx(struct airoha_qdma *qdma)
 	return 0;
 }
 
+static void airoha_qdma_tx_disable(struct airoha_qdma *qdma)
+{
+	struct airoha_eth *eth = qdma->eth;
+	int i;
+
+	/* Protect netdev->reg_state and netif_tx_disable() calls. */
+	rtnl_lock();
+
+	for (i = 0; i < ARRAY_SIZE(eth->ports); i++) {
+		struct airoha_gdm_port *port = eth->ports[i];
+		int j;
+
+		if (!port)
+			continue;
+
+		for (j = 0; j < ARRAY_SIZE(port->devs); j++) {
+			struct airoha_gdm_dev *dev = port->devs[j];
+			struct net_device *netdev;
+
+			if (!dev)
+				continue;
+
+			if (dev->qdma != qdma)
+				continue;
+
+			netdev = netdev_from_priv(dev);
+			if (netdev->reg_state != NETREG_REGISTERED)
+				continue;
+
+			netif_tx_disable(netdev);
+		}
+	}
+
+	rtnl_unlock();
+}
+
 static void airoha_qdma_cleanup_tx_queue(struct airoha_queue *q)
 {
 	struct airoha_qdma *qdma = q->qdma;
@@ -1158,13 +1195,20 @@ static void airoha_qdma_cleanup_tx_queue(struct airoha_queue *q)
 	for (i = 0; i < q->ndesc; i++) {
 		struct airoha_queue_entry *e = &q->entry[i];
 		struct airoha_qdma_desc *desc = &q->desc[i];
+		struct sk_buff *skb = e->skb;
 
 		if (!e->dma_addr)
 			continue;
 
 		dma_unmap_single(eth->dev, e->dma_addr, e->dma_len,
 				 DMA_TO_DEVICE);
-		dev_kfree_skb_any(e->skb);
+		if (skb) {
+			struct netdev_queue *txq;
+
+			txq = skb_get_tx_queue(skb->dev, skb);
+			netdev_tx_completed_queue(txq, 1, skb->len);
+			dev_kfree_skb_any(skb);
+		}
 		e->dma_addr = 0;
 		e->skb = NULL;
 		list_add_tail(&e->list, &q->tx_list);
@@ -1527,6 +1571,23 @@ static void airoha_qdma_cleanup(struct airoha_qdma *qdma)
 {
 	int i;
 
+	if (test_bit(DEV_STATE_INITIALIZED, &qdma->eth->state)) {
+		u32 status;
+
+		airoha_qdma_tx_disable(qdma);
+
+		airoha_qdma_clear(qdma, REG_QDMA_GLOBAL_CFG,
+				  GLOBAL_CFG_TX_DMA_EN_MASK |
+				  GLOBAL_CFG_RX_DMA_EN_MASK);
+		if (read_poll_timeout(airoha_qdma_rr, status,
+				      !(status & (GLOBAL_CFG_TX_DMA_BUSY_MASK |
+						  GLOBAL_CFG_RX_DMA_BUSY_MASK)),
+				      USEC_PER_MSEC, 50 * USEC_PER_MSEC, true,
+				      qdma, REG_QDMA_GLOBAL_CFG))
+			dev_warn(qdma->eth->dev,
+				 "QDMA DMA engine busy timeout\n");
+	}
+
 	for (i = 0; i < ARRAY_SIZE(qdma->q_rx); i++) {
 		if (!qdma->q_rx[i].ndesc)
 			continue;
@@ -1837,9 +1898,6 @@ static int airoha_dev_open(struct net_device *netdev)
 	}
 	port->users++;
 
-	airoha_qdma_set(qdma, REG_QDMA_GLOBAL_CFG,
-			GLOBAL_CFG_TX_DMA_EN_MASK |
-			GLOBAL_CFG_RX_DMA_EN_MASK);
 	qdma->users++;
 
 	if (!airoha_is_lan_gdm_dev(dev) &&
@@ -1880,12 +1938,9 @@ static int airoha_dev_stop(struct net_device *netdev)
 	struct airoha_gdm_dev *dev = netdev_priv(netdev);
 	struct airoha_gdm_port *port = dev->port;
 	struct airoha_qdma *qdma = dev->qdma;
-	int i;
 
 	netif_tx_disable(netdev);
 	airoha_set_vip_for_gdm_port(dev, false);
-	for (i = 0; i < netdev->num_tx_queues; i++)
-		netdev_tx_reset_subqueue(netdev, i);
 
 	if (--port->users)
 		airoha_set_port_mtu(dev->eth, port);
@@ -1893,19 +1948,7 @@ static int airoha_dev_stop(struct net_device *netdev)
 		airoha_set_gdm_port_fwd_cfg(qdma->eth,
 					    REG_GDM_FWD_CFG(port->id),
 					    FE_PSE_PORT_DROP);
-
-	if (!--qdma->users) {
-		airoha_qdma_clear(qdma, REG_QDMA_GLOBAL_CFG,
-				  GLOBAL_CFG_TX_DMA_EN_MASK |
-				  GLOBAL_CFG_RX_DMA_EN_MASK);
-
-		for (i = 0; i < ARRAY_SIZE(qdma->q_tx); i++) {
-			if (!qdma->q_tx[i].ndesc)
-				continue;
-
-			airoha_qdma_cleanup_tx_queue(&qdma->q_tx[i]);
-		}
-	}
+	qdma->users--;
 
 	return 0;
 }
@@ -3413,8 +3456,12 @@ static int airoha_probe(struct platform_device *pdev)
 	if (err)
 		goto error_netdev_free;
 
-	for (i = 0; i < ARRAY_SIZE(eth->qdma); i++)
+	for (i = 0; i < ARRAY_SIZE(eth->qdma); i++) {
 		airoha_qdma_start_napi(&eth->qdma[i]);
+		airoha_qdma_set(&eth->qdma[i], REG_QDMA_GLOBAL_CFG,
+				GLOBAL_CFG_TX_DMA_EN_MASK |
+				GLOBAL_CFG_RX_DMA_EN_MASK);
+	}
 
 	for_each_child_of_node(pdev->dev.of_node, np) {
 		if (!of_device_is_compatible(np, "airoha,eth-mac"))
@@ -3440,6 +3487,8 @@ static int airoha_probe(struct platform_device *pdev)
 	for (i = 0; i < ARRAY_SIZE(eth->qdma); i++)
 		airoha_qdma_stop_napi(&eth->qdma[i]);
 
+	airoha_hw_cleanup(eth);
+
 	for (i = 0; i < ARRAY_SIZE(eth->ports); i++) {
 		struct airoha_gdm_port *port = eth->ports[i];
 		int j;
@@ -3461,7 +3510,6 @@ static int airoha_probe(struct platform_device *pdev)
 		}
 		airoha_metadata_dst_free(port);
 	}
-	airoha_hw_cleanup(eth);
 error_netdev_free:
 	free_netdev(eth->napi_dev);
 	platform_set_drvdata(pdev, NULL);
@@ -3477,6 +3525,8 @@ static void airoha_remove(struct platform_device *pdev)
 	for (i = 0; i < ARRAY_SIZE(eth->qdma); i++)
 		airoha_qdma_stop_napi(&eth->qdma[i]);
 
+	airoha_hw_cleanup(eth);
+
 	for (i = 0; i < ARRAY_SIZE(eth->ports); i++) {
 		struct airoha_gdm_port *port = eth->ports[i];
 		int j;
@@ -3497,7 +3547,6 @@ static void airoha_remove(struct platform_device *pdev)
 		}
 		airoha_metadata_dst_free(port);
 	}
-	airoha_hw_cleanup(eth);
 
 	free_netdev(eth->napi_dev);
 	platform_set_drvdata(pdev, NULL);

---
base-commit: 7d8297e26b4e20b5d1c3c3fe51fe81a1c7fbc823
change-id: 20260618-airoha-bql-fixes-f57b2d108573

Best regards,
-- 
Lorenzo Bianconi <lorenzo@kernel.org>


^ permalink raw reply related

* [PATCH net 2/2] net: airoha: fix netif_set_real_num_tx_queues for sparse QoS channels
From: Lorenzo Bianconi @ 2026-06-18  6:00 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Wayen Yan, linux-arm-kernel, linux-mediatek, netdev,
	Lorenzo Bianconi
In-Reply-To: <20260618-airoha-qos-fixes-v1-0-37192652157f@kernel.org>

airoha_tc_htb_alloc_leaf_queue() assigns queue IDs based on the channel
index (opt->qid = AIROHA_NUM_TX_RING + channel), but updates
real_num_tx_queues with a simple increment (num_tx_queues + 1). When QoS
channels are allocated sparsely (e.g., channels 0 and 3 without 1 and
2), the returned qid can exceed real_num_tx_queues, causing out-of-bounds
accesses in the networking stack.
For example, allocating channel 0 then channel 3 results in
real_num_tx_queues = 34 but qid = 35, which is out of range [0, 34).
Fix this by computing real_num_tx_queues based on the highest active
channel index rather than using a simple counter, in both the allocation
and deletion paths.

Fixes: ef1ca9271313b ("net: airoha: Add sched HTB offload support")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/ethernet/airoha/airoha_eth.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
index aa98d1823ab6..e2652cff67c0 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.c
+++ b/drivers/net/ethernet/airoha/airoha_eth.c
@@ -2789,7 +2789,7 @@ static int airoha_tc_htb_alloc_leaf_queue(struct net_device *netdev,
 					  struct tc_htb_qopt_offload *opt)
 {
 	u32 channel = TC_H_MIN(opt->classid) % AIROHA_NUM_QOS_CHANNELS;
-	int err, num_tx_queues = netdev->real_num_tx_queues;
+	int err, num_tx_queues = AIROHA_NUM_TX_RING + channel + 1;
 	struct airoha_gdm_dev *dev = netdev_priv(netdev);
 	struct airoha_qdma *qdma = dev->qdma;
 
@@ -2806,7 +2806,10 @@ static int airoha_tc_htb_alloc_leaf_queue(struct net_device *netdev,
 	if (err)
 		goto error;
 
-	err = netif_set_real_num_tx_queues(netdev, num_tx_queues + 1);
+	if (num_tx_queues <= netdev->real_num_tx_queues)
+		goto set_qos_sq_bmap;
+
+	err = netif_set_real_num_tx_queues(netdev, num_tx_queues);
 	if (err) {
 		airoha_qdma_set_tx_rate_limit(netdev, channel, 0,
 					      opt->quantum);
@@ -2815,6 +2818,7 @@ static int airoha_tc_htb_alloc_leaf_queue(struct net_device *netdev,
 		goto error;
 	}
 
+set_qos_sq_bmap:
 	set_bit(channel, dev->qos_sq_bmap);
 	opt->qid = AIROHA_NUM_TX_RING + channel;
 
@@ -3003,13 +3007,18 @@ static int airoha_dev_setup_tc_block(struct net_device *dev,
 static void airoha_tc_remove_htb_queue(struct net_device *netdev, int queue)
 {
 	struct airoha_gdm_dev *dev = netdev_priv(netdev);
+	int num_tx_queues = AIROHA_NUM_TX_RING;
 	struct airoha_qdma *qdma = dev->qdma;
 
-	netif_set_real_num_tx_queues(netdev, netdev->real_num_tx_queues - 1);
 	airoha_qdma_set_tx_rate_limit(netdev, queue, 0, 0);
 
 	clear_bit(queue, qdma->qos_channel_map);
 	clear_bit(queue, dev->qos_sq_bmap);
+
+	if (!bitmap_empty(dev->qos_sq_bmap, AIROHA_NUM_QOS_CHANNELS))
+		num_tx_queues += find_last_bit(dev->qos_sq_bmap,
+					       AIROHA_NUM_QOS_CHANNELS) + 1;
+	netif_set_real_num_tx_queues(netdev, num_tx_queues);
 }
 
 static int airoha_tc_htb_delete_leaf_queue(struct net_device *netdev,

-- 
2.54.0


^ permalink raw reply related

* [PATCH net 1/2] net: airoha: Fix off-by-one in airoha_tc_remove_htb_queue()
From: Lorenzo Bianconi @ 2026-06-18  6:00 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Wayen Yan, linux-arm-kernel, linux-mediatek, netdev,
	Lorenzo Bianconi
In-Reply-To: <20260618-airoha-qos-fixes-v1-0-37192652157f@kernel.org>

airoha_tc_htb_alloc_leaf_queue() computes the HTB QoS channel index
as opt->classid % AIROHA_NUM_QOS_CHANNELS and stores it in qos_sq_bmap.
However, airoha_tc_remove_htb_queue() clears the HTB configuration
using queue + 1 as the channel index, causing an off-by-one error.
Use queue directly as the QoS channel index to match the allocation
logic.

Fixes: ef1ca9271313b ("net: airoha: Add sched HTB offload support")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/ethernet/airoha/airoha_eth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
index 64dde6464f3f..aa98d1823ab6 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.c
+++ b/drivers/net/ethernet/airoha/airoha_eth.c
@@ -3006,7 +3006,7 @@ static void airoha_tc_remove_htb_queue(struct net_device *netdev, int queue)
 	struct airoha_qdma *qdma = dev->qdma;
 
 	netif_set_real_num_tx_queues(netdev, netdev->real_num_tx_queues - 1);
-	airoha_qdma_set_tx_rate_limit(netdev, queue + 1, 0, 0);
+	airoha_qdma_set_tx_rate_limit(netdev, queue, 0, 0);
 
 	clear_bit(queue, qdma->qos_channel_map);
 	clear_bit(queue, dev->qos_sq_bmap);

-- 
2.54.0


^ permalink raw reply related

* [PATCH net 0/2] airoha: fixes for sched HTB offload support
From: Lorenzo Bianconi @ 2026-06-18  6:00 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Wayen Yan, linux-arm-kernel, linux-mediatek, netdev,
	Lorenzo Bianconi


---
Lorenzo Bianconi (2):
      net: airoha: Fix off-by-one in airoha_tc_remove_htb_queue()
      net: airoha: fix netif_set_real_num_tx_queues for sparse QoS channels

 drivers/net/ethernet/airoha/airoha_eth.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)
---
base-commit: 7d8297e26b4e20b5d1c3c3fe51fe81a1c7fbc823
change-id: 20260618-airoha-qos-fixes-b6460b085680

Best regards,
-- 
Lorenzo Bianconi <lorenzo@kernel.org>


^ permalink raw reply

* Re: [PATCH bpf v2] bpf, sockmap: fix use-after-free when the stream parser resizes the skb
From: Sechang Lim @ 2026-06-18  5:58 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: John Fastabend, Jakub Sitnicki, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, netdev, bpf,
	linux-kernel
In-Reply-To: <ajMhV7O5YfYbwQzE@devvm29614.prn0.facebook.com>

On Wed, Jun 17, 2026 at 03:36:07PM -0700, Bobby Eshleman wrote:
>On Fri, Jun 12, 2026 at 12:35:51PM +0000, Sechang Lim wrote:
>> sk_psock_strp_parse() runs the BPF_PROG_TYPE_SK_SKB stream-parser program
>> to find the length of the next message. strparser assembles a message out
>> of several received skbs by chaining them onto the head's frag_list and
>> recording where to append the next one in strp->skb_nextp:
>>
>> 	*strp->skb_nextp = skb;
>> 	strp->skb_nextp = &skb->next;
>>
>> and then calls the parser on the head:
>>
>> 	len = (*strp->cb.parse_msg)(strp, head);
>>
>> The parser is only meant to inspect the skb, but the program may call
>> bpf_skb_change_tail() -- or the sibling bpf_skb_pull_data(),
>> bpf_skb_change_head(), bpf_skb_adjust_room(), all allowed for SK_SKB.
>> Once the head carries a frag_list these go
>>
>> 	... -> skb_ensure_writable -> pskb_may_pull -> __pskb_pull_tail
>>
>> and __pskb_pull_tail() frees the frag_list skbs that strparser still
>> tracks through skb_nextp:
>>
>> 	while ((list = skb_shinfo(skb)->frag_list) != insp) {
>> 		skb_shinfo(skb)->frag_list = list->next;
>> 		consume_skb(list);
>> 	}
>>
>> strp->skb_nextp now points into a freed sk_buff. The next segment of
>> the same message arrives in __strp_recv(), which links it with
>> *strp->skb_nextp = skb, an 8-byte write into the freed skb. The free
>> and the write happen in different __strp_recv() calls, so the message
>> has to span at least three segments before it triggers.
>>
>>   BUG: KASAN: slab-use-after-free in __strp_recv+0x447/0xda0
>>   Write of size 8 at addr ffff88810db86140 by task repro/349
>>
>>   Call Trace:
>>    <IRQ>
>>    __strp_recv+0x447/0xda0
>>    __tcp_read_sock+0x13d/0x590
>>    tcp_bpf_strp_read_sock+0x195/0x320
>>    strp_data_ready+0x267/0x340
>>    sk_psock_strp_data_ready+0x1ce/0x350
>>    tcp_data_queue+0x1364/0x2fd0
>>    tcp_rcv_established+0xe07/0x1640
>>    [...]
>>
>>   Allocated by task 349:
>>    skb_clone+0x17b/0x210
>>    __strp_recv+0x2c3/0xda0
>>    __tcp_read_sock+0x13d/0x590
>>    [...]
>>
>>   Freed by task 349:
>>    kmem_cache_free+0x150/0x570
>>    __pskb_pull_tail+0x57b/0xc20
>>    skb_ensure_writable+0x236/0x260
>>    __bpf_skb_change_tail+0x1d4/0x590
>>    sk_skb_change_tail+0x2a/0x40
>>    bpf_prog_1b285dcd6c41373e+0x27/0x30
>>    bpf_prog_run_pin_on_cpu+0xf3/0x260
>>    sk_psock_strp_parse+0x118/0x1e0
>>    __strp_recv+0x4f6/0xda0
>>    [...]
>>
>> The same resize also leaves the head's length inconsistent with its
>> frags, so a later __pskb_pull_tail() can instead hit the
>> BUG_ON(skb_copy_bits(...)) in net/core/skbuff.c.
>>
>> Run the parser on a private clone of the head when the message spans more
>> than one skb and the program can modify the packet
>> (prog->aux->changes_pkt_data), so a resizing helper can only touch the
>> clone and strparser's head and skb_nextp stay valid. Single-skb messages
>> have no frag_list and read-only parsers cannot resize, so both are still
>> parsed in place. If the clone cannot be allocated, return 0 so the caller
>> retries on the next read rather than failing the parser.
>>
>> Fixes: 8a31db561566 ("bpf: add access to sock fields and pkt data from sk_skb programs")
>> Signed-off-by: Sechang Lim <rhkrqnwk98@gmail.com>
>> ---
>> v2:
>>  - clone only when prog->aux->changes_pkt_data (Bobby Eshleman)
>>  - return 0 on clone failure instead of -ENOMEM (Bobby Eshleman)
>>  - free the clone with consume_skb() instead of kfree_skb()
>>  - drop the unrelated guard(rcu)() change (Bobby Eshleman)
>>
>> v1:
>>  - https://lore.kernel.org/all/20260609112316.3685738-1-rhkrqnwk98@gmail.com/
>>
>>  net/core/skmsg.c | 26 +++++++++++++++++++++++---
>>  1 file changed, 23 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
>> index e1850caf1a71..97e5bc5f38c3 100644
>> --- a/net/core/skmsg.c
>> +++ b/net/core/skmsg.c
>> @@ -1149,9 +1149,29 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
>>  	rcu_read_lock();
>>  	prog = READ_ONCE(psock->progs.stream_parser);
>>  	if (likely(prog)) {
>> -		skb->sk = psock->sk;
>> -		ret = bpf_prog_run_pin_on_cpu(prog, skb);
>> -		skb->sk = NULL;
>> +		struct sk_buff *parse_skb = skb;
>> +
>> +		/*
>> +		 * strparser chains the message skbs through skb->frag_list and
>> +		 * keeps a pointer into that list in strp->skb_nextp.  The parser
>> +		 * program may call bpf_skb_change_tail() and friends, which go
>> +		 * through __pskb_pull_tail() and free the frag_list skbs that
>> +		 * strparser still tracks.  Run the program on a clone when the head
>> +		 * has a frag_list and the program can modify the packet, so it
>> +		 * cannot drop frags strparser owns.
>> +		 */
>> +		if (skb_has_frag_list(skb) && prog->aux->changes_pkt_data) {
>> +			parse_skb = skb_clone(skb, GFP_ATOMIC);
>> +			if (!parse_skb) {
>> +				rcu_read_unlock();
>> +				return 0;
>> +			}
>> +		}
>> +		parse_skb->sk = psock->sk;
>> +		ret = bpf_prog_run_pin_on_cpu(prog, parse_skb);
>> +		parse_skb->sk = NULL;
>> +		if (parse_skb != skb)
>> +			consume_skb(parse_skb);
>>  	}
>>  	rcu_read_unlock();
>>  	return ret;
>> --
>> 2.43.0
>>
>I'm still on the fence about "return 0" vs ENOMEM. I hate to flip-flop
>on you here, but now I'm not sure if it is worth the complication to
>return 0 since we're really only buying a single timer interval in which
>we need 1) suddenly more memory to alloc the clone, and 2) another data
>ready event to cause the stream parsing to pick up again. If any one
>doesn't happen, the end result is the same. Not sure its a good
>trade-off for the complexity of basically tricking the caller with the
>zero return. Maybe let's go back to ENOMEM?
>

Per Kuniyuki's and Jiayuan's suggestion, v3 will reject a packet-modifying
stream parser at attach time instead of runtime, so the return-0 vs
ENOMEM question goes away with that code.

>BTW, based on the comm name "repro", it sounds like you have a decent
>reproducer for this. I wonder if it is possible to add something to the
>selftests to catch this?
>

I will add an selftest in v3.

Best,
Sechang

^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH net v2 2/2] ice: dpll: fix memory leak in ice_dpll_init_info error paths
From: Rinitha, SX @ 2026-06-18  5:52 UTC (permalink / raw)
  To: ZhaoJinming, Nguyen, Anthony L, Kitszel, Przemyslaw, Andrew Lunn,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20260529053733.764996-3-zhaojinming@uniontech.com>

> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of ZhaoJinming
> Sent: 29 May 2026 11:08
> To: Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Kitszel, Przemyslaw <przemyslaw.kitszel@intel.com>; Andrew Lunn <andrew+netdev@lunn.ch>; David S . Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>
> Cc: intel-wired-lan@lists.osuosl.org; netdev@vger.kernel.org; linux-kernel@vger.kernel.org; ZhaoJinming <zhaojinming@uniontech.com>
> Subject: [Intel-wired-lan] [PATCH net v2 2/2] ice: dpll: fix memory leak in ice_dpll_init_info error paths
>
> Several error return paths in ice_dpll_init_info() directly return without freeing previously allocated resources, causing memory leaks:
>
> - When de->input_prio allocation fails, d->inputs is leaked
> - When dp->input_prio allocation fails, d->inputs and de->input_prio
>  are leaked
> - When ice_get_cgu_rclk_pin_info() fails, all previously allocated
>  inputs/outputs/input_prio are leaked
> - When ice_dpll_init_pins_info(RCLK_INPUT) fails, same resources
>  are leaked
>
> Fix this by jumping to the deinit_info label which properly calls
> ice_dpll_deinit_info() to free all allocated resources.
>
> Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu")
> Signed-off-by: ZhaoJinming <zhaojinming@uniontech.com>
> ---
> drivers/net/ethernet/intel/ice/ice_dpll.c | 16 ++++++++++------
> 1 file changed, 10 insertions(+), 6 deletions(-)
>

Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)

^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH net v2 1/2] ice: dpll: set pointers to NULL after kfree in ice_dpll_deinit_info
From: Rinitha, SX @ 2026-06-18  5:52 UTC (permalink / raw)
  To: ZhaoJinming, Nguyen, Anthony L, Kitszel, Przemyslaw, Andrew Lunn,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20260529053733.764996-2-zhaojinming@uniontech.com>

> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf Of ZhaoJinming
> Sent: 29 May 2026 11:08
> To: Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Kitszel, Przemyslaw <przemyslaw.kitszel@intel.com>; Andrew Lunn <andrew+netdev@lunn.ch>; David S . Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>
> Cc: intel-wired-lan@lists.osuosl.org; netdev@vger.kernel.org; linux-kernel@vger.kernel.org; ZhaoJinming <zhaojinming@uniontech.com>
> Subject: [Intel-wired-lan] [PATCH net v2 1/2] ice: dpll: set pointers to NULL after kfree in ice_dpll_deinit_info
>
> ice_dpll_deinit_info() calls kfree() on several pf->dplls fields (inputs, outputs, eec.input_prio, pps.input_prio) but does not set the pointers to NULL afterward. This leaves dangling pointers in the
> pf->dplls structure.
>
> While not currently exploitable through existing code paths, this is unsafe because:
>
> 1. If ice_dpll_init_info() is called again after a deinit (e.g. during
>   driver recovery), and a subsequent allocation within init fails, the
>   error path will jump to deinit_info and call ice_dpll_deinit_info()
>   again. Since some pointers still hold the old freed addresses, this
>   would result in a double-free.
>
> 2. Any future code that checks these pointers before use or after free
>   would be unprotected against use-after-free.
>
> Follow the common kernel convention of setting pointers to NULL after
> kfree() so that:
> - kfree(NULL) is a safe no-op, preventing double-free
> - NULL checks on these pointers become meaningful
>
> This is a preparatory fix for a subsequent patch that routes additional error paths in ice_dpll_init_info() to the deinit_info label.
>
> Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu")
> Signed-off-by: ZhaoJinming <zhaojinming@uniontech.com>
> ---
> drivers/net/ethernet/intel/ice/ice_dpll.c | 4 ++++
> 1 file changed, 4 insertions(+)
>

Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)

^ permalink raw reply

* Re: [linux-next:master] [selftests]  a3f88d89f6: kernel-selftests-bpf.net.test_bridge_neigh_suppress.sh.arping.fail
From: Oliver Sang @ 2026-06-18  5:43 UTC (permalink / raw)
  To: Danielle Ratson
  Cc: oe-lkp@lists.linux.dev, lkp@intel.com, Jakub Kicinski,
	Nikolay Aleksandrov, netdev@vger.kernel.org, oliver.sang
In-Reply-To: <SJ2PR12MB9008502979A1D3566074A273D8E42@SJ2PR12MB9008.namprd12.prod.outlook.com>

hi, Danielle,

On Wed, Jun 17, 2026 at 07:29:48AM +0000, Danielle Ratson wrote:
> Hi Oliver,
> 
> Thank you for confirming the arping version. ARPing 2.25 by Thomas Habets has incompatible semantics for several flags that the test relies on (-D, b, -U, -A). 
> So, the failures are a tool version issue rather than a kernel regression.
> 
> This is not limited to the new commit- looking at the 56 failures in the added log, the other arping-based test cases that existed before commit a3f88d89f698 are also failing, which confirms the root cause.
> 
> The same assumption (iputils arping) is made by other net selftests as well:
> test_vxlan_nh.sh, forwarding/vxlan_asymmetric.sh, and arp_ndisc_untracked_subnets.sh all use iputils-specific flags.
> 
> There are two options to address this on your end:
> 
> 1. Install iputils-arping and ensure it takes precedence over ARPing 2.25 in PATH. Running "arping -V" should then show "iputils".

thanks a lot for information and guildance! we will defintely to install
iputils-arping.

> 2. If that is not feasible, I can send a fix that adds an iputils version check to test_bridge_neigh_suppress.sh, causing it to SKIP cleanly when iputils arping is not present:
> 
> if ! arping -V 2>&1 | grep -q "iputils"; then
>      echo "SKIP: Test requires iputils arping"
>      exit $ksft_skip
> fi
> 
> This will result in a SKIP rather than a FAIL for your environment.

no need to do fix for our env IMHO. this kernel test robot's purpose is to
bisect the regression and report the fbc to linux kernel community to help
developers to improve kernel code quality. we need to enable as many test
cases as possible for this purpose. so we will try above #1 option :)

thanks a lot!


> 
> Thanks,
> Danielle
> 
> > -----Original Message-----
> > From: Oliver Sang <oliver.sang@intel.com>
> > Sent: Monday, 15 June 2026 16:03
> > To: Danielle Ratson <danieller@nvidia.com>
> > Cc: oe-lkp@lists.linux.dev; lkp@intel.com; Jakub Kicinski <kuba@kernel.org>;
> > Nikolay Aleksandrov <razor@blackwall.org>; netdev@vger.kernel.org;
> > oliver.sang@intel.com
> > Subject: Re: [linux-next:master] [selftests] a3f88d89f6: kernel-selftests-
> > bpf.net.test_bridge_neigh_suppress.sh.arping.fail
> > 
> > hi, Danielle,
> > 
> > On Thu, Jun 11, 2026 at 11:44:39AM +0000, Danielle Ratson wrote:
> > > Hi Oliver,
> > >
> > > Thank you for the report.
> > >
> > > The failures appear to be caused by an arping tool version mismatch.
> > > The test was written assuming iputils arping semantics, but not all
> > distributions ship that version. Different arping implementations have
> > incompatible behavior for the flags used throughout
> > test_bridge_neigh_suppress.sh.
> > >
> > > Looking at the added log, the 56 failures are not limited to the
> > neigh_suppress_arp_probe section.
> > > The other arping-based test cases in the file are also affected, which is
> > consistent with a tool version issue rather than a kernel regression.
> > >
> > > To confirm the root cause on your end, please share the results for running
> > the below:
> > > $ arping -V
> > > $ ./test_bridge_neigh_suppress.sh -t neigh_suppress_arp -v
> > 
> > sorry for late.
> > 
> > our tests run in a auto framework, I had to add some code to print above
> > information, but so far, it just generates below output.
> > before we try further, want to seek your advice if these information are
> > enough?
> > 
> > KERNEL SELFTESTS: linux_headers_dir is /usr/src/linux-headers-x86_64-rhel-
> > 9.4-bpf-a3f88d89f698743a8cd91fb43f997e2d292a168d
> > ### arping -V
> > arping: option requires an argument -- 'V'
> > ARPing 2.25, by Thomas Habets <thomas@habets.se>
> > usage: arping [ -0aAbdDeFpPqrRuUvzZ ] [ -w <sec> ] [ -W <sec> ] [ -S <host/ip>
> > ]
> >               [ -T <host/ip ] [ -s <MAC> ] [ -t <MAC> ] [ -c <count> ]
> >               [ -C <count> ] [ -i <interface> ] [ -m <type> ] [ -g <group> ]
> >               [ -V <vlan> ] [ -Q <priority> ] <host/ip/MAC | -B> For complete usage
> > info, use --help or check the manpage.
> > ### ./test_bridge_neigh_suppress.sh -t neigh_suppress_arp -v
> >                                                 <-------- seems there is no output here
> > Per-port ARP suppression - VLAN 10              <-------- seems already start the
> > tests
> > ----------------------------------
> > COMMAND: tc -n sw1-U1mYwE qdisc replace dev vx0 clsact
> > 
> > 
> > >
> > > Thanks,
> > > Danielle
> > >
> > > > -----Original Message-----
> > > > From: kernel test robot <oliver.sang@intel.com>
> > > > Sent: Thursday, 11 June 2026 10:23
> > > > To: Danielle Ratson <danieller@nvidia.com>
> > > > Cc: oe-lkp@lists.linux.dev; lkp@intel.com; Jakub Kicinski
> > > > <kuba@kernel.org>; Nikolay Aleksandrov <razor@blackwall.org>;
> > > > netdev@vger.kernel.org; oliver.sang@intel.com
> > > > Subject: [linux-next:master] [selftests] a3f88d89f6:
> > > > kernel-selftests- bpf.net.test_bridge_neigh_suppress.sh.arping.fail
> > > >
> > > >
> > > > hi, Danielle Ratson,
> > > >
> > > > for new added tests, we still found some failures in our tests, not
> > > > sure if any dependencies we missed? thanks
> > > >
> > > >
> > > > Hello,
> > > >
> > > > kernel test robot noticed "kernel-selftests-
> > > > bpf.net.test_bridge_neigh_suppress.sh.arping.fail" on:
> > > >
> > > > commit: a3f88d89f698743a8cd91fb43f997e2d292a168d ("selftests: net:
> > > > Add tests for ARP probe and DAD NS handling")
> > > > https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git
> > > > master
> > > >
> > > > in testcase: kernel-selftests-bpf
> > > > version:
> > > > with following parameters:
> > > >
> > > > 	group: net
> > > >
> > > >
> > > > config: x86_64-rhel-9.4-bpf
> > > > compiler: gcc-14
> > > > test machine: 16 threads Intel(R) Core(TM) i7-13620H (Raptor Lake)
> > > > with 32G memory
> > > >
> > > > (please refer to attached dmesg/kmsg for entire log/backtrace)
> > > >
> > > >
> > > >
> > > > If you fix the issue in a separate patch/commit (i.e. not just a new
> > > > version of the same patch/commit), kindly add following tags
> > > > | Reported-by: kernel test robot <oliver.sang@intel.com>
> > > > | Closes:
> > > > | https://lore.kernel.org/oe-lkp/202606110955.8f29025d-lkp@intel.com
> > > >
> > > >
> > > > # timeout set to 3600
> > > > # selftests: net: test_bridge_neigh_suppress.sh #
> > > >
> > > > [...]
> > > >
> > > > #
> > > > # Per-port ARP probe suppression
> > > > # ------------------------------
> > > > # TEST: ARP probe suppression                                         [FAIL]
> > > > # TEST: "neigh_suppress" is on                                        [ OK ]
> > > > # TEST: ARP probe suppression                                         [FAIL]
> > > > # TEST: FDB and neighbor entry installation                           [ OK ]
> > > > # TEST: arping                                                        [FAIL]
> > > > # TEST: ARP probe suppression                                         [FAIL]
> > > > # TEST: neighbor removal                                              [ OK ]
> > > > # TEST: ARP probe suppression                                         [FAIL]
> > > > # TEST: "neigh_suppress" is off                                       [ OK ]
> > > > # TEST: ARP probe suppression                                         [FAIL]
> > > > #
> > > > # Per-port DAD NS suppression
> > > > # ---------------------------
> > > > # TEST: DAD NS suppression                                            [ OK ]
> > > > # TEST: "neigh_suppress" is on                                        [ OK ]
> > > > # TEST: DAD NS suppression                                            [ OK ]
> > > > # TEST: FDB and neighbor entry installation                           [ OK ]
> > > > # TEST: DAD NS suppression                                            [ OK ]
> > > > # TEST: DAD NS proxy NA reply                                         [ OK ]
> > > > # TEST: neighbor removal                                              [ OK ]
> > > > # TEST: DAD NS suppression                                            [ OK ]
> > > > # TEST: "neigh_suppress" is off                                       [ OK ]
> > > > # TEST: DAD NS suppression                                            [ OK ]
> > > > #
> > > > # Tests passed: 124
> > > > # Tests failed:  56
> > > > not ok 110 selftests: net: test_bridge_neigh_suppress.sh # exit=1
> > > >
> > > >
> > > >
> > > > The kernel config and materials to reproduce are available at:
> > > > https://download.01.org/0day-
> > > > ci/archive/20260611/202606110955.8f29025d-lkp@intel.com
> > > >
> > > >
> > > >
> > > > --
> > > > 0-DAY CI Kernel Test Service
> > > > https://github.com/intel/lkp-tests/wiki
> > >

^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH] idpf: bound interrupt-vector register fill to the allocated array
From: Loktionov, Aleksandr @ 2026-06-18  5:22 UTC (permalink / raw)
  To: Michael Bommarito, Nguyen, Anthony L, Kitszel, Przemyslaw,
	Hay, Joshua A, Pavan Kumar Linga, Andrew Lunn, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20260617215754.1117178-1-michael.bommarito@gmail.com>



> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf
> Of Michael Bommarito
> Sent: Wednesday, June 17, 2026 11:58 PM
> To: Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Kitszel,
> Przemyslaw <przemyslaw.kitszel@intel.com>; Hay, Joshua A
> <joshua.a.hay@intel.com>; Pavan Kumar Linga
> <pavan.kumar.linga@intel.com>; Andrew Lunn <andrew+netdev@lunn.ch>;
> David S . Miller <davem@davemloft.net>; Eric Dumazet
> <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni
> <pabeni@redhat.com>
> Cc: intel-wired-lan@lists.osuosl.org; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org
> Subject: [Intel-wired-lan] [PATCH] idpf: bound interrupt-vector
> register fill to the allocated array
> 
> idpf_get_reg_intr_vecs() fills the caller-allocated reg_vals[] array
> from the VIRTCHNL2_OP_ALLOC_VECTORS reply in adapter->req_vec_chunks,
> bounding its inner loop only by the per-chunk num_vectors. The array
> is sized
> separately: idpf_intr_reg_init() allocates kzalloc_objs(struct
> idpf_vec_regs, total_vecs) from caps.num_allocated_vectors and only
> checks the returned count after the fill. The sum of per-chunk
> num_vectors is never reconciled against total_vecs, so a reply with a
> small num_allocated_vectors but chunks summing higher writes past the
> end of reg_vals[].
> 
> Impact: a control plane (a PF or hypervisor device model) that returns
> a VIRTCHNL2_OP_ALLOC_VECTORS reply whose per-chunk num_vectors sum
> exceeds num_allocated_vectors writes struct idpf_vec_regs entries past
> the end of the reg_vals kmalloc allocation (KASAN slab-out-of-bounds
> write).
> 
> Bound the fill loop to the array capacity passed in by the callers,
> mirroring the sibling idpf_vport_get_q_reg(). The existing num_regs <
> num_vecs check then rejects an undersized reply without the out-of-
> bounds write happening first.
> 
> Fixes: d4d558718266 ("idpf: initialize interrupts and enable vport")
> Assisted-by: Claude:claude-opus-4-7
> Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
> ---
> The reply originates from the control plane (a PF or hypervisor device
> model), which is trusted in a standard deployment, so this is a
> defense-in-depth / robustness fix: it bounds a malformed or internally
> inconsistent ALLOC_VECTORS reply. It is a genuine trust-boundary
> crossing only where the guest distrusts the control plane (a
> confidential VM or an Intel IPU posture) or the control plane is
> simply buggy. It is not remotely or unprivileged-reachable.
> 
> Reproduced with a KUnit harness that calls the unmodified
> idpf_get_reg_intr_vecs() against a crafted req_vec_chunks reply
> (num_allocated_vectors = 1, four chunks of sixteen vectors) under
> KASAN:
> stock reports a slab-out-of-bounds write 0 bytes past a 12-byte
> kmalloc-16 object and the test fails; the patched build is KASAN-
> clean; a well-formed 64-vector reply still fills 64 entries on both.
> The KUnit wiring is repro-only scaffolding, not part of this patch;
> harness on request.
> 
>  drivers/net/ethernet/intel/idpf/idpf_dev.c      | 2 +-
>  drivers/net/ethernet/intel/idpf/idpf_vf_dev.c   | 2 +-
>  drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 5 +++--
> drivers/net/ethernet/intel/idpf/idpf_virtchnl.h | 2 +-
>  4 files changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/idpf/idpf_dev.c
> b/drivers/net/ethernet/intel/idpf/idpf_dev.c
> index 1a0c71c95ef12..4079a787657f1 100644
> --- a/drivers/net/ethernet/intel/idpf/idpf_dev.c
> +++ b/drivers/net/ethernet/intel/idpf/idpf_dev.c
> @@ -87,7 +87,7 @@ static int idpf_intr_reg_init(struct idpf_vport
> *vport,
>  	if (!reg_vals)
>  		return -ENOMEM;
> 
> -	num_regs = idpf_get_reg_intr_vecs(adapter, reg_vals);
> +	num_regs = idpf_get_reg_intr_vecs(adapter, reg_vals,
> total_vecs);
>  	if (num_regs < num_vecs) {
>  		err = -EINVAL;
>  		goto free_reg_vals;
> diff --git a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
> b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
> index a07d7e808ca9b..6726084f6cfa0 100644
> --- a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
> +++ b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
> @@ -86,7 +86,7 @@ static int idpf_vf_intr_reg_init(struct idpf_vport
> *vport,
>  	if (!reg_vals)
>  		return -ENOMEM;
> 
> -	num_regs = idpf_get_reg_intr_vecs(adapter, reg_vals);
> +	num_regs = idpf_get_reg_intr_vecs(adapter, reg_vals,
> total_vecs);
>  	if (num_regs < num_vecs) {
>  		err = -EINVAL;
>  		goto free_reg_vals;
> diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
> b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
> index be66f9b2e101c..ec7330603ff84 100644
> --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
> +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
> @@ -1318,11 +1318,12 @@ idpf_vport_init_queue_reg_chunks(struct
> idpf_vport_config *vport_config,
>   * idpf_get_reg_intr_vecs - Get vector queue register offset
>   * @adapter: adapter structure to get the vector chunks
>   * @reg_vals: Register offsets to store in
> + * @num_vecs: number of entries the @reg_vals array can hold
>   *
>   * Return: number of registers that got populated
>   */
>  int idpf_get_reg_intr_vecs(struct idpf_adapter *adapter,
> -			   struct idpf_vec_regs *reg_vals)
> +			   struct idpf_vec_regs *reg_vals, int num_vecs)
>  {
>  	struct virtchnl2_vector_chunks *chunks;
>  	struct idpf_vec_regs reg_val;
> @@ -1346,7 +1347,7 @@ int idpf_get_reg_intr_vecs(struct idpf_adapter
> *adapter,
>  		dynctl_reg_spacing = le32_to_cpu(chunk-
> >dynctl_reg_spacing);
>  		itrn_reg_spacing = le32_to_cpu(chunk->itrn_reg_spacing);
> 
> -		for (i = 0; i < num_vec; i++) {
> +		for (i = 0; i < num_vec && num_regs < num_vecs; i++) {
>  			reg_vals[num_regs].dyn_ctl_reg =
> reg_val.dyn_ctl_reg;
>  			reg_vals[num_regs].itrn_reg = reg_val.itrn_reg;
>  			reg_vals[num_regs].itrn_index_spacing = diff --
> git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h
> b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h
> index 6876e3ed9d1be..9b1c9c86f6eac 100644
> --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h
> +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h
> @@ -104,7 +104,7 @@ int idpf_vc_core_init(struct idpf_adapter
> *adapter);  void idpf_vc_core_deinit(struct idpf_adapter *adapter);
> 
>  int idpf_get_reg_intr_vecs(struct idpf_adapter *adapter,
> -			   struct idpf_vec_regs *reg_vals);
> +			   struct idpf_vec_regs *reg_vals, int num_vecs);
>  int idpf_queue_reg_init(struct idpf_vport *vport,
>  			struct idpf_q_vec_rsrc *rsrc,
>  			struct idpf_queue_id_reg_info *chunks);
> --
> 2.53.0

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>


^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH net v2] ice: Fix use-after-scope in ice_sched_add_nodes_to_layer()
From: Loktionov, Aleksandr @ 2026-06-18  5:21 UTC (permalink / raw)
  To: NeKon69, Nguyen, Anthony L, Kitszel, Przemyslaw
  Cc: andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, horms@kernel.org,
	Kwapulinski, Piotr, intel-wired-lan@lists.osuosl.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260617072155.1172432-1-nobodqwe@gmail.com>



> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf
> Of NeKon69
> Sent: Wednesday, June 17, 2026 9:22 AM
> To: Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Kitszel,
> Przemyslaw <przemyslaw.kitszel@intel.com>
> Cc: andrew+netdev@lunn.ch; davem@davemloft.net; edumazet@google.com;
> kuba@kernel.org; pabeni@redhat.com; horms@kernel.org; Kwapulinski,
> Piotr <piotr.kwapulinski@intel.com>; intel-wired-lan@lists.osuosl.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; NeKon69
> <nobodqwe@gmail.com>
> Subject: [Intel-wired-lan] [PATCH net v2] ice: Fix use-after-scope in
> ice_sched_add_nodes_to_layer()
> 
> Commit 7fb09a737536 ("ice: Modify recursive way of adding nodes")
> changed ice_sched_add_nodes_to_layer() from recursive control flow to
> an iterative loop.
> 
> Inside the loop, first_teid_ptr may be set to the address of a block-
> local variable:
> 
>     u32 temp;
>     ...
>     if (num_added)
>         first_teid_ptr = &temp;
> 
> On the next loop iteration, first_teid_ptr may be passed to
> ice_sched_add_nodes_to_hw_layer(), after temp from the previous
> iteration has gone out of scope.
> 
> Instead of keeping temporary storage for later calls, allow
> first_node_teid to be NULL when the caller does not need the TEID.
> 
> This was found by Clang with LifetimeSafety enabled while testing C
> language support on a Linux allmodconfig build.
> 
> Fixes: 7fb09a737536 ("ice: Modify recursive way of adding nodes")
> Link: https://github.com/llvm/llvm-project/pull/203270
> Signed-off-by: NeKon69 <nobodqwe@gmail.com>
> ---
> v2:
> - Allow first_node_teid to be NULL when callers do not need the TEID.
> - Pass NULL after the first TEID has already been returned instead of
> using
>   temporary stack storage.
> - Update kernel-doc for helpers accepting NULL.
> - Link to v1: https://lore.kernel.org/netdev/20260613101440.80190-1-
> nobodqwe@gmail.com/
> - Compile-tested with:
>   make drivers/net/ethernet/intel/ice/ice_sched.o
> 
>  drivers/net/ethernet/intel/ice/ice_sched.c | 16 +++++++---------
>  1 file changed, 7 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c
> b/drivers/net/ethernet/intel/ice/ice_sched.c
> index fff0c1afdb41..89e191c839b1 100644
> --- a/drivers/net/ethernet/intel/ice/ice_sched.c
> +++ b/drivers/net/ethernet/intel/ice/ice_sched.c
> @@ -895,7 +895,8 @@ void ice_sched_cleanup_all(struct ice_hw *hw)
>   * @layer: layer number to add nodes
>   * @num_nodes: number of nodes
>   * @num_nodes_added: pointer to num nodes added
> - * @first_node_teid: if new nodes are added then return the TEID of
> first node
> + * @first_node_teid: if new nodes are added then return the TEID of
> first node,
> + *                   may be NULL
>   * @prealloc_nodes: preallocated nodes struct for software DB
>   *
>   * This function add nodes to HW as well as to SW DB for a given
> layer @@ -1000,7 +1001,7 @@ ice_sched_add_elems(struct ice_port_info
> *pi, struct ice_sched_node *tc_node,
>  		if (!pi->sib_head[tc_node->tc_num][layer])
>  			pi->sib_head[tc_node->tc_num][layer] = new_node;
> 
> -		if (i == 0)
> +		if (first_node_teid && i == 0)
>  			*first_node_teid = teid;
>  	}
> 
> @@ -1015,7 +1016,7 @@ ice_sched_add_elems(struct ice_port_info *pi,
> struct ice_sched_node *tc_node,
>   * @parent: pointer to parent node
>   * @layer: layer number to add nodes
>   * @num_nodes: number of nodes to be added
> - * @first_node_teid: pointer to the first node TEID
> + * @first_node_teid: pointer to the first node TEID, may be NULL
>   * @num_nodes_added: pointer to number of nodes added
>   *
>   * Add nodes into specific HW layer.
> @@ -1078,7 +1079,6 @@ ice_sched_add_nodes_to_layer(struct
> ice_port_info *pi,
>  	*num_nodes_added = 0;
>  	while (*num_nodes_added < num_nodes) {
>  		u16 max_child_nodes, num_added = 0;
> -		u32 temp;
> 
>  		status = ice_sched_add_nodes_to_hw_layer(pi, tc_node,
> parent,
>  							 layer,
> 	new_num_nodes,
> @@ -1109,13 +1109,11 @@ ice_sched_add_nodes_to_layer(struct
> ice_port_info *pi,
>  			 * try the next available sibling.
>  			 */
>  			parent = ice_sched_find_next_vsi_node(parent);
> -			/* Don't modify the first node TEID memory if the
> -			 * first node was added already in the above
> call.
> -			 * Instead send some temp memory for all other
> -			 * recursive calls.
> +			/* Don't modify the first node TEID memory if the
> first node
> +			 * was added already in the above call.
>  			 */
>  			if (num_added)
> -				first_teid_ptr = &temp;
> +				first_teid_ptr = NULL;
> 
>  			new_num_nodes = num_nodes - *num_nodes_added;
>  		}
> --
> 2.54.0

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>


^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH net v2] ice: eswitch: fix use-after-free of metadata_dst in repr release
From: Loktionov, Aleksandr @ 2026-06-18  5:20 UTC (permalink / raw)
  To: Doruk Tan Ozturk, Nguyen, Anthony L, Kitszel, Przemyslaw,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com
  Cc: michal.swiatkowski@linux.intel.com, Drewek, Wojciech,
	horms@kernel.org, intel-wired-lan@lists.osuosl.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org
In-Reply-To: <20260617100556.83620-1-doruk@0sec.ai>



> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf
> Of Doruk Tan Ozturk
> Sent: Wednesday, June 17, 2026 12:06 PM
> To: Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Kitszel,
> Przemyslaw <przemyslaw.kitszel@intel.com>; andrew+netdev@lunn.ch;
> davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> pabeni@redhat.com
> Cc: michal.swiatkowski@linux.intel.com; Drewek, Wojciech
> <wojciech.drewek@intel.com>; horms@kernel.org; intel-wired-
> lan@lists.osuosl.org; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; Doruk Tan Ozturk <doruk@0sec.ai>;
> stable@vger.kernel.org
> Subject: [Intel-wired-lan] [PATCH net v2] ice: eswitch: fix use-after-
> free of metadata_dst in repr release
> 
> ice_eswitch_release_repr() frees the port representor metadata_dst via
> metadata_dst_free(), which directly kfree()s the object and ignores
> the dst_entry refcount. The eswitch slow-path TX routine
> ice_eswitch_port_start_xmit() takes a reference on this dst with
> dst_hold() and attaches it to the skb via skb_dst_set(). If such an
> skb is still in flight (e.g. queued in a qdisc) when the representor
> is torn down, the metadata_dst is freed while the skb still points at
> it. When the skb is later freed, dst_release() operates on already-
> freed memory.
> 
> Replace metadata_dst_free() with dst_release() so the metadata_dst is
> freed only after the last reference is dropped. The dst subsystem
> frees metadata_dst objects from dst_destroy() once the refcount
> reaches zero (DST_METADATA is set by metadata_dst_alloc()).
> 
> Same class of bug and fix as commit c32b26aaa2f9 ("netfilter:
> nft_tunnel: fix use-after-free on object destroy").
> 
> Fixes: 1a1c40df2e80 ("ice: set and release switchdev environment")
> Cc: stable@vger.kernel.org
> Signed-off-by: Doruk Tan Ozturk <doruk@0sec.ai>
> Reviewed-by: Simon Horman <horms@kernel.org>
> ---
>  v2:
>   - Correct the Fixes: tag to the commit that introduced the switchdev
>     teardown (Simon Horman); add his Reviewed-by. No functional
> change.
>  v1: https://lore.kernel.org/netdev/20260615140532.52676-1-
> doruk@0sec.ai/
> 
>  drivers/net/ethernet/intel/ice/ice_eswitch.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c
> b/drivers/net/ethernet/intel/ice/ice_eswitch.c
> index 2e4f0969035f..41b30a7ca4a9 100644
> --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c
> +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c
> @@ -95,7 +95,7 @@ ice_eswitch_release_repr(struct ice_pf *pf, struct
> ice_repr *repr)
>  		return;
> 
>  	ice_vsi_update_security(vsi, ice_vsi_ctx_set_antispoof);
> -	metadata_dst_free(repr->dst);
> +	dst_release(&repr->dst->dst);
>  	repr->dst = NULL;
>  	ice_fltr_add_mac_and_broadcast(vsi, repr->parent_mac,
>  				       ICE_FWD_TO_VSI);
> --
> 2.43.0

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>


^ permalink raw reply

* Re: [PATCH bpf v2] bpf, sockmap: fix use-after-free when the stream parser resizes the skb
From: Sechang Lim @ 2026-06-18  5:19 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: John Fastabend, Jakub Sitnicki, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Bobby Eshleman, netdev,
	bpf, linux-kernel
In-Reply-To: <04931588-e708-40d8-a1b7-3700a1a3b376@linux.dev>

On Thu, Jun 18, 2026 at 10:45:02AM +0800, Jiayuan Chen wrote:
>
>On 6/12/26 8:35 PM, Sechang Lim wrote:
>>sk_psock_strp_parse() runs the BPF_PROG_TYPE_SK_SKB stream-parser program
>>to find the length of the next message. strparser assembles a message out
>>of several received skbs by chaining them onto the head's frag_list and
>>recording where to append the next one in strp->skb_nextp:
>>
>>	*strp->skb_nextp = skb;
>>	strp->skb_nextp = &skb->next;
>>
>>and then calls the parser on the head:
>>
>>	len = (*strp->cb.parse_msg)(strp, head);
>>
>>The parser is only meant to inspect the skb, but the program may call
>>bpf_skb_change_tail() -- or the sibling bpf_skb_pull_data(),
>>bpf_skb_change_head(), bpf_skb_adjust_room(), all allowed for SK_SKB.
>>Once the head carries a frag_list these go
>>
>>	... -> skb_ensure_writable -> pskb_may_pull -> __pskb_pull_tail
>>
>>and __pskb_pull_tail() frees the frag_list skbs that strparser still
>>tracks through skb_nextp:
>>
>>	while ((list = skb_shinfo(skb)->frag_list) != insp) {
>>		skb_shinfo(skb)->frag_list = list->next;
>>		consume_skb(list);
>>	}
>>
>>strp->skb_nextp now points into a freed sk_buff. The next segment of
>>the same message arrives in __strp_recv(), which links it with
>>*strp->skb_nextp = skb, an 8-byte write into the freed skb. The free
>>and the write happen in different __strp_recv() calls, so the message
>>has to span at least three segments before it triggers.
>>
>>   BUG: KASAN: slab-use-after-free in __strp_recv+0x447/0xda0
>>   Write of size 8 at addr ffff88810db86140 by task repro/349
>>
>>   Call Trace:
>>    <IRQ>
>>    __strp_recv+0x447/0xda0
>>    __tcp_read_sock+0x13d/0x590
>>    tcp_bpf_strp_read_sock+0x195/0x320
>>    strp_data_ready+0x267/0x340
>>    sk_psock_strp_data_ready+0x1ce/0x350
>>    tcp_data_queue+0x1364/0x2fd0
>>    tcp_rcv_established+0xe07/0x1640
>>    [...]
>>
>>   Allocated by task 349:
>>    skb_clone+0x17b/0x210
>>    __strp_recv+0x2c3/0xda0
>>    __tcp_read_sock+0x13d/0x590
>>    [...]
>>
>>   Freed by task 349:
>>    kmem_cache_free+0x150/0x570
>>    __pskb_pull_tail+0x57b/0xc20
>>    skb_ensure_writable+0x236/0x260
>>    __bpf_skb_change_tail+0x1d4/0x590
>>    sk_skb_change_tail+0x2a/0x40
>>    bpf_prog_1b285dcd6c41373e+0x27/0x30
>>    bpf_prog_run_pin_on_cpu+0xf3/0x260
>>    sk_psock_strp_parse+0x118/0x1e0
>>    __strp_recv+0x4f6/0xda0
>>    [...]
>>
>>The same resize also leaves the head's length inconsistent with its
>>frags, so a later __pskb_pull_tail() can instead hit the
>>BUG_ON(skb_copy_bits(...)) in net/core/skbuff.c.
>>
>>Run the parser on a private clone of the head when the message spans more
>>than one skb and the program can modify the packet
>>(prog->aux->changes_pkt_data), so a resizing helper can only touch the
>>clone and strparser's head and skb_nextp stay valid. Single-skb messages
>>have no frag_list and read-only parsers cannot resize, so both are still
>>parsed in place. If the clone cannot be allocated, return 0 so the caller
>>retries on the next read rather than failing the parser.
>>
>>Fixes: 8a31db561566 ("bpf: add access to sock fields and pkt data from sk_skb programs")
>
>
>Please consider Kuniyuki Iwashima's suggestion.
>
>But it only covers the ATTACH path; the other two paths should be 
>covered as well:
>
>- BPF_PROG_ATTACH → sock_map_get_from_fd → sock_map_prog_update
>- BPF_LINK_CREATE → sock_map_link_create → sock_map_prog_update
>- replace prog → sock_map_link_update_prog
>
>A new helper for this check is probably needed, called from both
>
>sock_map_prog_update() and sock_map_link_update_prog().
>

Thanks, agreed. v3 will cover prog attach, link create and link update.

>
>Since this rejects the program at attach time rather than fixing a 
>runtime crash,
>
>I'm not sure a Fixes tag is appropriate here - thoughts?
>

I'd keep it. skb_change_tail reached SK_SKB in current Fixes tag, so
that is where the UAF became reachable. Happy to drop it if you prefer.

Best,
Sechang

^ permalink raw reply

* Re: [PATCH bpf v2] bpf, sockmap: fix use-after-free when the stream parser resizes the skb
From: Sechang Lim @ 2026-06-18  4:57 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: bobbyeshleman, bpf, davem, edumazet, horms, jakub, john.fastabend,
	kuba, linux-kernel, netdev, pabeni
In-Reply-To: <20260618002559.1479884-1-kuniyu@google.com>

On Thu, Jun 18, 2026 at 12:25:57AM +0000, Kuniyuki Iwashima wrote:
>From: Sechang Lim <rhkrqnwk98@gmail.com>
>Date: Fri, 12 Jun 2026 12:35:51 +0000
>> sk_psock_strp_parse() runs the BPF_PROG_TYPE_SK_SKB stream-parser program
>> to find the length of the next message. strparser assembles a message out
>> of several received skbs by chaining them onto the head's frag_list and
>> recording where to append the next one in strp->skb_nextp:
>>
>> 	*strp->skb_nextp = skb;
>> 	strp->skb_nextp = &skb->next;
>>
>> and then calls the parser on the head:
>>
>> 	len = (*strp->cb.parse_msg)(strp, head);
>>
>> The parser is only meant to inspect the skb, but the program may call
>> bpf_skb_change_tail() -- or the sibling bpf_skb_pull_data(),
>> bpf_skb_change_head(), bpf_skb_adjust_room(), all allowed for SK_SKB.
>
>It's bpf prog's responsibility not to abuse them.
>
>Even setting aside that, why not simply block such BPF prog ?
>
>It cannot be done at load time, but doable at attach time.
>
>>

Thanks, this is cleaner than cloning. Will fix in v3.

Best,
Sechang

^ permalink raw reply

* RE: [PATCH net v5 1/4] net: ethernet: oa_tc6: Interrupt is active low, level triggered.
From: Selvamani Rajagopal @ 2026-06-18  4:26 UTC (permalink / raw)
  To: Parthiban.Veerasooran@microchip.com, andrew+netdev@lunn.ch,
	davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	pabeni@redhat.com, robh@kernel.org, krzk+dt@kernel.org,
	conor+dt@kernel.org, Piergiorgio Beruto
  Cc: andrew@lunn.ch, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Conor.Dooley@microchip.com,
	devicetree@vger.kernel.org
In-Reply-To: <7c89df6b-32ac-46c8-8400-945879037f2e@microchip.com>

> Subject: Re: [PATCH net v5 1/4] net: ethernet: oa_tc6: Interrupt is active low, level
> triggered.
> 
> 
> 
> Test case 2: Two LAN8651 instances on the same RPI4
> 
> Setup:
> 
> RPI4 #1 + LAN8651 (IP: 192.168.10.101) <--- RPI4 #2 + EVB-LAN8670-USB
> (IP: 192.168.10.102)
> RPI4 #1 + LAN8651 (IP: 192.168.20.101) <--- RPI4 #2 + EVB-LAN8670-USB
> (IP: 192.168.20.102)
> 
> Result:
> 

Parthiban,

It appears that we can't reproduce the crash you saw in your setup. Code has been running
all day with 5+ millions of "™Receive buffer overflow error" (Yes. I added a counter to see how 
many times, code returns EAGAIN error code)

One obvious reason is that our EVB has only one network interface. Just like your setup in Test case 1,
where you didn't see any issue.

AI review bot Sashiko suggested one potential issue where skb pointers aren't protected. But those 
concerns are in transmit path. This crash seems to be in receive path. If you think that might help,
I can generate a patch for that.

What do you suggest? Since you are able to see the crash, would you have time to investigate?

Sincerely
Selva

^ permalink raw reply

* Re: [PATCH net v5 1/4] net: ethernet: oa_tc6: Interrupt is active low, level triggered.
From: Parthiban.Veerasooran @ 2026-06-18  4:26 UTC (permalink / raw)
  To: Selvamani.Rajagopal, andrew+netdev, davem, edumazet, kuba, pabeni,
	robh, krzk+dt, conor+dt, Pier.Beruto
  Cc: andrew, netdev, linux-kernel, Conor.Dooley, devicetree
In-Reply-To: <CYYPR02MB9828B41845A534BDF0B0C17083E42@CYYPR02MB9828.namprd02.prod.outlook.com>

Hi Selvamani,

On 17/06/26 10:24 am, Selvamani Rajagopal wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
> 
>> Subject: Re: [PATCH net v5 1/4] net: ethernet: oa_tc6: Interrupt is active low, level
>> triggered.
>>
>>
>> Hi Selvamani,
>>
>> I did a quick test by connecting Mikroe LAN8651 Click to a Raspberry Pi
>> 4 and shared the feedback below. Please let me know if you need any
>> further details.
> 
> Parthiban,
> 
> Thanks for testing this.
> 
> Though the NULL pointer reference after skb_put is a clue, I am working with our team to see we can see this crash in our setup.
> Will keep you updated.
Sure, thank you.

Best regards,
Parthiban V
> 
>>
>> [ 8276.691064] eth1: Receive buffer overflow error
>> [ 8281.662600] Unable to handle kernel NULL pointer dereference at
>> virtual address 0000000000000074> drm_panel_orientation_quirks backlight nfnetlink
>> [ 8281.839427] pc : skb_put+0x14/0x80
>> [ 8281.842864] lr : oa_tc6_macphy_threaded_irq+0x428/0x880 [lan865x_t1s]
> 


^ permalink raw reply

* Re: [PATCH net] ipv6: ndisc: fix NULL deref in accept_untracked_na()
From: Jiayuan Chen @ 2026-06-18  4:08 UTC (permalink / raw)
  To: Weiming Shi, David S . Miller, David Ahern, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, netdev, linux-kernel, Xiang Mei
In-Reply-To: <DJBD6SGYRIHX.1IHLCVG9YYTNJ@gmail.com>


On 6/17/26 9:38 PM, Weiming Shi wrote:
> On Wed Jun 17, 2026 at 4:32 PM CST, Jiayuan Chen wrote:
>> On 6/17/26 2:55 PM, Weiming Shi wrote:
>>> accept_untracked_na() re-fetches the inet6_dev with __in6_dev_get(dev)
>>> and dereferences idev->cnf.accept_untracked_na without a NULL check,
>>
>> Does ipv6_rpl_srh_rcv have same problem?
> Hi,
>
> Yes, ipv6_rpl_srh_rcv() has the same missing check. It reads
> idev->cnf.rpl_seg_enabled right after __in6_dev_get(skb->dev) with no
> NULL check, while seg6 and ioam6 in the same file both check it.
>
> But I tried to trigger it and couldn't. With a guard added as an instrument,
> idev never came back NULL over tens of millions of RPL packets while
> flapping the MTU, so I can't say it's actually reachable.


Can you need to add mdelay to enlarge the race window to reproduce it?

I believe we need more precise traffic and timing control, instead of
aggressively ramping up traffic and load in an attempt to reproduce the 
issue.


^ permalink raw reply

* Re: [PATCH net] net: dst_metadata: fix false-positive memcpy overflow in tun_dst_unclone
From: Gustavo A. R. Silva @ 2026-06-18  4:02 UTC (permalink / raw)
  To: Ilya Maximets, netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Kees Cook, Gustavo A. R. Silva, Nathan Chancellor,
	Nick Desaulniers, Bill Wendling, Justin Stitt, linux-kernel,
	linux-hardening, llvm, Johan Thomsen
In-Reply-To: <a95aabe9-6294-42ed-8327-b7d74bb4a8c8@embeddedor.com>



On 6/17/26 16:59, Gustavo A. R. Silva wrote:
> 
> 
> On 6/17/26 16:01, Ilya Maximets wrote:
>> On 6/17/26 10:08 PM, Gustavo A. R. Silva wrote:
>>> Hi,
>>>
>>> On 6/16/26 04:03, Ilya Maximets wrote:
>>>> kmalloc_flex() in metadata_dst_alloc() sets __counted_by for the
>>>> structure to the options_len, which is then initialized to zero.
>>>> Later, we're initializing the structure by copying the tunnel info
>>>> together with the options, and this triggers a warning for a potential
>>>> memcpy overflow, since the compiler estimates that the options can't
>>>> fit into the structure, even though the memory for them is actually
>>>> allocated.
>>>>
>>>>    memcpy: detected buffer overflow: 104 byte write of buffer size 96
>>>>    WARNING: CPU: X PID: Y at lib/string_helpers.c:1036 __fortify_report
>>>>     skb_tunnel_info_unclone+0x179/0x190
>>>>     geneve_xmit+0x7fe/0xe00
>>>
>>> This warning has nothing to do with counted_by. See below for more
>>> comments.
>>>
>>>>
>>>> The issue is triggered when built with clang and source fortification.
>>>>
>>>> Fix that by doing the copy in two stages: first - the main data with
>>>> the options_len, then the options.  This way the correct length should
>>>> be known at the time of the copy.
>>>>
>>>> It would be better if the options_len never changed after allocation,
>>>> but the allocation code is a little separate from the initialization
>>>> and it would be awkward and potentially dangerous to return a struct
>>>> with options_len set to a non-zero value from the metadata_dst_alloc().
>>>>
>>>> Another option would be to use ip_tunnel_info_opts_set(), but it is
>>>> doing too many unnecessary operations for the use case here.
>>>>
>>>> Fixes: 69050f8d6d07 ("treewide: Replace kmalloc with kmalloc_obj for non-scalar types")
>>>> Reported-by: Johan Thomsen <write@ownrisk.dk>
>>>> Closes: https://lore.kernel.org/netdev/CAKv6aAM8_EWgXScnKmKYm_4SwGDVBK++dzfP+Y6msUXbp99QUw@mail.gmail.com/
>>>> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
>>>> ---
>>>>
>>>> Johan, if you can test this one in your setup as well, that would
>>>> be great.  Thanks.
>>>>
>>>>    include/net/dst_metadata.h | 7 +++++--
>>>>    1 file changed, 5 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
>>>> index 1fc2fb03ce3f..f45d1e3163f0 100644
>>>> --- a/include/net/dst_metadata.h
>>>> +++ b/include/net/dst_metadata.h
>>>> @@ -164,8 +164,11 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
>>>>        if (!new_md)
>>>>            return ERR_PTR(-ENOMEM);
>>>> -    memcpy(&new_md->u.tun_info, &md_dst->u.tun_info,
>>>> -           sizeof(struct ip_tunnel_info) + md_size);
>>>
>>> What's going on here is that, internally, fortified memcpy() retrieves
>>> the destination size via __builtin_dynamic_object_size() in mode 1.
>>>
>>> That is:
>>>
>>> __builtin_dynamic_object_size(&new_md->u.tun_info, 1)
>>>
>>> For the above case, Clang returns sizeof(new_md->u.tun_info) == 96.
>>>
>>> So the warning is reporting that 104 bytes don't fit in an object of
>>> size 96 bytes, regardless of any counted_by annotation or allocation.
>>
>> Hmm.  Does __builtin_dynamic_object_size(&new_md->u.tun_info, 1) return
>> 104 when the options_len is 8?  If so, isn't that because it is counted
>> by that field?  Asking because the fortification doesn't complain if we
>> keep the full 104-byte copy as-is, but set the options_len beforehand,
>> as tested by Johan.
> 
> I see. If that is the case, then, internally, fortified memcpy() ends up
> using mode 0 instead of mode 1. Something like this:
> 
> __builtin_dynamic_object_size(&new_md->u.tun_info, 0)
> 
> The above will effectively consider the allocation and counted_by because
> it will interpret new_md->u.tun_info as an open-ended object due to the
> flexible-array member (in struct ip_tunnel_info) whose size is determined
> by counted_by.

Indeed. The execution stops here:

fortify_memcpy_chk():
588         /*
589          * Always stop accesses beyond the struct that contains the
590          * field, when the buffer's remaining size is known.
591          * (The SIZE_MAX test is to optimize away checks where the buffer
592          * lengths are unknown.)
593          */
594         if (p_size != SIZE_MAX && p_size < size)
595                 fortify_panic(func, FORTIFY_WRITE, p_size, size, true);

with p_size = __builtin_dynamic_object_size(&new_md->u.tun_info, 0)

The code never reaches the part where p_size_field (__bdos(&new_md->u.tun_info, 1))
is checked at runtime because there is no need for that.

So yep, this patch is okay as-is.

Thanks
-Gustavo

^ permalink raw reply

* [PATCH net] octeontx2-af: npc: cn20k: Fix subbank free list indexing for search order
From: Ratheesh Kannoth @ 2026-06-18  3:59 UTC (permalink / raw)
  To: kuba, linux-kernel, netdev, rkannoth
  Cc: andrew+netdev, davem, edumazet, pabeni, sgoutham

subbank_srch_order[i] is the physical subbank at search-order slot i,
so each subbank's arr_idx must be i (its slot), not
subbank_srch_order[sb->idx].  The old logic mis-keyed xa_sb_free
and broke allocation traversal order.

Populate arr_idx and xa_sb_free in a single pass over the search
order after subbank structs are initialized.

Fixes: 7ac9d4c4075c ("octeontx2-af: npc: cn20k: add subbank search order control")
Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
---
 .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 47 ++++++++++++++-----
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
index 354c4e881c6a..d38e848add93 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
@@ -3423,6 +3423,36 @@ static int npc_create_srch_order(int cnt)
 	return 0;
 }
 
+static int npc_subbanks_srch_oder_init(struct rvu *rvu)
+{
+	struct npc_subbank *sb;
+	int sb_idx;
+	int i, j;
+	int rc;
+
+	for (i = 0; i < npc_priv->num_subbanks; i++) {
+		sb_idx = subbank_srch_order[i];
+		sb = &npc_priv->sb[sb_idx];
+		sb->arr_idx = i;
+
+		dev_dbg(rvu->dev, "%s: sb->idx=%u sb->arr_idx=%u\n",
+			__func__, sb->idx, sb->arr_idx);
+
+		rc = xa_err(xa_store(&npc_priv->xa_sb_free, sb->arr_idx,
+				     xa_mk_value(sb->idx), GFP_KERNEL));
+		if (rc) {
+			dev_err(rvu->dev,
+				"%s: xa_store(xa_sb_free) failed at slot %d (sb=%d): %d\n",
+				__func__, i, sb_idx, rc);
+			for (j = 0; j < i; j++)
+				xa_erase(&npc_priv->xa_sb_free, j);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
 static void npc_subbank_init(struct rvu *rvu, struct npc_subbank *sb, int idx)
 {
 	mutex_init(&sb->lock);
@@ -3435,16 +3465,6 @@ static void npc_subbank_init(struct rvu *rvu, struct npc_subbank *sb, int idx)
 
 	sb->flags = NPC_SUBBANK_FLAG_FREE;
 	sb->idx = idx;
-	sb->arr_idx = subbank_srch_order[idx];
-
-	dev_dbg(rvu->dev, "%s: sb->idx=%u sb->arr_idx=%u\n",
-		__func__, sb->idx, sb->arr_idx);
-
-	/* Keep first and last subbank at end of free array; so that
-	 * it will be used at last
-	 */
-	xa_store(&npc_priv->xa_sb_free, sb->arr_idx,
-		 xa_mk_value(sb->idx), GFP_KERNEL);
 }
 
 static int npc_pcifunc_map_create(struct rvu *rvu)
@@ -4635,6 +4655,7 @@ static int npc_priv_init(struct rvu *rvu)
 	int num_subbanks, subbank_depth;
 	u64 npc_const1, npc_const2 = 0;
 	struct npc_subbank *sb;
+	int ret = -ENOMEM;
 	u64 cfg;
 	int i;
 
@@ -4727,6 +4748,10 @@ static int npc_priv_init(struct rvu *rvu)
 	for (i = 0, sb = npc_priv->sb; i < num_subbanks; i++, sb++)
 		npc_subbank_init(rvu, sb, i);
 
+	ret = npc_subbanks_srch_oder_init(rvu);
+	if (ret)
+		goto fail2;
+
 	/* Get number of pcifuncs in the system */
 	npc_priv->pf_cnt = npc_pcifunc_map_create(rvu);
 	npc_priv->xa_pf2idx_map = kcalloc(npc_priv->pf_cnt,
@@ -4760,7 +4785,7 @@ static int npc_priv_init(struct rvu *rvu)
 fail1:
 	kfree(npc_priv);
 	npc_priv = NULL;
-	return -ENOMEM;
+	return ret;
 }
 
 void npc_cn20k_deinit(struct rvu *rvu)
-- 
2.43.0


^ permalink raw reply related

* [PATCH net] net: mana: Sync page pool RX frags for CPU
From: Dexuan Cui @ 2026-06-18  3:50 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, kotaranov, horms, ernis, dipayanroy, kees,
	jacob.e.keller, ssengar, linux-hyperv, netdev, linux-kernel,
	linux-rdma
  Cc: stable

MANA allocates RX buffers from page pool fragments when frag_count is
greater than 1. In that case the buffers remain DMA mapped by page pool
and the RX completion path does not call dma_unmap_single(). As a result,
the implicit sync-for-CPU normally performed by dma_unmap_single() is
missing before the packet data is passed to the networking stack.

This breaks RX on configurations which require explicit DMA syncing, for
example when booted with swiotlb=force.

Fix this by recording the page pool page and DMA sync offset when the RX
buffer is allocated, and syncing the received packet range for CPU access
before handing the RX buffer to the stack.

Also validate the packet length reported in the RX CQE before using it as
a DMA sync length or passing it to skb processing. The CQE is supplied
by the device and should not be blindly trusted by Confidential VMs.

Fixes: 730ff06d3f5c ("net: mana: Use page pool fragments for RX buffers instead of full pages to improve memory efficiency.")
Cc: stable@vger.kernel.org
Signed-off-by: Dexuan Cui <decui@microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 61 +++++++++++++++----
 include/net/mana/mana.h                       |  8 +++
 2 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index c9b1df1ed109..d8906169666d 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2044,15 +2044,19 @@ static void mana_rx_skb(void *buf_va, bool from_pool,
 }
 
 static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
-			     dma_addr_t *da, bool *from_pool)
+			     dma_addr_t *da, bool *from_pool,
+			     struct page **pp_page, u32 *dma_sync_offset)
 {
 	struct page *page;
 	u32 offset;
 	void *va;
+
 	*from_pool = false;
+	*pp_page = NULL;
+	*dma_sync_offset = 0;
 
 	/* Don't use fragments for jumbo frames or XDP where it's 1 fragment
-	 * per page.
+	 * per page. These buffers are mapped with dma_map_single().
 	 */
 	if (rxq->frag_count == 1) {
 		/* Reuse XDP dropped page if available */
@@ -2087,31 +2091,47 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
 	va  = page_to_virt(page) + offset;
 	*da = page_pool_get_dma_addr(page) + offset + rxq->headroom;
 	*from_pool = true;
+	*pp_page = page;
+	*dma_sync_offset = offset + rxq->headroom;
 
 	return va;
 }
 
 /* Allocate frag for rx buffer, and save the old buf */
 static void mana_refill_rx_oob(struct device *dev, struct mana_rxq *rxq,
-			       struct mana_recv_buf_oob *rxoob, void **old_buf,
-			       bool *old_fp)
+			       struct mana_recv_buf_oob *rxoob, u32 pktlen,
+			       void **old_buf, bool *old_fp)
 {
+	u32 dma_sync_offset;
+	struct page *pp_page;
 	bool from_pool;
 	dma_addr_t da;
 	void *va;
 
-	va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
+	va = mana_get_rxfrag(rxq, dev, &da, &from_pool, &pp_page,
+			     &dma_sync_offset);
 	if (!va)
 		return;
-	if (!rxoob->from_pool || rxq->frag_count == 1)
+	if (!rxoob->from_pool || rxq->frag_count == 1) {
 		dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
 				 DMA_FROM_DEVICE);
+	} else {
+		/* The page pool maps the whole page and only syncs for device
+		 * automatically (PP_FLAG_DMA_SYNC_DEV). Sync the received bytes
+		 * for the CPU before they are read: this is required if DMA
+		 * is incoherent or bounce buffers are used.
+		 */
+		page_pool_dma_sync_for_cpu(rxq->page_pool, rxoob->pp_page,
+					   rxoob->dma_sync_offset, pktlen);
+	}
 	*old_buf = rxoob->buf_va;
 	*old_fp = rxoob->from_pool;
 
 	rxoob->buf_va = va;
 	rxoob->sgl[0].address = da;
 	rxoob->from_pool = from_pool;
+	rxoob->pp_page = pp_page;
+	rxoob->dma_sync_offset = dma_sync_offset;
 }
 
 static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
@@ -2170,12 +2190,24 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 		rxbuf_oob = &rxq->rx_oobs[curr];
 		WARN_ON_ONCE(rxbuf_oob->wqe_inf.wqe_size_in_bu != 1);
 
-		mana_refill_rx_oob(dev, rxq, rxbuf_oob, &old_buf, &old_fp);
+		if (unlikely(pktlen > rxq->datasize)) {
+			/* Increase it even if mana_rx_skb() isn't called. */
+			rxq->rx_cq.work_done++;
 
-		/* Unsuccessful refill will have old_buf == NULL.
-		 * In this case, mana_rx_skb() will drop the packet.
-		 */
-		mana_rx_skb(old_buf, old_fp, oob, rxq, i);
+			++ndev->stats.rx_dropped;
+			netdev_warn_once(ndev,
+				"Dropped oversized RX packet: len=%u, datasize=%u\n",
+				pktlen, rxq->datasize);
+
+			/* Reuse the RX buffer since rxbuf_oob is unchanged. */
+		} else {
+			mana_refill_rx_oob(dev, rxq, rxbuf_oob, pktlen, &old_buf, &old_fp);
+
+			/* Unsuccessful refill will have old_buf == NULL.
+			 * In this case, mana_rx_skb() will drop the packet.
+			 */
+			mana_rx_skb(old_buf, old_fp, oob, rxq, i);
+		}
 
 		mana_move_wq_tail(rxq->gdma_rq,
 				  rxbuf_oob->wqe_inf.wqe_size_in_bu);
@@ -2566,6 +2598,8 @@ static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
 			    struct mana_rxq *rxq, struct device *dev)
 {
 	struct mana_port_context *mpc = netdev_priv(rxq->ndev);
+	struct page *pp_page = NULL;
+	u32 dma_sync_offset = 0;
 	bool from_pool = false;
 	dma_addr_t da;
 	void *va;
@@ -2573,13 +2607,16 @@ static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
 	if (mpc->rxbufs_pre)
 		va = mana_get_rxbuf_pre(rxq, &da);
 	else
-		va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
+		va = mana_get_rxfrag(rxq, dev, &da, &from_pool, &pp_page,
+				     &dma_sync_offset);
 
 	if (!va)
 		return -ENOMEM;
 
 	rx_oob->buf_va = va;
 	rx_oob->from_pool = from_pool;
+	rx_oob->pp_page = pp_page;
+	rx_oob->dma_sync_offset = dma_sync_offset;
 
 	rx_oob->sgl[0].address = da;
 	rx_oob->sgl[0].size = rxq->datasize;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 8f721cd4e4a7..4111b93169d2 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -305,6 +305,14 @@ struct mana_recv_buf_oob {
 
 	void *buf_va;
 	bool from_pool; /* allocated from a page pool */
+	/* head page of the page_pool fragment; valid only when
+	 * from_pool && frag_count > 1.
+	 */
+	struct page *pp_page;
+	/* Fragment offset plus rxq->headroom, passed to
+	 * page_pool_dma_sync_for_cpu().
+	 */
+	u32 dma_sync_offset;
 
 	/* SGL of the buffer going to be sent as part of the work request. */
 	u32 num_sge;
-- 
2.34.1


^ permalink raw reply related

* Re: [net-next PATCH 06/10] net: dsa: realtek: rtl8365mb: add VLAN support
From: Luiz Angelo Daros de Luca @ 2026-06-18  3:30 UTC (permalink / raw)
  To: Gabor Juhos
  Cc: Linus Walleij, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Alvin Šipraga, Yury Norov, Rasmus Villemoes, Russell King,
	netdev, linux-kernel
In-Reply-To: <232d7c53-f413-4c8d-bc70-f03a12d601f5@gmail.com>

Hi Gabor, Linus,

(Sorry for the late answer. I left it as a draft and forgot to send it)

> The driver was based on various code found in GPL sources of the TP-Link
> TL-WR2543ND and ASUS RT-N56U devices. Those sources were using the RTL8370
> specific API for these chips.

The GPL code from both TL-WR2543ND and ASUS RT-N56U devices were a
dead end. However,
the mention of "RTL8370" simply changed everything. I was searching
for a RTL8367 API.

> It was not clear that the two models really belongs to the RTL8370 family
> or simply the vendors were using the RTL8370 specific code as a base, so
> I have used RTL8367 prefix in the swconfig driver. Probably, it would have
> been better to keep the RTL8370 prefix to avoid confusion.

Maybe. I don't know if the RTL8367 family exists. I'm adapting the
rtl8365mb driver to work with multiple switch generations, from
RTL8370/RTL8367 until RTL8367D. I simply internally renamed
RTL8370/RTL8367 to RTL8367A as I can reference it as Family-A (and B,
C, D...).

> > The main challenge with the base RTL8367 is the lack of a public API.
> > Most vendors support it via binary managers (ASUS) or proprietary
> > kernel modules (TP-Link). The only available references I’ve found are
> > the OpenWrt swconfig driver you mentioned and some U-Boot
> > initialization code. I do have the rtl8367{b,c,d} APIs. While the
> > rtl8367b seems close to the original RTL8367, it has fewer ports. Does
> > anyone happen to have access to the original RTL8367 API
> > documentation?
>
> I have no documentation, but the RTL8370 API source can be found at various
> places [2], [3].

I did have it checked out in my dev dir for years but the missing
piece was to match RTL8367R with RTL8370.

> To be honest, I don't remember all the details, but I hope that this helps.

It helped a lot.

I have a mostly working rtl8365mb version that runs on RTL8367R. I
still need more tests with FDB but I think it will be possible to
replace the swconfig rtl8367.c with DSA rtl8365mb. I am still looking
for a RTL8367B and RTL8367D device to complete the family.

Regards,

Luiz

^ permalink raw reply

* [PATCH net v3 2/2] geneve: validate inner network offset in geneve_gro_complete()
From: Xiang Mei @ 2026-06-18  3:26 UTC (permalink / raw)
  To: netdev, Paolo Abeni
  Cc: Jakub Kicinski, Eric Dumazet, Andrew Lunn, David S . Miller,
	Weiming Shi, Kyle Zeng, Xiang Mei
In-Reply-To: <20260618032622.484720-1-xmei5@asu.edu>

Even with both paths gated on gs->gro_hint, geneve_gro_complete()
re-derives the inner dispatch type and length from the packet and the
current gs->gro_hint, independently of geneve_gro_receive(). The two can
disagree if gs->gro_hint flips under a concurrent geneve_quiesce()/
geneve_unquiesce() (sk_user_data is NULL across a synchronize_net()), or if
the re-read option bytes differ from the ones receive parsed.

geneve_gro_receive() already records the inner network header position in
NAPI_GRO_CB()->inner_network_offset. Have geneve_gro_complete() compute the
offset it is about to dispatch at, adding ETH_HLEN in the ETH_P_TEB case
where eth_gro_complete() steps over the inner MAC header, and bail out if
it lands past inner_network_offset.

Use a lower bound rather than exact equality: between gh_len and the inner
L3 header, geneve_gro_receive() may also have pulled an inner VLAN tag
(vlan_gro_receive() advances the recorded offset past it), which only moves
inner_network_offset further out. A valid frame therefore always satisfies
inner_nh <= inner_network_offset, while a gh_len inflated by a hint
gro_receive() did not honour dispatches past the validated inner header,
i.e. the out-of-bounds completion. Only the latter is rejected.

Fixes: fd0dd796576e ("geneve: use GRO hint option in the RX path")
Suggested-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Weiming Shi <bestswngs@gmail.com>
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Xiang Mei <xmei5@asu.edu>
---
 drivers/net/geneve.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 7cf7aaac8ee1..396e1a113cd4 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -962,6 +962,20 @@ static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
 	type = gh->proto_type;
 	geneve_sk_gro_hint_off(sk, gh, &type, &gh_len);

+	/* Bail out if we are about to dispatch past the inner network header
+	 * gro_receive() validated. An inner VLAN tag only pushes
+	 * inner_network_offset out, so use a lower bound.
+	 */
+	if (skb->encapsulation) {
+		unsigned int inner_nh = nhoff + gh_len;
+
+		if (type == htons(ETH_P_TEB))
+			inner_nh += ETH_HLEN;
+
+		if (unlikely(inner_nh > NAPI_GRO_CB(skb)->inner_network_offset))
+			return -EINVAL;
+	}
+
 	/* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */
 	if (likely(type == htons(ETH_P_TEB)))
 		return eth_gro_complete(skb, nhoff + gh_len);
-- 
2.43.0

^ permalink raw reply related

* [PATCH net v3 1/2] geneve: gate GRO hint in geneve_gro_complete() on gs->gro_hint
From: Xiang Mei @ 2026-06-18  3:26 UTC (permalink / raw)
  To: netdev, Paolo Abeni
  Cc: Jakub Kicinski, Eric Dumazet, Andrew Lunn, David S . Miller,
	Weiming Shi, Kyle Zeng, Xiang Mei

geneve_gro_receive() reads the GRO hint through geneve_sk_gro_hint_off(),
which honours it only when the socket enabled IFLA_GENEVE_GRO_HINT
(gs->gro_hint). geneve_gro_complete() instead calls the low-level
geneve_opt_gro_hint_off() and acts on the hint unconditionally.

On a tunnel without the hint, receive aggregates the frames as plain
ETH_P_TEB while complete still honours an attacker-supplied hint option: it
inflates gh_len by gro_hint->nested_hdr_len (u8) and redirects the dispatch
type, so the inner gro_complete handler runs at nhoff + gh_len, an offset
receive never pulled nor validated, reading out of bounds of the skb head:

  BUG: KASAN: slab-out-of-bounds in ipv6_gro_complete (net/ipv6/ip6_offload.c:196)
  Read of size 1 at addr ffff88800fe91980 by task exploit/153
   ipv6_gro_complete (net/ipv6/ip6_offload.c:196)
   geneve_gro_complete (drivers/net/geneve.c:965)
   udp_gro_complete (net/ipv4/udp_offload.c:940)
   inet_gro_complete (net/ipv4/af_inet.c:1621)
   __gro_flush (net/core/gro.c:306)

Gate the complete path on gs->gro_hint too via geneve_sk_gro_hint_off(), so
both paths agree. Tunnels that enable the hint are unaffected.

Fixes: fd0dd796576e ("geneve: use GRO hint option in the RX path")
Reported-by: Weiming Shi <bestswngs@gmail.com>
Reported-by: Kyle Zeng <kylebot@openai.com>
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Xiang Mei <xmei5@asu.edu>
---
 drivers/net/geneve.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 9afff7bcaa0b..7cf7aaac8ee1 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -954,13 +954,13 @@ static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
 	struct genevehdr *gh;
 	struct packet_offload *ptype;
 	__be16 type;
-	int gh_len;
+	unsigned int gh_len;
 	int err = -ENOSYS;
 
 	gh = (struct genevehdr *)(skb->data + nhoff);
 	gh_len = geneve_hlen(gh);
 	type = gh->proto_type;
-	geneve_opt_gro_hint_off(gh, &type, &gh_len);
+	geneve_sk_gro_hint_off(sk, gh, &type, &gh_len);
 
 	/* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */
 	if (likely(type == htons(ETH_P_TEB)))
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net-next v1] net: wangxun: don't advertise IFF_SUPP_NOFCS
From: mengyuanlou @ 2026-06-18  3:06 UTC (permalink / raw)
  To: Rongguang Wei; +Cc: netdev, jiawenwu, pabeni, kuba, Rongguang Wei
In-Reply-To: <20260617092854.133992-1-clementwei90@163.com>



> 2026年6月17日 17:28，Rongguang Wei <clementwei90@163.com> 写道：
> 
> From: Rongguang Wei <weirongguang@kylinos.cn>
> 
> Like commit a24162f18825("i40e: don't advertise IFF_SUPP_NOFCS"),
> ngbe and txgbe also advertises IFF_SUPP_NOFCS and allowing users
> to use the SO_NOFCS socket option. But the driver does not check
> skb->no_fcs, so this option is silently ignored.
> 
> With this change, send() fails with -EPROTONOSUPPORT when AF_PACKET
> socket is set SO_NOFCS option.

In fact, the hardware supports this function, but it seems that no one is using it at present.
To ensure that the interface does not report any errors, it can be removed.

> 
> Signed-off-by: Rongguang Wei <weirongguang@kylinos.cn>
> ---
> drivers/net/ethernet/wangxun/ngbe/ngbe_main.c   | 1 -
> drivers/net/ethernet/wangxun/txgbe/txgbe_main.c | 1 -
> 2 files changed, 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
> index d8e3827a8b1f..1e4ebac8e495 100644
> --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
> +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
> @@ -713,7 +713,6 @@ static int ngbe_probe(struct pci_dev *pdev,
> netdev->features |= NETIF_F_GRO;
> 
> netdev->priv_flags |= IFF_UNICAST_FLT;
> - netdev->priv_flags |= IFF_SUPP_NOFCS;
> netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
> 
> netdev->min_mtu = ETH_MIN_MTU;
> diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c
> index 8b7c3753bb6a..db9262b00a66 100644
> --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c
> +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c
> @@ -801,7 +801,6 @@ static int txgbe_probe(struct pci_dev *pdev,
> netdev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
> 
> netdev->priv_flags |= IFF_UNICAST_FLT;
> - netdev->priv_flags |= IFF_SUPP_NOFCS;
> netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
> 
> netdev->min_mtu = ETH_MIN_MTU;
> -- 
> 2.25.1
> 
> 
> 


^ permalink raw reply

* Re: [PATCH bpf v2] bpf, sockmap: fix use-after-free when the stream parser resizes the skb
From: Jiayuan Chen @ 2026-06-18  2:45 UTC (permalink / raw)
  To: Sechang Lim, John Fastabend, Jakub Sitnicki, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Bobby Eshleman, netdev, bpf, linux-kernel
In-Reply-To: <20260612123553.2724240-1-rhkrqnwk98@gmail.com>


On 6/12/26 8:35 PM, Sechang Lim wrote:
> sk_psock_strp_parse() runs the BPF_PROG_TYPE_SK_SKB stream-parser program
> to find the length of the next message. strparser assembles a message out
> of several received skbs by chaining them onto the head's frag_list and
> recording where to append the next one in strp->skb_nextp:
>
> 	*strp->skb_nextp = skb;
> 	strp->skb_nextp = &skb->next;
>
> and then calls the parser on the head:
>
> 	len = (*strp->cb.parse_msg)(strp, head);
>
> The parser is only meant to inspect the skb, but the program may call
> bpf_skb_change_tail() -- or the sibling bpf_skb_pull_data(),
> bpf_skb_change_head(), bpf_skb_adjust_room(), all allowed for SK_SKB.
> Once the head carries a frag_list these go
>
> 	... -> skb_ensure_writable -> pskb_may_pull -> __pskb_pull_tail
>
> and __pskb_pull_tail() frees the frag_list skbs that strparser still
> tracks through skb_nextp:
>
> 	while ((list = skb_shinfo(skb)->frag_list) != insp) {
> 		skb_shinfo(skb)->frag_list = list->next;
> 		consume_skb(list);
> 	}
>
> strp->skb_nextp now points into a freed sk_buff. The next segment of
> the same message arrives in __strp_recv(), which links it with
> *strp->skb_nextp = skb, an 8-byte write into the freed skb. The free
> and the write happen in different __strp_recv() calls, so the message
> has to span at least three segments before it triggers.
>
>    BUG: KASAN: slab-use-after-free in __strp_recv+0x447/0xda0
>    Write of size 8 at addr ffff88810db86140 by task repro/349
>
>    Call Trace:
>     <IRQ>
>     __strp_recv+0x447/0xda0
>     __tcp_read_sock+0x13d/0x590
>     tcp_bpf_strp_read_sock+0x195/0x320
>     strp_data_ready+0x267/0x340
>     sk_psock_strp_data_ready+0x1ce/0x350
>     tcp_data_queue+0x1364/0x2fd0
>     tcp_rcv_established+0xe07/0x1640
>     [...]
>
>    Allocated by task 349:
>     skb_clone+0x17b/0x210
>     __strp_recv+0x2c3/0xda0
>     __tcp_read_sock+0x13d/0x590
>     [...]
>
>    Freed by task 349:
>     kmem_cache_free+0x150/0x570
>     __pskb_pull_tail+0x57b/0xc20
>     skb_ensure_writable+0x236/0x260
>     __bpf_skb_change_tail+0x1d4/0x590
>     sk_skb_change_tail+0x2a/0x40
>     bpf_prog_1b285dcd6c41373e+0x27/0x30
>     bpf_prog_run_pin_on_cpu+0xf3/0x260
>     sk_psock_strp_parse+0x118/0x1e0
>     __strp_recv+0x4f6/0xda0
>     [...]
>
> The same resize also leaves the head's length inconsistent with its
> frags, so a later __pskb_pull_tail() can instead hit the
> BUG_ON(skb_copy_bits(...)) in net/core/skbuff.c.
>
> Run the parser on a private clone of the head when the message spans more
> than one skb and the program can modify the packet
> (prog->aux->changes_pkt_data), so a resizing helper can only touch the
> clone and strparser's head and skb_nextp stay valid. Single-skb messages
> have no frag_list and read-only parsers cannot resize, so both are still
> parsed in place. If the clone cannot be allocated, return 0 so the caller
> retries on the next read rather than failing the parser.
>
> Fixes: 8a31db561566 ("bpf: add access to sock fields and pkt data from sk_skb programs")


Please consider Kuniyuki Iwashima's suggestion.

But it only covers the ATTACH path; the other two paths should be 
covered as well:

- BPF_PROG_ATTACH → sock_map_get_from_fd → sock_map_prog_update
- BPF_LINK_CREATE → sock_map_link_create → sock_map_prog_update
- replace prog → sock_map_link_update_prog

A new helper for this check is probably needed, called from both

sock_map_prog_update() and sock_map_link_update_prog().


Since this rejects the program at attach time rather than fixing a 
runtime crash,

I'm not sure a Fixes tag is appropriate here - thoughts?


^ permalink raw reply

* [PATCHv2] net: emac: Fix NULL pointer dereference in emac_probe
From: Rosen Penev @ 2026-06-18  2:34 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Rosen Penev, open list

Move devm_request_irq() after devm_platform_ioremap_resource() so that
dev->emacp is mapped before the interrupt handler can fire.  An early
interrupt hitting emac_irq() would dereference the NULL dev->emacp and
crash.

Also remove redundant error message. devm_platform_ioremap_resource()
already returns an error message with dev_err_probe().

Fixes: dcc34ef7c834 ("net: ibm: emac: manage emac_irq with devm")
Assisted-by: Opencode:Big-Pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 v2: remove redundant error message.
 drivers/net/ethernet/ibm/emac/core.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 80f0c8985845..62ee1b70c3e7 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -3044,6 +3044,12 @@ static int emac_probe(struct platform_device *ofdev)
 	if (err)
 		goto err_gone;

+	dev->emacp = devm_platform_ioremap_resource(ofdev, 0);
+	if (IS_ERR(dev->emacp)) {
+		err = PTR_ERR(dev->emacp);
+		goto err_gone;
+	}
+
 	/* Setup error IRQ handler */
 	dev->emac_irq = platform_get_irq(ofdev, 0);
 	if (dev->emac_irq < 0) {
@@ -3061,13 +3067,6 @@ static int emac_probe(struct platform_device *ofdev)

 	ndev->irq = dev->emac_irq;

-	dev->emacp = devm_platform_ioremap_resource(ofdev, 0);
-	if (IS_ERR(dev->emacp)) {
-		dev_err(&ofdev->dev, "can't map device registers");
-		err = PTR_ERR(dev->emacp);
-		goto err_gone;
-	}
-
 	/* Wait for dependent devices */
 	err = emac_wait_deps(dev);
 	if (err)
--
2.54.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox