Netdev List
 help / color / mirror / Atom feed
* [PATCH net v2 1/1] tcp: bound SYN-ACK timers to reqsk timeout range
From: Ren Wei @ 2026-06-30  3:49 UTC (permalink / raw)
  To: netdev
  Cc: edumazet, ncardwell, kuniyu, davem, pabeni, horms, chia-yu.chang,
	ij, bronzed_45_vested, fmancera, idosch, yuuchihsu, yuantan098,
	yifanwucs, tomapufckgml, bird, roxy520tt, n05ec

From: Zhiling Zou <roxy520tt@gmail.com>

tcp_synack_retries supplies the SYN-ACK retry limit used by request
socket timers. The same effective limit can also come from TCP_SYNCNT
through icsk_syn_retries, while TCP_DEFER_ACCEPT can keep an ACKed
request alive until rskq_defer_accept is reached.

The request socket timeout counter is incremented before it is used to
compute the next timeout. tcp_reqsk_timeout() and the Fast Open SYN-ACK
timer shift req->timeout by req->num_timeout. Excessive retry or
defer-accept limits can therefore drive these timer paths into invalid
shift counts before the request expires.

Limit tcp_synack_retries to the request socket timer range, clamp the
effective retry and defer-accept limits in the regular request socket
timer path, clamp the Fast Open retry limit, and make the request
socket timeout helper saturate before shifting.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Reported-by: Yuan Tan <yuantan098@gmail.com>
Reported-by: Yifan Wu <yifanwucs@gmail.com>
Reported-by: Juefei Pu <tomapufckgml@gmail.com>
Reported-by: Xin Liu <bird@lzu.edu.cn>
Assisted-by: Codex:gpt-5.4
Signed-off-by: Zhiling Zou <roxy520tt@gmail.com>
Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
---
Changes in v2:
  - Keep the existing max_retries calculation in tcp_fastopen_synack_timer()
    and only add the clamp, avoiding code churn.
  - v1 Link: https://lore.kernel.org/all/02e24eb83639e9d7ecc623f000c60254bb5c40a5.1782643946.git.roxy520tt@gmail.com/

 include/net/tcp.h               | 19 +++++++++++++++----
 net/ipv4/inet_connection_sock.c |  6 +++++-
 net/ipv4/sysctl_net_ipv4.c      |  2 ++
 net/ipv4/tcp_timer.c            |  3 ++-
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d376ea4d1c0..656f1bd0fa1a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -183,6 +183,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
 #define MAX_TCP_KEEPINTVL	32767
 #define MAX_TCP_KEEPCNT		127
 #define MAX_TCP_SYNCNT		127
+#define MAX_TCP_SYNACK_RETRIES	63
 
 /* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
  * to avoid overflows. This assumes a clock smaller than 1 Mhz.
@@ -882,12 +883,22 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
 }
 
-static inline unsigned long tcp_reqsk_timeout(struct request_sock *req)
+static inline unsigned long tcp_reqsk_timeout_sk(const struct sock *sk,
+						 struct request_sock *req)
 {
-	u64 timeout = (u64)req->timeout << req->num_timeout;
+	u64 timeout = req->timeout;
+	u32 rto_max = tcp_rto_max(sk);
+
+	if (req->num_timeout >= BITS_PER_TYPE(u64) ||
+	    timeout > U64_MAX >> req->num_timeout)
+		return rto_max;
+
+	return (unsigned long)min_t(u64, timeout << req->num_timeout, rto_max);
+}
 
-	return (unsigned long)min_t(u64, timeout,
-				    tcp_rto_max(req->rsk_listener));
+static inline unsigned long tcp_reqsk_timeout(struct request_sock *req)
+{
+	return tcp_reqsk_timeout_sk(req->rsk_listener, req);
 }
 
 u32 tcp_delack_max(const struct sock *sk);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 56902bba5483..b74212bae3dd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1056,6 +1056,8 @@ static void reqsk_timer_handler(struct timer_list *t)
 	net = sock_net(sk_listener);
 	max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
 		READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
+	max_syn_ack_retries = min_t(int, max_syn_ack_retries,
+				    MAX_TCP_SYNACK_RETRIES);
 	/* Normally all the openreqs are young and become mature
 	 * (i.e. converted to established socket) for first timeout.
 	 * If synack was not acknowledged for 1 second, it means
@@ -1086,7 +1088,9 @@ static void reqsk_timer_handler(struct timer_list *t)
 		}
 	}
 
-	syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept),
+	syn_ack_recalc(req, max_syn_ack_retries,
+		       min_t(u8, READ_ONCE(queue->rskq_defer_accept),
+			     MAX_TCP_SYNACK_RETRIES),
 		       &expire, &resend);
 	tcp_syn_ack_timeout(req);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ca1180dba1de..f9d233b98bbc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,7 @@ static int ip_ttl_min = 1;
 static int ip_ttl_max = 255;
 static int tcp_syn_retries_min = 1;
 static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
+static int tcp_synack_retries_max = MAX_TCP_SYNACK_RETRIES;
 static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT;
 static unsigned long ip_ping_group_range_min[] = { 0, 0 };
 static unsigned long ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
@@ -1034,6 +1035,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(u8),
 		.mode		= 0644,
 		.proc_handler	= proc_dou8vec_minmax,
+		.extra2		= &tcp_synack_retries_max
 	},
 #ifdef CONFIG_SYN_COOKIES
 	{
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index bf171b5e1eb3..bbedf2b9e1bc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -467,6 +467,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 	 */
 	max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
 		READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
+	max_retries = min_t(int, max_retries, MAX_TCP_SYNACK_RETRIES);
 
 	if (req->num_timeout >= max_retries) {
 		tcp_write_err(sk);
@@ -488,7 +489,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 	if (!tp->retrans_stamp)
 		tp->retrans_stamp = tcp_time_stamp_ts(tp);
 	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-			  req->timeout << req->num_timeout, false);
+			     tcp_reqsk_timeout_sk(sk, req), false);
 }
 
 static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
-- 
2.43.0


^ permalink raw reply related

* [PATCH 00/10] net: emac: various cleanups, fixes, and feature additions
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list

This series targets the IBM EMAC (Ethernet Media Access Controller)
driver used on PowerPC 4xx SoCs. It removes unused infrastructure,
fixes sparse warnings, replaces legacy helpers, streamlines
synchronization, fixes DMA API usage, and adds BQL support along
with ndo_get_stats64 conversion.

Rosen Penev (10):
  net: emac: remove emac_xaht_base()
  net: emac: fix sparse __iomem warnings in IAHT register access
  net: emac: use DMA-specific and SMP memory barriers
  net: emac: mal: replace of_get_property with of_property_read_u32
  net: emac: mal: replace busy-wait in mal_poll_disable with wait_event
  net: emac: batch stats, eliminate modulo, tighten barrier in RX poll
  net: emac: fix DMA API mapping and unmapping correctness
  net: emac: replace #ifdef CONFIG_PPC_DCR_NATIVE with IS_ENABLED()
  net: emac: add Byte Queue Limits (BQL) support
  net: emac: use ndo_get_stats64 instead of ndo_get_stats

 drivers/net/ethernet/ibm/emac/core.c  | 253 ++++++++++++++++----------
 drivers/net/ethernet/ibm/emac/core.h  |  17 +-
 drivers/net/ethernet/ibm/emac/mal.c   |  37 ++--
 drivers/net/ethernet/ibm/emac/mal.h   |   3 +
 drivers/net/ethernet/ibm/emac/rgmii.c |   2 +-
 drivers/net/ethernet/ibm/emac/tah.c   |   2 +-
 drivers/net/ethernet/ibm/emac/zmii.c  |   4 +-
 7 files changed, 188 insertions(+), 130 deletions(-)

-- 
2.54.0


^ permalink raw reply

* [PATCH 01/10] net: emac: remove emac_xaht_base()
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Unused function. It's also missing __iomem.

Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.h b/drivers/net/ethernet/ibm/emac/core.h
index 89fa1683ec3c..46c5512c8e00 100644
--- a/drivers/net/ethernet/ibm/emac/core.h
+++ b/drivers/net/ethernet/ibm/emac/core.h
@@ -420,14 +420,6 @@ static inline u32 __iomem *emac_gaht_base(struct emac_instance *dev)
 	return emac_xaht_base(dev) + EMAC_XAHT_REGS(dev);
 }
 
-static inline u32 *emac_iaht_base(struct emac_instance *dev)
-{
-	/* IAHT registers always come before an identical number of
-	 * GAHT registers.
-	 */
-	return emac_xaht_base(dev);
-}
-
 /* Ethtool get_regs complex data.
  * We want to get not just EMAC registers, but also MAL, ZMII, RGMII, TAH
  * when available.
-- 
2.54.0


^ permalink raw reply related

* [PATCH 02/10] net: emac: fix sparse __iomem warnings in IAHT register access
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Annotate iaht1/iaht2 in the EMAC4 register union with __iomem so
sparse does not warn about address-space mismatches, and simplify
emac_xaht_base() to return &p->u1.emac4sync.iaht1 (or the EMAC4
variant) directly instead of computing the offset by hand.

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.h b/drivers/net/ethernet/ibm/emac/core.h
index 46c5512c8e00..296da4bf3781 100644
--- a/drivers/net/ethernet/ibm/emac/core.h
+++ b/drivers/net/ethernet/ibm/emac/core.h
@@ -399,17 +399,13 @@ static inline int emac_has_feature(struct emac_instance *dev,
 static inline u32 __iomem *emac_xaht_base(struct emac_instance *dev)
 {
 	struct emac_regs __iomem *p = dev->emacp;
-	int offset;
 
 	/* The first IAHT entry always is the base of the block of
 	 * IAHT and GAHT registers.
 	 */
 	if (emac_has_feature(dev, EMAC_FTR_EMAC4SYNC))
-		offset = offsetof(struct emac_regs, u1.emac4sync.iaht1);
-	else
-		offset = offsetof(struct emac_regs, u0.emac4.iaht1);
-
-	return (u32 __iomem *)((__force ptrdiff_t)p + offset);
+		return &p->u1.emac4sync.iaht1;
+	return &p->u0.emac4.iaht1;
 }
 
 static inline u32 __iomem *emac_gaht_base(struct emac_instance *dev)
-- 
2.54.0


^ permalink raw reply related

* [PATCH 03/10] net: emac: use DMA-specific and SMP memory barriers
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Replace generic wmb()/mb() barriers with more specific variants:
- dma_wmb() for ordering descriptor field writes before hardware
  ownership bit (ctrl) hand-off in TX/RX paths
- dma_rmb() for ordering descriptor ctrl read after ownership
  observation in the RX poll path
- smp_wmb() for CPU-to-CPU ordering (link_polling flag,
  platform_set_drvdata visibility)
- dma_rmb() for the RX descriptor ownership transfer read

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.c  | 16 ++++++++--------
 drivers/net/ethernet/ibm/emac/mal.c   |  2 +-
 drivers/net/ethernet/ibm/emac/rgmii.c |  2 +-
 drivers/net/ethernet/ibm/emac/tah.c   |  2 +-
 drivers/net/ethernet/ibm/emac/zmii.c  |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 1d46cf6c2c12..5e7b85d28bde 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -1185,7 +1185,7 @@ __emac_prepare_rx_skb(struct sk_buff *skb, struct emac_instance *dev, int slot)
 	dev->rx_desc[slot].data_ptr =
 	    dma_map_single(&dev->ofdev->dev, skb->data - NET_IP_ALIGN,
 			   dev->rx_sync_size, DMA_FROM_DEVICE) + NET_IP_ALIGN;
-	wmb();
+	dma_wmb();
 	dev->rx_desc[slot].ctrl = MAL_RX_CTRL_EMPTY |
 	    (slot == (NUM_RX_BUFF - 1) ? MAL_RX_CTRL_WRAP : 0);
 
@@ -1263,7 +1263,7 @@ static int emac_open(struct net_device *ndev)
 			link_poll_interval = PHY_POLL_LINK_OFF;
 		}
 		dev->link_polling = 1;
-		wmb();
+		smp_wmb();
 		schedule_delayed_work(&dev->link_work, link_poll_interval);
 		emac_print_link_status(dev);
 	} else
@@ -1464,7 +1464,7 @@ static netdev_tx_t emac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 						     skb->data, len,
 						     DMA_TO_DEVICE);
 	dev->tx_desc[slot].data_len = (u16) len;
-	wmb();
+	dma_wmb();
 	dev->tx_desc[slot].ctrl = ctrl;
 
 	return emac_xmit_finish(dev, len);
@@ -1560,7 +1560,7 @@ emac_start_xmit_sg(struct sk_buff *skb, struct net_device *ndev)
 	/* Send the packet out */
 	if (dev->tx_slot == NUM_TX_BUFF - 1)
 		ctrl |= MAL_TX_CTRL_WRAP;
-	wmb();
+	dma_wmb();
 	dev->tx_desc[dev->tx_slot].ctrl = ctrl;
 	dev->tx_slot = (slot + 1) % NUM_TX_BUFF;
 
@@ -1671,7 +1671,7 @@ static inline void emac_recycle_rx_skb(struct emac_instance *dev, int slot,
 			       DMA_FROM_DEVICE);
 
 	dev->rx_desc[slot].data_len = 0;
-	wmb();
+	dma_wmb();
 	dev->rx_desc[slot].ctrl = MAL_RX_CTRL_EMPTY |
 	    (slot == (NUM_RX_BUFF - 1) ? MAL_RX_CTRL_WRAP : 0);
 }
@@ -1754,7 +1754,7 @@ static int emac_poll_rx(void *param, int budget)
 			break;
 
 		skb = dev->rx_skb[slot];
-		mb();
+		dma_rmb();
 		len = dev->rx_desc[slot].data_len;
 
 		if (unlikely(!MAL_IS_SINGLE_RX(ctrl)))
@@ -1845,7 +1845,7 @@ static int emac_poll_rx(void *param, int budget)
 	}
 
 	if (unlikely(budget && test_bit(MAL_COMMAC_RX_STOPPED, &dev->commac.flags))) {
-		mb();
+		dma_rmb();
 		if (!(dev->rx_desc[slot].ctrl & MAL_RX_CTRL_EMPTY)) {
 			DBG2(dev, "rx restart" NL);
 			received = 0;
@@ -3167,7 +3167,7 @@ static int emac_probe(struct platform_device *ofdev)
 	/* Set our drvdata last as we don't want them visible until we are
 	 * fully initialized
 	 */
-	wmb();
+	smp_wmb();
 	platform_set_drvdata(ofdev, dev);
 
 	printk(KERN_INFO "%s: EMAC-%d %pOF, MAC %pM\n",
diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c
index 4025bc36ae16..99615c8a6c3e 100644
--- a/drivers/net/ethernet/ibm/emac/mal.c
+++ b/drivers/net/ethernet/ibm/emac/mal.c
@@ -691,7 +691,7 @@ static int mal_probe(struct platform_device *ofdev)
 	       mal->num_tx_chans, mal->num_rx_chans);
 
 	/* Advertise this instance to the rest of the world */
-	wmb();
+	smp_wmb();
 	platform_set_drvdata(ofdev, mal);
 
 	return 0;
diff --git a/drivers/net/ethernet/ibm/emac/rgmii.c b/drivers/net/ethernet/ibm/emac/rgmii.c
index b544dd8633b7..093aa4f129e3 100644
--- a/drivers/net/ethernet/ibm/emac/rgmii.c
+++ b/drivers/net/ethernet/ibm/emac/rgmii.c
@@ -255,7 +255,7 @@ static int rgmii_probe(struct platform_device *ofdev)
 	       ofdev->dev.of_node,
 	       (dev->flags & EMAC_RGMII_FLAG_HAS_MDIO) ? "" : "out");
 
-	wmb();
+	smp_wmb();
 	platform_set_drvdata(ofdev, dev);
 
 	return 0;
diff --git a/drivers/net/ethernet/ibm/emac/tah.c b/drivers/net/ethernet/ibm/emac/tah.c
index ed07532aaf85..077da56fa449 100644
--- a/drivers/net/ethernet/ibm/emac/tah.c
+++ b/drivers/net/ethernet/ibm/emac/tah.c
@@ -112,7 +112,7 @@ static int tah_probe(struct platform_device *ofdev)
 	tah_reset(ofdev);
 
 	printk(KERN_INFO "TAH %pOF initialized\n", ofdev->dev.of_node);
-	wmb();
+	smp_wmb();
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/ibm/emac/zmii.c b/drivers/net/ethernet/ibm/emac/zmii.c
index a3839cf02ec4..5144ee94a7d2 100644
--- a/drivers/net/ethernet/ibm/emac/zmii.c
+++ b/drivers/net/ethernet/ibm/emac/zmii.c
@@ -258,8 +258,8 @@ static int zmii_probe(struct platform_device *ofdev)
 	/* Disable all inputs by default */
 	out_be32(&dev->base->fer, 0);
 
-	printk(KERN_INFO "ZMII %pOF initialized\n", ofdev->dev.of_node);
-	wmb();
+	dev_info(&ofdev->dev, "ZMII initialized\n");
+	smp_wmb();
 	platform_set_drvdata(ofdev, dev);
 
 	return 0;
-- 
2.54.0


^ permalink raw reply related

* [PATCH 04/10] net: emac: mal: replace of_get_property with of_property_read_u32
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Replace the deprecated of_get_property() calls with the typed
of_property_read_u32() helper in mal_probe(). This is both safer
and more idiomatic for modern DT API usage.

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/mal.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c
index 99615c8a6c3e..82d502d576ee 100644
--- a/drivers/net/ethernet/ibm/emac/mal.c
+++ b/drivers/net/ethernet/ibm/emac/mal.c
@@ -519,7 +519,7 @@ static int mal_probe(struct platform_device *ofdev)
 	int err = 0, i, bd_size;
 	int index = mal_count++;
 	unsigned int dcr_base;
-	const u32 *prop;
+	u32 val;
 	u32 cfg;
 	unsigned long irqflags;
 	irq_handler_t hdlr_serr, hdlr_txde, hdlr_rxde;
@@ -535,23 +535,23 @@ static int mal_probe(struct platform_device *ofdev)
 
 	MAL_DBG(mal, "probe" NL);
 
-	prop = of_get_property(ofdev->dev.of_node, "num-tx-chans", NULL);
-	if (prop == NULL) {
+	err = of_property_read_u32(ofdev->dev.of_node, "num-tx-chans", &val);
+	if (err) {
 		printk(KERN_ERR
 		       "mal%d: can't find MAL num-tx-chans property!\n",
 		       index);
 		return -ENODEV;
 	}
-	mal->num_tx_chans = prop[0];
+	mal->num_tx_chans = val;
 
-	prop = of_get_property(ofdev->dev.of_node, "num-rx-chans", NULL);
-	if (prop == NULL) {
+	err = of_property_read_u32(ofdev->dev.of_node, "num-rx-chans", &val);
+	if (err) {
 		printk(KERN_ERR
 		       "mal%d: can't find MAL num-rx-chans property!\n",
 		       index);
 		return -ENODEV;
 	}
-	mal->num_rx_chans = prop[0];
+	mal->num_rx_chans = val;
 
 	dcr_base = dcr_resource_start(ofdev->dev.of_node, 0);
 	if (dcr_base == 0) {
-- 
2.54.0


^ permalink raw reply related

* RE: [PATCH net v2 1/2] net: ethernet: oa_tc6: Protect skb pointer used by two different kernel instances
From: Selvamani Rajagopal @ 2026-06-30  4:16 UTC (permalink / raw)
  To: Jakub Kicinski, Selvamani Rajagopal via B4 Relay
  Cc: Parthiban Veerasooran, Andrew Lunn, Piergiorgio Beruto,
	David S. Miller, Eric Dumazet, Paolo Abeni,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Andrew Lunn
In-Reply-To: <20260629191553.0a305168@kernel.org>

> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Subject: Re: [PATCH net v2 1/2] net: ethernet: oa_tc6: Protect skb pointer used by two
> different kernel instances
> 
> 
> On Fri, 26 Jun 2026 08:35:18 -0700 Selvamani Rajagopal via B4 Relay
> wrote:
> > Threaded IRQ uses waiting_tx_skb. Transmit path also uses
> > this pointer without any mutual exclusion protection. As a
> > result, it might leak skb buffer, particularly threaded IRQ
> > runs in the middle of tranmsmit path, near skb_linearize.
> 
> Can you say more ? only xmit sets waiting_tx_skb, the IRQ
> clears it. So why is IRQ racing with xmit leading to drops?

I believe xmit path and IRQ thread would run in different kernel instances. Imagine oa_tc6_try_spi_transfer
call fails in threaded IRQ. It would set disable_irq. If xmit function didn't see that when it checked, but it is set
before placing skb buffer in the waiting_tx_skb pointer (due to skb_linearize for example), the skb would be stuck
in waiting_tx_skb.

Also, See the Sashiko review that gave a SMP use-case. If you search for CPU0 0r CPU1, you would find the use case.

https://netdev-ai.bots.linux.dev/sashiko/#/patchset/20260611-level-trigger-v5-0-4533a9e85ce2%40onsemi.com

^ permalink raw reply

* [PATCH 05/10] net: emac: mal: replace busy-wait in mal_poll_disable with wait_event
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Replace the msleep(1) busy-wait loop in mal_poll_disable() with
a proper wait_event/wake_up mechanism. Add wait_queue_head_t to
struct mal_commac, initialize it in mal_poll_add(), and wake
waiters in mal_poll_enable().

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/mal.c | 7 +++++--
 drivers/net/ethernet/ibm/emac/mal.h | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c
index 82d502d576ee..d12a376f69fd 100644
--- a/drivers/net/ethernet/ibm/emac/mal.c
+++ b/drivers/net/ethernet/ibm/emac/mal.c
@@ -176,6 +176,8 @@ void mal_poll_add(struct mal_instance *mal, struct mal_commac *commac)
 
 	MAL_DBG(mal, "poll_add(%p)" NL, commac);
 
+	init_waitqueue_head(&commac->poll_wait);
+
 	/* starts disabled */
 	set_bit(MAL_COMMAC_POLL_DISABLED, &commac->flags);
 
@@ -371,8 +373,8 @@ static irqreturn_t mal_int(int irq, void *dev_instance)
 void mal_poll_disable(struct mal_instance *mal, struct mal_commac *commac)
 {
 	/* Spinlock-type semantics: only one caller disable poll at a time */
-	while (test_and_set_bit(MAL_COMMAC_POLL_DISABLED, &commac->flags))
-		msleep(1);
+	wait_event(commac->poll_wait,
+		   !test_and_set_bit(MAL_COMMAC_POLL_DISABLED, &commac->flags));
 
 	/* Synchronize with the MAL NAPI poller */
 	napi_synchronize(&mal->napi);
@@ -382,6 +384,7 @@ void mal_poll_enable(struct mal_instance *mal, struct mal_commac *commac)
 {
 	smp_wmb();
 	clear_bit(MAL_COMMAC_POLL_DISABLED, &commac->flags);
+	wake_up(&commac->poll_wait);
 
 	/* Feels better to trigger a poll here to catch up with events that
 	 * may have happened on this channel while disabled. It will most
diff --git a/drivers/net/ethernet/ibm/emac/mal.h b/drivers/net/ethernet/ibm/emac/mal.h
index e0ddc41186a2..bd52bb41adee 100644
--- a/drivers/net/ethernet/ibm/emac/mal.h
+++ b/drivers/net/ethernet/ibm/emac/mal.h
@@ -19,6 +19,8 @@
 #ifndef __IBM_NEWEMAC_MAL_H
 #define __IBM_NEWEMAC_MAL_H
 
+#include <linux/wait.h>
+
 /*
  * There are some variations on the MAL, we express them in this driver as
  * MAL Version 1 and 2 though that doesn't match any IBM terminology.
@@ -172,6 +174,7 @@ struct mal_commac {
 	void			*dev;
 	struct list_head	poll_list;
 	long       		flags;
+	wait_queue_head_t	poll_wait;
 #define MAL_COMMAC_RX_STOPPED		0
 #define MAL_COMMAC_POLL_DISABLED	1
 	u32			tx_chan_mask;
-- 
2.54.0


^ permalink raw reply related

* [PATCH 06/10] net: emac: batch stats, eliminate modulo, tighten barrier in RX poll
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Three small hot-path changes in emac_poll_rx():

- Batch per-packet 64-bit stat updates into local accumulators and
  write dev->stats once after the poll loop, avoiding expensive
  load-linked/store-conditional sequences on 32-bit PPC for every
  received packet.

- Replace slot = (slot + 1) % NUM_RX_BUFF with a simple
  if (++slot == NUM_RX_BUFF) branch, avoiding a div/mul by a
  non-power-of-2 constant.

- Use dma_rmb() instead of mb() when ordering the ctrl vs. data_len
  read of the coherent RX descriptor.  The device writes the
  descriptor fields in-order and clears MAL_RX_CTRL_EMPTY last;
  a read barrier is sufficient.

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 5e7b85d28bde..ced9690cddc3 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -1741,6 +1741,7 @@ static int emac_poll_rx(void *param, int budget)
 {
 	struct emac_instance *dev = param;
 	int slot = dev->rx_slot, received = 0;
+	u64 packets = 0, bytes = 0;
 
 	DBG2(dev, "poll_rx(%d)" NL, budget);
 
@@ -1797,10 +1798,11 @@ static int emac_poll_rx(void *param, int budget)
 
 		napi_gro_receive(&dev->mal->napi, skb);
 	next:
-		++dev->stats.rx_packets;
+		++packets;
 	skip:
-		dev->stats.rx_bytes += len;
-		slot = (slot + 1) % NUM_RX_BUFF;
+		bytes += len;
+		if (++slot == NUM_RX_BUFF)
+			slot = 0;
 		--budget;
 		++received;
 		continue;
@@ -1864,6 +1866,9 @@ static int emac_poll_rx(void *param, int budget)
 		emac_rx_enable(dev);
 		dev->rx_slot = 0;
 	}
+
+	dev->stats.rx_packets += packets;
+	dev->stats.rx_bytes += bytes;
 	return received;
 }
 
-- 
2.54.0


^ permalink raw reply related

* [PATCH 07/10] net: emac: fix DMA API mapping and unmapping correctness
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Add missing dma_mapping_error() checks after every dma_map_single()
and skb_frag_dma_map() call. Without these, CONFIG_DMA_API_DEBUG
emits:

  DMA-API: emac ... device driver failed to check map error

Also fix emac_recycle_rx_skb() which called dma_map_single() but
discarded the returned DMA address -- the descriptor kept a stale
(unmapped) address after the old mapping was freed.

The RX descriptors are allocated in coherent (uncached) DMA memory.
Add a shadow rx_dma[] array in regular cached memory to store the
raw DMA address of each RX slot, avoiding a slow uncached read of
dev->rx_desc[slot].data_ptr on the per-packet hot path.  This
prevents a measurable throughput regression on non-coherent PowerPC
platforms where the original fix added such a read.

In emac_recycle_rx_skb(), use dma_sync_single_for_device() with the
actual received length (cache-line-aligned) instead of destroying
and recreating the mapping.  The mapping is long-lived; only the
bytes touched by skb_copy_from_linear_data_offset() need
synchronization.

- __emac_prepare_rx_skb: check map error, free skb on failure
- emac_resize_rx_ring: check map error, invalidate slot on failure
- emac_recycle_rx_skb: map first, check error, then unmap old;
  use dma_sync_single_for_device() with SKB_DATA_ALIGN(len + NET_IP_ALIGN)
- emac_start_xmit: check map error, free skb on failure
- emac_start_xmit_sg: check map error on both data and frag maps,
  undo partial descriptor setup on frag failure
- emac_poll_rx: use rx_dma[] shadow array, unmap old skb before
  passing to napi_gro_receive
- emac_clean_rx_ring: use rx_dma[] shadow array for consistency

There's a small performance decrease tested with iperf3

[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec   574 MBytes   481 Mbits/sec    1            sender
[  5]   0.00-10.00  sec   572 MBytes   479 Mbits/sec                  receiver

[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec   558 MBytes   468 Mbits/sec    0            sender
[  5]   0.00-10.00  sec   556 MBytes   466 Mbits/sec                  receiver

but probably worth it. For whatever reason after this patch, ath9k
stopped throwing DMA errors with CONFIG_DMA_API_DEBUG.

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.c | 92 +++++++++++++++++++++++-----
 drivers/net/ethernet/ibm/emac/core.h |  1 +
 2 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index ced9690cddc3..aed1ad21e2ea 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -52,7 +52,15 @@
 #include "core.h"
 
 /*
- * Lack of dma_unmap_???? calls is intentional.
+ * Note on dma_unmap calls:
+ *
+ * RX buffers are properly unmapped before being remapped or passed to the
+ * network stack.  See emac_recycle_rx_skb() and emac_poll_rx().
+ *
+ * TX buffers still lack dma_unmap calls for the reasons explained in the
+ * original note below (a single skb may be split across multiple BDs on
+ * TAH-equipped EMACs, making per-fragment tracking complex).
+ * The original rationale is kept for the TX path only:
  *
  * API-correct usage requires additional support state information to be
  * maintained for every RX and TX buffer descriptor (BD). Unfortunately, due to
@@ -1058,6 +1066,7 @@ static int emac_resize_rx_ring(struct emac_instance *dev, int new_mtu)
 	/* Second pass, allocate new skbs */
 	for (i = 0; i < NUM_RX_BUFF; ++i) {
 		struct sk_buff *skb;
+		dma_addr_t dma;
 
 		skb = netdev_alloc_skb_ip_align(dev->ndev, rx_skb_size);
 		if (!skb) {
@@ -1066,12 +1075,24 @@ static int emac_resize_rx_ring(struct emac_instance *dev, int new_mtu)
 		}
 
 		BUG_ON(!dev->rx_skb[i]);
+		dma_unmap_single(&dev->ofdev->dev,
+				 dev->rx_dma[i],
+				 dev->rx_sync_size, DMA_FROM_DEVICE);
 		dev_kfree_skb(dev->rx_skb[i]);
 
-		dev->rx_desc[i].data_ptr =
-		    dma_map_single(&dev->ofdev->dev, skb->data - NET_IP_ALIGN,
-				   rx_sync_size, DMA_FROM_DEVICE)
-				   + NET_IP_ALIGN;
+		dma = dma_map_single(&dev->ofdev->dev, skb->data - NET_IP_ALIGN,
+				     rx_sync_size, DMA_FROM_DEVICE);
+		if (dma_mapping_error(&dev->ofdev->dev, dma)) {
+			dev_kfree_skb(skb);
+			dev->rx_skb[i] = NULL;
+			dev->rx_dma[i] = 0;
+			dev->rx_desc[i].data_ptr = 0;
+			dev->rx_desc[i].ctrl = 0;
+			ret = -ENOMEM;
+			goto oom;
+		}
+		dev->rx_desc[i].data_ptr = dma + NET_IP_ALIGN;
+		dev->rx_dma[i] = dma;
 		dev->rx_skb[i] = skb;
 	}
  skip:
@@ -1150,9 +1171,13 @@ static void emac_clean_rx_ring(struct emac_instance *dev)
 
 	for (i = 0; i < NUM_RX_BUFF; ++i)
 		if (dev->rx_skb[i]) {
+			dma_unmap_single(&dev->ofdev->dev,
+					 dev->rx_dma[i],
+					 dev->rx_sync_size, DMA_FROM_DEVICE);
 			dev->rx_desc[i].ctrl = 0;
 			dev_kfree_skb(dev->rx_skb[i]);
 			dev->rx_skb[i] = NULL;
+			dev->rx_dma[i] = 0;
 			dev->rx_desc[i].data_ptr = 0;
 		}
 
@@ -1176,15 +1201,23 @@ static void emac_clear_mal_desc(struct mal_descriptor *desc, int count)
 static int
 __emac_prepare_rx_skb(struct sk_buff *skb, struct emac_instance *dev, int slot)
 {
+	dma_addr_t dma;
+
 	if (unlikely(!skb))
 		return -ENOMEM;
 
 	dev->rx_skb[slot] = skb;
 	dev->rx_desc[slot].data_len = 0;
 
-	dev->rx_desc[slot].data_ptr =
-	    dma_map_single(&dev->ofdev->dev, skb->data - NET_IP_ALIGN,
-			   dev->rx_sync_size, DMA_FROM_DEVICE) + NET_IP_ALIGN;
+	dma = dma_map_single(&dev->ofdev->dev, skb->data - NET_IP_ALIGN,
+			     dev->rx_sync_size, DMA_FROM_DEVICE);
+	if (dma_mapping_error(&dev->ofdev->dev, dma)) {
+		dev->rx_skb[slot] = NULL;
+		dev_kfree_skb(skb);
+		return -ENOMEM;
+	}
+	dev->rx_desc[slot].data_ptr = dma + NET_IP_ALIGN;
+	dev->rx_dma[slot] = dma;
 	dma_wmb();
 	dev->rx_desc[slot].ctrl = MAL_RX_CTRL_EMPTY |
 	    (slot == (NUM_RX_BUFF - 1) ? MAL_RX_CTRL_WRAP : 0);
@@ -1463,6 +1496,12 @@ static netdev_tx_t emac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	dev->tx_desc[slot].data_ptr = dma_map_single(&dev->ofdev->dev,
 						     skb->data, len,
 						     DMA_TO_DEVICE);
+	if (dma_mapping_error(&dev->ofdev->dev,
+			      dev->tx_desc[slot].data_ptr)) {
+		dev->tx_skb[slot] = NULL;
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
 	dev->tx_desc[slot].data_len = (u16) len;
 	dma_wmb();
 	dev->tx_desc[slot].ctrl = ctrl;
@@ -1530,8 +1569,12 @@ emac_start_xmit_sg(struct sk_buff *skb, struct net_device *ndev)
 	/* skb data */
 	dev->tx_skb[slot] = NULL;
 	chunk = min(len, MAL_MAX_TX_SIZE);
-	dev->tx_desc[slot].data_ptr = pd =
-	    dma_map_single(&dev->ofdev->dev, skb->data, len, DMA_TO_DEVICE);
+	pd = dma_map_single(&dev->ofdev->dev, skb->data, len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dev->ofdev->dev, pd)) {
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+	dev->tx_desc[slot].data_ptr = pd;
 	dev->tx_desc[slot].data_len = (u16) chunk;
 	len -= chunk;
 	if (unlikely(len))
@@ -1547,6 +1590,18 @@ emac_start_xmit_sg(struct sk_buff *skb, struct net_device *ndev)
 
 		pd = skb_frag_dma_map(&dev->ofdev->dev, frag, 0, len,
 				      DMA_TO_DEVICE);
+		if (dma_mapping_error(&dev->ofdev->dev, pd)) {
+			/* Undo partial descriptor setup and drop packet */
+			while (slot != dev->tx_slot) {
+				dev->tx_desc[slot].ctrl = 0;
+				--dev->tx_cnt;
+				if (--slot < 0)
+					slot = NUM_TX_BUFF - 1;
+			}
+			++dev->estats.tx_undo;
+			dev_kfree_skb(skb);
+			return NETDEV_TX_OK;
+		}
 
 		slot = emac_xmit_split(dev, slot, pd, len, i == nr_frags - 1,
 				       ctrl);
@@ -1661,14 +1716,14 @@ static void emac_poll_tx(void *param)
 static inline void emac_recycle_rx_skb(struct emac_instance *dev, int slot,
 				       int len)
 {
-	struct sk_buff *skb = dev->rx_skb[slot];
-
 	DBG2(dev, "recycle %d %d" NL, slot, len);
 
-	if (len)
-		dma_map_single(&dev->ofdev->dev, skb->data - NET_IP_ALIGN,
-			       SKB_DATA_ALIGN(len + NET_IP_ALIGN),
-			       DMA_FROM_DEVICE);
+	if (len) {
+		dma_sync_single_for_device(&dev->ofdev->dev,
+					    dev->rx_dma[slot],
+					    SKB_DATA_ALIGN(len + NET_IP_ALIGN),
+					    DMA_FROM_DEVICE);
+	}
 
 	dev->rx_desc[slot].data_len = 0;
 	dma_wmb();
@@ -1808,12 +1863,17 @@ static int emac_poll_rx(void *param, int budget)
 		continue;
 	sg:
 		if (ctrl & MAL_RX_CTRL_FIRST) {
+			dma_addr_t old_dma = dev->rx_dma[slot];
+
 			BUG_ON(dev->rx_sg_skb);
 			if (unlikely(emac_alloc_rx_skb_napi(dev, slot))) {
 				DBG(dev, "rx OOM %d" NL, slot);
 				++dev->estats.rx_dropped_oom;
 				emac_recycle_rx_skb(dev, slot, 0);
 			} else {
+				dma_unmap_single(&dev->ofdev->dev, old_dma,
+						 dev->rx_sync_size,
+						 DMA_FROM_DEVICE);
 				dev->rx_sg_skb = skb;
 				skb_put(skb, len);
 			}
diff --git a/drivers/net/ethernet/ibm/emac/core.h b/drivers/net/ethernet/ibm/emac/core.h
index 296da4bf3781..0719f98f3325 100644
--- a/drivers/net/ethernet/ibm/emac/core.h
+++ b/drivers/net/ethernet/ibm/emac/core.h
@@ -246,6 +246,7 @@ struct emac_instance {
 
 	struct sk_buff			*tx_skb[NUM_TX_BUFF];
 	struct sk_buff			*rx_skb[NUM_RX_BUFF];
+	dma_addr_t			rx_dma[NUM_RX_BUFF];
 
 	/* Stats
 	 */
-- 
2.54.0


^ permalink raw reply related

* [PATCH 08/10] net: emac: replace #ifdef CONFIG_PPC_DCR_NATIVE with IS_ENABLED()
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Convert compile-time #ifdef blocks to IS_ENABLED() conditionals
for better compile coverage and more idiomatic kernel code.
Affected functions: emac_rx_clk_tx, emac_rx_clk_default,
emac_reset, emac_init_phy in core.c, and mal_txeob/mal_rxeob
in mal.c.

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.c | 41 ++++++++++++----------------
 drivers/net/ethernet/ibm/emac/mal.c  | 14 ++++------
 2 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index aed1ad21e2ea..dba3cdfea340 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -139,20 +139,18 @@ static inline void emac_report_timeout_error(struct emac_instance *dev,
  */
 static inline void emac_rx_clk_tx(struct emac_instance *dev)
 {
-#ifdef CONFIG_PPC_DCR_NATIVE
-	if (emac_has_feature(dev, EMAC_FTR_440EP_PHY_CLK_FIX))
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    emac_has_feature(dev, EMAC_FTR_440EP_PHY_CLK_FIX))
 		dcri_clrset(SDR0, SDR0_MFR,
 			    0, SDR0_MFR_ECS >> dev->cell_index);
-#endif
 }
 
 static inline void emac_rx_clk_default(struct emac_instance *dev)
 {
-#ifdef CONFIG_PPC_DCR_NATIVE
-	if (emac_has_feature(dev, EMAC_FTR_440EP_PHY_CLK_FIX))
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    emac_has_feature(dev, EMAC_FTR_440EP_PHY_CLK_FIX))
 		dcri_clrset(SDR0, SDR0_MFR,
 			    SDR0_MFR_ECS >> dev->cell_index, 0);
-#endif
 }
 
 /* PHY polling intervals */
@@ -339,7 +337,7 @@ static int emac_reset(struct emac_instance *dev)
 {
 	struct emac_regs __iomem *p = dev->emacp;
 	int n = 20;
-	bool __maybe_unused try_internal_clock = false;
+	bool try_internal_clock = false;
 
 	DBG(dev, "reset" NL);
 
@@ -351,8 +349,6 @@ static int emac_reset(struct emac_instance *dev)
 		emac_tx_disable(dev);
 	}
 
-#ifdef CONFIG_PPC_DCR_NATIVE
-do_retry:
 	/*
 	 * PPC460EX/GT Embedded Processor Advanced User's Manual
 	 * section 28.10.1 Mode Register 0 (EMACx_MR0) states:
@@ -370,7 +366,9 @@ static int emac_reset(struct emac_instance *dev)
 	 * driver will temporarily switch to the internal clock, after
 	 * the first reset fails.
 	 */
-	if (emac_has_feature(dev, EMAC_FTR_460EX_PHY_CLK_FIX)) {
+retry:
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    emac_has_feature(dev, EMAC_FTR_460EX_PHY_CLK_FIX)) {
 		if (try_internal_clock || (dev->phy_address == 0xffffffff &&
 					   dev->phy_map == 0xffffffff)) {
 			/* No PHY: select internal loop clock before reset */
@@ -382,19 +380,18 @@ static int emac_reset(struct emac_instance *dev)
 				    SDR0_ETH_CFG_ECS << dev->cell_index, 0);
 		}
 	}
-#endif
 
 	out_be32(&p->mr0, EMAC_MR0_SRST);
 	while ((in_be32(&p->mr0) & EMAC_MR0_SRST) && n)
 		--n;
 
-#ifdef CONFIG_PPC_DCR_NATIVE
-	if (emac_has_feature(dev, EMAC_FTR_460EX_PHY_CLK_FIX)) {
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    emac_has_feature(dev, EMAC_FTR_460EX_PHY_CLK_FIX)) {
 		if (!n && !try_internal_clock) {
 			/* first attempt has timed out. */
 			n = 20;
 			try_internal_clock = true;
-			goto do_retry;
+			goto retry;
 		}
 
 		if (try_internal_clock || (dev->phy_address == 0xffffffff &&
@@ -404,7 +401,6 @@ static int emac_reset(struct emac_instance *dev)
 				    SDR0_ETH_CFG_ECS << dev->cell_index, 0);
 		}
 	}
-#endif
 
 	if (n) {
 		dev->reset_failed = 0;
@@ -2754,18 +2750,16 @@ static int emac_init_phy(struct emac_instance *dev)
 	dev->phy.mdio_write = emac_mdio_write;
 
 	/* Enable internal clock source */
-#ifdef CONFIG_PPC_DCR_NATIVE
-	if (emac_has_feature(dev, EMAC_FTR_440GX_PHY_CLK_FIX))
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    emac_has_feature(dev, EMAC_FTR_440GX_PHY_CLK_FIX))
 		dcri_clrset(SDR0, SDR0_MFR, 0, SDR0_MFR_ECS);
-#endif
 	/* PHY clock workaround */
 	emac_rx_clk_tx(dev);
 
 	/* Enable internal clock source on 440GX*/
-#ifdef CONFIG_PPC_DCR_NATIVE
-	if (emac_has_feature(dev, EMAC_FTR_440GX_PHY_CLK_FIX))
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    emac_has_feature(dev, EMAC_FTR_440GX_PHY_CLK_FIX))
 		dcri_clrset(SDR0, SDR0_MFR, 0, SDR0_MFR_ECS);
-#endif
 	/* Configure EMAC with defaults so we can at least use MDIO
 	 * This is needed mostly for 440GX
 	 */
@@ -2825,10 +2819,9 @@ static int emac_init_phy(struct emac_instance *dev)
 		}
 
 	/* Enable external clock source */
-#ifdef CONFIG_PPC_DCR_NATIVE
-	if (emac_has_feature(dev, EMAC_FTR_440GX_PHY_CLK_FIX))
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    emac_has_feature(dev, EMAC_FTR_440GX_PHY_CLK_FIX))
 		dcri_clrset(SDR0, SDR0_MFR, SDR0_MFR_ECS, 0);
-#endif
 	mutex_unlock(&emac_phy_map_lock);
 	if (i == 0x20) {
 		printk(KERN_WARNING "%pOF: can't find PHY!\n", np);
diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c
index d12a376f69fd..2adfd9d9bdb1 100644
--- a/drivers/net/ethernet/ibm/emac/mal.c
+++ b/drivers/net/ethernet/ibm/emac/mal.c
@@ -282,11 +282,10 @@ static irqreturn_t mal_txeob(int irq, void *dev_instance)
 	mal_schedule_poll(mal);
 	set_mal_dcrn(mal, MAL_TXEOBISR, r);
 
-#ifdef CONFIG_PPC_DCR_NATIVE
-	if (mal_has_feature(mal, MAL_FTR_CLEAR_ICINTSTAT))
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    mal_has_feature(mal, MAL_FTR_CLEAR_ICINTSTAT))
 		mtdcri(SDR0, DCRN_SDR_ICINTSTAT,
-				(mfdcri(SDR0, DCRN_SDR_ICINTSTAT) | ICINTSTAT_ICTX));
-#endif
+			(mfdcri(SDR0, DCRN_SDR_ICINTSTAT) | ICINTSTAT_ICTX));
 
 	return IRQ_HANDLED;
 }
@@ -302,11 +301,10 @@ static irqreturn_t mal_rxeob(int irq, void *dev_instance)
 	mal_schedule_poll(mal);
 	set_mal_dcrn(mal, MAL_RXEOBISR, r);
 
-#ifdef CONFIG_PPC_DCR_NATIVE
-	if (mal_has_feature(mal, MAL_FTR_CLEAR_ICINTSTAT))
+	if (IS_ENABLED(CONFIG_PPC_DCR_NATIVE) &&
+	    mal_has_feature(mal, MAL_FTR_CLEAR_ICINTSTAT))
 		mtdcri(SDR0, DCRN_SDR_ICINTSTAT,
-				(mfdcri(SDR0, DCRN_SDR_ICINTSTAT) | ICINTSTAT_ICRX));
-#endif
+			(mfdcri(SDR0, DCRN_SDR_ICINTSTAT) | ICINTSTAT_ICRX));
 
 	return IRQ_HANDLED;
 }
-- 
2.54.0


^ permalink raw reply related

* [PATCH 09/10] net: emac: add Byte Queue Limits (BQL) support
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Add BQL to the TX path to improve tail latency under high throughput:
  - Call netdev_tx_sent_queue() before ringing the TX doorbell
  - Call netdev_tx_completed_queue() with byte/packet counts after
    TX completions
  - Call netdev_reset_queue() on close and full TX reset

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index dba3cdfea340..da5f3d436aa3 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -751,6 +751,7 @@ static void emac_full_tx_reset(struct emac_instance *dev)
 	mal_disable_tx_channel(dev->mal, dev->mal_tx_chan);
 	emac_clean_tx_ring(dev);
 	dev->tx_cnt = dev->tx_slot = dev->ack_slot = 0;
+	netdev_reset_queue(dev->ndev);
 
 	emac_configure(dev);
 
@@ -1428,6 +1429,7 @@ static int emac_close(struct net_device *ndev)
 	emac_clean_tx_ring(dev);
 	emac_clean_rx_ring(dev);
 
+	netdev_reset_queue(ndev);
 	netif_carrier_off(ndev);
 
 	return 0;
@@ -1448,6 +1450,9 @@ static inline netdev_tx_t emac_xmit_finish(struct emac_instance *dev, int len)
 {
 	struct emac_regs __iomem *p = dev->emacp;
 	struct net_device *ndev = dev->ndev;
+	struct netdev_queue *txq = netdev_get_tx_queue(ndev, 0);
+
+	netdev_tx_sent_queue(txq, len);
 
 	/* Send the packet out. If the if makes a significant perf
 	 * difference, then we can store the TMR0 value in "dev"
@@ -1666,6 +1671,7 @@ static void emac_parse_tx_error(struct emac_instance *dev, u16 ctrl)
 static void emac_poll_tx(void *param)
 {
 	struct emac_instance *dev = param;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev->ndev, 0);
 	u32 bad_mask;
 
 	DBG2(dev, "poll_tx, %d %d" NL, dev->tx_cnt, dev->ack_slot);
@@ -1679,6 +1685,7 @@ static void emac_poll_tx(void *param)
 	if (dev->tx_cnt) {
 		u16 ctrl;
 		int slot = dev->ack_slot, n = 0;
+		unsigned int bytes = 0;
 	again:
 		ctrl = dev->tx_desc[slot].ctrl;
 		if (!(ctrl & MAL_TX_CTRL_READY)) {
@@ -1686,6 +1693,7 @@ static void emac_poll_tx(void *param)
 			++n;
 
 			if (skb) {
+				bytes += skb->len;
 				dev_kfree_skb(skb);
 				dev->tx_skb[slot] = NULL;
 			}
@@ -1699,6 +1707,7 @@ static void emac_poll_tx(void *param)
 		}
 		if (n) {
 			dev->ack_slot = slot;
+			netdev_tx_completed_queue(txq, n, bytes);
 			if (netif_queue_stopped(dev->ndev) &&
 			    dev->tx_cnt < EMAC_TX_WAKEUP_THRESH)
 				netif_wake_queue(dev->ndev);
-- 
2.54.0


^ permalink raw reply related

* [PATCH 10/10] net: emac: use ndo_get_stats64 instead of ndo_get_stats
From: Rosen Penev @ 2026-06-30  4:16 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, open list
In-Reply-To: <20260630041634.284127-1-rosenp@gmail.com>

Replace the legacy emac_stats() callback with emac_get_stats64()
that fills struct rtnl_link_stats64 directly from the driver's
u64 counters, avoiding truncation of 64-bit values on 32-bit
architectures.

Assisted-by: opencode:big-pickle
Signed-off-by: Rosen Penev <rosenp@gmail.com>
---
 drivers/net/ethernet/ibm/emac/core.c | 84 ++++++++++++++--------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index da5f3d436aa3..c62abc8aa471 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -2026,57 +2026,55 @@ static irqreturn_t emac_irq(int irq, void *dev_instance)
 	return IRQ_HANDLED;
 }
 
-static struct net_device_stats *emac_stats(struct net_device *ndev)
+static void emac_get_stats64(struct net_device *ndev,
+			     struct rtnl_link_stats64 *stats)
 {
 	struct emac_instance *dev = netdev_priv(ndev);
 	struct emac_stats *st = &dev->stats;
 	struct emac_error_stats *est = &dev->estats;
-	struct net_device_stats *nst = &ndev->stats;
 	unsigned long flags;
 
 	DBG2(dev, "stats" NL);
 
-	/* Compute "legacy" statistics */
 	spin_lock_irqsave(&dev->lock, flags);
-	nst->rx_packets = (unsigned long)st->rx_packets;
-	nst->rx_bytes = (unsigned long)st->rx_bytes;
-	nst->tx_packets = (unsigned long)st->tx_packets;
-	nst->tx_bytes = (unsigned long)st->tx_bytes;
-	nst->rx_dropped = (unsigned long)(est->rx_dropped_oom +
-					  est->rx_dropped_error +
-					  est->rx_dropped_resize +
-					  est->rx_dropped_mtu);
-	nst->tx_dropped = (unsigned long)est->tx_dropped;
-
-	nst->rx_errors = (unsigned long)est->rx_bd_errors;
-	nst->rx_fifo_errors = (unsigned long)(est->rx_bd_overrun +
-					      est->rx_fifo_overrun +
-					      est->rx_overrun);
-	nst->rx_frame_errors = (unsigned long)(est->rx_bd_alignment_error +
-					       est->rx_alignment_error);
-	nst->rx_crc_errors = (unsigned long)(est->rx_bd_bad_fcs +
-					     est->rx_bad_fcs);
-	nst->rx_length_errors = (unsigned long)(est->rx_bd_runt_packet +
-						est->rx_bd_short_event +
-						est->rx_bd_packet_too_long +
-						est->rx_bd_out_of_range +
-						est->rx_bd_in_range +
-						est->rx_runt_packet +
-						est->rx_short_event +
-						est->rx_packet_too_long +
-						est->rx_out_of_range +
-						est->rx_in_range);
-
-	nst->tx_errors = (unsigned long)(est->tx_bd_errors + est->tx_errors);
-	nst->tx_fifo_errors = (unsigned long)(est->tx_bd_underrun +
-					      est->tx_underrun);
-	nst->tx_carrier_errors = (unsigned long)est->tx_bd_carrier_loss;
-	nst->collisions = (unsigned long)(est->tx_bd_excessive_deferral +
-					  est->tx_bd_excessive_collisions +
-					  est->tx_bd_late_collision +
-					  est->tx_bd_multple_collisions);
+	stats->rx_packets = st->rx_packets;
+	stats->rx_bytes = st->rx_bytes;
+	stats->tx_packets = st->tx_packets;
+	stats->tx_bytes = st->tx_bytes;
+	stats->rx_dropped = est->rx_dropped_oom +
+			    est->rx_dropped_error +
+			    est->rx_dropped_resize +
+			    est->rx_dropped_mtu;
+	stats->tx_dropped = est->tx_dropped;
+
+	stats->rx_errors = est->rx_bd_errors;
+	stats->rx_fifo_errors = est->rx_bd_overrun +
+				est->rx_fifo_overrun +
+				est->rx_overrun;
+	stats->rx_frame_errors = est->rx_bd_alignment_error +
+				 est->rx_alignment_error;
+	stats->rx_crc_errors = est->rx_bd_bad_fcs +
+			       est->rx_bad_fcs;
+	stats->rx_length_errors = est->rx_bd_runt_packet +
+				  est->rx_bd_short_event +
+				  est->rx_bd_packet_too_long +
+				  est->rx_bd_out_of_range +
+				  est->rx_bd_in_range +
+				  est->rx_runt_packet +
+				  est->rx_short_event +
+				  est->rx_packet_too_long +
+				  est->rx_out_of_range +
+				  est->rx_in_range;
+
+	stats->tx_errors = est->tx_bd_errors + est->tx_errors;
+	stats->tx_fifo_errors = est->tx_bd_underrun +
+				est->tx_underrun;
+	stats->tx_carrier_errors = est->tx_bd_carrier_loss;
+	stats->collisions = est->tx_bd_excessive_deferral +
+			    est->tx_bd_excessive_collisions +
+			    est->tx_bd_late_collision +
+			    est->tx_bd_multple_collisions;
 	spin_unlock_irqrestore(&dev->lock, flags);
-	return nst;
 }
 
 static struct mal_commac_ops emac_commac_ops = {
@@ -3040,7 +3038,7 @@ static int emac_init_config(struct emac_instance *dev)
 static const struct net_device_ops emac_netdev_ops = {
 	.ndo_open		= emac_open,
 	.ndo_stop		= emac_close,
-	.ndo_get_stats		= emac_stats,
+	.ndo_get_stats64	= emac_get_stats64,
 	.ndo_set_rx_mode	= emac_set_multicast_list,
 	.ndo_eth_ioctl		= emac_ioctl,
 	.ndo_tx_timeout		= emac_tx_timeout,
@@ -3052,7 +3050,7 @@ static const struct net_device_ops emac_netdev_ops = {
 static const struct net_device_ops emac_gige_netdev_ops = {
 	.ndo_open		= emac_open,
 	.ndo_stop		= emac_close,
-	.ndo_get_stats		= emac_stats,
+	.ndo_get_stats64	= emac_get_stats64,
 	.ndo_set_rx_mode	= emac_set_multicast_list,
 	.ndo_eth_ioctl		= emac_ioctl,
 	.ndo_tx_timeout		= emac_tx_timeout,
-- 
2.54.0


^ permalink raw reply related

* [PATCH] usb: atm: ueagle: fix use-after-free in uea_upload_pre_firmware()
From: Deepanshu Kartikey @ 2026-06-30  4:17 UTC (permalink / raw)
  To: castet.matthieu, stf_xl, 3chas3, gregkh
  Cc: linux-atm-general, netdev, linux-usb, linux-kernel,
	Deepanshu Kartikey, syzbot+3d45d763d18796f97412

uea_load_firmware() calls request_firmware_nowait() passing a raw
struct usb_device pointer as context, without holding a reference
to it.

If the USB device is disconnected before the firmware workqueue
fires, the usb_device and its usb_interface objects are freed while
uea_upload_pre_firmware() is still pending on the workqueue. When
the callback eventually runs, it accesses the freed memory causing
a slab-use-after-free:

  BUG: KASAN: slab-use-after-free in __intf_to_usbdev
  include/linux/usb.h:752 [inline]
  BUG: KASAN: slab-use-after-free in uea_upload_pre_firmware+0x8d/0x640
  drivers/usb/atm/ueagle-atm.c:598
  Read of size 8 at addr ffff88802b0710b8 by task kworker/0:2/1664

Fix by calling usb_get_dev() before queuing the firmware request to
pin the usb_device in memory for the lifetime of the async operation,
and usb_put_dev() in the callback once it is finished with the
pointer. On the error path where request_firmware_nowait() itself
fails, drop the reference immediately since the callback will never
fire.

Reported-by: syzbot+3d45d763d18796f97412@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=3d45d763d18796f97412
Signed-off-by: Deepanshu Kartikey <kartikey406@gmail.com>
---
 drivers/usb/atm/ueagle-atm.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index d610cdcef7d0..686cc58fb89f 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -663,6 +663,7 @@ static void uea_upload_pre_firmware(const struct firmware *fw_entry,
 	uea_err(usb, "firmware is corrupted\n");
 err:
 	release_firmware(fw_entry);
+	usb_put_dev(usb);
 }
 
 /*
@@ -693,12 +694,14 @@ static int uea_load_firmware(struct usb_device *usb, unsigned int ver)
 		break;
 	}
 
+	usb_get_dev(usb);
 	ret = request_firmware_nowait(THIS_MODULE, 1, fw_name, &usb->dev,
 					GFP_KERNEL, usb,
 					uea_upload_pre_firmware);
-	if (ret)
+	if (ret) {
 		uea_err(usb, "firmware %s is not available\n", fw_name);
-	else
+		usb_put_dev(usb);
+	} else
 		uea_info(usb, "loading firmware %s\n", fw_name);
 
 	return ret;
-- 
2.43.0


^ permalink raw reply related

* RE: [PATCH net v5 1/4] net: ethernet: oa_tc6: Interrupt is active low, level triggered.
From: Selvamani Rajagopal @ 2026-06-30  4:21 UTC (permalink / raw)
  To: Parthiban.Veerasooran@microchip.com
  Cc: andrew@lunn.ch, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Conor.Dooley@microchip.com,
	devicetree@vger.kernel.org, andrew+netdev@lunn.ch,
	davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	pabeni@redhat.com, robh@kernel.org, krzk+dt@kernel.org,
	conor+dt@kernel.org, Piergiorgio Beruto
In-Reply-To: <d15eaa01-3312-420f-a34a-d810710e5b12@microchip.com>


> -----Original Message-----
> From: Parthiban.Veerasooran@microchip.com <Parthiban.Veerasooran@microchip.com>
> Subject: Re: [PATCH net v5 1/4] net: ethernet: oa_tc6: Interrupt is active low, level
> triggered.
> 
> 
> Sorry for the delayed response. I see you already shared the patches for
> the fixes. Today I will test the below patch series and share the
> feedback ASAP.
> 
> 

Parthiban.
No worries. Let us hope this address the NULL pointer reference (and traffic recovers gracefully)

Sincerely
Selva


^ permalink raw reply

* Re: [PATCH net] selftests: net: bump default cmd() timeout to 20 seconds
From: Pavan Chebbi @ 2026-06-30  4:31 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem, netdev, edumazet, pabeni, andrew+netdev, horms, shuah,
	petrm, leitao, dw, noren, gal, linux-kselftest
In-Reply-To: <20260629233348.2145841-1-kuba@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2772 bytes --]

On Tue, Jun 30, 2026 at 5:04 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> We always used 5 sec as the default command timeout. But soon after
> it was introduced, David effectively made us ignore the timeout
> (it was passed to process.communicate() as the wrong argument).
> Gal recently fixed that, but turns out the 5 sec is not enough
> for a lot of tests and setups. The fix regressed regressions.
>
> In particular running reconfig commands (e.g. XDP attach) on mlx5
> with 32 rings and 9k MTU, on a heavily-debug-enabled kernel takes
> more than 5 sec. The XDP installation command will time out after
> 5 sec but since the sleeps in the kernel are non interruptible
> the command finishes anyway, leaving the XDP program attached,
> but with non-zero exit code. defer()ed cleanups are not installed,
> breaking the environment for subsequent tests.
>
> Since "install XDP" is a pretty normal command a "point fix"
> does not seem appropriate. 32 rings is a fairly reasonable
> config, too, so we should just increase the timeout to 20 sec.
>
> There's no real reason behind the value of 20.
>
> Fixes: 1cf270424218 ("net: selftest: add test for netdev netlink queue-get API")
> Fixes: f0bd19316663 ("selftests: net: fix timeout passed as positional argument to communicate()")
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
> CC: shuah@kernel.org
> CC: petrm@nvidia.com
> CC: leitao@debian.org
> CC: dw@davidwei.uk
> CC: noren@nvidia.com
> CC: gal@nvidia.com
> CC: linux-kselftest@vger.kernel.org
> ---
>  tools/testing/selftests/net/lib/py/utils.py | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>

Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>

> diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py
> index 308c91833239..9b40049e2dbb 100644
> --- a/tools/testing/selftests/net/lib/py/utils.py
> +++ b/tools/testing/selftests/net/lib/py/utils.py
> @@ -44,7 +44,7 @@ import time
>      Use bkg() instead to run a command in the background.
>      """
>      def __init__(self, comm, shell=None, fail=True, expect_fail=False, ns=None,
> -                 background=False, host=None, timeout=5, ksft_ready=None,
> +                 background=False, host=None, timeout=20, ksft_ready=None,
>                   ksft_wait=None):
>          if ns:
>              if hasattr(ns, 'user_ns_path'):
> @@ -113,7 +113,7 @@ import time
>
>          return stdout, stderr
>
> -    def process(self, terminate=True, fail=None, expect_fail=False, timeout=5):
> +    def process(self, terminate=True, fail=None, expect_fail=False, timeout=20):
>          if fail is None:
>              fail = not terminate
>
> --
> 2.54.0
>
>

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5469 bytes --]

^ permalink raw reply

* Re: [PATCH net] selftests: drv-net: tso: don't touch dangerous feature bits
From: Pavan Chebbi @ 2026-06-30  4:33 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem, netdev, edumazet, pabeni, andrew+netdev, horms, shuah,
	daniel.zahka, linux-kselftest
In-Reply-To: <20260629233923.2151144-1-kuba@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 1024 bytes --]

On Tue, Jun 30, 2026 at 5:09 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> query_nic_features() detects which offloads depend on tx-gso-partial
> by enabling everything, turning tx-gso-partial off, and seeing which
> active features drop out. Enabling all hw features is dangerous:
> we may end up enabling rx-fcs and loopback for example. For the
> ice driver we end up getting into problems with feature dependencies
> so the cleanup isn't successful either, and the test exits with
> rx-fcs and loopback enabled.
>
> Scope the feature probing just to segmentation bits.
>
> Fixes: 266b835e5e84 ("selftests: drv-net: tso: enable test cases based on hw_features")
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
> CC: shuah@kernel.org
> CC: daniel.zahka@gmail.com
> CC: linux-kselftest@vger.kernel.org
> ---
>  tools/testing/selftests/drivers/net/hw/tso.py | 16 ++++++----------
>  1 file changed, 6 insertions(+), 10 deletions(-)
>

Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5469 bytes --]

^ permalink raw reply

* RE: [PATCH net v2 2/2] net: ethernet: oa_tc6: Improvement in buffer overflow handling
From: Selvamani Rajagopal @ 2026-06-30  4:41 UTC (permalink / raw)
  To: Jakub Kicinski, Selvamani Rajagopal via B4 Relay
  Cc: Parthiban Veerasooran, Andrew Lunn, Piergiorgio Beruto,
	David S. Miller, Eric Dumazet, Paolo Abeni,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Andrew Lunn
In-Reply-To: <20260629192959.445776c9@kernel.org>

> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Monday, June 29, 2026 7:30 PM
> Subject: Re: [PATCH net v2 2/2] net: ethernet: oa_tc6: Improvement in buffer overflow handling
> 
> 
> This sounds rather scary. The driver seems to have no length
> information to confirm it got all the chunks. So if it missed
> a middle chunk we will never know? At the very least this seems
> like something we should increment rx_error for, not just rx_dropped?

If middle chunk is lost, we won't know. If we look at prcs_rx_chunk_payload, it relies on start_valid 
and end_valid bits only. start_byte_offset and end_byte_offset are always associated with the above
two bits.

When rx buffer overflow occurs, we don't know whether to trust the data chunk or lost, if lost,
how many lost, so, we drop everything and start looking for new frame indicated by "start_bit"
That's why drop count is incremented. 

If we dig deeper, though we increment the dropped count by one. It is possible that we may have lost 
more than one frame.

Function process_spi_data_rx_buf  bails out of the "for loop" with number_of_rx_chunks,
after EAGAIN error is seen. We don't go through all the rx chunks that are already received, when 
we start looking for next data chunks with "start_bit". 

That improvement is for another time. I didn't want to complicate the changes for this effort.

> 
> Regarding the patch itself, I'm not clear on why we need to look
> for new frame. Will we not notice the start bit immediately and
> call oa_tc6_allocate_rx_skb() (if there is indeed a start bit in the
> stream?)
> 

The chunk with "start_bit" could be next chunk or "nth" chunk. We don't know.
We keep throwing away incoming data chunks until we see a chunk with data_bit.

> So handling skb-already-exists in oa_tc6_allocate_rx_skb() seems
> like enough to start a new frame.


I don't think we can rely on this. What if we get two continuous data chunks with "start_bit" set
without end_valid bit? Later skb would overwrite the previous one. (either due to corruption or lost chunks)


> 
> Sashiko has another comment:

Let me read at the Sashiko review 

> 
> https://sashiko.dev/#/patchset/20260626-fix-race-condition-and-crash-v2-1-
> b6c5c10e604f@onsemi.com
> <https://sashiko.dev/#/patchset/20260626-fix-race-condition-and-crash-v2-1-b6c5c10e604f@onsemi.com
> =sashiko.dev>
> --
> pw-bot: cr


^ permalink raw reply

* Re: [PATCH net] net: usb: net1080: validate packet_len before pad-byte access in rx_fixup
From: Xiang Mei @ 2026-06-30  4:51 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Paolo Abeni, netdev,
	linux-usb, linux-kernel, Weiming Shi
In-Reply-To: <20260629185415.56ec8b67@kernel.org>

On Mon, Jun 29, 2026 at 6:54 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Sat, 27 Jun 2026 14:28:54 -0700 Xiang Mei wrote:
> >       if ((packet_len & 0x01) == 0) {
>
> just add "skb->len &&" to this condition?
>
Thanks for the review; your one-line patch is clearer.

v2 has been sent.

Xiang
> > +             if (packet_len >= skb->len) {
> > +                     dev->net->stats.rx_frame_errors++;
> > +                     netdev_dbg(dev->net, "bad packet len %d (expected %d)\n",
> > +                                skb->len, packet_len);
> > +                     nc_ensure_sync(dev);
> > +                     return 0;
> > +             }
> --
> pw-bot: cr

^ permalink raw reply

* [PATCH net v2] net: usb: net1080: validate packet_len before pad-byte access in rx_fixup
From: Xiang Mei @ 2026-06-30  4:51 UTC (permalink / raw)
  To: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, netdev, linux-usb
  Cc: linux-kernel, Weiming Shi, Xiang Mei

For an even packet_len, net1080_rx_fixup() reads the pad byte at
skb->data[packet_len] before the skb->len != packet_len check further
down, and packet_len is only bounded against NC_MAX_PACKET. A malicious
NetChip 1080 device can send a short frame advertising a large even
packet_len (e.g. 0x4000), so the pad-byte read lands past the end of the
skb:

  BUG: KASAN: slab-out-of-bounds in net1080_rx_fixup
  Read of size 1 at addr ffff8880106c83c6 by task ksoftirqd/0/14
   ...
   net1080_rx_fixup (drivers/net/usb/net1080.c:384)
   usbnet_bh (drivers/net/usb/usbnet.c:1589)
   process_one_work (kernel/workqueue.c:3322)
   bh_worker (kernel/workqueue.c:3708)
   tasklet_action (kernel/softirq.c:965)
   handle_softirqs (kernel/softirq.c:622)
   ...

Reject the frame when packet_len >= skb->len before reading.

Fixes: 904813cd8a0b ("[PATCH] USB: usbnet (4/9) module for net1080 cables")
Reported-by: Weiming Shi <bestswngs@gmail.com>
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Xiang Mei <xmei5@asu.edu>
---
v2: merge two validations into one

 drivers/net/usb/net1080.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/usb/net1080.c b/drivers/net/usb/net1080.c
index 5d4a1fd2b524..19f6e1222d93 100644
--- a/drivers/net/usb/net1080.c
+++ b/drivers/net/usb/net1080.c
@@ -381,7 +381,7 @@ static int net1080_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
 	skb_trim(skb, skb->len - sizeof *trailer);
 
 	if ((packet_len & 0x01) == 0) {
-		if (skb->data [packet_len] != PAD_BYTE) {
+		if (packet_len >= skb->len || skb->data[packet_len] != PAD_BYTE) {
 			dev->net->stats.rx_frame_errors++;
 			netdev_dbg(dev->net, "bad pad\n");
 			return 0;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net 0/9] netfilter: updates for net
From: Florian Westphal @ 2026-06-30  4:52 UTC (permalink / raw)
  To: netdev
  Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netfilter-devel, pablo

Hi,

The following patchset contains Netfilter fixes for *net*.
Due to bug volume the plan is to make a second *net* pull request
this Friday.

1) Zero nf_conntrack_expect at allocation to prevent uninitialized data
leaks to userspace. Add missing exp->dir initialization.

2) Prevent out-of-bounds writes in nft_set_pipapo caused by inconsistent
clones during allocation failures.  Fail operations if the clone enters an
error state.  This was a day-0 bug.

3) Fix use-after-free race between ipset dump and array resizing. Protect
array pointer access with rcu_read_lock().  From Xiang Mei. Bug existed
since v4.20.

4) Validate skb_dst() exists before access in nf_conntrack_sip.
This Prevent crash when called from tc ingress or openvswitch.
From Pablo Neira Ayuso.  Bug added in 4.3 when ovs gained support
for conntrack helpers.

5) Cap the maximum number of expectations to NF_CT_EXPECT_MAX_CNT during
userspace helper policy updates.  Also from Pablo.

6) Prevent NULL pointer dereference in nft_fib on netdev egress hooks. Add
nft_fib_netdev_validate() to restrict fib expressions to appropriate
netdev hooks. Restrict nft_fib_validate() to IPv4, IPv6, and INET
protocols.  From Theodor Arsenij Larionov-Trichkine.
Bug was exposed in v5.16 when egress hooks got added.

7) Restrict nfnetlink_queue writes to network headers. Validate IP/IPv6
header length and disable extension headers or IP option modifications.
Disable bridge modification for now, its unlikely anyone is using this.

8) Restrict arbitrary writes to link-layer and network headers in nftables.
Prevent link-layer modifications from spilling into network headers.
Prevent writes to IP version and length fields.

9) Restrict L3 checksum update offset to IPv4. Else csum offset can be
used to munge arbitrary header offsets, rendering the previous change moot.

These three patches are follow-ups to a 7.1 change that disabled
header rewrite ability in unprivileged network namespaces.
unprivileged netns support is not yet enabled again here.

Please, pull these changes from:
The following changes since commit 1398b1014909618f65ff6bcebcb2ee5ccd44fdc0:

  MAINTAINERS: Update Jason Wang's email address (2026-06-29 19:09:00 -0700)

are available in the Git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git tags/nf-26-06-30

for you to fetch changes up to e2c4a0c805f7be21c8288e8562145a6691e11559:

  netfilter: nftables: restrict checkum update offset (2026-06-30 06:37:12 +0200)

----------------------------------------------------------------
netfilter pull request nf-26-06-30
----------------------------------------------------------------

Florian Westphal (5):
  netfilter: nf_conntrack_expect: zero at allocation time
  netfilter: nft_set_pipapo: don't leak bad clone into future transaction
  netfilter: nfnetlink_queue: restrict writes to network header
  netfilter: nftables: restrict linklayer and network header writes
  netfilter: nftables: restrict checkum update offset

Pablo Neira Ayuso (2):
  netfilter: nf_conntrack_sip: validate skb_dst() before accessing it
  netfilter: nfnetlink_cthelper: cap to maximum number of expectation per master

Theodor Arsenij Larionov-Trichkine (1):
  netfilter: nft_fib: reject fib expression on the netdev egress hook

Xiang Mei (1):
  netfilter: ipset: fix race between dump and ip_set_list resize

 net/netfilter/ipset/ip_set_core.c    |   8 +-
 net/netfilter/nf_conntrack_expect.c  |   3 +-
 net/netfilter/nf_conntrack_netlink.c |  11 +-
 net/netfilter/nf_conntrack_sip.c     |   7 +-
 net/netfilter/nfnetlink_cthelper.c   |   2 +
 net/netfilter/nfnetlink_queue.c      | 170 +++++++++++++++++
 net/netfilter/nft_fib.c              |   9 +
 net/netfilter/nft_fib_netdev.c       |  29 ++-
 net/netfilter/nft_payload.c          | 270 +++++++++++++++++++++++++++
 net/netfilter/nft_set_pipapo.c       |  34 +++-
 net/netfilter/nft_set_pipapo.h       |   8 +
 11 files changed, 531 insertions(+), 20 deletions(-)

-- 
2.53.0

^ permalink raw reply

* [PATCH net 1/9] netfilter: nf_conntrack_expect: zero at allocation time
From: Florian Westphal @ 2026-06-30  4:52 UTC (permalink / raw)
  To: netdev
  Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netfilter-devel, pablo
In-Reply-To: <20260630045243.2657-1-fw@strlen.de>

There are occasional LLM hints wrt. leaking uninitialized data to
userspace via ctnetlink.  Just zero at allocation time,
expectations are not frequently used these days.

Intentionally keeps _init as-is because we could theoretically
support re-init, so add the missing exp->dir there.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_conntrack_expect.c  |  3 ++-
 net/netfilter/nf_conntrack_netlink.c | 11 +----------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 38630c5e006f..7ae68d60586a 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -306,7 +306,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
 {
 	struct nf_conntrack_expect *new;
 
-	new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
+	new = kmem_cache_zalloc(nf_ct_expect_cachep, GFP_ATOMIC);
 	if (!new)
 		return NULL;
 
@@ -391,6 +391,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
 #if IS_ENABLED(CONFIG_NF_NAT)
 	memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
 	memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
+	exp->dir = 0;
 #endif
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_init);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4217715d42dc..31cbb1b55b9e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3549,8 +3549,6 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
 	if (cda[CTA_EXPECT_FLAGS]) {
 		exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
 		exp->flags &= ~NF_CT_EXPECT_USERSPACE;
-	} else {
-		exp->flags = 0;
 	}
 	if (cda[CTA_EXPECT_FN]) {
 		const char *name = nla_data(cda[CTA_EXPECT_FN]);
@@ -3562,8 +3560,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
 			goto err_out;
 		}
 		exp->expectfn = expfn->expectfn;
-	} else
-		exp->expectfn = NULL;
+	}
 
 	exp->class = class;
 	exp->master = ct;
@@ -3583,12 +3580,6 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
 						 exp, nf_ct_l3num(ct));
 		if (err < 0)
 			goto err_out;
-#if IS_ENABLED(CONFIG_NF_NAT)
-	} else {
-		memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
-		memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
-		exp->dir = 0;
-#endif
 	}
 	return exp;
 err_out:
-- 
2.53.0


^ permalink raw reply related

* [PATCH net 2/9] netfilter: nft_set_pipapo: don't leak bad clone into future transaction
From: Florian Westphal @ 2026-06-30  4:52 UTC (permalink / raw)
  To: netdev
  Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netfilter-devel, pablo
In-Reply-To: <20260630045243.2657-1-fw@strlen.de>

On memory allocation failure the cloned nft_pipapo_match can enter a bad
state:
 - some fields can have their lookup tables resized while others did
   not
 - bits might have been toggled
 - scratch map can be undersized which also means m->bsize_max can be
   lower than what is required

This means that the next insertion in the same batch can trigger
out-of-bounds writes.

Furthermore, a failure in the first can result in the bad clone to
leak into the next transaction because the abort callback is never
executed in this case (the upper layer saw an error and no attempt to
allocate a transactional request was made).

Record a state for the nft_pipapo_match structure:
- NEW (pristine clone)
- MOD (modified clone with good state)
- ERR (potentially bogus content)

Then make it so that deletes and insertions fail when the clone
entered ERR state.

In case the very first insert attempt results in an error, free the
clone right away.

Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Cc: stable@vger.kernel.org
Reported-and-tested-by: Seesee <cjc000013@gmail.com>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nft_set_pipapo.c | 34 +++++++++++++++++++++++++++++-----
 net/netfilter/nft_set_pipapo.h |  8 ++++++++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 706c78853f24..978bb0c01106 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -342,6 +342,8 @@
 #include "nft_set_pipapo_avx2.h"
 #include "nft_set_pipapo.h"
 
+static void nft_pipapo_abort(const struct nft_set *set);
+
 /**
  * pipapo_refill() - For each set bit, set bits from selected mapping table item
  * @map:	Bitmap to be scanned for set bits
@@ -1296,7 +1298,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 	const u8 *start_p, *end_p;
 	int i, bsize_max, err = 0;
 
-	if (!m)
+	if (!m || m->state == NFT_PIPAPO_CLONE_ERR)
 		return -ENOMEM;
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
@@ -1367,8 +1369,10 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 		else
 			ret = pipapo_expand(f, start, end, f->groups * f->bb);
 
-		if (ret < 0)
-			return ret;
+		if (ret < 0) {
+			err = ret;
+			goto abort;
+		}
 
 		if (f->bsize > bsize_max)
 			bsize_max = f->bsize;
@@ -1384,7 +1388,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 
 		err = pipapo_realloc_scratch(m, bsize_max);
 		if (err)
-			return err;
+			goto abort;
 
 		m->bsize_max = bsize_max;
 	} else {
@@ -1396,7 +1400,26 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
 
 	pipapo_map(m, rulemap, e);
 
+	m->state = NFT_PIPAPO_CLONE_MOD;
 	return 0;
+abort:
+	DEBUG_NET_WARN_ON_ONCE(m->state == NFT_PIPAPO_CLONE_ERR);
+
+	/* Two rollback cases:
+	 * 1) no previous changes.  nft_pipapo_abort is not
+	 * guaranteed to be invoked (there might be no further
+	 * add/delete requests coming after this).
+	 *
+	 * 2) we had previous changes: there are transaction
+	 * records pointing to this set.  Leave the rollback to
+	 * the transaction handling.
+	 */
+	if (m->state == NFT_PIPAPO_CLONE_NEW)
+		nft_pipapo_abort(set); /* releases m */
+	else
+		m->state = NFT_PIPAPO_CLONE_ERR;
+
+	return err;
 }
 
 /**
@@ -1473,6 +1496,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
 		dst++;
 	}
 
+	new->state = NFT_PIPAPO_CLONE_NEW;
 	return new;
 
 out_mt:
@@ -1896,7 +1920,7 @@ nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
 	/* removal must occur on priv->clone, if we are low on memory
 	 * we have no choice and must fail the removal request.
 	 */
-	if (!m)
+	if (!m || m->state == NFT_PIPAPO_CLONE_ERR)
 		return NULL;
 
 	e = pipapo_get(m, (const u8 *)elem->key.val.data,
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index b82abb03576e..a19e980d06ef 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -131,9 +131,16 @@ struct nft_pipapo_scratch {
 	unsigned long __map[];
 };
 
+enum nft_pipapo_clone_state {
+	NFT_PIPAPO_CLONE_NEW,
+	NFT_PIPAPO_CLONE_MOD,
+	NFT_PIPAPO_CLONE_ERR,
+};
+
 /**
  * struct nft_pipapo_match - Data used for lookup and matching
  * @field_count:	Amount of fields in set
+ * @state:		add/delete state; used from control plane
  * @bsize_max:		Maximum lookup table bucket size of all fields, in longs
  * @scratch:		Preallocated per-CPU maps for partial matching results
  * @rcu:		Matching data is swapped on commits
@@ -141,6 +148,7 @@ struct nft_pipapo_scratch {
  */
 struct nft_pipapo_match {
 	u8 field_count;
+	enum nft_pipapo_clone_state state:8;
 	unsigned int bsize_max;
 	struct nft_pipapo_scratch * __percpu *scratch;
 	struct rcu_head rcu;
-- 
2.53.0


^ permalink raw reply related

* [PATCH net 3/9] netfilter: ipset: fix race between dump and ip_set_list resize
From: Florian Westphal @ 2026-06-30  4:52 UTC (permalink / raw)
  To: netdev
  Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netfilter-devel, pablo
In-Reply-To: <20260630045243.2657-1-fw@strlen.de>

From: Xiang Mei <xmei5@asu.edu>

The release path of ip_set_dump_do() and ip_set_dump_done() read
inst->ip_set_list via ip_set_ref_netlink(), a plain rcu_dereference_raw()
of the array pointer. These run from netlink_recvmsg() without the nfnl
mutex and without an RCU read-side critical section.

A concurrent ip_set_create() can grow the array: it publishes the new
array, calls synchronize_net() and then kvfree()s the old one. Since the
dump paths read the array outside any RCU reader, synchronize_net() does
not wait for them and the old array can be freed while they still index
into it, causing a use-after-free.

The dumped set itself stays pinned via set->ref_netlink, so only the
array load needs protecting. Take rcu_read_lock() around it, matching
ip_set_get_byname() and __ip_set_put_byindex().

  BUG: KASAN: slab-use-after-free in ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1697)
  Read of size 8 at addr ffff88800b5c4018 by task exploit/150
  Call Trace:
   ...
   kasan_report (mm/kasan/report.c:595)
   ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1697)
   netlink_dump (net/netlink/af_netlink.c:2325)
   netlink_recvmsg (net/netlink/af_netlink.c:1976)
   sock_recvmsg (net/socket.c:1159)
   __sys_recvfrom (net/socket.c:2315)
   ...
  Oops: general protection fault, probably for non-canonical address ... KASAN NOPTI
  KASAN: maybe wild-memory-access in range [0x02d6...d0-0x02d6...d7]
  RIP: 0010:ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1698)
  Kernel panic - not syncing: Fatal exception

Fixes: 8a02bdd50b2e ("netfilter: ipset: Fix calling ip_set() macro at dumping")
Cc: stable@vger.kernel.org
Reported-by: Weiming Shi <bestswngs@gmail.com>
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Xiang Mei <xmei5@asu.edu>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/ipset/ip_set_core.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index a531b654b8d9..6cfad152d7d1 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1480,7 +1480,11 @@ ip_set_dump_done(struct netlink_callback *cb)
 		struct ip_set_net *inst =
 			(struct ip_set_net *)cb->args[IPSET_CB_NET];
 		ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
-		struct ip_set *set = ip_set_ref_netlink(inst, index);
+		struct ip_set *set;
+
+		rcu_read_lock();
+		set = ip_set_ref_netlink(inst, index);
+		rcu_read_unlock();
 
 		if (set->variant->uref)
 			set->variant->uref(set, cb, false);
@@ -1686,7 +1690,9 @@ ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb)
 release_refcount:
 	/* If there was an error or set is done, release set */
 	if (ret || !cb->args[IPSET_CB_ARG0]) {
+		rcu_read_lock();
 		set = ip_set_ref_netlink(inst, index);
+		rcu_read_unlock();
 		if (set->variant->uref)
 			set->variant->uref(set, cb, false);
 		pr_debug("release set %s\n", set->name);
-- 
2.53.0


^ permalink raw reply related

* [PATCH net 4/9] netfilter: nf_conntrack_sip: validate skb_dst() before accessing it
From: Florian Westphal @ 2026-06-30  4:52 UTC (permalink / raw)
  To: netdev
  Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netfilter-devel, pablo
In-Reply-To: <20260630045243.2657-1-fw@strlen.de>

From: Pablo Neira Ayuso <pablo@netfilter.org>

tc ingress and openvswitch do not guarantee routing information to be
available. These subsystems use the conntrack helper infrastructure, and
the SIP helper relies on the skb_dst() to be present if
sip_external_media is set to 1 (which is disabled by default as a module
parameter).

This effectively disables the sip_external_media toggle for these
subsystems without resulting in a crash.

Fixes: cae3a2627520 ("openvswitch: Allow attaching helpers to ct action")
Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Cc: stable@vger.kernel.org
Reported-by: Ren Wei <n05ec@lzu.edu.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_conntrack_sip.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 5ec3a4a4bbd7..f3f90a866338 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -956,7 +956,6 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
 			return NF_ACCEPT;
 		saddr = &ct->tuplehash[!dir].tuple.src.u3;
 	} else if (sip_external_media) {
-		struct net_device *dev = skb_dst(skb)->dev;
 		struct dst_entry *dst = NULL;
 		struct flowi fl;
 
@@ -978,7 +977,11 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
 		 * through the same interface as the signalling peer.
 		 */
 		if (dst) {
-			bool external_media = (dst->dev == dev);
+			const struct dst_entry *this_dst = skb_dst(skb);
+			bool external_media = false;
+
+			if (this_dst && dst->dev == this_dst->dev)
+				external_media = true;
 
 			dst_release(dst);
 			if (external_media)
-- 
2.53.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox