Netdev List
 help / color / mirror / Atom feed
* [PATCH net] net: airoha: fix max receive size configuration
From: Lorenzo Bianconi @ 2026-06-25  6:49 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Lorenzo Bianconi
  Cc: linux-arm-kernel, linux-mediatek, netdev, Madhur Agrawal

Set the GDM maximum receive size to AIROHA_MAX_RX_SIZE unconditionally
during hardware initialization instead of updating it according to the
configured MTU. This avoids dropping incoming frames that exceed the
current MTU but could still be processed by the networking stack, which
is able to fragment the reply on the TX side (e.g. ICMP echo requests).
Move the per-port MTU configuration to the PPE egress path where it
belongs, and set the tx frame size running airoha_ppe_set_xmit_frame_size()
to dynamically track the maximum MTU across running interfaces sharing
the same PPE instance.
Fix the PPE MTU register addressing to pack two port entries per
register word and add WAN_MTU0 configuration for non-LAN GDM devices.

Fixes: 54d989d58d2a ("net: airoha: Move min/max packet len configuration in airoha_dev_open()")
Tested-by: Madhur Agrawal <madhur.agrawal@airoha.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/ethernet/airoha/airoha_eth.c  | 68 ++++++++++---------------------
 drivers/net/ethernet/airoha/airoha_eth.h  |  2 +
 drivers/net/ethernet/airoha/airoha_ppe.c  | 39 +++++++++++++-----
 drivers/net/ethernet/airoha/airoha_regs.h |  9 ++--
 4 files changed, 58 insertions(+), 60 deletions(-)

diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
index 932b3a3df2e5..3f451c2d4c24 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.c
+++ b/drivers/net/ethernet/airoha/airoha_eth.c
@@ -178,10 +178,15 @@ static void airoha_fe_maccr_init(struct airoha_eth *eth)
 {
 	int p;
 
-	for (p = 1; p <= ARRAY_SIZE(eth->ports); p++)
+	for (p = 1; p <= ARRAY_SIZE(eth->ports); p++) {
 		airoha_fe_set(eth, REG_GDM_FWD_CFG(p),
 			      GDM_TCP_CKSUM_MASK | GDM_UDP_CKSUM_MASK |
 			      GDM_IP4_CKSUM_MASK | GDM_DROP_CRC_ERR_MASK);
+		airoha_fe_rmw(eth, REG_GDM_LEN_CFG(p),
+			      GDM_SHORT_LEN_MASK | GDM_LONG_LEN_MASK,
+			      FIELD_PREP(GDM_SHORT_LEN_MASK, 60) |
+			      FIELD_PREP(GDM_LONG_LEN_MASK, AIROHA_MAX_RX_SIZE));
+	}
 
 	airoha_fe_rmw(eth, REG_CDM_VLAN_CTRL(1), CDM_VLAN_MASK,
 		      FIELD_PREP(CDM_VLAN_MASK, 0x8100));
@@ -1831,13 +1836,24 @@ static void airoha_update_hw_stats(struct airoha_gdm_dev *dev)
 	spin_unlock(&port->stats_lock);
 }
 
+static void airoha_dev_set_xmit_frame_size(struct net_device *netdev)
+{
+	struct airoha_gdm_dev *dev = netdev_priv(netdev);
+
+	airoha_ppe_set_xmit_frame_size(dev);
+	if (!airoha_is_lan_gdm_dev(dev))
+		airoha_fe_rmw(dev->eth, REG_WAN_MTU0, WAN_MTU0_MASK,
+			      FIELD_PREP(WAN_MTU0_MASK,
+					 VLAN_ETH_HLEN + netdev->mtu));
+}
+
 static int airoha_dev_open(struct net_device *netdev)
 {
-	int err, len = ETH_HLEN + netdev->mtu + ETH_FCS_LEN;
 	struct airoha_gdm_dev *dev = netdev_priv(netdev);
 	struct airoha_gdm_port *port = dev->port;
-	u32 cur_len, pse_port = FE_PSE_PORT_PPE1;
 	struct airoha_qdma *qdma = dev->qdma;
+	u32 pse_port = FE_PSE_PORT_PPE1;
+	int err;
 
 	netif_tx_start_all_queues(netdev);
 	err = airoha_set_vip_for_gdm_port(dev, true);
@@ -1851,19 +1867,7 @@ static int airoha_dev_open(struct net_device *netdev)
 		airoha_fe_clear(qdma->eth, REG_GDM_INGRESS_CFG(port->id),
 				GDM_STAG_EN_MASK);
 
-	cur_len = airoha_fe_get(qdma->eth, REG_GDM_LEN_CFG(port->id),
-				GDM_LONG_LEN_MASK);
-	if (!port->users || len > cur_len) {
-		/* Opening a sibling net_device with a larger MTU updates the
-		 * MTU of already running devices. This is required to allow
-		 * multiple net_devices with different MTUs to share the same
-		 * GDM port.
-		 */
-		airoha_fe_rmw(qdma->eth, REG_GDM_LEN_CFG(port->id),
-			      GDM_SHORT_LEN_MASK | GDM_LONG_LEN_MASK,
-			      FIELD_PREP(GDM_SHORT_LEN_MASK, 60) |
-			      FIELD_PREP(GDM_LONG_LEN_MASK, len));
-	}
+	airoha_dev_set_xmit_frame_size(netdev);
 	port->users++;
 
 	if (!airoha_is_lan_gdm_dev(dev) &&
@@ -1875,30 +1879,6 @@ static int airoha_dev_open(struct net_device *netdev)
 	return 0;
 }
 
-static void airoha_set_port_mtu(struct airoha_eth *eth,
-				struct airoha_gdm_port *port)
-{
-	u32 len = 0;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(port->devs); i++) {
-		struct airoha_gdm_dev *dev = port->devs[i];
-		struct net_device *netdev;
-
-		if (!dev)
-			continue;
-
-		netdev = netdev_from_priv(dev);
-		if (netif_running(netdev))
-			len = max_t(u32, len, netdev->mtu);
-	}
-	len += ETH_HLEN + ETH_FCS_LEN;
-
-	airoha_fe_rmw(eth, REG_GDM_LEN_CFG(port->id),
-		      GDM_LONG_LEN_MASK,
-		      FIELD_PREP(GDM_LONG_LEN_MASK, len));
-}
-
 static int airoha_dev_stop(struct net_device *netdev)
 {
 	struct airoha_gdm_dev *dev = netdev_priv(netdev);
@@ -1909,7 +1889,7 @@ static int airoha_dev_stop(struct net_device *netdev)
 	airoha_set_vip_for_gdm_port(dev, false);
 
 	if (--port->users)
-		airoha_set_port_mtu(dev->eth, port);
+		airoha_ppe_set_xmit_frame_size(dev);
 	else
 		airoha_set_gdm_port_fwd_cfg(qdma->eth,
 					    REG_GDM_FWD_CFG(port->id),
@@ -1962,10 +1942,6 @@ static int airoha_enable_gdm2_loopback(struct airoha_gdm_dev *dev)
 		      FIELD_PREP(LPBK_CHAN_MASK, chan) |
 		      LBK_GAP_MODE_MASK | LBK_LEN_MODE_MASK |
 		      LBK_CHAN_MODE_MASK | LPBK_EN_MASK);
-	airoha_fe_rmw(eth, REG_GDM_LEN_CFG(AIROHA_GDM2_IDX),
-		      GDM_SHORT_LEN_MASK | GDM_LONG_LEN_MASK,
-		      FIELD_PREP(GDM_SHORT_LEN_MASK, 60) |
-		      FIELD_PREP(GDM_LONG_LEN_MASK, AIROHA_MAX_MTU));
 	/* Forward the traffic to the proper GDM port */
 	pse_port = port->id == AIROHA_GDM3_IDX ? FE_PSE_PORT_GDM3
 					       : FE_PSE_PORT_GDM4;
@@ -2098,7 +2074,7 @@ static int airoha_dev_change_mtu(struct net_device *netdev, int mtu)
 
 	WRITE_ONCE(netdev->mtu, mtu);
 	if (port->users)
-		airoha_set_port_mtu(dev->eth, port);
+		airoha_dev_set_xmit_frame_size(netdev);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h
index d7ff8c5200e2..0c3fb6e5d7f1 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.h
+++ b/drivers/net/ethernet/airoha/airoha_eth.h
@@ -23,6 +23,7 @@
 #define AIROHA_MAX_DSA_PORTS		7
 #define AIROHA_MAX_NUM_RSTS		3
 #define AIROHA_MAX_MTU			9220
+#define AIROHA_MAX_RX_SIZE		16128
 #define AIROHA_MAX_PACKET_SIZE		2048
 #define AIROHA_NUM_QOS_CHANNELS		4
 #define AIROHA_NUM_QOS_QUEUES		8
@@ -676,6 +677,7 @@ int airoha_get_fe_port(struct airoha_gdm_dev *dev);
 bool airoha_is_valid_gdm_dev(struct airoha_eth *eth,
 			     struct airoha_gdm_dev *dev);
 
+void airoha_ppe_set_xmit_frame_size(struct airoha_gdm_dev *dev);
 void airoha_ppe_set_cpu_port(struct airoha_gdm_dev *dev, u8 ppe_id, u8 fport);
 bool airoha_ppe_is_enabled(struct airoha_eth *eth, int index);
 void airoha_ppe_check_skb(struct airoha_ppe_dev *dev, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c
index 42f4b0f21d17..e7c78293002a 100644
--- a/drivers/net/ethernet/airoha/airoha_ppe.c
+++ b/drivers/net/ethernet/airoha/airoha_ppe.c
@@ -97,6 +97,33 @@ void airoha_ppe_set_cpu_port(struct airoha_gdm_dev *dev, u8 ppe_id, u8 fport)
 		      __field_prep(DFT_CPORT_MASK(fport), fe_cpu_port));
 }
 
+void airoha_ppe_set_xmit_frame_size(struct airoha_gdm_dev *dev)
+{
+	struct airoha_gdm_port *port = dev->port;
+	struct airoha_eth *eth = dev->eth;
+	int i, ppe_id, index;
+	u32 len = 0;
+
+	for (i = 0; i < ARRAY_SIZE(port->devs); i++) {
+		struct airoha_gdm_dev *d = port->devs[i];
+		struct net_device *netdev;
+
+		if (!d)
+			continue;
+
+		netdev = netdev_from_priv(d);
+		if (netif_running(netdev))
+			len = max_t(u32, len, netdev->mtu);
+	}
+	len += VLAN_ETH_HLEN;
+
+	ppe_id = !airoha_is_lan_gdm_dev(dev) && airoha_ppe_is_enabled(eth, 1);
+	index = port->id == AIROHA_GDM4_IDX ? 7 : port->id;
+	airoha_fe_rmw(eth, REG_PPE_MTU(ppe_id, index),
+		      FP_EGRESS_MTU_MASK(index),
+		      __field_prep(FP_EGRESS_MTU_MASK(index), len));
+}
+
 static void airoha_ppe_hw_init(struct airoha_ppe *ppe)
 {
 	u32 sram_ppe_num_data_entries = PPE_SRAM_NUM_ENTRIES, sram_num_entries;
@@ -115,8 +142,6 @@ static void airoha_ppe_hw_init(struct airoha_ppe *ppe)
 		PPE_RAM_NUM_ENTRIES_SHIFT(sram_ppe_num_data_entries);
 
 	for (i = 0; i < eth->soc->num_ppe; i++) {
-		int p;
-
 		airoha_fe_wr(eth, REG_PPE_TB_BASE(i),
 			     ppe->foe_dma + sram_tb_size);
 
@@ -166,15 +191,6 @@ static void airoha_ppe_hw_init(struct airoha_ppe *ppe)
 		airoha_fe_wr(eth, REG_PPE_HASH_SEED(i), PPE_HASH_SEED);
 		airoha_fe_clear(eth, REG_PPE_PPE_FLOW_CFG(i),
 				PPE_FLOW_CFG_IP6_6RD_MASK);
-
-		for (p = 0; p < ARRAY_SIZE(eth->ports); p++)
-			airoha_fe_rmw(eth, REG_PPE_MTU(i, p),
-				      FP0_EGRESS_MTU_MASK |
-				      FP1_EGRESS_MTU_MASK,
-				      FIELD_PREP(FP0_EGRESS_MTU_MASK,
-						 AIROHA_MAX_MTU) |
-				      FIELD_PREP(FP1_EGRESS_MTU_MASK,
-						 AIROHA_MAX_MTU));
 	}
 
 	for (i = 0; i < ARRAY_SIZE(eth->ports); i++) {
@@ -196,6 +212,7 @@ static void airoha_ppe_hw_init(struct airoha_ppe *ppe)
 				 airoha_ppe_is_enabled(eth, 1);
 			fport = airoha_get_fe_port(dev);
 			airoha_ppe_set_cpu_port(dev, ppe_id, fport);
+			airoha_ppe_set_xmit_frame_size(dev);
 		}
 	}
 }
diff --git a/drivers/net/ethernet/airoha/airoha_regs.h b/drivers/net/ethernet/airoha/airoha_regs.h
index 436f3c8779c1..6fed63d013b4 100644
--- a/drivers/net/ethernet/airoha/airoha_regs.h
+++ b/drivers/net/ethernet/airoha/airoha_regs.h
@@ -327,9 +327,8 @@
 #define PPE_SRAM_TABLE_EN_MASK			BIT(0)
 
 #define REG_PPE_MTU_BASE(_n)			(((_n) ? PPE2_BASE : PPE1_BASE) + 0x304)
-#define REG_PPE_MTU(_m, _n)			(REG_PPE_MTU_BASE(_m) + ((_n) << 2))
-#define FP1_EGRESS_MTU_MASK			GENMASK(29, 16)
-#define FP0_EGRESS_MTU_MASK			GENMASK(13, 0)
+#define REG_PPE_MTU(_m, _n)			(REG_PPE_MTU_BASE(_m) + (((_n) / 2) << 2))
+#define FP_EGRESS_MTU_MASK(_n)			GENMASK(13 + (((_n) % 2) << 4), ((_n) % 2) << 4)
 
 #define REG_PPE_RAM_CTRL(_n)			(((_n) ? PPE2_BASE : PPE1_BASE) + 0x31c)
 #define PPE_SRAM_CTRL_ACK_MASK			BIT(31)
@@ -377,6 +376,10 @@
 #define REG_SRC_PORT_FC_MAP6		0x2298
 #define FC_ID_OF_SRC_PORT_MASK(_n)	GENMASK(4 + ((_n) << 3), ((_n) << 3))
 
+#define REG_WAN_MTU0			0x2300
+#define WAN_MTU1_MASK			GENMASK(29, 16)
+#define WAN_MTU0_MASK			GENMASK(13, 0)
+
 #define REG_CDM5_RX_OQ1_DROP_CNT	0x29d4
 
 /* QDMA */

---
base-commit: fd1269e454089abda0e4f9e5e25ecd02a90ab009
change-id: 20260618-airoha-fix-rx-max-len-57654b661646

Best regards,
-- 
Lorenzo Bianconi <lorenzo@kernel.org>


^ permalink raw reply related

* [PATCH v2 net 0/3] net: udp_tunnel: fix races and use-after-free
From: Eric Dumazet @ 2026-06-25  6:59 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Yue Sun, Stanislav Fomichev, netdev, eric.dumazet,
	Eric Dumazet

Yue Sun reported a use-after-free and debugobjects warning in
udp_tunnel_nic_device_sync_work() when concurrently creating and
destroying netdevsim and geneve devices.

This series resolves the UAF and the underlying data races that
make the fix vulnerable.

The core issue is a workqueue re-queue race combined with data races
introduced by the lock-splitting in commit 1ead7501094c ("udp_tunnel:
remove rtnl_lock dependency"). That commit allowed the device reset
path (reset_ntf) to run without holding the RTNL lock (using only
utn->lock), while the port addition paths (add_port) still run under
RTNL without acquiring utn->lock.

This series fixes these issues in three steps:

1. Patch 1 (Jakub's fix) addresses the UAF by preventing double-queueing
   of the sync work. If work_pending is already set, we return early
   in device_sync(), blocking a second work item from entering the
   queue while the first is blocked on RTNL.

2. Patch 2 converts the state flags (need_sync, need_replay, work_pending)
   from bitfields to atomic bitops. Because these flags share a single
   byte, concurrent RMW writes from the RTNL-locked path and the RTNL-less
   reset path corrupt the byte. This corruption could clear work_pending,
   defeating the UAF fix.

3. Patch 3 fixes a similar data race on the 'missed' bitmap. Writes
   (__set_bit) happen under RTNL, while reads (should_replay) happen
   under utn->lock without RTNL. We convert this to use atomic set_bit(),
   READ_ONCE() for the fast-path read, and WRITE_ONCE() for clearing.

Reported-by: Yue Sun <samsun1006219@gmail.com>

Eric Dumazet (3):
  net: udp_tunnel: prevent double queueing in udp_tunnel_nic_device_sync
  net: udp_tunnel: convert state flags to atomic bitops
  net: udp_tunnel: use atomic bitops for missed bitmap

 net/ipv4/udp_tunnel_nic.c | 51 +++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply

* [PATCH v2 net 1/3] net: udp_tunnel: prevent double queueing in udp_tunnel_nic_device_sync
From: Eric Dumazet @ 2026-06-25  6:59 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Yue Sun, Stanislav Fomichev, netdev, eric.dumazet,
	Eric Dumazet
In-Reply-To: <20260625065938.654652-1-edumazet@google.com>

Yue Sun reported a use-after-free and debugobjects warning in
udp_tunnel_nic_device_sync_work() during concurrent device operations.

The workqueue core clears the internal pending bit before invoking the
worker. At that point, a concurrent thread can queue the work again.
When the already running worker eventually clears the work_pending flag
to 0, it mistakenly clears the flag for the newly queued instance.
udp_tunnel_nic_unregister() then observes work_pending as 0 and frees
the structure while the second work item is still active in the queue,
leading to UAF.

Fix this by returning early in udp_tunnel_nic_device_sync() if
work_pending is already set, preventing redundant work queueing.

Fixes: cc4e3835eff4 ("udp_tunnel: add central NIC RX port offload infrastructure")
Reported-by: Yue Sun <samsun1006219@gmail.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/udp_tunnel_nic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index 9944ed923ddfd10f9adf6ad788c0740daeaf2adb..3b32a0afa9798d3c416d9ae570e6d529f70e6697 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -301,7 +301,7 @@ __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
 static void
 udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
 {
-	if (!utn->need_sync)
+	if (!utn->need_sync || utn->work_pending)
 		return;
 
 	queue_work(udp_tunnel_nic_workqueue, &utn->work);
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* [PATCH v2 net 2/3] net: udp_tunnel: convert state flags to atomic bitops
From: Eric Dumazet @ 2026-06-25  6:59 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Yue Sun, Stanislav Fomichev, netdev, eric.dumazet,
	Eric Dumazet
In-Reply-To: <20260625065938.654652-1-edumazet@google.com>

The state flags of struct udp_tunnel_nic (need_sync, need_replay,
work_pending) are currently bitfields sharing a single byte.

These flags can be modified concurrently from different contexts:
- RTNL-locked paths (like add_port/del_port) write to need_sync and
  work_pending.
- The RTNL-less reset path (reset_ntf, used by netdevsim) writes to
  need_sync and need_replay under utn->lock.

Since they share a byte, concurrent writes are compiled into non-atomic
Read-Modify-Write (RMW) operations that can corrupt each other. For
example, a write to need_replay in reset_ntf can overwrite and clear
work_pending, defeating the double-queueing prevention and causing UAF.

Fix this by converting these state flags to atomic bitops, ensuring
safe concurrent writes across RTNL-locked and RTNL-less paths.

Fixes: 1ead7501094c ("udp_tunnel: remove rtnl_lock dependency")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/udp_tunnel_nic.c | 43 ++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index 3b32a0afa9798d3c416d9ae570e6d529f70e6697..840be5d79fc0ac3142049dcb9f1105a5844da9ae 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -30,9 +30,7 @@ struct udp_tunnel_nic_table_entry {
  * @work:	async work for talking to hardware from process context
  * @dev:	netdev pointer
  * @lock:	protects all fields
- * @need_sync:	at least one port start changed
- * @need_replay: space was freed, we need a replay of all ports
- * @work_pending: @work is currently scheduled
+ * @flags:	sync, replay, pending flags
  * @n_tables:	number of tables under @entries
  * @missed:	bitmap of tables which overflown
  * @entries:	table of tables of ports currently offloaded
@@ -44,9 +42,10 @@ struct udp_tunnel_nic {
 
 	struct mutex lock;
 
-	u8 need_sync:1;
-	u8 need_replay:1;
-	u8 work_pending:1;
+	unsigned long flags;
+#define UDP_TUNNEL_NIC_NEED_SYNC	0
+#define UDP_TUNNEL_NIC_NEED_REPLAY	1
+#define UDP_TUNNEL_NIC_WORK_PENDING	2
 
 	unsigned int n_tables;
 	unsigned long missed;
@@ -116,7 +115,7 @@ udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn,
 			   unsigned int flag)
 {
 	entry->flags |= flag;
-	utn->need_sync = 1;
+	set_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags);
 }
 
 static void
@@ -283,7 +282,7 @@ udp_tunnel_nic_device_sync_by_table(struct net_device *dev,
 static void
 __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
 {
-	if (!utn->need_sync)
+	if (!test_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags))
 		return;
 
 	if (dev->udp_tunnel_nic_info->sync_table)
@@ -291,21 +290,27 @@ __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
 	else
 		udp_tunnel_nic_device_sync_by_port(dev, utn);
 
-	utn->need_sync = 0;
+	clear_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags);
 	/* Can't replay directly here, in case we come from the tunnel driver's
 	 * notification - trying to replay may deadlock inside tunnel driver.
 	 */
-	utn->need_replay = udp_tunnel_nic_should_replay(dev, utn);
+	if (udp_tunnel_nic_should_replay(dev, utn))
+		set_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);
+	else
+		clear_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);
 }
 
 static void
 udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
 {
-	if (!utn->need_sync || utn->work_pending)
+	if (!test_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags))
+		return;
+
+	if (test_bit(UDP_TUNNEL_NIC_WORK_PENDING, &utn->flags))
 		return;
 
 	queue_work(udp_tunnel_nic_workqueue, &utn->work);
-	utn->work_pending = 1;
+	set_bit(UDP_TUNNEL_NIC_WORK_PENDING, &utn->flags);
 }
 
 static bool
@@ -552,7 +557,7 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
 
 	mutex_lock(&utn->lock);
 
-	utn->need_sync = false;
+	clear_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags);
 	for (i = 0; i < utn->n_tables; i++)
 		for (j = 0; j < info->tables[i].n_entries; j++) {
 			struct udp_tunnel_nic_table_entry *entry;
@@ -696,8 +701,8 @@ udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn)
 	for (i = 0; i < utn->n_tables; i++)
 		memset(utn->entries[i], 0, array_size(info->tables[i].n_entries,
 						      sizeof(**utn->entries)));
-	WARN_ON(utn->need_sync);
-	utn->need_replay = 0;
+	WARN_ON(test_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags));
+	clear_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);
 }
 
 static void
@@ -714,7 +719,7 @@ udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
 		for (j = 0; j < info->tables[i].n_entries; j++)
 			udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]);
 	utn->missed = 0;
-	utn->need_replay = 0;
+	clear_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);
 
 	if (!info->shared) {
 		udp_tunnel_get_rx_info(dev);
@@ -736,10 +741,10 @@ static void udp_tunnel_nic_device_sync_work(struct work_struct *work)
 	rtnl_lock();
 	mutex_lock(&utn->lock);
 
-	utn->work_pending = 0;
+	clear_bit(UDP_TUNNEL_NIC_WORK_PENDING, &utn->flags);
 	__udp_tunnel_nic_device_sync(utn->dev, utn);
 
-	if (utn->need_replay)
+	if (test_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags))
 		udp_tunnel_nic_replay(utn->dev, utn);
 
 	mutex_unlock(&utn->lock);
@@ -904,7 +909,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
 	/* Wait for the work to be done using the state, netdev core will
 	 * retry unregister until we give up our reference on this device.
 	 */
-	if (utn->work_pending)
+	if (test_bit(UDP_TUNNEL_NIC_WORK_PENDING, &utn->flags))
 		return;
 
 	udp_tunnel_nic_free(utn);
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* [PATCH v2 net 3/3] net: udp_tunnel: use atomic bitops for missed bitmap
From: Eric Dumazet @ 2026-06-25  6:59 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Yue Sun, Stanislav Fomichev, netdev, eric.dumazet,
	Eric Dumazet
In-Reply-To: <20260625065938.654652-1-edumazet@google.com>

The 'missed' bitmap in struct udp_tunnel_nic can be accessed
concurrently:
- Writes (__set_bit) happen in the port add path (add_port), which
  holds the RTNL lock.
- Reads (checking if missed is non-zero) happen in the reset path
  (reset_ntf) via __udp_tunnel_nic_device_sync(), which holds
  utn->lock but does not hold RTNL after the blamed commit.

This setup creates a data race between concurrent writes and reads
on different CPUs. Fix this by using atomic set_bit() for writes,
READ_ONCE() for the fast-path read, and WRITE_ONCE() for clearing
the bitmap.

Fixes: 1ead7501094c ("udp_tunnel: remove rtnl_lock dependency")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/udp_tunnel_nic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index 840be5d79fc0ac3142049dcb9f1105a5844da9ae..9a567a87635caaf76f5b88029a7f28a65c795efc 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -147,7 +147,7 @@ udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
 	const struct udp_tunnel_nic_table_info *table;
 	unsigned int i, j;
 
-	if (!utn->missed)
+	if (!READ_ONCE(utn->missed))
 		return false;
 
 	for (i = 0; i < utn->n_tables; i++) {
@@ -353,7 +353,7 @@ udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn,
 			if (!udp_tunnel_nic_entry_is_free(entry) &&
 			    entry->port == ti->port &&
 			    entry->type != ti->type) {
-				__set_bit(i, &utn->missed);
+				set_bit(i, &utn->missed);
 				return true;
 			}
 		}
@@ -488,7 +488,7 @@ udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn,
 		 * are no devices currently which have multiple tables accepting
 		 * the same tunnel type, and false positives are okay.
 		 */
-		__set_bit(i, &utn->missed);
+		set_bit(i, &utn->missed);
 	}
 
 	return false;
@@ -718,7 +718,7 @@ udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
 	for (i = 0; i < utn->n_tables; i++)
 		for (j = 0; j < info->tables[i].n_entries; j++)
 			udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]);
-	utn->missed = 0;
+	WRITE_ONCE(utn->missed, 0);
 	clear_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);
 
 	if (!info->shared) {
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* [PATCH v3 net] ipv6: fib6: fix NULL deref in fib6_walk_continue() on multi-batch dump
From: Pengfei Zhang @ 2026-06-25  7:05 UTC (permalink / raw)
  To: dsahern, idosch
  Cc: davem, edumazet, kuba, pabeni, horms, netdev, linux-kernel,
	chenzhangqi, baohua, Pengfei Zhang

inet6_dump_fib() saves its progress in cb->args[1] as a positional
index within the current hash chain.  Between batches, a concurrent
fib6_new_table() can insert a new table at the chain head, shifting
all existing entries.  The saved index then lands on a different
table, causing fib6_dump_table() to set w->root to the wrong table
while w->node still points into the previous one.
fib6_walk_continue() dereferences w->node->parent (NULL) and panics:

  BUG: kernel NULL pointer dereference, address: 0000000000000008
  RIP: 0010:fib6_walk_continue+0x6e/0x170
  Call Trace:
   <TASK>
   fib6_dump_table.isra.0+0xc5/0x240
   inet6_dump_fib+0xf6/0x420
   rtnl_dumpit+0x30/0xa0
   netlink_dump+0x15b/0x460
   netlink_recvmsg+0x1d6/0x2a0
   ____sys_recvmsg+0x17a/0x190

Fix by storing tb->tb6_id in cb->args[1] instead of a positional
index.  On resume, skip entries until the id matches; a concurrent
head-insert can never match the saved id, so the walker always
resumes on the correct table.

Fixes: 1b43af5480c3 ("[IPV6]: Increase number of possible routing tables to 2^32")
Signed-off-by: Pengfei Zhang <zhangfeionline@gmail.com>
---
v3:
 - Fix Author/SOB email mismatch (use gmail for both)
 - Drop "RTNL lock is released" from commit message (RTNL removed from IPv6 FIB)
 - Reorder local variables to follow reverse xmas tree
 - Move blank line after continue for readability

v2:
 - Add Fixes tag

v2: https://lore.kernel.org/netdev/20260625044101.939070-1-zhangfeionline@gmail.com/
v1: https://lore.kernel.org/netdev/20260624171156.822055-1-zhangfeionline@gmail.com/

 net/ipv6/ip6_fib.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index fc95738de..a130cdfae 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -636,12 +636,12 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	};
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
-	unsigned int e = 0, s_e;
 	struct hlist_head *head;
 	struct fib6_walker *w;
 	struct fib6_table *tb;
 	unsigned int h, s_h;
 	int err = 0;
+	u32 s_id;
 
 	rcu_read_lock();
 	if (cb->strict_check) {
@@ -701,23 +701,22 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	s_h = cb->args[0];
-	s_e = cb->args[1];
+	s_id = cb->args[1];
 
-	for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
-		e = 0;
+	for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_id = 0) {
 		head = &net->ipv6.fib_table_hash[h];
 		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
-			if (e < s_e)
-				goto next;
+			if (s_id && tb->tb6_id != s_id)
+				continue;
+
+			s_id = 0;
+			cb->args[1] = tb->tb6_id;
 			err = fib6_dump_table(tb, skb, cb);
 			if (err != 0)
 				goto out;
-next:
-			e++;
 		}
 	}
 out:
-	cb->args[1] = e;
 	cb->args[0] = h;
 
 unlock:
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH] net: stmmac: fix missed le32_to_cpu()
From: Maxime Chevallier @ 2026-06-25  7:07 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Ben Dooks, Andrew Lunn, David S. Miller, Eric Dumazet,
	Paolo Abeni, Maxime Coquelin, Alexandre Torgue,
	Russell King (Oracle), netdev, linux-stm32, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260624192205.4485cd61@kernel.org>



On 6/25/26 04:22, Jakub Kicinski wrote:
> On Mon, 22 Jun 2026 19:51:39 +0200 Maxime Chevallier wrote:
>> Hi Ben,
>>
>> On 6/22/26 16:37, Ben Dooks wrote:
>>> The print in ndesc_display_ring() sends the des2 and des3
>>> to the pr_info() without passing them through the relevant
>>> conversion to cpu order.
>>>
>>> Fix the (prototype) sparse warnings by using le32_to_cpu():
>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17: warning: incorrect type in argument 6 (different base types)
>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    expected unsigned int
>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    got restricted __le32 [usertype] des2
>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17: warning: incorrect type in argument 7 (different base types)
>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    expected unsigned int
>>> drivers/net/ethernet/stmicro/stmmac/norm_desc.c:258:17:    got restricted __le32 [usertype] des3
>>>
>>> Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>  
>>
>> I agree on the principle, but this isn't a fix so this'll have to wait
>> until net-next re-opens :)
> 
> Humpf, why are we not seeing this on x86 allmodconfig ? 🤔️
> 
> $ make C=1 W=1 drivers/net/ethernet/stmicro/stmmac/norm_desc.o 
>   DESCEND objtool
>   CC [M]  drivers/net/ethernet/stmicro/stmmac/norm_desc.o
>   CHECK   drivers/net/ethernet/stmicro/stmmac/norm_desc.c
> $

Heh good point indeed !
  
>>> Fix the (prototype) sparse warnings by using le32_to_cpu():

Ben, what's this "prototype" sparse ? a custom tool of yours that
you used to find that ?

Maxime



^ permalink raw reply

* RE: [Intel-wired-lan] [TEST] Weird RSS state on ice
From: Loktionov, Aleksandr @ 2026-06-25  7:11 UTC (permalink / raw)
  To: Jakub Kicinski, Pielech, Adrian, Kitszel, Przemyslaw
  Cc: netdev@vger.kernel.org, intel-wired-lan@lists.osuosl.org
In-Reply-To: <20260624083020.131a75fe@kernel.org>



> -----Original Message-----
> From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On Behalf
> Of Jakub Kicinski
> Sent: Wednesday, June 24, 2026 5:30 PM
> To: Pielech, Adrian <adrian.pielech@intel.com>; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>
> Cc: netdev@vger.kernel.org; intel-wired-lan@lists.osuosl.org
> Subject: [Intel-wired-lan] [TEST] Weird RSS state on ice
> 
> Hi!
> 
> I noticed in the netdev CI that the ice runner fails to run the
> toeplitz tests because of the RSS config.
> 
> https://netdev-ci-results.intel.com/ice-results/net-next-hw-2026-06-
> 23--00-00/ice-E810-CQ2/toeplitz.py/stdout
> 
> I added some extra debug on the branch:
> 
> net.lib.ynl.pyynl.lib.ynl.NlError: Netlink error: hash field config is
> not symmetric 16 304: Invalid argument {'bad-attr': '.input-xfrm'}
> 
> 16, 304 means GTP flow, GTP_TEID field. So we are trying to disable
> symmetric RSS, but the field configuration contains TEID. The problem
> is this is an illegal configuration in the first place. We are
> _disabling_ symmetric RSS, but the kernel tries to make sure that both
> before and after states are correct (because the configuration
> involves multiple calls to the drivers and may fail half-way-thru). If
> the current config is illegal net/ethtool/ won't even let us restore
> it to sane state.
> 
> So the question is how we got into this state. It does not happen on
> netdev machines. And on Intel machines it happens randomly around 30%
> of the time.
> 
> I tried to look thru the driver code and I don't see how we could end
> up with such a config.
> 
> Could y'all have a look and figure out / fix this? This has been
> happening for a while back but I was waiting until the merge window to
> poke at it first.

Good day, Jakub

The patchset didn't help? 

[PATCH iwl-next v5 2/2] ice: implement symmetric RSS hash configuration

With the best regards
Alex

^ permalink raw reply

* Re: [PATCH net-next] openvswitch: conntrack: annotate ct limit hlist traversal
From: Eelco Chaudron @ 2026-06-25  7:21 UTC (permalink / raw)
  To: Runyu Xiao
  Cc: aconole, i.maximets, davem, edumazet, kuba, pabeni, horms, netdev,
	dev, linux-kernel, jianhao.xu
In-Reply-To: <20260624150149.3510541-1-runyu.xiao@seu.edu.cn>



On 24 Jun 2026, at 17:01, Runyu Xiao wrote:

> ct_limit_set() is documented as being called with ovs_mutex held. It
> walks the ct limit hlist with hlist_for_each_entry_rcu(), but the
> iterator does not currently pass the OVS lockdep condition used
> elsewhere for RCU-protected OVS objects.
>
> Pass lockdep_ovsl_is_held() to the iterator. This matches the function's
> existing caller contract and lets CONFIG_PROVE_RCU_LIST distinguish the
> ovs_mutex-protected update path from the RCU read-side ct_limit_get()
> path.
>
> This was found by our static analysis tool and then manually reviewed
> against the current tree. In the reviewed CONFIG_PROVE_RCU_LIST triage
> run, the writer-side ct limit update produced the expected "RCU-list
> traversed in non-reader section!!" warning while ovs_mutex was held,
> with the stack matching ct_limit_set() and ovs_ct_limit_set_zone_limit().
> The change is limited to documenting the existing protection contract.
>
> This is a lockdep annotation cleanup. It does not change the conntrack
> limit list update or release behavior.
>
> Signed-off-by: Runyu Xiao <runyu.xiao@seu.edu.cn>
> ---

Hi Runyu,

I think net-next is still closed, so you might need to resend it once
it opens. But the patch itself looks good to me.

Reviewed-by: Eelco Chaudron <echaudro@redhat.com>


^ permalink raw reply

* Re: [PATCH nf] netfilter: ipset: fix race between dump and ip_set_list resize
From: Jozsef Kadlecsik @ 2026-06-25  7:21 UTC (permalink / raw)
  To: Xiang Mei
  Cc: Florian Westphal, Pablo Neira Ayuso, Jozsef Kadlecsik,
	Phil Sutter, netfilter-devel, kees, horms, Weiming Shi, coreteam,
	netdev, linux-kernel
In-Reply-To: <20260625010006.1448558-1-xmei5@asu.edu>

Hi,

On Wed, 24 Jun 2026, Xiang Mei wrote:

> The release path of ip_set_dump_do() and ip_set_dump_done() read
> inst->ip_set_list via ip_set_ref_netlink(), a plain rcu_dereference_raw()
> of the array pointer. These run from netlink_recvmsg() without the nfnl
> mutex and without an RCU read-side critical section.
>
> A concurrent ip_set_create() can grow the array: it publishes the new
> array, calls synchronize_net() and then kvfree()s the old one. Since the
> dump paths read the array outside any RCU reader, synchronize_net() does
> not wait for them and the old array can be freed while they still index
> into it, causing a use-after-free.
>
> The dumped set itself stays pinned via set->ref_netlink, so only the
> array load needs protecting. Take rcu_read_lock() around it, matching
> ip_set_get_byname() and __ip_set_put_byindex().
>
>  BUG: KASAN: slab-use-after-free in ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1697)
>  Read of size 8 at addr ffff88800b5c4018 by task exploit/150
>  Call Trace:
>   ...
>   kasan_report (mm/kasan/report.c:595)
>   ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1697)
>   netlink_dump (net/netlink/af_netlink.c:2325)
>   netlink_recvmsg (net/netlink/af_netlink.c:1976)
>   sock_recvmsg (net/socket.c:1159)
>   __sys_recvfrom (net/socket.c:2315)
>   ...
>  Oops: general protection fault, probably for non-canonical address ... KASAN NOPTI
>  KASAN: maybe wild-memory-access in range [0x02d6...d0-0x02d6...d7]
>  RIP: 0010:ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1698)
>  Kernel panic - not syncing: Fatal exception
>
> Fixes: 8a02bdd50b2e ("netfilter: ipset: Fix calling ip_set() macro at dumping")
> Reported-by: Weiming Shi <bestswngs@gmail.com>
> Assisted-by: Claude:claude-opus-4-8
> Signed-off-by: Xiang Mei <xmei5@asu.edu>

Thank you for the nice report and fix, good catch.

Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>

Best regards,
Jozsef
> ---
> net/netfilter/ipset/ip_set_core.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> index a531b654b8d9..6cfad152d7d1 100644
> --- a/net/netfilter/ipset/ip_set_core.c
> +++ b/net/netfilter/ipset/ip_set_core.c
> @@ -1480,7 +1480,11 @@ ip_set_dump_done(struct netlink_callback *cb)
> 		struct ip_set_net *inst =
> 			(struct ip_set_net *)cb->args[IPSET_CB_NET];
> 		ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
> -		struct ip_set *set = ip_set_ref_netlink(inst, index);
> +		struct ip_set *set;
> +
> +		rcu_read_lock();
> +		set = ip_set_ref_netlink(inst, index);
> +		rcu_read_unlock();
>
> 		if (set->variant->uref)
> 			set->variant->uref(set, cb, false);
> @@ -1686,7 +1690,9 @@ ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb)
> release_refcount:
> 	/* If there was an error or set is done, release set */
> 	if (ret || !cb->args[IPSET_CB_ARG0]) {
> +		rcu_read_lock();
> 		set = ip_set_ref_netlink(inst, index);
> +		rcu_read_unlock();
> 		if (set->variant->uref)
> 			set->variant->uref(set, cb, false);
> 		pr_debug("release set %s\n", set->name);
> -- 
> 2.43.0
>
>
>

^ permalink raw reply

* Re: [PATCH] net: pch_gbe: return errors from MIIM accesses
From: Maxime Chevallier @ 2026-06-25  7:36 UTC (permalink / raw)
  To: Pengpeng Hou, Andrew Lunn, davem, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel
In-Reply-To: <20260625030503.93588-1-pengpeng@iscas.ac.cn>



On 6/25/26 05:05, Pengpeng Hou wrote:
> pch_gbe_mac_ctrl_miim() polls for the MIIM controller to become ready,
> but returns zero on the initial ready timeout and ignores the completion
> timeout after issuing the operation. MDIO and PHY helpers can then report
> success with zero or stale data.
> 
> Make the MIIM helper return an errno and pass read data through an output
> parameter. Propagate the error through the MDIO read path, the probe-time
> PHY discovery path, and the internal PHY register helpers that already
> return an error status.
> 
> Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
> ---
>  .../net/ethernet/oki-semi/pch_gbe/pch_gbe.h   |  4 +-
>  .../ethernet/oki-semi/pch_gbe/pch_gbe_main.c  | 54 ++++++++++++++-----
>  .../ethernet/oki-semi/pch_gbe/pch_gbe_phy.c   | 22 +++++---
>  3 files changed, 57 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
> index 108f312bc542..4bdf0afca462 100644
> --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
> +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
> @@ -619,6 +619,6 @@ void pch_gbe_set_ethtool_ops(struct net_device *netdev);
>  
>  /* pch_gbe_mac.c */
>  s32 pch_gbe_mac_force_mac_fc(struct pch_gbe_hw *hw);
> -u16 pch_gbe_mac_ctrl_miim(struct pch_gbe_hw *hw, u32 addr, u32 dir, u32 reg,
> -			  u16 data);
> +int pch_gbe_mac_ctrl_miim(struct pch_gbe_hw *hw, u32 addr, u32 dir, u32 reg,
> +			  u16 data, u16 *read_data);

You should look and replicate what other mdio accessors do, that is return a
negative value on error, and the actual value otherwise.

Maxime

^ permalink raw reply

* [PATCH 6.18] sctp: disable BH before calling udp_tunnel_xmit_skb()
From: Alexander Martyniuk @ 2026-06-25  7:43 UTC (permalink / raw)
  To: stable, Greg Kroah-Hartman
  Cc: marcelo.leitner, lucien.xin, davem, edumazet, kuba, pabeni, horms,
	bestswngs, linux-sctp, netdev, linux-kernel, Alexander Martyniuk

From: Xin Long <lucien.xin@gmail.com>

commit 2cd7e6971fc2787408ceef17906ea152791448cf upstream.

udp_tunnel_xmit_skb() / udp_tunnel6_xmit_skb() are expected to run with
BH disabled.  After commit 6f1a9140ecda ("add xmit recursion limit to
tunnel xmit functions"), on the path:

  udp(6)_tunnel_xmit_skb() -> ip(6)tunnel_xmit()

dev_xmit_recursion_inc()/dec() must stay balanced on the same CPU.

Without local_bh_disable(), the context may move between CPUs, which can
break the inc/dec pairing. This may lead to incorrect recursion level
detection and cause packets to be dropped in ip(6)_tunnel_xmit() or
__dev_queue_xmit().

Fix it by disabling BH around both IPv4 and IPv6 SCTP UDP xmit paths.

In my testing, after enabling the SCTP over UDP:

  # ip net exec ha sysctl -w net.sctp.udp_port=9899
  # ip net exec ha sysctl -w net.sctp.encap_port=9899
  # ip net exec hb sysctl -w net.sctp.udp_port=9899
  # ip net exec hb sysctl -w net.sctp.encap_port=9899

  # ip net exec ha iperf3 -s

- without this patch:

  # ip net exec hb iperf3 -c 192.168.0.1 --sctp
  [  5]   0.00-10.00  sec  37.2 MBytes  31.2 Mbits/sec  sender
  [  5]   0.00-10.00  sec  37.1 MBytes  31.1 Mbits/sec  receiver

- with this patch:

  # ip net exec hb iperf3 -c 192.168.0.1 --sctp
  [  5]   0.00-10.00  sec  3.14 GBytes  2.69 Gbits/sec  sender
  [  5]   0.00-10.00  sec  3.14 GBytes  2.69 Gbits/sec  receiver

Fixes: 6f1a9140ecda ("net: add xmit recursion limit to tunnel xmit functions")
Fixes: 046c052b475e ("sctp: enable udp tunneling socks")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Link: https://patch.msgid.link/c874a8548221dcd56ff03c65ba75a74e6cf99119.1776017727.git.lucien.xin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexander Martyniuk <alexevgmart@gmail.com>
---
 net/sctp/ipv6.c     | 2 ++
 net/sctp/protocol.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index d725b2158758..7434309785cc 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -261,9 +261,11 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	skb_set_inner_ipproto(skb, IPPROTO_SCTP);
 	label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6);
 
+	local_bh_disable();
 	udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr,
 			     tclass, ip6_dst_hoplimit(dst), label,
 			     sctp_sk(sk)->udp_port, t->encap_port, false, 0);
+	local_bh_enable();
 	return 0;
 }
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 9dbc24af749b..6ce58fc95ef5 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1102,10 +1102,12 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	skb_reset_inner_mac_header(skb);
 	skb_reset_inner_transport_header(skb);
 	skb_set_inner_ipproto(skb, IPPROTO_SCTP);
+	local_bh_disable();
 	udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr,
 			    fl4->daddr, dscp, ip4_dst_hoplimit(dst), df,
 			    sctp_sk(sk)->udp_port, t->encap_port, false, false,
 			    0);
+	local_bh_enable();
 	return 0;
 }
 
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net 1/2] net: dsa: mxl862xx: avoid unaligned 16-bit access in api_wrap
From: David Laight @ 2026-06-25  7:44 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Paolo Abeni, netdev, linux-kernel
In-Reply-To: <20260624175239.1b97aaa6@kernel.org>

On Wed, 24 Jun 2026 17:52:39 -0700
Jakub Kicinski <kuba@kernel.org> wrote:

> On Fri, 19 Jun 2026 10:01:54 +0100 David Laight wrote:
> > > The MXL862XX_API_* macros pass the address of a stack-allocated, __packed
> > > firmware-ABI struct to mxl862xx_api_wrap() as a void *. The struct has an
> > > alignment of 1, so the compiler is free to place it at an odd address.
> > > 
> > > mxl862xx_api_wrap() reinterprets that buffer as a __le16 * and accesses it
> > > with data[i], for which the compiler assumes the natural 2-byte alignment
> > > of __le16 and emits aligned 16-bit loads/stores (e.g. lhu/sh on MIPS).
> > > When the buffer lands on an odd address these fault on architectures that
> > > do not support unaligned access, such as MIPS32.    
> > 
> > Isn't the correct fix to not pack the structure?
> > (or probably any of the associated structures??)  
> 
> Agreed, this is very silly:
> 
> struct mxl862xx_register_mod {
> 	__le16 addr;
> 	__le16 data;
> 	__le16 mask;
> } __packed;
> 
> But some structs won't get aligned:
> 
> struct mxl862xx_mac_table_clear {
> 	u8 type;
> 	u8 port_id;
> } __packed;

Does that one need an aligned(2) ?

> So I guess the "just don't pack" will have some corner cases, too.

The main problem is the original 32bit arm abi which 32bit aligns
all structures.
But that is pretty much dead and would want a packed_if_arm_oabi
define.
Unlikely to be relevant for this code.

	David


^ permalink raw reply

* [PATCH net v5] net: dsa: Fix skb ownership in taggers
From: Linus Walleij @ 2026-06-25  7:47 UTC (permalink / raw)
  To: Andrew Lunn, Vladimir Oltean, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Florian Fainelli,
	Jonas Gorski, Hauke Mehrtens, Kurt Kanzenbach, Woojung Huh,
	UNGLinuxDriver, Chester A. Unal, Daniel Golle, Matthias Brugger,
	AngeloGioacchino Del Regno, Wei Fang, Clark Wang,
	Clément Léger, George McCollister, David Yang
  Cc: netdev, Sashiko AI Review, Linus Walleij

The tag_8021q.c tagger calls vlan_insert_tag() in dsa_8021q_xmit().
vlan_insert_tag() will consume the skb with kfree_skb() on failure
and return NULL.

When NULL is returned as error code to ->xmit() in dsa_user_xmit()
it will free the same skb again leading to a double-free.

The idea of dsa_user_xmit() and dsa_switch_rcv() dropping the skb
they held before the call to ->xmit() and ->rcv() is conceptually
wrong: the pattern elsewhere in the networking code is that consumers
drop their skb:s on failure.

Modify the ->xmit() and ->rcv() call sites to not drop the SKB if
the taggers return NULL from any of these calls. Move those drops into
the taggers so every callback error path that retains ownership consumes
the skb before returning NULL.

Keep the existing helper ownership rules: VLAN insertion helpers already
free on failure (this is the case in tag_8021q.c), while deferred
transmit paths either transfer the skb reference to worker context or
hold a worker reference with skb_get() and drop the caller's reference.

For SJA1105 meta RX, transfer the buffered stampable skb under the meta
lock and return NULL while the skb is waiting for its meta frame: the
skb is not dropped in this case.

NOTICE: Backporting patches to taggers (e.g. for stable kernels) after
this point cannot be mechanical or they will introduce double
kfree_skb().

Reported-by: Sashiko AI Review <sashiko-bot@kernel.org>
Closes: https://lore.kernel.org/r/20260610153952.1685895-1-kuba@kernel.org/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Assisted-by: Codex:gpt-5-5
Acked-by: David Yang <mmyangfl@gmail.com> # yt921x
Acked-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek
Reviewed-by: Wei Fang <wei.fang@nxp.com> # netc
Signed-off-by: Linus Walleij <linusw@kernel.org>
---
Changes in v5:
- Fix another potential double-free in dsa_switch_rcv() when calling
  dsa_software_vlan_untag() and still free:ing skb if this returns NULL,
  while Sashiko duly noted that the only path in dsa_software_vlan_untag()
  where NULL is returned is on skb_vlan_untag() which already will
  drop the skb.
- Link to v4: https://patch.msgid.link/20260622-dsa-fix-free-skb-v4-1-7aea01bf4036@kernel.org

Changes in v4:
- Add a kfree_skb() on the else{} path of if (likely(skb->dev)) {} after
  skb->dev = dsa_conduit_find_user(dev, 0, port);
  Doing this explicitly rather than keeping the old code is more readable.
- Tag for net now that net-next is closed.
- Link to v3: https://patch.msgid.link/20260617-dsa-fix-free-skb-v3-1-cdd4e0778a39@kernel.org

Changes in v3:
- Simplify __skb_put_padto(skb, ETH_ZLEN, false) and
  skb_put_padto(skb, ETH_ZLEN) to eth_skb_pad().
- Pick up Wei's review tag.
- Link to v2: https://patch.msgid.link/20260616-dsa-fix-free-skb-v2-1-9dbda6a19e97@kernel.org

Changes in v2:
- In some instances __skb_pad() and __skb_put_padto() followed by a
  kfree_skb() could be simplified to just call skb_pad() and
  skb_put_padto() which will free the skb on failure.
- Use a label and goto for the kfree_skb(); return NULL; in
  the netc_rcv() callback in tag_netc.c as requested.
- Collect ACKs.
- Retag for net-next.
- Link to v1: https://patch.msgid.link/20260616-dsa-fix-free-skb-v1-1-fd30b35dcf66@kernel.org
---
 net/dsa/tag.c               | 12 ++++++------
 net/dsa/tag_ar9331.c        | 10 ++++++++--
 net/dsa/tag_brcm.c          | 39 ++++++++++++++++++++++++---------------
 net/dsa/tag_dsa.c           | 15 ++++++++++++---
 net/dsa/tag_gswip.c         |  8 ++++++--
 net/dsa/tag_hellcreek.c     |  9 +++++++--
 net/dsa/tag_ksz.c           | 44 +++++++++++++++++++++++++++++++-------------
 net/dsa/tag_lan9303.c       |  2 ++
 net/dsa/tag_mtk.c           |  8 ++++++--
 net/dsa/tag_mxl-gsw1xx.c    |  3 +++
 net/dsa/tag_mxl862xx.c      |  3 +++
 net/dsa/tag_netc.c          | 18 ++++++++++--------
 net/dsa/tag_ocelot.c        |  4 +++-
 net/dsa/tag_ocelot_8021q.c  | 20 +++++++++++++-------
 net/dsa/tag_qca.c           | 14 +++++++++++---
 net/dsa/tag_rtl4_a.c        |  8 ++++++--
 net/dsa/tag_rtl8_4.c        | 24 ++++++++++++++++++------
 net/dsa/tag_rzn1_a5psw.c    |  8 ++++++--
 net/dsa/tag_sja1105.c       | 42 +++++++++++++++++++++++++++---------------
 net/dsa/tag_trailer.c       | 16 ++++++++++++----
 net/dsa/tag_vsc73xx_8021q.c |  1 +
 net/dsa/tag_xrs700x.c       | 12 +++++++++---
 net/dsa/tag_yt921x.c        |  7 ++++++-
 net/dsa/user.c              |  7 +++----
 24 files changed, 233 insertions(+), 101 deletions(-)

diff --git a/net/dsa/tag.c b/net/dsa/tag.c
index 79ad105902d9..991732d6eae2 100644
--- a/net/dsa/tag.c
+++ b/net/dsa/tag.c
@@ -79,15 +79,16 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 		if (likely(skb->dev)) {
 			dsa_default_offload_fwd_mark(skb);
 			nskb = skb;
+		} else {
+			/* Just drop the skb if we can't find the user */
+			kfree_skb(skb);
 		}
 	} else {
 		nskb = cpu_dp->rcv(skb, dev);
 	}
 
-	if (!nskb) {
-		kfree_skb(skb);
+	if (!nskb)
 		return 0;
-	}
 
 	skb = nskb;
 	skb_push(skb, ETH_HLEN);
@@ -107,11 +108,10 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	if (unlikely(cpu_dp->ds->untag_bridge_pvid ||
 		     cpu_dp->ds->untag_vlan_aware_bridge_pvid)) {
+		/* dsa_software_vlan_untag() drops skb on failure */
 		nskb = dsa_software_vlan_untag(skb);
-		if (!nskb) {
-			kfree_skb(skb);
+		if (!nskb)
 			return 0;
-		}
 		skb = nskb;
 	}
 
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
index cbb588ca73aa..2e2388143b02 100644
--- a/net/dsa/tag_ar9331.c
+++ b/net/dsa/tag_ar9331.c
@@ -51,8 +51,10 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
 	u8 ver, port;
 	u16 hdr;
 
-	if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN)))
+	if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	hdr = le16_to_cpu(*(__le16 *)skb_mac_header(skb));
 
@@ -60,12 +62,14 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
 	if (unlikely(ver != AR9331_HDR_VERSION)) {
 		netdev_warn_once(ndev, "%s:%i wrong header version 0x%2x\n",
 				 __func__, __LINE__, hdr);
+		kfree_skb(skb);
 		return NULL;
 	}
 
 	if (unlikely(hdr & AR9331_HDR_FROM_CPU)) {
 		netdev_warn_once(ndev, "%s:%i packet should not be from cpu 0x%2x\n",
 				 __func__, __LINE__, hdr);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -75,8 +79,10 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
 	port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr);
 
 	skb->dev = dsa_conduit_find_user(ndev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	return skb;
 }
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index cf9420439054..411e3b57d16a 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -102,9 +102,9 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 	 * (including FCS and tag) because the length verification is done after
 	 * the Broadcom tag is stripped off the ingress packet.
 	 *
-	 * Let dsa_user_xmit() free the SKB
+	 * Free the SKB on error.
 	 */
-	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN, false))
+	if (skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN))
 		return NULL;
 
 	skb_push(skb, BRCM_TAG_LEN);
@@ -151,27 +151,35 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
 	int source_port;
 	u8 *brcm_tag;
 
-	if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN)))
+	if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	brcm_tag = skb->data - offset;
 
 	/* The opcode should never be different than 0b000 */
-	if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK))
+	if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* We should never see a reserved reason code without knowing how to
 	 * handle it
 	 */
-	if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD))
+	if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Locate which port this is coming from */
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
 
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Remove Broadcom tag and update checksum */
 	skb_pull_rcsum(skb, BRCM_TAG_LEN);
@@ -228,8 +236,10 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
 	__be16 *proto;
 	u8 *brcm_tag;
 
-	if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN)))
+	if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	brcm_tag = dsa_etype_header_pos_rx(skb);
 	proto = (__be16 *)(brcm_tag + BRCM_LEG_TAG_LEN);
@@ -237,8 +247,10 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
 	source_port = brcm_tag[5] & BRCM_LEG_PORT_ID;
 
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* The internal switch in BCM63XX SoCs always tags on egress on the CPU
 	 * port. We use VID 0 internally for untagged traffic, so strip the tag
@@ -273,10 +285,8 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
 	 * need to make sure that packets are at least 70 bytes
 	 * (including FCS and tag) because the length verification is done after
 	 * the Broadcom tag is stripped off the ingress packet.
-	 *
-	 * Let dsa_user_xmit() free the SKB
 	 */
-	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+	if (skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN))
 		return NULL;
 
 	skb_push(skb, BRCM_LEG_TAG_LEN);
@@ -325,10 +335,8 @@ static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb,
 	 * need to make sure that packets are at least 70 bytes (including FCS
 	 * and tag) because the length verification is done after the Broadcom
 	 * tag is stripped off the ingress packet.
-	 *
-	 * Let dsa_user_xmit() free the SKB.
 	 */
-	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+	if (skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN))
 		return NULL;
 
 	fcs_len = skb->len;
@@ -351,8 +359,9 @@ static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb,
 	brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID;
 
 	/* Original FCS value */
-	if (__skb_pad(skb, ETH_FCS_LEN, false))
+	if (skb_pad(skb, ETH_FCS_LEN))
 		return NULL;
+
 	skb_put_data(skb, &fcs_val, ETH_FCS_LEN);
 
 	return skb;
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 2a2c4fb61a65..d5ffee35fbb5 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -224,6 +224,7 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
 			/* Remote management is not implemented yet,
 			 * drop.
 			 */
+			kfree_skb(skb);
 			return NULL;
 		case DSA_CODE_ARP_MIRROR:
 		case DSA_CODE_POLICY_MIRROR:
@@ -244,12 +245,14 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
 			/* Reserved code, this could be anything. Drop
 			 * seems like the safest option.
 			 */
+			kfree_skb(skb);
 			return NULL;
 		}
 
 		break;
 
 	default:
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -271,8 +274,10 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
 						 source_port);
 	}
 
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* When using LAG offload, skb->dev is not a DSA user interface,
 	 * so we cannot call dsa_default_offload_fwd_mark and we need to
@@ -335,8 +340,10 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev)
 {
-	if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
+	if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	return dsa_rcv_ll(skb, dev, 0);
 }
@@ -375,8 +382,10 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev)
 {
-	if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
+	if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	skb_pull_rcsum(skb, EDSA_HLEN - DSA_HLEN);
 
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index 5fa436121087..5c407d448c9f 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -80,16 +80,20 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
 	int port;
 	u8 *gswip_tag;
 
-	if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN)))
+	if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	gswip_tag = skb->data - ETH_HLEN;
 
 	/* Get source port information */
 	port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* remove GSWIP tag */
 	skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN);
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
index 544ab15685a2..dd9f328f3182 100644
--- a/net/dsa/tag_hellcreek.c
+++ b/net/dsa/tag_hellcreek.c
@@ -27,8 +27,10 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb,
 	 * checksums after the switch strips the tag.
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
-	    skb_checksum_help(skb))
+	    skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag encoding */
 	tag  = skb_put(skb, HELLCREEK_TAG_LEN);
@@ -47,11 +49,14 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
 		netdev_warn_once(dev, "Failed to get source port: %d\n", port);
+		kfree_skb(skb);
 		return NULL;
 	}
 
-	if (pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN))
+	if (pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index d2475c3bbb7d..67fa89f102e0 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -88,11 +88,15 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
 				      unsigned int port, unsigned int len)
 {
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (pskb_trim_rcsum(skb, skb->len - len))
+	if (pskb_trim_rcsum(skb, skb->len - len)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 
@@ -123,8 +127,10 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ethhdr *hdr;
 	u8 *tag;
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag encoding */
 	tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
@@ -141,8 +147,10 @@ static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev)
 {
 	u8 *tag;
 
-	if (skb_linearize(skb))
+	if (skb_linearize(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
 
@@ -255,22 +263,24 @@ static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb)
 	xmit_work_fn = tagger_data->xmit_work_fn;
 	xmit_worker = priv->xmit_worker;
 
-	if (!xmit_work_fn || !xmit_worker)
+	if (!xmit_work_fn || !xmit_worker) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC);
-	if (!xmit_work)
+	if (!xmit_work) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_user_xmit
-	 * won't really free the packet.
-	 */
 	xmit_work->dp = dp;
 	xmit_work->skb = skb_get(skb);
 
 	kthread_queue_work(xmit_worker, &xmit_work->work);
 
+	kfree_skb(skb);
 	return NULL;
 }
 
@@ -284,8 +294,10 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
 	__be16 *tag;
 	u16 val;
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag encoding */
 	ksz_xmit_timestamp(dp, skb);
@@ -310,8 +322,10 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev)
 	unsigned int port;
 	u8 *tag;
 
-	if (skb_linearize(skb))
+	if (skb_linearize(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag decoding */
 	tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
@@ -352,8 +366,10 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
 	struct ethhdr *hdr;
 	u8 *tag;
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Tag encoding */
 	ksz_xmit_timestamp(dp, skb);
@@ -418,8 +434,10 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
 	__be16 *tag;
 	u16 val;
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	ksz_xmit_timestamp(dp, skb);
 
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 258e5d7dc5ef..d1194696499a 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -85,6 +85,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) {
 		dev_warn_ratelimited(&dev->dev,
 				     "Dropping packet, cannot pull\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -102,6 +103,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index dea3eecaf093..c7dc7731675e 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -72,8 +72,10 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	int port;
 	__be16 *phdr;
 
-	if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN)))
+	if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	phdr = dsa_etype_header_pos_rx(skb);
 	hdr = ntohs(*phdr);
@@ -87,8 +89,10 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK);
 
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 
diff --git a/net/dsa/tag_mxl-gsw1xx.c b/net/dsa/tag_mxl-gsw1xx.c
index 60f7c445e656..4b1b6ef94196 100644
--- a/net/dsa/tag_mxl-gsw1xx.c
+++ b/net/dsa/tag_mxl-gsw1xx.c
@@ -73,6 +73,7 @@ static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
 
 	if (unlikely(!pskb_may_pull(skb, GSW1XX_HEADER_LEN))) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull SKB\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -81,6 +82,7 @@ static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
 	if (unlikely(ntohs(gsw1xx_tag[0]) != ETH_P_MXLGSW)) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid special tag\n");
 		dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -90,6 +92,7 @@ static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
 	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
 		dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_mxl862xx.c b/net/dsa/tag_mxl862xx.c
index 8daefeb8d49d..87b80ddf0946 100644
--- a/net/dsa/tag_mxl862xx.c
+++ b/net/dsa/tag_mxl862xx.c
@@ -64,6 +64,7 @@ static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb,
 
 	if (unlikely(!pskb_may_pull(skb, MXL862_HEADER_LEN))) {
 		dev_warn_ratelimited(&dev->dev, "Cannot pull SKB, packet dropped\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -73,6 +74,7 @@ static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb,
 		dev_warn_ratelimited(&dev->dev,
 				     "Invalid special tag marker, packet dropped, tag: %8ph\n",
 				     mxl862_tag);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -83,6 +85,7 @@ static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb,
 		dev_warn_ratelimited(&dev->dev,
 				     "Invalid source port, packet dropped, tag: %8ph\n",
 				     mxl862_tag);
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_netc.c b/net/dsa/tag_netc.c
index ccedfe3a80b6..df72a61796ad 100644
--- a/net/dsa/tag_netc.c
+++ b/net/dsa/tag_netc.c
@@ -131,14 +131,13 @@ static struct sk_buff *netc_rcv(struct sk_buff *skb,
 	int type, subtype;
 
 	if (unlikely(!pskb_may_pull(skb, NETC_TAG_MAX_LEN)))
-		return NULL;
+		goto err_free_skb;
 
 	tag_cmn = dsa_etype_header_pos_rx(skb);
 	if (ntohs(tag_cmn->tpid) != ETH_P_NXP_NETC) {
 		dev_warn_ratelimited(&ndev->dev, "Unknown TPID 0x%04x\n",
 				     ntohs(tag_cmn->tpid));
-
-		return NULL;
+		goto err_free_skb;
 	}
 
 	if (tag_cmn->qos & NETC_TAG_QV)
@@ -149,14 +148,13 @@ static struct sk_buff *netc_rcv(struct sk_buff *skb,
 	if (!sw_id) {
 		dev_warn_ratelimited(&ndev->dev,
 				     "VEPA switch ID is not supported yet\n");
-
-		return NULL;
+		goto err_free_skb;
 	}
 
 	port = FIELD_GET(NETC_TAG_PORT, tag_cmn->switch_port);
 	skb->dev = dsa_conduit_find_user(ndev, sw_id, port);
 	if (!skb->dev)
-		return NULL;
+		goto err_free_skb;
 
 	type = FIELD_GET(NETC_TAG_TYPE, tag_cmn->type);
 	subtype = FIELD_GET(NETC_TAG_SUBTYPE, tag_cmn->type);
@@ -165,11 +163,11 @@ static struct sk_buff *netc_rcv(struct sk_buff *skb,
 	} else if (type == NETC_TAG_TO_HOST) {
 		/* Currently only subtype0 supported */
 		if (subtype != NETC_TAG_TH_SUBTYPE0)
-			return NULL;
+			goto err_free_skb;
 	} else {
 		dev_warn_ratelimited(&ndev->dev,
 				     "Unexpected  tag type %d\n", type);
-		return NULL;
+		goto err_free_skb;
 	}
 
 	/* Remove Switch tag from the frame */
@@ -178,6 +176,10 @@ static struct sk_buff *netc_rcv(struct sk_buff *skb,
 	dsa_strip_etype_header(skb, tag_len);
 
 	return skb;
+
+err_free_skb:
+	kfree_skb(skb);
+	return NULL;
 }
 
 static void netc_flow_dissect(const struct sk_buff *skb, __be16 *proto,
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 3405def79c2d..d208c7322cd6 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -107,14 +107,16 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	ocelot_xfh_get_rew_val(extraction, &rew_val);
 
 	skb->dev = dsa_conduit_find_user(netdev, 0, src_port);
-	if (!skb->dev)
+	if (!skb->dev) {
 		/* The switch will reflect back some frames sent through
 		 * sockets opened on the bare DSA conduit. These will come back
 		 * with src_port equal to the index of the CPU port, for which
 		 * there is no user registered. So don't print any error
 		 * message here (ignore and drop those frames).
 		 */
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 	skb->priority = qos_class;
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index e89d9254e90a..f50f1cd83f16 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -33,30 +33,34 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
 	xmit_work_fn = data->xmit_work_fn;
 	xmit_worker = priv->xmit_worker;
 
-	if (!xmit_work_fn || !xmit_worker)
+	if (!xmit_work_fn || !xmit_worker) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* PTP over IP packets need UDP checksumming. We may have inherited
 	 * NETIF_F_HW_CSUM from the DSA conduit, but these packets are not sent
 	 * through the DSA conduit, so calculate the checksum here.
 	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC);
-	if (!xmit_work)
+	if (!xmit_work) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Calls felix_port_deferred_xmit in felix.c */
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_user_xmit
-	 * won't really free the packet.
-	 */
 	xmit_work->dp = dp;
 	xmit_work->skb = skb_get(skb);
 
 	kthread_queue_work(xmit_worker, &xmit_work->work);
 
+	kfree_skb(skb);
 	return NULL;
 }
 
@@ -84,8 +88,10 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	dsa_8021q_rcv(skb, &src_port, &switch_id, NULL, NULL);
 
 	skb->dev = dsa_conduit_find_user(netdev, switch_id, src_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	dsa_default_offload_fwd_mark(skb);
 
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 9e3b429e8b36..510792fbfa92 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -46,16 +46,20 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 
 	tagger_data = ds->tagger_data;
 
-	if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
+	if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	phdr = dsa_etype_header_pos_rx(skb);
 	hdr = ntohs(*phdr);
 
 	/* Make sure the version is correct */
 	ver = FIELD_GET(QCA_HDR_RECV_VERSION, hdr);
-	if (unlikely(ver != QCA_HDR_VERSION))
+	if (unlikely(ver != QCA_HDR_VERSION)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Get pk type */
 	pk_type = FIELD_GET(QCA_HDR_RECV_TYPE, hdr);
@@ -64,6 +68,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) {
 		if (likely(tagger_data->rw_reg_ack_handler))
 			tagger_data->rw_reg_ack_handler(ds, skb);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -71,6 +76,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	if (pk_type == QCA_HDR_RECV_TYPE_MIB) {
 		if (likely(tagger_data->mib_autocast_handler))
 			tagger_data->mib_autocast_handler(ds, skb);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -78,8 +84,10 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr);
 
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Remove QCA tag and recalculate checksum */
 	skb_pull_rcsum(skb, QCA_HDR_LEN);
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
index 3cc63eacfa03..590ea3b921c9 100644
--- a/net/dsa/tag_rtl4_a.c
+++ b/net/dsa/tag_rtl4_a.c
@@ -41,7 +41,7 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb,
 	u16 out;
 
 	/* Pad out to at least 60 bytes */
-	if (unlikely(__skb_put_padto(skb, ETH_ZLEN, false)))
+	if (unlikely(eth_skb_pad(skb)))
 		return NULL;
 
 	netdev_dbg(dev, "add realtek tag to package to port %d\n",
@@ -75,8 +75,10 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
 	u8 prot;
 	u8 port;
 
-	if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN)))
+	if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	tag = dsa_etype_header_pos_rx(skb);
 	p = (__be16 *)tag;
@@ -92,6 +94,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
 	prot = (protport >> RTL4_A_PROTOCOL_SHIFT) & 0x0f;
 	if (prot != RTL4_A_PROTOCOL_RTL8366RB) {
 		netdev_err(dev, "unknown realtek protocol 0x%01x\n", prot);
+		kfree_skb(skb);
 		return NULL;
 	}
 	port = protport & 0xff;
@@ -99,6 +102,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
 		netdev_dbg(dev, "could not find user for port %d\n", port);
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c
index 852c6b88079a..4da3beebef75 100644
--- a/net/dsa/tag_rtl8_4.c
+++ b/net/dsa/tag_rtl8_4.c
@@ -143,8 +143,10 @@ static struct sk_buff *rtl8_4t_tag_xmit(struct sk_buff *skb,
 	/* Calculate the checksum here if not done yet as trailing tags will
 	 * break either software or hardware based checksum
 	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	rtl8_4_write_tag(skb, dev, skb_put(skb, RTL8_4_TAG_LEN));
 
@@ -201,11 +203,15 @@ static int rtl8_4_read_tag(struct sk_buff *skb, struct net_device *dev,
 static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN)))
+	if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (unlikely(rtl8_4_read_tag(skb, dev, dsa_etype_header_pos_rx(skb))))
+	if (unlikely(rtl8_4_read_tag(skb, dev, dsa_etype_header_pos_rx(skb)))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Remove tag and recalculate checksum */
 	skb_pull_rcsum(skb, RTL8_4_TAG_LEN);
@@ -218,14 +224,20 @@ static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb,
 static struct sk_buff *rtl8_4t_tag_rcv(struct sk_buff *skb,
 				       struct net_device *dev)
 {
-	if (skb_linearize(skb))
+	if (skb_linearize(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (unlikely(rtl8_4_read_tag(skb, dev, skb_tail_pointer(skb) - RTL8_4_TAG_LEN)))
+	if (unlikely(rtl8_4_read_tag(skb, dev, skb_tail_pointer(skb) - RTL8_4_TAG_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (pskb_trim_rcsum(skb, skb->len - RTL8_4_TAG_LEN))
+	if (pskb_trim_rcsum(skb, skb->len - RTL8_4_TAG_LEN)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	return skb;
 }
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
index 10994b3470f6..734910156dc3 100644
--- a/net/dsa/tag_rzn1_a5psw.c
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -48,7 +48,7 @@ static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *de
 	 * least 60 bytes otherwise they will be discarded when they enter the
 	 * switch port logic.
 	 */
-	if (__skb_put_padto(skb, ETH_ZLEN, false))
+	if (eth_skb_pad(skb))
 		return NULL;
 
 	/* provide 'A5PSW_TAG_LEN' bytes additional space */
@@ -77,6 +77,7 @@ static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
 	if (unlikely(!pskb_may_pull(skb, A5PSW_TAG_LEN))) {
 		dev_warn_ratelimited(&dev->dev,
 				     "Dropping packet, cannot pull\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -84,14 +85,17 @@ static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
 
 	if (tag->ctrl_tag != htons(ETH_P_DSA_A5PSW)) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid TAG marker\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
 	port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
 
 	skb->dev = dsa_conduit_find_user(dev, 0, port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	skb_pull_rcsum(skb, A5PSW_TAG_LEN);
 	dsa_strip_etype_header(skb, A5PSW_TAG_LEN);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index de6d4ce8668b..bfe1f746f55b 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -149,19 +149,20 @@ static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp,
 	xmit_work_fn = tagger_data->xmit_work_fn;
 	xmit_worker = priv->xmit_worker;
 
-	if (!xmit_work_fn || !xmit_worker)
+	if (!xmit_work_fn || !xmit_worker) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC);
-	if (!xmit_work)
+	if (!xmit_work) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_user_xmit
-	 * won't really free the packet.
-	 */
 	xmit_work->dp = dp;
-	xmit_work->skb = skb_get(skb);
+	xmit_work->skb = skb;
 
 	kthread_queue_work(xmit_worker, &xmit_work->work);
 
@@ -401,10 +402,7 @@ static struct sk_buff
 			kfree_skb(priv->stampable_skb);
 		}
 
-		/* Hold a reference to avoid dsa_switch_rcv
-		 * from freeing the skb.
-		 */
-		priv->stampable_skb = skb_get(skb);
+		priv->stampable_skb = skb;
 		spin_unlock(&priv->meta_lock);
 
 		/* Tell DSA we got nothing */
@@ -436,6 +434,7 @@ static struct sk_buff
 			dev_err_ratelimited(ds->dev,
 					    "Unexpected meta frame\n");
 			spin_unlock(&priv->meta_lock);
+			kfree_skb(skb);
 			return NULL;
 		}
 
@@ -443,6 +442,7 @@ static struct sk_buff
 			dev_err_ratelimited(ds->dev,
 					    "Meta frame on wrong port\n");
 			spin_unlock(&priv->meta_lock);
+			kfree_skb(skb);
 			return NULL;
 		}
 
@@ -501,18 +501,21 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	/* Normal data plane traffic and link-local frames are tagged with
 	 * a tag_8021q VLAN which we have to strip
 	 */
-	if (sja1105_skb_has_tag_8021q(skb))
+	if (sja1105_skb_has_tag_8021q(skb)) {
 		dsa_8021q_rcv(skb, &source_port, &switch_id, &vbid, &vid);
-	else if (source_port == -1 && switch_id == -1)
+	} else if (source_port == -1 && switch_id == -1) {
 		/* Packets with no source information have no chance of
 		 * getting accepted, drop them straight away.
 		 */
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	skb->dev = dsa_tag_8021q_find_user(netdev, source_port, switch_id,
 					   vid, vbid);
 	if (!skb->dev) {
 		netdev_warn(netdev, "Couldn't decode source port\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -539,12 +542,15 @@ static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
 	if (!ds) {
 		net_err_ratelimited("%s: cannot find switch id %d\n",
 				    conduit->name, switch_id);
+		kfree_skb(skb);
 		return NULL;
 	}
 
 	tagger_data = sja1105_tagger_data(ds);
-	if (!tagger_data->meta_tstamp_handler)
+	if (!tagger_data->meta_tstamp_handler) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	for (i = 0; i <= n_ts; i++) {
 		u8 ts_id, source_port, dir;
@@ -562,6 +568,7 @@ static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
 	}
 
 	/* Discard the meta frame, we've consumed the timestamps it contained */
+	kfree_skb(skb);
 	return NULL;
 }
 
@@ -572,8 +579,10 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 {
 	u16 rx_header;
 
-	if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN)))
+	if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* skb->data points to skb_mac_header(skb) + ETH_HLEN, which is exactly
 	 * what we need because the caller has checked the EtherType (which is
@@ -609,8 +618,10 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 		 * padding and trailer we need to account for the fact that
 		 * skb->data points to skb_mac_header(skb) + ETH_HLEN.
 		 */
-		if (pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN))
+		if (pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN)) {
+			kfree_skb(skb);
 			return NULL;
+		}
 	/* Trap-to-host frame, no timestamp trailer */
 	} else {
 		*source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header);
@@ -653,6 +664,7 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
 
 	if (!skb->dev) {
 		netdev_warn(netdev, "Couldn't decode source port\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 4dce24cfe6a7..49c802c10ca6 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -30,22 +30,30 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev)
 	u8 *trailer;
 	int source_port;
 
-	if (skb_linearize(skb))
+	if (skb_linearize(skb)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	trailer = skb_tail_pointer(skb) - 4;
 	if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 ||
-	    (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00)
+	    (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	source_port = trailer[1] & 7;
 
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (pskb_trim_rcsum(skb, skb->len - 4))
+	if (pskb_trim_rcsum(skb, skb->len - 4)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	return skb;
 }
diff --git a/net/dsa/tag_vsc73xx_8021q.c b/net/dsa/tag_vsc73xx_8021q.c
index af121a9aff7f..f4736a1a7a0f 100644
--- a/net/dsa/tag_vsc73xx_8021q.c
+++ b/net/dsa/tag_vsc73xx_8021q.c
@@ -44,6 +44,7 @@ vsc73xx_rcv(struct sk_buff *skb, struct net_device *netdev)
 	if (!skb->dev) {
 		dev_warn_ratelimited(&netdev->dev,
 				     "Couldn't decode source port\n");
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c
index a05219f702c6..bb268020ee86 100644
--- a/net/dsa/tag_xrs700x.c
+++ b/net/dsa/tag_xrs700x.c
@@ -30,15 +30,21 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev)
 
 	source_port = ffs((int)trailer[0]) - 1;
 
-	if (source_port < 0)
+	if (source_port < 0) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
-	if (!skb->dev)
+	if (!skb->dev) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
-	if (pskb_trim_rcsum(skb, skb->len - 1))
+	if (pskb_trim_rcsum(skb, skb->len - 1)) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	/* Frame is forwarded by hardware, don't forward in software. */
 	dsa_default_offload_fwd_mark(skb);
diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c
index f3ced99b1c85..294784ab6694 100644
--- a/net/dsa/tag_yt921x.c
+++ b/net/dsa/tag_yt921x.c
@@ -87,8 +87,10 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
 	__be16 *tag;
 	u16 rx;
 
-	if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN)))
+	if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN))) {
+		kfree_skb(skb);
 		return NULL;
+	}
 
 	tag = dsa_etype_header_pos_rx(skb);
 
@@ -96,6 +98,7 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
 		dev_warn_ratelimited(&netdev->dev,
 				     "Unexpected EtherType 0x%04x\n",
 				     ntohs(tag[0]));
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -104,6 +107,7 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
 	if (unlikely((rx & YT921X_TAG_PORT_EN) == 0)) {
 		dev_warn_ratelimited(&netdev->dev,
 				     "Unexpected rx tag 0x%04x\n", rx);
+		kfree_skb(skb);
 		return NULL;
 	}
 
@@ -112,6 +116,7 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
 	if (unlikely(!skb->dev)) {
 		dev_warn_ratelimited(&netdev->dev,
 				     "Couldn't decode source port %u\n", port);
+		kfree_skb(skb);
 		return NULL;
 	}
 
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 8704c1a3a5b7..072fa76972cc 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -935,13 +935,12 @@ static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
 		eth_skb_pad(skb);
 
 	/* Transmit function may have to reallocate the original SKB,
-	 * in which case it must have freed it. Only free it here on error.
+	 * in which case it must have freed it. Taggers will drop the
+	 * passed skb on error.
 	 */
 	nskb = p->xmit(skb, dev);
-	if (!nskb) {
-		kfree_skb(skb);
+	if (!nskb)
 		return NETDEV_TX_OK;
-	}
 
 	return dsa_enqueue_skb(nskb, dev);
 }

---
base-commit: f34c6b3a3c3d98f34918e1d2ea846a5acccac6d1
change-id: 20260616-dsa-fix-free-skb-bb028ce90802

Best regards,
--  
Linus Walleij <linusw@kernel.org>


^ permalink raw reply related

* Re: [PATCH iwl v3] ice: retry reading NVM if admin queue returns EBUSY
From: Przemek Kitszel @ 2026-06-25  7:53 UTC (permalink / raw)
  To: Robert Malz, Simon Horman, Grzegorz Nitka
  Cc: anthony.l.nguyen, intel-wired-lan, netdev
In-Reply-To: <CADcc-bydFL4KNDQEznStE41NFXuCey9S+kyXg0usbonwyWpiAQ@mail.gmail.com>

>> This is an AI-generated review of your patch. The human sending this


>> [Severity: Low]
>> Does this check allow an extra retry execution? Since retry_cnt starts at 0,
>> using a strictly greater-than check allows the loop to retry 4 times
>> (5 total executions) instead of the 3 executions intended by
>> ICE_SQ_SEND_MAX_EXECUTE.
>>
>>> +                             break;
>>> +                     ice_debug(hw, ICE_DBG_NVM,
>>> +                               "NVM read EBUSY error, retry %d\n",
>>> +                               retry_cnt + 1);
>>> +                     last_cmd = false;
>>> +                     ice_release_nvm(hw);
>>> +                     msleep(ICE_SQ_SEND_DELAY_TIME_MS);
>>
> The review is correct that this allows one extra execution. However, I
> intentionally kept this verbatim from the OOT driver (1.15.4) to

This OOT change was made entirely by Grzegorz, for E825C. Would be great 
to acknowledge his authorship (+ SoB ofc, below).
Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>

What was the platform that you have encountered the bug on?

> retain the significant test exposure it has there. The absolute worst
> case is one harmless extra retry before giving up. If you strongly
> prefer strict adherence to ICE_SQ_SEND_MAX_EXECUTE, I can update the
> condition in a v4.

Please update (to have one attempt less).

> 
>> [Severity: High]
>> Does dropping the NVM lock here violate the caller's assumption of mutual
>> exclusion?
>>
>> Callers of ice_read_flat_nvm() typically acquire the NVM lock beforehand to
>> ensure an atomic read of multiple sectors without interleaved writes. By
>> explicitly dropping the lock during the msleep() window, another thread, VF,
>> or PF could acquire the NVM lock and modify the NVM before the read resumes,
>> potentially resulting in reading torn or inconsistent NVM data.
>>
>>> +                     status = ice_acquire_nvm(hw, ICE_RES_READ);
>>> +                     if (status)
>>> +                             break;
>>
> 
> I don't think this is an issue. The NVM resource (req_res 0x0008) is
> an ownership lease, not a content snapshot. The FW grants it with a
> hold timeout (see the timeout field handling in ice_aq_req_res). It
> never guaranteed a point-in-time view of the flash.
> Very similar behavior already exists in ice_devlink_nvm_snapshot(),
> which reads the NVM in chunks via ice_read_flat_nvm() and deliberately
> acquires/releases the lock between chunks (with a comment explaining
> the FW reclaim).
> Additionally, the lock is only dropped on the failure path, where
> neither bytes_read nor offset have been advanced. The driver simply
> re-reads the failed chunk after re-acquiring. So no data that is
> returned to the caller is committed across the drop window.
> Overall the lock drop does not weaken any guarantee callers actually
> have. If we wanted to introduce a true snapshot guarantee, that would
> probably require different approach and separate patch.
> 
> 
>> [Severity: High]
>> Can this lead to an unbalanced NVM resource lock release and potential data
>> corruption?
>>
>> If ice_acquire_nvm() fails here, the loop breaks and returns an error with the
>> lock currently unheld. However, callers like ice_read_flash_module() are
>> unaware of this internal lock state change and will unconditionally call
>> ice_release_nvm() upon failure:
>>
>> drivers/net/ethernet/intel/ice/ice_nvm.c:ice_read_flash_module() {
>>          status = ice_acquire_nvm(hw, ICE_RES_READ);
>>          if (status)
>>                  return status;
>>
>>          status = ice_read_flat_nvm(hw, start + offset, &length, data, false);
>>
>>          ice_release_nvm(hw);
>>      ...
>>
>> Because firmware tracks resource locks per-PF rather than per-OS-thread, this
>> errant second release could forcefully strip the lock from another thread on
>> the same PF that successfully acquired it, exposing the NVM to concurrent
>> modification.
>>
> 
> Agreed, this might be a real bug, and the one of the three I think is
> worth investigating.
> If ice_acquire_nvm() fails after the drop, ice_read_flat_nvm() returns
> with the lock unheld while callers unconditionally call
> ice_release_nvm(), so a stray release is issued.
> 
> On probability, though, the window is very small. Reaching it requires
> sustained EBUSY across the retry budget plus a failed re-acquire
> (which itself polls up to ICE_NVM_TIMEOUT), and concurrently another
> requester taking the lock. Most reads happen during init (ice_probe,
> and reset/rebuild via ice_init_nvm), and NVM writes only happen on an
> already initialized driver. The devlink/ethtool nvm_read paths are
> also exposed, but hitting this race would require precise timing
> against a concurrent NVM owner on the device.
> 
> I'd prefer to keep the scope of this patch limited to the EBUSY retry
> path and not take on the unbalanced-release fix here. A proper fix
> should change the lock-ownership contract of ice_read_flat_nvm() (on
> error, the lock must be released by ice_read_flat_nvm(), callers
> release only on success) and update all callers. Code change sould be
> simple for all callers but ice_discover_flash_size(), it intentionally
> holds one lease across a read loop and would need to re-acquire after
> each expected boundary failure.
> 
> Given how small the original window is, I'd rather not trade tested
> OOT behavior for the risk of a complex unbalanced NVM lock fix. I
> actually have a patch mostly ready that fixes the lock-ownership
> contract, but I really don't like it. It changes the design of
> ice_read_flat_nvm(), making it less intuitive for callers. More
> importantly, I just don't have the resources or test coverage right
> now to properly verify such change.
> 
> However, I can modify the failure path for ice_acquire_nvm inside
> ice_read_flat_nvm. Instead of bailing out immediately, we can just
> retry it within the existing retry budget. In this case, the
> probability of leaving ice_read_flat_nvm without holding the lock is
> reduced even further without needing a refactor.
> 
> Please let me know what you think about my thought process on this.

I think that both AI-reported issues against the lock are valid
concerns.

I think that sleep was the actual fix, and re-locking were merely
a necessity due to their expiration (as you said).

A proper fix would be to just increase lock-timeout to accommodate all
attempts (and still do the retries&sleep, but without unlocking).


> 
> 
> 
>>> +                     retry_cnt++;
>>> +             } else {
>>> +                     bytes_read += read_size;
>>> +                     offset += read_size;
>>> +                     retry_cnt = 0;
>>> +             }
>>>        } while (!last_cmd);
>>>
>>>        *length = bytes_read;
> 
> Thanks,
> Robert


^ permalink raw reply

* Re: [net] ipv6: honor per-interface proxy_ndp in forward and NA paths
From: Ido Schimmel @ 2026-06-25  7:53 UTC (permalink / raw)
  To: Chenguang Zhao
  Cc: David Ahern, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, netdev
In-Reply-To: <20260623085600.396401-1-zhaochenguang@kylinos.cn>

Subject prefix is incorrect. See:

https://docs.kernel.org/process/maintainer-netdev.html

On Tue, Jun 23, 2026 at 04:56:00PM +0800, Chenguang Zhao wrote:
> ndisc_recv_ns() has always checked both devconf_all and idev->cnf for
> proxy_ndp, but ip6_forward() and ndisc_recv_na() only looked at the
> global setting.
> 
> Honor per-interface proxy_ndp in both places to match the NS path and
> allow setups that only enable proxy_ndp on specific interfaces.
> 
> Fixes: fbea49e1e240 ("[IPV6] NDISC: Add proxy_ndp sysctl.")
> Signed-off-by: Chenguang Zhao <zhaochenguang@kylinos.cn>

Given that this never worked and that the patch changes a 20 years old
user-visible behavior, I prefer that you target it at net-next (without
the Fixes tag) when it opens next week.

Also, did you look into why these "XXX" comments were added in the
original commit from 2006? I *assume* that it's because back then both
ndisc_recv_na() and ip6_forward() were missing an idev, unlike
ndisc_recv_ns().

> ---
>  net/ipv6/ip6_output.c | 4 ++--
>  net/ipv6/ndisc.c      | 4 ++--
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index 368e4fa3b43c..c4ca4a813479 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -579,8 +579,8 @@ int ip6_forward(struct sk_buff *skb)
>  		return -ETIMEDOUT;
>  	}
>  
> -	/* XXX: idev->cnf.proxy_ndp? */
> -	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
> +	if ((READ_ONCE(net->ipv6.devconf_all->proxy_ndp) ||
> +	     (idev && READ_ONCE(idev->cnf.proxy_ndp))) &&
>  	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {

Note that idev doesn't necessarily correspond to the device with which
the neighbour lookup is performed (skb->dev). See 0857d6f8c759d.

vrf_ip6_rcv() does not modify skb->dev for neighbour discovery packets,
so this happens to be OK in this case, but you need to explain this in
the commit message.

>  		int proxied = ip6_forward_proxy_check(skb);
>  
> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
> index f867ec8d3d90..e03e94681738 100644
> --- a/net/ipv6/ndisc.c
> +++ b/net/ipv6/ndisc.c
> @@ -1096,9 +1096,9 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
>  		 */
>  		if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
>  		    READ_ONCE(net->ipv6.devconf_all->forwarding) &&
> -		    READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
> +		    (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) ||
> +		     (idev && READ_ONCE(idev->cnf.proxy_ndp))) &&
>  		    pneigh_lookup(&nd_tbl, net, &msg->target, dev)) {
> -			/* XXX: idev->cnf.proxy_ndp */
>  			goto out;
>  		}
>  
> -- 
> 2.25.1
> 

^ permalink raw reply

* RE: [PATCH] ice: propagate ETH56G deskew read errors
From: Jagielski, Jedrzej @ 2026-06-25  7:55 UTC (permalink / raw)
  To: Pengpeng Hou, Nguyen, Anthony L, Kitszel, Przemyslaw
  Cc: Andrew Lunn, davem@davemloft.net, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Richard Cochran, intel-wired-lan@lists.osuosl.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260625030305.85304-1-pengpeng@iscas.ac.cn>

From: Pengpeng Hou <pengpeng@iscas.ac.cn> 
Sent: Thursday, June 25, 2026 5:03 AM

>ice_ptp_calc_deskew_eth56g() returns a u32 deskew value, but it also
>returns the negative read_poll_timeout() error when the DESKEW valid bit
>never appears. That converts the negative error into a large unsigned
>deskew contribution, which can then be folded into the RX timestamp
>offset and programmed into hardware.
>
>Return the deskew value through an output parameter and propagate the
>read error from ice_phy_set_offsets_eth56g() instead of using it as
>offset data.

Hi
looks like fix so please add fixes tag

>
>Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
>---
> drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 27 +++++++++++++++------
> 1 file changed, 19 insertions(+), 8 deletions(-)
>
>diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
>index 8e5f97835954..bd2e31b816a8 100644
>--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
>+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
>@@ -1736,17 +1736,21 @@ static u32 ice_ptp_calc_bitslip_eth56g(struct ice_hw *hw, u8 port, u32 bs,
>  * @ds: deskew multiplier
>  * @rs: RS-FEC enabled
>  * @spd: link speed
>+ * @deskew: calculated deskew value
>  *
>- * Return: calculated deskew value
>+ * Return: 0 on success, negative error code otherwise

please state it clear that @deskew is also an output 

>  */
>-static u32 ice_ptp_calc_deskew_eth56g(struct ice_hw *hw, u8 port, u32 ds,
>-				      bool rs, enum ice_eth56g_link_spd spd)
>+static int ice_ptp_calc_deskew_eth56g(struct ice_hw *hw, u8 port, u32 ds,
>+				      bool rs, enum ice_eth56g_link_spd spd,
>+				      u32 *deskew)
> {
> 	u32 deskew_i, deskew_f;
> 	int err;
> 
>-	if (!ds)
>+	if (!ds) {
>+		*deskew = 0;
> 		return 0;
>+	}
> 
> 	read_poll_timeout(ice_read_ptp_reg_eth56g, err,
> 			  FIELD_GET(PHY_REG_DESKEW_0_VALID, deskew_i), 500,
>@@ -1766,7 +1770,9 @@ static u32 ice_ptp_calc_deskew_eth56g(struct ice_hw *hw, u8 port, u32 ds,
> 	deskew_i = FIELD_PREP(ICE_ETH56G_MAC_CFG_RX_OFFSET_INT, deskew_i);
> 	/* Shift 3 fractional bits to the end of the integer part */
> 	deskew_f <<= ICE_ETH56G_MAC_CFG_FRAC_W - PHY_REG_DESKEW_0_RLEVEL_FRAC_W;
>-	return mul_u32_u32_fx_q9(deskew_i | deskew_f, ds);
>+	*deskew = mul_u32_u32_fx_q9(deskew_i | deskew_f, ds);
>+
>+	return 0;
> }
> 
> /**
>@@ -1789,6 +1795,7 @@ static int ice_phy_set_offsets_eth56g(struct ice_hw *hw, u8 port,
> {
> 	u32 rx_offset, tx_offset, bs_ds;
> 	bool onestep, sfd;
>+	int err;
> 
> 	onestep = hw->ptp.phy.eth56g.onestep_ena;
> 	sfd = hw->ptp.phy.eth56g.sfd_ena;
>@@ -1805,11 +1812,15 @@ static int ice_phy_set_offsets_eth56g(struct ice_hw *hw, u8 port,
> 	if (sfd)
> 		rx_offset = add_u32_u32_fx(rx_offset, cfg->rx_offset.sfd);
> 
>-	if (spd < ICE_ETH56G_LNK_SPD_40G)
>+	if (spd < ICE_ETH56G_LNK_SPD_40G) {
> 		bs_ds = ice_ptp_calc_bitslip_eth56g(hw, port, bs_ds, fc, rs,
> 						    spd);
>-	else
>-		bs_ds = ice_ptp_calc_deskew_eth56g(hw, port, bs_ds, rs, spd);
>+	} else {
>+		err = ice_ptp_calc_deskew_eth56g(hw, port, bs_ds, rs, spd,
>+						 &bs_ds);
>+		if (err)
>+			return err;
>+	}
> 	rx_offset = add_u32_u32_fx(rx_offset, bs_ds);
> 	rx_offset &= ICE_ETH56G_MAC_CFG_RX_OFFSET_INT |
> 		     ICE_ETH56G_MAC_CFG_RX_OFFSET_FRAC;
>-- 
>2.50.1 (Apple Git-155)



^ permalink raw reply

* [PATCH 6.12 1/2] net: ipv6: Make udp_tunnel6_xmit_skb() void
From: Alexander Martyniuk @ 2026-06-25  8:24 UTC (permalink / raw)
  To: stable, gregkh
  Cc: Alexander Martyniuk, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, David Ahern,
	Marcelo Ricardo Leitner, Xin Long, Jon Maloy, Ying Xue, netdev,
	linux-kernel, linux-sctp, tipc-discussion, Petr Machata,
	Ido Schimmel, Nikolay Aleksandrov
In-Reply-To: <20260625082442.96390-1-alexevgmart@gmail.com>

From: Petr Machata <petrm@nvidia.com>

commit 6a7d88ca15f73c5c570c372238f71d63da1fda55 upstream.

The function always returns zero, thus the return value does not carry any
signal. Just make it void.

Most callers already ignore the return value. However:

- Refold arguments of the call from sctp_v6_xmit() so that they fit into
  the 80-column limit.

- tipc_udp_xmit() initializes err from the return value, but that should
  already be always zero at that point. So there's no practical change, but
  elision of the assignment prompts a couple more tweaks to clean up the
  function.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/7facacf9d8ca3ca9391a4aee88160913671b868d.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexander Martyniuk <alexevgmart@gmail.com>
---
 include/net/udp_tunnel.h  | 14 +++++++-------
 net/ipv6/ip6_udp_tunnel.c | 15 +++++++--------
 net/sctp/ipv6.c           |  7 ++++---
 net/tipc/udp_media.c      | 10 +++++-----
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 6e2c5c77031f..8ed36ec520d7 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -152,13 +152,13 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
 			 __be16 df, __be16 src_port, __be16 dst_port,
 			 bool xnet, bool nocheck);
 
-int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
-			 struct sk_buff *skb,
-			 struct net_device *dev,
-			 const struct in6_addr *saddr,
-			 const struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be32 label,
-			 __be16 src_port, __be16 dst_port, bool nocheck);
+void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+			  struct sk_buff *skb,
+			  struct net_device *dev,
+			  const struct in6_addr *saddr,
+			  const struct in6_addr *daddr,
+			  __u8 prio, __u8 ttl, __be32 label,
+			  __be16 src_port, __be16 dst_port, bool nocheck);
 
 void udp_tunnel_sock_release(struct socket *sock);
 
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 2acf1bb93fc0..f22eff2ba77c 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -74,13 +74,13 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 }
 EXPORT_SYMBOL_GPL(udp_sock_create6);
 
-int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
-			 struct sk_buff *skb,
-			 struct net_device *dev,
-			 const struct in6_addr *saddr,
-			 const struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be32 label,
-			 __be16 src_port, __be16 dst_port, bool nocheck)
+void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+			  struct sk_buff *skb,
+			  struct net_device *dev,
+			  const struct in6_addr *saddr,
+			  const struct in6_addr *daddr,
+			  __u8 prio, __u8 ttl, __be32 label,
+			  __be16 src_port, __be16 dst_port, bool nocheck)
 {
 	struct udphdr *uh;
 	struct ipv6hdr *ip6h;
@@ -109,7 +109,6 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 	ip6h->saddr	  = *saddr;
 
 	ip6tunnel_xmit(sk, skb, dev);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 9f835e674c59..b4c321bad033 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -261,9 +261,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	skb_set_inner_ipproto(skb, IPPROTO_SCTP);
 	label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6);
 
-	return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr,
-				    &fl6->daddr, tclass, ip6_dst_hoplimit(dst),
-				    label, sctp_sk(sk)->udp_port, t->encap_port, false);
+	udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr,
+			     tclass, ip6_dst_hoplimit(dst), label,
+			     sctp_sk(sk)->udp_port, t->encap_port, false);
+	return 0;
 }
 
 /* Returns the dst cache entry for the given source and destination ip
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 258d6aa4f21a..1b8d6bbf8a8e 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -172,7 +172,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 			 struct udp_media_addr *dst, struct dst_cache *cache)
 {
 	struct dst_entry *ndst;
-	int ttl, err = 0;
+	int ttl, err;
 
 	local_bh_disable();
 	ndst = dst_cache_get(cache);
@@ -217,13 +217,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 			dst_cache_set_ip6(cache, ndst, &fl6.saddr);
 		}
 		ttl = ip6_dst_hoplimit(ndst);
-		err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL,
-					   &src->ipv6, &dst->ipv6, 0, ttl, 0,
-					   src->port, dst->port, false);
+		udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL,
+				     &src->ipv6, &dst->ipv6, 0, ttl, 0,
+				     src->port, dst->port, false);
 #endif
 	}
 	local_bh_enable();
-	return err;
+	return 0;
 
 tx_error:
 	local_bh_enable();
-- 
2.43.0


^ permalink raw reply related

* [PATCH 6.12 2/2] sctp: disable BH before calling udp_tunnel_xmit_skb()
From: Alexander Martyniuk @ 2026-06-25  8:24 UTC (permalink / raw)
  To: stable, gregkh
  Cc: Alexander Martyniuk, Marcelo Ricardo Leitner, Xin Long,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Weiming Shi, linux-sctp, netdev, linux-kernel
In-Reply-To: <20260625082442.96390-1-alexevgmart@gmail.com>

From: Xin Long <lucien.xin@gmail.com>

commit 2cd7e6971fc2787408ceef17906ea152791448cf upstream.

udp_tunnel_xmit_skb() / udp_tunnel6_xmit_skb() are expected to run with
BH disabled.  After commit 6f1a9140ecda ("add xmit recursion limit to
tunnel xmit functions"), on the path:

  udp(6)_tunnel_xmit_skb() -> ip(6)tunnel_xmit()

dev_xmit_recursion_inc()/dec() must stay balanced on the same CPU.

Without local_bh_disable(), the context may move between CPUs, which can
break the inc/dec pairing. This may lead to incorrect recursion level
detection and cause packets to be dropped in ip(6)_tunnel_xmit() or
__dev_queue_xmit().

Fix it by disabling BH around both IPv4 and IPv6 SCTP UDP xmit paths.

In my testing, after enabling the SCTP over UDP:

  # ip net exec ha sysctl -w net.sctp.udp_port=9899
  # ip net exec ha sysctl -w net.sctp.encap_port=9899
  # ip net exec hb sysctl -w net.sctp.udp_port=9899
  # ip net exec hb sysctl -w net.sctp.encap_port=9899

  # ip net exec ha iperf3 -s

- without this patch:

  # ip net exec hb iperf3 -c 192.168.0.1 --sctp
  [  5]   0.00-10.00  sec  37.2 MBytes  31.2 Mbits/sec  sender
  [  5]   0.00-10.00  sec  37.1 MBytes  31.1 Mbits/sec  receiver

- with this patch:

  # ip net exec hb iperf3 -c 192.168.0.1 --sctp
  [  5]   0.00-10.00  sec  3.14 GBytes  2.69 Gbits/sec  sender
  [  5]   0.00-10.00  sec  3.14 GBytes  2.69 Gbits/sec  receiver

Fixes: 6f1a9140ecda ("net: add xmit recursion limit to tunnel xmit functions")
Fixes: 046c052b475e ("sctp: enable udp tunneling socks")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Link: https://patch.msgid.link/c874a8548221dcd56ff03c65ba75a74e6cf99119.1776017727.git.lucien.xin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexander Martyniuk <alexevgmart@gmail.com>
---
 net/sctp/ipv6.c     | 2 ++
 net/sctp/protocol.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index b4c321bad033..b45cc51dfc35 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -261,9 +261,11 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	skb_set_inner_ipproto(skb, IPPROTO_SCTP);
 	label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6);
 
+	local_bh_disable();
 	udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr,
 			     tclass, ip6_dst_hoplimit(dst), label,
 			     sctp_sk(sk)->udp_port, t->encap_port, false);
+	local_bh_enable();
 	return 0;
 }
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 39ca5403d4d7..6ea15361088b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1086,9 +1086,11 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	skb_reset_inner_mac_header(skb);
 	skb_reset_inner_transport_header(skb);
 	skb_set_inner_ipproto(skb, IPPROTO_SCTP);
+	local_bh_disable();
 	udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr,
 			    fl4->daddr, dscp, ip4_dst_hoplimit(dst), df,
 			    sctp_sk(sk)->udp_port, t->encap_port, false, false);
+	local_bh_enable();
 	return 0;
 }
 
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net] net/smc: avoid recursive sk_callback_lock in listen data_ready
From: Sidraya Jayagond @ 2026-06-25  8:32 UTC (permalink / raw)
  To: Runyu Xiao, D. Wythe, Dust Li, Wenjia Zhang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Mahanta Jambigi, Tony Lu, Wen Gu, Simon Horman, Karsten Graul,
	linux-rdma, linux-s390, netdev, linux-kernel, jianhao.xu, stable
In-Reply-To: <20260617152855.1039151-1-runyu.xiao@seu.edu.cn>



On 17/06/26 8:58 pm, Runyu Xiao wrote:
> smc_listen() installs smc_clcsock_data_ready() as the underlying TCP
> listen socket's sk_data_ready callback.  smc_clcsock_data_ready() then
> immediately takes sk_callback_lock before looking up the SMC listener and
> queuing smc_tcp_listen_work().
> 
> That is unsafe once the TCP listen socket is leaving TCP_LISTEN.  The TCP
> close/flush path can run the installed sk_data_ready callback with
> sk_callback_lock already held, so entering smc_clcsock_data_ready() again
> tries to take the same rwlock recursively in the same thread.  The nvmet
> TCP listener had to make the same state check before taking
> sk_callback_lock for this reason.
> 
> This issue was found by our static analysis tool and then manually
> reviewed against the current tree.
> 
> The grounded PoC kept the SMC listen callback installation path:
> 
>   smc_listen()
>   smc_clcsock_replace_cb()
>   sk_data_ready = smc_clcsock_data_ready()
> 
> It then modeled the close/flush carrier that invokes the installed
> sk_data_ready callback while sk_callback_lock is already held.  Lockdep
> reported the same-thread recursive acquisition:
> 
>   WARNING: possible recursive locking detected
>   smc_clcsock_data_ready+0xa/0x4d [vuln_msv]
>   smc_close_flush_work+0x1f/0x30 [vuln_msv]
>   *** DEADLOCK ***
> 
> Return before taking sk_callback_lock when the underlying TCP socket is no
> longer in TCP_LISTEN.  In that state there is no listen accept work to
> queue for SMC, and avoiding the callback lock mirrors the fix used by the
> TCP nvmet listener.
> 
> Fixes: 0558226cebee ("net/smc: Fix slab-out-of-bounds issue in fallback")
> Cc: stable@vger.kernel.org
> Signed-off-by: Runyu Xiao <runyu.xiao@seu.edu.cn>
> ---
>  net/smc/af_smc.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> index 6421c2e1c84d..1af4e3c333ff 100644
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -2631,6 +2631,9 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock)
>  {
>  	struct smc_sock *lsmc;
>  
> +	if (READ_ONCE(listen_clcsock->sk_state) != TCP_LISTEN)
> +		return;
> +

In smc_close_active(), the TCP socket remains in TCP_LISTEN state while
holding write_lock_bh(&smc->clcsock->sk->sk_callback_lock);. The patch's
state check would pass during this window, not preventing the recursive
lock scenario.
It's unclear whether it fully prevents the recursive locking scenario
described in the commit message for the specific code path in
smc_close_active().
Could you come up with exact deadlock scenario and how the patch
addresses it?

>  	read_lock_bh(&listen_clcsock->sk_callback_lock);
>  	lsmc = smc_clcsock_user_data(listen_clcsock);
>  	if (!lsmc)


^ permalink raw reply

* [PATCH v2 0/7] vmsplice: fix some problems in my previous vmsplice patchset
From: Askar Safin @ 2026-06-25  8:34 UTC (permalink / raw)
  To: linux-fsdevel, Christian Brauner, Alexander Viro, Jan Kara
  Cc: linux-kernel, linux-mm, linux-api, netdev, fuse-devel,
	Linus Torvalds, Matthew Wilcox, Jens Axboe, Christoph Hellwig,
	David Howells, Andrew Morton, David Hildenbrand, Pedro Falcato,
	Miklos Szeredi, Andy Lutomirski, Collin Funk, David Laight,
	Stefan Metzmacher, The 8472, Willy Tarreau, Joanne Koong,
	Val Packett, Andrei Vagin, patches

This patchset is for VFS. Of course, it depends on my previous vmsplice
patchset ( https://lore.kernel.org/all/20260531010107.1953702-1-safinaskar@gmail.com/ ).

I fix some problems in my previous patchset.

1. Fix problem with CLASS(fd, f)(fd). See first patch in this patchset
for details. This is probably not so important, but I fix it anyway.

2. Change "unsigned long" back to "int". See second patch for details.
Again, this is probably not important, but I want to fix this anyway.

3. Fix that LTP vmsplice01 bug.

4. libfuse relies on sharing vmsplice behavior. So we detect particular
combination of flags to pipe2(2) and vmsplice(2) and return -EINVAL.
This forces libfuse to fail back to non-vmsplice code path.
I. e. we fix libfuse-related regression [1].
I did debian code search for regex "vmsplice.*SPLICE_F_NONBLOCK" and
I found no other packages with this particular combination of flags
except for fuse itself. (Okay, other packages are fio and stress-ng,
but these are merely testers.) So, I think this is okay to return
EINVAL here, breakage will be minimal.

5. Set FMODE_NOWAIT for named FIFOs. CRIU relies on ability to do
vmsplice(SPLICE_F_NONBLOCK) on named FIFOs. So, I fix this CRIU-related
regression [2]. But there is another CRIU-related regression, which I do not
fix [3]: CRIU behavior in splice mode becomes so slow that splice mode
becomes useless. I personally still believe that removing vmsplice is
right thing to do. Other option is doing nothing. Yet another option
is to implement some deprecation period [3]. Let other developers
decide.

See patches for details.

Please, run that LTP vmsplice01 test again.

Notes:

- I want to repeat: I change behavior around SPLICE_F_NONBLOCK.
Previously, vmsplice ignored whether pipe itself was opened as
non-blocking file. Now it is not ignored. And in my opinion
new behavior is better.
- vmsplice(2) now is in fs/read_write.c . It is very similar to
preadv2 and pwritev2 now, so I think it belongs to fs/read_write.c now.

Please, review this patchset carefully. I'm still new contributor.
In particular, please, review that do-while loop, I'm not sure I did
everything right.

Tested in Qemu.

[1] https://lore.kernel.org/all/CAJnrk1Y9egYizkx1H9K0cqxSYuB+7vLvQbV7Tf4C5eHFqnnC-A@mail.gmail.com/
[2] https://lore.kernel.org/all/CANaxB-zK5q=Xw6UZTmeFtXsDZjUsPkFk=p485m-wtNTBnf4hgg@mail.gmail.com/
[3] https://lore.kernel.org/all/CANaxB-xUrLQYGiRJZc4Boi+KX=0TJSWymErNovANVko20fMDVA@mail.gmail.com/

v1: https://lore.kernel.org/lkml/20260606061031.3744880-1-safinaskar@gmail.com/

Changes since v1: fix fuse-related and CRIU-related regressions (see above).

Askar Safin (7):
  vmsplice: open-code do_writev and do_readv
  vmsplice: change argument type back to "int"
  splice: turn wait_for_space flags argument into bool
  pipe: move wait_for_space to fs/pipe.c and rename it
  vmsplice: make sure we don't wait after writing some data
  vmsplice: return -EINVAL for particular combination of flags
  pipe: set FMODE_NOWAIT for named FIFOs

 fs/pipe.c                 | 23 +++++++++++++
 fs/read_write.c           | 71 +++++++++++++++++++++++++++++++++++----
 fs/splice.c               | 19 +----------
 include/linux/pipe_fs_i.h |  2 ++
 include/linux/syscalls.h  |  2 +-
 5 files changed, 91 insertions(+), 26 deletions(-)


base-commit: 8d86fcfc2857d64af85f5c87c193c25655c970af
-- 
2.47.3


^ permalink raw reply

* [PATCH v2 1/7] vmsplice: open-code do_writev and do_readv
From: Askar Safin @ 2026-06-25  8:34 UTC (permalink / raw)
  To: linux-fsdevel, Christian Brauner, Alexander Viro, Jan Kara
  Cc: linux-kernel, linux-mm, linux-api, netdev, fuse-devel,
	Linus Torvalds, Matthew Wilcox, Jens Axboe, Christoph Hellwig,
	David Howells, Andrew Morton, David Hildenbrand, Pedro Falcato,
	Miklos Szeredi, Andy Lutomirski, Collin Funk, David Laight,
	Stefan Metzmacher, The 8472, Willy Tarreau, Joanne Koong,
	Val Packett, Andrei Vagin, patches
In-Reply-To: <20260625083409.3769242-1-safinaskar@gmail.com>

My previous vmsplice patch did the following mistake: I did
"CLASS(fd, f)(fd)", then did some checks on resulting "struct file",
then passed numeric (!) file descriptor to a function.

This is somewhat okay in this particular case, but I still think
this is code smell, so I fix this by open-coding do_writev and do_readv.

Also I insert a comment to warn other developers to keep
do_writev and do_readv in sync with vmsplice(2).

Signed-off-by: Askar Safin <safinaskar@gmail.com>
---
 fs/read_write.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 1e5444f4d..e224e7cb8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1070,6 +1070,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 			unsigned long vlen, rwf_t flags)
 {
+	/* All future changes to this function should be kept in sync with vmsplice(2). */
 	CLASS(fd_pos, f)(fd);
 	ssize_t ret = -EBADF;
 
@@ -1093,6 +1094,7 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
 			 unsigned long vlen, rwf_t flags)
 {
+	/* All future changes to this function should be kept in sync with vmsplice(2). */
 	CLASS(fd_pos, f)(fd);
 	ssize_t ret = -EBADF;
 
@@ -1226,14 +1228,24 @@ SYSCALL_DEFINE4(vmsplice, unsigned long, fd, const struct iovec __user *, vec,
 	if (fd_empty(f))
 		return -EBADF;
 
-	/* We do do_writev/do_readv, so it is okay to pass "false" here */
+	/* We do vfs_writev/vfs_readv, so it is okay to pass "false" here */
 	if (!get_pipe_info(fd_file(f), /* for_splice = */ false))
 		return -EBADF;
 
-	if (fd_file(f)->f_mode & FMODE_WRITE)
-		return do_writev(fd, vec, vlen, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0);
-	else
-		return do_readv(fd, vec, vlen, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0);
+	if (fd_file(f)->f_mode & FMODE_WRITE) {
+		ssize_t ret = vfs_writev(fd_file(f), vec, vlen, NULL, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0);
+		if (ret > 0)
+			add_wchar(current, ret);
+		inc_syscw(current);
+		return ret;
+	} else {
+		ssize_t ret = vfs_readv(fd_file(f), vec, vlen, NULL, (flags & SPLICE_F_NONBLOCK) ? RWF_NOWAIT : 0);
+
+		if (ret > 0)
+			add_rchar(current, ret);
+		inc_syscr(current);
+		return ret;
+	}
 }
 
 /*
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 2/7] vmsplice: change argument type back to "int"
From: Askar Safin @ 2026-06-25  8:34 UTC (permalink / raw)
  To: linux-fsdevel, Christian Brauner, Alexander Viro, Jan Kara
  Cc: linux-kernel, linux-mm, linux-api, netdev, fuse-devel,
	Linus Torvalds, Matthew Wilcox, Jens Axboe, Christoph Hellwig,
	David Howells, Andrew Morton, David Hildenbrand, Pedro Falcato,
	Miklos Szeredi, Andy Lutomirski, Collin Funk, David Laight,
	Stefan Metzmacher, The 8472, Willy Tarreau, Joanne Koong,
	Val Packett, Andrei Vagin, patches
In-Reply-To: <20260625083409.3769242-1-safinaskar@gmail.com>

My previous vmsplice patchset changed vmsplice argument from
"int" to "unsigned long". This may cause problems, so let's
change it back.

Signed-off-by: Askar Safin <safinaskar@gmail.com>
---
 fs/read_write.c          | 2 +-
 include/linux/syscalls.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index e224e7cb8..77487b307 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1218,7 +1218,7 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
 /*
  * Legacy preadv2/pwritev2 wrapper.
  */
-SYSCALL_DEFINE4(vmsplice, unsigned long, fd, const struct iovec __user *, vec,
+SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, vec,
 		unsigned long, vlen, unsigned int, flags)
 {
 	if (unlikely(flags & ~SPLICE_F_ALL))
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a86a88207..46a3ec954 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -514,7 +514,7 @@ asmlinkage long sys_ppoll_time32(struct pollfd __user *, unsigned int,
 			  struct old_timespec32 __user *, const sigset_t __user *,
 			  size_t);
 asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags);
-asmlinkage long sys_vmsplice(unsigned long fd, const struct iovec __user *vec,
+asmlinkage long sys_vmsplice(int fd, const struct iovec __user *vec,
 			     unsigned long vlen, unsigned int flags);
 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 			   int fd_out, loff_t __user *off_out,
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 3/7] splice: turn wait_for_space flags argument into bool
From: Askar Safin @ 2026-06-25  8:34 UTC (permalink / raw)
  To: linux-fsdevel, Christian Brauner, Alexander Viro, Jan Kara
  Cc: linux-kernel, linux-mm, linux-api, netdev, fuse-devel,
	Linus Torvalds, Matthew Wilcox, Jens Axboe, Christoph Hellwig,
	David Howells, Andrew Morton, David Hildenbrand, Pedro Falcato,
	Miklos Szeredi, Andy Lutomirski, Collin Funk, David Laight,
	Stefan Metzmacher, The 8472, Willy Tarreau, Joanne Koong,
	Val Packett, Andrei Vagin, patches
In-Reply-To: <20260625083409.3769242-1-safinaskar@gmail.com>

I want to do this, because I will move this function to fs/pipe.c.

Signed-off-by: Askar Safin <safinaskar@gmail.com>
---
 fs/splice.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 6ddf7dd72..707db2c2c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1239,7 +1239,7 @@ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
 }
 EXPORT_SYMBOL(splice_file_range);
 
-static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
+static int wait_for_space(struct pipe_inode_info *pipe, bool non_block)
 {
 	for (;;) {
 		if (unlikely(!pipe->readers)) {
@@ -1248,7 +1248,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 		}
 		if (!pipe_is_full(pipe))
 			return 0;
-		if (flags & SPLICE_F_NONBLOCK)
+		if (non_block)
 			return -EAGAIN;
 		if (signal_pending(current))
 			return -ERESTARTSYS;
@@ -1268,7 +1268,7 @@ ssize_t splice_file_to_pipe(struct file *in,
 	ssize_t ret;
 
 	pipe_lock(opipe);
-	ret = wait_for_space(opipe, flags);
+	ret = wait_for_space(opipe, flags & SPLICE_F_NONBLOCK);
 	if (!ret)
 		ret = do_splice_read(in, offset, opipe, len, flags);
 	pipe_unlock(opipe);
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 4/7] pipe: move wait_for_space to fs/pipe.c and rename it
From: Askar Safin @ 2026-06-25  8:34 UTC (permalink / raw)
  To: linux-fsdevel, Christian Brauner, Alexander Viro, Jan Kara
  Cc: linux-kernel, linux-mm, linux-api, netdev, fuse-devel,
	Linus Torvalds, Matthew Wilcox, Jens Axboe, Christoph Hellwig,
	David Howells, Andrew Morton, David Hildenbrand, Pedro Falcato,
	Miklos Szeredi, Andy Lutomirski, Collin Funk, David Laight,
	Stefan Metzmacher, The 8472, Willy Tarreau, Joanne Koong,
	Val Packett, Andrei Vagin, patches
In-Reply-To: <20260625083409.3769242-1-safinaskar@gmail.com>

This is needed, because I plan to use it in fs/read_write.c.

Signed-off-by: Askar Safin <safinaskar@gmail.com>
---
 fs/pipe.c                 | 17 +++++++++++++++++
 fs/splice.c               | 19 +------------------
 include/linux/pipe_fs_i.h |  2 ++
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 9841648c9..c0ccf21b9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1451,6 +1451,23 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 	return ret;
 }
 
+int pipe_wait_for_space(struct pipe_inode_info *pipe, bool non_block)
+{
+	for (;;) {
+		if (unlikely(!pipe->readers)) {
+			send_sig(SIGPIPE, current, 0);
+			return -EPIPE;
+		}
+		if (!pipe_is_full(pipe))
+			return 0;
+		if (non_block)
+			return -EAGAIN;
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+		pipe_wait_writable(pipe);
+	}
+}
+
 static const struct super_operations pipefs_ops = {
 	.destroy_inode = free_inode_nonrcu,
 	.statfs = simple_statfs,
diff --git a/fs/splice.c b/fs/splice.c
index 707db2c2c..d12243d19 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1239,23 +1239,6 @@ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
 }
 EXPORT_SYMBOL(splice_file_range);
 
-static int wait_for_space(struct pipe_inode_info *pipe, bool non_block)
-{
-	for (;;) {
-		if (unlikely(!pipe->readers)) {
-			send_sig(SIGPIPE, current, 0);
-			return -EPIPE;
-		}
-		if (!pipe_is_full(pipe))
-			return 0;
-		if (non_block)
-			return -EAGAIN;
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-		pipe_wait_writable(pipe);
-	}
-}
-
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			       struct pipe_inode_info *opipe,
 			       size_t len, unsigned int flags);
@@ -1268,7 +1251,7 @@ ssize_t splice_file_to_pipe(struct file *in,
 	ssize_t ret;
 
 	pipe_lock(opipe);
-	ret = wait_for_space(opipe, flags & SPLICE_F_NONBLOCK);
+	ret = pipe_wait_for_space(opipe, flags & SPLICE_F_NONBLOCK);
 	if (!ret)
 		ret = do_splice_read(in, offset, opipe, len, flags);
 	pipe_unlock(opipe);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index a1eeed800..be653625d 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -335,4 +335,6 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);
 int create_pipe_files(struct file **, int);
 unsigned int round_pipe_size(unsigned int size);
 
+int pipe_wait_for_space(struct pipe_inode_info *pipe, bool non_block);
+
 #endif
-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox