Netdev List
 help / color / mirror / Atom feed
* [PATCH v4 2/3] net: ethtool: add KSZ87xx low-loss cable PHY tunables
From: Fidelio Lawson @ 2026-04-17 12:44 UTC (permalink / raw)
  To: Woojung Huh, UNGLinuxDriver, Andrew Lunn, Vladimir Oltean,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Marek Vasut, Maxime Chevallier, Simon Horman, Heiner Kallweit,
	Russell King
  Cc: Woojung Huh, netdev, linux-kernel, Fidelio Lawson
In-Reply-To: <20260417-ksz87xx_errata_low_loss_connections-v4-0-6c7044ec4363@exotec.com>

Introduce vendor-specific PHY tunable identifiers to control the
KSZ87xx low-loss cable erratum handling through the ethtool PHY
tunable interface.

The following tunables are added:

- a boolean "short-cable" tunable, applying a documented and
  conservative preset intended for short or low-loss Ethernet cables;

- an integer LPF bandwidth tunable, allowing advanced adjustment of the
  receiver low-pass filter bandwidth;

- an integer DSP EQ initial value tunable, allowing advanced tuning of
  the PHY equalizer initialization.

The actual behavior is implemented by the corresponding PHY and switch
drivers.

Signed-off-by: Fidelio Lawson <fidelio.lawson@exotec.com>
---
 include/uapi/linux/ethtool.h | 3 +++
 net/ethtool/common.c         | 3 +++
 net/ethtool/ioctl.c          | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index b74b80508553..081d8f2191b6 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -291,6 +291,9 @@ enum phy_tunable_id {
 	ETHTOOL_PHY_DOWNSHIFT,
 	ETHTOOL_PHY_FAST_LINK_DOWN,
 	ETHTOOL_PHY_EDPD,
+	ETHTOOL_PHY_SHORT_CABLE_PRESET,
+	ETHTOOL_PHY_LPF_BW,
+	ETHTOOL_PHY_DSP_EQ_INIT_VALUE,
 	/*
 	 * Add your fresh new phy tunable attribute above and remember to update
 	 * phy_tunable_strings[] in net/ethtool/common.c
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index e252cf20c22f..9c2fe5b626d6 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -101,6 +101,9 @@ phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_PHY_DOWNSHIFT]	= "phy-downshift",
 	[ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down",
 	[ETHTOOL_PHY_EDPD]	= "phy-energy-detect-power-down",
+	[ETHTOOL_PHY_SHORT_CABLE_PRESET] = "phy-short-cable-preset",
+	[ETHTOOL_PHY_LPF_BW]	= "phy-lpf-bandwidth",
+	[ETHTOOL_PHY_DSP_EQ_INIT_VALUE] = "phy-dsp-eq-init-value",
 };
 
 #define __LINK_MODE_NAME(speed, type, duplex) \
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index ff4b4780d6af..5b66e4a96f67 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -3109,6 +3109,9 @@ static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
 	switch (tuna->id) {
 	case ETHTOOL_PHY_DOWNSHIFT:
 	case ETHTOOL_PHY_FAST_LINK_DOWN:
+	case ETHTOOL_PHY_SHORT_CABLE_PRESET:
+	case ETHTOOL_PHY_LPF_BW:
+	case ETHTOOL_PHY_DSP_EQ_INIT_VALUE:
 		if (tuna->len != sizeof(u8) ||
 		    tuna->type_id != ETHTOOL_TUNABLE_U8)
 			return -EINVAL;

-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 1/3] net: dsa: microchip: implement KSZ87xx Module 3 low-loss cable errata
From: Fidelio Lawson @ 2026-04-17 12:44 UTC (permalink / raw)
  To: Woojung Huh, UNGLinuxDriver, Andrew Lunn, Vladimir Oltean,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Marek Vasut, Maxime Chevallier, Simon Horman, Heiner Kallweit,
	Russell King
  Cc: Woojung Huh, netdev, linux-kernel, Fidelio Lawson
In-Reply-To: <20260417-ksz87xx_errata_low_loss_connections-v4-0-6c7044ec4363@exotec.com>

Implement the "Module 3: Equalizer fix for short cables" erratum from
Microchip document DS80000687C for KSZ87xx switches.

The issue affects short or low-loss cable links (e.g. CAT5e/CAT6),
where the PHY receiver equalizer may amplify high-amplitude signals
excessively, resulting in internal distortion and link establishment
failures.

KSZ87xx devices require a workaround for the Module 3 low-loss cable
condition, controlled through the switch TABLE_LINK_MD_V indirect
registers.

This change models the erratum handling as vendor-specific Clause 22 PHY
registers, virtualized by the KSZ8 DSA driver and accessed via
ksz8_r_phy() / ksz8_w_phy(). The following controls are provided:

- A boolean “short-cable” preset, which applies a documented and
  conservative configuration (LPF 62 MHz bandwidth and DSP EQ initial
  value 0), and is the recommended interface for typical use cases.

- Separate LPF bandwidth and DSP EQ initial value controls intended for
  advanced or experimental tuning. These are orthogonal and independent,
  and override the corresponding settings without requiring any specific
  ordering.

The preset and tunables act as simple setters with no implicit state
machine or invalid combinations, keeping the API predictable and aligned
with the KISS principle.

The erratum affects the shared PHY analog front-end and therefore applies
globally to the switch.

Signed-off-by: Fidelio Lawson <fidelio.lawson@exotec.com>
---
 drivers/net/dsa/microchip/ksz8.c       | 67 ++++++++++++++++++++++++++++++++++
 drivers/net/dsa/microchip/ksz8.h       |  1 +
 drivers/net/dsa/microchip/ksz8_reg.h   | 21 ++++++++++-
 drivers/net/dsa/microchip/ksz_common.h |  4 ++
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/microchip/ksz8.c b/drivers/net/dsa/microchip/ksz8.c
index c354abdafc1b..0f2b8acee80f 100644
--- a/drivers/net/dsa/microchip/ksz8.c
+++ b/drivers/net/dsa/microchip/ksz8.c
@@ -1058,6 +1058,22 @@ int ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 		if (ret)
 			return ret;
 
+		break;
+	case PHY_REG_KSZ87XX_SHORT_CABLE:
+		if (!ksz_is_ksz87xx(dev))
+			return -EOPNOTSUPP;
+		data = !!(dev->lpf_bw == KSZ87XX_PHY_LPF_62MHZ &&
+				dev->eq_init == KSZ87XX_DSP_EQ_INIT_LOW_LOSS);
+		break;
+	case PHY_REG_KSZ87XX_LPF_BW:
+		if (!ksz_is_ksz87xx(dev))
+			return -EOPNOTSUPP;
+		data = dev->lpf_bw;
+		break;
+	case PHY_REG_KSZ87XX_EQ_INIT:
+		if (!ksz_is_ksz87xx(dev))
+			return -EOPNOTSUPP;
+		data = dev->eq_init;
 		break;
 	default:
 		processed = false;
@@ -1271,6 +1287,29 @@ int ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 		if (ret)
 			return ret;
 		break;
+	case PHY_REG_KSZ87XX_SHORT_CABLE:
+		if (!ksz_is_ksz87xx(dev))
+			return -EOPNOTSUPP;
+		ret = ksz87xx_apply_low_loss_preset(dev, !!val);
+		if (ret)
+			return ret;
+		break;
+	case PHY_REG_KSZ87XX_LPF_BW:
+		if (!ksz_is_ksz87xx(dev))
+			return -EOPNOTSUPP;
+		ret = ksz8_ind_write8(dev, TABLE_LINK_MD, KSZ87XX_REG_PHY_LPF, (u8)val);
+		if (ret)
+			return ret;
+		dev->lpf_bw = val;
+		break;
+	case PHY_REG_KSZ87XX_EQ_INIT:
+		if (!ksz_is_ksz87xx(dev))
+			return -EOPNOTSUPP;
+		ret = ksz8_ind_write8(dev, TABLE_LINK_MD, KSZ87XX_REG_DSP_EQ, (u8)val);
+		if (ret)
+			return ret;
+		dev->eq_init = val;
+		break;
 	default:
 		break;
 	}
@@ -2096,11 +2135,39 @@ int ksz8463_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 	return 0;
 }
 
+int ksz87xx_apply_low_loss_preset(struct ksz_device *dev, bool enable)
+{
+	/* Apply the Microchip erratum short-cable preset (LPF 62 MHz, EQ init 0) */
+	/* providing a conservative configuration for short or low-loss cables. */
+	u8 lpf_bw, eq_init;
+	int ret;
+
+	lpf_bw = KSZ87XX_PHY_LPF_62MHZ;
+	eq_init = KSZ87XX_DSP_EQ_INIT_LOW_LOSS;
+
+	if (!ksz_is_ksz87xx(dev))
+		return -EOPNOTSUPP;
+	if (!enable)
+		return 0;
+	ret = ksz8_ind_write8(dev, TABLE_LINK_MD, KSZ87XX_REG_PHY_LPF, lpf_bw);
+	if (ret)
+		return ret;
+	dev->lpf_bw = lpf_bw;
+	ret = ksz8_ind_write8(dev, TABLE_LINK_MD, KSZ87XX_REG_DSP_EQ, eq_init);
+	if (ret)
+		return ret;
+	dev->eq_init = eq_init;
+
+	return ret;
+}
+
 int ksz8_switch_init(struct ksz_device *dev)
 {
 	dev->cpu_port = fls(dev->info->cpu_ports) - 1;
 	dev->phy_port_cnt = dev->info->port_cnt - 1;
 	dev->port_mask = (BIT(dev->phy_port_cnt) - 1) | dev->info->cpu_ports;
+	dev->lpf_bw = KSZ87XX_PHY_LPF_90MHZ;
+	dev->eq_init = KSZ87XX_DSP_EQ_INIT_FACTORY;
 
 	return 0;
 }
diff --git a/drivers/net/dsa/microchip/ksz8.h b/drivers/net/dsa/microchip/ksz8.h
index 0f2cd1474b44..5cf7bd90af0f 100644
--- a/drivers/net/dsa/microchip/ksz8.h
+++ b/drivers/net/dsa/microchip/ksz8.h
@@ -66,5 +66,6 @@ int ksz8_all_queues_split(struct ksz_device *dev, int queues);
 u32 ksz8463_get_port_addr(int port, int offset);
 int ksz8463_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val);
 int ksz8463_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val);
+int ksz87xx_apply_low_loss_preset(struct ksz_device *dev, bool enable);
 
 #endif
diff --git a/drivers/net/dsa/microchip/ksz8_reg.h b/drivers/net/dsa/microchip/ksz8_reg.h
index 332408567b47..5df17c463f7c 100644
--- a/drivers/net/dsa/microchip/ksz8_reg.h
+++ b/drivers/net/dsa/microchip/ksz8_reg.h
@@ -202,6 +202,10 @@
 #define REG_PORT_3_STATUS_0		0x38
 #define REG_PORT_4_STATUS_0		0x48
 
+/* KSZ87xx LinkMD registers (TABLE_LINK_MD_V) */
+#define KSZ87XX_REG_DSP_EQ			0x08   /* DSP EQ initial value */
+#define KSZ87XX_REG_PHY_LPF			0x4C   /* RX LPF bandwidth */
+
 /* For KSZ8765. */
 #define PORT_REMOTE_ASYM_PAUSE		BIT(5)
 #define PORT_REMOTE_SYM_PAUSE		BIT(4)
@@ -342,7 +346,7 @@
 #define TABLE_EEE			(TABLE_EEE_V << TABLE_EXT_SELECT_S)
 #define TABLE_ACL			(TABLE_ACL_V << TABLE_EXT_SELECT_S)
 #define TABLE_PME			(TABLE_PME_V << TABLE_EXT_SELECT_S)
-#define TABLE_LINK_MD			(TABLE_LINK_MD << TABLE_EXT_SELECT_S)
+#define TABLE_LINK_MD			(TABLE_LINK_MD_V << TABLE_EXT_SELECT_S)
 #define TABLE_READ			BIT(4)
 #define TABLE_SELECT_S			2
 #define TABLE_STATIC_MAC_V		0
@@ -729,6 +733,21 @@
 #define PHY_POWER_SAVING_ENABLE		BIT(2)
 #define PHY_REMOTE_LOOPBACK		BIT(1)
 
+/* Vendor-specific Clause 22 PHY registers (virtualized) */
+#define PHY_REG_KSZ87XX_SHORT_CABLE		0x1A
+#define PHY_REG_KSZ87XX_LPF_BW			0x1B
+#define PHY_REG_KSZ87XX_EQ_INIT			0x1C
+
+/* LPF bandwidth bits [7:6]: 00 = 90MHz (default), 01 = 62MHz, 10 = 55MHz, 11 = 44MHz  */
+#define KSZ87XX_PHY_LPF_90MHZ          0x00
+#define KSZ87XX_PHY_LPF_62MHZ          0x40
+#define KSZ87XX_PHY_LPF_55MHZ          0x80
+#define KSZ87XX_PHY_LPF_44MHZ          0xC0
+
+/* Low-loss workaround DSP EQ INIT VALUE */
+#define KSZ87XX_DSP_EQ_INIT_LOW_LOSS	0x00
+#define KSZ87XX_DSP_EQ_INIT_FACTORY		0x0F
+
 /* KSZ8463 specific registers. */
 #define P1MBCR				0x4C
 #define P1MBSR				0x4E
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 929aff4c55de..482e79cf6ae6 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -219,6 +219,10 @@ struct ksz_device {
 	 * the switch’s internal PHYs, bypassing the main SPI interface.
 	 */
 	struct mii_bus *parent_mdio_bus;
+
+	/* KSZ87xx low-loss tuning state */
+	u8 lpf_bw;		/* KSZ87XX_PHY_LPF_* */
+	u8 eq_init;		/* DSP EQ initial value */
 };
 
 /* List of supported models */

-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 0/3] ksz87xx: add support for low-loss cable equalizer errata
From: Fidelio Lawson @ 2026-04-17 12:44 UTC (permalink / raw)
  To: Woojung Huh, UNGLinuxDriver, Andrew Lunn, Vladimir Oltean,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Marek Vasut, Maxime Chevallier, Simon Horman, Heiner Kallweit,
	Russell King
  Cc: Woojung Huh, netdev, linux-kernel, Fidelio Lawson

Hello,

This patch implements the “Module 3: Equalizer fix for short cables” erratum
described in Microchip document DS80000687C for KSZ87xx switches.

According to the erratum, the embedded PHY receiver in KSZ87xx switches is
tuned by default for long, high-loss Ethernet cables. When operating with
short or low-loss cables (for example CAT5e or CAT6), the PHY equalizer may
over-amplify the incoming signal, leading to internal distortion and link
establishment failures.

Microchip documents two independent mechanisms to mitigate this issue:
adjusting the receiver low‑pass filter bandwidth and reducing the DSP
equalizer initial value. These registers are located in the switch’s
internal LinkMD table and cannot be accessed directly through a
stand‑alone PHY driver.

To keep the PHY‑facing API clean, this series models the erratum handling
as vendor‑specific Clause 22 PHY registers, virtualized by the KSZ8 DSA
driver. Accesses are intercepted by ksz8_r_phy() / ksz8_w_phy() and
translated into the appropriate indirect LinkMD register writes. The
erratum affects the shared PHY analog front‑end and therefore applies
globally to the switch.

Based on review feedback, the user‑visible interface is kept deliberately
simple and predictable:

- A boolean “short‑cable” PHY tunable applies a documented and
  conservative preset (LPF bandwidth 62MHz, DSP EQ initial value 0).
  This is the recommended KISS interface for the common short‑cable
  scenario.

- Two additional integer PHY tunables allow advanced or experimental
  tuning of the LPF bandwidth and the DSP EQ initial value. These
  controls are orthogonal, have no ordering requirements, and simply
  override the corresponding setting when written.

The tunables act as simple setters with no implicit state machine or
invalid combinations, avoiding surprises for userspace and not relying
on extended error reporting or netlink ethtool support.

This series contains:

  1. Support for the KSZ87xx low‑loss cable erratum in the KSZ8 DSA driver,
     including the short‑cable preset and orthogonal tuning controls.

  2. Addition of vendor‑specific PHY tunable identifiers for the
     short‑cable preset, LPF bandwidth, and DSP EQ initial value.

  3. Exposure of these tunables through the Micrel PHY driver via
     get_tunable / set_tunable callbacks.

This version follows the design agreed upon during v3 review and
reworks the interface accordingly.

This series is based on Linux v7.0-rc1.

Signed-off-by: Fidelio Lawson <fidelio.lawson@exotec.com>
---
Changes in v4:
- Reworked the user‑visible API to a boolean short‑cable preset plus
  orthogonal advanced tunables, following the KISS principle.
- Dropped the previous mode‑selector semantics in favor of simple
  setters with no ordering requirements
- Added persistent tracking of LPF bandwidth and EQ initial value.
- Clarified defaults and preset values to match Microchip documentation.
- Link to v3: https://patch.msgid.link/20260414-ksz87xx_errata_low_loss_connections-v3-0-0e3838ca98c9@exotec.com

Changes in v3:
- Exposed all LPF bandwidth values supported by the hardware.
- Added phy tunable.
- Link to v2: https://patch.msgid.link/20260408-ksz87xx_errata_low_loss_connections-v2-1-9cfe38691713@exotec.com

Changes in v2:
- Dropped the device tree approach based on review feedback
- Modeled the errata control as a vendor-specific Clause 22 PHY register
- Added KSZ87xx-specific guards and replaced magic values with named macros
- Rebased on Linux v7.0-rc1
- Link to v1: https://patch.msgid.link/20260326-ksz87xx_errata_low_loss_connections-v1-0-79a698f43626@exotec.com

---
Fidelio Lawson (3):
      net: dsa: microchip: implement KSZ87xx Module 3 low-loss cable errata
      net: ethtool: add KSZ87xx low-loss cable PHY tunables
      net: phy: micrel: expose KSZ87xx low-loss cable tunables

 drivers/net/dsa/microchip/ksz8.c       | 67 ++++++++++++++++++++++++++++++++++
 drivers/net/dsa/microchip/ksz8.h       |  1 +
 drivers/net/dsa/microchip/ksz8_reg.h   | 21 ++++++++++-
 drivers/net/dsa/microchip/ksz_common.h |  4 ++
 drivers/net/phy/micrel.c               | 54 +++++++++++++++++++++++++++
 include/uapi/linux/ethtool.h           |  3 ++
 net/ethtool/common.c                   |  3 ++
 net/ethtool/ioctl.c                    |  3 ++
 8 files changed, 155 insertions(+), 1 deletion(-)
---
base-commit: 2d1373e4246da3b58e1df058374ed6b101804e07
change-id: 20260323-ksz87xx_errata_low_loss_connections-b65e76e2b403

Best regards,
--  
Fidelio Lawson <fidelio.lawson@exotec.com>


^ permalink raw reply

* Re: [PATCH bpf-next v4 5/6] bpf: clear decap tunnel GSO state in skb_adjust_room
From: Hudson, Nick @ 2026-04-17 12:27 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: bpf@vger.kernel.org, netdev@vger.kernel.org, Willem de Bruijn,
	Martin KaFai Lau, Tottenham, Max, Glasgall, Anna, Daniel Borkmann,
	Alexei Starovoitov, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, linux-kernel@vger.kernel.org
In-Reply-To: <willemdebruijn.kernel.30d031033dc61@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3520 bytes --]



> On Apr 16, 2026, at 1:32 PM, Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> 
> !-------------------------------------------------------------------|
>  This Message Is From an External Sender
>  This message came from outside your organization.
> |-------------------------------------------------------------------!
> 
> Nick Hudson wrote:
>> On shrink in bpf_skb_adjust_room(), clear tunnel-specific GSO flags
>> according to the decapsulation flags:
>> 
>> - BPF_F_ADJ_ROOM_DECAP_L4_UDP clears SKB_GSO_UDP_TUNNEL{,_CSUM}
>> - BPF_F_ADJ_ROOM_DECAP_L4_GRE clears SKB_GSO_GRE{,_CSUM}
>> - BPF_F_ADJ_ROOM_DECAP_IPXIP4 clears SKB_GSO_IPXIP4
>> - BPF_F_ADJ_ROOM_DECAP_IPXIP6 clears SKB_GSO_IPXIP6
>> 
>> When all tunnel-related GSO bits are cleared, also clear
>> skb->encapsulation.
>> 
>> Handle the ESP inside a UDP tunnel case where encapsulation should remain
>> set.
>> 
>> If UDP decap is performed, clear encap_hdr_csum and remcsum_offload.
>> 
>> Co-developed-by: Max Tottenham <mtottenh@akamai.com>
>> Signed-off-by: Max Tottenham <mtottenh@akamai.com>
>> Co-developed-by: Anna Glasgall <aglasgal@akamai.com>
>> Signed-off-by: Anna Glasgall <aglasgal@akamai.com>
>> Signed-off-by: Nick Hudson <nhudson@akamai.com>
>> ---
>> net/core/filter.c | 38 ++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 38 insertions(+)
>> 
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 7f8d43420afb..e113ae2f3f14 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -3667,6 +3667,44 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
>> 		if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
>> 			skb_increase_gso_size(shinfo, len_diff);
>> 
>> +		/* Selective GSO flag clearing based on decap type.
>> +		 * Only clear the flags for the tunnel layer being removed.
>> +		 */
>> +		if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_UDP) &&
>> +		    (shinfo->gso_type & (SKB_GSO_UDP_TUNNEL |
>> +					 SKB_GSO_UDP_TUNNEL_CSUM)))
>> +			shinfo->gso_type &= ~(SKB_GSO_UDP_TUNNEL |
>> +					      SKB_GSO_UDP_TUNNEL_CSUM);
>> +		if ((flags & BPF_F_ADJ_ROOM_DECAP_L4_GRE) &&
>> +		    (shinfo->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM)))
>> +			shinfo->gso_type &= ~(SKB_GSO_GRE |
>> +					      SKB_GSO_GRE_CSUM);
>> +		if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP4) &&
>> +		    (shinfo->gso_type & SKB_GSO_IPXIP4))
>> +			shinfo->gso_type &= ~SKB_GSO_IPXIP4;
>> +		if ((flags & BPF_F_ADJ_ROOM_DECAP_IPXIP6) &&
>> +		    (shinfo->gso_type & SKB_GSO_IPXIP6))
>> +			shinfo->gso_type &= ~SKB_GSO_IPXIP6;
>> +
>> +		/* Clear encapsulation flag only when no tunnel GSO flags remain */
>> +		if (flags & (BPF_F_ADJ_ROOM_DECAP_L4_MASK |
>> +			     BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK)) {
>> +			if (!(shinfo->gso_type & (SKB_GSO_UDP_TUNNEL |
>> +						  SKB_GSO_UDP_TUNNEL_CSUM |
>> +						  SKB_GSO_GRE |
>> +						  SKB_GSO_GRE_CSUM |
>> +						  SKB_GSO_IPXIP4 |
>> +						  SKB_GSO_IPXIP6 |
>> +						  SKB_GSO_ESP)))
>> +				if (skb->encapsulation)
>> +					skb->encapsulation = 0;
>> +
>> +			if (flags & BPF_F_ADJ_ROOM_DECAP_L4_UDP) {
>> +				skb->encap_hdr_csum = 0;
> 
> This field is not used with UDP_L4.
> 
> Similar to remcsum, I'd ignore it entirely in this series.

Will drop from the series. Sorry for getting confused here.

> 
>> +				skb->remcsum_offload = 0;
> 
> Why still include remote checksum handling?


Because I misunderstood your last email - will drop.

Thanks,
Nick

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 3066 bytes --]

^ permalink raw reply

* Re: Path forward for NFC in the kernel
From: David Heidelberg @ 2026-04-17 12:16 UTC (permalink / raw)
  To: Michael Walle, Krzysztof Kozlowski, Jakub Kicinski,
	Michael Thalmeier, Raymond Hackley, Bongsu Jeon, Mark Greer
  Cc: netdev
In-Reply-To: <DHVAXU7E031H.1CPZZA6ELD2DN@walle.cc>

On 17/04/2026 10:54, Michael Walle wrote:
> On Fri Apr 17, 2026 at 9:18 AM CEST, Krzysztof Kozlowski wrote:
>> Does anyone knows if the NFC stack/drivers actually works fine? Did
>> anyone test actual devices?
> 
> I was working on a product which used an NFC part from NXP. We
> started with the upstream driver and libnfc and we did some
> bugfixes, that's also probably the reason I'm in the loop here ;).
> 
> But eventually it was decided to switch to the libs provided by the
> vendor, because that at least somehow worked reliably (and you'll
> get support from the vendor).

So what we need is show vendor how wonderful Linux kernel is, thus he should 
contribute to it and then switch to it (when nVidia can understand it's 
beneficial to them, why not NXP).

David

> 
> -michael

-- 
David Heidelberg


^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH v5 net-next 0/8] dpll/ice: Add TXC DPLL type and full TX reference clock control for E825
From: Kubalewski, Arkadiusz @ 2026-04-17 12:22 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Vecera, Ivan, vadim.fedorenko@linux.dev, edumazet@google.com,
	netdev@vger.kernel.org, richardcochran@gmail.com,
	donald.hunter@gmail.com, linux-kernel@vger.kernel.org,
	davem@davemloft.net, Prathosh.Satish@microchip.com,
	andrew+netdev@lunn.ch, intel-wired-lan@lists.osuosl.org,
	horms@kernel.org, Kitszel, Przemyslaw, Nguyen, Anthony L,
	pabeni@redhat.com, jiri@resnulli.us
In-Reply-To: <20260416180447.1a3c5c87@kernel.org>

>From: Jakub Kicinski <kuba@kernel.org>
>Sent: Friday, April 17, 2026 3:05 AM
>
>On Thu, 16 Apr 2026 18:26:11 +0000 Kubalewski, Arkadiusz wrote:
>> >> This HW doesn't use EEC DPLL signal to feed MAC clock, as DPLL is
>> >> external from NIC point of view. Only 2 signals from such external
>> >> DPLL
>> >> device are used by NIC:
>> >> - synce (a single source for all those TXC per-port DPLL device)
>> >> - time_ref (a source for the TS_PLL - which drives PTP timer)
>> >
>> >No bypass? The PLL is actually in the loop? oof, this is beyond
>> >my understanding of clocks and signals :S
>>
>> TBH, I am not entirely sure what do you mean with MAC PLL into bypass
>> mode, but the HW description I have provided is still true, the MAC is
>> not fed with any DPLL provided signal here. Only port tx clocks PLLs and
>> a timer PLL can use those.
>
>The ASIC PLL IPs I managed to find had a bypass mode where the reference
>/ input frequency still goes thru the dividers but the PLL circuit is
>bypassed. I assumed that if we want to distribute a syntonized clock
>across the network we would want as few PLL circuits in the paths as
>possible and we'd use bypass (which would be relevant here since for
>the target use case we wouldn't engage the PLL of the TXC). But this
>is 100% guesswork so I'm probably speaking gibberish.
>

OK, thanks for explanation. I don't have such details about it, I have
seen only high level design drawings.

>> >> Well, 'floating' MUX type pin not connected to any dpll would require
>> >> a
>> >> lot of additional implementations, just to allow source selection, as
>> >> we
>> >> have tried it already.
>> >>
>> >> Wouldn't more generic name cause a DPLL purpose problem?
>> >
>> >The old proposal in netdev family was to to have source selection
>> >without creating a real mux. Not saying I'm dead set on that direction.
>>
>> Yes, correct, it kept the list of dpll pins valid for source selection
>> of
>> tx clock within the netdev and control over it through RT netlink.
>> That solution was rather simple but you requested to hack into dpll so
>> we
>> did here.
>>
>> IMHO this is cleanest and simplest solution we could find to keep it
>> within DPLL subsystem.
>>
>> >> We still want to make sure that given DPLL device would serve the
>> >> role
>> >> of source selection for particular port where a source pin should be
>> >> an
>> >> output either on EEC dpll or some external signal generator but
>> >> somehow
>> >> related to SyncE or similar solutions.
>> >
>> >Right, but adding a new "type" per location of the PLL (especially if
>> >we lean into covering any ASIC PLL) may not scale, and opens us up to
>> >"vendor X calls it Y" and "in design A clock is fed by pll type X and
>> >in design B by type Y".
>>
>> I was thinking that this is more like a purpose specific DPLL device, if
>> someone would want something similar we would have to review it, right?
>
>We would if it was a Ethernet MAC PLL, but if someone wanted to expose
>whether some random PLL in their ASIC locks - are we adding a new type
>for each one of those?

Yes, that was the implicit intention within those patches, if other purpose
specific PLL would have to be present for whatever HW design and user
control over it would be required, then that would be the easiest to
maintain in the long term? Multiple types and each have own function/purpose.

It would be good as long as there is one PLL for a function per board, once
there could be multiple ones for single function, we would have to add some
enumeration (labels, etc.)

>
>> >IIUC you do provide "linking" of the pins? netdev will have the MAC pin
>> >assigned. Is the pin that connects the PLLs also annotated so that user
>> >knows what's on the "other side"? Maybe the topology would be clear
>> >enough from just that, and we don't have to add a TXC type.
>> >Call the PLL "integrated" or something generic. User should be able to
>> >trace the path of the signals?
>>
>> It depends, TX clock has one of external pins connected to external
>> DPLL,
>> but second is a board-level pin with ability to provide some external
>> clock signal, the user would have to determine that purpose just based
>> on the topology of one of the pins, which seems a bit problematic?
>> I.e. if at some point there would be HW with only external non-DPLL
>> connected pins?
>
>Not sure I follow, TBH. To me the function of the "MAC PLL" is fairly
>obvious from the fact that it has a pin exposed via rtnetlink. So it's
>obviously a DPLL which can drive the Tx clock?
>

I am lost a bit now too. You mean clock recovery pin? And EEC type dpll?
In this solution the 'MAC'/EEC is external and it doesn't drive TX clocks
directly.

>It's the function / relation / linking to the EEC DPLL that may not
>be obvious. But user can see how the pins connect they can get some
>LLM to draw a diagram of a live system.. et voila :)
>

Yes, correct it would work for this particular HW, but adding a variant
without a external EEC-connected pin in the picture would be problematic
to understand 'generic' dpll purpose, pointing to the labels later.

Just to make it clear. I believe that generic type dpll could be used in
any HW and for any purpose, so after all each such usage could possibly
introduce entropy and confusion on the user side.

But if you are fine with that, then sure, we can live with generic
purpose dpll.

>> I mean 'generic' type is something we could do, but as already
>> mentioned,
>> thought that we want a DPLL types specified/designed for some particular
>> functions/tasks.
>
>I feel like we often get labels wrong the first time around, so if we
>can defer adding them until later that'd make me happy..

Sure something like it later would be required.

Thank you!
Arkadiusz

^ permalink raw reply

* Re: [PATCH net] net: dsa: mt7530: fix .get_stats64 sleeping in atomic context
From: Daniel Golle @ 2026-04-17 12:08 UTC (permalink / raw)
  To: Chester A. Unal
  Cc: Andrew Lunn, Vladimir Oltean, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Matthias Brugger,
	AngeloGioacchino Del Regno, Russell King, Christian Marangi,
	netdev, linux-kernel, linux-arm-kernel, linux-mediatek,
	Frank Wunderlich, John Crispin
In-Reply-To: <C88911FE-2012-4D29-B7F2-6BEA28122854@arinc9.com>

On Fri, Apr 17, 2026 at 07:35:46AM +0000, Chester A. Unal wrote:
> On 17 April 2026 04:55:57 WEST, Daniel Golle <daniel@makrotopia.org> wrote:
> >The .get_stats64 callback runs in atomic context, but on
> >MDIO-connected switches every register read acquires the MDIO bus
> >mutex, which can sleep:
> >[   12.645973] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:609
> >[   12.654442] in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 759, name: grep
> >[   12.663377] preempt_count: 0, expected: 0
> >[   12.667410] RCU nest depth: 1, expected: 0
> >[   12.671511] INFO: lockdep is turned off.
> >[   12.675441] CPU: 0 UID: 0 PID: 759 Comm: grep Tainted: G S      W           7.0.0+ #0 PREEMPT
> >[   12.675453] Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN
> >[   12.675456] Hardware name: Bananapi BPI-R64 (DT)
> >[   12.675459] Call trace:
> >[   12.675462]  show_stack+0x14/0x1c (C)
> >[   12.675477]  dump_stack_lvl+0x68/0x8c
> >[   12.675487]  dump_stack+0x14/0x1c
> >[   12.675495]  __might_resched+0x14c/0x220
> >[   12.675504]  __might_sleep+0x44/0x80
> >[   12.675511]  __mutex_lock+0x50/0xb10
> >[   12.675523]  mutex_lock_nested+0x20/0x30
> >[   12.675532]  mt7530_get_stats64+0x40/0x2ac
> >[   12.675542]  dsa_user_get_stats64+0x2c/0x40
> >[   12.675553]  dev_get_stats+0x44/0x1e0
> >[   12.675564]  dev_seq_printf_stats+0x24/0xe0
> >[   12.675575]  dev_seq_show+0x14/0x3c
> >[   12.675583]  seq_read_iter+0x37c/0x480
> >[   12.675595]  seq_read+0xd0/0xec
> >[   12.675605]  proc_reg_read+0x94/0xe4
> >[   12.675615]  vfs_read+0x98/0x29c
> >[   12.675625]  ksys_read+0x54/0xdc
> >[   12.675633]  __arm64_sys_read+0x18/0x20
> >[   12.675642]  invoke_syscall.constprop.0+0x54/0xec
> >[   12.675653]  do_el0_svc+0x3c/0xb4
> >[   12.675662]  el0_svc+0x38/0x200
> >[   12.675670]  el0t_64_sync_handler+0x98/0xdc
> >[   12.675679]  el0t_64_sync+0x158/0x15c
> >
> >For MDIO-connected switches, poll MIB counters asynchronously using a
> >delayed workqueue every second and let .get_stats64 return the cached
> >values under a per-port spinlock. A mod_delayed_work() call on each
                 ^^^^^^^^^^^^^^^^^^^
Just noticed I forgot to update the commit message when changing the
implementation to use a single shared spinlock for all ports...

I'll send v2 tomorrow fixing that, and what ever else comes up, if
anything.

> >read triggers an immediate refresh so counters stay responsive when
> >queried more frequently.
> >
> >MMIO-connected switches (MT7988, EN7581, AN7583) are not affected
> >because their regmap does not sleep, so they continue to read MIB
> >counters directly in .get_stats64.
> >
> >Fixes: 88c810f35ed5 ("net: dsa: mt7530: implement .get_stats64")
> >Signed-off-by: Daniel Golle <daniel@makrotopia.org>
> >---
> >This bug highlights a bigger problem and the actual cause:
> >Locking in the mt7530 driver deserves a cleanup, and refactoring
> >towards cleanly and directly using the regmap API.
> >I've prepared this already and am going to submit a series doing
> >most of that using Coccinelle semantic patches once net-next opens
> >again.
> 
> Acked-by: Chester A. Unal <chester.a.unal@arinc9.com>
> 
> Chester A.

^ permalink raw reply

* Re: [PATCH net] net: dsa: mt7530: fix .get_stats64 sleeping in atomic context
From: Andrew Lunn @ 2026-04-17 12:06 UTC (permalink / raw)
  To: Daniel Golle
  Cc: Chester A. Unal, Vladimir Oltean, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Matthias Brugger,
	AngeloGioacchino Del Regno, Russell King, Christian Marangi,
	netdev, linux-kernel, linux-arm-kernel, linux-mediatek,
	Frank Wunderlich, John Crispin
In-Reply-To: <79dc0ec5b6be698b14cb66339d6f63033ca2934a.1776397542.git.daniel@makrotopia.org>

On Fri, Apr 17, 2026 at 04:55:57AM +0100, Daniel Golle wrote:
> The .get_stats64 callback runs in atomic context, but on
> MDIO-connected switches every register read acquires the MDIO bus
> mutex, which can sleep:
> [   12.645973] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:609
> [   12.654442] in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 759, name: grep
> [   12.663377] preempt_count: 0, expected: 0
> [   12.667410] RCU nest depth: 1, expected: 0
> [   12.671511] INFO: lockdep is turned off.
> [   12.675441] CPU: 0 UID: 0 PID: 759 Comm: grep Tainted: G S      W           7.0.0+ #0 PREEMPT
> [   12.675453] Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN
> [   12.675456] Hardware name: Bananapi BPI-R64 (DT)
> [   12.675459] Call trace:
> [   12.675462]  show_stack+0x14/0x1c (C)
> [   12.675477]  dump_stack_lvl+0x68/0x8c
> [   12.675487]  dump_stack+0x14/0x1c
> [   12.675495]  __might_resched+0x14c/0x220
> [   12.675504]  __might_sleep+0x44/0x80
> [   12.675511]  __mutex_lock+0x50/0xb10
> [   12.675523]  mutex_lock_nested+0x20/0x30
> [   12.675532]  mt7530_get_stats64+0x40/0x2ac
> [   12.675542]  dsa_user_get_stats64+0x2c/0x40
> [   12.675553]  dev_get_stats+0x44/0x1e0
> [   12.675564]  dev_seq_printf_stats+0x24/0xe0
> [   12.675575]  dev_seq_show+0x14/0x3c
> [   12.675583]  seq_read_iter+0x37c/0x480
> [   12.675595]  seq_read+0xd0/0xec
> [   12.675605]  proc_reg_read+0x94/0xe4
> [   12.675615]  vfs_read+0x98/0x29c
> [   12.675625]  ksys_read+0x54/0xdc
> [   12.675633]  __arm64_sys_read+0x18/0x20
> [   12.675642]  invoke_syscall.constprop.0+0x54/0xec
> [   12.675653]  do_el0_svc+0x3c/0xb4
> [   12.675662]  el0_svc+0x38/0x200
> [   12.675670]  el0t_64_sync_handler+0x98/0xdc
> [   12.675679]  el0t_64_sync+0x158/0x15c
> 
> For MDIO-connected switches, poll MIB counters asynchronously using a
> delayed workqueue every second and let .get_stats64 return the cached
> values under a per-port spinlock. A mod_delayed_work() call on each
> read triggers an immediate refresh so counters stay responsive when
> queried more frequently.
> 
> MMIO-connected switches (MT7988, EN7581, AN7583) are not affected
> because their regmap does not sleep, so they continue to read MIB
> counters directly in .get_stats64.
> 
> Fixes: 88c810f35ed5 ("net: dsa: mt7530: implement .get_stats64")
> Signed-off-by: Daniel Golle <daniel@makrotopia.org>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: TODOs in oa_tc6
From: Parthiban.Veerasooran @ 2026-04-17 12:05 UTC (permalink / raw)
  To: wahrenst; +Cc: netdev
In-Reply-To: <74f610a7-d9e1-41eb-ba1a-3d446a232769@gmx.net>

Hi Stefan,

On 14/04/26 11:48 pm, Stefan Wahren wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know 
> the content is safe
> 
> Hello Parthiban,
> 
> thank you for upstreaming the lan865x driver.
> 
> When do you plan to fix these open TODOs in ethernet/oa_tc6?

Thank you for your review and for pointing this out.

At the moment, we unfortunately don’t have concrete plans to address the 
TODOs in ethernet/oa_tc6, as our team is currently committed to other 
projects. That said, this work is in my backlog, and I do intend to come 
back to it when bandwidth allows.

Sorry for the inconvenience, and thank you for your patience and 
understanding.

Best regards,
Parthiban V
> 
> Best regards


^ permalink raw reply

* Re: [PATCH v3 net] rose: fix OOB reads on short CLEAR REQUEST frames
From: Simon Horman @ 2026-04-17 12:02 UTC (permalink / raw)
  To: Ashutosh Desai
  Cc: netdev, linux-hams, davem, edumazet, kuba, pabeni, stable,
	linux-kernel
In-Reply-To: <20260415055756.3825584-1-ashutoshdesai993@gmail.com>

On Wed, Apr 15, 2026 at 05:57:56AM +0000, Ashutosh Desai wrote:
> rose_process_rx_frame() calls rose_decode() which reads skb->data[2]
> without any prior length check. For CLEAR REQUEST frames the state
> machines then read skb->data[3] and skb->data[4] as the cause and
> diagnostic bytes.
> 
> A crafted 3-byte ROSE CLEAR REQUEST frame passes the minimum length
> gate in rose_route_frame() and reaches rose_process_rx_frame(), where
> rose_decode() reads one byte past the header and the state machines
> read two bytes past the valid buffer. A remote peer can exploit this
> to leak kernel memory contents or trigger a kernel panic.
> 
> Add a pskb_may_pull(skb, 3) check before rose_decode() to cover its
> skb->data[2] access, and a pskb_may_pull(skb, 5) check afterwards for
> the CLEAR REQUEST path to cover the cause and diagnostic reads.
> 
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Cc: stable@vger.kernel.org
> Signed-off-by: Ashutosh Desai <ashutoshdesai993@gmail.com>
> ---
> V2 -> V3: drop kfree_skb() calls to fix double-free; add end-user
>           visible symptom to commit log; use [net] subject prefix
> V1 -> V2: switch skb->len check to pskb_may_pull; add pskb_may_pull(skb, 3)
>           before rose_decode() to cover its skb->data[2] access
> 
> v2: https://lore.kernel.org/netdev/177614667427.3606651.8700070406932922261@gmail.com/
> v1: https://lore.kernel.org/netdev/20260409013246.2051746-1-ashutoshdesai993@gmail.com/


Unfortunately this conflicts with a recent commit, which I believe
addresses the same problem: commit 2835750dd647 ("net: rose: reject
truncated CLEAR_REQUEST frames in state machines")

I do, however, note that commit doesn't use pskb_may_pull.
So perhaps you could make an incremental change to add that.

Also, FTR, Sashiko has quite a few things to say about other problems
in this and adjacent code.

^ permalink raw reply

* Re: [PATCH iwl-net v2] igc: fix potential skb leak in igc_fpe_xmit_smd_frame()
From: Simon Horman @ 2026-04-17 11:51 UTC (permalink / raw)
  To: Kohei Enju
  Cc: intel-wired-lan, netdev, Tony Nguyen, Przemek Kitszel,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Faizal Rahim, kohei.enju, stable
In-Reply-To: <20260415025226.114115-1-kohei@enjuk.jp>

On Wed, Apr 15, 2026 at 02:52:18AM +0000, Kohei Enju wrote:
> When igc_fpe_init_tx_descriptor() fails, no one takes care of an
> allocated skb, leaking it. [1]
> Use dev_kfree_skb_any() on failure.
> 
> Tested on an I226 adapter with the following command, while injecting
> faults in igc_fpe_init_tx_descriptor() to trigger the error path.
>  # ethtool --set-mm $DEV verify-enabled on tx-enabled on pmac-enabled on
> 
> [1]
> unreferenced object 0xffff888113c6cdc0 (size 224):
> ...
>   backtrace (crc be3d3fda):
>     kmem_cache_alloc_node_noprof+0x3b1/0x410
>     __alloc_skb+0xde/0x830
>     igc_fpe_xmit_smd_frame.isra.0+0xad/0x1b0
>     igc_fpe_send_mpacket+0x37/0x90
>     ethtool_mmsv_verify_timer+0x15e/0x300
> 
> Cc: stable@vger.kernel.org
> Fixes: 5422570c0010 ("igc: add support for frame preemption verification")
> Signed-off-by: Kohei Enju <kohei@enjuk.jp>
> ---
> Changes:
>   v2:
>     - change to idiomatic style with goto (Simon)
>     - add Cc to stable (Alex)
>     - add reprodunction steps (Alex)
>   v1: https://lore.kernel.org/all/20260329145122.126040-1-kohei@enjuk.jp/

Thanks for the update.

Reviewed-by: Simon Horman <horms@kernel.org>

Sashiko has comments about a potential existing bug in the same code path.
I'd appreciate it if, as a follow-up, you could look over that.

Thanks!

^ permalink raw reply

* Re: [PATCH net v3 5/5] iavf: refactor virtchnl polling into single function
From: Simon Horman @ 2026-04-17 11:45 UTC (permalink / raw)
  To: jtornosm
  Cc: 'Simon Horman', netdev, intel-wired-lan, jesse.brandeburg,
	anthony.l.nguyen, davem, edumazet, kuba, pabeni,
	przemyslaw.kitszel
In-Reply-To: <20260414110006.124286-6-jtornosm@redhat.com>

From: 'Simon Horman' <horms@kernel.org>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
iavf: refactor virtchnl polling into single function

This commit consolidates the init-time and runtime virtchnl message polling
logic into a single enhanced iavf_poll_virtchnl_msg() function.

diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
[ ... ]
> +int iavf_poll_virtchnl_msg(struct iavf_hw *hw, struct iavf_arq_event_info *event,
> +			   enum virtchnl_ops op_to_poll, unsigned int timeout_ms,
> +			   bool (*condition)(struct iavf_adapter *, const void *),
> +			   const void *cond_data)
>  {
[ ... ]
> +	do {
> +		if (timeout_ms && iavf_virtchnl_completion_done(adapter, condition,
> +								cond_data, op_to_poll))
> +			return 0;
>  
> +		status = iavf_clean_arq_element(hw, event, &pending);
> +		if (status == IAVF_SUCCESS) {
> +			received_op = (enum virtchnl_ops)le32_to_cpu(event->desc.cookie_high);
>  
> +			/* Handle reset events specially */
> +			if (received_op == VIRTCHNL_OP_EVENT) {
> +				struct virtchnl_pf_event *vpe =
> +					(struct virtchnl_pf_event *)event->msg_buf;
>  
> +				if (vpe->event != VIRTCHNL_EVENT_RESET_IMPENDING)
> +					continue;

If received_op is VIRTCHNL_OP_EVENT but not VIRTCHNL_EVENT_RESET_IMPENDING,
the loop executes continue. Does this mean that during runtime polling (when
timeout_ms > 0), other asynchronous events like VIRTCHNL_EVENT_LINK_CHANGE
bypass iavf_virtchnl_completion() entirely and get discarded?

> +
> +				dev_info(&adapter->pdev->dev,
> +					 "Reset indication received from the PF\n");
> +				if (!(adapter->flags & IAVF_FLAG_RESET_PENDING))
> +					iavf_schedule_reset(adapter,
> +							    IAVF_FLAG_RESET_PENDING);
> +
> +				return -EIO;
> +			}
> +
> +			v_retval = le32_to_cpu(event->desc.cookie_low);
> +
> +			if (!timeout_ms) {
> +				if (received_op == op_to_poll)
> +					return virtchnl_status_to_errno((enum virtchnl_status_code)
> +							v_retval);
> +			} else {
> +				iavf_virtchnl_completion(adapter, received_op,
> +							 (enum iavf_status)v_retval,
> +							 event->msg_buf, event->msg_len);
> +			}
> +
> +			if (pending)
> +				continue;

When pending is true, the loop proceeds to the next iteration before executing
the memset buffer clearing code below. Might this leave stale data in
event->msg_buf for the next iavf_clean_arq_element() call if the subsequent
message is smaller than the previous one? The original implementation zeroed
the buffer before checking pending.

> +		} else if (!timeout_ms) {
> +			return iavf_status_to_errno(status);
> +		}
>  
> +		if (timeout_ms) {
> +			memset(event->msg_buf, 0, IAVF_MAX_AQ_BUF_SIZE);

The size passed to memset is hardcoded to IAVF_MAX_AQ_BUF_SIZE. If a caller
ever provides a timeout but passes a smaller buffer in the event structure,
could this write out of bounds?

For example, other callers like iavf_get_vf_supported_rxdids() use an 8-byte
stack variable for event->msg_buf with event->buf_len set to 8. Would it be
safer to use event->buf_len here instead of the hardcoded macro?

> +			usleep_range(50, 75);
>  		}
>  
> +	} while (!timeout_ms || time_before(jiffies, timeout));

^ permalink raw reply

* Re: [PATCH v3 2/4] ynl_gen: generate Rust files from yaml files
From: Alice Ryhl @ 2026-04-17 11:32 UTC (permalink / raw)
  To: Donald Hunter
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Jakub Kicinski, David S. Miller, Eric Dumazet, Paolo Abeni,
	Simon Horman, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Christian Brauner, Carlos Llamas, linux-kernel,
	rust-for-linux, netdev
In-Reply-To: <CAD4GDZxGo6p9A07rKAW4MZNdFOYrMVzjkYNjwTqKfFcqKkzU1g@mail.gmail.com>

On Fri, Apr 17, 2026 at 11:54:05AM +0100, Donald Hunter wrote:
> On Wed, 15 Apr 2026 at 10:39, Alice Ryhl <aliceryhl@google.com> wrote:
> >
> > To generate netlink frames from Rust code easily, generate Rust
> > libraries with methods for generating different netlink messages as
> > appropriate.
> >
> > The new 'rust' type corresponds to a Rust version of the C target
> > 'kernel'. There is no Rust version of the 'uapi' target since Rust code
> > exports its uapi via C headers - choice of language is opaque to
> > userspace.
> >
> > This logic is kept in the existing ynl_gen_c.py file to reuse CodeWriter
> > and other shared pieces of logic in the existing python file. This has
> > the disadvantage that the gen_c part of the name is now wrong, as it
> > also generates Rust. One possible solution to this could be to rename
> > the file.
> 
> Hi Alice,
> 
> I started a rough refactor of ynl_gen_c a while ago, so that I could
> reuse CodeWriter and the gentypes in a code generator for strace
> decoders. I just rebased the work to net-next/main here:
> 
> https://github.com/donaldh/linux/tree/ynl-gen-strace
> 
> If that looks workable for a ynl_gen_rst then I'd be happy to help
> tidy it up into a patch series.

That sounds convenient!

Alice

^ permalink raw reply

* RE: [PATCH] tipc: crypto: require a NUL-terminated AEAD algorithm name
From: Tung Quang Nguyen @ 2026-04-17 11:25 UTC (permalink / raw)
  To: Pengpeng Hou
  Cc: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev@vger.kernel.org, tipc-discussion@lists.sourceforge.net,
	linux-kernel@vger.kernel.org, stable@vger.kernel.org,
	David S. Miller, Jon Maloy
In-Reply-To: <20260417075353.30662-1-pengpeng@iscas.ac.cn>

>Subject: [PATCH] tipc: crypto: require a NUL-terminated AEAD algorithm name
>
>struct tipc_aead_key carries alg_name in a fixed 32-byte field, but both the
>generic netlink validation path and the MSG_CRYPTO receive path pass that
>field straight to crypto_has_alg(), strcmp(), and
>crypto_alloc_aead() without first proving that it contains a terminating NUL.
>
This is not correct. TIPC guarantees the algorithm string is nul-terminated one.
>Reject locally supplied and received keys whose algorithm name fills the entire
>fixed-width field without a terminator.
>
>Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication")
>Cc: stable@vger.kernel.org
>
>Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
>---
> net/tipc/crypto.c | 14 ++++++++++++++
> 1 file changed, 14 insertions(+)
>
>diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index
>6d3b6b89b1d1..60110ea0fe7c 100644
>--- a/net/tipc/crypto.c
>+++ b/net/tipc/crypto.c
>@@ -307,6 +307,11 @@ static void tipc_crypto_work_tx(struct work_struct
>*work);  static void tipc_crypto_work_rx(struct work_struct *work);  static int
>tipc_aead_key_generate(struct tipc_aead_key *skey);
>
>+static bool tipc_aead_alg_name_valid(const char *alg_name) {
>+	return strnlen(alg_name, TIPC_AEAD_ALG_NAME) <
>TIPC_AEAD_ALG_NAME; }
>+
This is not needed because TIPC only supports one algorithm name "gcm(aes)" which is 8-byte length.
> #define is_tx(crypto) (!(crypto)->node)  #define is_rx(crypto) (!is_tx(crypto))
>
>@@ -335,6 +340,11 @@ int tipc_aead_key_validate(struct tipc_aead_key
>*ukey, struct genl_info *info)  {
> 	int keylen;
>
>+	if (unlikely(!tipc_aead_alg_name_valid(ukey->alg_name))) {
>+		GENL_SET_ERR_MSG(info, "algorithm name is not NUL-
>terminated");
>+		return -EINVAL;
>+	}
>+
This is not needed because the system guarantees that the string passed from user-space is nul-terminated one.
> 	/* Check if algorithm exists */
> 	if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) {
> 		GENL_SET_ERR_MSG(info, "unable to load the algorithm
>(module existed?)"); @@ -2298,6 +2308,10 @@ static bool
>tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr)
> 		pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name);
> 		goto exit;
> 	}
>+	if (unlikely(!tipc_aead_alg_name_valid(data))) {
>+		pr_debug("%s: invalid MSG_CRYPTO algorithm name\n", rx-
>>name);
>+		goto exit;
>+	}
This is not needed as explained above.
>
> 	spin_lock(&rx->lock);
> 	if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) {
>--
>2.50.1 (Apple Git-155)
>


^ permalink raw reply

* [PATCH net 1/1] net/rose: hold listener socket during call request handling
From: Ren Wei @ 2026-04-17 11:01 UTC (permalink / raw)
  To: linux-hams, netdev
  Cc: davem, edumazet, kuba, pabeni, horms, kees, takamitz, kuniyu,
	jiayuan.chen, mingo, stanksal, jlayton, yifanwucs, tomapufckgml,
	bird, yuantan098, tonanli66, n05ec
In-Reply-To: <cover.1776327338.git.tonanli66@gmail.com>

From: Nan Li <tonanli66@gmail.com>

The call request receive path keeps using the listener socket after the
lookup lock has been dropped. Keep the listener alive across the
remaining validation and child socket setup by taking a reference in the
lookup path and releasing it once request handling is finished.

This makes listener lifetime handling explicit and avoids races with
concurrent socket teardown.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@kernel.org
Reported-by: Yifan Wu <yifanwucs@gmail.com>
Reported-by: Juefei Pu <tomapufckgml@gmail.com>
Reported-by: Xin Liu <bird@lzu.edu.cn>
Co-developed-by: Yuan Tan <yuantan098@gmail.com>
Signed-off-by: Yuan Tan <yuantan098@gmail.com>
Signed-off-by: Nan Li <tonanli66@gmail.com>
Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
---
 net/rose/af_rose.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index d5032840ee48..c96325e54a86 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -277,8 +277,10 @@ static struct sock *rose_find_listener(rose_address *addr, ax25_address *call)
 
 		if (!rosecmp(&rose->source_addr, addr) &&
 		    !ax25cmp(&rose->source_call, call) &&
-		    !rose->source_ndigis && s->sk_state == TCP_LISTEN)
+		    !rose->source_ndigis && s->sk_state == TCP_LISTEN) {
+			sock_hold(s);
 			goto found;
+		}
 	}
 
 	sk_for_each(s, &rose_list) {
@@ -286,8 +288,10 @@ static struct sock *rose_find_listener(rose_address *addr, ax25_address *call)
 
 		if (!rosecmp(&rose->source_addr, addr) &&
 		    !ax25cmp(&rose->source_call, &null_ax25_address) &&
-		    s->sk_state == TCP_LISTEN)
+		    s->sk_state == TCP_LISTEN) {
+			sock_hold(s);
 			goto found;
+		}
 	}
 	s = NULL;
 found:
@@ -1056,10 +1060,13 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros
 	/*
 	 * We can't accept the Call Request.
 	 */
-	if (sk == NULL || sk_acceptq_is_full(sk) ||
+	if (sk == NULL)
+		goto out_clear_request;
+
+	if (sk_acceptq_is_full(sk) ||
 	    (make = rose_make_new(sk)) == NULL) {
-		rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120);
-		return 0;
+		sock_put(sk);
+		goto out_clear_request;
 	}
 
 	skb->sk     = make;
@@ -1110,7 +1117,14 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk);
 
+	sock_put(sk);
+
 	return 1;
+
+out_clear_request:
+	rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120);
+
+	return 0;
 }
 
 static int rose_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v3 2/4] ynl_gen: generate Rust files from yaml files
From: Donald Hunter @ 2026-04-17 10:54 UTC (permalink / raw)
  To: Alice Ryhl
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Jakub Kicinski, David S. Miller, Eric Dumazet, Paolo Abeni,
	Simon Horman, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Christian Brauner, Carlos Llamas, linux-kernel,
	rust-for-linux, netdev
In-Reply-To: <20260415-binder-netlink-v3-2-84be9ba63ee2@google.com>

On Wed, 15 Apr 2026 at 10:39, Alice Ryhl <aliceryhl@google.com> wrote:
>
> To generate netlink frames from Rust code easily, generate Rust
> libraries with methods for generating different netlink messages as
> appropriate.
>
> The new 'rust' type corresponds to a Rust version of the C target
> 'kernel'. There is no Rust version of the 'uapi' target since Rust code
> exports its uapi via C headers - choice of language is opaque to
> userspace.
>
> This logic is kept in the existing ynl_gen_c.py file to reuse CodeWriter
> and other shared pieces of logic in the existing python file. This has
> the disadvantage that the gen_c part of the name is now wrong, as it
> also generates Rust. One possible solution to this could be to rename
> the file.

Hi Alice,

I started a rough refactor of ynl_gen_c a while ago, so that I could
reuse CodeWriter and the gentypes in a code generator for strace
decoders. I just rebased the work to net-next/main here:

https://github.com/donaldh/linux/tree/ynl-gen-strace

If that looks workable for a ynl_gen_rst then I'd be happy to help
tidy it up into a patch series.

Thanks,
Donald

^ permalink raw reply

* Re: [PATCH bpf v3 2/2] selftests/bpf: Test TCP_NODELAY in TCP hdr opt callbacks
From: Jiayuan Chen @ 2026-04-17 10:45 UTC (permalink / raw)
  To: KaFai Wan, martin.lau, daniel, john.fastabend, sdf, ast, andrii,
	eddyz87, memxor, song, yonghong.song, jolsa, davem, edumazet,
	kuba, pabeni, horms, shuah, jiayuan.chen, bpf, netdev,
	linux-kernel, linux-kselftest
In-Reply-To: <20260417092035.2299913-3-kafai.wan@linux.dev>


On 4/17/26 5:20 PM, KaFai Wan wrote:
> Add a sockops selftest for the TCP_NODELAY restriction in
> BPF_SOCK_OPS_HDR_OPT_LEN_CB and BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
>
> With BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG enabled,
> bpf_setsockopt(TCP_NODELAY) returns -EOPNOTSUPP from
> BPF_SOCK_OPS_HDR_OPT_LEN_CB and BPF_SOCK_OPS_WRITE_HDR_OPT_CB, avoiding
> unbounded recursion and kernel stack overflow.
>
> Other cases continue to work as before, including
> BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and user space
> setsockopt(TCP_NODELAY).
>
> Signed-off-by: KaFai Wan <kafai.wan@linux.dev>


Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>


A little nit below, no need to resend.

> ---
>   .../selftests/bpf/prog_tests/tcp_hdr_options.c    | 12 +++++++++++-
>   .../bpf/progs/test_misc_tcp_hdr_options.c         | 15 ++++++++++++++-
>   2 files changed, 25 insertions(+), 2 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> index 56685fc03c7e..7b9dbbb84316 100644
> --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> @@ -461,7 +461,7 @@ static void misc(void)
>   	const unsigned int nr_data = 2;
>   	struct bpf_link *link;
>   	struct sk_fds sk_fds;
> -	int i, ret;
> +	int i, ret, true_val = 1;
>   

NIT: please follow the reverse xmas tree variable ordering


>   	lport_linum_map_fd = bpf_map__fd(misc_skel->maps.lport_linum_map);
>   
> @@ -477,6 +477,10 @@ static void misc(void)
>   		return;
>   	}
>   
> +	ret = setsockopt(sk_fds.active_fd, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
> +	if (!ASSERT_OK(ret, "setsockopt(TCP_NODELAY)"))
> +		goto check_linum;
> +
>   	for (i = 0; i < nr_data; i++) {
>   		/* MSG_EOR to ensure skb will not be combined */
>   		ret = send(sk_fds.active_fd, send_msg, sizeof(send_msg),
> @@ -507,6 +511,12 @@ static void misc(void)
>   
>   	ASSERT_EQ(misc_skel->bss->nr_hwtstamp, 0, "nr_hwtstamp");
>   
> +	ASSERT_TRUE(misc_skel->data->nodelay_est_ok, "unexpected nodelay_est_ok");
> +
> +	ASSERT_TRUE(misc_skel->data->nodelay_hdr_len_err, "unexpected nodelay_hdr_len_err");
> +
> +	ASSERT_TRUE(misc_skel->data->nodelay_write_hdr_err, "unexpected nodelay_write_hdr_err");
> +

NIT: It's would be misleading if you run ./test_progs with "-v"
misc:PASS:unexpected nodelay_est_ok 0 nsec

"PASS:unexpected" ?

>   check_linum:
>   	ASSERT_FALSE(check_error_linum(&sk_fds), "check_error_linum");
>   	sk_fds_close(&sk_fds);
> diff --git a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
> index d487153a839d..a02e28d9db2e 100644
> --- a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
> +++ b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
> @@ -29,6 +29,10 @@ unsigned int nr_syn = 0;
>   unsigned int nr_fin = 0;
>   unsigned int nr_hwtstamp = 0;
>   
> +bool nodelay_est_ok = true;
> +bool nodelay_hdr_len_err = true;
> +bool nodelay_write_hdr_err = true;

I prefer "nodelay_hdr_len_reject"



^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH net v3 5/5] iavf: refactor virtchnl polling into single function
From: Jose Ignacio Tornos Martinez @ 2026-04-17 10:30 UTC (permalink / raw)
  To: jacob.e.keller
  Cc: aleksandr.loktionov, anthony.l.nguyen, davem, edumazet,
	intel-wired-lan, jesse.brandeburg, jtornosm, kuba, netdev, pabeni,
	przemyslaw.kitszel, horms
In-Reply-To: <6539cc33-9294-4312-aa1b-5df311dce79f@intel.com>

Hello Jacob,

> The cleanup makes sense as next material, but the other patches fix bugs
> that could (should?) still target net, right?
Yes, I think so and that is my intention.
Unless Przemek has another opinion and/or better idea and since
"iavf: add iavf_poll_virtchnl_response()" has not yet landed in any public
repository, I can include "iavf: add iavf_poll_virtchnl_response()" patch
like it is in my next version (to drop when the commented one is landed)
and in the meantime allow to progress the rest of the patches for net.
I will take into account the interesting comments from Simon and his
AI-generated review too in my next version.
If it is ok, later on I can proceed with this patch for the refactoring.

> I'll drop this version from the Intel Wired LAN patchwork then.
Ok

Thanks

Best regards
Jose Ignacio


^ permalink raw reply

* Re: [PATCH bpf v3 1/2] bpf: Reject TCP_NODELAY in TCP header option callbacks
From: Jiayuan Chen @ 2026-04-17 10:26 UTC (permalink / raw)
  To: KaFai Wan, martin.lau, daniel, john.fastabend, sdf, ast, andrii,
	eddyz87, memxor, song, yonghong.song, jolsa, davem, edumazet,
	kuba, pabeni, horms, shuah, jiayuan.chen, bpf, netdev,
	linux-kernel, linux-kselftest
  Cc: Quan Sun, Yinhao Hu, Kaiyan Mei
In-Reply-To: <20260417092035.2299913-2-kafai.wan@linux.dev>


On 4/17/26 5:20 PM, KaFai Wan wrote:
> A BPF_SOCK_OPS program can enable
> BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG and then call
> bpf_setsockopt(TCP_NODELAY) from BPF_SOCK_OPS_HDR_OPT_LEN_CB or
> BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
>
> In these callbacks, bpf_setsockopt(TCP_NODELAY) can reach
> __tcp_sock_set_nodelay(), which can call tcp_push_pending_frames().
>
>  From BPF_SOCK_OPS_HDR_OPT_LEN_CB, tcp_push_pending_frames() can call
> tcp_current_mss(), which calls tcp_established_options() and re-enters
> bpf_skops_hdr_opt_len().
>
> BPF_SOCK_OPS_HDR_OPT_LEN_CB
>    -> bpf_setsockopt(TCP_NODELAY)
>      -> tcp_push_pending_frames()
>        -> tcp_current_mss()
>          -> tcp_established_options()
>            -> bpf_skops_hdr_opt_len()
>              -> BPF_SOCK_OPS_HDR_OPT_LEN_CB
>
>  From BPF_SOCK_OPS_WRITE_HDR_OPT_CB, tcp_push_pending_frames() can call
> tcp_write_xmit(), which calls tcp_transmit_skb().  That path recomputes
> header option length through tcp_established_options() and
> bpf_skops_hdr_opt_len() before re-entering bpf_skops_write_hdr_opt().
>
> BPF_SOCK_OPS_WRITE_HDR_OPT_CB
>    -> bpf_setsockopt(TCP_NODELAY)
>      -> tcp_push_pending_frames()
>        -> tcp_write_xmit()
>          -> tcp_transmit_skb()
>            -> tcp_established_options()
>              -> bpf_skops_hdr_opt_len()
>            -> bpf_skops_write_hdr_opt()
>              -> BPF_SOCK_OPS_WRITE_HDR_OPT_CB
>
> This leads to unbounded recursion and can overflow the kernel stack.
>
> Reject TCP_NODELAY with -EOPNOTSUPP in bpf_sock_ops_setsockopt()
> when bpf_setsockopt() is called from
> BPF_SOCK_OPS_HDR_OPT_LEN_CB or BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
>
> Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn>
> Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
> Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
> Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@std.uestc.edu.cn/
> Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt")
> Signed-off-by: KaFai Wan <kafai.wan@linux.dev>


Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>

I think the result of AI review is false-positve.



^ permalink raw reply

* [PATCH nf,v3] netfilter: nat: use kfree_rcu to release ops
From: Pablo Neira Ayuso @ 2026-04-17 10:11 UTC (permalink / raw)
  To: netfilter-devel; +Cc: fw, netdev

Florian Westphal says:

"Historically this is not an issue, even for normal base hooks: the data
path doesn't use the original nf_hook_ops that are used to register the
callbacks.

However, in v5.14 I added the ability to dump the active netfilter
hooks from userspace.

This code will peek back into the nf_hook_ops that are available
at the tail of the pointer-array blob used by the datapath.

The nat hooks are special, because they are called indirectly from
the central nat dispatcher hook. They are currently invisible to
the nfnl hook dump subsystem though.

But once that changes the nat ops structures have to be deferred too."

Update nf_nat_register_fn() to deal with partial exposition of the hooks
from error path which can be also an issue for nfnetlink_hook.

Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
v3: use kfree_rcu as suggested by AI in ip{6}tables_nat.

 net/ipv4/netfilter/iptable_nat.c  |  4 ++--
 net/ipv6/netfilter/ip6table_nat.c |  4 ++--
 net/netfilter/nf_nat_core.c       | 10 ++++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a5db7c67d61b..625a1ca13b1b 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -79,7 +79,7 @@ static int ipt_nat_register_lookups(struct net *net)
 			while (i)
 				nf_nat_ipv4_unregister_fn(net, &ops[--i]);
 
-			kfree(ops);
+			kfree_rcu(ops, rcu);
 			return ret;
 		}
 	}
@@ -100,7 +100,7 @@ static void ipt_nat_unregister_lookups(struct net *net)
 	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
 		nf_nat_ipv4_unregister_fn(net, &ops[i]);
 
-	kfree(ops);
+	kfree_rcu(ops, rcu);
 }
 
 static int iptable_nat_table_init(struct net *net)
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index e119d4f090cc..5be723232df8 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -81,7 +81,7 @@ static int ip6t_nat_register_lookups(struct net *net)
 			while (i)
 				nf_nat_ipv6_unregister_fn(net, &ops[--i]);
 
-			kfree(ops);
+			kfree_rcu(ops, rcu);
 			return ret;
 		}
 	}
@@ -102,7 +102,7 @@ static void ip6t_nat_unregister_lookups(struct net *net)
 	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
 		nf_nat_ipv6_unregister_fn(net, &ops[i]);
 
-	kfree(ops);
+	kfree_rcu(ops, rcu);
 }
 
 static int ip6table_nat_table_init(struct net *net)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 83b2b5e9759a..74ec224ce0d6 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1222,9 +1222,11 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 		ret = nf_register_net_hooks(net, nat_ops, ops_count);
 		if (ret < 0) {
 			mutex_unlock(&nf_nat_proto_mutex);
-			for (i = 0; i < ops_count; i++)
-				kfree(nat_ops[i].priv);
-			kfree(nat_ops);
+			for (i = 0; i < ops_count; i++) {
+				priv = nat_ops[i].priv;
+				kfree_rcu(priv, rcu_head);
+			}
+			kfree_rcu(nat_ops, rcu);
 			return ret;
 		}
 
@@ -1288,7 +1290,7 @@ void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 		}
 
 		nat_proto_net->nat_hook_ops = NULL;
-		kfree(nat_ops);
+		kfree_rcu(nat_ops, rcu);
 	}
 unlock:
 	mutex_unlock(&nf_nat_proto_mutex);
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH bpf v3 1/2] bpf: Reject TCP_NODELAY in TCP header option callbacks
From: bot+bpf-ci @ 2026-04-17 10:10 UTC (permalink / raw)
  To: kafai.wan, martin.lau, daniel, john.fastabend, sdf, ast, andrii,
	eddyz87, memxor, song, yonghong.song, jolsa, davem, edumazet,
	kuba, pabeni, horms, shuah, jiayuan.chen, kafai.wan, bpf, netdev,
	linux-kernel, linux-kselftest
  Cc: 2022090917019, dddddd, M202472210, ast, andrii, daniel,
	martin.lau, eddyz87, yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260417092035.2299913-2-kafai.wan@linux.dev>

[-- Attachment #1: Type: text/plain, Size: 1997 bytes --]

> diff --git a/net/core/filter.c b/net/core/filter.c
> index 5fa9189eb772..96849f4c1fbc 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5833,6 +5833,12 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
>  	if (!is_locked_tcp_sock_ops(bpf_sock))
>  		return -EOPNOTSUPP;
>
> +	/* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters these callbacks. */
> +	if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB ||
> +	     bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) &&
> +	    level == SOL_TCP && optname == TCP_NODELAY)
> +		return -EOPNOTSUPP;
> +

Does this fix also need to block TCP_CORK? When bpf_setsockopt() is called
with SOL_TCP and TCP_CORK where the value is 0 (disabling cork), the call
path goes through:

bpf_sock_ops_setsockopt()
  -> _bpf_setsockopt()
    -> __bpf_setsockopt()
      -> sol_tcp_sockopt()
        -> do_tcp_setsockopt()
          -> __tcp_sock_set_cork(sk, false)

In __tcp_sock_set_cork() at net/ipv4/tcp.c, when cork is disabled, it
unconditionally calls tcp_push_pending_frames(sk):

net/ipv4/tcp.c:__tcp_sock_set_cork() {
    ...
    tcp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;

    tcp_push_pending_frames(sk);
}

This triggers the same recursion chain described in the commit message:

tcp_push_pending_frames()
  -> tcp_current_mss()
    -> tcp_established_options()
      -> bpf_skops_hdr_opt_len()
        -> BPF_CGROUP_RUN_PROG_SOCK_OPS_SK (re-enters with
           op=BPF_SOCK_OPS_HDR_OPT_LEN_CB)

causing unbounded recursion and kernel stack overflow. Note that enabling
TCP_CORK (val=1) is safe since it doesn't call tcp_push_pending_frames(),
only disabling it (val=0) does.


>  	return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
>  }


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24558395229

^ permalink raw reply

* [PATCH nf,v6 4/4] netfilter: nf_tables: add hook transactions for device deletions
From: Pablo Neira Ayuso @ 2026-04-17  9:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, fw
In-Reply-To: <20260417093942.373885-1-pablo@netfilter.org>

Restore the flag that indicates that the hook is going away, ie.
NFT_HOOK_REMOVE, but add a new transaction object to track deletion
of hooks without altering the basechain/flowtable hook_list during
the preparation phase.

The existing approach that moves the hook from the basechain/flowtable
hook_list to transaction hook_list breaks netlink dump path readers
of this RCU-protected list.

It should be possible use an array for nft_trans_hook to store the
deleted hooks to compact the representation but I am not expecting
many hook object, specially now that wildcard support for devices
is in place.

Note that the nft_trans_chain_hooks() list contains a list of struct
nft_trans_hook objects for DELCHAIN and DELFLOWTABLE commands, while
this list stores struct nft_hook objects for NEWCHAIN and NEWFLOWTABLE.
Note that new commands can be updated to use nft_trans_hook for
consistency.

This patch also adapts the event notification path to deal with the list
of hook transactions.

Fixes: 7d937b107108 ("netfilter: nf_tables: support for deleting devices in an existing netdev chain")
Fixes: b6d9014a3335 ("netfilter: nf_tables: delete flowtable hooks via transaction list")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
v6: just a rebase on top of net-next, net is still behind.

 include/net/netfilter/nf_tables.h |  13 ++
 net/netfilter/nf_tables_api.c     | 251 +++++++++++++++++++++++-------
 2 files changed, 212 insertions(+), 52 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2c0173d9309c..cff7b773e972 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1204,12 +1204,15 @@ struct nft_stats {
 	struct u64_stats_sync	syncp;
 };
 
+#define NFT_HOOK_REMOVE	(1 << 0)
+
 struct nft_hook {
 	struct list_head	list;
 	struct list_head	ops_list;
 	struct rcu_head		rcu;
 	char			ifname[IFNAMSIZ];
 	u8			ifnamelen;
+	u8			flags;
 };
 
 struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
@@ -1664,6 +1667,16 @@ struct nft_trans {
 	u8				put_net:1;
 };
 
+/**
+ * struct nft_trans_hook - nf_tables hook update in transaction
+ * @list: used internally
+ * @hook: struct nft_hook with the device hook
+ */
+struct nft_trans_hook {
+	struct list_head		list;
+	struct nft_hook			*hook;
+};
+
 /**
  * struct nft_trans_binding - nf_tables object with binding support in transaction
  * @nft_trans:    base structure, MUST be first member
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ae10116af923..d04768641ffb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -380,6 +380,29 @@ static void nft_netdev_hook_unlink_free_rcu(struct nft_hook *hook)
 	nft_netdev_hook_free_rcu(hook);
 }
 
+static void nft_trans_hook_destroy(struct nft_trans_hook *trans_hook)
+{
+	list_del(&trans_hook->list);
+	kfree(trans_hook);
+}
+
+static void nft_netdev_unregister_trans_hook(struct net *net,
+					     struct list_head *hook_list)
+{
+	struct nft_trans_hook *trans_hook, *next;
+	struct nf_hook_ops *ops;
+	struct nft_hook *hook;
+
+	list_for_each_entry_safe(trans_hook, next, hook_list, list) {
+		hook = trans_hook->hook;
+		list_for_each_entry(ops, &hook->ops_list, list)
+			nf_unregister_net_hook(net, ops);
+
+		nft_netdev_hook_unlink_free_rcu(hook);
+		nft_trans_hook_destroy(trans_hook);
+	}
+}
+
 static void nft_netdev_unregister_hooks(struct net *net,
 					struct list_head *hook_list,
 					bool release_netdev)
@@ -1946,13 +1969,59 @@ static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook)
 	return nla_put_string(skb, attr, hook->ifname);
 }
 
+struct nft_hook_dump_ctx {
+	struct nft_hook *first;
+	int n;
+};
+
+static int nft_dump_basechain_hook_one(struct sk_buff *skb,
+				       struct nft_hook *hook,
+				       struct nft_hook_dump_ctx *dump_ctx)
+{
+	if (!dump_ctx->first)
+		dump_ctx->first = hook;
+
+	if (nft_nla_put_hook_dev(skb, hook))
+		return -1;
+
+	dump_ctx->n++;
+
+	return 0;
+}
+
+static int nft_dump_basechain_hook_list(struct sk_buff *skb,
+					const struct list_head *hook_list,
+					struct nft_hook_dump_ctx *dump_ctx)
+{
+	struct nft_hook *hook;
+
+	list_for_each_entry_rcu(hook, hook_list, list,
+				lockdep_commit_lock_is_held(net))
+		nft_dump_basechain_hook_one(skb, hook, dump_ctx);
+
+	return 0;
+}
+
+static int nft_dump_basechain_trans_hook_list(struct sk_buff *skb,
+					      const struct list_head *trans_hook_list,
+					      struct nft_hook_dump_ctx *dump_ctx)
+{
+	struct nft_trans_hook *trans_hook;
+
+	list_for_each_entry(trans_hook, trans_hook_list, list)
+		nft_dump_basechain_hook_one(skb, trans_hook->hook, dump_ctx);
+
+	return 0;
+}
+
 static int nft_dump_basechain_hook(struct sk_buff *skb,
 				   const struct net *net, int family,
 				   const struct nft_base_chain *basechain,
-				   const struct list_head *hook_list)
+				   const struct list_head *hook_list,
+				   const struct list_head *trans_hook_list)
 {
 	const struct nf_hook_ops *ops = &basechain->ops;
-	struct nft_hook *hook, *first = NULL;
+	struct nft_hook_dump_ctx dump_hook_ctx = {};
 	struct nlattr *nest, *nest_devs;
 	int n = 0;
 
@@ -1969,23 +2038,23 @@ static int nft_dump_basechain_hook(struct sk_buff *skb,
 		if (!nest_devs)
 			goto nla_put_failure;
 
-		if (!hook_list)
+		if (!hook_list && !trans_hook_list)
 			hook_list = &basechain->hook_list;
 
-		list_for_each_entry_rcu(hook, hook_list, list,
-					lockdep_commit_lock_is_held(net)) {
-			if (!first)
-				first = hook;
-
-			if (nft_nla_put_hook_dev(skb, hook))
-				goto nla_put_failure;
-			n++;
+		if (hook_list &&
+		    nft_dump_basechain_hook_list(skb, hook_list, &dump_hook_ctx)) {
+			goto nla_put_failure;
+		} else if (trans_hook_list &&
+			   nft_dump_basechain_trans_hook_list(skb, trans_hook_list,
+							      &dump_hook_ctx)) {
+			goto nla_put_failure;
 		}
+
 		nla_nest_end(skb, nest_devs);
 
 		if (n == 1 &&
-		    !hook_is_prefix(first) &&
-		    nla_put_string(skb, NFTA_HOOK_DEV, first->ifname))
+		    !hook_is_prefix(dump_hook_ctx.first) &&
+		    nla_put_string(skb, NFTA_HOOK_DEV, dump_hook_ctx.first->ifname))
 			goto nla_put_failure;
 	}
 	nla_nest_end(skb, nest);
@@ -1999,7 +2068,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 				     u32 portid, u32 seq, int event, u32 flags,
 				     int family, const struct nft_table *table,
 				     const struct nft_chain *chain,
-				     const struct list_head *hook_list)
+				     const struct list_head *hook_list,
+				     const struct list_head *trans_hook_list)
 {
 	struct nlmsghdr *nlh;
 
@@ -2015,7 +2085,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 			 NFTA_CHAIN_PAD))
 		goto nla_put_failure;
 
-	if (!hook_list &&
+	if (!hook_list && !trans_hook_list &&
 	    (event == NFT_MSG_DELCHAIN ||
 	     event == NFT_MSG_DESTROYCHAIN)) {
 		nlmsg_end(skb, nlh);
@@ -2026,7 +2096,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 		const struct nft_base_chain *basechain = nft_base_chain(chain);
 		struct nft_stats __percpu *stats;
 
-		if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list))
+		if (nft_dump_basechain_hook(skb, net, family, basechain,
+					    hook_list, trans_hook_list))
 			goto nla_put_failure;
 
 		if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
@@ -2062,7 +2133,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 }
 
 static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
-				   const struct list_head *hook_list)
+				   const struct list_head *hook_list,
+				   const struct list_head *trans_hook_list)
 {
 	struct nftables_pernet *nft_net;
 	struct sk_buff *skb;
@@ -2082,7 +2154,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
 
 	err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
 					event, flags, ctx->family, ctx->table,
-					ctx->chain, hook_list);
+					ctx->chain, hook_list, trans_hook_list);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -2128,7 +2200,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 						      NFT_MSG_NEWCHAIN,
 						      NLM_F_MULTI,
 						      table->family, table,
-						      chain, NULL) < 0)
+						      chain, NULL, NULL) < 0)
 				goto done;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -2182,7 +2254,7 @@ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
 
 	err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
 					info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
-					0, family, table, chain, NULL);
+					0, family, table, chain, NULL, NULL);
 	if (err < 0)
 		goto err_fill_chain_info;
 
@@ -2345,8 +2417,12 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
 
 	list_for_each_entry(hook, hook_list, list) {
 		if (!strncmp(hook->ifname, this->ifname,
-			     min(hook->ifnamelen, this->ifnamelen)))
+			     min(hook->ifnamelen, this->ifnamelen))) {
+			if (hook->flags & NFT_HOOK_REMOVE)
+				continue;
+
 			return hook;
+		}
 	}
 
 	return NULL;
@@ -3105,6 +3181,32 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
 	return nf_tables_addchain(&ctx, family, policy, flags, extack);
 }
 
+static int nft_trans_delhook(struct nft_hook *hook,
+			     struct list_head *del_list)
+{
+	struct nft_trans_hook *trans_hook;
+
+	trans_hook = kmalloc_obj(*trans_hook, GFP_KERNEL);
+	if (!trans_hook)
+		return -ENOMEM;
+
+	trans_hook->hook = hook;
+	list_add_tail(&trans_hook->list, del_list);
+	hook->flags |= NFT_HOOK_REMOVE;
+
+	return 0;
+}
+
+static void nft_trans_delhook_abort(struct list_head *del_list)
+{
+	struct nft_trans_hook *trans_hook, *next;
+
+	list_for_each_entry_safe(trans_hook, next, del_list, list) {
+		trans_hook->hook->flags &= ~NFT_HOOK_REMOVE;
+		nft_trans_hook_destroy(trans_hook);
+	}
+}
+
 static int nft_delchain_hook(struct nft_ctx *ctx,
 			     struct nft_base_chain *basechain,
 			     struct netlink_ext_ack *extack)
@@ -3131,7 +3233,10 @@ static int nft_delchain_hook(struct nft_ctx *ctx,
 			err = -ENOENT;
 			goto err_chain_del_hook;
 		}
-		list_move(&hook->list, &chain_del_list);
+		if (nft_trans_delhook(hook, &chain_del_list) < 0) {
+			err = -ENOMEM;
+			goto err_chain_del_hook;
+		}
 	}
 
 	trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN);
@@ -3151,7 +3256,7 @@ static int nft_delchain_hook(struct nft_ctx *ctx,
 	return 0;
 
 err_chain_del_hook:
-	list_splice(&chain_del_list, &basechain->hook_list);
+	nft_trans_delhook_abort(&chain_del_list);
 	nft_chain_release_hook(&chain_hook);
 
 	return err;
@@ -8933,6 +9038,14 @@ static int nft_register_flowtable_net_hooks(struct net *net,
 	return err;
 }
 
+static void nft_trans_delhook_commit(struct list_head *hook_list)
+{
+	struct nft_trans_hook *trans_hook, *next;
+
+	list_for_each_entry_safe(trans_hook, next, hook_list, list)
+		nft_trans_hook_destroy(trans_hook);
+}
+
 static void nft_hooks_destroy(struct list_head *hook_list)
 {
 	struct nft_hook *hook, *next;
@@ -8941,6 +9054,24 @@ static void nft_hooks_destroy(struct list_head *hook_list)
 		nft_netdev_hook_unlink_free_rcu(hook);
 }
 
+static void nft_flowtable_unregister_trans_hook(struct net *net,
+						struct nft_flowtable *flowtable,
+						struct list_head *hook_list)
+{
+	struct nft_trans_hook *trans_hook, *next;
+	struct nf_hook_ops *ops;
+	struct nft_hook *hook;
+
+	list_for_each_entry_safe(trans_hook, next, hook_list, list) {
+		hook = trans_hook->hook;
+		list_for_each_entry(ops, &hook->ops_list, list)
+			nft_unregister_flowtable_ops(net, flowtable, ops);
+
+		nft_netdev_hook_unlink_free_rcu(hook);
+		nft_trans_hook_destroy(trans_hook);
+	}
+}
+
 static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
 				struct nft_flowtable *flowtable,
 				struct netlink_ext_ack *extack)
@@ -9199,7 +9330,10 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx,
 			err = -ENOENT;
 			goto err_flowtable_del_hook;
 		}
-		list_move(&hook->list, &flowtable_del_list);
+		if (nft_trans_delhook(hook, &flowtable_del_list) < 0) {
+			err = -ENOMEM;
+			goto err_flowtable_del_hook;
+		}
 	}
 
 	trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE,
@@ -9220,7 +9354,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx,
 	return 0;
 
 err_flowtable_del_hook:
-	list_splice(&flowtable_del_list, &flowtable->hook_list);
+	nft_trans_delhook_abort(&flowtable_del_list);
 	nft_flowtable_hook_release(&flowtable_hook);
 
 	return err;
@@ -9285,8 +9419,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 					 u32 portid, u32 seq, int event,
 					 u32 flags, int family,
 					 struct nft_flowtable *flowtable,
-					 struct list_head *hook_list)
+					 struct list_head *hook_list,
+					 struct list_head *trans_hook_list)
 {
+	struct nft_trans_hook *trans_hook;
 	struct nlattr *nest, *nest_devs;
 	struct nft_hook *hook;
 	struct nlmsghdr *nlh;
@@ -9303,7 +9439,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 			 NFTA_FLOWTABLE_PAD))
 		goto nla_put_failure;
 
-	if (!hook_list &&
+	if (!hook_list && !trans_hook_list &&
 	    (event == NFT_MSG_DELFLOWTABLE ||
 	     event == NFT_MSG_DESTROYFLOWTABLE)) {
 		nlmsg_end(skb, nlh);
@@ -9325,13 +9461,20 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 	if (!nest_devs)
 		goto nla_put_failure;
 
-	if (!hook_list)
+	if (!hook_list && !trans_hook_list)
 		hook_list = &flowtable->hook_list;
 
-	list_for_each_entry_rcu(hook, hook_list, list,
-				lockdep_commit_lock_is_held(net)) {
-		if (nft_nla_put_hook_dev(skb, hook))
-			goto nla_put_failure;
+	if (hook_list) {
+		list_for_each_entry_rcu(hook, hook_list, list,
+					lockdep_commit_lock_is_held(net)) {
+			if (nft_nla_put_hook_dev(skb, hook))
+				goto nla_put_failure;
+		}
+	} else if (trans_hook_list) {
+		list_for_each_entry(trans_hook, trans_hook_list, list) {
+			if (nft_nla_put_hook_dev(skb, trans_hook->hook))
+				goto nla_put_failure;
+		}
 	}
 	nla_nest_end(skb, nest_devs);
 	nla_nest_end(skb, nest);
@@ -9385,7 +9528,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
 							  NFT_MSG_NEWFLOWTABLE,
 							  NLM_F_MULTI | NLM_F_APPEND,
 							  table->family,
-							  flowtable, NULL) < 0)
+							  flowtable, NULL, NULL) < 0)
 				goto done;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -9485,7 +9628,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb,
 	err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
 					    info->nlh->nlmsg_seq,
 					    NFT_MSG_NEWFLOWTABLE, 0, family,
-					    flowtable, NULL);
+					    flowtable, NULL, NULL);
 	if (err < 0)
 		goto err_fill_flowtable_info;
 
@@ -9498,7 +9641,9 @@ static int nf_tables_getflowtable(struct sk_buff *skb,
 
 static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 				       struct nft_flowtable *flowtable,
-				       struct list_head *hook_list, int event)
+				       struct list_head *hook_list,
+				       struct list_head *trans_hook_list,
+				       int event)
 {
 	struct nftables_pernet *nft_net = nft_pernet(ctx->net);
 	struct sk_buff *skb;
@@ -9518,7 +9663,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 
 	err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
 					    ctx->seq, event, flags,
-					    ctx->family, flowtable, hook_list);
+					    ctx->family, flowtable,
+					    hook_list, trans_hook_list);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -10053,7 +10199,7 @@ static void nft_commit_release(struct nft_trans *trans)
 	case NFT_MSG_DELCHAIN:
 	case NFT_MSG_DESTROYCHAIN:
 		if (nft_trans_chain_update(trans))
-			nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+			nft_trans_delhook_commit(&nft_trans_chain_hooks(trans));
 		else
 			nf_tables_chain_destroy(nft_trans_chain(trans));
 		break;
@@ -10076,7 +10222,7 @@ static void nft_commit_release(struct nft_trans *trans)
 	case NFT_MSG_DELFLOWTABLE:
 	case NFT_MSG_DESTROYFLOWTABLE:
 		if (nft_trans_flowtable_update(trans))
-			nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+			nft_trans_delhook_commit(&nft_trans_flowtable_hooks(trans));
 		else
 			nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
 		break;
@@ -10837,31 +10983,30 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			if (nft_trans_chain_update(trans)) {
 				nft_chain_commit_update(nft_trans_container_chain(trans));
 				nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN,
-						       &nft_trans_chain_hooks(trans));
+						       &nft_trans_chain_hooks(trans), NULL);
 				list_splice_rcu(&nft_trans_chain_hooks(trans),
 						&nft_trans_basechain(trans)->hook_list);
 				/* trans destroyed after rcu grace period */
 			} else {
 				nft_chain_commit_drop_policy(nft_trans_container_chain(trans));
 				nft_clear(net, nft_trans_chain(trans));
-				nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL);
+				nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL, NULL);
 				nft_trans_destroy(trans);
 			}
 			break;
 		case NFT_MSG_DELCHAIN:
 		case NFT_MSG_DESTROYCHAIN:
 			if (nft_trans_chain_update(trans)) {
-				nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
+				nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, NULL,
 						       &nft_trans_chain_hooks(trans));
 				if (!(table->flags & NFT_TABLE_F_DORMANT)) {
-					nft_netdev_unregister_hooks(net,
-								    &nft_trans_chain_hooks(trans),
-								    true);
+					nft_netdev_unregister_trans_hook(net,
+									 &nft_trans_chain_hooks(trans));
 				}
 			} else {
 				nft_chain_del(nft_trans_chain(trans));
 				nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
-						       NULL);
+						       NULL, NULL);
 				nf_tables_unregister_hook(ctx.net, ctx.table,
 							  nft_trans_chain(trans));
 			}
@@ -10967,6 +11112,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 				nf_tables_flowtable_notify(&ctx,
 							   nft_trans_flowtable(trans),
 							   &nft_trans_flowtable_hooks(trans),
+							   NULL,
 							   NFT_MSG_NEWFLOWTABLE);
 				list_splice_rcu(&nft_trans_flowtable_hooks(trans),
 						&nft_trans_flowtable(trans)->hook_list);
@@ -10975,6 +11121,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 				nf_tables_flowtable_notify(&ctx,
 							   nft_trans_flowtable(trans),
 							   NULL,
+							   NULL,
 							   NFT_MSG_NEWFLOWTABLE);
 			}
 			nft_trans_destroy(trans);
@@ -10984,16 +11131,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			if (nft_trans_flowtable_update(trans)) {
 				nf_tables_flowtable_notify(&ctx,
 							   nft_trans_flowtable(trans),
+							   NULL,
 							   &nft_trans_flowtable_hooks(trans),
 							   trans->msg_type);
-				nft_unregister_flowtable_net_hooks(net,
-								   nft_trans_flowtable(trans),
-								   &nft_trans_flowtable_hooks(trans));
+				nft_flowtable_unregister_trans_hook(net,
+								    nft_trans_flowtable(trans),
+								    &nft_trans_flowtable_hooks(trans));
 			} else {
 				list_del_rcu(&nft_trans_flowtable(trans)->list);
 				nf_tables_flowtable_notify(&ctx,
 							   nft_trans_flowtable(trans),
 							   NULL,
+							   NULL,
 							   trans->msg_type);
 				nft_unregister_flowtable_net_hooks(net,
 						nft_trans_flowtable(trans),
@@ -11157,8 +11306,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 		case NFT_MSG_DELCHAIN:
 		case NFT_MSG_DESTROYCHAIN:
 			if (nft_trans_chain_update(trans)) {
-				list_splice(&nft_trans_chain_hooks(trans),
-					    &nft_trans_basechain(trans)->hook_list);
+				nft_trans_delhook_abort(&nft_trans_chain_hooks(trans));
 			} else {
 				nft_use_inc_restore(&table->use);
 				nft_clear(trans->net, nft_trans_chain(trans));
@@ -11272,8 +11420,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 		case NFT_MSG_DELFLOWTABLE:
 		case NFT_MSG_DESTROYFLOWTABLE:
 			if (nft_trans_flowtable_update(trans)) {
-				list_splice(&nft_trans_flowtable_hooks(trans),
-					    &nft_trans_flowtable(trans)->hook_list);
+				nft_trans_delhook_abort(&nft_trans_flowtable_hooks(trans));
 			} else {
 				nft_use_inc_restore(&table->use);
 				nft_clear(trans->net, nft_trans_flowtable(trans));
-- 
2.47.3


^ permalink raw reply related

* [PATCH nf,v6 3/4] netfilter: nf_tables: join hook list via splice_list_rcu() in commit phase
From: Pablo Neira Ayuso @ 2026-04-17  9:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, fw
In-Reply-To: <20260417093942.373885-1-pablo@netfilter.org>

Publish new hooks in the list into the basechain/flowtable using
splice_list_rcu() to ensure netlink dump list traversal via rcu is safe
while concurrent ruleset update is going on.

Fixes: 78d9f48f7f44 ("netfilter: nf_tables: add devices to existing flowtable")
Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
v3: just a rebase on top of net-next, net is still behind.

 net/netfilter/nf_tables_api.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 07e151245765..ae10116af923 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -10838,8 +10838,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 				nft_chain_commit_update(nft_trans_container_chain(trans));
 				nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN,
 						       &nft_trans_chain_hooks(trans));
-				list_splice(&nft_trans_chain_hooks(trans),
-					    &nft_trans_basechain(trans)->hook_list);
+				list_splice_rcu(&nft_trans_chain_hooks(trans),
+						&nft_trans_basechain(trans)->hook_list);
 				/* trans destroyed after rcu grace period */
 			} else {
 				nft_chain_commit_drop_policy(nft_trans_container_chain(trans));
@@ -10968,8 +10968,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 							   nft_trans_flowtable(trans),
 							   &nft_trans_flowtable_hooks(trans),
 							   NFT_MSG_NEWFLOWTABLE);
-				list_splice(&nft_trans_flowtable_hooks(trans),
-					    &nft_trans_flowtable(trans)->hook_list);
+				list_splice_rcu(&nft_trans_flowtable_hooks(trans),
+						&nft_trans_flowtable(trans)->hook_list);
 			} else {
 				nft_clear(net, nft_trans_flowtable(trans));
 				nf_tables_flowtable_notify(&ctx,
-- 
2.47.3


^ permalink raw reply related

* [PATCH nf,v6 2/4] rculist: add list_splice_rcu() for private lists
From: Pablo Neira Ayuso @ 2026-04-17  9:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, fw
In-Reply-To: <20260417093942.373885-1-pablo@netfilter.org>

This patch adds a helper function, list_splice_rcu(), to safely splice
a private (non-RCU-protected) list into an RCU-protected list.

The function ensures that only the pointer visible to RCU readers
(prev->next) is updated using rcu_assign_pointer(), while the rest of
the list manipulations are performed with regular assignments, as the
source list is private and not visible to concurrent RCU readers.

This is useful for moving elements from a private list into a global
RCU-protected list, ensuring safe publication for RCU readers.
Subsystems with some sort of batching mechanism from userspace can
benefit from this new function.

The function __list_splice_rcu() has been added for clarity and to
follow the same pattern as in the existing list_splice*() interfaces,
where there is a check to ensure that the list to splice is not
empty. Note that __list_splice_rcu() has no documentation for this
reason.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
v6: - just a rebase on top of net-next, net is still behind.

 include/linux/rculist.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 2abba7552605..e3bc44225692 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -261,6 +261,35 @@ static inline void list_replace_rcu(struct list_head *old,
 	old->prev = LIST_POISON2;
 }
 
+static inline void __list_splice_rcu(struct list_head *list,
+				     struct list_head *prev,
+				     struct list_head *next)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+
+	last->next = next;
+	first->prev = prev;
+	next->prev = last;
+	rcu_assign_pointer(list_next_rcu(prev), first);
+}
+
+/**
+ * list_splice_rcu - splice a non-RCU list into an RCU-protected list,
+ *                   designed for stacks.
+ * @list:	the non RCU-protected list to splice
+ * @head:	the place in the existing RCU-protected list to splice
+ *
+ * The list pointed to by @head can be RCU-read traversed concurrently with
+ * this function.
+ */
+static inline void list_splice_rcu(struct list_head *list,
+				   struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice_rcu(list, head, head->next);
+}
+
 /**
  * __list_splice_init_rcu - join an RCU-protected list into an existing list.
  * @list:	the RCU-protected list to splice
-- 
2.47.3


^ permalink raw reply related

* [PATCH nf,v6 1/4] netfilter: nf_tables: use list_del_rcu for netlink hooks
From: Pablo Neira Ayuso @ 2026-04-17  9:39 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, fw

From: Florian Westphal <fw@strlen.de>

nft_netdev_unregister_hooks and __nft_unregister_flowtable_net_hooks need
to use list_del_rcu(), this list can be walked by concurrent dumpers.

Add a new helper and use it consistently.

Fixes: f9a43007d3f7 ("netfilter: nf_tables: double hook unregistration in netns path")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
v6: - missing fundamental patch dependency in v5
    - just a rebase on top of net-next, net is still behind.

 net/netfilter/nf_tables_api.c | 44 ++++++++++++++---------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8537b94653d3..07e151245765 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -374,6 +374,12 @@ static void nft_netdev_hook_free_rcu(struct nft_hook *hook)
 	call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu);
 }
 
+static void nft_netdev_hook_unlink_free_rcu(struct nft_hook *hook)
+{
+	list_del_rcu(&hook->list);
+	nft_netdev_hook_free_rcu(hook);
+}
+
 static void nft_netdev_unregister_hooks(struct net *net,
 					struct list_head *hook_list,
 					bool release_netdev)
@@ -384,10 +390,8 @@ static void nft_netdev_unregister_hooks(struct net *net,
 	list_for_each_entry_safe(hook, next, hook_list, list) {
 		list_for_each_entry(ops, &hook->ops_list, list)
 			nf_unregister_net_hook(net, ops);
-		if (release_netdev) {
-			list_del(&hook->list);
-			nft_netdev_hook_free_rcu(hook);
-		}
+		if (release_netdev)
+			nft_netdev_hook_unlink_free_rcu(hook);
 	}
 }
 
@@ -2271,10 +2275,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain)
 
 		if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
 			list_for_each_entry_safe(hook, next,
-						 &basechain->hook_list, list) {
-				list_del_rcu(&hook->list);
-				nft_netdev_hook_free_rcu(hook);
-			}
+						 &basechain->hook_list, list)
+				nft_netdev_hook_unlink_free_rcu(hook);
 		}
 		module_put(basechain->type->owner);
 		if (rcu_access_pointer(basechain->stats)) {
@@ -2974,6 +2976,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 				list_for_each_entry(ops, &h->ops_list, list)
 					nf_unregister_net_hook(ctx->net, ops);
 			}
+			/* hook.list is on stack, no need for list_del_rcu() */
 			list_del(&h->list);
 			nft_netdev_hook_free_rcu(h);
 		}
@@ -8852,10 +8855,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net,
 	list_for_each_entry_safe(hook, next, hook_list, list) {
 		list_for_each_entry(ops, &hook->ops_list, list)
 			nft_unregister_flowtable_ops(net, flowtable, ops);
-		if (release_netdev) {
-			list_del(&hook->list);
-			nft_netdev_hook_free_rcu(hook);
-		}
+		if (release_netdev)
+			nft_netdev_hook_unlink_free_rcu(hook);
 	}
 }
 
@@ -8926,8 +8927,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
 
 			nft_unregister_flowtable_ops(net, flowtable, ops);
 		}
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
+		nft_netdev_hook_unlink_free_rcu(hook);
 	}
 
 	return err;
@@ -8937,10 +8937,8 @@ static void nft_hooks_destroy(struct list_head *hook_list)
 {
 	struct nft_hook *hook, *next;
 
-	list_for_each_entry_safe(hook, next, hook_list, list) {
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
-	}
+	list_for_each_entry_safe(hook, next, hook_list, list)
+		nft_netdev_hook_unlink_free_rcu(hook);
 }
 
 static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
@@ -9028,8 +9026,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
 				nft_unregister_flowtable_ops(ctx->net,
 							     flowtable, ops);
 		}
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
+		nft_netdev_hook_unlink_free_rcu(hook);
 	}
 
 	return err;
@@ -9535,13 +9532,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
-	struct nft_hook *hook, *next;
-
 	flowtable->data.type->free(&flowtable->data);
-	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
-		list_del_rcu(&hook->list);
-		nft_netdev_hook_free_rcu(hook);
-	}
+	nft_hooks_destroy(&flowtable->hook_list);
 	kfree(flowtable->name);
 	module_put(flowtable->data.type->owner);
 	kfree(flowtable);
-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox