LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v7 net-next 10/15] net: dsa: netc: introduce NXP NETC switch driver for i.MX94
From: Wei Fang @ 2026-05-13  3:04 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
	f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
	andrew, olteanv
  Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
	imx
In-Reply-To: <20260513030454.1666570-1-wei.fang@nxp.com>

For i.MX94 series, the NETC IP provides full 802.1Q Ethernet switch
functionality, advanced QoS with 8 traffic classes, and a full range of
TSN standards capabilities. The switch has 3 user ports and 1 CPU port,
the CPU port is connected to an internal ENETC. Since the switch and the
internal ENETC are fully integrated within the NETC IP, no back-to-back
MAC connection is required. Instead, a light-weight "pseudo MAC" is used
between the switch and the ENETC. This translates to lower power (less
logic and memory) and lower delay (as there is no serialization delay
across this link).

Introduce the initial NETC switch driver with basic probe and remove
functionality. More features will be added in subsequent patches.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
 MAINTAINERS                           |  11 +
 drivers/net/dsa/Kconfig               |   2 +
 drivers/net/dsa/Makefile              |   1 +
 drivers/net/dsa/netc/Kconfig          |  15 +
 drivers/net/dsa/netc/Makefile         |   3 +
 drivers/net/dsa/netc/netc_main.c      | 600 ++++++++++++++++++++++++++
 drivers/net/dsa/netc/netc_platform.c  |  49 +++
 drivers/net/dsa/netc/netc_switch.h    |  92 ++++
 drivers/net/dsa/netc/netc_switch_hw.h | 133 ++++++
 9 files changed, 906 insertions(+)
 create mode 100644 drivers/net/dsa/netc/Kconfig
 create mode 100644 drivers/net/dsa/netc/Makefile
 create mode 100644 drivers/net/dsa/netc/netc_main.c
 create mode 100644 drivers/net/dsa/netc/netc_platform.c
 create mode 100644 drivers/net/dsa/netc/netc_switch.h
 create mode 100644 drivers/net/dsa/netc/netc_switch_hw.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 5bbbbde6b907..78d0a6038086 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19290,6 +19290,17 @@ F:	Documentation/devicetree/bindings/clock/*imx*
 F:	drivers/clk/imx/
 F:	include/dt-bindings/clock/*imx*
 
+NXP NETC ETHERNET SWITCH DRIVER
+M:	Wei Fang <wei.fang@nxp.com>
+R:	Clark Wang <xiaoning.wang@nxp.com>
+L:	imx@lists.linux.dev
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/net/dsa/nxp,netc-switch.yaml
+F:	drivers/net/dsa/netc/
+F:	include/linux/dsa/tag_netc.h
+F:	net/dsa/tag_netc.c
+
 NXP NETC TIMER PTP CLOCK DRIVER
 M:	Wei Fang <wei.fang@nxp.com>
 M:	Clark Wang <xiaoning.wang@nxp.com>
diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 39fb8ead16b5..4ab567c5bbaf 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -76,6 +76,8 @@ source "drivers/net/dsa/mv88e6xxx/Kconfig"
 
 source "drivers/net/dsa/mxl862xx/Kconfig"
 
+source "drivers/net/dsa/netc/Kconfig"
+
 source "drivers/net/dsa/ocelot/Kconfig"
 
 source "drivers/net/dsa/qca/Kconfig"
diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile
index f5a463b87ec2..d2975badffc0 100644
--- a/drivers/net/dsa/Makefile
+++ b/drivers/net/dsa/Makefile
@@ -21,6 +21,7 @@ obj-y				+= lantiq/
 obj-y				+= microchip/
 obj-y				+= mv88e6xxx/
 obj-y				+= mxl862xx/
+obj-y				+= netc/
 obj-y				+= ocelot/
 obj-y				+= qca/
 obj-y				+= realtek/
diff --git a/drivers/net/dsa/netc/Kconfig b/drivers/net/dsa/netc/Kconfig
new file mode 100644
index 000000000000..0f246ac9e018
--- /dev/null
+++ b/drivers/net/dsa/netc/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NET_DSA_NETC_SWITCH
+	tristate "NXP NETC Ethernet switch support"
+	depends on ARM64 || COMPILE_TEST
+	depends on NET_DSA && PCI
+	select NET_DSA_TAG_NETC
+	select FSL_ENETC_MDIO
+	select NXP_NTMP
+	select NXP_NETC_LIB
+	help
+	  This driver supports the NXP NETC Ethernet switch, which is embedded
+	  as a PCIe function of the NXP NETC IP. But note that this driver is
+	  is only available for NETC v4.3 and later versions.
+
+	  If compiled as module (M), the module name is nxp-netc-switch.
diff --git a/drivers/net/dsa/netc/Makefile b/drivers/net/dsa/netc/Makefile
new file mode 100644
index 000000000000..4a5767562574
--- /dev/null
+++ b/drivers/net/dsa/netc/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_NET_DSA_NETC_SWITCH) += nxp-netc-switch.o
+nxp-netc-switch-objs := netc_main.o netc_platform.o
diff --git a/drivers/net/dsa/netc/netc_main.c b/drivers/net/dsa/netc/netc_main.c
new file mode 100644
index 000000000000..8e3a3230226c
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_main.c
@@ -0,0 +1,600 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * NXP NETC switch driver
+ * Copyright 2025-2026 NXP
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/fsl/enetc_mdio.h>
+#include <linux/if_vlan.h>
+#include <linux/of_mdio.h>
+
+#include "netc_switch.h"
+
+static enum dsa_tag_protocol
+netc_get_tag_protocol(struct dsa_switch *ds, int port,
+		      enum dsa_tag_protocol mprot)
+{
+	return DSA_TAG_PROTO_NETC;
+}
+
+static void netc_port_rmw(struct netc_port *np, u32 reg,
+			  u32 mask, u32 val)
+{
+	u32 old, new;
+
+	WARN_ON((mask | val) != mask);
+
+	old = netc_port_rd(np, reg);
+	new = (old & ~mask) | val;
+	if (new == old)
+		return;
+
+	netc_port_wr(np, reg, new);
+}
+
+static void netc_mac_port_wr(struct netc_port *np, u32 reg, u32 val)
+{
+	if (is_netc_pseudo_port(np))
+		return;
+
+	netc_port_wr(np, reg, val);
+	if (np->caps.pmac)
+		netc_port_wr(np, reg + NETC_PMAC_OFFSET, val);
+}
+
+static void netc_port_get_capability(struct netc_port *np)
+{
+	u32 val;
+
+	val = netc_port_rd(np, NETC_PMCAPR);
+	if (val & PMCAPR_HD)
+		np->caps.half_duplex = true;
+
+	if (FIELD_GET(PMCAPR_FP, val) == FP_SUPPORT)
+		np->caps.pmac = true;
+
+	val = netc_port_rd(np, NETC_PCAPR);
+	if (val & PCAPR_LINK_TYPE)
+		np->caps.pseudo_link = true;
+}
+
+static int netc_port_create_emdio_bus(struct netc_port *np,
+				      struct device_node *node)
+{
+	struct netc_switch *priv = np->switch_priv;
+	struct enetc_mdio_priv *mdio_priv;
+	struct device *dev = priv->dev;
+	struct enetc_hw *hw;
+	struct mii_bus *bus;
+	int err;
+
+	hw = enetc_hw_alloc(dev, np->iobase);
+	if (IS_ERR(hw))
+		return dev_err_probe(dev, PTR_ERR(hw),
+				     "Failed to allocate enetc_hw\n");
+
+	bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv));
+	if (!bus)
+		return -ENOMEM;
+
+	bus->name = "NXP NETC switch external MDIO Bus";
+	bus->read = enetc_mdio_read_c22;
+	bus->write = enetc_mdio_write_c22;
+	bus->read_c45 = enetc_mdio_read_c45;
+	bus->write_c45 = enetc_mdio_write_c45;
+	bus->parent = dev;
+	mdio_priv = bus->priv;
+	mdio_priv->hw = hw;
+	mdio_priv->mdio_base = NETC_EMDIO_BASE;
+	snprintf(bus->id, MII_BUS_ID_SIZE, "%s-p%d-emdio",
+		 dev_name(dev), np->dp->index);
+
+	err = devm_of_mdiobus_register(dev, bus, node);
+	if (err)
+		return dev_err_probe(dev, err,
+				     "Cannot register EMDIO bus\n");
+
+	np->emdio = bus;
+
+	return 0;
+}
+
+static int netc_port_create_mdio_bus(struct netc_port *np,
+				     struct device_node *node)
+{
+	struct device_node *mdio_node;
+	int err;
+
+	mdio_node = of_get_child_by_name(node, "mdio");
+	if (mdio_node) {
+		err = netc_port_create_emdio_bus(np, mdio_node);
+		of_node_put(mdio_node);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int netc_init_switch_id(struct netc_switch *priv)
+{
+	struct netc_switch_regs *regs = &priv->regs;
+	struct dsa_switch *ds = priv->ds;
+
+	/* The value of 0 is reserved for the VEPA switch and cannot
+	 * be used. So 'dsa,member' is a required property for NETC
+	 * switch, the member is used to specify the switch ID, which
+	 * cannot be zero. This way, the hardware switch ID and the
+	 * software switch ID are consistent.
+	 */
+	if (ds->index > FIELD_MAX(SWCR_SWID) || !ds->index) {
+		dev_err(priv->dev, "Switch index %d out of range\n",
+			ds->index);
+		return -ERANGE;
+	}
+
+	netc_base_wr(regs, NETC_SWCR, ds->index);
+
+	return 0;
+}
+
+static int netc_init_all_ports(struct netc_switch *priv)
+{
+	struct device *dev = priv->dev;
+	struct netc_port *np;
+	struct dsa_port *dp;
+	int err;
+
+	priv->ports = devm_kcalloc(dev, priv->info->num_ports,
+				   sizeof(struct netc_port *),
+				   GFP_KERNEL);
+	if (!priv->ports)
+		return -ENOMEM;
+
+	/* Some DSA interfaces may set the port even it is disabled, such
+	 * as .port_disable(), .port_stp_state_set() and so on. To avoid
+	 * crash caused by accessing NULL port pointer, each port is
+	 * allocated its own memory. Otherwise, we need to check whether
+	 * the port pointer is NULL in these interfaces. The latter is
+	 * difficult for us to cover.
+	 */
+	for (int i = 0; i < priv->info->num_ports; i++) {
+		np = devm_kzalloc(dev, sizeof(*np), GFP_KERNEL);
+		if (!np)
+			return -ENOMEM;
+
+		np->switch_priv = priv;
+		np->iobase = priv->regs.port + PORT_IOBASE(i);
+		netc_port_get_capability(np);
+		priv->ports[i] = np;
+	}
+
+	dsa_switch_for_each_available_port(dp, priv->ds) {
+		np = priv->ports[dp->index];
+		np->dp = dp;
+
+		if (dsa_port_is_user(dp)) {
+			err = netc_port_create_mdio_bus(np, dp->dn);
+			if (err) {
+				dev_err(dev, "Failed to create MDIO bus\n");
+				return err;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void netc_init_ntmp_tbl_versions(struct netc_switch *priv)
+{
+	struct ntmp_user *ntmp = &priv->ntmp;
+
+	/* All tables default to version 0 */
+	memset(&ntmp->tbl, 0, sizeof(ntmp->tbl));
+}
+
+static int netc_init_all_cbdrs(struct netc_switch *priv)
+{
+	struct netc_switch_regs *regs = &priv->regs;
+	struct ntmp_user *ntmp = &priv->ntmp;
+	int i, err;
+
+	ntmp->cbdr_num = NETC_CBDR_NUM;
+	ntmp->dev = priv->dev;
+	ntmp->ring = devm_kcalloc(ntmp->dev, ntmp->cbdr_num,
+				  sizeof(struct netc_cbdr),
+				  GFP_KERNEL);
+	if (!ntmp->ring)
+		return -ENOMEM;
+
+	for (i = 0; i < ntmp->cbdr_num; i++) {
+		struct netc_cbdr *cbdr = &ntmp->ring[i];
+		struct netc_cbdr_regs cbdr_regs;
+
+		cbdr_regs.pir = regs->base + NETC_CBDRPIR(i);
+		cbdr_regs.cir = regs->base + NETC_CBDRCIR(i);
+		cbdr_regs.mr = regs->base + NETC_CBDRMR(i);
+		cbdr_regs.bar0 = regs->base + NETC_CBDRBAR0(i);
+		cbdr_regs.bar1 = regs->base + NETC_CBDRBAR1(i);
+		cbdr_regs.lenr = regs->base + NETC_CBDRLENR(i);
+
+		err = ntmp_init_cbdr(cbdr, ntmp->dev, &cbdr_regs);
+		if (err)
+			goto free_cbdrs;
+	}
+
+	return 0;
+
+free_cbdrs:
+	for (i--; i >= 0; i--)
+		ntmp_free_cbdr(&ntmp->ring[i]);
+
+	return err;
+}
+
+static void netc_remove_all_cbdrs(struct netc_switch *priv)
+{
+	struct ntmp_user *ntmp = &priv->ntmp;
+
+	for (int i = 0; i < NETC_CBDR_NUM; i++)
+		ntmp_free_cbdr(&ntmp->ring[i]);
+}
+
+static int netc_init_ntmp_user(struct netc_switch *priv)
+{
+	netc_init_ntmp_tbl_versions(priv);
+
+	return netc_init_all_cbdrs(priv);
+}
+
+static void netc_free_ntmp_user(struct netc_switch *priv)
+{
+	netc_remove_all_cbdrs(priv);
+}
+
+static void netc_switch_dos_default_config(struct netc_switch *priv)
+{
+	struct netc_switch_regs *regs = &priv->regs;
+	u32 val;
+
+	val = DOSL2CR_SAMEADDR | DOSL2CR_MSAMCC;
+	netc_base_wr(regs, NETC_DOSL2CR, val);
+
+	val = DOSL3CR_SAMEADDR | DOSL3CR_IPSAMCC;
+	netc_base_wr(regs, NETC_DOSL3CR, val);
+}
+
+static void netc_switch_vfht_default_config(struct netc_switch *priv)
+{
+	struct netc_switch_regs *regs = &priv->regs;
+	u32 val;
+
+	val = netc_base_rd(regs, NETC_VFHTDECR2);
+
+	/* If no match is found in the VLAN Filter table, then VFHTDECR2[MLO]
+	 * will take effect. VFHTDECR2[MLO] is set to "Software MAC learning
+	 * secure" by default. Notice BPCR[MLO] will override VFHTDECR2[MLO]
+	 * if its value is not zero.
+	 */
+	val = u32_replace_bits(val, MLO_SW_SEC, VFHTDECR2_MLO);
+	val = u32_replace_bits(val, MFO_NO_MATCH_DISCARD, VFHTDECR2_MFO);
+	netc_base_wr(regs, NETC_VFHTDECR2, val);
+}
+
+static void netc_port_set_max_frame_size(struct netc_port *np,
+					 u32 max_frame_size)
+{
+	netc_mac_port_wr(np, NETC_PM_MAXFRM(0),
+			 max_frame_size & PM_MAXFRAM);
+}
+
+static void netc_switch_fixed_config(struct netc_switch *priv)
+{
+	netc_switch_dos_default_config(priv);
+	netc_switch_vfht_default_config(priv);
+}
+
+static void netc_port_set_tc_max_sdu(struct netc_port *np,
+				     int tc, u32 max_sdu)
+{
+	u32 val = FIELD_PREP(PTCTMSDUR_MAXSDU, max_sdu) |
+		  FIELD_PREP(PTCTMSDUR_SDU_TYPE, SDU_TYPE_MPDU);
+
+	netc_port_wr(np, NETC_PTCTMSDUR(tc), val);
+}
+
+static void netc_port_set_all_tc_msdu(struct netc_port *np)
+{
+	for (int tc = 0; tc < NETC_TC_NUM; tc++)
+		netc_port_set_tc_max_sdu(np, tc, NETC_MAX_FRAME_LEN);
+}
+
+static void netc_port_set_mlo(struct netc_port *np, enum netc_mlo mlo)
+{
+	netc_port_rmw(np, NETC_BPCR, BPCR_MLO, FIELD_PREP(BPCR_MLO, mlo));
+}
+
+static void netc_port_fixed_config(struct netc_port *np)
+{
+	/* Default IPV and DR setting */
+	netc_port_rmw(np, NETC_PQOSMR, PQOSMR_VS | PQOSMR_VE,
+		      PQOSMR_VS | PQOSMR_VE);
+
+	/* Enable L2 and L3 DOS */
+	netc_port_rmw(np, NETC_PCR, PCR_L2DOSE | PCR_L3DOSE,
+		      PCR_L2DOSE | PCR_L3DOSE);
+}
+
+static void netc_port_default_config(struct netc_port *np)
+{
+	netc_port_fixed_config(np);
+
+	/* Default VLAN unaware */
+	netc_port_rmw(np, NETC_BPDVR, BPDVR_RXVAM, BPDVR_RXVAM);
+
+	if (dsa_port_is_cpu(np->dp))
+		/* For CPU port, source port pruning is disabled */
+		netc_port_rmw(np, NETC_BPCR, BPCR_SRCPRND, BPCR_SRCPRND);
+	else
+		netc_port_set_mlo(np, MLO_DISABLE);
+
+	netc_port_set_max_frame_size(np, NETC_MAX_FRAME_LEN);
+	netc_port_set_all_tc_msdu(np);
+}
+
+static int netc_setup(struct dsa_switch *ds)
+{
+	struct netc_switch *priv = ds->priv;
+	struct dsa_port *dp;
+	int err;
+
+	err = netc_init_switch_id(priv);
+	if (err)
+		return err;
+
+	err = netc_init_all_ports(priv);
+	if (err)
+		return err;
+
+	err = netc_init_ntmp_user(priv);
+	if (err)
+		return err;
+
+	netc_switch_fixed_config(priv);
+
+	/* default setting for ports */
+	dsa_switch_for_each_available_port(dp, ds)
+		netc_port_default_config(priv->ports[dp->index]);
+
+	return 0;
+}
+
+static void netc_teardown(struct dsa_switch *ds)
+{
+	struct netc_switch *priv = ds->priv;
+
+	netc_free_ntmp_user(priv);
+}
+
+static bool netc_port_is_emdio_consumer(struct device_node *node)
+{
+	struct device_node *mdio_node;
+
+	/* If the port node has phy-handle property and it does
+	 * not contain a mdio child node, then the port is the
+	 * EMDIO consumer.
+	 */
+	mdio_node = of_get_child_by_name(node, "mdio");
+	if (!mdio_node)
+		return true;
+
+	of_node_put(mdio_node);
+
+	return false;
+}
+
+/* Currently, phylink_of_phy_connect() is called by dsa_user_create(),
+ * so if the switch uses the external MDIO controller (like the EMDIO
+ * function) to manage the external PHYs. The MDIO bus may not be
+ * created when phylink_of_phy_connect() is called, so it will return
+ * an error and cause the switch driver to fail to probe.
+ * This workaround can be removed when DSA phylink_of_phy_connect()
+ * calls are moved from probe() to ndo_open().
+ */
+static int netc_switch_check_emdio_is_ready(struct device *dev)
+{
+	struct device_node *ports, *phy_node;
+	struct phy_device *phydev;
+	int err = 0;
+
+	ports = of_get_child_by_name(dev->of_node, "ethernet-ports");
+	if (!ports) {
+		dev_err(dev, "Cannot find the ethernet-ports node\n");
+		return -EINVAL;
+	}
+
+	for_each_available_child_of_node_scoped(ports, child) {
+		/* If the node does not have phy-handle property, then the
+		 * port does not connect to a PHY, so the port is not the
+		 * EMDIO consumer.
+		 */
+		phy_node = of_parse_phandle(child, "phy-handle", 0);
+		if (!phy_node)
+			continue;
+
+		/* Note that from the hardware perspective, the switch ports
+		 * do not support sharing the MDIO bus defined under one port.
+		 * Each port can only access its own external PHY through its
+		 * port MDIO bus.
+		 */
+		if (!netc_port_is_emdio_consumer(child)) {
+			of_node_put(phy_node);
+			continue;
+		}
+
+		phydev = of_phy_find_device(phy_node);
+		of_node_put(phy_node);
+		if (!phydev) {
+			err = -EPROBE_DEFER;
+			goto out;
+		}
+
+		put_device(&phydev->mdio.dev);
+	}
+
+out:
+	of_node_put(ports);
+
+	return err;
+}
+
+static int netc_switch_pci_init(struct pci_dev *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct netc_switch_regs *regs;
+	struct netc_switch *priv;
+	void __iomem *base;
+	int err;
+
+	pcie_flr(pdev);
+	err = pcim_enable_device(pdev);
+	if (err)
+		return dev_err_probe(dev, err, "Failed to enable device\n");
+
+	err = pcim_request_all_regions(pdev, KBUILD_MODNAME);
+	if (err)
+		return dev_err_probe(dev, err, "Failed to request regions\n");
+
+	/* The command BD rings and NTMP tables need DMA. No need to check
+	 * the return value, because it never returns fail when the mask is
+	 * DMA_BIT_MASK(64), see dma-api-howto.rst.
+	 */
+	dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+
+	if (pci_resource_len(pdev, NETC_REGS_BAR) < NETC_REGS_SIZE) {
+		return dev_err_probe(dev, -EINVAL,
+				     "Invalid register space size\n");
+	}
+
+	base = pcim_iomap(pdev, NETC_REGS_BAR, 0);
+	if (!base)
+		return dev_err_probe(dev, -ENXIO, "pcim_iomap() failed\n");
+
+	pci_set_master(pdev);
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->pdev = pdev;
+	priv->dev = dev;
+
+	regs = &priv->regs;
+	regs->base = base;
+	regs->port = regs->base + NETC_REGS_PORT_BASE;
+	regs->global = regs->base + NETC_REGS_GLOBAL_BASE;
+	pci_set_drvdata(pdev, priv);
+
+	return 0;
+}
+
+static void netc_switch_get_ip_revision(struct netc_switch *priv)
+{
+	struct netc_switch_regs *regs = &priv->regs;
+	u32 val = netc_glb_rd(regs, NETC_IPBRR0);
+
+	priv->revision = FIELD_GET(IPBRR0_IP_REV, val);
+}
+
+static const struct dsa_switch_ops netc_switch_ops = {
+	.get_tag_protocol		= netc_get_tag_protocol,
+	.setup				= netc_setup,
+	.teardown			= netc_teardown,
+};
+
+static int netc_switch_probe(struct pci_dev *pdev,
+			     const struct pci_device_id *id)
+{
+	struct device_node *node = dev_of_node(&pdev->dev);
+	struct device *dev = &pdev->dev;
+	struct netc_switch *priv;
+	struct dsa_switch *ds;
+	int err;
+
+	if (!node)
+		return dev_err_probe(dev, -ENODEV,
+				     "No DT bindings, skipping\n");
+
+	err = netc_switch_check_emdio_is_ready(dev);
+	if (err)
+		return err;
+
+	err = netc_switch_pci_init(pdev);
+	if (err)
+		return err;
+
+	priv = pci_get_drvdata(pdev);
+	netc_switch_get_ip_revision(priv);
+
+	err = netc_switch_platform_probe(priv);
+	if (err)
+		return err;
+
+	ds = devm_kzalloc(dev, sizeof(*ds), GFP_KERNEL);
+	if (!ds)
+		return -ENOMEM;
+
+	ds->dev = dev;
+	ds->num_ports = priv->info->num_ports;
+	ds->num_tx_queues = NETC_TC_NUM;
+	ds->ops = &netc_switch_ops;
+	ds->priv = priv;
+	priv->ds = ds;
+
+	err = dsa_register_switch(ds);
+	if (err)
+		return dev_err_probe(dev, err,
+				     "Failed to register DSA switch\n");
+
+	return 0;
+}
+
+static void netc_switch_remove(struct pci_dev *pdev)
+{
+	struct netc_switch *priv = pci_get_drvdata(pdev);
+
+	if (!priv)
+		return;
+
+	dsa_unregister_switch(priv->ds);
+}
+
+static void netc_switch_shutdown(struct pci_dev *pdev)
+{
+	struct netc_switch *priv = pci_get_drvdata(pdev);
+
+	if (!priv)
+		return;
+
+	dsa_switch_shutdown(priv->ds);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id netc_switch_ids[] = {
+	{ PCI_DEVICE(NETC_SWITCH_VENDOR_ID, NETC_SWITCH_DEVICE_ID) },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, netc_switch_ids);
+
+static struct pci_driver netc_switch_driver = {
+	.name		= KBUILD_MODNAME,
+	.id_table	= netc_switch_ids,
+	.probe		= netc_switch_probe,
+	.remove		= netc_switch_remove,
+	.shutdown	= netc_switch_shutdown,
+};
+module_pci_driver(netc_switch_driver);
+
+MODULE_DESCRIPTION("NXP NETC Switch driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/dsa/netc/netc_platform.c b/drivers/net/dsa/netc/netc_platform.c
new file mode 100644
index 000000000000..abd599ea9c8d
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_platform.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * NXP NETC switch driver
+ * Copyright 2025-2026 NXP
+ */
+
+#include "netc_switch.h"
+
+struct netc_switch_platform {
+	u16 revision;
+	const struct netc_switch_info *info;
+};
+
+static const struct netc_switch_info imx94_info = {
+	.num_ports = 4,
+};
+
+static const struct netc_switch_platform netc_platforms[] = {
+	{ .revision = NETC_SWITCH_REV_4_3, .info = &imx94_info, },
+	{ }
+};
+
+static const struct netc_switch_info *
+netc_switch_get_info(struct netc_switch *priv)
+{
+	int i;
+
+	/* Matching based on IP revision */
+	for (i = 0; i < ARRAY_SIZE(netc_platforms); i++) {
+		if (priv->revision == netc_platforms[i].revision)
+			return netc_platforms[i].info;
+	}
+
+	return NULL;
+}
+
+int netc_switch_platform_probe(struct netc_switch *priv)
+{
+	const struct netc_switch_info *info = netc_switch_get_info(priv);
+
+	if (!info) {
+		dev_err(priv->dev, "Cannot find switch platform info\n");
+		return -EINVAL;
+	}
+
+	priv->info = info;
+
+	return 0;
+}
diff --git a/drivers/net/dsa/netc/netc_switch.h b/drivers/net/dsa/netc/netc_switch.h
new file mode 100644
index 000000000000..a6d36dcebc6d
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_switch.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright 2025-2026 NXP
+ */
+
+#ifndef _NETC_SWITCH_H
+#define _NETC_SWITCH_H
+
+#include <linux/dsa/tag_netc.h>
+#include <linux/fsl/netc_global.h>
+#include <linux/fsl/ntmp.h>
+#include <linux/of_device.h>
+#include <linux/of_net.h>
+#include <linux/pci.h>
+
+#include "netc_switch_hw.h"
+
+#define NETC_REGS_BAR			0
+#define NETC_REGS_SIZE			0x80000
+#define NETC_MSIX_TBL_BAR		2
+#define NETC_REGS_PORT_BASE		0x4000
+/* register block size per port  */
+#define NETC_REGS_PORT_SIZE		0x4000
+#define PORT_IOBASE(p)			(NETC_REGS_PORT_SIZE * (p))
+#define NETC_REGS_GLOBAL_BASE		0x70000
+
+#define NETC_SWITCH_REV_4_3		0x0403
+
+#define NETC_TC_NUM			8
+#define NETC_CBDR_NUM			2
+
+#define NETC_MAX_FRAME_LEN		9600
+
+struct netc_switch;
+
+struct netc_switch_info {
+	u32 num_ports;
+};
+
+struct netc_port_caps {
+	u32 half_duplex:1; /* indicates whether the port support half-duplex */
+	u32 pmac:1;	  /* indicates whether the port has preemption MAC */
+	u32 pseudo_link:1;
+};
+
+struct netc_port {
+	void __iomem *iobase;
+	struct netc_switch *switch_priv;
+	struct netc_port_caps caps;
+	struct dsa_port *dp;
+	struct mii_bus *emdio;
+};
+
+struct netc_switch_regs {
+	void __iomem *base;
+	void __iomem *port;
+	void __iomem *global;
+};
+
+struct netc_switch {
+	struct pci_dev *pdev;
+	struct device *dev;
+	struct dsa_switch *ds;
+	u16 revision;
+
+	const struct netc_switch_info *info;
+	struct netc_switch_regs regs;
+	struct netc_port **ports;
+
+	struct ntmp_user ntmp;
+};
+
+/* Write/Read Switch base registers */
+#define netc_base_rd(r, o)		netc_read((r)->base + (o))
+#define netc_base_wr(r, o, v)		netc_write((r)->base + (o), v)
+
+/* Write/Read registers of Switch Port (including pseudo MAC port) */
+#define netc_port_rd(p, o)		netc_read((p)->iobase + (o))
+#define netc_port_wr(p, o, v)		netc_write((p)->iobase + (o), v)
+
+/* Write/Read Switch global registers */
+#define netc_glb_rd(r, o)		netc_read((r)->global + (o))
+#define netc_glb_wr(r, o, v)		netc_write((r)->global + (o), v)
+
+static inline bool is_netc_pseudo_port(struct netc_port *np)
+{
+	return np->caps.pseudo_link;
+}
+
+int netc_switch_platform_probe(struct netc_switch *priv);
+
+#endif
diff --git a/drivers/net/dsa/netc/netc_switch_hw.h b/drivers/net/dsa/netc/netc_switch_hw.h
new file mode 100644
index 000000000000..0419f7f9207e
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_switch_hw.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright 2025-2026 NXP
+ */
+
+#ifndef _NETC_SWITCH_HW_H
+#define _NETC_SWITCH_HW_H
+
+#include <linux/bitops.h>
+
+#define NETC_SWITCH_VENDOR_ID		0x1131
+#define NETC_SWITCH_DEVICE_ID		0xeef2
+
+/* Definition of Switch base registers */
+#define NETC_CBDRMR(a)			(0x0800 + (a) * 0x30)
+#define NETC_CBDRBAR0(a)		(0x0810 + (a) * 0x30)
+#define NETC_CBDRBAR1(a)		(0x0814 + (a) * 0x30)
+#define NETC_CBDRPIR(a)			(0x0818 + (a) * 0x30)
+#define NETC_CBDRCIR(a)			(0x081c + (a) * 0x30)
+#define NETC_CBDRLENR(a)		(0x0820 + (a) * 0x30)
+
+#define NETC_SWCR			0x1018
+#define  SWCR_SWID			GENMASK(2, 0)
+
+#define NETC_DOSL2CR			0x1220
+#define  DOSL2CR_SAMEADDR		BIT(0)
+#define  DOSL2CR_MSAMCC			BIT(1)
+
+#define NETC_DOSL3CR			0x1224
+#define  DOSL3CR_SAMEADDR		BIT(0)
+#define  DOSL3CR_IPSAMCC		BIT(1)
+
+#define NETC_VFHTDECR1			0x2014
+#define NETC_VFHTDECR2			0x2018
+#define  VFHTDECR2_ET_PORT(a)		BIT((a))
+#define  VFHTDECR2_MLO			GENMASK(26, 24)
+#define  VFHTDECR2_MFO			GENMASK(28, 27)
+
+/* Definition of Switch port registers */
+#define NETC_PCAPR			0x0000
+#define  PCAPR_LINK_TYPE		BIT(4)
+#define  PCAPR_NUM_TC			GENMASK(15, 12)
+#define  PCAPR_NUM_Q			GENMASK(19, 16)
+#define  PCAPR_NUM_CG			GENMASK(27, 24)
+#define  PCAPR_TGS			BIT(28)
+#define  PCAPR_CBS			BIT(29)
+
+#define NETC_PMCAPR			0x0004
+#define  PMCAPR_HD			BIT(8)
+#define  PMCAPR_FP			GENMASK(10, 9)
+#define   FP_SUPPORT			2
+
+#define NETC_PCR			0x0010
+#define  PCR_HDR_FMT			BIT(0)
+#define  PCR_NS_TAG_PORT		BIT(3)
+#define  PCR_L2DOSE			BIT(4)
+#define  PCR_L3DOSE			BIT(5)
+#define  PCR_TIMER_CS			BIT(8)
+#define  PCR_PSPEED			GENMASK(29, 16)
+#define   PSPEED_SET_VAL(s)		FIELD_PREP(PCR_PSPEED, ((s) / 10 - 1))
+
+#define NETC_PQOSMR			0x0054
+#define  PQOSMR_VS			BIT(0)
+#define  PQOSMR_VE			BIT(1)
+#define  PQOSMR_DDR			GENMASK(3, 2)
+#define  PQOSMR_DIPV			GENMASK(6, 4)
+#define  PQOSMR_VQMP			GENMASK(19, 16)
+#define  PQOSMR_QVMP			GENMASK(23, 20)
+
+#define NETC_PTCTMSDUR(a)		(0x208 + (a) * 0x20)
+#define  PTCTMSDUR_MAXSDU		GENMASK(15, 0)
+#define  PTCTMSDUR_SDU_TYPE		GENMASK(17, 16)
+#define   SDU_TYPE_PPDU			0
+#define   SDU_TYPE_MPDU			1
+#define   SDU_TYPE_MSDU			2
+
+#define NETC_BPCR			0x500
+#define  BPCR_DYN_LIMIT			GENMASK(15, 0)
+#define  BPCR_MLO			GENMASK(22, 20)
+#define  BPCR_UUCASTE			BIT(24)
+#define  BPCR_UMCASTE			BIT(25)
+#define  BPCR_MCASTE			BIT(26)
+#define  BPCR_BCASTE			BIT(27)
+#define  BPCR_STAMVD			BIT(28)
+#define  BPCR_SRCPRND			BIT(29)
+
+/* MAC learning options, see BPCR[MLO], VFHTDECR2[MLO] and
+ * VLAN Filter Table CFGE_DATA[MLO]
+ */
+enum netc_mlo {
+	MLO_NOT_OVERRIDE = 0,
+	MLO_DISABLE,
+	MLO_HW,
+	MLO_SW_SEC,
+	MLO_SW_UNSEC,
+	MLO_DISABLE_SMAC,
+};
+
+/* MAC forwarding options, see VFHTDECR2[MFO] and VLAN
+ * Filter Table CFGE_DATA[MFO]
+ */
+enum netc_mfo {
+	MFO_NO_FDB_LOOKUP = 1,
+	MFO_NO_MATCH_FLOOD,
+	MFO_NO_MATCH_DISCARD,
+};
+
+#define NETC_BPDVR			0x510
+#define  BPDVR_VID			GENMASK(11, 0)
+#define  BPDVR_DEI			BIT(12)
+#define  BPDVR_PCP			GENMASK(15, 13)
+#define  BPDVR_TPID			BIT(16)
+#define  BPDVR_RXTAGA			GENMASK(23, 20)
+#define  BPDVR_RXVAM			BIT(24)
+#define  BPDVR_TXTAGA			GENMASK(26, 25)
+
+/* Definition of Switch ethernet MAC port registers */
+#define NETC_PMAC_OFFSET		0x400
+#define NETC_PM_CMD_CFG(a)		(0x1008 + (a) * 0x400)
+#define  PM_CMD_CFG_TX_EN		BIT(0)
+#define  PM_CMD_CFG_RX_EN		BIT(1)
+
+#define NETC_PM_MAXFRM(a)		(0x1014 + (a) * 0x400)
+#define  PM_MAXFRAM			GENMASK(15, 0)
+
+#define NETC_PEMDIOCR			0x1c00
+#define NETC_EMDIO_BASE			NETC_PEMDIOCR
+
+/* Definition of global registers (read only) */
+#define NETC_IPBRR0			0x0bf8
+#define  IPBRR0_IP_REV			GENMASK(15, 0)
+
+#endif
-- 
2.34.1



^ permalink raw reply related

* [PATCH v7 net-next 11/15] net: dsa: netc: add phylink MAC operations
From: Wei Fang @ 2026-05-13  3:04 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
	f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
	andrew, olteanv
  Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
	imx
In-Reply-To: <20260513030454.1666570-1-wei.fang@nxp.com>

Different versions of NETC switches have different numbers of ports and
MAC capabilities. Add .phylink_get_caps() to struct netc_switch_info,
allowing each NETC switch version to implement its own callback for
obtaining MAC capabilities.

Implement the phylink_mac_ops callbacks: .mac_config(), .mac_link_up(),
and .mac_link_down(). Note that flow-control configuration is not yet
supported in .mac_link_up(), but will be implemented in a subsequent
patch.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
---
 drivers/net/dsa/netc/netc_main.c      | 247 ++++++++++++++++++++++++++
 drivers/net/dsa/netc/netc_platform.c  |  38 ++++
 drivers/net/dsa/netc/netc_switch.h    |   4 +
 drivers/net/dsa/netc/netc_switch_hw.h |  26 +++
 4 files changed, 315 insertions(+)

diff --git a/drivers/net/dsa/netc/netc_main.c b/drivers/net/dsa/netc/netc_main.c
index 8e3a3230226c..2141b3aa96b7 100644
--- a/drivers/net/dsa/netc/netc_main.c
+++ b/drivers/net/dsa/netc/netc_main.c
@@ -43,6 +43,30 @@ static void netc_mac_port_wr(struct netc_port *np, u32 reg, u32 val)
 		netc_port_wr(np, reg + NETC_PMAC_OFFSET, val);
 }
 
+/* netc_mac_port_rmw() is used to synchronize the configurations of eMAC
+ * and pMAC to maintain consistency. This function should not be used if
+ * differentiated settings are required.
+ */
+static void netc_mac_port_rmw(struct netc_port *np, u32 reg,
+			      u32 mask, u32 val)
+{
+	u32 old, new;
+
+	if (is_netc_pseudo_port(np))
+		return;
+
+	WARN_ON((mask | val) != mask);
+
+	old = netc_port_rd(np, reg);
+	new = (old & ~mask) | val;
+	if (new == old)
+		return;
+
+	netc_port_wr(np, reg, new);
+	if (np->caps.pmac)
+		netc_port_wr(np, reg + NETC_PMAC_OFFSET, new);
+}
+
 static void netc_port_get_capability(struct netc_port *np)
 {
 	u32 val;
@@ -507,10 +531,232 @@ static void netc_switch_get_ip_revision(struct netc_switch *priv)
 	priv->revision = FIELD_GET(IPBRR0_IP_REV, val);
 }
 
+static void netc_phylink_get_caps(struct dsa_switch *ds, int port,
+				  struct phylink_config *config)
+{
+	struct netc_switch *priv = ds->priv;
+
+	priv->info->phylink_get_caps(port, config);
+}
+
+static void netc_port_set_mac_mode(struct netc_port *np,
+				   unsigned int mode,
+				   phy_interface_t phy_mode)
+{
+	u32 mask = PM_IF_MODE_IFMODE | PM_IF_MODE_REVMII;
+	u32 val = 0;
+
+	switch (phy_mode) {
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+		val |= IFMODE_RGMII;
+		break;
+	case PHY_INTERFACE_MODE_RMII:
+		val |= IFMODE_RMII;
+		break;
+	case PHY_INTERFACE_MODE_REVMII:
+		val |= PM_IF_MODE_REVMII;
+		fallthrough;
+	case PHY_INTERFACE_MODE_MII:
+		val |= IFMODE_MII;
+		break;
+	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_2500BASEX:
+		val |= IFMODE_SGMII;
+		break;
+	default:
+		break;
+	}
+
+	netc_mac_port_rmw(np, NETC_PM_IF_MODE(0), mask, val);
+}
+
+static void netc_mac_config(struct phylink_config *config, unsigned int mode,
+			    const struct phylink_link_state *state)
+{
+	struct dsa_port *dp = dsa_phylink_to_port(config);
+
+	netc_port_set_mac_mode(NETC_PORT(dp->ds, dp->index), mode,
+			       state->interface);
+}
+
+static void netc_port_set_speed(struct netc_port *np, int speed)
+{
+	netc_port_rmw(np, NETC_PCR, PCR_PSPEED, PSPEED_SET_VAL(speed));
+}
+
+static void netc_port_set_rgmii_mac(struct netc_port *np,
+				    int speed, int duplex)
+{
+	u32 mask, val;
+
+	mask = PM_IF_MODE_SSP | PM_IF_MODE_HD | PM_IF_MODE_M10;
+
+	switch (speed) {
+	default:
+	case SPEED_1000:
+		val = FIELD_PREP(PM_IF_MODE_SSP, SSP_1G);
+		break;
+	case SPEED_100:
+		val = FIELD_PREP(PM_IF_MODE_SSP, SSP_100M);
+		break;
+	case SPEED_10:
+		val = FIELD_PREP(PM_IF_MODE_SSP, SSP_10M);
+		break;
+	}
+
+	if (duplex != DUPLEX_FULL)
+		val |= PM_IF_MODE_HD;
+
+	netc_mac_port_rmw(np, NETC_PM_IF_MODE(0), mask, val);
+}
+
+static void netc_port_set_rmii_mii_mac(struct netc_port *np,
+				       int speed, int duplex)
+{
+	u32 mask, val = 0;
+
+	mask = PM_IF_MODE_SSP | PM_IF_MODE_HD | PM_IF_MODE_M10;
+
+	if (speed == SPEED_10)
+		val |= PM_IF_MODE_M10;
+
+	if (duplex != DUPLEX_FULL)
+		val |= PM_IF_MODE_HD;
+
+	netc_mac_port_rmw(np, NETC_PM_IF_MODE(0), mask, val);
+}
+
+static void netc_port_mac_rx_enable(struct netc_port *np)
+{
+	netc_port_rmw(np, NETC_POR, POR_RXDIS, 0);
+	netc_mac_port_rmw(np, NETC_PM_CMD_CFG(0), PM_CMD_CFG_RX_EN,
+			  PM_CMD_CFG_RX_EN);
+}
+
+static void netc_port_wait_rx_empty(struct netc_port *np, int mac)
+{
+	u32 val;
+
+	/* PM_IEVENT_RX_EMPTY is a read-only bit, it is automatically set by
+	 * hardware if RX FIFO is empty and no RX packet receive in process.
+	 * And it is automatically cleared if RX FIFO is not empty or RX
+	 * packet receive in process.
+	 */
+	if (read_poll_timeout(netc_port_rd, val, val & PM_IEVENT_RX_EMPTY,
+			      100, 10000, false, np, NETC_PM_IEVENT(mac)))
+		dev_warn(np->switch_priv->dev,
+			 "swp%d MAC%d: RX is not idle\n", np->dp->index, mac);
+}
+
+static void netc_port_mac_rx_graceful_stop(struct netc_port *np)
+{
+	u32 val;
+
+	if (is_netc_pseudo_port(np))
+		goto rx_disable;
+
+	if (np->caps.pmac) {
+		netc_port_rmw(np, NETC_PM_CMD_CFG(1), PM_CMD_CFG_RX_EN, 0);
+		netc_port_wait_rx_empty(np, 1);
+	}
+
+	netc_port_rmw(np, NETC_PM_CMD_CFG(0), PM_CMD_CFG_RX_EN, 0);
+	netc_port_wait_rx_empty(np, 0);
+
+	if (read_poll_timeout(netc_port_rd, val, !(val & PSR_RX_BUSY),
+			      100, 10000, false, np, NETC_PSR))
+		dev_warn(np->switch_priv->dev, "swp%d RX is busy\n",
+			 np->dp->index);
+
+rx_disable:
+	netc_port_rmw(np, NETC_POR, POR_RXDIS, POR_RXDIS);
+}
+
+static void netc_port_mac_tx_enable(struct netc_port *np)
+{
+	netc_mac_port_rmw(np, NETC_PM_CMD_CFG(0), PM_CMD_CFG_TX_EN,
+			  PM_CMD_CFG_TX_EN);
+	netc_port_rmw(np, NETC_POR, POR_TXDIS, 0);
+}
+
+static void netc_port_wait_tx_empty(struct netc_port *np, int mac)
+{
+	u32 val;
+
+	/* PM_IEVENT_TX_EMPTY is a read-only bit, it is automatically set by
+	 * hardware if TX FIFO is empty. And it is automatically cleared if
+	 * TX FIFO is not empty.
+	 */
+	if (read_poll_timeout(netc_port_rd, val, val & PM_IEVENT_TX_EMPTY,
+			      100, 10000, false, np, NETC_PM_IEVENT(mac)))
+		dev_warn(np->switch_priv->dev,
+			 "swp%d MAC%d: TX FIFO is not empty\n",
+			 np->dp->index, mac);
+}
+
+static void netc_port_mac_tx_graceful_stop(struct netc_port *np)
+{
+	netc_port_rmw(np, NETC_POR, POR_TXDIS, POR_TXDIS);
+
+	if (is_netc_pseudo_port(np))
+		return;
+
+	netc_port_wait_tx_empty(np, 0);
+	if (np->caps.pmac)
+		netc_port_wait_tx_empty(np, 1);
+
+	netc_mac_port_rmw(np, NETC_PM_CMD_CFG(0), PM_CMD_CFG_TX_EN, 0);
+}
+
+static void netc_mac_link_up(struct phylink_config *config,
+			     struct phy_device *phy, unsigned int mode,
+			     phy_interface_t interface, int speed,
+			     int duplex, bool tx_pause, bool rx_pause)
+{
+	struct dsa_port *dp = dsa_phylink_to_port(config);
+	struct netc_port *np;
+
+	np = NETC_PORT(dp->ds, dp->index);
+	netc_port_set_speed(np, speed);
+
+	if (phy_interface_mode_is_rgmii(interface))
+		netc_port_set_rgmii_mac(np, speed, duplex);
+
+	if (interface == PHY_INTERFACE_MODE_RMII ||
+	    interface == PHY_INTERFACE_MODE_REVMII ||
+	    interface == PHY_INTERFACE_MODE_MII)
+		netc_port_set_rmii_mii_mac(np, speed, duplex);
+
+	netc_port_mac_tx_enable(np);
+	netc_port_mac_rx_enable(np);
+}
+
+static void netc_mac_link_down(struct phylink_config *config,
+			       unsigned int mode,
+			       phy_interface_t interface)
+{
+	struct dsa_port *dp = dsa_phylink_to_port(config);
+	struct netc_port *np;
+
+	np = NETC_PORT(dp->ds, dp->index);
+	netc_port_mac_rx_graceful_stop(np);
+	netc_port_mac_tx_graceful_stop(np);
+}
+
+static const struct phylink_mac_ops netc_phylink_mac_ops = {
+	.mac_config		= netc_mac_config,
+	.mac_link_up		= netc_mac_link_up,
+	.mac_link_down		= netc_mac_link_down,
+};
+
 static const struct dsa_switch_ops netc_switch_ops = {
 	.get_tag_protocol		= netc_get_tag_protocol,
 	.setup				= netc_setup,
 	.teardown			= netc_teardown,
+	.phylink_get_caps		= netc_phylink_get_caps,
 };
 
 static int netc_switch_probe(struct pci_dev *pdev,
@@ -549,6 +795,7 @@ static int netc_switch_probe(struct pci_dev *pdev,
 	ds->num_ports = priv->info->num_ports;
 	ds->num_tx_queues = NETC_TC_NUM;
 	ds->ops = &netc_switch_ops;
+	ds->phylink_mac_ops = &netc_phylink_mac_ops;
 	ds->priv = priv;
 	priv->ds = ds;
 
diff --git a/drivers/net/dsa/netc/netc_platform.c b/drivers/net/dsa/netc/netc_platform.c
index abd599ea9c8d..bb4f92d238cb 100644
--- a/drivers/net/dsa/netc/netc_platform.c
+++ b/drivers/net/dsa/netc/netc_platform.c
@@ -11,8 +11,46 @@ struct netc_switch_platform {
 	const struct netc_switch_info *info;
 };
 
+static void imx94_switch_phylink_get_caps(int port,
+					  struct phylink_config *config)
+{
+	config->mac_capabilities = MAC_1000FD;
+
+	switch (port) {
+	case 0 ... 1:
+		__set_bit(PHY_INTERFACE_MODE_SGMII,
+			  config->supported_interfaces);
+		__set_bit(PHY_INTERFACE_MODE_2500BASEX,
+			  config->supported_interfaces);
+		config->mac_capabilities |= MAC_2500FD;
+		fallthrough;
+	case 2:
+		config->mac_capabilities |= MAC_10 | MAC_100;
+		__set_bit(PHY_INTERFACE_MODE_MII,
+			  config->supported_interfaces);
+		__set_bit(PHY_INTERFACE_MODE_RMII,
+			  config->supported_interfaces);
+		/* Port 0 and 1 do not support REVMII */
+		if (port == 2)
+			__set_bit(PHY_INTERFACE_MODE_REVMII,
+				  config->supported_interfaces);
+
+		phy_interface_set_rgmii(config->supported_interfaces);
+		break;
+	case 3: /* CPU port */
+		__set_bit(PHY_INTERFACE_MODE_INTERNAL,
+			  config->supported_interfaces);
+		config->mac_capabilities |= MAC_10FD | MAC_100FD |
+					    MAC_2500FD;
+		break;
+	default:
+		break;
+	}
+}
+
 static const struct netc_switch_info imx94_info = {
 	.num_ports = 4,
+	.phylink_get_caps = imx94_switch_phylink_get_caps,
 };
 
 static const struct netc_switch_platform netc_platforms[] = {
diff --git a/drivers/net/dsa/netc/netc_switch.h b/drivers/net/dsa/netc/netc_switch.h
index a6d36dcebc6d..ac9743da2a1e 100644
--- a/drivers/net/dsa/netc/netc_switch.h
+++ b/drivers/net/dsa/netc/netc_switch.h
@@ -35,6 +35,7 @@ struct netc_switch;
 
 struct netc_switch_info {
 	u32 num_ports;
+	void (*phylink_get_caps)(int port, struct phylink_config *config);
 };
 
 struct netc_port_caps {
@@ -70,6 +71,9 @@ struct netc_switch {
 	struct ntmp_user ntmp;
 };
 
+#define NETC_PRIV(ds)			((struct netc_switch *)((ds)->priv))
+#define NETC_PORT(ds, port_id)		(NETC_PRIV(ds)->ports[(port_id)])
+
 /* Write/Read Switch base registers */
 #define netc_base_rd(r, o)		netc_read((r)->base + (o))
 #define netc_base_wr(r, o, v)		netc_write((r)->base + (o), v)
diff --git a/drivers/net/dsa/netc/netc_switch_hw.h b/drivers/net/dsa/netc/netc_switch_hw.h
index 0419f7f9207e..7d9afb493053 100644
--- a/drivers/net/dsa/netc/netc_switch_hw.h
+++ b/drivers/net/dsa/netc/netc_switch_hw.h
@@ -67,6 +67,14 @@
 #define  PQOSMR_VQMP			GENMASK(19, 16)
 #define  PQOSMR_QVMP			GENMASK(23, 20)
 
+#define NETC_POR			0x100
+#define  POR_TXDIS			BIT(0)
+#define  POR_RXDIS			BIT(1)
+
+#define NETC_PSR			0x104
+#define  PSR_TX_BUSY			BIT(0)
+#define  PSR_RX_BUSY			BIT(1)
+
 #define NETC_PTCTMSDUR(a)		(0x208 + (a) * 0x20)
 #define  PTCTMSDUR_MAXSDU		GENMASK(15, 0)
 #define  PTCTMSDUR_SDU_TYPE		GENMASK(17, 16)
@@ -123,6 +131,24 @@ enum netc_mfo {
 #define NETC_PM_MAXFRM(a)		(0x1014 + (a) * 0x400)
 #define  PM_MAXFRAM			GENMASK(15, 0)
 
+#define NETC_PM_IEVENT(a)		(0x1040 + (a) * 0x400)
+#define  PM_IEVENT_TX_EMPTY		BIT(5)
+#define  PM_IEVENT_RX_EMPTY		BIT(6)
+
+#define NETC_PM_IF_MODE(a)		(0x1300 + (a) * 0x400)
+#define  PM_IF_MODE_IFMODE		GENMASK(2, 0)
+#define   IFMODE_MII			1
+#define   IFMODE_RMII			3
+#define   IFMODE_RGMII			4
+#define   IFMODE_SGMII			5
+#define  PM_IF_MODE_REVMII		BIT(3)
+#define  PM_IF_MODE_M10			BIT(4)
+#define  PM_IF_MODE_HD			BIT(6)
+#define  PM_IF_MODE_SSP			GENMASK(14, 13)
+#define   SSP_100M			0
+#define   SSP_10M			1
+#define   SSP_1G			2
+
 #define NETC_PEMDIOCR			0x1c00
 #define NETC_EMDIO_BASE			NETC_PEMDIOCR
 
-- 
2.34.1



^ permalink raw reply related

* [PATCH v7 net-next 12/15] net: dsa: netc: add FDB, STP, MTU, port setup and host flooding support
From: Wei Fang @ 2026-05-13  3:04 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
	f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
	andrew, olteanv
  Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
	imx
In-Reply-To: <20260513030454.1666570-1-wei.fang@nxp.com>

Expand the NETC switch driver with several foundational features:
- FDB and MDB management
- STP state handling
- MTU configuration
- Port setup/teardown
- Host flooding support

At this stage, the driver operates only in standalone port mode. Each
port uses VLAN 0 as its PVID, meaning ingress frames are internally
assigned VID 0 regardless of whether they arrive tagged or untagged.
Note that this does not inject a VLAN 0 header into the frame, the VID
is used purely for subsequent VLAN processing within the switch.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
 drivers/net/dsa/netc/netc_main.c      | 601 ++++++++++++++++++++++++++
 drivers/net/dsa/netc/netc_switch.h    |  37 ++
 drivers/net/dsa/netc/netc_switch_hw.h |  14 +
 3 files changed, 652 insertions(+)

diff --git a/drivers/net/dsa/netc/netc_main.c b/drivers/net/dsa/netc/netc_main.c
index 2141b3aa96b7..34b5e655d1c9 100644
--- a/drivers/net/dsa/netc/netc_main.c
+++ b/drivers/net/dsa/netc/netc_main.c
@@ -4,13 +4,39 @@
  * Copyright 2025-2026 NXP
  */
 
+#include <linux/clk.h>
 #include <linux/etherdevice.h>
 #include <linux/fsl/enetc_mdio.h>
+#include <linux/if_bridge.h>
 #include <linux/if_vlan.h>
 #include <linux/of_mdio.h>
 
 #include "netc_switch.h"
 
+static struct netc_fdb_entry *
+netc_lookup_fdb_entry(struct netc_switch *priv,
+		      const unsigned char *addr,
+		      u16 vid)
+{
+	struct netc_fdb_entry *entry;
+
+	hlist_for_each_entry(entry, &priv->fdb_list, node)
+		if (ether_addr_equal(entry->keye.mac_addr, addr) &&
+		    le16_to_cpu(entry->keye.fid) == vid)
+			return entry;
+
+	return NULL;
+}
+
+static void netc_destroy_fdb_list(struct netc_switch *priv)
+{
+	struct netc_fdb_entry *entry;
+	struct hlist_node *tmp;
+
+	hlist_for_each_entry_safe(entry, tmp, &priv->fdb_list, node)
+		netc_del_fdb_entry(entry);
+}
+
 static enum dsa_tag_protocol
 netc_get_tag_protocol(struct dsa_switch *ds, int port,
 		      enum dsa_tag_protocol mprot)
@@ -83,6 +109,22 @@ static void netc_port_get_capability(struct netc_port *np)
 		np->caps.pseudo_link = true;
 }
 
+static int netc_port_get_info_from_dt(struct netc_port *np,
+				      struct device_node *node,
+				      struct device *dev)
+{
+	if (of_find_property(node, "clock-names", NULL)) {
+		np->ref_clk = devm_get_clk_from_child(dev, node, "ref");
+		if (IS_ERR(np->ref_clk)) {
+			dev_err(dev, "Port %d cannot get reference clock\n",
+				np->dp->index);
+			return PTR_ERR(np->ref_clk);
+		}
+	}
+
+	return 0;
+}
+
 static int netc_port_create_emdio_bus(struct netc_port *np,
 				      struct device_node *node)
 {
@@ -163,6 +205,15 @@ static int netc_init_switch_id(struct netc_switch *priv)
 	return 0;
 }
 
+static void netc_get_switch_capabilities(struct netc_switch *priv)
+{
+	struct netc_switch_regs *regs = &priv->regs;
+	u32 val;
+
+	val = netc_base_rd(regs, NETC_FDBHTCAPR);
+	priv->num_fdb_gmac = FIELD_GET(FDBHTCAPR_NUM_GMAC, val);
+}
+
 static int netc_init_all_ports(struct netc_switch *priv)
 {
 	struct device *dev = priv->dev;
@@ -198,6 +249,10 @@ static int netc_init_all_ports(struct netc_switch *priv)
 		np = priv->ports[dp->index];
 		np->dp = dp;
 
+		err = netc_port_get_info_from_dt(np, dp->dn, dev);
+		if (err)
+			return err;
+
 		if (dsa_port_is_user(dp)) {
 			err = netc_port_create_mdio_bus(np, dp->dn);
 			if (err) {
@@ -367,6 +422,220 @@ static void netc_port_default_config(struct netc_port *np)
 	netc_port_set_all_tc_msdu(np);
 }
 
+static u32 netc_available_port_bitmap(struct netc_switch *priv)
+{
+	struct dsa_port *dp;
+	u32 bitmap = 0;
+
+	dsa_switch_for_each_available_port(dp, priv->ds)
+		bitmap |= BIT(dp->index);
+
+	return bitmap;
+}
+
+static int netc_add_standalone_vlan_entry(struct netc_switch *priv)
+{
+	u32 bitmap_stg = VFT_STG_ID(0) | netc_available_port_bitmap(priv);
+	struct vft_cfge_data *cfge;
+	u16 cfg;
+	int err;
+
+	cfge = kzalloc_obj(*cfge);
+	if (!cfge)
+		return -ENOMEM;
+
+	cfge->bitmap_stg = cpu_to_le32(bitmap_stg);
+	cfge->et_eid = cpu_to_le32(NTMP_NULL_ENTRY_ID);
+	cfge->fid = cpu_to_le16(NETC_STANDALONE_PVID);
+
+	/* For standalone ports, MAC learning needs to be disabled, so frames
+	 * from other user ports will not be forwarded to the standalone ports,
+	 * because there are no FDB entries on the standalone ports. Also, the
+	 * frames received by the standalone ports cannot be flooded to other
+	 * ports, so MAC forwarding option needs to be set to
+	 * MFO_NO_MATCH_DISCARD, so the frames will be discarded rather than
+	 * flooding to other ports.
+	 */
+	cfg = FIELD_PREP(VFT_MLO, MLO_DISABLE) |
+	      FIELD_PREP(VFT_MFO, MFO_NO_MATCH_DISCARD);
+	cfge->cfg = cpu_to_le16(cfg);
+
+	err = ntmp_vft_add_entry(&priv->ntmp, NETC_STANDALONE_PVID, cfge);
+	if (err)
+		dev_err(priv->dev,
+			"Failed to add standalone VLAN entry\n");
+
+	kfree(cfge);
+
+	return err;
+}
+
+static int netc_port_add_fdb_entry(struct netc_port *np,
+				   const unsigned char *addr, u16 vid)
+{
+	struct netc_switch *priv = np->switch_priv;
+	struct netc_fdb_entry *entry;
+	struct fdbt_keye_data *keye;
+	struct fdbt_cfge_data *cfge;
+	int port = np->dp->index;
+	u32 cfg = 0;
+	int err;
+
+	entry = kzalloc_obj(*entry);
+	if (!entry)
+		return -ENOMEM;
+
+	keye = &entry->keye;
+	cfge = &entry->cfge;
+	ether_addr_copy(keye->mac_addr, addr);
+	keye->fid = cpu_to_le16(vid);
+
+	cfge->port_bitmap = cpu_to_le32(BIT(port));
+	cfge->cfg = cpu_to_le32(cfg);
+	cfge->et_eid = cpu_to_le32(NTMP_NULL_ENTRY_ID);
+
+	err = ntmp_fdbt_add_entry(&priv->ntmp, &entry->entry_id, keye, cfge);
+	if (err) {
+		kfree(entry);
+
+		return err;
+	}
+
+	netc_add_fdb_entry(priv, entry);
+
+	return 0;
+}
+
+static int netc_port_set_fdb_entry(struct netc_port *np,
+				   const unsigned char *addr, u16 vid)
+{
+	struct netc_switch *priv = np->switch_priv;
+	struct netc_fdb_entry *entry;
+	struct fdbt_cfge_data *cfge;
+	int port = np->dp->index;
+	__le32 old_port_bitmap;
+	int err = 0;
+
+	mutex_lock(&priv->fdbt_lock);
+
+	entry = netc_lookup_fdb_entry(priv, addr, vid);
+	if (!entry) {
+		err = netc_port_add_fdb_entry(np, addr, vid);
+		if (err)
+			dev_err(priv->dev,
+				"Failed to add FDB entry on port %d\n",
+				port);
+
+		goto unlock_fdbt;
+	}
+
+	cfge = &entry->cfge;
+	/* If the entry already exists on the port, return 0 directly */
+	if (unlikely(cfge->port_bitmap & cpu_to_le32(BIT(port))))
+		goto unlock_fdbt;
+
+	/* If the entry already exists, but not on this port, we need to
+	 * update the port bitmap. In general, it should only be valid
+	 * for multicast or broadcast address.
+	 */
+	old_port_bitmap = cfge->port_bitmap;
+	if (is_multicast_ether_addr(addr))
+		cfge->port_bitmap |= cpu_to_le32(BIT(port));
+	else
+		cfge->port_bitmap = cpu_to_le32(BIT(port));
+
+	err = ntmp_fdbt_update_entry(&priv->ntmp, entry->entry_id, cfge);
+	if (err) {
+		cfge->port_bitmap = old_port_bitmap;
+		dev_err(priv->dev, "Failed to set FDB entry on port %d\n",
+			port);
+	}
+
+unlock_fdbt:
+	mutex_unlock(&priv->fdbt_lock);
+
+	return err;
+}
+
+static int netc_port_del_fdb_entry(struct netc_port *np,
+				   const unsigned char *addr, u16 vid)
+{
+	struct netc_switch *priv = np->switch_priv;
+	struct ntmp_user *ntmp = &priv->ntmp;
+	struct netc_fdb_entry *entry;
+	struct fdbt_cfge_data *cfge;
+	int port = np->dp->index;
+	int err = 0;
+
+	mutex_lock(&priv->fdbt_lock);
+
+	entry = netc_lookup_fdb_entry(priv, addr, vid);
+	if (unlikely(!entry))
+		/* Currently only single port mode is supported, MAC learning
+		 * is disabled, so there is no dynamically learned FDB entry.
+		 * We need to support deleting dynamically FDB entry when the
+		 * bridge mode is supported.
+		 */
+		goto unlock_fdbt;
+
+	cfge = &entry->cfge;
+	if (unlikely(!(cfge->port_bitmap & cpu_to_le32(BIT(port)))))
+		goto unlock_fdbt;
+
+	if (cfge->port_bitmap != cpu_to_le32(BIT(port))) {
+		/* If the entry also exists on other ports, we need to
+		 * update the entry in the FDB table.
+		 */
+		cfge->port_bitmap &= cpu_to_le32(~BIT(port));
+		err = ntmp_fdbt_update_entry(ntmp, entry->entry_id, cfge);
+		if (err) {
+			cfge->port_bitmap |= cpu_to_le32(BIT(port));
+			goto unlock_fdbt;
+		}
+	} else {
+		/* If the entry only exists on this port, just delete
+		 * it from the FDB table.
+		 */
+		err = ntmp_fdbt_delete_entry(ntmp, entry->entry_id);
+		if (err)
+			goto unlock_fdbt;
+
+		netc_del_fdb_entry(entry);
+	}
+
+unlock_fdbt:
+	mutex_unlock(&priv->fdbt_lock);
+
+	return err;
+}
+
+static int netc_add_standalone_fdb_bcast_entry(struct netc_switch *priv)
+{
+	const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+	struct dsa_port *dp, *cpu_dp = NULL;
+
+	dsa_switch_for_each_cpu_port(dp, priv->ds) {
+		/* The switch has only one CPU port, so only need to find
+		 * the first CPU port to break out of the loop.
+		 */
+		cpu_dp = dp;
+		break;
+	}
+
+	if (!cpu_dp)
+		return -ENODEV;
+
+	/* If the user port acts as a standalone port, then its PVID is 0,
+	 * MLO is set to "disable MAC learning" and MFO is set to "discard
+	 * frames if no matching entry found in FDB table". Therefore, we
+	 * need to add a broadcast FDB entry on the CPU port so that the
+	 * broadcast frames received on the user port can be forwarded to
+	 * the CPU port.
+	 */
+	return netc_port_set_fdb_entry(NETC_PORT(priv->ds, cpu_dp->index),
+				       bcast, NETC_STANDALONE_PVID);
+}
+
 static int netc_setup(struct dsa_switch *ds)
 {
 	struct netc_switch *priv = ds->priv;
@@ -377,6 +646,8 @@ static int netc_setup(struct dsa_switch *ds)
 	if (err)
 		return err;
 
+	netc_get_switch_capabilities(priv);
+
 	err = netc_init_all_ports(priv);
 	if (err)
 		return err;
@@ -385,19 +656,65 @@ static int netc_setup(struct dsa_switch *ds)
 	if (err)
 		return err;
 
+	INIT_HLIST_HEAD(&priv->fdb_list);
+	mutex_init(&priv->fdbt_lock);
+
 	netc_switch_fixed_config(priv);
 
 	/* default setting for ports */
 	dsa_switch_for_each_available_port(dp, ds)
 		netc_port_default_config(priv->ports[dp->index]);
 
+	err = netc_add_standalone_vlan_entry(priv);
+	if (err)
+		goto free_lock_and_ntmp_user;
+
+	err = netc_add_standalone_fdb_bcast_entry(priv);
+	if (err)
+		goto free_lock_and_ntmp_user;
+
 	return 0;
+
+free_lock_and_ntmp_user:
+	/* No need to clear the hardware state, netc_setup() is only called
+	 * when the driver is bound, and FLR will be performed to reset the
+	 * hardware state.
+	 */
+	mutex_destroy(&priv->fdbt_lock);
+	netc_free_ntmp_user(priv);
+
+	return err;
+}
+
+static void netc_destroy_all_lists(struct netc_switch *priv)
+{
+	netc_destroy_fdb_list(priv);
+	mutex_destroy(&priv->fdbt_lock);
+}
+
+static void netc_free_host_flood_rules(struct netc_switch *priv)
+{
+	struct dsa_port *dp;
+
+	dsa_switch_for_each_user_port(dp, priv->ds) {
+		struct netc_port *np = priv->ports[dp->index];
+
+		/* No need to clear the hardware IPFT entry. Because PCIe
+		 * FLR will be performed when the switch is re-registered,
+		 * it will reset hardware state. So only need to free the
+		 * memory to avoid memory leak.
+		 */
+		kfree(np->host_flood);
+		np->host_flood = NULL;
+	}
 }
 
 static void netc_teardown(struct dsa_switch *ds)
 {
 	struct netc_switch *priv = ds->priv;
 
+	netc_destroy_all_lists(priv);
+	netc_free_host_flood_rules(priv);
 	netc_free_ntmp_user(priv);
 }
 
@@ -531,6 +848,278 @@ static void netc_switch_get_ip_revision(struct netc_switch *priv)
 	priv->revision = FIELD_GET(IPBRR0_IP_REV, val);
 }
 
+static int netc_port_enable(struct dsa_switch *ds, int port,
+			    struct phy_device *phy)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	int err;
+
+	if (np->enable)
+		return 0;
+
+	err = clk_prepare_enable(np->ref_clk);
+	if (err) {
+		dev_err(ds->dev,
+			"Failed to enable enet_ref_clk of port %d\n", port);
+		return err;
+	}
+
+	np->enable = true;
+
+	return 0;
+}
+
+static void netc_port_disable(struct dsa_switch *ds, int port)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+
+	/* When .port_disable() is called, .port_enable() may not have been
+	 * called. In this case, both the prepare_count and enable_count of
+	 * clock are 0. Calling clk_disable_unprepare() at this time will
+	 * cause warnings.
+	 */
+	if (!np->enable)
+		return;
+
+	clk_disable_unprepare(np->ref_clk);
+	np->enable = false;
+}
+
+static void netc_port_stp_state_set(struct dsa_switch *ds,
+				    int port, u8 state)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	u32 val;
+
+	switch (state) {
+	case BR_STATE_DISABLED:
+	case BR_STATE_LISTENING:
+	case BR_STATE_BLOCKING:
+		val = NETC_STG_STATE_DISABLED;
+		break;
+	case BR_STATE_LEARNING:
+		val = NETC_STG_STATE_LEARNING;
+		break;
+	case BR_STATE_FORWARDING:
+		val = NETC_STG_STATE_FORWARDING;
+		break;
+	default:
+		return;
+	}
+
+	netc_port_wr(np, NETC_BPSTGSR, val);
+}
+
+static int netc_port_change_mtu(struct dsa_switch *ds,
+				int port, int mtu)
+{
+	u32 max_frame_size = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
+
+	netc_port_set_max_frame_size(NETC_PORT(ds, port), max_frame_size);
+
+	return 0;
+}
+
+static int netc_port_max_mtu(struct dsa_switch *ds, int port)
+{
+	return NETC_MAX_FRAME_LEN - VLAN_ETH_HLEN - ETH_FCS_LEN;
+}
+
+static int netc_port_fdb_add(struct dsa_switch *ds, int port,
+			     const unsigned char *addr, u16 vid,
+			     struct dsa_db db)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+
+	/* Currently, only support standalone port mode, so only
+	 * NETC_STANDALONE_PVID (= 0) is supported here.
+	 */
+	if (vid != NETC_STANDALONE_PVID)
+		return -EOPNOTSUPP;
+
+	return netc_port_set_fdb_entry(np, addr, vid);
+}
+
+static int netc_port_fdb_del(struct dsa_switch *ds, int port,
+			     const unsigned char *addr, u16 vid,
+			     struct dsa_db db)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+
+	if (vid != NETC_STANDALONE_PVID)
+		return -EOPNOTSUPP;
+
+	return netc_port_del_fdb_entry(np, addr, vid);
+}
+
+static int netc_port_fdb_dump(struct dsa_switch *ds, int port,
+			      dsa_fdb_dump_cb_t *cb, void *data)
+{
+	struct netc_switch *priv = ds->priv;
+	u32 resume_eid = NTMP_NULL_ENTRY_ID;
+	struct fdbt_entry_data *entry;
+	struct fdbt_keye_data *keye;
+	struct fdbt_cfge_data *cfge;
+	u32 cfg, cnt = 0;
+	bool is_static;
+	int err;
+	u16 vid;
+
+	entry = kmalloc_obj(*entry);
+	if (!entry)
+		return -ENOMEM;
+
+	keye = &entry->keye;
+	cfge = &entry->cfge;
+	mutex_lock(&priv->fdbt_lock);
+
+	do {
+		memset(entry, 0, sizeof(*entry));
+		err = ntmp_fdbt_search_port_entry(&priv->ntmp, port,
+						  &resume_eid, entry);
+		if (err || entry->entry_id == NTMP_NULL_ENTRY_ID)
+			break;
+
+		cfg = le32_to_cpu(cfge->cfg);
+		is_static = (cfg & FDBT_DYNAMIC) ? false : true;
+		vid = le16_to_cpu(keye->fid);
+
+		err = cb(keye->mac_addr, vid, is_static, data);
+		if (err)
+			break;
+
+		/* To prevent hardware malfunctions from causing an
+		 * infinite loop.
+		 */
+		if (++cnt >= priv->num_fdb_gmac)
+			break;
+	} while (resume_eid != NTMP_NULL_ENTRY_ID);
+
+	mutex_unlock(&priv->fdbt_lock);
+	kfree(entry);
+
+	return err;
+}
+
+static int netc_port_mdb_add(struct dsa_switch *ds, int port,
+			     const struct switchdev_obj_port_mdb *mdb,
+			     struct dsa_db db)
+{
+	return netc_port_fdb_add(ds, port, mdb->addr, mdb->vid, db);
+}
+
+static int netc_port_mdb_del(struct dsa_switch *ds, int port,
+			     const struct switchdev_obj_port_mdb *mdb,
+			     struct dsa_db db)
+{
+	return netc_port_fdb_del(ds, port, mdb->addr, mdb->vid, db);
+}
+
+static int netc_port_add_host_flood_rule(struct netc_port *np,
+					 bool uc, bool mc)
+{
+	const u8 dmac_mask[ETH_ALEN] = {0x1, 0, 0, 0, 0, 0};
+	struct netc_switch *priv = np->switch_priv;
+	struct ipft_entry_data *host_flood;
+	struct ipft_keye_data *keye;
+	struct ipft_cfge_data *cfge;
+	u16 src_port;
+	u32 cfg;
+	int err;
+
+	if (!uc && !mc) {
+		/* Disable ingress port filter table lookup */
+		netc_port_wr(np, NETC_PIPFCR, 0);
+		np->uc = false;
+		np->mc = false;
+
+		return 0;
+	}
+
+	host_flood = kzalloc_obj(*host_flood);
+	if (!host_flood)
+		return -ENOMEM;
+
+	keye = &host_flood->keye;
+	cfge = &host_flood->cfge;
+
+	src_port = FIELD_PREP(IPFT_SRC_PORT, np->dp->index);
+	src_port |= IPFT_SRC_PORT_MASK;
+	keye->src_port = cpu_to_le16(src_port);
+
+	/* If either only unicast or only multicast need to be flooded
+	 * to the host, we always set the mask that tests the first MAC
+	 * DA octet. The value should be 0 for the first bit (if unicast
+	 * has to be flooded) or 1 (if multicast). If both unicast and
+	 * multicast have to be flooded, we leave the key mask empty, so
+	 * it matches everything.
+	 */
+	if (uc && !mc)
+		ether_addr_copy(keye->dmac_mask, dmac_mask);
+
+	if (!uc && mc) {
+		ether_addr_copy(keye->dmac, dmac_mask);
+		ether_addr_copy(keye->dmac_mask, dmac_mask);
+	}
+
+	cfg = FIELD_PREP(IPFT_FLTFA, IPFT_FLTFA_REDIRECT);
+	cfg |= FIELD_PREP(IPFT_HR, NETC_HR_HOST_FLOOD);
+	cfge->cfg = cpu_to_le32(cfg);
+
+	err = ntmp_ipft_add_entry(&priv->ntmp, host_flood);
+	if (err) {
+		kfree(host_flood);
+		return err;
+	}
+
+	np->uc = uc;
+	np->mc = mc;
+	np->host_flood = host_flood;
+	/* Enable ingress port filter table lookup */
+	netc_port_wr(np, NETC_PIPFCR, PIPFCR_EN);
+
+	return 0;
+}
+
+static void netc_port_remove_host_flood(struct netc_port *np,
+					struct ipft_entry_data *host_flood)
+{
+	struct netc_switch *priv = np->switch_priv;
+
+	if (!host_flood)
+		return;
+
+	ntmp_ipft_delete_entry(&priv->ntmp, host_flood->entry_id);
+	kfree(host_flood);
+}
+
+static void netc_port_set_host_flood(struct dsa_switch *ds, int port,
+				     bool uc, bool mc)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	struct ipft_entry_data *old_host_flood;
+
+	if (np->uc == uc && np->mc == mc)
+		return;
+
+	/* IPFT does not support in-place updates to the KEYE element,
+	 * we need to add a new entry and then delete the old one. So
+	 * save the old entry first.
+	 */
+	old_host_flood = np->host_flood;
+	np->host_flood = NULL;
+
+	if (netc_port_add_host_flood_rule(np, uc, mc)) {
+		np->host_flood = old_host_flood;
+		dev_err(ds->dev, "Failed to add host flood rule on port %d\n",
+			port);
+		return;
+	}
+
+	/* Remove the old host flood entry */
+	netc_port_remove_host_flood(np, old_host_flood);
+}
+
 static void netc_phylink_get_caps(struct dsa_switch *ds, int port,
 				  struct phylink_config *config)
 {
@@ -757,6 +1346,17 @@ static const struct dsa_switch_ops netc_switch_ops = {
 	.setup				= netc_setup,
 	.teardown			= netc_teardown,
 	.phylink_get_caps		= netc_phylink_get_caps,
+	.port_enable			= netc_port_enable,
+	.port_disable			= netc_port_disable,
+	.port_stp_state_set		= netc_port_stp_state_set,
+	.port_change_mtu		= netc_port_change_mtu,
+	.port_max_mtu			= netc_port_max_mtu,
+	.port_fdb_add			= netc_port_fdb_add,
+	.port_fdb_del			= netc_port_fdb_del,
+	.port_fdb_dump			= netc_port_fdb_dump,
+	.port_mdb_add			= netc_port_mdb_add,
+	.port_mdb_del			= netc_port_mdb_del,
+	.port_set_host_flood		= netc_port_set_host_flood,
 };
 
 static int netc_switch_probe(struct pci_dev *pdev,
@@ -796,6 +1396,7 @@ static int netc_switch_probe(struct pci_dev *pdev,
 	ds->num_tx_queues = NETC_TC_NUM;
 	ds->ops = &netc_switch_ops;
 	ds->phylink_mac_ops = &netc_phylink_mac_ops;
+	ds->fdb_isolation = true;
 	ds->priv = priv;
 	priv->ds = ds;
 
diff --git a/drivers/net/dsa/netc/netc_switch.h b/drivers/net/dsa/netc/netc_switch.h
index ac9743da2a1e..cc278a862623 100644
--- a/drivers/net/dsa/netc/netc_switch.h
+++ b/drivers/net/dsa/netc/netc_switch.h
@@ -31,6 +31,8 @@
 
 #define NETC_MAX_FRAME_LEN		9600
 
+#define NETC_STANDALONE_PVID		0
+
 struct netc_switch;
 
 struct netc_switch_info {
@@ -44,12 +46,23 @@ struct netc_port_caps {
 	u32 pseudo_link:1;
 };
 
+enum netc_host_reason {
+	/* Software defined host reasons */
+	NETC_HR_HOST_FLOOD = 8,
+};
+
 struct netc_port {
 	void __iomem *iobase;
 	struct netc_switch *switch_priv;
 	struct netc_port_caps caps;
 	struct dsa_port *dp;
+	struct clk *ref_clk; /* RGMII/RMII reference clock */
 	struct mii_bus *emdio;
+
+	u16 enable:1;
+	u16 uc:1;
+	u16 mc:1;
+	struct ipft_entry_data *host_flood;
 };
 
 struct netc_switch_regs {
@@ -58,6 +71,13 @@ struct netc_switch_regs {
 	void __iomem *global;
 };
 
+struct netc_fdb_entry {
+	u32 entry_id;
+	struct fdbt_cfge_data cfge;
+	struct fdbt_keye_data keye;
+	struct hlist_node node;
+};
+
 struct netc_switch {
 	struct pci_dev *pdev;
 	struct device *dev;
@@ -69,6 +89,11 @@ struct netc_switch {
 	struct netc_port **ports;
 
 	struct ntmp_user ntmp;
+	struct hlist_head fdb_list;
+	struct mutex fdbt_lock; /* FDB table lock */
+
+	/* Switch hardware capabilities */
+	u32 num_fdb_gmac;
 };
 
 #define NETC_PRIV(ds)			((struct netc_switch *)((ds)->priv))
@@ -91,6 +116,18 @@ static inline bool is_netc_pseudo_port(struct netc_port *np)
 	return np->caps.pseudo_link;
 }
 
+static inline void netc_add_fdb_entry(struct netc_switch *priv,
+				      struct netc_fdb_entry *entry)
+{
+	hlist_add_head(&entry->node, &priv->fdb_list);
+}
+
+static inline void netc_del_fdb_entry(struct netc_fdb_entry *entry)
+{
+	hlist_del(&entry->node);
+	kfree(entry);
+}
+
 int netc_switch_platform_probe(struct netc_switch *priv);
 
 #endif
diff --git a/drivers/net/dsa/netc/netc_switch_hw.h b/drivers/net/dsa/netc/netc_switch_hw.h
index 7d9afb493053..b04e9866d72a 100644
--- a/drivers/net/dsa/netc/netc_switch_hw.h
+++ b/drivers/net/dsa/netc/netc_switch_hw.h
@@ -36,6 +36,9 @@
 #define  VFHTDECR2_MLO			GENMASK(26, 24)
 #define  VFHTDECR2_MFO			GENMASK(28, 27)
 
+#define NETC_FDBHTCAPR			0x2020
+#define  FDBHTCAPR_NUM_GMAC		GENMASK(8, 0)
+
 /* Definition of Switch port registers */
 #define NETC_PCAPR			0x0000
 #define  PCAPR_LINK_TYPE		BIT(4)
@@ -67,6 +70,9 @@
 #define  PQOSMR_VQMP			GENMASK(19, 16)
 #define  PQOSMR_QVMP			GENMASK(23, 20)
 
+#define NETC_PIPFCR			0x0084
+#define  PIPFCR_EN			BIT(0)
+
 #define NETC_POR			0x100
 #define  POR_TXDIS			BIT(0)
 #define  POR_RXDIS			BIT(1)
@@ -122,6 +128,14 @@ enum netc_mfo {
 #define  BPDVR_RXVAM			BIT(24)
 #define  BPDVR_TXTAGA			GENMASK(26, 25)
 
+#define NETC_BPSTGSR			0x520
+
+enum netc_stg_stage {
+	NETC_STG_STATE_DISABLED = 0,
+	NETC_STG_STATE_LEARNING,
+	NETC_STG_STATE_FORWARDING,
+};
+
 /* Definition of Switch ethernet MAC port registers */
 #define NETC_PMAC_OFFSET		0x400
 #define NETC_PM_CMD_CFG(a)		(0x1008 + (a) * 0x400)
-- 
2.34.1



^ permalink raw reply related

* [PATCH v7 net-next 13/15] net: dsa: netc: initialize buffer pool table and implement flow-control
From: Wei Fang @ 2026-05-13  3:04 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
	f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
	andrew, olteanv
  Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
	imx
In-Reply-To: <20260513030454.1666570-1-wei.fang@nxp.com>

The buffer pool is a quantity of memory available for buffering a group
of flows (e.g. frames having the same priority, frames received from the
same port), while waiting to be transmitted on a port. The buffer pool
tracks internal memory consumption with upper bound limits and optionally
a non-shared portion when associated with a shared buffer pool. Currently
the shared buffer pool is not supported, it will be added in the future.

For i.MX94, the switch has 4 ports and 8 buffer pools, so each port is
allocated two buffer pools. For frames with priorities of 0 to 3, they
will be mapped to the first buffer pool; For frames with priorities of
4 to 7, they will be mapped to the second buffer pool. Each buffer pool
has a flow control on threshold and a flow control off threshold. By
setting these threshold, add the flow control support to each port.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
 drivers/net/dsa/netc/netc_main.c      | 130 ++++++++++++++++++++++++++
 drivers/net/dsa/netc/netc_platform.c  |   2 +-
 drivers/net/dsa/netc/netc_switch.h    |  19 ++++
 drivers/net/dsa/netc/netc_switch_hw.h |  10 ++
 4 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/netc/netc_main.c b/drivers/net/dsa/netc/netc_main.c
index 34b5e655d1c9..7a4064849693 100644
--- a/drivers/net/dsa/netc/netc_main.c
+++ b/drivers/net/dsa/netc/netc_main.c
@@ -212,6 +212,9 @@ static void netc_get_switch_capabilities(struct netc_switch *priv)
 
 	val = netc_base_rd(regs, NETC_FDBHTCAPR);
 	priv->num_fdb_gmac = FIELD_GET(FDBHTCAPR_NUM_GMAC, val);
+
+	val = netc_base_rd(regs, NETC_BPCAPR);
+	priv->num_bp = FIELD_GET(BPCAPR_NUM_BP, val);
 }
 
 static int netc_init_all_ports(struct netc_switch *priv)
@@ -403,6 +406,15 @@ static void netc_port_fixed_config(struct netc_port *np)
 	/* Enable L2 and L3 DOS */
 	netc_port_rmw(np, NETC_PCR, PCR_L2DOSE | PCR_L3DOSE,
 		      PCR_L2DOSE | PCR_L3DOSE);
+
+	/* Set the quanta value of TX PAUSE frame */
+	netc_mac_port_wr(np, NETC_PM_PAUSE_QUANTA(0), NETC_PAUSE_QUANTA);
+
+	/* When a quanta timer counts down and reaches this value,
+	 * the MAC sends a refresh PAUSE frame with the programmed
+	 * full quanta value if a pause condition still exists.
+	 */
+	netc_mac_port_wr(np, NETC_PM_PAUSE_THRESH(0), NETC_PAUSE_THRESH);
 }
 
 static void netc_port_default_config(struct netc_port *np)
@@ -636,6 +648,79 @@ static int netc_add_standalone_fdb_bcast_entry(struct netc_switch *priv)
 				       bcast, NETC_STANDALONE_PVID);
 }
 
+static void netc_port_set_pbpmcr(struct netc_port *np, u64 mapping)
+{
+	u32 pbpmcr0 = lower_32_bits(mapping);
+	u32 pbpmcr1 = upper_32_bits(mapping);
+
+	netc_port_wr(np, NETC_PBPMCR0, pbpmcr0);
+	netc_port_wr(np, NETC_PBPMCR1, pbpmcr1);
+}
+
+static void netc_ipv_to_buffer_pool_mapping(struct netc_switch *priv)
+{
+	int bp_per_port = priv->num_bp / priv->info->num_ports;
+	int q = NETC_IPV_NUM / bp_per_port;
+	int r = NETC_IPV_NUM % bp_per_port;
+	int num = q + r;
+
+	/* IPV-to-buffer-pool mapping per port:
+	 * Each port is allocated 'bp_per_port' buffer pools and supports 8
+	 * IPVs, where a higher IPV indicates a higher frame priority. Each
+	 * IPV can be mapped to only one buffer pool, from hardware design
+	 * perspective, bp_per_port will not be greater than 8. So 'q' will
+	 * not be 0.
+	 *
+	 * The mapping rule is as follows:
+	 * - The first 'num' IPVs share the port's first buffer pool (index
+	 * 'base_id').
+	 * - After that, every 'q' IPVs share one buffer pool, with pool
+	 * indices increasing sequentially.
+	 */
+	for (int i = 0; i < priv->info->num_ports; i++) {
+		u32 base_id = i * bp_per_port;
+		u32 bp_id = base_id;
+		u64 mapping = 0;
+
+		for (int ipv = 0; ipv < NETC_IPV_NUM; ipv++) {
+			/* Update the buffer pool index */
+			if (ipv >= num)
+				bp_id = base_id + ((ipv - num) / q) + 1;
+
+			mapping |= (u64)bp_id << (ipv * 8);
+		}
+
+		netc_port_set_pbpmcr(priv->ports[i], mapping);
+	}
+}
+
+static int netc_switch_bpt_default_config(struct netc_switch *priv)
+{
+	if (priv->num_bp < priv->info->num_ports)
+		return -EINVAL;
+
+	priv->bpt_list = devm_kcalloc(priv->dev, priv->num_bp,
+				      sizeof(struct bpt_cfge_data),
+				      GFP_KERNEL);
+	if (!priv->bpt_list)
+		return -ENOMEM;
+
+	/* Initialize the maximum threshold of each buffer pool entry */
+	for (int i = 0; i < priv->num_bp; i++) {
+		struct bpt_cfge_data *cfge = &priv->bpt_list[i];
+		int err;
+
+		cfge->max_thresh = cpu_to_le16(NETC_BP_THRESH);
+		err = ntmp_bpt_update_entry(&priv->ntmp, i, cfge);
+		if (err)
+			return err;
+	}
+
+	netc_ipv_to_buffer_pool_mapping(priv);
+
+	return 0;
+}
+
 static int netc_setup(struct dsa_switch *ds)
 {
 	struct netc_switch *priv = ds->priv;
@@ -665,6 +750,10 @@ static int netc_setup(struct dsa_switch *ds)
 	dsa_switch_for_each_available_port(dp, ds)
 		netc_port_default_config(priv->ports[dp->index]);
 
+	err = netc_switch_bpt_default_config(priv);
+	if (err)
+		goto free_lock_and_ntmp_user;
+
 	err = netc_add_standalone_vlan_entry(priv);
 	if (err)
 		goto free_lock_and_ntmp_user;
@@ -1218,6 +1307,45 @@ static void netc_port_set_rmii_mii_mac(struct netc_port *np,
 	netc_mac_port_rmw(np, NETC_PM_IF_MODE(0), mask, val);
 }
 
+static void netc_port_set_tx_pause(struct netc_port *np, bool tx_pause)
+{
+	struct netc_switch *priv = np->switch_priv;
+	int port = np->dp->index;
+	int i, j, num_bp;
+
+	num_bp = priv->num_bp / priv->info->num_ports;
+	for (i = 0, j = port * num_bp; i < num_bp; i++, j++) {
+		struct bpt_cfge_data *cfge = &priv->bpt_list[j];
+		struct bpt_cfge_data old_cfge = *cfge;
+
+		if (tx_pause) {
+			cfge->fc_on_thresh = cpu_to_le16(NETC_FC_THRESH_ON);
+			cfge->fc_off_thresh = cpu_to_le16(NETC_FC_THRESH_OFF);
+			cfge->fccfg_sbpen = FIELD_PREP(BPT_FC_CFG,
+						       BPT_FC_CFG_EN_BPFC);
+			cfge->fc_ports = cpu_to_le32(BIT(port));
+		} else {
+			cfge->fc_on_thresh = cpu_to_le16(0);
+			cfge->fc_off_thresh = cpu_to_le16(0);
+			cfge->fccfg_sbpen = 0;
+			cfge->fc_ports = cpu_to_le32(0);
+		}
+
+		if (ntmp_bpt_update_entry(&priv->ntmp, j, cfge)) {
+			*cfge = old_cfge;
+			dev_warn(priv->dev,
+				 "Failed to %s TX pause of buffer pool %d (swp%d)\n",
+				 tx_pause ? "enable" : "disable", j, port);
+		}
+	}
+}
+
+static void netc_port_set_rx_pause(struct netc_port *np, bool rx_pause)
+{
+	netc_mac_port_rmw(np, NETC_PM_CMD_CFG(0), PM_CMD_CFG_PAUSE_IGN,
+			  rx_pause ? 0 : PM_CMD_CFG_PAUSE_IGN);
+}
+
 static void netc_port_mac_rx_enable(struct netc_port *np)
 {
 	netc_port_rmw(np, NETC_POR, POR_RXDIS, 0);
@@ -1319,6 +1447,8 @@ static void netc_mac_link_up(struct phylink_config *config,
 	    interface == PHY_INTERFACE_MODE_MII)
 		netc_port_set_rmii_mii_mac(np, speed, duplex);
 
+	netc_port_set_tx_pause(np, tx_pause);
+	netc_port_set_rx_pause(np, rx_pause);
 	netc_port_mac_tx_enable(np);
 	netc_port_mac_rx_enable(np);
 }
diff --git a/drivers/net/dsa/netc/netc_platform.c b/drivers/net/dsa/netc/netc_platform.c
index bb4f92d238cb..34aeb6fceb3c 100644
--- a/drivers/net/dsa/netc/netc_platform.c
+++ b/drivers/net/dsa/netc/netc_platform.c
@@ -14,7 +14,7 @@ struct netc_switch_platform {
 static void imx94_switch_phylink_get_caps(int port,
 					  struct phylink_config *config)
 {
-	config->mac_capabilities = MAC_1000FD;
+	config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD;
 
 	switch (port) {
 	case 0 ... 1:
diff --git a/drivers/net/dsa/netc/netc_switch.h b/drivers/net/dsa/netc/netc_switch.h
index cc278a862623..4a9bf69907e9 100644
--- a/drivers/net/dsa/netc/netc_switch.h
+++ b/drivers/net/dsa/netc/netc_switch.h
@@ -28,11 +28,27 @@
 
 #define NETC_TC_NUM			8
 #define NETC_CBDR_NUM			2
+#define NETC_IPV_NUM			8
 
 #define NETC_MAX_FRAME_LEN		9600
 
 #define NETC_STANDALONE_PVID		0
 
+/* Threshold format: MANT (bits 11:4) * 2^EXP (bits 3:0)
+ * Unit: Memory words (average of 20 bytes each)
+ * NETC_BP_THRESH = 0x334, MANT = 0x33, EXP = 4. Threshold: 816 words
+ * NETC_FC_THRESH_ON = 0x533, MANT = 0x53, EXP = 3. Threshold: 664 words
+ * NETC_FC_THRESH_OFF = 0x3c3, MANT = 0x3c, EXP = 3. Threshold: 480 words
+ */
+#define NETC_BP_THRESH			0x334
+#define NETC_FC_THRESH_ON		0x533
+#define NETC_FC_THRESH_OFF		0x3c3
+
+/* PAUSE quanta: 0xFFFF = 65535 quanta (each quanta = 512 bit times) */
+#define NETC_PAUSE_QUANTA		0xFFFF
+/* PAUSE refresh threshold: send refresh when timer reaches this value */
+#define NETC_PAUSE_THRESH		0xFF00
+
 struct netc_switch;
 
 struct netc_switch_info {
@@ -94,6 +110,9 @@ struct netc_switch {
 
 	/* Switch hardware capabilities */
 	u32 num_fdb_gmac;
+	u32 num_bp;
+
+	struct bpt_cfge_data *bpt_list;
 };
 
 #define NETC_PRIV(ds)			((struct netc_switch *)((ds)->priv))
diff --git a/drivers/net/dsa/netc/netc_switch_hw.h b/drivers/net/dsa/netc/netc_switch_hw.h
index b04e9866d72a..caf5977c5100 100644
--- a/drivers/net/dsa/netc/netc_switch_hw.h
+++ b/drivers/net/dsa/netc/netc_switch_hw.h
@@ -12,6 +12,12 @@
 #define NETC_SWITCH_DEVICE_ID		0xeef2
 
 /* Definition of Switch base registers */
+#define NETC_BPCAPR			0x0008
+#define  BPCAPR_NUM_BP			GENMASK(7, 0)
+
+#define NETC_PBPMCR0			0x0400
+#define NETC_PBPMCR1			0x0404
+
 #define NETC_CBDRMR(a)			(0x0800 + (a) * 0x30)
 #define NETC_CBDRBAR0(a)		(0x0810 + (a) * 0x30)
 #define NETC_CBDRBAR1(a)		(0x0814 + (a) * 0x30)
@@ -141,6 +147,7 @@ enum netc_stg_stage {
 #define NETC_PM_CMD_CFG(a)		(0x1008 + (a) * 0x400)
 #define  PM_CMD_CFG_TX_EN		BIT(0)
 #define  PM_CMD_CFG_RX_EN		BIT(1)
+#define  PM_CMD_CFG_PAUSE_IGN		BIT(8)
 
 #define NETC_PM_MAXFRM(a)		(0x1014 + (a) * 0x400)
 #define  PM_MAXFRAM			GENMASK(15, 0)
@@ -149,6 +156,9 @@ enum netc_stg_stage {
 #define  PM_IEVENT_TX_EMPTY		BIT(5)
 #define  PM_IEVENT_RX_EMPTY		BIT(6)
 
+#define NETC_PM_PAUSE_QUANTA(a)		(0x1054 + (a) * 0x400)
+#define NETC_PM_PAUSE_THRESH(a)		(0x1064 + (a) * 0x400)
+
 #define NETC_PM_IF_MODE(a)		(0x1300 + (a) * 0x400)
 #define  PM_IF_MODE_IFMODE		GENMASK(2, 0)
 #define   IFMODE_MII			1
-- 
2.34.1



^ permalink raw reply related

* [PATCH v7 net-next 14/15] net: dsa: netc: add support for the standardized counters
From: Wei Fang @ 2026-05-13  3:04 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
	f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
	andrew, olteanv
  Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
	imx
In-Reply-To: <20260513030454.1666570-1-wei.fang@nxp.com>

Each user port of the NETC switch supports 802.3 basic and mandatory
managed objects statistic counters and IETF Management Information
Database (MIB) package (RFC2665) and Remote Network Monitoring (RMON)
counters. And all of these counters are 64-bit registers. In addition,
some user ports support preemption, so these ports have two MACs, MAC
0 is the express MAC (eMAC), MAC 1 is the preemptible MAC (pMAC). So
for ports that support preemption, the statistics are the sum of the
pMAC and eMAC statistics.

Note that the current switch driver does not support preemption, all
frames are sent and received via the eMAC by default. The statistics
read from the pMAC should be zero.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
 drivers/net/dsa/netc/Makefile         |   2 +-
 drivers/net/dsa/netc/netc_ethtool.c   | 190 ++++++++++++++++++++++++++
 drivers/net/dsa/netc/netc_main.c      |   4 +
 drivers/net/dsa/netc/netc_switch.h    |  12 ++
 drivers/net/dsa/netc/netc_switch_hw.h | 120 ++++++++++++++++
 include/linux/fsl/netc_global.h       |   6 +
 6 files changed, 333 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/dsa/netc/netc_ethtool.c

diff --git a/drivers/net/dsa/netc/Makefile b/drivers/net/dsa/netc/Makefile
index 4a5767562574..f40b13c702e0 100644
--- a/drivers/net/dsa/netc/Makefile
+++ b/drivers/net/dsa/netc/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_NET_DSA_NETC_SWITCH) += nxp-netc-switch.o
-nxp-netc-switch-objs := netc_main.o netc_platform.o
+nxp-netc-switch-objs := netc_main.o netc_platform.o netc_ethtool.o
diff --git a/drivers/net/dsa/netc/netc_ethtool.c b/drivers/net/dsa/netc/netc_ethtool.c
new file mode 100644
index 000000000000..ac8940b5a85c
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_ethtool.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * NXP NETC switch driver
+ * Copyright 2025-2026 NXP
+ */
+
+#include <linux/ethtool_netlink.h>
+
+#include "netc_switch.h"
+
+static const struct ethtool_rmon_hist_range netc_rmon_ranges[] = {
+	{   64,   64 },
+	{   65,  127 },
+	{  128,  255 },
+	{  256,  511 },
+	{  512, 1023 },
+	{ 1024, 1522 },
+	{ 1523, NETC_MAX_FRAME_LEN },
+	{ }
+};
+
+static void netc_port_pause_stats(struct netc_port *np, int mac,
+				  struct ethtool_pause_stats *stats)
+{
+	if (mac && !np->caps.pmac)
+		return;
+
+	stats->tx_pause_frames = netc_port_rd64(np, NETC_PM_TXPF(mac));
+	stats->rx_pause_frames = netc_port_rd64(np, NETC_PM_RXPF(mac));
+}
+
+void netc_port_get_pause_stats(struct dsa_switch *ds, int port,
+			       struct ethtool_pause_stats *pause_stats)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	struct net_device *ndev;
+
+	switch (pause_stats->src) {
+	case ETHTOOL_MAC_STATS_SRC_EMAC:
+		netc_port_pause_stats(np, 0, pause_stats);
+		break;
+	case ETHTOOL_MAC_STATS_SRC_PMAC:
+		netc_port_pause_stats(np, 1, pause_stats);
+		break;
+	case ETHTOOL_MAC_STATS_SRC_AGGREGATE:
+		ndev = dsa_to_port(ds, port)->user;
+		ethtool_aggregate_pause_stats(ndev, pause_stats);
+		break;
+	}
+}
+
+static void netc_port_rmon_stats(struct netc_port *np, int mac,
+				 struct ethtool_rmon_stats *stats)
+{
+	if (mac && !np->caps.pmac)
+		return;
+
+	stats->undersize_pkts = netc_port_rd64(np, NETC_PM_RUND(mac));
+	stats->oversize_pkts = netc_port_rd64(np, NETC_PM_ROVR(mac));
+	stats->fragments = netc_port_rd64(np, NETC_PM_RFRG(mac));
+	stats->jabbers = netc_port_rd64(np, NETC_PM_RJBR(mac));
+
+	stats->hist[0] = netc_port_rd64(np, NETC_PM_R64(mac));
+	stats->hist[1] = netc_port_rd64(np, NETC_PM_R127(mac));
+	stats->hist[2] = netc_port_rd64(np, NETC_PM_R255(mac));
+	stats->hist[3] = netc_port_rd64(np, NETC_PM_R511(mac));
+	stats->hist[4] = netc_port_rd64(np, NETC_PM_R1023(mac));
+	stats->hist[5] = netc_port_rd64(np, NETC_PM_R1522(mac));
+	stats->hist[6] = netc_port_rd64(np, NETC_PM_R1523X(mac));
+
+	stats->hist_tx[0] = netc_port_rd64(np, NETC_PM_T64(mac));
+	stats->hist_tx[1] = netc_port_rd64(np, NETC_PM_T127(mac));
+	stats->hist_tx[2] = netc_port_rd64(np, NETC_PM_T255(mac));
+	stats->hist_tx[3] = netc_port_rd64(np, NETC_PM_T511(mac));
+	stats->hist_tx[4] = netc_port_rd64(np, NETC_PM_T1023(mac));
+	stats->hist_tx[5] = netc_port_rd64(np, NETC_PM_T1522(mac));
+	stats->hist_tx[6] = netc_port_rd64(np, NETC_PM_T1523X(mac));
+}
+
+void netc_port_get_rmon_stats(struct dsa_switch *ds, int port,
+			      struct ethtool_rmon_stats *rmon_stats,
+			      const struct ethtool_rmon_hist_range **ranges)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	struct net_device *ndev;
+
+	*ranges = netc_rmon_ranges;
+
+	switch (rmon_stats->src) {
+	case ETHTOOL_MAC_STATS_SRC_EMAC:
+		netc_port_rmon_stats(np, 0, rmon_stats);
+		break;
+	case ETHTOOL_MAC_STATS_SRC_PMAC:
+		netc_port_rmon_stats(np, 1, rmon_stats);
+		break;
+	case ETHTOOL_MAC_STATS_SRC_AGGREGATE:
+		ndev = dsa_to_port(ds, port)->user;
+		ethtool_aggregate_rmon_stats(ndev, rmon_stats);
+		break;
+	}
+}
+
+static void netc_port_ctrl_stats(struct netc_port *np, int mac,
+				 struct ethtool_eth_ctrl_stats *stats)
+{
+	if (mac && !np->caps.pmac)
+		return;
+
+	stats->MACControlFramesTransmitted =
+		netc_port_rd64(np, NETC_PM_TCNP(mac));
+	stats->MACControlFramesReceived =
+		netc_port_rd64(np, NETC_PM_RCNP(mac));
+}
+
+void netc_port_get_eth_ctrl_stats(struct dsa_switch *ds, int port,
+				  struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	struct net_device *ndev;
+
+	switch (ctrl_stats->src) {
+	case ETHTOOL_MAC_STATS_SRC_EMAC:
+		netc_port_ctrl_stats(np, 0, ctrl_stats);
+		break;
+	case ETHTOOL_MAC_STATS_SRC_PMAC:
+		netc_port_ctrl_stats(np, 1, ctrl_stats);
+		break;
+	case ETHTOOL_MAC_STATS_SRC_AGGREGATE:
+		ndev = dsa_to_port(ds, port)->user;
+		ethtool_aggregate_ctrl_stats(ndev, ctrl_stats);
+		break;
+	}
+}
+
+static void netc_port_mac_stats(struct netc_port *np, int mac,
+				struct ethtool_eth_mac_stats *stats)
+{
+	if (mac && !np->caps.pmac)
+		return;
+
+	stats->FramesTransmittedOK = netc_port_rd64(np, NETC_PM_TFRM(mac));
+	stats->SingleCollisionFrames = netc_port_rd64(np, NETC_PM_TSCOL(mac));
+	stats->MultipleCollisionFrames =
+		netc_port_rd64(np, NETC_PM_TMCOL(mac));
+	stats->FramesReceivedOK = netc_port_rd64(np, NETC_PM_RFRM(mac));
+	stats->FrameCheckSequenceErrors =
+		netc_port_rd64(np, NETC_PM_RFCS(mac));
+	stats->AlignmentErrors = netc_port_rd64(np, NETC_PM_RALN(mac));
+	stats->OctetsTransmittedOK = netc_port_rd64(np, NETC_PM_TEOCT(mac));
+	stats->FramesWithDeferredXmissions =
+		netc_port_rd64(np, NETC_PM_TDFR(mac));
+	stats->LateCollisions = netc_port_rd64(np, NETC_PM_TLCOL(mac));
+	stats->FramesAbortedDueToXSColls =
+		netc_port_rd64(np, NETC_PM_TECOL(mac));
+	stats->FramesLostDueToIntMACXmitError =
+		netc_port_rd64(np, NETC_PM_TERR(mac));
+	stats->OctetsReceivedOK = netc_port_rd64(np, NETC_PM_REOCT(mac));
+	stats->FramesLostDueToIntMACRcvError =
+		netc_port_rd64(np, NETC_PM_RDRNTP(mac));
+	stats->MulticastFramesXmittedOK =
+		netc_port_rd64(np, NETC_PM_TMCA(mac));
+	stats->BroadcastFramesXmittedOK =
+		netc_port_rd64(np, NETC_PM_TBCA(mac));
+	stats->MulticastFramesReceivedOK =
+		netc_port_rd64(np, NETC_PM_RMCA(mac));
+	stats->BroadcastFramesReceivedOK =
+		netc_port_rd64(np, NETC_PM_RBCA(mac));
+	stats->FramesWithExcessiveDeferral =
+		netc_port_rd64(np, NETC_PM_TEDFR(mac));
+}
+
+void netc_port_get_eth_mac_stats(struct dsa_switch *ds, int port,
+				 struct ethtool_eth_mac_stats *mac_stats)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	struct net_device *ndev;
+
+	switch (mac_stats->src) {
+	case ETHTOOL_MAC_STATS_SRC_EMAC:
+		netc_port_mac_stats(np, 0, mac_stats);
+		break;
+	case ETHTOOL_MAC_STATS_SRC_PMAC:
+		netc_port_mac_stats(np, 1, mac_stats);
+		break;
+	case ETHTOOL_MAC_STATS_SRC_AGGREGATE:
+		ndev = dsa_to_port(ds, port)->user;
+		ethtool_aggregate_mac_stats(ndev, mac_stats);
+		break;
+	}
+}
diff --git a/drivers/net/dsa/netc/netc_main.c b/drivers/net/dsa/netc/netc_main.c
index 7a4064849693..d79bd18b7e6e 100644
--- a/drivers/net/dsa/netc/netc_main.c
+++ b/drivers/net/dsa/netc/netc_main.c
@@ -1487,6 +1487,10 @@ static const struct dsa_switch_ops netc_switch_ops = {
 	.port_mdb_add			= netc_port_mdb_add,
 	.port_mdb_del			= netc_port_mdb_del,
 	.port_set_host_flood		= netc_port_set_host_flood,
+	.get_pause_stats		= netc_port_get_pause_stats,
+	.get_rmon_stats			= netc_port_get_rmon_stats,
+	.get_eth_ctrl_stats		= netc_port_get_eth_ctrl_stats,
+	.get_eth_mac_stats		= netc_port_get_eth_mac_stats,
 };
 
 static int netc_switch_probe(struct pci_dev *pdev,
diff --git a/drivers/net/dsa/netc/netc_switch.h b/drivers/net/dsa/netc/netc_switch.h
index 4a9bf69907e9..40e54af0c356 100644
--- a/drivers/net/dsa/netc/netc_switch.h
+++ b/drivers/net/dsa/netc/netc_switch.h
@@ -124,6 +124,7 @@ struct netc_switch {
 
 /* Write/Read registers of Switch Port (including pseudo MAC port) */
 #define netc_port_rd(p, o)		netc_read((p)->iobase + (o))
+#define netc_port_rd64(p, o)		netc_read64((p)->iobase + (o))
 #define netc_port_wr(p, o, v)		netc_write((p)->iobase + (o), v)
 
 /* Write/Read Switch global registers */
@@ -149,4 +150,15 @@ static inline void netc_del_fdb_entry(struct netc_fdb_entry *entry)
 
 int netc_switch_platform_probe(struct netc_switch *priv);
 
+/* ethtool APIs */
+void netc_port_get_pause_stats(struct dsa_switch *ds, int port,
+			       struct ethtool_pause_stats *pause_stats);
+void netc_port_get_rmon_stats(struct dsa_switch *ds, int port,
+			      struct ethtool_rmon_stats *rmon_stats,
+			      const struct ethtool_rmon_hist_range **ranges);
+void netc_port_get_eth_ctrl_stats(struct dsa_switch *ds, int port,
+				  struct ethtool_eth_ctrl_stats *ctrl_stats);
+void netc_port_get_eth_mac_stats(struct dsa_switch *ds, int port,
+				 struct ethtool_eth_mac_stats *mac_stats);
+
 #endif
diff --git a/drivers/net/dsa/netc/netc_switch_hw.h b/drivers/net/dsa/netc/netc_switch_hw.h
index caf5977c5100..f8d436ad9623 100644
--- a/drivers/net/dsa/netc/netc_switch_hw.h
+++ b/drivers/net/dsa/netc/netc_switch_hw.h
@@ -173,6 +173,126 @@ enum netc_stg_stage {
 #define   SSP_10M			1
 #define   SSP_1G			2
 
+/* Port MAC 0/1 Receive Ethernet Octets Counter */
+#define NETC_PM_REOCT(a)		(0x1100 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Alignment Error Counter Register */
+#define NETC_PM_RALN(a)			(0x1110 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Valid Pause Frame Counter */
+#define NETC_PM_RXPF(a)			(0x1118 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Frame Counter */
+#define NETC_PM_RFRM(a)			(0x1120 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Frame Check Sequence Error Counter */
+#define NETC_PM_RFCS(a)			(0x1128 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Multicast Frame Counter */
+#define NETC_PM_RMCA(a)			(0x1148 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Broadcast Frame Counter */
+#define NETC_PM_RBCA(a)			(0x1150 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Undersized Packet Counter */
+#define NETC_PM_RUND(a)			(0x1168 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive 64-Octet Packet Counter */
+#define NETC_PM_R64(a)			(0x1170 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive 65 to 127-Octet Packet Counter */
+#define NETC_PM_R127(a)			(0x1178 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive 128 to 255-Octet Packet Counter */
+#define NETC_PM_R255(a)			(0x1180 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive 256 to 511-Octet Packet Counter */
+#define NETC_PM_R511(a)			(0x1188 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive 512 to 1023-Octet Packet Counter */
+#define NETC_PM_R1023(a)		(0x1190 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive 1024 to 1522-Octet Packet Counter */
+#define NETC_PM_R1522(a)		(0x1198 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive 1523 to Max-Octet Packet Counter */
+#define NETC_PM_R1523X(a)		(0x11a0 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Oversized Packet Counter */
+#define NETC_PM_ROVR(a)			(0x11a8 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Jabber Packet Counter */
+#define NETC_PM_RJBR(a)			(0x11b0 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Fragment Packet Counter */
+#define NETC_PM_RFRG(a)			(0x11b8 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Control Packet Counter */
+#define NETC_PM_RCNP(a)			(0x11c0 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Dropped Not Truncated Packets Counter */
+#define NETC_PM_RDRNTP(a)		(0x11c8 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Ethernet Octets Counter */
+#define NETC_PM_TEOCT(a)		(0x1200 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Excessive Deferral Packet Counter */
+#define NETC_PM_TEDFR(a)		(0x1210 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Valid Pause Frame Counter */
+#define NETC_PM_TXPF(a)			(0x1218 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Frame Counter */
+#define NETC_PM_TFRM(a)			(0x1220 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Frame Error Counter */
+#define NETC_PM_TERR(a)			(0x1238 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Multicast Frame Counter */
+#define NETC_PM_TMCA(a)			(0x1248 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Broadcast Frame Counter */
+#define NETC_PM_TBCA(a)			(0x1250 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit 64-Octet Packet Counter */
+#define NETC_PM_T64(a)			(0x1270 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit 65 to 127-Octet Packet Counter */
+#define NETC_PM_T127(a)			(0x1278 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit 128 to 255-Octet Packet Counter */
+#define NETC_PM_T255(a)			(0x1280 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit 256 to 511-Octet Packet Counter */
+#define NETC_PM_T511(a)			(0x1288 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit 512 to 1023-Octet Packet Counter */
+#define NETC_PM_T1023(a)		(0x1290 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit 1024 to 1522-Octet Packet Counter */
+#define NETC_PM_T1522(a)		(0x1298 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit 1523 to TX_MTU-Octet Packet Counter */
+#define NETC_PM_T1523X(a)		(0x12a0 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Control Packet Counter */
+#define NETC_PM_TCNP(a)			(0x12c0 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Deferred Packet Counter */
+#define NETC_PM_TDFR(a)			(0x12d0 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Multiple Collisions Counter */
+#define NETC_PM_TMCOL(a)		(0x12d8 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Single Collision */
+#define NETC_PM_TSCOL(a)		(0x12e0 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Late Collision Counter */
+#define NETC_PM_TLCOL(a)		(0x12e8 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Excessive Collisions Counter */
+#define NETC_PM_TECOL(a)		(0x12f0 + (a) * 0x400)
+
 #define NETC_PEMDIOCR			0x1c00
 #define NETC_EMDIO_BASE			NETC_PEMDIOCR
 
diff --git a/include/linux/fsl/netc_global.h b/include/linux/fsl/netc_global.h
index fdecca8c90f0..5b8ff528d369 100644
--- a/include/linux/fsl/netc_global.h
+++ b/include/linux/fsl/netc_global.h
@@ -5,6 +5,7 @@
 #define __NETC_GLOBAL_H
 
 #include <linux/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 
 static inline u32 netc_read(void __iomem *reg)
 {
@@ -16,4 +17,9 @@ static inline void netc_write(void __iomem *reg, u32 val)
 	iowrite32(val, reg);
 }
 
+static inline u64 netc_read64(void __iomem *reg)
+{
+	return ioread64(reg);
+}
+
 #endif
-- 
2.34.1



^ permalink raw reply related

* [PATCH v7 net-next 15/15] net: dsa: netc: add support for ethtool private statistics
From: Wei Fang @ 2026-05-13  3:04 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
	f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
	andrew, olteanv
  Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
	imx
In-Reply-To: <20260513030454.1666570-1-wei.fang@nxp.com>

Implement the ethtool private statistics interface to expose additional
port-level and MAC-level counters that are not covered by the standard
IEEE 802.3 statistics. The pMAC counters are only reported when the port
supports Frame Preemption (802.1Qbu/802.3br).

Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
 drivers/net/dsa/netc/netc_ethtool.c   | 107 ++++++++++++++++++++++++++
 drivers/net/dsa/netc/netc_main.c      |   3 +
 drivers/net/dsa/netc/netc_switch.h    |   9 +++
 drivers/net/dsa/netc/netc_switch_hw.h |  58 ++++++++++++++
 4 files changed, 177 insertions(+)

diff --git a/drivers/net/dsa/netc/netc_ethtool.c b/drivers/net/dsa/netc/netc_ethtool.c
index ac8940b5a85c..8d04db534347 100644
--- a/drivers/net/dsa/netc/netc_ethtool.c
+++ b/drivers/net/dsa/netc/netc_ethtool.c
@@ -19,6 +19,56 @@ static const struct ethtool_rmon_hist_range netc_rmon_ranges[] = {
 	{ }
 };
 
+static const struct netc_port_stat netc_port_counters[] = {
+	{ NETC_PTGSLACR,	"port gate late arrival frames" },
+	{ NETC_PSDFTCR,	"port SDF transmit frames" },
+	{ NETC_PSDFDDCR,	"port SDF drop duplicate frames" },
+	{ NETC_PRXDCR,		"port rx discard frames" },
+	{ NETC_PRXDCRRR,	"port rx discard read-reset" },
+	{ NETC_PRXDCRR0,	"port rx discard reason 0" },
+	{ NETC_PRXDCRR1,	"port rx discard reason 1" },
+	{ NETC_PTXDCR,		"port tx discard frames" },
+	{ NETC_PTXDCRRR,	"port tx discard read-reset" },
+	{ NETC_PTXDCRR0,	"port tx discard reason 0" },
+	{ NETC_PTXDCRR1,	"port tx discard reason 1" },
+	{ NETC_BPDCR,		"bridge port discard frames" },
+	{ NETC_BPDCRRR,	"bridge port discard read-reset" },
+	{ NETC_BPDCRR0,	"bridge port discard reason 0" },
+	{ NETC_BPDCRR1,	"bridge port discard reason 1" },
+};
+
+static const struct netc_port_stat netc_emac_counters[] = {
+	{ NETC_PM_ROCT(0),	"eMAC rx octets" },
+	{ NETC_PM_RVLAN(0),	"eMAC rx VLAN frames" },
+	{ NETC_PM_RERR(0),	"eMAC rx frame errors" },
+	{ NETC_PM_RUCA(0),	"eMAC rx unicast frames" },
+	{ NETC_PM_RDRP(0),	"eMAC rx dropped packets" },
+	{ NETC_PM_RPKT(0),	"eMAC rx packets" },
+	{ NETC_PM_TOCT(0),	"eMAC tx octets" },
+	{ NETC_PM_TVLAN(0),	"eMAC tx VLAN frames" },
+	{ NETC_PM_TFCS(0),	"eMAC tx FCS errors" },
+	{ NETC_PM_TUCA(0),	"eMAC tx unicast frames" },
+	{ NETC_PM_TPKT(0),	"eMAC tx packets" },
+	{ NETC_PM_TUND(0),	"eMAC tx undersized packets" },
+	{ NETC_PM_TIOCT(0),	"eMAC tx invalid octets" },
+};
+
+static const struct netc_port_stat netc_pmac_counters[] = {
+	{ NETC_PM_ROCT(1),	"pMAC rx octets" },
+	{ NETC_PM_RVLAN(1),	"pMAC rx VLAN frames" },
+	{ NETC_PM_RERR(1),	"pMAC rx frame errors" },
+	{ NETC_PM_RUCA(1),	"pMAC rx unicast frames" },
+	{ NETC_PM_RDRP(1),	"pMAC rx dropped packets" },
+	{ NETC_PM_RPKT(1),	"pMAC rx packets" },
+	{ NETC_PM_TOCT(1),	"pMAC tx octets" },
+	{ NETC_PM_TVLAN(1),	"pMAC tx VLAN frames" },
+	{ NETC_PM_TFCS(1),	"pMAC tx FCS errors" },
+	{ NETC_PM_TUCA(1),	"pMAC tx unicast frames" },
+	{ NETC_PM_TPKT(1),	"pMAC tx packets" },
+	{ NETC_PM_TUND(1),	"pMAC tx undersized packets" },
+	{ NETC_PM_TIOCT(1),	"pMAC tx invalid octets" },
+};
+
 static void netc_port_pause_stats(struct netc_port *np, int mac,
 				  struct ethtool_pause_stats *stats)
 {
@@ -188,3 +238,60 @@ void netc_port_get_eth_mac_stats(struct dsa_switch *ds, int port,
 		break;
 	}
 }
+
+int netc_port_get_sset_count(struct dsa_switch *ds, int port, int sset)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	int size;
+
+	if (sset != ETH_SS_STATS)
+		return -EOPNOTSUPP;
+
+	size = ARRAY_SIZE(netc_port_counters) +
+	       ARRAY_SIZE(netc_emac_counters);
+
+	if (np->caps.pmac)
+		size += ARRAY_SIZE(netc_pmac_counters);
+
+	return size;
+}
+
+void netc_port_get_strings(struct dsa_switch *ds, int port,
+			   u32 sset, u8 *data)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	int i;
+
+	if (sset != ETH_SS_STATS)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(netc_port_counters); i++)
+		ethtool_cpy(&data, netc_port_counters[i].name);
+
+	for (i = 0; i < ARRAY_SIZE(netc_emac_counters); i++)
+		ethtool_cpy(&data, netc_emac_counters[i].name);
+
+	if (!np->caps.pmac)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(netc_pmac_counters); i++)
+		ethtool_cpy(&data, netc_pmac_counters[i].name);
+}
+
+void netc_port_get_ethtool_stats(struct dsa_switch *ds, int port, u64 *data)
+{
+	struct netc_port *np = NETC_PORT(ds, port);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(netc_port_counters); i++)
+		*data++ = netc_port_rd(np, netc_port_counters[i].reg);
+
+	for (i = 0; i < ARRAY_SIZE(netc_emac_counters); i++)
+		*data++ = netc_port_rd64(np, netc_emac_counters[i].reg);
+
+	if (!np->caps.pmac)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(netc_pmac_counters); i++)
+		*data++ = netc_port_rd64(np, netc_pmac_counters[i].reg);
+}
diff --git a/drivers/net/dsa/netc/netc_main.c b/drivers/net/dsa/netc/netc_main.c
index d79bd18b7e6e..fed2b1dcb856 100644
--- a/drivers/net/dsa/netc/netc_main.c
+++ b/drivers/net/dsa/netc/netc_main.c
@@ -1491,6 +1491,9 @@ static const struct dsa_switch_ops netc_switch_ops = {
 	.get_rmon_stats			= netc_port_get_rmon_stats,
 	.get_eth_ctrl_stats		= netc_port_get_eth_ctrl_stats,
 	.get_eth_mac_stats		= netc_port_get_eth_mac_stats,
+	.get_sset_count			= netc_port_get_sset_count,
+	.get_strings			= netc_port_get_strings,
+	.get_ethtool_stats		= netc_port_get_ethtool_stats,
 };
 
 static int netc_switch_probe(struct pci_dev *pdev,
diff --git a/drivers/net/dsa/netc/netc_switch.h b/drivers/net/dsa/netc/netc_switch.h
index 40e54af0c356..740e1f307c45 100644
--- a/drivers/net/dsa/netc/netc_switch.h
+++ b/drivers/net/dsa/netc/netc_switch.h
@@ -94,6 +94,11 @@ struct netc_fdb_entry {
 	struct hlist_node node;
 };
 
+struct netc_port_stat {
+	int reg;
+	char name[ETH_GSTRING_LEN] __nonstring;
+};
+
 struct netc_switch {
 	struct pci_dev *pdev;
 	struct device *dev;
@@ -160,5 +165,9 @@ void netc_port_get_eth_ctrl_stats(struct dsa_switch *ds, int port,
 				  struct ethtool_eth_ctrl_stats *ctrl_stats);
 void netc_port_get_eth_mac_stats(struct dsa_switch *ds, int port,
 				 struct ethtool_eth_mac_stats *mac_stats);
+int netc_port_get_sset_count(struct dsa_switch *ds, int port, int sset);
+void netc_port_get_strings(struct dsa_switch *ds, int port,
+			   u32 sset, u8 *data);
+void netc_port_get_ethtool_stats(struct dsa_switch *ds, int port, u64 *data);
 
 #endif
diff --git a/drivers/net/dsa/netc/netc_switch_hw.h b/drivers/net/dsa/netc/netc_switch_hw.h
index f8d436ad9623..1b016e7dd03e 100644
--- a/drivers/net/dsa/netc/netc_switch_hw.h
+++ b/drivers/net/dsa/netc/netc_switch_hw.h
@@ -87,6 +87,17 @@
 #define  PSR_TX_BUSY			BIT(0)
 #define  PSR_RX_BUSY			BIT(1)
 
+#define NETC_PTGSLACR			0x130
+
+#define NETC_PRXDCR			0x1c0
+#define NETC_PRXDCRRR			0x1c4
+#define NETC_PRXDCRR0			0x1c8
+#define NETC_PRXDCRR1			0x1cc
+#define NETC_PTXDCR			0x1e0
+#define NETC_PTXDCRRR			0x1e4
+#define NETC_PTXDCRR0			0x1e8
+#define NETC_PTXDCRR1			0x1ec
+
 #define NETC_PTCTMSDUR(a)		(0x208 + (a) * 0x20)
 #define  PTCTMSDUR_MAXSDU		GENMASK(15, 0)
 #define  PTCTMSDUR_SDU_TYPE		GENMASK(17, 16)
@@ -94,6 +105,9 @@
 #define   SDU_TYPE_MPDU			1
 #define   SDU_TYPE_MSDU			2
 
+#define NETC_PSDFTCR			0x4c4
+#define NETC_PSDFDDCR			0x4c8
+
 #define NETC_BPCR			0x500
 #define  BPCR_DYN_LIMIT			GENMASK(15, 0)
 #define  BPCR_MLO			GENMASK(22, 20)
@@ -142,6 +156,11 @@ enum netc_stg_stage {
 	NETC_STG_STATE_FORWARDING,
 };
 
+#define NETC_BPDCR			0x580
+#define NETC_BPDCRRR			0x584
+#define NETC_BPDCRR0			0x588
+#define NETC_BPDCRR1			0x58c
+
 /* Definition of Switch ethernet MAC port registers */
 #define NETC_PMAC_OFFSET		0x400
 #define NETC_PM_CMD_CFG(a)		(0x1008 + (a) * 0x400)
@@ -176,6 +195,9 @@ enum netc_stg_stage {
 /* Port MAC 0/1 Receive Ethernet Octets Counter */
 #define NETC_PM_REOCT(a)		(0x1100 + (a) * 0x400)
 
+/* Port MAC 0/1 Receive Octets Counter */
+#define NETC_PM_ROCT(a)			(0x1108 + (a) * 0x400)
+
 /* Port MAC 0/1 Receive Alignment Error Counter Register */
 #define NETC_PM_RALN(a)			(0x1110 + (a) * 0x400)
 
@@ -188,12 +210,27 @@ enum netc_stg_stage {
 /* Port MAC 0/1 Receive Frame Check Sequence Error Counter */
 #define NETC_PM_RFCS(a)			(0x1128 + (a) * 0x400)
 
+/* Port MAC 0/1 Receive VLAN Frame Counter */
+#define NETC_PM_RVLAN(a)		(0x1130 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Frame Error Counter */
+#define NETC_PM_RERR(a)			(0x1138 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Unicast Frame Counter */
+#define NETC_PM_RUCA(a)			(0x1140 + (a) * 0x400)
+
 /* Port MAC 0/1 Receive Multicast Frame Counter */
 #define NETC_PM_RMCA(a)			(0x1148 + (a) * 0x400)
 
 /* Port MAC 0/1 Receive Broadcast Frame Counter */
 #define NETC_PM_RBCA(a)			(0x1150 + (a) * 0x400)
 
+/* Port MAC 0/1 Receive Dropped Packets Counter */
+#define NETC_PM_RDRP(a)			(0x1158 + (a) * 0x400)
+
+/* Port MAC 0/1 Receive Packets Counter */
+#define NETC_PM_RPKT(a)			(0x1160 + (a) * 0x400)
+
 /* Port MAC 0/1 Receive Undersized Packet Counter */
 #define NETC_PM_RUND(a)			(0x1168 + (a) * 0x400)
 
@@ -236,6 +273,9 @@ enum netc_stg_stage {
 /* Port MAC 0/1 Transmit Ethernet Octets Counter */
 #define NETC_PM_TEOCT(a)		(0x1200 + (a) * 0x400)
 
+/* Port MAC 0/1 Transmit Octets Counter */
+#define NETC_PM_TOCT(a)			(0x1208 + (a) * 0x400)
+
 /* Port MAC 0/1 Transmit Excessive Deferral Packet Counter */
 #define NETC_PM_TEDFR(a)		(0x1210 + (a) * 0x400)
 
@@ -245,15 +285,30 @@ enum netc_stg_stage {
 /* Port MAC 0/1 Transmit Frame Counter */
 #define NETC_PM_TFRM(a)			(0x1220 + (a) * 0x400)
 
+/* Port MAC 0/1 Transmit Frame Check Sequence Error Counter */
+#define NETC_PM_TFCS(a)			(0x1228 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit VLAN Frame Counter */
+#define NETC_PM_TVLAN(a)		(0x1230 + (a) * 0x400)
+
 /* Port MAC 0/1 Transmit Frame Error Counter */
 #define NETC_PM_TERR(a)			(0x1238 + (a) * 0x400)
 
+/* Port MAC 0/1 Transmit Unicast Frame Counter */
+#define NETC_PM_TUCA(a)			(0x1240 + (a) * 0x400)
+
 /* Port MAC 0/1 Transmit Multicast Frame Counter */
 #define NETC_PM_TMCA(a)			(0x1248 + (a) * 0x400)
 
 /* Port MAC 0/1 Transmit Broadcast Frame Counter */
 #define NETC_PM_TBCA(a)			(0x1250 + (a) * 0x400)
 
+/* Port MAC 0/1 Transmit Packets Counter */
+#define NETC_PM_TPKT(a)			(0x1260 + (a) * 0x400)
+
+/* Port MAC 0/1 Transmit Undersized Packet Counter */
+#define NETC_PM_TUND(a)			(0x1268 + (a) * 0x400)
+
 /* Port MAC 0/1 Transmit 64-Octet Packet Counter */
 #define NETC_PM_T64(a)			(0x1270 + (a) * 0x400)
 
@@ -293,6 +348,9 @@ enum netc_stg_stage {
 /* Port MAC 0/1 Transmit Excessive Collisions Counter */
 #define NETC_PM_TECOL(a)		(0x12f0 + (a) * 0x400)
 
+/* Port MAC 0/1 Transmit Invalid Octets Counter */
+#define NETC_PM_TIOCT(a)		(0x12f8 + (a) * 0x400)
+
 #define NETC_PEMDIOCR			0x1c00
 #define NETC_EMDIO_BASE			NETC_PEMDIOCR
 
-- 
2.34.1



^ permalink raw reply related

* Re: [PATCH v6 00/14] selftests/mm: fix failures and robustness improvements
From: Andrew Morton @ 2026-05-13  3:10 UTC (permalink / raw)
  To: Sayali Patil
  Cc: Shuah Khan, linux-mm, linux-kernel, linux-kselftest,
	Ritesh Harjani, David Hildenbrand, Zi Yan, Michal Hocko,
	Oscar Salvador, Lorenzo Stoakes, Dev Jain, Liam.Howlett,
	linuxppc-dev, Miaohe Lin, Venkat Rao Bagalkote
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

On Mon,  4 May 2026 13:54:37 +0530 Sayali Patil <sayalip@linux.ibm.com> wrote:

> Powerpc systems with a 64K base page size exposed several issues while
> running mm selftests. Some tests assume specific hugetlb configurations,
> use incorrect interfaces, or fail instead of skipping when the required
> kernel features are not available.
> 
> This series fixes these issues and improves test robustness.

Thanks, I'll add this to mm.git for testing exposure.

Did you see the Sashiko results?  It all looks fairly nitpicky to me
but perhaps there's something you'd prefer to change?

	https://sashiko.dev/#/patchset/cover.1777877814.git.sayalip@linux.ibm.com


^ permalink raw reply

* Re: [PATCH v6 00/14] selftests/mm: fix failures and robustness improvements
From: Andrew Morton @ 2026-05-13  3:24 UTC (permalink / raw)
  To: Sayali Patil
  Cc: Shuah Khan, linux-mm, linux-kernel, linux-kselftest,
	Ritesh Harjani, David Hildenbrand, Zi Yan, Michal Hocko,
	Oscar Salvador, Lorenzo Stoakes, Dev Jain, Liam.Howlett,
	linuxppc-dev, Miaohe Lin, Venkat Rao Bagalkote
In-Reply-To: <cover.1777877814.git.sayalip@linux.ibm.com>

On Mon,  4 May 2026 13:54:37 +0530 Sayali Patil <sayalip@linux.ibm.com> wrote:

> Powerpc systems with a 64K base page size exposed several issues while
> running mm selftests. Some tests assume specific hugetlb configurations,
> use incorrect interfaces, or fail instead of skipping when the required
> kernel features are not available.
> 
> This series fixes these issues and improves test robustness.

Sorry, bitrot.  Mike's monster series "make MM selftests more CI
friendly" has moved a lot of this code around then altered it.  Can you
please redo against mm.git's mm-new branch?

Thanks.



^ permalink raw reply

* Re: [PATCH 8/8] powerpc/mm: remove CONFIG_HAVE_BOOTMEM_INFO_NODE
From: Ritesh Harjani @ 2026-05-13  3:25 UTC (permalink / raw)
  To: David Hildenbrand (Arm), David S. Miller, Andreas Larsson,
	Mike Rapoport, Andrew Morton, Alexander Gordeev, Gerald Schaefer,
	Heiko Carstens, Vasily Gorbik, Christian Borntraeger,
	Sven Schnelle, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Suren Baghdasaryan,
	Michal Hocko
  Cc: sparclinux, linux-kernel, linux-mm, linux-s390, linuxppc-dev,
	David Hildenbrand (Arm)
In-Reply-To: <20260511-bootmem_info_prep-v1-8-3fb0be6fc688@kernel.org>

"David Hildenbrand (Arm)" <david@kernel.org> writes:

> register_page_bootmem_info_node() essentially only calls
> register_page_bootmem_memmap(). However, on powerpc that function is a
> nop. So there is not benefit in using CONFIG_HAVE_BOOTMEM_INFO_NODE
> anymore, let's just drop it.
>
> We can stop including bootmem_info.h.

Yup, the following patch [1] removed the usage of bootmem more than a
decade ago. Thanks for cleaning up the remaining unused pieces.

[1]: https://lore.kernel.org/all/1410933504-28564-1-git-send-email-anton@samba.org/

LGTM. Please feel free to add:

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>



^ permalink raw reply

* Re: [PATCH v3]  KVM: PPC: Book3S HV: Add missing mappings for tracing exits
From: Ritesh Harjani @ 2026-05-13  3:52 UTC (permalink / raw)
  To: Gautam Menghani, maddy, npiggin, mpe, chleroy
  Cc: linuxppc-dev, kvm, linux-kernel, Gautam Menghani,
	Harsh Prateek Bora, Amit Machhiwal, Vaibhav Jain (IBM)
In-Reply-To: <20260512115724.59299-1-gautam@linux.ibm.com>

Gautam Menghani <gautam@linux.ibm.com> writes:

> The macro kvm_trace_symbol_exit is used for providing the mappings
> for the trap vectors and their names. Add mappings for H_FAC_UNAVAIL and
> RETURN_TO_HOST so that trap reasons are displayed as string instead of
> vector numbers when using the kvm_guest_exit tracepoint.
>

LGTM. Please feel free to add:
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>

Minor nit: It's generally simpler for everyone to keep the related
patches belonging to a common maintainer tree in one series. For
example, we could have grouped patch [1] with the current patch into one
series to keep them together.

[1]: https://lore.kernel.org/linuxppc-dev/20260511080412.50722-1-Gautam.Menghani@ibm.com/

-ritesh


^ permalink raw reply

* Re: [PATCH] powerpc/64s: Fix the vector number in comments for h_facility_unavailable
From: Ritesh Harjani @ 2026-05-13  4:00 UTC (permalink / raw)
  To: Gautam Menghani, maddy, mpe, npiggin, chleroy
  Cc: Gautam Menghani, linuxppc-dev, linux-kernel
In-Reply-To: <20260511080412.50722-1-Gautam.Menghani@ibm.com>

Gautam Menghani <Gautam.Menghani@ibm.com> writes:

> From: Gautam Menghani <gautam@linux.ibm.com>
>
> The comments explaining the h_facility_unavailable interrupt have mentioned
> the vector number as 0xf60 instead of 0xf80. Fix this typo.
>

Yup, looks like it got copied over from the above definition of Facility
Unavailable Interrupt. Nice catch!

LGTM. Please feel free to add:

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>


^ permalink raw reply

* Re: [PATCH] perf kvm stat: Update the exit reason mappings
From: Ritesh Harjani @ 2026-05-13  4:03 UTC (permalink / raw)
  To: Gautam Menghani, Ian Rogers
  Cc: peterz, mingo, acme, namhyung, mark.rutland, alexander.shishkin,
	jolsa, adrian.hunter, james.clark, linux-perf-users, linux-kernel,
	linuxppc-dev, maddy
In-Reply-To: <agNS2p8vXm-Pkkfv@Gautams-MacBook-Pro.local>


++ linuxppc-dev

Gautam Menghani <gautam@linux.ibm.com> writes:

> On Tue, May 12, 2026 at 08:25:08AM -0700, Ian Rogers wrote:
>> On Tue, May 12, 2026 at 5:04 AM Gautam Menghani <gautam@linux.ibm.com> wrote:
>> >
>> > Sync the exit reason mappings with the mappings in trace_book3s.h
>> 
>> I see:
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/kvm/trace_book3s.h
>> Would it make sense to have a copy in perf and use the check headers
>> code to keep them in sync?
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/check-headers.sh
>
> I'll take a look at this, thanks
>
>> 
>> Could you add the commits that add the H_VIRT and H_FAC_UNAVAIL
>> definitions? I don't see them in Linus' tree yet.
>
> I posted that patch earlier today - https://lore.kernel.org/linuxppc-dev/20260512115724.59299-1-gautam@linux.ibm.com/
> should've pasted the link in the patch
>

For patches not yet merged and having such a dependency, this could cause
confusion. What I generally tend to do in such case is, group this
patch (changes in tools/perf/util/kvm-stat-arch/book3s_hv_exits.h) into
the same series which adds H_FAC_UNAVAIL to trace_book3s.h [1].
This way it is easier for everyone to keep track of the dependencies.

[1]: https://lore.kernel.org/linuxppc-dev/20260512115724.59299-1-gautam@linux.ibm.com/

Note, that we should still cc the relevant mailing lists, reviewers and
maintainers to get an Acked-by. Since the changes in this patch are
largely powerpc specific, so IMO, it should be ok even if it goes via
powerpc tree via a common series, as long as everyone agrees.

-ritesh



^ permalink raw reply

* Re: [PATCH v3] powerpc/audit: Convert powerpc to AUDIT_ARCH_COMPAT_GENERIC
From: Harsh Prateek Bora @ 2026-05-13  4:35 UTC (permalink / raw)
  To: Paul Moore, Christophe Leroy (CS GROUP), Venkat Rao Bagalkote
  Cc: Michael Ellerman, Nicholas Piggin, Madhavan Srinivasan,
	Eric Paris, Christophe Leroy, linux-kernel, linuxppc-dev, audit,
	Thomas Weissschuh, Cédric Le Goater, ritesh.list
In-Reply-To: <CAHC9VhTyOuuPs1uovodM9M_8zZeJnVXs9xrWSdPhZcjXUb=mvA@mail.gmail.com>

+ Ritesh

Hi Venkat,

On 11/03/26 12:49 am, Paul Moore wrote:
> On Tue, Mar 10, 2026 at 11:08 AM Christophe Leroy (CS GROUP)
> <chleroy@kernel.org> wrote:
>>
>> From: Christophe Leroy <christophe.leroy@csgroup.eu>
>>
>> Commit e65e1fc2d24b ("[PATCH] syscall class hookup for all normal
>> targets") added generic support for AUDIT but that didn't include
>> support for bi-arch like powerpc.
>>
>> Commit 4b58841149dc ("audit: Add generic compat syscall support")
>> added generic support for bi-arch.
>>
>> Convert powerpc to that bi-arch generic audit support.
>>
>> With this change generated text is similar.
>>
>> Thomas has confirmed that the previously failing filter_exclude/test
>> is now successful both without and with this patch, see [1]
>>
>> [1] https://lore.kernel.org/all/20260306115350-ef265661-6d6b-4043-9bd0-8e6b437d0d67@linutronix.de/
>>
>> Link: https://github.com/linuxppc/issues/issues/412
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> Reviewed-by: Cédric Le Goater <clg@kaod.org>
>> ---
>> Venkat, a test result with https://github.com/linux-audit/audit-testsuite would be appreciated.
> 
> Yes, I'd like to see confirmation that the audit test suite runs clean
> on ppc systems with this patch applied, and unfortunately without a
> ppc system I have no way to test this myself.

Is it possible to include this test suite in the IBM CI ?

Thanks
Harsh
> 
>> v3: Rebased on v7.0-rc1
>>
>> v2: https://lore.kernel.org/all/a4b3951d1191d4183d92a07a6097566bde60d00a.1629812058.git.christophe.leroy@csgroup.eu/
>> ---
>>   arch/powerpc/Kconfig                |  5 +-
>>   arch/powerpc/include/asm/unistd32.h |  7 +++
>>   arch/powerpc/kernel/Makefile        |  3 -
>>   arch/powerpc/kernel/audit.c         | 87 -----------------------------
>>   arch/powerpc/kernel/compat_audit.c  | 49 ----------------
>>   5 files changed, 8 insertions(+), 143 deletions(-)
>>   create mode 100644 arch/powerpc/include/asm/unistd32.h
>>   delete mode 100644 arch/powerpc/kernel/audit.c
>>   delete mode 100644 arch/powerpc/kernel/compat_audit.c
> 



^ permalink raw reply

* Re: [PATCH 1/3] powerpc/time: remove preempt_disable/enable from arch_irq_work_raise()
From: Ritesh Harjani @ 2026-05-13  4:30 UTC (permalink / raw)
  To: Sayali Patil, linuxppc-dev, maddy
  Cc: linux-kernel, Mahesh Salgaonkar, sshegde, chleroy
In-Reply-To: <a64fa7d86da51f78743bee26e16ae155c43016c7.1778057685.git.sayalip@linux.ibm.com>

Sayali Patil <sayalip@linux.ibm.com> writes:

> A kernel panic is observed when handling machine check exceptions from
> real mode.
>
>   BUG: Unable to handle kernel data access on read at 0xc00000006be21300
>   Oops: Kernel access of bad area, sig: 11 [#1]
>   NIP [c000000000029e40] arch_irq_work_raise+0x10/0x70
>   LR [c00000000003ffc8] machine_check_queue_event+0xa8/0x150

[14626.841925] MSR:  8000000000001003 <SF,ME,RI,LE>  CR: 88222248  XER: 00000005
[14626.841939] CFAR: c00000000003ffc4 DAR: c00000006be21300 DSISR: 40000000 IRQMASK: 0


Let's also add the above MSR state along with the call stack showing
MSR[EE] was 0 when this triggered. This also shows the DAR as 0xc....
while MSR[IR|DR] = 0. 

>   Call Trace:
>   [c0000000179d3c70] [c00000000003ff64] machine_check_queue_event+0x44/0x150
>   [c0000000179d3d30] [c0000000000084e0] machine_check_early_common+0x1f0/0x2c0
>
> The crash occurs because arch_irq_work_raise() calls preempt_disable()
> from machine check exception (MCE) handlers running in real mode. In
> this context, accessing the preempt_count can fault, leading to the panic.
>
> The preempt_disable()/preempt_enable() pair in arch_irq_work_raise()
> was originally added by commit 0fe1ac48bef0 ("powerpc/perf_event: Fix
> oops due to perf_event_do_pending call") to avoid races while raising
> irq work from exception context.
>
> Later, commit 471ba0e686cb ("irq_work: Do not raise an IPI when
> queueing work on the local CPU") added preemption protection in
> irq_work_queue() path, while commit 20b876918c06 ("irq_work: Use per
> cpu atomics instead of regular atomics") added equivalent
> protection in irq_work_queue_on() before reaching arch_irq_work_raise():
>
>   irq_work_queue() / irq_work_queue_on()
>     -> preempt_disable()
>       -> __irq_work_queue_local()
>         -> irq_work_raise()
>           -> arch_irq_work_raise()
>
> As a result, callers other than mce_irq_work_raise() already execute
> with preemption disabled, making the additional
> preempt_disable()/preempt_enable() pair in arch_irq_work_raise()
> redundant.
>
> Remove it to avoid accessing preempt_count from real mode context.
>
> Fixes: cc15ff327569 ("powerpc/mce: Avoid using irq_work_queue() in realmode")

Agree with the Fixes tag. This patch actually moved mce to use
arch_irq_work_raise(). It was ok until the CONFIG_PREEMPTION was
disabled on powerpc since macros like preempt_enable|disable() were
mostly a no-op. However, after lazy preemption got enabled, access to
preempt_count while in real mode can cause the issue you described.


One more thing which we should add to the commit msg is:
The arch_irq_work_raise() function executes in NMI context when called
from MCE handler, hence we won't be preempted or scheduled out since we
are in NMI context with MSR[EE]=0, hence it is safe to remove
preempt_disable|enable() call from here.

And let's change the commit subject to:
    powerpc/time: Remove redundant preempt_disable|enable() calls from arch_irq_work_raise()


BTW, thanks for adding a nice commit msg with the sequence of events.
With the above changes - pease feel free to add:

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>


> Suggested-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
> Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
> ---
>  arch/powerpc/kernel/time.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 4bbeb8644d3d..a99eb43f6ce9 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -471,10 +471,8 @@ void arch_irq_work_raise(void)
>  	 * which could get tangled up if we're messing with the same state
>  	 * here.
>  	 */
> -	preempt_disable();
>  	set_irq_work_pending_flag();
>  	set_dec(1);
> -	preempt_enable();
>  }
>  
>  static void set_dec_or_work(u64 val)
> -- 
> 2.52.0


^ permalink raw reply

* Re: [PATCH 1/3] powerpc/time: remove preempt_disable/enable from arch_irq_work_raise()
From: Shrikanth Hegde @ 2026-05-13  5:35 UTC (permalink / raw)
  To: Ritesh Harjani (IBM), Sayali Patil, linuxppc-dev, maddy
  Cc: linux-kernel, Mahesh Salgaonkar, chleroy
In-Reply-To: <pl30q6bq.ritesh.list@gmail.com>



On 5/13/26 10:00 AM, Ritesh Harjani (IBM) wrote:
> Sayali Patil <sayalip@linux.ibm.com> writes:
> 
>> A kernel panic is observed when handling machine check exceptions from
>> real mode.
>>
>>    BUG: Unable to handle kernel data access on read at 0xc00000006be21300
>>    Oops: Kernel access of bad area, sig: 11 [#1]
>>    NIP [c000000000029e40] arch_irq_work_raise+0x10/0x70
>>    LR [c00000000003ffc8] machine_check_queue_event+0xa8/0x150
> 
> [14626.841925] MSR:  8000000000001003 <SF,ME,RI,LE>  CR: 88222248  XER: 00000005
> [14626.841939] CFAR: c00000000003ffc4 DAR: c00000006be21300 DSISR: 40000000 IRQMASK: 0
> 
> 
> Let's also add the above MSR state along with the call stack showing
> MSR[EE] was 0 when this triggered. This also shows the DAR as 0xc....
> while MSR[IR|DR] = 0.
> 
>>    Call Trace:
>>    [c0000000179d3c70] [c00000000003ff64] machine_check_queue_event+0x44/0x150
>>    [c0000000179d3d30] [c0000000000084e0] machine_check_early_common+0x1f0/0x2c0
>>
>> The crash occurs because arch_irq_work_raise() calls preempt_disable()
>> from machine check exception (MCE) handlers running in real mode. In
>> this context, accessing the preempt_count can fault, leading to the panic.
>>
>> The preempt_disable()/preempt_enable() pair in arch_irq_work_raise()
>> was originally added by commit 0fe1ac48bef0 ("powerpc/perf_event: Fix
>> oops due to perf_event_do_pending call") to avoid races while raising
>> irq work from exception context.
>>
>> Later, commit 471ba0e686cb ("irq_work: Do not raise an IPI when
>> queueing work on the local CPU") added preemption protection in
>> irq_work_queue() path, while commit 20b876918c06 ("irq_work: Use per
>> cpu atomics instead of regular atomics") added equivalent
>> protection in irq_work_queue_on() before reaching arch_irq_work_raise():
>>
>>    irq_work_queue() / irq_work_queue_on()
>>      -> preempt_disable()
>>        -> __irq_work_queue_local()
>>          -> irq_work_raise()
>>            -> arch_irq_work_raise()
>>
>> As a result, callers other than mce_irq_work_raise() already execute
>> with preemption disabled, making the additional
>> preempt_disable()/preempt_enable() pair in arch_irq_work_raise()
>> redundant.
>>
>> Remove it to avoid accessing preempt_count from real mode context.
>>
>> Fixes: cc15ff327569 ("powerpc/mce: Avoid using irq_work_queue() in realmode")
> 
> Agree with the Fixes tag. This patch actually moved mce to use
> arch_irq_work_raise(). It was ok until the CONFIG_PREEMPTION was
> disabled on powerpc since macros like preempt_enable|disable() were
> mostly a no-op. However, after lazy preemption got enabled, access to

Both full/lazy preemption. With upstream now, one can choose full or lazy only.
Leading to issue being discovered.

> preempt_count while in real mode can cause the issue you described.
> 
> 
> One more thing which we should add to the commit msg is:
> The arch_irq_work_raise() function executes in NMI context when called
> from MCE handler, hence we won't be preempted or scheduled out since we
> are in NMI context with MSR[EE]=0, hence it is safe to remove
> preempt_disable|enable() call from here.
> 
> And let's change the commit subject to:
>      powerpc/time: Remove redundant preempt_disable|enable() calls from arch_irq_work_raise()
> 
> 
> BTW, thanks for adding a nice commit msg with the sequence of events.
> With the above changes - pease feel free to add:
> 
> Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> 
> 
>> Suggested-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
>> Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
>> ---
>>   arch/powerpc/kernel/time.c | 2 --
>>   1 file changed, 2 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
>> index 4bbeb8644d3d..a99eb43f6ce9 100644
>> --- a/arch/powerpc/kernel/time.c
>> +++ b/arch/powerpc/kernel/time.c
>> @@ -471,10 +471,8 @@ void arch_irq_work_raise(void)
>>   	 * which could get tangled up if we're messing with the same state
>>   	 * here.
>>   	 */
>> -	preempt_disable();
>>   	set_irq_work_pending_flag();
>>   	set_dec(1);
>> -	preempt_enable();
>>   }
>>   
>>   static void set_dec_or_work(u64 val)
>> -- 
>> 2.52.0



^ permalink raw reply

* Re: [PATCH v3] powerpc/audit: Convert powerpc to AUDIT_ARCH_COMPAT_GENERIC
From: Madhavan Srinivasan @ 2026-05-13  5:41 UTC (permalink / raw)
  To: Harsh Prateek Bora, Paul Moore, Christophe Leroy (CS GROUP),
	Venkat Rao Bagalkote
  Cc: Michael Ellerman, Nicholas Piggin, Eric Paris, Christophe Leroy,
	linux-kernel, linuxppc-dev, audit, Thomas Weissschuh,
	Cédric Le Goater, ritesh.list
In-Reply-To: <fce0a701-ee58-4cb2-b34b-8aec26aaab5d@linux.ibm.com>


On 5/13/26 10:05 AM, Harsh Prateek Bora wrote:
> + Ritesh
>
> Hi Venkat,
>
> On 11/03/26 12:49 am, Paul Moore wrote:
>> On Tue, Mar 10, 2026 at 11:08 AM Christophe Leroy (CS GROUP)
>> <chleroy@kernel.org> wrote:
>>>
>>> From: Christophe Leroy <christophe.leroy@csgroup.eu>
>>>
>>> Commit e65e1fc2d24b ("[PATCH] syscall class hookup for all normal
>>> targets") added generic support for AUDIT but that didn't include
>>> support for bi-arch like powerpc.
>>>
>>> Commit 4b58841149dc ("audit: Add generic compat syscall support")
>>> added generic support for bi-arch.
>>>
>>> Convert powerpc to that bi-arch generic audit support.
>>>
>>> With this change generated text is similar.
>>>
>>> Thomas has confirmed that the previously failing filter_exclude/test
>>> is now successful both without and with this patch, see [1]
>>>
>>> [1] 
>>> https://lore.kernel.org/all/20260306115350-ef265661-6d6b-4043-9bd0-8e6b437d0d67@linutronix.de/
>>>
>>> Link: https://github.com/linuxppc/issues/issues/412
>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>> Reviewed-by: Cédric Le Goater <clg@kaod.org>
>>> ---
>>> Venkat, a test result with 
>>> https://github.com/linux-audit/audit-testsuite would be appreciated.
>>
>> Yes, I'd like to see confirmation that the audit test suite runs clean
>> on ppc systems with this patch applied, and unfortunately without a
>> ppc system I have no way to test this myself.
>
My bad, this is a miss from my end.
Venkat is already on this and will update the results here.

Maddy


> Is it possible to include this test suite in the IBM CI ?
>
> Thanks
> Harsh
>>
>>> v3: Rebased on v7.0-rc1
>>>
>>> v2: 
>>> https://lore.kernel.org/all/a4b3951d1191d4183d92a07a6097566bde60d00a.1629812058.git.christophe.leroy@csgroup.eu/
>>> ---
>>>   arch/powerpc/Kconfig                |  5 +-
>>>   arch/powerpc/include/asm/unistd32.h |  7 +++
>>>   arch/powerpc/kernel/Makefile        |  3 -
>>>   arch/powerpc/kernel/audit.c         | 87 
>>> -----------------------------
>>>   arch/powerpc/kernel/compat_audit.c  | 49 ----------------
>>>   5 files changed, 8 insertions(+), 143 deletions(-)
>>>   create mode 100644 arch/powerpc/include/asm/unistd32.h
>>>   delete mode 100644 arch/powerpc/kernel/audit.c
>>>   delete mode 100644 arch/powerpc/kernel/compat_audit.c
>>
>
>


^ permalink raw reply

* Re: [PATCH 01/19] btrfs: require at least 4 devices for RAID 6
From: Christoph Hellwig @ 2026-05-13  5:47 UTC (permalink / raw)
  To: David Sterba
  Cc: Christoph Hellwig, Andrew Morton, Catalin Marinas, Will Deacon,
	Ard Biesheuvel, Huacai Chen, WANG Xuerui, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Paul Walmsley, Palmer Dabbelt, Albert Ou, Alexandre Ghiti,
	Heiko Carstens, Vasily Gorbik, Alexander Gordeev,
	Christian Borntraeger, Sven Schnelle, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Herbert Xu, Dan Williams, Chris Mason, David Sterba,
	Arnd Bergmann, Song Liu, Yu Kuai, Li Nan, linux-kernel,
	linux-arm-kernel, loongarch, linuxppc-dev, linux-riscv,
	linux-s390, linux-crypto, linux-btrfs, linux-arch, linux-raid
In-Reply-To: <20260512114231.GG2558453@suse.cz>

On Tue, May 12, 2026 at 01:42:31PM +0200, David Sterba wrote:
> On Tue, May 12, 2026 at 07:20:41AM +0200, Christoph Hellwig wrote:
> > While the RAID6 algorithm could in theory support 3 devices by just
> > copying the data disk to the two parity disks, this version is not only
> > useless because it is a suboptimal version of 3-way mirroring, but also
> > broken with various crashes and incorrect parity generation in various
> > architecture-optimized implementations.  Disallow it similar to mdraid
> > which requires at least 4 devices for RAID 6.
> > 
> > Fixes: 53b381b3abeb ("Btrfs: RAID5 and RAID6")
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> This patch should have been sent separately as it has user visible
> impact and can potentially break some setups.

It _is_ sent out separate.

> The degenerate modes of
> raid0, 5, or 6 are explicit as a possible middle step when converting
> profiles.  We can use a fallback implementation for this case if the
> accelerated implementations cannot do it.

This is not about a degenerated mode.  For a degenerated RAID 6, parity
generation uses the RAID 5 XOR routines as the second parity will be
missing.  This is about generating two parities for a single data disk,
which must be explicitly selected.


^ permalink raw reply

* Re: [PATCH v6 00/14] selftests/mm: fix failures and robustness improvements
From: Sayali Patil @ 2026-05-13  6:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Shuah Khan, linux-mm, linux-kernel, linux-kselftest,
	Ritesh Harjani, David Hildenbrand, Zi Yan, Michal Hocko,
	Oscar Salvador, Lorenzo Stoakes, Dev Jain, Liam.Howlett,
	linuxppc-dev, Miaohe Lin, Venkat Rao Bagalkote
In-Reply-To: <20260512202439.e71f233281428ff596dab0b6@linux-foundation.org>

Hi Andrew,

Sure, I will rebase the series on top of the current mm-new branch and 
resend an updated version. It appears that Mike’s changes have modified 
many selftests, so I will adjust the series accordingly and ensure it 
applies cleanly.

Thanks,
Sayali

On 13/05/26 08:54, Andrew Morton wrote:
> On Mon,  4 May 2026 13:54:37 +0530 Sayali Patil <sayalip@linux.ibm.com> wrote:
> 
>> Powerpc systems with a 64K base page size exposed several issues while
>> running mm selftests. Some tests assume specific hugetlb configurations,
>> use incorrect interfaces, or fail instead of skipping when the required
>> kernel features are not available.
>>
>> This series fixes these issues and improves test robustness.
> 
> Sorry, bitrot.  Mike's monster series "make MM selftests more CI
> friendly" has moved a lot of this code around then altered it.  Can you
> please redo against mm.git's mm-new branch?
> 
> Thanks.
> 
> 



^ permalink raw reply

* Re: [PATCH v2] drivers/base/memory: make memory block get/put explicit
From: Richard Cheng @ 2026-05-13  5:54 UTC (permalink / raw)
  To: Muchun Song
  Cc: Muchun Song, Andrew Morton, David Hildenbrand, Greg Kroah-Hartman,
	linux-mm, driver-core, Oscar Salvador, Lorenzo Stoakes,
	Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Danilo Krummrich,
	Rafael J . Wysocki, linux-kernel, linux-cxl, linuxppc-dev,
	linux-s390, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy, Heiko Carstens, Vasily Gorbik,
	Alexander Gordeev, Christian Borntraeger, Sven Schnelle,
	Sumanth Korikkar, Kees Cook, Douglas Anderson, Donet Tom
In-Reply-To: <3EE08C93-F28D-4363-AC1C-C9B99F8ABB7C@linux.dev>

On Wed, May 13, 2026 at 09:41:23AM +0800, Muchun Song wrote:
> 
> 
> > On May 12, 2026, at 20:03, Richard Cheng <icheng@nvidia.com> wrote:
> > 
> > On Tue, May 12, 2026 at 03:26:35PM +0800, Muchun Song wrote:
> >> Rename the memory block lookup helper to make the acquired reference
> >> explicit, add memory_block_put() to wrap put_device(), remove
> >> find_memory_block(), and use memory_block_get() as the single block-id
> >> based lookup interface.
> >> 
> >> This makes it clearer to callers that a successful lookup holds a
> >> reference that must be dropped, reducing the chance of forgetting the
> >> matching put and leaking the memory block device reference.
> >> 
> >> Link: https://lore.kernel.org/linux-mm/7887915D-E598-42B3-9AFE-BFFBACE8DE2D@linux.dev/#t
> >> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> >> Acked-by: Oscar Salvador <osalvador@suse.de>
> >> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> >> Acked-by: Michal Hocko <mhocko@suse.com>
> >> Tested-by: Donet Tom <donettom@linux.ibm.com>
> >> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> >> ---
> >> Changes in v2:
> >> - mention the removal of find_memory_block() in the commit message
> >> - drop the redundant extern from the memory_block_get() declaration
> >> ---
> >> .../platforms/pseries/hotplug-memory.c        | 14 ++-----
> >> drivers/base/memory.c                         | 38 +++++++------------
> >> drivers/base/node.c                           |  4 +-
> >> drivers/s390/char/sclp_mem.c                  | 17 ++++-----
> >> include/linux/memory.h                        |  7 +++-
> >> mm/memory_hotplug.c                           |  5 +--
> >> 6 files changed, 35 insertions(+), 50 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> index 75f85a5da981..94f3b57054b6 100644
> >> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> @@ -164,13 +164,7 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
> >> 
> >> static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb)
> >> {
> >> - unsigned long section_nr;
> >> - struct memory_block *mem_block;
> >> -
> >> - section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
> >> -
> >> - mem_block = find_memory_block(section_nr);
> >> - return mem_block;
> >> + return memory_block_get(phys_to_block_id(lmb->base_addr));
> >> }
> >> 
> >> static int get_lmb_range(u32 drc_index, int n_lmbs,
> >> @@ -220,7 +214,7 @@ static int dlpar_change_lmb_state(struct drmem_lmb *lmb, bool online)
> >> else
> >> rc = 0;
> >> 
> >> - put_device(&mem_block->dev);
> >> + memory_block_put(mem_block);
> >> 
> >> return rc;
> >> }
> >> @@ -319,12 +313,12 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
> >> 
> >> rc = dlpar_offline_lmb(lmb);
> >> if (rc) {
> >> - put_device(&mem_block->dev);
> >> + memory_block_put(mem_block);
> >> return rc;
> >> }
> >> 
> >> __remove_memory(lmb->base_addr, memory_block_size);
> >> - put_device(&mem_block->dev);
> >> + memory_block_put(mem_block);
> >> 
> >> /* Update memory regions for memory remove */
> >> memblock_remove(lmb->base_addr, memory_block_size);
> >> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> >> index 11d57cfa8d72..5b5d41089e81 100644
> >> --- a/drivers/base/memory.c
> >> +++ b/drivers/base/memory.c
> >> @@ -649,7 +649,7 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
> >>  *
> >>  * Called under device_hotplug_lock.
> >>  */
> >> -struct memory_block *find_memory_block_by_id(unsigned long block_id)
> >> +struct memory_block *memory_block_get(unsigned long block_id)
> >> {
> >> struct memory_block *mem;
> >> 
> >> @@ -659,16 +659,6 @@ struct memory_block *find_memory_block_by_id(unsigned long block_id)
> >> return mem;
> >> }
> >> 
> >> -/*
> >> - * Called under device_hotplug_lock.
> >> - */
> >> -struct memory_block *find_memory_block(unsigned long section_nr)
> >> -{
> >> - unsigned long block_id = memory_block_id(section_nr);
> >> -
> >> - return find_memory_block_by_id(block_id);
> >> -}
> >> -
> >> static struct attribute *memory_memblk_attrs[] = {
> >> &dev_attr_phys_index.attr,
> >> &dev_attr_state.attr,
> >> @@ -701,7 +691,7 @@ static int __add_memory_block(struct memory_block *memory)
> >> 
> >> ret = device_register(&memory->dev);
> >> if (ret) {
> >> - put_device(&memory->dev);
> >> + memory_block_put(memory);
> >> return ret;
> >> }
> >> ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
> >> @@ -795,9 +785,9 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state
> >> struct memory_block *mem;
> >> int ret = 0;
> >> 
> >> - mem = find_memory_block_by_id(block_id);
> >> + mem = memory_block_get(block_id);
> >> if (mem) {
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> return -EEXIST;
> >> }
> >> mem = kzalloc_obj(*mem);
> >> @@ -845,8 +835,8 @@ static void remove_memory_block(struct memory_block *memory)
> >> memory->group = NULL;
> >> }
> >> 
> >> - /* drop the ref. we got via find_memory_block() */
> >> - put_device(&memory->dev);
> >> + /* drop the ref. we got via memory_block_get() */
> >> + memory_block_put(memory);
> >> device_unregister(&memory->dev);
> >> }
> >> 
> >> @@ -880,7 +870,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
> >> end_block_id = block_id;
> >> for (block_id = start_block_id; block_id != end_block_id;
> >>      block_id++) {
> >> - mem = find_memory_block_by_id(block_id);
> >> + mem = memory_block_get(block_id);
> >> if (WARN_ON_ONCE(!mem))
> >> continue;
> >> remove_memory_block(mem);
> >> @@ -908,7 +898,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
> >> return;
> >> 
> >> for (block_id = start_block_id; block_id != end_block_id; block_id++) {
> >> - mem = find_memory_block_by_id(block_id);
> >> + mem = memory_block_get(block_id);
> >> if (WARN_ON_ONCE(!mem))
> >> continue;
> >> num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
> >> @@ -1015,12 +1005,12 @@ int walk_memory_blocks(unsigned long start, unsigned long size,
> >> return 0;
> >> 
> >> for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
> >> - mem = find_memory_block_by_id(block_id);
> >> + mem = memory_block_get(block_id);
> >> if (!mem)
> >> continue;
> >> 
> >> ret = func(mem, arg);
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> if (ret)
> >> break;
> >> }
> >> @@ -1228,22 +1218,22 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
> >> void memblk_nr_poison_inc(unsigned long pfn)
> >> {
> >> const unsigned long block_id = pfn_to_block_id(pfn);
> >> - struct memory_block *mem = find_memory_block_by_id(block_id);
> >> + struct memory_block *mem = memory_block_get(block_id);
> >> 
> >> if (mem) {
> >> atomic_long_inc(&mem->nr_hwpoison);
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> }
> >> }
> >> 
> >> void memblk_nr_poison_sub(unsigned long pfn, long i)
> >> {
> >> const unsigned long block_id = pfn_to_block_id(pfn);
> >> - struct memory_block *mem = find_memory_block_by_id(block_id);
> >> + struct memory_block *mem = memory_block_get(block_id);
> >> 
> >> if (mem) {
> >> atomic_long_sub(i, &mem->nr_hwpoison);
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> }
> >> }
> >> 
> >> diff --git a/drivers/base/node.c b/drivers/base/node.c
> >> index 126f66aa2c3e..b3333ca92090 100644
> >> --- a/drivers/base/node.c
> >> +++ b/drivers/base/node.c
> >> @@ -847,13 +847,13 @@ static void register_memory_blocks_under_nodes(void)
> >> for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
> >> struct memory_block *mem;
> >> 
> >> - mem = find_memory_block_by_id(block_id);
> >> + mem = memory_block_get(block_id);
> >> if (!mem)
> >> continue;
> >> 
> >> memory_block_add_nid_early(mem, nid);
> >> do_register_memory_block_under_node(nid, mem);
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> }
> >> 
> >> }
> >> diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c
> >> index 78c054e26d17..6df1926d4c62 100644
> >> --- a/drivers/s390/char/sclp_mem.c
> >> +++ b/drivers/s390/char/sclp_mem.c
> >> @@ -204,7 +204,7 @@ static ssize_t sclp_config_mem_store(struct kobject *kobj, struct kobj_attribute
> >> addr = sclp_mem->id * block_size;
> >> /*
> >>  * Hold device_hotplug_lock when adding/removing memory blocks.
> >> -  * Additionally, also protect calls to find_memory_block() and
> >> +  * Additionally, also protect calls to memory_block_get() and
> >>  * sclp_attach_storage().
> >>  */
> >> rc = lock_device_hotplug_sysfs();
> >> @@ -231,20 +231,19 @@ static ssize_t sclp_config_mem_store(struct kobject *kobj, struct kobj_attribute
> >> sclp_mem_change_state(addr, block_size, 0);
> >> goto out_unlock;
> >> }
> >> - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
> >> - put_device(&mem->dev);
> >> + mem = memory_block_get(phys_to_block_id(addr));
> >> + memory_block_put(mem);
> >> WRITE_ONCE(sclp_mem->config, 1);
> >> } else {
> >> if (!sclp_mem->config)
> >> goto out_unlock;
> >> - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
> >> + mem = memory_block_get(phys_to_block_id(addr));
> >> if (mem->state != MEM_OFFLINE) {
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> rc = -EBUSY;
> >> goto out_unlock;
> >> }
> >> - /* drop the ref just got via find_memory_block() */
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> sclp_mem_change_state(addr, block_size, 0);
> >> __remove_memory(addr, block_size);
> >> #ifdef CONFIG_KASAN
> >> @@ -294,11 +293,11 @@ static ssize_t sclp_memmap_on_memory_store(struct kobject *kobj, struct kobj_att
> >> return rc;
> >> block_size = memory_block_size_bytes();
> >> sclp_mem = container_of(kobj, struct sclp_mem, kobj);
> >> - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(sclp_mem->id * block_size)));
> >> + mem = memory_block_get(phys_to_block_id(sclp_mem->id * block_size));
> >> if (!mem) {
> >> WRITE_ONCE(sclp_mem->memmap_on_memory, value);
> >> } else {
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> rc = -EBUSY;
> >> }
> >> unlock_device_hotplug();
> >> diff --git a/include/linux/memory.h b/include/linux/memory.h
> >> index 5bb5599c6b2b..463dc02f6cff 100644
> >> --- a/include/linux/memory.h
> >> +++ b/include/linux/memory.h
> >> @@ -158,7 +158,11 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
> >> void remove_memory_block_devices(unsigned long start, unsigned long size);
> >> extern void memory_dev_init(void);
> >> extern int memory_notify(enum memory_block_state state, void *v);
> >> -extern struct memory_block *find_memory_block(unsigned long section_nr);
> >> +struct memory_block *memory_block_get(unsigned long block_id);
> >> +static inline void memory_block_put(struct memory_block *mem)
> >> +{
> >> + put_device(&mem->dev);
> >> +}
> > 
> > Hi Muchun,
> 
> Hi,
> 
> > 
> > Thanks for the work, I have a small suggestion if that fits your thought.
> > 
> > I think we should at least add a comment  above memory_block_put() to remind the caller to check
> > for the availabitliy of "mem" before calling this function.
> > We perform the check in memory_block_get() inside the function body, I see different usage pattern
> > across the caller when they're dealing with "mem == NULL" and avoid to call memory_block_put(), 
> > I can understand we should leverage the check to caller, not inside memory_block_put().
> > But just in case the next caller might forgot to do the check or think the behavior might be symmetric
> > bettween memory_block_get() and memory_block_put(), a comment above the function would be nice.
> 
> Thanks for the suggestion!
> 
> Regarding the additional comment, I feel they might not be strictly necessary.
> If a user passes a NULL pointer, the issue would be exposed immediately
> before memory_block_put() is even called. It's unlikely a user would obtain
> mem and not perform any read/write operations; any such attempt would trigger
> a NULL pointer dereference right away.
>

Makes sense.
 
> As for adding comments to memory_block_get(), I’m wondering if it’s truly
> essential. The function is quite straightforward—anyone looking at the
> definition would see the implementation alongside the comments. It’s very
> clear from the code that mem must be non-NULL to be "gotten."
> 
> Overall, I’m concerned the extra comments might not add much value. What do
> you think?
> 

Agreed, let's leave it as it is then.

Best regards,
Richard Cheng.

> Thanks,
> Muchun
> > 
> > Best regards,
> > Richard Cheng.
> > 
> >> typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
> >> extern int walk_memory_blocks(unsigned long start, unsigned long size,
> >>       void *arg, walk_memory_blocks_func_t func);
> >> @@ -171,7 +175,6 @@ struct memory_group *memory_group_find_by_id(int mgid);
> >> typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
> >> int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
> >>        struct memory_group *excluded, void *arg);
> >> -struct memory_block *find_memory_block_by_id(unsigned long block_id);
> >> #define hotplug_memory_notifier(fn, pri) ({ \
> >> static __meminitdata struct notifier_block fn##_mem_nb =\
> >> { .notifier_call = fn, .priority = pri };\
> >> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> >> index 462d8dcd636d..890c6453e887 100644
> >> --- a/mm/memory_hotplug.c
> >> +++ b/mm/memory_hotplug.c
> >> @@ -1417,14 +1417,13 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
> >> struct vmem_altmap *altmap = NULL;
> >> struct memory_block *mem;
> >> 
> >> - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start)));
> >> + mem = memory_block_get(phys_to_block_id(cur_start));
> >> if (WARN_ON_ONCE(!mem))
> >> continue;
> >> 
> >> altmap = mem->altmap;
> >> mem->altmap = NULL;
> >> - /* drop the ref. we got via find_memory_block() */
> >> - put_device(&mem->dev);
> >> + memory_block_put(mem);
> >> 
> >> remove_memory_block_devices(cur_start, memblock_size);
> >> 
> >> 
> >> base-commit: e98d21c170b01ddef366f023bbfcf6b31509fa83
> >> -- 
> >> 2.54.0
> 
> 


^ permalink raw reply

* Re: [PATCH 1/1] powerpc/pseries/pci: quirks: Add pseries TG3 D3hot delay quirk for EEH stability
From: Harsh Prateek Bora @ 2026-05-13  6:49 UTC (permalink / raw)
  To: Narayana Murty N, linuxppc-dev, maddy, mpe, npiggin
  Cc: christophe.leroy, linux-kernel, mahesh, vaibhav, sbhat
In-Reply-To: <20260428093529.7618-1-nnmlinux@linux.ibm.com>

Hi Narayana,

Few comments inline below ..

On 28/04/26 3:05 pm, Narayana Murty N wrote:
> On pseries platforms, binding Broadcom PCIe NIC devices to the vfio-pci
> driver frequently triggers an unintended EEH (Extended Error Handling)
> isolation event.
> 

Commit log title can be rephrased to say "Broadcom TG3" instead of 
"pseries TG3". Also, need to mention TG3 in commit log as well.

> This occurs because the device firmware violates PCIe specification
> recovery timings when transitioning from the D3hot to D0 power state
> during the binding process. The strict pseries PHB catches the
> resulting Unsupported Request during the subsequent configuration
> space read, assuming a device failure.
> 
> Add a pseries-specific PCI fixup quirk for Broadcom devices to

Not all broadcom devices, only TG3 specific, right ?

> explicitly extend the `d3hot_delay` to 200ms. This forces the PCI
> core to sleep long enough for the firmware to safely complete the
> D0 transition before attempting config reads, ensuring a clean VFIO
> passthrough initialization.

Do we need a "Fixes:" tag here to mention specific TG3 device support 
being tested/fixed ?

> 
> Signed-off-by: Narayana Murty N <nnmlinux@linux.ibm.com>
> ---
>   arch/powerpc/platforms/pseries/pci.c | 21 +++++++++++++++++++++
>   1 file changed, 21 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
> index 84e4ffe957a8..5f3cf9a7bdd3 100644
> --- a/arch/powerpc/platforms/pseries/pci.c
> +++ b/arch/powerpc/platforms/pseries/pci.c
> @@ -291,3 +291,24 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
>   	bus->cur_bus_speed = prop_to_pci_speed(pcie_link_speed_stats[1]);
>   	return 0;
>   }
> +
> +/*
> + * Workaround for sluggish PCIe device firmware.

Mention Broadcom TG3 (specific device details) here as well, until the 
routine broadens its scope.

Also, which specific Broadcom device IDs exhibit this issue?
Are all TG3 devices affected, or only specific models? Please document.

> + *
> + * The device violates the PCIe spec recovery timing when transitioning
> + * from D3hot to D0. On standard architectures this is often ignored, but
> + * the strict PowerPC pseries PHB catches the Unsupported Request during
> + * the subsequent config read and triggers an EEH.
> + *
> + * We inject a longer delay to ensure the device is ready before the PCI
> + * core attempts to access configuration space.
> + */
> +static void quirk_pseries_d0_wake_delay(struct pci_dev *dev)
> +{
> +	dev->d3hot_delay = 200;

No justification provided for using this value. Is this minimum or
maximum from test observations? Please share test details in comments
and/or commit log. Also, I see Marvel sky2 driver using 300ms, not sure
if we may need to increase. Preferably, use a macro to define this
constant? Should it be a tg3 driver level fix during probe or be placed
in drivers/pci/quirks.c ? Also, is there an impact on system boot time?

> +	pci_info(dev, "pseries  Quirk:D3hot->D0 delay %d ms to prevent EEH\n",

Double space after "pseries", missing space after "Quirk:"
Also pci_dbg() may be more appropriate?

> +		 dev->d3hot_delay);
> +}
> +/* Blanket application to ALL Broadcom PCI devices */
> +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM,
> +			PCI_ANY_ID, quirk_pseries_d0_wake_delay);

Replace PCI_ANY_ID with specific device IDs that are known to have this 
issue, or provide strong justification for the broad scope as it may 
impact system boot time.



^ permalink raw reply

* Re: [PATCH 0/3] MCE robustness fixes and LKDTM powerpc enhancements
From: Sayali Patil @ 2026-05-13  7:08 UTC (permalink / raw)
  To: linuxppc-dev, maddy, Ritesh Harjani (IBM), sshegde
  Cc: linux-kernel, Mahesh Salgaonkar, chleroy
In-Reply-To: <cover.1778057685.git.sayalip@linux.ibm.com>



On 06/05/26 14:36, Sayali Patil wrote:
> Hi all,
> 
> This series fixes a panic observed while handling machine check exceptions
> (MCEs) from real mode. It also improves the reliability of the
> PPC_SLB_MULTIHIT test by adding isync instructions after slbmte operations,
> and introduces a new LKDTM trigger (PPC_RADIX_TLBIEL) to validate MCE
> behavior on radix MMU.
> 
> Please review the patches and provide any feedback or suggestions
> for improvement.
> 
> Thanks,
> Sayali
> 
> Sayali Patil (3):
>    powerpc/time: remove preempt_disable/enable from arch_irq_work_raise()
>    lkdtm/powerpc: add isync after slbmte to enforce SLB update ordering
>    lkdtm/powerpc: add PPC_RADIX_TLBIEL test for radix MCE validation
> 
>   arch/powerpc/kernel/time.c              |  2 --
>   drivers/misc/lkdtm/Makefile             |  2 +-
>   drivers/misc/lkdtm/powerpc.c            | 44 +++++++++++++++++++++++++
>   tools/testing/selftests/lkdtm/tests.txt |  1 +
>   4 files changed, 46 insertions(+), 3 deletions(-)
> 


Sashiko has provided some comments 
(https://sashiko.dev/#/patchset/cover.1778057685.git.sayalip%40linux.ibm.com), 
so I am planning to split this into two patch series. I will send v2 
part 1 today with the proposed change for "powerpc/time: Remove 
preempt_disable/enable from arch_irq_work_raise()", and v2 part 2 with 
the remaining two patches later.

Thanks for the review, Ritesh and Shrikanth.

Regards,
Sayali


^ permalink raw reply

* Re: [PATCH v2] powerpc/pseries/iommu: export DMA window data to user space
From: Vaibhav Jain @ 2026-05-13  7:10 UTC (permalink / raw)
  To: Gaurav Batra; +Cc: linuxppc-dev, sbhat, ritesh.list, Brian King, maddy
In-Reply-To: <20260507180646.40356-1-gbatra@linux.ibm.com>

Gaurav Batra <gbatra@linux.ibm.com> writes:

Thanks for the v2 patch. My review comments below:

General comment. I see some issues in the patch that checkpatch would
have flagged. Can you please also ensure that there are no checkpatch
related warning before you send the patch

Optional comment:
Please split the patch into 2 , moving the DOC changes into separate
patch.

> Export PowerPC DMA window information (both default 2GB and Dynamic
> larger window) to user space via sysfs. Each of these DMA windows has
> attributes like size of the window, page size backing the window, mode,
> etc. Each of these atributes is exported for user space consumption as a
> file.
>
> PowerPC Host Bridge (PHB) can have multiple devices/functions sharing
> the same DMA window. For each PHB, iommu registration creates an iommu
> device under "/sys/devices/virtual/iommu".
>
> These devices will have 2 groups created to export Default and DDW
> attributes.
>
> Reviewed-by: Brian King <brking@linux.ibm.com>
> Reviewed-by: Vaibhav Jain <vaibhav@linux.ibm.com>

Thanks for incorporating my review comments from the previous
iteration. However I dont remember reviewing the v2 of this patch
before. Can you please avoid presumptively adding my R-b until I have a
chance to review the patch.

> Reviewed-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> Signed-off-by: Gaurav Batra <gbatra@linux.ibm.com>
> ---
> V1 -> V2 change log:
>
> 1. Shiva: "weight" the it_map for the bitmap. This avoids using an extra
>    counter in the table. Please look into how iommu_debugfs_weight_get()
>    does this
>
>    Response: Incorporated changes
>
> 2. Vaibhav: If the DMA window is not available, show function should just
>    return ENOENT so that userspace know the error instantly instead of
>    having to parse the sysfs contents.
>
>    Response: Incorporated changes, returning ENODATA
>
> 3. Vaibhav: All the show functions have similar template. Please convert
>    them to macros expansion to reduce code volume.
>
>    Response: Incorporated changes
>
> 4. Vaibhav: These new attributes are PSeries specific but they are being
>    setup in ppc generic iommu code at arch/powerpc/kernel/iommu.c. Can
>    you move these attributes to arch/powerpc/platforms/pseries/iommu.c
>
>    Response: I have split the attributes and moved them to pseries specific
>    files. The original group "spapr-tce-iommu", is moved to PowerNV code
>    base to retain the legacy functionality.
>
>    I tested the changes both on Pseries and PowerNV.
>
> 5. Vaibhav: It would be better to use function iommu_table_inuse_tces() as
>    a callback in iommu_table_ops which can be implemented by pseries and
>    powernv code differently.
>
>    Response: the function is no longer needed after changes in #1
>
> 6. Vaibhav: Since sysfs is ABI can you propose appropriate entries under
>    Documentation/ABI/testing
>
>    Response: Added documentation
>
>  ...sfs-devices-virtual-iommu-dma_window_attrs |  21 ++
>  .../arch/powerpc/dma_window_attributes.rst    |  65 +++++
>  arch/powerpc/include/asm/pci-bridge.h         |   4 +
>  arch/powerpc/kernel/iommu.c                   |  16 +-
>  arch/powerpc/platforms/powernv/pci-ioda.c     |  16 ++
>  arch/powerpc/platforms/pseries/iommu.c        | 261 ++++++++++++++++++
>  arch/powerpc/platforms/pseries/pci_dlpar.c    |   2 +
>  arch/powerpc/platforms/pseries/pseries.h      |   1 +
>  arch/powerpc/platforms/pseries/setup.c        |   2 +
>  9 files changed, 373 insertions(+), 15 deletions(-)
>  create mode 100644 Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
>  create mode 100644 Documentation/arch/powerpc/dma_window_attributes.rst
>
> diff --git a/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs b/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
> new file mode 100644
> index 000000000000..18ba63874276
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
> @@ -0,0 +1,21 @@
> +What:
> /sys/devices/virtual/iommu/<iommu-isolation>/spapr-tce-ddw/*
Suggested:
s/iommu-isolation/iommu-group/
> +Date:       Oct 2025
> +Contact:    linuxppc-dev@lists.ozlabs.org
> +Description:    read only
> +    For each IOMMU isolation unit spapr-tce-ddw sub-directory provides
> +    attributes to query information related to the bigger Dynamic DMA
> +    window (DDW) in the PowerPC virtualized platforms.
> +
> +    See Documentation/arch/powerpc/dma_window_attributes.rst for more
> +    information.
> +
> +What:       /sys/devices/virtual/iommu/<iommu-isolation>/spapr-tce-dma/*
> +Date:       Oct 2025
> +Contact:    linuxppc-dev@lists.ozlabs.org
> +Description:    read only
> +    For each IOMMU isolation unit spapr-tce-dma sub-directory provides
> +    attributes to query information related to the default 2GB DMA
> +    window in the PowerPC virtualized platforms.
> +
> +    See Documentation/arch/powerpc/dma_window_attributes.rst for more
> +    information.
sysfs ABI documentation typically describes all the attribute files rather
then directory. Please add details of the individual attributes that you
are adding here.

> diff --git a/Documentation/arch/powerpc/dma_window_attributes.rst b/Documentation/arch/powerpc/dma_window_attributes.rst
> new file mode 100644
> index 000000000000..8bd9aec8539d
> --- /dev/null
> +++ b/Documentation/arch/powerpc/dma_window_attributes.rst
> @@ -0,0 +1,65 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=====================
> +DMA Window Attributes
> +=====================
> +
> +In PowerPC architecture there are 2 types of DMA windows -
> +
This is only true for PPC64-PSeries not for PPC64-PowerNV

> +1. Default 2GB DMA window which is backed by 4K page size
> +2. A bigger Dynamic DMA Window (DDW) which is backed by larger page size
> +   (64K or 2MB)
> +
> +A dedicated device will have both the DMA windows instantiated but an SR-IOV
> +device will only have the bigger Dynamic DMA Window.
In context of PSeries please give some context abt 'dedicated device'

> +
> +The attributes of these 2 DMA windows are exported to user space via sysfs.
> +Each IOMMU isolation unit will have its directory created under
> +/sys/devices/virtual/iommu.
> +
> +As an exapmple, iommu-phb0001
> +
> +Under each IOMMU isolation unit, there will be a group of attributes for
> +"Default 2GB DMA Window" and "Dynamic DMA Window" - spapr-tce-dma and
> +spapr-tce-ddw respectively.
> +
> +Attributes under each group
> +
> +spapr-tce-ddw:
> +direct_address  dynamic_address       dynamic_size  window_type
> +direct_size     dynamic_pages_mapped  page_size
> +
> +spapr-tce-dma:
> +dynamic_address  dynamic_pages_mapped  dynamic_size  page_size
> +
> +
> +The bigger Dynamic DMA Window is configured into pre-mapped and/or dynamically
> +allocated TCEs. If the DDW is in "Hybrid" mode, then both the Direct
> +(pre-mapped) and Dynamic part of the DMA window will have valid values. Hybrid
> +mode is valid only for SR-IOV devices.
> +
> +DMA Window properties:
> +
> +direct_address              Starting address of the pre-mapped DMA window
> +direct_size                 Size of the pre-mapped DMA Window
> +dynamic_address             Starting address of the dynamic allocations
> +dynamic_size                Size of the dynamic allocation window
> +dynamic_pages_mapped        Pages mapped for DMA by dynamic allocations
> +page_size                   Page size backing the DMA window
> +window_type                 Type of the DMA Window (Direct/Dynamic/Hybrid)
> +

these attributes should also be documented in the sysfs/ABI 

> +
> +An example of DDW attributes for an SR-IOV device::
> +
> +    $ cd /sys/devices/virtual/iommu/iommu-phb0001/spapr-tce-ddw
> +
> +    $ grep . *
> +
> +    direct_address:0x800000000000000   <-- Starting addr of pre-mapped Window
> +    direct_size:137438953472           <-- Size of pre-mapped Window (128GB)
> +    dynamic_address:0x800002000000000  <-- Starting addr of Dynamic allocations
> +    dynamic_size:412316860416          <-- Size of dynamic allocation window (384GB)
> +    dynamic_pages_mapped:270           <-- Pages mapped by dynamic allocations
> +    page_size:2097152                  <-- DMA window page size (2MB)
> +    window_type:Hybrid                 <-- window has both pre-mapped and
> +                                           dynamic sections
Suggested:
This documentation can be improved by moving details on sysfs attrs and
adding details on how 2 different types of DMA windows are allocated and managed.

> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index 1dae53130782..9b09178aca5e 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -124,6 +124,10 @@ struct pci_controller {
>  	resource_size_t dma_window_base_cur;
>  	resource_size_t dma_window_size;
>  
> +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
> +	const struct attribute_group **iommu_groups;
> +#endif
Ideally addition of new members to a struct should be done at the end
to preserve KABI.
Naming issue: s/iommu_groups/iommu_group_attrs/

> +
>  #ifdef CONFIG_PPC64
>  	unsigned long buid;
>  	struct pci_dn *pci_data;
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 0ce71310b7d9..d6242e3f77da 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -1269,24 +1269,10 @@ static const struct iommu_ops spapr_tce_iommu_ops = {
>  	.device_group = spapr_tce_iommu_device_group,
>  };
>


> -static struct attribute *spapr_tce_iommu_attrs[] = {
> -	NULL,
> -};
> -
> -static struct attribute_group spapr_tce_iommu_group = {
> -	.name = "spapr-tce-iommu",
> -	.attrs = spapr_tce_iommu_attrs,
> -};
> -
> -static const struct attribute_group *spapr_tce_iommu_groups[] = {
> -	&spapr_tce_iommu_group,
> -	NULL,
> -};
> -
>  void ppc_iommu_register_device(struct pci_controller *phb)
>  {
>  	iommu_device_sysfs_add(&phb->iommu, phb->parent,
> -				spapr_tce_iommu_groups, "iommu-phb%04x",
> +				phb->iommu_groups, "iommu-phb%04x",
>  				phb->global_number);
>  	iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops,
>  				phb->parent);
Since you are changing this code, can you check for NULL
phb->iommu_groups and also check for returned errors from these two
functions().
In case phb->iommu_groups == NULL you can ignore registering sysfs. That
will take care of POWERNV case.

> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 1c78fdfb7b03..0887f154955e 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -2493,6 +2493,20 @@ static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
>  	.shutdown		= pnv_pci_ioda_shutdown,
>  };
>  
> +static struct attribute *pnv_tce_iommu_attrs[] = {
> +	NULL,
> +};
> +
> +static struct attribute_group pnv_tce_iommu_group = {
> +	.name = "spapr-tce-iommu",
> +	.attrs = pnv_tce_iommu_attrs,
> +};
> +
> +static const struct attribute_group *pnv_tce_iommu_groups[] = {
> +	&pnv_tce_iommu_group,
> +	NULL,
> +};
> +
>  static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>  					 u64 hub_id, int ioda_type)
>  {
> @@ -2697,6 +2711,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>  		hose->controller_ops = pnv_pci_ioda_controller_ops;
>  	}
>  
> +	hose->iommu_groups = pnv_tce_iommu_groups;
> +
See the previous comment for optimization. This  proposed hunk can be
removed.

>  	ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
>  
>  #ifdef CONFIG_PCI_IOV
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 5497b130e026..28be7a45761d 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -56,6 +56,20 @@ enum {
>  	DDW_EXT_LIMITED_ADDR_MODE = 3
>  };
>  
> +/* used by sysfs when querying Dynamic/Default DMA Window data */
> +struct dma_win_data {
> +	u32     page_size;
> +	u64     direct_address;
> +	u64     direct_size;
> +	u64     dynamic_address;
> +	u64     dynamic_size;
> +	u32     dynamic_pages_mapped;
> +	char    window_type[15];
Why do you need to hold a string representation of the window_type. Can
this be replaced by an enum that holds much smaller space.

> +};
> +
> +#define SPAPR_SUCCESS		0
> +#define SPAPR_ERROR			-1

Returning 0 or -1 are common and well known return values from kernel
functions and as such you need not create seperate macros for them.

Also Indentation looks strange.
> +
>  static struct iommu_table *iommu_pseries_alloc_table(int node)
>  {
>  	struct iommu_table *tbl;
> @@ -837,6 +851,253 @@ static struct device_node *pci_dma_find(struct device_node *dn,
>  	return rdn;
>  }
>  
> +/* Get DDW information for the device */
> +static int gather_ddw_info(struct device *dev, struct dma_win_data *data)
> +{
> +	struct iommu_device *iommu;
> +	struct pci_controller *phb;
> +	struct device_node *dn;
> +	struct pci_dn *pci;
> +	const __be32 *prop = NULL;
> +	bool ddw_direct = false;
> +	bool found = false;
> +	struct iommu_table *tbl;
> +	u32 pgshift;
> +	struct dynamic_dma_window_prop *p;
> +
> +	memset(data, 0, sizeof(*data));
> +
> +	iommu = dev_get_drvdata(dev);
> +	phb = container_of(iommu, struct pci_controller, iommu);
> +	dn = phb->dn;
> +
> +	if (!dn)
> +		return SPAPR_ERROR;
> +
> +	pci = PCI_DN(dn);
> +	if (!pci || !pci->table_group)
> +		return SPAPR_ERROR;
> +
> +	/* Find DDW */
> +	prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
> +	if (prop) {
> +		ddw_direct = true;
> +		found = true;
> +	} else {
> +		prop = of_get_property(dn, DMA64_PROPNAME, NULL);
> +		if (prop)
> +			found = true;
> +	}
> +
> +	/* NO DDW */
> +	if (!found)
> +		return SPAPR_ERROR;
> +
> +	p = (struct dynamic_dma_window_prop *)prop;
> +
> +	pgshift = be32_to_cpu(p->tce_shift);
> +	if (pgshift != 0xc && pgshift != 0x10 && pgshift != 0x15)
> +		data->page_size = 0;
> +	else
> +		data->page_size = 1 << pgshift;
> +
> +	/* Check if DDW has table associated with it. Having a table associated with
> +	 * DDW is indicative that is has some dynamic TCE allocations. In this case the
> +	 * DDW can be fully Dynamic or in Hybrid mode. For SR-IOV DDW is on index 0,
> +	 * for dedicated adapter on index 1.
> +	 */
> +	found = false;
> +	for (int i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
Variable Naming: avoid 'i' . Also please hoist the loop variable
> +		tbl = pci->table_group->tables[i];
> +
> +		if (tbl && tbl->it_index == be32_to_cpu(p->liobn)) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	/* set the parameters depnding on the DDW type */
> +	if (ddw_direct && found) {          /* Hybrid */
> +		data->direct_address = be64_to_cpu(p->dma_base);
> +		data->dynamic_size = (u64)(tbl->it_size <<
> tbl->it_page_shift);
May want to check for possible overflow

> +
> +		data->dynamic_address = data->direct_address
> +								+ (u64)(1UL << be32_to_cpu(p->window_shift))
> +								-
> data->dynamic_size;
May want to check for possible overflow


> +
> +		data->direct_size = data->dynamic_address - data->direct_address;
> +		data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, tbl->it_size);
> +
> +		sprintf(data->window_type, "%s", "Hybrid");
> +	} else if (ddw_direct && !found) {    /* Direct */
> +		data->direct_address = be64_to_cpu(p->dma_base);
> +		data->direct_size = (u64)(1UL << be32_to_cpu(p->window_shift));
> +
> +		sprintf(data->window_type, "%s", "Direct");
> +	} else {                              /* Dynamic */
> +		data->dynamic_address = be64_to_cpu(p->dma_base);
> +		data->dynamic_size = (u64)(1UL << be32_to_cpu(p->window_shift));
> +		data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, tbl->it_size);
> +
> +		sprintf(data->window_type, "%s", "Dynamic");
> +	}
> +
> +	return SPAPR_SUCCESS;
> +}
> +
> +/* Get DDW information for the device */
> +static int gather_dma_info(struct device *dev, struct dma_win_data *data)
> +{
> +	struct iommu_device *iommu;
> +	struct pci_controller *phb;
> +	struct device_node *dn;
> +	struct pci_dn *pci;
> +	const __be32 *prop = NULL;
> +	struct iommu_table *tbl;
> +	unsigned long offset, size, liobn;
> +
> +	memset(data, 0, sizeof(*data));
> +
> +	iommu = dev_get_drvdata(dev);
> +	phb = container_of(iommu, struct pci_controller, iommu);
> +	dn = phb->dn;
> +
> +	if (!dn)
> +		return SPAPR_ERROR;
> +
> +	pci = PCI_DN(dn);
> +	if (!pci || !pci->table_group)
> +		return SPAPR_ERROR;
> +
> +	/* search for default DMA window */
> +	prop = of_get_property(dn, "ibm,dma-window", NULL);
> +
> +	if (!prop)
> +		return SPAPR_ERROR;
> +
> +	/* default DMA Window is always at index 0 */
> +	tbl = pci->table_group->tables[0];
> +	if (!tbl)
> +		return SPAPR_ERROR;
> +
> +	of_parse_dma_window(dn, prop, &liobn, &offset, &size);
> +
> +	data->dynamic_address = offset;
> +	data->dynamic_size = size;
> +	data->page_size = 1ULL << IOMMU_PAGE_SHIFT_4K;
> +	data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, tbl->it_size);
> +
> +	return SPAPR_SUCCESS;
> +}
> +
> +#define DEVICE_SHOW_DDW(_name, _fmt)							\
> +ssize_t ddw_##_name##_show(struct device *dev,					\
> +								  struct device_attribute *attr,\
> +								  char *buf)					\
> +{																\
> +	int rc = 0;													\
> +	struct dma_win_data data;									\
> +																\
> +	rc = gather_ddw_info(dev, &data);							\
> +																\
> +	if (rc == SPAPR_SUCCESS)									\
> +		return sysfs_emit(buf, _fmt, data._name);				\
> +	else														\
> +		return -ENODATA;										\
> +}																\
All the device tree data that gather_{ddw dma}_info() collects except
bitmap_weight is static in nature and need not be refreshed at each call
to xx_show(). This can be optimized.

> +
> +#define DEVICE_SHOW_DMA(_name, _fmt)							\
> +ssize_t dma_##_name##_show(struct device *dev,					\
> +								  struct device_attribute *attr,\
> +								  char *buf)					\
> +{																\
> +	int rc = 0;													\
> +	struct dma_win_data data;									\
> +																\
> +	rc = gather_dma_info(dev, &data);							\
> +																\
> +	if (rc == SPAPR_SUCCESS)									\
> +		return sysfs_emit(buf, _fmt, data._name);				\
> +	else														\
> +		return -ENODATA;										\
> +}																\
> +

Indentation looks strange.
Also can you just return the 'rc' from gather_{ddw dma}_info back from
xx_show rather then ENODATA

> +static DEVICE_SHOW_DDW(direct_address, "%#llx\n");
> +static DEVICE_SHOW_DDW(direct_size, "%lld\n");
> +static DEVICE_SHOW_DDW(page_size, "%d\n");
> +static DEVICE_SHOW_DDW(window_type, "%s\n");
> +static DEVICE_SHOW_DDW(dynamic_address, "%#llx\n");
> +static DEVICE_SHOW_DDW(dynamic_size, "%lld\n");
> +static DEVICE_SHOW_DDW(dynamic_pages_mapped, "%d\n");
> +static DEVICE_SHOW_DMA(dynamic_address, "%#llx\n");
> +static DEVICE_SHOW_DMA(dynamic_size, "%lld\n");
> +static DEVICE_SHOW_DMA(page_size, "%d\n");
> +static DEVICE_SHOW_DMA(dynamic_pages_mapped, "%d\n");
Avoid putting '\n's at the end of strings. Makes parsing contents
tricky.

> +
> +#define DEVICE_ATTR_DDW(_name)                              \
> +		struct device_attribute dev_attr_ddw_##_name =      \
> +			__ATTR(_name, 0444, ddw_##_name##_show, NULL)
> +#define DEVICE_ATTR_DMA(_name)                              \
> +		struct device_attribute dev_attr_dma_##_name =      \
> +		__ATTR(_name, 0444, dma_##_name##_show, NULL)
> +
> +static DEVICE_ATTR_DDW(direct_address);
> +static DEVICE_ATTR_DDW(direct_size);
> +static DEVICE_ATTR_DDW(page_size);
> +static DEVICE_ATTR_DDW(window_type);
> +static DEVICE_ATTR_DDW(dynamic_address);
> +static DEVICE_ATTR_DDW(dynamic_size);
> +static DEVICE_ATTR_DDW(dynamic_pages_mapped);
> +static DEVICE_ATTR_DMA(dynamic_address);
> +static DEVICE_ATTR_DMA(dynamic_size);
> +static DEVICE_ATTR_DMA(page_size);
> +static DEVICE_ATTR_DMA(dynamic_pages_mapped);
> +
> +static struct attribute *spapr_tce_ddw_attrs[] = {
> +	&dev_attr_ddw_direct_address.attr,
> +	&dev_attr_ddw_direct_size.attr,
> +	&dev_attr_ddw_page_size.attr,
> +	&dev_attr_ddw_window_type.attr,
> +	&dev_attr_ddw_dynamic_address.attr,
> +	&dev_attr_ddw_dynamic_size.attr,
> +	&dev_attr_ddw_dynamic_pages_mapped.attr,
> +	NULL,
> +};
> +
> +static struct attribute *spapr_tce_dma_attrs[] = {
> +	&dev_attr_dma_dynamic_address.attr,
> +	&dev_attr_dma_dynamic_size.attr,
> +	&dev_attr_dma_page_size.attr,
> +	&dev_attr_dma_dynamic_pages_mapped.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group spapr_tce_ddw_group = {
> +	.name = "spapr-tce-ddw",
> +	.attrs = spapr_tce_ddw_attrs,
> +};
> +
> +static struct attribute_group spapr_tce_dma_group = {
> +	.name = "spapr-tce-dma",
> +	.attrs = spapr_tce_dma_attrs,
> +};
> +
> +static struct attribute *spapr_tce_iommu_attrs[] = {
> +	NULL,
> +};
> +
> +static struct attribute_group spapr_tce_iommu_group = {
> +	.name = "spapr-tce-iommu",
> +	.attrs = spapr_tce_iommu_attrs,
> +};
> +
> +const struct attribute_group *spapr_tce_iommu_groups[] = {
> +	&spapr_tce_iommu_group,
> +	&spapr_tce_ddw_group,
> +	&spapr_tce_dma_group,
> +	NULL,
> +};
> +
>  static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>  {
>  	struct iommu_table *tbl;
> diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
> index 8c77ec7980de..b457451a2814 100644
> --- a/arch/powerpc/platforms/pseries/pci_dlpar.c
> +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
> @@ -45,6 +45,8 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn)
>  	pci_process_bridge_OF_ranges(phb, dn, 0);
>  	phb->controller_ops = pseries_pci_controller_ops;
>  
> +	phb->iommu_groups = spapr_tce_iommu_groups;
> +
>  	pci_devs_phb_init_dynamic(phb);
>  
>  	pseries_msi_allocate_domains(phb);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 3968a6970fa8..4cf0b7a4e96a 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -128,4 +128,5 @@ struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose,
>  					     struct pci_dev *pdev);
>  #endif
>  
> +extern const struct attribute_group *spapr_tce_iommu_groups[];
>  #endif /* _PSERIES_PSERIES_H */
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 50b26ed8432d..4d877aae0560 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -512,6 +512,8 @@ static void __init pSeries_discover_phbs(void)
>  		isa_bridge_find_early(phb);
>  		phb->controller_ops = pseries_pci_controller_ops;
>  
> +		phb->iommu_groups = spapr_tce_iommu_groups;
> +
>  		/* create pci_dn's for DT nodes under this PHB */
>  		pci_devs_phb_init_dynamic(phb);
>  
> base-commit: 192c0159402e6bfbe13de6f8379546943297783d
> -- 
> 2.39.3
>

-- 
Cheers
~ Vaibhav


^ permalink raw reply

* [PATCH v2] powerpc/time: Remove redundant preempt_disable|enable() calls from arch_irq_work_raise() - part1
From: Sayali Patil @ 2026-05-13  8:14 UTC (permalink / raw)
  To: linuxppc-dev, maddy
  Cc: linux-kernel, Ritesh Harjani, Mahesh Salgaonkar, sshegde, chleroy

A kernel panic is observed when handling machine check exceptions from
real mode.

  BUG: Unable to handle kernel data access on read at 0xc00000006be21300
  Oops: Kernel access of bad area, sig: 11 [#1]
  MSR:  8000000000001003 <SF,ME,RI,LE>  CR: 88222248  XER: 00000005
  CFAR: c00000000003ffc4 DAR: c00000006be21300 DSISR: 40000000 IRQMASK: 0
  NIP [c000000000029e40] arch_irq_work_raise+0x10/0x70
  LR [c00000000003ffc8] machine_check_queue_event+0xa8/0x150
  Call Trace:
  [c0000000179d3c70] [c00000000003ff64] machine_check_queue_event+0x44/0x150
  [c0000000179d3d30] [c0000000000084e0] machine_check_early_common+0x1f0/0x2c0

The crash occurs because arch_irq_work_raise() calls preempt_disable()
from machine check exception (MCE) handlers running in real mode. In
this context, accessing the preempt_count can fault, leading to the panic.

The preempt_disable()/preempt_enable() pair in arch_irq_work_raise()
was originally added by commit 0fe1ac48bef0 ("powerpc/perf_event: Fix
oops due to perf_event_do_pending call") to avoid races while raising
irq work from exception context.

Later, commit 471ba0e686cb ("irq_work: Do not raise an IPI when
queueing work on the local CPU") added preemption protection in
irq_work_queue() path, while commit 20b876918c06 ("irq_work: Use per
cpu atomics instead of regular atomics") added equivalent
protection in irq_work_queue_on() before reaching arch_irq_work_raise():

  irq_work_queue() / irq_work_queue_on()
    -> preempt_disable()
      -> __irq_work_queue_local()
        -> irq_work_raise()
          -> arch_irq_work_raise()

As a result, callers other than mce_irq_work_raise() already execute
with preemption disabled, making the additional
preempt_disable()/preempt_enable() pair in arch_irq_work_raise()
redundant.

The arch_irq_work_raise() function executes in NMI context when called
from MCE handler. Hence we will not be preempted or scheduled out since
we are in NMI context with MSR[EE]=0. Therefore, it is safe to remove
the preempt_disable()/preempt_enable() calls from here.

Remove it to avoid accessing preempt_count from real mode context.

Fixes: cc15ff327569 ("powerpc/mce: Avoid using irq_work_queue() in realmode")
Suggested-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Acked-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>

---

v1->v2
 - Split the patch series into two parts; v2 part 2 of remaining two patches
   will be sent later due to changes required in 
   "lkdtm/powerpc: add PPC_RADIX_TLBIEL test for radix MCE validation".
 - Updated commit message as per review feedback.
 - Added comment for arch_irq_work_raise() to indicate it must be called
   with preemption disabled, as per review feedback.
 - Added Acked-By from Shrikanth and Reviewed-by tag from Ritesh.

v1: https://lore.kernel.org/all/cover.1778057685.git.sayalip@linux.ibm.com/
---
 arch/powerpc/kernel/time.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 4bbeb8644d3d..b4472288e0d4 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -458,6 +458,10 @@ DEFINE_PER_CPU(u8, irq_work_pending);
 
 #endif /* 32 vs 64 bit */
 
+/*
+ * Must be called with preemption disabled since it updates
+ * per-CPU irq_work state and programs the local CPU decrementer.
+ */
 void arch_irq_work_raise(void)
 {
 	/*
@@ -471,10 +475,8 @@ void arch_irq_work_raise(void)
 	 * which could get tangled up if we're messing with the same state
 	 * here.
 	 */
-	preempt_disable();
 	set_irq_work_pending_flag();
 	set_dec(1);
-	preempt_enable();
 }
 
 static void set_dec_or_work(u64 val)
-- 
2.52.0



^ permalink raw reply related

* Re: [PATCH 1/8] sparc/mm: remove register_page_bootmem_info()
From: Mike Rapoport @ 2026-05-13  8:25 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: David S. Miller, Andreas Larsson, Andrew Morton,
	Alexander Gordeev, Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka,
	Suren Baghdasaryan, Michal Hocko, sparclinux, linux-kernel,
	linux-mm, linux-s390, linuxppc-dev
In-Reply-To: <20260511-bootmem_info_prep-v1-1-3fb0be6fc688@kernel.org>

On Mon, May 11, 2026 at 04:05:29PM +0200, David Hildenbrand (Arm) wrote:
> sparc does not select CONFIG_HAVE_BOOTMEM_INFO_NODE, therefore,
> register_page_bootmem_info_node() is a nop.
> 
> Let's just get rid of register_page_bootmem_info().
> 
> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>

Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

> ---
>  arch/sparc/mm/init_64.c | 20 --------------------
>  1 file changed, 20 deletions(-)

-- 
Sincerely yours,
Mike.


^ permalink raw reply

* Re: [PATCH 2/8] mm/bootmem_info: drop initialization of page->lru
From: Mike Rapoport @ 2026-05-13  8:27 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: David S. Miller, Andreas Larsson, Andrew Morton,
	Alexander Gordeev, Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka,
	Suren Baghdasaryan, Michal Hocko, sparclinux, linux-kernel,
	linux-mm, linux-s390, linuxppc-dev
In-Reply-To: <20260511-bootmem_info_prep-v1-2-3fb0be6fc688@kernel.org>

On Mon, May 11, 2026 at 04:05:30PM +0200, David Hildenbrand (Arm) wrote:
> In the past, we used to store the type in page->lru.next, introduced by
> commit 5f24ce5fd34c ("thp: remove PG_buddy"). The location changed over
> the years; ever since commit 0386aaa6e9c8 ("bootmem: stop using
> page->index"), we store it alongside the info in page->private.
> 
> Consequently, there is no need to reset page->lru anymore.
> 
> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>

Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

> ---
>  mm/bootmem_info.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
> index 3d7675a3ae04..a0a1ecdec8d0 100644
> --- a/mm/bootmem_info.c
> +++ b/mm/bootmem_info.c
> @@ -34,7 +34,6 @@ void put_page_bootmem(struct page *page)
>  	if (page_ref_dec_return(page) == 1) {
>  		ClearPagePrivate(page);
>  		set_page_private(page, 0);
> -		INIT_LIST_HEAD(&page->lru);
>  		kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
>  		free_reserved_page(page);
>  	}
> 
> -- 
> 2.43.0
> 

-- 
Sincerely yours,
Mike.


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox