* [PATCH v6 net-next 10/15] net: dsa: netc: introduce NXP NETC switch driver for i.MX94
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
For i.MX94 series, the NETC IP provides full 802.1Q Ethernet switch
functionality, advanced QoS with 8 traffic classes, and a full range of
TSN standards capabilities. The switch has 3 user ports and 1 CPU port,
the CPU port is connected to an internal ENETC. Since the switch and the
internal ENETC are fully integrated within the NETC IP, no back-to-back
MAC connection is required. Instead, a light-weight "pseudo MAC" is used
between the switch and the ENETC. This translates to lower power (less
logic and memory) and lower delay (as there is no serialization delay
across this link).
Introduce the initial NETC switch driver with basic probe and remove
functionality. More features will be added in subsequent patches.
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
MAINTAINERS | 11 +
drivers/net/dsa/Kconfig | 2 +
drivers/net/dsa/Makefile | 1 +
drivers/net/dsa/netc/Kconfig | 14 +
drivers/net/dsa/netc/Makefile | 3 +
drivers/net/dsa/netc/netc_main.c | 600 ++++++++++++++++++++++++++
drivers/net/dsa/netc/netc_platform.c | 49 +++
drivers/net/dsa/netc/netc_switch.h | 92 ++++
drivers/net/dsa/netc/netc_switch_hw.h | 133 ++++++
9 files changed, 905 insertions(+)
create mode 100644 drivers/net/dsa/netc/Kconfig
create mode 100644 drivers/net/dsa/netc/Makefile
create mode 100644 drivers/net/dsa/netc/netc_main.c
create mode 100644 drivers/net/dsa/netc/netc_platform.c
create mode 100644 drivers/net/dsa/netc/netc_switch.h
create mode 100644 drivers/net/dsa/netc/netc_switch_hw.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 5bbbbde6b907..78d0a6038086 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19290,6 +19290,17 @@ F: Documentation/devicetree/bindings/clock/*imx*
F: drivers/clk/imx/
F: include/dt-bindings/clock/*imx*
+NXP NETC ETHERNET SWITCH DRIVER
+M: Wei Fang <wei.fang@nxp.com>
+R: Clark Wang <xiaoning.wang@nxp.com>
+L: imx@lists.linux.dev
+L: netdev@vger.kernel.org
+S: Maintained
+F: Documentation/devicetree/bindings/net/dsa/nxp,netc-switch.yaml
+F: drivers/net/dsa/netc/
+F: include/linux/dsa/tag_netc.h
+F: net/dsa/tag_netc.c
+
NXP NETC TIMER PTP CLOCK DRIVER
M: Wei Fang <wei.fang@nxp.com>
M: Clark Wang <xiaoning.wang@nxp.com>
diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 39fb8ead16b5..4ab567c5bbaf 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -76,6 +76,8 @@ source "drivers/net/dsa/mv88e6xxx/Kconfig"
source "drivers/net/dsa/mxl862xx/Kconfig"
+source "drivers/net/dsa/netc/Kconfig"
+
source "drivers/net/dsa/ocelot/Kconfig"
source "drivers/net/dsa/qca/Kconfig"
diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile
index f5a463b87ec2..d2975badffc0 100644
--- a/drivers/net/dsa/Makefile
+++ b/drivers/net/dsa/Makefile
@@ -21,6 +21,7 @@ obj-y += lantiq/
obj-y += microchip/
obj-y += mv88e6xxx/
obj-y += mxl862xx/
+obj-y += netc/
obj-y += ocelot/
obj-y += qca/
obj-y += realtek/
diff --git a/drivers/net/dsa/netc/Kconfig b/drivers/net/dsa/netc/Kconfig
new file mode 100644
index 000000000000..8824d30ed3ea
--- /dev/null
+++ b/drivers/net/dsa/netc/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NET_DSA_NETC_SWITCH
+ tristate "NXP NETC Ethernet switch support"
+ depends on NET_DSA && PCI
+ select NET_DSA_TAG_NETC
+ select FSL_ENETC_MDIO
+ select NXP_NTMP
+ select NXP_NETC_LIB
+ help
+ This driver supports the NXP NETC Ethernet switch, which is embedded
+ as a PCIe function of the NXP NETC IP. But note that this driver does
+ only support switch versions greater than or equal to NETC v4.3.
+
+ If compiled as module (M), the module name is nxp-netc-switch.
diff --git a/drivers/net/dsa/netc/Makefile b/drivers/net/dsa/netc/Makefile
new file mode 100644
index 000000000000..4a5767562574
--- /dev/null
+++ b/drivers/net/dsa/netc/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_NET_DSA_NETC_SWITCH) += nxp-netc-switch.o
+nxp-netc-switch-objs := netc_main.o netc_platform.o
diff --git a/drivers/net/dsa/netc/netc_main.c b/drivers/net/dsa/netc/netc_main.c
new file mode 100644
index 000000000000..8e3a3230226c
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_main.c
@@ -0,0 +1,600 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * NXP NETC switch driver
+ * Copyright 2025-2026 NXP
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/fsl/enetc_mdio.h>
+#include <linux/if_vlan.h>
+#include <linux/of_mdio.h>
+
+#include "netc_switch.h"
+
+static enum dsa_tag_protocol
+netc_get_tag_protocol(struct dsa_switch *ds, int port,
+ enum dsa_tag_protocol mprot)
+{
+ return DSA_TAG_PROTO_NETC;
+}
+
+static void netc_port_rmw(struct netc_port *np, u32 reg,
+ u32 mask, u32 val)
+{
+ u32 old, new;
+
+ WARN_ON((mask | val) != mask);
+
+ old = netc_port_rd(np, reg);
+ new = (old & ~mask) | val;
+ if (new == old)
+ return;
+
+ netc_port_wr(np, reg, new);
+}
+
+static void netc_mac_port_wr(struct netc_port *np, u32 reg, u32 val)
+{
+ if (is_netc_pseudo_port(np))
+ return;
+
+ netc_port_wr(np, reg, val);
+ if (np->caps.pmac)
+ netc_port_wr(np, reg + NETC_PMAC_OFFSET, val);
+}
+
+static void netc_port_get_capability(struct netc_port *np)
+{
+ u32 val;
+
+ val = netc_port_rd(np, NETC_PMCAPR);
+ if (val & PMCAPR_HD)
+ np->caps.half_duplex = true;
+
+ if (FIELD_GET(PMCAPR_FP, val) == FP_SUPPORT)
+ np->caps.pmac = true;
+
+ val = netc_port_rd(np, NETC_PCAPR);
+ if (val & PCAPR_LINK_TYPE)
+ np->caps.pseudo_link = true;
+}
+
+static int netc_port_create_emdio_bus(struct netc_port *np,
+ struct device_node *node)
+{
+ struct netc_switch *priv = np->switch_priv;
+ struct enetc_mdio_priv *mdio_priv;
+ struct device *dev = priv->dev;
+ struct enetc_hw *hw;
+ struct mii_bus *bus;
+ int err;
+
+ hw = enetc_hw_alloc(dev, np->iobase);
+ if (IS_ERR(hw))
+ return dev_err_probe(dev, PTR_ERR(hw),
+ "Failed to allocate enetc_hw\n");
+
+ bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv));
+ if (!bus)
+ return -ENOMEM;
+
+ bus->name = "NXP NETC switch external MDIO Bus";
+ bus->read = enetc_mdio_read_c22;
+ bus->write = enetc_mdio_write_c22;
+ bus->read_c45 = enetc_mdio_read_c45;
+ bus->write_c45 = enetc_mdio_write_c45;
+ bus->parent = dev;
+ mdio_priv = bus->priv;
+ mdio_priv->hw = hw;
+ mdio_priv->mdio_base = NETC_EMDIO_BASE;
+ snprintf(bus->id, MII_BUS_ID_SIZE, "%s-p%d-emdio",
+ dev_name(dev), np->dp->index);
+
+ err = devm_of_mdiobus_register(dev, bus, node);
+ if (err)
+ return dev_err_probe(dev, err,
+ "Cannot register EMDIO bus\n");
+
+ np->emdio = bus;
+
+ return 0;
+}
+
+static int netc_port_create_mdio_bus(struct netc_port *np,
+ struct device_node *node)
+{
+ struct device_node *mdio_node;
+ int err;
+
+ mdio_node = of_get_child_by_name(node, "mdio");
+ if (mdio_node) {
+ err = netc_port_create_emdio_bus(np, mdio_node);
+ of_node_put(mdio_node);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int netc_init_switch_id(struct netc_switch *priv)
+{
+ struct netc_switch_regs *regs = &priv->regs;
+ struct dsa_switch *ds = priv->ds;
+
+ /* The value of 0 is reserved for the VEPA switch and cannot
+ * be used. So 'dsa,member' is a required property for NETC
+ * switch, the member is used to specify the switch ID, which
+ * cannot be zero. This way, the hardware switch ID and the
+ * software switch ID are consistent.
+ */
+ if (ds->index > FIELD_MAX(SWCR_SWID) || !ds->index) {
+ dev_err(priv->dev, "Switch index %d out of range\n",
+ ds->index);
+ return -ERANGE;
+ }
+
+ netc_base_wr(regs, NETC_SWCR, ds->index);
+
+ return 0;
+}
+
+static int netc_init_all_ports(struct netc_switch *priv)
+{
+ struct device *dev = priv->dev;
+ struct netc_port *np;
+ struct dsa_port *dp;
+ int err;
+
+ priv->ports = devm_kcalloc(dev, priv->info->num_ports,
+ sizeof(struct netc_port *),
+ GFP_KERNEL);
+ if (!priv->ports)
+ return -ENOMEM;
+
+ /* Some DSA interfaces may set the port even it is disabled, such
+ * as .port_disable(), .port_stp_state_set() and so on. To avoid
+ * crash caused by accessing NULL port pointer, each port is
+ * allocated its own memory. Otherwise, we need to check whether
+ * the port pointer is NULL in these interfaces. The latter is
+ * difficult for us to cover.
+ */
+ for (int i = 0; i < priv->info->num_ports; i++) {
+ np = devm_kzalloc(dev, sizeof(*np), GFP_KERNEL);
+ if (!np)
+ return -ENOMEM;
+
+ np->switch_priv = priv;
+ np->iobase = priv->regs.port + PORT_IOBASE(i);
+ netc_port_get_capability(np);
+ priv->ports[i] = np;
+ }
+
+ dsa_switch_for_each_available_port(dp, priv->ds) {
+ np = priv->ports[dp->index];
+ np->dp = dp;
+
+ if (dsa_port_is_user(dp)) {
+ err = netc_port_create_mdio_bus(np, dp->dn);
+ if (err) {
+ dev_err(dev, "Failed to create MDIO bus\n");
+ return err;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static void netc_init_ntmp_tbl_versions(struct netc_switch *priv)
+{
+ struct ntmp_user *ntmp = &priv->ntmp;
+
+ /* All tables default to version 0 */
+ memset(&ntmp->tbl, 0, sizeof(ntmp->tbl));
+}
+
+static int netc_init_all_cbdrs(struct netc_switch *priv)
+{
+ struct netc_switch_regs *regs = &priv->regs;
+ struct ntmp_user *ntmp = &priv->ntmp;
+ int i, err;
+
+ ntmp->cbdr_num = NETC_CBDR_NUM;
+ ntmp->dev = priv->dev;
+ ntmp->ring = devm_kcalloc(ntmp->dev, ntmp->cbdr_num,
+ sizeof(struct netc_cbdr),
+ GFP_KERNEL);
+ if (!ntmp->ring)
+ return -ENOMEM;
+
+ for (i = 0; i < ntmp->cbdr_num; i++) {
+ struct netc_cbdr *cbdr = &ntmp->ring[i];
+ struct netc_cbdr_regs cbdr_regs;
+
+ cbdr_regs.pir = regs->base + NETC_CBDRPIR(i);
+ cbdr_regs.cir = regs->base + NETC_CBDRCIR(i);
+ cbdr_regs.mr = regs->base + NETC_CBDRMR(i);
+ cbdr_regs.bar0 = regs->base + NETC_CBDRBAR0(i);
+ cbdr_regs.bar1 = regs->base + NETC_CBDRBAR1(i);
+ cbdr_regs.lenr = regs->base + NETC_CBDRLENR(i);
+
+ err = ntmp_init_cbdr(cbdr, ntmp->dev, &cbdr_regs);
+ if (err)
+ goto free_cbdrs;
+ }
+
+ return 0;
+
+free_cbdrs:
+ for (i--; i >= 0; i--)
+ ntmp_free_cbdr(&ntmp->ring[i]);
+
+ return err;
+}
+
+static void netc_remove_all_cbdrs(struct netc_switch *priv)
+{
+ struct ntmp_user *ntmp = &priv->ntmp;
+
+ for (int i = 0; i < NETC_CBDR_NUM; i++)
+ ntmp_free_cbdr(&ntmp->ring[i]);
+}
+
+static int netc_init_ntmp_user(struct netc_switch *priv)
+{
+ netc_init_ntmp_tbl_versions(priv);
+
+ return netc_init_all_cbdrs(priv);
+}
+
+static void netc_free_ntmp_user(struct netc_switch *priv)
+{
+ netc_remove_all_cbdrs(priv);
+}
+
+static void netc_switch_dos_default_config(struct netc_switch *priv)
+{
+ struct netc_switch_regs *regs = &priv->regs;
+ u32 val;
+
+ val = DOSL2CR_SAMEADDR | DOSL2CR_MSAMCC;
+ netc_base_wr(regs, NETC_DOSL2CR, val);
+
+ val = DOSL3CR_SAMEADDR | DOSL3CR_IPSAMCC;
+ netc_base_wr(regs, NETC_DOSL3CR, val);
+}
+
+static void netc_switch_vfht_default_config(struct netc_switch *priv)
+{
+ struct netc_switch_regs *regs = &priv->regs;
+ u32 val;
+
+ val = netc_base_rd(regs, NETC_VFHTDECR2);
+
+ /* If no match is found in the VLAN Filter table, then VFHTDECR2[MLO]
+ * will take effect. VFHTDECR2[MLO] is set to "Software MAC learning
+ * secure" by default. Notice BPCR[MLO] will override VFHTDECR2[MLO]
+ * if its value is not zero.
+ */
+ val = u32_replace_bits(val, MLO_SW_SEC, VFHTDECR2_MLO);
+ val = u32_replace_bits(val, MFO_NO_MATCH_DISCARD, VFHTDECR2_MFO);
+ netc_base_wr(regs, NETC_VFHTDECR2, val);
+}
+
+static void netc_port_set_max_frame_size(struct netc_port *np,
+ u32 max_frame_size)
+{
+ netc_mac_port_wr(np, NETC_PM_MAXFRM(0),
+ max_frame_size & PM_MAXFRAM);
+}
+
+static void netc_switch_fixed_config(struct netc_switch *priv)
+{
+ netc_switch_dos_default_config(priv);
+ netc_switch_vfht_default_config(priv);
+}
+
+static void netc_port_set_tc_max_sdu(struct netc_port *np,
+ int tc, u32 max_sdu)
+{
+ u32 val = FIELD_PREP(PTCTMSDUR_MAXSDU, max_sdu) |
+ FIELD_PREP(PTCTMSDUR_SDU_TYPE, SDU_TYPE_MPDU);
+
+ netc_port_wr(np, NETC_PTCTMSDUR(tc), val);
+}
+
+static void netc_port_set_all_tc_msdu(struct netc_port *np)
+{
+ for (int tc = 0; tc < NETC_TC_NUM; tc++)
+ netc_port_set_tc_max_sdu(np, tc, NETC_MAX_FRAME_LEN);
+}
+
+static void netc_port_set_mlo(struct netc_port *np, enum netc_mlo mlo)
+{
+ netc_port_rmw(np, NETC_BPCR, BPCR_MLO, FIELD_PREP(BPCR_MLO, mlo));
+}
+
+static void netc_port_fixed_config(struct netc_port *np)
+{
+ /* Default IPV and DR setting */
+ netc_port_rmw(np, NETC_PQOSMR, PQOSMR_VS | PQOSMR_VE,
+ PQOSMR_VS | PQOSMR_VE);
+
+ /* Enable L2 and L3 DOS */
+ netc_port_rmw(np, NETC_PCR, PCR_L2DOSE | PCR_L3DOSE,
+ PCR_L2DOSE | PCR_L3DOSE);
+}
+
+static void netc_port_default_config(struct netc_port *np)
+{
+ netc_port_fixed_config(np);
+
+ /* Default VLAN unaware */
+ netc_port_rmw(np, NETC_BPDVR, BPDVR_RXVAM, BPDVR_RXVAM);
+
+ if (dsa_port_is_cpu(np->dp))
+ /* For CPU port, source port pruning is disabled */
+ netc_port_rmw(np, NETC_BPCR, BPCR_SRCPRND, BPCR_SRCPRND);
+ else
+ netc_port_set_mlo(np, MLO_DISABLE);
+
+ netc_port_set_max_frame_size(np, NETC_MAX_FRAME_LEN);
+ netc_port_set_all_tc_msdu(np);
+}
+
+static int netc_setup(struct dsa_switch *ds)
+{
+ struct netc_switch *priv = ds->priv;
+ struct dsa_port *dp;
+ int err;
+
+ err = netc_init_switch_id(priv);
+ if (err)
+ return err;
+
+ err = netc_init_all_ports(priv);
+ if (err)
+ return err;
+
+ err = netc_init_ntmp_user(priv);
+ if (err)
+ return err;
+
+ netc_switch_fixed_config(priv);
+
+ /* default setting for ports */
+ dsa_switch_for_each_available_port(dp, ds)
+ netc_port_default_config(priv->ports[dp->index]);
+
+ return 0;
+}
+
+static void netc_teardown(struct dsa_switch *ds)
+{
+ struct netc_switch *priv = ds->priv;
+
+ netc_free_ntmp_user(priv);
+}
+
+static bool netc_port_is_emdio_consumer(struct device_node *node)
+{
+ struct device_node *mdio_node;
+
+ /* If the port node has phy-handle property and it does
+ * not contain a mdio child node, then the port is the
+ * EMDIO consumer.
+ */
+ mdio_node = of_get_child_by_name(node, "mdio");
+ if (!mdio_node)
+ return true;
+
+ of_node_put(mdio_node);
+
+ return false;
+}
+
+/* Currently, phylink_of_phy_connect() is called by dsa_user_create(),
+ * so if the switch uses the external MDIO controller (like the EMDIO
+ * function) to manage the external PHYs. The MDIO bus may not be
+ * created when phylink_of_phy_connect() is called, so it will return
+ * an error and cause the switch driver to fail to probe.
+ * This workaround can be removed when DSA phylink_of_phy_connect()
+ * calls are moved from probe() to ndo_open().
+ */
+static int netc_switch_check_emdio_is_ready(struct device *dev)
+{
+ struct device_node *ports, *phy_node;
+ struct phy_device *phydev;
+ int err = 0;
+
+ ports = of_get_child_by_name(dev->of_node, "ethernet-ports");
+ if (!ports) {
+ dev_err(dev, "Cannot find the ethernet-ports node\n");
+ return -EINVAL;
+ }
+
+ for_each_available_child_of_node_scoped(ports, child) {
+ /* If the node does not have phy-handle property, then the
+ * port does not connect to a PHY, so the port is not the
+ * EMDIO consumer.
+ */
+ phy_node = of_parse_phandle(child, "phy-handle", 0);
+ if (!phy_node)
+ continue;
+
+ /* Note that from the hardware perspective, the switch ports
+ * do not support sharing the MDIO bus defined under one port.
+ * Each port can only access its own external PHY through its
+ * port MDIO bus.
+ */
+ if (!netc_port_is_emdio_consumer(child)) {
+ of_node_put(phy_node);
+ continue;
+ }
+
+ phydev = of_phy_find_device(phy_node);
+ of_node_put(phy_node);
+ if (!phydev) {
+ err = -EPROBE_DEFER;
+ goto out;
+ }
+
+ put_device(&phydev->mdio.dev);
+ }
+
+out:
+ of_node_put(ports);
+
+ return err;
+}
+
+static int netc_switch_pci_init(struct pci_dev *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct netc_switch_regs *regs;
+ struct netc_switch *priv;
+ void __iomem *base;
+ int err;
+
+ pcie_flr(pdev);
+ err = pcim_enable_device(pdev);
+ if (err)
+ return dev_err_probe(dev, err, "Failed to enable device\n");
+
+ err = pcim_request_all_regions(pdev, KBUILD_MODNAME);
+ if (err)
+ return dev_err_probe(dev, err, "Failed to request regions\n");
+
+ /* The command BD rings and NTMP tables need DMA. No need to check
+ * the return value, because it never returns fail when the mask is
+ * DMA_BIT_MASK(64), see dma-api-howto.rst.
+ */
+ dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+
+ if (pci_resource_len(pdev, NETC_REGS_BAR) < NETC_REGS_SIZE) {
+ return dev_err_probe(dev, -EINVAL,
+ "Invalid register space size\n");
+ }
+
+ base = pcim_iomap(pdev, NETC_REGS_BAR, 0);
+ if (!base)
+ return dev_err_probe(dev, -ENXIO, "pcim_iomap() failed\n");
+
+ pci_set_master(pdev);
+
+ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->pdev = pdev;
+ priv->dev = dev;
+
+ regs = &priv->regs;
+ regs->base = base;
+ regs->port = regs->base + NETC_REGS_PORT_BASE;
+ regs->global = regs->base + NETC_REGS_GLOBAL_BASE;
+ pci_set_drvdata(pdev, priv);
+
+ return 0;
+}
+
+static void netc_switch_get_ip_revision(struct netc_switch *priv)
+{
+ struct netc_switch_regs *regs = &priv->regs;
+ u32 val = netc_glb_rd(regs, NETC_IPBRR0);
+
+ priv->revision = FIELD_GET(IPBRR0_IP_REV, val);
+}
+
+static const struct dsa_switch_ops netc_switch_ops = {
+ .get_tag_protocol = netc_get_tag_protocol,
+ .setup = netc_setup,
+ .teardown = netc_teardown,
+};
+
+static int netc_switch_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct device_node *node = dev_of_node(&pdev->dev);
+ struct device *dev = &pdev->dev;
+ struct netc_switch *priv;
+ struct dsa_switch *ds;
+ int err;
+
+ if (!node)
+ return dev_err_probe(dev, -ENODEV,
+ "No DT bindings, skipping\n");
+
+ err = netc_switch_check_emdio_is_ready(dev);
+ if (err)
+ return err;
+
+ err = netc_switch_pci_init(pdev);
+ if (err)
+ return err;
+
+ priv = pci_get_drvdata(pdev);
+ netc_switch_get_ip_revision(priv);
+
+ err = netc_switch_platform_probe(priv);
+ if (err)
+ return err;
+
+ ds = devm_kzalloc(dev, sizeof(*ds), GFP_KERNEL);
+ if (!ds)
+ return -ENOMEM;
+
+ ds->dev = dev;
+ ds->num_ports = priv->info->num_ports;
+ ds->num_tx_queues = NETC_TC_NUM;
+ ds->ops = &netc_switch_ops;
+ ds->priv = priv;
+ priv->ds = ds;
+
+ err = dsa_register_switch(ds);
+ if (err)
+ return dev_err_probe(dev, err,
+ "Failed to register DSA switch\n");
+
+ return 0;
+}
+
+static void netc_switch_remove(struct pci_dev *pdev)
+{
+ struct netc_switch *priv = pci_get_drvdata(pdev);
+
+ if (!priv)
+ return;
+
+ dsa_unregister_switch(priv->ds);
+}
+
+static void netc_switch_shutdown(struct pci_dev *pdev)
+{
+ struct netc_switch *priv = pci_get_drvdata(pdev);
+
+ if (!priv)
+ return;
+
+ dsa_switch_shutdown(priv->ds);
+ pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id netc_switch_ids[] = {
+ { PCI_DEVICE(NETC_SWITCH_VENDOR_ID, NETC_SWITCH_DEVICE_ID) },
+ { }
+};
+MODULE_DEVICE_TABLE(pci, netc_switch_ids);
+
+static struct pci_driver netc_switch_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = netc_switch_ids,
+ .probe = netc_switch_probe,
+ .remove = netc_switch_remove,
+ .shutdown = netc_switch_shutdown,
+};
+module_pci_driver(netc_switch_driver);
+
+MODULE_DESCRIPTION("NXP NETC Switch driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/dsa/netc/netc_platform.c b/drivers/net/dsa/netc/netc_platform.c
new file mode 100644
index 000000000000..abd599ea9c8d
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_platform.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * NXP NETC switch driver
+ * Copyright 2025-2026 NXP
+ */
+
+#include "netc_switch.h"
+
+struct netc_switch_platform {
+ u16 revision;
+ const struct netc_switch_info *info;
+};
+
+static const struct netc_switch_info imx94_info = {
+ .num_ports = 4,
+};
+
+static const struct netc_switch_platform netc_platforms[] = {
+ { .revision = NETC_SWITCH_REV_4_3, .info = &imx94_info, },
+ { }
+};
+
+static const struct netc_switch_info *
+netc_switch_get_info(struct netc_switch *priv)
+{
+ int i;
+
+ /* Matching based on IP revision */
+ for (i = 0; i < ARRAY_SIZE(netc_platforms); i++) {
+ if (priv->revision == netc_platforms[i].revision)
+ return netc_platforms[i].info;
+ }
+
+ return NULL;
+}
+
+int netc_switch_platform_probe(struct netc_switch *priv)
+{
+ const struct netc_switch_info *info = netc_switch_get_info(priv);
+
+ if (!info) {
+ dev_err(priv->dev, "Cannot find switch platform info\n");
+ return -EINVAL;
+ }
+
+ priv->info = info;
+
+ return 0;
+}
diff --git a/drivers/net/dsa/netc/netc_switch.h b/drivers/net/dsa/netc/netc_switch.h
new file mode 100644
index 000000000000..a6d36dcebc6d
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_switch.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright 2025-2026 NXP
+ */
+
+#ifndef _NETC_SWITCH_H
+#define _NETC_SWITCH_H
+
+#include <linux/dsa/tag_netc.h>
+#include <linux/fsl/netc_global.h>
+#include <linux/fsl/ntmp.h>
+#include <linux/of_device.h>
+#include <linux/of_net.h>
+#include <linux/pci.h>
+
+#include "netc_switch_hw.h"
+
+#define NETC_REGS_BAR 0
+#define NETC_REGS_SIZE 0x80000
+#define NETC_MSIX_TBL_BAR 2
+#define NETC_REGS_PORT_BASE 0x4000
+/* register block size per port */
+#define NETC_REGS_PORT_SIZE 0x4000
+#define PORT_IOBASE(p) (NETC_REGS_PORT_SIZE * (p))
+#define NETC_REGS_GLOBAL_BASE 0x70000
+
+#define NETC_SWITCH_REV_4_3 0x0403
+
+#define NETC_TC_NUM 8
+#define NETC_CBDR_NUM 2
+
+#define NETC_MAX_FRAME_LEN 9600
+
+struct netc_switch;
+
+struct netc_switch_info {
+ u32 num_ports;
+};
+
+struct netc_port_caps {
+ u32 half_duplex:1; /* indicates whether the port support half-duplex */
+ u32 pmac:1; /* indicates whether the port has preemption MAC */
+ u32 pseudo_link:1;
+};
+
+struct netc_port {
+ void __iomem *iobase;
+ struct netc_switch *switch_priv;
+ struct netc_port_caps caps;
+ struct dsa_port *dp;
+ struct mii_bus *emdio;
+};
+
+struct netc_switch_regs {
+ void __iomem *base;
+ void __iomem *port;
+ void __iomem *global;
+};
+
+struct netc_switch {
+ struct pci_dev *pdev;
+ struct device *dev;
+ struct dsa_switch *ds;
+ u16 revision;
+
+ const struct netc_switch_info *info;
+ struct netc_switch_regs regs;
+ struct netc_port **ports;
+
+ struct ntmp_user ntmp;
+};
+
+/* Write/Read Switch base registers */
+#define netc_base_rd(r, o) netc_read((r)->base + (o))
+#define netc_base_wr(r, o, v) netc_write((r)->base + (o), v)
+
+/* Write/Read registers of Switch Port (including pseudo MAC port) */
+#define netc_port_rd(p, o) netc_read((p)->iobase + (o))
+#define netc_port_wr(p, o, v) netc_write((p)->iobase + (o), v)
+
+/* Write/Read Switch global registers */
+#define netc_glb_rd(r, o) netc_read((r)->global + (o))
+#define netc_glb_wr(r, o, v) netc_write((r)->global + (o), v)
+
+static inline bool is_netc_pseudo_port(struct netc_port *np)
+{
+ return np->caps.pseudo_link;
+}
+
+int netc_switch_platform_probe(struct netc_switch *priv);
+
+#endif
diff --git a/drivers/net/dsa/netc/netc_switch_hw.h b/drivers/net/dsa/netc/netc_switch_hw.h
new file mode 100644
index 000000000000..0419f7f9207e
--- /dev/null
+++ b/drivers/net/dsa/netc/netc_switch_hw.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright 2025-2026 NXP
+ */
+
+#ifndef _NETC_SWITCH_HW_H
+#define _NETC_SWITCH_HW_H
+
+#include <linux/bitops.h>
+
+#define NETC_SWITCH_VENDOR_ID 0x1131
+#define NETC_SWITCH_DEVICE_ID 0xeef2
+
+/* Definition of Switch base registers */
+#define NETC_CBDRMR(a) (0x0800 + (a) * 0x30)
+#define NETC_CBDRBAR0(a) (0x0810 + (a) * 0x30)
+#define NETC_CBDRBAR1(a) (0x0814 + (a) * 0x30)
+#define NETC_CBDRPIR(a) (0x0818 + (a) * 0x30)
+#define NETC_CBDRCIR(a) (0x081c + (a) * 0x30)
+#define NETC_CBDRLENR(a) (0x0820 + (a) * 0x30)
+
+#define NETC_SWCR 0x1018
+#define SWCR_SWID GENMASK(2, 0)
+
+#define NETC_DOSL2CR 0x1220
+#define DOSL2CR_SAMEADDR BIT(0)
+#define DOSL2CR_MSAMCC BIT(1)
+
+#define NETC_DOSL3CR 0x1224
+#define DOSL3CR_SAMEADDR BIT(0)
+#define DOSL3CR_IPSAMCC BIT(1)
+
+#define NETC_VFHTDECR1 0x2014
+#define NETC_VFHTDECR2 0x2018
+#define VFHTDECR2_ET_PORT(a) BIT((a))
+#define VFHTDECR2_MLO GENMASK(26, 24)
+#define VFHTDECR2_MFO GENMASK(28, 27)
+
+/* Definition of Switch port registers */
+#define NETC_PCAPR 0x0000
+#define PCAPR_LINK_TYPE BIT(4)
+#define PCAPR_NUM_TC GENMASK(15, 12)
+#define PCAPR_NUM_Q GENMASK(19, 16)
+#define PCAPR_NUM_CG GENMASK(27, 24)
+#define PCAPR_TGS BIT(28)
+#define PCAPR_CBS BIT(29)
+
+#define NETC_PMCAPR 0x0004
+#define PMCAPR_HD BIT(8)
+#define PMCAPR_FP GENMASK(10, 9)
+#define FP_SUPPORT 2
+
+#define NETC_PCR 0x0010
+#define PCR_HDR_FMT BIT(0)
+#define PCR_NS_TAG_PORT BIT(3)
+#define PCR_L2DOSE BIT(4)
+#define PCR_L3DOSE BIT(5)
+#define PCR_TIMER_CS BIT(8)
+#define PCR_PSPEED GENMASK(29, 16)
+#define PSPEED_SET_VAL(s) FIELD_PREP(PCR_PSPEED, ((s) / 10 - 1))
+
+#define NETC_PQOSMR 0x0054
+#define PQOSMR_VS BIT(0)
+#define PQOSMR_VE BIT(1)
+#define PQOSMR_DDR GENMASK(3, 2)
+#define PQOSMR_DIPV GENMASK(6, 4)
+#define PQOSMR_VQMP GENMASK(19, 16)
+#define PQOSMR_QVMP GENMASK(23, 20)
+
+#define NETC_PTCTMSDUR(a) (0x208 + (a) * 0x20)
+#define PTCTMSDUR_MAXSDU GENMASK(15, 0)
+#define PTCTMSDUR_SDU_TYPE GENMASK(17, 16)
+#define SDU_TYPE_PPDU 0
+#define SDU_TYPE_MPDU 1
+#define SDU_TYPE_MSDU 2
+
+#define NETC_BPCR 0x500
+#define BPCR_DYN_LIMIT GENMASK(15, 0)
+#define BPCR_MLO GENMASK(22, 20)
+#define BPCR_UUCASTE BIT(24)
+#define BPCR_UMCASTE BIT(25)
+#define BPCR_MCASTE BIT(26)
+#define BPCR_BCASTE BIT(27)
+#define BPCR_STAMVD BIT(28)
+#define BPCR_SRCPRND BIT(29)
+
+/* MAC learning options, see BPCR[MLO], VFHTDECR2[MLO] and
+ * VLAN Filter Table CFGE_DATA[MLO]
+ */
+enum netc_mlo {
+ MLO_NOT_OVERRIDE = 0,
+ MLO_DISABLE,
+ MLO_HW,
+ MLO_SW_SEC,
+ MLO_SW_UNSEC,
+ MLO_DISABLE_SMAC,
+};
+
+/* MAC forwarding options, see VFHTDECR2[MFO] and VLAN
+ * Filter Table CFGE_DATA[MFO]
+ */
+enum netc_mfo {
+ MFO_NO_FDB_LOOKUP = 1,
+ MFO_NO_MATCH_FLOOD,
+ MFO_NO_MATCH_DISCARD,
+};
+
+#define NETC_BPDVR 0x510
+#define BPDVR_VID GENMASK(11, 0)
+#define BPDVR_DEI BIT(12)
+#define BPDVR_PCP GENMASK(15, 13)
+#define BPDVR_TPID BIT(16)
+#define BPDVR_RXTAGA GENMASK(23, 20)
+#define BPDVR_RXVAM BIT(24)
+#define BPDVR_TXTAGA GENMASK(26, 25)
+
+/* Definition of Switch ethernet MAC port registers */
+#define NETC_PMAC_OFFSET 0x400
+#define NETC_PM_CMD_CFG(a) (0x1008 + (a) * 0x400)
+#define PM_CMD_CFG_TX_EN BIT(0)
+#define PM_CMD_CFG_RX_EN BIT(1)
+
+#define NETC_PM_MAXFRM(a) (0x1014 + (a) * 0x400)
+#define PM_MAXFRAM GENMASK(15, 0)
+
+#define NETC_PEMDIOCR 0x1c00
+#define NETC_EMDIO_BASE NETC_PEMDIOCR
+
+/* Definition of global registers (read only) */
+#define NETC_IPBRR0 0x0bf8
+#define IPBRR0_IP_REV GENMASK(15, 0)
+
+#endif
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 09/15] net: dsa: add NETC switch tag support
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
The NXP NETC switch tag is a proprietary header added to frames after the
source MAC address. The switch tag has 3 types, and each type has 1 ~ 4
subtypes, the details are as follows.
Forward NXP switch tag (Type=0): Represents forwarded frames.
- SubType = 0 - Normal frame processing.
To_Port NXP switch tag (Type=1): Represents frames that are to be sent
to a specific switch port.
- SubType = 0. No request to perform timestamping.
- SubType = 1. Request to perform one-step timestamping.
- SubType = 2. Request to perform two-step timestamping.
- SubType = 3. Request to perform both one-step timestamping and
two-step timestamping.
To_Host NXP switch tag (Type=2): Represents frames redirected or copied
to the switch management port.
- SubType = 0. Received frames redirected or copied to the switch
management port.
- SubType = 1. Received frames redirected or copied to the switch
management port with captured timestamp at the switch port where
the frame was received.
- SubType = 2. Transmit timestamp response (two-step timestamping).
In addition, the length of different type switch tag is different, the
minimum length is 6 bytes, the maximum length is 14 bytes. Currently,
Forward tag, SubType 0 of To_Port tag and Subtype 0 of To_Host tag are
supported. More tags will be supported in the future.
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
include/linux/dsa/tag_netc.h | 14 +++
include/net/dsa.h | 2 +
include/uapi/linux/if_ether.h | 1 +
net/dsa/Kconfig | 10 ++
net/dsa/Makefile | 1 +
net/dsa/tag_netc.c | 193 ++++++++++++++++++++++++++++++++++
6 files changed, 221 insertions(+)
create mode 100644 include/linux/dsa/tag_netc.h
create mode 100644 net/dsa/tag_netc.c
diff --git a/include/linux/dsa/tag_netc.h b/include/linux/dsa/tag_netc.h
new file mode 100644
index 000000000000..fe964722e5b0
--- /dev/null
+++ b/include/linux/dsa/tag_netc.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2025-2026 NXP
+ */
+
+#ifndef __NET_DSA_TAG_NETC_H
+#define __NET_DSA_TAG_NETC_H
+
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#define NETC_TAG_MAX_LEN 14
+
+#endif
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4cc67469cf2e..8c16ef23cc10 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -58,6 +58,7 @@ struct tc_action;
#define DSA_TAG_PROTO_YT921X_VALUE 30
#define DSA_TAG_PROTO_MXL_GSW1XX_VALUE 31
#define DSA_TAG_PROTO_MXL862_VALUE 32
+#define DSA_TAG_PROTO_NETC_VALUE 33
enum dsa_tag_protocol {
DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE,
@@ -93,6 +94,7 @@ enum dsa_tag_protocol {
DSA_TAG_PROTO_YT921X = DSA_TAG_PROTO_YT921X_VALUE,
DSA_TAG_PROTO_MXL_GSW1XX = DSA_TAG_PROTO_MXL_GSW1XX_VALUE,
DSA_TAG_PROTO_MXL862 = DSA_TAG_PROTO_MXL862_VALUE,
+ DSA_TAG_PROTO_NETC = DSA_TAG_PROTO_NETC_VALUE,
};
struct dsa_switch;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index df9d44a11540..fb5efc8e06cc 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -123,6 +123,7 @@
#define ETH_P_DSA_A5PSW 0xE001 /* A5PSW Tag Value [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */
#define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_NXP_NETC 0xFD3A /* NXP NETC DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is more than this value
* then the frame is Ethernet II. Else it is 802.3 */
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 5ed8c704636d..d5e725b90d78 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -125,6 +125,16 @@ config NET_DSA_TAG_KSZ
Say Y if you want to enable support for tagging frames for the
Microchip 8795/937x/9477/9893 families of switches.
+config NET_DSA_TAG_NETC
+ tristate "Tag driver for NXP NETC switches"
+ help
+ Say Y or M if you want to enable support for the NXP Switch Tag (NST),
+ as implemented by NXP NETC switches having version 4.3 or later. The
+ switch tag is a proprietary header added to frames after the source
+ MAC address, it has 3 types and each type has different subtypes, so
+ its length depends on the type and subtype of the tag, the maximum
+ length is 14 bytes.
+
config NET_DSA_TAG_OCELOT
tristate "Tag driver for Ocelot family of switches, using NPI port"
select PACKING
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index bf7247759a64..b8c2667cd14a 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
obj-$(CONFIG_NET_DSA_TAG_MXL_862XX) += tag_mxl862xx.o
obj-$(CONFIG_NET_DSA_TAG_MXL_GSW1XX) += tag_mxl-gsw1xx.o
+obj-$(CONFIG_NET_DSA_TAG_NETC) += tag_netc.o
obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o
obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o
diff --git a/net/dsa/tag_netc.c b/net/dsa/tag_netc.c
new file mode 100644
index 000000000000..b29aa54b1988
--- /dev/null
+++ b/net/dsa/tag_netc.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2025-2026 NXP
+ */
+
+#include <linux/dsa/tag_netc.h>
+
+#include "tag.h"
+
+#define NETC_NAME "nxp_netc"
+
+/* Forward NXP switch tag */
+#define NETC_TAG_FORWARD 0
+
+/* To_Port NXP switch tag */
+#define NETC_TAG_TO_PORT 1
+/* SubType0: No request to perform timestamping */
+#define NETC_TAG_TP_SUBTYPE0 0
+
+/* To_Host NXP switch tag */
+#define NETC_TAG_TO_HOST 2
+/* SubType0: frames redirected or copied to CPU port */
+#define NETC_TAG_TH_SUBTYPE0 0
+/* SubType1: frames redirected or copied to CPU port with timestamp */
+#define NETC_TAG_TH_SUBTYPE1 1
+/* SubType2: Transmit timestamp response (two-step timestamping) */
+#define NETC_TAG_TH_SUBTYPE2 2
+
+/* NETC switch tag lengths */
+#define NETC_TAG_FORWARD_LEN 6
+#define NETC_TAG_TP_SUBTYPE0_LEN 6
+#define NETC_TAG_TH_SUBTYPE0_LEN 6
+#define NETC_TAG_TH_SUBTYPE1_LEN 14
+#define NETC_TAG_TH_SUBTYPE2_LEN 14
+#define NETC_TAG_CMN_LEN 5
+
+#define NETC_TAG_SUBTYPE GENMASK(3, 0)
+#define NETC_TAG_TYPE GENMASK(7, 4)
+#define NETC_TAG_QV BIT(0)
+#define NETC_TAG_IPV GENMASK(4, 2)
+#define NETC_TAG_SWITCH GENMASK(2, 0)
+#define NETC_TAG_PORT GENMASK(7, 3)
+
+struct netc_tag_cmn {
+ __be16 tpid;
+ u8 type;
+ u8 qos;
+ u8 switch_port;
+} __packed;
+
+static void netc_fill_common_tag(struct netc_tag_cmn *tag, u8 type,
+ u8 subtype, u8 sw_id, u8 port, u8 ipv)
+{
+ tag->tpid = htons(ETH_P_NXP_NETC);
+ tag->type = FIELD_PREP(NETC_TAG_TYPE, type) |
+ FIELD_PREP(NETC_TAG_SUBTYPE, subtype);
+ tag->qos = NETC_TAG_QV | FIELD_PREP(NETC_TAG_IPV, ipv);
+ tag->switch_port = FIELD_PREP(NETC_TAG_SWITCH, sw_id) |
+ FIELD_PREP(NETC_TAG_PORT, port);
+}
+
+static void *netc_fill_common_tp_tag(struct sk_buff *skb,
+ struct net_device *ndev,
+ u8 subtype, int tag_len)
+{
+ struct dsa_port *dp = dsa_user_to_port(ndev);
+ u16 queue = skb_get_queue_mapping(skb);
+ s8 ipv = netdev_txq_to_tc(ndev, queue);
+ void *tag;
+
+ if (unlikely(ipv < 0))
+ ipv = 0;
+
+ skb_push(skb, tag_len);
+ dsa_alloc_etype_header(skb, tag_len);
+
+ tag = dsa_etype_header_pos_tx(skb);
+ memset(tag + NETC_TAG_CMN_LEN, 0, tag_len - NETC_TAG_CMN_LEN);
+ /* As 'dsa,member' is a required property for NETC switch, the
+ * member is used to specify the switch ID (thus the hardware
+ * switch ID and the software switch ID are consistent), which
+ * is a non-zero value, so dp->ds->index will not 0 here.
+ */
+ netc_fill_common_tag(tag, NETC_TAG_TO_PORT, subtype,
+ dp->ds->index, dp->index, ipv);
+
+ return tag;
+}
+
+static void netc_fill_tp_tag_subtype0(struct sk_buff *skb,
+ struct net_device *ndev)
+{
+ netc_fill_common_tp_tag(skb, ndev, NETC_TAG_TP_SUBTYPE0,
+ NETC_TAG_TP_SUBTYPE0_LEN);
+}
+
+/* Currently only support To_Port tag, subtype 0 */
+static struct sk_buff *netc_xmit(struct sk_buff *skb,
+ struct net_device *ndev)
+{
+ netc_fill_tp_tag_subtype0(skb, ndev);
+
+ return skb;
+}
+
+static int netc_get_rx_tag_len(int rx_type)
+{
+ int type = FIELD_GET(NETC_TAG_TYPE, rx_type);
+
+ if (type == NETC_TAG_TO_HOST) {
+ u8 subtype = rx_type & NETC_TAG_SUBTYPE;
+
+ if (subtype == NETC_TAG_TH_SUBTYPE1)
+ return NETC_TAG_TH_SUBTYPE1_LEN;
+ else if (subtype == NETC_TAG_TH_SUBTYPE2)
+ return NETC_TAG_TH_SUBTYPE2_LEN;
+ else
+ return NETC_TAG_TH_SUBTYPE0_LEN;
+ }
+
+ return NETC_TAG_FORWARD_LEN;
+}
+
+static struct sk_buff *netc_rcv(struct sk_buff *skb,
+ struct net_device *ndev)
+{
+ struct netc_tag_cmn *tag_cmn;
+ int tag_len, sw_id, port;
+
+ if (unlikely(!pskb_may_pull(skb, NETC_TAG_MAX_LEN)))
+ return NULL;
+
+ tag_cmn = dsa_etype_header_pos_rx(skb);
+ tag_len = netc_get_rx_tag_len(tag_cmn->type);
+
+ if (ntohs(tag_cmn->tpid) != ETH_P_NXP_NETC) {
+ dev_warn_ratelimited(&ndev->dev, "Unknown TPID 0x%04x\n",
+ ntohs(tag_cmn->tpid));
+
+ return NULL;
+ }
+
+ if (tag_cmn->qos & NETC_TAG_QV)
+ skb->priority = FIELD_GET(NETC_TAG_IPV, tag_cmn->qos);
+
+ sw_id = FIELD_GET(NETC_TAG_SWITCH, tag_cmn->switch_port);
+ /* ENETC VEPA switch ID (0) is not supported yet */
+ if (!sw_id) {
+ dev_warn_ratelimited(&ndev->dev,
+ "VEPA switch ID is not supported yet\n");
+
+ return NULL;
+ }
+
+ port = FIELD_GET(NETC_TAG_PORT, tag_cmn->switch_port);
+ skb->dev = dsa_conduit_find_user(ndev, sw_id, port);
+ if (!skb->dev)
+ return NULL;
+
+ if (FIELD_GET(NETC_TAG_TYPE, tag_cmn->type) == NETC_TAG_FORWARD)
+ dsa_default_offload_fwd_mark(skb);
+
+ /* Remove Switch tag from the frame */
+ skb_pull_rcsum(skb, tag_len);
+ dsa_strip_etype_header(skb, tag_len);
+
+ return skb;
+}
+
+static void netc_flow_dissect(const struct sk_buff *skb, __be16 *proto,
+ int *offset)
+{
+ struct netc_tag_cmn *tag_cmn = (struct netc_tag_cmn *)(skb->data - 2);
+ int tag_len = netc_get_rx_tag_len(tag_cmn->type);
+
+ *offset = tag_len;
+ *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1];
+}
+
+static const struct dsa_device_ops netc_netdev_ops = {
+ .name = NETC_NAME,
+ .proto = DSA_TAG_PROTO_NETC,
+ .xmit = netc_xmit,
+ .rcv = netc_rcv,
+ .needed_headroom = NETC_TAG_MAX_LEN,
+ .flow_dissect = netc_flow_dissect,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for NXP NETC switch family");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_NETC, NETC_NAME);
+module_dsa_tag_driver(netc_netdev_ops);
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 08/15] net: enetc: add multiple command BD rings support
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
All the tables of NETC switch are managed through the command BD ring,
but unlike ENETC, the switch has two command BD rings, if the current
ring is busy, the switch driver can switch to another ring to manage
the table. Currently, the NTMP driver does not support multiple rings.
Therefore, update ntmp_select_and_lock_cbdr() to select a appropriate
ring to execute the command for the switch.
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
drivers/net/ethernet/freescale/enetc/ntmp.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c b/drivers/net/ethernet/freescale/enetc/ntmp.c
index c62c6a9d7bfa..c491046fe80f 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp.c
+++ b/drivers/net/ethernet/freescale/enetc/ntmp.c
@@ -146,11 +146,16 @@ static void ntmp_clean_cbdr(struct netc_cbdr *cbdr)
static void ntmp_select_and_lock_cbdr(struct ntmp_user *user,
struct netc_cbdr **cbdr)
{
- /* Currently only ENETC is supported, and it has only one command
- * BD ring.
- */
- *cbdr = &user->ring[0];
+ for (int i = 0; i < user->cbdr_num; i++) {
+ *cbdr = &user->ring[i];
+ if (mutex_trylock(&(*cbdr)->ring_lock))
+ return;
+ }
+ /* If all command BD rings are locked, we need to select one of
+ * them and wait for it.
+ */
+ *cbdr = &user->ring[raw_smp_processor_id() % user->cbdr_num];
mutex_lock(&(*cbdr)->ring_lock);
}
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 07/15] net: enetc: add support for "Add" and "Delete" operations to IPFT
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
The ingress port filter table (IPFT )contains a set of filters each
capable of classifying incoming traffic using a mix of L2, L3, and L4
parsed and arbitrary field data. As a result of a filter match, several
actions can be specified such as on whether to deny or allow a frame,
overriding internal QoS attributes associated with the frame and setting
parameters for the subsequent frame processing functions, such as stream
identification, policing, ingress mirroring. Each entry corresponds to a
filter. The ingress port filter entries are added using a precedence
value. If a frame matches multiple entries, the entry with the higher
precedence is used. Currently, this patch only adds "Add" and "Delete"
operations to the ingress port filter table. These two interfaces will
be used by both ENETC driver and NETC switch driver.
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
drivers/net/ethernet/freescale/enetc/ntmp.c | 76 +++++++++++++++
.../ethernet/freescale/enetc/ntmp_private.h | 36 +++++++
include/linux/fsl/ntmp.h | 93 +++++++++++++++++++
3 files changed, 205 insertions(+)
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c b/drivers/net/ethernet/freescale/enetc/ntmp.c
index ba80f5e08d80..c62c6a9d7bfa 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp.c
+++ b/drivers/net/ethernet/freescale/enetc/ntmp.c
@@ -21,6 +21,7 @@
/* Define NTMP Table ID */
#define NTMP_MAFT_ID 1
#define NTMP_RSST_ID 3
+#define NTMP_IPFT_ID 13
#define NTMP_FDBT_ID 15
#define NTMP_VFT_ID 18
#define NTMP_BPT_ID 41
@@ -271,6 +272,8 @@ static const char *ntmp_table_name(int tbl_id)
return "MAC Address Filter Table";
case NTMP_RSST_ID:
return "RSS Table";
+ case NTMP_IPFT_ID:
+ return "Ingress Port Filter Table";
case NTMP_FDBT_ID:
return "FDB Table";
case NTMP_VFT_ID:
@@ -513,6 +516,79 @@ int ntmp_rsst_query_entry(struct ntmp_user *user, u32 *table, int count)
}
EXPORT_SYMBOL_GPL(ntmp_rsst_query_entry);
+/**
+ * ntmp_ipft_add_entry - add an entry into the ingress port filter table
+ * @user: target ntmp_user struct
+ * @entry: the entry data, entry->cfge (configuration element data) and
+ * entry->keye (key element data) are used as input. Since the entry ID
+ * is assigned by the hardware, so entry->entry_id is a returned value
+ * for the driver to use, the driver can update/delete/query the entry
+ * based on the entry_id.
+ *
+ * Return: 0 on success, otherwise a negative error code
+ */
+int ntmp_ipft_add_entry(struct ntmp_user *user,
+ struct ipft_entry_data *entry)
+{
+ struct ipft_resp_query *resp;
+ struct ipft_req_ua *req;
+ struct netc_swcbd swcbd;
+ struct netc_cbdr *cbdr;
+ union netc_cbd cbd;
+ u32 len;
+ int err;
+
+ swcbd.size = sizeof(*resp);
+ err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
+ if (err)
+ return err;
+
+ ntmp_fill_crd(&req->crd, user->tbl.ipft_ver, NTMP_QA_ENTRY_ID,
+ NTMP_GEN_UA_CFGEU | NTMP_GEN_UA_STSEU);
+ req->ak.keye = entry->keye;
+ req->cfge = entry->cfge;
+
+ len = NTMP_LEN(sizeof(*req), swcbd.size);
+ ntmp_fill_request_hdr(&cbd, swcbd.dma, len, NTMP_IPFT_ID,
+ NTMP_CMD_AQ, NTMP_AM_TERNARY_KEY);
+
+ ntmp_select_and_lock_cbdr(user, &cbdr);
+ err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
+ if (err) {
+ dev_err(user->dev, "Failed to add %s entry, err: %pe\n",
+ ntmp_table_name(NTMP_IPFT_ID), ERR_PTR(err));
+
+ goto unlock_cbdr;
+ }
+
+ resp = (struct ipft_resp_query *)req;
+ entry->entry_id = le32_to_cpu(resp->entry_id);
+
+unlock_cbdr:
+ ntmp_unlock_cbdr(cbdr);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ntmp_ipft_add_entry);
+
+/**
+ * ntmp_ipft_delete_entry - delete a specified ingress port filter table entry
+ * @user: target ntmp_user struct
+ * @entry_id: the specified ID of the ingress port filter table entry
+ *
+ * Return: 0 on success, otherwise a negative error code
+ */
+int ntmp_ipft_delete_entry(struct ntmp_user *user, u32 entry_id)
+{
+ u32 req_len = sizeof(struct ipft_req_qd);
+
+ return ntmp_delete_entry_by_id(user, NTMP_IPFT_ID,
+ user->tbl.ipft_ver,
+ entry_id, req_len,
+ NTMP_STATUS_RESP_LEN);
+}
+EXPORT_SYMBOL_GPL(ntmp_ipft_delete_entry);
+
/**
* ntmp_fdbt_add_entry - add an entry into the FDB table
* @user: target ntmp_user struct
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp_private.h b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
index 64df49e9a3ef..0a9b87286105 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp_private.h
+++ b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
@@ -99,6 +99,42 @@ struct rsst_req_update {
u8 groups[];
};
+/* Ingress Port Filter Table Response Data Buffer Format of Query action */
+struct ipft_resp_query {
+ __le32 status;
+ __le32 entry_id;
+ struct ipft_keye_data keye;
+ __le64 match_count; /* STSE_DATA */
+ struct ipft_cfge_data cfge;
+} __packed;
+
+struct ipft_ak_eid {
+ __le32 entry_id;
+ __le32 resv[52];
+};
+
+union ipft_access_key {
+ struct ipft_ak_eid eid;
+ struct ipft_keye_data keye;
+};
+
+/* Ingress Port Filter Table Request Data Buffer Format of Update and
+ * Add actions
+ */
+struct ipft_req_ua {
+ struct ntmp_cmn_req_data crd;
+ union ipft_access_key ak;
+ struct ipft_cfge_data cfge;
+};
+
+/* Ingress Port Filter Table Request Data Buffer Format of Query and
+ * Delete actions
+ */
+struct ipft_req_qd {
+ struct ntmp_req_by_eid rbe;
+ __le32 resv[52];
+};
+
/* Access Key Format of FDB Table */
struct fdbt_ak_eid {
__le32 entry_id;
diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h
index d74714a402f6..f68551045b60 100644
--- a/include/linux/fsl/ntmp.h
+++ b/include/linux/fsl/ntmp.h
@@ -7,6 +7,7 @@
#include <linux/if_ether.h>
#define NTMP_NULL_ENTRY_ID 0xffffffffU
+#define IPFT_MAX_PLD_LEN 24
struct maft_keye_data {
u8 mac_addr[ETH_ALEN];
@@ -34,6 +35,7 @@ struct netc_tbl_vers {
u8 fdbt_ver;
u8 vft_ver;
u8 bpt_ver;
+ u8 ipft_ver;
};
struct netc_swcbd {
@@ -73,6 +75,94 @@ struct maft_entry_data {
struct maft_cfge_data cfge;
};
+struct ipft_pld_byte {
+ u8 data;
+ u8 mask;
+};
+
+struct ipft_keye_data {
+ __le16 precedence;
+ __le16 resv0[3];
+ __le16 frm_attr_flags;
+#define IPFT_FAF_OVLAN BIT(2)
+#define IPFT_FAF_IVLAN BIT(3)
+#define IPFT_FAF_IP_HDR BIT(7)
+#define IPFT_FAF_IP_VER6 BIT(8)
+#define IPFT_FAF_L4_CODE GENMASK(11, 10)
+#define IPFT_FAF_TCP_HDR 1
+#define IPFT_FAF_UDP_HDR 2
+#define IPFT_FAF_SCTP_HDR 3
+#define IPFT_FAF_WOL_MAGIC BIT(12)
+ __le16 frm_attr_flags_mask;
+ __le16 dscp;
+#define IPFT_DSCP GENMASK(5, 0)
+#define IPFT_DSCP_MASK GENMASK(11, 6)
+#define IPFT_DSCP_MASK_ALL 0x3f
+ __le16 src_port; /* This field is reserved for ENETC */
+#define IPFT_SRC_PORT GENMASK(4, 0)
+#define IPFT_SRC_PORT_MASK GENMASK(9, 5)
+#define IPFT_SRC_PORT_MASK_ALL 0x1f
+ __be16 outer_vlan_tci;
+ __be16 outer_vlan_tci_mask;
+ u8 dmac[ETH_ALEN];
+ u8 dmac_mask[ETH_ALEN];
+ u8 smac[ETH_ALEN];
+ u8 smac_mask[ETH_ALEN];
+ __be16 inner_vlan_tci;
+ __be16 inner_vlan_tci_mask;
+ __be16 ethertype;
+ __be16 ethertype_mask;
+ u8 ip_protocol;
+ u8 ip_protocol_mask;
+ __le16 resv1[7];
+ __be32 ip_src[4];
+ __le32 resv2[2];
+ __be32 ip_src_mask[4];
+ __be16 l4_src_port;
+ __be16 l4_src_port_mask;
+ __le32 resv3;
+ __be32 ip_dst[4];
+ __le32 resv4[2];
+ __be32 ip_dst_mask[4];
+ __be16 l4_dst_port;
+ __be16 l4_dst_port_mask;
+ __le32 resv5;
+ struct ipft_pld_byte byte[IPFT_MAX_PLD_LEN];
+};
+
+struct ipft_cfge_data {
+ __le32 cfg;
+#define IPFT_IPV GENMASK(3, 0)
+#define IPFT_OIPV BIT(4)
+#define IPFT_DR GENMASK(6, 5)
+#define IPFT_ODR BIT(7)
+#define IPFT_FLTFA GENMASK(10, 8)
+#define IPFT_FLTFA_DISCARD 0
+#define IPFT_FLTFA_PERMIT 1
+/* Redirect is only for switch */
+#define IPFT_FLTFA_REDIRECT 2
+#define IPFT_IMIRE BIT(11)
+#define IPFT_WOLTE BIT(12)
+#define IPFT_FLTA GENMASK(14, 13)
+#define IPFT_FLTA_RP 1
+#define IPFT_FLTA_IS 2
+#define IPFT_FLTA_SI_BITMAP 3
+#define IPFT_RPR GENMASK(16, 15)
+#define IPFT_CTD BIT(17)
+#define IPFT_HR GENMASK(21, 18)
+#define IPFT_TIMECAPE BIT(22)
+#define IPFT_RRT BIT(23)
+#define IPFT_BL2F BIT(24)
+#define IPFT_EVMEID GENMASK(31, 28)
+ __le32 flta_tgt;
+};
+
+struct ipft_entry_data {
+ u32 entry_id; /* hardware assigns entry ID */
+ struct ipft_keye_data keye;
+ struct ipft_cfge_data cfge;
+};
+
struct fdbt_keye_data {
u8 mac_addr[ETH_ALEN]; /* big-endian */
__le16 resv0;
@@ -162,6 +252,9 @@ int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table,
int count);
int ntmp_rsst_query_entry(struct ntmp_user *user,
u32 *table, int count);
+int ntmp_ipft_add_entry(struct ntmp_user *user,
+ struct ipft_entry_data *entry);
+int ntmp_ipft_delete_entry(struct ntmp_user *user, u32 entry_id);
int ntmp_fdbt_add_entry(struct ntmp_user *user, u32 *entry_id,
const struct fdbt_keye_data *keye,
const struct fdbt_cfge_data *cfge);
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 06/15] net: enetc: add support for the "Update" operation to buffer pool table
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
The buffer pool table contains buffer pool configuration and operational
information. Each entry corresponds to a buffer pool. The Entry ID value
represents the buffer pool ID to access.
The buffer pool table is a static bounded index table, buffer pools are
always present and enabled. It only supports Update and Query operations,
This patch only adds ntmp_bpt_update_entry() helper to support updating
the specified entry of the buffer pool table. Query action to the table
will be added in the future.
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
drivers/net/ethernet/freescale/enetc/ntmp.c | 39 +++++++++++++++++++
.../ethernet/freescale/enetc/ntmp_private.h | 6 +++
include/linux/fsl/ntmp.h | 26 +++++++++++++
3 files changed, 71 insertions(+)
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c b/drivers/net/ethernet/freescale/enetc/ntmp.c
index db74a9107975..ba80f5e08d80 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp.c
+++ b/drivers/net/ethernet/freescale/enetc/ntmp.c
@@ -23,11 +23,15 @@
#define NTMP_RSST_ID 3
#define NTMP_FDBT_ID 15
#define NTMP_VFT_ID 18
+#define NTMP_BPT_ID 41
/* Generic Update Actions for most tables */
#define NTMP_GEN_UA_CFGEU BIT(0)
#define NTMP_GEN_UA_STSEU BIT(1)
+/* Specific Update Actions for some tables */
+#define BPT_UA_BPSEU BIT(1)
+
/* Query Action: 0: Full query. 1: Query entry ID, the fields after entry
* ID are not returned.
*/
@@ -271,6 +275,8 @@ static const char *ntmp_table_name(int tbl_id)
return "FDB Table";
case NTMP_VFT_ID:
return "VLAN Filter Table";
+ case NTMP_BPT_ID:
+ return "Buffer Pool Table";
default:
return "Unknown Table";
}
@@ -749,5 +755,38 @@ int ntmp_vft_add_entry(struct ntmp_user *user, u16 vid,
}
EXPORT_SYMBOL_GPL(ntmp_vft_add_entry);
+int ntmp_bpt_update_entry(struct ntmp_user *user, u32 entry_id,
+ const struct bpt_cfge_data *cfge)
+{
+ struct bpt_req_update *req;
+ struct netc_swcbd swcbd;
+ struct netc_cbdr *cbdr;
+ union netc_cbd cbd;
+ int err;
+
+ swcbd.size = sizeof(*req);
+ err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
+ if (err)
+ return err;
+
+ ntmp_fill_crd_eid(&req->rbe, user->tbl.bpt_ver, 0,
+ NTMP_GEN_UA_CFGEU | BPT_UA_BPSEU, entry_id);
+ req->cfge = *cfge;
+ ntmp_fill_request_hdr(&cbd, swcbd.dma, NTMP_LEN(swcbd.size, 0),
+ NTMP_BPT_ID, NTMP_CMD_UPDATE, NTMP_AM_ENTRY_ID);
+
+ ntmp_select_and_lock_cbdr(user, &cbdr);
+ err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
+ if (err)
+ dev_err(user->dev,
+ "Failed to update %s entry 0x%x, err: %pe\n",
+ ntmp_table_name(NTMP_BPT_ID), entry_id, ERR_PTR(err));
+
+ ntmp_unlock_cbdr(cbdr);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ntmp_bpt_update_entry);
+
MODULE_DESCRIPTION("NXP NETC Library");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp_private.h b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
index 575ee783be47..64df49e9a3ef 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp_private.h
+++ b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
@@ -175,4 +175,10 @@ struct vft_req_ua {
struct vft_cfge_data cfge;
};
+/* Buffer Pool Table Request Data Buffer Format of Update action */
+struct bpt_req_update {
+ struct ntmp_req_by_eid rbe;
+ struct bpt_cfge_data cfge;
+};
+
#endif
diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h
index 3672e0dc7726..d74714a402f6 100644
--- a/include/linux/fsl/ntmp.h
+++ b/include/linux/fsl/ntmp.h
@@ -33,6 +33,7 @@ struct netc_tbl_vers {
u8 rsst_ver;
u8 fdbt_ver;
u8 vft_ver;
+ u8 bpt_ver;
};
struct netc_swcbd {
@@ -123,6 +124,29 @@ struct vft_cfge_data {
__le32 et_eid;
};
+struct bpt_bpse_data {
+ __le32 amount_used;
+ __le32 amount_used_hwm;
+ u8 bpd_fc_state;
+#define BPT_FC_STATE BIT(0)
+#define BPT_BPD BIT(1)
+} __packed;
+
+struct bpt_cfge_data {
+ u8 fccfg_sbpen;
+#define BPT_SBP_EN BIT(0)
+#define BPT_FC_CFG GENMASK(2, 1)
+#define BPT_FC_CFG_EN_BPFC 1
+ u8 pfc_vector;
+ __le16 max_thresh;
+ __le16 fc_on_thresh;
+ __le16 fc_off_thresh;
+ __le16 sbp_thresh;
+ __le16 resv;
+ __le32 sbp_eid;
+ __le32 fc_ports;
+};
+
#if IS_ENABLED(CONFIG_NXP_NETC_LIB)
int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
const struct netc_cbdr_regs *regs);
@@ -149,6 +173,8 @@ int ntmp_fdbt_search_port_entry(struct ntmp_user *user, int port,
struct fdbt_entry_data *entry);
int ntmp_vft_add_entry(struct ntmp_user *user, u16 vid,
const struct vft_cfge_data *cfge);
+int ntmp_bpt_update_entry(struct ntmp_user *user, u32 entry_id,
+ const struct bpt_cfge_data *cfge);
#else
static inline int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
const struct netc_cbdr_regs *regs)
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 05/15] net: enetc: add support for the "Add" operation to VLAN filter table
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
The VLAN filter table contains configuration and control information for
each VLAN configured on the switch. Each VLAN entry includes the VLAN
port membership, which FID to use in the FDB lookup, which spanning tree
group to use, the egress frame modification actions to apply to a frame
exiting form this VLAN, and various configuration and control parameters
for this VLAN.
The VLAN filter table can only be managed by the command BD ring using
table management protocol version 2.0. The table supports Add, Delete,
Update and Query operations. And the table supports 3 access methods:
Entry ID, Exact Match Key Element and Search. But currently we only add
the ntmp_vft_add_entry() helper to support the upcoming switch driver to
add an entry to the VLAN filter table. Other interfaces will be added in
the future.
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
drivers/net/ethernet/freescale/enetc/ntmp.c | 50 +++++++++++++++++++
.../ethernet/freescale/enetc/ntmp_private.h | 19 +++++++
include/linux/fsl/ntmp.h | 24 +++++++++
3 files changed, 93 insertions(+)
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c b/drivers/net/ethernet/freescale/enetc/ntmp.c
index 6074eeafd5a2..db74a9107975 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp.c
+++ b/drivers/net/ethernet/freescale/enetc/ntmp.c
@@ -22,6 +22,7 @@
#define NTMP_MAFT_ID 1
#define NTMP_RSST_ID 3
#define NTMP_FDBT_ID 15
+#define NTMP_VFT_ID 18
/* Generic Update Actions for most tables */
#define NTMP_GEN_UA_CFGEU BIT(0)
@@ -268,6 +269,8 @@ static const char *ntmp_table_name(int tbl_id)
return "RSS Table";
case NTMP_FDBT_ID:
return "FDB Table";
+ case NTMP_VFT_ID:
+ return "VLAN Filter Table";
default:
return "Unknown Table";
}
@@ -699,5 +702,52 @@ int ntmp_fdbt_search_port_entry(struct ntmp_user *user, int port,
}
EXPORT_SYMBOL_GPL(ntmp_fdbt_search_port_entry);
+/**
+ * ntmp_vft_add_entry - add an entry into the VLAN filter table
+ * @user: target ntmp_user struct
+ * @vid: VLAN ID
+ * @cfge: configuration element data
+ *
+ * Return: 0 on success, otherwise a negative error code
+ */
+int ntmp_vft_add_entry(struct ntmp_user *user, u16 vid,
+ const struct vft_cfge_data *cfge)
+{
+ struct netc_swcbd swcbd;
+ struct vft_req_ua *req;
+ struct netc_cbdr *cbdr;
+ union netc_cbd cbd;
+ u32 len;
+ int err;
+
+ swcbd.size = sizeof(*req);
+ err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
+ if (err)
+ return err;
+
+ /* Request data */
+ ntmp_fill_crd(&req->crd, user->tbl.vft_ver, 0,
+ NTMP_GEN_UA_CFGEU);
+ req->ak.exact.vid = cpu_to_le16(vid);
+ req->cfge = *cfge;
+
+ /* Request header */
+ len = NTMP_LEN(swcbd.size, NTMP_STATUS_RESP_LEN);
+ ntmp_fill_request_hdr(&cbd, swcbd.dma, len, NTMP_VFT_ID,
+ NTMP_CMD_ADD, NTMP_AM_EXACT_KEY);
+
+ ntmp_select_and_lock_cbdr(user, &cbdr);
+ err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
+ if (err)
+ dev_err(user->dev,
+ "Failed to add %s entry, vid: %u, err: %pe\n",
+ ntmp_table_name(NTMP_VFT_ID), vid, ERR_PTR(err));
+
+ ntmp_unlock_cbdr(cbdr);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ntmp_vft_add_entry);
+
MODULE_DESCRIPTION("NXP NETC Library");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp_private.h b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
index b0b5805ac4f6..575ee783be47 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp_private.h
+++ b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
@@ -156,4 +156,23 @@ struct fdbt_resp_query {
u8 resv[3];
};
+/* Access Key Format of VLAN Filter Table */
+struct vft_ak_exact {
+ __le16 vid; /* bit0~11: VLAN ID, other bits are reserved */
+ __le16 resv;
+};
+
+union vft_access_key {
+ __le32 entry_id; /* entry_id match */
+ struct vft_ak_exact exact;
+ __le32 resume_entry_id; /* search */
+};
+
+/* VLAN Filter Table Request Data Buffer Format of Update and Add actions */
+struct vft_req_ua {
+ struct ntmp_cmn_req_data crd;
+ union vft_access_key ak;
+ struct vft_cfge_data cfge;
+};
+
#endif
diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h
index 4cfff835954e..3672e0dc7726 100644
--- a/include/linux/fsl/ntmp.h
+++ b/include/linux/fsl/ntmp.h
@@ -32,6 +32,7 @@ struct netc_tbl_vers {
u8 maft_ver;
u8 rsst_ver;
u8 fdbt_ver;
+ u8 vft_ver;
};
struct netc_swcbd {
@@ -101,6 +102,27 @@ struct fdbt_entry_data {
#define FDBT_ACT_FLAG BIT(7)
};
+struct vft_cfge_data {
+ __le32 bitmap_stg;
+#define VFT_PORT_MEMBERSHIP GENMASK(23, 0)
+#define VFT_STG_ID_MASK GENMASK(27, 24)
+#define VFT_STG_ID(g) FIELD_PREP(VFT_STG_ID_MASK, (g))
+ __le16 fid;
+#define VFT_FID GENMASK(11, 0)
+ __le16 cfg;
+#define VFT_MLO GENMASK(2, 0)
+#define VFT_MFO GENMASK(4, 3)
+#define VFT_IPMFE BIT(6)
+#define VFT_IPMFLE BIT(7)
+#define VFT_PGA BIT(8)
+#define VFT_SFDA BIT(10)
+#define VFT_OSFDA BIT(11)
+#define VFT_FDBAFSS BIT(12)
+ __le32 eta_port_bitmap;
+#define VFT_ETA_PORT_BITMAP GENMASK(23, 0)
+ __le32 et_eid;
+};
+
#if IS_ENABLED(CONFIG_NXP_NETC_LIB)
int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
const struct netc_cbdr_regs *regs);
@@ -125,6 +147,8 @@ int ntmp_fdbt_delete_entry(struct ntmp_user *user, u32 entry_id);
int ntmp_fdbt_search_port_entry(struct ntmp_user *user, int port,
u32 *resume_entry_id,
struct fdbt_entry_data *entry);
+int ntmp_vft_add_entry(struct ntmp_user *user, u16 vid,
+ const struct vft_cfge_data *cfge);
#else
static inline int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
const struct netc_cbdr_regs *regs)
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 04/15] net: enetc: add basic operations to the FDB table
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
The FDB table is used for MAC learning lookups and MAC forwarding lookups.
Each table entry includes information such as a FID and MAC address that
may be unicast or multicast and a forwarding destination field containing
a port bitmap identifying the associated port(s) with the MAC address.
FDB table entries can be static or dynamic. Static entries are added from
software whereby dynamic entries are added either by software or by the
hardware as MAC addresses are learned in the datapath.
The FDB table can only be managed by the command BD ring using table
management protocol version 2.0. Table management command operations Add,
Delete, Update and Query are supported. And the FDB table supports three
access methods: Entry ID, Exact Match Key Element and Search. This patch
adds the following basic supports to the FDB table.
ntmp_fdbt_update_entry() - update the configuration element data of a
specified FDB entry
ntmp_fdbt_delete_entry() - delete a specified FDB entry
ntmp_fdbt_add_entry() - add an entry into the FDB table
ntmp_fdbt_search_port_entry() - Search the FDB entry on the specified
port based on RESUME_ENTRY_ID.
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
drivers/net/ethernet/freescale/enetc/ntmp.c | 205 +++++++++++++++++-
.../ethernet/freescale/enetc/ntmp_private.h | 61 +++++-
include/linux/fsl/ntmp.h | 44 +++-
3 files changed, 307 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c b/drivers/net/ethernet/freescale/enetc/ntmp.c
index c94a928622fd..6074eeafd5a2 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp.c
+++ b/drivers/net/ethernet/freescale/enetc/ntmp.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
/*
* NETC NTMP (NETC Table Management Protocol) 2.0 Library
- * Copyright 2025 NXP
+ * Copyright 2025-2026 NXP
*/
#include <linux/dma-mapping.h>
@@ -21,11 +21,17 @@
/* Define NTMP Table ID */
#define NTMP_MAFT_ID 1
#define NTMP_RSST_ID 3
+#define NTMP_FDBT_ID 15
/* Generic Update Actions for most tables */
#define NTMP_GEN_UA_CFGEU BIT(0)
#define NTMP_GEN_UA_STSEU BIT(1)
+/* Query Action: 0: Full query. 1: Query entry ID, the fields after entry
+ * ID are not returned.
+ */
+#define NTMP_QA_ENTRY_ID 1
+
#define NTMP_ENTRY_ID_SIZE 4
#define RSST_ENTRY_NUM 64
#define RSST_STSE_DATA_SIZE(n) ((n) * 8)
@@ -260,6 +266,8 @@ static const char *ntmp_table_name(int tbl_id)
return "MAC Address Filter Table";
case NTMP_RSST_ID:
return "RSS Table";
+ case NTMP_FDBT_ID:
+ return "FDB Table";
default:
return "Unknown Table";
}
@@ -496,5 +504,200 @@ int ntmp_rsst_query_entry(struct ntmp_user *user, u32 *table, int count)
}
EXPORT_SYMBOL_GPL(ntmp_rsst_query_entry);
+/**
+ * ntmp_fdbt_add_entry - add an entry into the FDB table
+ * @user: target ntmp_user struct
+ * @entry_id: returned value, the entry ID of the new added entry
+ * @keye: key element data
+ * @cfge: configuration element data
+ *
+ * Return: 0 on success, otherwise a negative error code
+ */
+int ntmp_fdbt_add_entry(struct ntmp_user *user, u32 *entry_id,
+ const struct fdbt_keye_data *keye,
+ const struct fdbt_cfge_data *cfge)
+{
+ struct fdbt_resp_query *resp;
+ struct fdbt_req_ua *req;
+ struct netc_swcbd swcbd;
+ struct netc_cbdr *cbdr;
+ union netc_cbd cbd;
+ u32 len;
+ int err;
+
+ swcbd.size = sizeof(*req);
+ err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
+ if (err)
+ return err;
+
+ /* Request data */
+ ntmp_fill_crd(&req->crd, user->tbl.fdbt_ver, NTMP_QA_ENTRY_ID,
+ NTMP_GEN_UA_CFGEU);
+ req->ak.exact.keye = *keye;
+ req->cfge = *cfge;
+
+ len = NTMP_LEN(swcbd.size, sizeof(*resp));
+ /* The entry ID is allotted by hardware, so we need to perform
+ * a query action after the add action to get the entry ID from
+ * hardware.
+ */
+ ntmp_fill_request_hdr(&cbd, swcbd.dma, len, NTMP_FDBT_ID,
+ NTMP_CMD_AQ, NTMP_AM_EXACT_KEY);
+
+ ntmp_select_and_lock_cbdr(user, &cbdr);
+ err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
+ if (err) {
+ dev_err(user->dev, "Failed to add %s entry, err: %pe\n",
+ ntmp_table_name(NTMP_FDBT_ID), ERR_PTR(err));
+ goto unlock_cbdr;
+ }
+
+ if (entry_id) {
+ resp = (struct fdbt_resp_query *)req;
+ *entry_id = le32_to_cpu(resp->entry_id);
+ }
+
+unlock_cbdr:
+ ntmp_unlock_cbdr(cbdr);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ntmp_fdbt_add_entry);
+
+/**
+ * ntmp_fdbt_update_entry - update the configuration element data of the
+ * specified FDB entry
+ * @user: target ntmp_user struct
+ * @entry_id: the specified entry ID of the FDB table
+ * @cfge: configuration element data
+ *
+ * Return: 0 on success, otherwise a negative error code
+ */
+int ntmp_fdbt_update_entry(struct ntmp_user *user, u32 entry_id,
+ const struct fdbt_cfge_data *cfge)
+{
+ struct fdbt_req_ua *req;
+ struct netc_swcbd swcbd;
+ struct netc_cbdr *cbdr;
+ union netc_cbd cbd;
+ u32 len;
+ int err;
+
+ swcbd.size = sizeof(*req);
+ err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
+ if (err)
+ return err;
+
+ /* Request data */
+ ntmp_fill_crd(&req->crd, user->tbl.fdbt_ver, 0, NTMP_GEN_UA_CFGEU);
+ req->ak.eid.entry_id = cpu_to_le32(entry_id);
+ req->cfge = *cfge;
+
+ /* Request header */
+ len = NTMP_LEN(swcbd.size, NTMP_STATUS_RESP_LEN);
+ ntmp_fill_request_hdr(&cbd, swcbd.dma, len, NTMP_FDBT_ID,
+ NTMP_CMD_UPDATE, NTMP_AM_ENTRY_ID);
+
+ ntmp_select_and_lock_cbdr(user, &cbdr);
+ err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
+ if (err)
+ dev_err(user->dev, "Failed to update %s entry, err: %pe\n",
+ ntmp_table_name(NTMP_FDBT_ID), ERR_PTR(err));
+
+ ntmp_unlock_cbdr(cbdr);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ntmp_fdbt_update_entry);
+
+/**
+ * ntmp_fdbt_delete_entry - delete the specified FDB entry
+ * @user: target ntmp_user struct
+ * @entry_id: the specified ID of the FDB entry
+ *
+ * Return: 0 on success, otherwise a negative error code
+ */
+int ntmp_fdbt_delete_entry(struct ntmp_user *user, u32 entry_id)
+{
+ u32 req_len = sizeof(struct fdbt_req_qd);
+
+ return ntmp_delete_entry_by_id(user, NTMP_FDBT_ID,
+ user->tbl.fdbt_ver,
+ entry_id, req_len,
+ NTMP_STATUS_RESP_LEN);
+}
+EXPORT_SYMBOL_GPL(ntmp_fdbt_delete_entry);
+
+/**
+ * ntmp_fdbt_search_port_entry - Search the FDB entry on the specified
+ * port based on RESUME_ENTRY_ID
+ * @user: target ntmp_user struct
+ * @port: the specified switch port ID
+ * @resume_entry_id: it is both an input and an output. As an input, it
+ * represents the FDB entry ID to be searched. If it is a NULL entry ID,
+ * it indicates that the first FDB entry for that port is being searched.
+ * As an output, it represents the next FDB entry ID to be searched.
+ * @entry: returned value, the response data of the searched FDB entry
+ *
+ * Return: 0 on success, otherwise a negative error code
+ */
+int ntmp_fdbt_search_port_entry(struct ntmp_user *user, int port,
+ u32 *resume_entry_id,
+ struct fdbt_entry_data *entry)
+{
+ struct fdbt_resp_query *resp;
+ struct fdbt_req_qd *req;
+ struct netc_swcbd swcbd;
+ struct netc_cbdr *cbdr;
+ union netc_cbd cbd;
+ u32 len;
+ int err;
+
+ swcbd.size = sizeof(*req);
+ err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
+ if (err)
+ return err;
+
+ /* Request data */
+ ntmp_fill_crd(&req->crd, user->tbl.fdbt_ver, 0, 0);
+ req->ak.search.resume_eid = cpu_to_le32(*resume_entry_id);
+ req->ak.search.cfge.port_bitmap = cpu_to_le32(BIT(port));
+ /* Match CFGE_DATA[PORT_BITMAP] field */
+ req->ak.search.cfge_mc = FDBT_CFGE_MC_PORT_BITMAP;
+
+ /* Request header */
+ len = NTMP_LEN(swcbd.size, sizeof(*resp));
+ ntmp_fill_request_hdr(&cbd, swcbd.dma, len, NTMP_FDBT_ID,
+ NTMP_CMD_QUERY, NTMP_AM_SEARCH);
+
+ ntmp_select_and_lock_cbdr(user, &cbdr);
+ err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
+ if (err) {
+ dev_err(user->dev,
+ "Failed to search %s entry on port %d, err: %pe\n",
+ ntmp_table_name(NTMP_FDBT_ID), port, ERR_PTR(err));
+ goto unlock_cbdr;
+ }
+
+ if (!cbd.resp_hdr.num_matched) {
+ entry->entry_id = NTMP_NULL_ENTRY_ID;
+ *resume_entry_id = NTMP_NULL_ENTRY_ID;
+ goto unlock_cbdr;
+ }
+
+ resp = (struct fdbt_resp_query *)req;
+ *resume_entry_id = le32_to_cpu(resp->status);
+ entry->entry_id = le32_to_cpu(resp->entry_id);
+ entry->keye = resp->keye;
+ entry->cfge = resp->cfge;
+ entry->acte = resp->acte;
+
+unlock_cbdr:
+ ntmp_unlock_cbdr(cbdr);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ntmp_fdbt_search_port_entry);
+
MODULE_DESCRIPTION("NXP NETC Library");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp_private.h b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
index f8dff3ba2c28..b0b5805ac4f6 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp_private.h
+++ b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
/*
* NTMP table request and response data buffer formats
- * Copyright 2025 NXP
+ * Copyright 2025-2026 NXP
*/
#ifndef __NTMP_PRIVATE_H
@@ -11,6 +11,7 @@
#include <linux/fsl/ntmp.h>
#define NTMP_EID_REQ_LEN 8
+#define NTMP_STATUS_RESP_LEN 4
#define NETC_CBDR_BD_NUM 256
#define NETC_CBDRCIR_INDEX GENMASK(9, 0)
#define NETC_CBDRCIR_SBE BIT(31)
@@ -30,6 +31,7 @@ union netc_cbd {
#define NTMP_CMD_QUERY BIT(2)
#define NTMP_CMD_ADD BIT(3)
#define NTMP_CMD_QU (NTMP_CMD_QUERY | NTMP_CMD_UPDATE)
+#define NTMP_CMD_AQ (NTMP_CMD_ADD | NTMP_CMD_QUERY)
u8 access_method;
#define NTMP_ACCESS_METHOD GENMASK(7, 4)
#define NTMP_AM_ENTRY_ID 0
@@ -97,4 +99,61 @@ struct rsst_req_update {
u8 groups[];
};
+/* Access Key Format of FDB Table */
+struct fdbt_ak_eid {
+ __le32 entry_id;
+ __le32 resv[7];
+};
+
+struct fdbt_ak_exact {
+ struct fdbt_keye_data keye;
+ __le32 resv[5];
+};
+
+struct fdbt_ak_search {
+ __le32 resume_eid;
+ struct fdbt_keye_data keye;
+ struct fdbt_cfge_data cfge;
+ u8 acte;
+ u8 keye_mc;
+#define FDBT_KEYE_MAC GENMASK(1, 0)
+ u8 cfge_mc;
+#define FDBT_CFGE_MC GENMASK(2, 0)
+#define FDBT_CFGE_MC_ANY 0
+#define FDBT_CFGE_MC_DYNAMIC 1
+#define FDBT_CFGE_MC_PORT_BITMAP 2
+#define FDBT_CFGE_MC_DYNAMIC_AND_PORT_BITMAP 3
+ u8 acte_mc;
+#define FDBT_ACTE_MC BIT(0)
+};
+
+union fdbt_access_key {
+ struct fdbt_ak_eid eid;
+ struct fdbt_ak_exact exact;
+ struct fdbt_ak_search search;
+};
+
+/* FDB Table Request Data Buffer Format of Update and Add actions */
+struct fdbt_req_ua {
+ struct ntmp_cmn_req_data crd;
+ union fdbt_access_key ak;
+ struct fdbt_cfge_data cfge;
+};
+
+/* FDB Table Request Data Buffer Format of Query and Delete actions */
+struct fdbt_req_qd {
+ struct ntmp_cmn_req_data crd;
+ union fdbt_access_key ak;
+};
+
+/* FDB Table Response Data Buffer Format of Query action */
+struct fdbt_resp_query {
+ __le32 status;
+ __le32 entry_id;
+ struct fdbt_keye_data keye;
+ struct fdbt_cfge_data cfge;
+ u8 acte;
+ u8 resv[3];
+};
+
#endif
diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h
index 83a449b4d6ec..4cfff835954e 100644
--- a/include/linux/fsl/ntmp.h
+++ b/include/linux/fsl/ntmp.h
@@ -1,11 +1,13 @@
/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
-/* Copyright 2025 NXP */
+/* Copyright 2025-2026 NXP */
#ifndef __NETC_NTMP_H
#define __NETC_NTMP_H
#include <linux/bitops.h>
#include <linux/if_ether.h>
+#define NTMP_NULL_ENTRY_ID 0xffffffffU
+
struct maft_keye_data {
u8 mac_addr[ETH_ALEN];
__le16 resv;
@@ -29,6 +31,7 @@ struct netc_cbdr_regs {
struct netc_tbl_vers {
u8 maft_ver;
u8 rsst_ver;
+ u8 fdbt_ver;
};
struct netc_swcbd {
@@ -68,6 +71,36 @@ struct maft_entry_data {
struct maft_cfge_data cfge;
};
+struct fdbt_keye_data {
+ u8 mac_addr[ETH_ALEN]; /* big-endian */
+ __le16 resv0;
+ __le16 fid;
+#define FDBT_FID GENMASK(11, 0)
+ __le16 resv1;
+};
+
+struct fdbt_cfge_data {
+ __le32 port_bitmap;
+#define FDBT_PORT_BITMAP GENMASK(23, 0)
+ __le32 cfg;
+#define FDBT_OETEID GENMASK(1, 0)
+#define FDBT_EPORT GENMASK(6, 2)
+#define FDBT_IMIRE BIT(7)
+#define FDBT_CTD GENMASK(10, 9)
+#define FDBT_DYNAMIC BIT(11)
+#define FDBT_TIMECAPE BIT(12)
+ __le32 et_eid;
+};
+
+struct fdbt_entry_data {
+ u32 entry_id;
+ struct fdbt_keye_data keye;
+ struct fdbt_cfge_data cfge;
+ u8 acte;
+#define FDBT_ACT_CNT GENMASK(6, 0)
+#define FDBT_ACT_FLAG BIT(7)
+};
+
#if IS_ENABLED(CONFIG_NXP_NETC_LIB)
int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
const struct netc_cbdr_regs *regs);
@@ -83,6 +116,15 @@ int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table,
int count);
int ntmp_rsst_query_entry(struct ntmp_user *user,
u32 *table, int count);
+int ntmp_fdbt_add_entry(struct ntmp_user *user, u32 *entry_id,
+ const struct fdbt_keye_data *keye,
+ const struct fdbt_cfge_data *cfge);
+int ntmp_fdbt_update_entry(struct ntmp_user *user, u32 entry_id,
+ const struct fdbt_cfge_data *cfge);
+int ntmp_fdbt_delete_entry(struct ntmp_user *user, u32 entry_id);
+int ntmp_fdbt_search_port_entry(struct ntmp_user *user, int port,
+ u32 *resume_entry_id,
+ struct fdbt_entry_data *entry);
#else
static inline int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
const struct netc_cbdr_regs *regs)
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 03/15] net: enetc: add pre-boot initialization for i.MX94 switch
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
Before probing the NETC switch driver, some pre-initialization needs to
be set in NETCMIX and IERB to ensure that the switch can work properly.
For example, i.MX94 NETC switch has three external ports and each port
is bound to a link. And each link needs to be configured so that it can
work properly, such as I/O variant and MII protocol.
In addition, the switch port 2 (MAC 2) and ENETC 0 (MAC 3) share the same
parallel interface, they cannot be used at the same time due to the SoC
constraint. And the MAC selection is controlled by the mac2_mac3_sel bit
of EXT_PIN_CONTROL register. Currently, the interface is set for ENETC 0
by default unless the switch port 2 is enabled in the DT node.
Like ENETC, each external port of the NETC switch can manage its external
PHY through its port MDIO registers. And the port can only access its own
external PHY by setting the PHY address to the LaBCR[MDIO_PHYAD_PRTAD].
If the accessed PHY address is not equal to LaBCR[MDIO_PHYAD_PRTAD], then
the MDIO access initiated by port MDIO will be invalid.
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
.../ethernet/freescale/enetc/netc_blk_ctrl.c | 185 +++++++++++++++---
1 file changed, 163 insertions(+), 22 deletions(-)
diff --git a/drivers/net/ethernet/freescale/enetc/netc_blk_ctrl.c b/drivers/net/ethernet/freescale/enetc/netc_blk_ctrl.c
index 92a0f824dae7..c7eb0234c785 100644
--- a/drivers/net/ethernet/freescale/enetc/netc_blk_ctrl.c
+++ b/drivers/net/ethernet/freescale/enetc/netc_blk_ctrl.c
@@ -261,40 +261,108 @@ static int imx94_link_config(struct netc_blk_ctrl *priv,
}
static int imx94_enetc_link_config(struct netc_blk_ctrl *priv,
- struct device_node *np)
+ struct device_node *np,
+ bool *enetc0_en)
{
int link_id = imx94_enetc_get_link_id(np);
if (link_id < 0)
return link_id;
+ if (link_id == IMX94_ENETC0_LINK && of_device_is_available(np))
+ *enetc0_en = true;
+
return imx94_link_config(priv, np, link_id);
}
+static int imx94_switch_link_config(struct netc_blk_ctrl *priv,
+ struct device_node *np,
+ bool *swp2_en)
+{
+ struct device_node *ports;
+ u32 port_id;
+ int err = 0;
+
+ ports = of_get_child_by_name(np, "ethernet-ports");
+ if (!ports)
+ return -ENODEV;
+
+ /* The switch may be owned by a guest OS, in this case, the switch
+ * node in the host OS will be disabled, but we still hope that the
+ * host OS could do some configurations for the switch, as the
+ * netc_blk_ctrl is owned by host OS. So of_device_is_available()
+ * is not needed here.
+ */
+ for_each_available_child_of_node_scoped(ports, child) {
+ if (of_property_read_u32(child, "reg", &port_id) < 0) {
+ err = -ENODEV;
+ goto end;
+ }
+
+ switch (port_id) {
+ case 0 ... 2: /* External ports */
+ err = imx94_link_config(priv, child, port_id);
+ if (err)
+ goto end;
+
+ if (port_id == 2)
+ *swp2_en = true;
+
+ break;
+ case 3: /* CPU port */
+ break;
+ default:
+ err = -EINVAL;
+ goto end;
+ }
+ }
+
+end:
+ of_node_put(ports);
+
+ return err;
+}
+
static int imx94_netcmix_init(struct platform_device *pdev)
{
struct netc_blk_ctrl *priv = platform_get_drvdata(pdev);
struct device_node *np = pdev->dev.of_node;
+ bool enetc0_en = false, swp2_en = false;
u32 val;
int err;
for_each_child_of_node_scoped(np, child) {
for_each_child_of_node_scoped(child, gchild) {
- if (!of_device_is_compatible(gchild, "pci1131,e101"))
- continue;
-
- err = imx94_enetc_link_config(priv, gchild);
- if (err)
- return err;
+ if (of_device_is_compatible(gchild, "pci1131,e101")) {
+ err = imx94_enetc_link_config(priv, gchild,
+ &enetc0_en);
+ if (err)
+ return err;
+ } else if (of_device_is_compatible(gchild,
+ "pci1131,eef2")) {
+ err = imx94_switch_link_config(priv, gchild,
+ &swp2_en);
+ if (err)
+ return err;
+ }
}
}
- /* ENETC 0 and switch port 2 share the same parallel interface.
- * Currently, the switch is not supported, so this interface is
- * used by ENETC 0 by default.
+ if (enetc0_en && swp2_en) {
+ dev_err(&pdev->dev,
+ "Cannot enable swp2 and enetc0 at the same time\n");
+ return -EINVAL;
+ }
+
+ /* ENETC 0 and switch port 2 share the same parallel interface, they
+ * cannot be enabled at the same time. The interface is set for the
+ * ENETC 0 by default unless the switch port 2 is enabled in the DTS.
*/
val = netc_reg_read(priv->netcmix, IMX94_EXT_PIN_CONTROL);
- val |= MAC2_MAC3_SEL;
+ if (!swp2_en)
+ val |= MAC2_MAC3_SEL;
+ else
+ val &= ~MAC2_MAC3_SEL;
netc_reg_write(priv->netcmix, IMX94_EXT_PIN_CONTROL, val);
return 0;
@@ -610,6 +678,78 @@ static int imx94_enetc_mdio_phyaddr_config(struct netc_blk_ctrl *priv,
return 0;
}
+static int imx94_ierb_enetc_init(struct netc_blk_ctrl *priv,
+ struct device_node *np,
+ u32 phy_mask)
+{
+ int err;
+
+ err = imx94_enetc_update_tid(priv, np);
+ if (err)
+ return err;
+
+ return imx94_enetc_mdio_phyaddr_config(priv, np, phy_mask);
+}
+
+static int imx94_switch_mdio_phyaddr_config(struct netc_blk_ctrl *priv,
+ struct device_node *np,
+ u32 port_id, u32 phy_mask)
+{
+ int addr;
+
+ /* The switch has 3 external ports at most */
+ if (port_id > 2)
+ return 0;
+
+ addr = netc_get_phy_addr(np);
+ if (addr < 0) {
+ if (addr == -ENODEV)
+ return 0;
+
+ return addr;
+ }
+
+ if (phy_mask & BIT(addr)) {
+ dev_err(&priv->pdev->dev,
+ "Found same PHY address in EMDIO and switch node\n");
+ return -EINVAL;
+ }
+
+ netc_reg_write(priv->ierb, IERB_LBCR(port_id),
+ LBCR_MDIO_PHYAD_PRTAD(addr));
+
+ return 0;
+}
+
+static int imx94_ierb_switch_init(struct netc_blk_ctrl *priv,
+ struct device_node *np,
+ u32 phy_mask)
+{
+ struct device_node *ports;
+ u32 port_id;
+ int err = 0;
+
+ ports = of_get_child_by_name(np, "ethernet-ports");
+ if (!ports)
+ return -ENODEV;
+
+ for_each_available_child_of_node_scoped(ports, child) {
+ err = of_property_read_u32(child, "reg", &port_id);
+ if (err)
+ goto end;
+
+ err = imx94_switch_mdio_phyaddr_config(priv, child,
+ port_id, phy_mask);
+ if (err)
+ goto end;
+ }
+
+end:
+ of_node_put(ports);
+
+ return err;
+}
+
static int imx94_ierb_init(struct platform_device *pdev)
{
struct netc_blk_ctrl *priv = platform_get_drvdata(pdev);
@@ -625,17 +765,18 @@ static int imx94_ierb_init(struct platform_device *pdev)
for_each_child_of_node_scoped(np, child) {
for_each_child_of_node_scoped(child, gchild) {
- if (!of_device_is_compatible(gchild, "pci1131,e101"))
- continue;
-
- err = imx94_enetc_update_tid(priv, gchild);
- if (err)
- return err;
-
- err = imx94_enetc_mdio_phyaddr_config(priv, gchild,
- phy_mask);
- if (err)
- return err;
+ if (of_device_is_compatible(gchild, "pci1131,e101")) {
+ err = imx94_ierb_enetc_init(priv, gchild,
+ phy_mask);
+ if (err)
+ return err;
+ } else if (of_device_is_compatible(gchild,
+ "pci1131,eef2")) {
+ err = imx94_ierb_switch_init(priv, gchild,
+ phy_mask);
+ if (err)
+ return err;
+ }
}
}
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 02/15] dt-bindings: net: dsa: add NETC switch
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
Add bindings for NETC switch. This switch is a PCIe function of NETC IP,
it supports advanced QoS with 8 traffic classes and 4 drop resilience
levels, and a full range of TSN standards capabilities. The switch CPU
port connects to an internal ENETC port, which is also a PCIe function
of NETC IP. So these two ports use a light-weight "pseudo MAC" instead
of a back-to-back MAC, because the "pseudo MAC" provides the delineation
between switch and ENETC, this translates to lower power (less logic and
memory) and lower delay (as there is no serialization delay across this
link).
Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
---
.../bindings/net/dsa/nxp,netc-switch.yaml | 127 ++++++++++++++++++
1 file changed, 127 insertions(+)
create mode 100644 Documentation/devicetree/bindings/net/dsa/nxp,netc-switch.yaml
diff --git a/Documentation/devicetree/bindings/net/dsa/nxp,netc-switch.yaml b/Documentation/devicetree/bindings/net/dsa/nxp,netc-switch.yaml
new file mode 100644
index 000000000000..988688bf4467
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/dsa/nxp,netc-switch.yaml
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/dsa/nxp,netc-switch.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NETC Switch family
+
+description: >
+ The NETC presents itself as a multi-function PCIe Root Complex Integrated
+ Endpoint (RCiEP) and provides full 802.1Q Ethernet switch functionality,
+ advanced QoS with 8 traffic classes and 4 drop resilience levels, and a
+ full range of TSN standards capabilities.
+
+ The CPU port of the switch connects to an internal ENETC. The switch and
+ the internal ENETC are fully integrated into the NETC IP, a back-to-back
+ MAC is not required. Instead, a light-weight "pseudo MAC" provides the
+ delineation between the switch and ENETC. This translates to lower power
+ (less logic and memory) and lower delay (as there is no serialization
+ delay across this link).
+
+maintainers:
+ - Wei Fang <wei.fang@nxp.com>
+
+properties:
+ compatible:
+ enum:
+ - pci1131,eef2
+
+ reg:
+ maxItems: 1
+
+ dsa,member:
+ description: >
+ The property indicates DSA cluster and switch index. For NETC switch,
+ the valid range of the switch index is 1 ~ 7, the index is reflected
+ in the switch tag as an indication of the switch ID where the frame
+ originated. The value 0 is reserved for ENETC VEPA switch, whose ID
+ is hardwired to zero.
+
+ ethernet-ports:
+ type: object
+ patternProperties:
+ "^ethernet-port@[0-9a-f]$":
+ type: object
+ $ref: dsa-port.yaml#
+
+ properties:
+ clocks:
+ items:
+ - description: MAC transmit/receive reference clock.
+
+ clock-names:
+ items:
+ - const: ref
+
+ mdio:
+ $ref: /schemas/net/mdio.yaml#
+ unevaluatedProperties: false
+ description:
+ Optional child node for switch port, otherwise use NETC EMDIO.
+
+ unevaluatedProperties: false
+
+required:
+ - compatible
+ - reg
+ - dsa,member
+ - ethernet-ports
+
+allOf:
+ - $ref: /schemas/pci/pci-device.yaml
+ - $ref: dsa.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ pcie {
+ #address-cells = <3>;
+ #size-cells = <2>;
+
+ ethernet-switch@0,2 {
+ compatible = "pci1131,eef2";
+ reg = <0x200 0 0 0 0>;
+ dsa,member = <0 1>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_switch>;
+
+ ethernet-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ethernet-port@0 {
+ reg = <0>;
+ phy-handle = <ðphy0>;
+ phy-mode = "mii";
+ };
+
+ ethernet-port@1 {
+ reg = <1>;
+ phy-handle = <ðphy1>;
+ phy-mode = "mii";
+ };
+
+ ethernet-port@2 {
+ reg = <2>;
+ clocks = <&scmi_clk 103>;
+ clock-names = "ref";
+ phy-handle = <ðphy2>;
+ phy-mode = "rgmii-id";
+ };
+
+ ethernet-port@3 {
+ reg = <3>;
+ ethernet = <&enetc3>;
+ phy-mode = "internal";
+
+ fixed-link {
+ speed = <2500>;
+ full-duplex;
+ pause;
+ };
+ };
+ };
+ };
+ };
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 01/15] dt-bindings: net: dsa: update the description of 'dsa,member' property
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
In-Reply-To: <20260509102954.4116624-1-wei.fang@nxp.com>
The current description indicates that the 'dsa,member' property cannot
be set for a switch that is not part of any cluster. Vladimir thinks
that this is a case where the actual technical limitation was poorly
transposed into words when this restriction was first documented, in
commit 8c5ad1d6179d ("net: dsa: Document new binding").
The true technical limitation is that many DSA tagging protocols are
topology-unaware, and always call dsa_conduit_find_user() with a
switch_id of 0. Specifying a custom "dsa,member" property with a
non-zero switch_id would break them.
Therefore, for topology-aware switches, it is fine to specify this
property for them, even if they are not part of any cluster. Our NETC
switch is a good example which is topology-aware, the switch_id is
carried in the switch tag, but the switch_id 0 is reserved for VEPA
switch and cannot be used, so we need to use this property to assign
a non-zero switch_id for it.
Suggested-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Wei Fang <wei.fang@nxp.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
---
Documentation/devicetree/bindings/net/dsa/dsa.yaml | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.yaml b/Documentation/devicetree/bindings/net/dsa/dsa.yaml
index 2abd036578d1..801e1411e5c2 100644
--- a/Documentation/devicetree/bindings/net/dsa/dsa.yaml
+++ b/Documentation/devicetree/bindings/net/dsa/dsa.yaml
@@ -28,7 +28,11 @@ properties:
A two element list indicates which DSA cluster, and position within the
cluster a switch takes. <0 0> is cluster 0, switch 0. <0 1> is cluster 0,
switch 1. <1 0> is cluster 1, switch 0. A switch not part of any cluster
- (single device hanging off a CPU port) must not specify this property
+ (single device hanging off a CPU port) does not usually need to specify
+ this property, and then it becomes cluster 0, switch 0. For a topology
+ aware switch, its switch index can be specified through this property,
+ even if it is not part of any cluster. Also, topology-unaware switches
+ must always be defined as index 0 of their cluster.
$ref: /schemas/types.yaml#/definitions/uint32-array
additionalProperties: true
--
2.34.1
^ permalink raw reply related
* [PATCH v6 net-next 00/15] Add preliminary NETC switch support for i.MX94
From: Wei Fang @ 2026-05-09 10:29 UTC (permalink / raw)
To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
davem, edumazet, kuba, pabeni, robh, krzk+dt, conor+dt,
f.fainelli, frank.li, chleroy, horms, linux, maxime.chevallier,
andrew, olteanv
Cc: netdev, linux-kernel, devicetree, linuxppc-dev, linux-arm-kernel,
imx
i.MX94 NETC (v4.3) integrates 802.1Q Ethernet switch functionality, the
switch provides advanced QoS with 8 traffic classes and a full range of
TSN standards capabilities. It has 3 user ports and 1 CPU port, and the
CPU port is connected to an internal ENETC through the pseduo link, so
instead of a back-to-back MAC, the lightweight "pseudo MAC" is used at
both ends of the pseudo link to transfer Ethernet frames. The pseudo
link provides a zero-copy interface (no serialization delay) and lower
power (less logic and memory).
Like most Ethernet switches, the NETC switch also supports a proprietary
switch tag, is used to carry in-band metadata information about frames.
This in-band metadata information can include the source port from which
the frame was received, what was the reason why this frame got forwarded
to the entity, and for the entity to indicate the precise destination
port of a frame. The NETC switch tag is added to frames after the source
MAC address. There are three types of switch tags, and each type has 1
to 4 subtypes, more details are as follows.
Forward switch tag (Type = 0): Represents forwarded frames.
- SubType = 0 - Normal frame processing.
To_Port switch tag (Type = 1): Represents frames that are to be sent to
a specific switch port.
- SubType = 0. No request to perform timestamping.
- SubType = 1. Request to perform one-step timestamping.
- SubType = 2. Request to perform two-step timestamping.
- SubType = 3. Request to perform both one-step timestamping and
two-step timestamping.
To_Host switch tag (Type = 2): Represents frames redirected or copied to
the switch management port.
- SubType = 0. Received frames redirected or copied to the switch
management port.
- SubType = 1. Received frames redirected or copied to the switch
management port with captured timestamp at the switch port where
the frame was received.
- SubType = 2. Transmit timestamp response (two-step timestamping).
Currently, this patch set supports Forward tag, SubType 0 of To_Port tag
and SubType 0 of To_Host tag. More tags will be supported in the future.
In addition, the switch supports NETC Table Management Protocol (NTMP),
some switch functionality is controlled using control messages sent to
the hardware using BD ring interface with 32B descriptors similar to the
packet Transmit BD ring used on ENETC. This interface is referred to as
the command BD ring. This is used to configure functionality where the
underlying resources may be shared between different entities or being
too large to configure using direct registers.
For this patch set, we have supported the following tables through the
command BD ring interface.
FDB Table: It contains forwarding and/or filtering information about MAC
addresses. The FDB table is used for MAC learning lookups and MAC
forwarding lookups.
VLAN Filter Table: It contains configuration and control information for
each VLAN configured on the switch.
Buffer Pool Table: It contains buffer pool configuration and operational
information. Each entry corresponds to a buffer pool. Currently, we use
this table to implement flow control feature on each port.
Ingress Port Filter Table: It contains a set of filters each capable of
classifying incoming traffic using a mix of L2, L3, and L4 parsed and
arbitrary field data. We use this table to implement host flood support
to the switch port.
The switch also supports other tables, and we will add more advanced
features through them in the future.
---
v6:
1. Use FIELD_GET() to get switch tag type
2. Add PCI BAR length check in netc_switch_pci_init()
3. Remove CPU port check from netc_port_change_mtu()
4. Move netc_port_get_info_from_dt() from patch 10 to patch 12
5. Update the comment for NTMP_QA_ENTRY_ID
6. Collect Reviewed-by tags
v5 link: https://lore.kernel.org/imx/20260430024945.3413973-1-wei.fang@nxp.com/
v4 link: https://lore.kernel.org/imx/20260331113025.1566878-1-wei.fang@nxp.com/
v3 link: https://lore.kernel.org/imx/20260326062917.3552334-1-wei.fang@nxp.com/
v2 link: https://lore.kernel.org/imx/20260323060752.1157031-1-wei.fang@nxp.com/
v1 link: https://lore.kernel.org/imx/20260316094152.1558671-1-wei.fang@nxp.com/
---
Wei Fang (15):
dt-bindings: net: dsa: update the description of 'dsa,member' property
dt-bindings: net: dsa: add NETC switch
net: enetc: add pre-boot initialization for i.MX94 switch
net: enetc: add basic operations to the FDB table
net: enetc: add support for the "Add" operation to VLAN filter table
net: enetc: add support for the "Update" operation to buffer pool
table
net: enetc: add support for "Add" and "Delete" operations to IPFT
net: enetc: add multiple command BD rings support
net: dsa: add NETC switch tag support
net: dsa: netc: introduce NXP NETC switch driver for i.MX94
net: dsa: netc: add phylink MAC operations
net: dsa: netc: add FDB, STP, MTU, port setup and host flooding
support
net: dsa: netc: initialize buffer pool table and implement
flow-control
net: dsa: netc: add support for the standardized counters
net: dsa: netc: add support for ethtool private statistics
.../devicetree/bindings/net/dsa/dsa.yaml | 6 +-
.../bindings/net/dsa/nxp,netc-switch.yaml | 127 ++
MAINTAINERS | 11 +
drivers/net/dsa/Kconfig | 2 +
drivers/net/dsa/Makefile | 1 +
drivers/net/dsa/netc/Kconfig | 14 +
drivers/net/dsa/netc/Makefile | 3 +
drivers/net/dsa/netc/netc_ethtool.c | 297 ++++
drivers/net/dsa/netc/netc_main.c | 1561 +++++++++++++++++
drivers/net/dsa/netc/netc_platform.c | 87 +
drivers/net/dsa/netc/netc_switch.h | 173 ++
drivers/net/dsa/netc/netc_switch_hw.h | 361 ++++
.../ethernet/freescale/enetc/netc_blk_ctrl.c | 185 +-
drivers/net/ethernet/freescale/enetc/ntmp.c | 383 +++-
.../ethernet/freescale/enetc/ntmp_private.h | 122 +-
include/linux/dsa/tag_netc.h | 14 +
include/linux/fsl/netc_global.h | 6 +
include/linux/fsl/ntmp.h | 187 +-
include/net/dsa.h | 2 +
include/uapi/linux/if_ether.h | 1 +
net/dsa/Kconfig | 10 +
net/dsa/Makefile | 1 +
net/dsa/tag_netc.c | 193 ++
23 files changed, 3717 insertions(+), 30 deletions(-)
create mode 100644 Documentation/devicetree/bindings/net/dsa/nxp,netc-switch.yaml
create mode 100644 drivers/net/dsa/netc/Kconfig
create mode 100644 drivers/net/dsa/netc/Makefile
create mode 100644 drivers/net/dsa/netc/netc_ethtool.c
create mode 100644 drivers/net/dsa/netc/netc_main.c
create mode 100644 drivers/net/dsa/netc/netc_platform.c
create mode 100644 drivers/net/dsa/netc/netc_switch.h
create mode 100644 drivers/net/dsa/netc/netc_switch_hw.h
create mode 100644 include/linux/dsa/tag_netc.h
create mode 100644 net/dsa/tag_netc.c
--
2.34.1
^ permalink raw reply
* [GIT PULL] Please pull powerpc/linux.git powerpc-7.1-2 tag
From: Madhavan Srinivasan @ 2026-05-09 5:17 UTC (permalink / raw)
To: Linus Torvalds
Cc: atrajeev, chleroy, christophe.leroy, krzysztof.kozlowski,
linux-kernel, linuxppc-dev, mahesh, mpe, nathan, naveen, npiggin,
ritesh.list, shivani, sourabhjain, tzimmermann, Nicholas Piggin
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
Hi Linus,
Please pull powerpc fixes for 7.1:
The following changes since commit 7fd2df204f342fc17d1a0bfcd474b24232fb0f32:
Linux 7.1-rc2 (2026-05-03 14:21:25 -0700)
are available in the git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
tags/powerpc-7.1-2
for you to fetch changes up to f583bd5f64d40e083dde5bb22846c4d93e59d471:
powerpc/pasemi: Drop redundant res assignment (2026-05-06 07:49:19 +0530)
- ------------------------------------------------------------------
powerpc fixes for 7.1 #2
- Fix KASAN sanitization flag for core_$(BITS).o
- Fixes for handling offset values in pseries htmdump
- Fix interrupt mask in cpm1_gpiochip_add16()
- ps3/pasemi fixes to drop redundant result assignment
- Fixes in papr-hvpipe code path
- powerpc/perf: Update check for PERF_SAMPLE_DATA_SRC marked events
Thanks to:
Aboorva Devarajan, Athira Rajeev, Christophe Leroy (CS GROUP), Geert
Uytterhoeven, Haren Myneni, Krzysztof Kozlowski, Mukesh Kumar Chaurasiya
(IBM),
Nathan Chancellor, Ritesh Harjani (IBM), Shivani Nittor, Sourabh Jain,
Thomas
Zimmermann, Venkat Rao Bagalkote
- ------------------------------------------------------------------
Athira Rajeev (4):
powerpc/pseries/htmdump: Free the global buffers in htmdump
module exit
powerpc/pseries/htmdump: Fix the offset value used in processor
configuration dump
powerpc/pseries/htmdump: Fix the offset value used in htm status dump
powerpc/pseries/htmdump: Add memory configuration dump support to
htmdump module
Christophe Leroy (CS GROUP) (1):
powerpc/8xx: Fix interrupt mask in cpm1_gpiochip_add16()
Krzysztof Kozlowski (2):
powerpc/ps3: Drop redundant result assignment
powerpc/pasemi: Drop redundant res assignment
Nathan Chancellor (1):
powerpc/vdso: Drop -DCC_USING_PATCHABLE_FUNCTION_ENTRY from
32-bit flags with clang
Ritesh Harjani (IBM) (9):
pseries/papr-hvpipe: Fix race with interrupt handler
pseries/papr-hvpipe: Prevent kernel stack memory leak to userspace
pseries/papr-hvpipe: Fix null ptr deref in
papr_hvpipe_dev_create_handle()
pseries/papr-hvpipe: Fix & simplify error handling in
papr_hvpipe_init()
pseries/papr-hvpipe: Fix the usage of copy_to_user()
pseries/papr-hvpipe: Simplify spin unlock usage in
papr_hvpipe_handle_release()
pseries/papr-hvpipe: Kill task_struct pointer from struct
hvpipe_source_info
pseries/papr-hvpipe: Refactor and simplify hvpipe_rtas_recv_msg()
pseries/papr-hvpipe: Fix style and checkpatch issues in
enable_hvpipe_IRQ()
Shivani Nittor (1):
powerpc/perf: Update check for PERF_SAMPLE_DATA_SRC marked events
Sourabh Jain (2):
powerpc/kdump: fix KASAN sanitization flag for core_$(BITS).o
powerpc/vmx: avoid KASAN instrumentation in enter_vmx_ops() for kexec
Thomas Zimmermann (1):
arch/powerpc: Drop CONFIG_FIRMWARE_EDID from defconfig files
arch/powerpc/configs/amigaone_defconfig | 1 -
arch/powerpc/configs/chrp32_defconfig | 1 -
arch/powerpc/configs/g5_defconfig | 1 -
arch/powerpc/configs/pasemi_defconfig | 1 -
arch/powerpc/configs/powernv_defconfig | 1 -
arch/powerpc/configs/ppc64_defconfig | 1 -
arch/powerpc/configs/ppc64e_defconfig | 1 -
arch/powerpc/configs/skiroot_defconfig | 1 -
arch/powerpc/kernel/vdso/Makefile | 6 +
arch/powerpc/kexec/Makefile | 2 +-
arch/powerpc/lib/vmx-helper.c | 9 +-
arch/powerpc/perf/core-book3s.c | 5 +-
arch/powerpc/platforms/8xx/cpm1.c | 4 +-
arch/powerpc/platforms/pasemi/pci.c | 3 +-
arch/powerpc/platforms/ps3/device-init.c | 4 +-
arch/powerpc/platforms/pseries/htmdump.c | 133 ++++++++++++--
arch/powerpc/platforms/pseries/papr-hvpipe.c | 181 +++++++++++---------
arch/powerpc/platforms/pseries/papr-hvpipe.h | 1 -
18 files changed, 235 insertions(+), 121 deletions(-)
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEEqX2DNAOgU8sBX3pRpnEsdPSHZJQFAmn+wtYACgkQpnEsdPSH
ZJSYihAAglGB3V+ICTWx2ic7FHPJp2uiuEhbvwdjrLhloqRVpOrs00TxS3Eso1q+
ySXqrr2u5qsPUeMr8h4VqiCnc7eGsfmfJaheItVs69Klv4FnexTilJgH8BpRc5z3
qHVPAZBAv1laCTN5uRm4pXBrv3V/vGWLtHjTkqBnkGGgy82luJlV5nyUvOYfR0IR
eRYWn2BxqvkFCQn0FzdPktJJuOuTca3u/DwA1s1IwsrBqdIblDUPCdXwlzXaB2p7
YmlFBjCXkcdu2MlNC1qWx+zJQqFCEXt2l58daW5HIGt/Fw7mNYr6EzoxnSgT2SvQ
33dzLtxOELEFZrUhZ/IgvMwnv7awS34gzNWXtowkuSdI+W6gFNPu3ahv8xQluF2K
+3gamy9lVudD1EAUBkr60dJLjYcCualClXh24U9pcZIdehmBFLInRC6gVyPJxAu4
hcGNNTCYtPzejUFi/8BywnsVZ9uRCwk2iLbYF8EzaJV9m9fciZetnm6IFs48L9zw
2P+qq9B8FdAmEyd0f1RKymFr4tBA05nrFex4P/5ujEWBOr1sUVIJvbaxPxpFs1vt
FykuXyzeJERyt7elKH0rTglxikxMVVFNLZ+pJMJh5R7fMZg7RTxSMCWCaBH2ecRb
MOmDYpWUsOsbsW+oIKqYEDRnYy8wz/hH0mQMWM8Xm5ZLmg2Il70=
=v1Hy
-----END PGP SIGNATURE-----
^ permalink raw reply
* Re: [PATCH v3 0/9] pseries/papr-hvpipe: Fix deadlock, races and misc cleanups
From: Aboorva Devarajan @ 2026-05-09 4:09 UTC (permalink / raw)
To: Madhavan Srinivasan, linuxppc-dev
Cc: Ritesh Harjani (IBM), Haren Myneni, Christophe Leroy,
Venkat Rao Bagalkote, Nicholas Piggin, linux-kernel
In-Reply-To: <9dad7c263f8197000d4e56b09532d87a7f7f57e4.camel@linux.ibm.com>
On Fri, 2026-05-08 at 13:16 +0530, Aboorva Devarajan wrote:
> On Fri, 2026-05-01 at 09:41 +0530, Ritesh Harjani (IBM) wrote:
> > While going over papr-hvpipe code, there were a few fixes which were identified.
> > This patch series is an attempt to fix those along with some misc cleanups.
> > Me and Haren are trying to get these patches verified on a real HW. The tests
> > are not straight forward and we are waiting for the results.
> > Will update on the test results once we hear back from the internal test team.
> >
> > v2->v3:
> > ======
> > 1. Rearranged the patches in such a way that it is easier to backport the fixes
> > if required.
> > 2. Clubbed patch-8 and patch-10 (of v2) since they both were changing the same function.
> > 3. Handled ret>=0 case in copy_to_user patch, when the user itself may request
> > for 0 effective bytes (after the HDR_LEN).
>
>
> Since this is CCed to stable, it is currently being evaluated by RSCT.
> We can merge it once we receive an Acked-by from RSCT.
>
>
An update from RSCT: with the patch, the earlier issues observed are
now resolved, and the inband RMC connection is successfully established
with the patched kernel.
Thanks,
Aboorva
>
> >
> > [v2]: https://lore.kernel.org/linuxppc-dev/cover.1775648406.git.ritesh.list@gmail.com/
> >
> > v1->v2:
> > ========
> > 1. Fix a possible deadlock due to use of spin_lock instead of spin_lock_irqsave.
> > 2. Prevent kernel stack uninit memory leak to userspace
> > 3. Fix the race condition in null-ptr-deref case where there may be an
> > msg pending to be consumed from the hvpipe.
> > 4. Fixed error handling in init routine in patch-10
> >
> > [v1]: https://lore.kernel.org/linuxppc-dev/cover.1775569027.git.ritesh.list@gmail.com/#t
> >
> > Ritesh Harjani (IBM) (9):
> > pseries/papr-hvpipe: Fix race with interrupt handler
> > pseries/papr-hvpipe: Prevent kernel stack memory leak to userspace
> > pseries/papr-hvpipe: Fix null ptr deref in papr_hvpipe_dev_create_handle()
> > pseries/papr-hvpipe: Fix & simplify error handling in papr_hvpipe_init()
> > pseries/papr-hvpipe: Fix the usage of copy_to_user()
> > pseries/papr-hvpipe: Simplify spin unlock usage in papr_hvpipe_handle_release()
> > pseries/papr-hvpipe: Kill task_struct pointer from struct hvpipe_source_info
> > pseries/papr-hvpipe: Refactor and simplify hvpipe_rtas_recv_msg()
> > pseries/papr-hvpipe: Fix style and checkpatch issues in enable_hvpipe_IRQ()
> >
> > arch/powerpc/platforms/pseries/papr-hvpipe.c | 181 ++++++++++---------
> > arch/powerpc/platforms/pseries/papr-hvpipe.h | 1 -
> > 2 files changed, 97 insertions(+), 85 deletions(-)
> >
> > --
> > 2.39.5
^ permalink raw reply
* Re: [PATCH v3 net] net: wan: fsl_ucc_hdlc: free tx_skbuff in uhdlc_memclean
From: patchwork-bot+netdevbpf @ 2026-05-09 1:50 UTC (permalink / raw)
To: Holger Brunck
Cc: netdev, linuxppc-dev, andrew+netdev, chleroy, qiang.zhao, horms,
kuba
In-Reply-To: <20260507155332.3452319-1-holger.brunck@hitachienergy.com>
Hello:
This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Thu, 7 May 2026 17:53:32 +0200 you wrote:
> When the device is removed all allocated resources should be freed.
> In uhdlc_memclean the netdev transmit queue was already stopped. But at
> this point we may have pending skb in the transmit queue which must be
> freed. Therefore iterate over the tx_skbuff pointers and free all
> pending skb. The issue was discovered by sashiko.
> Tested on a ls1043a board running HDLC in bus mode on kernel 6.12.
>
> [...]
Here is the summary with links:
- [v3,net] net: wan: fsl_ucc_hdlc: free tx_skbuff in uhdlc_memclean
https://git.kernel.org/netdev/net/c/496c0c4c53bb
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH v7 00/24] PCI: Convert all dynamic sysfs attributes to static
From: Bjorn Helgaas @ 2026-05-08 23:00 UTC (permalink / raw)
To: Krzysztof Wilczyński
Cc: Bjorn Helgaas, Manivannan Sadhasivam, Lorenzo Pieralisi,
Alex Williamson, Magnus Lindholm, Matt Turner, Richard Henderson,
Christophe Leroy, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Dexuan Cui, Krzysztof Hałasa, Lukas Wunner,
Oliver O'Halloran, Saurabh Singh Sengar, Shuan He,
Srivatsa Bhat, Ilpo Järvinen, linux-pci, linux-alpha,
linuxppc-dev
In-Reply-To: <20260508043543.217179-1-kwilczynski@kernel.org>
On Fri, May 08, 2026 at 04:35:19AM +0000, Krzysztof Wilczyński wrote:
> Hello,
>
> This series converts every dynamically allocated PCI sysfs attribute to
> a static const definition. After the full series, pci_sysfs_init() and
> sysfs_initialized are gone, and every sysfs file is created by the
> driver model at device_add() time.
>
> Currently, the PCI resource files (resourceN, resourceN_wc) and the
> legacy bus files (legacy_io, legacy_mem) are created dynamically
> from two unsynchronised paths:
>
> Path A: late_initcall
>
> pci_sysfs_init() (late_initcall)
> sysfs_initialized = 1
> for_each_pci_dev()
> pci_create_sysfs_dev_files()
> sysfs_create_bin_file() (resourceN, resourceN_wc)
> pci_find_next_bus()
> pci_create_legacy_files()
> sysfs_create_bin_file() (legacy_io, legacy_mem)
>
> Path B: device registration / hotplug
>
> pci_bus_add_devices()
> pci_bus_add_device()
> pci_create_sysfs_dev_files()
> if (!sysfs_initialized) <- only guard
> return
> sysfs_create_bin_file() (resourceN, resourceN_wc)
>
> On most ACPI systems this does not race because PCI enumeration
> completes at subsys_initcall time, before pci_sysfs_init() runs:
>
> subsys_initcall (level 4):
> acpi_pci_root_add()
> pci_bus_add_device()
> pci_create_sysfs_dev_files()
> if (!sysfs_initialized) <- variable not yet set
> return -EACCES
>
> late_initcall (level 7):
> pci_sysfs_init()
> sysfs_initialized = 1
> for_each_pci_dev()
> pci_create_sysfs_dev_files() <- creates the files, no race
>
> On Devicetree platforms the host controller is a platform driver that
> probes via the driver model, often on a workqueue, and overlaps with the
> late_initcall:
>
> CPU 0 (late_initcall) CPU 1 (driver probe)
> --------------------------- ----------------------------
> pci_sysfs_init()
> sysfs_initialized = 1
> for_each_pci_dev() pci_bus_add_device()
> pci_create_sysfs_dev_files() pci_create_sysfs_dev_files()
> sysfs_create_bin_file() sysfs_create_bin_file()
> -> "duplicate filename"
>
> The same happens on ACPI when probing is asynchronous (hv_pci on
> Azure, RISC-V with ACPI).
>
> The duplicate causes sysfs_create_bin_file() to fail with -EEXIST.
> pci_create_resource_files() then calls pci_remove_resource_files() in
> its error unwind, tearing down files the other thread created and
> still references through pdev->res_attr[]. This has caused kernel
> panics on i.MX6 and boot failures on other platforms.
>
> Several different fixes have been proposed over the years: reordering
> the sysfs_initialized assignment, adding locks, checking
> pci_dev_is_added(), setting pdev->res_attr[] to NULL after kfree
> (which only prevents a double-free on the teardown path, not the
> error unwind removing the other thread's files). None would address the
> root cause.
>
> This has been reported a few times:
>
> - https://lore.kernel.org/linux-pci/20250702155112.40124-1-heshuan@bytedance.com/
> - https://lore.kernel.org/linux-pci/b51519d6-ce45-4b6d-8135-c70169bd110e@h-partners.com/
> - https://lore.kernel.org/linux-pci/1702093576-30405-1-git-send-email-ssengar@linux.microsoft.com/
> - https://lore.kernel.org/linux-pci/SY0P300MB04687548090B73E40AF97D8897B82@SY0P300MB0468.AUSP300.PROD.OUTLOOK.COM/
> - https://lore.kernel.org/linux-pci/20230105174736.GA1154719@bhelgaas/
> - https://lore.kernel.org/linux-pci/m3eebg9puj.fsf@t19.piap.pl/
> - https://lore.kernel.org/linux-pci/20200716110423.xtfyb3n6tn5ixedh@pali/
> - https://lore.kernel.org/linux-pci/1366196798-15929-1-git-send-email-artem.savkov@gmail.com/
> - https://bugzilla.kernel.org/show_bug.cgi?id=215515
> - https://bugzilla.kernel.org/show_bug.cgi?id=216888
>
> With static attributes the driver model creates sysfs entries once per
> device at device_add() time, under the device lock, eliminating the
> late_initcall iteration and the race along with it.
>
> Krzysztof
>
> ---
> Changes in v7:
> https://lore.kernel.org/linux-pci/20260422161407.118748-1-kwilczynski@kernel.org/
>
> - Addded Alex Williamson (author of the resource resize sysfs
> attributes) to the list of recipients for visibility.
> - Split pci_llseek_resource() into pci_llseek_resource() and
> pci_llseek_resource_legacy() since legacy attributes operate
> on a struct pci_bus where to_pci_dev() would be invalid,
> as per Bjorn Helgaas' feedback.
> - Moved each llseek variant inside its respective #ifdef guard
> during the corresponding dynamic-to-static conversion commit,
> dropping the __maybe_unused annotations.
> - Extended the WARN macro removal to also cover __legacy_mmap_fits().
> - Updated commit message of patch 18, so that it correctly mentions
> pci_stop_dev() rather than pci_stop_bus_device().
> - Updated commit message of patch 24 to clarify the indirect
> relationship between ReBAR and the HAVE_PCI_MMAP and/or
> ARCH_GENERIC_PCI_MMAP_RESOURCE guards.
>
> Changes in v6:
> https://lore.kernel.org/linux-pci/20260416180107.777065-1-kwilczynski@kernel.org/
>
> - Fixed commit message for patch 13, removing reference to
> pci_resource_flags() which was no longer changed there.
> - Added a new patch (24) to move the BAR resource resize
> (ReBAR) support behind existing PCI mmap #ifdef guard,
> so that the code is not included on architectures that
> do not support resource resizing (i.e., Alpha, etc.).
>
> Changes in v5:
> https://lore.kernel.org/linux-pci/20260411080148.471335-1-kwilczynski@kernel.org/
>
> - Added new Tested-by, Reviewed-by, and Acked-by tags.
> - Used the existing _io function names in the static macro
> definitions, deferring the rename to the conversion commit
> where it belongs, to avoid a forward reference across
> commits. This was reported by Sashiko, see:
> https://sashiko.dev/#/patchset/20260411080148.471335-1-kwilczynski%40kernel.org?part=6
> - Folded the __resource_resize_store() conversion into the
> main static attributes commit so the resize path is never
> broken between commits. This was reported by Sashiko, see:
> https://sashiko.dev/#/patchset/20260410055040.39233-1-kwilczynski%40kernel.org?part=6
> https://sashiko.dev/#/patchset/20260411080148.471335-1-kwilczynski%40kernel.org?part=7
> - Dropped the unnecessary parentheses cleanup from the Alpha
> BAR index commit, as the line is replaced two commits later
> anyway, as per Ilpo Järvinen's feedback.
> - Squashed the Alpha accessor macro and cleanup commits into
> one, using pci_resource_is_mem() directly instead of the
> intermediate pci_resource_flags() step, as per Ilpo
> Järvinen's feedback.
> - Moved the raw literal conversion in pci_create_legacy_files()
> into the macro definition commit, so the macros and their
> usage are introduced together, as per Ilpo Järvinen's
> feedback.
> - Removed unnecessary backslash line continuation from the
> ternary in pci_mmap_legacy_page_range().
> - Kept pci_resource_len() for visibility checks instead of
> resource_assigned(). The static is_visible() callback
> runs at device_add() time during the PCI enumeration,
> before the pci_assign_unassigned_bus_resources() populates
> res->parent, as such, resource_assigned() returned false
> for every BAR, hiding all resource files. This is related
> to review feedback from Ilpo Järvinen.
>
> Changes in v4:
> https://lore.kernel.org/linux-pci/20260410055040.39233-1-kwilczynski@kernel.org/
>
> - Added new Reviewed-by tags.
> - Added pci_resource_is_io() and pci_resource_is_mem() helpers
> for resource type checks, replacing the open-coded bitwise
> flag tests in pci_mmap_resource(), pci_resource_io(), and
> Alpha's pci_mmap_resource(), as per Ilpo Järvinen's
> suggestion.
> - Split the __pci_mmap_fits() cleanup into two patches. An
> overflow fix for zero-length BARs, which now includes a
> Fixes: tag referencing the original Alpha PCI sysfs commit,
> and the WARN macro removal is a separate cleanup as per Ilpo
> Järvinen's suggestion.
> - Added a missing Fixes: tag to the Alpha lockdown check,
> referencing the commit that added the check to the generic
> path but missed Alpha's implementation.
> - Added PCI_LEGACY_IO_SIZE and PCI_LEGACY_MEM_SIZE macros to
> replace the raw literals used for legacy address space sizes.
> These are used in both Alpha's pci_mmap_legacy_page_range()
> and the static legacy attribute definitions, as per Ilpo
> Järvinen's suggestion.
> - Replaced sysfs_update_groups() in the BAR resize path with
> sysfs_remove_groups() before the resize and sysfs_create_groups()
> after, restoring the original teardown before BAR resize
> ordering. This was reported by Sashiko, see:
> https://sashiko.dev/#/patchset/20260410055040.39233-1-kwilczynski%40kernel.org?part=7
> - Defined pci_dev_resource_attr_groups as a NULL macro when
> HAVE_PCI_MMAP and ARCH_GENERIC_PCI_MMAP_RESOURCE are both
> absent, so the resize path compiles unconditionally without
> #ifdef guards in the function body. This was reported by
> Sashiko, see:
> https://sashiko.dev/#/patchset/20260410055040.39233-1-kwilczynski%40kernel.org?part=7
> - Moved the pci_legacy_has_sparse() prototype into the patch
> that introduces the function, alongside the existing
> pci_adjust_legacy_attr() declaration, to fix a bisection
> issue where Alpha would warn on -Wmissing-prototypes.
> This was reported by Sashiko, see:
> https://sashiko.dev/#/patchset/20260410055040.39233-1-kwilczynski%40kernel.org?part=18
>
> Changes in v3:
> https://lore.kernel.org/linux-pci/20210910202623.2293708-1-kw@linux.com/
>
> - Updated for modern kernel releases and expanded scope. The
> v2 only covered the generic resource files. This version
> also converts Alpha's sparse/dense resource files and the
> legacy bus attributes, removing pci_sysfs_init() entirely.
> - Split the single macro definition into three distinct ones
> (per I/O, UC, and WC), to make sure that each carries only
> the callbacks its resource type needs.
> - Updated to use the new .bin_size callback, as the attributes
> are const, to replace using a->size directly, which was not
> ideal. This required changes to pci_llseek_resource(), to
> ensure that it would work for device and bus-level attributes.
> - Updated the __resource_resize_store() to include CAP_SYS_ADMIN
> capabilities check.
> - Added the security_locked_down() check to Alpha's
> pci_mmap_resource(), to align with other architectures.
>
> Changes in v2:
> https://lore.kernel.org/linux-pci/20210825212255.878043-1-kw@linux.com/
>
> - Refactored code so that the macros, helpers and internal
> functions can be used to correctly leverage the read(),
> write() and mmap() callbacks rather than to use the
> .is_bin_visible() callback to set up sysfs objects
> internals as this is not supported.
> - Refactored some if-statements to check for a resource
> flag first, and then call either arch_can_pci_mmap_io()
> or arch_can_pci_mmap_wc(), plus store result of testing
> for IORESOURCE_MEM and IORESOURCE_PREFETCH flags into
> a boolean variable, as per Bjorn Helgaas' suggestion.
> - Renamed pci_read_resource_io() and pci_write_resource_io()
> callbacks so that these are not specifically tied to I/O
> BARs read() and write() operations also as per Bjorn
> Helgaas' suggestion.
> - Updated style for code handling bitwise operations to
> match the style that is preferred as per Bjorn Helgaas'
> suggestion.
> - Updated commit messages adding more details about the
> implementation as requested by Bjorn Helgaas.
>
> Krzysztof Wilczyński (24):
> PCI/sysfs: Use PCI resource accessor macros
> PCI: Add pci_resource_is_io() and pci_resource_is_mem() helpers
> PCI/sysfs: Only allow supported resource types in I/O and MMIO helpers
> PCI/sysfs: Split pci_llseek_resource() for device and legacy
> attributes
> PCI/sysfs: Add CAP_SYS_ADMIN check to __resource_resize_store()
> PCI/sysfs: Add static PCI resource attribute macros
> PCI/sysfs: Convert PCI resource files to static attributes
> PCI/sysfs: Warn about BAR resize failure in __resource_resize_store()
> PCI/sysfs: Add stubs for pci_{create,remove}_sysfs_dev_files()
> PCI/sysfs: Limit pci_sysfs_init() late_initcall compile scope
> alpha/PCI: Add security_locked_down() check to pci_mmap_resource()
> alpha/PCI: Use BAR index in sysfs attr->private instead of resource
> pointer
> alpha/PCI: Use PCI resource accessor macros
> alpha/PCI: Fix __pci_mmap_fits() overflow for zero-length BARs
> alpha/PCI: Remove WARN from __pci_mmap_fits() and __legacy_mmap_fits()
> alpha/PCI: Add static PCI resource attribute macros
> alpha/PCI: Convert resource files to static attributes
> PCI/sysfs: Remove pci_{create,remove}_sysfs_dev_files()
> PCI: Add macros for legacy I/O and memory address space sizes
> alpha/PCI: Compute legacy size in pci_mmap_legacy_page_range()
> PCI/sysfs: Add __weak pci_legacy_has_sparse() helper
> PCI/sysfs: Convert legacy I/O and memory attributes to static
> definitions
> PCI/sysfs: Remove pci_create_legacy_files() and pci_sysfs_init()
> PCI/sysfs: Limit BAR resize attribute scope to platforms with PCI mmap
>
> arch/alpha/include/asm/pci.h | 13 +-
> arch/alpha/kernel/pci-sysfs.c | 385 +++++++++++----------
> arch/powerpc/include/asm/pci.h | 2 -
> drivers/pci/bus.c | 1 -
> drivers/pci/pci-sysfs.c | 592 +++++++++++++++++++--------------
> drivers/pci/pci.h | 16 +-
> drivers/pci/probe.c | 6 -
> drivers/pci/remove.c | 3 -
> include/linux/pci.h | 39 ++-
> 9 files changed, 589 insertions(+), 468 deletions(-)
Updated pci/sysfs to this series, thank you very much! Looks great!
^ permalink raw reply
* Re: [PATCH 4/5] ibmvfc: use async sub-queue for FPIN messages
From: Tyrel Datwyler @ 2026-05-08 17:25 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <87ecjmetcm.fsf@linux.ibm.com>
On 5/7/26 3:40 PM, Dave Marquardt wrote:
> Tyrel Datwyler <tyreld@linux.ibm.com> writes:
>
>> On 4/8/26 10:07 AM, Dave Marquardt via B4 Relay wrote:
>>> From: Dave Marquardt <davemarq@linux.ibm.com>
>>>
<snip>
>>> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
>>> index 803fc3caa14d..26e39b367022 100644
>>> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
>>> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
>>> @@ -1471,6 +1471,13 @@ static void ibmvfc_gather_partition_info(struct ibmvfc_host *vhost)
>>> of_node_put(rootdn);
>>> }.
>>>
>>> +static __be64 ibmvfc_npiv_chan_caps[] = {
>>> + cpu_to_be64(IBMVFC_CAN_USE_CHANNELS | IBMVFC_USE_ASYNC_SUBQ |
>>> + IBMVFC_YES_SCSI | IBMVFC_CAN_HANDLE_FPIN),
>>> + cpu_to_be64(IBMVFC_CAN_USE_CHANNELS),
>>> +};
>>> +#define IBMVFC_NPIV_CHAN_CAPS_SIZE (sizeof(ibmvfc_npiv_chan_caps)/sizeof(__be64))
>>> +
>>
>> I really don't understand what you are doing here? You seem to be definig
>> various sets of capabilities, but how does the driver decide which set to use?
>> As far as I can tell the index is increased and the capabilities decrease each
>> time a transport event is received. This looks like maybe its just a testing hack.
>
> My thought was to deal with an older VIOS that doesn't support the async
> sub-queue and full FPIN. But I suppose the response should just not set the
> appropriate bits. I'll go re-read the NPIV spec and figure out if this
> is actually needed.
>
Exactly, we should blanket set everything we support, and then rely on what the
VIOS advertises as support to then setup the proper facilities on our end.
-Tyrel
^ permalink raw reply
* Re: [PATCH v2] powerpc/pseries/iommu: export DMA window data to user space
From: Harsh Prateek Bora @ 2026-05-08 17:04 UTC (permalink / raw)
To: Gaurav Batra, maddy; +Cc: linuxppc-dev, sbhat, vaibhav, ritesh.list, Brian King
In-Reply-To: <20260507180646.40356-1-gbatra@linux.ibm.com>
Hi Gaurav,
On 07/05/26 11:36 pm, Gaurav Batra wrote:
> Export PowerPC DMA window information (both default 2GB and Dynamic
> larger window) to user space via sysfs. Each of these DMA windows has
> attributes like size of the window, page size backing the window, mode,
> etc. Each of these atributes is exported for user space consumption as a
> file.
>
> PowerPC Host Bridge (PHB) can have multiple devices/functions sharing
> the same DMA window. For each PHB, iommu registration creates an iommu
> device under "/sys/devices/virtual/iommu".
>
> These devices will have 2 groups created to export Default and DDW
> attributes.
>
> Reviewed-by: Brian King <brking@linux.ibm.com>
> Reviewed-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> Reviewed-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
I do not see R-b tags provided on the list after review comments.
Not sure if I am missing the email or were these provided privately ?
Sharing some review comments inline below ..
> Signed-off-by: Gaurav Batra <gbatra@linux.ibm.com>
> ---
> V1 -> V2 change log:
>
> 1. Shiva: "weight" the it_map for the bitmap. This avoids using an extra
> counter in the table. Please look into how iommu_debugfs_weight_get()
> does this
>
> Response: Incorporated changes
>
> 2. Vaibhav: If the DMA window is not available, show function should just
> return ENOENT so that userspace know the error instantly instead of
> having to parse the sysfs contents.
>
> Response: Incorporated changes, returning ENODATA
>
> 3. Vaibhav: All the show functions have similar template. Please convert
> them to macros expansion to reduce code volume.
>
> Response: Incorporated changes
>
> 4. Vaibhav: These new attributes are PSeries specific but they are being
> setup in ppc generic iommu code at arch/powerpc/kernel/iommu.c. Can
> you move these attributes to arch/powerpc/platforms/pseries/iommu.c
>
> Response: I have split the attributes and moved them to pseries specific
> files. The original group "spapr-tce-iommu", is moved to PowerNV code
> base to retain the legacy functionality.
>
> I tested the changes both on Pseries and PowerNV.
>
> 5. Vaibhav: It would be better to use function iommu_table_inuse_tces() as
> a callback in iommu_table_ops which can be implemented by pseries and
> powernv code differently.
>
> Response: the function is no longer needed after changes in #1
>
> 6. Vaibhav: Since sysfs is ABI can you propose appropriate entries under
> Documentation/ABI/testing
>
> Response: Added documentation
>
> ...sfs-devices-virtual-iommu-dma_window_attrs | 21 ++
> .../arch/powerpc/dma_window_attributes.rst | 65 +++++
> arch/powerpc/include/asm/pci-bridge.h | 4 +
> arch/powerpc/kernel/iommu.c | 16 +-
> arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++
> arch/powerpc/platforms/pseries/iommu.c | 261 ++++++++++++++++++
> arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +
> arch/powerpc/platforms/pseries/pseries.h | 1 +
> arch/powerpc/platforms/pseries/setup.c | 2 +
> 9 files changed, 373 insertions(+), 15 deletions(-)
> create mode 100644 Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
> create mode 100644 Documentation/arch/powerpc/dma_window_attributes.rst
>
> diff --git a/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs b/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
> new file mode 100644
> index 000000000000..18ba63874276
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-devices-virtual-iommu-dma_window_attrs
> @@ -0,0 +1,21 @@
> +What: /sys/devices/virtual/iommu/<iommu-isolation>/spapr-tce-ddw/*
> +Date: Oct 2025
> +Contact: linuxppc-dev@lists.ozlabs.org
> +Description: read only
> + For each IOMMU isolation unit spapr-tce-ddw sub-directory provides
> + attributes to query information related to the bigger Dynamic DMA
> + window (DDW) in the PowerPC virtualized platforms.
> +
> + See Documentation/arch/powerpc/dma_window_attributes.rst for more
> + information.
> +
> +What: /sys/devices/virtual/iommu/<iommu-isolation>/spapr-tce-dma/*
> +Date: Oct 2025
> +Contact: linuxppc-dev@lists.ozlabs.org
> +Description: read only
> + For each IOMMU isolation unit spapr-tce-dma sub-directory provides
> + attributes to query information related to the default 2GB DMA
> + window in the PowerPC virtualized platforms.
> +
> + See Documentation/arch/powerpc/dma_window_attributes.rst for more
> + information.
> diff --git a/Documentation/arch/powerpc/dma_window_attributes.rst b/Documentation/arch/powerpc/dma_window_attributes.rst
> new file mode 100644
> index 000000000000..8bd9aec8539d
> --- /dev/null
> +++ b/Documentation/arch/powerpc/dma_window_attributes.rst
> @@ -0,0 +1,65 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=====================
> +DMA Window Attributes
> +=====================
> +
> +In PowerPC architecture there are 2 types of DMA windows -
> +
> +1. Default 2GB DMA window which is backed by 4K page size
> +2. A bigger Dynamic DMA Window (DDW) which is backed by larger page size
> + (64K or 2MB)
> +
> +A dedicated device will have both the DMA windows instantiated but an SR-IOV
> +device will only have the bigger Dynamic DMA Window.
> +
> +The attributes of these 2 DMA windows are exported to user space via sysfs.
> +Each IOMMU isolation unit will have its directory created under
> +/sys/devices/virtual/iommu.
> +
> +As an exapmple, iommu-phb0001
s/exapmple/example ?
> +
> +Under each IOMMU isolation unit, there will be a group of attributes for
> +"Default 2GB DMA Window" and "Dynamic DMA Window" - spapr-tce-dma and
> +spapr-tce-ddw respectively.
> +
> +Attributes under each group
> +
> +spapr-tce-ddw:
> +direct_address dynamic_address dynamic_size window_type
> +direct_size dynamic_pages_mapped page_size
> +
> +spapr-tce-dma:
> +dynamic_address dynamic_pages_mapped dynamic_size page_size
> +
> +
> +The bigger Dynamic DMA Window is configured into pre-mapped and/or dynamically
> +allocated TCEs. If the DDW is in "Hybrid" mode, then both the Direct
> +(pre-mapped) and Dynamic part of the DMA window will have valid values. Hybrid
> +mode is valid only for SR-IOV devices.
> +
> +DMA Window properties:
> +
> +direct_address Starting address of the pre-mapped DMA window
> +direct_size Size of the pre-mapped DMA Window
> +dynamic_address Starting address of the dynamic allocations
> +dynamic_size Size of the dynamic allocation window
> +dynamic_pages_mapped Pages mapped for DMA by dynamic allocations
> +page_size Page size backing the DMA window
> +window_type Type of the DMA Window (Direct/Dynamic/Hybrid)
> +
> +
> +An example of DDW attributes for an SR-IOV device::
> +
> + $ cd /sys/devices/virtual/iommu/iommu-phb0001/spapr-tce-ddw
> +
> + $ grep . *
> +
> + direct_address:0x800000000000000 <-- Starting addr of pre-mapped Window
> + direct_size:137438953472 <-- Size of pre-mapped Window (128GB)
> + dynamic_address:0x800002000000000 <-- Starting addr of Dynamic allocations
> + dynamic_size:412316860416 <-- Size of dynamic allocation window (384GB)
> + dynamic_pages_mapped:270 <-- Pages mapped by dynamic allocations
> + page_size:2097152 <-- DMA window page size (2MB)
> + window_type:Hybrid <-- window has both pre-mapped and
> + dynamic sections
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index 1dae53130782..9b09178aca5e 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -124,6 +124,10 @@ struct pci_controller {
> resource_size_t dma_window_base_cur;
> resource_size_t dma_window_size;
>
> +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
> + const struct attribute_group **iommu_groups;
> +#endif
> +
> #ifdef CONFIG_PPC64
> unsigned long buid;
> struct pci_dn *pci_data;
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 0ce71310b7d9..d6242e3f77da 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -1269,24 +1269,10 @@ static const struct iommu_ops spapr_tce_iommu_ops = {
> .device_group = spapr_tce_iommu_device_group,
> };
>
> -static struct attribute *spapr_tce_iommu_attrs[] = {
> - NULL,
> -};
> -
> -static struct attribute_group spapr_tce_iommu_group = {
> - .name = "spapr-tce-iommu",
> - .attrs = spapr_tce_iommu_attrs,
> -};
> -
> -static const struct attribute_group *spapr_tce_iommu_groups[] = {
> - &spapr_tce_iommu_group,
> - NULL,
> -};
> -
> void ppc_iommu_register_device(struct pci_controller *phb)
> {
> iommu_device_sysfs_add(&phb->iommu, phb->parent,
> - spapr_tce_iommu_groups, "iommu-phb%04x",
> + phb->iommu_groups, "iommu-phb%04x",
> phb->global_number);
> iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops,
> phb->parent);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 1c78fdfb7b03..0887f154955e 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -2493,6 +2493,20 @@ static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
> .shutdown = pnv_pci_ioda_shutdown,
> };
>
> +static struct attribute *pnv_tce_iommu_attrs[] = {
> + NULL,
> +};
> +
> +static struct attribute_group pnv_tce_iommu_group = {
> + .name = "spapr-tce-iommu",
> + .attrs = pnv_tce_iommu_attrs,
> +};
> +
> +static const struct attribute_group *pnv_tce_iommu_groups[] = {
> + &pnv_tce_iommu_group,
> + NULL,
> +};
> +
> static void __init pnv_pci_init_ioda_phb(struct device_node *np,
> u64 hub_id, int ioda_type)
> {
> @@ -2697,6 +2711,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
> hose->controller_ops = pnv_pci_ioda_controller_ops;
> }
>
> + hose->iommu_groups = pnv_tce_iommu_groups;
> +
> ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
>
> #ifdef CONFIG_PCI_IOV
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 5497b130e026..28be7a45761d 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -56,6 +56,20 @@ enum {
> DDW_EXT_LIMITED_ADDR_MODE = 3
> };
>
> +/* used by sysfs when querying Dynamic/Default DMA Window data */
> +struct dma_win_data {
> + u32 page_size;
> + u64 direct_address;
> + u64 direct_size;
> + u64 dynamic_address;
> + u64 dynamic_size;
> + u32 dynamic_pages_mapped;
> + char window_type[15];
> +};
> +
> +#define SPAPR_SUCCESS 0
> +#define SPAPR_ERROR -1
> +
> static struct iommu_table *iommu_pseries_alloc_table(int node)
> {
> struct iommu_table *tbl;
> @@ -837,6 +851,253 @@ static struct device_node *pci_dma_find(struct device_node *dn,
> return rdn;
> }
>
> +/* Get DDW information for the device */
> +static int gather_ddw_info(struct device *dev, struct dma_win_data *data)
> +{
> + struct iommu_device *iommu;
> + struct pci_controller *phb;
> + struct device_node *dn;
> + struct pci_dn *pci;
> + const __be32 *prop = NULL;
> + bool ddw_direct = false;
> + bool found = false;
> + struct iommu_table *tbl;
> + u32 pgshift;
> + struct dynamic_dma_window_prop *p;
> +
> + memset(data, 0, sizeof(*data));
> +
> + iommu = dev_get_drvdata(dev);
> + phb = container_of(iommu, struct pci_controller, iommu);
> + dn = phb->dn;
> +
> + if (!dn)
> + return SPAPR_ERROR;
> +
> + pci = PCI_DN(dn);
> + if (!pci || !pci->table_group)
> + return SPAPR_ERROR;
> +
Should we also hold a dn ref with of_node_get(dn) before proceeding with
of_get_property calls ?
> + /* Find DDW */
> + prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
> + if (prop) {
> + ddw_direct = true;
> + found = true;
> + } else {
> + prop = of_get_property(dn, DMA64_PROPNAME, NULL);
> + if (prop)
> + found = true;
> + }
> +
> + /* NO DDW */
> + if (!found)
.. then release dn ref here if not found ..
> + return SPAPR_ERROR;
> +
> + p = (struct dynamic_dma_window_prop *)prop;
> +
> + pgshift = be32_to_cpu(p->tce_shift);
> + if (pgshift != 0xc && pgshift != 0x10 && pgshift != 0x15)
Can we have macros for 0xc, 0x10 and 0x15 respectively ?
> + data->page_size = 0;
> + else
> + data->page_size = 1 << pgshift;
> +
> + /* Check if DDW has table associated with it. Having a table associated with
> + * DDW is indicative that is has some dynamic TCE allocations. In this case the
> + * DDW can be fully Dynamic or in Hybrid mode. For SR-IOV DDW is on index 0,
> + * for dedicated adapter on index 1.
> + */
> + found = false;
> + for (int i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> + tbl = pci->table_group->tables[i];
Can another thread do a kfree(table_group) via
iommu_pseries_free_group() during hotplug remove before we reach here?
> +
> + if (tbl && tbl->it_index == be32_to_cpu(p->liobn)) {
> + found = true;
> + break;
> + }
> + }
Is it possible that another thread changes bitmap before we reach
bitmap_weight below ? If table is found, we may want to safely access
its bitamp (consider using tbl->largepool.lock?).
> +
> + /* set the parameters depnding on the DDW type */
s/depnding/depending ?
> + if (ddw_direct && found) { /* Hybrid */
> + data->direct_address = be64_to_cpu(p->dma_base);
> + data->dynamic_size = (u64)(tbl->it_size << tbl->it_page_shift);
> +
> + data->dynamic_address = data->direct_address
> + + (u64)(1UL << be32_to_cpu(p->window_shift))
> + - data->dynamic_size;
> +
> + data->direct_size = data->dynamic_address - data->direct_address;
> + data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, tbl->it_size);
> +
> + sprintf(data->window_type, "%s", "Hybrid");
Preferably use snprintf for safety. I see two more instances below.
> + } else if (ddw_direct && !found) { /* Direct */
> + data->direct_address = be64_to_cpu(p->dma_base);
> + data->direct_size = (u64)(1UL << be32_to_cpu(p->window_shift));
> +
> + sprintf(data->window_type, "%s", "Direct");
> + } else { /* Dynamic */
> + data->dynamic_address = be64_to_cpu(p->dma_base);
> + data->dynamic_size = (u64)(1UL << be32_to_cpu(p->window_shift));
> + data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, tbl->it_size);
> +
> + sprintf(data->window_type, "%s", "Dynamic");
> + }
> +
.. release dn ref with of_node_put() before returning.
Similarly applicable for gather_dma_info() also.
> + return SPAPR_SUCCESS;
> +}
> +
> +/* Get DDW information for the device */
> +static int gather_dma_info(struct device *dev, struct dma_win_data *data)
> +{
> + struct iommu_device *iommu;
> + struct pci_controller *phb;
> + struct device_node *dn;
> + struct pci_dn *pci;
> + const __be32 *prop = NULL;
> + struct iommu_table *tbl;
> + unsigned long offset, size, liobn;
> +
> + memset(data, 0, sizeof(*data));
> +
> + iommu = dev_get_drvdata(dev);
> + phb = container_of(iommu, struct pci_controller, iommu);
> + dn = phb->dn;
> +
> + if (!dn)
> + return SPAPR_ERROR;
> +
> + pci = PCI_DN(dn);
> + if (!pci || !pci->table_group)
> + return SPAPR_ERROR;
> +
> + /* search for default DMA window */
> + prop = of_get_property(dn, "ibm,dma-window", NULL);
> +
> + if (!prop)
> + return SPAPR_ERROR;
> +
> + /* default DMA Window is always at index 0 */
> + tbl = pci->table_group->tables[0];
> + if (!tbl)
> + return SPAPR_ERROR;
> +
> + of_parse_dma_window(dn, prop, &liobn, &offset, &size);
> +
> + data->dynamic_address = offset;
> + data->dynamic_size = size;
> + data->page_size = 1ULL << IOMMU_PAGE_SHIFT_4K;
> + data->dynamic_pages_mapped = bitmap_weight(tbl->it_map, tbl->it_size);
> +
> + return SPAPR_SUCCESS;
> +}
> +
> +#define DEVICE_SHOW_DDW(_name, _fmt) \
> +ssize_t ddw_##_name##_show(struct device *dev, \
> + struct device_attribute *attr,\
> + char *buf) \
> +{ \
> + int rc = 0; \
> + struct dma_win_data data; \
> + \
> + rc = gather_ddw_info(dev, &data); \
> + \
> + if (rc == SPAPR_SUCCESS) \
> + return sysfs_emit(buf, _fmt, data._name); \
> + else \
> + return -ENODATA; \
> +} \
> +
> +#define DEVICE_SHOW_DMA(_name, _fmt) \
> +ssize_t dma_##_name##_show(struct device *dev, \
> + struct device_attribute *attr,\
> + char *buf) \
> +{ \
> + int rc = 0; \
> + struct dma_win_data data; \
> + \
> + rc = gather_dma_info(dev, &data); \
> + \
> + if (rc == SPAPR_SUCCESS) \
> + return sysfs_emit(buf, _fmt, data._name); \
> + else \
> + return -ENODATA; \
> +} \
> +
> +static DEVICE_SHOW_DDW(direct_address, "%#llx\n");
> +static DEVICE_SHOW_DDW(direct_size, "%lld\n");
> +static DEVICE_SHOW_DDW(page_size, "%d\n");
> +static DEVICE_SHOW_DDW(window_type, "%s\n");
> +static DEVICE_SHOW_DDW(dynamic_address, "%#llx\n");
> +static DEVICE_SHOW_DDW(dynamic_size, "%lld\n");
> +static DEVICE_SHOW_DDW(dynamic_pages_mapped, "%d\n");
> +static DEVICE_SHOW_DMA(dynamic_address, "%#llx\n");
> +static DEVICE_SHOW_DMA(dynamic_size, "%lld\n");
> +static DEVICE_SHOW_DMA(page_size, "%d\n");
> +static DEVICE_SHOW_DMA(dynamic_pages_mapped, "%d\n");
> +
> +#define DEVICE_ATTR_DDW(_name) \
> + struct device_attribute dev_attr_ddw_##_name = \
> + __ATTR(_name, 0444, ddw_##_name##_show, NULL)
> +#define DEVICE_ATTR_DMA(_name) \
> + struct device_attribute dev_attr_dma_##_name = \
> + __ATTR(_name, 0444, dma_##_name##_show, NULL)
> +
> +static DEVICE_ATTR_DDW(direct_address);
> +static DEVICE_ATTR_DDW(direct_size);
> +static DEVICE_ATTR_DDW(page_size);
> +static DEVICE_ATTR_DDW(window_type);
> +static DEVICE_ATTR_DDW(dynamic_address);
> +static DEVICE_ATTR_DDW(dynamic_size);
> +static DEVICE_ATTR_DDW(dynamic_pages_mapped);
> +static DEVICE_ATTR_DMA(dynamic_address);
> +static DEVICE_ATTR_DMA(dynamic_size);
> +static DEVICE_ATTR_DMA(page_size);
> +static DEVICE_ATTR_DMA(dynamic_pages_mapped);
> +
> +static struct attribute *spapr_tce_ddw_attrs[] = {
> + &dev_attr_ddw_direct_address.attr,
> + &dev_attr_ddw_direct_size.attr,
> + &dev_attr_ddw_page_size.attr,
> + &dev_attr_ddw_window_type.attr,
> + &dev_attr_ddw_dynamic_address.attr,
> + &dev_attr_ddw_dynamic_size.attr,
> + &dev_attr_ddw_dynamic_pages_mapped.attr,
> + NULL,
> +};
> +
> +static struct attribute *spapr_tce_dma_attrs[] = {
> + &dev_attr_dma_dynamic_address.attr,
> + &dev_attr_dma_dynamic_size.attr,
> + &dev_attr_dma_page_size.attr,
> + &dev_attr_dma_dynamic_pages_mapped.attr,
> + NULL,
> +};
> +
> +static struct attribute_group spapr_tce_ddw_group = {
> + .name = "spapr-tce-ddw",
> + .attrs = spapr_tce_ddw_attrs,
> +};
> +
> +static struct attribute_group spapr_tce_dma_group = {
> + .name = "spapr-tce-dma",
> + .attrs = spapr_tce_dma_attrs,
> +};
> +
> +static struct attribute *spapr_tce_iommu_attrs[] = {
> + NULL,
> +};
> +
> +static struct attribute_group spapr_tce_iommu_group = {
> + .name = "spapr-tce-iommu",
> + .attrs = spapr_tce_iommu_attrs,
> +};
> +
> +const struct attribute_group *spapr_tce_iommu_groups[] = {
> + &spapr_tce_iommu_group,
> + &spapr_tce_ddw_group,
> + &spapr_tce_dma_group,
> + NULL,
> +};
> +
> static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
> {
> struct iommu_table *tbl;
> diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
> index 8c77ec7980de..b457451a2814 100644
> --- a/arch/powerpc/platforms/pseries/pci_dlpar.c
> +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
> @@ -45,6 +45,8 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn)
> pci_process_bridge_OF_ranges(phb, dn, 0);
> phb->controller_ops = pseries_pci_controller_ops;
>
> + phb->iommu_groups = spapr_tce_iommu_groups;
> +
> pci_devs_phb_init_dynamic(phb);
>
> pseries_msi_allocate_domains(phb);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 3968a6970fa8..4cf0b7a4e96a 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -128,4 +128,5 @@ struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose,
> struct pci_dev *pdev);
> #endif
>
> +extern const struct attribute_group *spapr_tce_iommu_groups[];
> #endif /* _PSERIES_PSERIES_H */
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 50b26ed8432d..4d877aae0560 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -512,6 +512,8 @@ static void __init pSeries_discover_phbs(void)
> isa_bridge_find_early(phb);
> phb->controller_ops = pseries_pci_controller_ops;
>
> + phb->iommu_groups = spapr_tce_iommu_groups;
> +
> /* create pci_dn's for DT nodes under this PHB */
> pci_devs_phb_init_dynamic(phb);
>
> base-commit: 192c0159402e6bfbe13de6f8379546943297783d
^ permalink raw reply
* Re: [PATCH 5/5] ibmvfc: handle extended FPIN events
From: Dave Marquardt @ 2026-05-08 14:38 UTC (permalink / raw)
To: Tyrel Datwyler
Cc: James E.J. Bottomley, Martin K. Petersen, Madhavan Srinivasan,
Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
linux-kernel, linux-scsi, linuxppc-dev, Brian King, Greg Joyce,
Kyle Mahlkuch
In-Reply-To: <8ac414a6-b4e9-4fd9-b316-3738b3229664@linux.ibm.com>
Tyrel Datwyler <tyreld@linux.ibm.com> writes:
> On 4/8/26 10:07 AM, Dave Marquardt via B4 Relay wrote:
>> From: Dave Marquardt <davemarq@linux.ibm.com>
>>
>> - negotiate use of extended FPIN events with NPIV (VIOS)
>> - add code to parse and handle extended FPIN events
>> - add KUnit test to test extended FPIN event handling
>
> Same nit here as the previous 4 patches.
>
>> ---
>> drivers/scsi/ibmvscsi/ibmvfc.c | 45 ++++++++++++++---
>> drivers/scsi/ibmvscsi/ibmvfc.h | 31 ++++++++++++
>> drivers/scsi/ibmvscsi/ibmvfc_kunit.c | 97 +++++++++++++++++++++++++++++++++---
>> 3 files changed, 161 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
>> index 26e39b367022..5b2b861a34c2 100644
>> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
>> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
>> @@ -1472,6 +1472,9 @@ static void ibmvfc_gather_partition_info(struct ibmvfc_host *vhost)
>> }
>>
>> static __be64 ibmvfc_npiv_chan_caps[] = {
>> + cpu_to_be64(IBMVFC_CAN_USE_CHANNELS | IBMVFC_USE_ASYNC_SUBQ |
>> + IBMVFC_YES_SCSI | IBMVFC_CAN_HANDLE_FPIN |
>> + IBMVFC_CAN_HANDLE_FPIN_EXT),
>> cpu_to_be64(IBMVFC_CAN_USE_CHANNELS | IBMVFC_USE_ASYNC_SUBQ |
>> IBMVFC_YES_SCSI | IBMVFC_CAN_HANDLE_FPIN),
>> cpu_to_be64(IBMVFC_CAN_USE_CHANNELS),
>> @@ -3370,6 +3373,28 @@ ibmvfc_full_fpin_to_desc(struct ibmvfc_async_subq *ibmvfc_fpin)
>> cpu_to_be32(1));
>> }
>>
>> +/**
>> + * ibmvfc_ext_fpin_to_desc(): allocate and populate a struct fc_els_fpin struct
>> + * containing a descriptor.
>> + * @ibmvfc_fpin: Pointer to async subq FPIN data
>> + *
>> + * Allocate a struct fc_els_fpin containing a descriptor and populate
>> + * based on data from *ibmvfc_fpin.
>> + *
>> + * Return:
>> + * NULL - unable to allocate structure
>> + * non-NULL - pointer to populated struct fc_els_fpin
>> + */
>> +static struct fc_els_fpin *
>> +ibmvfc_ext_fpin_to_desc(struct ibmvfc_async_subq_fpin *ibmvfc_fpin)
>> +{
>> + return ibmvfc_common_fpin_to_desc(ibmvfc_fpin->fpin_status, ibmvfc_fpin->wwpn,
>> + ibmvfc_fpin->fpin_data.event_type_modifier,
>> + ibmvfc_fpin->fpin_data.event_threshold,
>> + ibmvfc_fpin->fpin_data.event_threshold,
>
> I see mention of threshold and period previously. Why in this case is it just
> the threshold value passed for both?
I'll look into this. There's no obvious period here in ibmvfc_fpin or
ibmvfc_fpin->fpin_data. It may be more appropriate to use a default
period.
-Dave
^ permalink raw reply
* [PATCH 15/15] sched/cputime: Handle dyntick-idle steal time correctly
From: Frederic Weisbecker @ 2026-05-08 13:16 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Christophe Leroy (CS GROUP),
Rafael J. Wysocki, Alexander Gordeev, Anna-Maria Behnsen,
Ben Segall, Boqun Feng, Christian Borntraeger, Dietmar Eggemann,
Heiko Carstens, Ingo Molnar, Ingo Molnar, Jan Kiszka,
Joel Fernandes, Juri Lelli, Kieran Bingham, Madhavan Srinivasan,
Mel Gorman, Michael Ellerman, Neeraj Upadhyay, Nicholas Piggin,
Paul E . McKenney, Peter Zijlstra, Sashiko, Shrikanth Hegde,
Steven Rostedt, Sven Schnelle, Thomas Gleixner, Uladzislau Rezki,
Valentin Schneider, Vasily Gorbik, Vincent Guittot, Viresh Kumar,
Xin Zhao, linux-pm, linux-s390, linuxppc-dev
In-Reply-To: <20260508131647.43868-1-frederic@kernel.org>
The dyntick-idle steal time is currently accounted when the tick
restarts but the stolen idle time is not subtracted from the idle time
that was already accounted. This is to avoid observing the idle time
going backward as the dyntick-idle cputime accessors can't reliably know
in advance the stolen idle time.
In order to maintain a forward progressing idle cputime while
subtracting idle steal time from it, keep track of the previously
accounted idle stolen time and substract it from _later_ idle cputime
accounting.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
include/linux/kernel_stat.h | 1 +
kernel/sched/cputime.c | 28 +++++++++++++++++++++++-----
2 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 512104b0ff49..fce1392e2140 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -39,6 +39,7 @@ struct kernel_cpustat {
bool idle_elapse;
seqcount_t idle_sleeptime_seq;
u64 idle_entrytime;
+ u64 idle_stealtime[2];
#endif
u64 cpustat[NR_STATS];
};
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 94be22aa5cb6..244b57417240 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -425,19 +425,32 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
static void kcpustat_idle_stop(struct kernel_cpustat *kc, u64 now)
{
u64 *cpustat = kc->cpustat;
- u64 delta;
+ u64 delta, steal, steal_delta;
+ int iowait;
if (!kc->idle_elapse)
return;
+ iowait = nr_iowait_cpu(smp_processor_id()) > 0;
delta = now - kc->idle_entrytime;
+ steal = steal_account_process_time(delta);
+ /*
+ * Record the idle time after substracting the steal time from
+ * previous update sequence. Don't substract the steal time from
+ * the current update sequence to avoid readers moving backward.
+ */
write_seqcount_begin(&kc->idle_sleeptime_seq);
- if (nr_iowait_cpu(smp_processor_id()) > 0)
+ steal_delta = min_t(u64, kc->idle_stealtime[iowait], delta);
+ delta -= steal_delta;
+ kc->idle_stealtime[iowait] -= steal_delta;
+
+ if (iowait)
cpustat[CPUTIME_IOWAIT] += delta;
else
cpustat[CPUTIME_IDLE] += delta;
+ kc->idle_stealtime[iowait] += steal;
kc->idle_entrytime = now;
kc->idle_elapse = false;
write_seqcount_end(&kc->idle_sleeptime_seq);
@@ -464,7 +477,6 @@ void kcpustat_dyntick_stop(u64 now)
kcpustat_idle_stop(kc, now);
kc->idle_dyntick = false;
vtime_dyntick_stop();
- steal_account_process_time(ULONG_MAX);
}
}
@@ -508,6 +520,7 @@ static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx,
bool compute_delta, u64 now)
{
struct kernel_cpustat *kc = &kcpustat_cpu(cpu);
+ int iowait = idx == CPUTIME_IOWAIT;
u64 *cpustat = kc->cpustat;
unsigned int seq;
u64 idle;
@@ -516,8 +529,13 @@ static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx,
seq = read_seqcount_begin(&kc->idle_sleeptime_seq);
idle = cpustat[idx];
- if (kc->idle_elapse && compute_delta && now > kc->idle_entrytime)
- idle += (now - kc->idle_entrytime);
+
+ if (kc->idle_elapse && compute_delta && now > kc->idle_entrytime) {
+ u64 delta = now - kc->idle_entrytime;
+
+ delta -= min_t(u64, kc->idle_stealtime[iowait], delta);
+ idle += delta;
+ }
} while (read_seqcount_retry(&kc->idle_sleeptime_seq, seq));
return idle;
--
2.53.0
^ permalink raw reply related
* [PATCH 14/15] sched/cputime: Handle idle irqtime gracefully
From: Frederic Weisbecker @ 2026-05-08 13:16 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Christophe Leroy (CS GROUP),
Rafael J. Wysocki, Alexander Gordeev, Anna-Maria Behnsen,
Ben Segall, Boqun Feng, Christian Borntraeger, Dietmar Eggemann,
Heiko Carstens, Ingo Molnar, Ingo Molnar, Jan Kiszka,
Joel Fernandes, Juri Lelli, Kieran Bingham, Madhavan Srinivasan,
Mel Gorman, Michael Ellerman, Neeraj Upadhyay, Nicholas Piggin,
Paul E . McKenney, Peter Zijlstra, Sashiko, Shrikanth Hegde,
Steven Rostedt, Sven Schnelle, Thomas Gleixner, Uladzislau Rezki,
Valentin Schneider, Vasily Gorbik, Vincent Guittot, Viresh Kumar,
Xin Zhao, linux-pm, linux-s390, linuxppc-dev
In-Reply-To: <20260508131647.43868-1-frederic@kernel.org>
The dyntick-idle cputime accounting always assumes that IRQ time
accounting is enabled and consequently stops elapsing the idle time
during dyntick-idle IRQs.
This doesn't mix up well with disabled IRQ time accounting because then
idle IRQs become a cputime blind-spot. Also this feature is disabled
on most configurations and the overhead of pausing dyntick-idle
accounting while in idle IRQs could then be avoided.
Fix the situation with conditionally pausing dyntick-idle accounting
during idle IRQs only iff either native vtime (which does IRQ time
accounting) or generic IRQ time accounting are enabled.
Also make sure that the accumulated IRQ time is not accidentally
substracted from later accounting.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
kernel/sched/cputime.c | 20 +++++++++++++++++---
1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 335d2c127763..94be22aa5cb6 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -47,7 +47,8 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
u64_stats_update_begin(&irqtime->sync);
cpustat[idx] += delta;
irqtime->total += delta;
- irqtime->tick_delta += delta;
+ if (!kcpustat_idle_dyntick())
+ irqtime->tick_delta += delta;
u64_stats_update_end(&irqtime->sync);
}
@@ -444,6 +445,10 @@ static void kcpustat_idle_stop(struct kernel_cpustat *kc, u64 now)
static void kcpustat_idle_start(struct kernel_cpustat *kc, u64 now)
{
+ /* Irqtime accounting might have been enabled in the middle of the IRQ */
+ if (kc->idle_elapse)
+ return;
+
write_seqcount_begin(&kc->idle_sleeptime_seq);
kc->idle_entrytime = now;
kc->idle_elapse = true;
@@ -478,7 +483,8 @@ void kcpustat_irq_enter(u64 now)
{
struct kernel_cpustat *kc = kcpustat_this_cpu;
- if (!vtime_generic_enabled_this_cpu())
+ if (!vtime_generic_enabled_this_cpu() &&
+ (irqtime_enabled() || vtime_accounting_enabled_this_cpu()))
kcpustat_idle_stop(kc, now);
}
@@ -486,7 +492,15 @@ void kcpustat_irq_exit(u64 now)
{
struct kernel_cpustat *kc = kcpustat_this_cpu;
- if (!vtime_generic_enabled_this_cpu())
+ /*
+ * Generic vtime already does its own idle accounting.
+ * But irqtime accounting or arch vtime which also accounts IRQs
+ * need to pause nohz accounting. Resume nohz accounting as long
+ * as the irqtime config is enabled to handle case where irqtime
+ * accounting got runtime disabled in the middle of an IRQ.
+ */
+ if (!vtime_generic_enabled_this_cpu() &&
+ (IS_ENABLED(CONFIG_IRQ_TIME_ACCOUNTING) || vtime_accounting_enabled_this_cpu()))
kcpustat_idle_start(kc, now);
}
--
2.53.0
^ permalink raw reply related
* [PATCH 13/15] sched/cputime: Provide get_cpu_[idle|iowait]_time_us() off-case
From: Frederic Weisbecker @ 2026-05-08 13:16 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Christophe Leroy (CS GROUP),
Rafael J. Wysocki, Alexander Gordeev, Anna-Maria Behnsen,
Ben Segall, Boqun Feng, Christian Borntraeger, Dietmar Eggemann,
Heiko Carstens, Ingo Molnar, Ingo Molnar, Jan Kiszka,
Joel Fernandes, Juri Lelli, Kieran Bingham, Madhavan Srinivasan,
Mel Gorman, Michael Ellerman, Neeraj Upadhyay, Nicholas Piggin,
Paul E . McKenney, Peter Zijlstra, Sashiko, Shrikanth Hegde,
Steven Rostedt, Sven Schnelle, Thomas Gleixner, Uladzislau Rezki,
Valentin Schneider, Vasily Gorbik, Vincent Guittot, Viresh Kumar,
Xin Zhao, linux-pm, linux-s390, linuxppc-dev
In-Reply-To: <20260508131647.43868-1-frederic@kernel.org>
The last reason why get_cpu_idle/iowait_time_us() may return -1 now is
if the config doesn't support nohz.
The ad-hoc replacement solution by cpufreq is to compute jiffies minus
the whole busy cputime. Although the intention should provide a coherent
low resolution estimation of the idle and iowait time, the
implementation is buggy because jiffies don't start at 0.
Just provide instead a real get_cpu_[idle|iowait]_time_us() offcase.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
drivers/cpufreq/cpufreq.c | 29 +----------------------------
include/linux/kernel_stat.h | 3 +++
include/linux/tick.h | 4 ----
kernel/sched/cputime.c | 12 +++++++++---
4 files changed, 13 insertions(+), 35 deletions(-)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44eb1b7e7fc1..dda0d34d3c02 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -130,38 +130,11 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
}
EXPORT_SYMBOL_GPL(get_governor_parent_kobj);
-static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
-{
- struct kernel_cpustat kcpustat;
- u64 cur_wall_time;
- u64 idle_time;
- u64 busy_time;
-
- cur_wall_time = jiffies64_to_nsecs(get_jiffies_64());
-
- kcpustat_cpu_fetch(&kcpustat, cpu);
-
- busy_time = kcpustat.cpustat[CPUTIME_USER];
- busy_time += kcpustat.cpustat[CPUTIME_SYSTEM];
- busy_time += kcpustat.cpustat[CPUTIME_IRQ];
- busy_time += kcpustat.cpustat[CPUTIME_SOFTIRQ];
- busy_time += kcpustat.cpustat[CPUTIME_STEAL];
- busy_time += kcpustat.cpustat[CPUTIME_NICE];
-
- idle_time = cur_wall_time - busy_time;
- if (wall)
- *wall = div_u64(cur_wall_time, NSEC_PER_USEC);
-
- return div_u64(idle_time, NSEC_PER_USEC);
-}
-
u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy)
{
u64 idle_time = get_cpu_idle_time_us(cpu, io_busy ? wall : NULL);
- if (idle_time == -1ULL)
- return get_cpu_idle_time_jiffy(cpu, wall);
- else if (!io_busy)
+ if (!io_busy)
idle_time += get_cpu_iowait_time_us(cpu, wall);
return idle_time;
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 3680519d7b2c..512104b0ff49 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -133,6 +133,9 @@ static inline bool kcpustat_idle_dyntick(void)
}
#endif /* CONFIG_NO_HZ_COMMON */
+extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
+
/* Fetch cputime values when vtime is disabled on a CPU */
static inline u64 kcpustat_field_default(enum cpu_usage_stat usage, int cpu)
{
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 738007d6f577..1cf4651f09ad 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -139,8 +139,6 @@ extern bool tick_nohz_idle_got_tick(void);
extern ktime_t tick_nohz_get_next_hrtimer(void);
extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next);
extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
-extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
-extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else /* !CONFIG_NO_HZ_COMMON */
#define tick_nohz_enabled (0)
static inline bool tick_nohz_is_active(void) { return false; }
@@ -162,8 +160,6 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
*delta_next = TICK_NSEC;
return *delta_next;
}
-static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
-static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
#endif /* !CONFIG_NO_HZ_COMMON */
/*
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index c91fd67f93ea..335d2c127763 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -522,6 +522,13 @@ u64 kcpustat_field_iowait(int cpu)
nr_iowait_cpu(cpu), ktime_get());
}
EXPORT_SYMBOL_GPL(kcpustat_field_iowait);
+#else
+static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, ktime_t now)
+{
+ return kcpustat_cpu(cpu).cpustat[idx];
+}
+#endif /* CONFIG_NO_HZ_COMMON */
static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
bool compute_delta, u64 *last_update_time)
@@ -557,7 +564,7 @@ static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
- * Return: -1 if generic vtime is enabled, else total idle time of the @cpu
+ * Return: total idle time of the @cpu
*/
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
@@ -581,7 +588,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
- * Return: -1 if generic vtime is enabled, else total iowait time of @cpu
+ * Return: total iowait time of @cpu
*/
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
@@ -589,7 +596,6 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-#endif /* CONFIG_NO_HZ_COMMON */
/*
* Use precise platform statistics if available:
--
2.53.0
^ permalink raw reply related
* [PATCH 12/15] tick/sched: Consolidate idle time fetching APIs
From: Frederic Weisbecker @ 2026-05-08 13:16 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Christophe Leroy (CS GROUP),
Rafael J. Wysocki, Alexander Gordeev, Anna-Maria Behnsen,
Ben Segall, Boqun Feng, Christian Borntraeger, Dietmar Eggemann,
Heiko Carstens, Ingo Molnar, Ingo Molnar, Jan Kiszka,
Joel Fernandes, Juri Lelli, Kieran Bingham, Madhavan Srinivasan,
Mel Gorman, Michael Ellerman, Neeraj Upadhyay, Nicholas Piggin,
Paul E . McKenney, Peter Zijlstra, Sashiko, Shrikanth Hegde,
Steven Rostedt, Sven Schnelle, Thomas Gleixner, Uladzislau Rezki,
Valentin Schneider, Vasily Gorbik, Vincent Guittot, Viresh Kumar,
Xin Zhao, linux-pm, linux-s390, linuxppc-dev
In-Reply-To: <20260508131647.43868-1-frederic@kernel.org>
Fetching the idle cputime is available through a variety of accessors
all over the place depending on the different accounting flavours and
needs:
- idle vtime generic accounting can be accessed by kcpustat_field(),
kcpustat_cpu_fetch(), get_idle/iowait_time() and
get_cpu_idle/iowait_time_us()
- dynticks-idle accounting can only be accessed by get_idle/iowait_time()
or get_cpu_idle/iowait_time_us()
- CONFIG_NO_HZ_COMMON=n idle accounting can be accessed by kcpustat_field()
kcpustat_cpu_fetch(), or get_idle/iowait_time() but not by
get_cpu_idle/iowait_time_us()
Moreover get_idle/iowait_time() relies on get_cpu_idle/iowait_time_us()
with a non-sensical conversion to microseconds and back to nanoseconds
on the way.
Start consolidating the APIs with removing get_idle/iowait_time() and
make kcpustat_field() and kcpustat_cpu_fetch() work for all cases.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
fs/proc/stat.c | 40 +++---------------------
fs/proc/uptime.c | 8 ++---
include/linux/kernel_stat.h | 34 ++++++++++++++++++---
kernel/sched/cputime.c | 61 ++++++++++++++++++++++++-------------
4 files changed, 76 insertions(+), 67 deletions(-)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 8b444e862319..c00468a83f64 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,38 +22,6 @@
#define arch_irq_stat() 0
#endif
-u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 idle, idle_usecs = -1ULL;
-
- if (cpu_online(cpu))
- idle_usecs = get_cpu_idle_time_us(cpu, NULL);
-
- if (idle_usecs == -1ULL)
- /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcs->cpustat[CPUTIME_IDLE];
- else
- idle = idle_usecs * NSEC_PER_USEC;
-
- return idle;
-}
-
-static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 iowait, iowait_usecs = -1ULL;
-
- if (cpu_online(cpu))
- iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
-
- if (iowait_usecs == -1ULL)
- /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcs->cpustat[CPUTIME_IOWAIT];
- else
- iowait = iowait_usecs * NSEC_PER_USEC;
-
- return iowait;
-}
-
static void show_irq_gap(struct seq_file *p, unsigned int gap)
{
static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
@@ -105,8 +73,8 @@ static int show_stat(struct seq_file *p, void *v)
user += cpustat[CPUTIME_USER];
nice += cpustat[CPUTIME_NICE];
system += cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(&kcpustat, i);
- iowait += get_iowait_time(&kcpustat, i);
+ idle += cpustat[CPUTIME_IDLE];
+ iowait += cpustat[CPUTIME_IOWAIT];
irq += cpustat[CPUTIME_IRQ];
softirq += cpustat[CPUTIME_SOFTIRQ];
steal += cpustat[CPUTIME_STEAL];
@@ -146,8 +114,8 @@ static int show_stat(struct seq_file *p, void *v)
user = cpustat[CPUTIME_USER];
nice = cpustat[CPUTIME_NICE];
system = cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(&kcpustat, i);
- iowait = get_iowait_time(&kcpustat, i);
+ idle = cpustat[CPUTIME_IDLE];
+ iowait = cpustat[CPUTIME_IOWAIT];
irq = cpustat[CPUTIME_IRQ];
softirq = cpustat[CPUTIME_SOFTIRQ];
steal = cpustat[CPUTIME_STEAL];
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index b5343d209381..433aa947cd57 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -18,12 +18,8 @@ static int uptime_proc_show(struct seq_file *m, void *v)
int i;
idle_nsec = 0;
- for_each_possible_cpu(i) {
- struct kernel_cpustat kcs;
-
- kcpustat_cpu_fetch(&kcs, i);
- idle_nsec += get_idle_time(&kcs, i);
- }
+ for_each_possible_cpu(i)
+ idle_nsec += kcpustat_field(CPUTIME_IDLE, i);
ktime_get_boottime_ts64(&uptime);
timens_add_boottime(&uptime);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 9343353ac7a3..3680519d7b2c 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -110,32 +110,59 @@ extern void kcpustat_dyntick_start(u64 now);
extern void kcpustat_dyntick_stop(u64 now);
extern void kcpustat_irq_enter(u64 now);
extern void kcpustat_irq_exit(u64 now);
+extern u64 kcpustat_field_idle(int cpu);
+extern u64 kcpustat_field_iowait(int cpu);
static inline bool kcpustat_idle_dyntick(void)
{
return __this_cpu_read(kernel_cpustat.idle_dyntick);
}
#else
+static inline u64 kcpustat_field_idle(int cpu)
+{
+ return kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+}
+static inline u64 kcpustat_field_iowait(int cpu)
+{
+ return kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+}
+
static inline bool kcpustat_idle_dyntick(void)
{
return false;
}
#endif /* CONFIG_NO_HZ_COMMON */
+/* Fetch cputime values when vtime is disabled on a CPU */
+static inline u64 kcpustat_field_default(enum cpu_usage_stat usage, int cpu)
+{
+ if (usage == CPUTIME_IDLE)
+ return kcpustat_field_idle(cpu);
+ if (usage == CPUTIME_IOWAIT)
+ return kcpustat_field_iowait(cpu);
+ return kcpustat_cpu(cpu).cpustat[usage];
+}
+
+static inline void kcpustat_cpu_fetch_default(struct kernel_cpustat *dst, int cpu)
+{
+ *dst = kcpustat_cpu(cpu);
+ dst->cpustat[CPUTIME_IDLE] = kcpustat_field_idle(cpu);
+ dst->cpustat[CPUTIME_IOWAIT] = kcpustat_field_iowait(cpu);
+}
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern u64 kcpustat_field(enum cpu_usage_stat usage, int cpu);
extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu);
#else
static inline u64 kcpustat_field(enum cpu_usage_stat usage, int cpu)
{
- return kcpustat_cpu(cpu).cpustat[usage];
+ return kcpustat_field_default(usage, cpu);
}
static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
{
- *dst = kcpustat_cpu(cpu);
+ kcpustat_cpu_fetch_default(dst, cpu);
}
-
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
extern void account_user_time(struct task_struct *, u64);
@@ -145,7 +172,6 @@ extern void account_system_index_time(struct task_struct *, u64,
enum cpu_usage_stat);
extern void account_steal_time(u64);
extern void account_idle_time(u64);
-extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline void account_process_tick(struct task_struct *tsk, int user)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 4c00163b74b9..c91fd67f93ea 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -490,24 +490,14 @@ void kcpustat_irq_exit(u64 now)
kcpustat_idle_start(kc, now);
}
-static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
- bool compute_delta, u64 *last_update_time)
+static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, u64 now)
{
struct kernel_cpustat *kc = &kcpustat_cpu(cpu);
u64 *cpustat = kc->cpustat;
unsigned int seq;
- ktime_t now;
u64 idle;
- now = ktime_get();
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
-
- if (vtime_generic_enabled_cpu(cpu)) {
- idle = kcpustat_field(idx, cpu);
- goto to_us;
- }
-
do {
seq = read_seqcount_begin(&kc->idle_sleeptime_seq);
@@ -516,12 +506,42 @@ static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
idle += (now - kc->idle_entrytime);
} while (read_seqcount_retry(&kc->idle_sleeptime_seq, seq));
-to_us:
- do_div(idle, NSEC_PER_USEC);
-
return idle;
}
+u64 kcpustat_field_idle(int cpu)
+{
+ return kcpustat_field_dyntick(cpu, CPUTIME_IDLE,
+ !nr_iowait_cpu(cpu), ktime_get());
+}
+EXPORT_SYMBOL_GPL(kcpustat_field_idle);
+
+u64 kcpustat_field_iowait(int cpu)
+{
+ return kcpustat_field_dyntick(cpu, CPUTIME_IOWAIT,
+ nr_iowait_cpu(cpu), ktime_get());
+}
+EXPORT_SYMBOL_GPL(kcpustat_field_iowait);
+
+static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now = ktime_get();
+ u64 res;
+
+ if (vtime_generic_enabled_cpu(cpu))
+ res = kcpustat_field(idx, cpu);
+ else
+ res = kcpustat_field_dyntick(cpu, idx, compute_delta, now);
+
+ do_div(res, NSEC_PER_USEC);
+
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ return res;
+}
+
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
@@ -569,7 +589,6 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-
#endif /* CONFIG_NO_HZ_COMMON */
/*
@@ -1123,8 +1142,8 @@ u64 kcpustat_field(enum cpu_usage_stat usage, int cpu)
struct rq *rq;
int err;
- if (!vtime_accounting_enabled_cpu(cpu))
- return val;
+ if (!vtime_generic_enabled_cpu(cpu))
+ return kcpustat_field_default(usage, cpu);
rq = cpu_rq(cpu);
@@ -1219,8 +1238,8 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
struct rq *rq;
int err;
- if (!vtime_accounting_enabled_cpu(cpu)) {
- *dst = *src;
+ if (!vtime_generic_enabled_cpu(cpu)) {
+ kcpustat_cpu_fetch_default(dst, cpu);
return;
}
@@ -1233,7 +1252,7 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
curr = rcu_dereference(rq->curr);
if (WARN_ON_ONCE(!curr)) {
rcu_read_unlock();
- *dst = *src;
+ kcpustat_cpu_fetch_default(dst, cpu);
return;
}
--
2.53.0
^ permalink raw reply related
* [PATCH 11/15] tick/sched: Account tickless idle cputime only when tick is stopped
From: Frederic Weisbecker @ 2026-05-08 13:16 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Christophe Leroy (CS GROUP),
Rafael J. Wysocki, Alexander Gordeev, Anna-Maria Behnsen,
Ben Segall, Boqun Feng, Christian Borntraeger, Dietmar Eggemann,
Heiko Carstens, Ingo Molnar, Ingo Molnar, Jan Kiszka,
Joel Fernandes, Juri Lelli, Kieran Bingham, Madhavan Srinivasan,
Mel Gorman, Michael Ellerman, Neeraj Upadhyay, Nicholas Piggin,
Paul E . McKenney, Peter Zijlstra, Sashiko, Shrikanth Hegde,
Steven Rostedt, Sven Schnelle, Thomas Gleixner, Uladzislau Rezki,
Valentin Schneider, Vasily Gorbik, Vincent Guittot, Viresh Kumar,
Xin Zhao, linux-pm, linux-s390, linuxppc-dev
In-Reply-To: <20260508131647.43868-1-frederic@kernel.org>
There is no real point in switching to dyntick-idle cputime accounting
mode if the tick is not actually stopped. This just adds overhead,
notably fetching the GTOD, on each idle exit and each idle IRQ entry for
no reason during short idle trips.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
kernel/time/tick-sched.c | 50 +++++++++++++++++++++-------------------
1 file changed, 26 insertions(+), 24 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa03cf7b3cec..c1ee0b256445 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1157,8 +1157,10 @@ void tick_nohz_idle_stop_tick(void)
ts->idle_sleeps++;
ts->idle_expires = expires;
- if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED))
+ if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
+ kcpustat_dyntick_start(ts->idle_entrytime);
nohz_balance_enter_idle(cpu);
+ }
} else {
tick_nohz_retain_tick(ts);
}
@@ -1200,7 +1202,6 @@ void tick_nohz_idle_enter(void)
WARN_ON_ONCE(ts->timer_expires_base);
tick_sched_flag_set(ts, TS_FLAG_INIDLE);
ts->idle_entrytime = ktime_get();
- kcpustat_dyntick_start(ts->idle_entrytime);
tick_nohz_clock_sleep(ts);
local_irq_enable();
@@ -1230,9 +1231,10 @@ void tick_nohz_irq_exit(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) {
- ts->idle_entrytime = ktime_get();
- kcpustat_irq_exit(ts->idle_entrytime);
tick_nohz_clock_sleep(ts);
+ ts->idle_entrytime = ktime_get();
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
+ kcpustat_irq_exit(ts->idle_entrytime);
} else {
tick_nohz_full_update_tick(ts);
}
@@ -1333,8 +1335,17 @@ void tick_nohz_idle_restart_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
- tick_nohz_restart_sched_tick(ts, ktime_get());
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
+ /*
+ * Update entrytime here in case the tick restart is due to temporary
+ * polling on forced broadcast. The tick may be stopped again later within
+ * the same idle trip. The idle_entrytime was updated recently but make sure
+ * no tiny amount of idle time is accounted twice.
+ */
+ ts->idle_entrytime = ktime_get();
+ kcpustat_dyntick_stop(ts->idle_entrytime);
+ tick_nohz_restart_sched_tick(ts, ts->idle_entrytime);
+ }
}
static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
@@ -1364,7 +1375,6 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
void tick_nohz_idle_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- bool idle_active, tick_stopped;
ktime_t now;
local_irq_disable();
@@ -1373,18 +1383,13 @@ void tick_nohz_idle_exit(void)
WARN_ON_ONCE(ts->timer_expires_base);
tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
- idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
- tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
+ tick_nohz_clock_wakeup(ts);
- if (idle_active || tick_stopped)
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
now = ktime_get();
-
- if (idle_active)
- tick_nohz_clock_wakeup(ts);
-
- if (tick_stopped)
+ kcpustat_dyntick_stop(now);
tick_nohz_idle_update_tick(ts, now);
- kcpustat_dyntick_stop(now);
+ }
local_irq_enable();
}
@@ -1439,15 +1444,13 @@ static inline void tick_nohz_irq_enter(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
- if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
+ tick_nohz_clock_wakeup(ts);
+
+ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED))
return;
now = ktime_get();
-
- if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) {
- tick_nohz_clock_wakeup(ts);
- kcpustat_irq_enter(now);
- }
+ kcpustat_irq_enter(now);
/*
* If all CPUs are idle we may need to update a stale jiffies value.
@@ -1456,8 +1459,7 @@ static inline void tick_nohz_irq_enter(void)
* rare case (typically stop machine). So we must make sure we have a
* last resort.
*/
- if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
- tick_nohz_update_jiffies(now);
+ tick_nohz_update_jiffies(now);
}
#else
--
2.53.0
^ permalink raw reply related
* [PATCH 10/15] tick/sched: Remove unused fields
From: Frederic Weisbecker @ 2026-05-08 13:16 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Christophe Leroy (CS GROUP),
Rafael J. Wysocki, Alexander Gordeev, Anna-Maria Behnsen,
Ben Segall, Boqun Feng, Christian Borntraeger, Dietmar Eggemann,
Heiko Carstens, Ingo Molnar, Ingo Molnar, Jan Kiszka,
Joel Fernandes, Juri Lelli, Kieran Bingham, Madhavan Srinivasan,
Mel Gorman, Michael Ellerman, Neeraj Upadhyay, Nicholas Piggin,
Paul E . McKenney, Peter Zijlstra, Sashiko, Shrikanth Hegde,
Steven Rostedt, Sven Schnelle, Thomas Gleixner, Uladzislau Rezki,
Valentin Schneider, Vasily Gorbik, Vincent Guittot, Viresh Kumar,
Xin Zhao, linux-pm, linux-s390, linuxppc-dev
In-Reply-To: <20260508131647.43868-1-frederic@kernel.org>
Remove fields after the dyntick-idle cputime migration to scheduler
code.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
kernel/time/tick-sched.h | 12 ------------
kernel/time/timer_list.c | 6 +-----
scripts/gdb/linux/timerlist.py | 4 ----
3 files changed, 1 insertion(+), 21 deletions(-)
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index b4a7822f495d..79b9252047b1 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -44,9 +44,7 @@ struct tick_device {
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
- * @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_waketime: Time when the idle was interrupted
- * @idle_sleeptime_seq: sequence counter for data consistency
* @idle_entrytime: Time when the idle call was entered
* @last_jiffies: Base jiffies snapshot when next event was last computed
* @timer_expires_base: Base time clock monotonic for @timer_expires
@@ -55,9 +53,6 @@ struct tick_device {
* @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_exittime: Time when the idle state was left
- * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
- * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
* @check_clocks: Notification mechanism about clocksource changes
*/
@@ -73,12 +68,10 @@ struct tick_sched {
struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
- unsigned long idle_jiffies;
ktime_t idle_waketime;
unsigned int got_idle_tick;
/* Idle entry */
- seqcount_t idle_sleeptime_seq;
ktime_t idle_entrytime;
/* Tick stop */
@@ -90,11 +83,6 @@ struct tick_sched {
unsigned long idle_calls;
unsigned long idle_sleeps;
- /* Idle exit */
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
-
/* Full dynticks handling */
atomic_t tick_dep_mask;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 427d7ddea3af..514802def1e0 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -152,14 +152,10 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_flag(highres, TS_FLAG_HIGHRES);
P_ns(last_tick);
P_flag(tick_stopped, TS_FLAG_STOPPED);
- P(idle_jiffies);
P(idle_calls);
P(idle_sleeps);
P_ns(idle_entrytime);
P_ns(idle_waketime);
- P_ns(idle_exittime);
- P_ns(idle_sleeptime);
- P_ns(iowait_sleeptime);
P(last_jiffies);
P(next_timer);
P_ns(idle_expires);
@@ -256,7 +252,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.10\n");
+ SEQ_printf(m, "Timer List Version: v0.11\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py
index 9fb3436a217c..744b032e4d38 100644
--- a/scripts/gdb/linux/timerlist.py
+++ b/scripts/gdb/linux/timerlist.py
@@ -90,14 +90,10 @@ def print_cpu(hrtimer_bases, cpu, max_clock_bases):
text += f" .{'nohz':15s}: {int(bool(ts['flags'] & TS_FLAG_NOHZ))}\n"
text += f" .{'last_tick':15s}: {ts['last_tick']}\n"
text += f" .{'tick_stopped':15s}: {int(bool(ts['flags'] & TS_FLAG_STOPPED))}\n"
- text += f" .{'idle_jiffies':15s}: {ts['idle_jiffies']}\n"
text += f" .{'idle_calls':15s}: {ts['idle_calls']}\n"
text += f" .{'idle_sleeps':15s}: {ts['idle_sleeps']}\n"
text += f" .{'idle_entrytime':15s}: {ts['idle_entrytime']} nsecs\n"
text += f" .{'idle_waketime':15s}: {ts['idle_waketime']} nsecs\n"
- text += f" .{'idle_exittime':15s}: {ts['idle_exittime']} nsecs\n"
- text += f" .{'idle_sleeptime':15s}: {ts['idle_sleeptime']} nsecs\n"
- text += f" .{'iowait_sleeptime':15s}: {ts['iowait_sleeptime']} nsecs\n"
text += f" .{'last_jiffies':15s}: {ts['last_jiffies']}\n"
text += f" .{'next_timer':15s}: {ts['next_timer']}\n"
text += f" .{'idle_expires':15s}: {ts['idle_expires']} nsecs\n"
--
2.53.0
^ permalink raw reply related
* [PATCH 09/15] tick/sched: Move dyntick-idle cputime accounting to cputime code
From: Frederic Weisbecker @ 2026-05-08 13:16 UTC (permalink / raw)
To: LKML
Cc: Frederic Weisbecker, Christophe Leroy (CS GROUP),
Rafael J. Wysocki, Alexander Gordeev, Anna-Maria Behnsen,
Ben Segall, Boqun Feng, Christian Borntraeger, Dietmar Eggemann,
Heiko Carstens, Ingo Molnar, Ingo Molnar, Jan Kiszka,
Joel Fernandes, Juri Lelli, Kieran Bingham, Madhavan Srinivasan,
Mel Gorman, Michael Ellerman, Neeraj Upadhyay, Nicholas Piggin,
Paul E . McKenney, Peter Zijlstra, Sashiko, Shrikanth Hegde,
Steven Rostedt, Sven Schnelle, Thomas Gleixner, Uladzislau Rezki,
Valentin Schneider, Vasily Gorbik, Vincent Guittot, Viresh Kumar,
Xin Zhao, linux-pm, linux-s390, linuxppc-dev
In-Reply-To: <20260508131647.43868-1-frederic@kernel.org>
Although the dynticks-idle cputime accounting is necessarily tied to
the tick subsystem, the actual related accounting code has no business
residing there and should be part of the scheduler cputime code.
Move away the relevant pieces and state machine to where they belong.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---
include/linux/kernel_stat.h | 14 +++-
kernel/sched/core.c | 6 +-
kernel/sched/cputime.c | 148 ++++++++++++++++++++++++++++++--
kernel/time/tick-sched.c | 163 +++++++-----------------------------
4 files changed, 188 insertions(+), 143 deletions(-)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index ba65aad308a1..9343353ac7a3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -35,9 +35,12 @@ enum cpu_usage_stat {
struct kernel_cpustat {
#ifdef CONFIG_NO_HZ_COMMON
- int idle_dyntick;
+ bool idle_dyntick;
+ bool idle_elapse;
+ seqcount_t idle_sleeptime_seq;
+ u64 idle_entrytime;
#endif
- u64 cpustat[NR_STATS];
+ u64 cpustat[NR_STATS];
};
struct kernel_stat {
@@ -103,8 +106,11 @@ static inline unsigned long kstat_cpu_irqs_sum(unsigned int cpu)
}
#ifdef CONFIG_NO_HZ_COMMON
-extern void kcpustat_dyntick_start(void);
-extern void kcpustat_dyntick_stop(void);
+extern void kcpustat_dyntick_start(u64 now);
+extern void kcpustat_dyntick_stop(u64 now);
+extern void kcpustat_irq_enter(u64 now);
+extern void kcpustat_irq_exit(u64 now);
+
static inline bool kcpustat_idle_dyntick(void)
{
return __this_cpu_read(kernel_cpustat.idle_dyntick);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da20fb6ea25a..0cfc027a955d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5517,7 +5517,11 @@ void sched_exec(void)
}
DEFINE_PER_CPU(struct kernel_stat, kstat);
-DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat) = {
+#ifdef CONFIG_NO_HZ_COMMON
+ .idle_sleeptime_seq = SEQCNT_ZERO(kernel_cpustat.idle_sleeptime_seq)
+#endif
+};
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a5733789e0bd..4c00163b74b9 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,6 +2,7 @@
/*
* Simple CPU accounting cgroup controller
*/
+#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
#include <linux/tsacct_kern.h>
#include "sched.h"
@@ -420,22 +421,155 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_NO_HZ_COMMON
-void kcpustat_dyntick_start(void)
+static void kcpustat_idle_stop(struct kernel_cpustat *kc, u64 now)
{
- if (!vtime_generic_enabled_this_cpu()) {
- vtime_dyntick_start();
- __this_cpu_write(kernel_cpustat.idle_dyntick, 1);
- }
+ u64 *cpustat = kc->cpustat;
+ u64 delta;
+
+ if (!kc->idle_elapse)
+ return;
+
+ delta = now - kc->idle_entrytime;
+
+ write_seqcount_begin(&kc->idle_sleeptime_seq);
+ if (nr_iowait_cpu(smp_processor_id()) > 0)
+ cpustat[CPUTIME_IOWAIT] += delta;
+ else
+ cpustat[CPUTIME_IDLE] += delta;
+
+ kc->idle_entrytime = now;
+ kc->idle_elapse = false;
+ write_seqcount_end(&kc->idle_sleeptime_seq);
}
-void kcpustat_dyntick_stop(void)
+static void kcpustat_idle_start(struct kernel_cpustat *kc, u64 now)
{
+ write_seqcount_begin(&kc->idle_sleeptime_seq);
+ kc->idle_entrytime = now;
+ kc->idle_elapse = true;
+ write_seqcount_end(&kc->idle_sleeptime_seq);
+}
+
+void kcpustat_dyntick_stop(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
if (!vtime_generic_enabled_this_cpu()) {
- __this_cpu_write(kernel_cpustat.idle_dyntick, 0);
+ WARN_ON_ONCE(!kc->idle_dyntick);
+ kcpustat_idle_stop(kc, now);
+ kc->idle_dyntick = false;
vtime_dyntick_stop();
steal_account_process_time(ULONG_MAX);
}
}
+
+void kcpustat_dyntick_start(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu()) {
+ vtime_dyntick_start();
+ kc->idle_dyntick = true;
+ kcpustat_idle_start(kc, now);
+ }
+}
+
+void kcpustat_irq_enter(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu())
+ kcpustat_idle_stop(kc, now);
+}
+
+void kcpustat_irq_exit(u64 now)
+{
+ struct kernel_cpustat *kc = kcpustat_this_cpu;
+
+ if (!vtime_generic_enabled_this_cpu())
+ kcpustat_idle_start(kc, now);
+}
+
+static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
+ bool compute_delta, u64 *last_update_time)
+{
+ struct kernel_cpustat *kc = &kcpustat_cpu(cpu);
+ u64 *cpustat = kc->cpustat;
+ unsigned int seq;
+ ktime_t now;
+ u64 idle;
+
+ now = ktime_get();
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ if (vtime_generic_enabled_cpu(cpu)) {
+ idle = kcpustat_field(idx, cpu);
+ goto to_us;
+ }
+
+ do {
+ seq = read_seqcount_begin(&kc->idle_sleeptime_seq);
+
+ idle = cpustat[idx];
+ if (kc->idle_elapse && compute_delta && now > kc->idle_entrytime)
+ idle += (now - kc->idle_entrytime);
+ } while (read_seqcount_retry(&kc->idle_sleeptime_seq, seq));
+
+to_us:
+ do_div(idle, NSEC_PER_USEC);
+
+ return idle;
+}
+
+/**
+ * get_cpu_idle_time_us - get the total idle time of a CPU
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cumulative idle time (since boot) for a given
+ * CPU, in microseconds. Note that this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * Return: -1 if generic vtime is enabled, else total idle time of the @cpu
+ */
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+ return get_cpu_sleep_time_us(cpu, CPUTIME_IDLE,
+ !nr_iowait_cpu(cpu), last_update_time);
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+
+/**
+ * get_cpu_iowait_time_us - get the total iowait time of a CPU
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cumulative iowait time (since boot) for a given
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * Return: -1 if generic vtime is enabled, else total iowait time of @cpu
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+ return get_cpu_sleep_time_us(cpu, CPUTIME_IOWAIT,
+ nr_iowait_cpu(cpu), last_update_time);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
#endif /* CONFIG_NO_HZ_COMMON */
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb235ec7d2d6..fa03cf7b3cec 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -749,126 +749,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}
-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- ktime_t delta;
-
- if (vtime_generic_enabled_this_cpu())
- return;
-
- if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
- return;
-
- delta = ktime_sub(now, ts->idle_entrytime);
-
- write_seqcount_begin(&ts->idle_sleeptime_seq);
- if (nr_iowait_cpu(smp_processor_id()) > 0)
- cpustat[CPUTIME_IOWAIT] = ktime_add(cpustat[CPUTIME_IOWAIT], delta);
- else
- cpustat[CPUTIME_IDLE] = ktime_add(cpustat[CPUTIME_IDLE], delta);
-
- ts->idle_entrytime = now;
- tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
- write_seqcount_end(&ts->idle_sleeptime_seq);
-
- sched_clock_idle_wakeup_event();
-}
-
-static void tick_nohz_start_idle(struct tick_sched *ts)
-{
- if (vtime_generic_enabled_this_cpu())
- return;
-
- write_seqcount_begin(&ts->idle_sleeptime_seq);
- ts->idle_entrytime = ktime_get();
- tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
- write_seqcount_end(&ts->idle_sleeptime_seq);
- sched_clock_idle_sleep_event();
-}
-
-static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
- bool compute_delta, u64 *last_update_time)
-{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- u64 *cpustat = kcpustat_cpu(cpu).cpustat;
- ktime_t now, idle;
- unsigned int seq;
-
- now = ktime_get();
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
-
- if (vtime_generic_enabled_cpu(cpu)) {
- idle = kcpustat_field(idx, cpu);
- return ktime_to_us(idle);
- }
-
- do {
- ktime_t delta = 0;
-
- seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
-
- if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
- if (now > ts->idle_entrytime)
- delta = ktime_sub(now, ts->idle_entrytime);
- }
-
- idle = ktime_add(cpustat[idx], delta);
- } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
-
- return ktime_to_us(idle);
-
-}
-
-/**
- * get_cpu_idle_time_us - get the total idle time of a CPU
- * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
- * counters if NULL.
- *
- * Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds. Note that this is partially broken due to
- * the counter of iowait tasks that can be remotely updated without
- * any synchronization. Therefore it is possible to observe backward
- * values within two consecutive reads.
- *
- * This time is measured via accounting rather than sampling,
- * and is as accurate as ktime_get() is.
- *
- * Return: -1 if generic vtime is enabled, else total idle time of the @cpu
- */
-u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
-{
- return get_cpu_sleep_time_us(cpu, CPUTIME_IDLE,
- !nr_iowait_cpu(cpu), last_update_time);
-}
-EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
-
-/**
- * get_cpu_iowait_time_us - get the total iowait time of a CPU
- * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
- * counters if NULL.
- *
- * Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds. Note this is partially broken due to
- * the counter of iowait tasks that can be remotely updated without
- * any synchronization. Therefore it is possible to observe backward
- * values within two consecutive reads.
- *
- * This time is measured via accounting rather than sampling,
- * and is as accurate as ktime_get() is.
- *
- * Return: -1 if generic vtime is enabled, else total iowait time of @cpu
- */
-u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
-{
- return get_cpu_sleep_time_us(cpu, CPUTIME_IOWAIT,
- nr_iowait_cpu(cpu), last_update_time);
-}
-EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-
/* Simplified variant of hrtimer_forward_now() */
static ktime_t tick_forward_now(ktime_t expires, ktime_t now)
{
@@ -1289,6 +1169,20 @@ void tick_nohz_idle_retain_tick(void)
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
}
+static void tick_nohz_clock_sleep(struct tick_sched *ts)
+{
+ tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
+ sched_clock_idle_sleep_event();
+}
+
+static void tick_nohz_clock_wakeup(struct tick_sched *ts)
+{
+ if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) {
+ tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
+ sched_clock_idle_wakeup_event();
+ }
+}
+
/**
* tick_nohz_idle_enter - prepare for entering idle on the current CPU
*
@@ -1303,12 +1197,11 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
ts = this_cpu_ptr(&tick_cpu_sched);
-
WARN_ON_ONCE(ts->timer_expires_base);
-
tick_sched_flag_set(ts, TS_FLAG_INIDLE);
- kcpustat_dyntick_start();
- tick_nohz_start_idle(ts);
+ ts->idle_entrytime = ktime_get();
+ kcpustat_dyntick_start(ts->idle_entrytime);
+ tick_nohz_clock_sleep(ts);
local_irq_enable();
}
@@ -1336,10 +1229,13 @@ void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
- tick_nohz_start_idle(ts);
- else
+ if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) {
+ ts->idle_entrytime = ktime_get();
+ kcpustat_irq_exit(ts->idle_entrytime);
+ tick_nohz_clock_sleep(ts);
+ } else {
tick_nohz_full_update_tick(ts);
+ }
}
/**
@@ -1484,11 +1380,11 @@ void tick_nohz_idle_exit(void)
now = ktime_get();
if (idle_active)
- tick_nohz_stop_idle(ts, now);
+ tick_nohz_clock_wakeup(ts);
if (tick_stopped)
tick_nohz_idle_update_tick(ts, now);
- kcpustat_dyntick_stop();
+ kcpustat_dyntick_stop(now);
local_irq_enable();
}
@@ -1545,9 +1441,14 @@ static inline void tick_nohz_irq_enter(void)
if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
return;
+
now = ktime_get();
- if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
- tick_nohz_stop_idle(ts, now);
+
+ if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) {
+ tick_nohz_clock_wakeup(ts);
+ kcpustat_irq_enter(now);
+ }
+
/*
* If all CPUs are idle we may need to update a stale jiffies value.
* Note nohz_full is a special case: a timekeeper is guaranteed to stay
--
2.53.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox