Linux-ARM-Kernel Archive on lore.kernel.org

Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v13 2/3] of: Factor arguments passed to of_map_id() into a struct
From: Vijayanand Jitta @ 2026-04-08 10:03 UTC (permalink / raw)
  To: Nipun Gupta, Nikhil Agarwal, Joerg Roedel, Will Deacon,
	Robin Murphy, Marc Zyngier, Lorenzo Pieralisi, Thomas Gleixner,
	Saravana Kannan, Richard Zhu, Lucas Stach,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Bjorn Helgaas,
	Frank Li, Sascha Hauer, Pengutronix Kernel Team, Fabio Estevam,
	Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko,
	Dmitry Baryshkov, Konrad Dybcio, Bjorn Andersson, Rob Herring,
	Conor Dooley, Krzysztof Kozlowski, Prakash Gupta, Vikash Garodia
  Cc: linux-kernel, iommu, linux-arm-kernel, devicetree, linux-pci, imx,
	xen-devel, linux-arm-msm, Vijayanand Jitta, Charan Teja Kalla
In-Reply-To: <20260408-parse_iommu_cells-v13-0-fa921e92661b@oss.qualcomm.com>

From: Charan Teja Kalla <charan.kalla@oss.qualcomm.com>

Change of_map_id() to take a pointer to struct of_phandle_args
instead of passing target device node and translated IDs separately.
Update all callers accordingly.

Add an explicit filter_np parameter to of_map_id() and of_map_msi_id()
to separate the filter input from the output. Previously, the target
parameter served dual purpose: as an input filter (if non-NULL, only
match entries targeting that node) and as an output (receiving the
matched node with a reference held). Now filter_np is the explicit
input filter and arg->np is the pure output.

Previously, of_map_id() would call of_node_put() on the matched node
when a filter was provided, making reference ownership inconsistent.
Remove this internal of_node_put() call so that of_map_id() now always
transfers ownership of the matched node reference to the caller via
arg->np. Callers are now consistently responsible for releasing this
reference with of_node_put(arg->np) when done.

Acked-by: Frank Li <Frank.Li@nxp.com>
Suggested-by: Rob Herring (Arm) <robh@kernel.org>
Suggested-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Signed-off-by: Charan Teja Kalla <charan.kalla@oss.qualcomm.com>
Signed-off-by: Vijayanand Jitta <vijayanand.jitta@oss.qualcomm.com>
---
 drivers/cdx/cdx_msi.c                    |  7 ++--
 drivers/iommu/of_iommu.c                 |  4 +-
 drivers/irqchip/irq-gic-its-msi-parent.c | 11 ++++--
 drivers/of/base.c                        | 68 +++++++++++++++++---------------
 drivers/of/irq.c                         | 10 ++++-
 drivers/pci/controller/dwc/pci-imx6.c    | 32 +++++++--------
 drivers/pci/controller/pcie-apple.c      |  5 ++-
 drivers/xen/grant-dma-ops.c              |  4 +-
 include/linux/of.h                       | 14 ++++---
 9 files changed, 89 insertions(+), 66 deletions(-)

diff --git a/drivers/cdx/cdx_msi.c b/drivers/cdx/cdx_msi.c
index 63b3544ec997..6924e07c7528 100644
--- a/drivers/cdx/cdx_msi.c
+++ b/drivers/cdx/cdx_msi.c
@@ -121,22 +121,23 @@ static int cdx_msi_prepare(struct irq_domain *msi_domain,
 			   struct device *dev,
 			   int nvec, msi_alloc_info_t *info)
 {
+	struct of_phandle_args msi_spec = {};
 	struct cdx_device *cdx_dev = to_cdx_device(dev);
 	struct device *parent = cdx_dev->cdx->dev;
 	struct msi_domain_info *msi_info;
-	u32 dev_id;
 	int ret;
 
 	/* Retrieve device ID from requestor ID using parent device */
-	ret = of_map_msi_id(parent->of_node, cdx_dev->msi_dev_id, NULL, &dev_id);
+	ret = of_map_msi_id(parent->of_node, cdx_dev->msi_dev_id, NULL, &msi_spec);
 	if (ret) {
 		dev_err(dev, "of_map_id failed for MSI: %d\n", ret);
 		return ret;
 	}
+	of_node_put(msi_spec.np);
 
 #ifdef GENERIC_MSI_DOMAIN_OPS
 	/* Set the device Id to be passed to the GIC-ITS */
-	info->scratchpad[0].ul = dev_id;
+	info->scratchpad[0].ul = msi_spec.args[0];
 #endif
 
 	msi_info = msi_get_domain_info(msi_domain->parent);
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index a511ecf21fcd..a18bb60f6f3d 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -45,10 +45,10 @@ static int of_iommu_configure_dev_id(struct device_node *master_np,
 				     struct device *dev,
 				     const u32 *id)
 {
-	struct of_phandle_args iommu_spec = { .args_count = 1 };
+	struct of_phandle_args iommu_spec = {};
 	int err;
 
-	err = of_map_iommu_id(master_np, *id, &iommu_spec.np, iommu_spec.args);
+	err = of_map_iommu_id(master_np, *id, &iommu_spec);
 	if (err)
 		return err;
 
diff --git a/drivers/irqchip/irq-gic-its-msi-parent.c b/drivers/irqchip/irq-gic-its-msi-parent.c
index b63343a227a9..dd5f84b6470a 100644
--- a/drivers/irqchip/irq-gic-its-msi-parent.c
+++ b/drivers/irqchip/irq-gic-its-msi-parent.c
@@ -152,6 +152,8 @@ static int its_v5_pci_msi_prepare(struct irq_domain *domain, struct device *dev,
 static int of_pmsi_get_msi_info(struct irq_domain *domain, struct device *dev, u32 *dev_id,
 				phys_addr_t *pa)
 {
+	struct device_node *msi_ctrl __free(device_node) = NULL;
+	struct of_phandle_args msi_spec = {};
 	struct of_phandle_iterator it;
 	int ret;
 
@@ -178,9 +180,12 @@ static int of_pmsi_get_msi_info(struct irq_domain *domain, struct device *dev, u
 		}
 	}
 
-	struct device_node *msi_ctrl __free(device_node) = NULL;
-
-	return of_map_msi_id(dev->of_node, dev->id, &msi_ctrl, dev_id);
+	ret = of_map_msi_id(dev->of_node, dev->id, NULL, &msi_spec);
+	if (!ret) {
+		msi_ctrl = msi_spec.np;
+		*dev_id = msi_spec.args[0];
+	}
+	return ret;
 }
 
 static int its_pmsi_prepare(struct irq_domain *domain, struct device *dev,
diff --git a/drivers/of/base.c b/drivers/of/base.c
index ae04487bd614..b3d002015192 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2102,36 +2102,37 @@ int of_find_last_cache_level(unsigned int cpu)
  * @id: device ID to map.
  * @map_name: property name of the map to use.
  * @map_mask_name: optional property name of the mask to use.
- * @target: optional pointer to a target device node.
- * @id_out: optional pointer to receive the translated ID.
+ * @filter_np: optional device node to filter matches by, or NULL to match any.
+ *	If non-NULL, only map entries targeting this node will be matched.
+ * @arg: pointer to a &struct of_phandle_args for the result. On success,
+ *	@arg->args[0] will contain the translated ID. If a map entry was
+ *	matched, @arg->np will be set to the target node with a reference
+ *	held that the caller must release with of_node_put().
  *
  * Given a device ID, look up the appropriate implementation-defined
  * platform ID and/or the target device which receives transactions on that
- * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or
- * @id_out may be NULL if only the other is required. If @target points to
- * a non-NULL device node pointer, only entries targeting that node will be
- * matched; if it points to a NULL value, it will receive the device node of
- * the first matching target phandle, with a reference held.
+ * ID, as per the "iommu-map" and "msi-map" bindings.
  *
  * Return: 0 on success or a standard error code on failure.
  */
 int of_map_id(const struct device_node *np, u32 id,
 	       const char *map_name, const char *map_mask_name,
-	       struct device_node **target, u32 *id_out)
+	       const struct device_node *filter_np, struct of_phandle_args *arg)
 {
 	u32 map_mask, masked_id;
 	int map_len;
 	const __be32 *map = NULL;
 
-	if (!np || !map_name || (!target && !id_out))
+	if (!np || !map_name || !arg)
 		return -EINVAL;
 
 	map = of_get_property(np, map_name, &map_len);
 	if (!map) {
-		if (target)
+		if (filter_np)
 			return -ENODEV;
 		/* Otherwise, no map implies no translation */
-		*id_out = id;
+		arg->args[0] = id;
+		arg->args_count = 1;
 		return 0;
 	}
 
@@ -2173,18 +2174,14 @@ int of_map_id(const struct device_node *np, u32 id,
 		if (!phandle_node)
 			return -ENODEV;
 
-		if (target) {
-			if (*target)
-				of_node_put(phandle_node);
-			else
-				*target = phandle_node;
-
-			if (*target != phandle_node)
-				continue;
+		if (filter_np && filter_np != phandle_node) {
+			of_node_put(phandle_node);
+			continue;
 		}
 
-		if (id_out)
-			*id_out = masked_id - id_base + out_base;
+		arg->np = phandle_node;
+		arg->args[0] = masked_id - id_base + out_base;
+		arg->args_count = 1;
 
 		pr_debug("%pOF: %s, using mask %08x, id-base: %08x, out-base: %08x, length: %08x, id: %08x -> %08x\n",
 			np, map_name, map_mask, id_base, out_base,
@@ -2193,11 +2190,11 @@ int of_map_id(const struct device_node *np, u32 id,
 	}
 
 	pr_info("%pOF: no %s translation for id 0x%x on %pOF\n", np, map_name,
-		id, target && *target ? *target : NULL);
+		id, filter_np);
 
 	/* Bypasses translation */
-	if (id_out)
-		*id_out = id;
+	arg->args[0] = id;
+	arg->args_count = 1;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(of_map_id);
@@ -2207,17 +2204,19 @@ EXPORT_SYMBOL_GPL(of_map_id);
  * @np: root complex device node.
  * @id: Requester ID of the device (e.g. PCI RID/BDF or a platform
  *      stream/device ID) used as the lookup key in the iommu-map table.
- * @target: optional pointer to a target device node.
- * @id_out: optional pointer to receive the translated ID.
+ * @arg: pointer to a &struct of_phandle_args for the result. On success,
+ *	@arg->args[0] contains the translated ID. If a map entry was matched,
+ *	@arg->np holds a reference to the target node that the caller must
+ *	release with of_node_put().
  *
  * Convenience wrapper around of_map_id() using "iommu-map" and "iommu-map-mask".
  *
  * Return: 0 on success or a standard error code on failure.
  */
 int of_map_iommu_id(const struct device_node *np, u32 id,
-		    struct device_node **target, u32 *id_out)
+		    struct of_phandle_args *arg)
 {
-	return of_map_id(np, id, "iommu-map", "iommu-map-mask", target, id_out);
+	return of_map_id(np, id, "iommu-map", "iommu-map-mask", NULL, arg);
 }
 EXPORT_SYMBOL_GPL(of_map_iommu_id);
 
@@ -2226,16 +2225,21 @@ EXPORT_SYMBOL_GPL(of_map_iommu_id);
  * @np: root complex device node.
  * @id: Requester ID of the device (e.g. PCI RID/BDF or a platform
  *      stream/device ID) used as the lookup key in the msi-map table.
- * @target: optional pointer to a target device node.
- * @id_out: optional pointer to receive the translated ID.
+ * @filter_np: optional MSI controller node to filter matches by, or NULL
+ *	to match any. If non-NULL, only map entries targeting this node will
+ *	be matched.
+ * @arg: pointer to a &struct of_phandle_args for the result. On success,
+ *	@arg->args[0] contains the translated ID. If a map entry was matched,
+ *	@arg->np holds a reference to the target node that the caller must
+ *	release with of_node_put().
  *
  * Convenience wrapper around of_map_id() using "msi-map" and "msi-map-mask".
  *
  * Return: 0 on success or a standard error code on failure.
  */
 int of_map_msi_id(const struct device_node *np, u32 id,
-		  struct device_node **target, u32 *id_out)
+		  const struct device_node *filter_np, struct of_phandle_args *arg)
 {
-	return of_map_id(np, id, "msi-map", "msi-map-mask", target, id_out);
+	return of_map_id(np, id, "msi-map", "msi-map-mask", filter_np, arg);
 }
 EXPORT_SYMBOL_GPL(of_map_msi_id);
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index e37c1b3f8736..f86a56bd81fc 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -817,8 +817,16 @@ u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in)
 	 * "msi-map" or an "msi-parent" property.
 	 */
 	for (parent_dev = dev; parent_dev; parent_dev = parent_dev->parent) {
-		if (!of_map_msi_id(parent_dev->of_node, id_in, msi_np, &id_out))
+		struct of_phandle_args msi_spec = {};
+
+		if (!of_map_msi_id(parent_dev->of_node, id_in, *msi_np, &msi_spec)) {
+			id_out = msi_spec.args[0];
+			if (!*msi_np)
+				*msi_np = msi_spec.np;
+			else
+				of_node_put(msi_spec.np);
 			break;
+		}
 		if (!of_check_msi_parent(parent_dev->of_node, msi_np))
 			break;
 	}
diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c
index bff8289f804a..c0544d9c0921 100644
--- a/drivers/pci/controller/dwc/pci-imx6.c
+++ b/drivers/pci/controller/dwc/pci-imx6.c
@@ -1137,30 +1137,32 @@ static void imx_pcie_remove_lut(struct imx_pcie *imx_pcie, u16 rid)
 
 static int imx_pcie_add_lut_by_rid(struct imx_pcie *imx_pcie, u32 rid)
 {
+	struct of_phandle_args iommu_spec = {};
+	struct of_phandle_args msi_spec = {};
 	struct device *dev = imx_pcie->pci->dev;
-	struct device_node *target;
 	u32 sid_i, sid_m;
 	int err_i, err_m;
 	u32 sid = 0;
 
-	target = NULL;
-	err_i = of_map_iommu_id(dev->of_node, rid, &target, &sid_i);
-	if (target) {
-		of_node_put(target);
-	} else {
+	err_i = of_map_iommu_id(dev->of_node, rid, &iommu_spec);
+	if (!err_i)
+		sid_i = iommu_spec.args[0];
+	of_node_put(iommu_spec.np);
+	if (!err_i && !iommu_spec.np) {
 		/*
-		 * "target == NULL && err_i == 0" means RID out of map range.
-		 * Use 1:1 map RID to streamID. Hardware can't support this
-		 * because the streamID is only 6 bits
+		 * "iommu_spec.np == NULL && err_i == 0" means RID out of map
+		 * range. Use 1:1 map RID to streamID. Hardware can't support
+		 * this because the streamID is only 6 bits.
 		 */
 		err_i = -EINVAL;
 	}
 
-	target = NULL;
-	err_m = of_map_msi_id(dev->of_node, rid, &target, &sid_m);
-
+	err_m = of_map_msi_id(dev->of_node, rid, NULL, &msi_spec);
+	if (!err_m)
+		sid_m = msi_spec.args[0];
+	of_node_put(msi_spec.np);
 	/*
-	 *   err_m      target
+	 *   err_m      msi_spec.np
 	 *	0	NULL		RID out of range. Use 1:1 map RID to
 	 *				streamID, Current hardware can't
 	 *				support it, so return -EINVAL.
@@ -1168,10 +1170,8 @@ static int imx_pcie_add_lut_by_rid(struct imx_pcie *imx_pcie, u32 rid)
 	 *	0	!= NULL		Get correct streamID from RID
 	 *	!= 0	!= NULL		Invalid combination
 	 */
-	if (!err_m && !target)
+	if (!err_m && !msi_spec.np)
 		return -EINVAL;
-	else if (target)
-		of_node_put(target); /* Find streamID map entry for RID in msi-map */
 
 	/*
 	 * msi-map        iommu-map
diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c
index a0937b7b3c4d..c2cffc0659f4 100644
--- a/drivers/pci/controller/pcie-apple.c
+++ b/drivers/pci/controller/pcie-apple.c
@@ -755,6 +755,7 @@ static int apple_pcie_enable_device(struct pci_host_bridge *bridge, struct pci_d
 {
 	u32 sid, rid = pci_dev_id(pdev);
 	struct apple_pcie_port *port;
+	struct of_phandle_args iommu_spec = {};
 	int idx, err;
 
 	port = apple_pcie_get_port(pdev);
@@ -764,10 +765,12 @@ static int apple_pcie_enable_device(struct pci_host_bridge *bridge, struct pci_d
 	dev_dbg(&pdev->dev, "added to bus %s, index %d\n",
 		pci_name(pdev->bus->self), port->idx);
 
-	err = of_map_iommu_id(port->pcie->dev->of_node, rid, NULL, &sid);
+	err = of_map_iommu_id(port->pcie->dev->of_node, rid, &iommu_spec);
 	if (err)
 		return err;
 
+	of_node_put(iommu_spec.np);
+	sid = iommu_spec.args[0];
 	mutex_lock(&port->pcie->lock);
 
 	idx = bitmap_find_free_region(port->sid_map, port->sid_map_sz, 0);
diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
index 1b7696b2d762..2aa1a772a0ff 100644
--- a/drivers/xen/grant-dma-ops.c
+++ b/drivers/xen/grant-dma-ops.c
@@ -319,13 +319,13 @@ static int xen_dt_grant_init_backend_domid(struct device *dev,
 					   struct device_node *np,
 					   domid_t *backend_domid)
 {
-	struct of_phandle_args iommu_spec = { .args_count = 1 };
+	struct of_phandle_args iommu_spec = {};
 
 	if (dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(dev);
 		u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn);
 
-		if (of_map_iommu_id(np, rid, &iommu_spec.np, iommu_spec.args)) {
+		if (of_map_iommu_id(np, rid, &iommu_spec)) {
 			dev_dbg(dev, "Cannot translate ID\n");
 			return -ESRCH;
 		}
diff --git a/include/linux/of.h b/include/linux/of.h
index fe841f3cc747..8548cd9eb4f1 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -463,13 +463,13 @@ bool of_console_check(const struct device_node *dn, char *name, int index);
 
 int of_map_id(const struct device_node *np, u32 id,
 	       const char *map_name, const char *map_mask_name,
-	       struct device_node **target, u32 *id_out);
+	       const struct device_node *filter_np, struct of_phandle_args *arg);
 
 int of_map_iommu_id(const struct device_node *np, u32 id,
-		    struct device_node **target, u32 *id_out);
+		    struct of_phandle_args *arg);
 
 int of_map_msi_id(const struct device_node *np, u32 id,
-		  struct device_node **target, u32 *id_out);
+		  const struct device_node *filter_np, struct of_phandle_args *arg);
 
 phys_addr_t of_dma_get_max_cpu_address(struct device_node *np);
 
@@ -935,19 +935,21 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag
 
 static inline int of_map_id(const struct device_node *np, u32 id,
 			     const char *map_name, const char *map_mask_name,
-			     struct device_node **target, u32 *id_out)
+			     const struct device_node *filter_np,
+			     struct of_phandle_args *arg)
 {
 	return -EINVAL;
 }
 
 static inline int of_map_iommu_id(const struct device_node *np, u32 id,
-				  struct device_node **target, u32 *id_out)
+				  struct of_phandle_args *arg)
 {
 	return -EINVAL;
 }
 
 static inline int of_map_msi_id(const struct device_node *np, u32 id,
-				struct device_node **target, u32 *id_out)
+				const struct device_node *filter_np,
+				struct of_phandle_args *arg)
 {
 	return -EINVAL;
 }

-- 
2.34.1



^ permalink raw reply related

* [PATCH v13 3/3] of: Respect #{iommu,msi}-cells in maps
From: Vijayanand Jitta @ 2026-04-08 10:03 UTC (permalink / raw)
  To: Nipun Gupta, Nikhil Agarwal, Joerg Roedel, Will Deacon,
	Robin Murphy, Marc Zyngier, Lorenzo Pieralisi, Thomas Gleixner,
	Saravana Kannan, Richard Zhu, Lucas Stach,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Bjorn Helgaas,
	Frank Li, Sascha Hauer, Pengutronix Kernel Team, Fabio Estevam,
	Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko,
	Dmitry Baryshkov, Konrad Dybcio, Bjorn Andersson, Rob Herring,
	Conor Dooley, Krzysztof Kozlowski, Prakash Gupta, Vikash Garodia
  Cc: linux-kernel, iommu, linux-arm-kernel, devicetree, linux-pci, imx,
	xen-devel, linux-arm-msm, Vijayanand Jitta, Charan Teja Kalla
In-Reply-To: <20260408-parse_iommu_cells-v13-0-fa921e92661b@oss.qualcomm.com>

From: Robin Murphy <robin.murphy@arm.com>

So far our parsing of {iommu,msi}-map properties has always blindly
assumed that the output specifiers will always have exactly 1 cell.
This typically does happen to be the case, but is not actually enforced
(and the PCI msi-map binding even explicitly states support for 0 or 1
cells) - as a result we've now ended up with dodgy DTs out in the field
which depend on this behaviour to map a 1-cell specifier for a 2-cell
provider, despite that being bogus per the bindings themselves.

Since there is some potential use in being able to map at least single
input IDs to multi-cell output specifiers (and properly support 0-cell
outputs as well), add support for properly parsing and using the target
nodes' #cells values, albeit with the unfortunate complication of still
having to work around expectations of the old behaviour too.

Since there are multi-cell output specifiers, the callers of of_map_id()
may need to get the exact cell output value for further processing.
Update of_map_id() to set args_count in the output to reflect the actual
number of output specifier cells.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Charan Teja Kalla <charan.kalla@oss.qualcomm.com>
Signed-off-by: Vijayanand Jitta <vijayanand.jitta@oss.qualcomm.com>
---
 drivers/of/base.c  | 157 +++++++++++++++++++++++++++++++++++++++++------------
 include/linux/of.h |   6 +-
 2 files changed, 125 insertions(+), 38 deletions(-)

diff --git a/drivers/of/base.c b/drivers/of/base.c
index b3d002015192..2554e4f1a181 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2096,18 +2096,48 @@ int of_find_last_cache_level(unsigned int cpu)
 	return cache_level;
 }
 
+/*
+ * Some DTs have an iommu-map targeting a 2-cell IOMMU node while
+ * specifying only 1 cell. Fortunately they all consist of value '1'
+ * as the 2nd cell entry with the same target, so check for that pattern.
+ *
+ * Example:
+ *	IOMMU node:
+ *		#iommu-cells = <2>;
+ *
+ *	Device node:
+ *		iommu-map = <0x0000 &smmu 0x0000 0x1>,
+ *			    <0x0100 &smmu 0x0100 0x1>;
+ */
+static bool of_check_bad_map(const __be32 *map, int len)
+{
+	__be32 phandle = map[1];
+
+	if (len % 4)
+		return false;
+	for (int i = 0; i < len; i += 4) {
+		if (map[i + 1] != phandle || map[i + 3] != cpu_to_be32(1))
+			return false;
+	}
+	return true;
+}
+
 /**
  * of_map_id - Translate an ID through a downstream mapping.
  * @np: root complex device node.
  * @id: device ID to map.
  * @map_name: property name of the map to use.
+ * @cells_name: property name of target specifier cells.
  * @map_mask_name: optional property name of the mask to use.
  * @filter_np: optional device node to filter matches by, or NULL to match any.
  *	If non-NULL, only map entries targeting this node will be matched.
  * @arg: pointer to a &struct of_phandle_args for the result. On success,
- *	@arg->args[0] will contain the translated ID. If a map entry was
- *	matched, @arg->np will be set to the target node with a reference
- *	held that the caller must release with of_node_put().
+ *	@arg->args_count will be set to the number of output specifier cells
+ *	as defined by @cells_name in the target node, and
+ *	@arg->args[0..args_count-1] will contain the translated output
+ *	specifier values. If a map entry was matched, @arg->np will be set
+ *	to the target node with a reference held that the caller must release
+ *	with of_node_put().
  *
  * Given a device ID, look up the appropriate implementation-defined
  * platform ID and/or the target device which receives transactions on that
@@ -2116,17 +2146,19 @@ int of_find_last_cache_level(unsigned int cpu)
  * Return: 0 on success or a standard error code on failure.
  */
 int of_map_id(const struct device_node *np, u32 id,
-	       const char *map_name, const char *map_mask_name,
+	       const char *map_name, const char *cells_name,
+	       const char *map_mask_name,
 	       const struct device_node *filter_np, struct of_phandle_args *arg)
 {
 	u32 map_mask, masked_id;
-	int map_len;
+	int map_bytes, map_len, offset = 0;
+	bool bad_map = false;
 	const __be32 *map = NULL;
 
 	if (!np || !map_name || !arg)
 		return -EINVAL;
 
-	map = of_get_property(np, map_name, &map_len);
+	map = of_get_property(np, map_name, &map_bytes);
 	if (!map) {
 		if (filter_np)
 			return -ENODEV;
@@ -2136,11 +2168,9 @@ int of_map_id(const struct device_node *np, u32 id,
 		return 0;
 	}
 
-	if (!map_len || map_len % (4 * sizeof(*map))) {
-		pr_err("%pOF: Error: Bad %s length: %d\n", np,
-			map_name, map_len);
-		return -EINVAL;
-	}
+	if (map_bytes % sizeof(*map))
+		goto err_map_len;
+	map_len = map_bytes / sizeof(*map);
 
 	/* The default is to select all bits. */
 	map_mask = 0xffffffff;
@@ -2153,39 +2183,84 @@ int of_map_id(const struct device_node *np, u32 id,
 		of_property_read_u32(np, map_mask_name, &map_mask);
 
 	masked_id = map_mask & id;
-	for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) {
+
+	while (offset < map_len) {
 		struct device_node *phandle_node;
-		u32 id_base = be32_to_cpup(map + 0);
-		u32 phandle = be32_to_cpup(map + 1);
-		u32 out_base = be32_to_cpup(map + 2);
-		u32 id_len = be32_to_cpup(map + 3);
+		u32 id_base, phandle, id_len, id_off, cells = 0;
+		const __be32 *out_base;
+
+		if (map_len - offset < 2)
+			goto err_map_len;
+
+		id_base = be32_to_cpup(map + offset);
 
 		if (id_base & ~map_mask) {
-			pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores id-base (0x%x)\n",
-				np, map_name, map_name,
-				map_mask, id_base);
+			pr_err("%pOF: Invalid %s translation - %s (0x%x) ignores id-base (0x%x)\n",
+			       np, map_name, map_mask_name, map_mask, id_base);
 			return -EFAULT;
 		}
 
-		if (masked_id < id_base || masked_id >= id_base + id_len)
-			continue;
-
+		phandle = be32_to_cpup(map + offset + 1);
 		phandle_node = of_find_node_by_phandle(phandle);
 		if (!phandle_node)
 			return -ENODEV;
 
+		if (bad_map) {
+			cells = 1;
+		} else if (of_property_read_u32(phandle_node, cells_name, &cells)) {
+			pr_err("%pOF: missing %s property\n", phandle_node, cells_name);
+			of_node_put(phandle_node);
+			return -EINVAL;
+		}
+
+		if (map_len - offset < 3 + cells) {
+			of_node_put(phandle_node);
+			goto err_map_len;
+		}
+
+		if (offset == 0 && cells == 2) {
+			bad_map = of_check_bad_map(map, map_len);
+			if (bad_map) {
+				pr_warn_once("%pOF: %s mismatches target %s, assuming extra cell of 0\n",
+					     np, map_name, cells_name);
+				cells = 1;
+			}
+		}
+
+		out_base = map + offset + 2;
+		offset += 3 + cells;
+
+		id_len = be32_to_cpup(map + offset - 1);
+		if (id_len > 1 && cells > 1) {
+			/*
+			 * With 1 output cell we reasonably assume its value
+			 * has a linear relationship to the input; with more,
+			 * we'd need help from the provider to know what to do.
+			 */
+			pr_err("%pOF: Unsupported %s - cannot handle %d-ID range with %d-cell output specifier\n",
+			       np, map_name, id_len, cells);
+			of_node_put(phandle_node);
+			return -EINVAL;
+		}
+		id_off = masked_id - id_base;
+		if (masked_id < id_base || id_off >= id_len) {
+			of_node_put(phandle_node);
+			continue;
+		}
+
 		if (filter_np && filter_np != phandle_node) {
 			of_node_put(phandle_node);
 			continue;
 		}
 
 		arg->np = phandle_node;
-		arg->args[0] = masked_id - id_base + out_base;
-		arg->args_count = 1;
+		for (int i = 0; i < cells; i++)
+			arg->args[i] = id_off + be32_to_cpu(out_base[i]);
+		arg->args_count = cells;
 
 		pr_debug("%pOF: %s, using mask %08x, id-base: %08x, out-base: %08x, length: %08x, id: %08x -> %08x\n",
-			np, map_name, map_mask, id_base, out_base,
-			id_len, id, masked_id - id_base + out_base);
+			np, map_name, map_mask, id_base, be32_to_cpup(out_base),
+			id_len, id, id_off + be32_to_cpup(out_base));
 		return 0;
 	}
 
@@ -2196,6 +2271,10 @@ int of_map_id(const struct device_node *np, u32 id,
 	arg->args[0] = id;
 	arg->args_count = 1;
 	return 0;
+
+err_map_len:
+	pr_err("%pOF: Error: Bad %s length: %d\n", np, map_name, map_bytes);
+	return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(of_map_id);
 
@@ -2205,18 +2284,21 @@ EXPORT_SYMBOL_GPL(of_map_id);
  * @id: Requester ID of the device (e.g. PCI RID/BDF or a platform
  *      stream/device ID) used as the lookup key in the iommu-map table.
  * @arg: pointer to a &struct of_phandle_args for the result. On success,
- *	@arg->args[0] contains the translated ID. If a map entry was matched,
- *	@arg->np holds a reference to the target node that the caller must
- *	release with of_node_put().
+ *	@arg->args_count will be set to the number of output specifier cells
+ *	and @arg->args[0..args_count-1] will contain the translated output
+ *	specifier values. If a map entry was matched, @arg->np holds a
+ *	reference to the target node that the caller must release with
+ *	of_node_put().
  *
- * Convenience wrapper around of_map_id() using "iommu-map" and "iommu-map-mask".
+ * Convenience wrapper around of_map_id() using "iommu-map", "#iommu-cells",
+ * and "iommu-map-mask".
  *
  * Return: 0 on success or a standard error code on failure.
  */
 int of_map_iommu_id(const struct device_node *np, u32 id,
 		    struct of_phandle_args *arg)
 {
-	return of_map_id(np, id, "iommu-map", "iommu-map-mask", NULL, arg);
+	return of_map_id(np, id, "iommu-map", "#iommu-cells", "iommu-map-mask", NULL, arg);
 }
 EXPORT_SYMBOL_GPL(of_map_iommu_id);
 
@@ -2229,17 +2311,20 @@ EXPORT_SYMBOL_GPL(of_map_iommu_id);
  *	to match any. If non-NULL, only map entries targeting this node will
  *	be matched.
  * @arg: pointer to a &struct of_phandle_args for the result. On success,
- *	@arg->args[0] contains the translated ID. If a map entry was matched,
- *	@arg->np holds a reference to the target node that the caller must
- *	release with of_node_put().
+ *	@arg->args_count will be set to the number of output specifier cells
+ *	and @arg->args[0..args_count-1] will contain the translated output
+ *	specifier values. If a map entry was matched, @arg->np holds a
+ *	reference to the target node that the caller must release with
+ *	of_node_put().
  *
- * Convenience wrapper around of_map_id() using "msi-map" and "msi-map-mask".
+ * Convenience wrapper around of_map_id() using "msi-map", "#msi-cells",
+ * and "msi-map-mask".
  *
  * Return: 0 on success or a standard error code on failure.
  */
 int of_map_msi_id(const struct device_node *np, u32 id,
 		  const struct device_node *filter_np, struct of_phandle_args *arg)
 {
-	return of_map_id(np, id, "msi-map", "msi-map-mask", filter_np, arg);
+	return of_map_id(np, id, "msi-map", "#msi-cells", "msi-map-mask", filter_np, arg);
 }
 EXPORT_SYMBOL_GPL(of_map_msi_id);
diff --git a/include/linux/of.h b/include/linux/of.h
index 8548cd9eb4f1..51ac8539f2c3 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -462,7 +462,8 @@ const char *of_prop_next_string(const struct property *prop, const char *cur);
 bool of_console_check(const struct device_node *dn, char *name, int index);
 
 int of_map_id(const struct device_node *np, u32 id,
-	       const char *map_name, const char *map_mask_name,
+	       const char *map_name, const char *cells_name,
+	       const char *map_mask_name,
 	       const struct device_node *filter_np, struct of_phandle_args *arg);
 
 int of_map_iommu_id(const struct device_node *np, u32 id,
@@ -934,7 +935,8 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag
 }
 
 static inline int of_map_id(const struct device_node *np, u32 id,
-			     const char *map_name, const char *map_mask_name,
+			     const char *map_name, const char *cells_name,
+			     const char *map_mask_name,
 			     const struct device_node *filter_np,
 			     struct of_phandle_args *arg)
 {

-- 
2.34.1



^ permalink raw reply related

* Re: [PATCH v12 2/2] arm: dts: aspeed: ventura: add Meta Ventura BMC
From: P.K. Lee @ 2026-04-08 10:06 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: robh+dt, krzysztof.kozlowski+dt, conor+dt, joel, andrew,
	devicetree, linux-arm-kernel, linux-aspeed, linux-kernel,
	Jason-Hsu, p.k.lee
In-Reply-To: <435616b8-8d4c-4814-8f21-d667755473f1@lunn.ch>

> > > > +&mac3 {
> > > > +     status = "okay";
> > > > +     phy-mode = "rmii";
> > > > +     pinctrl-names = "default";
> > > > +     pinctrl-0 = <&pinctrl_rmii4_default>;
> > > > +     fixed-link {
> > > > +             speed = <100>;
> > > > +             full-duplex;
> > > > +     };
> > >
> > > What is on the other end of this fixed link?
> >
> > The other end of this fixed link is the CPU port of a Marvell 88E6393X
> > switch. We are using this switch in unmanaged mode rather than using
> > the DSA subsystem. Therefore, we use a fixed-link to force the mac3 to
> > 100Mbps full-duplex RMII to match the CPU port configuration.
>
> You are mixing up terms. The 88E6393X does not have a dedicated port
> for connecting to the host CPU. Any port can be connected to the host,
> using DSA tags. And all the ports are 1G or faster, so it seems odd to
> limit it to 100Mbps. There is something consider a CPU port, but that
> connects the internal Z80 CPU to the switch fabric.
>

I apologize for the confusing terminology. I meant the port 0 of the
88E6393X is connected to the AST2600 mac3. Regarding the 100Mbps RMII
limitation, this is a strict hardware design constraint on our
specific board.

> > > > +};
> > > > +
> > > > +&mdio0 {
> > > > +     status = "okay";
> > > > +};
> > >
> > > If there are no devices on the bus, why enable it?
> >
> > We intentionally enable it so user-space tools can access the switch
> > registers. I have added a comment in v13 to clarify this.
>
> Why would user space want to access the switch registers for an
> unmanaged switch? It sounds like you are using Marvells SDK in
> userspace to manage the switch, rather than using DSA.
>

We do have a custom user-space daemon that configures the switch
registers for our specific use case. Should I remove the &mdio0 node
if it is only enabled and has no other configuration in the upstream
device tree?

P.K. Lee


^ permalink raw reply

* [PATCH] clk: clk-imx8mm: Initialize clocks in arch_initcall
From: Paul Geurts @ 2026-04-08 10:13 UTC (permalink / raw)
  To: abelvesa, peng.fan, mturquette, sboyd, Frank.Li, s.hauer, kernel,
	festevam, shawnguo, linux-clk, imx, linux-arm-kernel,
	linux-kernel
  Cc: martijn.de.gouw, Paul Geurts

The i.MX8MM clock driver is implemented as module_platform_driver();,
which makes it initialize in device_initcall(). This means that all
drivers referencing the clock driver nodes in the device tree are
deferred by fw_devlink, which are most of the i.MX8M platform drivers.

Explicitly initialize the clock driver in arch_initcall(), to make sure
the clock driver is ready when the rest of the drivers are probed.

Fixes: af7e7ee0e428 ("clk: imx8mm: Switch to platform driver")
Signed-off-by: Paul Geurts <paul.geurts@prodrive-technologies.com>
---
 drivers/clk/imx/clk-imx8mm.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/imx/clk-imx8mm.c b/drivers/clk/imx/clk-imx8mm.c
index 319af4deec01..7b2cf867b920 100644
--- a/drivers/clk/imx/clk-imx8mm.c
+++ b/drivers/clk/imx/clk-imx8mm.c
@@ -636,7 +636,19 @@ static struct platform_driver imx8mm_clk_driver = {
 		.of_match_table = imx8mm_clk_of_match,
 	},
 };
-module_platform_driver(imx8mm_clk_driver);
+
+static int __init imx8mm_clk_init(void)
+{
+	return platform_driver_register(&imx8mm_clk_driver);
+}
+arch_initcall(imx8mm_clk_init);
+
+static void __exit imx8mm_clk_exit(void)
+{
+	platform_driver_unregister(&imx8mm_clk_driver);
+}
+module_exit(imx8mm_clk_exit);
+
 module_param(mcore_booted, bool, S_IRUGO);
 MODULE_PARM_DESC(mcore_booted, "See Cortex-M core is booted or not");
 
-- 
2.39.2



^ permalink raw reply related

* Re: [PATCH 00/10] arm64/entry:
From: Thomas Gleixner @ 2026-04-08 10:14 UTC (permalink / raw)
  To: Catalin Marinas, Mark Rutland
  Cc: vladimir.murzin, Peter Zijlstra, ruanjinjie, linux-kernel,
	Andy Lutomirski, Will Deacon, linux-arm-kernel
In-Reply-To: <adYaj8FX_PVKdkdZ@arm.com>

On Wed, Apr 08 2026 at 10:06, Catalin Marinas wrote:
> On Wed, Apr 08, 2026 at 10:02:28AM +0100, Mark Rutland wrote:
>> > Does that work for you?
>> 
>> That sounds good to me.
>> 
>> Catalin, Will, does that work for you?
>
> Yes, it does. Thanks!

Here you go:

  git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git entry-for-arm64-26-04-08

Thanks,

        tglx


^ permalink raw reply

* [PATCH] nvmem: imx-ocotp: Initialize in subsys_initcall
From: Paul Geurts @ 2026-04-08 10:19 UTC (permalink / raw)
  To: srini, Frank.Li, s.hauer, kernel, festevam, gregkh, p.zabel, imx,
	linux-arm-kernel, linux-kernel
  Cc: martijn.de.gouw, Paul Geurts

The i.MX OCOTP driver is implemented as module_platform_driver();,
which makes it initialize in device_initcall(). This means that all
drivers referencing the clock driver nodes in the device tree are
deferred by fw_devlink.

As the OCOTP driver is arch specific, but dependent on the i.MX clock
driver, which is also initialized in arch_initcall(), explicitly
initialize the driver in subsys_initcall(). This makes sure the drivers
depending on fuses defined by OCOTP, which are initialized in
device_initcall() are not deferred.

Fixes: 3edba6b47e42 ("nvmem: imx-ocotp: Add i.MX6 OCOTP driver")
Signed-off-by: Paul Geurts <paul.geurts@prodrive-technologies.com>
---
 drivers/nvmem/imx-ocotp.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/nvmem/imx-ocotp.c b/drivers/nvmem/imx-ocotp.c
index 108d78d7f6cb..9b1e7bb14ced 100644
--- a/drivers/nvmem/imx-ocotp.c
+++ b/drivers/nvmem/imx-ocotp.c
@@ -638,7 +638,18 @@ static struct platform_driver imx_ocotp_driver = {
 		.of_match_table = imx_ocotp_dt_ids,
 	},
 };
-module_platform_driver(imx_ocotp_driver);
+
+static int __init imx_ocotp_init(void)
+{
+	return platform_driver_register(&imx_ocotp_driver);
+}
+subsys_initcall(imx_ocotp_init);
+
+static void __exit imx_ocotp_exit(void)
+{
+	platform_driver_unregister(&imx_ocotp_driver);
+}
+module_exit(imx_ocotp_exit);
 
 MODULE_AUTHOR("Philipp Zabel <p.zabel@pengutronix.de>");
 MODULE_DESCRIPTION("i.MX6/i.MX7 OCOTP fuse box driver");
-- 
2.39.2



^ permalink raw reply related

* Re: [PATCH 05/10] entry: Split preemption from irqentry_exit_to_kernel_mode()
From: Mark Rutland @ 2026-04-08 10:19 UTC (permalink / raw)
  To: Jinjie Ruan
  Cc: vladimir.murzin, Peter Zijlstra, catalin.marinas, linux-kernel,
	Thomas Gleixner, Andy Lutomirski, will, linux-arm-kernel
In-Reply-To: <2d647257-f14b-efac-0d46-ef8aa643393d@huawei.com>

On Wed, Apr 08, 2026 at 05:17:29PM +0800, Jinjie Ruan wrote:
> 
> 
> On 2026/4/7 21:16, Mark Rutland wrote:
> > Some architecture-specific work needs to be performed between the state
> > management for exception entry/exit and the "real" work to handle the
> > exception. For example, arm64 needs to manipulate a number of exception
> > masking bits, with different exceptions requiring different masking.
> > 
> > Generally this can all be hidden in the architecture code, but for arm64
> > the current structure of irqentry_exit_to_kernel_mode() makes this
> > particularly difficult to handle in a way that is correct, maintainable,
> > and efficient.
> > 
> > The gory details are described in the thread surrounding:
> > 
> >   https://lore.kernel.org/lkml/acPAzdtjK5w-rNqC@J2N7QTR9R3/
> > 
> > The summary is:
> > 
> > * Currently, irqentry_exit_to_kernel_mode() handles both involuntary
> >   preemption AND state management necessary for exception return.
> > 
> > * When scheduling (including involuntary preemption), arm64 needs to
> >   have all arm64-specific exceptions unmasked, though regular interrupts
> >   must be masked.
> > 
> > * Prior to the state management for exception return, arm64 needs to
> >   mask a number of arm64-specific exceptions, and perform some work with
> >   these exceptions masked (with RCU watching, etc).
> > 
> > While in theory it is possible to handle this with a new arch_*() hook
> > called somewhere under irqentry_exit_to_kernel_mode(), this is fragile
> > and complicated, and doesn't match the flow used for exception return to
> > user mode, which has a separate 'prepare' step (where preemption can
> > occur) prior to the state management.
> > 
> > To solve this, refactor irqentry_exit_to_kernel_mode() to match the
> > style of {irqentry,syscall}_exit_to_user_mode(), moving preemption logic
> > into a new irqentry_exit_to_kernel_mode_preempt() function, and moving
> > state management in a new irqentry_exit_to_kernel_mode_after_preempt()
> > function. The existing irqentry_exit_to_kernel_mode() is left as a
> > caller of both of these, avoiding the need to modify existing callers.
> > 
> > There should be no functional change as a result of this patch.
> > 
> > Signed-off-by: Mark Rutland <mark.rutland@arm.com>
> > Cc: Andy Lutomirski <luto@kernel.org>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Jinjie Ruan <ruanjinjie@huawei.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@kernel.org>
> > Cc: Vladimir Murzin <vladimir.murzin@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > ---
> >  include/linux/irq-entry-common.h | 26 +++++++++++++++++++++-----
> >  1 file changed, 21 insertions(+), 5 deletions(-)
> > 
> > Thomas/Peter/Andy, as mentioned on IRC, I haven't created kerneldoc
> > comments for these new functions because the existing comments don't
> > seem all that consistent (e.g. for user mode vs kernel mode), and I
> > suspect we want to rewrite them all in one go for wider consistency.
> > 
> > I'm happy to respin this, or to follow-up with that as per your
> > preference.
> > 
> > Mark.
> > 
> > diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
> > index 2206150e526d8..24830baa539c6 100644
> > --- a/include/linux/irq-entry-common.h
> > +++ b/include/linux/irq-entry-common.h
> > @@ -421,10 +421,18 @@ static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct p
> >  	return ret;
> >  }
> >  
> > -static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, irqentry_state_t state)
> > +static inline void irqentry_exit_to_kernel_mode_preempt(struct pt_regs *regs, irqentry_state_t state)
> >  {
> > -	lockdep_assert_irqs_disabled();
> > +	if (regs_irqs_disabled(regs) || state.exit_rcu)
> > +		return;
> > +
> > +	if (IS_ENABLED(CONFIG_PREEMPTION))
> > +		irqentry_exit_cond_resched();
> > +}
> >  
> > +static __always_inline void
> > +irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_t state)
> > +{
> >  	if (!regs_irqs_disabled(regs)) {
> >  		/*
> >  		 * If RCU was not watching on entry this needs to be done
> > @@ -443,9 +451,6 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, i
> >  		}
> >  
> >  		instrumentation_begin();
> > -		if (IS_ENABLED(CONFIG_PREEMPTION))
> > -			irqentry_exit_cond_resched();
> > -
> >  		/* Covers both tracing and lockdep */
> >  		trace_hardirqs_on();
> >  		instrumentation_end();
> > @@ -459,6 +464,17 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, i
> >  	}
> >  }
> >  
> > +static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, irqentry_state_t state)
> > +{
> > +	lockdep_assert_irqs_disabled();
> > +
> > +	instrumentation_begin();
> > +	irqentry_exit_to_kernel_mode_preempt(regs, state);
> > +	instrumentation_end();
> 
> I think the below AI's feedback makes sense. Directly calling
> irqentry_exit_to_kernel_mode_preempt() on arm64/other archs could lead
> to missing instrumentation_begin()/end() markers.
> 
> https://sashiko.dev/#/patchset/20260407131650.3813777-1-mark.rutland%40arm.com

I deliberartely made irqentry_exit_to_kernel_mode_preempt 'inline'
rather than '__always_inline' since everything it does is
instrumentable, and it's up to architecture code to handle that
appropriately.

On arm64 instrumentation_begin() and instrumentation_end() are currently
irrelevant. I didn't add those in the arm64-specific entry code as
they'd simply add pointless NOPs.

This is fine as-is.

Mark.


^ permalink raw reply

* [PATCH net] net: airoha: Fix FE_PSE_BUF_SET configuration if PPE2 is available
From: Lorenzo Bianconi @ 2026-04-08 10:20 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman
  Cc: linux-arm-kernel, linux-mediatek, netdev, Xuegang Lu,
	Lorenzo Bianconi

airoha_fe_set routine is used to set specified bits to 1 in the selected
register. In the FE_PSE_BUF_SET case this can due to a overestimation of
the required buffers for I/O queues since we can miss to set some bits
of PSE_ALLRSV_MASK subfield to 0. Fix the issue relying on airoha_fe_rmw
routine instead.

Fixes: 8e38e08f2c560 ("net: airoha: fix PSE memory configuration in airoha_fe_pse_ports_init()")
Tested-by: Xuegang Lu <xuegang.lu@airoha.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/ethernet/airoha/airoha_eth.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
index 91cb63a32d99..c365d693cf40 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.c
+++ b/drivers/net/ethernet/airoha/airoha_eth.c
@@ -293,16 +293,18 @@ static void airoha_fe_pse_ports_init(struct airoha_eth *eth)
 		[FE_PSE_PORT_GDM4] = 2,
 		[FE_PSE_PORT_CDM5] = 2,
 	};
-	u32 all_rsv;
 	int q;
 
-	all_rsv = airoha_fe_get_pse_all_rsv(eth);
 	if (airoha_ppe_is_enabled(eth, 1)) {
+		u32 all_rsv;
+
 		/* hw misses PPE2 oq rsv */
+		all_rsv = airoha_fe_get_pse_all_rsv(eth);
 		all_rsv += PSE_RSV_PAGES *
 			   pse_port_num_queues[FE_PSE_PORT_PPE2];
+		airoha_fe_rmw(eth, REG_FE_PSE_BUF_SET, PSE_ALLRSV_MASK,
+			      FIELD_PREP(PSE_ALLRSV_MASK, all_rsv));
 	}
-	airoha_fe_set(eth, REG_FE_PSE_BUF_SET, all_rsv);
 
 	/* CMD1 */
 	for (q = 0; q < pse_port_num_queues[FE_PSE_PORT_CDM1]; q++)

---
base-commit: f821664dde29302e8450aa0597bf1e4c7c5b0a22
change-id: 20260408-airoha-reg_fe_pse_buf_set-39a75edb52ca

Best regards,
-- 
Lorenzo Bianconi <lorenzo@kernel.org>



^ permalink raw reply related

* Re: [PATCH] clk: clk-imx8mm: Initialize clocks in arch_initcall
From: Ahmad Fatoum @ 2026-04-08 10:21 UTC (permalink / raw)
  To: Paul Geurts, abelvesa, peng.fan, mturquette, sboyd, Frank.Li,
	s.hauer, kernel, festevam, shawnguo, linux-clk, imx,
	linux-arm-kernel, linux-kernel
  Cc: martijn.de.gouw
In-Reply-To: <20260408101313.2082125-1-paul.geurts@prodrive-technologies.com>

Hello Paul,

On 4/8/26 12:13 PM, Paul Geurts wrote:
> The i.MX8MM clock driver is implemented as module_platform_driver();,
> which makes it initialize in device_initcall(). This means that all
> drivers referencing the clock driver nodes in the device tree are
> deferred by fw_devlink, which are most of the i.MX8M platform drivers.
> 
> Explicitly initialize the clock driver in arch_initcall(), to make sure
> the clock driver is ready when the rest of the drivers are probed.
> 
> Fixes: af7e7ee0e428 ("clk: imx8mm: Switch to platform driver")

Your commit message doesn't explain why this was a problem for you.
Does it delay your boot? What makes this patch a fix?

> Signed-off-by: Paul Geurts <paul.geurts@prodrive-technologies.com>
> ---
>  drivers/clk/imx/clk-imx8mm.c | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/clk/imx/clk-imx8mm.c b/drivers/clk/imx/clk-imx8mm.c
> index 319af4deec01..7b2cf867b920 100644
> --- a/drivers/clk/imx/clk-imx8mm.c
> +++ b/drivers/clk/imx/clk-imx8mm.c
> @@ -636,7 +636,19 @@ static struct platform_driver imx8mm_clk_driver = {
>  		.of_match_table = imx8mm_clk_of_match,
>  	},
>  };
> -module_platform_driver(imx8mm_clk_driver);
> +
> +static int __init imx8mm_clk_init(void)
> +{
> +	return platform_driver_register(&imx8mm_clk_driver);
> +}
> +arch_initcall(imx8mm_clk_init);

What happens if you build the driver as module with your changes applied?

Cheers,
Ahmad

> +
> +static void __exit imx8mm_clk_exit(void)
> +{
> +	platform_driver_unregister(&imx8mm_clk_driver);
> +}
> +module_exit(imx8mm_clk_exit);
> +
>  module_param(mcore_booted, bool, S_IRUGO);
>  MODULE_PARM_DESC(mcore_booted, "See Cortex-M core is booted or not");
>  

-- 
Pengutronix e.K.                  |                             |
Steuerwalder Str. 21              | http://www.pengutronix.de/  |
31137 Hildesheim, Germany         | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686  | Fax:   +49-5121-206917-5555 |



^ permalink raw reply

* Re: [PATCH] nvmem: imx-ocotp: Initialize in subsys_initcall
From: Ahmad Fatoum @ 2026-04-08 10:23 UTC (permalink / raw)
  To: Paul Geurts, srini, Frank.Li, s.hauer, kernel, festevam, gregkh,
	p.zabel, imx, linux-arm-kernel, linux-kernel
  Cc: martijn.de.gouw
In-Reply-To: <20260408101901.2111140-1-paul.geurts@prodrive-technologies.com>

Hello Paul,

On 4/8/26 12:19 PM, Paul Geurts wrote:
> The i.MX OCOTP driver is implemented as module_platform_driver();,
> which makes it initialize in device_initcall(). This means that all
> drivers referencing the clock driver nodes in the device tree are
> deferred by fw_devlink.
> 
> As the OCOTP driver is arch specific, but dependent on the i.MX clock
> driver, which is also initialized in arch_initcall(), explicitly
> initialize the driver in subsys_initcall(). This makes sure the drivers
> depending on fuses defined by OCOTP, which are initialized in
> device_initcall() are not deferred.
> 
> Fixes: 3edba6b47e42 ("nvmem: imx-ocotp: Add i.MX6 OCOTP driver")

Same question as the imx8mm-clk patch.

Your commit message doesn't mention what problem you had the with deferral.

> -module_platform_driver(imx_ocotp_driver);
> +
> +static int __init imx_ocotp_init(void)
> +{
> +	return platform_driver_register(&imx_ocotp_driver);
> +}
> +subsys_initcall(imx_ocotp_init);

Same question as before. What happens when you build this as module.

Cheers,
Ahmad

> +
> +static void __exit imx_ocotp_exit(void)
> +{
> +	platform_driver_unregister(&imx_ocotp_driver);
> +}
> +module_exit(imx_ocotp_exit);
>  
>  MODULE_AUTHOR("Philipp Zabel <p.zabel@pengutronix.de>");
>  MODULE_DESCRIPTION("i.MX6/i.MX7 OCOTP fuse box driver");

-- 
Pengutronix e.K.                  |                             |
Steuerwalder Str. 21              | http://www.pengutronix.de/  |
31137 Hildesheim, Germany         | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686  | Fax:   +49-5121-206917-5555 |



^ permalink raw reply

* Re: [PATCH 2/3] KVM: arm64: vgic: Allow userspace to set IIDR revision 1
From: David Woodhouse @ 2026-04-08 10:32 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: Oliver Upton, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
	Catalin Marinas, Will Deacon, Paolo Bonzini, Shuah Khan,
	Raghavendra Rao Ananta, Eric Auger, Kees Cook, Arnd Bergmann,
	Nathan Chancellor, linux-arm-kernel, kvmarm, linux-kernel, kvm,
	linux-kselftest
In-Reply-To: <87wlyhc2g4.wl-maz@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 985 bytes --]

On Wed, 2026-04-08 at 08:54 +0100, Marc Zyngier wrote:
> 
> > @@ -93,6 +95,9 @@ static int vgic_mmio_uaccess_write_v2_misc(struct
> > kvm_vcpu *vcpu,
> >   		 */
> >   		reg = FIELD_GET(GICD_IIDR_REVISION_MASK, val);
> >   		switch (reg) {
> > +		case KVM_VGIC_IMP_REV_1:
> > +			dist->implementation_rev = reg;
> > +			return 0;
> >   		case KVM_VGIC_IMP_REV_2:
> >   		case KVM_VGIC_IMP_REV_3:
> >   			vcpu->kvm-
> > >arch.vgic.v2_groups_user_writable = true;
> 
> nit: move the v1 handling down with a fallthrough in v2/v3 so that we
> don't duplicate the basic handling:

I think I actually want to rip out the v2_groups_user_writable flag
completely.

It was specifically added in order to allow the actual behaviour to be
inconsistent with the value in the IIDR.

But it doesn't actually stop the *guest* wrong writing; only userspace.

I'll rip it out in a fourth patch when I resend the series (having
addressed your other comments).


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 1/8] arm64/hugetlb: Extend batching of multiple CONT_PTE in a single PTE setup
From: Dev Jain @ 2026-04-08 10:32 UTC (permalink / raw)
  To: Barry Song (Xiaomi), linux-mm, linux-arm-kernel, catalin.marinas,
	will, akpm, urezki
  Cc: linux-kernel, anshuman.khandual, ryan.roberts, ajd, rppt, david,
	Xueyuan.chen21
In-Reply-To: <20260408025115.27368-2-baohua@kernel.org>



On 08/04/26 8:21 am, Barry Song (Xiaomi) wrote:
> For sizes aligned to CONT_PTE_SIZE and smaller than PMD_SIZE,
> we can batch CONT_PTE settings instead of handling them individually.
> 
> Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
> ---
>  arch/arm64/mm/hugetlbpage.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> index a42c05cf5640..bf31c11ebd3b 100644
> --- a/arch/arm64/mm/hugetlbpage.c
> +++ b/arch/arm64/mm/hugetlbpage.c
> @@ -110,6 +110,12 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
>  		contig_ptes = CONT_PTES;
>  		break;
>  	default:
> +		if (size < CONT_PMD_SIZE && size > 0 &&
> +				IS_ALIGNED(size, CONT_PTE_SIZE)) {

Nit: Having the lower bound check before upper bound is natural to
read, so this should be size > 0 && size < CONT_PMD_SIZE (i.e written
the other way around).

Also IS_ALIGNED needs to go below size.


> +			contig_ptes = size >> PAGE_SHIFT;
> +			*pgsize = PAGE_SIZE;
> +			break;
> +		}
>  		WARN_ON(!__hugetlb_valid_size(size));
>  	}
>  
> @@ -359,6 +365,10 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
>  	case CONT_PTE_SIZE:
>  		return pte_mkcont(entry);
>  	default:
> +		if (pagesize < CONT_PMD_SIZE && pagesize > 0 &&
> +				IS_ALIGNED(pagesize, CONT_PTE_SIZE))
> +			return pte_mkcont(entry);
> +
>  		break;
>  	}
>  	pr_warn("%s: unrecognized huge page size 0x%lx\n",



^ permalink raw reply

* Re: [PATCH v13 2/7] qcom-tgu: Add TGU driver
From: Konrad Dybcio @ 2026-04-08 10:42 UTC (permalink / raw)
  To: Songwei Chai, andersson, alexander.shishkin, mike.leach,
	suzuki.poulose, james.clark, krzk+dt, conor+dt
  Cc: linux-kernel, linux-arm-kernel, linux-arm-msm, coresight,
	devicetree, gregkh
In-Reply-To: <20260402092838.341295-3-songwei.chai@oss.qualcomm.com>

On 4/2/26 11:28 AM, Songwei Chai wrote:
> Add driver to support device TGU (Trigger Generation Unit).
> TGU is a Data Engine which can be utilized to sense a plurality of
> signals and create a trigger into the CTI or generate interrupts to
> processors. Add probe/enable/disable functions for tgu.
> 
> Signed-off-by: Songwei Chai <songwei.chai@oss.qualcomm.com>
> ---

Acked-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>

Konrad


^ permalink raw reply

* Re: [PATCH] arm64: dts: imx93-9x9-qsb: Add tianma,tm050rdh03 panel
From: Frank Li @ 2026-04-08 10:50 UTC (permalink / raw)
  To: Liu Ying
  Cc: Sascha Hauer, Pengutronix Kernel Team, Fabio Estevam, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, imx, linux-arm-kernel,
	devicetree, linux-kernel
In-Reply-To: <5ce48659-2c6c-4c60-a8e8-9031bdbaa2a3@nxp.com>

On Wed, Apr 08, 2026 at 04:40:37PM +0800, Liu Ying wrote:
> On Wed, Apr 08, 2026 at 04:28:40AM -0400, Frank Li wrote:
...
> >>>>>
> >>>>> Is it possible to appply this overlay file and kd50g21-40nt-a1 overlay file
> >>>>>
> >>>>> to imx93-9x9-qsb.dtb, so needn't create dtsi.
> >>>>
> >>>> I'm sorry, I don't get your question here.
> >>>> Anyway, the DT overlays are needed, because the 40-pin EXP/PRI interface on
> >>>> the i.MX93 9x9 QSB board can not only connect to a DPI panel adapter board
> >>>> but also to an audio hat[2], and maybe more.  The newly introduced .dtsi
> >>>> file just aims to avoid duplicated code.
> >>>
> >>> My means apply two overlay files to dtb
> >>>
> >>> imx93-9x9-qsb-tianma-tm050rdh03-dtbs += imx93-9x9-qsb.dtb imx93-9x9-qsb-ontat-kd50g21-40nt-a1.dtbo imx93-9x9-qsb-tianma-tm050rdh03.dtbo
>
> This ...
>
> >>>
> >>> In imx93-9x9-qsb-tianma-tm050rdh03.dtbo, only include
> >>> &{/} {
> >>> 	panel {
> >>> 		compatible = "tianma,tm050rdh03";
> >>> 		enable-gpios = <&pcal6524 8 GPIO_ACTIVE_HIGH>;
> >>> 	};
> >>> };
> >>
> >> If an user wants to use imx93-9x9-qsb.dtb and the DT overlay blob
> >> imx93-9x9-qsb-tianma-tm050rdh03.dtbo to enable the tianma,tm050rdh03
> >> DPI panel, then it won't work unless the user also apply
> >> imx93-9x9-qsb-ontat-kd50g21-40nt-a1.dtbo, right?
> >>
> >>>
> >
> > Yes, imx93-9x9-qsb-tianma-tm050rdh03.dtb already created, which already
> > applied both overlay file.
>
> .... indicates that imx93-9x9-qsb-tianma-tm050rdh03.dtb is generated by
> applying both imx93-9x9-qsb-ontat-kd50g21-40nt-a1.dtbo and
> imx93-9x9-qsb-tianma-tm050rdh03.dtbo to imx93-9x9-qsb.dtb.
> While, imx93-9x9-qsb-tianma-tm050rdh03.dtbo(a DT overlay blob) just contains
> the panel node, which means that an user __cannot_ enable the tianma,tm050rdh03
> DPI panel by only applying it to imx93-9x9-qsb.dtb, unless the user also
> applies imx93-9x9-qsb-ontat-kd50g21-40nt-a1.dtbo.  That's why the .dtsi
> file is needed.

what's problem if we require user do that? Makefile already create finial
imx93-9x9-qsb-tianma-tm050rdh03.dtb.

Any user really apply dtso manaully without use kernel's Makefile?

>
> >
> > can the same board be use for imx91 or other evk boards?
>
> Yes, both tianma,tm050rdh03 and ontat,kd50g21-40nt-a1 DPI panels can be
> connected to i.MX91/93 11x11 EVK and 9x9 QSB boards.

Is it possible to use one overlay files for all imx91/imx93 boards?

Frank
>
> >
> > Frank
> >
> >>> Frank
> >>>>
> >>>> [2] https://www.nxp.com/design/design-center/development-boards-and-designs/mx93aud-hat-audio-board:MX93AUD-HAT
> >>>>
> >>>>>
> >>>>> Frank
> >>>>>>
> >>>>>> ---
> >>>>>> base-commit: 816f193dd0d95246f208590924dd962b192def78
> >>>>>> change-id: 20260407-tianma-tm050rdh03-imx93-9x9-qsb-6e4bbbde3d08
> >>>>>>
> >>>>>> Best regards,
> >>>>>> --
> >>>>>> Liu Ying <victor.liu@nxp.com>
> >>>>>>
> >>>>
> >>>> --
> >>>> Regards,
> >>>> Liu Ying
> >>
> >> --
> >> Regards,
> >> Liu Ying
>
> --
> Regards,
> Liu Ying


^ permalink raw reply

* Re: [RFC PATCH 0/8] mm/vmalloc: Speed up ioremap, vmalloc and vmap with contiguous memory
From: Barry Song @ 2026-04-08 10:51 UTC (permalink / raw)
  To: Dev Jain
  Cc: linux-mm, linux-arm-kernel, catalin.marinas, will, akpm, urezki,
	linux-kernel, anshuman.khandual, ryan.roberts, ajd, rppt, david,
	Xueyuan.chen21
In-Reply-To: <1e7427c6-b6e5-4a3a-a600-bef9ac2bf3e0@arm.com>

On Wed, Apr 8, 2026 at 5:14 PM Dev Jain <dev.jain@arm.com> wrote:
>
>
>
> On 08/04/26 8:21 am, Barry Song (Xiaomi) wrote:
> > This patchset accelerates ioremap, vmalloc, and vmap when the memory
> > is physically fully or partially contiguous. Two techniques are used:
> >
> > 1. Avoid page table zigzag when setting PTEs/PMDs for multiple memory
> >    segments
> > 2. Use batched mappings wherever possible in both vmalloc and ARM64
> >    layers
> >
> > Patches 1–2 extend ARM64 vmalloc CONT-PTE mapping to support multiple
> > CONT-PTE regions instead of just one.
> >
> > Patches 3–4 extend vmap_small_pages_range_noflush() to support page
> > shifts other than PAGE_SHIFT. This allows mapping multiple memory
> > segments for vmalloc() without zigzagging page tables.
> >
> > Patches 5–8 add huge vmap support for contiguous pages. This not only
> > improves performance but also enables PMD or CONT-PTE mapping for the
> > vmapped area, reducing TLB pressure.
> >
> > Many thanks to Xueyuan Chen for his substantial testing efforts
> > on RK3588 boards.
> >
> > On the RK3588 8-core ARM64 SoC, with tasks pinned to CPU2 and
> > the performance CPUfreq policy enabled, Xueyuan’s tests report:
> >
> > * ioremap(1 MB): 1.2× faster
> > * vmalloc(1 MB) mapping time (excluding allocation) with
> >   VM_ALLOW_HUGE_VMAP: 1.5× faster
> > * vmap(): 5.6× faster when memory includes some order-8 pages,
> >   with no regression observed for order-0 pages
> >
> > Barry Song (Xiaomi) (8):
> >   arm64/hugetlb: Extend batching of multiple CONT_PTE in a single PTE
> >     setup
> >   arm64/vmalloc: Allow arch_vmap_pte_range_map_size to batch multiple
> >     CONT_PTE
> >   mm/vmalloc: Extend vmap_small_pages_range_noflush() to support larger
> >     page_shift sizes
> >   mm/vmalloc: Eliminate page table zigzag for huge vmalloc mappings
> >   mm/vmalloc: map contiguous pages in batches for vmap() if possible
> >   mm/vmalloc: align vm_area so vmap() can batch mappings
> >   mm/vmalloc: Coalesce same page_shift mappings in vmap to avoid pgtable
> >     zigzag
> >   mm/vmalloc: Stop scanning for compound pages after encountering small
> >     pages in vmap
> >
> >  arch/arm64/include/asm/vmalloc.h |   6 +-
> >  arch/arm64/mm/hugetlbpage.c      |  10 ++
> >  mm/vmalloc.c                     | 178 +++++++++++++++++++++++++------
> >  3 files changed, 161 insertions(+), 33 deletions(-)
> >
>
> On Linux VM on Apple M3, running mm-selftests:

Dev, thanks for your report. Sorry for the silly typo—
Xueyuan’s vmalloc/vmap tests don’t trigger that case yet.

it should be fixed by:

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index bf31c11ebd3b..25b9fce1ec6a 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -110,7 +110,7 @@ static inline int num_contig_ptes(unsigned long
size, size_t *pgsize)
                contig_ptes = CONT_PTES;
                break;
        default:
-               if (size < CONT_PMD_SIZE && size > 0 &&
+               if (size < PMD_SIZE && size > 0 &&
                                IS_ALIGNED(size, CONT_PTE_SIZE)) {
                        contig_ptes = size >> PAGE_SHIFT;
                        *pgsize = PAGE_SIZE;
@@ -365,7 +365,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int
shift, vm_flags_t flags)
        case CONT_PTE_SIZE:
                return pte_mkcont(entry);
        default:
-               if (pagesize < CONT_PMD_SIZE && pagesize > 0 &&
+               if (pagesize < PMD_SIZE && pagesize > 0 &&
                                IS_ALIGNED(pagesize, CONT_PTE_SIZE))
                        return pte_mkcont(entry);

>
>  ./run_vmtests.sh -t "hugetlb"
>
> TAP version 13
> # -----------------------
> # running ./hugepage-mmap
> # -----------------------
> # TAP version 13
> # 1..1
> # # Returned address is 0xffffe7c00000
>
>
>
> [   30.884630] kernel BUG at mm/page_table_check.c:86!
> [   30.884701] Internal error: Oops - BUG: 00000000f2000800 [#1]  SMP
> [   30.886803] Modules linked in:
> [   30.887217] CPU: 3 UID: 0 PID: 1869 Comm: hugepage-mmap Not tainted 7.0.0-rc5+ #86 PREEMPT
> [   30.888218] Hardware name: linux,dummy-virt (DT)
> [   30.889413] pstate: a1400005 (NzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
> [   30.889901] pc : page_table_check_clear.part.0+0x128/0x1a0
> [   30.890337] lr : page_table_check_clear.part.0+0x7c/0x1a0
> [   30.890714] sp : ffff800084da3ad0
> [   30.890946] x29: ffff800084da3ad0 x28: 0000000000000001 x27: 0010000000000001
> [   30.891434] x26: 0040000000000040 x25: ffffa06bb8fb9000 x24: 00000000ffffffff
> [   30.891932] x23: 0000000000000001 x22: 0000000000000000 x21: ffffa06bb8997810
> [   30.892514] x20: 0000000000113e39 x19: 0000000000113e38 x18: 0000000000000000
> [   30.893007] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> [   30.893500] x14: ffffa06bb7013780 x13: 0000fffff7f90fff x12: 0000000000000000
> [   30.893990] x11: 1fffe0001a1282c1 x10: ffff0000d094160c x9 : ffffa06bb568a858
> [   30.894479] x8 : ffff5f95c8474000 x7 : 0000000000000000 x6 : ffff00017fffc500
> [   30.894973] x5 : ffff000191208fc0 x4 : 0000000000000000 x3 : 0000000000004000
> [   30.895449] x2 : 0000000000000000 x1 : 00000000ffffffff x0 : ffff0000c071f1b8
> [   30.895875] Call trace:
> [   30.896027]  page_table_check_clear.part.0+0x128/0x1a0 (P)
> [   30.896369]  page_table_check_clear+0xc8/0x138
> [   30.896776]  __page_table_check_ptes_set+0xe4/0x1e8
> [   30.897073]  __set_ptes_anysz+0x2e4/0x308
> [   30.897327]  set_huge_pte_at+0xec/0x210
> [   30.897561]  hugetlb_no_page+0x1ec/0x8e0
> [   30.897807]  hugetlb_fault+0x188/0x740
> [   30.898036]  handle_mm_fault+0x294/0x2c0
> [   30.898283]  do_page_fault+0x120/0x748
> [   30.898539]  do_translation_fault+0x68/0x90
> [   30.898796]  do_mem_abort+0x4c/0xa8
> [   30.899011]  el0_da+0x2c/0x90
> [   30.899205]  el0t_64_sync_handler+0xd0/0xe8
> [   30.899461]  el0t_64_sync+0x198/0x1a0
> [   30.899688] Code: 91001021 b8f80022 51000441 36fffd41 (d4210000)
> [   30.900053] ---[ end trace 0000000000000000 ]---
>
>
>
> The bug is at
>
> BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
>
> My tree is mm-unstable, commit 3fa44141e0bb.
>

Thanks
Barry


^ permalink raw reply related

* Re: [RFC V1 00/16] arm64/mm: Enable 128 bit page table entries
From: Anshuman Khandual @ 2026-04-08 10:53 UTC (permalink / raw)
  To: David Hildenbrand (Arm), linux-arm-kernel
  Cc: Catalin Marinas, Will Deacon, Ryan Roberts, Mark Rutland,
	Lorenzo Stoakes, Andrew Morton, Mike Rapoport, Linu Cherian,
	linux-kernel, linux-mm
In-Reply-To: <a77b39a4-c6dd-4a8a-8ea4-2bdc31bd3601@kernel.org>

On 07/04/26 8:14 PM, David Hildenbrand (Arm) wrote:
> On 2/24/26 06:11, Anshuman Khandual wrote:
>> FEAT_D128 is a new arm architecture feature adding support for VMSAv9-128
>> translation system. FEAT_D128 is an optional feature from ARMV9.3 onwards.
>> So with this feature arm64 platforms could have two different translation
>> systems, VMSAv8-64 and VMSAv9-128 could selectively be enabled.
>>
>> FEAT_D128 adds 128 bit page table entries, thus supporting larger physical
>> and virtual address range while also expanding available room for more MMU
>> management feature bits both for HW and SW. 
>>
>> This series has been split into two parts. Generic MM changes followed by
>> arm64 platform changes, finally enabling D128 with a new config ARM64_D128.
>>
>> READ_ONCE() on page table entries get routed via level specific pxdp_get()
>> helpers which platforms could then override when required. These accessors
>> on arm64 platform help in ensuring page table accesses are performed in an
>> atomic manner while reading 128 bit page table entries.
>>
>> All ARM64_VA_BITS and ARM64_PA_BITS combinations for all page sizes are now
>> supported both on D64 and D128 translation regimes. Although new 56 bits VA
>> space is not yet supported. Similarly FEAT_D128 skip level is not supported
>> currently.
>>
>> Basic page table geometry has been changed with D128 as there are now fewer
>> entries per level. Please refer to the following table for leaf entry sizes
>>
>>                     D64              D128
>> ------------------------------------------------
>> | PAGE_SIZE |   PMD  |  PUD  |   PMD  |   PUD  |
>> -----------------------------|-----------------|
>> |     4K    |    2M  |  1G   |    1M  |  256M  |
>> |    16K    |   32M  | 64G   |   16M  |   16G  |
>> |    64K    |  512M  |  4T   |  256M  |    1T  |
>> ------------------------------------------------
>>
> 
> Interesting. That means user space will have it even harder to optimize
> for THP sizes.
> 
> What's the effect on cont-pte? Do they still span the same number of
> entries and there is effectively no change?

The numbers are the same for 4K base page size but will need
some changes for 16K and 64K base page sizes. Something that
git missed in this series, will fix it.

> 
>> From arm64 kernel features perspective KVM, KASAN and UNMAP_KERNEL_AT_EL0
>> are currently not supported as well.
>>
>> Open Questions:
>>
>> - Do we need to support UNMAP_KERNEL_AT_EL0 with D128
>> - Do we need to emulate traditional D64 sizes at PUD, PMD level with D128
> 
> It would certainly make user space interaction easier. But then, user
> space already has to consider various PMD sizes (and is better of
> querying /sys/kernel/mm/transparent_hugepage/hpage_pmd_size instead of
> hardcoding it). s390x, for example, also has 1M PMD size.
> > I guess with "emulating" you mean something simple like always
> allocating order-1 page tables that effectively have the same number of
> page table entries?

Yeah - thought something similar.

> 
> The would be an option, but I recall that the pte_map_* infrastructure
> currently expects that leaf page tables only ever span a single page.
> > So it wouldn't really give us a lot of easy benefit I guess.

Right. So probably need to figure all other benefits this might
add besides just the user space facing interactions as you have
mentioned earlier.


^ permalink raw reply

* Re: [RFC PATCH 0/8] mm/vmalloc: Speed up ioremap, vmalloc and vmap with contiguous memory
From: Dev Jain @ 2026-04-08 10:55 UTC (permalink / raw)
  To: Barry Song
  Cc: linux-mm, linux-arm-kernel, catalin.marinas, will, akpm, urezki,
	linux-kernel, anshuman.khandual, ryan.roberts, ajd, rppt, david,
	Xueyuan.chen21
In-Reply-To: <CAGsJ_4xqxmHWBahN-yX10XcEwaHpvypCkwWDLHMY_1P_SzCeMg@mail.gmail.com>



On 08/04/26 4:21 pm, Barry Song wrote:
> On Wed, Apr 8, 2026 at 5:14 PM Dev Jain <dev.jain@arm.com> wrote:
>>
>>
>>
>> On 08/04/26 8:21 am, Barry Song (Xiaomi) wrote:
>>> This patchset accelerates ioremap, vmalloc, and vmap when the memory
>>> is physically fully or partially contiguous. Two techniques are used:
>>>
>>> 1. Avoid page table zigzag when setting PTEs/PMDs for multiple memory
>>>    segments
>>> 2. Use batched mappings wherever possible in both vmalloc and ARM64
>>>    layers
>>>
>>> Patches 1–2 extend ARM64 vmalloc CONT-PTE mapping to support multiple
>>> CONT-PTE regions instead of just one.
>>>
>>> Patches 3–4 extend vmap_small_pages_range_noflush() to support page
>>> shifts other than PAGE_SHIFT. This allows mapping multiple memory
>>> segments for vmalloc() without zigzagging page tables.
>>>
>>> Patches 5–8 add huge vmap support for contiguous pages. This not only
>>> improves performance but also enables PMD or CONT-PTE mapping for the
>>> vmapped area, reducing TLB pressure.
>>>
>>> Many thanks to Xueyuan Chen for his substantial testing efforts
>>> on RK3588 boards.
>>>
>>> On the RK3588 8-core ARM64 SoC, with tasks pinned to CPU2 and
>>> the performance CPUfreq policy enabled, Xueyuan’s tests report:
>>>
>>> * ioremap(1 MB): 1.2× faster
>>> * vmalloc(1 MB) mapping time (excluding allocation) with
>>>   VM_ALLOW_HUGE_VMAP: 1.5× faster
>>> * vmap(): 5.6× faster when memory includes some order-8 pages,
>>>   with no regression observed for order-0 pages
>>>
>>> Barry Song (Xiaomi) (8):
>>>   arm64/hugetlb: Extend batching of multiple CONT_PTE in a single PTE
>>>     setup
>>>   arm64/vmalloc: Allow arch_vmap_pte_range_map_size to batch multiple
>>>     CONT_PTE
>>>   mm/vmalloc: Extend vmap_small_pages_range_noflush() to support larger
>>>     page_shift sizes
>>>   mm/vmalloc: Eliminate page table zigzag for huge vmalloc mappings
>>>   mm/vmalloc: map contiguous pages in batches for vmap() if possible
>>>   mm/vmalloc: align vm_area so vmap() can batch mappings
>>>   mm/vmalloc: Coalesce same page_shift mappings in vmap to avoid pgtable
>>>     zigzag
>>>   mm/vmalloc: Stop scanning for compound pages after encountering small
>>>     pages in vmap
>>>
>>>  arch/arm64/include/asm/vmalloc.h |   6 +-
>>>  arch/arm64/mm/hugetlbpage.c      |  10 ++
>>>  mm/vmalloc.c                     | 178 +++++++++++++++++++++++++------
>>>  3 files changed, 161 insertions(+), 33 deletions(-)
>>>
>>
>> On Linux VM on Apple M3, running mm-selftests:
> 
> Dev, thanks for your report. Sorry for the silly typo—
> Xueyuan’s vmalloc/vmap tests don’t trigger that case yet.
> 
> it should be fixed by:
> 
> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> index bf31c11ebd3b..25b9fce1ec6a 100644
> --- a/arch/arm64/mm/hugetlbpage.c
> +++ b/arch/arm64/mm/hugetlbpage.c
> @@ -110,7 +110,7 @@ static inline int num_contig_ptes(unsigned long
> size, size_t *pgsize)
>                 contig_ptes = CONT_PTES;
>                 break;
>         default:
> -               if (size < CONT_PMD_SIZE && size > 0 &&
> +               if (size < PMD_SIZE && size > 0 &&
>                                 IS_ALIGNED(size, CONT_PTE_SIZE)) {
>                         contig_ptes = size >> PAGE_SHIFT;
>                         *pgsize = PAGE_SIZE;
> @@ -365,7 +365,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int
> shift, vm_flags_t flags)
>         case CONT_PTE_SIZE:
>                 return pte_mkcont(entry);
>         default:
> -               if (pagesize < CONT_PMD_SIZE && pagesize > 0 &&
> +               if (pagesize < PMD_SIZE && pagesize > 0 &&
>                                 IS_ALIGNED(pagesize, CONT_PTE_SIZE))
>                         return pte_mkcont(entry);

Yeah indeed the problem was that a PMD chunk was being treated as 512 ptes
rather than 1 PMD. This fixes it.

> 
>>
>>  ./run_vmtests.sh -t "hugetlb"
>>
>> TAP version 13
>> # -----------------------
>> # running ./hugepage-mmap
>> # -----------------------
>> # TAP version 13
>> # 1..1
>> # # Returned address is 0xffffe7c00000
>>
>>
>>
>> [   30.884630] kernel BUG at mm/page_table_check.c:86!
>> [   30.884701] Internal error: Oops - BUG: 00000000f2000800 [#1]  SMP
>> [   30.886803] Modules linked in:
>> [   30.887217] CPU: 3 UID: 0 PID: 1869 Comm: hugepage-mmap Not tainted 7.0.0-rc5+ #86 PREEMPT
>> [   30.888218] Hardware name: linux,dummy-virt (DT)
>> [   30.889413] pstate: a1400005 (NzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
>> [   30.889901] pc : page_table_check_clear.part.0+0x128/0x1a0
>> [   30.890337] lr : page_table_check_clear.part.0+0x7c/0x1a0
>> [   30.890714] sp : ffff800084da3ad0
>> [   30.890946] x29: ffff800084da3ad0 x28: 0000000000000001 x27: 0010000000000001
>> [   30.891434] x26: 0040000000000040 x25: ffffa06bb8fb9000 x24: 00000000ffffffff
>> [   30.891932] x23: 0000000000000001 x22: 0000000000000000 x21: ffffa06bb8997810
>> [   30.892514] x20: 0000000000113e39 x19: 0000000000113e38 x18: 0000000000000000
>> [   30.893007] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
>> [   30.893500] x14: ffffa06bb7013780 x13: 0000fffff7f90fff x12: 0000000000000000
>> [   30.893990] x11: 1fffe0001a1282c1 x10: ffff0000d094160c x9 : ffffa06bb568a858
>> [   30.894479] x8 : ffff5f95c8474000 x7 : 0000000000000000 x6 : ffff00017fffc500
>> [   30.894973] x5 : ffff000191208fc0 x4 : 0000000000000000 x3 : 0000000000004000
>> [   30.895449] x2 : 0000000000000000 x1 : 00000000ffffffff x0 : ffff0000c071f1b8
>> [   30.895875] Call trace:
>> [   30.896027]  page_table_check_clear.part.0+0x128/0x1a0 (P)
>> [   30.896369]  page_table_check_clear+0xc8/0x138
>> [   30.896776]  __page_table_check_ptes_set+0xe4/0x1e8
>> [   30.897073]  __set_ptes_anysz+0x2e4/0x308
>> [   30.897327]  set_huge_pte_at+0xec/0x210
>> [   30.897561]  hugetlb_no_page+0x1ec/0x8e0
>> [   30.897807]  hugetlb_fault+0x188/0x740
>> [   30.898036]  handle_mm_fault+0x294/0x2c0
>> [   30.898283]  do_page_fault+0x120/0x748
>> [   30.898539]  do_translation_fault+0x68/0x90
>> [   30.898796]  do_mem_abort+0x4c/0xa8
>> [   30.899011]  el0_da+0x2c/0x90
>> [   30.899205]  el0t_64_sync_handler+0xd0/0xe8
>> [   30.899461]  el0t_64_sync+0x198/0x1a0
>> [   30.899688] Code: 91001021 b8f80022 51000441 36fffd41 (d4210000)
>> [   30.900053] ---[ end trace 0000000000000000 ]---
>>
>>
>>
>> The bug is at
>>
>> BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
>>
>> My tree is mm-unstable, commit 3fa44141e0bb.
>>
> 
> Thanks
> Barry



^ permalink raw reply

* Re: [RFC PATCH 1/8] arm64/hugetlb: Extend batching of multiple CONT_PTE in a single PTE setup
From: Barry Song @ 2026-04-08 11:00 UTC (permalink / raw)
  To: Dev Jain
  Cc: linux-mm, linux-arm-kernel, catalin.marinas, will, akpm, urezki,
	linux-kernel, anshuman.khandual, ryan.roberts, ajd, rppt, david,
	Xueyuan.chen21
In-Reply-To: <02b0464a-13f6-4e0c-ad69-0f494bfacbf4@arm.com>

On Wed, Apr 8, 2026 at 6:32 PM Dev Jain <dev.jain@arm.com> wrote:
>
>
>
> On 08/04/26 8:21 am, Barry Song (Xiaomi) wrote:
> > For sizes aligned to CONT_PTE_SIZE and smaller than PMD_SIZE,
> > we can batch CONT_PTE settings instead of handling them individually.
> >
> > Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
> > ---
> >  arch/arm64/mm/hugetlbpage.c | 10 ++++++++++
> >  1 file changed, 10 insertions(+)
> >
> > diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> > index a42c05cf5640..bf31c11ebd3b 100644
> > --- a/arch/arm64/mm/hugetlbpage.c
> > +++ b/arch/arm64/mm/hugetlbpage.c
> > @@ -110,6 +110,12 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
> >               contig_ptes = CONT_PTES;
> >               break;
> >       default:
> > +             if (size < CONT_PMD_SIZE && size > 0 &&
> > +                             IS_ALIGNED(size, CONT_PTE_SIZE)) {
>
> Nit: Having the lower bound check before upper bound is natural to
> read, so this should be size > 0 && size < CONT_PMD_SIZE (i.e written
> the other way around).

Thanks very much for reviewing, Dev. As we discussed in patch 0/8,
this should be
PMD_SIZE, not CONT_PMD_SIZE. I will use size > 0 && size < PMD_SIZE
in the next version.

>
> Also IS_ALIGNED needs to go below size.

Sure, thanks!

>
>
> > +                     contig_ptes = size >> PAGE_SHIFT;
> > +                     *pgsize = PAGE_SIZE;
> > +                     break;
> > +             }
> >               WARN_ON(!__hugetlb_valid_size(size));
> >       }
> >
> > @@ -359,6 +365,10 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
> >       case CONT_PTE_SIZE:
> >               return pte_mkcont(entry);
> >       default:
> > +             if (pagesize < CONT_PMD_SIZE && pagesize > 0 &&
> > +                             IS_ALIGNED(pagesize, CONT_PTE_SIZE))
> > +                     return pte_mkcont(entry);

Here it should be pagesize > 0 && pagesize < PMD_SIZE as well :-)

> > +
> >               break;
> >       }
> >       pr_warn("%s: unrecognized huge page size 0x%lx\n",
>

Best Regards
Barry


^ permalink raw reply

* Re: [RFC V1 00/16] arm64/mm: Enable 128 bit page table entries
From: Ryan Roberts @ 2026-04-08 11:01 UTC (permalink / raw)
  To: Anshuman Khandual, David Hildenbrand (Arm), linux-arm-kernel
  Cc: Catalin Marinas, Will Deacon, Mark Rutland, Lorenzo Stoakes,
	Andrew Morton, Mike Rapoport, Linu Cherian, linux-kernel,
	linux-mm
In-Reply-To: <8d2c9ecb-ae33-42f2-a8ed-66b3286b9286@arm.com>

On 08/04/2026 11:53, Anshuman Khandual wrote:
> On 07/04/26 8:14 PM, David Hildenbrand (Arm) wrote:
>> On 2/24/26 06:11, Anshuman Khandual wrote:
>>> FEAT_D128 is a new arm architecture feature adding support for VMSAv9-128
>>> translation system. FEAT_D128 is an optional feature from ARMV9.3 onwards.
>>> So with this feature arm64 platforms could have two different translation
>>> systems, VMSAv8-64 and VMSAv9-128 could selectively be enabled.
>>>
>>> FEAT_D128 adds 128 bit page table entries, thus supporting larger physical
>>> and virtual address range while also expanding available room for more MMU
>>> management feature bits both for HW and SW. 
>>>
>>> This series has been split into two parts. Generic MM changes followed by
>>> arm64 platform changes, finally enabling D128 with a new config ARM64_D128.
>>>
>>> READ_ONCE() on page table entries get routed via level specific pxdp_get()
>>> helpers which platforms could then override when required. These accessors
>>> on arm64 platform help in ensuring page table accesses are performed in an
>>> atomic manner while reading 128 bit page table entries.
>>>
>>> All ARM64_VA_BITS and ARM64_PA_BITS combinations for all page sizes are now
>>> supported both on D64 and D128 translation regimes. Although new 56 bits VA
>>> space is not yet supported. Similarly FEAT_D128 skip level is not supported
>>> currently.
>>>
>>> Basic page table geometry has been changed with D128 as there are now fewer
>>> entries per level. Please refer to the following table for leaf entry sizes
>>>
>>>                     D64              D128
>>> ------------------------------------------------
>>> | PAGE_SIZE |   PMD  |  PUD  |   PMD  |   PUD  |
>>> -----------------------------|-----------------|
>>> |     4K    |    2M  |  1G   |    1M  |  256M  |
>>> |    16K    |   32M  | 64G   |   16M  |   16G  |
>>> |    64K    |  512M  |  4T   |  256M  |    1T  |
>>> ------------------------------------------------
>>>
>>
>> Interesting. That means user space will have it even harder to optimize
>> for THP sizes.
>>
>> What's the effect on cont-pte? Do they still span the same number of
>> entries and there is effectively no change?
> 
> The numbers are the same for 4K base page size but will need
> some changes for 16K and 64K base page sizes. Something that
> git missed in this series, will fix it.

Really - I thought the contiguous sizes were the same for D128 as they are for
D64? What's the difference? Perhaps it's different for level 2, but for level 3,
I'm pretty sure it remains:

PAGE_SIZE	CONT_SIZE	NR_PTES		CONT_ORDER
4K		64K		16		4
16K		2M		128		7
64K		2M		32		5

Thanks,
Ryan

> 
>>
>>> From arm64 kernel features perspective KVM, KASAN and UNMAP_KERNEL_AT_EL0
>>> are currently not supported as well.
>>>
>>> Open Questions:
>>>
>>> - Do we need to support UNMAP_KERNEL_AT_EL0 with D128
>>> - Do we need to emulate traditional D64 sizes at PUD, PMD level with D128
>>
>> It would certainly make user space interaction easier. But then, user
>> space already has to consider various PMD sizes (and is better of
>> querying /sys/kernel/mm/transparent_hugepage/hpage_pmd_size instead of
>> hardcoding it). s390x, for example, also has 1M PMD size.
>>> I guess with "emulating" you mean something simple like always
>> allocating order-1 page tables that effectively have the same number of
>> page table entries?
> 
> Yeah - thought something similar.
> 
>>
>> The would be an option, but I recall that the pte_map_* infrastructure
>> currently expects that leaf page tables only ever span a single page.
>>> So it wouldn't really give us a lot of easy benefit I guess.
> 
> Right. So probably need to figure all other benefits this might
> add besides just the user space facing interactions as you have
> mentioned earlier.



^ permalink raw reply

* Re: [PATCH 1/2] coresight: etm4x: fix inconsistencies with sysfs configration
From: Yeoreum Yun @ 2026-04-08 11:02 UTC (permalink / raw)
  To: Leo Yan
  Cc: coresight, linux-arm-kernel, linux-kernel, suzuki.poulose,
	mike.leach, james.clark, alexander.shishkin
In-Reply-To: <20260407143028.GM356832@e132581.arm.com>

Hi Leo,

[...]

> As Suzuki suggested in another reply, we need to extract capabilities
> into a separate structure.  I'd also extract status related registers
> into a new structure:
>
>   struct etm4_cap {
>       int nr_ss_cmp;
>       bool pe_comparator;    // TRCSSCSRn.PC
>       bool dv_comparator;    // TRCSSCSRn.DV
>       bool da_comparator;    // TRCSSCSRn.DA
>       bool inst_comparator;  // TRCSSCSRn.INST
>
>       int ns_ex_level;
>       int nr_pe;
>       int nr_pe_cmp;
>       int nr_resource;
>       ...
>   }
>
>   struct etm4_status_reg {
>       u32 ss_status[ETM_MAX_SS_CMP];
>       u32 cntr_val[ETMv4_MAX_CNTR];
>   }

Hmm, I don't think the cntr_val doesn't need to be separated into
etm4_status_reg since they're configurable by sysfs.

BTW from etmv4_config, I think parts of capabilites are only:
  - ss_status
  - s_ex_level

I think it would be okay to include all of this information into
struct etm4_cap not dedicate etm4_status_reg structure.

BTW, Is it required to sustain TRCSSCSR<n>.PENDING in sysfs after
re-enable sysfs-session? (enable->disable->enable) while it's always
cleared in perf mode?

Thanks!

--
Sincerely,
Yeoreum Yun


^ permalink raw reply

* Re: [RFC PATCH 3/8] mm/vmalloc: Extend vmap_small_pages_range_noflush() to support larger page_shift sizes
From: Dev Jain @ 2026-04-08 11:08 UTC (permalink / raw)
  To: Barry Song (Xiaomi), linux-mm, linux-arm-kernel, catalin.marinas,
	will, akpm, urezki
  Cc: linux-kernel, anshuman.khandual, ryan.roberts, ajd, rppt, david,
	Xueyuan.chen21
In-Reply-To: <20260408025115.27368-4-baohua@kernel.org>



On 08/04/26 8:21 am, Barry Song (Xiaomi) wrote:
> vmap_small_pages_range_noflush() provides a clean interface by taking
> struct page **pages and mapping them via direct PTE iteration. This
> avoids the page table zigzag seen when using

"Zigzag" is ambiguous. Just say "page table rewalk". Also please
elaborate on why the rewalk is happening currently.

> vmap_range_noflush() for page_shift values other than PAGE_SHIFT.
> 
> Extend it to support larger page_shift values, and add PMD- and
> contiguous-PTE mappings as well.

So we can drop the "small" here since now it supports larger chunks
as well.

Also at this point the code you add is a no-op since you pass PAGE_SHIFT.
Let us just squash patch 4 into this. This patch looks weird retaining
the pagetable-rewalk algorithm when it literally adds functionality
to avoid that.

> 
> Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
> ---
>  mm/vmalloc.c | 54 ++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 42 insertions(+), 12 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 57eae99d9909..5bf072297536 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -524,8 +524,9 @@ void vunmap_range(unsigned long addr, unsigned long end)
>  
>  static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
> -		pgtbl_mod_mask *mask)
> +		pgtbl_mod_mask *mask, unsigned int shift)
>  {
> +	unsigned int steps = 1;
>  	int err = 0;
>  	pte_t *pte;
>  
> @@ -543,6 +544,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>  	do {
>  		struct page *page = pages[*nr];
>  
> +		steps = 1;
>  		if (WARN_ON(!pte_none(ptep_get(pte)))) {
>  			err = -EBUSY;
>  			break;
> @@ -556,9 +558,24 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>  			break;
>  		}
>  
> +#ifdef CONFIG_HUGETLB_PAGE
> +		if (shift != PAGE_SHIFT) {
> +			unsigned long pfn = page_to_pfn(page), size;
> +
> +			size = arch_vmap_pte_range_map_size(addr, end, pfn, shift);
> +			if (size != PAGE_SIZE) {
> +				steps = size >> PAGE_SHIFT;
> +				pte_t entry = pfn_pte(pfn, prot);
> +
> +				entry = arch_make_huge_pte(entry, ilog2(size), 0);
> +				set_huge_pte_at(&init_mm, addr, pte, entry, size);
> +				continue;
> +			}
> +		}
> +#endif
> +
>  		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
> -		(*nr)++;
> -	} while (pte++, addr += PAGE_SIZE, addr != end);
> +	} while (pte += steps, *nr += steps, addr += PAGE_SIZE * steps, addr != end);
>  
>  	lazy_mmu_mode_disable();
>  	*mask |= PGTBL_PTE_MODIFIED;
> @@ -568,7 +585,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>  
>  static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
>  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
> -		pgtbl_mod_mask *mask)
> +		pgtbl_mod_mask *mask, unsigned int shift)
>  {
>  	pmd_t *pmd;
>  	unsigned long next;
> @@ -578,7 +595,20 @@ static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
>  		return -ENOMEM;
>  	do {
>  		next = pmd_addr_end(addr, end);
> -		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
> +
> +		if (shift == PMD_SHIFT) {
> +			struct page *page = pages[*nr];
> +			phys_addr_t phys_addr = page_to_phys(page);
> +
> +			if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
> +						shift)) {
> +				*mask |= PGTBL_PMD_MODIFIED;
> +				*nr += 1 << (shift - PAGE_SHIFT);
> +				continue;
> +			}
> +		}
> +
> +		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask, shift))
>  			return -ENOMEM;
>  	} while (pmd++, addr = next, addr != end);
>  	return 0;
> @@ -586,7 +616,7 @@ static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
>  
>  static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
>  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
> -		pgtbl_mod_mask *mask)
> +		pgtbl_mod_mask *mask, unsigned int shift)
>  {
>  	pud_t *pud;
>  	unsigned long next;
> @@ -596,7 +626,7 @@ static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
>  		return -ENOMEM;
>  	do {
>  		next = pud_addr_end(addr, end);
> -		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
> +		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask, shift))
>  			return -ENOMEM;
>  	} while (pud++, addr = next, addr != end);
>  	return 0;
> @@ -604,7 +634,7 @@ static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
>  
>  static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
>  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
> -		pgtbl_mod_mask *mask)
> +		pgtbl_mod_mask *mask, unsigned int shift)
>  {
>  	p4d_t *p4d;
>  	unsigned long next;
> @@ -614,14 +644,14 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
>  		return -ENOMEM;
>  	do {
>  		next = p4d_addr_end(addr, end);
> -		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
> +		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask, shift))
>  			return -ENOMEM;
>  	} while (p4d++, addr = next, addr != end);
>  	return 0;
>  }
>  
>  static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
> -		pgprot_t prot, struct page **pages)
> +		pgprot_t prot, struct page **pages, unsigned int shift)
>  {
>  	unsigned long start = addr;
>  	pgd_t *pgd;
> @@ -636,7 +666,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
>  		next = pgd_addr_end(addr, end);
>  		if (pgd_bad(*pgd))
>  			mask |= PGTBL_PGD_MODIFIED;
> -		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
> +		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask, shift);
>  		if (err)
>  			break;
>  	} while (pgd++, addr = next, addr != end);
> @@ -665,7 +695,7 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>  
>  	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
>  			page_shift == PAGE_SHIFT)
> -		return vmap_small_pages_range_noflush(addr, end, prot, pages);
> +		return vmap_small_pages_range_noflush(addr, end, prot, pages, PAGE_SHIFT);
>  
>  	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
>  		int err;



^ permalink raw reply

* RE: [PATCH v4 net-next 12/14] net: dsa: netc: add more basic functions support
From: Wei Fang @ 2026-04-08 11:09 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Claudiu Manoil, Vladimir Oltean, Clark Wang,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	pabeni@redhat.com, robh@kernel.org, krzk+dt@kernel.org,
	conor+dt@kernel.org, f.fainelli@gmail.com, Frank Li,
	chleroy@kernel.org, horms@kernel.org, linux@armlinux.org.uk,
	andrew@lunn.ch, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, devicetree@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org, imx@lists.linux.dev
In-Reply-To: <20260403011743.1795726-1-kuba@kernel.org>

> > +static int netc_add_standalone_vlan_entry(struct netc_switch *priv)
> > +{
> > +	u32 bitmap_stg = VFT_STG_ID(0) | netc_available_port_bitmap(priv);
> 
> Does putting all standalone ports into a single shared VLAN 0 break the DSA
> isolation requirements?
> 
> If a user manually adds a static FDB entry to a standalone port, it is
> programmed into VLAN 0. Because all standalone ports reside in this VLAN, a
> frame arriving on one standalone port with a matching destination MAC might
> be hardware-switched directly to another standalone port, bypassing the CPU.

I think the FDB entry can only be added to the CPU port if the port is a
standalone port not a bridge port. So this case will not be true.

> 
> [ ... ]
> 
> > +static int netc_port_set_fdb_entry(struct netc_port *np,
> > +				   const unsigned char *addr, u16 vid)
> > +{
> 
> [ ... ]
> 
> > +	/* If the entry already exists, but not on this port, we need to
> > +	 * update the port bitmap. In general, it should only be valid
> > +	 * for multicast or broadcast address.
> > +	 */
> > +	port_bitmap ^= BIT(port);
> > +	entry->cfge.port_bitmap = cpu_to_le32(port_bitmap);
> 
> Can this XOR operation cause multi-port floods during unicast MAC migrations?
> 
> When a unicast MAC address moves to a new port, the bridging layer issues an
> ADD for the new port. The XOR operation leaves the old port bit set alongside
> the new port bit. Does the driver need to differentiate between unicast and
> multicast addresses and explicitly overwrite the bitmap for unicast entries?

I think the old FDB entry should be removed first when a unicast MAC address
moves to a new port rather than only adding a new entry. If AI refers to adding
before deleting, then this problem will indeed occur, and it is unavoidable.

> 
> [ ... ]
> 
> > +static int netc_port_del_fdb_entry(struct netc_port *np,
> > +				   const unsigned char *addr, u16 vid)
> > +{
> > +	struct netc_switch *priv = np->switch_priv;
> > +	struct ntmp_user *ntmp = &priv->ntmp;
> > +	struct netc_fdb_entry *entry;
> > +	int port = np->dp->index;
> > +	u32 port_bitmap;
> > +	int err = 0;
> > +
> > +	mutex_lock(&priv->fdbt_lock);
> > +
> > +	entry = netc_lookup_fdb_entry(priv, addr, vid);
> > +	if (unlikely(!entry))
> > +		goto unlock_fdbt;
> 
> Will this prevent dynamically learned FDB entries from being deleted?
> 
> Since the CPU port is configured with MLO_HW, the switch hardware

Currently we only support single port mode, so only support static FDB
entry. The MLO_HW should be removed from the code, I will improve
the logic.

> dynamically
> learns MAC addresses, but they are not added to the software priv->fdb_list.
> If a user attempts to delete a dynamic entry, the software lookup will fail
> here and return 0 (success), leaving the stale dynamically learned entry
> active in hardware.
> 
> [ ... ]
> 
> > +static int netc_port_change_mtu(struct dsa_switch *ds,
> > +				int port, int mtu)
> > +{
> > +	u32 max_frame_size = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
> > +	struct netc_port *np = NETC_PORT(ds, port);
> > +
> > +	if (dsa_is_cpu_port(ds, port))
> > +		max_frame_size += NETC_TAG_MAX_LEN;
> 
> Does this double-count the DSA tag overhead for the CPU port?
> 
> The DSA core computes the CPU port's MTU by adding the tagger overhead
> before
> calling the switch's .port_change_mtu() function. If the maximum MTU of 9582
> bytes is requested, the CPU port receives an MTU of 9596. Adding
> NETC_TAG_MAX_LEN again here results in 9628 bytes, which exceeds the
> hardware limit of 9600 (NETC_MAX_FRAME_LEN) and might lead to truncation.

It seems a bug, I will fix it.

> 
> [ ... ]
> 
> > +static int netc_port_fdb_add(struct dsa_switch *ds, int port,
> > +			     const unsigned char *addr, u16 vid,
> > +			     struct dsa_db db)
> > +{
> > +	struct netc_port *np = NETC_PORT(ds, port);
> > +
> > +	/* Currently, we only support standalone port mode, so all VLANs
> > +	 * should be converted to NETC_STANDALONE_PVID.
> > +	 */
> > +	return netc_port_set_fdb_entry(np, addr, NETC_STANDALONE_PVID);
> > +}
> 
> Is it safe to silently override user-requested VLAN IDs to VLAN 0?
> 
> If a user explicitly attempts to configure a MAC address for a specific VLAN,
> the hardware is programmed for VLAN 0 and returns success. Should this
> instead check if the requested vid is not 0 and not NETC_STANDALONE_PVID,
> and return -EOPNOTSUPP to inform the networking stack?

Okay, I will add a check.

> 
> [ ... ]
> 
> > +static void netc_port_set_host_flood(struct dsa_switch *ds, int port,
> > +				     bool uc, bool mc)
> > +{
> > +	struct netc_port *np = NETC_PORT(ds, port);
> > +
> > +	if (np->uc == uc && np->mc == mc)
> > +		return;
> > +
> > +	/* IPFT does not support in-place updates to the KEYE element,
> > +	 * so we need to delete the old IPFT entry and then add a new
> > +	 * one.
> > +	 */
> > +	if (np->host_flood)
> > +		netc_port_remove_host_flood(np);
> > +
> > +	if (netc_port_add_host_flood_rule(np, uc, mc))
> > +		dev_err(ds->dev, "Failed to add host flood rule on port %d\n",
> > +			port);
> > +}
> 
> Could this permanently lose the host flood configuration if the new rule
> addition fails?
> 
> If the call to netc_port_add_host_flood_rule() fails (for example, due to
> -ENOMEM), the function logs an error and returns, leaving the port entirely
> without a host flood rule. Should the driver revert to the previous
> configuration on failure to avoid blackholing traffic meant for the CPU?

Okay, I will improve it.



^ permalink raw reply

* Re: [PATCH 0/3] arm-smmu-v3: Add PMCG child support and update PMU MMIO mapping
From: Robin Murphy @ 2026-04-08 11:15 UTC (permalink / raw)
  To: Peng Fan (OSS), Will Deacon, Joerg Roedel, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Mark Rutland
  Cc: linux-arm-kernel, iommu, devicetree, linux-kernel,
	linux-perf-users, Peng Fan
In-Reply-To: <20260408-smmu-perf-v1-0-d75dac96e828@nxp.com>

On 2026-04-08 8:51 am, Peng Fan (OSS) wrote:
> This patch series adds proper support for describing and probing the
> Arm SMMU v3 PMCG (Performance Monitor Control Group) as a child node of
> the SMMU in Devicetree, and updates the relevant drivers accordingly.
> 
> The SMMU v3 architecture allows an optional PMCG block, typically
> associated with TCUs, to be implemented within the SMMU register
> address space. For example, mmu700 PMCG is at the offset 0x2000 of the
> TCU page 0.

But what's wrong with the existing binding? Especially given that it 
even has an upstream user already:

https://git.kernel.org/torvalds/c/aef9703dcbf8

> Patch 1 updates the SMMU v3 Devicetree binding to allow PMCG child nodes,
> referencing the existing arm,smmu-v3-pmcg binding.
> 
> Patch 2 updates the arm-smmu-v3 driver to populate platform devices for
> child nodes described in DT once the SMMU probe succeeds.
> 
> Patch 3 updates the SMMUv3 PMU driver to correctly handle MMIO mapping when
> PMCG is described as a child node. The PMCG registers occupy a sub-region
> of the parent SMMU MMIO window, which is already requested by the SMMU

That has not been the case since 52f3fab0067d ("iommu/arm-smmu-v3: Don't 
reserve implementation defined register space") nearly 6 years ago, 
where the whole purpose was to support Arm's PMCG implementation 
properly. What kernel is this based on?

Thanks,
Robin.

> Signed-off-by: Peng Fan <peng.fan@nxp.com>
> ---
> Peng Fan (3):
>        dt-bindings: iommu: arm-smmu-v3: Allow PMU child nodes
>        iommu/arm-smmu-v3: Populate PMU child devices from Devicetree
>        perf/arm-smmuv3: Avoid double-requesting shared SMMU MMIO for PMCG
> 
>   .../devicetree/bindings/iommu/arm,smmu-v3.yaml        | 10 ++++++++++
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c           |  3 +++
>   drivers/perf/arm_smmuv3_pmu.c                         | 19 ++++++++++++++++---
>   3 files changed, 29 insertions(+), 3 deletions(-)
> ---
> base-commit: f3e6330d7fe42b204af05a2dbc68b379e0ad179e
> change-id: 20260408-smmu-perf-754367fe66c8
> 
> Best regards,



^ permalink raw reply

* RE: [PATCH v4 net-next 13/14] net: dsa: netc: initialize buffer bool table and implement flow-control
From: Wei Fang @ 2026-04-08 11:16 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Claudiu Manoil, Vladimir Oltean, Clark Wang,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	pabeni@redhat.com, robh@kernel.org, krzk+dt@kernel.org,
	conor+dt@kernel.org, f.fainelli@gmail.com, Frank Li,
	chleroy@kernel.org, horms@kernel.org, linux@armlinux.org.uk,
	andrew@lunn.ch, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, devicetree@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org, imx@lists.linux.dev
In-Reply-To: <20260403011745.1795759-1-kuba@kernel.org>

> > +static u32 netc_get_buffer_pool_num(struct netc_switch *priv)
> > +{
> > +	/* The BPCAPR register is a read only register, the hardware
> > +	 * guarantees that the BPCAPR_NUM_BP field will not be 0.
> > +	 */
> > +	return netc_base_rd(&priv->regs, NETC_BPCAPR) & BPCAPR_NUM_BP;
> > +}
> [ ... ]
> > +static void netc_ipv_to_buffer_pool_mapping(struct netc_switch *priv)
> > +{
> > +	int bp_per_port = priv->num_bp / priv->info->num_ports;
> > +	int q = NETC_IPV_NUM / bp_per_port;
> > +	int r = NETC_IPV_NUM % bp_per_port;
> 
> If the hardware register read in netc_get_buffer_pool_num() returns a value
> smaller than priv->info->num_ports (for instance, during a bus error or
> reset state), bp_per_port will evaluate to 0.
> 
> Can this lead to a division by zero panic here?
> 
> Should priv->num_bp be validated against the number of ports before
> performing this calculation to ensure the driver fails gracefully if the
> hardware returns an unexpected value?

If the hardware is so unstable and prone to such malfunctions, that would
be terrible, but I believe it won't. The AI review is being a bit too nitpicky.
Anyway, I will ll add this check to avoid AI review warnings.


^ permalink raw reply

* Re: [RFC PATCH 5/8] mm/vmalloc: map contiguous pages in batches for vmap() if possible
From: Dev Jain @ 2026-04-08 11:22 UTC (permalink / raw)
  To: Barry Song
  Cc: linux-mm, linux-arm-kernel, catalin.marinas, will, akpm, urezki,
	linux-kernel, anshuman.khandual, ryan.roberts, ajd, rppt, david,
	Xueyuan.chen21
In-Reply-To: <CAGsJ_4xCtFe=5ofj4FW6cqu-fgR+K9BM7FPZRdAWOGP3YKtNzQ@mail.gmail.com>



On 08/04/26 10:42 am, Barry Song wrote:
> On Wed, Apr 8, 2026 at 12:20 PM Dev Jain <dev.jain@arm.com> wrote:
>>
>>
>>
>> On 08/04/26 8:21 am, Barry Song (Xiaomi) wrote:
>>> In many cases, the pages passed to vmap() may include high-order
>>> pages allocated with __GFP_COMP flags. For example, the systemheap
>>> often allocates pages in descending order: order 8, then 4, then 0.
>>> Currently, vmap() iterates over every page individually—even pages
>>> inside a high-order block are handled one by one.
>>>
>>> This patch detects high-order pages and maps them as a single
>>> contiguous block whenever possible.
>>>
>>> An alternative would be to implement a new API, vmap_sg(), but that
>>> change seems to be large in scope.
>>>
>>> Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
>>> ---
>>
>> Coincidentally, I was working on the same thing :)
> 
> Interesting, thanks — at least I’ve got one good reviewer :-)
> 
>>
>> We have a usecase regarding Arm TRBE and SPE aux buffers.
>>
>> I'll take a look at your patches later, but my implementation is the
> 
> Yes. Please.
> 
> 
>> following, if you have any comments. I have squashed the patches into
>> a single diff.
> 
> Thanks very much, Dev. What you’ve done is quite similar to
> patches 5/8 and 6/8, although the code differs somewhat.
> 
>>
>>
>>
>> From ccb9670a52b7f50b1f1e07b579a1316f76b84811 Mon Sep 17 00:00:00 2001
>> From: Dev Jain <dev.jain@arm.com>
>> Date: Thu, 26 Feb 2026 16:21:29 +0530
>> Subject: [PATCH] arm64/perf: map AUX buffer with large pages
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>  .../hwtracing/coresight/coresight-etm-perf.c  |  3 +-
>>  drivers/hwtracing/coresight/coresight-trbe.c  |  3 +-
>>  drivers/perf/arm_spe_pmu.c                    |  5 +-
>>  mm/vmalloc.c                                  | 86 ++++++++++++++++---
>>  4 files changed, 79 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
>> index 72017dcc3b7f1..e90a430af86bb 100644
>> --- a/drivers/hwtracing/coresight/coresight-etm-perf.c
>> +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
>> @@ -984,7 +984,8 @@ int __init etm_perf_init(void)
>>
>>         etm_pmu.capabilities            = (PERF_PMU_CAP_EXCLUSIVE |
>>                                            PERF_PMU_CAP_ITRACE |
>> -                                          PERF_PMU_CAP_AUX_PAUSE);
>> +                                          PERF_PMU_CAP_AUX_PAUSE |
>> +                                          PERF_PMU_CAP_AUX_PREFER_LARGE);
>>
>>         etm_pmu.attr_groups             = etm_pmu_attr_groups;
>>         etm_pmu.task_ctx_nr             = perf_sw_context;
>> diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c
>> index 1511f8eb95afb..74e6ad891e236 100644
>> --- a/drivers/hwtracing/coresight/coresight-trbe.c
>> +++ b/drivers/hwtracing/coresight/coresight-trbe.c
>> @@ -760,7 +760,8 @@ static void *arm_trbe_alloc_buffer(struct coresight_device *csdev,
>>         for (i = 0; i < nr_pages; i++)
>>                 pglist[i] = virt_to_page(pages[i]);
>>
>> -       buf->trbe_base = (unsigned long)vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
>> +       buf->trbe_base = (unsigned long)vmap(pglist, nr_pages,
>> +                        VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL);
>>         if (!buf->trbe_base) {
>>                 kfree(pglist);
>>                 kfree(buf);
>> diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
>> index dbd0da1116390..90c349fd66b2c 100644
>> --- a/drivers/perf/arm_spe_pmu.c
>> +++ b/drivers/perf/arm_spe_pmu.c
>> @@ -1027,7 +1027,7 @@ static void *arm_spe_pmu_setup_aux(struct perf_event *event, void **pages,
>>         for (i = 0; i < nr_pages; ++i)
>>                 pglist[i] = virt_to_page(pages[i]);
>>
>> -       buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
>> +       buf->base = vmap(pglist, nr_pages, VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL);
>>         if (!buf->base)
>>                 goto out_free_pglist;
>>
>> @@ -1064,7 +1064,8 @@ static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu)
>>         spe_pmu->pmu = (struct pmu) {
>>                 .module = THIS_MODULE,
>>                 .parent         = &spe_pmu->pdev->dev,
>> -               .capabilities   = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
>> +               .capabilities   = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE |
>> +                                 PERF_PMU_CAP_AUX_PREFER_LARGE,
>>                 .attr_groups    = arm_spe_pmu_attr_groups,
>>                 /*
>>                  * We hitch a ride on the software context here, so that
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index 61caa55a44027..8482463d41203 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -660,14 +660,14 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>>                 pgprot_t prot, struct page **pages, unsigned int page_shift)
>>  {
>>         unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
>> -
>> +       unsigned long step = 1UL << (page_shift - PAGE_SHIFT);
>>         WARN_ON(page_shift < PAGE_SHIFT);
>>
>>         if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
>>                         page_shift == PAGE_SHIFT)
>>                 return vmap_small_pages_range_noflush(addr, end, prot, pages);
>>
>> -       for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
>> +       for (i = 0; i < ALIGN_DOWN(nr, step); i += step) {
>>                 int err;
>>
>>                 err = vmap_range_noflush(addr, addr + (1UL << page_shift),
>> @@ -678,8 +678,9 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>>
>>                 addr += 1UL << page_shift;
>>         }
>> -
>> -       return 0;
>> +       if (IS_ALIGNED(nr, step))
>> +               return 0;
>> +       return vmap_small_pages_range_noflush(addr, end, prot, pages + i);
>>  }
>>
>>  int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>> @@ -3514,6 +3515,50 @@ void vunmap(const void *addr)
>>  }
>>  EXPORT_SYMBOL(vunmap);
>>
>> +static inline unsigned int vm_shift(pgprot_t prot, unsigned long size)
>> +{
>> +       if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
>> +               return PMD_SHIFT;
>> +
>> +       return arch_vmap_pte_supported_shift(size);
>> +}
>> +
>> +static inline int __vmap_huge(struct page **pages, pgprot_t prot,
>> +               unsigned long addr, unsigned int count)
>> +{
>> +       unsigned int i = 0;
>> +       unsigned int shift;
>> +       unsigned long nr;
>> +
>> +       while (i < count) {
>> +               nr = num_pages_contiguous(pages + i, count - i);
>> +               shift = vm_shift(prot, nr << PAGE_SHIFT);
>> +               if (vmap_pages_range(addr, addr + (nr << PAGE_SHIFT),
>> +                                    pgprot_nx(prot), pages + i, shift) < 0) {
>> +                       return 1;
>> +               }
> 
> One observation on my side is that the performance gain is somewhat
> offset by page table zigzagging caused by what you are doing here -
> iterating each mem segment by vmap_pages_range() .

I recall having observed this problem half an year back, and I wrote
code similar to what you did with patch 3 - but I didn't observe any
performance improvement. I think that was because I was testing
vmalloc - most of the cost there lies in the page allocation.

So looks like this indeed is a benefit for vmap.

> 
> In patch 3/8, I enhanced vmap_small_pages_range_noflush() to
> avoid repeated pgd → p4d → pud → pmd → pte traversals for page
> shifts other than PAGE_SHIFT. This improves performance for
> vmalloc as well as vmap(). Then, in patch 7/8, I adopt the new
> vmap_small_pages_range_noflush() and eliminate the iteration.
> 
>> +               i += nr;
>> +               addr += (nr << PAGE_SHIFT);
>> +       }
>> +       return 0;
>> +}
>> +
>> +static unsigned long max_contiguous_stride_order(struct page **pages,
>> +               pgprot_t prot, unsigned int count)
>> +{
>> +       unsigned long max_shift = PAGE_SHIFT;
>> +       unsigned int i = 0;
>> +
>> +       while (i < count) {
>> +               unsigned long nr = num_pages_contiguous(pages + i, count - i);
>> +               unsigned long shift = vm_shift(prot, nr << PAGE_SHIFT);
>> +
>> +               max_shift = max(max_shift, shift);
>> +               i += nr;
>> +       }
>> +       return max_shift;
>> +}
>> +
>>  /**
>>   * vmap - map an array of pages into virtually contiguous space
>>   * @pages: array of page pointers
>> @@ -3552,15 +3597,32 @@ void *vmap(struct page **pages, unsigned int count,
>>                 return NULL;
>>
>>         size = (unsigned long)count << PAGE_SHIFT;
>> -       area = get_vm_area_caller(size, flags, __builtin_return_address(0));
>> +       if (flags & VM_ALLOW_HUGE_VMAP) {
>> +               /* determine from page array, the max alignment */
>> +               unsigned long max_shift = max_contiguous_stride_order(pages, prot, count);
>> +
>> +               area = __get_vm_area_node(size, 1 << max_shift, max_shift, flags,
>> +                                         VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
>> +                                         GFP_KERNEL, __builtin_return_address(0));
>> +       } else {
>> +               area = get_vm_area_caller(size, flags, __builtin_return_address(0));
>> +       }
>>         if (!area)
>>                 return NULL;
>>
>>         addr = (unsigned long)area->addr;
>> -       if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
>> -                               pages, PAGE_SHIFT) < 0) {
>> -               vunmap(area->addr);
>> -               return NULL;
>> +
>> +       if (flags & VM_ALLOW_HUGE_VMAP) {
>> +               if (__vmap_huge(pages, prot, addr, count)) {
>> +                       vunmap(area->addr);
>> +                       return NULL;
>> +               }
>> +       } else {
>> +               if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
>> +                                       pages, PAGE_SHIFT) < 0) {
>> +                       vunmap(area->addr);
>> +                       return NULL;
>> +               }
>>         }
>>
>>         if (flags & VM_MAP_PUT_PAGES) {
>> @@ -4011,11 +4073,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
>>                  * their allocations due to apply_to_page_range not
>>                  * supporting them.
>>                  */
>> -
>> -               if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
>> -                       shift = PMD_SHIFT;
>> -               else
>> -                       shift = arch_vmap_pte_supported_shift(size);
>> +               shift = vm_shift(prot, size);
> 
> What I actually did is different. In patches 1/8 and 2/8, I
> extended the arm64 levels to support N * CONT_PTE, and let the
> final PTE mapping use the maximum possible batch after avoiding
> zigzag. This further improves all orders greater than CONT_PTE.
> 
> Thanks
> Barry



^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox