The Linux Kernel Mailing List
 help / color / mirror / Atom feed
* [PATCH] iommu/io-pgtable-arm: Add support for contiguous hint bit
@ 2026-06-18  9:02 Vijayanand Jitta
  2026-06-19 19:40 ` Daniel Mentz
  2026-07-03 16:12 ` Jason Gunthorpe
  0 siblings, 2 replies; 4+ messages in thread
From: Vijayanand Jitta @ 2026-06-18  9:02 UTC (permalink / raw)
  To: Joerg Roedel (AMD), Will Deacon, Robin Murphy
  Cc: linux-arm-msm, iommu, linux-kernel, linux-arm-kernel,
	Prakash Gupta, Vijayanand Jitta

From: Prakash Gupta <prakash.gupta@oss.qualcomm.com>

Add support for the contiguous hint (CONT) bit in ARM LPAE page tables.
When a set of consecutive PTEs map a naturally-aligned contiguous block
of memory, the CONT bit can be set on all entries in the group to allow
the hardware to combine them into a single TLB entry, improving TLB
utilization.

The contiguous hint sizes per granule are:

  Page Size | CONT PTE |  PMD  | CONT PMD
  ----------+----------+-------+---------
      4K    |   64K    |   2M  |   32M
     16K    |    2M    |  32M  |    1G
     64K    |    2M    | 512M  |   16G

Contiguous hint sizes are advertised in pgsize_bitmap, analogous to
how the CPU MMU advertises them via hugetlb hstates, so that IOMMU API
users (e.g. __iommu_dma_alloc_pages()) can align allocations to these
sizes and benefit from the TLB optimization automatically.

Support is gated behind CONFIG_IOMMU_IO_PGTABLE_CONTIG_HINT, which
provides a compile-time opt-out for hardware affected by SMMU errata
related to the contiguous bit.

On the mapping side, __arm_lpae_map() detects when the requested size
matches a contiguous range at the next level, sets the CONT bit on all
PTEs in the group, then recurses with the base block size and an
adjusted pgcount.

On the unmapping side, the CONT bit is cleared from all PTEs in the
affected contiguous group before any individual entry is invalidated,
following the Break-Before-Make requirement of the architecture.

Tested on QEMU (arm64/SMMUv3) with iommu_map()/iommu_unmap() of
contiguous hint sizes; verified the CONT bit is correctly set on map
and cleared on unmap via page table walk.

Co-developed-by: Vijayanand Jitta <vijayanand.jitta@oss.qualcomm.com>
Signed-off-by: Vijayanand Jitta <vijayanand.jitta@oss.qualcomm.com>
Signed-off-by: Prakash Gupta <prakash.gupta@oss.qualcomm.com>
---
 drivers/iommu/Kconfig          |  16 +++
 drivers/iommu/io-pgtable-arm.c | 216 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 226 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6e07bd69467a3..1c514361c5c9e 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -50,6 +50,22 @@ config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST
 
 	  If unsure, say N here.
 
+config IOMMU_IO_PGTABLE_CONTIG_HINT
+	bool "Enable contiguous hint"
+	depends on IOMMU_IO_PGTABLE_LPAE
+	default y
+	help
+	  Enable contiguous hint (CONT bit) support for the ARM LPAE page
+	  table allocator. Contiguous hint sizes are advertised in the
+	  pgsize_bitmap so that IOMMU API users can align allocations to
+	  these sizes and benefit from improved TLB utilization, analogous
+	  to how the CPU MMU advertises contiguous sizes via hugetlb.
+
+	  Disabling this option provides a compile-time opt-out for
+	  hardware affected by SMMU errata related to the contiguous bit.
+
+	  If unsure, say Y here.
+
 config IOMMU_IO_PGTABLE_ARMV7S
 	bool "ARMv7/v8 Short Descriptor Format"
 	select IOMMU_IO_PGTABLE
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 476c0e25631af..9fc60520177f1 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -86,6 +86,21 @@
 /* Software bit for solving coherency races */
 #define ARM_LPAE_PTE_SW_SYNC		(((arm_lpae_iopte)1) << 55)
 
+/* PTE Contiguous Bit */
+#define ARM_LPAE_PTE_CONT		(((arm_lpae_iopte)1) << 52)
+
+/*
+ * CONTIG HINT SUPPORT TABLE
+ *
+ *---------------------------------------------------
+ *| Page Size | CONT PTE |  PMD  | CONT PMD |  PUD  |
+ *---------------------------------------------------
+ *|     4K    |   64K    |   2M  |    32M   |   1G  |
+ *|    16K    |    2M    |  32M  |     1G   |       |
+ *|    64K    |    2M    | 512M  |    16G   |       |
+ *---------------------------------------------------
+ */
+
 /* Stage-1 PTE */
 #define ARM_LPAE_PTE_AP_UNPRIV		(((arm_lpae_iopte)1) << 6)
 #define ARM_LPAE_PTE_AP_RDONLY_BIT	7
@@ -453,6 +468,111 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
 	return old;
 }
 
+#ifdef CONFIG_IOMMU_IO_PGTABLE_CONTIG_HINT
+static inline int arm_lpae_cont_ptes(unsigned long size)
+{
+	if (size == SZ_4K)
+		return 16;
+	if (size == SZ_16K)
+		return 128;
+	if (size == SZ_64K)
+		return 32;
+	return 1;
+}
+
+static inline unsigned long arm_lpae_cont_pte_size(unsigned long size)
+{
+	return arm_lpae_cont_ptes(size) * size;
+}
+
+static inline int arm_lpae_cont_pmds(unsigned long size)
+{
+	if (size == SZ_2M)
+		return 16;
+	if (size == SZ_32M)
+		return 32;
+	if (size == SZ_512M)
+		return 32;
+	return 1;
+}
+
+static inline unsigned long arm_lpae_cont_pmd_size(unsigned long size)
+{
+	return arm_lpae_cont_pmds(size) * size;
+}
+
+static unsigned long arm_lpae_get_cont_sizes(struct io_pgtable_cfg *cfg)
+{
+	unsigned long pg_size, pmd_size;
+	int pg_shift, bits_per_level;
+
+	if (!cfg->pgsize_bitmap)
+		return 0;
+
+	pg_shift = __ffs(cfg->pgsize_bitmap);
+	bits_per_level = pg_shift - ilog2(sizeof(arm_lpae_iopte));
+	pg_size = (1UL << pg_shift);
+	pmd_size = (pg_size << bits_per_level);
+
+	return (arm_lpae_cont_pte_size(pg_size) | arm_lpae_cont_pmd_size(pmd_size));
+}
+
+static u32 arm_lpae_find_num_cont(struct arm_lpae_io_pgtable *data, int lvl)
+{
+	if (lvl == ARM_LPAE_MAX_LEVELS - 2)
+		return arm_lpae_cont_pmds(ARM_LPAE_BLOCK_SIZE(lvl, data));
+	else if (lvl == ARM_LPAE_MAX_LEVELS - 1)
+		return arm_lpae_cont_ptes(ARM_LPAE_BLOCK_SIZE(lvl, data));
+	else
+		return 1;
+}
+
+static u32 arm_lpae_check_num_cont(struct arm_lpae_io_pgtable *data, size_t size, int lvl)
+{
+	int num_cont;
+
+	num_cont = arm_lpae_find_num_cont(data, lvl);
+	if (size == num_cont * ARM_LPAE_BLOCK_SIZE(lvl, data))
+		return num_cont;
+	else
+		return 1;
+}
+
+static bool arm_lpae_pte_is_contiguous_range(struct arm_lpae_io_pgtable *data,
+					     unsigned long size,
+					     int lvl, u32 *num_cont)
+{
+	unsigned long block_size;
+
+	*num_cont = arm_lpae_find_num_cont(data, lvl);
+	block_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+
+	return (size == ((*num_cont) * block_size));
+}
+#else
+static unsigned long arm_lpae_get_cont_sizes(struct io_pgtable_cfg *cfg)
+{
+	return 0;
+}
+
+static u32 arm_lpae_find_num_cont(struct arm_lpae_io_pgtable *data, int lvl)
+{
+	return 1;
+}
+
+static u32 arm_lpae_check_num_cont(struct arm_lpae_io_pgtable *data, size_t size, int lvl)
+{
+	return 1;
+}
+
+static bool arm_lpae_pte_is_contiguous_range(struct arm_lpae_io_pgtable *data,
+					     unsigned long size,
+					     int lvl, u32 *num_cont)
+{
+	return false;
+}
+#endif
+
 static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
 			  phys_addr_t paddr, size_t size, size_t pgcount,
 			  arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep,
@@ -463,6 +583,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
 	size_t tblsz = ARM_LPAE_GRANULE(data);
 	struct io_pgtable_cfg *cfg = &data->iop.cfg;
 	int ret = 0, num_entries, max_entries, map_idx_start;
+	u32 num_cont = 1;
 
 	/* Find our entry at the current level */
 	map_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
@@ -505,6 +626,24 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
 		return -EEXIST;
 	}
 
+	if (arm_lpae_pte_is_contiguous_range(data, size, lvl + 1, &num_cont)) {
+		size_t ct_size = ARM_LPAE_BLOCK_SIZE(lvl + 1, data);
+
+		/* Set cont bit */
+		prot |= ARM_LPAE_PTE_CONT;
+
+		/*
+		 * Since size here would be of CONT_PTE or CONT_PMD (e.g. SZ_64K/SZ_32M
+		 * in case of 4K PAGE_SIZE), but actual mappings are in multiples of
+		 * SZ_4K/SZ_2M, call __arm_lpae_map with ct_size and update pgcount
+		 * accordingly by num_cont * pgcount.
+		 */
+		ret = __arm_lpae_map(data, iova, paddr, ct_size,
+				     num_cont * pgcount,
+				     prot, lvl + 1, cptep, gfp, mapped);
+		return ret;
+	}
+
 	/* Rinse, repeat */
 	return __arm_lpae_map(data, iova, paddr, size, pgcount, prot, lvl + 1,
 			      cptep, gfp, mapped);
@@ -653,6 +792,48 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)
 	kfree(data);
 }
 
+#ifdef CONFIG_IOMMU_IO_PGTABLE_CONTIG_HINT
+static void arm_lpae_cont_clear(struct arm_lpae_io_pgtable *data,
+				unsigned long iova, int lvl,
+				arm_lpae_iopte *ptep, size_t num_entries)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	u32 num_cont = arm_lpae_find_num_cont(data, lvl);
+	arm_lpae_iopte *cont_ptep;
+	arm_lpae_iopte *cont_ptep_start;
+	unsigned long cont_iova;
+	int offset, itr;
+
+	cont_ptep = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
+	cont_iova = round_down(iova,
+			       ARM_LPAE_BLOCK_SIZE(lvl, data) * num_cont);
+	cont_ptep += ARM_LPAE_LVL_IDX(cont_iova, lvl, data);
+	cont_ptep_start = cont_ptep;
+
+	/*
+	 * iova may not be aligned to the contiguous group boundary; include
+	 * any leading entries so round_up() covers all overlapping groups.
+	 */
+	offset = ARM_LPAE_LVL_IDX(iova, lvl, data) -
+		 ARM_LPAE_LVL_IDX(cont_iova, lvl, data);
+	num_entries = round_up(offset + num_entries, num_cont);
+
+	for (itr = 0; itr < num_entries; itr++) {
+		WRITE_ONCE(*cont_ptep, READ_ONCE(*cont_ptep) & ~ARM_LPAE_PTE_CONT);
+		cont_ptep++;
+	}
+
+	if (!cfg->coherent_walk)
+		__arm_lpae_sync_pte(cont_ptep_start, num_entries, cfg);
+}
+#else
+static void arm_lpae_cont_clear(struct arm_lpae_io_pgtable *data,
+				unsigned long iova, int lvl,
+				arm_lpae_iopte *ptep, size_t num_entries)
+{
+}
+#endif
+
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			       struct iommu_iotlb_gather *gather,
 			       unsigned long iova, size_t size, size_t pgcount,
@@ -660,7 +841,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 {
 	arm_lpae_iopte pte;
 	struct io_pgtable *iop = &data->iop;
-	int i = 0, num_entries, max_entries, unmap_idx_start;
+	int i = 0, num_cont = 1, num_entries, max_entries, unmap_idx_start;
 
 	/* Something went horribly wrong and we ran out of page table */
 	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
@@ -675,9 +856,15 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 	}
 
 	/* If the size matches this level, we're in the right place */
-	if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
+	if (size == ARM_LPAE_BLOCK_SIZE(lvl, data) ||
+	    (size == arm_lpae_find_num_cont(data, lvl) *
+		     ARM_LPAE_BLOCK_SIZE(lvl, data))) {
+		size_t pte_size;
+
 		max_entries = arm_lpae_max_entries(unmap_idx_start, data);
-		num_entries = min_t(int, pgcount, max_entries);
+		num_cont = arm_lpae_check_num_cont(data, size, lvl);
+		num_entries = min_t(int, num_cont * pgcount, max_entries);
+		pte_size = size / num_cont;
 
 		/* Find and handle non-leaf entries */
 		for (i = 0; i < num_entries; i++) {
@@ -687,11 +874,27 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 				break;
 			}
 
+			/*
+			 * Break-Before-Make: before invalidating any leaf
+			 * entry, clear the CONT bit from every entry in the
+			 * contiguous group(s) and flush the TLB, as required
+			 * by the architecture.  arm_lpae_cont_clear() covers
+			 * the full [iova, iova + num_entries * pte_size) range
+			 * via round_up(), so subsequent entries read back
+			 * CONT=0 and skip this block.
+			 */
+			if (pte & ARM_LPAE_PTE_CONT) {
+				arm_lpae_cont_clear(data, iova, lvl, ptep, num_entries);
+				io_pgtable_tlb_flush_walk(iop, iova,
+							  num_entries * pte_size,
+							  ARM_LPAE_GRANULE(data));
+			}
+
 			if (!iopte_leaf(pte, lvl, iop->fmt)) {
 				__arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
 
 				/* Also flush any partial walks */
-				io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
+				io_pgtable_tlb_flush_walk(iop, iova + i * pte_size, pte_size,
 							  ARM_LPAE_GRANULE(data));
 				__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
 			}
@@ -702,9 +905,9 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 
 		if (gather && !iommu_iotlb_gather_queued(gather))
 			for (int j = 0; j < i; j++)
-				io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
+				io_pgtable_tlb_add_page(iop, gather, iova + j * pte_size, pte_size);
 
-		return i * size;
+		return i * pte_size;
 	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
 		WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed");
 		return 0;
@@ -943,6 +1146,7 @@ static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
 	}
 
 	cfg->pgsize_bitmap &= page_sizes;
+	cfg->pgsize_bitmap |= arm_lpae_get_cont_sizes(cfg);
 	cfg->ias = min(cfg->ias, max_addr_bits);
 	cfg->oas = min(cfg->oas, max_addr_bits);
 }

---
base-commit: 4fa3f5fabb30bf00d7475d5a33459ea83d639bf9
change-id: 20260618-iommu_contig_hint-71ae491fbb52

Best regards,
--  
Vijayanand Jitta <vijayanand.jitta@oss.qualcomm.com>


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] iommu/io-pgtable-arm: Add support for contiguous hint bit
  2026-06-18  9:02 [PATCH] iommu/io-pgtable-arm: Add support for contiguous hint bit Vijayanand Jitta
@ 2026-06-19 19:40 ` Daniel Mentz
  2026-06-25  5:47   ` Vijayanand Jitta
  2026-07-03 16:12 ` Jason Gunthorpe
  1 sibling, 1 reply; 4+ messages in thread
From: Daniel Mentz @ 2026-06-19 19:40 UTC (permalink / raw)
  To: Vijayanand Jitta
  Cc: Joerg Roedel (AMD), Will Deacon, Robin Murphy, linux-arm-msm,
	iommu, linux-kernel, linux-arm-kernel, Prakash Gupta

On Thu, Jun 18, 2026 at 2:06 AM Vijayanand Jitta
<vijayanand.jitta@oss.qualcomm.com> wrote:
> Support is gated behind CONFIG_IOMMU_IO_PGTABLE_CONTIG_HINT, which
> provides a compile-time opt-out for hardware affected by SMMU errata
> related to the contiguous bit.

Have you considered making this a runtime option? Compare this with
arm_smmu_device_iidr_probe() where the smmuv3 driver disables certain
features based on the identified implementation and the errata
affecting that implementation.

> On the mapping side, __arm_lpae_map() detects when the requested size
> matches a contiguous range at the next level, sets the CONT bit on all
> PTEs in the group, then recurses with the base block size and an
> adjusted pgcount.

I would perform this check at the current level not the previous
level. See comments below.

>
> On the unmapping side, the CONT bit is cleared from all PTEs in the
> affected contiguous group before any individual entry is invalidated,
> following the Break-Before-Make requirement of the architecture.

My understanding is that for unmap operations, the following rule applies:

The IOVA range targeted by an unmap operation must exactly match the
IOVA range of a previous map operation. Partial unmap operations are
not allowed.

The iopgtable code previously had a function named
arm_lpae_split_blk_unmap() which allowed a block mapping to be split
up. However, that function has since been removed, which aligns with
prohibiting partial unmaps.
The other concern I have is a potential race condition: While one
thread clears the contiguous bit, another thread could try to unmap
the same descriptor.

Consider dropping support for partial unmap and just triggering a
WARN_ON() if you detect that a contiguous group is partially unmapped.

> +static inline int arm_lpae_cont_pmds(unsigned long size)

PMD is not a term that is used in this file. I advise against
introducing this term.

> +static u32 arm_lpae_find_num_cont(struct arm_lpae_io_pgtable *data, int lvl)
> +{
> +       if (lvl == ARM_LPAE_MAX_LEVELS - 2)
> +               return arm_lpae_cont_pmds(ARM_LPAE_BLOCK_SIZE(lvl, data));
> +       else if (lvl == ARM_LPAE_MAX_LEVELS - 1)
> +               return arm_lpae_cont_ptes(ARM_LPAE_BLOCK_SIZE(lvl, data));

Consider supporting the contiguous bit at lookup level 1.

>  static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
>                           phys_addr_t paddr, size_t size, size_t pgcount,
>                           arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep,
> @@ -463,6 +583,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
>         size_t tblsz = ARM_LPAE_GRANULE(data);
>         struct io_pgtable_cfg *cfg = &data->iop.cfg;
>         int ret = 0, num_entries, max_entries, map_idx_start;
> +       u32 num_cont = 1;
>
>         /* Find our entry at the current level */
>         map_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
> @@ -505,6 +626,24 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
>                 return -EEXIST;
>         }
>
> +       if (arm_lpae_pte_is_contiguous_range(data, size, lvl + 1, &num_cont)) {

I would recommend performing this check at the actual level not at the
previous lookup level i.e. not at the (lvl - 1) level. Imagine the
following situation: The granule size is 4KB, the initial lookup level
is 2, and size is 32MB. I'm wondering if in that case, it'll just keep
recursing until it hits (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1)).

> +#ifdef CONFIG_IOMMU_IO_PGTABLE_CONTIG_HINT
> +static void arm_lpae_cont_clear(struct arm_lpae_io_pgtable *data,
> +                               unsigned long iova, int lvl,
> +                               arm_lpae_iopte *ptep, size_t num_entries)
> +{
> +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +       u32 num_cont = arm_lpae_find_num_cont(data, lvl);
> +       arm_lpae_iopte *cont_ptep;
> +       arm_lpae_iopte *cont_ptep_start;
> +       unsigned long cont_iova;
> +       int offset, itr;
> +
> +       cont_ptep = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
> +       cont_iova = round_down(iova,
> +                              ARM_LPAE_BLOCK_SIZE(lvl, data) * num_cont);

As a result of this round_down() function, you are accessing a
descriptor that describes an IOVA outside the range targeted by the
iommu_unmap call. Consequently, you might race against another thread
accessing the same descriptor.

> +       cont_ptep += ARM_LPAE_LVL_IDX(cont_iova, lvl, data);
> +       cont_ptep_start = cont_ptep;
> +
> +       /*
> +        * iova may not be aligned to the contiguous group boundary; include
> +        * any leading entries so round_up() covers all overlapping groups.
> +        */
> +       offset = ARM_LPAE_LVL_IDX(iova, lvl, data) -
> +                ARM_LPAE_LVL_IDX(cont_iova, lvl, data);
> +       num_entries = round_up(offset + num_entries, num_cont);
> +
> +       for (itr = 0; itr < num_entries; itr++) {
> +               WRITE_ONCE(*cont_ptep, READ_ONCE(*cont_ptep) & ~ARM_LPAE_PTE_CONT);

This read-modify-write operation is not safe due to the potential race
described above.

> +               cont_ptep++;
> +       }
> +
> +       if (!cfg->coherent_walk)
> +               __arm_lpae_sync_pte(cont_ptep_start, num_entries, cfg);
> +}
> +#else
> +static void arm_lpae_cont_clear(struct arm_lpae_io_pgtable *data,
> +                               unsigned long iova, int lvl,
> +                               arm_lpae_iopte *ptep, size_t num_entries)
> +{
> +}
> +#endif
> +
>  static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>                                struct iommu_iotlb_gather *gather,
>                                unsigned long iova, size_t size, size_t pgcount,
> @@ -660,7 +841,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>  {
>         arm_lpae_iopte pte;
>         struct io_pgtable *iop = &data->iop;
> -       int i = 0, num_entries, max_entries, unmap_idx_start;
> +       int i = 0, num_cont = 1, num_entries, max_entries, unmap_idx_start;
>
>         /* Something went horribly wrong and we ran out of page table */
>         if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
> @@ -675,9 +856,15 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>         }
>
>         /* If the size matches this level, we're in the right place */
> -       if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
> +       if (size == ARM_LPAE_BLOCK_SIZE(lvl, data) ||
> +           (size == arm_lpae_find_num_cont(data, lvl) *
> +                    ARM_LPAE_BLOCK_SIZE(lvl, data))) {
> +               size_t pte_size;
> +
>                 max_entries = arm_lpae_max_entries(unmap_idx_start, data);
> -               num_entries = min_t(int, pgcount, max_entries);
> +               num_cont = arm_lpae_check_num_cont(data, size, lvl);
> +               num_entries = min_t(int, num_cont * pgcount, max_entries);
> +               pte_size = size / num_cont;
>
>                 /* Find and handle non-leaf entries */
>                 for (i = 0; i < num_entries; i++) {
> @@ -687,11 +874,27 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>                                 break;
>                         }
>
> +                       /*
> +                        * Break-Before-Make: before invalidating any leaf
> +                        * entry, clear the CONT bit from every entry in the
> +                        * contiguous group(s) and flush the TLB, as required
> +                        * by the architecture.  arm_lpae_cont_clear() covers
> +                        * the full [iova, iova + num_entries * pte_size) range
> +                        * via round_up(), so subsequent entries read back
> +                        * CONT=0 and skip this block.
> +                        */
> +                       if (pte & ARM_LPAE_PTE_CONT) {
> +                               arm_lpae_cont_clear(data, iova, lvl, ptep, num_entries);
> +                               io_pgtable_tlb_flush_walk(iop, iova,
> +                                                         num_entries * pte_size,
> +                                                         ARM_LPAE_GRANULE(data));

I believe this is inefficient. Consider the case where we unmap 2MB
worth of IOVA space mapped by 512 4KB page descriptors with the
contiguous bit set. If I'm not mistaken, you're running CMOs
(__arm_lpae_sync_pte) twice for every page descriptor. In addition,
io_pgtable_tlb_flush_walk() will submit an extra CMD_SYNC and wait for
it's completion.

Additionally, you perform rounding in arm_lpae_cont_clear(). However,
io_pgtable_tlb_flush_walk() is called on the original, potentially
unaligned range. Can this lead to under invalidation? Again, my
preference would be to drop support for partial unmaps which would
also remove the requirement for calling io_pgtable_tlb_flush_walk()
here.

> +                       }
> +
>                         if (!iopte_leaf(pte, lvl, iop->fmt)) {
>                                 __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
>
>                                 /* Also flush any partial walks */
> -                               io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
> +                               io_pgtable_tlb_flush_walk(iop, iova + i * pte_size, pte_size,
>                                                           ARM_LPAE_GRANULE(data));
>                                 __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
>                         }

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] iommu/io-pgtable-arm: Add support for contiguous hint bit
  2026-06-19 19:40 ` Daniel Mentz
@ 2026-06-25  5:47   ` Vijayanand Jitta
  0 siblings, 0 replies; 4+ messages in thread
From: Vijayanand Jitta @ 2026-06-25  5:47 UTC (permalink / raw)
  To: Daniel Mentz
  Cc: Joerg Roedel (AMD), Will Deacon, Robin Murphy, linux-arm-msm,
	iommu, linux-kernel, linux-arm-kernel, Prakash Gupta



On 6/20/2026 1:10 AM, Daniel Mentz wrote:
> On Thu, Jun 18, 2026 at 2:06 AM Vijayanand Jitta
> <vijayanand.jitta@oss.qualcomm.com> wrote:
>> Support is gated behind CONFIG_IOMMU_IO_PGTABLE_CONTIG_HINT, which
>> provides a compile-time opt-out for hardware affected by SMMU errata
>> related to the contiguous bit.
> 
> Have you considered making this a runtime option? Compare this with
> arm_smmu_device_iidr_probe() where the smmuv3 driver disables certain
> features based on the identified implementation and the errata
> affecting that implementation.
> 

Thanks for the review comments.

Good point. I’ll drop the Kconfig switch and make this runtime-controlled
via an io-pgtable quirk, so SMMU drivers can disable CONT based on errata.

>> On the mapping side, __arm_lpae_map() detects when the requested size
>> matches a contiguous range at the next level, sets the CONT bit on all
>> PTEs in the group, then recurses with the base block size and an
>> adjusted pgcount.
> 
> I would perform this check at the current level not the previous
> level. See comments below.
> 

Sure, will update this check at current level.

>>
>> On the unmapping side, the CONT bit is cleared from all PTEs in the
>> affected contiguous group before any individual entry is invalidated,
>> following the Break-Before-Make requirement of the architecture.
> 
> My understanding is that for unmap operations, the following rule applies:
> 
> The IOVA range targeted by an unmap operation must exactly match the
> IOVA range of a previous map operation. Partial unmap operations are
> not allowed.
> 
> The iopgtable code previously had a function named
> arm_lpae_split_blk_unmap() which allowed a block mapping to be split
> up. However, that function has since been removed, which aligns with
> prohibiting partial unmaps.
> The other concern I have is a potential race condition: While one
> thread clears the contiguous bit, another thread could try to unmap
> the same descriptor.
> 
> Consider dropping support for partial unmap and just triggering a
> WARN_ON() if you detect that a contiguous group is partially unmapped.
> 

Sure, will drop partial unmap support and  I'll update with WARN_ON()
as suggested.

>> +static inline int arm_lpae_cont_pmds(unsigned long size)
> 
> PMD is not a term that is used in this file. I advise against
> introducing this term.
> 

Agreed, I’ll avoid PMD terminology here and rename those helpers/comments
to use block-level wording.

>> +static u32 arm_lpae_find_num_cont(struct arm_lpae_io_pgtable *data, int lvl)
>> +{
>> +       if (lvl == ARM_LPAE_MAX_LEVELS - 2)
>> +               return arm_lpae_cont_pmds(ARM_LPAE_BLOCK_SIZE(lvl, data));
>> +       else if (lvl == ARM_LPAE_MAX_LEVELS - 1)
>> +               return arm_lpae_cont_ptes(ARM_LPAE_BLOCK_SIZE(lvl, data));
> 
> Consider supporting the contiguous bit at lookup level 1.
> 

Sure.

>>  static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
>>                           phys_addr_t paddr, size_t size, size_t pgcount,
>>                           arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep,
>> @@ -463,6 +583,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
>>         size_t tblsz = ARM_LPAE_GRANULE(data);
>>         struct io_pgtable_cfg *cfg = &data->iop.cfg;
>>         int ret = 0, num_entries, max_entries, map_idx_start;
>> +       u32 num_cont = 1;
>>
>>         /* Find our entry at the current level */
>>         map_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
>> @@ -505,6 +626,24 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
>>                 return -EEXIST;
>>         }
>>
>> +       if (arm_lpae_pte_is_contiguous_range(data, size, lvl + 1, &num_cont)) {
> 
> I would recommend performing this check at the actual level not at the
> previous lookup level i.e. not at the (lvl - 1) level. Imagine the
> following situation: The granule size is 4KB, the initial lookup level
> is 2, and size is 32MB. I'm wondering if in that case, it'll just keep
> recursing until it hits (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1)).
> 

Right, I see your point. The contiguous-size check should be done against the
current level, I’ll fix that in v2.

>> +#ifdef CONFIG_IOMMU_IO_PGTABLE_CONTIG_HINT
>> +static void arm_lpae_cont_clear(struct arm_lpae_io_pgtable *data,
>> +                               unsigned long iova, int lvl,
>> +                               arm_lpae_iopte *ptep, size_t num_entries)
>> +{
>> +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
>> +       u32 num_cont = arm_lpae_find_num_cont(data, lvl);
>> +       arm_lpae_iopte *cont_ptep;
>> +       arm_lpae_iopte *cont_ptep_start;
>> +       unsigned long cont_iova;
>> +       int offset, itr;
>> +
>> +       cont_ptep = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
>> +       cont_iova = round_down(iova,
>> +                              ARM_LPAE_BLOCK_SIZE(lvl, data) * num_cont);
> 
> As a result of this round_down() function, you are accessing a
> descriptor that describes an IOVA outside the range targeted by the
> iommu_unmap call. Consequently, you might race against another thread
> accessing the same descriptor.
> 

Agreed. I’m going to drop partial-unmap handling for contiguous groups,
so we will only operate on an exact aligned contiguous range and
reject partial unmaps with WARN_ON(). That also removes the need for
the current round_down()-based logic.

>> +       cont_ptep += ARM_LPAE_LVL_IDX(cont_iova, lvl, data);
>> +       cont_ptep_start = cont_ptep;
>> +
>> +       /*
>> +        * iova may not be aligned to the contiguous group boundary; include
>> +        * any leading entries so round_up() covers all overlapping groups.
>> +        */
>> +       offset = ARM_LPAE_LVL_IDX(iova, lvl, data) -
>> +                ARM_LPAE_LVL_IDX(cont_iova, lvl, data);
>> +       num_entries = round_up(offset + num_entries, num_cont);
>> +
>> +       for (itr = 0; itr < num_entries; itr++) {
>> +               WRITE_ONCE(*cont_ptep, READ_ONCE(*cont_ptep) & ~ARM_LPAE_PTE_CONT);
> 
> This read-modify-write operation is not safe due to the potential race
> described above.
> 

With partial unmap support removed, I suppose this should be fine now.

>> +               cont_ptep++;
>> +       }
>> +
>> +       if (!cfg->coherent_walk)
>> +               __arm_lpae_sync_pte(cont_ptep_start, num_entries, cfg);
>> +}
>> +#else
>> +static void arm_lpae_cont_clear(struct arm_lpae_io_pgtable *data,
>> +                               unsigned long iova, int lvl,
>> +                               arm_lpae_iopte *ptep, size_t num_entries)
>> +{
>> +}
>> +#endif
>> +
>>  static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>                                struct iommu_iotlb_gather *gather,
>>                                unsigned long iova, size_t size, size_t pgcount,
>> @@ -660,7 +841,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>  {
>>         arm_lpae_iopte pte;
>>         struct io_pgtable *iop = &data->iop;
>> -       int i = 0, num_entries, max_entries, unmap_idx_start;
>> +       int i = 0, num_cont = 1, num_entries, max_entries, unmap_idx_start;
>>
>>         /* Something went horribly wrong and we ran out of page table */
>>         if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
>> @@ -675,9 +856,15 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>         }
>>
>>         /* If the size matches this level, we're in the right place */
>> -       if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
>> +       if (size == ARM_LPAE_BLOCK_SIZE(lvl, data) ||
>> +           (size == arm_lpae_find_num_cont(data, lvl) *
>> +                    ARM_LPAE_BLOCK_SIZE(lvl, data))) {
>> +               size_t pte_size;
>> +
>>                 max_entries = arm_lpae_max_entries(unmap_idx_start, data);
>> -               num_entries = min_t(int, pgcount, max_entries);
>> +               num_cont = arm_lpae_check_num_cont(data, size, lvl);
>> +               num_entries = min_t(int, num_cont * pgcount, max_entries);
>> +               pte_size = size / num_cont;
>>
>>                 /* Find and handle non-leaf entries */
>>                 for (i = 0; i < num_entries; i++) {
>> @@ -687,11 +874,27 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>                                 break;
>>                         }
>>
>> +                       /*
>> +                        * Break-Before-Make: before invalidating any leaf
>> +                        * entry, clear the CONT bit from every entry in the
>> +                        * contiguous group(s) and flush the TLB, as required
>> +                        * by the architecture.  arm_lpae_cont_clear() covers
>> +                        * the full [iova, iova + num_entries * pte_size) range
>> +                        * via round_up(), so subsequent entries read back
>> +                        * CONT=0 and skip this block.
>> +                        */
>> +                       if (pte & ARM_LPAE_PTE_CONT) {
>> +                               arm_lpae_cont_clear(data, iova, lvl, ptep, num_entries);
>> +                               io_pgtable_tlb_flush_walk(iop, iova,
>> +                                                         num_entries * pte_size,
>> +                                                         ARM_LPAE_GRANULE(data));
> 
> I believe this is inefficient. Consider the case where we unmap 2MB
> worth of IOVA space mapped by 512 4KB page descriptors with the
> contiguous bit set. If I'm not mistaken, you're running CMOs
> (__arm_lpae_sync_pte) twice for every page descriptor. In addition,
> io_pgtable_tlb_flush_walk() will submit an extra CMD_SYNC and wait for
> it's completion.
> 
> Additionally, you perform rounding in arm_lpae_cont_clear(). However,
> io_pgtable_tlb_flush_walk() is called on the original, potentially
> unaligned range. Can this lead to under invalidation? Again, my
> preference would be to drop support for partial unmaps which would
> also remove the requirement for calling io_pgtable_tlb_flush_walk()
> here.
> 

Agreed. The current unmap path is more complex and expensive than necessary.
Since partial unmap of contiguous groups should not be supported, I will remove
the rounding-based handling and only permit unmaps that exactly match an
aligned contiguous group. That also eliminates the need for the 
extra io_pgtable_tlb_flush_walk() here.

Thanks,
Vijay

>> +                       }
>> +
>>                         if (!iopte_leaf(pte, lvl, iop->fmt)) {
>>                                 __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
>>
>>                                 /* Also flush any partial walks */
>> -                               io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
>> +                               io_pgtable_tlb_flush_walk(iop, iova + i * pte_size, pte_size,
>>                                                           ARM_LPAE_GRANULE(data));
>>                                 __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
>>                         }


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] iommu/io-pgtable-arm: Add support for contiguous hint bit
  2026-06-18  9:02 [PATCH] iommu/io-pgtable-arm: Add support for contiguous hint bit Vijayanand Jitta
  2026-06-19 19:40 ` Daniel Mentz
@ 2026-07-03 16:12 ` Jason Gunthorpe
  1 sibling, 0 replies; 4+ messages in thread
From: Jason Gunthorpe @ 2026-07-03 16:12 UTC (permalink / raw)
  To: Vijayanand Jitta
  Cc: Joerg Roedel (AMD), Will Deacon, Robin Murphy, linux-arm-msm,
	iommu, linux-kernel, linux-arm-kernel, Prakash Gupta

On Thu, Jun 18, 2026 at 02:32:09PM +0530, Vijayanand Jitta wrote:
> From: Prakash Gupta <prakash.gupta@oss.qualcomm.com>
> 
> Add support for the contiguous hint (CONT) bit in ARM LPAE page tables.
> When a set of consecutive PTEs map a naturally-aligned contiguous block
> of memory, the CONT bit can be set on all entries in the group to allow
> the hardware to combine them into a single TLB entry, improving TLB
> utilization.
> 
> The contiguous hint sizes per granule are:
> 
>   Page Size | CONT PTE |  PMD  | CONT PMD
>   ----------+----------+-------+---------
>       4K    |   64K    |   2M  |   32M
>      16K    |    2M    |  32M  |    1G
>      64K    |    2M    | 512M  |   16G

My series to convert smmuv3 to the iommupt takes care of this and
supports all the orders too. I'd rather we move forward with that then
try to patch up this.

> Support is gated behind CONFIG_IOMMU_IO_PGTABLE_CONTIG_HINT, which
> provides a compile-time opt-out for hardware affected by SMMU errata
> related to the contiguous bit.

I reviewed the errata and didn't find any related to contig that
required disabling contig, the driver was OK with the ones I
found.. Did you find something specific?

> On the unmapping side, the CONT bit is cleared from all PTEs in the
> affected contiguous group before any individual entry is invalidated,
> following the Break-Before-Make requirement of the architecture.

BBM means you make it non-present and flush, it doesn't mean you clear
CONT, this is wrong..

Jason

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-07-03 16:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-18  9:02 [PATCH] iommu/io-pgtable-arm: Add support for contiguous hint bit Vijayanand Jitta
2026-06-19 19:40 ` Daniel Mentz
2026-06-25  5:47   ` Vijayanand Jitta
2026-07-03 16:12 ` Jason Gunthorpe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox