From: Jason Gunthorpe <jgg@nvidia.com>
To: iommu@lists.linux.dev, Joerg Roedel <joro@8bytes.org>,
Robin Murphy <robin.murphy@arm.com>,
Will Deacon <will@kernel.org>
Cc: Kevin Tian <kevin.tian@intel.com>,
patches@lists.linux.dev, Samiullah Khawaja <skhawaja@google.com>
Subject: [PATCH v3 2/2] iommupt: Avoid rewalking during map
Date: Fri, 27 Feb 2026 15:30:11 -0400 [thread overview]
Message-ID: <2-v3-a1777ea76519+370f-iommpt_map_direct_jgg@nvidia.com> (raw)
In-Reply-To: <0-v3-a1777ea76519+370f-iommpt_map_direct_jgg@nvidia.com>
Currently the core code provides a simplified interface to drivers where
it fragments a requested multi-page map into single page size steps after
doing all the calculations to figure out what page size is
appropriate. Each step rewalks the page tables from the start.
Since iommupt has a single implementation of the mapping algorithm it can
internally compute each step as it goes while retaining its current
position in the walk.
Add a new function pt_pgsz_count() which computes the same page size
fragement of a large mapping operations.
Compute the next fragment when all the leaf entries of the current
fragement have been written, then continue walking from the current
point.
The function pointer is run through pt_iommu_ops instead of
iommu_domain_ops to discourage using it outside iommupt. All drivers with
their own page tables should continue to use the simplified map_pages()
style interfaces.
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
drivers/iommu/generic_pt/iommu_pt.h | 133 ++++++++++++--------
drivers/iommu/generic_pt/kunit_generic_pt.h | 12 ++
drivers/iommu/generic_pt/pt_iter.h | 22 ++++
drivers/iommu/iommu.c | 39 ++++--
include/linux/generic_pt/iommu.h | 34 ++++-
5 files changed, 175 insertions(+), 65 deletions(-)
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 62d6ae2a97ba63..d67bc5a09fccb3 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -466,6 +466,7 @@ struct pt_iommu_map_args {
pt_oaddr_t oa;
unsigned int leaf_pgsize_lg2;
unsigned int leaf_level;
+ pt_vaddr_t num_leaves;
};
/*
@@ -518,11 +519,15 @@ static int clear_contig(const struct pt_state *start_pts,
static int __map_range_leaf(struct pt_range *range, void *arg,
unsigned int level, struct pt_table_p *table)
{
+ struct pt_iommu *iommu_table = iommu_from_common(range->common);
struct pt_state pts = pt_init(range, level, table);
struct pt_iommu_map_args *map = arg;
unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2;
unsigned int start_index;
pt_oaddr_t oa = map->oa;
+ unsigned int num_leaves;
+ unsigned int orig_end;
+ pt_vaddr_t last_va;
unsigned int step;
bool need_contig;
int ret = 0;
@@ -536,6 +541,15 @@ static int __map_range_leaf(struct pt_range *range, void *arg,
_pt_iter_first(&pts);
start_index = pts.index;
+ orig_end = pts.end_index;
+ if (pts.index + map->num_leaves < pts.end_index) {
+ /* Need to stop in the middle of the table to change sizes */
+ pts.end_index = pts.index + map->num_leaves;
+ num_leaves = 0;
+ } else {
+ num_leaves = map->num_leaves - (pts.end_index - pts.index);
+ }
+
do {
pts.type = pt_load_entry_raw(&pts);
if (pts.type != PT_ENTRY_EMPTY || need_contig) {
@@ -561,7 +575,40 @@ static int __map_range_leaf(struct pt_range *range, void *arg,
flush_writes_range(&pts, start_index, pts.index);
map->oa = oa;
- return ret;
+ map->num_leaves = num_leaves;
+ if (ret || num_leaves)
+ return ret;
+
+ /* range->va is not valid if we reached the end of the table */
+ pts.index -= step;
+ pt_index_to_va(&pts);
+ pts.index += step;
+ last_va = range->va + log2_to_int(leaf_pgsize_lg2);
+
+ if (last_va - 1 == range->last_va) {
+ PT_WARN_ON(pts.index != orig_end);
+ return 0;
+ }
+
+ /*
+ * Reached a point where the page size changed, compute the new
+ * parameters.
+ */
+ map->leaf_pgsize_lg2 = pt_compute_best_pgsize(
+ iommu_table->domain.pgsize_bitmap, last_va, range->last_va, oa);
+ map->leaf_level =
+ pt_pgsz_lg2_to_level(range->common, map->leaf_pgsize_lg2);
+ map->num_leaves = pt_pgsz_count(iommu_table->domain.pgsize_bitmap,
+ last_va, range->last_va, oa,
+ map->leaf_pgsize_lg2);
+
+ /* Didn't finish this table level, caller will repeat it */
+ if (pts.index != orig_end) {
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ return -EAGAIN;
+ }
+ return 0;
}
static int __map_range(struct pt_range *range, void *arg, unsigned int level,
@@ -584,14 +631,9 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level,
if (pts.type != PT_ENTRY_EMPTY)
return -EADDRINUSE;
ret = pt_iommu_new_table(&pts, &map->attrs);
- if (ret) {
- /*
- * Racing with another thread installing a table
- */
- if (ret == -EAGAIN)
- continue;
+ /* EAGAIN on a race will loop again */
+ if (ret)
return ret;
- }
} else {
pts.table_lower = pt_table_ptr(&pts);
/*
@@ -615,10 +657,12 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level,
* The already present table can possibly be shared with another
* concurrent map.
*/
- if (map->leaf_level == level - 1)
- ret = pt_descend(&pts, arg, __map_range_leaf);
- else
- ret = pt_descend(&pts, arg, __map_range);
+ do {
+ if (map->leaf_level == level - 1)
+ ret = pt_descend(&pts, arg, __map_range_leaf);
+ else
+ ret = pt_descend(&pts, arg, __map_range);
+ } while (ret == -EAGAIN);
if (ret)
return ret;
@@ -626,6 +670,14 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level,
pt_index_to_va(&pts);
if (pts.index >= pts.end_index)
break;
+
+ /*
+ * This level is currently running __map_range_leaf() which is
+ * not correct if the target level has been updated to this
+ * level. Have the caller invoke __map_range_leaf.
+ */
+ if (map->leaf_level == level)
+ return -EAGAIN;
} while (true);
return 0;
}
@@ -797,12 +849,13 @@ static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range,
static int do_map(struct pt_range *range, struct pt_common *common,
bool single_page, struct pt_iommu_map_args *map)
{
+ int ret;
+
/*
* The __map_single_page() fast path does not support DMA_INCOHERENT
* flushing to keep its .text small.
*/
if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
- int ret;
ret = pt_walk_range(range, __map_single_page, map);
if (ret != -EAGAIN)
@@ -810,50 +863,25 @@ static int do_map(struct pt_range *range, struct pt_common *common,
/* EAGAIN falls through to the full path */
}
- if (map->leaf_level == range->top_level)
- return pt_walk_range(range, __map_range_leaf, map);
- return pt_walk_range(range, __map_range, map);
+ do {
+ if (map->leaf_level == range->top_level)
+ ret = pt_walk_range(range, __map_range_leaf, map);
+ else
+ ret = pt_walk_range(range, __map_range, map);
+ } while (ret == -EAGAIN);
+ return ret;
}
-/**
- * map_pages() - Install translation for an IOVA range
- * @domain: Domain to manipulate
- * @iova: IO virtual address to start
- * @paddr: Physical/Output address to start
- * @pgsize: Length of each page
- * @pgcount: Length of the range in pgsize units starting from @iova
- * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO
- * @gfp: GFP flags for any memory allocations
- * @mapped: Total bytes successfully mapped
- *
- * The range starting at IOVA will have paddr installed into it. The caller
- * must specify a valid pgsize and pgcount to segment the range into compatible
- * blocks.
- *
- * On error the caller will probably want to invoke unmap on the range from iova
- * up to the amount indicated by @mapped to return the table back to an
- * unchanged state.
- *
- * Context: The caller must hold a write range lock that includes the whole
- * range.
- *
- * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were
- * mapped are added to @mapped, @mapped is not zerod first.
- */
-int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
+static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova,
+ phys_addr_t paddr, dma_addr_t len, unsigned int prot,
+ gfp_t gfp, size_t *mapped)
{
- struct pt_iommu *iommu_table =
- container_of(domain, struct pt_iommu, domain);
pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap;
struct pt_common *common = common_from_iommu(iommu_table);
struct iommu_iotlb_gather iotlb_gather;
- pt_vaddr_t len = pgsize * pgcount;
struct pt_iommu_map_args map = {
.iotlb_gather = &iotlb_gather,
.oa = paddr,
- .leaf_pgsize_lg2 = vaffs(pgsize),
};
bool single_page = false;
struct pt_range range;
@@ -881,13 +909,13 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
return ret;
/* Calculate target page size and level for the leaves */
- if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE &&
- pgcount == 1) {
+ if (pt_has_system_page_size(common) && len == PAGE_SIZE) {
PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE));
if (log2_mod(iova | paddr, PAGE_SHIFT))
return -ENXIO;
map.leaf_pgsize_lg2 = PAGE_SHIFT;
map.leaf_level = 0;
+ map.num_leaves = 1;
single_page = true;
} else {
map.leaf_pgsize_lg2 = pt_compute_best_pgsize(
@@ -896,6 +924,9 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
return -ENXIO;
map.leaf_level =
pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2);
+ map.num_leaves = pt_pgsz_count(pgsize_bitmap, range.va,
+ range.last_va, paddr,
+ map.leaf_pgsize_lg2);
}
ret = check_map_range(iommu_table, &range, &map);
@@ -918,7 +949,6 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
*mapped += map.oa - paddr;
return ret;
}
-EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU");
struct pt_unmap_args {
struct iommu_pages_list free_list;
@@ -1087,6 +1117,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table)
}
static const struct pt_iommu_ops NS(ops) = {
+ .map_range = NS(map_range),
.unmap_range = NS(unmap_range),
#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \
IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)
diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h
index 68278bf15cfe07..374e475f591e15 100644
--- a/drivers/iommu/generic_pt/kunit_generic_pt.h
+++ b/drivers/iommu/generic_pt/kunit_generic_pt.h
@@ -312,6 +312,17 @@ static void test_best_pgsize(struct kunit *test)
}
}
+static void test_pgsz_count(struct kunit *test)
+{
+ KUNIT_EXPECT_EQ(test,
+ pt_pgsz_count(SZ_4K, 0, SZ_1G - 1, 0, ilog2(SZ_4K)),
+ SZ_1G / SZ_4K);
+ KUNIT_EXPECT_EQ(test,
+ pt_pgsz_count(SZ_2M | SZ_4K, SZ_4K, SZ_1G - 1, SZ_4K,
+ ilog2(SZ_4K)),
+ (SZ_2M - SZ_4K) / SZ_4K);
+}
+
/*
* Check that pt_install_table() and pt_table_pa() match
*/
@@ -770,6 +781,7 @@ static struct kunit_case generic_pt_test_cases[] = {
KUNIT_CASE_FMT(test_init),
KUNIT_CASE_FMT(test_bitops),
KUNIT_CASE_FMT(test_best_pgsize),
+ KUNIT_CASE_FMT(test_pgsz_count),
KUNIT_CASE_FMT(test_table_ptr),
KUNIT_CASE_FMT(test_max_va),
KUNIT_CASE_FMT(test_table_radix),
diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h
index c0d8617cce2928..3e45dbde6b8327 100644
--- a/drivers/iommu/generic_pt/pt_iter.h
+++ b/drivers/iommu/generic_pt/pt_iter.h
@@ -569,6 +569,28 @@ static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
return pgsz_lg2;
}
+/*
+ * Return the number of pgsize_lg2 leaf entries that can be mapped for
+ * va to oa. This accounts for any requirement to reduce or increase the page
+ * size across the VA range.
+ */
+static inline pt_vaddr_t pt_pgsz_count(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va,
+ pt_vaddr_t last_va, pt_oaddr_t oa,
+ unsigned int pgsize_lg2)
+{
+ pt_vaddr_t len = last_va - va + 1;
+ pt_vaddr_t next_pgsizes = log2_set_mod(pgsz_bitmap, 0, pgsize_lg2 + 1);
+
+ if (next_pgsizes) {
+ unsigned int next_pgsize_lg2 = vaffs(next_pgsizes);
+
+ if (log2_mod(va ^ oa, next_pgsize_lg2) == 0)
+ len = min(len, log2_set_mod_max(va, next_pgsize_lg2) -
+ va + 1);
+ }
+ return log2_div(len, pgsize_lg2);
+}
+
#define _PT_MAKE_CALL_LEVEL(fn) \
static __always_inline int fn(struct pt_range *range, void *arg, \
unsigned int level, \
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f68269707101a3..33cee64686e3ed 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2569,14 +2569,14 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
return pgsize;
}
-int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+static int __iommu_map_domain_pgtbl(struct iommu_domain *domain,
+ unsigned long iova, phys_addr_t paddr,
+ size_t size, int prot, gfp_t gfp)
{
const struct iommu_domain_ops *ops = domain->ops;
unsigned long orig_iova = iova;
unsigned int min_pagesz;
size_t orig_size = size;
- phys_addr_t orig_paddr = paddr;
int ret = 0;
might_sleep_if(gfpflags_allow_blocking(gfp));
@@ -2633,12 +2633,9 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova,
/* unroll mapping in case something went wrong */
if (ret) {
iommu_unmap(domain, orig_iova, orig_size - size);
- } else {
- trace_map(orig_iova, orig_paddr, orig_size);
- iommu_debug_map(domain, orig_paddr, orig_size);
+ return ret;
}
-
- return ret;
+ return 0;
}
int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size)
@@ -2650,6 +2647,32 @@ int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size)
return ops->iotlb_sync_map(domain, iova, size);
}
+int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ struct pt_iommu *pt = iommupt_from_domain(domain);
+ int ret;
+
+ if (pt) {
+ size_t mapped = 0;
+
+ ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp,
+ &mapped);
+ if (ret) {
+ iommu_unmap(domain, iova, mapped);
+ return ret;
+ }
+ return 0;
+ }
+ ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp);
+ if (!ret)
+ return ret;
+
+ trace_map(iova, paddr, size);
+ iommu_debug_map(domain, paddr, size);
+ return 0;
+}
+
int iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
{
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index f094f8f44e4e8a..43cc98c9c55f70 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -87,6 +87,33 @@ struct pt_iommu_info {
};
struct pt_iommu_ops {
+ /**
+ * @map_range: Install translation for an IOVA range
+ * @iommu_table: Table to manipulate
+ * @iova: IO virtual address to start
+ * @paddr: Physical/Output address to start
+ * @len: Length of the range starting from @iova
+ * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO
+ * @gfp: GFP flags for any memory allocations
+ *
+ * The range starting at IOVA will have paddr installed into it. The
+ * rage is automatically segmented into optimally sized table entries,
+ * and can have any valid alignment.
+ *
+ * On error the caller will probably want to invoke unmap on the range
+ * from iova up to the amount indicated by @mapped to return the table
+ * back to an unchanged state.
+ *
+ * Context: The caller must hold a write range lock that includes
+ * the whole range.
+ *
+ * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA
+ * that were mapped are added to @mapped, @mapped is not zerod first.
+ */
+ int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova,
+ phys_addr_t paddr, dma_addr_t len, unsigned int prot,
+ gfp_t gfp, size_t *mapped);
+
/**
* @unmap_range: Make a range of IOVA empty/not present
* @iommu_table: Table to manipulate
@@ -224,10 +251,6 @@ struct pt_iommu_cfg {
#define IOMMU_PROTOTYPES(fmt) \
phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
dma_addr_t iova); \
- int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \
- unsigned long iova, phys_addr_t paddr, \
- size_t pgsize, size_t pgcount, \
- int prot, gfp_t gfp, size_t *mapped); \
int pt_iommu_##fmt##_read_and_clear_dirty( \
struct iommu_domain *domain, unsigned long iova, size_t size, \
unsigned long flags, struct iommu_dirty_bitmap *dirty); \
@@ -248,8 +271,7 @@ struct pt_iommu_cfg {
* iommu_pt
*/
#define IOMMU_PT_DOMAIN_OPS(fmt) \
- .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \
- .map_pages = &pt_iommu_##fmt##_map_pages
+ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys
#define IOMMU_PT_DIRTY_OPS(fmt) \
.read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty
--
2.43.0
next prev parent reply other threads:[~2026-02-27 19:31 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-27 19:30 [PATCH v3 0/2] Let iommupt manage changes in page size internally Jason Gunthorpe
2026-02-27 19:30 ` [PATCH v3 1/2] iommupt: Directly call iommupt's unmap_range() Jason Gunthorpe
2026-02-27 19:30 ` Jason Gunthorpe [this message]
2026-05-09 17:41 ` [PATCH v3 2/2] iommupt: Avoid rewalking during map Josua Mayer
2026-05-09 19:40 ` Jason Gunthorpe
2026-05-09 20:25 ` Josua Mayer
2026-05-09 23:39 ` Jason Gunthorpe
2026-03-03 3:55 ` [PATCH v3 0/2] Let iommupt manage changes in page size internally Baolu Lu
2026-03-17 12:58 ` Joerg Roedel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=2-v3-a1777ea76519+370f-iommpt_map_direct_jgg@nvidia.com \
--to=jgg@nvidia.com \
--cc=iommu@lists.linux.dev \
--cc=joro@8bytes.org \
--cc=kevin.tian@intel.com \
--cc=patches@lists.linux.dev \
--cc=robin.murphy@arm.com \
--cc=skhawaja@google.com \
--cc=will@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox