* [PATCH 1/5] iova: switch to augmented rbtree for log(n) allocation
2026-05-13 2:00 [PATCH 0/5] iova augmented rbtree O(log n) alloc_iova Rik van Riel
@ 2026-05-13 2:00 ` Rik van Riel
2026-05-13 2:00 ` [PATCH 2/5] iova: drop dead cached_node / cached32_node infrastructure Rik van Riel
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Rik van Riel @ 2026-05-13 2:00 UTC (permalink / raw)
To: linux-kernel
Cc: robin.murphy, joro, will, iommu, kyle, kernel-team, Rik van Riel,
Rik van Riel
From: Rik van Riel <riel@meta.com>
The IOVA allocator's __alloc_and_insert_iova_range() walks the rbtree
backwards via rb_prev() to find a free gap, which is O(n) on the number
of allocated ranges. Under heavy fragmentation (e.g. iommu_iova caches
holding hundreds of thousands of ranges) this can take 20+ seconds and
trigger softlockups.
Switch to an augmented rbtree that tracks two new fields per iova node:
gap_to_prev - the free gap (in pfns) between this node and
its in-order predecessor.
__subtree_max_gap - max gap_to_prev over this node's subtree.
The augmented invariant is maintained via RB_DECLARE_CALLBACKS_MAX from
rbtree_augmented.h, plus explicit propagate() calls on insertion (where
both the new node and its successor get a fresh gap_to_prev) and on
deletion (where the successor's gap_to_prev grows).
The new __iova_search_free_gap() walks right-first to prefer high
addresses (preserving the existing top-down allocation behaviour) and
prunes any subtree whose __subtree_max_gap is smaller than the requested
size. For the typical IOMMU workload (uniform DMA mask per domain, so
limit_pfn never binds against a candidate gap) and unaligned or
small-alignment allocations, the search is O(log n). The
alloc_iova_fast() size_aligned path can still degrade to O(n) when
alignment is large relative to the available gaps; a follow-up patch
in this series addresses that case.
The cached_node / cached32_node fields and their update helpers are
left in place but no longer consulted on the alloc path; a follow-up
can strip them.
The fast-fail max32_alloc_size optimization is preserved.
Assisted-by: Claude:claude-opus-4.7
Signed-off-by: Rik van Riel <riel@surriel.com>
---
drivers/iommu/iova.c | 206 ++++++++++++++++++++++++++-----------------
include/linux/iova.h | 2 +
2 files changed, 128 insertions(+), 80 deletions(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 021daf6528de..953188e296f0 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -13,6 +13,7 @@
#include <linux/bitops.h>
#include <linux/cpu.h>
#include <linux/workqueue.h>
+#include <linux/rbtree_augmented.h>
/* The anchor node sits above the top of the usable address space */
#define IOVA_ANCHOR ~0UL
@@ -34,6 +35,15 @@ static struct iova *to_iova(struct rb_node *node)
return rb_entry(node, struct iova, node);
}
+static inline unsigned long iova_gap_value(struct iova *iova)
+{
+ return iova->gap_to_prev;
+}
+
+RB_DECLARE_CALLBACKS_MAX(static, iova_gap_callbacks,
+ struct iova, node, unsigned long, __subtree_max_gap,
+ iova_gap_value)
+
void
init_iova_domain(struct iova_domain *iovad, unsigned long granule,
unsigned long start_pfn)
@@ -54,20 +64,13 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
iovad->max32_alloc_size = iovad->dma_32bit_pfn;
iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
+ iovad->anchor.gap_to_prev = IOVA_ANCHOR;
+ iovad->anchor.__subtree_max_gap = IOVA_ANCHOR;
rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
}
EXPORT_SYMBOL_GPL(init_iova_domain);
-static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
-{
- if (limit_pfn <= iovad->dma_32bit_pfn)
- return iovad->cached32_node;
-
- return iovad->cached_node;
-}
-
static void
__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
{
@@ -96,49 +99,13 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
iovad->cached_node = rb_next(&free->node);
}
-static struct rb_node *iova_find_limit(struct iova_domain *iovad, unsigned long limit_pfn)
-{
- struct rb_node *node, *next;
- /*
- * Ideally what we'd like to judge here is whether limit_pfn is close
- * enough to the highest-allocated IOVA that starting the allocation
- * walk from the anchor node will be quicker than this initial work to
- * find an exact starting point (especially if that ends up being the
- * anchor node anyway). This is an incredibly crude approximation which
- * only really helps the most likely case, but is at least trivially easy.
- */
- if (limit_pfn > iovad->dma_32bit_pfn)
- return &iovad->anchor.node;
-
- node = iovad->rbroot.rb_node;
- while (to_iova(node)->pfn_hi < limit_pfn)
- node = node->rb_right;
-
-search_left:
- while (node->rb_left && to_iova(node->rb_left)->pfn_lo >= limit_pfn)
- node = node->rb_left;
-
- if (!node->rb_left)
- return node;
-
- next = node->rb_left;
- while (next->rb_right) {
- next = next->rb_right;
- if (to_iova(next)->pfn_lo >= limit_pfn) {
- node = next;
- goto search_left;
- }
- }
-
- return node;
-}
-
/* Insert the iova into domain rbtree by holding writer lock */
static void
iova_insert_rbtree(struct rb_root *root, struct iova *iova,
struct rb_node *start)
{
struct rb_node **new, *parent = NULL;
+ struct rb_node *prev_node, *next_node;
new = (start) ? &start : &(root->rb_node);
/* Figure out where to put new node */
@@ -156,62 +123,104 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
return;
}
}
- /* Add new node and rebalance tree. */
+
rb_link_node(&iova->node, parent, new);
- rb_insert_color(&iova->node, root);
+
+ prev_node = rb_prev(&iova->node);
+ if (prev_node)
+ iova->gap_to_prev = iova->pfn_lo - to_iova(prev_node)->pfn_hi - 1;
+ else
+ iova->gap_to_prev = iova->pfn_lo;
+ iova->__subtree_max_gap = iova->gap_to_prev;
+
+ next_node = rb_next(&iova->node);
+ if (next_node)
+ to_iova(next_node)->gap_to_prev =
+ to_iova(next_node)->pfn_lo - iova->pfn_hi - 1;
+
+ if (parent)
+ iova_gap_callbacks.propagate(parent, NULL);
+ if (next_node)
+ iova_gap_callbacks.propagate(next_node, NULL);
+
+ rb_insert_augmented(&iova->node, root, &iova_gap_callbacks);
+}
+
+/*
+ * Search the augmented rbtree for the highest-addressed free gap of at least
+ * @size pages, with the allocation fitting below @limit_pfn and at or above
+ * @start_pfn. Returns the node whose gap_to_prev is used, or NULL.
+ */
+static struct rb_node *
+__iova_search_free_gap(struct rb_node *node, unsigned long size,
+ unsigned long limit_pfn, unsigned long start_pfn,
+ unsigned long align_mask, unsigned long *new_pfn)
+{
+ struct iova *iova;
+ struct rb_node *result;
+
+ if (!node)
+ return NULL;
+
+ iova = to_iova(node);
+ if (iova->__subtree_max_gap < size)
+ return NULL;
+
+ result = __iova_search_free_gap(node->rb_right, size, limit_pfn,
+ start_pfn, align_mask, new_pfn);
+ if (result)
+ return result;
+
+ if (iova->gap_to_prev >= size) {
+ unsigned long gap_lo = iova->pfn_lo - iova->gap_to_prev;
+ unsigned long gap_hi = iova->pfn_lo - 1;
+ unsigned long pfn;
+
+ if (gap_hi >= limit_pfn)
+ gap_hi = limit_pfn - 1;
+ if (gap_hi >= gap_lo && gap_hi - gap_lo + 1 >= size) {
+ pfn = (gap_hi - size + 1) & align_mask;
+ if (pfn >= gap_lo && pfn >= start_pfn) {
+ *new_pfn = pfn;
+ return node;
+ }
+ }
+ }
+
+ return __iova_search_free_gap(node->rb_left, size, limit_pfn,
+ start_pfn, align_mask, new_pfn);
}
static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
unsigned long size, unsigned long limit_pfn,
struct iova *new, bool size_aligned)
{
- struct rb_node *curr, *prev;
- struct iova *curr_iova;
unsigned long flags;
- unsigned long new_pfn, retry_pfn;
+ unsigned long new_pfn;
unsigned long align_mask = ~0UL;
- unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;
+ struct rb_node *gap_node;
if (size_aligned)
align_mask <<= fls_long(size - 1);
- /* Walk the tree backwards */
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
if (limit_pfn <= iovad->dma_32bit_pfn &&
size >= iovad->max32_alloc_size)
goto iova32_full;
- curr = __get_cached_rbnode(iovad, limit_pfn);
- curr_iova = to_iova(curr);
- retry_pfn = curr_iova->pfn_hi;
-
-retry:
- do {
- high_pfn = min(high_pfn, curr_iova->pfn_lo);
- new_pfn = (high_pfn - size) & align_mask;
- prev = curr;
- curr = rb_prev(curr);
- curr_iova = to_iova(curr);
- } while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);
-
- if (high_pfn < size || new_pfn < low_pfn) {
- if (low_pfn == iovad->start_pfn && retry_pfn < limit_pfn) {
- high_pfn = limit_pfn;
- low_pfn = retry_pfn + 1;
- curr = iova_find_limit(iovad, limit_pfn);
- curr_iova = to_iova(curr);
- goto retry;
- }
- iovad->max32_alloc_size = size;
+ gap_node = __iova_search_free_gap(iovad->rbroot.rb_node, size,
+ limit_pfn, iovad->start_pfn,
+ align_mask, &new_pfn);
+ if (!gap_node) {
+ if (limit_pfn <= iovad->dma_32bit_pfn)
+ iovad->max32_alloc_size = size;
goto iova32_full;
}
- /* pfn_lo will point to size aligned address if size_aligned is set */
new->pfn_lo = new_pfn;
- new->pfn_hi = new->pfn_lo + size - 1;
+ new->pfn_hi = new_pfn + size - 1;
- /* If we have 'prev', it's a valid place to start the insertion. */
- iova_insert_rbtree(&iovad->rbroot, new, prev);
+ iova_insert_rbtree(&iovad->rbroot, new, gap_node);
__cached_rbnode_insert_update(iovad, new);
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
@@ -295,9 +304,34 @@ private_find_iova(struct iova_domain *iovad, unsigned long pfn)
static void remove_iova(struct iova_domain *iovad, struct iova *iova)
{
+ struct rb_node *next_node;
+ struct iova *next_iova = NULL;
+
assert_spin_locked(&iovad->iova_rbtree_lock);
__cached_rbnode_delete_update(iovad, iova);
- rb_erase(&iova->node, &iovad->rbroot);
+
+ next_node = rb_next(&iova->node);
+ if (next_node) {
+ struct rb_node *prev_node = rb_prev(&iova->node);
+
+ next_iova = to_iova(next_node);
+ if (prev_node)
+ next_iova->gap_to_prev =
+ next_iova->pfn_lo - to_iova(prev_node)->pfn_hi - 1;
+ else
+ next_iova->gap_to_prev = next_iova->pfn_lo;
+ /*
+ * Propagate next_iova's new augmented values to the root BEFORE
+ * the erase. Otherwise rotations inside rb_erase_augmented may
+ * copy a stale __subtree_max_gap from next_iova to other nodes,
+ * leaving ancestors in an inconsistent state that the post-erase
+ * propagate cannot fully repair (early-termination at matching
+ * intermediate values).
+ */
+ iova_gap_callbacks.propagate(&next_iova->node, NULL);
+ }
+
+ rb_erase_augmented(&iova->node, &iovad->rbroot, &iova_gap_callbacks);
}
/**
@@ -527,8 +561,20 @@ reserve_iova(struct iova_domain *iovad,
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
+ struct rb_node *p;
+ unsigned long gap;
+
iova = to_iova(node);
__adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
+ p = rb_prev(node);
+ if (p)
+ gap = iova->pfn_lo - to_iova(p)->pfn_hi - 1;
+ else
+ gap = iova->pfn_lo;
+ if (iova->gap_to_prev != gap) {
+ iova->gap_to_prev = gap;
+ iova_gap_callbacks.propagate(node, NULL);
+ }
if ((pfn_lo >= iova->pfn_lo) &&
(pfn_hi <= iova->pfn_hi))
goto finish;
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d2c4fd923efa..52635a72c5c5 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -19,6 +19,8 @@ struct iova {
struct rb_node node;
unsigned long pfn_hi; /* Highest allocated pfn */
unsigned long pfn_lo; /* Lowest allocated pfn */
+ unsigned long gap_to_prev; /* Gap (in pfns) to predecessor */
+ unsigned long __subtree_max_gap; /* Max gap_to_prev in subtree */
};
--
2.52.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 2/5] iova: drop dead cached_node / cached32_node infrastructure
2026-05-13 2:00 [PATCH 0/5] iova augmented rbtree O(log n) alloc_iova Rik van Riel
2026-05-13 2:00 ` [PATCH 1/5] iova: switch to augmented rbtree for log(n) allocation Rik van Riel
@ 2026-05-13 2:00 ` Rik van Riel
2026-05-13 2:00 ` [PATCH 3/5] iova: limit_pfn-aware augmentation for log(n) 32-bit alloc Rik van Riel
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Rik van Riel @ 2026-05-13 2:00 UTC (permalink / raw)
To: linux-kernel
Cc: robin.murphy, joro, will, iommu, kyle, kernel-team, Rik van Riel,
Rik van Riel
From: Rik van Riel <riel@meta.com>
After the augmented-rbtree port, cached_node and cached32_node are still
maintained on every insert and delete but are never consulted on the
allocation path. Drop the fields and their helpers.
The one piece of useful work in __cached_rbnode_delete_update() was
resetting iovad->max32_alloc_size when an iova in the 32-bit range was
freed (so the next 32-bit alloc can retry). That logic is preserved by
moving it inline into remove_iova().
No external consumers reference the cached_node fields.
Assisted-by: Claude:claude-opus-4.7
Signed-off-by: Rik van Riel <riel@surriel.com>
---
drivers/iommu/iova.c | 35 +++--------------------------------
include/linux/iova.h | 2 --
2 files changed, 3 insertions(+), 34 deletions(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 953188e296f0..c358ce981cae 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -57,8 +57,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
spin_lock_init(&iovad->iova_rbtree_lock);
iovad->rbroot = RB_ROOT;
- iovad->cached_node = &iovad->anchor.node;
- iovad->cached32_node = &iovad->anchor.node;
iovad->granule = granule;
iovad->start_pfn = start_pfn;
iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
@@ -71,34 +69,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
}
EXPORT_SYMBOL_GPL(init_iova_domain);
-static void
-__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
-{
- if (new->pfn_hi < iovad->dma_32bit_pfn)
- iovad->cached32_node = &new->node;
- else
- iovad->cached_node = &new->node;
-}
-
-static void
-__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
-{
- struct iova *cached_iova;
-
- cached_iova = to_iova(iovad->cached32_node);
- if (free == cached_iova ||
- (free->pfn_hi < iovad->dma_32bit_pfn &&
- free->pfn_lo >= cached_iova->pfn_lo))
- iovad->cached32_node = rb_next(&free->node);
-
- if (free->pfn_lo < iovad->dma_32bit_pfn)
- iovad->max32_alloc_size = iovad->dma_32bit_pfn;
-
- cached_iova = to_iova(iovad->cached_node);
- if (free->pfn_lo >= cached_iova->pfn_lo)
- iovad->cached_node = rb_next(&free->node);
-}
-
/* Insert the iova into domain rbtree by holding writer lock */
static void
iova_insert_rbtree(struct rb_root *root, struct iova *iova,
@@ -221,7 +191,6 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
new->pfn_hi = new_pfn + size - 1;
iova_insert_rbtree(&iovad->rbroot, new, gap_node);
- __cached_rbnode_insert_update(iovad, new);
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
return 0;
@@ -308,7 +277,9 @@ static void remove_iova(struct iova_domain *iovad, struct iova *iova)
struct iova *next_iova = NULL;
assert_spin_locked(&iovad->iova_rbtree_lock);
- __cached_rbnode_delete_update(iovad, iova);
+
+ if (iova->pfn_lo < iovad->dma_32bit_pfn)
+ iovad->max32_alloc_size = iovad->dma_32bit_pfn;
next_node = rb_next(&iova->node);
if (next_node) {
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 52635a72c5c5..3c4cc81e5182 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -30,8 +30,6 @@ struct iova_rcache;
struct iova_domain {
spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
struct rb_root rbroot; /* iova domain rbtree root */
- struct rb_node *cached_node; /* Save last alloced node */
- struct rb_node *cached32_node; /* Save last 32-bit alloced node */
unsigned long granule; /* pfn granularity for this domain */
unsigned long start_pfn; /* Lower limit for this domain */
unsigned long dma_32bit_pfn;
--
2.52.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 3/5] iova: limit_pfn-aware augmentation for log(n) 32-bit alloc
2026-05-13 2:00 [PATCH 0/5] iova augmented rbtree O(log n) alloc_iova Rik van Riel
2026-05-13 2:00 ` [PATCH 1/5] iova: switch to augmented rbtree for log(n) allocation Rik van Riel
2026-05-13 2:00 ` [PATCH 2/5] iova: drop dead cached_node / cached32_node infrastructure Rik van Riel
@ 2026-05-13 2:00 ` Rik van Riel
2026-05-13 2:00 ` [PATCH 4/5] iova: alignment-aware two-phase search for log(n) aligned alloc Rik van Riel
2026-05-13 2:00 ` [PATCH 5/5] iova: add KUnit test suite Rik van Riel
4 siblings, 0 replies; 6+ messages in thread
From: Rik van Riel @ 2026-05-13 2:00 UTC (permalink / raw)
To: linux-kernel
Cc: robin.murphy, joro, will, iommu, kyle, kernel-team, Rik van Riel,
Rik van Riel
From: Rik van Riel <riel@meta.com>
The augmented-rbtree port made __iova_search_free_gap O(log n) when
limit_pfn doesn't bind any candidate gap, but degrades to O(n) when
limit_pfn is small relative to the gaps in the tree. The classic case
is a 32-bit DMA allocation on a domain dominated by 64-bit allocations:
the augmented invariant __subtree_max_gap is satisfied everywhere (the
huge gap between the highest 64-bit allocation and the IOVA_ANCHOR
dominates), so pruning never fires, and every node above limit_pfn must
be visited and rejected before falling through to the 32-bit region.
Add a second augmented field that bounds the search by 32-bit-clamped
gap size:
clamped_gap32(node) =
0 if gap_to_prev == 0
0 if gap_lo >= dma_32bit_pfn
min(gap_hi, dma_32bit_pfn-1) - gap_lo + 1 otherwise
__subtree_max_gap32 = max(clamped_gap32) over subtree
clamped_gap32 is precomputed and stored on each iova whenever
gap_to_prev changes (insert, remove, reserve overlap fix-up), so the
augmented callbacks remain pure functions of the node and don't need
domain context.
The compute helper iova_compute_max() updates both __subtree_max_gap
and __subtree_max_gap32 in one pass; the hand-rolled propagate / copy
/ rotate callbacks (replacing the single-field RB_DECLARE_CALLBACKS_MAX
expansion) maintain both fields per insert/erase. Pruning in
__iova_search_free_gap selects between them based on whether the
caller is doing a 32-bit allocation.
For 64-bit allocations the behaviour is unchanged. For 32-bit
allocations on mixed-DMA-mask domains the search is now O(log n).
Assisted-by: Claude:claude-opus-4.7
Signed-off-by: Rik van Riel <riel@surriel.com>
---
drivers/iommu/iova.c | 153 ++++++++++++++++++++++++++++++++++++-------
include/linux/iova.h | 6 +-
2 files changed, 133 insertions(+), 26 deletions(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index c358ce981cae..7787da8b2dad 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -35,14 +35,97 @@ static struct iova *to_iova(struct rb_node *node)
return rb_entry(node, struct iova, node);
}
-static inline unsigned long iova_gap_value(struct iova *iova)
+/*
+ * Portion of @iova->gap_to_prev that lies strictly below @dma_32bit_pfn,
+ * i.e. the largest contiguous sub-gap that a 32-bit-restricted allocation
+ * could possibly use. Maintained alongside gap_to_prev so the augmented
+ * callbacks can compare against it without needing per-iova access to the
+ * domain's dma_32bit_pfn.
+ */
+static unsigned long
+iova_compute_clamped_gap32(struct iova *iova, unsigned long dma_32bit_pfn)
+{
+ unsigned long gap_lo, gap_hi;
+
+ if (iova->gap_to_prev == 0)
+ return 0;
+ gap_lo = iova->pfn_lo - iova->gap_to_prev;
+ if (gap_lo >= dma_32bit_pfn)
+ return 0;
+ gap_hi = iova->pfn_lo - 1;
+ if (gap_hi >= dma_32bit_pfn)
+ gap_hi = dma_32bit_pfn - 1;
+ return gap_hi - gap_lo + 1;
+}
+
+/*
+ * Recompute @node's __subtree_max_gap and __subtree_max_gap32 from its
+ * own gap fields and its children's subtree maxes. Returns true (for
+ * propagate's early-termination) if neither value would change.
+ */
+static bool iova_compute_max(struct iova *node, bool exit)
{
- return iova->gap_to_prev;
+ unsigned long max_gap = node->gap_to_prev;
+ unsigned long max_gap32 = node->clamped_gap32;
+ struct iova *child;
+
+ if (node->node.rb_left) {
+ child = to_iova(node->node.rb_left);
+ if (child->__subtree_max_gap > max_gap)
+ max_gap = child->__subtree_max_gap;
+ if (child->__subtree_max_gap32 > max_gap32)
+ max_gap32 = child->__subtree_max_gap32;
+ }
+ if (node->node.rb_right) {
+ child = to_iova(node->node.rb_right);
+ if (child->__subtree_max_gap > max_gap)
+ max_gap = child->__subtree_max_gap;
+ if (child->__subtree_max_gap32 > max_gap32)
+ max_gap32 = child->__subtree_max_gap32;
+ }
+ if (exit && node->__subtree_max_gap == max_gap &&
+ node->__subtree_max_gap32 == max_gap32)
+ return true;
+ node->__subtree_max_gap = max_gap;
+ node->__subtree_max_gap32 = max_gap32;
+ return false;
}
-RB_DECLARE_CALLBACKS_MAX(static, iova_gap_callbacks,
- struct iova, node, unsigned long, __subtree_max_gap,
- iova_gap_value)
+static void iova_gap_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+ while (rb != stop) {
+ struct iova *node = to_iova(rb);
+
+ if (iova_compute_max(node, true))
+ break;
+ rb = rb_parent(&node->node);
+ }
+}
+
+static void iova_gap_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+ struct iova *old = to_iova(rb_old);
+ struct iova *new = to_iova(rb_new);
+
+ new->__subtree_max_gap = old->__subtree_max_gap;
+ new->__subtree_max_gap32 = old->__subtree_max_gap32;
+}
+
+static void iova_gap_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+ struct iova *old = to_iova(rb_old);
+ struct iova *new = to_iova(rb_new);
+
+ new->__subtree_max_gap = old->__subtree_max_gap;
+ new->__subtree_max_gap32 = old->__subtree_max_gap32;
+ iova_compute_max(old, false);
+}
+
+static const struct rb_augment_callbacks iova_gap_callbacks = {
+ .propagate = iova_gap_propagate,
+ .copy = iova_gap_copy,
+ .rotate = iova_gap_rotate,
+};
void
init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -64,6 +147,9 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
iovad->anchor.gap_to_prev = IOVA_ANCHOR;
iovad->anchor.__subtree_max_gap = IOVA_ANCHOR;
+ iovad->anchor.clamped_gap32 = iova_compute_clamped_gap32(&iovad->anchor,
+ iovad->dma_32bit_pfn);
+ iovad->anchor.__subtree_max_gap32 = iovad->anchor.clamped_gap32;
rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
}
@@ -71,9 +157,10 @@ EXPORT_SYMBOL_GPL(init_iova_domain);
/* Insert the iova into domain rbtree by holding writer lock */
static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova,
+iova_insert_rbtree(struct iova_domain *iovad, struct iova *iova,
struct rb_node *start)
{
+ struct rb_root *root = &iovad->rbroot;
struct rb_node **new, *parent = NULL;
struct rb_node *prev_node, *next_node;
@@ -101,12 +188,18 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
iova->gap_to_prev = iova->pfn_lo - to_iova(prev_node)->pfn_hi - 1;
else
iova->gap_to_prev = iova->pfn_lo;
+ iova->clamped_gap32 = iova_compute_clamped_gap32(iova, iovad->dma_32bit_pfn);
iova->__subtree_max_gap = iova->gap_to_prev;
+ iova->__subtree_max_gap32 = iova->clamped_gap32;
next_node = rb_next(&iova->node);
- if (next_node)
- to_iova(next_node)->gap_to_prev =
- to_iova(next_node)->pfn_lo - iova->pfn_hi - 1;
+ if (next_node) {
+ struct iova *next_iova = to_iova(next_node);
+
+ next_iova->gap_to_prev = next_iova->pfn_lo - iova->pfn_hi - 1;
+ next_iova->clamped_gap32 = iova_compute_clamped_gap32(next_iova,
+ iovad->dma_32bit_pfn);
+ }
if (parent)
iova_gap_callbacks.propagate(parent, NULL);
@@ -119,25 +212,32 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
/*
* Search the augmented rbtree for the highest-addressed free gap of at least
* @size pages, with the allocation fitting below @limit_pfn and at or above
- * @start_pfn. Returns the node whose gap_to_prev is used, or NULL.
+ * @start_pfn. When @is_32bit, prune by the 32-bit-clamped subtree max so
+ * that 32-bit-restricted allocations on a domain dominated by high-pfn
+ * allocations stay O(log n) instead of degrading to O(n). Returns the node
+ * whose gap_to_prev is used, or NULL.
*/
static struct rb_node *
__iova_search_free_gap(struct rb_node *node, unsigned long size,
unsigned long limit_pfn, unsigned long start_pfn,
- unsigned long align_mask, unsigned long *new_pfn)
+ unsigned long align_mask, bool is_32bit,
+ unsigned long *new_pfn)
{
struct iova *iova;
struct rb_node *result;
+ unsigned long subtree_max;
if (!node)
return NULL;
iova = to_iova(node);
- if (iova->__subtree_max_gap < size)
+ subtree_max = is_32bit ? iova->__subtree_max_gap32
+ : iova->__subtree_max_gap;
+ if (subtree_max < size)
return NULL;
result = __iova_search_free_gap(node->rb_right, size, limit_pfn,
- start_pfn, align_mask, new_pfn);
+ start_pfn, align_mask, is_32bit, new_pfn);
if (result)
return result;
@@ -158,7 +258,7 @@ __iova_search_free_gap(struct rb_node *node, unsigned long size,
}
return __iova_search_free_gap(node->rb_left, size, limit_pfn,
- start_pfn, align_mask, new_pfn);
+ start_pfn, align_mask, is_32bit, new_pfn);
}
static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
@@ -169,20 +269,21 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
unsigned long new_pfn;
unsigned long align_mask = ~0UL;
struct rb_node *gap_node;
+ bool is_32bit;
if (size_aligned)
align_mask <<= fls_long(size - 1);
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- if (limit_pfn <= iovad->dma_32bit_pfn &&
- size >= iovad->max32_alloc_size)
+ is_32bit = limit_pfn <= iovad->dma_32bit_pfn;
+ if (is_32bit && size >= iovad->max32_alloc_size)
goto iova32_full;
gap_node = __iova_search_free_gap(iovad->rbroot.rb_node, size,
limit_pfn, iovad->start_pfn,
- align_mask, &new_pfn);
+ align_mask, is_32bit, &new_pfn);
if (!gap_node) {
- if (limit_pfn <= iovad->dma_32bit_pfn)
+ if (is_32bit)
iovad->max32_alloc_size = size;
goto iova32_full;
}
@@ -190,7 +291,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
new->pfn_lo = new_pfn;
new->pfn_hi = new_pfn + size - 1;
- iova_insert_rbtree(&iovad->rbroot, new, gap_node);
+ iova_insert_rbtree(iovad, new, gap_node);
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
return 0;
@@ -291,13 +392,15 @@ static void remove_iova(struct iova_domain *iovad, struct iova *iova)
next_iova->pfn_lo - to_iova(prev_node)->pfn_hi - 1;
else
next_iova->gap_to_prev = next_iova->pfn_lo;
+ next_iova->clamped_gap32 = iova_compute_clamped_gap32(next_iova,
+ iovad->dma_32bit_pfn);
/*
* Propagate next_iova's new augmented values to the root BEFORE
* the erase. Otherwise rotations inside rb_erase_augmented may
- * copy a stale __subtree_max_gap from next_iova to other nodes,
- * leaving ancestors in an inconsistent state that the post-erase
- * propagate cannot fully repair (early-termination at matching
- * intermediate values).
+ * copy a stale __subtree_max_gap or __subtree_max_gap32 from
+ * next_iova to other nodes, leaving ancestors in an inconsistent
+ * state that the post-erase propagate cannot fully repair
+ * (early-termination at matching intermediate values).
*/
iova_gap_callbacks.propagate(&next_iova->node, NULL);
}
@@ -493,7 +596,7 @@ __insert_new_range(struct iova_domain *iovad,
iova = alloc_and_init_iova(pfn_lo, pfn_hi);
if (iova)
- iova_insert_rbtree(&iovad->rbroot, iova, NULL);
+ iova_insert_rbtree(iovad, iova, NULL);
return iova;
}
@@ -544,6 +647,8 @@ reserve_iova(struct iova_domain *iovad,
gap = iova->pfn_lo;
if (iova->gap_to_prev != gap) {
iova->gap_to_prev = gap;
+ iova->clamped_gap32 = iova_compute_clamped_gap32(iova,
+ iovad->dma_32bit_pfn);
iova_gap_callbacks.propagate(node, NULL);
}
if ((pfn_lo >= iova->pfn_lo) &&
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 3c4cc81e5182..d262c6d88d6c 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -19,8 +19,10 @@ struct iova {
struct rb_node node;
unsigned long pfn_hi; /* Highest allocated pfn */
unsigned long pfn_lo; /* Lowest allocated pfn */
- unsigned long gap_to_prev; /* Gap (in pfns) to predecessor */
- unsigned long __subtree_max_gap; /* Max gap_to_prev in subtree */
+ unsigned long gap_to_prev; /* Gap (in pfns) to predecessor */
+ unsigned long clamped_gap32; /* gap_to_prev portion below dma_32bit_pfn */
+ unsigned long __subtree_max_gap; /* Max gap_to_prev in subtree */
+ unsigned long __subtree_max_gap32; /* Max clamped_gap32 in subtree */
};
--
2.52.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 4/5] iova: alignment-aware two-phase search for log(n) aligned alloc
2026-05-13 2:00 [PATCH 0/5] iova augmented rbtree O(log n) alloc_iova Rik van Riel
` (2 preceding siblings ...)
2026-05-13 2:00 ` [PATCH 3/5] iova: limit_pfn-aware augmentation for log(n) 32-bit alloc Rik van Riel
@ 2026-05-13 2:00 ` Rik van Riel
2026-05-13 2:00 ` [PATCH 5/5] iova: add KUnit test suite Rik van Riel
4 siblings, 0 replies; 6+ messages in thread
From: Rik van Riel @ 2026-05-13 2:00 UTC (permalink / raw)
To: linux-kernel
Cc: robin.murphy, joro, will, iommu, kyle, kernel-team, Rik van Riel,
Rik van Riel
From: Rik van Riel <riel@meta.com>
After the limit_pfn-aware augmentation, __iova_search_free_gap is
O(log n) for both 64-bit and 32-bit alloc paths -- but only when
ignoring alignment. The augmented invariant prunes by gap size; it
doesn't know about the per-call align_mask. A subtree whose
__subtree_max_gap >= size can still contain only borderline-sized gaps
that fail the alignment check, forcing the search to visit every node
in the subtree.
For a request of S pages with alignment A (a power of two), a gap of
size G is GUARANTEED to contain an aligned-fitting position iff
G >= S + A - 1. (Worst case: the highest aligned position
roundDOWN(gap_hi - S + 1, A) loses up to A-1 pfns to rounding.)
Two-phase search:
Phase 1: prune by S + A - 1. Every visited node either prunes its
subtree or descends into one whose subtree_max guarantees a
fitting candidate. Strict O(log n).
Phase 2: only if phase 1 returned NULL. Prune by S, the existing
behaviour. Catches fortuitously-aligned borderline gaps.
O(n) worst case but only triggers when no comfortable gap
exists anywhere in the tree.
For typical IOMMU workloads (alloc_iova_fast rounds size up to a power
of two, so A == S, and most allocations are 1-16 pages) phase 1
succeeds and the search is one O(log n) descent. The borderline case
is the network/storage workload that fragments small power-of-two
allocations and statistically misaligns half (size 2), 75% (size 4),
or more (size 16) of the marginal gaps -- exactly where phase 2 is the
fallback.
Implementation: __iova_search_free_gap takes a separate prune_size
distinct from the size used for fitting and pfn computation, so phase
1 can prune more aggressively while still returning the correct
top-aligned pfn within whichever gap it finds. The non-aligned alloc
path (size_aligned == false, A == 1) collapses to a single phase since
strict_size == size.
Assisted-by: Claude:claude-opus-4.7
Signed-off-by: Rik van Riel <riel@surriel.com>
---
drivers/iommu/iova.c | 65 +++++++++++++++++++++++++++++++++-----------
1 file changed, 49 insertions(+), 16 deletions(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 7787da8b2dad..4f3465d8ee16 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -210,18 +210,29 @@ iova_insert_rbtree(struct iova_domain *iovad, struct iova *iova,
}
/*
- * Search the augmented rbtree for the highest-addressed free gap of at least
- * @size pages, with the allocation fitting below @limit_pfn and at or above
- * @start_pfn. When @is_32bit, prune by the 32-bit-clamped subtree max so
- * that 32-bit-restricted allocations on a domain dominated by high-pfn
+ * Search the augmented rbtree for the highest-addressed free gap that fits
+ * @size pages with @align_mask alignment, below @limit_pfn and at or above
+ * @start_pfn.
+ *
+ * @prune_size is the threshold compared against the augmented subtree max:
+ * - If @prune_size == @size, every subtree containing any size-fitting
+ * gap is descended (lax search; can visit O(n) nodes when alignment or
+ * limit_pfn rejects most candidates).
+ * - If @prune_size == @size + alignment - 1, only subtrees containing a
+ * gap big enough to GUARANTEE an aligned fit are descended (strict
+ * search; O(log n) but may miss borderline-sized fortuitously aligned
+ * gaps).
+ *
+ * When @is_32bit, prune by the 32-bit-clamped subtree max so that
+ * 32-bit-restricted allocations on a domain dominated by high-pfn
* allocations stay O(log n) instead of degrading to O(n). Returns the node
* whose gap_to_prev is used, or NULL.
*/
static struct rb_node *
__iova_search_free_gap(struct rb_node *node, unsigned long size,
- unsigned long limit_pfn, unsigned long start_pfn,
- unsigned long align_mask, bool is_32bit,
- unsigned long *new_pfn)
+ unsigned long prune_size, unsigned long limit_pfn,
+ unsigned long start_pfn, unsigned long align_mask,
+ bool is_32bit, unsigned long *new_pfn)
{
struct iova *iova;
struct rb_node *result;
@@ -233,11 +244,12 @@ __iova_search_free_gap(struct rb_node *node, unsigned long size,
iova = to_iova(node);
subtree_max = is_32bit ? iova->__subtree_max_gap32
: iova->__subtree_max_gap;
- if (subtree_max < size)
+ if (subtree_max < prune_size)
return NULL;
- result = __iova_search_free_gap(node->rb_right, size, limit_pfn,
- start_pfn, align_mask, is_32bit, new_pfn);
+ result = __iova_search_free_gap(node->rb_right, size, prune_size,
+ limit_pfn, start_pfn, align_mask,
+ is_32bit, new_pfn);
if (result)
return result;
@@ -257,8 +269,9 @@ __iova_search_free_gap(struct rb_node *node, unsigned long size,
}
}
- return __iova_search_free_gap(node->rb_left, size, limit_pfn,
- start_pfn, align_mask, is_32bit, new_pfn);
+ return __iova_search_free_gap(node->rb_left, size, prune_size,
+ limit_pfn, start_pfn, align_mask,
+ is_32bit, new_pfn);
}
static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
@@ -268,11 +281,24 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
unsigned long flags;
unsigned long new_pfn;
unsigned long align_mask = ~0UL;
+ unsigned long strict_size = size;
struct rb_node *gap_node;
bool is_32bit;
- if (size_aligned)
- align_mask <<= fls_long(size - 1);
+ if (size_aligned) {
+ unsigned long align_shift = fls_long(size - 1);
+
+ align_mask <<= align_shift;
+ /*
+ * For an A-aligned allocation of S pages, a gap of size G is
+ * guaranteed to contain a fitting aligned position iff
+ * G >= S + A - 1. Use that as the strict pruning threshold for
+ * a fast O(log n) first pass; if it fails, fall back to a
+ * pruning threshold of just S to catch fortuitously-aligned
+ * borderline gaps.
+ */
+ strict_size = size + (1UL << align_shift) - 1;
+ }
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
is_32bit = limit_pfn <= iovad->dma_32bit_pfn;
@@ -280,8 +306,15 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
goto iova32_full;
gap_node = __iova_search_free_gap(iovad->rbroot.rb_node, size,
- limit_pfn, iovad->start_pfn,
- align_mask, is_32bit, &new_pfn);
+ strict_size, limit_pfn,
+ iovad->start_pfn, align_mask,
+ is_32bit, &new_pfn);
+ if (!gap_node && strict_size > size) {
+ gap_node = __iova_search_free_gap(iovad->rbroot.rb_node, size,
+ size, limit_pfn,
+ iovad->start_pfn, align_mask,
+ is_32bit, &new_pfn);
+ }
if (!gap_node) {
if (is_32bit)
iovad->max32_alloc_size = size;
--
2.52.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 5/5] iova: add KUnit test suite
2026-05-13 2:00 [PATCH 0/5] iova augmented rbtree O(log n) alloc_iova Rik van Riel
` (3 preceding siblings ...)
2026-05-13 2:00 ` [PATCH 4/5] iova: alignment-aware two-phase search for log(n) aligned alloc Rik van Riel
@ 2026-05-13 2:00 ` Rik van Riel
4 siblings, 0 replies; 6+ messages in thread
From: Rik van Riel @ 2026-05-13 2:00 UTC (permalink / raw)
To: linux-kernel
Cc: robin.murphy, joro, will, iommu, kyle, kernel-team, Rik van Riel,
Rik van Riel
From: Rik van Riel <riel@meta.com>
Add a kunit suite for the augmented-rbtree IOVA allocator, plus an
iova_domain_verify_invariants() helper (compiled only when the test
config is enabled) that walks the tree and confirms every node's
gap_to_prev, clamped_gap32, __subtree_max_gap, and __subtree_max_gap32
match what recomputation from scratch yields.
Test cases:
- test_init_destroy: domain lifecycle, no leaks.
- test_basic_alloc_free: single alloc/free roundtrip, top-down reuse.
- test_size_aligned: alignment of size_aligned allocs across orders 0..7.
- test_top_down_preference: sequential allocs decrease in pfn_lo.
- test_limit_pfn_respected: 100 allocs all stay <= limit_pfn.
- test_reserve_iova: allocs avoid the reserved range.
- test_find_iova: lookup by pfn returns the right iova.
- test_32bit_in_64bit_domain: 1000 64-bit allocs followed by a 32-bit
alloc must still find a slot below DMA_BIT_MASK(32) -- exercises
the __subtree_max_gap32 augmentation.
- test_two_phase_alignment: pack size-2 size_aligned allocs, free
every other; subsequent size-2 alloc must succeed via the phase-2
fallback search since phase-1's S+A-1 threshold prunes the size-2
gaps.
- test_pci_32bit_workaround_pattern: alternate 32-bit-first allocation
attempts with 64-bit fallback, mirroring dma-iommu.c.
- test_stress_random: 2048 random alloc/free operations with mixed
sizes, alignments, and 32/64-bit limits, with periodic invariant
checks.
Each test verifies the augmented invariants both during and after the
test run so that any sequencing bug in insert / erase / rotate /
propagate is caught at the operation that introduced it.
Tested by: building drivers/iommu/iova.o and drivers/iommu/iova-kunit.o
(no warnings); runtime execution requires booting a kernel with
CONFIG_IOMMU_IOVA_KUNIT_TEST=y under qemu-system-x86_64 (not available
on this devvm).
Assisted-by: Claude:claude-opus-4.7
Signed-off-by: Rik van Riel <riel@surriel.com>
---
drivers/iommu/Kconfig | 14 ++
drivers/iommu/Makefile | 1 +
drivers/iommu/iova-kunit.c | 396 +++++++++++++++++++++++++++++++++++++
drivers/iommu/iova.c | 90 +++++++++
include/linux/iova.h | 3 +
5 files changed, 504 insertions(+)
create mode 100644 drivers/iommu/iova-kunit.c
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f86262b11416..61906a5664a6 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -3,6 +3,20 @@
config IOMMU_IOVA
tristate
+config IOMMU_IOVA_KUNIT_TEST
+ tristate "KUnit tests for the IOVA allocator" if !KUNIT_ALL_TESTS
+ depends on IOMMU_IOVA && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Enable kunit tests for the IOVA allocator. The tests exercise
+ basic allocation and free, size-aligned allocation, top-down
+ ordering, the limit_pfn-aware 32-bit augmentation, the
+ alignment-aware two-phase search, and randomly-fragmented
+ stress scenarios. Each test verifies that the augmented
+ rbtree invariants remain consistent throughout.
+
+ If unsure, say N here.
+
# IOMMU_API always gets selected by whoever wants it.
config IOMMU_API
bool
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 0275821f4ef9..6bd7da1cbebd 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o
obj-$(CONFIG_IOMMU_IOVA) += iova.o
+obj-$(CONFIG_IOMMU_IOVA_KUNIT_TEST) += iova-kunit.o
obj-$(CONFIG_OF_IOMMU) += of_iommu.o
obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
diff --git a/drivers/iommu/iova-kunit.c b/drivers/iommu/iova-kunit.c
new file mode 100644
index 000000000000..e921c8543668
--- /dev/null
+++ b/drivers/iommu/iova-kunit.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for the IOVA allocator.
+ *
+ * Exercises the augmented-rbtree based allocator: basic alloc/free,
+ * size-aligned allocations, top-down ordering, the limit_pfn-aware
+ * 32-bit augmentation (relevant for the dma-iommu pci_32bit_workaround
+ * pattern), the alignment-aware two-phase search, and randomly
+ * fragmented stress.
+ *
+ * Each test verifies that the augmented invariants
+ * (__subtree_max_gap, __subtree_max_gap32, gap_to_prev, clamped_gap32)
+ * remain consistent after every batch of operations.
+ */
+#include <kunit/test.h>
+#include <linux/dma-mapping.h>
+#include <linux/iova.h>
+#include <linux/random.h>
+
+#define TEST_GRANULE PAGE_SIZE
+/* Highest pfn that fits in 32 bits — triggers the is_32bit alloc path. */
+#define TEST_LIMIT_32BIT (DMA_BIT_MASK(32) >> PAGE_SHIFT)
+/* A 64-bit-ish limit well above dma_32bit_pfn. */
+#define TEST_LIMIT_64BIT ((1UL << 36) >> PAGE_SHIFT)
+
+struct iova_test_ctx {
+ struct iova_domain iovad;
+ bool initialized;
+};
+
+static struct iova_test_ctx *iova_test_init_ctx(struct kunit *test)
+{
+ struct iova_test_ctx *ctx;
+ int ret;
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+
+ ret = iova_cache_get();
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ init_iova_domain(&ctx->iovad, TEST_GRANULE, 1);
+ ret = iova_domain_init_rcaches(&ctx->iovad);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ctx->initialized = true;
+ KUNIT_ASSERT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+ return ctx;
+}
+
+static void iova_test_cleanup(struct kunit *test, struct iova_test_ctx *ctx)
+{
+ if (ctx->initialized) {
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+ put_iova_domain(&ctx->iovad);
+ iova_cache_put();
+ ctx->initialized = false;
+ }
+}
+
+static void test_init_destroy(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+
+ iova_test_cleanup(test, ctx);
+}
+
+static void test_basic_alloc_free(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ struct iova *iova;
+ unsigned long pfn_lo;
+
+ iova = alloc_iova(&ctx->iovad, 1, TEST_LIMIT_32BIT, false);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ KUNIT_EXPECT_LE(test, iova->pfn_hi, TEST_LIMIT_32BIT);
+ KUNIT_EXPECT_EQ(test, iova->pfn_hi - iova->pfn_lo + 1, 1);
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+ pfn_lo = iova->pfn_lo;
+ __free_iova(&ctx->iovad, iova);
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ /* Top-down policy: subsequent alloc should reuse the same pfn. */
+ iova = alloc_iova(&ctx->iovad, 1, TEST_LIMIT_32BIT, false);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ KUNIT_EXPECT_EQ(test, iova->pfn_lo, pfn_lo);
+ __free_iova(&ctx->iovad, iova);
+
+ iova_test_cleanup(test, ctx);
+}
+
+static void test_size_aligned(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ int order;
+
+ for (order = 0; order < 8; ++order) {
+ unsigned long size = 1UL << order;
+ struct iova *iova = alloc_iova(&ctx->iovad, size,
+ TEST_LIMIT_32BIT, true);
+
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ KUNIT_EXPECT_EQ(test, iova->pfn_lo & (size - 1), 0);
+ KUNIT_EXPECT_EQ(test, iova->pfn_hi - iova->pfn_lo + 1, size);
+ __free_iova(&ctx->iovad, iova);
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+ }
+
+ iova_test_cleanup(test, ctx);
+}
+
+static void test_top_down_preference(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ struct iova *iovas[16];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(iovas); ++i) {
+ iovas[i] = alloc_iova(&ctx->iovad, 1, TEST_LIMIT_32BIT, false);
+ KUNIT_ASSERT_NOT_NULL(test, iovas[i]);
+ if (i > 0)
+ KUNIT_EXPECT_LT(test, iovas[i]->pfn_lo,
+ iovas[i - 1]->pfn_lo);
+ }
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ for (i = 0; i < ARRAY_SIZE(iovas); ++i)
+ __free_iova(&ctx->iovad, iovas[i]);
+
+ iova_test_cleanup(test, ctx);
+}
+
+static void test_limit_pfn_respected(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ struct iova *iova;
+ int i;
+
+ for (i = 0; i < 100; ++i) {
+ iova = alloc_iova(&ctx->iovad, 1, TEST_LIMIT_32BIT, false);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ KUNIT_EXPECT_LE(test, iova->pfn_hi, TEST_LIMIT_32BIT);
+ }
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ iova_test_cleanup(test, ctx);
+}
+
+static void test_reserve_iova(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ const unsigned long reserve_lo = TEST_LIMIT_32BIT / 2;
+ struct iova *r, *iova;
+ int i;
+
+ /* Reserve the entire top half through the limit_pfn, inclusive. */
+ r = reserve_iova(&ctx->iovad, reserve_lo, TEST_LIMIT_32BIT);
+ KUNIT_ASSERT_NOT_NULL(test, r);
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ /* All allocs must land below the reserved range. */
+ for (i = 0; i < 100; ++i) {
+ iova = alloc_iova(&ctx->iovad, 1, TEST_LIMIT_32BIT, false);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ KUNIT_EXPECT_LT(test, iova->pfn_hi, reserve_lo);
+ }
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ iova_test_cleanup(test, ctx);
+}
+
+static void test_find_iova(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ struct iova *iova, *found;
+
+ iova = alloc_iova(&ctx->iovad, 4, TEST_LIMIT_32BIT, true);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+
+ found = find_iova(&ctx->iovad, iova->pfn_lo);
+ KUNIT_EXPECT_PTR_EQ(test, found, iova);
+ found = find_iova(&ctx->iovad, iova->pfn_hi);
+ KUNIT_EXPECT_PTR_EQ(test, found, iova);
+ /* Outside the range should not find. */
+ if (iova->pfn_hi + 1 <= TEST_LIMIT_32BIT) {
+ found = find_iova(&ctx->iovad, iova->pfn_hi + 1);
+ KUNIT_EXPECT_PTR_NE(test, found, iova);
+ }
+
+ __free_iova(&ctx->iovad, iova);
+ iova_test_cleanup(test, ctx);
+}
+
+/*
+ * The pci_32bit_workaround scenario: every PCI device's first IOVA
+ * allocation hits the 32-bit-restricted path before falling back to
+ * 64-bit. Mix the two and verify the limit_pfn-aware augmentation
+ * keeps both correct.
+ */
+static void test_32bit_in_64bit_domain(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ struct iova *iova;
+ int i;
+
+ /* Fill the high 64-bit space. */
+ for (i = 0; i < 1000; ++i) {
+ iova = alloc_iova(&ctx->iovad, 1, TEST_LIMIT_64BIT, true);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ }
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ /* A 32-bit alloc must still find a slot below DMA_BIT_MASK(32). */
+ iova = alloc_iova(&ctx->iovad, 1, TEST_LIMIT_32BIT, true);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ KUNIT_EXPECT_LE(test, iova->pfn_hi, TEST_LIMIT_32BIT);
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ __free_iova(&ctx->iovad, iova);
+ iova_test_cleanup(test, ctx);
+}
+
+/*
+ * Force the alignment-aware two-phase search through phase 2: pack
+ * size-2 size_aligned allocations, free every other one to leave gaps
+ * of size 2 (which the strict phase-1 threshold of size + align - 1 = 3
+ * will prune away), and verify a fresh size-2 alloc still succeeds via
+ * the phase-2 fallback.
+ */
+static void test_two_phase_alignment(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ const int N = 64;
+ struct iova **iovas;
+ struct iova *iova;
+ int i;
+ bool found_phase2_candidate = false;
+
+ iovas = kunit_kcalloc(test, N, sizeof(*iovas), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, iovas);
+
+ for (i = 0; i < N; ++i) {
+ iovas[i] = alloc_iova(&ctx->iovad, 2, TEST_LIMIT_32BIT, true);
+ KUNIT_ASSERT_NOT_NULL(test, iovas[i]);
+ KUNIT_EXPECT_EQ(test, iovas[i]->pfn_lo & 1, 0);
+ }
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ /* Free every other to create size-2 gaps interleaved with allocs. */
+ for (i = 0; i < N; i += 2) {
+ __free_iova(&ctx->iovad, iovas[i]);
+ iovas[i] = NULL;
+ }
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ /*
+ * Allocate size-2 — phase 1 (threshold 3) should miss the size-2
+ * gaps; phase 2 (threshold 2) should still find one. The result
+ * may also land in the unfragmented gap below the lowest packed
+ * iova; either way, alloc must succeed and the result must be
+ * 2-aligned.
+ */
+ iova = alloc_iova(&ctx->iovad, 2, TEST_LIMIT_32BIT, true);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ KUNIT_EXPECT_EQ(test, iova->pfn_lo & 1, 0);
+ /* Was it placed in one of the freed slots (= phase 2 hit)? */
+ for (i = 1; i < N; i += 2) {
+ struct iova *neighbor = iovas[i];
+
+ if (!neighbor)
+ continue;
+ if (iova->pfn_hi + 1 == neighbor->pfn_lo ||
+ iova->pfn_lo == neighbor->pfn_hi + 1) {
+ found_phase2_candidate = true;
+ break;
+ }
+ }
+ kunit_info(test, "alloc landed at pfn 0x%lx, phase2-slot=%s\n",
+ iova->pfn_lo, found_phase2_candidate ? "yes" : "no");
+ __free_iova(&ctx->iovad, iova);
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ for (i = 0; i < N; ++i)
+ if (iovas[i])
+ __free_iova(&ctx->iovad, iovas[i]);
+ iova_test_cleanup(test, ctx);
+}
+
+/*
+ * Mimic dma-iommu's pci_32bit_workaround pattern: every alloc first
+ * tries the 32-bit limit; if that fails, retry with the 64-bit limit.
+ * Verifies that the dual-augmented invariant survives the rapid
+ * switching between is_32bit=true and is_32bit=false.
+ */
+static void test_pci_32bit_workaround_pattern(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ int i;
+
+ for (i = 0; i < 500; ++i) {
+ unsigned long size = (i % 4) + 1;
+ struct iova *iova = alloc_iova(&ctx->iovad, size,
+ TEST_LIMIT_32BIT, true);
+
+ if (!iova)
+ iova = alloc_iova(&ctx->iovad, size,
+ TEST_LIMIT_64BIT, true);
+ KUNIT_ASSERT_NOT_NULL(test, iova);
+ }
+ KUNIT_EXPECT_TRUE(test, iova_domain_verify_invariants(&ctx->iovad));
+
+ iova_test_cleanup(test, ctx);
+}
+
+/*
+ * Random alloc/free over many iterations, verifying invariants after
+ * every operation. Uses a deterministic PRNG so failures reproduce
+ * across boots.
+ */
+static void test_stress_random(struct kunit *test)
+{
+ struct iova_test_ctx *ctx = iova_test_init_ctx(test);
+ const int N = 512;
+ const int iters = 4 * N;
+ struct iova **iovas;
+ u32 rng = 0xDEADBEEF;
+ int i;
+
+ iovas = kunit_kcalloc(test, N, sizeof(*iovas), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, iovas);
+
+ for (i = 0; i < iters; ++i) {
+ int slot;
+ bool use_32bit;
+ unsigned long limit;
+ const char *op;
+
+ rng = rng * 1103515245 + 12345;
+ slot = (rng >> 8) % N;
+ rng = rng * 1103515245 + 12345;
+ use_32bit = rng & 1;
+ limit = use_32bit ? TEST_LIMIT_32BIT : TEST_LIMIT_64BIT;
+
+ if (iovas[slot]) {
+ op = "free";
+ __free_iova(&ctx->iovad, iovas[slot]);
+ iovas[slot] = NULL;
+ } else {
+ unsigned long size;
+ bool aligned;
+
+ rng = rng * 1103515245 + 12345;
+ size = 1UL << ((rng >> 8) % 4);
+ rng = rng * 1103515245 + 12345;
+ aligned = rng & 1;
+
+ op = "alloc";
+ iovas[slot] = alloc_iova(&ctx->iovad, size, limit,
+ aligned);
+ }
+ if (!iova_domain_verify_invariants(&ctx->iovad)) {
+ kunit_info(test, "iter %d slot %d: invariant broken after %s\n",
+ i, slot, op);
+ KUNIT_FAIL(test, "verify failed");
+ break;
+ }
+ }
+
+ for (i = 0; i < N; ++i)
+ if (iovas[i])
+ __free_iova(&ctx->iovad, iovas[i]);
+ iova_test_cleanup(test, ctx);
+}
+
+static struct kunit_case iova_test_cases[] = {
+ KUNIT_CASE(test_init_destroy),
+ KUNIT_CASE(test_basic_alloc_free),
+ KUNIT_CASE(test_size_aligned),
+ KUNIT_CASE(test_top_down_preference),
+ KUNIT_CASE(test_limit_pfn_respected),
+ KUNIT_CASE(test_reserve_iova),
+ KUNIT_CASE(test_find_iova),
+ KUNIT_CASE(test_32bit_in_64bit_domain),
+ KUNIT_CASE(test_two_phase_alignment),
+ KUNIT_CASE(test_pci_32bit_workaround_pattern),
+ KUNIT_CASE(test_stress_random),
+ {}
+};
+
+static struct kunit_suite iova_test_suite = {
+ .name = "iova",
+ .test_cases = iova_test_cases,
+};
+kunit_test_suite(iova_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for the IOVA allocator");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 4f3465d8ee16..dfde90fef1f5 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -1160,6 +1160,96 @@ void iova_cache_put(void)
}
EXPORT_SYMBOL_GPL(iova_cache_put);
+#if IS_ENABLED(CONFIG_IOMMU_IOVA_KUNIT_TEST)
+/*
+ * Walk the iova rbtree and verify that every node's gap_to_prev,
+ * clamped_gap32, __subtree_max_gap, and __subtree_max_gap32 match what
+ * recomputation from scratch would yield. Returns true on success.
+ *
+ * Intended for use by kunit tests to catch invariant corruption from
+ * insertion / deletion / rotation paths.
+ */
+struct iova_subtree_maxes {
+ unsigned long max_gap;
+ unsigned long max_gap32;
+};
+
+static struct iova_subtree_maxes
+iova_walk_verify(struct rb_node *node, struct iova_domain *iovad, bool *ok)
+{
+ struct iova_subtree_maxes m = { 0, 0 };
+ struct iova_subtree_maxes left, right;
+ struct rb_node *prev_node;
+ struct iova *iova;
+ unsigned long expected_gap;
+ unsigned long expected_clamped;
+
+ if (!node)
+ return m;
+
+ left = iova_walk_verify(node->rb_left, iovad, ok);
+ right = iova_walk_verify(node->rb_right, iovad, ok);
+ iova = to_iova(node);
+
+ prev_node = rb_prev(node);
+ if (prev_node)
+ expected_gap = iova->pfn_lo - to_iova(prev_node)->pfn_hi - 1;
+ else
+ expected_gap = iova->pfn_lo;
+ if (iova->gap_to_prev != expected_gap) {
+ pr_err("iova_verify: pfn_lo=0x%lx gap_to_prev=%lu expected=%lu\n",
+ iova->pfn_lo, iova->gap_to_prev, expected_gap);
+ *ok = false;
+ }
+
+ expected_clamped = iova_compute_clamped_gap32(iova, iovad->dma_32bit_pfn);
+ if (iova->clamped_gap32 != expected_clamped) {
+ pr_err("iova_verify: pfn_lo=0x%lx clamped_gap32=%lu expected=%lu\n",
+ iova->pfn_lo, iova->clamped_gap32, expected_clamped);
+ *ok = false;
+ }
+
+ m.max_gap = iova->gap_to_prev;
+ if (left.max_gap > m.max_gap)
+ m.max_gap = left.max_gap;
+ if (right.max_gap > m.max_gap)
+ m.max_gap = right.max_gap;
+
+ m.max_gap32 = iova->clamped_gap32;
+ if (left.max_gap32 > m.max_gap32)
+ m.max_gap32 = left.max_gap32;
+ if (right.max_gap32 > m.max_gap32)
+ m.max_gap32 = right.max_gap32;
+
+ if (iova->__subtree_max_gap != m.max_gap) {
+ pr_err("iova_verify: pfn_lo=0x%lx __subtree_max_gap=%lu expected=%lu (own=%lu left=%lu right=%lu)\n",
+ iova->pfn_lo, iova->__subtree_max_gap, m.max_gap,
+ iova->gap_to_prev, left.max_gap, right.max_gap);
+ *ok = false;
+ }
+ if (iova->__subtree_max_gap32 != m.max_gap32) {
+ pr_err("iova_verify: pfn_lo=0x%lx __subtree_max_gap32=%lu expected=%lu (own=%lu left=%lu right=%lu)\n",
+ iova->pfn_lo, iova->__subtree_max_gap32, m.max_gap32,
+ iova->clamped_gap32, left.max_gap32, right.max_gap32);
+ *ok = false;
+ }
+
+ return m;
+}
+
+bool iova_domain_verify_invariants(struct iova_domain *iovad)
+{
+ bool ok = true;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ iova_walk_verify(iovad->rbroot.rb_node, iovad, &ok);
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return ok;
+}
+EXPORT_SYMBOL_GPL(iova_domain_verify_invariants);
+#endif /* CONFIG_IOMMU_IOVA_KUNIT_TEST */
+
MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
MODULE_DESCRIPTION("IOMMU I/O Virtual Address management");
MODULE_LICENSE("GPL");
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d262c6d88d6c..108676d8ad69 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -104,6 +104,9 @@ void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
int iova_domain_init_rcaches(struct iova_domain *iovad);
struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
void put_iova_domain(struct iova_domain *iovad);
+#if IS_ENABLED(CONFIG_IOMMU_IOVA_KUNIT_TEST)
+bool iova_domain_verify_invariants(struct iova_domain *iovad);
+#endif
#else
static inline int iova_cache_get(void)
{
--
2.52.0
^ permalink raw reply related [flat|nested] 6+ messages in thread