All of lore.kernel.org
 help / color / mirror / Atom feed
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: robin.murphy@arm.com, joro@8bytes.org, will@kernel.org,
	iommu@lists.linux.dev, kyle@mcmartin.ca, kernel-team@meta.com,
	Rik van Riel <riel@meta.com>, Rik van Riel <riel@surriel.com>
Subject: [PATCH 3/5] iova: limit_pfn-aware augmentation for log(n) 32-bit alloc
Date: Tue, 12 May 2026 22:00:20 -0400	[thread overview]
Message-ID: <20260513020304.1528751-4-riel@surriel.com> (raw)
In-Reply-To: <20260513020304.1528751-1-riel@surriel.com>

From: Rik van Riel <riel@meta.com>

The augmented-rbtree port made __iova_search_free_gap O(log n) when
limit_pfn doesn't bind any candidate gap, but degrades to O(n) when
limit_pfn is small relative to the gaps in the tree. The classic case
is a 32-bit DMA allocation on a domain dominated by 64-bit allocations:
the augmented invariant __subtree_max_gap is satisfied everywhere (the
huge gap between the highest 64-bit allocation and the IOVA_ANCHOR
dominates), so pruning never fires, and every node above limit_pfn must
be visited and rejected before falling through to the 32-bit region.

Add a second augmented field that bounds the search by 32-bit-clamped
gap size:

  clamped_gap32(node) =
      0                                if gap_to_prev == 0
      0                                if gap_lo >= dma_32bit_pfn
      min(gap_hi, dma_32bit_pfn-1) - gap_lo + 1   otherwise

  __subtree_max_gap32 = max(clamped_gap32) over subtree

clamped_gap32 is precomputed and stored on each iova whenever
gap_to_prev changes (insert, remove, reserve overlap fix-up), so the
augmented callbacks remain pure functions of the node and don't need
domain context.

The compute helper iova_compute_max() updates both __subtree_max_gap
and __subtree_max_gap32 in one pass; the hand-rolled propagate / copy
/ rotate callbacks (replacing the single-field RB_DECLARE_CALLBACKS_MAX
expansion) maintain both fields per insert/erase. Pruning in
__iova_search_free_gap selects between them based on whether the
caller is doing a 32-bit allocation.

For 64-bit allocations the behaviour is unchanged. For 32-bit
allocations on mixed-DMA-mask domains the search is now O(log n).

Assisted-by: Claude:claude-opus-4.7
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 drivers/iommu/iova.c | 153 ++++++++++++++++++++++++++++++++++++-------
 include/linux/iova.h |   6 +-
 2 files changed, 133 insertions(+), 26 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index c358ce981cae..7787da8b2dad 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -35,14 +35,97 @@ static struct iova *to_iova(struct rb_node *node)
 	return rb_entry(node, struct iova, node);
 }
 
-static inline unsigned long iova_gap_value(struct iova *iova)
+/*
+ * Portion of @iova->gap_to_prev that lies strictly below @dma_32bit_pfn,
+ * i.e. the largest contiguous sub-gap that a 32-bit-restricted allocation
+ * could possibly use. Maintained alongside gap_to_prev so the augmented
+ * callbacks can compare against it without needing per-iova access to the
+ * domain's dma_32bit_pfn.
+ */
+static unsigned long
+iova_compute_clamped_gap32(struct iova *iova, unsigned long dma_32bit_pfn)
+{
+	unsigned long gap_lo, gap_hi;
+
+	if (iova->gap_to_prev == 0)
+		return 0;
+	gap_lo = iova->pfn_lo - iova->gap_to_prev;
+	if (gap_lo >= dma_32bit_pfn)
+		return 0;
+	gap_hi = iova->pfn_lo - 1;
+	if (gap_hi >= dma_32bit_pfn)
+		gap_hi = dma_32bit_pfn - 1;
+	return gap_hi - gap_lo + 1;
+}
+
+/*
+ * Recompute @node's __subtree_max_gap and __subtree_max_gap32 from its
+ * own gap fields and its children's subtree maxes. Returns true (for
+ * propagate's early-termination) if neither value would change.
+ */
+static bool iova_compute_max(struct iova *node, bool exit)
 {
-	return iova->gap_to_prev;
+	unsigned long max_gap = node->gap_to_prev;
+	unsigned long max_gap32 = node->clamped_gap32;
+	struct iova *child;
+
+	if (node->node.rb_left) {
+		child = to_iova(node->node.rb_left);
+		if (child->__subtree_max_gap > max_gap)
+			max_gap = child->__subtree_max_gap;
+		if (child->__subtree_max_gap32 > max_gap32)
+			max_gap32 = child->__subtree_max_gap32;
+	}
+	if (node->node.rb_right) {
+		child = to_iova(node->node.rb_right);
+		if (child->__subtree_max_gap > max_gap)
+			max_gap = child->__subtree_max_gap;
+		if (child->__subtree_max_gap32 > max_gap32)
+			max_gap32 = child->__subtree_max_gap32;
+	}
+	if (exit && node->__subtree_max_gap == max_gap &&
+	    node->__subtree_max_gap32 == max_gap32)
+		return true;
+	node->__subtree_max_gap = max_gap;
+	node->__subtree_max_gap32 = max_gap32;
+	return false;
 }
 
-RB_DECLARE_CALLBACKS_MAX(static, iova_gap_callbacks,
-			 struct iova, node, unsigned long, __subtree_max_gap,
-			 iova_gap_value)
+static void iova_gap_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+	while (rb != stop) {
+		struct iova *node = to_iova(rb);
+
+		if (iova_compute_max(node, true))
+			break;
+		rb = rb_parent(&node->node);
+	}
+}
+
+static void iova_gap_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct iova *old = to_iova(rb_old);
+	struct iova *new = to_iova(rb_new);
+
+	new->__subtree_max_gap = old->__subtree_max_gap;
+	new->__subtree_max_gap32 = old->__subtree_max_gap32;
+}
+
+static void iova_gap_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct iova *old = to_iova(rb_old);
+	struct iova *new = to_iova(rb_new);
+
+	new->__subtree_max_gap = old->__subtree_max_gap;
+	new->__subtree_max_gap32 = old->__subtree_max_gap32;
+	iova_compute_max(old, false);
+}
+
+static const struct rb_augment_callbacks iova_gap_callbacks = {
+	.propagate = iova_gap_propagate,
+	.copy = iova_gap_copy,
+	.rotate = iova_gap_rotate,
+};
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -64,6 +147,9 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
 	iovad->anchor.gap_to_prev = IOVA_ANCHOR;
 	iovad->anchor.__subtree_max_gap = IOVA_ANCHOR;
+	iovad->anchor.clamped_gap32 = iova_compute_clamped_gap32(&iovad->anchor,
+								 iovad->dma_32bit_pfn);
+	iovad->anchor.__subtree_max_gap32 = iovad->anchor.clamped_gap32;
 	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
 	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
 }
@@ -71,9 +157,10 @@ EXPORT_SYMBOL_GPL(init_iova_domain);
 
 /* Insert the iova into domain rbtree by holding writer lock */
 static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova,
+iova_insert_rbtree(struct iova_domain *iovad, struct iova *iova,
 		   struct rb_node *start)
 {
+	struct rb_root *root = &iovad->rbroot;
 	struct rb_node **new, *parent = NULL;
 	struct rb_node *prev_node, *next_node;
 
@@ -101,12 +188,18 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
 		iova->gap_to_prev = iova->pfn_lo - to_iova(prev_node)->pfn_hi - 1;
 	else
 		iova->gap_to_prev = iova->pfn_lo;
+	iova->clamped_gap32 = iova_compute_clamped_gap32(iova, iovad->dma_32bit_pfn);
 	iova->__subtree_max_gap = iova->gap_to_prev;
+	iova->__subtree_max_gap32 = iova->clamped_gap32;
 
 	next_node = rb_next(&iova->node);
-	if (next_node)
-		to_iova(next_node)->gap_to_prev =
-			to_iova(next_node)->pfn_lo - iova->pfn_hi - 1;
+	if (next_node) {
+		struct iova *next_iova = to_iova(next_node);
+
+		next_iova->gap_to_prev = next_iova->pfn_lo - iova->pfn_hi - 1;
+		next_iova->clamped_gap32 = iova_compute_clamped_gap32(next_iova,
+								      iovad->dma_32bit_pfn);
+	}
 
 	if (parent)
 		iova_gap_callbacks.propagate(parent, NULL);
@@ -119,25 +212,32 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
 /*
  * Search the augmented rbtree for the highest-addressed free gap of at least
  * @size pages, with the allocation fitting below @limit_pfn and at or above
- * @start_pfn. Returns the node whose gap_to_prev is used, or NULL.
+ * @start_pfn. When @is_32bit, prune by the 32-bit-clamped subtree max so
+ * that 32-bit-restricted allocations on a domain dominated by high-pfn
+ * allocations stay O(log n) instead of degrading to O(n). Returns the node
+ * whose gap_to_prev is used, or NULL.
  */
 static struct rb_node *
 __iova_search_free_gap(struct rb_node *node, unsigned long size,
 		       unsigned long limit_pfn, unsigned long start_pfn,
-		       unsigned long align_mask, unsigned long *new_pfn)
+		       unsigned long align_mask, bool is_32bit,
+		       unsigned long *new_pfn)
 {
 	struct iova *iova;
 	struct rb_node *result;
+	unsigned long subtree_max;
 
 	if (!node)
 		return NULL;
 
 	iova = to_iova(node);
-	if (iova->__subtree_max_gap < size)
+	subtree_max = is_32bit ? iova->__subtree_max_gap32
+			       : iova->__subtree_max_gap;
+	if (subtree_max < size)
 		return NULL;
 
 	result = __iova_search_free_gap(node->rb_right, size, limit_pfn,
-					start_pfn, align_mask, new_pfn);
+					start_pfn, align_mask, is_32bit, new_pfn);
 	if (result)
 		return result;
 
@@ -158,7 +258,7 @@ __iova_search_free_gap(struct rb_node *node, unsigned long size,
 	}
 
 	return __iova_search_free_gap(node->rb_left, size, limit_pfn,
-				      start_pfn, align_mask, new_pfn);
+				      start_pfn, align_mask, is_32bit, new_pfn);
 }
 
 static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
@@ -169,20 +269,21 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 	unsigned long new_pfn;
 	unsigned long align_mask = ~0UL;
 	struct rb_node *gap_node;
+	bool is_32bit;
 
 	if (size_aligned)
 		align_mask <<= fls_long(size - 1);
 
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	if (limit_pfn <= iovad->dma_32bit_pfn &&
-			size >= iovad->max32_alloc_size)
+	is_32bit = limit_pfn <= iovad->dma_32bit_pfn;
+	if (is_32bit && size >= iovad->max32_alloc_size)
 		goto iova32_full;
 
 	gap_node = __iova_search_free_gap(iovad->rbroot.rb_node, size,
 					  limit_pfn, iovad->start_pfn,
-					  align_mask, &new_pfn);
+					  align_mask, is_32bit, &new_pfn);
 	if (!gap_node) {
-		if (limit_pfn <= iovad->dma_32bit_pfn)
+		if (is_32bit)
 			iovad->max32_alloc_size = size;
 		goto iova32_full;
 	}
@@ -190,7 +291,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 	new->pfn_lo = new_pfn;
 	new->pfn_hi = new_pfn + size - 1;
 
-	iova_insert_rbtree(&iovad->rbroot, new, gap_node);
+	iova_insert_rbtree(iovad, new, gap_node);
 
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 	return 0;
@@ -291,13 +392,15 @@ static void remove_iova(struct iova_domain *iovad, struct iova *iova)
 				next_iova->pfn_lo - to_iova(prev_node)->pfn_hi - 1;
 		else
 			next_iova->gap_to_prev = next_iova->pfn_lo;
+		next_iova->clamped_gap32 = iova_compute_clamped_gap32(next_iova,
+								      iovad->dma_32bit_pfn);
 		/*
 		 * Propagate next_iova's new augmented values to the root BEFORE
 		 * the erase. Otherwise rotations inside rb_erase_augmented may
-		 * copy a stale __subtree_max_gap from next_iova to other nodes,
-		 * leaving ancestors in an inconsistent state that the post-erase
-		 * propagate cannot fully repair (early-termination at matching
-		 * intermediate values).
+		 * copy a stale __subtree_max_gap or __subtree_max_gap32 from
+		 * next_iova to other nodes, leaving ancestors in an inconsistent
+		 * state that the post-erase propagate cannot fully repair
+		 * (early-termination at matching intermediate values).
 		 */
 		iova_gap_callbacks.propagate(&next_iova->node, NULL);
 	}
@@ -493,7 +596,7 @@ __insert_new_range(struct iova_domain *iovad,
 
 	iova = alloc_and_init_iova(pfn_lo, pfn_hi);
 	if (iova)
-		iova_insert_rbtree(&iovad->rbroot, iova, NULL);
+		iova_insert_rbtree(iovad, iova, NULL);
 
 	return iova;
 }
@@ -544,6 +647,8 @@ reserve_iova(struct iova_domain *iovad,
 				gap = iova->pfn_lo;
 			if (iova->gap_to_prev != gap) {
 				iova->gap_to_prev = gap;
+				iova->clamped_gap32 = iova_compute_clamped_gap32(iova,
+										 iovad->dma_32bit_pfn);
 				iova_gap_callbacks.propagate(node, NULL);
 			}
 			if ((pfn_lo >= iova->pfn_lo) &&
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 3c4cc81e5182..d262c6d88d6c 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -19,8 +19,10 @@ struct iova {
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* Highest allocated pfn */
 	unsigned long	pfn_lo; /* Lowest allocated pfn */
-	unsigned long	gap_to_prev;       /* Gap (in pfns) to predecessor */
-	unsigned long	__subtree_max_gap; /* Max gap_to_prev in subtree */
+	unsigned long	gap_to_prev;         /* Gap (in pfns) to predecessor */
+	unsigned long	clamped_gap32;       /* gap_to_prev portion below dma_32bit_pfn */
+	unsigned long	__subtree_max_gap;   /* Max gap_to_prev in subtree */
+	unsigned long	__subtree_max_gap32; /* Max clamped_gap32 in subtree */
 };
 
 
-- 
2.52.0


  parent reply	other threads:[~2026-05-13  2:03 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-13  2:00 [PATCH 0/5] iova augmented rbtree O(log n) alloc_iova Rik van Riel
2026-05-13  2:00 ` [PATCH 1/5] iova: switch to augmented rbtree for log(n) allocation Rik van Riel
2026-05-13  2:00 ` [PATCH 2/5] iova: drop dead cached_node / cached32_node infrastructure Rik van Riel
2026-05-13  2:00 ` Rik van Riel [this message]
2026-05-13  2:00 ` [PATCH 4/5] iova: alignment-aware two-phase search for log(n) aligned alloc Rik van Riel
2026-05-13  2:00 ` [PATCH 5/5] iova: add KUnit test suite Rik van Riel
2026-05-15 22:43   ` Jason Gunthorpe
2026-05-16  3:20     ` Rik van Riel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260513020304.1528751-4-riel@surriel.com \
    --to=riel@surriel.com \
    --cc=iommu@lists.linux.dev \
    --cc=joro@8bytes.org \
    --cc=kernel-team@meta.com \
    --cc=kyle@mcmartin.ca \
    --cc=linux-kernel@vger.kernel.org \
    --cc=riel@meta.com \
    --cc=robin.murphy@arm.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.