Linux-HyperV List
 help / color / mirror / Atom feed
* [RFC PATCH 13/20] kho: add radix tree freeze and del_key() error reporting
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

Add kho_radix_tree_freeze() to prevent further modifications to a
KHO radix tree. After freezing, kho_radix_add_key() and
kho_radix_del_key() return -EBUSY. This is used by the MSHV page
preservation code to lock the tree before serializing it for kexec.

Also change kho_radix_del_key() from void to int so it can report
-EBUSY (frozen) and -ENOENT (key not present).

Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kho_radix_tree.h     | 24 ++++++++++----
 kernel/liveupdate/kexec_handover.c | 51 +++++++++++++++++++++++-------
 2 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index c0840ecb230c..4fe2238e1e30 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -21,10 +21,10 @@
  * scheme. Each key is an unsigned long that combines a page's physical
  * address and its order.
  *
- * Client code is responsible for allocating the root node of the tree,
- * initializing the mutex lock, and managing its lifecycle. It must use the
- * tree data structures defined in the KHO ABI,
- * `include/linux/kho/abi/kexec_handover.h`.
+ * Client code must initialize the tree using kho_radix_tree_init(). Pass
+ * a physical address to restore a tree preserved across kexec, or 0 to
+ * allocate a fresh empty tree. The tree uses data structures defined in
+ * the KHO ABI, `include/linux/kho/abi/kexec_handover.h`.
  */
 
 struct kho_radix_node;
@@ -32,6 +32,7 @@ struct kho_radix_node;
 struct kho_radix_tree {
 	struct kho_radix_node *root;
 	struct mutex lock; /* protects the tree's structure and root pointer */
+	bool frozen;
 };
 
 /**
@@ -51,11 +52,12 @@ struct kho_radix_walk_cb {
 #ifdef CONFIG_KEXEC_HANDOVER
 
 int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key);
-void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key);
+int kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key);
 int kho_radix_walk_tree(struct kho_radix_tree *tree,
 			const struct kho_radix_walk_cb *cb, void *data);
 int kho_radix_init_tree(struct kho_radix_tree *tree, struct kho_radix_node *root);
 void kho_radix_destroy_tree(struct kho_radix_tree *tree);
+int kho_radix_tree_freeze(struct kho_radix_tree *tree);
 
 #else  /* #ifdef CONFIG_KEXEC_HANDOVER */
 
@@ -64,8 +66,11 @@ static inline int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long k
 	return -EOPNOTSUPP;
 }
 
-static inline void kho_radix_del_key(struct kho_radix_tree *tree,
-				     unsigned long key) { }
+static inline int kho_radix_del_key(struct kho_radix_tree *tree,
+				     unsigned long key)
+{
+	return -EOPNOTSUPP;
+}
 
 static inline int kho_radix_walk_tree(struct kho_radix_tree *tree,
 				      const struct kho_radix_walk_cb *cb, void *data)
@@ -81,6 +86,11 @@ static inline int kho_radix_init_tree(struct kho_radix_tree *tree,
 
 static inline void kho_radix_destroy_tree(struct kho_radix_tree *tree) { }
 
+static inline int kho_radix_tree_freeze(struct kho_radix_tree *tree)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* #ifdef CONFIG_KEXEC_HANDOVER */
 
 #endif	/* _LINUX_KHO_RADIX_TREE_H */
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 797ec285b698..2e2b4e73f00d 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -79,9 +79,6 @@ struct kho_out {
 
 static struct kho_out kho_out = {
 	.lock = __MUTEX_INITIALIZER(kho_out.lock),
-	.radix_tree = {
-		.lock = __MUTEX_INITIALIZER(kho_out.radix_tree.lock),
-	},
 };
 
 struct kho_in {
@@ -180,6 +177,28 @@ static void __ref kho_radix_free_node(struct kho_radix_node *node)
 		memblock_free(node, PAGE_SIZE);
 }
 
+/**
+ * kho_radix_tree_freeze - Freeze the tree, preventing further modifications.
+ * @tree: The KHO radix tree to freeze.
+ *
+ * After freezing, kho_radix_add_key() and kho_radix_del_key() will return
+ * -EBUSY. The check is performed under the tree's mutex, so there is no
+ * race between a concurrent add/del and the freeze.
+ *
+ * Return: 0 on success, -EBUSY if the tree is already frozen.
+ */
+int kho_radix_tree_freeze(struct kho_radix_tree *tree)
+{
+	guard(mutex)(&tree->lock);
+
+	if (tree->frozen)
+		return -EBUSY;
+
+	tree->frozen = true;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kho_radix_tree_freeze);
+
 /**
  * kho_radix_add_key - Add a key to the radix tree.
  * @tree: The KHO radix tree.
@@ -210,6 +229,9 @@ int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key)
 
 	guard(mutex)(&tree->lock);
 
+	if (tree->frozen)
+		return -EBUSY;
+
 	/* Go from high levels to low levels */
 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
 		idx = kho_radix_get_table_index(key, i);
@@ -268,20 +290,26 @@ EXPORT_SYMBOL_GPL(kho_radix_add_key);
  * This function traverses the radix tree and clears the bit corresponding to
  * the key, effectively removing it from the tree. It does not free the tree's
  * intermediate nodes, even if they become empty.
+ *
+ * Return: 0 on success, -EINVAL if the tree is uninitialized, -EBUSY if
+ *         frozen, -ENOENT if the key was not present.
  */
-void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key)
+int kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key)
 {
 	struct kho_radix_node *node = tree->root;
 	struct kho_radix_leaf *leaf;
 	unsigned int i, idx;
 
 	if (WARN_ON_ONCE(!tree->root))
-		return;
+		return -EINVAL;
 
 	might_sleep();
 
 	guard(mutex)(&tree->lock);
 
+	if (WARN_ON_ONCE(tree->frozen))
+		return -EBUSY;
+
 	/* Go from high levels to low levels */
 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
 		idx = kho_radix_get_table_index(key, i);
@@ -291,7 +319,7 @@ void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key)
 		 * return with a warning.
 		 */
 		if (WARN_ON(!node->table[idx]))
-			return;
+			return -ENOENT;
 
 		node = phys_to_virt(node->table[idx]);
 	}
@@ -300,6 +328,8 @@ void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key)
 	leaf = (struct kho_radix_leaf *)node;
 	idx = kho_radix_get_bitmap_index(key);
 	__clear_bit(idx, leaf->bitmap);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(kho_radix_del_key);
 
@@ -346,6 +376,7 @@ int kho_radix_init_tree(struct kho_radix_tree *tree, struct kho_radix_node *root
 
 	tree->root = root;
 	mutex_init(&tree->lock);
+	tree->frozen = false;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kho_radix_init_tree);
@@ -1746,11 +1777,9 @@ static __init int kho_init(void)
 	if (!kho_enable)
 		return 0;
 
-	tree->root = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!tree->root) {
-		err = -ENOMEM;
+	err = kho_radix_init_tree(tree, NULL);
+	if (err)
 		goto err_free_scratch;
-	}
 
 	kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
 	if (IS_ERR(kho_out.fdt)) {
@@ -1807,7 +1836,7 @@ static __init int kho_init(void)
 err_free_fdt:
 	kho_unpreserve_free(kho_out.fdt);
 err_free_kho_radix_tree_root:
-	kfree(tree->root);
+	free_page((unsigned long)tree->root);
 	tree->root = NULL;
 err_free_scratch:
 	kho_out.fdt = NULL;
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 14/20] kho: Add crash-kernel-safe radix tree presence check
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

In the crash kernel, the old kernel's memory is outside the direct
map. Add a read-only radix tree variant that memremaps nodes during
init so that subsequent page presence checks can traverse the tree
with plain pointer dereferencing.

This will be used by the MSHV driver to exclude hypervisor-owned pages
from /proc/vmcore via a pfn_is_ram() callback.

Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kho_radix_tree.h     |  30 +++++++
 kernel/liveupdate/kexec_handover.c | 124 +++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index 4fe2238e1e30..e906a874e612 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -49,6 +49,19 @@ struct kho_radix_walk_cb {
 	int (*table)(phys_addr_t phys, void *data);
 };
 
+/**
+ * struct kho_radix_crash_tree - Read-only radix tree for crash kernel use.
+ * @root: pointer to the remapped root node
+ *
+ * In the crash kernel, the old kernel's memory is not in the direct map.
+ * This variant uses memremap() during init to map the tree nodes and
+ * converts the physical address table entries to virtual addresses in-place,
+ * enabling efficient pointer-based traversal without per-lookup remapping.
+ */
+struct kho_radix_crash_tree {
+	struct kho_radix_node *root;
+};
+
 #ifdef CONFIG_KEXEC_HANDOVER
 
 int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key);
@@ -59,6 +72,11 @@ int kho_radix_init_tree(struct kho_radix_tree *tree, struct kho_radix_node *root
 void kho_radix_destroy_tree(struct kho_radix_tree *tree);
 int kho_radix_tree_freeze(struct kho_radix_tree *tree);
 
+int kho_radix_crash_init(struct kho_radix_crash_tree *tree, phys_addr_t root_pa);
+
+bool kho_radix_crash_contains_page(struct kho_radix_crash_tree *tree,
+				   unsigned long pfn, unsigned int order);
+
 #else  /* #ifdef CONFIG_KEXEC_HANDOVER */
 
 static inline int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key)
@@ -91,6 +109,18 @@ static inline int kho_radix_tree_freeze(struct kho_radix_tree *tree)
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_radix_crash_init(struct kho_radix_crash_tree *tree,
+				       phys_addr_t root_pa)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool kho_radix_crash_contains_page(
+					struct kho_radix_crash_tree *tree,
+					unsigned long pfn, unsigned int order)
+{
+	return false;
+}
 #endif /* #ifdef CONFIG_KEXEC_HANDOVER */
 
 #endif	/* _LINUX_KHO_RADIX_TREE_H */
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 2e2b4e73f00d..0dfdf0f9781e 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -15,6 +15,7 @@
 #include <linux/kmemleak.h>
 #include <linux/count_zeros.h>
 #include <linux/kasan.h>
+#include <linux/io.h>
 #include <linux/kexec.h>
 #include <linux/kexec_handover.h>
 #include <linux/kho_radix_tree.h>
@@ -396,6 +397,129 @@ void kho_radix_destroy_tree(struct kho_radix_tree *tree)
 }
 EXPORT_SYMBOL_GPL(kho_radix_destroy_tree);
 
+/*
+ * Convert a crash tree node's children from PA to VA in-place via memremap().
+ * On failure, already-remapped pages are not cleaned up -- the crash kernel
+ * is short-lived and will reboot after dump collection, so the leak is
+ * inconsequential.
+ */
+static int kho_radix_crash_convert_node(struct kho_radix_node *node,
+					unsigned int level)
+{
+	struct kho_radix_node *child;
+	unsigned int i;
+	int err;
+
+	for (i = 0; i < (1 << KHO_TABLE_SIZE_LOG2); i++) {
+		if (!node->table[i])
+			continue;
+
+		/* Validate: PA must have bit 63 clear and be page-aligned */
+		if ((node->table[i] & BIT_ULL(63)) ||
+		    (node->table[i] & (PAGE_SIZE - 1)))
+			return -EINVAL;
+
+		child = memremap(node->table[i], PAGE_SIZE, MEMREMAP_WB);
+		if (!child)
+			return -ENOMEM;
+
+		/* Overwrite PA with VA in-place */
+		node->table[i] = (u64)(uintptr_t)child;
+
+		/* Recurse for intermediate levels; level 1 children are leaves */
+		if (level > 1) {
+			err = kho_radix_crash_convert_node(child, level - 1);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * kho_radix_crash_init - Initialize a crash-kernel view of a KHO radix tree.
+ * @tree: The crash tree to initialize.
+ * @root_pa: Physical address of the radix tree root from the old kernel.
+ *
+ * Maps the old kernel's radix tree into the crash kernel's address space
+ * by memremapping each node and converting table entries from physical to
+ * virtual addresses in-place. After successful initialization, the tree
+ * can be traversed with kho_radix_crash_contains_page() using direct
+ * pointer dereferencing.
+ *
+ * This function is intended for use in the crash kernel where the old
+ * kernel's memory is not in the direct map. No locking is used as the
+ * crash kernel is effectively single-threaded during dump collection.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kho_radix_crash_init(struct kho_radix_crash_tree *tree, phys_addr_t root_pa)
+{
+	struct kho_radix_node *root;
+	int err;
+
+	tree->root = NULL;
+
+	if (!root_pa || (root_pa & (PAGE_SIZE - 1)))
+		return -EINVAL;
+
+	root = memremap(root_pa, PAGE_SIZE, MEMREMAP_WB);
+	if (!root)
+		return -ENOMEM;
+
+	err = kho_radix_crash_convert_node(root, KHO_TREE_MAX_DEPTH - 1);
+	if (err)
+		return err;
+
+	tree->root = root;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kho_radix_crash_init);
+
+/**
+ * kho_radix_crash_contains_page - Check if a page is in a crash-kernel radix tree.
+ * @tree: The crash tree, previously initialized with kho_radix_crash_init().
+ * @pfn: The page frame number to check.
+ * @order: The order of the page.
+ *
+ * Traverses the radix tree using direct pointer dereferencing (the table
+ * entries were converted from PA to VA during init). No locking is used as the
+ * crash kernel is effectively single-threaded during dump collection.
+ *
+ * Note: This function checks specifically for the presence of the page at the
+ * given order. If a larger order page that encompasses this page is preserved,
+ * this function will return false.
+ *
+ * Return: true if the page is present in the tree, false otherwise.
+ */
+bool kho_radix_crash_contains_page(struct kho_radix_crash_tree *tree,
+				   unsigned long pfn, unsigned int order)
+{
+	unsigned long key = kho_encode_radix_key(PFN_PHYS(pfn), order);
+	struct kho_radix_node *node = tree->root;
+	struct kho_radix_leaf *leaf;
+	unsigned int i, idx;
+
+	if (!tree->root)
+		return false;
+
+	/* Traverse using VA pointers stored in table[] */
+	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
+		idx = kho_radix_get_table_index(key, i);
+
+		if (!node->table[idx])
+			return false;
+
+		node = (struct kho_radix_node *)(uintptr_t)node->table[idx];
+	}
+
+	leaf = (struct kho_radix_leaf *)node;
+	idx = kho_radix_get_bitmap_index(key);
+	return test_bit(idx, leaf->bitmap);
+}
+EXPORT_SYMBOL_GPL(kho_radix_crash_contains_page);
+
 static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, unsigned long key,
 			       const struct kho_radix_walk_cb *cb, void *data)
 {
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 12/20] mm/hugetlb: make bootmem allocation work with KHO
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

Gigantic page allocation is somewhat broken currently when KHO is used.

Firstly, they break KHO scratch size accounting. RSRV_KERN is used to
track how much memory is reserved for use by the kernel. Since
alloc_bootmem() calls the memblock_alloc*() APIs, the hugepages
allocated also get marked as RSRV_KERN.

Allocations marked RSRV_KERN are used by KHO to calculate how much
scratch space it should reserve to make sure the next kernel has enough
memory to boot when it is in scratch-only phase. Counting hugepages in
that blows up scratch size, and can lead to the scratch allocation
failing, making KHO unusable. This will show up when huge pages make up
more than 50% of the system, which is a fairly common use case.

Secondly, while not supported right now, huge pages are user memory and
can be preserved via KHO. The scratch spaces should not have any
preserved memory. Allocating hugepages from scratch (on a KHO boot) can
lead to them being un-preservable.

Introduce memblock_alloc_nid_user(). This does two things: first, it
instructs __memblock_alloc_range_nid() to not use scratch areas to
fulfill allocation. If KHO is in scratch-only mode, allocations will
only be made from extended scratch areas. Second, it removes RSRV_KERN
from the allocation to make sure it doesn't mess up scratch size
accounting.

To reduce duplication, introduce __memblock_alloc_range_nid() which does
exactly what memblock_alloc_range_nid() used to do, but takes the flags
from its caller. Then make memblock_alloc_range_nid() a wrapper to it.
This lets memblock_alloc_nid_user() re-use most of the logic without
causing churn to update all callers of memblock_alloc_range_nid() and
adding yet another argument to it.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/memblock.h |   4 ++
 mm/hugetlb.c             |  19 ++----
 mm/memblock.c            | 138 ++++++++++++++++++++++++++++++---------
 3 files changed, 116 insertions(+), 45 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 4f535ca4947a..c7056cf3f0f2 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -160,6 +160,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
 int memblock_reserved_mark_kern(phys_addr_t base, phys_addr_t size);
+int memblock_reserved_clear_kern(phys_addr_t base, phys_addr_t size);
 int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size);
 int memblock_mark_kho_scratch_ext(phys_addr_t base, phys_addr_t size);
 int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size);
@@ -431,6 +432,9 @@ void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
 			     phys_addr_t min_addr, phys_addr_t max_addr,
 			     int nid);
 
+void *memblock_alloc_nid_user(phys_addr_t size, phys_addr_t align, int nid,
+			      bool exact_nid);
+
 static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
 {
 	return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 571212b80835..46f2b1bd5abe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3033,26 +3033,19 @@ static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
 	if (hugetlb_early_cma(h))
 		m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
 	else {
-		if (node_exact)
-			m = memblock_alloc_exact_nid_raw(huge_page_size(h),
-				huge_page_size(h), 0,
-				MEMBLOCK_ALLOC_ACCESSIBLE, nid);
-		else {
-			m = memblock_alloc_try_nid_raw(huge_page_size(h),
-				huge_page_size(h), 0,
-				MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+		m = memblock_alloc_nid_user(huge_page_size(h), huge_page_size(h),
+					    nid, node_exact);
+		if (m) {
 			/*
 			 * For pre-HVO to work correctly, pages need to be on
 			 * the list for the node they were actually allocated
 			 * from. That node may be different in the case of
-			 * fallback by memblock_alloc_try_nid_raw. So,
-			 * extract the actual node first.
+			 * fallback by memblock_alloc_try_nid_raw. So, extract
+			 * the actual node first.
 			 */
-			if (m)
+			if (node_exact)
 				listnode = early_pfn_to_nid(PHYS_PFN(__pa(m)));
-		}
 
-		if (m) {
 			m->flags = 0;
 			m->cma = NULL;
 		}
diff --git a/mm/memblock.c b/mm/memblock.c
index 6f76a6bb96d6..8cd52d34ad6e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -178,11 +178,21 @@ bool __init_memblock memblock_has_mirror(void)
 	return system_has_some_mirror;
 }
 
-static enum memblock_flags __init_memblock choose_memblock_flags(void)
+static enum memblock_flags __init_memblock choose_memblock_flags(bool user)
 {
 	/* skip non-scratch memory for kho early boot allocations */
-	if (kho_scratch_only)
-		return MEMBLOCK_KHO_SCRATCH | MEMBLOCK_KHO_SCRATCH_EXT;
+	if (kho_scratch_only) {
+		enum memblock_flags flags = MEMBLOCK_KHO_SCRATCH_EXT;
+
+		/*
+		 * Scratch can only be used for kernel memory, since user memory
+		 * might be preserved and thus can not be in scratch.
+		 */
+		if (!user)
+			flags |= MEMBLOCK_KHO_SCRATCH;
+
+		return flags;
+	}
 
 	return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
 }
@@ -346,7 +356,7 @@ static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
 					phys_addr_t align)
 {
 	phys_addr_t ret;
-	enum memblock_flags flags = choose_memblock_flags();
+	enum memblock_flags flags = choose_memblock_flags(false);
 
 again:
 	ret = memblock_find_in_range_node(size, align, start, end,
@@ -1175,6 +1185,20 @@ int __init_memblock memblock_reserved_mark_kern(phys_addr_t base, phys_addr_t si
 				    MEMBLOCK_RSRV_KERN);
 }
 
+/**
+ * memblock_reserved_clear_kern - Clear MEMBLOCK_RSRV_KERN flag for region
+ *
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_reserved_clear_kern(phys_addr_t base, phys_addr_t size)
+{
+	return memblock_setclr_flag(&memblock.reserved, base, size, 0,
+				    MEMBLOCK_RSRV_KERN);
+}
+
 /**
  * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH.
  * @base: the base phys addr of the region
@@ -1534,37 +1558,11 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 	return 0;
 }
 
-/**
- * memblock_alloc_range_nid - allocate boot memory block
- * @size: size of memory block to be allocated in bytes
- * @align: alignment of the region and block's size
- * @start: the lower bound of the memory region to allocate (phys address)
- * @end: the upper bound of the memory region to allocate (phys address)
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- * @exact_nid: control the allocation fall back to other nodes
- *
- * The allocation is performed from memory region limited by
- * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
- *
- * If the specified node can not hold the requested memory and @exact_nid
- * is false, the allocation falls back to any node in the system.
- *
- * For systems with memory mirroring, the allocation is attempted first
- * from the regions with mirroring enabled and then retried from any
- * memory region.
- *
- * In addition, function using kmemleak_alloc_phys for allocated boot
- * memory block, it is never reported as leaks.
- *
- * Return:
- * Physical address of allocated memory block on success, %0 on failure.
- */
-phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
+static phys_addr_t __init __memblock_alloc_range_nid(phys_addr_t size,
 					phys_addr_t align, phys_addr_t start,
 					phys_addr_t end, int nid,
-					bool exact_nid)
+					bool exact_nid, enum memblock_flags flags)
 {
-	enum memblock_flags flags = choose_memblock_flags();
 	phys_addr_t found;
 
 	/*
@@ -1633,6 +1631,41 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 	return found;
 }
 
+/**
+ * memblock_alloc_range_nid - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @start: the lower bound of the memory region to allocate (phys address)
+ * @end: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @exact_nid: control the allocation fall back to other nodes
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
+ *
+ * If the specified node can not hold the requested memory and @exact_nid
+ * is false, the allocation falls back to any node in the system.
+ *
+ * For systems with memory mirroring, the allocation is attempted first
+ * from the regions with mirroring enabled and then retried from any
+ * memory region.
+ *
+ * In addition, function using kmemleak_alloc_phys for allocated boot
+ * memory block, it is never reported as leaks.
+ *
+ * Return:
+ * Physical address of allocated memory block on success, %0 on failure.
+ */
+phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
+					phys_addr_t align, phys_addr_t start,
+					phys_addr_t end, int nid,
+					bool exact_nid)
+{
+	enum memblock_flags flags = choose_memblock_flags(false);
+
+	return __memblock_alloc_range_nid(size, align, start, end, nid, exact_nid, flags);
+}
+
 /**
  * memblock_phys_alloc_range - allocate a memory block inside specified range
  * @size: size of memory block to be allocated in bytes
@@ -1784,6 +1817,47 @@ void * __init memblock_alloc_try_nid_raw(
 				       false);
 }
 
+/**
+ * memblock_alloc_nid_user - allocate boot memory for use by userspace
+ * @size: size of the memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @exact_nid: control the allocation fall back to other nodes
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * If the specified node can not hold the requested memory and @exact_nid is
+ * false, the allocation falls back to any node in the system. The allocated
+ * memory has no restrictions on minimum or maximum address, and does not count
+ * towards %MEMBLOCK_RSRV_KERN.
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, %NULL on failure.
+ */
+void * __init memblock_alloc_nid_user(phys_addr_t size, phys_addr_t align,
+				      int nid, bool exact_nid)
+{
+	enum memblock_flags flags = choose_memblock_flags(true);
+	phys_addr_t alloc;
+
+	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d %pS\n",
+		     __func__, (u64)size, (u64)align, nid, (void *)_RET_IP_);
+
+	alloc = __memblock_alloc_range_nid(size, align, 0, MEMBLOCK_ALLOC_ACCESSIBLE,
+					   nid, exact_nid, flags);
+	if (!alloc)
+		return NULL;
+
+	/* User memory should not be marked with RSRV_KERN. */
+	if (memblock_reserved_clear_kern(alloc, size)) {
+		memblock_phys_free(alloc, size);
+		return NULL;
+	}
+
+	return phys_to_virt(alloc);
+}
+
 /**
  * memblock_alloc_try_nid - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 11/20] kho: return virtual address of mem_map
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

There are currently 3 callers of kho_get_mem_map_phys(). Two of them,
kho_mem_retrieve() and kho_extend_scratch() need the virtual address.
The third, kho_populate() doesn't care. Make things simpler by
directly returning the virtual address. Rename kho_get_mem_map_phys() to
kho_get_mem_map() to accurately reflect what it returns.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 kernel/liveupdate/kexec_handover.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index a006a883ee94..797ec285b698 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -610,10 +610,11 @@ static int __init kho_preserved_memory_reserve(unsigned long key, void *data)
 	return 0;
 }
 
-/* Returns physical address of the preserved memory map from FDT */
-static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
+/* Returns virtual address of the preserved memory map from FDT */
+static __init void *kho_get_mem_map(const void *fdt)
 {
 	const void *mem_ptr;
+	phys_addr_t mem_map_phys;
 	int len;
 
 	mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
@@ -622,7 +623,11 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
 		return 0;
 	}
 
-	return get_unaligned((const u64 *)mem_ptr);
+	mem_map_phys = get_unaligned((const u64 *)mem_ptr);
+	if (!mem_map_phys)
+		return NULL;
+
+	return phys_to_virt(mem_map_phys);
 }
 
 /*
@@ -917,15 +922,15 @@ void __init kho_extend_scratch(void)
 		.key = kho_ext_mark_scratch,
 	};
 	struct kho_radix_tree radix;
-	phys_addr_t prev_end = 0, mem_map_phys;
+	phys_addr_t prev_end = 0;
 	int err = 0;
 
 	if (!is_kho_boot())
 		return;
 
 	/* Make sure the KHO radix tree is initialized. */
-	mem_map_phys = kho_get_mem_map_phys(kho_get_fdt());
-	err = kho_radix_init_tree(&kho_in.radix_tree, phys_to_virt(mem_map_phys));
+	err = kho_radix_init_tree(&kho_in.radix_tree,
+				  kho_get_mem_map(kho_get_fdt()));
 	if (err)
 		goto print;
 
@@ -1609,11 +1614,9 @@ static int __init kho_mem_retrieve(const void *fdt)
 	const struct kho_radix_walk_cb cb = {
 		.key = kho_preserved_memory_reserve,
 	};
-	phys_addr_t mem_map_phys;
 	int err;
 
-	mem_map_phys = kho_get_mem_map_phys(fdt);
-	err = kho_radix_init_tree(&kho_in.radix_tree, phys_to_virt(mem_map_phys));
+	err = kho_radix_init_tree(&kho_in.radix_tree, kho_get_mem_map(fdt));
 	if (err)
 		return err;
 
@@ -1838,8 +1841,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
 {
 	unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
 	struct kho_scratch *scratch = NULL;
-	phys_addr_t mem_map_phys;
-	void *fdt = NULL;
+	void *fdt = NULL, *mem_map;
 	bool populated = false;
 	int err;
 
@@ -1862,8 +1864,8 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
 		goto unmap_fdt;
 	}
 
-	mem_map_phys = kho_get_mem_map_phys(fdt);
-	if (!mem_map_phys)
+	mem_map = kho_get_mem_map(fdt);
+	if (!mem_map)
 		goto unmap_fdt;
 
 	scratch = early_memremap(scratch_phys, scratch_len);
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 10/20] kho: extended scratch
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

Motivation
==========

The scratch space is allocated by the first kernel in the KHO chain, and
is reused by all subsequent kernels. The size of the space is either set
via the commandline by the system administrator or by calculating the
amount of memory used by the kernel and adding a multiplier. In either
case, the scratch space is a heuristic and is liable to fill up and fail
allocation if a kernel uses more memory than expected.

In addition, gigantic huge pages (usually 1 GiB) are allocated via
memblock, and in a KHO boot that memory comes from the scratch space. In
hypervisors it is common to dedicate a major part of the system's memory
to gigantic hugepages for VM memory.

If this memory needs to come from scratch space, then scratch needs to
be greater than the memory needed for huge pages, which is impractical.
In addition, hugepages can be preserved memory. Allocating them from
scratch violates the assumption that scratch contains no preserved
memory.

Methodology
===========

Introduce extended scratch areas. These areas are discovered at boot by
walking the preserved memory radix tree and looking for free blocks of
memory. They then marked as scratch to allow allocations from them. This
makes KHO more resilient to memory pressure and allows supporting huge
page preservation.

Since the preserved memory radix tree mixes both physical address and
order into a single key, and does not track table pages, it is difficult
to identify free areas from it directly. Walk the tree and digest it
down into another radix tree. The latter tracks blocks of
KHO_EXT_SHIFT (1 GiB as of now) granularity. Then walk the digested tree
and mark the areas between the present keys as scratch.

Performance
===========

The discovery algorithm traverses the preserved memory radix tree
exactly once. While it does use memory for the digested radix tree,
since the blocks are split by 1 GiB, a single bitmap with 4k pages can
track up to 32 TiB of memory. So there are likely to be very few radix
tree pages used in this tracking. For systems with all physical memory
below 32 TiB, this should result in a total of 6 pages being
used (KHO_TREE_MAX_DEPTH == 6).

An alternate way of achieving this would be to call kho_mem_retrieve()
earlier in boot and mark all the KHO preservations as reserved. But that
can blow up memblock.reserved with a bunch of 4K pages scattered
everywhere, which will reduce performance of subsequent allocations.
Since the free blocks are tracked in chunks of 1 GiB, this won't blow up
memblock.memory as much.

Practical evaluation
====================

The testing is done on a x86_64 qemu VM running under KVM with 64G
memory and 12 CPUs. The machine pre-allocates 50 1G pages.

Since the performance scales with how busy the radix tree is, tests are
done with 2 preservation patterns: first with two 1M memfds, second with
two 1G memfds, both using 4k pages.

Test case 1 - 1M memfd
~~~~~~~~~~~~~~~~~~~~~~

This test case has two memfds with 1M memory each in 4k pages, plus
other preservations from LUO core and other KHO users.

This is how the radix tree stats look like:

    radix_nodes:       0x2f
    nr_preservations:  0x22d
    mem_preserved:     0xa2b000

    per order preservations:
    order  0:  0x215
    order  1:  0x9
    order  2:  0x1
    order  3:  0x2
    order  4:  0x5
    order  5:  0x1
    order  6:  0x2
    order  7:  0x2
    order  9:  0x1
    order 10:  0x1

and this is how long it takes to extend the scratch after KHO boot:

    kho_extend_scratch(): time taken: 88 us
    kho_extend_scratch(): total memory recovered: 0xf7ff7b000 (~62G)

Test case 2 - 1G memfd
~~~~~~~~~~~~~~~~~~~~~~

This test case has two memfds with 1G memory each in 4k pages, plus
other preservations from LUO core and other KHO users.

This is how the radix tree stats look like:

    radix_nodes:       0x45
    nr_preservations:  0x80832
    mem_preserved:     0x8102d000

    per order preservations:
    order  0:  0x80817
    order  1:  0x7
    order  2:  0x2
    order  3:  0x4
    order  4:  0x2
    order  5:  0x2
    order  6:  0x4
    order  7:  0x3
    order  8:  0x1
    order  9:  0x2

and this is how long it takes to extend the scratch after KHO boot:

    kho_extend_scratch(): time taken: 21769 us
    kho_extend_scratch(): total memory recovered: 0xe40000000 (57G)

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kexec_handover.h     |   1 +
 kernel/liveupdate/kexec_handover.c | 148 +++++++++++++++++++++++++----
 mm/mm_init.c                       |   1 +
 3 files changed, 133 insertions(+), 17 deletions(-)

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 8968c56d2d73..6ce46f36ed99 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -37,6 +37,7 @@ void kho_remove_subtree(void *blob);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size);
 
 void kho_memory_init(void);
+void kho_extend_scratch(void);
 
 void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys,
 		  u64 scratch_len);
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index b2d1572808eb..a006a883ee94 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -84,6 +84,23 @@ static struct kho_out kho_out = {
 	},
 };
 
+struct kho_in {
+	phys_addr_t fdt_phys;
+	phys_addr_t scratch_phys;
+	char previous_release[__NEW_UTS_LEN + 1];
+	u32 kexec_count;
+	struct kho_debugfs dbg;
+	struct kho_radix_tree radix_tree;
+};
+
+static struct kho_in kho_in = {
+};
+
+static const void *kho_get_fdt(void)
+{
+	return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
+}
+
 /**
  * kho_encode_radix_key - Encodes a physical address and order into a radix key.
  * @phys: The physical address of the page.
@@ -825,6 +842,120 @@ static void __init kho_reserve_scratch(void)
 	kho_enable = false;
 }
 
+#define KHO_EXT_SHIFT 30 /* 1 GiB */
+
+static int __init kho_ext_walk_key(unsigned long key, void *data)
+{
+	struct kho_radix_tree *tree = data;
+	phys_addr_t start, end;
+	unsigned int order;
+	int err;
+
+	start = kho_decode_radix_key(key, &order);
+	end = start + (1UL << (order + PAGE_SHIFT));
+
+	while (start < end) {
+		err = kho_radix_add_key(tree, start >> KHO_EXT_SHIFT);
+		if (err)
+			return err;
+
+		start += (1UL << KHO_EXT_SHIFT);
+	}
+
+	return 0;
+}
+
+static int __init kho_ext_walk_table(phys_addr_t phys, void *data)
+{
+	struct kho_radix_tree *tree = data;
+
+	return kho_radix_add_key(tree, phys >> KHO_EXT_SHIFT);
+}
+
+static int __init kho_ext_mark_scratch(unsigned long key, void *data)
+{
+	phys_addr_t *prev_end = data;
+	phys_addr_t start = key << KHO_EXT_SHIFT;
+	int err;
+
+	if (start > *prev_end) {
+		err = memblock_mark_kho_scratch_ext(*prev_end, start - *prev_end);
+		if (err)
+			return err;
+	}
+
+	*prev_end = start + (1UL << KHO_EXT_SHIFT);
+	return 0;
+}
+
+/**
+ * kho_extend_scratch - Extend the scratch regions
+ *
+ * The KHO radix tree mixes both physical address and order into a single key.
+ * This makes it hard to look for free ranges directly. This function first
+ * walks the radix tree and digests it down into another radix tree, whose keys
+ * identify blocks of KHO_EXT_SHIFT which contain preserved memory.
+ *
+ * Then it walks the digested radix tree and marks everything that doesn't have
+ * preserved memory as scratch.
+ *
+ * NOTE: This function allocates memory so it should be called when scratch has
+ * available space.
+ *
+ * NOTE: The pages of the KHO radix tree tables are not marked as preserved in
+ * the KHO tree. But they are expected to remain untouched until the tree is
+ * fully parsed. So this function also considers them to be "preserved memory"
+ * and marks their blocks as busy.
+ */
+void __init kho_extend_scratch(void)
+{
+	const struct kho_radix_walk_cb kho_cb = {
+		.key = kho_ext_walk_key,
+		.table = kho_ext_walk_table,
+	};
+	const struct kho_radix_walk_cb ext_cb = {
+		.key = kho_ext_mark_scratch,
+	};
+	struct kho_radix_tree radix;
+	phys_addr_t prev_end = 0, mem_map_phys;
+	int err = 0;
+
+	if (!is_kho_boot())
+		return;
+
+	/* Make sure the KHO radix tree is initialized. */
+	mem_map_phys = kho_get_mem_map_phys(kho_get_fdt());
+	err = kho_radix_init_tree(&kho_in.radix_tree, phys_to_virt(mem_map_phys));
+	if (err)
+		goto print;
+
+	err = kho_radix_init_tree(&radix, NULL);
+	if (err)
+		goto print;
+
+	/* Walk the KHO radix tree to find busy blocks. */
+	err = kho_radix_walk_tree(&kho_in.radix_tree, &kho_cb, &radix);
+	if (err)
+		goto out;
+
+	/* Walk the blocks and mark everything between keys as scratch. */
+	err = kho_radix_walk_tree(&radix, &ext_cb, &prev_end);
+	if (err)
+		goto out;
+
+	/* Mark everything from last busy block to end of DRAM. */
+	if (prev_end < memblock_end_of_DRAM())
+		err = memblock_mark_kho_scratch_ext(prev_end,
+						    memblock_end_of_DRAM() - prev_end);
+
+	/* fallthrough */
+out:
+	kho_radix_destroy_tree(&radix);
+print:
+	if (err)
+		pr_err("Failed to extend scratch: %pe\n", ERR_PTR(err));
+}
+
 /**
  * kho_add_subtree - record the physical address of a sub blob in KHO root tree.
  * @name: name of the sub tree.
@@ -1406,23 +1537,6 @@ void kho_restore_free(void *mem)
 }
 EXPORT_SYMBOL_GPL(kho_restore_free);
 
-struct kho_in {
-	phys_addr_t fdt_phys;
-	phys_addr_t scratch_phys;
-	char previous_release[__NEW_UTS_LEN + 1];
-	u32 kexec_count;
-	struct kho_debugfs dbg;
-	struct kho_radix_tree radix_tree;
-};
-
-static struct kho_in kho_in = {
-};
-
-static const void *kho_get_fdt(void)
-{
-	return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
-}
-
 /**
  * is_kho_boot - check if current kernel was booted via KHO-enabled
  * kexec
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 6de3a77eb9ae..bbca4cc9b912 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2702,6 +2702,7 @@ void __init __weak mem_init(void)
 
 void __init mm_core_init_early(void)
 {
+	kho_extend_scratch();
 	hugetlb_cma_reserve();
 	hugetlb_bootmem_alloc();
 
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 09/20] memblock: introduce MEMBLOCK_KHO_SCRATCH_EXT
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

In the upcoming commits, the KHO will learn how to discover free blocks
of memory by walking the KHO radix tree. It will then mark those regions
as scratch to allow memory allocation in case scratch runs low.

To differentiate the extended scratch areas from the main scratch areas,
introduce MEMBLOCK_KHO_SCRATCH_EXT. Use it when choosing memblock flags
for allocations during scratch-only. Teach should_skip_region() to check
for both flags before deciding if the region should be skipped.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/memblock.h | 10 ++++++++++
 mm/memblock.c            | 41 ++++++++++++++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5afcd99aa8c1..4f535ca4947a 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -51,6 +51,9 @@ extern unsigned long long max_possible_pfn;
  * memory reservations yet, so we get scratch memory from the previous
  * kernel that we know is good to use. It is the only memory that
  * allocations may happen from in this phase.
+ * @MEMBLOCK_KHO_SCRATCH_EXT: same as MEMBLOCK_KHO_SCRATCH but was discovered at
+ * boot time by finding gaps in preserved memory instead of being passed from
+ * previous kernel. Does not get passed to the next kernel.
  */
 enum memblock_flags {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
@@ -61,6 +64,7 @@ enum memblock_flags {
 	MEMBLOCK_RSRV_NOINIT	= 0x10,	/* don't initialize struct pages */
 	MEMBLOCK_RSRV_KERN	= 0x20,	/* memory reserved for kernel use */
 	MEMBLOCK_KHO_SCRATCH	= 0x40,	/* scratch memory for kexec handover */
+	MEMBLOCK_KHO_SCRATCH_EXT= 0x80, /* extended scratch memory for KHO */
 };
 
 /**
@@ -157,6 +161,7 @@ int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
 int memblock_reserved_mark_kern(phys_addr_t base, phys_addr_t size);
 int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size);
+int memblock_mark_kho_scratch_ext(phys_addr_t base, phys_addr_t size);
 int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size);
 
 void memblock_free(void *ptr, size_t size);
@@ -304,6 +309,11 @@ static inline bool memblock_is_kho_scratch(struct memblock_region *m)
 	return m->flags & MEMBLOCK_KHO_SCRATCH;
 }
 
+static inline bool memblock_is_kho_scratch_ext(struct memblock_region *m)
+{
+	return m->flags & MEMBLOCK_KHO_SCRATCH_EXT;
+}
+
 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
diff --git a/mm/memblock.c b/mm/memblock.c
index 6349c48154f4..6f76a6bb96d6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -182,7 +182,7 @@ static enum memblock_flags __init_memblock choose_memblock_flags(void)
 {
 	/* skip non-scratch memory for kho early boot allocations */
 	if (kho_scratch_only)
-		return MEMBLOCK_KHO_SCRATCH;
+		return MEMBLOCK_KHO_SCRATCH | MEMBLOCK_KHO_SCRATCH_EXT;
 
 	return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
 }
@@ -1180,8 +1180,9 @@ int __init_memblock memblock_reserved_mark_kern(phys_addr_t base, phys_addr_t si
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered
- * for allocations during early boot with kexec handover.
+ * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH or
+ * %MEMBLOCK_KHO_SCRATCH_EXT will be considered for allocations during early
+ * boot with kexec handover.
  *
  * Return: 0 on success, -errno on failure.
  */
@@ -1205,6 +1206,23 @@ __init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size)
 				    MEMBLOCK_KHO_SCRATCH);
 }
 
+/**
+ * memblock_mark_kho_scratch_ext - Mark a memory region as MEMBLOCK_KHO_SCRATCH_EXT.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH or
+ * %MEMBLOCK_KHO_SCRATCH_EXT will be considered for allocations during early
+ * boot with kexec handover.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_mark_kho_scratch_ext(phys_addr_t base, phys_addr_t size)
+{
+	return memblock_setclr_flag(&memblock.memory, base, size, 1,
+				    MEMBLOCK_KHO_SCRATCH_EXT);
+}
+
 static bool should_skip_region(struct memblock_type *type,
 			       struct memblock_region *m,
 			       int nid, int flags)
@@ -1238,10 +1256,20 @@ static bool should_skip_region(struct memblock_type *type,
 
 	/*
 	 * In early alloc during kexec handover, we can only consider
-	 * MEMBLOCK_KHO_SCRATCH regions for the allocations
+	 * MEMBLOCK_KHO_SCRATCH or MEMBLOCK_KHO_SCRATCH_EXT regions for the
+	 * allocations.
 	 */
-	if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m))
-		return true;
+	if (flags & (MEMBLOCK_KHO_SCRATCH | MEMBLOCK_KHO_SCRATCH_EXT)) {
+		bool skip = true;
+
+		if ((flags & MEMBLOCK_KHO_SCRATCH) && memblock_is_kho_scratch(m))
+			skip = false;
+
+		if ((flags & MEMBLOCK_KHO_SCRATCH_EXT) && memblock_is_kho_scratch_ext(m))
+			skip = false;
+
+		return skip;
+	}
 
 	return false;
 }
@@ -2801,6 +2829,7 @@ static const char * const flagname[] = {
 	[ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
 	[ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN",
 	[ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH",
+	[ilog2(MEMBLOCK_KHO_SCRATCH_EXT)] = "KHO_SCRATCH_EXT",
 };
 
 static int memblock_debug_show(struct seq_file *m, void *private)
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 08/20] kho: add kho_radix_init_tree()
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

Move the initialization logic of the radix tree into
kho_radix_init_tree() instead of having users open-code it. Makes the
boundaries cleaner and reduces code duplication when a new user of the
radix tree will be added in a future commit.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kho_radix_tree.h     |  7 ++++++
 kernel/liveupdate/kexec_handover.c | 37 ++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index 617395a6647a..c0840ecb230c 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -54,6 +54,7 @@ int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key);
 void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key);
 int kho_radix_walk_tree(struct kho_radix_tree *tree,
 			const struct kho_radix_walk_cb *cb, void *data);
+int kho_radix_init_tree(struct kho_radix_tree *tree, struct kho_radix_node *root);
 void kho_radix_destroy_tree(struct kho_radix_tree *tree);
 
 #else  /* #ifdef CONFIG_KEXEC_HANDOVER */
@@ -72,6 +73,12 @@ static inline int kho_radix_walk_tree(struct kho_radix_tree *tree,
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_radix_init_tree(struct kho_radix_tree *tree,
+				      struct kho_radix_node *root)
+{
+	return 0;
+}
+
 static inline void kho_radix_destroy_tree(struct kho_radix_tree *tree) { }
 
 #endif /* #ifdef CONFIG_KEXEC_HANDOVER */
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 3f3ea71baa1a..b2d1572808eb 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -305,6 +305,34 @@ static void __kho_radix_destroy_tree(struct kho_radix_node *root,
 	kho_radix_free_node(root);
 }
 
+/**
+ * kho_radix_init_tree - initialize the radix tree.
+ * @tree:   the tree to initialize.
+ * @root:   root table of the radix tree.
+ *
+ * Initialize the radix tree with the given root node. If root is %NULL, an
+ * empty root table is allocated. If root is not %NULL, it is the caller's
+ * responsibility to make sure the root is valid and in the correct format.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int kho_radix_init_tree(struct kho_radix_tree *tree, struct kho_radix_node *root)
+{
+	/* Already initialized. */
+	if (tree->root)
+		return 0;
+
+	if (!root)
+		root = kho_radix_alloc_node();
+	if (!root)
+		return -ENOMEM;
+
+	tree->root = root;
+	mutex_init(&tree->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kho_radix_init_tree);
+
 /**
  * kho_radix_destroy_tree - Destroy the radix tree
  * @tree: The radix tree to destroy
@@ -1467,9 +1495,14 @@ static int __init kho_mem_retrieve(const void *fdt)
 	const struct kho_radix_walk_cb cb = {
 		.key = kho_preserved_memory_reserve,
 	};
+	phys_addr_t mem_map_phys;
+	int err;
+
+	mem_map_phys = kho_get_mem_map_phys(fdt);
+	err = kho_radix_init_tree(&kho_in.radix_tree, phys_to_virt(mem_map_phys));
+	if (err)
+		return err;
 
-	kho_in.radix_tree.root = phys_to_virt(kho_get_mem_map_phys(fdt));
-	mutex_init(&kho_in.radix_tree.lock);
 	return kho_radix_walk_tree(&kho_in.radix_tree, &cb, NULL);
 }
 
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 07/20] kho: allow destroying KHO radix tree
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

Add kho_radix_destroy_tree() which allows destroying the radix tree and
freeing all its pages.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kho_radix_tree.h     |  3 +++
 kernel/liveupdate/kexec_handover.c | 34 ++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index 6c0f7d82716b..617395a6647a 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -54,6 +54,7 @@ int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key);
 void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key);
 int kho_radix_walk_tree(struct kho_radix_tree *tree,
 			const struct kho_radix_walk_cb *cb, void *data);
+void kho_radix_destroy_tree(struct kho_radix_tree *tree);
 
 #else  /* #ifdef CONFIG_KEXEC_HANDOVER */
 
@@ -71,6 +72,8 @@ static inline int kho_radix_walk_tree(struct kho_radix_tree *tree,
 	return -EOPNOTSUPP;
 }
 
+static inline void kho_radix_destroy_tree(struct kho_radix_tree *tree) { }
+
 #endif /* #ifdef CONFIG_KEXEC_HANDOVER */
 
 #endif	/* _LINUX_KHO_RADIX_TREE_H */
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 5c201e605b96..3f3ea71baa1a 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -286,6 +286,40 @@ void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key)
 }
 EXPORT_SYMBOL_GPL(kho_radix_del_key);
 
+static void __kho_radix_destroy_tree(struct kho_radix_node *root,
+				     unsigned int level)
+{
+	unsigned long i;
+
+	if (level == 0) {
+		kho_radix_free_node(root);
+		return;
+	}
+
+	for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
+		if (root->table[i])
+			__kho_radix_destroy_tree(phys_to_virt(root->table[i]),
+						 level - 1);
+	}
+
+	kho_radix_free_node(root);
+}
+
+/**
+ * kho_radix_destroy_tree - Destroy the radix tree
+ * @tree: The radix tree to destroy
+ *
+ * Walk @tree and free all its nodes.
+ */
+void kho_radix_destroy_tree(struct kho_radix_tree *tree)
+{
+	if (!tree->root)
+		return;
+
+	__kho_radix_destroy_tree(tree->root, KHO_TREE_MAX_DEPTH - 1);
+}
+EXPORT_SYMBOL_GPL(kho_radix_destroy_tree);
+
 static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, unsigned long key,
 			       const struct kho_radix_walk_cb *cb, void *data)
 {
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 06/20] kho: allow early-boot usage of the KHO radix tree
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

The KHO radix tree allocates memory for table pages from the buddy
allocator using get_zeroed_page(). This is not available in early boot
when memblock is still active.

Using the radix tree in early boot is useful for KHO to track metadata
about its memory. One such example is for tracking free blocks for
memory allocation when scratch runs out of space. This feature will be
added in the following commits.

Add kho_radix_{alloc,free}_node() which allocate and free the table
pages. They use slab_is_available() to decide which allocator to use.
While slab_is_available() indicates availability of the slab allocator,
it gets initialized right before buddy so it serves the same practical
purpose.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 kernel/liveupdate/kexec_handover.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index f6de6bf63226..5c201e605b96 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -143,6 +143,26 @@ static unsigned long kho_radix_get_table_index(unsigned long key,
 	return (key >> s) % (1 << KHO_TABLE_SIZE_LOG2);
 }
 
+static void __ref *kho_radix_alloc_node(void)
+{
+	struct kho_radix_node *node;
+
+	if (slab_is_available())
+		node = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL);
+	else
+		node = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+
+	return node;
+}
+
+static void __ref kho_radix_free_node(struct kho_radix_node *node)
+{
+	if (slab_is_available())
+		free_page((unsigned long)node);
+	else
+		memblock_free(node, PAGE_SIZE);
+}
+
 /**
  * kho_radix_add_key - Add a key to the radix tree.
  * @tree: The KHO radix tree.
@@ -183,7 +203,7 @@ int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key)
 		}
 
 		/* Next node is empty, create a new node for it */
-		new_node = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL);
+		new_node = kho_radix_alloc_node();
 		if (!new_node) {
 			err = -ENOMEM;
 			goto err_free_nodes;
@@ -214,7 +234,7 @@ int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key)
 err_free_nodes:
 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
 		if (intermediate_nodes[i])
-			free_page((unsigned long)intermediate_nodes[i]);
+			kho_radix_free_node(intermediate_nodes[i]);
 	}
 	if (anchor_node)
 		anchor_node->table[anchor_idx] = 0;
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 05/20] kho: add data argument to radix walk callback
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

Add an opaque data pointer argument to kho_radix_walk_cb_t. This can be
used for callers to pass extra information to the callback.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kho_radix_tree.h     |  8 ++++----
 kernel/liveupdate/kexec_handover.c | 24 +++++++++++++-----------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index fe7151d89361..6c0f7d82716b 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -44,8 +44,8 @@ struct kho_radix_tree {
  * return value is directly returned to the caller.
  */
 struct kho_radix_walk_cb {
-	int (*key)(unsigned long key);
-	int (*table)(phys_addr_t phys);
+	int (*key)(unsigned long key, void *data);
+	int (*table)(phys_addr_t phys, void *data);
 };
 
 #ifdef CONFIG_KEXEC_HANDOVER
@@ -53,7 +53,7 @@ struct kho_radix_walk_cb {
 int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key);
 void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key);
 int kho_radix_walk_tree(struct kho_radix_tree *tree,
-			const struct kho_radix_walk_cb *cb);
+			const struct kho_radix_walk_cb *cb, void *data);
 
 #else  /* #ifdef CONFIG_KEXEC_HANDOVER */
 
@@ -66,7 +66,7 @@ static inline void kho_radix_del_key(struct kho_radix_tree *tree,
 				     unsigned long key) { }
 
 static inline int kho_radix_walk_tree(struct kho_radix_tree *tree,
-				      const struct kho_radix_walk_cb *cb)
+				      const struct kho_radix_walk_cb *cb, void *data)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 0f8d058f1a27..f6de6bf63226 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -267,14 +267,14 @@ void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key)
 EXPORT_SYMBOL_GPL(kho_radix_del_key);
 
 static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, unsigned long key,
-			       const struct kho_radix_walk_cb *cb)
+			       const struct kho_radix_walk_cb *cb, void *data)
 {
 	unsigned long *bitmap = (unsigned long *)leaf;
 	unsigned int i;
 	int err;
 
 	if (cb->table) {
-		err = cb->table(virt_to_phys(leaf));
+		err = cb->table(virt_to_phys(leaf), data);
 		if (err)
 			return err;
 	}
@@ -283,7 +283,7 @@ static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, unsigned long key,
 		return 0;
 
 	for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
-		err = cb->key(key | i);
+		err = cb->key(key | i, data);
 		if (err)
 			return err;
 	}
@@ -293,7 +293,7 @@ static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, unsigned long key,
 
 static int __kho_radix_walk_tree(struct kho_radix_node *root,
 				 unsigned int level, unsigned long start,
-				 const struct kho_radix_walk_cb *cb)
+				 const struct kho_radix_walk_cb *cb, void *data)
 {
 	struct kho_radix_node *node;
 	struct kho_radix_leaf *leaf;
@@ -302,7 +302,7 @@ static int __kho_radix_walk_tree(struct kho_radix_node *root,
 	int err;
 
 	if (cb->table) {
-		err = cb->table(virt_to_phys(root));
+		err = cb->table(virt_to_phys(root), data);
 		if (err)
 			return err;
 	}
@@ -323,10 +323,10 @@ static int __kho_radix_walk_tree(struct kho_radix_node *root,
 			 * node is pointing to the level 0 bitmap.
 			 */
 			leaf = (struct kho_radix_leaf *)node;
-			err = kho_radix_walk_leaf(leaf, key, cb);
+			err = kho_radix_walk_leaf(leaf, key, cb, data);
 		} else {
 			err  = __kho_radix_walk_tree(node, level - 1,
-						     key, cb);
+						     key, cb, data);
 		}
 
 		if (err)
@@ -340,6 +340,7 @@ static int __kho_radix_walk_tree(struct kho_radix_node *root,
  * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each key.
  * @tree: A pointer to the KHO radix tree to walk.
  * @cb:   Set of callbacks to be invoked during the tree walk.
+ * @data: Opaque data pointer passed to each callback in @cb.
  *
  * This function walks the radix tree, searching from the top level down to the
  * lowest level (level 0), invoking the appropriate callbacks.
@@ -348,14 +349,15 @@ static int __kho_radix_walk_tree(struct kho_radix_node *root,
  *         value from the callback that stopped the walk.
  */
 int kho_radix_walk_tree(struct kho_radix_tree *tree,
-			const struct kho_radix_walk_cb *cb)
+			const struct kho_radix_walk_cb *cb, void *data)
 {
 	if (WARN_ON_ONCE(!tree->root))
 		return -EINVAL;
 
 	guard(mutex)(&tree->lock);
 
-	return __kho_radix_walk_tree(tree->root, KHO_TREE_MAX_DEPTH - 1, 0, cb);
+	return __kho_radix_walk_tree(tree->root, KHO_TREE_MAX_DEPTH - 1, 0, cb,
+				     data);
 }
 EXPORT_SYMBOL_GPL(kho_radix_walk_tree);
 
@@ -486,7 +488,7 @@ static struct page *__init kho_get_preserved_page(phys_addr_t phys,
 	return pfn_to_page(pfn);
 }
 
-static int __init kho_preserved_memory_reserve(unsigned long key)
+static int __init kho_preserved_memory_reserve(unsigned long key, void *data)
 {
 	union kho_page_info info;
 	struct page *page;
@@ -1414,7 +1416,7 @@ static int __init kho_mem_retrieve(const void *fdt)
 
 	kho_in.radix_tree.root = phys_to_virt(kho_get_mem_map_phys(fdt));
 	mutex_init(&kho_in.radix_tree.lock);
-	return kho_radix_walk_tree(&kho_in.radix_tree, &cb);
+	return kho_radix_walk_tree(&kho_in.radix_tree, &cb, NULL);
 }
 
 static __init int kho_out_fdt_setup(void)
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 04/20] kho: add callback for table pages
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

The KHO memory preservation radix tree does not mark the table pages
themselves as scratch. This is done to avoid a circular dependency where
preserving a page can lead of allocating other preserved pages. This
means any walker looking for free ranges of memory outside of scratch
areas will ignore the table

Add a table callback that is invoked for each table page. The callback
is given the physical address of the table page.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kho_radix_tree.h     |  3 +++
 kernel/liveupdate/kexec_handover.c | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index 030da6399d28..fe7151d89361 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -37,12 +37,15 @@ struct kho_radix_tree {
 /**
  * struct kho_radix_walk_cb - Callbacks for KHO radix tree walk.
  * @key:      Called on each present key in the radix tree.
+ * @table:    Called on each table of the radix tree itself. Receives the
+ *            physical address of the page containing the table.
  *
  * For each callback, a return value of 0 continues the walk and a non-zero
  * return value is directly returned to the caller.
  */
 struct kho_radix_walk_cb {
 	int (*key)(unsigned long key);
+	int (*table)(phys_addr_t phys);
 };
 
 #ifdef CONFIG_KEXEC_HANDOVER
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index b22b3cec251e..0f8d058f1a27 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -273,6 +273,12 @@ static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, unsigned long key,
 	unsigned int i;
 	int err;
 
+	if (cb->table) {
+		err = cb->table(virt_to_phys(leaf));
+		if (err)
+			return err;
+	}
+
 	if (!cb->key)
 		return 0;
 
@@ -295,6 +301,12 @@ static int __kho_radix_walk_tree(struct kho_radix_node *root,
 	unsigned int shift;
 	int err;
 
+	if (cb->table) {
+		err = cb->table(virt_to_phys(root));
+		if (err)
+			return err;
+	}
+
 	for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
 		if (!root->table[i])
 			continue;
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 03/20] kho: add a struct for radix callbacks
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

A future commit will add more callbacks for the KHO radix tree. Add a
struct for collecting the callbacks.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kho_radix_tree.h     | 15 ++++++++++++---
 kernel/liveupdate/kexec_handover.c | 29 ++++++++++++++++-------------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index f368f3b9f923..030da6399d28 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -34,14 +34,23 @@ struct kho_radix_tree {
 	struct mutex lock; /* protects the tree's structure and root pointer */
 };
 
-typedef int (*kho_radix_tree_walk_callback_t)(unsigned long key);
+/**
+ * struct kho_radix_walk_cb - Callbacks for KHO radix tree walk.
+ * @key:      Called on each present key in the radix tree.
+ *
+ * For each callback, a return value of 0 continues the walk and a non-zero
+ * return value is directly returned to the caller.
+ */
+struct kho_radix_walk_cb {
+	int (*key)(unsigned long key);
+};
 
 #ifdef CONFIG_KEXEC_HANDOVER
 
 int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key);
 void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key);
 int kho_radix_walk_tree(struct kho_radix_tree *tree,
-			kho_radix_tree_walk_callback_t cb);
+			const struct kho_radix_walk_cb *cb);
 
 #else  /* #ifdef CONFIG_KEXEC_HANDOVER */
 
@@ -54,7 +63,7 @@ static inline void kho_radix_del_key(struct kho_radix_tree *tree,
 				     unsigned long key) { }
 
 static inline int kho_radix_walk_tree(struct kho_radix_tree *tree,
-				      kho_radix_tree_walk_callback_t cb)
+				      const struct kho_radix_walk_cb *cb)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index afc986845839..b22b3cec251e 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -266,16 +266,18 @@ void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key)
 }
 EXPORT_SYMBOL_GPL(kho_radix_del_key);
 
-static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf,
-			       unsigned long key,
-			       kho_radix_tree_walk_callback_t cb)
+static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, unsigned long key,
+			       const struct kho_radix_walk_cb *cb)
 {
 	unsigned long *bitmap = (unsigned long *)leaf;
 	unsigned int i;
 	int err;
 
+	if (!cb->key)
+		return 0;
+
 	for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
-		err = cb(key | i);
+		err = cb->key(key | i);
 		if (err)
 			return err;
 	}
@@ -285,7 +287,7 @@ static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf,
 
 static int __kho_radix_walk_tree(struct kho_radix_node *root,
 				 unsigned int level, unsigned long start,
-				 kho_radix_tree_walk_callback_t cb)
+				 const struct kho_radix_walk_cb *cb)
 {
 	struct kho_radix_node *node;
 	struct kho_radix_leaf *leaf;
@@ -325,18 +327,16 @@ static int __kho_radix_walk_tree(struct kho_radix_node *root,
 /**
  * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each key.
  * @tree: A pointer to the KHO radix tree to walk.
- * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be
- *      invoked for each key in the tree.
+ * @cb:   Set of callbacks to be invoked during the tree walk.
  *
- * This function walks the radix tree, searching from the specified top level
- * down to the lowest level (level 0). For each key found, it invokes the
- * provided callback.
+ * This function walks the radix tree, searching from the top level down to the
+ * lowest level (level 0), invoking the appropriate callbacks.
  *
  * Return: 0 if the walk completed the specified tree, or the non-zero return
  *         value from the callback that stopped the walk.
  */
 int kho_radix_walk_tree(struct kho_radix_tree *tree,
-			kho_radix_tree_walk_callback_t cb)
+			const struct kho_radix_walk_cb *cb)
 {
 	if (WARN_ON_ONCE(!tree->root))
 		return -EINVAL;
@@ -1396,10 +1396,13 @@ EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
 
 static int __init kho_mem_retrieve(const void *fdt)
 {
+	const struct kho_radix_walk_cb cb = {
+		.key = kho_preserved_memory_reserve,
+	};
+
 	kho_in.radix_tree.root = phys_to_virt(kho_get_mem_map_phys(fdt));
 	mutex_init(&kho_in.radix_tree.lock);
-	return kho_radix_walk_tree(&kho_in.radix_tree,
-				   kho_preserved_memory_reserve);
+	return kho_radix_walk_tree(&kho_in.radix_tree, &cb);
 }
 
 static __init int kho_out_fdt_setup(void)
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 02/20] kho: store incoming radix tree in kho_in
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

This allows other functions to also use the radix tree. While at it,
also use kho_get_mem_map_phys() instead of duplicating the code to get
the radix tree root from the FDT.

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 kernel/liveupdate/kexec_handover.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 05a6eb56e176..afc986845839 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -1316,6 +1316,7 @@ struct kho_in {
 	char previous_release[__NEW_UTS_LEN + 1];
 	u32 kexec_count;
 	struct kho_debugfs dbg;
+	struct kho_radix_tree radix_tree;
 };
 
 static struct kho_in kho_in = {
@@ -1395,24 +1396,10 @@ EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
 
 static int __init kho_mem_retrieve(const void *fdt)
 {
-	struct kho_radix_tree tree;
-	const phys_addr_t *mem;
-	int len;
-
-	/* Retrieve the KHO radix tree from passed-in FDT. */
-	mem = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
-
-	if (!mem || len != sizeof(*mem)) {
-		pr_err("failed to get preserved KHO memory tree\n");
-		return -ENOENT;
-	}
-
-	if (!*mem)
-		return -EINVAL;
-
-	tree.root = phys_to_virt(*mem);
-	mutex_init(&tree.lock);
-	return kho_radix_walk_tree(&tree, kho_preserved_memory_reserve);
+	kho_in.radix_tree.root = phys_to_virt(kho_get_mem_map_phys(fdt));
+	mutex_init(&kho_in.radix_tree.lock);
+	return kho_radix_walk_tree(&kho_in.radix_tree,
+				   kho_preserved_memory_reserve);
 }
 
 static __init int kho_out_fdt_setup(void)
@@ -1619,8 +1606,10 @@ void __init kho_memory_init(void)
 	if (kho_in.scratch_phys) {
 		kho_scratch = phys_to_virt(kho_in.scratch_phys);
 
-		if (kho_mem_retrieve(kho_get_fdt()))
+		if (kho_mem_retrieve(kho_get_fdt())) {
 			kho_in.fdt_phys = 0;
+			kho_in.radix_tree.root = NULL;
+		}
 	} else {
 		kho_reserve_scratch();
 	}
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 01/20] kho: generalize radix tree APIs
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser
In-Reply-To: <20260528004204.1484584-1-jloeser@linux.microsoft.com>

From: "Pratyush Yadav (Google)" <pratyush@kernel.org>

The KHO radix tree is a data structure that can track the presence or
absence of an arbitrary key, with nothing inherently tied to KHO memory
preservation tracking. This was one of the design goals of the radix
tree. This was done to enable it to be re-used by other users of KHO.

Despite that, the radix tree APIs are very closely tied to KHO memory
preservation tracking. Adding a key is done by kho_radix_add_page(),
which encodes it as a page tracking operation and takes in PFN and
order. kho_radix_del_page() does the same. These functions encode the
key internally that goes into the radix tree. kho_radix_walk_tree() does
the same by baking the PFN and order into the callback arguments.

Generalize the APIs by taking the key directly and doing the encoding at
the callers. Rename the functions to kho_radix_add_key() and
kho_radix_del_key(). In practice, this removes a line each from the
functions and moves the encoding function call to the callers.
Similarly, update kho_radix_tree_walk_callback_t to take the key
directly.

To keep the naming convention clearer, rename
kho_radix_{encode,decode}_key() to kho_{encode,decode}_radix_key().

Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jork Loeser <jloeser@linux.microsoft.com>
---
 include/linux/kho_radix_tree.h     |  18 ++---
 kernel/liveupdate/kexec_handover.c | 119 ++++++++++++++---------------
 2 files changed, 63 insertions(+), 74 deletions(-)

diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index 84e918b96e53..f368f3b9f923 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -34,30 +34,24 @@ struct kho_radix_tree {
 	struct mutex lock; /* protects the tree's structure and root pointer */
 };
 
-typedef int (*kho_radix_tree_walk_callback_t)(phys_addr_t phys,
-					      unsigned int order);
+typedef int (*kho_radix_tree_walk_callback_t)(unsigned long key);
 
 #ifdef CONFIG_KEXEC_HANDOVER
 
-int kho_radix_add_page(struct kho_radix_tree *tree, unsigned long pfn,
-		       unsigned int order);
-
-void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
-			unsigned int order);
-
+int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key);
+void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key);
 int kho_radix_walk_tree(struct kho_radix_tree *tree,
 			kho_radix_tree_walk_callback_t cb);
 
 #else  /* #ifdef CONFIG_KEXEC_HANDOVER */
 
-static inline int kho_radix_add_page(struct kho_radix_tree *tree, long pfn,
-				     unsigned int order)
+static inline int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline void kho_radix_del_page(struct kho_radix_tree *tree,
-				      unsigned long pfn, unsigned int order) { }
+static inline void kho_radix_del_key(struct kho_radix_tree *tree,
+				     unsigned long key) { }
 
 static inline int kho_radix_walk_tree(struct kho_radix_tree *tree,
 				      kho_radix_tree_walk_callback_t cb)
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 4834a809985a..05a6eb56e176 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -85,7 +85,7 @@ static struct kho_out kho_out = {
 };
 
 /**
- * kho_radix_encode_key - Encodes a physical address and order into a radix key.
+ * kho_encode_radix_key - Encodes a physical address and order into a radix key.
  * @phys: The physical address of the page.
  * @order: The order of the page.
  *
@@ -95,7 +95,7 @@ static struct kho_out kho_out = {
  *
  * Return: The encoded unsigned long radix key.
  */
-static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order)
+static unsigned long kho_encode_radix_key(phys_addr_t phys, unsigned int order)
 {
 	/* Order bits part */
 	unsigned long h = 1UL << (KHO_ORDER_0_LOG2 - order);
@@ -106,17 +106,17 @@ static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order)
 }
 
 /**
- * kho_radix_decode_key - Decodes a radix key back into a physical address and order.
+ * kho_decode_radix_key - Decodes a radix key back into a physical address and order.
  * @key: The unsigned long key to decode.
  * @order: An output parameter, a pointer to an unsigned int where the decoded
  *         page order will be stored.
  *
- * This function reverses the encoding performed by kho_radix_encode_key(),
+ * This function reverses the encoding performed by kho_encode_radix_key(),
  * extracting the original physical address and page order from a given key.
  *
  * Return: The decoded physical address.
  */
-static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order)
+static phys_addr_t kho_decode_radix_key(unsigned long key, unsigned int *order)
 {
 	unsigned int order_bit = fls64(key);
 	phys_addr_t phys;
@@ -144,24 +144,21 @@ static unsigned long kho_radix_get_table_index(unsigned long key,
 }
 
 /**
- * kho_radix_add_page - Marks a page as preserved in the radix tree.
+ * kho_radix_add_key - Add a key to the radix tree.
  * @tree: The KHO radix tree.
- * @pfn: The page frame number of the page to preserve.
- * @order: The order of the page.
+ * @key: The key to add.
  *
- * This function traverses the radix tree based on the key derived from @pfn
- * and @order. It sets the corresponding bit in the leaf bitmap to mark the
- * page for preservation. If intermediate nodes do not exist along the path,
- * they are allocated and added to the tree.
+ * This function traverses the radix tree based on the key provided. It sets the
+ * corresponding bit in the leaf bitmap to mark the key as present. If
+ * intermediate nodes do not exist along the path, they are allocated and added
+ * to the tree.
  *
  * Return: 0 on success, or a negative error code on failure.
  */
-int kho_radix_add_page(struct kho_radix_tree *tree,
-		       unsigned long pfn, unsigned int order)
+int kho_radix_add_key(struct kho_radix_tree *tree, unsigned long key)
 {
 	/* Newly allocated nodes for error cleanup */
 	struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 };
-	unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
 	struct kho_radix_node *anchor_node = NULL;
 	struct kho_radix_node *node = tree->root;
 	struct kho_radix_node *new_node;
@@ -224,22 +221,19 @@ int kho_radix_add_page(struct kho_radix_tree *tree,
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(kho_radix_add_page);
+EXPORT_SYMBOL_GPL(kho_radix_add_key);
 
 /**
- * kho_radix_del_page - Removes a page's preservation status from the radix tree.
+ * kho_radix_del_key - Removes the key from the radix tree.
  * @tree: The KHO radix tree.
- * @pfn: The page frame number of the page to unpreserve.
- * @order: The order of the page.
+ * @key: The key to remove.
  *
  * This function traverses the radix tree and clears the bit corresponding to
- * the page, effectively removing its "preserved" status. It does not free
- * the tree's intermediate nodes, even if they become empty.
+ * the key, effectively removing it from the tree. It does not free the tree's
+ * intermediate nodes, even if they become empty.
  */
-void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
-			unsigned int order)
+void kho_radix_del_key(struct kho_radix_tree *tree, unsigned long key)
 {
-	unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
 	struct kho_radix_node *node = tree->root;
 	struct kho_radix_leaf *leaf;
 	unsigned int i, idx;
@@ -270,21 +264,18 @@ void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
 	idx = kho_radix_get_bitmap_index(key);
 	__clear_bit(idx, leaf->bitmap);
 }
-EXPORT_SYMBOL_GPL(kho_radix_del_page);
+EXPORT_SYMBOL_GPL(kho_radix_del_key);
 
 static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf,
 			       unsigned long key,
 			       kho_radix_tree_walk_callback_t cb)
 {
 	unsigned long *bitmap = (unsigned long *)leaf;
-	unsigned int order;
-	phys_addr_t phys;
 	unsigned int i;
 	int err;
 
 	for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
-		phys = kho_radix_decode_key(key | i, &order);
-		err = cb(phys, order);
+		err = cb(key | i);
 		if (err)
 			return err;
 	}
@@ -332,15 +323,14 @@ static int __kho_radix_walk_tree(struct kho_radix_node *root,
 }
 
 /**
- * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page.
+ * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each key.
  * @tree: A pointer to the KHO radix tree to walk.
  * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be
- *      invoked for each preserved page found in the tree. The callback receives
- *      the physical address and order of the preserved page.
+ *      invoked for each key in the tree.
  *
  * This function walks the radix tree, searching from the specified top level
- * down to the lowest level (level 0). For each preserved page found, it invokes
- * the provided callback, passing the page's physical address and order.
+ * down to the lowest level (level 0). For each key found, it invokes the
+ * provided callback.
  *
  * Return: 0 if the walk completed the specified tree, or the non-zero return
  *         value from the callback that stopped the walk.
@@ -484,13 +474,16 @@ static struct page *__init kho_get_preserved_page(phys_addr_t phys,
 	return pfn_to_page(pfn);
 }
 
-static int __init kho_preserved_memory_reserve(phys_addr_t phys,
-					       unsigned int order)
+static int __init kho_preserved_memory_reserve(unsigned long key)
 {
 	union kho_page_info info;
 	struct page *page;
+	unsigned int order;
+	phys_addr_t phys;
 	u64 sz;
 
+	phys = kho_decode_radix_key(key, &order);
+
 	sz = 1 << (order + PAGE_SHIFT);
 	page = kho_get_preserved_page(phys, order);
 
@@ -618,30 +611,20 @@ early_param("kho_scratch", kho_parse_scratch_size);
 
 static void __init scratch_size_update(void)
 {
-	/*
-	 * If fixed sizes are not provided via command line, calculate them
-	 * now.
-	 */
-	if (scratch_scale) {
-		phys_addr_t size;
+	phys_addr_t size;
 
-		size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
-						   NUMA_NO_NODE);
-		size = size * scratch_scale / 100;
-		scratch_size_lowmem = size;
+	if (!scratch_scale)
+		return;
 
-		size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
-						   NUMA_NO_NODE);
-		size = size * scratch_scale / 100 - scratch_size_lowmem;
-		scratch_size_global = size;
-	}
+	size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
+					   NUMA_NO_NODE);
+	size = size * scratch_scale / 100;
+	scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
 
-	/*
-	 * Scratch areas are released as MIGRATE_CMA. Round them up to the right
-	 * size.
-	 */
-	scratch_size_lowmem = round_up(scratch_size_lowmem, CMA_MIN_ALIGNMENT_BYTES);
-	scratch_size_global = round_up(scratch_size_global, CMA_MIN_ALIGNMENT_BYTES);
+	size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+					   NUMA_NO_NODE);
+	size = size * scratch_scale / 100 - scratch_size_lowmem;
+	scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
 }
 
 static phys_addr_t __init scratch_size_node(int nid)
@@ -859,7 +842,8 @@ int kho_preserve_folio(struct folio *folio)
 	if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
 		return -EINVAL;
 
-	return kho_radix_add_page(tree, pfn, order);
+	return kho_radix_add_key(tree, kho_encode_radix_key(PFN_PHYS(pfn),
+							    order));
 }
 EXPORT_SYMBOL_GPL(kho_preserve_folio);
 
@@ -877,7 +861,7 @@ void kho_unpreserve_folio(struct folio *folio)
 	const unsigned long pfn = folio_pfn(folio);
 	const unsigned int order = folio_order(folio);
 
-	kho_radix_del_page(tree, pfn, order);
+	kho_radix_del_key(tree, kho_encode_radix_key(PFN_PHYS(pfn), order));
 }
 EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
 
@@ -906,7 +890,8 @@ static void __kho_unpreserve(struct kho_radix_tree *tree,
 	while (pfn < end_pfn) {
 		order = __kho_preserve_pages_order(pfn, end_pfn);
 
-		kho_radix_del_page(tree, pfn, order);
+		kho_radix_del_key(tree, kho_encode_radix_key(PFN_PHYS(pfn),
+							     order));
 
 		pfn += 1 << order;
 	}
@@ -937,9 +922,19 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages)
 	}
 
 	while (pfn < end_pfn) {
-		unsigned int order = __kho_preserve_pages_order(pfn, end_pfn);
+		unsigned int order =
+			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+		/*
+		 * Make sure all the pages in a single preservation are in the
+		 * same NUMA node. The restore machinery can not cope with a
+		 * preservation spanning multiple NUMA nodes.
+		 */
+		while (pfn_to_nid(pfn) != pfn_to_nid(pfn + (1UL << order) - 1))
+			order--;
 
-		err = kho_radix_add_page(tree, pfn, order);
+		err = kho_radix_add_key(tree, kho_encode_radix_key(PFN_PHYS(pfn),
+								   order));
 		if (err) {
 			failed_pfn = pfn;
 			break;
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 00/20] mshv: enable kexec with Hyper-V donated pages and partitions
From: Jork Loeser @ 2026-05-28  0:41 UTC (permalink / raw)
  To: linux-hyperv, linux-mm, kexec
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Mike Rapoport, Pasha Tatashin, Pratyush Yadav, Alexander Graf,
	Jason Miu, Andrew Morton, David Hildenbrand, Muchun Song,
	Oscar Salvador, Baoquan He, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Kees Cook, Ran Xiaokai, Justinien Bouron,
	Sourabh Jain, Pingfan Liu, Rafael J. Wysocki, Mario Limonciello,
	linux-arm-kernel, x86, linux-kernel, Michael Kelley, Jork Loeser

When Linux runs as an L1 Virtual Host (L1VH) under Hyper-V, the MSHV
root partition driver deposits pages to the hypervisor and creates
partitions for guest VMs. Prior patches enabled kexec for L1VH, but
only when no partitions had been created and no memory had been donated.

This series lifts that limitation. It uses KHO (Kexec Handover) to:

 - Track all pages deposited to the hypervisor in a KHO radix tree
   and preserve them across kexec so the new kernel knows which pages
   are owned by the hypervisor.

 - Freeze running partitions before kexec, record their IDs in the
   KHO FDT, and vacuum (tear down + reclaim memory) stale partitions
   after kexec.

 - In case of a crash, exclude hypervisor-owned pages from crash
   dump collection by passing the radix tree root PA via Hyper-V
   crash MSR P2 to the crash kernel.

Dependency on Pratyush's KHO series
===================================

Patches 1-12 are cherry-picked from Pratyush Yadav's v1 series
"kho: make boot time huge page allocation work nicely with KHO" [1],
which is still under discussion. This series uses functionality from
those patches -- specifically the meta-data page enumeration via table
callbacks and the restructured radix tree API. It also extends the
KHO radix tree with:

 - A freeze mechanism to lock the tree before serializing for kexec
   (patch 13).

 - A crash-kernel-safe variant that memremaps radix nodes for use
   outside the direct map (patch 14).

Patch overview
==============

Patches 1-12:  KHO radix tree and memblock changes (from [1])
Patch 13:      Radix tree freeze and del_key() error reporting
Patch 14:      Crash-kernel-safe radix tree presence check
Patch 15:      Page tracker using KHO radix tree for deposited pages
Patch 16:      Debugfs interface for page tracker
Patches 17-18: Crash MSR reshuffling + crash dump page exclusion
Patch 19:      Export kexec_in_progress for modules
Patch 20:      Freeze and vacuum partitions across kexec

Feedback
========

This is an RFC. I am looking for feedback on the overall approach as
well as the KHO changes (patches 13-14).

[1] https://lore.kernel.org/linux-mm/20260429133928.850721-1-pratyush@kernel.org/

Based-on: linux-next/master (next-20260527)

Jork Loeser (8):
  kho: add radix tree freeze and del_key() error reporting
  kho: Add crash-kernel-safe radix tree presence check
  mshv: Use page tracker to manage MSHV-owned pages and preserve with
    KHO
  mshv: Add debugfs interface to page tracker
  hyperv: Reserve crash MSR P2 for page preservation root PA
  mshv: Exclude Hyper-V donated pages from crash dump collection
  kexec: export kexec_in_progress for modules
  mshv: freeze and vacuum partitions across kexec

Pratyush Yadav (Google) (12):
  kho: generalize radix tree APIs
  kho: store incoming radix tree in kho_in
  kho: add a struct for radix callbacks
  kho: add callback for table pages
  kho: add data argument to radix walk callback
  kho: allow early-boot usage of the KHO radix tree
  kho: allow destroying KHO radix tree
  kho: add kho_radix_init_tree()
  memblock: introduce MEMBLOCK_KHO_SCRATCH_EXT
  kho: extended scratch
  kho: return virtual address of mem_map
  mm/hugetlb: make bootmem allocation work with KHO

 arch/arm64/hyperv/hv_core.c        |   6 +-
 arch/x86/hyperv/hv_init.c          |   4 +-
 drivers/hv/Kconfig                 |   3 +
 drivers/hv/Makefile                |   2 +-
 drivers/hv/hv_common.c             |   5 +-
 drivers/hv/hv_proc.c               |  32 +-
 drivers/hv/mshv_debugfs.c          |  99 +++++
 drivers/hv/mshv_page_preserve.c    | 557 ++++++++++++++++++++++++++
 drivers/hv/mshv_page_preserve.h    |  21 +
 drivers/hv/mshv_root.h             |   5 +
 drivers/hv/mshv_root_hv_call.c     |  12 +-
 drivers/hv/mshv_root_main.c        | 341 ++++++++++++++--
 include/linux/kexec_handover.h     |   1 +
 include/linux/kho_radix_tree.h     |  90 ++++-
 include/linux/memblock.h           |  14 +
 kernel/kexec_core.c                |   1 +
 kernel/liveupdate/kexec_handover.c | 605 +++++++++++++++++++++++------
 mm/hugetlb.c                       |  19 +-
 mm/memblock.c                      | 177 +++++++--
 mm/mm_init.c                       |   1 +
 20 files changed, 1767 insertions(+), 228 deletions(-)
 create mode 100644 drivers/hv/mshv_page_preserve.c
 create mode 100644 drivers/hv/mshv_page_preserve.h

--
2.43.0


^ permalink raw reply

* Re: [PATCH v3 3/6] x86/hyperv: Skip LP/VP creation on kexec
From: Wei Liu @ 2026-05-27 23:17 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Jork Loeser, linux-hyperv@vger.kernel.org, x86@kernel.org,
	K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H . Peter Anvin, Arnd Bergmann, linux-kernel@vger.kernel.org,
	linux-arch@vger.kernel.org, Anirudh Rayabharam,
	Stanislav Kinsburskii, Mukesh Rathor
In-Reply-To: <SN6PR02MB41578A8F9A225227FB5E79E6D4312@SN6PR02MB4157.namprd02.prod.outlook.com>

On Mon, May 04, 2026 at 03:09:21PM +0000, Michael Kelley wrote:
> From: Jork Loeser <jloeser@linux.microsoft.com> Sent: Tuesday, April 7, 2026 6:37 PM
> > 
> > After a kexec the logical processors and virtual processors already
> > exist in the hypervisor because they were created by the previous
> > kernel. Attempting to add them again causes either a BUG_ON or
> > corrupted VP state leading to MCEs in the new kernel.
> > 
> > Add hv_lp_exists() to probe whether an LP is already present by
> > calling HVCALL_GET_LOGICAL_PROCESSOR_RUN_TIME. When it succeeds the
> > LP exists and we skip the add-LP and create-VP loops entirely.
> > 
> > Also add hv_call_notify_all_processors_started() which informs the
> > hypervisor that all processors are online. This is required after
> > adding LPs (fresh boot) and is a no-op on kexec since we skip that
> > path.
> 
> Adding hv_call_notify_all_processors_started() seems like it should be
> a separate patch. And this paragraph in the commit message leaves me
> with questions:  Is it really "required"?  If it is, how does the existing
> upstream code ever work? Does the change need to be backported
> to stable kernels? If it isn't *really* required, what are the implications
> of not doing it?

It is complicated. If I remember correctly, we realized this call was
absolutely needed if SEV-SNP host side support is enabled. If that
support is not enabled, then things continue to work. I think it is the
right thing to do to always make this call.

We don't need to backport this yet.

Wei

^ permalink raw reply

* Re: [PATCH v3 09/41] clocksource: hyper-v: Don't save/restore TSC offset when using HV sched_clock
From: Wei Liu @ 2026-05-27 22:50 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Kiryl Shutsemau, Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
	Jan Kiszka, Dave Hansen, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, Thomas Gleixner, John Stultz,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-10-seanjc@google.com>

On Fri, May 15, 2026 at 12:19:10PM -0700, Sean Christopherson wrote:
> Now that Hyper-V overrides the sched_clock save/restore hooks if and only
> sched_clock itself is set to the Hyper-V reference counter, drop the
> invocation of the "old" save/restore callbacks.  When the registration of
> the PV sched_clock was done separately from overriding the save/restore
> hooks, it was possible for Hyper-V to clobber the TSC save/restore
> callbacks without actually switching to the Hyper-V refcounter.
> 
> Enabling a PV sched_clock is a one-way street, i.e. the kernel will never
> revert to using TSC for sched_clock, and so there is no need to invoke the
> TSC save/restore hooks (and if there was, it belongs in common PV code).
> 
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> Tested-by: Michael Kelley <mhklinux@outlook.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Acked-by: Wei Liu <wei.liu@kernel.org>

^ permalink raw reply

* Re: [PATCH v3 08/41] clocksource: hyper-v: Drop wrappers to sched_clock save/restore helpers
From: Wei Liu @ 2026-05-27 22:50 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Kiryl Shutsemau, Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
	Jan Kiszka, Dave Hansen, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, Thomas Gleixner, John Stultz,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-9-seanjc@google.com>

On Fri, May 15, 2026 at 12:19:09PM -0700, Sean Christopherson wrote:
> Now that all of the Hyper-V reference counter sched_clock code is located
> in a single file, drop the superfluous wrappers for the save/restore flows.
> 
> No functional change intended.
> 
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> Tested-by: Michael Kelley <mhklinux@outlook.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Acked-by: Wei Liu <wei.liu@kernel.org>

^ permalink raw reply

* Re: [PATCH v3 07/41] clocksource: hyper-v: Register sched_clock save/restore iff it's necessary
From: Wei Liu @ 2026-05-27 22:49 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Kiryl Shutsemau, Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
	Jan Kiszka, Dave Hansen, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, Thomas Gleixner, John Stultz,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-8-seanjc@google.com>

On Fri, May 15, 2026 at 12:19:08PM -0700, Sean Christopherson wrote:
> Register the Hyper-V reference counter (refcounter) callbacks for saving
> and restoring its PV sched_clock, if and only if the refcounter is
> actually being used for sched_clock.  Currently, Hyper-V overrides the
> save/restore hooks if the reference TSC available, whereas the Hyper-V
> refcounter code only overrides sched_clock if the reference TSC is
> available *and* it's not invariant.  The flaw is effectively papered over
> by invoking the "old" save/restore callbacks as part of save/restore, but
> that's unnecessary and fragile.
> 
> To avoid introducing more complexity, and to allow for additional cleanups
> of the PV sched_clock code, move the save/restore hooks and logic into
> hyperv_timer.c and simply wire up the hooks when overriding sched_clock
> itself.
> 
> Note, while the Hyper-V refcounter code is intended to be architecture
> neutral, CONFIG_PARAVIRT is firmly x86-only, i.e. adding a small amount of
> x86 specific code (which will be reduced in future cleanups) doesn't
> meaningfully pollute generic code.
> 
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> Tested-by: Michael Kelley <mhklinux@outlook.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Acked-by: Wei Liu <wei.liu@kernel.org>

^ permalink raw reply

* Re: [PATCH v3] Drivers: hv: vmbus: Improve the logic of reserving fb_mmio on Gen2 VMs
From: Wei Liu @ 2026-05-27 22:28 UTC (permalink / raw)
  To: Dexuan Cui
  Cc: kys, haiyangz, wei.liu, longli, linux-hyperv, linux-kernel,
	mhklinux, matthew.ruffell, johansen, hargar, stable,
	Krister Johansen
In-Reply-To: <20260507212838.448891-1-decui@microsoft.com>

On Thu, May 07, 2026 at 02:28:38PM -0700, Dexuan Cui wrote:
> If vmbus_reserve_fb() in the kdump/kexec kernel fails to properly reserve
> the framebuffer MMIO range (which is below 4GB) due to a Gen2 VM's
> screen.lfb_base being zero [1], there is an MMIO conflict between the
> drivers hyperv-drm and pci-hyperv: when the driver pci-hyperv's
> hv_allocate_config_window() calls vmbus_allocate_mmio() to get an
> MMIO range, typically it gets a 32-bit MMIO range that overlaps with the
> framebuffer MMIO range, and later hv_pci_enter_d0() fails with an
> error message "PCI Pass-through VSP failed D0 Entry with status" since
> the host thinks that PCI devices must not use MMIO space that the
> host has assigned to the framebuffer.
> 
> This is especially an issue if pci-hyperv is built-in and hyperv-drm is
> built as a module. Consequently, the kdump/kexec kernel fails to detect
> PCI devices via pci-hyperv, and may fail to mount the root file system,
> which may reside in a NVMe disk. The issue described here has existed
> for SR-IOV VF NICs since day one of the pci-hyperv driver, and has been
> worked around on x64 when possible. With the recent introduction of
> ARM64 VMs that boot from NVMe, there is no workaround, so we need a
> formal fix.
> 
> On Gen2 VMs, if the screen.lfb_base is 0 in the kdump/kexec kernel [1],
> fall back to the low MMIO base, which should be equal to the framebuffer
> MMIO base [2] (the statement is true according to my testing on x64
> Windows Server 2016, and on x64 and ARM64 Windows Server 2025 and on
> Azure. I checked with the Hyper-V team and they said the statement should
> continue to be true for Gen2 VMs). In the first kernel, screen.lfb_base
> is not 0; if the user specifies a very high resolution, it's not enough
> to only reserve 8MB: let's always reserve half of the space below 4GB,
> but cap the reservation to 128MB, which is the required framebuffer size
> of the highest resolution 7680*4320 supported by Hyper-V.
> 
> While at it, fix the comparison "end > VTPM_BASE_ADDRESS" by changing
> the > to >=. Here the 'end' is an inclusive end (typically, it's
> 0xFFFF_FFFF for the low MMIO range).
> 
> Note: vmbus_reserve_fb() now also reserves an MMIO range at the beginning
> of the low MMIO range on CVMs, which have no framebuffers (the
> 'screen.lfb_base' in vmbus_reserve_fb() is 0 for CVMs), just in case the
> host might treat the beginning of the low MMIO range specially [3]. BTW,
> the OpenHCL kernel is not affected by the change, because that kernel
> boots with DeviceTree rather than ACPI (so vmbus_reserve_fb() won't run
> there), and there is no framebuffer device for that kernel.
> 
> Note: normally Gen1 VMs don't have the MMIO conflict issue because the
> framebuffer MMIO range (which is hardcoded to base=4GB-128MB and
> size=64MB for Gen1 VMs by the host) is always reported via the legacy PCI
> graphics device's BAR, so the kdump/kexec kernel can reserve the 64MB
> MMIO range; however, if the VM is configured to use a very high resolution
> and the required framebuffer size exceeds 64MB (AFAIK, in practice, this
> isn't a typical configuration by users), the hyperv-drm driver may need to
> allocate an MMIO range above 4GB and change the framebuffer MMIO location
> to the allocated MMIO range -- in this case, there can still be issues [4]
> which can't be easily fixed: any possible affected Gen1 users would have
> to use a resolution whose framebuffer size is <= 64MB, or switch to Gen2
> VMs.
> 
> [1] https://lore.kernel.org/all/SA1PR21MB692176C1BC53BFC9EAE5CF8EBF51A@SA1PR21MB6921.namprd21.prod.outlook.com/
> [2] https://lore.kernel.org/all/SA1PR21MB69218F955B62DFF62E3E88D2BF222@SA1PR21MB6921.namprd21.prod.outlook.com/
> [3] https://lore.kernel.org/all/SN6PR02MB415726B17D5A6027CD1717E8D4342@SN6PR02MB4157.namprd02.prod.outlook.com/
> [4] https://lore.kernel.org/all/SA1PR21MB69213486F821CA5A2C793C81BF342@SA1PR21MB6921.namprd21.prod.outlook.com/
> 
> Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
> CC: stable@vger.kernel.org
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> Tested-by: Krister Johansen <kjlx@templeofstupid.com>
> Tested-by: Matthew Ruffell <matthew.ruffell@canonical.com>
> Signed-off-by: Dexuan Cui <decui@microsoft.com>

Applied. Thanks.

^ permalink raw reply

* Re: [PATCH] drivers: hv: use kmalloc_array in mshv_root_scheduler_init
From: Wei Liu @ 2026-05-27 22:27 UTC (permalink / raw)
  To: Can Peng
  Cc: kys, haiyangz, wei.liu, longli, decui, linux-kernel, linux-hyperv
In-Reply-To: <20260520071632.557990-1-pengcan@kylinos.cn>

On Wed, May 20, 2026 at 03:16:32PM +0800, Can Peng wrote:
> Replace kmalloc() with kmalloc_array() to prevent potential
> overflow, as recommended in Documentation/process/deprecated.rst.
> 
> Signed-off-by: Can Peng <pengcan@kylinos.cn>
> ---
>  drivers/hv/mshv_root_main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index bd1359eb58dd..146726cc4e9b 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -2241,7 +2241,7 @@ static int mshv_root_scheduler_init(unsigned int cpu)
>  	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
>  
>  	/* Allocate two consecutive pages. One for input, one for output. */
> -	p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
> +	p = kmalloc_array(2, HV_HYP_PAGE_SIZE, GFP_KERNEL);

HV_HYP_PAGE_SIZE is a constant (4096). We don't have any dynamism in code.
There is zero potential for overflow.

That being said, I'm fine with taking this patch to stay consistent with
the document.

Thanks for your contribution.

Wei


>  	if (!p)
>  		return -ENOMEM;
>  
> -- 
> 2.53.0
> 

^ permalink raw reply

* Re: [PATCH v2 1/1] mshv: Add conditional VMBus dependency
From: Wei Liu @ 2026-05-27 22:11 UTC (permalink / raw)
  To: mhklinux
  Cc: kys, haiyangz, wei.liu, decui, longli, jloeser, linux-hyperv,
	linux-kernel, arnd, hamzamahfooz
In-Reply-To: <20260526141304.3924-1-mhklkml@zohomail.com>

On Tue, May 26, 2026 at 07:13:04AM -0700, Michael Kelley wrote:
> From: Michael Kelley <mhklinux@outlook.com>
> 
> When the VMBus driver is not part of the kernel (CONFIG_HYPERV_VMBUS=n),
> the MSHV root driver fails to link:
> 
> ERROR: modpost: "hv_vmbus_exists" [drivers/hv/mshv_root.ko] undefined!
> 
> Fix this while meeting these requirements:
> * It must be possible to include the MSHV root driver without the
>   VMBus driver. In such case, the MSHV root driver can be built-in
>   to the kernel image, or it can be built as a separate module.
> * If both the MSHV root driver and the VMBus driver are present, the
>   MSHV root driver and VMBus driver can both be built-in, or they can
>   both be separate modules. Or the MSHV root driver can be a module
>   while the VMBus driver can be built-in, but the reverse is
>   disallowed. Regardless of the build choices, the VMBus driver must
>   be loaded before the MSHV driver in order for the SynIC to be
>   managed properly (see comments in the MSHV SynIC code).
> 
> The fix has two parts:
> * Add a Kconfig entry for MSHV_ROOT to depend on HYPERV_VMBUS if
>   HYPERV_VMBUS is present. The entry disallows MSHV_ROOT being
>   built-in when HYPERV_VMBUS is a module, but without requiring that
>   HYPERV_VMBUS be built.
> * Add a stub implementation of hv_vmbus_exists() for when the
>   VMBus driver is not present so that the MSHV root driver has
>   no module dependency on VMBus. When the VMBus driver *is*
>   present, the module dependency ensures that the VMBus driver
>   loads first when both are built as modules.
> 
> Existing code ensures that the VMBus driver loads first if it is
> built-in. The VMBus driver uses subsys_initcall(), which is
> initcall level 4. The MSHV root driver uses module_init(), which
> becomes device_init() when built-in, and device_init() is
> initcall level 6.
> 
> Reported-by: Arnd Bergmann <arnd@arndb.de>
> Closes: https://lore.kernel.org/all/20260520074044.923728-1-arnd@kernel.org/
> Signed-off-by: Michael Kelley <mhklinux@outlook.com>
> Acked-by: Arnd Bergmann <arnd@arndb.de>
> Reviewed-by: Jork Loeser <jloeser@linux.microsoft.com>

Applied. Thanks everyone.

^ permalink raw reply

* Re: [PATCH] hyperv: Clean up and fix the guest ID comment in hvgdk.h
From: Wei Liu @ 2026-05-27 22:05 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: Dexuan Cui, kys, haiyangz, wei.liu, longli, linux-hyperv, gregkh,
	linux-kernel
In-Reply-To: <ahdNVLMHrSgpVKwr@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>

On Wed, May 27, 2026 at 04:00:20PM -0400, Hamza Mahfooz wrote:
> On Wed, May 27, 2026 at 12:21:01PM -0700, Dexuan Cui wrote:
> > Change the "64 bit" to "64-bit", and the "Os" to "OS".
> > 
> > Remove the obsolete paragraph since the guideline has been
> > published in the Hypervisor Top Level Functional Specification
> > for many years.
> > 
> > The "OS Type" is 0x1 for Linux, not 0x100.
> > 
> > No functional change.
> > 
> > Fixes: 83ba0c4f3f31 ("Drivers: hv: Cleanup the guest ID computation")
> > Signed-off-by: Dexuan Cui <decui@microsoft.com>
> 
> Reviewed-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>

Applied. Thanks.

^ permalink raw reply

* Re: [PATCH] hyperv: Clean up and fix the guest ID comment in hvgdk.h
From: Hamza Mahfooz @ 2026-05-27 20:00 UTC (permalink / raw)
  To: Dexuan Cui
  Cc: kys, haiyangz, wei.liu, longli, linux-hyperv, gregkh,
	linux-kernel
In-Reply-To: <20260527192101.1471995-1-decui@microsoft.com>

On Wed, May 27, 2026 at 12:21:01PM -0700, Dexuan Cui wrote:
> Change the "64 bit" to "64-bit", and the "Os" to "OS".
> 
> Remove the obsolete paragraph since the guideline has been
> published in the Hypervisor Top Level Functional Specification
> for many years.
> 
> The "OS Type" is 0x1 for Linux, not 0x100.
> 
> No functional change.
> 
> Fixes: 83ba0c4f3f31 ("Drivers: hv: Cleanup the guest ID computation")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>

Reviewed-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>

> ---
>  include/hyperv/hvgdk.h | 10 ++--------
>  1 file changed, 2 insertions(+), 8 deletions(-)
> 
> diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h
> index 384c3f3ff4a5..f538144280ca 100644
> --- a/include/hyperv/hvgdk.h
> +++ b/include/hyperv/hvgdk.h
> @@ -10,18 +10,12 @@
>  
>  /*
>   * The guest OS needs to register the guest ID with the hypervisor.
> - * The guest ID is a 64 bit entity and the structure of this ID is
> + * The guest ID is a 64-bit entity and the structure of this ID is
>   * specified in the Hyper-V TLFS specification.
>   *
> - * While the current guideline does not specify how Linux guest ID(s)
> - * need to be generated, our plan is to publish the guidelines for
> - * Linux and other guest operating systems that currently are hosted
> - * on Hyper-V. The implementation here conforms to this yet
> - * unpublished guidelines.
> - *
>   * Bit(s)
>   * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
> - * 62:56 - Os Type; Linux is 0x100
> + * 62:56 - OS Type; Linux is 0x1
>   * 55:48 - Distro specific identification
>   * 47:16 - Linux kernel version number
>   * 15:0  - Distro specific identification
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [PATCH v2 1/1] mshv: Add conditional VMBus dependency
From: Hardik Garg @ 2026-05-27 19:57 UTC (permalink / raw)
  To: mhklinux, kys, haiyangz, wei.liu, decui, longli, jloeser,
	linux-hyperv
  Cc: linux-kernel, arnd, hamzamahfooz
In-Reply-To: <20260526141304.3924-1-mhklkml@zohomail.com>



On 5/26/2026 7:13 AM, Michael Kelley wrote:
> From: Michael Kelley <mhklinux@outlook.com>
> 
> When the VMBus driver is not part of the kernel (CONFIG_HYPERV_VMBUS=n),
> the MSHV root driver fails to link:
> 
> ERROR: modpost: "hv_vmbus_exists" [drivers/hv/mshv_root.ko] undefined!
> 
> Fix this while meeting these requirements:
> * It must be possible to include the MSHV root driver without the
>   VMBus driver. In such case, the MSHV root driver can be built-in
>   to the kernel image, or it can be built as a separate module.
> * If both the MSHV root driver and the VMBus driver are present, the
>   MSHV root driver and VMBus driver can both be built-in, or they can
>   both be separate modules. Or the MSHV root driver can be a module
>   while the VMBus driver can be built-in, but the reverse is
>   disallowed. Regardless of the build choices, the VMBus driver must
>   be loaded before the MSHV driver in order for the SynIC to be
>   managed properly (see comments in the MSHV SynIC code).
> 
> The fix has two parts:
> * Add a Kconfig entry for MSHV_ROOT to depend on HYPERV_VMBUS if
>   HYPERV_VMBUS is present. The entry disallows MSHV_ROOT being
>   built-in when HYPERV_VMBUS is a module, but without requiring that
>   HYPERV_VMBUS be built.
> * Add a stub implementation of hv_vmbus_exists() for when the
>   VMBus driver is not present so that the MSHV root driver has
>   no module dependency on VMBus. When the VMBus driver *is*
>   present, the module dependency ensures that the VMBus driver
>   loads first when both are built as modules.
> 
> Existing code ensures that the VMBus driver loads first if it is
> built-in. The VMBus driver uses subsys_initcall(), which is
> initcall level 4. The MSHV root driver uses module_init(), which
> becomes device_init() when built-in, and device_init() is
> initcall level 6.
> 
> Reported-by: Arnd Bergmann <arnd@arndb.de>
> Closes: https://lore.kernel.org/all/20260520074044.923728-1-arnd@kernel.org/
> Signed-off-by: Michael Kelley <mhklinux@outlook.com>
> Acked-by: Arnd Bergmann <arnd@arndb.de>
> Reviewed-by: Jork Loeser <jloeser@linux.microsoft.com>
> ---
> Changes in v2:
> * Instead of putting IS_ENABLED(CONFIG_HYPERV_VMBUS) around each of
>   the two calls to hv_vmbus_exists() in mshv_synic.c, provide a stub
>   for hv_vmbus_exists() when CONFIG_HYPERV_VMBUS is not set. The
>   effect is the same as in v1, but the code is cleaner. [Jork Loeser]
> 
> Arnd: I've kept your Ack even though I changed how hv_vmbus_exists()
> is stubbed out since the effect is the same. Let me know if
> you have any concerns.
> 
>  drivers/hv/Kconfig     | 1 +
>  include/linux/hyperv.h | 4 ++++
>  2 files changed, 5 insertions(+)
> 
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 2d0b3fcb0ff8..aa11bcefddf2 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -74,6 +74,7 @@ config MSHV_ROOT
>  	# e.g. When withdrawing memory, the hypervisor gives back 4k pages in
>  	# no particular order, making it impossible to reassemble larger pages
>  	depends on PAGE_SIZE_4KB
> +	depends on HYPERV_VMBUS if HYPERV_VMBUS
>  	select EVENTFD
>  	select VIRT_XFER_TO_GUEST_WORK
>  	select HMM_MIRROR
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index 41a3d82f0722..734b7ef98f4d 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -1304,7 +1304,11 @@ static inline void *hv_get_drvdata(struct hv_device *dev)
>  
>  struct device *hv_get_vmbus_root_device(void);
>  
> +#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
>  bool hv_vmbus_exists(void);
> +#else
> +static inline bool hv_vmbus_exists(void) { return false; }
> +#endif
>  
>  struct hv_ring_buffer_debug_info {
>  	u32 current_interrupt_mask;

Reviewed-by: Hardik Garg <hargar@linux.microsoft.com>



Thanks,
Hardik

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox