From: Pranjal Shrivastava <praan@google.com>
To: Mike Rapoport <rppt@kernel.org>,
Pasha Tatashin <pasha.tatashin@soleen.com>,
Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>,
Samiullah Khawaja <skhawaja@google.com>,
David Matlack <dmatlack@google.com>,
kexec@lists.infradead.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org,
Pranjal Shrivastava <praan@google.com>
Subject: [RFC PATCH 1/4] kho: Introduce infrastructure to track preserved page types
Date: Fri, 3 Jul 2026 02:08:29 +0000 [thread overview]
Message-ID: <20260703020832.1731864-2-praan@google.com> (raw)
In-Reply-To: <20260703020832.1731864-1-praan@google.com>
The KHO mechanism currently treats all multi-page blocks preserved across
a kexec as split pages during restoration, i.e. every page carries a
refcount of 1.
However, many kernel allocations, most notably DMA buffer-allocations
via dma_alloc_coherent(), return high-order non-compound pages. In this
unsplit state, only the head page has a reference count of 1, while tail
pages have a reference count of 0.
Restoring these contiguous & unsplit blocks using the current KHO
restore forces a refcount of 1 on every tail page. This causes the
buddy allocator to trigger a bad page state panic on the free path in
the new kernel when CONFIG_DEBUG_VM is enabled, as it does not expect
tail pages of a high-order block to be refcounted.
Introduce a page_type field to track the refcount pattern of preserved
pages to avoid refcounting the tails pages of high-order non-compound
pages during restore.
The type is stored in the unused high bit (bit 63) of the KHO radix tree
key to ensure it survives the kexec journey (ABI), and is stashed in the
page->private metadata during early boot of the new kernel.
Signed-off-by: Pranjal Shrivastava <praan@google.com>
---
include/linux/kho_radix_tree.h | 17 +++++---
kernel/liveupdate/kexec_handover.c | 62 ++++++++++++++++++++----------
2 files changed, 53 insertions(+), 26 deletions(-)
diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
index 84e918b96e53..9244a3f7a2d4 100644
--- a/include/linux/kho_radix_tree.h
+++ b/include/linux/kho_radix_tree.h
@@ -34,16 +34,22 @@ struct kho_radix_tree {
struct mutex lock; /* protects the tree's structure and root pointer */
};
+enum kho_page_type {
+ KHO_PAGE_CONTIG = 0,
+ KHO_PAGE_SPLIT,
+};
+
typedef int (*kho_radix_tree_walk_callback_t)(phys_addr_t phys,
- unsigned int order);
+ unsigned int order,
+ enum kho_page_type type);
#ifdef CONFIG_KEXEC_HANDOVER
int kho_radix_add_page(struct kho_radix_tree *tree, unsigned long pfn,
- unsigned int order);
+ unsigned int order, enum kho_page_type type);
void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
- unsigned int order);
+ unsigned int order, enum kho_page_type type);
int kho_radix_walk_tree(struct kho_radix_tree *tree,
kho_radix_tree_walk_callback_t cb);
@@ -51,13 +57,14 @@ int kho_radix_walk_tree(struct kho_radix_tree *tree,
#else /* #ifdef CONFIG_KEXEC_HANDOVER */
static inline int kho_radix_add_page(struct kho_radix_tree *tree, long pfn,
- unsigned int order)
+ unsigned int order, enum kho_page_type type)
{
return -EOPNOTSUPP;
}
static inline void kho_radix_del_page(struct kho_radix_tree *tree,
- unsigned long pfn, unsigned int order) { }
+ unsigned long pfn, unsigned int order,
+ enum kho_page_type type) { }
static inline int kho_radix_walk_tree(struct kho_radix_tree *tree,
kho_radix_tree_walk_callback_t cb)
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 4834a809985a..f829ffdd00f4 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -43,18 +43,22 @@
/*
* KHO uses page->private, which is an unsigned long, to store page metadata.
- * Use it to store both the magic and the order.
+ * Use it to store the magic, the order, and the type bit.
*/
union kho_page_info {
unsigned long page_private;
struct {
- unsigned int order;
+ unsigned int order : 31;
+ unsigned int type : 1;
unsigned int magic;
};
};
static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
+#define KHO_KEY_TYPE_SHIFT 63
+#define KHO_KEY_TYPE_MASK BIT(KHO_KEY_TYPE_SHIFT)
+
static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT);
bool kho_is_enabled(void)
@@ -85,42 +89,52 @@ static struct kho_out kho_out = {
};
/**
- * kho_radix_encode_key - Encodes a physical address and order into a radix key.
+ * kho_radix_encode_key - Encodes a physical address, order and type into a radix key.
* @phys: The physical address of the page.
* @order: The order of the page.
+ * @type: The page type.
*
- * This function combines a page's physical address and its order into a
+ * This function combines a page's physical address, its order, and its type into a
* single unsigned long, which is used as a key for all radix tree
* operations.
*
* Return: The encoded unsigned long radix key.
*/
-static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order)
+static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order,
+ enum kho_page_type type)
{
/* Order bits part */
unsigned long h = 1UL << (KHO_ORDER_0_LOG2 - order);
/* Shifted physical address part */
unsigned long l = phys >> (PAGE_SHIFT + order);
+ /* Type bit part */
+ unsigned long t = (unsigned long)type << KHO_KEY_TYPE_SHIFT;
- return h | l;
+ return h | l | t;
}
/**
- * kho_radix_decode_key - Decodes a radix key back into a physical address and order.
+ * kho_radix_decode_key - Decodes a radix key back into physical address, order, and type.
* @key: The unsigned long key to decode.
* @order: An output parameter, a pointer to an unsigned int where the decoded
* page order will be stored.
+ * @type: An output parameter, a pointer to where the decoded type will be stored.
*
* This function reverses the encoding performed by kho_radix_encode_key(),
- * extracting the original physical address and page order from a given key.
+ * extracting the original physical address, page order, and type from a given key.
*
* Return: The decoded physical address.
*/
-static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order)
+static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order,
+ enum kho_page_type *type)
{
- unsigned int order_bit = fls64(key);
+ unsigned int order_bit;
phys_addr_t phys;
+ *type = (key & KHO_KEY_TYPE_MASK) >> KHO_KEY_TYPE_SHIFT;
+ key &= ~KHO_KEY_TYPE_MASK;
+
+ order_bit = fls64(key);
/* order_bit is numbered starting at 1 from fls64 */
*order = KHO_ORDER_0_LOG2 - order_bit + 1;
/* The order is discarded by the shift */
@@ -148,6 +162,7 @@ static unsigned long kho_radix_get_table_index(unsigned long key,
* @tree: The KHO radix tree.
* @pfn: The page frame number of the page to preserve.
* @order: The order of the page.
+ * @type: The page type.
*
* This function traverses the radix tree based on the key derived from @pfn
* and @order. It sets the corresponding bit in the leaf bitmap to mark the
@@ -157,11 +172,12 @@ static unsigned long kho_radix_get_table_index(unsigned long key,
* Return: 0 on success, or a negative error code on failure.
*/
int kho_radix_add_page(struct kho_radix_tree *tree,
- unsigned long pfn, unsigned int order)
+ unsigned long pfn, unsigned int order,
+ enum kho_page_type type)
{
/* Newly allocated nodes for error cleanup */
struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 };
- unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
+ unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order, type);
struct kho_radix_node *anchor_node = NULL;
struct kho_radix_node *node = tree->root;
struct kho_radix_node *new_node;
@@ -231,15 +247,16 @@ EXPORT_SYMBOL_GPL(kho_radix_add_page);
* @tree: The KHO radix tree.
* @pfn: The page frame number of the page to unpreserve.
* @order: The order of the page.
+ * @type: The page type.
*
* This function traverses the radix tree and clears the bit corresponding to
* the page, effectively removing its "preserved" status. It does not free
* the tree's intermediate nodes, even if they become empty.
*/
void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
- unsigned int order)
+ unsigned int order, enum kho_page_type type)
{
- unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
+ unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order, type);
struct kho_radix_node *node = tree->root;
struct kho_radix_leaf *leaf;
unsigned int i, idx;
@@ -277,14 +294,15 @@ static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf,
kho_radix_tree_walk_callback_t cb)
{
unsigned long *bitmap = (unsigned long *)leaf;
+ enum kho_page_type type;
unsigned int order;
phys_addr_t phys;
unsigned int i;
int err;
for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
- phys = kho_radix_decode_key(key | i, &order);
- err = cb(phys, order);
+ phys = kho_radix_decode_key(key | i, &order, &type);
+ err = cb(phys, order, type);
if (err)
return err;
}
@@ -485,7 +503,8 @@ static struct page *__init kho_get_preserved_page(phys_addr_t phys,
}
static int __init kho_preserved_memory_reserve(phys_addr_t phys,
- unsigned int order)
+ unsigned int order,
+ enum kho_page_type type)
{
union kho_page_info info;
struct page *page;
@@ -499,6 +518,7 @@ static int __init kho_preserved_memory_reserve(phys_addr_t phys,
memblock_reserved_mark_noinit(phys, sz);
info.magic = KHO_PAGE_MAGIC;
info.order = order;
+ info.type = type;
page->private = info.page_private;
return 0;
@@ -859,7 +879,7 @@ int kho_preserve_folio(struct folio *folio)
if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
return -EINVAL;
- return kho_radix_add_page(tree, pfn, order);
+ return kho_radix_add_page(tree, pfn, order, KHO_PAGE_CONTIG);
}
EXPORT_SYMBOL_GPL(kho_preserve_folio);
@@ -877,7 +897,7 @@ void kho_unpreserve_folio(struct folio *folio)
const unsigned long pfn = folio_pfn(folio);
const unsigned int order = folio_order(folio);
- kho_radix_del_page(tree, pfn, order);
+ kho_radix_del_page(tree, pfn, order, KHO_PAGE_CONTIG);
}
EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
@@ -906,7 +926,7 @@ static void __kho_unpreserve(struct kho_radix_tree *tree,
while (pfn < end_pfn) {
order = __kho_preserve_pages_order(pfn, end_pfn);
- kho_radix_del_page(tree, pfn, order);
+ kho_radix_del_page(tree, pfn, order, KHO_PAGE_CONTIG);
pfn += 1 << order;
}
@@ -939,7 +959,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages)
while (pfn < end_pfn) {
unsigned int order = __kho_preserve_pages_order(pfn, end_pfn);
- err = kho_radix_add_page(tree, pfn, order);
+ err = kho_radix_add_page(tree, pfn, order, KHO_PAGE_CONTIG);
if (err) {
failed_pfn = pfn;
break;
--
2.55.0.rc0.799.gd6f94ed593-goog
next prev parent reply other threads:[~2026-07-03 2:08 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-07-03 2:08 [RFC PATCH 0/4] kho: Support preserving unsplit high-order pages Pranjal Shrivastava
2026-07-03 2:08 ` Pranjal Shrivastava [this message]
2026-07-03 2:08 ` [RFC PATCH 2/4] kho: Detect preserved page types Pranjal Shrivastava
2026-07-03 2:08 ` [RFC PATCH 3/4] kho: Implement page-aware refcount restoration Pranjal Shrivastava
2026-07-03 2:08 ` [RFC PATCH 4/4] kho: Introduce kho_split_preserved_pages() helper Pranjal Shrivastava
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260703020832.1731864-2-praan@google.com \
--to=praan@google.com \
--cc=dmatlack@google.com \
--cc=graf@amazon.com \
--cc=kexec@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=pasha.tatashin@soleen.com \
--cc=pratyush@kernel.org \
--cc=rppt@kernel.org \
--cc=skhawaja@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox