kexec.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-07  7:00 [PATCH v2 0/2] kho: add support for preserving vmalloc allocations Mike Rapoport
@ 2025-09-07  7:00 ` Mike Rapoport
  0 siblings, 0 replies; 21+ messages in thread
From: Mike Rapoport @ 2025-09-07  7:00 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
	Jason Gunthorpe, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
	kexec, linux-mm, linux-kernel

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

A vmalloc allocation is preserved using binary structure similar to
global KHO memory tracker. It's a linked list of pages where each page
is an array of physical address of pages in vmalloc area.

kho_preserve_vmalloc() hands out the physical address of the head page
to the caller. This address is used as the argument to
kho_vmalloc_restore() to restore the mapping in the vmalloc address
space and populate it with the preserved pages.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 include/linux/kexec_handover.h |  12 ++
 kernel/kexec_handover.c        | 200 +++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 348844cffb13..b7bf3bf11019 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -42,8 +42,10 @@ struct kho_serialization;
 bool kho_is_enabled(void);
 
 int kho_preserve_folio(struct folio *folio);
+int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation);
 int kho_preserve_phys(phys_addr_t phys, size_t size);
 struct folio *kho_restore_folio(phys_addr_t phys);
+void *kho_restore_vmalloc(phys_addr_t preservation);
 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
 
@@ -70,11 +72,21 @@ static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct folio *kho_restore_folio(phys_addr_t phys)
 {
 	return NULL;
 }
 
+static inline void *kho_restore_vmalloc(phys_addr_t preservation)
+{
+	return NULL;
+}
+
 static inline int kho_add_subtree(struct kho_serialization *ser,
 				  const char *name, void *fdt)
 {
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 8079fc4b9189..1177cc5ffa1a 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -18,6 +18,7 @@
 #include <linux/memblock.h>
 #include <linux/notifier.h>
 #include <linux/page-isolation.h>
+#include <linux/vmalloc.h>
 
 #include <asm/early_ioremap.h>
 
@@ -742,6 +743,205 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
 }
 EXPORT_SYMBOL_GPL(kho_preserve_phys);
 
+struct kho_vmalloc_chunk;
+
+struct kho_vmalloc_hdr {
+	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
+	unsigned int total_pages;	/* only valid in the first chunk */
+	unsigned int flags;		/* only valid in the first chunk */
+	unsigned short order;		/* only valid in the first chunk */
+	unsigned short num_elms;
+};
+
+#define KHO_VMALLOC_SIZE				\
+	((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
+	 sizeof(phys_addr_t))
+
+struct kho_vmalloc_chunk {
+	struct kho_vmalloc_hdr hdr;
+	phys_addr_t phys[KHO_VMALLOC_SIZE];
+};
+
+static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
+
+#define KHO_VMALLOC_FLAGS_MASK	(VM_ALLOC | VM_ALLOW_HUGE_VMAP)
+
+static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
+{
+	struct kho_vmalloc_chunk *chunk;
+	int err;
+
+	chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	err = kho_preserve_phys(virt_to_phys(chunk), PAGE_SIZE);
+	if (err)
+		goto err_free;
+	if (cur)
+		KHOSER_STORE_PTR(cur->hdr.next, chunk);
+	return chunk;
+
+err_free:
+	kfree(chunk);
+	return NULL;
+}
+
+static void kho_vmalloc_free_chunks(struct kho_vmalloc_chunk *first_chunk)
+{
+	struct kho_mem_track *track = &kho_out.ser.track;
+	struct kho_vmalloc_chunk *chunk = first_chunk;
+
+	while (chunk) {
+		unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
+		struct kho_vmalloc_chunk *tmp = chunk;
+
+		__kho_unpreserve(track, pfn, pfn + 1);
+
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		kfree(tmp);
+	}
+}
+
+/**
+ * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
+ * @ptr: pointer to the area in vmalloc address space
+ * @preservation: returned physical address of preservation metadata
+ *
+ * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
+ * physical pages mapped at @ptr will be preserved and on successful return
+ * @preservation will hold the physical address of a structure that describes
+ * the preservation.
+ *
+ * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
+ * restored on the same node
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
+{
+	struct kho_mem_track *track = &kho_out.ser.track;
+	struct kho_vmalloc_chunk *chunk, *first_chunk;
+	struct vm_struct *vm = find_vm_area(ptr);
+	unsigned int order, flags;
+	int err;
+
+	if (!vm)
+		return -EINVAL;
+
+	if (vm->flags & ~KHO_VMALLOC_FLAGS_MASK)
+		return -EOPNOTSUPP;
+
+	flags = vm->flags & KHO_VMALLOC_FLAGS_MASK;
+	order = get_vm_area_page_order(vm);
+
+	chunk = new_vmalloc_chunk(NULL);
+	if (!chunk)
+		return -ENOMEM;
+	first_chunk = chunk;
+	first_chunk->hdr.total_pages = vm->nr_pages;
+	first_chunk->hdr.flags = flags;
+	first_chunk->hdr.order = order;
+
+	for (int i = 0; i < vm->nr_pages; i += (1 << order)) {
+		phys_addr_t phys = page_to_phys(vm->pages[i]);
+
+		err = __kho_preserve_order(track, PHYS_PFN(phys), order);
+		if (err)
+			goto err_free;
+
+		chunk->phys[chunk->hdr.num_elms] = phys;
+		chunk->hdr.num_elms++;
+		if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->phys)) {
+			chunk = new_vmalloc_chunk(chunk);
+			if (!chunk)
+				goto err_free;
+		}
+	}
+
+	*preservation = virt_to_phys(first_chunk);
+	return 0;
+
+err_free:
+	kho_vmalloc_free_chunks(first_chunk);
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
+
+/**
+ * kho_restore_vmalloc - recreates and populates an area in vmalloc address
+ * space from the preserved memory.
+ * @preservation: physical address of the preservation metadata.
+ *
+ * Recreates an area in vmalloc address space and populates it with memory that
+ * was preserved using kho_preserve_vmalloc().
+ *
+ * Return: pointer to the area in the vmalloc address space, NULL on failure.
+ */
+void *kho_restore_vmalloc(phys_addr_t preservation)
+{
+	struct kho_vmalloc_chunk *chunk = phys_to_virt(preservation);
+	unsigned int align, order, shift, flags;
+	unsigned int idx = 0, nr;
+	unsigned long addr, size;
+	struct vm_struct *area;
+	struct page **pages;
+	int err;
+
+	flags = chunk->hdr.flags;
+	if (flags & ~KHO_VMALLOC_FLAGS_MASK)
+		return NULL;
+
+	nr = chunk->hdr.total_pages;
+	pages = kvmalloc_array(nr, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return NULL;
+	order = chunk->hdr.order;
+	shift = PAGE_SHIFT + order;
+	align = 1 << shift;
+
+	while (chunk) {
+		struct page *page;
+
+		for (int i = 0; i < chunk->hdr.num_elms; i++) {
+			phys_addr_t phys = chunk->phys[i];
+
+			for (int j = 0; j < (1 << order); j++) {
+				page = phys_to_page(phys);
+				kho_restore_page(page, 0);
+				pages[idx++] = page;
+				phys += PAGE_SIZE;
+			}
+		}
+
+		page = virt_to_page(chunk);
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		kho_restore_page(page, 0);
+		__free_page(page);
+	}
+
+	area = __get_vm_area_node(nr * PAGE_SIZE, align, shift, flags,
+				  VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
+				  GFP_KERNEL, __builtin_return_address(0));
+	if (!area)
+		goto err_free_pages_array;
+
+	addr = (unsigned long)area->addr;
+	size = get_vm_area_size(area);
+	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
+	if (err)
+		goto err_free_vm_area;
+
+	return area->addr;
+
+err_free_vm_area:
+	free_vm_area(area);
+err_free_pages_array:
+	kvfree(pages);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
+
 /* Handling for debug/kho/out */
 
 static struct dentry *debugfs_root;
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v3 0/2] kho: add support for preserving vmalloc allocations
@ 2025-09-08 10:35 Mike Rapoport
  2025-09-08 10:35 ` [PATCH v3 1/2] " Mike Rapoport
  2025-09-08 10:35 ` [PATCH v3 2/2] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt Mike Rapoport
  0 siblings, 2 replies; 21+ messages in thread
From: Mike Rapoport @ 2025-09-08 10:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
	Jason Gunthorpe, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
	kexec, linux-mm, linux-kernel

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

Hi,

Following the discussion about preservation of memfd with LUO [1] these
patches add support for preserving vmalloc allocations.

Any KHO uses case presumes that there's a data structure that lists
physical addresses of preserved folios (and potentially some additional
metadata). Allowing vmalloc preservations with KHO allows scalable
preservation of such data structures.

For instance, instead of allocating array describing preserved folios in
the fdt, memfd preservation can use vmalloc:

        preserved_folios = vmalloc_array(nr_folios, sizeof(*preserved_folios));
        memfd_luo_preserve_folios(preserved_folios, folios, nr_folios);
        kho_preserve_vmalloc(preserved_folios, &folios_info);

[1] https://lore.kernel.org/all/20250807014442.3829950-30-pasha.tatashin@soleen.com

v3 changes:
* rebase on mm-unstable

v2: https://lore.kernel.org/all/20250905131302.3595582-1-rppt@kernel.org
* support preservation of vmalloc backed by large pages
* add check for supported vmalloc flags and preserve the flags to be
  able to identify incompatible preservations
* don't use kho_preserve_phys()
* add kernel-doc

v1: https://lore.kernel.org/all/20250903063018.3346652-1-rppt@kernel.org

Mike Rapoport (Microsoft) (2):
  kho: add support for preserving vmalloc allocations
  lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt

 include/linux/kexec_handover.h |  12 ++
 kernel/kexec_handover.c        | 200 +++++++++++++++++++++++++++++++++
 lib/test_kho.c                 |  41 +++++--
 3 files changed, 241 insertions(+), 12 deletions(-)


base-commit: b024763926d2726978dff6588b81877d000159c1
-- 
2.50.1



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-08 10:35 [PATCH v3 0/2] kho: add support for preserving vmalloc allocations Mike Rapoport
@ 2025-09-08 10:35 ` Mike Rapoport
  2025-09-08 14:14   ` Jason Gunthorpe
  2025-09-08 18:12   ` Pratyush Yadav
  2025-09-08 10:35 ` [PATCH v3 2/2] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt Mike Rapoport
  1 sibling, 2 replies; 21+ messages in thread
From: Mike Rapoport @ 2025-09-08 10:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
	Jason Gunthorpe, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
	kexec, linux-mm, linux-kernel

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

A vmalloc allocation is preserved using binary structure similar to
global KHO memory tracker. It's a linked list of pages where each page
is an array of physical address of pages in vmalloc area.

kho_preserve_vmalloc() hands out the physical address of the head page
to the caller. This address is used as the argument to
kho_vmalloc_restore() to restore the mapping in the vmalloc address
space and populate it with the preserved pages.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 include/linux/kexec_handover.h |  12 ++
 kernel/kexec_handover.c        | 200 +++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 348844cffb13..b7bf3bf11019 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -42,8 +42,10 @@ struct kho_serialization;
 bool kho_is_enabled(void);
 
 int kho_preserve_folio(struct folio *folio);
+int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation);
 int kho_preserve_phys(phys_addr_t phys, size_t size);
 struct folio *kho_restore_folio(phys_addr_t phys);
+void *kho_restore_vmalloc(phys_addr_t preservation);
 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
 int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
 
@@ -70,11 +72,21 @@ static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct folio *kho_restore_folio(phys_addr_t phys)
 {
 	return NULL;
 }
 
+static inline void *kho_restore_vmalloc(phys_addr_t preservation)
+{
+	return NULL;
+}
+
 static inline int kho_add_subtree(struct kho_serialization *ser,
 				  const char *name, void *fdt)
 {
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 8079fc4b9189..1177cc5ffa1a 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -18,6 +18,7 @@
 #include <linux/memblock.h>
 #include <linux/notifier.h>
 #include <linux/page-isolation.h>
+#include <linux/vmalloc.h>
 
 #include <asm/early_ioremap.h>
 
@@ -742,6 +743,205 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
 }
 EXPORT_SYMBOL_GPL(kho_preserve_phys);
 
+struct kho_vmalloc_chunk;
+
+struct kho_vmalloc_hdr {
+	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
+	unsigned int total_pages;	/* only valid in the first chunk */
+	unsigned int flags;		/* only valid in the first chunk */
+	unsigned short order;		/* only valid in the first chunk */
+	unsigned short num_elms;
+};
+
+#define KHO_VMALLOC_SIZE				\
+	((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
+	 sizeof(phys_addr_t))
+
+struct kho_vmalloc_chunk {
+	struct kho_vmalloc_hdr hdr;
+	phys_addr_t phys[KHO_VMALLOC_SIZE];
+};
+
+static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
+
+#define KHO_VMALLOC_FLAGS_MASK	(VM_ALLOC | VM_ALLOW_HUGE_VMAP)
+
+static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
+{
+	struct kho_vmalloc_chunk *chunk;
+	int err;
+
+	chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	err = kho_preserve_phys(virt_to_phys(chunk), PAGE_SIZE);
+	if (err)
+		goto err_free;
+	if (cur)
+		KHOSER_STORE_PTR(cur->hdr.next, chunk);
+	return chunk;
+
+err_free:
+	kfree(chunk);
+	return NULL;
+}
+
+static void kho_vmalloc_free_chunks(struct kho_vmalloc_chunk *first_chunk)
+{
+	struct kho_mem_track *track = &kho_out.ser.track;
+	struct kho_vmalloc_chunk *chunk = first_chunk;
+
+	while (chunk) {
+		unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
+		struct kho_vmalloc_chunk *tmp = chunk;
+
+		__kho_unpreserve(track, pfn, pfn + 1);
+
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		kfree(tmp);
+	}
+}
+
+/**
+ * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
+ * @ptr: pointer to the area in vmalloc address space
+ * @preservation: returned physical address of preservation metadata
+ *
+ * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
+ * physical pages mapped at @ptr will be preserved and on successful return
+ * @preservation will hold the physical address of a structure that describes
+ * the preservation.
+ *
+ * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
+ * restored on the same node
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
+{
+	struct kho_mem_track *track = &kho_out.ser.track;
+	struct kho_vmalloc_chunk *chunk, *first_chunk;
+	struct vm_struct *vm = find_vm_area(ptr);
+	unsigned int order, flags;
+	int err;
+
+	if (!vm)
+		return -EINVAL;
+
+	if (vm->flags & ~KHO_VMALLOC_FLAGS_MASK)
+		return -EOPNOTSUPP;
+
+	flags = vm->flags & KHO_VMALLOC_FLAGS_MASK;
+	order = get_vm_area_page_order(vm);
+
+	chunk = new_vmalloc_chunk(NULL);
+	if (!chunk)
+		return -ENOMEM;
+	first_chunk = chunk;
+	first_chunk->hdr.total_pages = vm->nr_pages;
+	first_chunk->hdr.flags = flags;
+	first_chunk->hdr.order = order;
+
+	for (int i = 0; i < vm->nr_pages; i += (1 << order)) {
+		phys_addr_t phys = page_to_phys(vm->pages[i]);
+
+		err = __kho_preserve_order(track, PHYS_PFN(phys), order);
+		if (err)
+			goto err_free;
+
+		chunk->phys[chunk->hdr.num_elms] = phys;
+		chunk->hdr.num_elms++;
+		if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->phys)) {
+			chunk = new_vmalloc_chunk(chunk);
+			if (!chunk)
+				goto err_free;
+		}
+	}
+
+	*preservation = virt_to_phys(first_chunk);
+	return 0;
+
+err_free:
+	kho_vmalloc_free_chunks(first_chunk);
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
+
+/**
+ * kho_restore_vmalloc - recreates and populates an area in vmalloc address
+ * space from the preserved memory.
+ * @preservation: physical address of the preservation metadata.
+ *
+ * Recreates an area in vmalloc address space and populates it with memory that
+ * was preserved using kho_preserve_vmalloc().
+ *
+ * Return: pointer to the area in the vmalloc address space, NULL on failure.
+ */
+void *kho_restore_vmalloc(phys_addr_t preservation)
+{
+	struct kho_vmalloc_chunk *chunk = phys_to_virt(preservation);
+	unsigned int align, order, shift, flags;
+	unsigned int idx = 0, nr;
+	unsigned long addr, size;
+	struct vm_struct *area;
+	struct page **pages;
+	int err;
+
+	flags = chunk->hdr.flags;
+	if (flags & ~KHO_VMALLOC_FLAGS_MASK)
+		return NULL;
+
+	nr = chunk->hdr.total_pages;
+	pages = kvmalloc_array(nr, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return NULL;
+	order = chunk->hdr.order;
+	shift = PAGE_SHIFT + order;
+	align = 1 << shift;
+
+	while (chunk) {
+		struct page *page;
+
+		for (int i = 0; i < chunk->hdr.num_elms; i++) {
+			phys_addr_t phys = chunk->phys[i];
+
+			for (int j = 0; j < (1 << order); j++) {
+				page = phys_to_page(phys);
+				kho_restore_page(page, 0);
+				pages[idx++] = page;
+				phys += PAGE_SIZE;
+			}
+		}
+
+		page = virt_to_page(chunk);
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		kho_restore_page(page, 0);
+		__free_page(page);
+	}
+
+	area = __get_vm_area_node(nr * PAGE_SIZE, align, shift, flags,
+				  VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
+				  GFP_KERNEL, __builtin_return_address(0));
+	if (!area)
+		goto err_free_pages_array;
+
+	addr = (unsigned long)area->addr;
+	size = get_vm_area_size(area);
+	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
+	if (err)
+		goto err_free_vm_area;
+
+	return area->addr;
+
+err_free_vm_area:
+	free_vm_area(area);
+err_free_pages_array:
+	kvfree(pages);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
+
 /* Handling for debug/kho/out */
 
 static struct dentry *debugfs_root;
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v3 2/2] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt
  2025-09-08 10:35 [PATCH v3 0/2] kho: add support for preserving vmalloc allocations Mike Rapoport
  2025-09-08 10:35 ` [PATCH v3 1/2] " Mike Rapoport
@ 2025-09-08 10:35 ` Mike Rapoport
  1 sibling, 0 replies; 21+ messages in thread
From: Mike Rapoport @ 2025-09-08 10:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
	Jason Gunthorpe, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
	kexec, linux-mm, linux-kernel

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

KHO test stores physical addresses of the preserved folios directly in
fdt.
Use kho_preserve_vmalloc() instead of it and kho_restore_vmalloc() to
retrieve the addresses after kexec.

This makes the test more scalable from one side and adds tests coverage
for kho_preserve_vmalloc() from the other.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 lib/test_kho.c | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/lib/test_kho.c b/lib/test_kho.c
index fe8504e3407b..c46f577b6aee 100644
--- a/lib/test_kho.c
+++ b/lib/test_kho.c
@@ -32,6 +32,7 @@ module_param(max_mem, long, 0644);
 struct kho_test_state {
 	unsigned int nr_folios;
 	struct folio **folios;
+	phys_addr_t *folios_info;
 	struct folio *fdt;
 	__wsum csum;
 };
@@ -67,18 +68,15 @@ static struct notifier_block kho_test_nb = {
 
 static int kho_test_save_data(struct kho_test_state *state, void *fdt)
 {
-	phys_addr_t *folios_info;
+	phys_addr_t *folios_info __free(kvfree) = NULL;
+	phys_addr_t folios_info_phys;
 	int err = 0;
 
-	err |= fdt_begin_node(fdt, "data");
-	err |= fdt_property(fdt, "nr_folios", &state->nr_folios,
-			    sizeof(state->nr_folios));
-	err |= fdt_property_placeholder(fdt, "folios_info",
-					state->nr_folios * sizeof(*folios_info),
-					(void **)&folios_info);
-	err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum));
-	err |= fdt_end_node(fdt);
+	folios_info = vmalloc_array(state->nr_folios, sizeof(*folios_info));
+	if (!folios_info)
+		return -ENOMEM;
 
+	err = kho_preserve_vmalloc(folios_info, &folios_info_phys);
 	if (err)
 		return err;
 
@@ -93,6 +91,17 @@ static int kho_test_save_data(struct kho_test_state *state, void *fdt)
 			break;
 	}
 
+	err |= fdt_begin_node(fdt, "data");
+	err |= fdt_property(fdt, "nr_folios", &state->nr_folios,
+			    sizeof(state->nr_folios));
+	err |= fdt_property(fdt, "folios_info", &folios_info_phys,
+			    sizeof(folios_info_phys));
+	err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum));
+	err |= fdt_end_node(fdt);
+
+	if (!err)
+		state->folios_info = no_free_ptr(folios_info);
+
 	return err;
 }
 
@@ -210,7 +219,8 @@ static int kho_test_save(void)
 static int kho_test_restore_data(const void *fdt, int node)
 {
 	const unsigned int *nr_folios;
-	const phys_addr_t *folios_info;
+	const phys_addr_t *folios_info_phys;
+	phys_addr_t *folios_info;
 	const __wsum *old_csum;
 	__wsum csum = 0;
 	int len;
@@ -225,8 +235,12 @@ static int kho_test_restore_data(const void *fdt, int node)
 	if (!old_csum || len != sizeof(*old_csum))
 		return -EINVAL;
 
-	folios_info = fdt_getprop(fdt, node, "folios_info", &len);
-	if (!folios_info || len != sizeof(*folios_info) * *nr_folios)
+	folios_info_phys = fdt_getprop(fdt, node, "folios_info", &len);
+	if (!folios_info_phys || len != sizeof(*folios_info_phys))
+		return -EINVAL;
+
+	folios_info = kho_restore_vmalloc(*folios_info_phys);
+	if (!folios_info)
 		return -EINVAL;
 
 	for (int i = 0; i < *nr_folios; i++) {
@@ -246,6 +260,8 @@ static int kho_test_restore_data(const void *fdt, int node)
 		folio_put(folio);
 	}
 
+	vfree(folios_info);
+
 	if (csum != *old_csum)
 		return -EINVAL;
 
@@ -304,6 +320,7 @@ static void kho_test_cleanup(void)
 		folio_put(kho_test_state.folios[i]);
 
 	kvfree(kho_test_state.folios);
+	vfree(kho_test_state.folios_info);
 	folio_put(kho_test_state.fdt);
 }
 
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-08 10:35 ` [PATCH v3 1/2] " Mike Rapoport
@ 2025-09-08 14:14   ` Jason Gunthorpe
  2025-09-15 14:01     ` Mike Rapoport
  2025-09-08 18:12   ` Pratyush Yadav
  1 sibling, 1 reply; 21+ messages in thread
From: Jason Gunthorpe @ 2025-09-08 14:14 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
	Chris Li, Pasha Tatashin, Pratyush Yadav, kexec, linux-mm,
	linux-kernel

On Mon, Sep 08, 2025 at 01:35:27PM +0300, Mike Rapoport wrote:
> +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
> +{
> +	struct kho_vmalloc_chunk *chunk;
> +	int err;
> +
> +	chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!chunk)
> +		return NULL;
> +
> +	err = kho_preserve_phys(virt_to_phys(chunk), PAGE_SIZE);
> +	if (err)
> +		goto err_free;

kzalloc() cannot be preserved, the only thing we support today is
alloc_page(), so this code pattern shouldn't exist.

Call alloc_page() and use a kho_preserve_page/folio() like the luo
patches were doing. The pattern seems common it probably needs a small
alloc/free helper.

> +	for (int i = 0; i < vm->nr_pages; i += (1 << order)) {
> +		phys_addr_t phys = page_to_phys(vm->pages[i]);
> +
> +		err = __kho_preserve_order(track, PHYS_PFN(phys), order);
> +		if (err)
> +			goto err_free;

I think you should make a helper inline to document what is happening here:

/*
 * Preserve a contiguous aligned list of order 0 pages that aggregate
 * to a higher order allocation. Must be restored using
 * kho_restore_page() on each order 0 page.
 */
kho_preserve_pages(page, order);

Jason


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-08 10:35 ` [PATCH v3 1/2] " Mike Rapoport
  2025-09-08 14:14   ` Jason Gunthorpe
@ 2025-09-08 18:12   ` Pratyush Yadav
  2025-09-08 18:18     ` Jason Gunthorpe
                       ` (2 more replies)
  1 sibling, 3 replies; 21+ messages in thread
From: Pratyush Yadav @ 2025-09-08 18:12 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
	Chris Li, Jason Gunthorpe, Pasha Tatashin, Pratyush Yadav, kexec,
	linux-mm, linux-kernel

On Mon, Sep 08 2025, Mike Rapoport wrote:

> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> A vmalloc allocation is preserved using binary structure similar to
> global KHO memory tracker. It's a linked list of pages where each page
> is an array of physical address of pages in vmalloc area.
>
> kho_preserve_vmalloc() hands out the physical address of the head page
> to the caller. This address is used as the argument to
> kho_vmalloc_restore() to restore the mapping in the vmalloc address
> space and populate it with the preserved pages.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  include/linux/kexec_handover.h |  12 ++
>  kernel/kexec_handover.c        | 200 +++++++++++++++++++++++++++++++++
>  2 files changed, 212 insertions(+)
>
> diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
> index 348844cffb13..b7bf3bf11019 100644
> --- a/include/linux/kexec_handover.h
> +++ b/include/linux/kexec_handover.h
> @@ -42,8 +42,10 @@ struct kho_serialization;
>  bool kho_is_enabled(void);
>  
>  int kho_preserve_folio(struct folio *folio);
> +int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation);
>  int kho_preserve_phys(phys_addr_t phys, size_t size);
>  struct folio *kho_restore_folio(phys_addr_t phys);
> +void *kho_restore_vmalloc(phys_addr_t preservation);
>  int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
>  int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
>  
> @@ -70,11 +72,21 @@ static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
>  	return -EOPNOTSUPP;
>  }
>  
> +static inline int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>  static inline struct folio *kho_restore_folio(phys_addr_t phys)
>  {
>  	return NULL;
>  }
>  
> +static inline void *kho_restore_vmalloc(phys_addr_t preservation)
> +{
> +	return NULL;
> +}
> +
>  static inline int kho_add_subtree(struct kho_serialization *ser,
>  				  const char *name, void *fdt)
>  {
> diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
> index 8079fc4b9189..1177cc5ffa1a 100644
> --- a/kernel/kexec_handover.c
> +++ b/kernel/kexec_handover.c
> @@ -18,6 +18,7 @@
>  #include <linux/memblock.h>
>  #include <linux/notifier.h>
>  #include <linux/page-isolation.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/early_ioremap.h>
>  
> @@ -742,6 +743,205 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
>  }
>  EXPORT_SYMBOL_GPL(kho_preserve_phys);
>  
> +struct kho_vmalloc_chunk;
> +
> +struct kho_vmalloc_hdr {
> +	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
> +	unsigned int total_pages;	/* only valid in the first chunk */
> +	unsigned int flags;		/* only valid in the first chunk */
> +	unsigned short order;		/* only valid in the first chunk */
> +	unsigned short num_elms;

I think it the serialization format would be cleaner if these were
defined in a separate structure that holds the metadata instead of being
defined in each page and then ignored in most of them.

If the caller can save 8 bytes (phys addr of first page), it might as
well save 16 instead. Something like the below perhaps?

struct kho_vmalloc {
	DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *);
	unsigned int total_pages;
	unsigned short flags;
	unsigned short order;
};

And then kho_vmalloc_hdr becomes simply:

struct kho_vmalloc_hdr {
	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
};

You don't even need num_elms since you have the list be zero-terminated.

> +};
> +
> +#define KHO_VMALLOC_SIZE				\
> +	((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
> +	 sizeof(phys_addr_t))
> +
> +struct kho_vmalloc_chunk {
> +	struct kho_vmalloc_hdr hdr;
> +	phys_addr_t phys[KHO_VMALLOC_SIZE];
> +};
> +
> +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
> +
> +#define KHO_VMALLOC_FLAGS_MASK	(VM_ALLOC | VM_ALLOW_HUGE_VMAP)

I don't think it is a good idea to re-use VM flags. This can make adding
more flags later down the line ugly. I think it would be better to
define KHO_VMALLOC_FL* instead.

> +
> +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
> +{
> +	struct kho_vmalloc_chunk *chunk;
> +	int err;
> +
> +	chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!chunk)
> +		return NULL;
> +
> +	err = kho_preserve_phys(virt_to_phys(chunk), PAGE_SIZE);
> +	if (err)
> +		goto err_free;
> +	if (cur)
> +		KHOSER_STORE_PTR(cur->hdr.next, chunk);
> +	return chunk;
> +
> +err_free:
> +	kfree(chunk);
> +	return NULL;
> +}
> +
> +static void kho_vmalloc_free_chunks(struct kho_vmalloc_chunk *first_chunk)
> +{
> +	struct kho_mem_track *track = &kho_out.ser.track;
> +	struct kho_vmalloc_chunk *chunk = first_chunk;
> +
> +	while (chunk) {
> +		unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
> +		struct kho_vmalloc_chunk *tmp = chunk;
> +
> +		__kho_unpreserve(track, pfn, pfn + 1);

This doesn't unpreserve the pages contained in the chunk, which
kho_preserve_vmalloc() preserved.

> +
> +		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
> +		kfree(tmp);
> +	}
> +}
> +
> +/**
> + * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
> + * @ptr: pointer to the area in vmalloc address space
> + * @preservation: returned physical address of preservation metadata
> + *
> + * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
> + * physical pages mapped at @ptr will be preserved and on successful return
> + * @preservation will hold the physical address of a structure that describes
> + * the preservation.
> + *
> + * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
> + * restored on the same node
> + *
> + * Return: 0 on success, error code on failure
> + */
> +int kho_preserve_vmalloc(void *ptr, phys_addr_t *preservation)
> +{
> +	struct kho_mem_track *track = &kho_out.ser.track;
> +	struct kho_vmalloc_chunk *chunk, *first_chunk;
> +	struct vm_struct *vm = find_vm_area(ptr);
> +	unsigned int order, flags;
> +	int err;
> +
> +	if (!vm)
> +		return -EINVAL;
> +
> +	if (vm->flags & ~KHO_VMALLOC_FLAGS_MASK)
> +		return -EOPNOTSUPP;
> +
> +	flags = vm->flags & KHO_VMALLOC_FLAGS_MASK;
> +	order = get_vm_area_page_order(vm);
> +
> +	chunk = new_vmalloc_chunk(NULL);
> +	if (!chunk)
> +		return -ENOMEM;
> +	first_chunk = chunk;
> +	first_chunk->hdr.total_pages = vm->nr_pages;
> +	first_chunk->hdr.flags = flags;
> +	first_chunk->hdr.order = order;
> +
> +	for (int i = 0; i < vm->nr_pages; i += (1 << order)) {
> +		phys_addr_t phys = page_to_phys(vm->pages[i]);
> +
> +		err = __kho_preserve_order(track, PHYS_PFN(phys), order);
> +		if (err)
> +			goto err_free;
> +
> +		chunk->phys[chunk->hdr.num_elms] = phys;
> +		chunk->hdr.num_elms++;
> +		if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->phys)) {
> +			chunk = new_vmalloc_chunk(chunk);
> +			if (!chunk)
> +				goto err_free;
> +		}
> +	}
> +
> +	*preservation = virt_to_phys(first_chunk);
> +	return 0;
> +
> +err_free:
> +	kho_vmalloc_free_chunks(first_chunk);
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
> +
> +/**
> + * kho_restore_vmalloc - recreates and populates an area in vmalloc address
> + * space from the preserved memory.
> + * @preservation: physical address of the preservation metadata.
> + *
> + * Recreates an area in vmalloc address space and populates it with memory that
> + * was preserved using kho_preserve_vmalloc().
> + *
> + * Return: pointer to the area in the vmalloc address space, NULL on failure.
> + */
> +void *kho_restore_vmalloc(phys_addr_t preservation)
> +{
> +	struct kho_vmalloc_chunk *chunk = phys_to_virt(preservation);
> +	unsigned int align, order, shift, flags;
> +	unsigned int idx = 0, nr;
> +	unsigned long addr, size;
> +	struct vm_struct *area;
> +	struct page **pages;
> +	int err;
> +
> +	flags = chunk->hdr.flags;
> +	if (flags & ~KHO_VMALLOC_FLAGS_MASK)
> +		return NULL;
> +
> +	nr = chunk->hdr.total_pages;
> +	pages = kvmalloc_array(nr, sizeof(*pages), GFP_KERNEL);
> +	if (!pages)
> +		return NULL;
> +	order = chunk->hdr.order;
> +	shift = PAGE_SHIFT + order;
> +	align = 1 << shift;
> +
> +	while (chunk) {
> +		struct page *page;
> +
> +		for (int i = 0; i < chunk->hdr.num_elms; i++) {
> +			phys_addr_t phys = chunk->phys[i];
> +
> +			for (int j = 0; j < (1 << order); j++) {
> +				page = phys_to_page(phys);
> +				kho_restore_page(page, 0);
> +				pages[idx++] = page;

This can buffer-overflow if the previous kernel was buggy and added too
many pages. Perhaps keep check for this?

> +				phys += PAGE_SIZE;
> +			}
> +		}
> +
> +		page = virt_to_page(chunk);
> +		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
> +		kho_restore_page(page, 0);
> +		__free_page(page);
> +	}
> +
> +	area = __get_vm_area_node(nr * PAGE_SIZE, align, shift, flags,
> +				  VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
> +				  GFP_KERNEL, __builtin_return_address(0));
> +	if (!area)
> +		goto err_free_pages_array;
> +
> +	addr = (unsigned long)area->addr;
> +	size = get_vm_area_size(area);
> +	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
> +	if (err)
> +		goto err_free_vm_area;
> +
> +	return area->addr;

You should free the pages array before returning here.

> +
> +err_free_vm_area:
> +	free_vm_area(area);
> +err_free_pages_array:
> +	kvfree(pages);
> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
> +
>  /* Handling for debug/kho/out */
>  
>  static struct dentry *debugfs_root;

-- 
Regards,
Pratyush Yadav


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-08 18:12   ` Pratyush Yadav
@ 2025-09-08 18:18     ` Jason Gunthorpe
  2025-09-09 14:33     ` Pratyush Yadav
  2025-09-15 14:08     ` Mike Rapoport
  2 siblings, 0 replies; 21+ messages in thread
From: Jason Gunthorpe @ 2025-09-08 18:18 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Andrew Morton, Alexander Graf, Baoquan He,
	Changyuan Lyu, Chris Li, Pasha Tatashin, kexec, linux-mm,
	linux-kernel

On Mon, Sep 08, 2025 at 08:12:59PM +0200, Pratyush Yadav wrote:
> > +#define KHO_VMALLOC_FLAGS_MASK	(VM_ALLOC | VM_ALLOW_HUGE_VMAP)
> 
> I don't think it is a good idea to re-use VM flags. This can make adding
> more flags later down the line ugly. I think it would be better to
> define KHO_VMALLOC_FL* instead.

+1

This kind of stuff should be avoided, it becomes a very sneaky ABI.

Jason


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-08 18:12   ` Pratyush Yadav
  2025-09-08 18:18     ` Jason Gunthorpe
@ 2025-09-09 14:33     ` Pratyush Yadav
  2025-09-15 14:12       ` Mike Rapoport
  2025-09-15 14:08     ` Mike Rapoport
  2 siblings, 1 reply; 21+ messages in thread
From: Pratyush Yadav @ 2025-09-09 14:33 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Mike Rapoport, Andrew Morton, Alexander Graf, Baoquan He,
	Changyuan Lyu, Chris Li, Jason Gunthorpe, Pasha Tatashin, kexec,
	linux-mm, linux-kernel

Hi Mike,

Couple more thoughts.

On Mon, Sep 08 2025, Pratyush Yadav wrote:

> On Mon, Sep 08 2025, Mike Rapoport wrote:
>
>> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>>
>> A vmalloc allocation is preserved using binary structure similar to
>> global KHO memory tracker. It's a linked list of pages where each page
>> is an array of physical address of pages in vmalloc area.
>>
>> kho_preserve_vmalloc() hands out the physical address of the head page
>> to the caller. This address is used as the argument to
>> kho_vmalloc_restore() to restore the mapping in the vmalloc address
>> space and populate it with the preserved pages.
>>
>> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
[...]
>> +/**
>> + * kho_restore_vmalloc - recreates and populates an area in vmalloc address
>> + * space from the preserved memory.
>> + * @preservation: physical address of the preservation metadata.
>> + *
>> + * Recreates an area in vmalloc address space and populates it with memory that
>> + * was preserved using kho_preserve_vmalloc().
>> + *
>> + * Return: pointer to the area in the vmalloc address space, NULL on failure.
>> + */
>> +void *kho_restore_vmalloc(phys_addr_t preservation)
>> +{
>> +	struct kho_vmalloc_chunk *chunk = phys_to_virt(preservation);
>> +	unsigned int align, order, shift, flags;
>> +	unsigned int idx = 0, nr;
>> +	unsigned long addr, size;
>> +	struct vm_struct *area;
>> +	struct page **pages;
>> +	int err;
>> +
>> +	flags = chunk->hdr.flags;
>> +	if (flags & ~KHO_VMALLOC_FLAGS_MASK)
>> +		return NULL;
>> +
>> +	nr = chunk->hdr.total_pages;
>> +	pages = kvmalloc_array(nr, sizeof(*pages), GFP_KERNEL);
>> +	if (!pages)
>> +		return NULL;
>> +	order = chunk->hdr.order;
>> +	shift = PAGE_SHIFT + order;
>> +	align = 1 << shift;
>> +
>> +	while (chunk) {
>> +		struct page *page;
>> +
>> +		for (int i = 0; i < chunk->hdr.num_elms; i++) {
>> +			phys_addr_t phys = chunk->phys[i];
>> +
>> +			for (int j = 0; j < (1 << order); j++) {
>> +				page = phys_to_page(phys);
>> +				kho_restore_page(page, 0);
>> +				pages[idx++] = page;
>
> This can buffer-overflow if the previous kernel was buggy and added too
> many pages. Perhaps keep check for this?

Thinking about this a bit more, I think this should check that we found
_exactly_ chunk->hdr.total_pages pages, and should error out otherwise.
If too few are found, pages array will contain bogus data, if too many,
buffer overflow.

Also, I am not a fan of using kho_restore_page() directly. I think the
vmalloc preservation is a layer above core KHO, and it should use the
public KHO APIs. It really doesn't need to poke into internal APIs. If
any of the public APIs are insufficient, we should add new ones.

I don't suppose I'd insist on it, but something to consider since you
are likely going to do another revision anyway.

>
>> +				phys += PAGE_SIZE;
>> +			}
>> +		}
>> +
>> +		page = virt_to_page(chunk);
>> +		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
>> +		kho_restore_page(page, 0);
>> +		__free_page(page);
>> +	}
>> +
>> +	area = __get_vm_area_node(nr * PAGE_SIZE, align, shift, flags,
>> +				  VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
>> +				  GFP_KERNEL, __builtin_return_address(0));
>> +	if (!area)
>> +		goto err_free_pages_array;
>> +
>> +	addr = (unsigned long)area->addr;
>> +	size = get_vm_area_size(area);
>> +	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
>> +	if (err)
>> +		goto err_free_vm_area;
>> +
>> +	return area->addr;
>
> You should free the pages array before returning here.
>
>> +
>> +err_free_vm_area:
>> +	free_vm_area(area);
>> +err_free_pages_array:
>> +	kvfree(pages);
>> +	return NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
>> +
>>  /* Handling for debug/kho/out */
>>  
>>  static struct dentry *debugfs_root;

-- 
Regards,
Pratyush Yadav


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-08 14:14   ` Jason Gunthorpe
@ 2025-09-15 14:01     ` Mike Rapoport
  2025-09-15 14:37       ` Jason Gunthorpe
  0 siblings, 1 reply; 21+ messages in thread
From: Mike Rapoport @ 2025-09-15 14:01 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
	Chris Li, Pasha Tatashin, Pratyush Yadav, kexec, linux-mm,
	linux-kernel

On Mon, Sep 08, 2025 at 11:14:23AM -0300, Jason Gunthorpe wrote:
> On Mon, Sep 08, 2025 at 01:35:27PM +0300, Mike Rapoport wrote:
> > +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
> > +{
> > +	struct kho_vmalloc_chunk *chunk;
> > +	int err;
> > +
> > +	chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
> > +	if (!chunk)
> > +		return NULL;
> > +
> > +	err = kho_preserve_phys(virt_to_phys(chunk), PAGE_SIZE);
> > +	if (err)
> > +		goto err_free;
> 
> kzalloc() cannot be preserved, the only thing we support today is
> alloc_page(), so this code pattern shouldn't exist.
 
kzalloc(PAGE_SIZE) can be preserved, it's page aligned and we don't have to
restore it into a slab cache. But this maybe indeed confusing for those who
copy paste the code, so I'll change it.

> Call alloc_page() and use a kho_preserve_page/folio() like the luo
> patches were doing. The pattern seems common it probably needs a small
> alloc/free helper.
> 
> > +	for (int i = 0; i < vm->nr_pages; i += (1 << order)) {
> > +		phys_addr_t phys = page_to_phys(vm->pages[i]);
> > +
> > +		err = __kho_preserve_order(track, PHYS_PFN(phys), order);
> > +		if (err)
> > +			goto err_free;
> 
> I think you should make a helper inline to document what is happening here:
> 
> /*
>  * Preserve a contiguous aligned list of order 0 pages that aggregate
>  * to a higher order allocation. Must be restored using
>  * kho_restore_page() on each order 0 page.
>  */
> kho_preserve_pages(page, order);

Maybe.
 
> Jason

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-08 18:12   ` Pratyush Yadav
  2025-09-08 18:18     ` Jason Gunthorpe
  2025-09-09 14:33     ` Pratyush Yadav
@ 2025-09-15 14:08     ` Mike Rapoport
  2025-09-16 12:43       ` Pratyush Yadav
  2 siblings, 1 reply; 21+ messages in thread
From: Mike Rapoport @ 2025-09-15 14:08 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
	Chris Li, Jason Gunthorpe, Pasha Tatashin, kexec, linux-mm,
	linux-kernel

On Mon, Sep 08, 2025 at 08:12:59PM +0200, Pratyush Yadav wrote:
> On Mon, Sep 08 2025, Mike Rapoport wrote:
> 
> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> >
> > A vmalloc allocation is preserved using binary structure similar to
> > global KHO memory tracker. It's a linked list of pages where each page
> > is an array of physical address of pages in vmalloc area.
> >
> > kho_preserve_vmalloc() hands out the physical address of the head page
> > to the caller. This address is used as the argument to
> > kho_vmalloc_restore() to restore the mapping in the vmalloc address
> > space and populate it with the preserved pages.
> >
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

...

> > @@ -742,6 +743,205 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
> >  }
> >  EXPORT_SYMBOL_GPL(kho_preserve_phys);
> >  
> > +struct kho_vmalloc_chunk;
> > +
> > +struct kho_vmalloc_hdr {
> > +	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
> > +	unsigned int total_pages;	/* only valid in the first chunk */
> > +	unsigned int flags;		/* only valid in the first chunk */
> > +	unsigned short order;		/* only valid in the first chunk */
> > +	unsigned short num_elms;
> 
> I think it the serialization format would be cleaner if these were
> defined in a separate structure that holds the metadata instead of being
> defined in each page and then ignored in most of them.
> 
> If the caller can save 8 bytes (phys addr of first page), it might as
> well save 16 instead. Something like the below perhaps?
> 
> struct kho_vmalloc {
> 	DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *);
> 	unsigned int total_pages;
> 	unsigned short flags;
> 	unsigned short order;
> };
> 
> And then kho_vmalloc_hdr becomes simply:
> 
> struct kho_vmalloc_hdr {
> 	DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
> };
> 
> You don't even need num_elms since you have the list be zero-terminated.

Agree, thanks. 

> > +#define KHO_VMALLOC_FLAGS_MASK	(VM_ALLOC | VM_ALLOW_HUGE_VMAP)
> 
> I don't think it is a good idea to re-use VM flags. This can make adding
> more flags later down the line ugly. I think it would be better to
> define KHO_VMALLOC_FL* instead.

Ok.
 
> > +static void kho_vmalloc_free_chunks(struct kho_vmalloc_chunk *first_chunk)
> > +{
> > +	struct kho_mem_track *track = &kho_out.ser.track;
> > +	struct kho_vmalloc_chunk *chunk = first_chunk;
> > +
> > +	while (chunk) {
> > +		unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
> > +		struct kho_vmalloc_chunk *tmp = chunk;
> > +
> > +		__kho_unpreserve(track, pfn, pfn + 1);
> 
> This doesn't unpreserve the pages contained in the chunk, which
> kho_preserve_vmalloc() preserved.

Will fix. 

> > +	while (chunk) {
> > +		struct page *page;
> > +
> > +		for (int i = 0; i < chunk->hdr.num_elms; i++) {
> > +			phys_addr_t phys = chunk->phys[i];
> > +
> > +			for (int j = 0; j < (1 << order); j++) {
> > +				page = phys_to_page(phys);
> > +				kho_restore_page(page, 0);
> > +				pages[idx++] = page;
> 
> This can buffer-overflow if the previous kernel was buggy and added too
> many pages. Perhaps keep check for this?

You mean it added more than total_pages?
But the preserve part adds exactly vm->nr_pages, so once we get it right
what bugs do you expect here? 
 
> > +				phys += PAGE_SIZE;
> > +			}
> > +		}
> > +
> > +		page = virt_to_page(chunk);
> > +		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
> > +		kho_restore_page(page, 0);
> > +		__free_page(page);
> > +	}
> > +
> > +	area = __get_vm_area_node(nr * PAGE_SIZE, align, shift, flags,
> > +				  VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
> > +				  GFP_KERNEL, __builtin_return_address(0));
> > +	if (!area)
> > +		goto err_free_pages_array;
> > +
> > +	addr = (unsigned long)area->addr;
> > +	size = get_vm_area_size(area);
> > +	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
> > +	if (err)
> > +		goto err_free_vm_area;
> > +
> > +	return area->addr;
> 
> You should free the pages array before returning here.

Why? They get into vm->pages.

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-09 14:33     ` Pratyush Yadav
@ 2025-09-15 14:12       ` Mike Rapoport
  2025-09-15 14:43         ` Jason Gunthorpe
  2025-09-16 12:48         ` Pratyush Yadav
  0 siblings, 2 replies; 21+ messages in thread
From: Mike Rapoport @ 2025-09-15 14:12 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Pratyush Yadav, Andrew Morton, Alexander Graf, Baoquan He,
	Changyuan Lyu, Chris Li, Jason Gunthorpe, Pasha Tatashin, kexec,
	linux-mm, linux-kernel

On Tue, Sep 09, 2025 at 04:33:27PM +0200, Pratyush Yadav wrote:
> Hi Mike,
> 
> Couple more thoughts.
> 
> On Mon, Sep 08 2025, Pratyush Yadav wrote:
> > On Mon, Sep 08 2025, Mike Rapoport wrote:
> >> +
> >> +	while (chunk) {
> >> +		struct page *page;
> >> +
> >> +		for (int i = 0; i < chunk->hdr.num_elms; i++) {
> >> +			phys_addr_t phys = chunk->phys[i];
> >> +
> >> +			for (int j = 0; j < (1 << order); j++) {
> >> +				page = phys_to_page(phys);
> >> +				kho_restore_page(page, 0);
> >> +				pages[idx++] = page;
> >
> > This can buffer-overflow if the previous kernel was buggy and added too
> > many pages. Perhaps keep check for this?
> 
> Thinking about this a bit more, I think this should check that we found
> _exactly_ chunk->hdr.total_pages pages, and should error out otherwise.
> If too few are found, pages array will contain bogus data, if too many,
> buffer overflow.

Sure, I can add the checks, but it feels superfluous to me.
 
> Also, I am not a fan of using kho_restore_page() directly. I think the
> vmalloc preservation is a layer above core KHO, and it should use the
> public KHO APIs. It really doesn't need to poke into internal APIs. If
> any of the public APIs are insufficient, we should add new ones.
> 
> I don't suppose I'd insist on it, but something to consider since you
> are likely going to do another revision anyway.

I think vmalloc is as basic as folio. At some point we probably converge to 
kho_preserve(void *) that will choose the right internal handler. like
folio, vmalloc, kmalloc etc.

> -- 
> Regards,
> Pratyush Yadav

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-15 14:01     ` Mike Rapoport
@ 2025-09-15 14:37       ` Jason Gunthorpe
  2025-09-15 16:11         ` Mike Rapoport
  0 siblings, 1 reply; 21+ messages in thread
From: Jason Gunthorpe @ 2025-09-15 14:37 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
	Chris Li, Pasha Tatashin, Pratyush Yadav, kexec, linux-mm,
	linux-kernel

On Mon, Sep 15, 2025 at 05:01:01PM +0300, Mike Rapoport wrote:
> > kzalloc() cannot be preserved, the only thing we support today is
> > alloc_page(), so this code pattern shouldn't exist.
>  
> kzalloc(PAGE_SIZE) can be preserved, it's page aligned and we don't have to
> restore it into a slab cache. But this maybe indeed confusing for those who
> copy paste the code, so I'll change it.

It really isn't. The kzalloc should be returning frozen pages for that
allocation and the restoration will not put frozen pages back.

Jason


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-15 14:12       ` Mike Rapoport
@ 2025-09-15 14:43         ` Jason Gunthorpe
  2025-09-15 16:36           ` Mike Rapoport
  2025-09-16 12:48         ` Pratyush Yadav
  1 sibling, 1 reply; 21+ messages in thread
From: Jason Gunthorpe @ 2025-09-15 14:43 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Pratyush Yadav, Pratyush Yadav, Andrew Morton, Alexander Graf,
	Baoquan He, Changyuan Lyu, Chris Li, Pasha Tatashin, kexec,
	linux-mm, linux-kernel

On Mon, Sep 15, 2025 at 05:12:27PM +0300, Mike Rapoport wrote:
> > I don't suppose I'd insist on it, but something to consider since you
> > are likely going to do another revision anyway.
> 
> I think vmalloc is as basic as folio.

vmalloc() ultimately calls vm_area_alloc_pages() -> alloc_pages_bulk_node_noprof()

KHO should have functions that clearly pair with the low level
allocators struct page related allocators, alloc_pages(order),
folio_alloc(), etc etc

ie if you call this allocator X then you call this kho preserve, this
kho restore, and this free function Y.

Under the covers it all uses the generic folio based code we already
have, but we should have appropriate wrappers around that code that
make clear these patterns.

Jason


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-15 14:37       ` Jason Gunthorpe
@ 2025-09-15 16:11         ` Mike Rapoport
  0 siblings, 0 replies; 21+ messages in thread
From: Mike Rapoport @ 2025-09-15 16:11 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
	Chris Li, Pasha Tatashin, Pratyush Yadav, kexec, linux-mm,
	linux-kernel

On Mon, Sep 15, 2025 at 11:37:30AM -0300, Jason Gunthorpe wrote:
> On Mon, Sep 15, 2025 at 05:01:01PM +0300, Mike Rapoport wrote:
> > > kzalloc() cannot be preserved, the only thing we support today is
> > > alloc_page(), so this code pattern shouldn't exist.
> >  
> > kzalloc(PAGE_SIZE) can be preserved, it's page aligned and we don't have to
> > restore it into a slab cache. But this maybe indeed confusing for those who
> > copy paste the code, so I'll change it.
> 
> It really isn't. The kzalloc should be returning frozen pages for that
> allocation and the restoration will not put frozen pages back.

But we don't care here, we free that page immediately to buddy on
restoration.
 
> Jason

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-15 14:43         ` Jason Gunthorpe
@ 2025-09-15 16:36           ` Mike Rapoport
  2025-09-16 13:05             ` Jason Gunthorpe
  0 siblings, 1 reply; 21+ messages in thread
From: Mike Rapoport @ 2025-09-15 16:36 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Pratyush Yadav, Pratyush Yadav, Andrew Morton, Alexander Graf,
	Baoquan He, Changyuan Lyu, Chris Li, Pasha Tatashin, kexec,
	linux-mm, linux-kernel

On Mon, Sep 15, 2025 at 11:43:35AM -0300, Jason Gunthorpe wrote:
> On Mon, Sep 15, 2025 at 05:12:27PM +0300, Mike Rapoport wrote:
> > > I don't suppose I'd insist on it, but something to consider since you
> > > are likely going to do another revision anyway.
> > 
> > I think vmalloc is as basic as folio.
> 
> vmalloc() ultimately calls vm_area_alloc_pages() -> alloc_pages_bulk_node_noprof()
> 
> KHO should have functions that clearly pair with the low level
> allocators struct page related allocators, alloc_pages(order),
> folio_alloc(), etc etc
> 
> ie if you call this allocator X then you call this kho preserve, this
> kho restore, and this free function Y.
> 
> Under the covers it all uses the generic folio based code we already
> have, but we should have appropriate wrappers around that code that
> make clear these patterns.

Right, but that does not mean that vmalloc preserve/restore should use the
public KHO APIs and avoid using internal methods.
 
> Jason

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-15 14:08     ` Mike Rapoport
@ 2025-09-16 12:43       ` Pratyush Yadav
  0 siblings, 0 replies; 21+ messages in thread
From: Pratyush Yadav @ 2025-09-16 12:43 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Pratyush Yadav, Andrew Morton, Alexander Graf, Baoquan He,
	Changyuan Lyu, Chris Li, Jason Gunthorpe, Pasha Tatashin, kexec,
	linux-mm, linux-kernel

Hi Mike,

On Mon, Sep 15 2025, Mike Rapoport wrote:

> On Mon, Sep 08, 2025 at 08:12:59PM +0200, Pratyush Yadav wrote:
>> On Mon, Sep 08 2025, Mike Rapoport wrote:
>> 
>> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>> >
>> > A vmalloc allocation is preserved using binary structure similar to
>> > global KHO memory tracker. It's a linked list of pages where each page
>> > is an array of physical address of pages in vmalloc area.
>> >
>> > kho_preserve_vmalloc() hands out the physical address of the head page
>> > to the caller. This address is used as the argument to
>> > kho_vmalloc_restore() to restore the mapping in the vmalloc address
>> > space and populate it with the preserved pages.
>> >
>> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
[...]
>> > +	while (chunk) {
>> > +		struct page *page;
>> > +
>> > +		for (int i = 0; i < chunk->hdr.num_elms; i++) {
>> > +			phys_addr_t phys = chunk->phys[i];
>> > +
>> > +			for (int j = 0; j < (1 << order); j++) {
>> > +				page = phys_to_page(phys);
>> > +				kho_restore_page(page, 0);
>> > +				pages[idx++] = page;
>> 
>> This can buffer-overflow if the previous kernel was buggy and added too
>> many pages. Perhaps keep check for this?
>
> You mean it added more than total_pages?
> But the preserve part adds exactly vm->nr_pages, so once we get it right
> what bugs do you expect here? 

The main reason is defensive programming. My mental model is to treat
the data from the previous kernel as external input, and so we should
always validate it. The kernel that did the preserve and the kernel that
does the restore are very different, and between them either may have a
bug or some incompatibility/disagreement on the data format. Catching
these inconsistencies and failing gracefully is a lot better than
silently corrupting memory and having hard to catch bugs. No one plans
to add bugs, but they always show up somehow :-)

And we do a lot of that KHO already. See the FDT parsing in memblock
preservation for example.

	p_start = fdt_getprop(fdt, offset, "start", &len_start);
	p_size = fdt_getprop(fdt, offset, "size", &len_size);
	if (!p_start || len_start != sizeof(*p_start) || !p_size ||
	    len_size != sizeof(*p_size)) {
		return false;
	}
	[...]

It checks that FDT contains sane data or errors out otherwise. Similar
checks exist in other places too.

>  
>> > +				phys += PAGE_SIZE;
>> > +			}
>> > +		}
>> > +
>> > +		page = virt_to_page(chunk);
>> > +		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
>> > +		kho_restore_page(page, 0);
>> > +		__free_page(page);
>> > +	}
>> > +
>> > +	area = __get_vm_area_node(nr * PAGE_SIZE, align, shift, flags,
>> > +				  VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
>> > +				  GFP_KERNEL, __builtin_return_address(0));
>> > +	if (!area)
>> > +		goto err_free_pages_array;
>> > +
>> > +	addr = (unsigned long)area->addr;
>> > +	size = get_vm_area_size(area);
>> > +	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
>> > +	if (err)
>> > +		goto err_free_vm_area;
>> > +
>> > +	return area->addr;
>> 
>> You should free the pages array before returning here.
>
> Why? They get into vm->pages.

Do they? I don't see any path in vmap_pages_range() taking a reference
to the pages array. They use the array, but never take a reference to
it.

The core of vmap_pages_range() is in __vmap_pages_range_noflush(). That
function has two paths:

	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
			page_shift == PAGE_SHIFT)
		return vmap_small_pages_range_noflush(addr, end, prot, pages);

	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
		int err;

		err = vmap_range_noflush(addr, addr + (1UL << page_shift),
					page_to_phys(pages[i]), prot,
					page_shift);
		if (err)
			return err;

		addr += 1UL << page_shift;
	}

	return 0;

The latter path (the for loop) _uses_ pages but never takes a reference
to it. The former, vmap_small_pages_range_noflush(), goes through a the
sequence of functions vmap_pages_{p4d,pud,pmd}_range(), eventually
landing in vmap_pages_pte_range(). That function's only use of the pages
array is the below:

	do {
		struct page *page = pages[*nr];
		[...]
	} while (pte++, addr += PAGE_SIZE, addr != end);

So I don't see anything setting area->pages here.

Note that vmap() does set area->pages if given the VM_MAP_PUT_PAGES
flag. It also calls vmap_pages_range() first and then sets area->pages
and area->nr_pages. So if you want to mimic that behaviour, I think you
have to explicitly make these assignments in kho_restore_vmalloc().

Similarly for __vmalloc_area_node(), it does area->pages =
__vmalloc_node_noprof() and then does vmap_pages_range() (also sets
area->nr_pages).

Side note: would it be a better idea to teach vmap() to take higher
order pages instead, to make this path simpler? I don't know the subtle
differences between these mapping primitives to know if that makes
sense. I seems to do pretty much what this function is doing with the
exception of only dealing with 0-order pages.

-- 
Regards,
Pratyush Yadav


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-15 14:12       ` Mike Rapoport
  2025-09-15 14:43         ` Jason Gunthorpe
@ 2025-09-16 12:48         ` Pratyush Yadav
  2025-09-16 14:41           ` Mike Rapoport
  1 sibling, 1 reply; 21+ messages in thread
From: Pratyush Yadav @ 2025-09-16 12:48 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Pratyush Yadav, Pratyush Yadav, Andrew Morton, Alexander Graf,
	Baoquan He, Changyuan Lyu, Chris Li, Jason Gunthorpe,
	Pasha Tatashin, kexec, linux-mm, linux-kernel

On Mon, Sep 15 2025, Mike Rapoport wrote:

> On Tue, Sep 09, 2025 at 04:33:27PM +0200, Pratyush Yadav wrote:
>> Hi Mike,
>> 
>> Couple more thoughts.
>> 
>> On Mon, Sep 08 2025, Pratyush Yadav wrote:
>> > On Mon, Sep 08 2025, Mike Rapoport wrote:
>> >> +
>> >> +	while (chunk) {
>> >> +		struct page *page;
>> >> +
>> >> +		for (int i = 0; i < chunk->hdr.num_elms; i++) {
>> >> +			phys_addr_t phys = chunk->phys[i];
>> >> +
>> >> +			for (int j = 0; j < (1 << order); j++) {
>> >> +				page = phys_to_page(phys);
>> >> +				kho_restore_page(page, 0);
>> >> +				pages[idx++] = page;
>> >
>> > This can buffer-overflow if the previous kernel was buggy and added too
>> > many pages. Perhaps keep check for this?
>> 
>> Thinking about this a bit more, I think this should check that we found
>> _exactly_ chunk->hdr.total_pages pages, and should error out otherwise.
>> If too few are found, pages array will contain bogus data, if too many,
>> buffer overflow.
>
> Sure, I can add the checks, but it feels superfluous to me.

See my reasoning in the other reply:
https://lore.kernel.org/linux-mm/mafs0frcmin3t.fsf@kernel.org/

>  
>> Also, I am not a fan of using kho_restore_page() directly. I think the
>> vmalloc preservation is a layer above core KHO, and it should use the
>> public KHO APIs. It really doesn't need to poke into internal APIs. If
>> any of the public APIs are insufficient, we should add new ones.
>> 
>> I don't suppose I'd insist on it, but something to consider since you
>> are likely going to do another revision anyway.
>
> I think vmalloc is as basic as folio. At some point we probably converge to 
> kho_preserve(void *) that will choose the right internal handler. like
> folio, vmalloc, kmalloc etc.

Sure, but do you need to use the internal APIs? Because doing this way
would miss some improvements for the public APIs. See my patch for
adding more sanity checking to kho_restore_folio() for example:
https://lore.kernel.org/linux-mm/20250910153443.95049-1-pratyush@kernel.org/

vmalloc preservation would miss this improvement since it uses the
internal API, even though it will clearly benefit from it.

-- 
Regards,
Pratyush Yadav


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-15 16:36           ` Mike Rapoport
@ 2025-09-16 13:05             ` Jason Gunthorpe
  2025-09-16 13:21               ` Pratyush Yadav
  2025-09-16 14:34               ` Mike Rapoport
  0 siblings, 2 replies; 21+ messages in thread
From: Jason Gunthorpe @ 2025-09-16 13:05 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Pratyush Yadav, Pratyush Yadav, Andrew Morton, Alexander Graf,
	Baoquan He, Changyuan Lyu, Chris Li, Pasha Tatashin, kexec,
	linux-mm, linux-kernel

On Mon, Sep 15, 2025 at 07:36:25PM +0300, Mike Rapoport wrote:
> > Under the covers it all uses the generic folio based code we already
> > have, but we should have appropriate wrappers around that code that
> > make clear these patterns.
> 
> Right, but that does not mean that vmalloc preserve/restore should use the
> public KHO APIs and avoid using internal methods.

I think it does, the same way vmalloc is layered on top of the buddy
allocator. Why wouldn't you build things in clean understandable
layers like this?

Jason


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-16 13:05             ` Jason Gunthorpe
@ 2025-09-16 13:21               ` Pratyush Yadav
  2025-09-16 14:34               ` Mike Rapoport
  1 sibling, 0 replies; 21+ messages in thread
From: Pratyush Yadav @ 2025-09-16 13:21 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Mike Rapoport, Pratyush Yadav, Pratyush Yadav, Andrew Morton,
	Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
	Pasha Tatashin, kexec, linux-mm, linux-kernel

On Tue, Sep 16 2025, Jason Gunthorpe wrote:

> On Mon, Sep 15, 2025 at 07:36:25PM +0300, Mike Rapoport wrote:
>> > Under the covers it all uses the generic folio based code we already
>> > have, but we should have appropriate wrappers around that code that
>> > make clear these patterns.
>> 
>> Right, but that does not mean that vmalloc preserve/restore should use the
>> public KHO APIs and avoid using internal methods.
>
> I think it does, the same way vmalloc is layered on top of the buddy
> allocator. Why wouldn't you build things in clean understandable
> layers like this?

+1

-- 
Regards,
Pratyush Yadav


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-16 13:05             ` Jason Gunthorpe
  2025-09-16 13:21               ` Pratyush Yadav
@ 2025-09-16 14:34               ` Mike Rapoport
  1 sibling, 0 replies; 21+ messages in thread
From: Mike Rapoport @ 2025-09-16 14:34 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Pratyush Yadav, Pratyush Yadav, Andrew Morton, Alexander Graf,
	Baoquan He, Changyuan Lyu, Chris Li, Pasha Tatashin, kexec,
	linux-mm, linux-kernel

On Tue, Sep 16, 2025 at 10:05:16AM -0300, Jason Gunthorpe wrote:
> On Mon, Sep 15, 2025 at 07:36:25PM +0300, Mike Rapoport wrote:
> > > Under the covers it all uses the generic folio based code we already
> > > have, but we should have appropriate wrappers around that code that
> > > make clear these patterns.
> > 
> > Right, but that does not mean that vmalloc preserve/restore should use the
> > public KHO APIs and avoid using internal methods.
> 
> I think it does, the same way vmalloc is layered on top of the buddy
> allocator. Why wouldn't you build things in clean understandable
> layers like this?

Here it's more like __vmalloc_node_range_noprof() and get_vm_area() calling
__get_vm_area()
 
> Jason

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3 1/2] kho: add support for preserving vmalloc allocations
  2025-09-16 12:48         ` Pratyush Yadav
@ 2025-09-16 14:41           ` Mike Rapoport
  0 siblings, 0 replies; 21+ messages in thread
From: Mike Rapoport @ 2025-09-16 14:41 UTC (permalink / raw)
  To: Pratyush Yadav
  Cc: Pratyush Yadav, Andrew Morton, Alexander Graf, Baoquan He,
	Changyuan Lyu, Chris Li, Jason Gunthorpe, Pasha Tatashin, kexec,
	linux-mm, linux-kernel

On Tue, Sep 16, 2025 at 02:48:55PM +0200, Pratyush Yadav wrote:
> On Mon, Sep 15 2025, Mike Rapoport wrote:
> 
> > On Tue, Sep 09, 2025 at 04:33:27PM +0200, Pratyush Yadav wrote:
> >> Hi Mike,
> >> 
> >> Also, I am not a fan of using kho_restore_page() directly. I think the
> >> vmalloc preservation is a layer above core KHO, and it should use the
> >> public KHO APIs. It really doesn't need to poke into internal APIs. If
> >> any of the public APIs are insufficient, we should add new ones.
> >> 
> >> I don't suppose I'd insist on it, but something to consider since you
> >> are likely going to do another revision anyway.
> >
> > I think vmalloc is as basic as folio. At some point we probably converge to 
> > kho_preserve(void *) that will choose the right internal handler. like
> > folio, vmalloc, kmalloc etc.
> 
> Sure, but do you need to use the internal APIs? Because doing this way
> would miss some improvements for the public APIs. See my patch for
> adding more sanity checking to kho_restore_folio() for example:
> https://lore.kernel.org/linux-mm/20250910153443.95049-1-pratyush@kernel.org/
> 
> vmalloc preservation would miss this improvement since it uses the
> internal API, even though it will clearly benefit from it.

The core restore API is kho_restore_page() and the improvements should land
there, IMO.

Then whatever uses that core API will benefit from them.

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2025-09-16 14:41 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-08 10:35 [PATCH v3 0/2] kho: add support for preserving vmalloc allocations Mike Rapoport
2025-09-08 10:35 ` [PATCH v3 1/2] " Mike Rapoport
2025-09-08 14:14   ` Jason Gunthorpe
2025-09-15 14:01     ` Mike Rapoport
2025-09-15 14:37       ` Jason Gunthorpe
2025-09-15 16:11         ` Mike Rapoport
2025-09-08 18:12   ` Pratyush Yadav
2025-09-08 18:18     ` Jason Gunthorpe
2025-09-09 14:33     ` Pratyush Yadav
2025-09-15 14:12       ` Mike Rapoport
2025-09-15 14:43         ` Jason Gunthorpe
2025-09-15 16:36           ` Mike Rapoport
2025-09-16 13:05             ` Jason Gunthorpe
2025-09-16 13:21               ` Pratyush Yadav
2025-09-16 14:34               ` Mike Rapoport
2025-09-16 12:48         ` Pratyush Yadav
2025-09-16 14:41           ` Mike Rapoport
2025-09-15 14:08     ` Mike Rapoport
2025-09-16 12:43       ` Pratyush Yadav
2025-09-08 10:35 ` [PATCH v3 2/2] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt Mike Rapoport
  -- strict thread matches above, loose matches on Subject: below --
2025-09-07  7:00 [PATCH v2 0/2] kho: add support for preserving vmalloc allocations Mike Rapoport
2025-09-07  7:00 ` [PATCH v3 1/2] " Mike Rapoport

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).