Linux Documentation
 help / color / mirror / Atom feed
* [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation
       [not found] <cover.1780676742.git.tarunsahu@google.com>
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-07  0:35   ` tarunsahu
  2026-06-05 17:08 ` [RFC PATCH v2 02/10] liveupdate: Add LIVEUPDATE_GUEST_MEMFD config option Tarun Sahu
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

From: Pasha Tatashin <pasha.tatashin@soleen.com>

The core liveupdate mechanism allows userspace to preserve file
descriptors. However, kernel subsystems often manage struct file
objects directly and need to participate in the preservation process
programmatically without relying solely on userspace interaction.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 include/linux/liveupdate.h       | 21 ++++++++++
 kernel/liveupdate/luo_file.c     | 69 ++++++++++++++++++++++++++++++++
 kernel/liveupdate/luo_internal.h | 17 ++++++++
 3 files changed, 107 insertions(+)

diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index 30c5a39ff9e9..de052438eaac 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -24,6 +24,7 @@ struct file;
 /**
  * struct liveupdate_file_op_args - Arguments for file operation callbacks.
  * @handler:          The file handler being called.
+ * @session:          The session this file belongs to.
  * @retrieve_status:  The retrieve status for the 'can_finish / finish'
  *                    operation. A value of 0 means the retrieve has not been
  *                    attempted, a positive value means the retrieve was
@@ -44,6 +45,7 @@ struct file;
  */
 struct liveupdate_file_op_args {
 	struct liveupdate_file_handler *handler;
+	struct liveupdate_session *session;
 	int retrieve_status;
 	struct file *file;
 	u64 serialized_data;
@@ -240,6 +242,13 @@ void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
 
 int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp);
 int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp);
+/* kernel can internally retrieve files */
+int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
+				 struct file **filep);
+
+/* Get a token for an outgoing file, or -ENOENT if file is not preserved */
+int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+				  struct file *file, u64 *tokenp);
 
 #else /* CONFIG_LIVEUPDATE */
 
@@ -285,5 +294,17 @@ static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb,
 	return -EOPNOTSUPP;
 }
 
+static inline int liveupdate_get_file_incoming(struct liveupdate_session *s,
+					       u64 token, struct file **filep)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+						struct file *file, u64 *tokenp)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_LIVEUPDATE */
 #endif /* _LINUX_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index a0a419085e28..0aa0b4e5339f 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -323,6 +323,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
 	mutex_init(&luo_file->mutex);
 
 	args.handler = fh;
+	args.session = luo_session_from_file_set(file_set);
 	args.file = file;
 	err = fh->ops->preserve(&args);
 	if (err)
@@ -380,6 +381,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
 					   struct luo_file, list);
 
 		args.handler = luo_file->fh;
+		args.session = luo_session_from_file_set(file_set);
 		args.file = luo_file->file;
 		args.serialized_data = luo_file->serialized_data;
 		args.private_data = luo_file->private_data;
@@ -411,6 +413,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set,
 		struct liveupdate_file_op_args args = {0};
 
 		args.handler = luo_file->fh;
+		args.session = luo_session_from_file_set(file_set);
 		args.file = luo_file->file;
 		args.serialized_data = luo_file->serialized_data;
 		args.private_data = luo_file->private_data;
@@ -432,6 +435,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set,
 		struct liveupdate_file_op_args args = {0};
 
 		args.handler = luo_file->fh;
+		args.session = luo_session_from_file_set(file_set);
 		args.file = luo_file->file;
 		args.serialized_data = luo_file->serialized_data;
 		args.private_data = luo_file->private_data;
@@ -621,6 +625,7 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
 	}
 
 	args.handler = luo_file->fh;
+	args.session = luo_session_from_file_set(file_set);
 	args.serialized_data = luo_file->serialized_data;
 	err = luo_file->fh->ops->retrieve(&args);
 	if (err) {
@@ -654,6 +659,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set,
 		struct liveupdate_file_op_args args = {0};
 
 		args.handler = luo_file->fh;
+		args.session = luo_session_from_file_set(file_set);
 		args.file = luo_file->file;
 		args.serialized_data = luo_file->serialized_data;
 		args.retrieve_status = luo_file->retrieve_status;
@@ -671,6 +677,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
 	guard(mutex)(&luo_file->mutex);
 
 	args.handler = luo_file->fh;
+	args.session = luo_session_from_file_set(file_set);
 	args.file = luo_file->file;
 	args.serialized_data = luo_file->serialized_data;
 	args.retrieve_status = luo_file->retrieve_status;
@@ -924,3 +931,65 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
 	luo_flb_unregister_all(fh);
 	list_del(&ACCESS_PRIVATE(fh, list));
 }
+EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler);
+
+/**
+ * liveupdate_get_token_outgoing - Get the token for a preserved file.
+ * @s:      The outgoing liveupdate session.
+ * @file:   The file object to search for.
+ * @tokenp: Output parameter for the found token.
+ *
+ * Searches the list of preserved files in an outgoing session for a matching
+ * file object. If found, the corresponding user-provided token is returned.
+ *
+ * This function is intended for in-kernel callers that need to correlate a
+ * file with its liveupdate token.
+ *
+ * Context: It must be called with session mutex acquired.
+ * Return: 0 on success, -ENOENT if the file is not preserved in this session.
+ */
+int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+				  struct file *file, u64 *tokenp)
+{
+	struct luo_file_set *file_set = luo_file_set_from_session_locked(s);
+	struct luo_file *luo_file;
+	int err = -ENOENT;
+
+	list_for_each_entry(luo_file, &file_set->files_list, list) {
+		if (luo_file->file == file) {
+			if (tokenp)
+				*tokenp = luo_file->token;
+			err = 0;
+			break;
+		}
+	}
+
+	return err;
+}
+
+/**
+ * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
+ * @s:      The incoming liveupdate session (restored from the previous kernel).
+ * @token:  The unique token identifying the file to retrieve.
+ * @filep:  On success, this will be populated with a pointer to the retrieved
+ *          'struct file'.
+ *
+ * Provides a kernel-internal API for other subsystems to retrieve their
+ * preserved files after a live update. This function is a simple wrapper
+ * around luo_retrieve_file(), allowing callers to find a file by its token.
+ *
+ * The caller receives a new reference to the file and must call fput() when it
+ * is no longer needed. The file's lifetime is managed by LUO and any userspace
+ * file descriptors. If the caller needs to hold a reference to the file beyond
+ * the immediate scope, it must call get_file() itself.
+ *
+ * Context: It must be called with session mutex acquired of a restored session.
+ * Return: 0 on success. Returns -ENOENT if no file with the matching token is
+ *         found, or any other negative errno on failure.
+ */
+int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
+				 struct file **filep)
+{
+	return luo_retrieve_file(luo_file_set_from_session_locked(s),
+				 token, filep);
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 875844d7a41d..08b198802e7f 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -79,6 +79,23 @@ struct luo_session {
 
 extern struct rw_semaphore luo_register_rwlock;
 
+static inline struct liveupdate_session *luo_session_from_file_set(struct luo_file_set *file_set)
+{
+	struct luo_session *session;
+
+	session = container_of(file_set, struct luo_session, file_set);
+
+	return (struct liveupdate_session *)session;
+}
+
+static inline struct luo_file_set *luo_file_set_from_session_locked(struct liveupdate_session *s)
+{
+	struct luo_session *session = (struct luo_session *)s;
+
+	lockdep_assert_held(&session->mutex);
+	return &session->file_set;
+}
+
 int luo_session_create(const char *name, struct file **filep);
 int luo_session_retrieve(const char *name, struct file **filep);
 int __init luo_session_setup_outgoing(void *fdt);
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 02/10] liveupdate: Add LIVEUPDATE_GUEST_MEMFD config option
       [not found] <cover.1780676742.git.tarunsahu@google.com>
  2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Introduce the LIVEUPDATE_GUEST_MEMFD Kconfig option. This option
enables live update support for KVM guest_memfd files, enabling
guest_memfd-backed memory preservation across kernel upgrades.

Currently this support only guest_memfd files that are full-shared
and pre-faulted.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 kernel/liveupdate/Kconfig | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
index 1a8513f16ef7..0bbc4037192e 100644
--- a/kernel/liveupdate/Kconfig
+++ b/kernel/liveupdate/Kconfig
@@ -88,4 +88,19 @@ config LIVEUPDATE_MEMFD
 
 	  If unsure, say N.
 
+config LIVEUPDATE_GUEST_MEMFD
+	bool "Live update support for guest_memfd"
+	depends on LIVEUPDATE
+	depends on KVM_GUEST_MEMFD
+	default LIVEUPDATE
+	help
+	  Enable live update support for KVM guest_memfd files. This allows
+	  preserving VM Memory backed by guest_memfd file across kernel live
+	  updates.
+
+	  This can only be used for the guest_memfd that are fully-shared
+	  and pre-faulted.
+
+	  If unsure, say N.
+
 endmenu
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support
       [not found] <cover.1780676742.git.tarunsahu@google.com>
  2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
  2026-06-05 17:08 ` [RFC PATCH v2 02/10] liveupdate: Add LIVEUPDATE_GUEST_MEMFD config option Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-22 23:59   ` Ackerley Tng
  2026-06-05 17:08 ` [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO Tarun Sahu
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Introduce core infrastructure to support VM preservation with LUO.

First two changes are just refactoring, no functional change, third
change introduces a new member in struct kvm.
- Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
- Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
  and anon_inode_getfile() to provide a unified VM file creation API.
- Track a weak reference to the backing file in struct kvm under
  CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
  without circular lifetime dependencies.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 include/linux/kvm_host.h | 14 +++++++
 virt/kvm/kvm_main.c      | 79 +++++++++++++++++++++++++++++-----------
 virt/kvm/kvm_mm.h        |  3 ++
 3 files changed, 75 insertions(+), 21 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4c14aee1fb06..9111a28637af 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -874,6 +874,18 @@ struct kvm {
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
 	/* Protected by slots_lock (for writes) and RCU (for reads) */
 	struct xarray mem_attr_array;
+#endif
+#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
+	/*
+	 * Weak reference to the VFS file backing this KVM instance. Stored
+	 * without incrementing the file refcount to prevent a circular lifetime
+	 * dependency (since file->private_data already pins this struct kvm).
+	 * Used exclusively to resolve the file pointer back from struct kvm.
+	 *
+	 * Written/cleared via rcu_assign_pointer() and read locklessly under
+	 * RCU (e.g. via get_file_active() to prevent ABA races).
+	 */
+	struct file *vm_file;
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
 };
@@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
 bool kvm_get_kvm_safe(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
 bool file_is_kvm(struct file *file);
+struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
 void kvm_put_kvm_no_destroy(struct kvm *kvm);
+void kvm_uevent_notify_vm_create(struct kvm *kvm);
 
 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 89489996fbc1..65f0c5fb353e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -67,9 +67,6 @@
 #include <linux/kvm_dirty_ring.h>
 
 
-/* Worst case buffer size needed for holding an integer. */
-#define ITOA_MAX_LEN 12
-
 MODULE_AUTHOR("Qumranet");
 MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
 MODULE_LICENSE("GPL");
@@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
 	struct kvm *kvm = filp->private_data;
 
+#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
+	/*
+	 * Clear the weak reference of the vm file.
+	 * In case vm file is closed by userspace, but kvm still has
+	 * other users like vCPUs, clearing this pointer ensures
+	 * that we don't have a dangling pointer to a closed file.
+	 *
+	 * Cleared via rcu_assign_pointer() to ensure proper memory visibility
+	 * for concurrent lockless readers under RCU.
+	 */
+	rcu_assign_pointer(kvm->vm_file, NULL);
+#endif
+
 	kvm_irqfd_release(kvm);
 
 	kvm_put_kvm(kvm);
@@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
 
+struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
+{
+	struct kvm *kvm = kvm_create_vm(type, fdname);
+	struct file *file;
+
+	if (IS_ERR(kvm))
+		return ERR_CAST(kvm);
+
+	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+	if (IS_ERR(file)) {
+		kvm_put_kvm(kvm);
+		return file;
+	}
+
+#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
+	/*
+	 * Weak reference to the file (without get_file()) to prevent a circular
+	 * dependency. Safe because the file's release path clears this pointer
+	 * and drops its reference to the VM.
+	 *
+	 * Written via rcu_assign_pointer() because the pointer can be read
+	 * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
+	 * get_file_active() to prevent lockless ABA races).
+	 */
+	rcu_assign_pointer(kvm->vm_file, file);
+#endif
+
+	/*
+	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
+	 * already set, with ->release() being kvm_vm_release().  In error
+	 * cases it will be called by the final fput(file) and will take
+	 * care of doing kvm_put_kvm(kvm).
+	 */
+
+	return file;
+}
+
 static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
 	char fdname[ITOA_MAX_LEN + 1];
 	int r, fd;
-	struct kvm *kvm;
 	struct file *file;
 
 	fd = get_unused_fd_flags(O_CLOEXEC);
@@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 
 	snprintf(fdname, sizeof(fdname), "%d", fd);
 
-	kvm = kvm_create_vm(type, fdname);
-	if (IS_ERR(kvm)) {
-		r = PTR_ERR(kvm);
-		goto put_fd;
-	}
-
-	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+	file = kvm_create_vm_file(type, fdname);
 	if (IS_ERR(file)) {
 		r = PTR_ERR(file);
-		goto put_kvm;
+		goto put_fd;
 	}
 
-	/*
-	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
-	 * already set, with ->release() being kvm_vm_release().  In error
-	 * cases it will be called by the final fput(file) and will take
-	 * care of doing kvm_put_kvm(kvm).
-	 */
-	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
+	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);
 
 	fd_install(fd, file);
 	return fd;
 
-put_kvm:
-	kvm_put_kvm(kvm);
 put_fd:
 	put_unused_fd(fd);
 	return r;
@@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 	kfree(env);
 }
 
+void kvm_uevent_notify_vm_create(struct kvm *kvm)
+{
+	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
+}
+
 static void kvm_init_debug(void)
 {
 	const struct file_operations *fops;
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 9fcc5d5b7f8d..7aa1d65c3d46 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -3,6 +3,9 @@
 #ifndef __KVM_MM_H__
 #define __KVM_MM_H__ 1
 
+/* Worst case buffer size needed for holding an integer as a string. */
+#define ITOA_MAX_LEN 12
+
 /*
  * Architectures can choose whether to use an rwlock or spinlock
  * for the mmu_lock.  These macros, for use in common code
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO
       [not found] <cover.1780676742.git.tarunsahu@google.com>
                   ` (2 preceding siblings ...)
  2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-05 17:08 ` [RFC PATCH v2 05/10] kvm: guest_memfd: Move internal definitions and helper to new header Tarun Sahu
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Introduce KVM VM preservation support for Live Update Orchestrator.

Register an LUO file handler for KVM files to serialize and
deserialize necessary VM state across live updates. Currently, this
preserves the VM type. This implementation provides the necessary
infrastructure and dependencies for the upcoming guest_memfd
preservation support. And it can be extended to preserve more vm
state in future.

Retrieve is simply creating the kvm and populate the retrieved data.
Only catch here is there is no way to know which fd is going to be
assigned to this kvm file hence I am using atomically incremented id
for the fdname.

This change also updates the MAINTAINERS list for kvm_luo.c.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>

---
My only worry is if userspace strictly depends on the fdname, that it
needs to be consistent with vm_fd. Discussed more details in the
cover letter. Would really appreciates the alternatives/other approaches.
---
 MAINTAINERS                 |  11 +++
 include/linux/kho/abi/kvm.h |  39 ++++++++
 virt/kvm/Makefile.kvm       |   1 +
 virt/kvm/kvm_luo.c          | 190 ++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c         |   8 ++
 virt/kvm/kvm_mm.h           |   8 ++
 6 files changed, 257 insertions(+)
 create mode 100644 include/linux/kho/abi/kvm.h
 create mode 100644 virt/kvm/kvm_luo.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 9ec290e38b44..9bfc3c1f6676 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14409,6 +14409,17 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/leds/backlight/kinetic,ktz8866.yaml
 F:	drivers/video/backlight/ktz8866.c
 
+KVM LIVE UPDATE
+M:	Pasha Tatashin <pasha.tatashin@soleen.com>
+M:	Mike Rapoport <rppt@kernel.org>
+M:	Pratyush Yadav <pratyush@kernel.org>
+R:	Tarun Sahu <tarunsahu@google.com>
+L:	kexec@lists.infradead.org
+L:	kvm@vger.kernel.org
+S:	Maintained
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
+F:	virt/kvm/kvm_luo.c
+
 KVM PARAVIRT (KVM/paravirt)
 M:	Paolo Bonzini <pbonzini@redhat.com>
 R:	Vitaly Kuznetsov <vkuznets@redhat.com>
diff --git a/include/linux/kho/abi/kvm.h b/include/linux/kho/abi/kvm.h
new file mode 100644
index 000000000000..718db68a541a
--- /dev/null
+++ b/include/linux/kho/abi/kvm.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2026, Google LLC.
+ * Tarun Sahu <tarunsahu@google.com>
+ *
+ * KVM Preservation ABI for Live Update Orchestrator (LUO)
+ */
+#ifndef _LINUX_KHO_ABI_KVM_H
+#define _LINUX_KHO_ABI_KVM_H
+
+#include <linux/types.h>
+#include <linux/kho/abi/kexec_handover.h>
+
+/**
+ * DOC: KVM Live Update ABI
+ *
+ * KVM uses the ABI defined below for preserving its state
+ * across a kexec reboot using the LUO.
+ *
+ * The state is serialized into a packed structure `struct kvm_luo_ser`
+ * which is handed over to the next kernel via the KHO mechanism.
+ *
+ * This interface is a contract. Any modification to the structure layout
+ * constitutes a breaking change. Such changes require incrementing the
+ * version number in the KVM_LUO_FH_COMPATIBLE compatibility string.
+ */
+
+/**
+ * struct kvm_luo_ser - Main serialization structure for a KVM VM.
+ * @type:         The type of VM.
+ */
+struct kvm_luo_ser {
+	u64 type;
+} __packed;
+
+/* The compatibility string for KVM VM file handler */
+#define KVM_LUO_FH_COMPATIBLE	"kvm_vm_luo_v1"
+
+#endif /* _LINUX_KHO_ABI_KVM_H */
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index d047d4cf58c9..c1a962159264 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -13,3 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
 kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
 kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
 kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
+kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/kvm_luo.o
diff --git a/virt/kvm/kvm_luo.c b/virt/kvm/kvm_luo.c
new file mode 100644
index 000000000000..25619f94ace5
--- /dev/null
+++ b/virt/kvm/kvm_luo.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2026, Google LLC.
+ * Tarun Sahu <tarunsahu@google.com>
+ *
+ * KVM VM Preservation for Live Update Orchestrator (LUO)
+ */
+
+/**
+ * DOC: KVM VM Preservation via LUO
+ *
+ * Overview
+ * ========
+ *
+ * KVM virtual machines (VMs) can be preserved over a kexec reboot using the
+ * Live Update Orchestrator (LUO) file preservation. This allows userspace
+ * to preserve KVM VM state across kexec reboots.
+ *
+ * The preservation is not intended to be fully transparent. Only specific
+ * VM configuration and state are preserved, while other aspects of the VM
+ * must be re-established or re-configured by userspace after retrieval.
+ *
+ * Preserved Properties
+ * ====================
+ *
+ * The following properties of the KVM VM are preserved across kexec:
+ *
+ * VM Type
+ *   The VM type (e.g., on x86 architecture, the vm_type parameter) is
+ *   preserved.
+ *
+ * Non-Preserved Properties
+ * ========================
+ *
+ * The preservation does not cover:
+ *
+ * - vCPUs and vCPU states
+ * - Memspots / Memory slot layout (memslots)
+ * - Interrupt controllers and IRQ routings
+ * - Coalesced MMIO zones
+ * - Device bindings (VFIO/Eventfds)
+ * - Active paging or guest registers state
+ * - etc
+ */
+#include <linux/liveupdate.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/err.h>
+#include <linux/anon_inodes.h>
+#include <linux/magic.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/kexec_handover.h>
+#include <linux/kho/abi/kvm.h>
+#include "kvm_mm.h"
+
+static bool kvm_luo_can_preserve(struct liveupdate_file_handler *handler,
+				 struct file *file)
+{
+	return file_is_kvm(file);
+}
+
+static int kvm_luo_preserve(struct liveupdate_file_op_args *args)
+{
+	struct kvm *kvm = args->file->private_data;
+	struct kvm_luo_ser *ser;
+
+	if (kvm->vm_dead || kvm->vm_bugged)
+		return -EINVAL;
+
+	ser = kho_alloc_preserve(sizeof(*ser));
+	if (IS_ERR(ser))
+		return PTR_ERR(ser);
+
+#ifdef CONFIG_X86
+	ser->type = kvm->arch.vm_type;
+#else
+	ser->type = 0;
+#endif
+
+	args->serialized_data = virt_to_phys(ser);
+
+	return 0;
+}
+
+static atomic_t restored_vm_id = ATOMIC_INIT(0);
+
+static int kvm_luo_retrieve(struct liveupdate_file_op_args *args)
+{
+	char fdname[ITOA_MAX_LEN + 1];
+	struct kvm_luo_ser *ser;
+	struct file *file;
+	struct kvm *kvm;
+	int err = 0;
+
+	if (!args->serialized_data)
+		return -EINVAL;
+
+	ser = phys_to_virt(args->serialized_data);
+
+	snprintf(fdname, sizeof(fdname), "%d",
+		 atomic_inc_return(&restored_vm_id));
+
+	file = kvm_create_vm_file(ser->type, fdname);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_free_ser;
+	}
+
+	kvm = file->private_data;
+
+	args->file = file;
+	kho_restore_free(ser);
+
+	kvm_uevent_notify_vm_create(kvm);
+	return 0;
+
+err_free_ser:
+	kho_restore_free(ser);
+	return err;
+}
+
+static void kvm_luo_unpreserve(struct liveupdate_file_op_args *args)
+{
+	struct kvm_luo_ser *ser;
+
+	/*
+	 * in case preservation failed, args->serialized_data will
+	 * be NULL and kvm_luo_preserve takes care of cleaning up.
+	 * If preserve succeeds, this condition fails and unpreserve
+	 * function takes care of cleaning up.
+	 */
+	if (WARN_ON_ONCE(!args->serialized_data))
+		return;
+
+	ser = phys_to_virt(args->serialized_data);
+
+	kho_unpreserve_free(ser);
+}
+
+static void kvm_luo_finish(struct liveupdate_file_op_args *args)
+{
+	struct kvm_luo_ser *ser;
+
+	/*
+	 * If retrieve_status is true or set to error, nothing to do here.
+	 * Already cleaned up in kvm_luo_retrieve().
+	 */
+	if (args->retrieve_status)
+		return;
+
+	if (!args->serialized_data)
+		return;
+
+	ser = phys_to_virt(args->serialized_data);
+	kho_restore_free(ser);
+}
+
+static const struct liveupdate_file_ops kvm_luo_file_ops = {
+	.can_preserve = kvm_luo_can_preserve,
+	.preserve = kvm_luo_preserve,
+	.retrieve = kvm_luo_retrieve,
+	.unpreserve = kvm_luo_unpreserve,
+	.finish = kvm_luo_finish,
+	.owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler kvm_luo_handler = {
+	.ops = &kvm_luo_file_ops,
+	.compatible = KVM_LUO_FH_COMPATIBLE,
+};
+
+int kvm_luo_init(void)
+{
+	int err = liveupdate_register_file_handler(&kvm_luo_handler);
+
+	if (err && err != -EOPNOTSUPP) {
+		pr_err("Could not register kvm_vm_luo handler: %pe\n", ERR_PTR(err));
+		return err;
+	}
+
+	return 0;
+}
+
+void kvm_luo_exit(void)
+{
+	liveupdate_unregister_file_handler(&kvm_luo_handler);
+}
+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 65f0c5fb353e..c70346906a89 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6576,6 +6576,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	if (r)
 		goto err_virt;
 
+	r = kvm_luo_init();
+	if (r)
+		goto err_luo;
+
 	/*
 	 * Registration _must_ be the very last thing done, as this exposes
 	 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
@@ -6589,6 +6593,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	return 0;
 
 err_register:
+	kvm_luo_exit();
+err_luo:
 	kvm_uninit_virtualization();
 err_virt:
 	kvm_gmem_exit();
@@ -6618,6 +6624,8 @@ void kvm_exit(void)
 	 */
 	misc_deregister(&kvm_dev);
 
+	kvm_luo_exit();
+
 	kvm_uninit_virtualization();
 
 	debugfs_remove_recursive(kvm_debugfs_dir);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 7aa1d65c3d46..118edc47df83 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -97,4 +97,12 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
 }
 #endif /* CONFIG_KVM_GUEST_MEMFD */
 
+#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
+int kvm_luo_init(void);
+void kvm_luo_exit(void);
+#else
+static inline int kvm_luo_init(void) { return 0; }
+static inline void kvm_luo_exit(void) {}
+#endif /* CONFIG_LIVEUPDATE_GUEST_MEMFD */
+
 #endif /* __KVM_MM_H__ */
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 05/10] kvm: guest_memfd: Move internal definitions and helper to new header
       [not found] <cover.1780676742.git.tarunsahu@google.com>
                   ` (3 preceding siblings ...)
  2026-06-05 17:08 ` [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

To support guest_memfd memory preservation with LUO, guest_memfd luo
code needs to access guest_memfd internals and reconstruct guest_memfd
file instances from a preserved state.

Extract gmem_file, gmem_inode, and the GMEM_I() helper from guest_memfd.c
into a new internal header virt/kvm/guest_memfd.h.

Additionally, split __kvm_gmem_create() to expose a non-static
__kvm_gmem_create_file() helper. This helper returns a struct file
instead of a file descriptor, enabling file creation and initialization
without installing it into a file descriptor table.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 virt/kvm/guest_memfd.c | 68 +++++++++++++++++-------------------------
 virt/kvm/guest_memfd.h | 39 ++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 40 deletions(-)
 create mode 100644 virt/kvm/guest_memfd.h

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 69c9d6d546b2..6740ae2bf948 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -7,38 +7,12 @@
 #include <linux/mempolicy.h>
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
+#include "guest_memfd.h"
 
 #include "kvm_mm.h"
 
 static struct vfsmount *kvm_gmem_mnt;
 
-/*
- * A guest_memfd instance can be associated multiple VMs, each with its own
- * "view" of the underlying physical memory.
- *
- * The gmem's inode is effectively the raw underlying physical storage, and is
- * used to track properties of the physical memory, while each gmem file is
- * effectively a single VM's view of that storage, and is used to track assets
- * specific to its associated VM, e.g. memslots=>gmem bindings.
- */
-struct gmem_file {
-	struct kvm *kvm;
-	struct xarray bindings;
-	struct list_head entry;
-};
-
-struct gmem_inode {
-	struct shared_policy policy;
-	struct inode vfs_inode;
-	struct list_head gmem_file_list;
-
-	u64 flags;
-};
-
-static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
-{
-	return container_of(inode, struct gmem_inode, vfs_inode);
-}
 
 #define kvm_gmem_for_each_file(f, inode) \
 	list_for_each_entry(f, &GMEM_I(inode)->gmem_file_list, entry)
@@ -556,23 +530,17 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
 	return true;
 }
 
-static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
+struct file *__kvm_gmem_create_file(struct kvm *kvm, loff_t size, u64 flags)
 {
 	static const char *name = "[kvm-gmem]";
 	struct gmem_file *f;
 	struct inode *inode;
 	struct file *file;
-	int fd, err;
-
-	fd = get_unused_fd_flags(0);
-	if (fd < 0)
-		return fd;
+	int err;
 
 	f = kzalloc_obj(*f);
-	if (!f) {
-		err = -ENOMEM;
-		goto err_fd;
-	}
+	if (!f)
+		return ERR_PTR(-ENOMEM);
 
 	/* __fput() will take care of fops_put(). */
 	if (!fops_get(&kvm_gmem_fops)) {
@@ -611,8 +579,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	xa_init(&f->bindings);
 	list_add(&f->entry, &GMEM_I(inode)->gmem_file_list);
 
-	fd_install(fd, file);
-	return fd;
+	return file;
 
 err_inode:
 	iput(inode);
@@ -620,7 +587,28 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	fops_put(&kvm_gmem_fops);
 err_gmem:
 	kfree(f);
-err_fd:
+	return ERR_PTR(err);
+}
+
+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
+{
+	struct file *file;
+	int fd, err;
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0)
+		return fd;
+
+	file = __kvm_gmem_create_file(kvm, size, flags);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_put_fd;
+	}
+
+	fd_install(fd, file);
+	return fd;
+
+err_put_fd:
 	put_unused_fd(fd);
 	return err;
 }
diff --git a/virt/kvm/guest_memfd.h b/virt/kvm/guest_memfd.h
new file mode 100644
index 000000000000..c528b046dd69
--- /dev/null
+++ b/virt/kvm/guest_memfd.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_GUEST_MEMFD_H__
+#define __KVM_GUEST_MEMFD_H__ 1
+
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <linux/mempolicy.h>
+
+/*
+ * A guest_memfd instance can be associated multiple VMs, each with its own
+ * "view" of the underlying physical memory.
+ *
+ * The gmem's inode is effectively the raw underlying physical storage, and is
+ * used to track properties of the physical memory, while each gmem file is
+ * effectively a single VM's view of that storage, and is used to track assets
+ * specific to its associated VM, e.g. memslots=>gmem bindings.
+ */
+struct gmem_file {
+	struct kvm *kvm;
+	struct xarray bindings;
+	struct list_head entry;
+};
+
+struct gmem_inode {
+	struct shared_policy policy;
+	struct inode vfs_inode;
+	struct list_head gmem_file_list;
+
+	u64 flags;
+};
+
+static inline struct gmem_inode *GMEM_I(struct inode *inode)
+{
+	return container_of(inode, struct gmem_inode, vfs_inode);
+}
+
+struct file *__kvm_gmem_create_file(struct kvm *kvm, loff_t size, u64 flags);
+
+#endif /* __KVM_GUEST_MEMFD_H__ */
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
       [not found] <cover.1780676742.git.tarunsahu@google.com>
                   ` (4 preceding siblings ...)
  2026-06-05 17:08 ` [RFC PATCH v2 05/10] kvm: guest_memfd: Move internal definitions and helper to new header Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-22 23:54   ` Ackerley Tng
  2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

This patch introduces the freeze on gmem_inode which prevents
the fallocate call and any new page fault allocation. This will avoid
gmem file modification when it is being preserved

Used srcu lock to synchronise the freeze call, where write blocks
until all the reads are free. And reads are re-entrant.

Incase fault fails, It return -EPERM and VM_EXIT to userspace. userspace
must handle this properly as every new fault will fail.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 virt/kvm/guest_memfd.c | 117 +++++++++++++++++++++++++++++++++++++----
 virt/kvm/guest_memfd.h |   5 ++
 2 files changed, 111 insertions(+), 11 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 6740ae2bf948..b94639cdf312 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -7,11 +7,13 @@
 #include <linux/mempolicy.h>
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
+#include <linux/srcu.h>
 #include "guest_memfd.h"
 
 #include "kvm_mm.h"
 
 static struct vfsmount *kvm_gmem_mnt;
+static struct srcu_struct kvm_gmem_freeze_srcu;
 
 
 #define kvm_gmem_for_each_file(f, inode) \
@@ -96,6 +98,7 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 	/* TODO: Support huge pages. */
 	struct mempolicy *policy;
 	struct folio *folio;
+	int idx;
 
 	/*
 	 * Fast-path: See if folio is already present in mapping to avoid
@@ -105,12 +108,20 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 	if (!IS_ERR(folio))
 		return folio;
 
+	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
+	if (kvm_gmem_is_frozen(inode)) {
+		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
+		return ERR_PTR(-EPERM);
+	}
+
 	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
 	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
 					 FGP_LOCK | FGP_CREAT,
 					 mapping_gfp_mask(inode->i_mapping), policy);
 	mpol_cond_put(policy);
 
+	srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
+
 	/*
 	 * External interfaces like kvm_gmem_get_pfn() support dealing
 	 * with hugepages to a degree, but internally, guest_memfd currently
@@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
 			       loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	int ret;
+	int idx;
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE))
-		return -EOPNOTSUPP;
+	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
+	if (kvm_gmem_is_frozen(inode)) {
+		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
+		return -EPERM;
+	}
 
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-		return -EOPNOTSUPP;
+	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
 
-	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
-		return -EINVAL;
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
@@ -291,6 +316,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
 
 	if (!ret)
 		file_modified(file);
+
+out:
+	srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
 	return ret;
 }
 
@@ -944,7 +972,9 @@ static void kvm_gmem_destroy_inode(struct inode *inode)
 
 static void kvm_gmem_free_inode(struct inode *inode)
 {
-	kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
+	struct gmem_inode *gi = GMEM_I(inode);
+
+	kmem_cache_free(kvm_gmem_inode_cachep, gi);
 }
 
 static const struct super_operations kvm_gmem_super_operations = {
@@ -1001,12 +1031,21 @@ int kvm_gmem_init(struct module *module)
 	if (!kvm_gmem_inode_cachep)
 		return -ENOMEM;
 
+	ret = init_srcu_struct(&kvm_gmem_freeze_srcu);
+	if (ret)
+		goto err_cache;
+
 	ret = kvm_gmem_init_mount();
-	if (ret) {
-		kmem_cache_destroy(kvm_gmem_inode_cachep);
-		return ret;
-	}
+	if (ret)
+		goto err_srcu;
+
 	return 0;
+
+err_srcu:
+	cleanup_srcu_struct(&kvm_gmem_freeze_srcu);
+err_cache:
+	kmem_cache_destroy(kvm_gmem_inode_cachep);
+	return ret;
 }
 
 void kvm_gmem_exit(void)
@@ -1014,5 +1053,61 @@ void kvm_gmem_exit(void)
 	kern_unmount(kvm_gmem_mnt);
 	kvm_gmem_mnt = NULL;
 	rcu_barrier();
+	cleanup_srcu_struct(&kvm_gmem_freeze_srcu);
 	kmem_cache_destroy(kvm_gmem_inode_cachep);
 }
+
+/**
+ * kvm_gmem_freeze - Freeze or unfreeze a guest_memfd inode mapping.
+ * @inode: The guest_memfd inode.
+ * @freeze: True to freeze, false to unfreeze.
+ *
+ * This API is used strictly during the live update / preservation transition
+ * window to prevent host userspace and guest-side faults from making any
+ * mapping modifications (such as fallocate or page fault allocation)
+ * to the guest_memfd page cache.
+ *
+ * Synchronization Strategy (Sleepable RCU):
+ * To avoid high-contention VFS locks (like inode_lock or
+ * filemap_invalidate_lock) on the vCPU page fault hot paths, this subsystem
+ * implements a lightweight, system-wide Sleepable RCU (SRCU) mechanism
+ * (`kvm_gmem_freeze_srcu`):
+ *
+ * Global vs. Per-Inode SRCU
+ * ======================
+ * A single system-wide global static `srcu_struct` is used instead of a
+ * per-inode SRCU structure to completely prevent unprivileged users from
+ * exhausting the host's per-CPU memory allocator. Because
+ * `init_srcu_struct()` allocates per-CPU memory via `alloc_percpu()`, which
+ * is not accounted by memory cgroups (memcg),
+ * a per-inode SRCU structure would allow a tenant to bypass cgroup limits and
+ * trigger a system-wide Out-of-Memory (OOM) crash simply by spawning a large
+ * number of guest_memfd file descriptors (bounded only by RLIMIT_NOFILE).
+ *
+ * Flag Modification Note:
+ * Since `GUEST_MEMFD_F_MAPPING_FROZEN` is the ONLY flag in
+ * `GMEM_I(inode)->flags` that is mutated dynamically at runtime (all other
+ * flags are creation-time flags which remain strictly read-only), there is
+ * no possibility of concurrent bit-modification races. Therefore, a standard
+ * `WRITE_ONCE` is fully safe and does not require complex `cmpxchg`
+ * synchronization loops.
+ */
+void kvm_gmem_freeze(struct inode *inode, bool freeze)
+{
+	u64 flags = READ_ONCE(GMEM_I(inode)->flags);
+
+	if (freeze)
+		flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
+	else
+		flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
+
+	WRITE_ONCE(GMEM_I(inode)->flags, flags);
+
+	if (freeze)
+		synchronize_srcu(&kvm_gmem_freeze_srcu);
+}
+
+bool kvm_gmem_is_frozen(struct inode *inode)
+{
+	return READ_ONCE(GMEM_I(inode)->flags) & GUEST_MEMFD_F_MAPPING_FROZEN;
+}
diff --git a/virt/kvm/guest_memfd.h b/virt/kvm/guest_memfd.h
index c528b046dd69..028c348a1023 100644
--- a/virt/kvm/guest_memfd.h
+++ b/virt/kvm/guest_memfd.h
@@ -29,11 +29,16 @@ struct gmem_inode {
 	u64 flags;
 };
 
+/* Internal kernel-only flags (must not overlap with UAPI flags) */
+#define GUEST_MEMFD_F_MAPPING_FROZEN	(1ULL << 63)
+
 static inline struct gmem_inode *GMEM_I(struct inode *inode)
 {
 	return container_of(inode, struct gmem_inode, vfs_inode);
 }
 
 struct file *__kvm_gmem_create_file(struct kvm *kvm, loff_t size, u64 flags);
+void kvm_gmem_freeze(struct inode *inode, bool freeze);
+bool kvm_gmem_is_frozen(struct inode *inode);
 
 #endif /* __KVM_GUEST_MEMFD_H__ */
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation
       [not found] <cover.1780676742.git.tarunsahu@google.com>
                   ` (5 preceding siblings ...)
  2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-22 23:27   ` Ackerley Tng
  2026-06-05 17:08 ` [RFC PATCH v2 08/10] docs: add documentation for guest_memfd preservation via LUO Tarun Sahu
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

This patch sets up the basic infrastructure to preserve the guest_memfd.
Currently this supports only fully shared guest_memfd and backed by
PAGE_SIZE pages.

It registers a new LUO file handler for guest_memfd files to serialize
and deserialize guest memory. This allows preserving guest memory backed
by guest_memfd across updates, ensuring that guest instances can be
resumed seamlessly without losing their memory contents.

Preservation is straight forward. It walks through the folios and
serialize them.

There is kvm_gmem_freeze call on preserve which freeze the guest_memfd
inode. It avoids any changes to inode mapping with fallocate calls or
any new fault allocation (fails) on or after preservation. No need to check
this during the page fault as preservation is only supported for
pre-faulted/pre-allocated guest_memfd.

While retrieving the guest_memfd, it requires the struct kvm to create
new guest_memfd. So it first get the vm_file from the same session using
the token passed during the preservation. And use it to get
vm_file->kvm.

This change also update the MAINTAINERS list.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 MAINTAINERS                 |   1 +
 include/linux/kho/abi/kvm.h |  79 +++++-
 virt/kvm/Makefile.kvm       |   2 +-
 virt/kvm/guest_memfd_luo.c  | 485 ++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c         |   7 +
 virt/kvm/kvm_mm.h           |   4 +
 6 files changed, 571 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/guest_memfd_luo.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 9bfc3c1f6676..16cba790a84d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14418,6 +14418,7 @@ L:	kexec@lists.infradead.org
 L:	kvm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
+F:	virt/kvm/guest_memfd_luo.c
 F:	virt/kvm/kvm_luo.c
 
 KVM PARAVIRT (KVM/paravirt)
diff --git a/include/linux/kho/abi/kvm.h b/include/linux/kho/abi/kvm.h
index 718db68a541a..42074d76e04a 100644
--- a/include/linux/kho/abi/kvm.h
+++ b/include/linux/kho/abi/kvm.h
@@ -9,20 +9,23 @@
 #define _LINUX_KHO_ABI_KVM_H
 
 #include <linux/types.h>
+#include <linux/bits.h>
 #include <linux/kho/abi/kexec_handover.h>
 
 /**
- * DOC: KVM Live Update ABI
+ * DOC: KVM and guest_memfd Live Update ABI
  *
- * KVM uses the ABI defined below for preserving its state
+ * KVM and guest_memfd use the ABI defined below for preserving their states
  * across a kexec reboot using the LUO.
  *
- * The state is serialized into a packed structure `struct kvm_luo_ser`
- * which is handed over to the next kernel via the KHO mechanism.
+ * The state is serialized into packed structures (struct kvm_luo_ser and
+ * struct guest_memfd_luo_ser) which are handed over to the next kernel via
+ * the KHO mechanism.
  *
- * This interface is a contract. Any modification to the structure layout
+ * This interface is a contract. Any modification to the structure layouts
  * constitutes a breaking change. Such changes require incrementing the
- * version number in the KVM_LUO_FH_COMPATIBLE compatibility string.
+ * version number in the KVM_LUO_FH_COMPATIBLE or
+ * GUEST_MEMFD_LUO_FH_COMPATIBLE compatibility strings.
  */
 
 /**
@@ -36,4 +39,68 @@ struct kvm_luo_ser {
 /* The compatibility string for KVM VM file handler */
 #define KVM_LUO_FH_COMPATIBLE	"kvm_vm_luo_v1"
 
+/**
+ * struct guest_memfd_luo_folio_ser - Serialization layout for a single folio in guest_memfd.
+ * @pfn:   Page Frame Number of the folio.
+ * @index: Page offset of the folio within the file.
+ * @flags: State flags associated with the folio.
+ */
+struct guest_memfd_luo_folio_ser {
+	u64 pfn:52;
+	u64 flags:12;
+	u64 index;
+} __packed;
+
+/**
+ * GUEST_MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date.
+ *
+ * This flag is per folio to check if the folio is uptodate.
+ */
+#define GUEST_MEMFD_LUO_FOLIO_UPTODATE	BIT(0)
+
+
+/**
+ * GUEST_MEMFD_LUO_FLAG_MMAP - The guest_memfd supports mmap.
+ *
+ * This flag indicates that the guest_memfd supports host-side mmap.
+ */
+#define GUEST_MEMFD_LUO_FLAG_MMAP		BIT(0)
+
+/**
+ * GUEST_MEMFD_LUO_FLAG_INIT_SHARED - Initialize memory as shared.
+ *
+ * This flag indicates that the guest_memfd has been initialized as shared
+ * memory.
+ */
+#define GUEST_MEMFD_LUO_FLAG_INIT_SHARED	BIT(1)
+
+/**
+ * GUEST_MEMFD_LUO_SUPPORTED_FLAGS - Supported guest_memfd LUO flags mask.
+ *
+ * A mask of all guest_memfd preservation flags supported by this version
+ * of the KVM LUO ABI.
+ */
+#define GUEST_MEMFD_LUO_SUPPORTED_FLAGS	(GUEST_MEMFD_LUO_FLAG_MMAP | \
+						 GUEST_MEMFD_LUO_FLAG_INIT_SHARED)
+
+/**
+ * struct guest_memfd_luo_ser - Main serialization structure for guest_memfd.
+ * @size:      The size of the file in bytes.
+ * @flags:     File-level flags.
+ * @nr_folios: Number of folios in the folios array.
+ * @vm_token:  Token of the associated KVM VM instance.
+ * @folios:    KHO vmalloc descriptor pointing to the array of
+ *             struct guest_memfd_luo_folio_ser.
+ */
+struct guest_memfd_luo_ser {
+	u64 size;
+	u64 flags;
+	u64 nr_folios;
+	u64 vm_token;
+	struct kho_vmalloc folios;
+} __packed;
+
+/* The compatibility string for GUEST_MEMFD file handler */
+#define GUEST_MEMFD_LUO_FH_COMPATIBLE	"guest_memfd_luo_v1"
+
 #endif /* _LINUX_KHO_ABI_KVM_H */
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index c1a962159264..d30fca094c42 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -13,4 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
 kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
 kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
 kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
-kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/kvm_luo.o
+kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/guest_memfd_luo.o $(KVM)/kvm_luo.o
diff --git a/virt/kvm/guest_memfd_luo.c b/virt/kvm/guest_memfd_luo.c
new file mode 100644
index 000000000000..d466f889c9aa
--- /dev/null
+++ b/virt/kvm/guest_memfd_luo.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2026, Google LLC.
+ * Tarun Sahu <tarunsahu@google.com>
+ *
+ * Guestmemfd Preservation for Live Update Orchestrator (LUO)
+ */
+
+/**
+ * DOC: Guestmemfd Preservation via LUO
+ *
+ * Overview
+ * ========
+ *
+ * Guest memory file descriptors (guest_memfd) can be preserved over a kexec
+ * reboot using the Live Update Orchestrator (LUO) file preservation. This
+ * allows userspace to preserve VM memory across kexec reboots.
+ *
+ * The preservation is not intended to be transparent. Only select properties
+ * of the guest_memfd are preserved, while others are reset to default.
+ *
+ * Preserved Properties
+ * ====================
+ *
+ * The following properties of guest_memfd are preserved across kexec:
+ *
+ * File Size
+ *   The size of the file is preserved.
+ *
+ * File Contents
+ *   All folios present in the page cache are preserved.
+ *
+ * File-level Flags
+ *   The file-level flags (such as MMAP support and INIT_SHARED default mapping)
+ *   are preserved.
+ *
+ * Non-Preserved Properties
+ * ========================
+ *
+ * NUMA Memory Policy
+ *   NUMA memory policies associated with the guest_memfd are not preserved.
+ */
+#include <linux/liveupdate.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/err.h>
+#include <linux/anon_inodes.h>
+#include <linux/magic.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/kexec_handover.h>
+#include <linux/kho/abi/kvm.h>
+#include "guest_memfd.h"
+
+static int kvm_gmem_luo_walk_folios(struct address_space *mapping,
+		pgoff_t end_index, struct guest_memfd_luo_folio_ser *folios_ser,
+		u64 *out_count)
+{
+	struct folio_batch fbatch;
+	pgoff_t index = 0;
+	u64 count = 0;
+	int err = 0;
+
+	folio_batch_init(&fbatch);
+	while (index < end_index) {
+		unsigned int nr, i;
+
+		nr = filemap_get_folios(mapping, &index, end_index - 1, &fbatch);
+		if (nr == 0)
+			break;
+
+		for (i = 0; i < nr; i++) {
+			struct folio *folio = fbatch.folios[i];
+
+			if (folios_ser) {
+				if (folio_test_hwpoison(folio)) {
+					err = -EHWPOISON;
+					folio_batch_release(&fbatch);
+					goto out;
+				}
+				err = kho_preserve_folio(folio);
+				if (err) {
+					folio_batch_release(&fbatch);
+					goto out;
+				}
+
+				folios_ser[count].pfn = folio_pfn(folio);
+				folios_ser[count].index = folio->index;
+				folios_ser[count].flags = folio_test_uptodate(folio) ?
+							  GUEST_MEMFD_LUO_FOLIO_UPTODATE : 0;
+			}
+			count++;
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+out:
+	*out_count = count;
+	return err;
+}
+
+static bool kvm_gmem_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file)
+{
+	struct inode *inode = file_inode(file);
+	struct gmem_file *gmem_file = file->private_data;
+	struct kvm *kvm = gmem_file->kvm;
+
+	if (inode->i_sb->s_magic != GUEST_MEMFD_MAGIC)
+		return 0;
+
+	if (kvm_arch_has_private_mem(kvm))
+		return 0;
+
+	if (mapping_large_folio_support(inode->i_mapping))
+		return 0;
+
+	return 1;
+}
+
+static int kvm_gmem_luo_preserve(struct liveupdate_file_op_args *args)
+{
+	struct guest_memfd_luo_folio_ser *folios_ser = NULL;
+	u64 count = 0, gmem_flags, abi_flags = 0;
+	struct guest_memfd_luo_ser *ser;
+	struct address_space *mapping;
+	struct gmem_file *gmem_file;
+	struct inode *inode;
+	pgoff_t end_index;
+	struct kvm *kvm;
+	int err = 0;
+	long size;
+
+	inode = file_inode(args->file);
+	kvm_gmem_freeze(inode, true);
+
+	mapping = inode->i_mapping;
+	size = i_size_read(inode);
+	if (!size) {
+		err = -EINVAL;
+		goto err_unfreeze_inode;
+	}
+
+	if (WARN_ON_ONCE(!PAGE_ALIGNED(size))) {
+		err = -EINVAL;
+		goto err_unfreeze_inode;
+	}
+
+	gmem_file = args->file->private_data;
+	kvm = gmem_file->kvm;
+
+	gmem_flags = READ_ONCE(GMEM_I(inode)->flags);
+	if (gmem_flags & ~(GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED
+				| GUEST_MEMFD_F_MAPPING_FROZEN)) {
+		err = -EOPNOTSUPP;
+		goto err_unfreeze_inode;
+	}
+
+	if (gmem_flags & GUEST_MEMFD_FLAG_MMAP)
+		abi_flags |= GUEST_MEMFD_LUO_FLAG_MMAP;
+	if (gmem_flags & GUEST_MEMFD_FLAG_INIT_SHARED)
+		abi_flags |= GUEST_MEMFD_LUO_FLAG_INIT_SHARED;
+
+	end_index = size >> PAGE_SHIFT;
+
+	ser = kho_alloc_preserve(sizeof(*ser));
+	if (IS_ERR(ser)) {
+		err = PTR_ERR(ser);
+		goto err_unfreeze_inode;
+	}
+
+	/* First pass: Count the folios present in the page cache */
+	err = kvm_gmem_luo_walk_folios(mapping, end_index, NULL, &count);
+	if (err)
+		goto err_free_ser;
+
+	ser->size = size;
+	ser->flags = abi_flags;
+	ser->nr_folios = count;
+	ser->vm_token = 0; // It will be set during the kvm_gmem_luo_freeze()
+
+	if (count > 0) {
+		folios_ser = vcalloc(count, sizeof(*folios_ser));
+		if (!folios_ser) {
+			err = -ENOMEM;
+			goto err_free_ser;
+		}
+
+		/* Second pass: Fill the metadata array and preserve folios */
+		err = kvm_gmem_luo_walk_folios(mapping, end_index, folios_ser, &count);
+		if (err)
+			goto err_unpreserve_unlocked;
+
+		if (WARN_ON_ONCE(count != ser->nr_folios)) {
+			err = -EINVAL;
+			goto err_unpreserve_unlocked;
+		}
+	}
+
+	if (count > 0) {
+		err = kho_preserve_vmalloc(folios_ser, &ser->folios);
+		if (err)
+			goto err_unpreserve_unlocked;
+	}
+
+	args->serialized_data = virt_to_phys(ser);
+	args->private_data = folios_ser;
+
+	return 0;
+
+err_unpreserve_unlocked:
+	for (long i = (long)count - 1; i >= 0; i--) {
+		struct folio *folio = pfn_folio(folios_ser[i].pfn);
+
+		kho_unpreserve_folio(folio);
+	}
+	vfree(folios_ser);
+err_free_ser:
+	kho_unpreserve_free(ser);
+err_unfreeze_inode:
+	kvm_gmem_freeze(inode, false);
+	return err;
+}
+
+static int kvm_gmem_luo_freeze(struct liveupdate_file_op_args *args)
+{
+	struct guest_memfd_luo_ser *ser;
+	struct gmem_file *gmem_file;
+	struct kvm *kvm;
+	struct file *kvm_file;
+	u64 vm_token;
+	int err;
+
+	if (WARN_ON_ONCE(!args->serialized_data))
+		return -EINVAL;
+
+	ser = phys_to_virt(args->serialized_data);
+
+	gmem_file = args->file->private_data;
+	kvm = gmem_file->kvm;
+
+	/*
+	 * Obtain a strong reference to kvm->vm_file to prevent the SLAB_TYPESAFE_BY_RCU
+	 * file memory from being reallocated while it is being processed.
+	 */
+	kvm_file = get_file_active(&kvm->vm_file);
+	if (!kvm_file)
+		return -ENOENT;
+
+	err = liveupdate_get_token_outgoing(args->session, kvm_file, &vm_token);
+	fput(kvm_file);
+	if (err)
+		return err;
+
+	ser->vm_token = vm_token;
+	return 0;
+}
+
+static void kvm_gmem_luo_discard_folios(
+	const struct guest_memfd_luo_folio_ser *folios_ser,
+	u64 nr_folios, u64 start_idx)
+{
+	long i;
+
+	for (i = start_idx; i < nr_folios; i++) {
+		struct folio *folio;
+		phys_addr_t phys;
+
+		if (!folios_ser[i].pfn)
+			continue;
+
+		phys = PFN_PHYS(folios_ser[i].pfn);
+		folio = kho_restore_folio(phys);
+		if (folio)
+			folio_put(folio);
+	}
+}
+
+static void kvm_gmem_luo_unpreserve(struct liveupdate_file_op_args *args)
+{
+	struct guest_memfd_luo_folio_ser *folios_ser = args->private_data;
+	struct guest_memfd_luo_ser *ser;
+	long i;
+
+	if (WARN_ON_ONCE(!args->serialized_data))
+		return;
+
+	ser = phys_to_virt(args->serialized_data);
+	if (!ser)
+		return;
+
+	if (ser->nr_folios > 0)
+		kho_unpreserve_vmalloc(&ser->folios);
+	for (i = ser->nr_folios - 1; i >= 0; i--) {
+		struct folio *folio;
+
+		if (!folios_ser[i].pfn)
+			continue;
+
+		folio = pfn_folio(folios_ser[i].pfn);
+		kho_unpreserve_folio(folio);
+	}
+	vfree(folios_ser);
+
+	kho_unpreserve_free(ser);
+	kvm_gmem_freeze(file_inode(args->file), false);
+}
+
+static int kvm_gmem_luo_retrieve(struct liveupdate_file_op_args *args)
+{
+	struct guest_memfd_luo_folio_ser *folios_ser = NULL;
+	struct guest_memfd_luo_ser *ser;
+	struct kvm *kvm = NULL;
+	struct file *vm_file;
+	struct inode *inode;
+	struct file *file;
+	u64 gmem_flags = 0;
+	int err = 0;
+	long i = 0;
+
+	if (!args->serialized_data)
+		return -EINVAL;
+
+	ser = phys_to_virt(args->serialized_data);
+
+	if (ser->flags & ~GUEST_MEMFD_LUO_SUPPORTED_FLAGS) {
+		err = -EOPNOTSUPP;
+		goto err_free_ser;
+	}
+
+	if (ser->flags & GUEST_MEMFD_LUO_FLAG_MMAP)
+		gmem_flags |= GUEST_MEMFD_FLAG_MMAP;
+	if (ser->flags & GUEST_MEMFD_LUO_FLAG_INIT_SHARED)
+		gmem_flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
+
+	err = liveupdate_get_file_incoming(args->session, ser->vm_token, &vm_file);
+	if (err) {
+		pr_warn("gmem: provided VM FD token (%llx) on preserve is incorrect\n",
+						ser->vm_token);
+		goto err_free_ser;
+	}
+
+	if (file_is_kvm(vm_file))
+		kvm = vm_file->private_data;
+
+	/*
+	 * Release the temporary reference taken by the liveupdate_get_file_incoming
+	 * call. LUO still holds a reference.
+	 */
+	fput(vm_file);
+
+	if (!kvm) {
+		err = -EINVAL;
+		goto err_free_ser;
+	}
+
+	file = __kvm_gmem_create_file(kvm, ser->size, gmem_flags);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_free_ser;
+	}
+
+	inode = file_inode(file);
+
+	if (ser->nr_folios) {
+		folios_ser = kho_restore_vmalloc(&ser->folios);
+		if (!folios_ser) {
+			err = -EINVAL;
+			goto err_destroy_file;
+		}
+
+		for (i = 0; i < ser->nr_folios; i++) {
+			struct folio *folio;
+			phys_addr_t phys;
+
+			if (!folios_ser[i].pfn)
+				continue;
+
+			phys = PFN_PHYS(folios_ser[i].pfn);
+			folio = kho_restore_folio(phys);
+			if (!folio) {
+				pr_err("gmem: failed to restore folio at %llx\n", phys);
+				err = -EIO;
+				goto err_put_remaining_folios;
+			}
+
+			err = filemap_add_folio(inode->i_mapping, folio, folios_ser[i].index,
+						GFP_KERNEL);
+			if (err) {
+				pr_err("gmem: failed to add folio to page cache\n");
+				folio_put(folio);
+				goto err_put_remaining_folios;
+			}
+
+			if (folios_ser[i].flags & GUEST_MEMFD_LUO_FOLIO_UPTODATE)
+				folio_mark_uptodate(folio);
+			folio_unlock(folio);
+			folio_put(folio);
+		}
+		vfree(folios_ser);
+	}
+
+	args->file = file;
+	kho_restore_free(ser);
+	return 0;
+
+err_put_remaining_folios:
+	i++;
+err_destroy_file:
+	fput(file);
+err_free_ser:
+	if (ser->nr_folios) {
+		if (!folios_ser)
+			folios_ser = kho_restore_vmalloc(&ser->folios);
+		if (folios_ser) {
+			kvm_gmem_luo_discard_folios(folios_ser, ser->nr_folios, i);
+			vfree(folios_ser);
+		}
+	}
+	kho_restore_free(ser);
+	return err;
+}
+
+static void kvm_gmem_luo_finish(struct liveupdate_file_op_args *args)
+{
+	struct guest_memfd_luo_ser *ser;
+	struct guest_memfd_luo_folio_ser *folios_ser;
+
+	/* Nothing to be done here, if retrieve_status was successful or errored,
+	 * Cleanup is taken care of in retrieval call.
+	 */
+	if (args->retrieve_status)
+		return;
+
+	if (!args->serialized_data)
+		return;
+
+	ser = phys_to_virt(args->serialized_data);
+	if (!ser)
+		return;
+
+	if (ser->nr_folios) {
+		folios_ser = kho_restore_vmalloc(&ser->folios);
+		if (folios_ser) {
+			kvm_gmem_luo_discard_folios(folios_ser, ser->nr_folios, 0);
+			vfree(folios_ser);
+		}
+	}
+
+	kho_restore_free(ser);
+}
+
+static const struct liveupdate_file_ops kvm_gmem_luo_file_ops = {
+	.can_preserve = kvm_gmem_luo_can_preserve,
+	.preserve = kvm_gmem_luo_preserve,
+	.freeze = kvm_gmem_luo_freeze,
+	.retrieve = kvm_gmem_luo_retrieve,
+	.unpreserve = kvm_gmem_luo_unpreserve,
+	.finish = kvm_gmem_luo_finish,
+	.owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler kvm_gmem_luo_handler = {
+	.ops = &kvm_gmem_luo_file_ops,
+	.compatible = GUEST_MEMFD_LUO_FH_COMPATIBLE,
+};
+
+int kvm_gmem_luo_init(void)
+{
+	int err = liveupdate_register_file_handler(&kvm_gmem_luo_handler);
+
+	if (err && err != -EOPNOTSUPP) {
+		pr_err("Could not register luo filesystem handler: %pe\n", ERR_PTR(err));
+		return err;
+	}
+
+	return 0;
+}
+
+void kvm_gmem_luo_exit(void)
+{
+	liveupdate_unregister_file_handler(&kvm_gmem_luo_handler);
+}
+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c70346906a89..501a5d048418 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6580,6 +6580,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	if (r)
 		goto err_luo;
 
+	r = kvm_gmem_luo_init();
+	if (r)
+		goto err_gmem_luo;
+
 	/*
 	 * Registration _must_ be the very last thing done, as this exposes
 	 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
@@ -6593,6 +6597,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	return 0;
 
 err_register:
+	kvm_gmem_luo_exit();
+err_gmem_luo:
 	kvm_luo_exit();
 err_luo:
 	kvm_uninit_virtualization();
@@ -6624,6 +6630,7 @@ void kvm_exit(void)
 	 */
 	misc_deregister(&kvm_dev);
 
+	kvm_gmem_luo_exit();
 	kvm_luo_exit();
 
 	kvm_uninit_virtualization();
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 118edc47df83..d8ccb68e7e9b 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -100,9 +100,13 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
 #ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
 int kvm_luo_init(void);
 void kvm_luo_exit(void);
+int kvm_gmem_luo_init(void);
+void kvm_gmem_luo_exit(void);
 #else
 static inline int kvm_luo_init(void) { return 0; }
 static inline void kvm_luo_exit(void) {}
+static inline int kvm_gmem_luo_init(void) { return 0; }
+static inline void kvm_gmem_luo_exit(void) {}
 #endif /* CONFIG_LIVEUPDATE_GUEST_MEMFD */
 
 #endif /* __KVM_MM_H__ */
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 08/10] docs: add documentation for guest_memfd preservation via LUO
       [not found] <cover.1780676742.git.tarunsahu@google.com>
                   ` (6 preceding siblings ...)
  2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-05 17:08 ` [RFC PATCH v2 09/10] selftests: kvm: Split ____vm_create() to expose init helpers Tarun Sahu
  2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
  9 siblings, 0 replies; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Add the documentation under the "Preserving file descriptors" section
of LUO's documentation.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 Documentation/core-api/liveupdate.rst |   1 +
 Documentation/liveupdate/vmm.rst      | 103 ++++++++++++++++++++++++++
 MAINTAINERS                           |   1 +
 3 files changed, 105 insertions(+)
 create mode 100644 Documentation/liveupdate/vmm.rst

diff --git a/Documentation/core-api/liveupdate.rst b/Documentation/core-api/liveupdate.rst
index 5a292d0f3706..bac58a363151 100644
--- a/Documentation/core-api/liveupdate.rst
+++ b/Documentation/core-api/liveupdate.rst
@@ -34,6 +34,7 @@ The following types of file descriptors can be preserved
    :maxdepth: 1
 
    ../mm/memfd_preservation
+   ../liveupdate/vmm
 
 Public API
 ==========
diff --git a/Documentation/liveupdate/vmm.rst b/Documentation/liveupdate/vmm.rst
new file mode 100644
index 000000000000..0cd487a0e1a6
--- /dev/null
+++ b/Documentation/liveupdate/vmm.rst
@@ -0,0 +1,103 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+=============================
+VM & Guest_Memfd Preservation
+=============================
+
+.. kernel-doc:: virt/kvm/kvm_luo.c
+   :doc: KVM VM Preservation via LUO
+
+.. kernel-doc:: virt/kvm/guest_memfd_luo.c
+   :doc: Guest_Memfd Preservation via LUO
+
+VMM Instructions
+================
+
+This section describes the requirements, scope, conditions, and
+ordering constraints that a Virtual Machine Monitor (VMM) must adhere
+to for successful preservation and retrieval of guest_memfd files
+across a Live Update Orchestrator (LUO) sequence.
+
+Scope and Limitations
+---------------------
+
+At this stage, the scope of guest_memfd preservation is restricted to:
+
+1. **Fully Shared guest_memfd**:
+   This time only fully shared guest_memfd supported. Any system that
+   supports coco vm (which uses private guest_memfd), will not support
+   the preservation.
+
+2. **Standard Page Size**:
+   Only guest_memfd backed by standard page size (``PAGE_SIZE``,
+   order-0) pages is supported. Large/huge page backing (e.g.,
+   hugetlb guest_memfd) is not supported.
+
+Any Virtual Machine (VM) whose memory is fully backed by such
+guest_memfd files can be preserved across live update.
+
+VMM Actions and Conditions during Live Update
+---------------------------------------------
+
+During the live update sequence, the kernel introduces a *freezing*
+phase for the guest_memfd inode. Freezing prevents any modifications to
+the guest_memfd page cache. Specifically, once a guest_memfd mapping is
+frozen:
+
+- Any subsequent ``fallocate`` calls on the guest_memfd file descriptor
+  will fail and return ``-EPERM``.
+- Any new page faults (guest-side or host-userspace-side) that require
+  folio allocation will fail and return ``-EPERM``.
+
+To prevent vCPUs or VMM helper threads from failing due to these
+``-EPERM`` errors, the VMM must implement one of the following
+strategies:
+
+1. **Pause the VM (Recommended)**:
+   The VMM should pause/suspend all vCPUs before invoking the
+   preservation or freezing of the VM and guest_memfd files. This
+   ensures no new page faults or memory accesses can occur while the
+   guest_memfd is frozen.
+
+2. **Handle Fault Failures**:
+   If the VM is not paused, the VMM must be prepared to handle VM
+   exits or user page fault errors resulting from the ``-EPERM``
+   failures. The VMM must take appropriate action, such as
+   immediately pausing the VM, or aborting the live update sequence
+   (by tearing down or unpreserving the live update session).
+
+Preservation and Retrieval Ordering
+-----------------------------------
+
+Preservation Order
+~~~~~~~~~~~~~~~~~~
+
+There is no strict ordering requirement for initiating the
+preservation of the KVM VM file and the guest_memfd files; they are
+preserved independently. If kexec is triggered with guest_memfd
+preservation without preserving the vm file, kexec will fail.
+
+Retrieval Order
+~~~~~~~~~~~~~~~
+
+Similarly, there is no strict ordering required for retrieving the VM
+and guest_memfd files. Any file can be retrieved at any order.
+
+If guest_memfd file is retrieved and VM file is not retrieved, and
+luo_finish is called, then vm_file will be lost and guest_memfd file
+will be hanging around.
+
+VM & Guest_Memfd Preservation ABI
+=================================
+
+.. kernel-doc:: include/linux/kho/abi/kvm.h
+   :doc: DOC: guest_memfd Live Update ABI
+
+.. kernel-doc:: include/linux/kho/abi/kvm.h
+   :internal:
+
+See Also
+========
+
+- :doc:`/core-api/liveupdate`
+- :doc:`/userspace-api/liveupdate`
diff --git a/MAINTAINERS b/MAINTAINERS
index 16cba790a84d..ca459d032712 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14418,6 +14418,7 @@ L:	kexec@lists.infradead.org
 L:	kvm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
+F:	Documentation/liveupdate/vmm.rst
 F:	virt/kvm/guest_memfd_luo.c
 F:	virt/kvm/kvm_luo.c
 
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 09/10] selftests: kvm: Split ____vm_create() to expose init helpers
       [not found] <cover.1780676742.git.tarunsahu@google.com>
                   ` (7 preceding siblings ...)
  2026-06-05 17:08 ` [RFC PATCH v2 08/10] docs: add documentation for guest_memfd preservation via LUO Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
  9 siblings, 0 replies; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Refactor `____vm_create()` in the KVM selftest library to extract its
initialization steps into separate, reusable internal helpers.

Introduce `vm_init_fields()` and `vm_init_memory_properties()`. This
allows advanced test setups to perform targeted VM fields or memory
property initializations independently, which is required by upcoming
test cases that restore preserved VMs. No functional changes are
introduced for the existing tests.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 .../testing/selftests/kvm/include/kvm_util.h  |  2 ++
 tools/testing/selftests/kvm/lib/kvm_util.c    | 26 +++++++++++++------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 2ecaaa0e9965..d10cd25d0658 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -471,6 +471,8 @@ const char *vm_guest_mode_string(u32 i);
 
 void kvm_vm_free(struct kvm_vm *vmp);
 void kvm_vm_restart(struct kvm_vm *vmp);
+void vm_init_fields(struct kvm_vm *vm, struct vm_shape shape);
+void vm_init_memory_properties(struct kvm_vm *vm);
 void kvm_vm_release(struct kvm_vm *vmp);
 void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename);
 int kvm_memfd_alloc(size_t size, bool hugepages);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index e08967ef7b7b..d3e6508e9863 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -276,13 +276,8 @@ __weak void vm_populate_gva_bitmap(struct kvm_vm *vm)
 		(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
 }
 
-struct kvm_vm *____vm_create(struct vm_shape shape)
+void vm_init_fields(struct kvm_vm *vm, struct vm_shape shape)
 {
-	struct kvm_vm *vm;
-
-	vm = calloc(1, sizeof(*vm));
-	TEST_ASSERT(vm != NULL, "Insufficient Memory");
-
 	INIT_LIST_HEAD(&vm->vcpus);
 	vm->regions.gpa_tree = RB_ROOT;
 	vm->regions.hva_tree = RB_ROOT;
@@ -380,9 +375,10 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
 	if (vm->pa_bits != 40)
 		vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
 #endif
+}
 
-	vm_open(vm);
-
+void vm_init_memory_properties(struct kvm_vm *vm)
+{
 	/* Limit to VA-bit canonical virtual addresses. */
 	vm->vpages_valid = sparsebit_alloc();
 	vm_populate_gva_bitmap(vm);
@@ -392,6 +388,20 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
 
 	/* Allocate and setup memory for guest. */
 	vm->vpages_mapped = sparsebit_alloc();
+}
+
+struct kvm_vm *____vm_create(struct vm_shape shape)
+{
+	struct kvm_vm *vm;
+
+	vm = calloc(1, sizeof(*vm));
+	TEST_ASSERT(vm != NULL, "Insufficient Memory");
+
+	vm_init_fields(vm, shape);
+
+	vm_open(vm);
+
+	vm_init_memory_properties(vm);
 
 	return vm;
 }
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test
       [not found] <cover.1780676742.git.tarunsahu@google.com>
                   ` (8 preceding siblings ...)
  2026-06-05 17:08 ` [RFC PATCH v2 09/10] selftests: kvm: Split ____vm_create() to expose init helpers Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
  2026-06-22 23:01   ` Ackerley Tng
  9 siblings, 1 reply; 23+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Add a new KVM selftest `guest_memfd_preservation_test` to verify that
guest memory backed by guest_memfd is preserved properly.

The test leverages the Live Update Orchestrator (LUO) infrastructure
to validate that memory folios and configuration layouts are
successfully saved and then restored during kernel live updates,
preventing any memory loss for the guest.

Here, I have used the kvm selftests framework by creating a new
vm and mapping two memory slots to it. One is the code that is executed
inside the vm and other is the guest_memfd whose memory is being
written by the guest code.

In Phase 1: Once data is written the vm exits and wait for the user
to trigger the kexec.

In Phase 2: A new vm is created with retrieved kvm and again two
memory slots are assigned. Once for guest code, and another is for
retrieved guest_memfd where guest_memfd memory is verified by the
executed guest code. If verification succeeds, The test passes.

Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
 MAINTAINERS                                   |   1 +
 tools/testing/selftests/kvm/Makefile.kvm      |   6 +-
 .../kvm/guest_memfd_preservation_test.c       | 230 ++++++++++++++++++
 3 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/kvm/guest_memfd_preservation_test.c

diff --git a/MAINTAINERS b/MAINTAINERS
index ca459d032712..76e59620d2f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14419,6 +14419,7 @@ L:	kvm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
 F:	Documentation/liveupdate/vmm.rst
+F:	tools/testing/selftests/kvm/guest_memfd_preservation_test.c
 F:	virt/kvm/guest_memfd_luo.c
 F:	virt/kvm/kvm_luo.c
 
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 9118a5a51b89..68584d4ee1b0 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -161,6 +161,8 @@ TEST_GEN_PROGS_x86 += pre_fault_memory_test
 
 # Compiled outputs used by test targets
 TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test
+# Manual test that forks a persistent background daemon; skip auto CI run
+TEST_GEN_PROGS_EXTENDED_x86 += guest_memfd_preservation_test
 
 TEST_GEN_PROGS_arm64 = $(TEST_GEN_PROGS_COMMON)
 TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs
@@ -254,6 +256,7 @@ OVERRIDE_TARGETS = 1
 # which causes the environment variable to override the makefile).
 include ../lib.mk
 include ../cgroup/lib/libcgroup.mk
+include ../liveupdate/lib/libliveupdate.mk
 
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
@@ -308,7 +311,8 @@ LIBKVM_S := $(filter %.S,$(LIBKVM))
 LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
 LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
 LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) $(LIBCGROUP_O)
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) \
+						$(LIBCGROUP_O) $(LIBLIVEUPDATE_O)
 SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
 SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH)/%.o, $(SPLIT_TESTS))
 
diff --git a/tools/testing/selftests/kvm/guest_memfd_preservation_test.c b/tools/testing/selftests/kvm/guest_memfd_preservation_test.c
new file mode 100644
index 000000000000..74f90c5c4bf5
--- /dev/null
+++ b/tools/testing/selftests/kvm/guest_memfd_preservation_test.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2026, Google LLC.
+ *
+ * Author: Tarun Sahu <tarunsahu@google.com>
+ *
+ * Test for VM and guest_memfd preservation across kexec (Live Update) via LUO.
+ *
+ * NOTE: This is a MANUAL test and is excluded from automated CI/testing
+ * frameworks because Phase 1 daemonizes into the background to pin resources
+ * and requires a human operator to manually trigger kexec before Phase 2
+ * is executed. Running Phase 1 automatically would leak the background daemon
+ * and cause CI runners to falsely interpret it as a passed test.
+ *
+ * Usage:
+ * Phase 1: ./guest_memfd_preservation_test
+ * Phase 2: ./guest_memfd_preservation_test --phase2
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/sizes.h>
+#include <linux/falloc.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+#include "ucall_common.h"
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+#include <libliveupdate.h>
+
+#define SESSION_NAME "gmem_vm_preservation_session"
+#define VM_TOKEN 0x1001
+#define GMEM_TOKEN 0x1002
+
+#define GMEM_SIZE (16ULL * 1024 * 1024)
+#define DATA_SIZE (5ULL * 1024 * 1024)
+
+static size_t page_size;
+
+/* Deterministic byte pattern generation based on offset */
+static inline uint8_t get_pattern_byte(size_t offset)
+{
+	return (uint8_t)(offset ^ 0x5A);
+}
+
+static void guest_code_phase1(uint64_t gpa, uint64_t size, uint64_t data_size)
+{
+	uint8_t *mem = (uint8_t *)gpa;
+	size_t i;
+
+	for (i = 0; i < data_size; i++)
+		mem[i] = get_pattern_byte(i);
+
+	GUEST_DONE();
+}
+
+static void guest_code_phase2(uint64_t gpa, uint64_t size, uint64_t data_size)
+{
+	uint8_t *mem = (uint8_t *)gpa;
+	size_t i;
+
+	for (i = 0; i < data_size; i++) {
+		uint8_t val = get_pattern_byte(i);
+
+		__GUEST_ASSERT(mem[i] == val,
+			       "Data mismatch at offset %lu! Expected 0x%x, got 0x%x",
+			       i, val, mem[i]);
+	}
+
+	GUEST_DONE();
+}
+
+static void do_phase1(void)
+{
+	uint64_t flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;
+	int gmem_fd, dev_luo_fd, session_fd, ret;
+	const uint64_t gpa = SZ_4G;
+	struct kvm_vcpu *vcpu;
+	const int slot = 1;
+	struct kvm_vm *vm;
+
+	vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1,
+					guest_code_phase1);
+	gmem_fd = vm_create_guest_memfd(vm, GMEM_SIZE, flags);
+	vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
+				 gmem_fd, 0);
+
+	for (size_t i = 0; i < GMEM_SIZE; i += page_size)
+		virt_pg_map(vm, gpa + i, gpa + i);
+
+	vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+	dev_luo_fd = luo_open_device();
+	TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
+
+	session_fd = luo_create_session(dev_luo_fd, SESSION_NAME);
+	TEST_ASSERT(session_fd >= 0, "Failed to create LUO session");
+
+	ret = luo_session_preserve_fd(session_fd, vm->fd, VM_TOKEN);
+	TEST_ASSERT(ret == 0, "Failed to preserve VM file descriptor");
+
+	ret = luo_session_preserve_fd(session_fd, gmem_fd, GMEM_TOKEN);
+	TEST_ASSERT(ret == 0, "Failed to preserve guest_memfd file descriptor");
+
+	printf("\n============================================================\n");
+	printf("Phase 1 Complete Successfully!\n");
+	printf("VM file and guest_memfd file have been preserved via LUO.\n");
+	printf("Tokens: VM_TOKEN=0x%x, GMEM_TOKEN=0x%x\n", VM_TOKEN, GMEM_TOKEN);
+	printf("Machine Size: %llu MB, Data Size: %llu MB\n", GMEM_SIZE / SZ_1M,
+				 DATA_SIZE / SZ_1M);
+	printf("------------------------------------------------------------\n");
+
+	daemonize_and_wait();
+}
+
+static struct kvm_vm *vm_create_from_fd(int resurrected_vm_fd,
+					struct vm_shape shape)
+{
+	struct kvm_vm *vm;
+
+	vm = calloc(1, sizeof(*vm));
+	TEST_ASSERT(vm != NULL, "Insufficient Memory");
+
+	vm_init_fields(vm, shape);
+
+	vm->kvm_fd = open_path_or_exit(KVM_DEV_PATH, O_RDWR);
+	vm->fd = resurrected_vm_fd;
+
+	if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
+		vm->stats.fd = vm_get_stats_fd(vm);
+	else
+		vm->stats.fd = -1;
+
+	vm_init_memory_properties(vm);
+
+	return vm;
+}
+
+static void do_phase2(void)
+{
+	int retrieved_vm_fd, retrieved_gmem_fd, dev_luo_fd, session_fd;
+	struct vm_shape shape = VM_SHAPE_DEFAULT;
+	const uint64_t gpa = SZ_4G;
+	struct kvm_vcpu *vcpu;
+	const int slot = 1;
+	struct kvm_vm *vm;
+
+	dev_luo_fd = luo_open_device();
+	TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
+
+	session_fd = luo_retrieve_session(dev_luo_fd, SESSION_NAME);
+	TEST_ASSERT(session_fd >= 0, "Failed to retrieve LUO session");
+
+	retrieved_vm_fd = luo_session_retrieve_fd(session_fd, VM_TOKEN);
+	TEST_ASSERT(retrieved_vm_fd >= 0, "Failed to retrieve VM file descriptor");
+
+	retrieved_gmem_fd = luo_session_retrieve_fd(session_fd, GMEM_TOKEN);
+	TEST_ASSERT(retrieved_gmem_fd >= 0, "Failed to retrieve guest_memfd file descriptor");
+
+	vm = vm_create_from_fd(retrieved_vm_fd, shape);
+
+	u64 nr_pages = 2048; /* 8MB is plenty for slot0 pages */
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
+	kvm_vm_elf_load(vm, program_invocation_name);
+
+	for (int i = 0; i < NR_MEM_REGIONS; i++)
+		vm->memslots[i] = 0;
+
+	struct userspace_mem_region *slot0 = memslot2region(vm, 0);
+
+	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
+
+	vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
+				   retrieved_gmem_fd, 0);
+
+	for (size_t i = 0; i < GMEM_SIZE; i += page_size)
+		virt_pg_map(vm, gpa + i, gpa + i);
+
+	vcpu = vm_vcpu_add(vm, 0, guest_code_phase2);
+	kvm_arch_vm_finalize_vcpus(vm);
+
+	vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
+
+	printf("Resuming / Running VM in Phase 2...\n");
+	vcpu_run(vcpu);
+	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+	printf("\nSUCCESS: Phase 2 Complete! All 5MB complex data verified intact!\n");
+
+	luo_session_finish(session_fd);
+	close(session_fd);
+	close(dev_luo_fd);
+	/* This will also close the vm_fd */
+	kvm_vm_free(vm);
+	close(retrieved_gmem_fd);
+}
+
+int main(int argc, char *argv[])
+{
+	bool phase2 = false;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
+	page_size = getpagesize();
+
+	for (int i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "--phase2") == 0)
+			phase2 = true;
+	}
+
+	if (phase2)
+		do_phase2();
+	else
+		do_phase1();
+
+	return 0;
+}
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation
  2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
@ 2026-06-07  0:35   ` tarunsahu
  0 siblings, 0 replies; 23+ messages in thread
From: tarunsahu @ 2026-06-07  0:35 UTC (permalink / raw)
  To: Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin, Shuah Khan,
	sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm


Hi,

I am sorry for incorrect layout of this series. After copy-pasting
I forgot to change the message id in header and the cover letter is not
attached to the same thread.

Please find the V2 cover letter here:
https://lore.kernel.org/all/cover.1780667929.git.tarunsahu@google.com/

Also for latest discussion related to scope:
https://lore.kernel.org/all/9huzldcrxkch.fsf@tarunix.c.googlers.com/

Thankyou

Tarun Sahu <tarunsahu@google.com> writes:

> From: Pasha Tatashin <pasha.tatashin@soleen.com>
>
> The core liveupdate mechanism allows userspace to preserve file
> descriptors. However, kernel subsystems often manage struct file
> objects directly and need to participate in the preservation process
> programmatically without relying solely on userspace interaction.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
> ---
>  include/linux/liveupdate.h       | 21 ++++++++++
>  kernel/liveupdate/luo_file.c     | 69 ++++++++++++++++++++++++++++++++
>  kernel/liveupdate/luo_internal.h | 17 ++++++++
>  3 files changed, 107 insertions(+)
>
> diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
> index 30c5a39ff9e9..de052438eaac 100644
> --- a/include/linux/liveupdate.h
> +++ b/include/linux/liveupdate.h
> @@ -24,6 +24,7 @@ struct file;
>  /**
>   * struct liveupdate_file_op_args - Arguments for file operation callbacks.
>   * @handler:          The file handler being called.
> + * @session:          The session this file belongs to.
>   * @retrieve_status:  The retrieve status for the 'can_finish / finish'
>   *                    operation. A value of 0 means the retrieve has not been
>   *                    attempted, a positive value means the retrieve was
> @@ -44,6 +45,7 @@ struct file;
>   */
>  struct liveupdate_file_op_args {
>  	struct liveupdate_file_handler *handler;
> +	struct liveupdate_session *session;
>  	int retrieve_status;
>  	struct file *file;
>  	u64 serialized_data;
> @@ -240,6 +242,13 @@ void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
>  
>  int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp);
>  int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp);
> +/* kernel can internally retrieve files */
> +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
> +				 struct file **filep);
> +
> +/* Get a token for an outgoing file, or -ENOENT if file is not preserved */
> +int liveupdate_get_token_outgoing(struct liveupdate_session *s,
> +				  struct file *file, u64 *tokenp);
>  
>  #else /* CONFIG_LIVEUPDATE */
>  
> @@ -285,5 +294,17 @@ static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb,
>  	return -EOPNOTSUPP;
>  }
>  
> +static inline int liveupdate_get_file_incoming(struct liveupdate_session *s,
> +					       u64 token, struct file **filep)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int liveupdate_get_token_outgoing(struct liveupdate_session *s,
> +						struct file *file, u64 *tokenp)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
>  #endif /* CONFIG_LIVEUPDATE */
>  #endif /* _LINUX_LIVEUPDATE_H */
> diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
> index a0a419085e28..0aa0b4e5339f 100644
> --- a/kernel/liveupdate/luo_file.c
> +++ b/kernel/liveupdate/luo_file.c
> @@ -323,6 +323,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
>  	mutex_init(&luo_file->mutex);
>  
>  	args.handler = fh;
> +	args.session = luo_session_from_file_set(file_set);
>  	args.file = file;
>  	err = fh->ops->preserve(&args);
>  	if (err)
> @@ -380,6 +381,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
>  					   struct luo_file, list);
>  
>  		args.handler = luo_file->fh;
> +		args.session = luo_session_from_file_set(file_set);
>  		args.file = luo_file->file;
>  		args.serialized_data = luo_file->serialized_data;
>  		args.private_data = luo_file->private_data;
> @@ -411,6 +413,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set,
>  		struct liveupdate_file_op_args args = {0};
>  
>  		args.handler = luo_file->fh;
> +		args.session = luo_session_from_file_set(file_set);
>  		args.file = luo_file->file;
>  		args.serialized_data = luo_file->serialized_data;
>  		args.private_data = luo_file->private_data;
> @@ -432,6 +435,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set,
>  		struct liveupdate_file_op_args args = {0};
>  
>  		args.handler = luo_file->fh;
> +		args.session = luo_session_from_file_set(file_set);
>  		args.file = luo_file->file;
>  		args.serialized_data = luo_file->serialized_data;
>  		args.private_data = luo_file->private_data;
> @@ -621,6 +625,7 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
>  	}
>  
>  	args.handler = luo_file->fh;
> +	args.session = luo_session_from_file_set(file_set);
>  	args.serialized_data = luo_file->serialized_data;
>  	err = luo_file->fh->ops->retrieve(&args);
>  	if (err) {
> @@ -654,6 +659,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set,
>  		struct liveupdate_file_op_args args = {0};
>  
>  		args.handler = luo_file->fh;
> +		args.session = luo_session_from_file_set(file_set);
>  		args.file = luo_file->file;
>  		args.serialized_data = luo_file->serialized_data;
>  		args.retrieve_status = luo_file->retrieve_status;
> @@ -671,6 +677,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
>  	guard(mutex)(&luo_file->mutex);
>  
>  	args.handler = luo_file->fh;
> +	args.session = luo_session_from_file_set(file_set);
>  	args.file = luo_file->file;
>  	args.serialized_data = luo_file->serialized_data;
>  	args.retrieve_status = luo_file->retrieve_status;
> @@ -924,3 +931,65 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
>  	luo_flb_unregister_all(fh);
>  	list_del(&ACCESS_PRIVATE(fh, list));
>  }
> +EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler);
> +
> +/**
> + * liveupdate_get_token_outgoing - Get the token for a preserved file.
> + * @s:      The outgoing liveupdate session.
> + * @file:   The file object to search for.
> + * @tokenp: Output parameter for the found token.
> + *
> + * Searches the list of preserved files in an outgoing session for a matching
> + * file object. If found, the corresponding user-provided token is returned.
> + *
> + * This function is intended for in-kernel callers that need to correlate a
> + * file with its liveupdate token.
> + *
> + * Context: It must be called with session mutex acquired.
> + * Return: 0 on success, -ENOENT if the file is not preserved in this session.
> + */
> +int liveupdate_get_token_outgoing(struct liveupdate_session *s,
> +				  struct file *file, u64 *tokenp)
> +{
> +	struct luo_file_set *file_set = luo_file_set_from_session_locked(s);
> +	struct luo_file *luo_file;
> +	int err = -ENOENT;
> +
> +	list_for_each_entry(luo_file, &file_set->files_list, list) {
> +		if (luo_file->file == file) {
> +			if (tokenp)
> +				*tokenp = luo_file->token;
> +			err = 0;
> +			break;
> +		}
> +	}
> +
> +	return err;
> +}
> +
> +/**
> + * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
> + * @s:      The incoming liveupdate session (restored from the previous kernel).
> + * @token:  The unique token identifying the file to retrieve.
> + * @filep:  On success, this will be populated with a pointer to the retrieved
> + *          'struct file'.
> + *
> + * Provides a kernel-internal API for other subsystems to retrieve their
> + * preserved files after a live update. This function is a simple wrapper
> + * around luo_retrieve_file(), allowing callers to find a file by its token.
> + *
> + * The caller receives a new reference to the file and must call fput() when it
> + * is no longer needed. The file's lifetime is managed by LUO and any userspace
> + * file descriptors. If the caller needs to hold a reference to the file beyond
> + * the immediate scope, it must call get_file() itself.
> + *
> + * Context: It must be called with session mutex acquired of a restored session.
> + * Return: 0 on success. Returns -ENOENT if no file with the matching token is
> + *         found, or any other negative errno on failure.
> + */
> +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
> +				 struct file **filep)
> +{
> +	return luo_retrieve_file(luo_file_set_from_session_locked(s),
> +				 token, filep);
> +}
> diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
> index 875844d7a41d..08b198802e7f 100644
> --- a/kernel/liveupdate/luo_internal.h
> +++ b/kernel/liveupdate/luo_internal.h
> @@ -79,6 +79,23 @@ struct luo_session {
>  
>  extern struct rw_semaphore luo_register_rwlock;
>  
> +static inline struct liveupdate_session *luo_session_from_file_set(struct luo_file_set *file_set)
> +{
> +	struct luo_session *session;
> +
> +	session = container_of(file_set, struct luo_session, file_set);
> +
> +	return (struct liveupdate_session *)session;
> +}
> +
> +static inline struct luo_file_set *luo_file_set_from_session_locked(struct liveupdate_session *s)
> +{
> +	struct luo_session *session = (struct luo_session *)s;
> +
> +	lockdep_assert_held(&session->mutex);
> +	return &session->file_set;
> +}
> +
>  int luo_session_create(const char *name, struct file **filep);
>  int luo_session_retrieve(const char *name, struct file **filep);
>  int __init luo_session_setup_outgoing(void *fdt);
> -- 
> 2.54.0.1032.g2f8565e1d1-goog

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test
  2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
@ 2026-06-22 23:01   ` Ackerley Tng
  0 siblings, 0 replies; 23+ messages in thread
From: Ackerley Tng @ 2026-06-22 23:01 UTC (permalink / raw)
  To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Tarun Sahu <tarunsahu@google.com> writes:

> Add a new KVM selftest `guest_memfd_preservation_test` to verify that
> guest memory backed by guest_memfd is preserved properly.
>

Don't think using backticks in commit messages is a common practice but
I might be wrong here.

> The test leverages the Live Update Orchestrator (LUO) infrastructure
> to validate that memory folios and configuration layouts are
> successfully saved and then restored during kernel live updates,
> preventing any memory loss for the guest.
>
> Here, I have used the kvm selftests framework by creating a new
> vm and mapping two memory slots to it. One is the code that is executed
> inside the vm and other is the guest_memfd whose memory is being
> written by the guest code.
>

Don't think commit messages with "I" are common either

> In Phase 1: Once data is written the vm exits and wait for the user
> to trigger the kexec.
>
> In Phase 2: A new vm is created with retrieved kvm and again two
> memory slots are assigned. Once for guest code, and another is for
> retrieved guest_memfd where guest_memfd memory is verified by the
> executed guest code. If verification succeeds, The test passes.
>
>
> [...snip...]
>
> +#define SESSION_NAME "gmem_vm_preservation_session"
> +#define VM_TOKEN 0x1001
> +#define GMEM_TOKEN 0x1002
> +
> +#define GMEM_SIZE (16ULL * 1024 * 1024)
> +#define DATA_SIZE (5ULL * 1024 * 1024)
> +
> +static size_t page_size;
> +
> +/* Deterministic byte pattern generation based on offset */
> +static inline uint8_t get_pattern_byte(size_t offset)
> +{
> +	return (uint8_t)(offset ^ 0x5A);
> +}
> +
> +static void guest_code_phase1(uint64_t gpa, uint64_t size, uint64_t data_size)
> +{
> +	uint8_t *mem = (uint8_t *)gpa;
> +	size_t i;
> +
> +	for (i = 0; i < data_size; i++)
> +		mem[i] = get_pattern_byte(i);
> +
> +	GUEST_DONE();
> +}
> +
> +static void guest_code_phase2(uint64_t gpa, uint64_t size, uint64_t data_size)
> +{
> +	uint8_t *mem = (uint8_t *)gpa;
> +	size_t i;
> +
> +	for (i = 0; i < data_size; i++) {
> +		uint8_t val = get_pattern_byte(i);
> +
> +		__GUEST_ASSERT(mem[i] == val,
> +			       "Data mismatch at offset %lu! Expected 0x%x, got 0x%x",
> +			       i, val, mem[i]);
> +	}
> +
> +	GUEST_DONE();
> +}
> +
> +static void do_phase1(void)
> +{
> +	uint64_t flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;

Is there a reason to set GUEST_MEMFD_FLAG_MMAP? We're not really
accessing that memory from the host in this test.

> +	int gmem_fd, dev_luo_fd, session_fd, ret;
> +	const uint64_t gpa = SZ_4G;
> +	struct kvm_vcpu *vcpu;
> +	const int slot = 1;
> +	struct kvm_vm *vm;
> +
> +	vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1,
> +					guest_code_phase1);
> +	gmem_fd = vm_create_guest_memfd(vm, GMEM_SIZE, flags);
> +	vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
> +				 gmem_fd, 0);
> +
> +	for (size_t i = 0; i < GMEM_SIZE; i += page_size)
> +		virt_pg_map(vm, gpa + i, gpa + i);
> +
> +	vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);

If GMEM_SIZE and DATA_SIZE are static I think we don't have to set those
as vcpu_args_set(), they can be used as macros from within the guest.

> +
> +	vcpu_run(vcpu);
> +	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
> +
> +	dev_luo_fd = luo_open_device();
> +	TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
> +
> +	session_fd = luo_create_session(dev_luo_fd, SESSION_NAME);
> +	TEST_ASSERT(session_fd >= 0, "Failed to create LUO session");
> +
> +	ret = luo_session_preserve_fd(session_fd, vm->fd, VM_TOKEN);
> +	TEST_ASSERT(ret == 0, "Failed to preserve VM file descriptor");
> +
> +	ret = luo_session_preserve_fd(session_fd, gmem_fd, GMEM_TOKEN);
> +	TEST_ASSERT(ret == 0, "Failed to preserve guest_memfd file descriptor");
> +

Thanks for showing how this works :)

> +	printf("\n============================================================\n");
> +	printf("Phase 1 Complete Successfully!\n");
> +	printf("VM file and guest_memfd file have been preserved via LUO.\n");
> +	printf("Tokens: VM_TOKEN=0x%x, GMEM_TOKEN=0x%x\n", VM_TOKEN, GMEM_TOKEN);
> +	printf("Machine Size: %llu MB, Data Size: %llu MB\n", GMEM_SIZE / SZ_1M,
> +				 DATA_SIZE / SZ_1M);
> +	printf("------------------------------------------------------------\n");
> +
> +	daemonize_and_wait();
> +}
> +
> +static struct kvm_vm *vm_create_from_fd(int resurrected_vm_fd,
> +					struct vm_shape shape)
> +{
> +	struct kvm_vm *vm;
> +
> +	vm = calloc(1, sizeof(*vm));
> +	TEST_ASSERT(vm != NULL, "Insufficient Memory");
> +
> +	vm_init_fields(vm, shape);

What would happen if the shape was changed between preserving and
restoring?

> +
> +	vm->kvm_fd = open_path_or_exit(KVM_DEV_PATH, O_RDWR);
> +	vm->fd = resurrected_vm_fd;
> +
> +	if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
> +		vm->stats.fd = vm_get_stats_fd(vm);
> +	else
> +		vm->stats.fd = -1;
> +
> +	vm_init_memory_properties(vm);
> +
> +	return vm;
> +}
> +

I think vm_create_from_fd() could be introduced in an earlier patch to
reduce the amount of new code in this patch. Also, I think it could
perhaps be moved to kvm_util.c assuming that other test will use it too.

> +static void do_phase2(void)
> +{
> +	int retrieved_vm_fd, retrieved_gmem_fd, dev_luo_fd, session_fd;
> +	struct vm_shape shape = VM_SHAPE_DEFAULT;
> +	const uint64_t gpa = SZ_4G;
> +	struct kvm_vcpu *vcpu;
> +	const int slot = 1;
> +	struct kvm_vm *vm;
> +
> +	dev_luo_fd = luo_open_device();
> +	TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
> +
> +	session_fd = luo_retrieve_session(dev_luo_fd, SESSION_NAME);
> +	TEST_ASSERT(session_fd >= 0, "Failed to retrieve LUO session");
> +
> +	retrieved_vm_fd = luo_session_retrieve_fd(session_fd, VM_TOKEN);
> +	TEST_ASSERT(retrieved_vm_fd >= 0, "Failed to retrieve VM file descriptor");
> +
> +	retrieved_gmem_fd = luo_session_retrieve_fd(session_fd, GMEM_TOKEN);
> +	TEST_ASSERT(retrieved_gmem_fd >= 0, "Failed to retrieve guest_memfd file descriptor");
> +
> +	vm = vm_create_from_fd(retrieved_vm_fd, shape);
> +
> +	u64 nr_pages = 2048; /* 8MB is plenty for slot0 pages */
> +

I don't think declarations are usually mixed with regular code.

> +	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
> +	kvm_vm_elf_load(vm, program_invocation_name);
> +
> +	for (int i = 0; i < NR_MEM_REGIONS; i++)
> +		vm->memslots[i] = 0;
> +
> +	struct userspace_mem_region *slot0 = memslot2region(vm, 0);
> +
> +	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
> +
> +	vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
> +				   retrieved_gmem_fd, 0);
> +
> +	for (size_t i = 0; i < GMEM_SIZE; i += page_size)
> +		virt_pg_map(vm, gpa + i, gpa + i);
> +
> +	vcpu = vm_vcpu_add(vm, 0, guest_code_phase2);
> +	kvm_arch_vm_finalize_vcpus(vm);
> +
> +	vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
> +
> +	printf("Resuming / Running VM in Phase 2...\n");
> +	vcpu_run(vcpu);
> +	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
> +
> +	printf("\nSUCCESS: Phase 2 Complete! All 5MB complex data verified intact!\n");
> +
> +	luo_session_finish(session_fd);
> +	close(session_fd);
> +	close(dev_luo_fd);
> +	/* This will also close the vm_fd */
> +	kvm_vm_free(vm);
> +	close(retrieved_gmem_fd);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	bool phase2 = false;
> +
> +	TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
> +	page_size = getpagesize();
> +
> +	for (int i = 1; i < argc; i++) {
> +		if (strcmp(argv[i], "--phase2") == 0)
> +			phase2 = true;
> +	}
> +

Maybe use getopt() here?

> +	if (phase2)
> +		do_phase2();
> +	else
> +		do_phase1();
> +
> +	return 0;
> +}
> --
> 2.54.0.1032.g2f8565e1d1-goog

I think we also need tests for trying to allocate while frozen, and
conversion while frozen, and trying to preserve while preservation is
not allowed.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation
  2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
@ 2026-06-22 23:27   ` Ackerley Tng
  2026-06-23 15:26     ` tarunsahu
  0 siblings, 1 reply; 23+ messages in thread
From: Ackerley Tng @ 2026-06-22 23:27 UTC (permalink / raw)
  To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Tarun Sahu <tarunsahu@google.com> writes:

> This patch sets up the basic infrastructure to preserve the guest_memfd.
> Currently this supports only fully shared guest_memfd and backed by
> PAGE_SIZE pages.
>
> It registers a new LUO file handler for guest_memfd files to serialize
> and deserialize guest memory. This allows preserving guest memory backed
> by guest_memfd across updates, ensuring that guest instances can be
> resumed seamlessly without losing their memory contents.
>
> Preservation is straight forward. It walks through the folios and
> serialize them.
>
> There is kvm_gmem_freeze call on preserve which freeze the guest_memfd
> inode. It avoids any changes to inode mapping with fallocate calls or
> any new fault allocation (fails) on or after preservation. No need to check
> this during the page fault as preservation is only supported for
> pre-faulted/pre-allocated guest_memfd.
>
> While retrieving the guest_memfd, it requires the struct kvm to create
> new guest_memfd. So it first get the vm_file from the same session using
> the token passed during the preservation. And use it to get
> vm_file->kvm.
>
> This change also update the MAINTAINERS list.
>
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
> ---
>  MAINTAINERS                 |   1 +
>  include/linux/kho/abi/kvm.h |  79 +++++-
>  virt/kvm/Makefile.kvm       |   2 +-
>  virt/kvm/guest_memfd_luo.c  | 485 ++++++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c         |   7 +
>  virt/kvm/kvm_mm.h           |   4 +
>  6 files changed, 571 insertions(+), 7 deletions(-)
>  create mode 100644 virt/kvm/guest_memfd_luo.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 9bfc3c1f6676..16cba790a84d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -14418,6 +14418,7 @@ L:	kexec@lists.infradead.org
>  L:	kvm@vger.kernel.org
>  S:	Maintained
>  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
> +F:	virt/kvm/guest_memfd_luo.c
>  F:	virt/kvm/kvm_luo.c
>
>  KVM PARAVIRT (KVM/paravirt)
> diff --git a/include/linux/kho/abi/kvm.h b/include/linux/kho/abi/kvm.h
> index 718db68a541a..42074d76e04a 100644
> --- a/include/linux/kho/abi/kvm.h
> +++ b/include/linux/kho/abi/kvm.h
> @@ -9,20 +9,23 @@
>  #define _LINUX_KHO_ABI_KVM_H
>
>  #include <linux/types.h>
> +#include <linux/bits.h>
>  #include <linux/kho/abi/kexec_handover.h>
>
>  /**
> - * DOC: KVM Live Update ABI
> + * DOC: KVM and guest_memfd Live Update ABI
>   *
> - * KVM uses the ABI defined below for preserving its state
> + * KVM and guest_memfd use the ABI defined below for preserving their states
>   * across a kexec reboot using the LUO.
>   *
> - * The state is serialized into a packed structure `struct kvm_luo_ser`
> - * which is handed over to the next kernel via the KHO mechanism.
> + * The state is serialized into packed structures (struct kvm_luo_ser and
> + * struct guest_memfd_luo_ser) which are handed over to the next kernel via
> + * the KHO mechanism.
>   *
> - * This interface is a contract. Any modification to the structure layout
> + * This interface is a contract. Any modification to the structure layouts
>   * constitutes a breaking change. Such changes require incrementing the
> - * version number in the KVM_LUO_FH_COMPATIBLE compatibility string.
> + * version number in the KVM_LUO_FH_COMPATIBLE or
> + * GUEST_MEMFD_LUO_FH_COMPATIBLE compatibility strings.
>   */
>
>  /**
> @@ -36,4 +39,68 @@ struct kvm_luo_ser {
>  /* The compatibility string for KVM VM file handler */
>  #define KVM_LUO_FH_COMPATIBLE	"kvm_vm_luo_v1"
>
> +/**
> + * struct guest_memfd_luo_folio_ser - Serialization layout for a single folio in guest_memfd.
> + * @pfn:   Page Frame Number of the folio.
> + * @index: Page offset of the folio within the file.
> + * @flags: State flags associated with the folio.
> + */
> +struct guest_memfd_luo_folio_ser {
> +	u64 pfn:52;
> +	u64 flags:12;
> +	u64 index;
> +} __packed;
> +
> +/**
> + * GUEST_MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date.
> + *
> + * This flag is per folio to check if the folio is uptodate.
> + */
> +#define GUEST_MEMFD_LUO_FOLIO_UPTODATE	BIT(0)
> +
> +
> +/**
> + * GUEST_MEMFD_LUO_FLAG_MMAP - The guest_memfd supports mmap.
> + *
> + * This flag indicates that the guest_memfd supports host-side mmap.
> + */
> +#define GUEST_MEMFD_LUO_FLAG_MMAP		BIT(0)
> +
> +/**
> + * GUEST_MEMFD_LUO_FLAG_INIT_SHARED - Initialize memory as shared.
> + *
> + * This flag indicates that the guest_memfd has been initialized as shared
> + * memory.
> + */
> +#define GUEST_MEMFD_LUO_FLAG_INIT_SHARED	BIT(1)
> +
> +/**
> + * GUEST_MEMFD_LUO_SUPPORTED_FLAGS - Supported guest_memfd LUO flags mask.
> + *
> + * A mask of all guest_memfd preservation flags supported by this version
> + * of the KVM LUO ABI.
> + */
> +#define GUEST_MEMFD_LUO_SUPPORTED_FLAGS	(GUEST_MEMFD_LUO_FLAG_MMAP | \
> +						 GUEST_MEMFD_LUO_FLAG_INIT_SHARED)
> +
> +/**
> + * struct guest_memfd_luo_ser - Main serialization structure for guest_memfd.
> + * @size:      The size of the file in bytes.
> + * @flags:     File-level flags.
> + * @nr_folios: Number of folios in the folios array.
> + * @vm_token:  Token of the associated KVM VM instance.
> + * @folios:    KHO vmalloc descriptor pointing to the array of
> + *             struct guest_memfd_luo_folio_ser.
> + */
> +struct guest_memfd_luo_ser {
> +	u64 size;
> +	u64 flags;
> +	u64 nr_folios;
> +	u64 vm_token;
> +	struct kho_vmalloc folios;
> +} __packed;
> +
> +/* The compatibility string for GUEST_MEMFD file handler */
> +#define GUEST_MEMFD_LUO_FH_COMPATIBLE	"guest_memfd_luo_v1"
> +
>  #endif /* _LINUX_KHO_ABI_KVM_H */
> diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> index c1a962159264..d30fca094c42 100644
> --- a/virt/kvm/Makefile.kvm
> +++ b/virt/kvm/Makefile.kvm
> @@ -13,4 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
>  kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
>  kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
>  kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
> -kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/kvm_luo.o
> +kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/guest_memfd_luo.o $(KVM)/kvm_luo.o
> diff --git a/virt/kvm/guest_memfd_luo.c b/virt/kvm/guest_memfd_luo.c
> new file mode 100644
> index 000000000000..d466f889c9aa
> --- /dev/null
> +++ b/virt/kvm/guest_memfd_luo.c
> @@ -0,0 +1,485 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright (c) 2026, Google LLC.
> + * Tarun Sahu <tarunsahu@google.com>
> + *
> + * Guestmemfd Preservation for Live Update Orchestrator (LUO)
> + */
> +
> +/**
> + * DOC: Guestmemfd Preservation via LUO
> + *
> + * Overview
> + * ========
> + *
> + * Guest memory file descriptors (guest_memfd) can be preserved over a kexec
> + * reboot using the Live Update Orchestrator (LUO) file preservation. This
> + * allows userspace to preserve VM memory across kexec reboots.
> + *
> + * The preservation is not intended to be transparent. Only select properties
> + * of the guest_memfd are preserved, while others are reset to default.
> + *
> + * Preserved Properties
> + * ====================
> + *
> + * The following properties of guest_memfd are preserved across kexec:
> + *
> + * File Size
> + *   The size of the file is preserved.
> + *
> + * File Contents
> + *   All folios present in the page cache are preserved.
> + *
> + * File-level Flags
> + *   The file-level flags (such as MMAP support and INIT_SHARED default mapping)
> + *   are preserved.
> + *
> + * Non-Preserved Properties
> + * ========================
> + *
> + * NUMA Memory Policy
> + *   NUMA memory policies associated with the guest_memfd are not preserved.
> + */
> +#include <linux/liveupdate.h>
> +#include <linux/kvm_host.h>
> +#include <linux/pagemap.h>
> +#include <linux/file.h>
> +#include <linux/err.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/magic.h>
> +#include <linux/kexec_handover.h>
> +#include <linux/kho/abi/kexec_handover.h>
> +#include <linux/kho/abi/kvm.h>
> +#include "guest_memfd.h"
> +
> +static int kvm_gmem_luo_walk_folios(struct address_space *mapping,
> +		pgoff_t end_index, struct guest_memfd_luo_folio_ser *folios_ser,
> +		u64 *out_count)
> +{
> +	struct folio_batch fbatch;
> +	pgoff_t index = 0;
> +	u64 count = 0;
> +	int err = 0;
> +
> +	folio_batch_init(&fbatch);
> +	while (index < end_index) {
> +		unsigned int nr, i;
> +
> +		nr = filemap_get_folios(mapping, &index, end_index - 1, &fbatch);
> +		if (nr == 0)
> +			break;
> +
> +		for (i = 0; i < nr; i++) {
> +			struct folio *folio = fbatch.folios[i];
> +
> +			if (folios_ser) {
> +				if (folio_test_hwpoison(folio)) {
> +					err = -EHWPOISON;
> +					folio_batch_release(&fbatch);
> +					goto out;
> +				}
> +				err = kho_preserve_folio(folio);
> +				if (err) {
> +					folio_batch_release(&fbatch);
> +					goto out;
> +				}
> +
> +				folios_ser[count].pfn = folio_pfn(folio);
> +				folios_ser[count].index = folio->index;
> +				folios_ser[count].flags = folio_test_uptodate(folio) ?
> +							  GUEST_MEMFD_LUO_FOLIO_UPTODATE : 0;
> +			}
> +			count++;
> +		}
> +		folio_batch_release(&fbatch);
> +		cond_resched();
> +	}
> +
> +out:
> +	*out_count = count;
> +	return err;
> +}
> +
> +static bool kvm_gmem_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file)
> +{
> +	struct inode *inode = file_inode(file);
> +	struct gmem_file *gmem_file = file->private_data;
> +	struct kvm *kvm = gmem_file->kvm;
> +
> +	if (inode->i_sb->s_magic != GUEST_MEMFD_MAGIC)
> +		return 0;
> +

How does .can_preserve decide route to this function? If it already
routes here, wouldn't this inode definitely be a guest_memfd file?

> +	if (kvm_arch_has_private_mem(kvm))
> +		return 0;
> +
> +	if (mapping_large_folio_support(inode->i_mapping))
> +		return 0;
> +
> +	return 1;

Let's return true and false rather than relying on casting.

> +}
> +
> +static int kvm_gmem_luo_preserve(struct liveupdate_file_op_args *args)
> +{
> +	struct guest_memfd_luo_folio_ser *folios_ser = NULL;
> +	u64 count = 0, gmem_flags, abi_flags = 0;
> +	struct guest_memfd_luo_ser *ser;
> +	struct address_space *mapping;
> +	struct gmem_file *gmem_file;
> +	struct inode *inode;
> +	pgoff_t end_index;
> +	struct kvm *kvm;
> +	int err = 0;
> +	long size;
> +
> +	inode = file_inode(args->file);

I think to lock out all allocates, you'd have to take
filemap_invalidate_lock() before freezing.

> +	kvm_gmem_freeze(inode, true);
> +
> +	mapping = inode->i_mapping;
> +	size = i_size_read(inode);
> +	if (!size) {
> +		err = -EINVAL;
> +		goto err_unfreeze_inode;
> +	}
> +
> +	if (WARN_ON_ONCE(!PAGE_ALIGNED(size))) {
> +		err = -EINVAL;
> +		goto err_unfreeze_inode;
> +	}
> +
> +	gmem_file = args->file->private_data;
> +	kvm = gmem_file->kvm;
> +
> +	gmem_flags = READ_ONCE(GMEM_I(inode)->flags);
> +	if (gmem_flags & ~(GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED

Why condition this on MMAP?

After conversions lands, we'd have to iterate to check that the entire
guest_memfd is shared offset-by-offset instead of checking for INIT_SHARED.

> +				| GUEST_MEMFD_F_MAPPING_FROZEN)) {

This would always be true since kvm_gmem_freeze() is done above.

> +		err = -EOPNOTSUPP;
> +		goto err_unfreeze_inode;
> +	}
> +
> +	if (gmem_flags & GUEST_MEMFD_FLAG_MMAP)
> +		abi_flags |= GUEST_MEMFD_LUO_FLAG_MMAP;
> +	if (gmem_flags & GUEST_MEMFD_FLAG_INIT_SHARED)
> +		abi_flags |= GUEST_MEMFD_LUO_FLAG_INIT_SHARED;
> +

Is it intentional to have a different set of flags that are actually
preserved? I think we should refactor out a function to transfer the
flags over.

> +	end_index = size >> PAGE_SHIFT;
> +
> +	ser = kho_alloc_preserve(sizeof(*ser));
> +	if (IS_ERR(ser)) {
> +		err = PTR_ERR(ser);
> +		goto err_unfreeze_inode;
> +	}
> +
> +	/* First pass: Count the folios present in the page cache */
> +	err = kvm_gmem_luo_walk_folios(mapping, end_index, NULL, &count);
> +	if (err)
> +		goto err_free_ser;
> +
> +	ser->size = size;
> +	ser->flags = abi_flags;
> +	ser->nr_folios = count;
> +	ser->vm_token = 0; // It will be set during the kvm_gmem_luo_freeze()

I don't think // is commonly used.

> +
> +	if (count > 0) {
> +		folios_ser = vcalloc(count, sizeof(*folios_ser));
> +		if (!folios_ser) {
> +			err = -ENOMEM;
> +			goto err_free_ser;
> +		}
> +
> +		/* Second pass: Fill the metadata array and preserve folios */
> +		err = kvm_gmem_luo_walk_folios(mapping, end_index, folios_ser, &count);

I think it's clearer to just define 2 functions rather than using the
same function twice to do these different things. The comments on the
two passes can then be dropped.

> +		if (err)
> +			goto err_unpreserve_unlocked;
> +
> +		if (WARN_ON_ONCE(count != ser->nr_folios)) {
> +			err = -EINVAL;
> +			goto err_unpreserve_unlocked;
> +		}
> +	}
> +
> +	if (count > 0) {
> +		err = kho_preserve_vmalloc(folios_ser, &ser->folios);
> +		if (err)
> +			goto err_unpreserve_unlocked;
> +	}
> +
> +	args->serialized_data = virt_to_phys(ser);
> +	args->private_data = folios_ser;
> +
> +	return 0;
> +
> +err_unpreserve_unlocked:
> +	for (long i = (long)count - 1; i >= 0; i--) {

Not sure if it's common to define long i inline.

> +		struct folio *folio = pfn_folio(folios_ser[i].pfn);
> +
> +		kho_unpreserve_folio(folio);
> +	}
> +	vfree(folios_ser);
> +err_free_ser:
> +	kho_unpreserve_free(ser);
> +err_unfreeze_inode:
> +	kvm_gmem_freeze(inode, false);
> +	return err;
> +}
> +
> +static int kvm_gmem_luo_freeze(struct liveupdate_file_op_args *args)
> +{
> +	struct guest_memfd_luo_ser *ser;
> +	struct gmem_file *gmem_file;
> +	struct kvm *kvm;
> +	struct file *kvm_file;
> +	u64 vm_token;
> +	int err;
> +
> +	if (WARN_ON_ONCE(!args->serialized_data))
> +		return -EINVAL;
> +
> +	ser = phys_to_virt(args->serialized_data);
> +
> +	gmem_file = args->file->private_data;
> +	kvm = gmem_file->kvm;
> +
> +	/*
> +	 * Obtain a strong reference to kvm->vm_file to prevent the SLAB_TYPESAFE_BY_RCU
> +	 * file memory from being reallocated while it is being processed.
> +	 */
> +	kvm_file = get_file_active(&kvm->vm_file);
> +	if (!kvm_file)
> +		return -ENOENT;
> +
> +	err = liveupdate_get_token_outgoing(args->session, kvm_file, &vm_token);
> +	fput(kvm_file);
> +	if (err)
> +		return err;
> +
> +	ser->vm_token = vm_token;
> +	return 0;
> +}
> +
> +static void kvm_gmem_luo_discard_folios(
> +	const struct guest_memfd_luo_folio_ser *folios_ser,
> +	u64 nr_folios, u64 start_idx)
> +{
> +	long i;
> +
> +	for (i = start_idx; i < nr_folios; i++) {
> +		struct folio *folio;
> +		phys_addr_t phys;
> +
> +		if (!folios_ser[i].pfn)
> +			continue;
> +
> +		phys = PFN_PHYS(folios_ser[i].pfn);
> +		folio = kho_restore_folio(phys);
> +		if (folio)
> +			folio_put(folio);
> +	}
> +}
> +
> +static void kvm_gmem_luo_unpreserve(struct liveupdate_file_op_args *args)
> +{
> +	struct guest_memfd_luo_folio_ser *folios_ser = args->private_data;
> +	struct guest_memfd_luo_ser *ser;
> +	long i;
> +
> +	if (WARN_ON_ONCE(!args->serialized_data))
> +		return;
> +
> +	ser = phys_to_virt(args->serialized_data);
> +	if (!ser)
> +		return;
> +
> +	if (ser->nr_folios > 0)
> +		kho_unpreserve_vmalloc(&ser->folios);
> +	for (i = ser->nr_folios - 1; i >= 0; i--) {
> +		struct folio *folio;
> +
> +		if (!folios_ser[i].pfn)

Is it possible for pfn to be 0 here? Perhaps this should be a
WARN_ON_ONCE().

> +			continue;
> +
> +		folio = pfn_folio(folios_ser[i].pfn);
> +		kho_unpreserve_folio(folio);
> +	}
> +	vfree(folios_ser);
> +
> +	kho_unpreserve_free(ser);
> +	kvm_gmem_freeze(file_inode(args->file), false);
> +}
> +
>
> [...snip...]
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
  2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
@ 2026-06-22 23:54   ` Ackerley Tng
  2026-06-23  0:09     ` Sean Christopherson
                       ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Ackerley Tng @ 2026-06-22 23:54 UTC (permalink / raw)
  To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Tarun Sahu <tarunsahu@google.com> writes:

> This patch introduces the freeze on gmem_inode which prevents

Can't find the reference now, but commit messages should take the
imperative mood and avoid "this patch" [*]

[*] https://lore.kernel.org/all/YKRWNaqzo4GVDxHP@google.com/

> the fallocate call and any new page fault allocation. This will avoid
> gmem file modification when it is being preserved
>
> Used srcu lock to synchronise the freeze call, where write blocks
> until all the reads are free. And reads are re-entrant.
>
> Incase fault fails, It return -EPERM and VM_EXIT to userspace. userspace
> must handle this properly as every new fault will fail.
>
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
>
> [...snip...]
>
> @@ -105,12 +108,20 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  	if (!IS_ERR(folio))
>  		return folio;
>
> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> +	if (kvm_gmem_is_frozen(inode)) {
> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +		return ERR_PTR(-EPERM);
> +	}
> +
>  	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
>  	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
>  					 FGP_LOCK | FGP_CREAT,
>  					 mapping_gfp_mask(inode->i_mapping), policy);
>  	mpol_cond_put(policy);
>
> +	srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +
>  	/*
>  	 * External interfaces like kvm_gmem_get_pfn() support dealing
>  	 * with hugepages to a degree, but internally, guest_memfd currently
> @@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
>  static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>  			       loff_t len)
>  {
> +	struct inode *inode = file_inode(file);
>  	int ret;
> +	int idx;
>
> -	if (!(mode & FALLOC_FL_KEEP_SIZE))
> -		return -EOPNOTSUPP;
> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> +	if (kvm_gmem_is_frozen(inode)) {
> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +		return -EPERM;
> +	}

fallocate may eventually go to kvm_gmem_get_folio(), so that would check
kvm_gmem_is_frozen() twice. Is this meant to catch the punch hole case?

>
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> -		return -EOPNOTSUPP;
> +	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
>
> -	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> -		return -EINVAL;
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +
> +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}

There's some reordering here. Why not let the validation happen like
before, then check kvm_gmem_is_frozen()?

>
>  	if (mode & FALLOC_FL_PUNCH_HOLE)
>  		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
>
> [...snip...]
>
> +
> +/**
> + * kvm_gmem_freeze - Freeze or unfreeze a guest_memfd inode mapping.
> + * @inode: The guest_memfd inode.
> + * @freeze: True to freeze, false to unfreeze.
> + *
> + * This API is used strictly during the live update / preservation transition
> + * window to prevent host userspace and guest-side faults from making any
> + * mapping modifications (such as fallocate or page fault allocation)
> + * to the guest_memfd page cache.
> + *
> + * Synchronization Strategy (Sleepable RCU):
> + * To avoid high-contention VFS locks (like inode_lock or
> + * filemap_invalidate_lock) on the vCPU page fault hot paths, this subsystem
> + * implements a lightweight, system-wide Sleepable RCU (SRCU) mechanism
> + * (`kvm_gmem_freeze_srcu`):
> + *
> + * Global vs. Per-Inode SRCU
> + * ======================
> + * A single system-wide global static `srcu_struct` is used instead of a
> + * per-inode SRCU structure to completely prevent unprivileged users from
> + * exhausting the host's per-CPU memory allocator. Because
> + * `init_srcu_struct()` allocates per-CPU memory via `alloc_percpu()`, which
> + * is not accounted by memory cgroups (memcg),
> + * a per-inode SRCU structure would allow a tenant to bypass cgroup limits and
> + * trigger a system-wide Out-of-Memory (OOM) crash simply by spawning a large
> + * number of guest_memfd file descriptors (bounded only by RLIMIT_NOFILE).
> + *
> + * Flag Modification Note:
> + * Since `GUEST_MEMFD_F_MAPPING_FROZEN` is the ONLY flag in
> + * `GMEM_I(inode)->flags` that is mutated dynamically at runtime (all other
> + * flags are creation-time flags which remain strictly read-only), there is
> + * no possibility of concurrent bit-modification races. Therefore, a standard
> + * `WRITE_ONCE` is fully safe and does not require complex `cmpxchg`
> + * synchronization loops.
> + */
> +void kvm_gmem_freeze(struct inode *inode, bool freeze)
> +{
> +	u64 flags = READ_ONCE(GMEM_I(inode)->flags);
> +
> +	if (freeze)
> +		flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
> +	else
> +		flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
> +
> +	WRITE_ONCE(GMEM_I(inode)->flags, flags);
> +
> +	if (freeze)
> +		synchronize_srcu(&kvm_gmem_freeze_srcu);

Why only synchronize on freeze but not unfreeze?

> +}
> +
>
> [...snip...]
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support
  2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
@ 2026-06-22 23:59   ` Ackerley Tng
  2026-06-23 12:48     ` tarunsahu
  2026-06-23 15:33     ` tarunsahu
  0 siblings, 2 replies; 23+ messages in thread
From: Ackerley Tng @ 2026-06-22 23:59 UTC (permalink / raw)
  To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Tarun Sahu <tarunsahu@google.com> writes:

> Introduce core infrastructure to support VM preservation with LUO.
>
> First two changes are just refactoring, no functional change, third
> change introduces a new member in struct kvm.
> - Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
> - Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
>   and anon_inode_getfile() to provide a unified VM file creation API.
> - Track a weak reference to the backing file in struct kvm under
>   CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
>   without circular lifetime dependencies.
>

Given the above, I think this should be separate patches.

> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
> ---
>  include/linux/kvm_host.h | 14 +++++++
>  virt/kvm/kvm_main.c      | 79 +++++++++++++++++++++++++++++-----------
>  virt/kvm/kvm_mm.h        |  3 ++
>  3 files changed, 75 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4c14aee1fb06..9111a28637af 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -874,6 +874,18 @@ struct kvm {
>  #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
>  	/* Protected by slots_lock (for writes) and RCU (for reads) */
>  	struct xarray mem_attr_array;
> +#endif
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> +	/*
> +	 * Weak reference to the VFS file backing this KVM instance. Stored
> +	 * without incrementing the file refcount to prevent a circular lifetime
> +	 * dependency (since file->private_data already pins this struct kvm).
> +	 * Used exclusively to resolve the file pointer back from struct kvm.
> +	 *
> +	 * Written/cleared via rcu_assign_pointer() and read locklessly under
> +	 * RCU (e.g. via get_file_active() to prevent ABA races).
> +	 */
> +	struct file *vm_file;
>  #endif

We didn't really talk about this during the calls, but it seems weird to
preserve a vm_file with pretty much nothing other than the vm type. The
entire VM is re-created, which means it could potentially be a
completely different VM?

In some sense it's more flexible since the guest_memfd can be restored
with some completely different VM, but it seems like it could introduce
other issues.

I think other KVM folks would probably have more thoughts here.

>  	char stats_id[KVM_STATS_NAME_SIZE];
>  };
> @@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
>  bool kvm_get_kvm_safe(struct kvm *kvm);
>  void kvm_put_kvm(struct kvm *kvm);
>  bool file_is_kvm(struct file *file);
> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
>  void kvm_put_kvm_no_destroy(struct kvm *kvm);
> +void kvm_uevent_notify_vm_create(struct kvm *kvm);
>
>  static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
>  {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 89489996fbc1..65f0c5fb353e 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -67,9 +67,6 @@
>  #include <linux/kvm_dirty_ring.h>
>
>
> -/* Worst case buffer size needed for holding an integer. */
> -#define ITOA_MAX_LEN 12
> -
>  MODULE_AUTHOR("Qumranet");
>  MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
>  MODULE_LICENSE("GPL");
> @@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>  {
>  	struct kvm *kvm = filp->private_data;
>
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> +	/*
> +	 * Clear the weak reference of the vm file.
> +	 * In case vm file is closed by userspace, but kvm still has
> +	 * other users like vCPUs, clearing this pointer ensures
> +	 * that we don't have a dangling pointer to a closed file.
> +	 *
> +	 * Cleared via rcu_assign_pointer() to ensure proper memory visibility
> +	 * for concurrent lockless readers under RCU.
> +	 */
> +	rcu_assign_pointer(kvm->vm_file, NULL);
> +#endif
> +
>  	kvm_irqfd_release(kvm);
>
>  	kvm_put_kvm(kvm);
> @@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
>  }
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
>
> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
> +{
> +	struct kvm *kvm = kvm_create_vm(type, fdname);
> +	struct file *file;
> +
> +	if (IS_ERR(kvm))
> +		return ERR_CAST(kvm);
> +
> +	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> +	if (IS_ERR(file)) {
> +		kvm_put_kvm(kvm);
> +		return file;
> +	}
> +
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> +	/*
> +	 * Weak reference to the file (without get_file()) to prevent a circular
> +	 * dependency. Safe because the file's release path clears this pointer
> +	 * and drops its reference to the VM.
> +	 *
> +	 * Written via rcu_assign_pointer() because the pointer can be read
> +	 * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
> +	 * get_file_active() to prevent lockless ABA races).
> +	 */
> +	rcu_assign_pointer(kvm->vm_file, file);
> +#endif
> +
> +	/*
> +	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
> +	 * already set, with ->release() being kvm_vm_release().  In error
> +	 * cases it will be called by the final fput(file) and will take
> +	 * care of doing kvm_put_kvm(kvm).
> +	 */
> +
> +	return file;
> +}
> +
>  static int kvm_dev_ioctl_create_vm(unsigned long type)
>  {
>  	char fdname[ITOA_MAX_LEN + 1];
>  	int r, fd;
> -	struct kvm *kvm;
>  	struct file *file;
>
>  	fd = get_unused_fd_flags(O_CLOEXEC);
> @@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
>
>  	snprintf(fdname, sizeof(fdname), "%d", fd);
>
> -	kvm = kvm_create_vm(type, fdname);
> -	if (IS_ERR(kvm)) {
> -		r = PTR_ERR(kvm);
> -		goto put_fd;
> -	}
> -
> -	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> +	file = kvm_create_vm_file(type, fdname);
>  	if (IS_ERR(file)) {
>  		r = PTR_ERR(file);
> -		goto put_kvm;
> +		goto put_fd;
>  	}
>
> -	/*
> -	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
> -	 * already set, with ->release() being kvm_vm_release().  In error
> -	 * cases it will be called by the final fput(file) and will take
> -	 * care of doing kvm_put_kvm(kvm).
> -	 */
> -	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
> +	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);

Notifying with file->private_data threw me off... I would rather inline
the rcu_assign_pointer() in this function and have this line read
notify(..., kvm) like before.

>
>  	fd_install(fd, file);
>  	return fd;
>
> -put_kvm:
> -	kvm_put_kvm(kvm);
>  put_fd:
>  	put_unused_fd(fd);
>  	return r;
> @@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
>  	kfree(env);
>  }
>
> +void kvm_uevent_notify_vm_create(struct kvm *kvm)
> +{
> +	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
> +}
> +
>  static void kvm_init_debug(void)
>  {
>  	const struct file_operations *fops;
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 9fcc5d5b7f8d..7aa1d65c3d46 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -3,6 +3,9 @@
>  #ifndef __KVM_MM_H__
>  #define __KVM_MM_H__ 1
>
> +/* Worst case buffer size needed for holding an integer as a string. */
> +#define ITOA_MAX_LEN 12
> +
>  /*
>   * Architectures can choose whether to use an rwlock or spinlock
>   * for the mmu_lock.  These macros, for use in common code
> --
> 2.54.0.1032.g2f8565e1d1-goog

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
  2026-06-22 23:54   ` Ackerley Tng
@ 2026-06-23  0:09     ` Sean Christopherson
  2026-06-23 14:03       ` tarunsahu
  2026-06-23 14:02     ` tarunsahu
  2026-06-23 14:36     ` tarunsahu
  2 siblings, 1 reply; 23+ messages in thread
From: Sean Christopherson @ 2026-06-23  0:09 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, axelrasmussen, linux-kselftest,
	kexec, linux-kernel, linux-doc, kvm, linux-mm

On Mon, Jun 22, 2026, Ackerley Tng wrote:
> Tarun Sahu <tarunsahu@google.com> writes:
> 
> > This patch introduces the freeze on gmem_inode which prevents
> 
> Can't find the reference now, but commit messages should take the
> imperative mood and avoid "this patch" [*]

From Documentation/process/submitting-patches.rst:

  Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
  instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
  to do frotz", as if you are giving orders to the codebase to change
  its behaviour.

Documentation/process/maintainer-tip.rst and Documentation/process/maintainer-kvm-x86.rst
elaborate more on the preferred style (I do most of the guest_memfd maintenance,
and so for all intents and purpose it's bound by KVM x86 "rules").

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support
  2026-06-22 23:59   ` Ackerley Tng
@ 2026-06-23 12:48     ` tarunsahu
  2026-06-23 15:33     ` tarunsahu
  1 sibling, 0 replies; 23+ messages in thread
From: tarunsahu @ 2026-06-23 12:48 UTC (permalink / raw)
  To: Ackerley Tng, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm


Hi,

Thanks for reviewing the patch.


Ackerley Tng <ackerleytng@google.com> writes:

> Tarun Sahu <tarunsahu@google.com> writes:
>
>> Introduce core infrastructure to support VM preservation with LUO.
>>
>> First two changes are just refactoring, no functional change, third
>> change introduces a new member in struct kvm.
>> - Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
>> - Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
>>   and anon_inode_getfile() to provide a unified VM file creation API.
>> - Track a weak reference to the backing file in struct kvm under
>>   CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
>>   without circular lifetime dependencies.
>>
>
> Given the above, I think this should be separate patches.

Okay.

>
>> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
>> ---
>>  include/linux/kvm_host.h | 14 +++++++
>>  virt/kvm/kvm_main.c      | 79 +++++++++++++++++++++++++++++-----------
>>  virt/kvm/kvm_mm.h        |  3 ++
>>  3 files changed, 75 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 4c14aee1fb06..9111a28637af 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -874,6 +874,18 @@ struct kvm {
>>  #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
>>  	/* Protected by slots_lock (for writes) and RCU (for reads) */
>>  	struct xarray mem_attr_array;
>> +#endif
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +	/*
>> +	 * Weak reference to the VFS file backing this KVM instance. Stored
>> +	 * without incrementing the file refcount to prevent a circular lifetime
>> +	 * dependency (since file->private_data already pins this struct kvm).
>> +	 * Used exclusively to resolve the file pointer back from struct kvm.
>> +	 *
>> +	 * Written/cleared via rcu_assign_pointer() and read locklessly under
>> +	 * RCU (e.g. via get_file_active() to prevent ABA races).
>> +	 */
>> +	struct file *vm_file;
>>  #endif
>
> We didn't really talk about this during the calls, but it seems weird to
> preserve a vm_file with pretty much nothing other than the vm type. The
> entire VM is re-created, which means it could potentially be a
> completely different VM?
>
> In some sense it's more flexible since the guest_memfd can be restored
> with some completely different VM, but it seems like it could introduce
> other issues.
>
> I think other KVM folks would probably have more thoughts here.

IIUC,
you are asking "Why preserve vm_fd with guest_memfd when we only
preserve vm_type?"

We discussed about this. Also explained here: (also copying it down)
[RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO
https://lore.kernel.org/all/8730c0e11acbd0d645a8b7187cd5cd7de373380e.1780676742.git.tarunsahu@google.com/

and
https://lore.kernel.org/all/cover.1780667929.git.tarunsahu@google.com/
(This cover letter was sent separately from the patches due to a problem
in my automated script)

vm_fd is needed for guest_memfd retrieval, because guest_memfd can
not be retrieved without struct kvm and there is no other way to pass
that. (We talked about alternative like LINK IOCTL or break the
CREATE_GUEST_MEMFD IOCTL in two IOCTL: one just create GUEST_MEMFD
and another attach it to the vm_file (struct kvm)). We discarded the
alternative approach because it changes the guest_memfd design.

This patch also set the infrastucture to preserve the vm_fd which
will be extended later in future when we will introduce private support.
where TDX related data (sPTE) might be preserved via struct kvm. Also,
vCPUs state, IRQ routing table etc if needed can also be preserved.


>> +	struct file *vm_file;

If You are asking about, the diff above (why vm_file is there)
There is no way to get vm_file from struct kvm which is needed
in guest_memfd preservation during freeze call to preserve the token of
vm_fd. This is used on retrieval time.


I have sent V3 as well here:
https://lore.kernel.org/all/20260622184851.2309827-1-tarunsahu@google.com/

V3 includes the few minor fixes suggested by sashiko.
we can continue reviewing on V2/V3. I will include all of the
suggestions in V4.

>
>>  	char stats_id[KVM_STATS_NAME_SIZE];
>>  };
>> @@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
>>  bool kvm_get_kvm_safe(struct kvm *kvm);
>>  void kvm_put_kvm(struct kvm *kvm);
>>  bool file_is_kvm(struct file *file);
>> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
>>  void kvm_put_kvm_no_destroy(struct kvm *kvm);
>> +void kvm_uevent_notify_vm_create(struct kvm *kvm);
>>
>>  static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
>>  {
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 89489996fbc1..65f0c5fb353e 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -67,9 +67,6 @@
>>  #include <linux/kvm_dirty_ring.h>
>>
>>
>> -/* Worst case buffer size needed for holding an integer. */
>> -#define ITOA_MAX_LEN 12
>> -
>>  MODULE_AUTHOR("Qumranet");
>>  MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
>>  MODULE_LICENSE("GPL");
>> @@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>>  {
>>  	struct kvm *kvm = filp->private_data;
>>
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +	/*
>> +	 * Clear the weak reference of the vm file.
>> +	 * In case vm file is closed by userspace, but kvm still has
>> +	 * other users like vCPUs, clearing this pointer ensures
>> +	 * that we don't have a dangling pointer to a closed file.
>> +	 *
>> +	 * Cleared via rcu_assign_pointer() to ensure proper memory visibility
>> +	 * for concurrent lockless readers under RCU.
>> +	 */
>> +	rcu_assign_pointer(kvm->vm_file, NULL);
>> +#endif
>> +
>>  	kvm_irqfd_release(kvm);
>>
>>  	kvm_put_kvm(kvm);
>> @@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
>>  }
>>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
>>
>> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
>> +{
>> +	struct kvm *kvm = kvm_create_vm(type, fdname);
>> +	struct file *file;
>> +
>> +	if (IS_ERR(kvm))
>> +		return ERR_CAST(kvm);
>> +
>> +	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>> +	if (IS_ERR(file)) {
>> +		kvm_put_kvm(kvm);
>> +		return file;
>> +	}
>> +
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +	/*
>> +	 * Weak reference to the file (without get_file()) to prevent a circular
>> +	 * dependency. Safe because the file's release path clears this pointer
>> +	 * and drops its reference to the VM.
>> +	 *
>> +	 * Written via rcu_assign_pointer() because the pointer can be read
>> +	 * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
>> +	 * get_file_active() to prevent lockless ABA races).
>> +	 */
>> +	rcu_assign_pointer(kvm->vm_file, file);
>> +#endif
>> +
>> +	/*
>> +	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
>> +	 * already set, with ->release() being kvm_vm_release().  In error
>> +	 * cases it will be called by the final fput(file) and will take
>> +	 * care of doing kvm_put_kvm(kvm).
>> +	 */
>> +
>> +	return file;
>> +}
>> +
>>  static int kvm_dev_ioctl_create_vm(unsigned long type)
>>  {
>>  	char fdname[ITOA_MAX_LEN + 1];
>>  	int r, fd;
>> -	struct kvm *kvm;
>>  	struct file *file;
>>
>>  	fd = get_unused_fd_flags(O_CLOEXEC);
>> @@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
>>
>>  	snprintf(fdname, sizeof(fdname), "%d", fd);
>>
>> -	kvm = kvm_create_vm(type, fdname);
>> -	if (IS_ERR(kvm)) {
>> -		r = PTR_ERR(kvm);
>> -		goto put_fd;
>> -	}
>> -
>> -	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>> +	file = kvm_create_vm_file(type, fdname);
>>  	if (IS_ERR(file)) {
>>  		r = PTR_ERR(file);
>> -		goto put_kvm;
>> +		goto put_fd;
>>  	}
>>
>> -	/*
>> -	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
>> -	 * already set, with ->release() being kvm_vm_release().  In error
>> -	 * cases it will be called by the final fput(file) and will take
>> -	 * care of doing kvm_put_kvm(kvm).
>> -	 */
>> -	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
>> +	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);
>
> Notifying with file->private_data threw me off... I would rather inline
> the rcu_assign_pointer() in this function and have this line read
> notify(..., kvm) like before.
>
>>
>>  	fd_install(fd, file);
>>  	return fd;
>>
>> -put_kvm:
>> -	kvm_put_kvm(kvm);
>>  put_fd:
>>  	put_unused_fd(fd);
>>  	return r;
>> @@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
>>  	kfree(env);
>>  }
>>
>> +void kvm_uevent_notify_vm_create(struct kvm *kvm)
>> +{
>> +	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
>> +}
>> +
>>  static void kvm_init_debug(void)
>>  {
>>  	const struct file_operations *fops;
>> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
>> index 9fcc5d5b7f8d..7aa1d65c3d46 100644
>> --- a/virt/kvm/kvm_mm.h
>> +++ b/virt/kvm/kvm_mm.h
>> @@ -3,6 +3,9 @@
>>  #ifndef __KVM_MM_H__
>>  #define __KVM_MM_H__ 1
>>
>> +/* Worst case buffer size needed for holding an integer as a string. */
>> +#define ITOA_MAX_LEN 12
>> +
>>  /*
>>   * Architectures can choose whether to use an rwlock or spinlock
>>   * for the mmu_lock.  These macros, for use in common code
>> --
>> 2.54.0.1032.g2f8565e1d1-goog

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
  2026-06-22 23:54   ` Ackerley Tng
  2026-06-23  0:09     ` Sean Christopherson
@ 2026-06-23 14:02     ` tarunsahu
  2026-06-23 14:36     ` tarunsahu
  2 siblings, 0 replies; 23+ messages in thread
From: tarunsahu @ 2026-06-23 14:02 UTC (permalink / raw)
  To: Ackerley Tng, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm


Thanks for reviewing!

Ackerley Tng <ackerleytng@google.com> writes:

> Tarun Sahu <tarunsahu@google.com> writes:
>
>> This patch introduces the freeze on gmem_inode which prevents
>
> Can't find the reference now, but commit messages should take the
> imperative mood and avoid "this patch" [*]
>
> [*] https://lore.kernel.org/all/YKRWNaqzo4GVDxHP@google.com/
>

ACK. Will take care of it.

>> the fallocate call and any new page fault allocation. This will avoid
>> gmem file modification when it is being preserved
>>
>> Used srcu lock to synchronise the freeze call, where write blocks
>> until all the reads are free. And reads are re-entrant.
>>
>> Incase fault fails, It return -EPERM and VM_EXIT to userspace. userspace
>> must handle this properly as every new fault will fail.
>>
>> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
>>
>> [...snip...]
>>
>> @@ -105,12 +108,20 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>>  	if (!IS_ERR(folio))
>>  		return folio;
>>
>> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
>> +	if (kvm_gmem_is_frozen(inode)) {
>> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
>> +		return ERR_PTR(-EPERM);
>> +	}
>> +
>>  	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
>>  	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
>>  					 FGP_LOCK | FGP_CREAT,
>>  					 mapping_gfp_mask(inode->i_mapping), policy);
>>  	mpol_cond_put(policy);
>>
>> +	srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
>> +
>>  	/*
>>  	 * External interfaces like kvm_gmem_get_pfn() support dealing
>>  	 * with hugepages to a degree, but internally, guest_memfd currently
>> @@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
>>  static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>>  			       loff_t len)
>>  {
>> +	struct inode *inode = file_inode(file);
>>  	int ret;
>> +	int idx;
>>
>> -	if (!(mode & FALLOC_FL_KEEP_SIZE))
>> -		return -EOPNOTSUPP;
>> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
>> +	if (kvm_gmem_is_frozen(inode)) {
>> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
>> +		return -EPERM;
>> +	}
>
> fallocate may eventually go to kvm_gmem_get_folio(), so that would check
> kvm_gmem_is_frozen() twice. Is this meant to catch the punch hole case?
>

Right. To catch punch hole case. And read lock being re-entrant, so I
blocked the fallocate call completely.

>>
>> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>> -		return -EOPNOTSUPP;
>> +	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
>> +		ret = -EOPNOTSUPP;
>> +		goto out;
>> +	}
>>
>> -	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
>> -		return -EINVAL;
>> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
>> +		ret = -EOPNOTSUPP;
>> +		goto out;
>> +	}
>> +
>> +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>
> There's some reordering here. Why not let the validation happen like
> before, then check kvm_gmem_is_frozen()?
>
>>
>>  	if (mode & FALLOC_FL_PUNCH_HOLE)
>>  		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
>>
>> [...snip...]
>>
>> +
>> +/**
>> + * kvm_gmem_freeze - Freeze or unfreeze a guest_memfd inode mapping.
>> + * @inode: The guest_memfd inode.
>> + * @freeze: True to freeze, false to unfreeze.
>> + *
>> + * This API is used strictly during the live update / preservation transition
>> + * window to prevent host userspace and guest-side faults from making any
>> + * mapping modifications (such as fallocate or page fault allocation)
>> + * to the guest_memfd page cache.
>> + *
>> + * Synchronization Strategy (Sleepable RCU):
>> + * To avoid high-contention VFS locks (like inode_lock or
>> + * filemap_invalidate_lock) on the vCPU page fault hot paths, this subsystem
>> + * implements a lightweight, system-wide Sleepable RCU (SRCU) mechanism
>> + * (`kvm_gmem_freeze_srcu`):
>> + *
>> + * Global vs. Per-Inode SRCU
>> + * ======================
>> + * A single system-wide global static `srcu_struct` is used instead of a
>> + * per-inode SRCU structure to completely prevent unprivileged users from
>> + * exhausting the host's per-CPU memory allocator. Because
>> + * `init_srcu_struct()` allocates per-CPU memory via `alloc_percpu()`, which
>> + * is not accounted by memory cgroups (memcg),
>> + * a per-inode SRCU structure would allow a tenant to bypass cgroup limits and
>> + * trigger a system-wide Out-of-Memory (OOM) crash simply by spawning a large
>> + * number of guest_memfd file descriptors (bounded only by RLIMIT_NOFILE).
>> + *
>> + * Flag Modification Note:
>> + * Since `GUEST_MEMFD_F_MAPPING_FROZEN` is the ONLY flag in
>> + * `GMEM_I(inode)->flags` that is mutated dynamically at runtime (all other
>> + * flags are creation-time flags which remain strictly read-only), there is
>> + * no possibility of concurrent bit-modification races. Therefore, a standard
>> + * `WRITE_ONCE` is fully safe and does not require complex `cmpxchg`
>> + * synchronization loops.
>> + */
>> +void kvm_gmem_freeze(struct inode *inode, bool freeze)
>> +{
>> +	u64 flags = READ_ONCE(GMEM_I(inode)->flags);
>> +
>> +	if (freeze)
>> +		flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
>> +	else
>> +		flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
>> +
>> +	WRITE_ONCE(GMEM_I(inode)->flags, flags);
>> +
>> +	if (freeze)
>> +		synchronize_srcu(&kvm_gmem_freeze_srcu);
>
> Why only synchronize on freeze but not unfreeze?

It was not needed because

Freeze => True
When an user setting freeze to true.

"Preservation will be stalled till all the current ongoing allocation
finished, and future allocations are already stopped."

Freeze => False
When an user unfreezing, current allocation/fallocate will
return -EPERM, and future one will be succeeded as freeze is set
to false. Synchronization will only stall the user, behviour does
not change.

Unless, user expects that it should be waiting for all the ongoing
drains.

>
>> +}
>> +
>>
>> [...snip...]
>>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
  2026-06-23  0:09     ` Sean Christopherson
@ 2026-06-23 14:03       ` tarunsahu
  0 siblings, 0 replies; 23+ messages in thread
From: tarunsahu @ 2026-06-23 14:03 UTC (permalink / raw)
  To: Sean Christopherson, Ackerley Tng
  Cc: Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin, Shuah Khan,
	sagis, aneesh.kumar, skhawaja, vipinsh, Pratyush Yadav, david,
	dmatlack, mark.rutland, Paolo Bonzini, Mike Rapoport,
	Alexander Graf, axelrasmussen, linux-kselftest, kexec,
	linux-kernel, linux-doc, kvm, linux-mm

Sean Christopherson <seanjc@google.com> writes:

> On Mon, Jun 22, 2026, Ackerley Tng wrote:
>> Tarun Sahu <tarunsahu@google.com> writes:
>> 
>> > This patch introduces the freeze on gmem_inode which prevents
>> 
>> Can't find the reference now, but commit messages should take the
>> imperative mood and avoid "this patch" [*]
>
> From Documentation/process/submitting-patches.rst:
>
>   Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
>   instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
>   to do frotz", as if you are giving orders to the codebase to change
>   its behaviour.
>
> Documentation/process/maintainer-tip.rst and Documentation/process/maintainer-kvm-x86.rst
> elaborate more on the preferred style (I do most of the guest_memfd maintenance,
> and so for all intents and purpose it's bound by KVM x86 "rules").


Thanks!. Will take care of that.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
  2026-06-22 23:54   ` Ackerley Tng
  2026-06-23  0:09     ` Sean Christopherson
  2026-06-23 14:02     ` tarunsahu
@ 2026-06-23 14:36     ` tarunsahu
  2026-06-23 16:14       ` Pratyush Yadav
  2 siblings, 1 reply; 23+ messages in thread
From: tarunsahu @ 2026-06-23 14:36 UTC (permalink / raw)
  To: Ackerley Tng, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Ackerley Tng <ackerleytng@google.com> writes:

> Tarun Sahu <tarunsahu@google.com> writes:
>
>>  static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>>  			       loff_t len)
>>  {
>> +	struct inode *inode = file_inode(file);
>>  	int ret;
>> +	int idx;
>>
>> -	if (!(mode & FALLOC_FL_KEEP_SIZE))
>> -		return -EOPNOTSUPP;
>> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
>> +	if (kvm_gmem_is_frozen(inode)) {
>> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
>> +		return -EPERM;
>> +	}
>
> fallocate may eventually go to kvm_gmem_get_folio(), so that would check
> kvm_gmem_is_frozen() twice. Is this meant to catch the punch hole case?
>
>>
>> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>> -		return -EOPNOTSUPP;
>> +	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
>> +		ret = -EOPNOTSUPP;
>> +		goto out;
>> +	}
>>
>> -	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
>> -		return -EINVAL;
>> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
>> +		ret = -EOPNOTSUPP;
>> +		goto out;
>> +	}
>> +
>> +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
>> +		ret = -EINVAL;
>> +		goto out;
>> +	}
>
> There's some reordering here. Why not let the validation happen like
> before, then check kvm_gmem_is_frozen()?

To align with design. "stop the fallocate call if inode is frozen, No
need to go further". I dont have strict opinion on this. I am fine with
taking it across punch hole as well to make it more fine grained. But it
will no longer claims stop the fallocate call (allocation one is stopped
in separate path: fault path) , though functionally it does the same
thing.

WDYT?

~Tarun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation
  2026-06-22 23:27   ` Ackerley Tng
@ 2026-06-23 15:26     ` tarunsahu
  0 siblings, 0 replies; 23+ messages in thread
From: tarunsahu @ 2026-06-23 15:26 UTC (permalink / raw)
  To: Ackerley Tng, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm


Thanks for reviewing!

Ackerley Tng <ackerleytng@google.com> writes:

> Tarun Sahu <tarunsahu@google.com> writes:
>
>> This patch sets up the basic infrastructure to preserve the guest_memfd.
>> Currently this supports only fully shared guest_memfd and backed by
>> PAGE_SIZE pages.
>>
>> It registers a new LUO file handler for guest_memfd files to serialize
>> and deserialize guest memory. This allows preserving guest memory backed
>> by guest_memfd across updates, ensuring that guest instances can be
>> resumed seamlessly without losing their memory contents.
>>
>> Preservation is straight forward. It walks through the folios and
>> serialize them.
>>
>> There is kvm_gmem_freeze call on preserve which freeze the guest_memfd
>> inode. It avoids any changes to inode mapping with fallocate calls or
>> any new fault allocation (fails) on or after preservation. No need to check
>> this during the page fault as preservation is only supported for
>> pre-faulted/pre-allocated guest_memfd.
>>
>> While retrieving the guest_memfd, it requires the struct kvm to create
>> new guest_memfd. So it first get the vm_file from the same session using
>> the token passed during the preservation. And use it to get
>> vm_file->kvm.
>>
>> This change also update the MAINTAINERS list.
>>
>> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
>> ---
>>  MAINTAINERS                 |   1 +
>>  include/linux/kho/abi/kvm.h |  79 +++++-
>>  virt/kvm/Makefile.kvm       |   2 +-
>>  virt/kvm/guest_memfd_luo.c  | 485 ++++++++++++++++++++++++++++++++++++
>>  virt/kvm/kvm_main.c         |   7 +
>>  virt/kvm/kvm_mm.h           |   4 +
>>  6 files changed, 571 insertions(+), 7 deletions(-)
>>  create mode 100644 virt/kvm/guest_memfd_luo.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 9bfc3c1f6676..16cba790a84d 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -14418,6 +14418,7 @@ L:	kexec@lists.infradead.org
>>  L:	kvm@vger.kernel.org
>>  S:	Maintained
>>  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
>> +F:	virt/kvm/guest_memfd_luo.c
>>  F:	virt/kvm/kvm_luo.c
>>
>>  KVM PARAVIRT (KVM/paravirt)
>> diff --git a/include/linux/kho/abi/kvm.h b/include/linux/kho/abi/kvm.h
>> index 718db68a541a..42074d76e04a 100644
>> --- a/include/linux/kho/abi/kvm.h
>> +++ b/include/linux/kho/abi/kvm.h
>> @@ -9,20 +9,23 @@
>>  #define _LINUX_KHO_ABI_KVM_H
>>
>>  #include <linux/types.h>
>> +#include <linux/bits.h>
>>  #include <linux/kho/abi/kexec_handover.h>
>>
>>  /**
>> - * DOC: KVM Live Update ABI
>> + * DOC: KVM and guest_memfd Live Update ABI
>>   *
>> - * KVM uses the ABI defined below for preserving its state
>> + * KVM and guest_memfd use the ABI defined below for preserving their states
>>   * across a kexec reboot using the LUO.
>>   *
>> - * The state is serialized into a packed structure `struct kvm_luo_ser`
>> - * which is handed over to the next kernel via the KHO mechanism.
>> + * The state is serialized into packed structures (struct kvm_luo_ser and
>> + * struct guest_memfd_luo_ser) which are handed over to the next kernel via
>> + * the KHO mechanism.
>>   *
>> - * This interface is a contract. Any modification to the structure layout
>> + * This interface is a contract. Any modification to the structure layouts
>>   * constitutes a breaking change. Such changes require incrementing the
>> - * version number in the KVM_LUO_FH_COMPATIBLE compatibility string.
>> + * version number in the KVM_LUO_FH_COMPATIBLE or
>> + * GUEST_MEMFD_LUO_FH_COMPATIBLE compatibility strings.
>>   */
>>
>>  /**
>> @@ -36,4 +39,68 @@ struct kvm_luo_ser {
>>  /* The compatibility string for KVM VM file handler */
>>  #define KVM_LUO_FH_COMPATIBLE	"kvm_vm_luo_v1"
>>
>> +/**
>> + * struct guest_memfd_luo_folio_ser - Serialization layout for a single folio in guest_memfd.
>> + * @pfn:   Page Frame Number of the folio.
>> + * @index: Page offset of the folio within the file.
>> + * @flags: State flags associated with the folio.
>> + */
>> +struct guest_memfd_luo_folio_ser {
>> +	u64 pfn:52;
>> +	u64 flags:12;
>> +	u64 index;
>> +} __packed;
>> +
>> +/**
>> + * GUEST_MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date.
>> + *
>> + * This flag is per folio to check if the folio is uptodate.
>> + */
>> +#define GUEST_MEMFD_LUO_FOLIO_UPTODATE	BIT(0)
>> +
>> +
>> +/**
>> + * GUEST_MEMFD_LUO_FLAG_MMAP - The guest_memfd supports mmap.
>> + *
>> + * This flag indicates that the guest_memfd supports host-side mmap.
>> + */
>> +#define GUEST_MEMFD_LUO_FLAG_MMAP		BIT(0)
>> +
>> +/**
>> + * GUEST_MEMFD_LUO_FLAG_INIT_SHARED - Initialize memory as shared.
>> + *
>> + * This flag indicates that the guest_memfd has been initialized as shared
>> + * memory.
>> + */
>> +#define GUEST_MEMFD_LUO_FLAG_INIT_SHARED	BIT(1)
>> +
>> +/**
>> + * GUEST_MEMFD_LUO_SUPPORTED_FLAGS - Supported guest_memfd LUO flags mask.
>> + *
>> + * A mask of all guest_memfd preservation flags supported by this version
>> + * of the KVM LUO ABI.
>> + */
>> +#define GUEST_MEMFD_LUO_SUPPORTED_FLAGS	(GUEST_MEMFD_LUO_FLAG_MMAP | \
>> +						 GUEST_MEMFD_LUO_FLAG_INIT_SHARED)
>> +
>> +/**
>> + * struct guest_memfd_luo_ser - Main serialization structure for guest_memfd.
>> + * @size:      The size of the file in bytes.
>> + * @flags:     File-level flags.
>> + * @nr_folios: Number of folios in the folios array.
>> + * @vm_token:  Token of the associated KVM VM instance.
>> + * @folios:    KHO vmalloc descriptor pointing to the array of
>> + *             struct guest_memfd_luo_folio_ser.
>> + */
>> +struct guest_memfd_luo_ser {
>> +	u64 size;
>> +	u64 flags;
>> +	u64 nr_folios;
>> +	u64 vm_token;
>> +	struct kho_vmalloc folios;
>> +} __packed;
>> +
>> +/* The compatibility string for GUEST_MEMFD file handler */
>> +#define GUEST_MEMFD_LUO_FH_COMPATIBLE	"guest_memfd_luo_v1"
>> +
>>  #endif /* _LINUX_KHO_ABI_KVM_H */
>> diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
>> index c1a962159264..d30fca094c42 100644
>> --- a/virt/kvm/Makefile.kvm
>> +++ b/virt/kvm/Makefile.kvm
>> @@ -13,4 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
>>  kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
>>  kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
>>  kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
>> -kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/kvm_luo.o
>> +kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/guest_memfd_luo.o $(KVM)/kvm_luo.o
>> diff --git a/virt/kvm/guest_memfd_luo.c b/virt/kvm/guest_memfd_luo.c
>> new file mode 100644
>> index 000000000000..d466f889c9aa
>> --- /dev/null
>> +++ b/virt/kvm/guest_memfd_luo.c
>> @@ -0,0 +1,485 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +/*
>> + * Copyright (c) 2026, Google LLC.
>> + * Tarun Sahu <tarunsahu@google.com>
>> + *
>> + * Guestmemfd Preservation for Live Update Orchestrator (LUO)
>> + */
>> +
>> +/**
>> + * DOC: Guestmemfd Preservation via LUO
>> + *
>> + * Overview
>> + * ========
>> + *
>> + * Guest memory file descriptors (guest_memfd) can be preserved over a kexec
>> + * reboot using the Live Update Orchestrator (LUO) file preservation. This
>> + * allows userspace to preserve VM memory across kexec reboots.
>> + *
>> + * The preservation is not intended to be transparent. Only select properties
>> + * of the guest_memfd are preserved, while others are reset to default.
>> + *
>> + * Preserved Properties
>> + * ====================
>> + *
>> + * The following properties of guest_memfd are preserved across kexec:
>> + *
>> + * File Size
>> + *   The size of the file is preserved.
>> + *
>> + * File Contents
>> + *   All folios present in the page cache are preserved.
>> + *
>> + * File-level Flags
>> + *   The file-level flags (such as MMAP support and INIT_SHARED default mapping)
>> + *   are preserved.
>> + *
>> + * Non-Preserved Properties
>> + * ========================
>> + *
>> + * NUMA Memory Policy
>> + *   NUMA memory policies associated with the guest_memfd are not preserved.
>> + */
>> +#include <linux/liveupdate.h>
>> +#include <linux/kvm_host.h>
>> +#include <linux/pagemap.h>
>> +#include <linux/file.h>
>> +#include <linux/err.h>
>> +#include <linux/anon_inodes.h>
>> +#include <linux/magic.h>
>> +#include <linux/kexec_handover.h>
>> +#include <linux/kho/abi/kexec_handover.h>
>> +#include <linux/kho/abi/kvm.h>
>> +#include "guest_memfd.h"
>> +
>> +static int kvm_gmem_luo_walk_folios(struct address_space *mapping,
>> +		pgoff_t end_index, struct guest_memfd_luo_folio_ser *folios_ser,
>> +		u64 *out_count)
>> +{
>> +	struct folio_batch fbatch;
>> +	pgoff_t index = 0;
>> +	u64 count = 0;
>> +	int err = 0;
>> +
>> +	folio_batch_init(&fbatch);
>> +	while (index < end_index) {
>> +		unsigned int nr, i;
>> +
>> +		nr = filemap_get_folios(mapping, &index, end_index - 1, &fbatch);
>> +		if (nr == 0)
>> +			break;
>> +
>> +		for (i = 0; i < nr; i++) {
>> +			struct folio *folio = fbatch.folios[i];
>> +
>> +			if (folios_ser) {
>> +				if (folio_test_hwpoison(folio)) {
>> +					err = -EHWPOISON;
>> +					folio_batch_release(&fbatch);
>> +					goto out;
>> +				}
>> +				err = kho_preserve_folio(folio);
>> +				if (err) {
>> +					folio_batch_release(&fbatch);
>> +					goto out;
>> +				}
>> +
>> +				folios_ser[count].pfn = folio_pfn(folio);
>> +				folios_ser[count].index = folio->index;
>> +				folios_ser[count].flags = folio_test_uptodate(folio) ?
>> +							  GUEST_MEMFD_LUO_FOLIO_UPTODATE : 0;
>> +			}
>> +			count++;
>> +		}
>> +		folio_batch_release(&fbatch);
>> +		cond_resched();
>> +	}
>> +
>> +out:
>> +	*out_count = count;
>> +	return err;
>> +}
>> +
>> +static bool kvm_gmem_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file)
>> +{
>> +	struct inode *inode = file_inode(file);
>> +	struct gmem_file *gmem_file = file->private_data;
>> +	struct kvm *kvm = gmem_file->kvm;
>> +
>> +	if (inode->i_sb->s_magic != GUEST_MEMFD_MAGIC)
>> +		return 0;
>> +
>
> How does .can_preserve decide route to this function? If it already
> routes here, wouldn't this inode definitely be a guest_memfd file?

No.
kvm_gmem_luo_handler {
                 ...
                 .ops.can_preserve = kvm_gmem_luo_can_preserve;
                 ...
                 }

.can_preserve is eligiblity call. LUO preservation ioctl can be
called by any type of file (for example: memfd file). LUO internally
loops through all the registered handler (memfd_luo handler, kvm_luo
handler, guest_memfd_luo handler etc) and call their can_preserve
function, which is kvm_gmem_luo_can_preserve, For memfd file, it will
return false and for guest_memfd it will return true.

In short: This function is used to filter the handler for the
guest_memfd file. We have additional checks to filter only the
guest_memfd file which we currently support for the preservation.

>
>> +	if (kvm_arch_has_private_mem(kvm))
>> +		return 0;
>> +
>> +	if (mapping_large_folio_support(inode->i_mapping))
>> +		return 0;
>> +
>> +	return 1;
>
> Let's return true and false rather than relying on casting.

ACK. Will update it.
>
>> +}
>> +
>> +static int kvm_gmem_luo_preserve(struct liveupdate_file_op_args *args)
>> +{
>> +	struct guest_memfd_luo_folio_ser *folios_ser = NULL;
>> +	u64 count = 0, gmem_flags, abi_flags = 0;
>> +	struct guest_memfd_luo_ser *ser;
>> +	struct address_space *mapping;
>> +	struct gmem_file *gmem_file;
>> +	struct inode *inode;
>> +	pgoff_t end_index;
>> +	struct kvm *kvm;
>> +	int err = 0;
>> +	long size;
>> +
>> +	inode = file_inode(args->file);
>
> I think to lock out all allocates, you'd have to take
> filemap_invalidate_lock() before freezing.

No need, freeze wait for synchronisation. Only after current,
allocation finishes (future allocation as blocked by freeze = true),
It will proceed further.

>
>> +	kvm_gmem_freeze(inode, true);
>> +
>> +	mapping = inode->i_mapping;
>> +	size = i_size_read(inode);
>> +	if (!size) {
>> +		err = -EINVAL;
>> +		goto err_unfreeze_inode;
>> +	}
>> +
>> +	if (WARN_ON_ONCE(!PAGE_ALIGNED(size))) {
>> +		err = -EINVAL;
>> +		goto err_unfreeze_inode;
>> +	}
>> +
>> +	gmem_file = args->file->private_data;
>> +	kvm = gmem_file->kvm;
>> +
>> +	gmem_flags = READ_ONCE(GMEM_I(inode)->flags);
>> +	if (gmem_flags & ~(GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED
>
> Why condition this on MMAP?

Here, It is only checking what flags are set in gmem inode. Currently,
gmem inode (not talking about preservation here) supports three flags
MMAP, INIT_SHARED and FROZEN. So, other than that if there are any
flags, We fail preservation with -EOPNOTSUPP.

>
> After conversions lands, we'd have to iterate to check that the entire
> guest_memfd is shared offset-by-offset instead of checking for
> INIT_SHARED.

This is unrelated to comment to the code snippet here: but we can
disucss this as it is good question.

I object it. It will not be needed. If in-place coversion lands first or
after this preservation series. There will no affect to the preservation
series (IMHO, IIUC). I have following two checks in V3:
https://lore.kernel.org/all/20260622184851.2309827-7-tarunsahu@google.com/#:~:text=static%20bool%20kvm_gmem_luo_can_preserve,0%3B%0A%2B%0A%2B%09return%201%3B%0A%2B%7D

INIT_SHARED: Which makes sure that initially, the guest_memfd was
completely shared, when created.
!kvm_arch_has_private_mem(): Avoids Any future conversion.

So, If these two checks are passed, that means guest_memfd file does
not have any private pages. So we can preserve it. This is also aligns
with logic in in-place conversion series.

>
>> +				| GUEST_MEMFD_F_MAPPING_FROZEN)) {
>
> This would always be true since kvm_gmem_freeze() is done above.

Like, I said above, it is just checking if there are any flags set other
than these three.

>
>> +		err = -EOPNOTSUPP;
>> +		goto err_unfreeze_inode;
>> +	}
>> +
>> +	if (gmem_flags & GUEST_MEMFD_FLAG_MMAP)
>> +		abi_flags |= GUEST_MEMFD_LUO_FLAG_MMAP;
>> +	if (gmem_flags & GUEST_MEMFD_FLAG_INIT_SHARED)
>> +		abi_flags |= GUEST_MEMFD_LUO_FLAG_INIT_SHARED;
>> +
>
> Is it intentional to have a different set of flags that are actually
> preserved? I think we should refactor out a function to transfer the
> flags over.

To have backward compatiblity with userspace ABI. the flags
must be constant across different version of kernel. In the
file include/linux/kho/abi/kvm.h

I have defined these two flags, and they will be certain to be
constant across different version of kernel. Internal flags might
change their name, but externally, this will remain constant.

gmem_inode->flags (internal kernel variablity)
                   |
                   v
"userspace defined gmem_inode flags" => preserve

<KEXEC to new kernel version>

retrieve => "userspace defined gmem_inode flags"
                   |
                   v
gmem_inode->flags (internal kernel variablity)

Ack on refactoring out the logic: Will take care of that in next
revision v4.

>
>> +	end_index = size >> PAGE_SHIFT;
>> +
>> +	ser = kho_alloc_preserve(sizeof(*ser));
>> +	if (IS_ERR(ser)) {
>> +		err = PTR_ERR(ser);
>> +		goto err_unfreeze_inode;
>> +	}
>> +
>> +	/* First pass: Count the folios present in the page cache */
>> +	err = kvm_gmem_luo_walk_folios(mapping, end_index, NULL, &count);
>> +	if (err)
>> +		goto err_free_ser;
>> +
>> +	ser->size = size;
>> +	ser->flags = abi_flags;
>> +	ser->nr_folios = count;
>> +	ser->vm_token = 0; // It will be set during the kvm_gmem_luo_freeze()
>
> I don't think // is commonly used.

Will take care of such instances.

>
>> +
>> +	if (count > 0) {
>> +		folios_ser = vcalloc(count, sizeof(*folios_ser));
>> +		if (!folios_ser) {
>> +			err = -ENOMEM;
>> +			goto err_free_ser;
>> +		}
>> +
>> +		/* Second pass: Fill the metadata array and preserve folios */
>> +		err = kvm_gmem_luo_walk_folios(mapping, end_index, folios_ser, &count);
>
> I think it's clearer to just define 2 functions rather than using the
> same function twice to do these different things. The comments on the
> two passes can then be dropped.

I have single function to avoid duplication of the same code.

>
>> +		if (err)
>> +			goto err_unpreserve_unlocked;
>> +
>> +		if (WARN_ON_ONCE(count != ser->nr_folios)) {
>> +			err = -EINVAL;
>> +			goto err_unpreserve_unlocked;
>> +		}
>> +	}
>> +
>> +	if (count > 0) {
>> +		err = kho_preserve_vmalloc(folios_ser, &ser->folios);
>> +		if (err)
>> +			goto err_unpreserve_unlocked;
>> +	}
>> +
>> +	args->serialized_data = virt_to_phys(ser);
>> +	args->private_data = folios_ser;
>> +
>> +	return 0;
>> +
>> +err_unpreserve_unlocked:
>> +	for (long i = (long)count - 1; i >= 0; i--) {
>
> Not sure if it's common to define long i inline.

I will correct it. Good fine. thanks.

>
>> +		struct folio *folio = pfn_folio(folios_ser[i].pfn);
>> +
>> +		kho_unpreserve_folio(folio);
>> +	}
>> +	vfree(folios_ser);
>> +err_free_ser:
>> +	kho_unpreserve_free(ser);
>> +err_unfreeze_inode:
>> +	kvm_gmem_freeze(inode, false);
>> +	return err;
>> +}
>> +
>> +static int kvm_gmem_luo_freeze(struct liveupdate_file_op_args *args)
>> +{
>> +	struct guest_memfd_luo_ser *ser;
>> +	struct gmem_file *gmem_file;
>> +	struct kvm *kvm;
>> +	struct file *kvm_file;
>> +	u64 vm_token;
>> +	int err;
>> +
>> +	if (WARN_ON_ONCE(!args->serialized_data))
>> +		return -EINVAL;
>> +
>> +	ser = phys_to_virt(args->serialized_data);
>> +
>> +	gmem_file = args->file->private_data;
>> +	kvm = gmem_file->kvm;
>> +
>> +	/*
>> +	 * Obtain a strong reference to kvm->vm_file to prevent the SLAB_TYPESAFE_BY_RCU
>> +	 * file memory from being reallocated while it is being processed.
>> +	 */
>> +	kvm_file = get_file_active(&kvm->vm_file);
>> +	if (!kvm_file)
>> +		return -ENOENT;
>> +
>> +	err = liveupdate_get_token_outgoing(args->session, kvm_file, &vm_token);
>> +	fput(kvm_file);
>> +	if (err)
>> +		return err;
>> +
>> +	ser->vm_token = vm_token;
>> +	return 0;
>> +}
>> +
>> +static void kvm_gmem_luo_discard_folios(
>> +	const struct guest_memfd_luo_folio_ser *folios_ser,
>> +	u64 nr_folios, u64 start_idx)
>> +{
>> +	long i;
>> +
>> +	for (i = start_idx; i < nr_folios; i++) {
>> +		struct folio *folio;
>> +		phys_addr_t phys;
>> +
>> +		if (!folios_ser[i].pfn)
>> +			continue;
>> +
>> +		phys = PFN_PHYS(folios_ser[i].pfn);
>> +		folio = kho_restore_folio(phys);
>> +		if (folio)
>> +			folio_put(folio);
>> +	}
>> +}
>> +
>> +static void kvm_gmem_luo_unpreserve(struct liveupdate_file_op_args *args)
>> +{
>> +	struct guest_memfd_luo_folio_ser *folios_ser = args->private_data;
>> +	struct guest_memfd_luo_ser *ser;
>> +	long i;
>> +
>> +	if (WARN_ON_ONCE(!args->serialized_data))
>> +		return;
>> +
>> +	ser = phys_to_virt(args->serialized_data);
>> +	if (!ser)
>> +		return;
>> +
>> +	if (ser->nr_folios > 0)
>> +		kho_unpreserve_vmalloc(&ser->folios);
>> +	for (i = ser->nr_folios - 1; i >= 0; i--) {
>> +		struct folio *folio;
>> +
>> +		if (!folios_ser[i].pfn)
>
> Is it possible for pfn to be 0 here? Perhaps this should be a
> WARN_ON_ONCE().

In LUO design, We explicitly assumed, that pfn 0 is invalid.
Looping +Pratyush for more context.

Yes, WARN_ON_ONCE makes sense. I will update it.

>
>> +			continue;
>> +
>> +		folio = pfn_folio(folios_ser[i].pfn);
>> +		kho_unpreserve_folio(folio);
>> +	}
>> +	vfree(folios_ser);
>> +
>> +	kho_unpreserve_free(ser);
>> +	kvm_gmem_freeze(file_inode(args->file), false);
>> +}
>> +
>>
>> [...snip...]
>>
Thanks,
Tarun



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support
  2026-06-22 23:59   ` Ackerley Tng
  2026-06-23 12:48     ` tarunsahu
@ 2026-06-23 15:33     ` tarunsahu
  1 sibling, 0 replies; 23+ messages in thread
From: tarunsahu @ 2026-06-23 15:33 UTC (permalink / raw)
  To: Ackerley Tng, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
  Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

Ackerley Tng <ackerleytng@google.com> writes:

> Tarun Sahu <tarunsahu@google.com> writes:
>
>> Introduce core infrastructure to support VM preservation with LUO.
>>
>> First two changes are just refactoring, no functional change, third
>> change introduces a new member in struct kvm.
>> - Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
>> - Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
>>   and anon_inode_getfile() to provide a unified VM file creation API.
>> - Track a weak reference to the backing file in struct kvm under
>>   CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
>>   without circular lifetime dependencies.
>>
>
> Given the above, I think this should be separate patches.
>
>> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
>> ---
>>  include/linux/kvm_host.h | 14 +++++++
>>  virt/kvm/kvm_main.c      | 79 +++++++++++++++++++++++++++++-----------
>>  virt/kvm/kvm_mm.h        |  3 ++
>>  3 files changed, 75 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 4c14aee1fb06..9111a28637af 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -874,6 +874,18 @@ struct kvm {
>>  #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
>>  	/* Protected by slots_lock (for writes) and RCU (for reads) */
>>  	struct xarray mem_attr_array;
>> +#endif
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +	/*
>> +	 * Weak reference to the VFS file backing this KVM instance. Stored
>> +	 * without incrementing the file refcount to prevent a circular lifetime
>> +	 * dependency (since file->private_data already pins this struct kvm).
>> +	 * Used exclusively to resolve the file pointer back from struct kvm.
>> +	 *
>> +	 * Written/cleared via rcu_assign_pointer() and read locklessly under
>> +	 * RCU (e.g. via get_file_active() to prevent ABA races).
>> +	 */
>> +	struct file *vm_file;
>>  #endif
>
> We didn't really talk about this during the calls, but it seems weird to
> preserve a vm_file with pretty much nothing other than the vm type. The
> entire VM is re-created, which means it could potentially be a
> completely different VM?
>
> In some sense it's more flexible since the guest_memfd can be restored
> with some completely different VM, but it seems like it could introduce
> other issues.
>
> I think other KVM folks would probably have more thoughts here.
>
>>  	char stats_id[KVM_STATS_NAME_SIZE];
>>  };
>> @@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
>>  bool kvm_get_kvm_safe(struct kvm *kvm);
>>  void kvm_put_kvm(struct kvm *kvm);
>>  bool file_is_kvm(struct file *file);
>> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
>>  void kvm_put_kvm_no_destroy(struct kvm *kvm);
>> +void kvm_uevent_notify_vm_create(struct kvm *kvm);
>>
>>  static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
>>  {
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 89489996fbc1..65f0c5fb353e 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -67,9 +67,6 @@
>>  #include <linux/kvm_dirty_ring.h>
>>
>>
>> -/* Worst case buffer size needed for holding an integer. */
>> -#define ITOA_MAX_LEN 12
>> -
>>  MODULE_AUTHOR("Qumranet");
>>  MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
>>  MODULE_LICENSE("GPL");
>> @@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>>  {
>>  	struct kvm *kvm = filp->private_data;
>>
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +	/*
>> +	 * Clear the weak reference of the vm file.
>> +	 * In case vm file is closed by userspace, but kvm still has
>> +	 * other users like vCPUs, clearing this pointer ensures
>> +	 * that we don't have a dangling pointer to a closed file.
>> +	 *
>> +	 * Cleared via rcu_assign_pointer() to ensure proper memory visibility
>> +	 * for concurrent lockless readers under RCU.
>> +	 */
>> +	rcu_assign_pointer(kvm->vm_file, NULL);
>> +#endif
>> +
>>  	kvm_irqfd_release(kvm);
>>
>>  	kvm_put_kvm(kvm);
>> @@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
>>  }
>>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
>>
>> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
>> +{
>> +	struct kvm *kvm = kvm_create_vm(type, fdname);
>> +	struct file *file;
>> +
>> +	if (IS_ERR(kvm))
>> +		return ERR_CAST(kvm);
>> +
>> +	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>> +	if (IS_ERR(file)) {
>> +		kvm_put_kvm(kvm);
>> +		return file;
>> +	}
>> +
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +	/*
>> +	 * Weak reference to the file (without get_file()) to prevent a circular
>> +	 * dependency. Safe because the file's release path clears this pointer
>> +	 * and drops its reference to the VM.
>> +	 *
>> +	 * Written via rcu_assign_pointer() because the pointer can be read
>> +	 * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
>> +	 * get_file_active() to prevent lockless ABA races).
>> +	 */
>> +	rcu_assign_pointer(kvm->vm_file, file);
>> +#endif
>> +
>> +	/*
>> +	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
>> +	 * already set, with ->release() being kvm_vm_release().  In error
>> +	 * cases it will be called by the final fput(file) and will take
>> +	 * care of doing kvm_put_kvm(kvm).
>> +	 */
>> +
>> +	return file;
>> +}
>> +
>>  static int kvm_dev_ioctl_create_vm(unsigned long type)
>>  {
>>  	char fdname[ITOA_MAX_LEN + 1];
>>  	int r, fd;
>> -	struct kvm *kvm;
>>  	struct file *file;
>>
>>  	fd = get_unused_fd_flags(O_CLOEXEC);
>> @@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
>>
>>  	snprintf(fdname, sizeof(fdname), "%d", fd);
>>
>> -	kvm = kvm_create_vm(type, fdname);
>> -	if (IS_ERR(kvm)) {
>> -		r = PTR_ERR(kvm);
>> -		goto put_fd;
>> -	}
>> -
>> -	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>> +	file = kvm_create_vm_file(type, fdname);
>>  	if (IS_ERR(file)) {
>>  		r = PTR_ERR(file);
>> -		goto put_kvm;
>> +		goto put_fd;
>>  	}
>>
>> -	/*
>> -	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
>> -	 * already set, with ->release() being kvm_vm_release().  In error
>> -	 * cases it will be called by the final fput(file) and will take
>> -	 * care of doing kvm_put_kvm(kvm).
>> -	 */
>> -	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
>> +	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);
>
> Notifying with file->private_data threw me off... I would rather inline
> the rcu_assign_pointer() in this function and have this line read
> notify(..., kvm) like before.

Yes, I will update it to:

     struct kvm *kvm;
     ...
     kvm = file->private_data;
     notify (..., kvm);

About rcu_assign_pointer, I am not sure, IIUC. that only set the
kvm->vm_file pointer, which does not have any relation with
file->private_data. And keeping the rcu_assign_pointer(kvm->vm_file,
file) at the current place (inside kvm_create_vm_file) logically makes
sense. because, kvm_create_vm_file creates the struct kvm and vm_file,
So, all the relation variables should get updated there.

>
>>
>>  	fd_install(fd, file);
>>  	return fd;
>>
>> -put_kvm:
>> -	kvm_put_kvm(kvm);
>>  put_fd:
>>  	put_unused_fd(fd);
>>  	return r;
>> @@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
>>  	kfree(env);
>>  }
>>
>> +void kvm_uevent_notify_vm_create(struct kvm *kvm)
>> +{
>> +	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
>> +}
>> +
>>  static void kvm_init_debug(void)
>>  {
>>  	const struct file_operations *fops;
>> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
>> index 9fcc5d5b7f8d..7aa1d65c3d46 100644
>> --- a/virt/kvm/kvm_mm.h
>> +++ b/virt/kvm/kvm_mm.h
>> @@ -3,6 +3,9 @@
>>  #ifndef __KVM_MM_H__
>>  #define __KVM_MM_H__ 1
>>
>> +/* Worst case buffer size needed for holding an integer as a string. */
>> +#define ITOA_MAX_LEN 12
>> +
>>  /*
>>   * Architectures can choose whether to use an rwlock or spinlock
>>   * for the mmu_lock.  These macros, for use in common code
>> --
>> 2.54.0.1032.g2f8565e1d1-goog

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
  2026-06-23 14:36     ` tarunsahu
@ 2026-06-23 16:14       ` Pratyush Yadav
  0 siblings, 0 replies; 23+ messages in thread
From: Pratyush Yadav @ 2026-06-23 16:14 UTC (permalink / raw)
  To: tarunsahu
  Cc: Ackerley Tng, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
	Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
	Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen,
	linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm

On Tue, Jun 23 2026, tarunsahu@google.com wrote:

> Ackerley Tng <ackerleytng@google.com> writes:
>
>> Tarun Sahu <tarunsahu@google.com> writes:
>>
>>>  static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>>>  			       loff_t len)
>>>  {
>>> +	struct inode *inode = file_inode(file);
>>>  	int ret;
>>> +	int idx;
>>>
>>> -	if (!(mode & FALLOC_FL_KEEP_SIZE))
>>> -		return -EOPNOTSUPP;
>>> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
>>> +	if (kvm_gmem_is_frozen(inode)) {
>>> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
>>> +		return -EPERM;
>>> +	}
>>
>> fallocate may eventually go to kvm_gmem_get_folio(), so that would check
>> kvm_gmem_is_frozen() twice. Is this meant to catch the punch hole case?

Yeah, I reckon you can get away with doing this check only in
kvm_gmem_get_folio(). Normally you'd like to fail early, but as of now I
don't see much of a problem. If you drop the check here and fail in
kvm_gmem_get_folio() you'd end up taking and releasing the mapping
invalidate_lock, but this isn't a fast path anyway so I don't think it
should matter much.

I think either way can work just as fine...

>>
>>>
>>> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>>> -		return -EOPNOTSUPP;
>>> +	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
>>> +		ret = -EOPNOTSUPP;
>>> +		goto out;
>>> +	}
>>>
>>> -	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
>>> -		return -EINVAL;
>>> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
>>> +		ret = -EOPNOTSUPP;
>>> +		goto out;
>>> +	}
>>> +
>>> +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
>>> +		ret = -EINVAL;
>>> +		goto out;
>>> +	}
>>
>> There's some reordering here. Why not let the validation happen like
>> before, then check kvm_gmem_is_frozen()?

There is no reordering, if I am reading the diff correctly. The diff is
somewhat misleading. The kvm_gmem_is_frozen() call is added at the top
of the function, and then all the later checks are in the same place but
get a goto out (and hence a full body to the if block). So the diff
reads like reordering, but there is none.

It would be very neat if scru had a cleanup.h style scope-based locking
function, but on a quick glance I can't see one.

>
> To align with design. "stop the fallocate call if inode is frozen, No
> need to go further". I dont have strict opinion on this. I am fine with
> taking it across punch hole as well to make it more fine grained. But it
> will no longer claims stop the fallocate call (allocation one is stopped
> in separate path: fault path) , though functionally it does the same
> thing.
>
> WDYT?
>
> ~Tarun

-- 
Regards,
Pratyush Yadav

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2026-06-23 16:14 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <cover.1780676742.git.tarunsahu@google.com>
2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
2026-06-07  0:35   ` tarunsahu
2026-06-05 17:08 ` [RFC PATCH v2 02/10] liveupdate: Add LIVEUPDATE_GUEST_MEMFD config option Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
2026-06-22 23:59   ` Ackerley Tng
2026-06-23 12:48     ` tarunsahu
2026-06-23 15:33     ` tarunsahu
2026-06-05 17:08 ` [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 05/10] kvm: guest_memfd: Move internal definitions and helper to new header Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
2026-06-22 23:54   ` Ackerley Tng
2026-06-23  0:09     ` Sean Christopherson
2026-06-23 14:03       ` tarunsahu
2026-06-23 14:02     ` tarunsahu
2026-06-23 14:36     ` tarunsahu
2026-06-23 16:14       ` Pratyush Yadav
2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
2026-06-22 23:27   ` Ackerley Tng
2026-06-23 15:26     ` tarunsahu
2026-06-05 17:08 ` [RFC PATCH v2 08/10] docs: add documentation for guest_memfd preservation via LUO Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 09/10] selftests: kvm: Split ____vm_create() to expose init helpers Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
2026-06-22 23:01   ` Ackerley Tng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox