* [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation
[not found] <cover.1780676742.git.tarunsahu@google.com>
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:24 ` sashiko-bot
2026-06-07 0:35 ` tarunsahu
2026-06-05 17:08 ` [RFC PATCH v2 02/10] liveupdate: Add LIVEUPDATE_GUEST_MEMFD config option Tarun Sahu
` (8 subsequent siblings)
9 siblings, 2 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
From: Pasha Tatashin <pasha.tatashin@soleen.com>
The core liveupdate mechanism allows userspace to preserve file
descriptors. However, kernel subsystems often manage struct file
objects directly and need to participate in the preservation process
programmatically without relying solely on userspace interaction.
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
include/linux/liveupdate.h | 21 ++++++++++
kernel/liveupdate/luo_file.c | 69 ++++++++++++++++++++++++++++++++
kernel/liveupdate/luo_internal.h | 17 ++++++++
3 files changed, 107 insertions(+)
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index 30c5a39ff9e9..de052438eaac 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -24,6 +24,7 @@ struct file;
/**
* struct liveupdate_file_op_args - Arguments for file operation callbacks.
* @handler: The file handler being called.
+ * @session: The session this file belongs to.
* @retrieve_status: The retrieve status for the 'can_finish / finish'
* operation. A value of 0 means the retrieve has not been
* attempted, a positive value means the retrieve was
@@ -44,6 +45,7 @@ struct file;
*/
struct liveupdate_file_op_args {
struct liveupdate_file_handler *handler;
+ struct liveupdate_session *session;
int retrieve_status;
struct file *file;
u64 serialized_data;
@@ -240,6 +242,13 @@ void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp);
int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp);
+/* kernel can internally retrieve files */
+int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
+ struct file **filep);
+
+/* Get a token for an outgoing file, or -ENOENT if file is not preserved */
+int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+ struct file *file, u64 *tokenp);
#else /* CONFIG_LIVEUPDATE */
@@ -285,5 +294,17 @@ static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb,
return -EOPNOTSUPP;
}
+static inline int liveupdate_get_file_incoming(struct liveupdate_session *s,
+ u64 token, struct file **filep)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+ struct file *file, u64 *tokenp)
+{
+ return -EOPNOTSUPP;
+}
+
#endif /* CONFIG_LIVEUPDATE */
#endif /* _LINUX_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index a0a419085e28..0aa0b4e5339f 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -323,6 +323,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
mutex_init(&luo_file->mutex);
args.handler = fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = file;
err = fh->ops->preserve(&args);
if (err)
@@ -380,6 +381,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
struct luo_file, list);
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.private_data = luo_file->private_data;
@@ -411,6 +413,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set,
struct liveupdate_file_op_args args = {0};
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.private_data = luo_file->private_data;
@@ -432,6 +435,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set,
struct liveupdate_file_op_args args = {0};
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.private_data = luo_file->private_data;
@@ -621,6 +625,7 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
}
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.serialized_data = luo_file->serialized_data;
err = luo_file->fh->ops->retrieve(&args);
if (err) {
@@ -654,6 +659,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set,
struct liveupdate_file_op_args args = {0};
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.retrieve_status = luo_file->retrieve_status;
@@ -671,6 +677,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
guard(mutex)(&luo_file->mutex);
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.retrieve_status = luo_file->retrieve_status;
@@ -924,3 +931,65 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
luo_flb_unregister_all(fh);
list_del(&ACCESS_PRIVATE(fh, list));
}
+EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler);
+
+/**
+ * liveupdate_get_token_outgoing - Get the token for a preserved file.
+ * @s: The outgoing liveupdate session.
+ * @file: The file object to search for.
+ * @tokenp: Output parameter for the found token.
+ *
+ * Searches the list of preserved files in an outgoing session for a matching
+ * file object. If found, the corresponding user-provided token is returned.
+ *
+ * This function is intended for in-kernel callers that need to correlate a
+ * file with its liveupdate token.
+ *
+ * Context: It must be called with session mutex acquired.
+ * Return: 0 on success, -ENOENT if the file is not preserved in this session.
+ */
+int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+ struct file *file, u64 *tokenp)
+{
+ struct luo_file_set *file_set = luo_file_set_from_session_locked(s);
+ struct luo_file *luo_file;
+ int err = -ENOENT;
+
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ if (luo_file->file == file) {
+ if (tokenp)
+ *tokenp = luo_file->token;
+ err = 0;
+ break;
+ }
+ }
+
+ return err;
+}
+
+/**
+ * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
+ * @s: The incoming liveupdate session (restored from the previous kernel).
+ * @token: The unique token identifying the file to retrieve.
+ * @filep: On success, this will be populated with a pointer to the retrieved
+ * 'struct file'.
+ *
+ * Provides a kernel-internal API for other subsystems to retrieve their
+ * preserved files after a live update. This function is a simple wrapper
+ * around luo_retrieve_file(), allowing callers to find a file by its token.
+ *
+ * The caller receives a new reference to the file and must call fput() when it
+ * is no longer needed. The file's lifetime is managed by LUO and any userspace
+ * file descriptors. If the caller needs to hold a reference to the file beyond
+ * the immediate scope, it must call get_file() itself.
+ *
+ * Context: It must be called with session mutex acquired of a restored session.
+ * Return: 0 on success. Returns -ENOENT if no file with the matching token is
+ * found, or any other negative errno on failure.
+ */
+int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
+ struct file **filep)
+{
+ return luo_retrieve_file(luo_file_set_from_session_locked(s),
+ token, filep);
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 875844d7a41d..08b198802e7f 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -79,6 +79,23 @@ struct luo_session {
extern struct rw_semaphore luo_register_rwlock;
+static inline struct liveupdate_session *luo_session_from_file_set(struct luo_file_set *file_set)
+{
+ struct luo_session *session;
+
+ session = container_of(file_set, struct luo_session, file_set);
+
+ return (struct liveupdate_session *)session;
+}
+
+static inline struct luo_file_set *luo_file_set_from_session_locked(struct liveupdate_session *s)
+{
+ struct luo_session *session = (struct luo_session *)s;
+
+ lockdep_assert_held(&session->mutex);
+ return &session->file_set;
+}
+
int luo_session_create(const char *name, struct file **filep);
int luo_session_retrieve(const char *name, struct file **filep);
int __init luo_session_setup_outgoing(void *fdt);
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 02/10] liveupdate: Add LIVEUPDATE_GUEST_MEMFD config option
[not found] <cover.1780676742.git.tarunsahu@google.com>
2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
` (7 subsequent siblings)
9 siblings, 0 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Introduce the LIVEUPDATE_GUEST_MEMFD Kconfig option. This option
enables live update support for KVM guest_memfd files, enabling
guest_memfd-backed memory preservation across kernel upgrades.
Currently this support only guest_memfd files that are full-shared
and pre-faulted.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
kernel/liveupdate/Kconfig | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
index 1a8513f16ef7..0bbc4037192e 100644
--- a/kernel/liveupdate/Kconfig
+++ b/kernel/liveupdate/Kconfig
@@ -88,4 +88,19 @@ config LIVEUPDATE_MEMFD
If unsure, say N.
+config LIVEUPDATE_GUEST_MEMFD
+ bool "Live update support for guest_memfd"
+ depends on LIVEUPDATE
+ depends on KVM_GUEST_MEMFD
+ default LIVEUPDATE
+ help
+ Enable live update support for KVM guest_memfd files. This allows
+ preserving VM Memory backed by guest_memfd file across kernel live
+ updates.
+
+ This can only be used for the guest_memfd that are fully-shared
+ and pre-faulted.
+
+ If unsure, say N.
+
endmenu
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support
[not found] <cover.1780676742.git.tarunsahu@google.com>
2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 02/10] liveupdate: Add LIVEUPDATE_GUEST_MEMFD config option Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:21 ` sashiko-bot
2026-06-22 23:59 ` Ackerley Tng
2026-06-05 17:08 ` [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO Tarun Sahu
` (6 subsequent siblings)
9 siblings, 2 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Introduce core infrastructure to support VM preservation with LUO.
First two changes are just refactoring, no functional change, third
change introduces a new member in struct kvm.
- Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
- Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
and anon_inode_getfile() to provide a unified VM file creation API.
- Track a weak reference to the backing file in struct kvm under
CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
without circular lifetime dependencies.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
include/linux/kvm_host.h | 14 +++++++
virt/kvm/kvm_main.c | 79 +++++++++++++++++++++++++++++-----------
virt/kvm/kvm_mm.h | 3 ++
3 files changed, 75 insertions(+), 21 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4c14aee1fb06..9111a28637af 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -874,6 +874,18 @@ struct kvm {
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
/* Protected by slots_lock (for writes) and RCU (for reads) */
struct xarray mem_attr_array;
+#endif
+#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
+ /*
+ * Weak reference to the VFS file backing this KVM instance. Stored
+ * without incrementing the file refcount to prevent a circular lifetime
+ * dependency (since file->private_data already pins this struct kvm).
+ * Used exclusively to resolve the file pointer back from struct kvm.
+ *
+ * Written/cleared via rcu_assign_pointer() and read locklessly under
+ * RCU (e.g. via get_file_active() to prevent ABA races).
+ */
+ struct file *vm_file;
#endif
char stats_id[KVM_STATS_NAME_SIZE];
};
@@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
bool kvm_get_kvm_safe(struct kvm *kvm);
void kvm_put_kvm(struct kvm *kvm);
bool file_is_kvm(struct file *file);
+struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
void kvm_put_kvm_no_destroy(struct kvm *kvm);
+void kvm_uevent_notify_vm_create(struct kvm *kvm);
static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 89489996fbc1..65f0c5fb353e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -67,9 +67,6 @@
#include <linux/kvm_dirty_ring.h>
-/* Worst case buffer size needed for holding an integer. */
-#define ITOA_MAX_LEN 12
-
MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
MODULE_LICENSE("GPL");
@@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
{
struct kvm *kvm = filp->private_data;
+#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
+ /*
+ * Clear the weak reference of the vm file.
+ * In case vm file is closed by userspace, but kvm still has
+ * other users like vCPUs, clearing this pointer ensures
+ * that we don't have a dangling pointer to a closed file.
+ *
+ * Cleared via rcu_assign_pointer() to ensure proper memory visibility
+ * for concurrent lockless readers under RCU.
+ */
+ rcu_assign_pointer(kvm->vm_file, NULL);
+#endif
+
kvm_irqfd_release(kvm);
kvm_put_kvm(kvm);
@@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
+struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
+{
+ struct kvm *kvm = kvm_create_vm(type, fdname);
+ struct file *file;
+
+ if (IS_ERR(kvm))
+ return ERR_CAST(kvm);
+
+ file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+ if (IS_ERR(file)) {
+ kvm_put_kvm(kvm);
+ return file;
+ }
+
+#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
+ /*
+ * Weak reference to the file (without get_file()) to prevent a circular
+ * dependency. Safe because the file's release path clears this pointer
+ * and drops its reference to the VM.
+ *
+ * Written via rcu_assign_pointer() because the pointer can be read
+ * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
+ * get_file_active() to prevent lockless ABA races).
+ */
+ rcu_assign_pointer(kvm->vm_file, file);
+#endif
+
+ /*
+ * Don't call kvm_put_kvm anymore at this point; file->f_op is
+ * already set, with ->release() being kvm_vm_release(). In error
+ * cases it will be called by the final fput(file) and will take
+ * care of doing kvm_put_kvm(kvm).
+ */
+
+ return file;
+}
+
static int kvm_dev_ioctl_create_vm(unsigned long type)
{
char fdname[ITOA_MAX_LEN + 1];
int r, fd;
- struct kvm *kvm;
struct file *file;
fd = get_unused_fd_flags(O_CLOEXEC);
@@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
snprintf(fdname, sizeof(fdname), "%d", fd);
- kvm = kvm_create_vm(type, fdname);
- if (IS_ERR(kvm)) {
- r = PTR_ERR(kvm);
- goto put_fd;
- }
-
- file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+ file = kvm_create_vm_file(type, fdname);
if (IS_ERR(file)) {
r = PTR_ERR(file);
- goto put_kvm;
+ goto put_fd;
}
- /*
- * Don't call kvm_put_kvm anymore at this point; file->f_op is
- * already set, with ->release() being kvm_vm_release(). In error
- * cases it will be called by the final fput(file) and will take
- * care of doing kvm_put_kvm(kvm).
- */
- kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
+ kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);
fd_install(fd, file);
return fd;
-put_kvm:
- kvm_put_kvm(kvm);
put_fd:
put_unused_fd(fd);
return r;
@@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
kfree(env);
}
+void kvm_uevent_notify_vm_create(struct kvm *kvm)
+{
+ kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
+}
+
static void kvm_init_debug(void)
{
const struct file_operations *fops;
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 9fcc5d5b7f8d..7aa1d65c3d46 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -3,6 +3,9 @@
#ifndef __KVM_MM_H__
#define __KVM_MM_H__ 1
+/* Worst case buffer size needed for holding an integer as a string. */
+#define ITOA_MAX_LEN 12
+
/*
* Architectures can choose whether to use an rwlock or spinlock
* for the mmu_lock. These macros, for use in common code
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO
[not found] <cover.1780676742.git.tarunsahu@google.com>
` (2 preceding siblings ...)
2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:26 ` sashiko-bot
2026-06-05 17:08 ` [RFC PATCH v2 05/10] kvm: guest_memfd: Move internal definitions and helper to new header Tarun Sahu
` (5 subsequent siblings)
9 siblings, 1 reply; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Introduce KVM VM preservation support for Live Update Orchestrator.
Register an LUO file handler for KVM files to serialize and
deserialize necessary VM state across live updates. Currently, this
preserves the VM type. This implementation provides the necessary
infrastructure and dependencies for the upcoming guest_memfd
preservation support. And it can be extended to preserve more vm
state in future.
Retrieve is simply creating the kvm and populate the retrieved data.
Only catch here is there is no way to know which fd is going to be
assigned to this kvm file hence I am using atomically incremented id
for the fdname.
This change also updates the MAINTAINERS list for kvm_luo.c.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
My only worry is if userspace strictly depends on the fdname, that it
needs to be consistent with vm_fd. Discussed more details in the
cover letter. Would really appreciates the alternatives/other approaches.
---
MAINTAINERS | 11 +++
include/linux/kho/abi/kvm.h | 39 ++++++++
virt/kvm/Makefile.kvm | 1 +
virt/kvm/kvm_luo.c | 190 ++++++++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 8 ++
virt/kvm/kvm_mm.h | 8 ++
6 files changed, 257 insertions(+)
create mode 100644 include/linux/kho/abi/kvm.h
create mode 100644 virt/kvm/kvm_luo.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 9ec290e38b44..9bfc3c1f6676 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14409,6 +14409,17 @@ S: Maintained
F: Documentation/devicetree/bindings/leds/backlight/kinetic,ktz8866.yaml
F: drivers/video/backlight/ktz8866.c
+KVM LIVE UPDATE
+M: Pasha Tatashin <pasha.tatashin@soleen.com>
+M: Mike Rapoport <rppt@kernel.org>
+M: Pratyush Yadav <pratyush@kernel.org>
+R: Tarun Sahu <tarunsahu@google.com>
+L: kexec@lists.infradead.org
+L: kvm@vger.kernel.org
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
+F: virt/kvm/kvm_luo.c
+
KVM PARAVIRT (KVM/paravirt)
M: Paolo Bonzini <pbonzini@redhat.com>
R: Vitaly Kuznetsov <vkuznets@redhat.com>
diff --git a/include/linux/kho/abi/kvm.h b/include/linux/kho/abi/kvm.h
new file mode 100644
index 000000000000..718db68a541a
--- /dev/null
+++ b/include/linux/kho/abi/kvm.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2026, Google LLC.
+ * Tarun Sahu <tarunsahu@google.com>
+ *
+ * KVM Preservation ABI for Live Update Orchestrator (LUO)
+ */
+#ifndef _LINUX_KHO_ABI_KVM_H
+#define _LINUX_KHO_ABI_KVM_H
+
+#include <linux/types.h>
+#include <linux/kho/abi/kexec_handover.h>
+
+/**
+ * DOC: KVM Live Update ABI
+ *
+ * KVM uses the ABI defined below for preserving its state
+ * across a kexec reboot using the LUO.
+ *
+ * The state is serialized into a packed structure `struct kvm_luo_ser`
+ * which is handed over to the next kernel via the KHO mechanism.
+ *
+ * This interface is a contract. Any modification to the structure layout
+ * constitutes a breaking change. Such changes require incrementing the
+ * version number in the KVM_LUO_FH_COMPATIBLE compatibility string.
+ */
+
+/**
+ * struct kvm_luo_ser - Main serialization structure for a KVM VM.
+ * @type: The type of VM.
+ */
+struct kvm_luo_ser {
+ u64 type;
+} __packed;
+
+/* The compatibility string for KVM VM file handler */
+#define KVM_LUO_FH_COMPATIBLE "kvm_vm_luo_v1"
+
+#endif /* _LINUX_KHO_ABI_KVM_H */
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index d047d4cf58c9..c1a962159264 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -13,3 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
+kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/kvm_luo.o
diff --git a/virt/kvm/kvm_luo.c b/virt/kvm/kvm_luo.c
new file mode 100644
index 000000000000..25619f94ace5
--- /dev/null
+++ b/virt/kvm/kvm_luo.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2026, Google LLC.
+ * Tarun Sahu <tarunsahu@google.com>
+ *
+ * KVM VM Preservation for Live Update Orchestrator (LUO)
+ */
+
+/**
+ * DOC: KVM VM Preservation via LUO
+ *
+ * Overview
+ * ========
+ *
+ * KVM virtual machines (VMs) can be preserved over a kexec reboot using the
+ * Live Update Orchestrator (LUO) file preservation. This allows userspace
+ * to preserve KVM VM state across kexec reboots.
+ *
+ * The preservation is not intended to be fully transparent. Only specific
+ * VM configuration and state are preserved, while other aspects of the VM
+ * must be re-established or re-configured by userspace after retrieval.
+ *
+ * Preserved Properties
+ * ====================
+ *
+ * The following properties of the KVM VM are preserved across kexec:
+ *
+ * VM Type
+ * The VM type (e.g., on x86 architecture, the vm_type parameter) is
+ * preserved.
+ *
+ * Non-Preserved Properties
+ * ========================
+ *
+ * The preservation does not cover:
+ *
+ * - vCPUs and vCPU states
+ * - Memspots / Memory slot layout (memslots)
+ * - Interrupt controllers and IRQ routings
+ * - Coalesced MMIO zones
+ * - Device bindings (VFIO/Eventfds)
+ * - Active paging or guest registers state
+ * - etc
+ */
+#include <linux/liveupdate.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/err.h>
+#include <linux/anon_inodes.h>
+#include <linux/magic.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/kexec_handover.h>
+#include <linux/kho/abi/kvm.h>
+#include "kvm_mm.h"
+
+static bool kvm_luo_can_preserve(struct liveupdate_file_handler *handler,
+ struct file *file)
+{
+ return file_is_kvm(file);
+}
+
+static int kvm_luo_preserve(struct liveupdate_file_op_args *args)
+{
+ struct kvm *kvm = args->file->private_data;
+ struct kvm_luo_ser *ser;
+
+ if (kvm->vm_dead || kvm->vm_bugged)
+ return -EINVAL;
+
+ ser = kho_alloc_preserve(sizeof(*ser));
+ if (IS_ERR(ser))
+ return PTR_ERR(ser);
+
+#ifdef CONFIG_X86
+ ser->type = kvm->arch.vm_type;
+#else
+ ser->type = 0;
+#endif
+
+ args->serialized_data = virt_to_phys(ser);
+
+ return 0;
+}
+
+static atomic_t restored_vm_id = ATOMIC_INIT(0);
+
+static int kvm_luo_retrieve(struct liveupdate_file_op_args *args)
+{
+ char fdname[ITOA_MAX_LEN + 1];
+ struct kvm_luo_ser *ser;
+ struct file *file;
+ struct kvm *kvm;
+ int err = 0;
+
+ if (!args->serialized_data)
+ return -EINVAL;
+
+ ser = phys_to_virt(args->serialized_data);
+
+ snprintf(fdname, sizeof(fdname), "%d",
+ atomic_inc_return(&restored_vm_id));
+
+ file = kvm_create_vm_file(ser->type, fdname);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_free_ser;
+ }
+
+ kvm = file->private_data;
+
+ args->file = file;
+ kho_restore_free(ser);
+
+ kvm_uevent_notify_vm_create(kvm);
+ return 0;
+
+err_free_ser:
+ kho_restore_free(ser);
+ return err;
+}
+
+static void kvm_luo_unpreserve(struct liveupdate_file_op_args *args)
+{
+ struct kvm_luo_ser *ser;
+
+ /*
+ * in case preservation failed, args->serialized_data will
+ * be NULL and kvm_luo_preserve takes care of cleaning up.
+ * If preserve succeeds, this condition fails and unpreserve
+ * function takes care of cleaning up.
+ */
+ if (WARN_ON_ONCE(!args->serialized_data))
+ return;
+
+ ser = phys_to_virt(args->serialized_data);
+
+ kho_unpreserve_free(ser);
+}
+
+static void kvm_luo_finish(struct liveupdate_file_op_args *args)
+{
+ struct kvm_luo_ser *ser;
+
+ /*
+ * If retrieve_status is true or set to error, nothing to do here.
+ * Already cleaned up in kvm_luo_retrieve().
+ */
+ if (args->retrieve_status)
+ return;
+
+ if (!args->serialized_data)
+ return;
+
+ ser = phys_to_virt(args->serialized_data);
+ kho_restore_free(ser);
+}
+
+static const struct liveupdate_file_ops kvm_luo_file_ops = {
+ .can_preserve = kvm_luo_can_preserve,
+ .preserve = kvm_luo_preserve,
+ .retrieve = kvm_luo_retrieve,
+ .unpreserve = kvm_luo_unpreserve,
+ .finish = kvm_luo_finish,
+ .owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler kvm_luo_handler = {
+ .ops = &kvm_luo_file_ops,
+ .compatible = KVM_LUO_FH_COMPATIBLE,
+};
+
+int kvm_luo_init(void)
+{
+ int err = liveupdate_register_file_handler(&kvm_luo_handler);
+
+ if (err && err != -EOPNOTSUPP) {
+ pr_err("Could not register kvm_vm_luo handler: %pe\n", ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+}
+
+void kvm_luo_exit(void)
+{
+ liveupdate_unregister_file_handler(&kvm_luo_handler);
+}
+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 65f0c5fb353e..c70346906a89 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6576,6 +6576,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (r)
goto err_virt;
+ r = kvm_luo_init();
+ if (r)
+ goto err_luo;
+
/*
* Registration _must_ be the very last thing done, as this exposes
* /dev/kvm to userspace, i.e. all infrastructure must be setup!
@@ -6589,6 +6593,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
return 0;
err_register:
+ kvm_luo_exit();
+err_luo:
kvm_uninit_virtualization();
err_virt:
kvm_gmem_exit();
@@ -6618,6 +6624,8 @@ void kvm_exit(void)
*/
misc_deregister(&kvm_dev);
+ kvm_luo_exit();
+
kvm_uninit_virtualization();
debugfs_remove_recursive(kvm_debugfs_dir);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 7aa1d65c3d46..118edc47df83 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -97,4 +97,12 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
}
#endif /* CONFIG_KVM_GUEST_MEMFD */
+#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
+int kvm_luo_init(void);
+void kvm_luo_exit(void);
+#else
+static inline int kvm_luo_init(void) { return 0; }
+static inline void kvm_luo_exit(void) {}
+#endif /* CONFIG_LIVEUPDATE_GUEST_MEMFD */
+
#endif /* __KVM_MM_H__ */
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 05/10] kvm: guest_memfd: Move internal definitions and helper to new header
[not found] <cover.1780676742.git.tarunsahu@google.com>
` (3 preceding siblings ...)
2026-06-05 17:08 ` [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
` (4 subsequent siblings)
9 siblings, 0 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
To support guest_memfd memory preservation with LUO, guest_memfd luo
code needs to access guest_memfd internals and reconstruct guest_memfd
file instances from a preserved state.
Extract gmem_file, gmem_inode, and the GMEM_I() helper from guest_memfd.c
into a new internal header virt/kvm/guest_memfd.h.
Additionally, split __kvm_gmem_create() to expose a non-static
__kvm_gmem_create_file() helper. This helper returns a struct file
instead of a file descriptor, enabling file creation and initialization
without installing it into a file descriptor table.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
virt/kvm/guest_memfd.c | 68 +++++++++++++++++-------------------------
virt/kvm/guest_memfd.h | 39 ++++++++++++++++++++++++
2 files changed, 67 insertions(+), 40 deletions(-)
create mode 100644 virt/kvm/guest_memfd.h
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 69c9d6d546b2..6740ae2bf948 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -7,38 +7,12 @@
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
+#include "guest_memfd.h"
#include "kvm_mm.h"
static struct vfsmount *kvm_gmem_mnt;
-/*
- * A guest_memfd instance can be associated multiple VMs, each with its own
- * "view" of the underlying physical memory.
- *
- * The gmem's inode is effectively the raw underlying physical storage, and is
- * used to track properties of the physical memory, while each gmem file is
- * effectively a single VM's view of that storage, and is used to track assets
- * specific to its associated VM, e.g. memslots=>gmem bindings.
- */
-struct gmem_file {
- struct kvm *kvm;
- struct xarray bindings;
- struct list_head entry;
-};
-
-struct gmem_inode {
- struct shared_policy policy;
- struct inode vfs_inode;
- struct list_head gmem_file_list;
-
- u64 flags;
-};
-
-static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
-{
- return container_of(inode, struct gmem_inode, vfs_inode);
-}
#define kvm_gmem_for_each_file(f, inode) \
list_for_each_entry(f, &GMEM_I(inode)->gmem_file_list, entry)
@@ -556,23 +530,17 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
return true;
}
-static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
+struct file *__kvm_gmem_create_file(struct kvm *kvm, loff_t size, u64 flags)
{
static const char *name = "[kvm-gmem]";
struct gmem_file *f;
struct inode *inode;
struct file *file;
- int fd, err;
-
- fd = get_unused_fd_flags(0);
- if (fd < 0)
- return fd;
+ int err;
f = kzalloc_obj(*f);
- if (!f) {
- err = -ENOMEM;
- goto err_fd;
- }
+ if (!f)
+ return ERR_PTR(-ENOMEM);
/* __fput() will take care of fops_put(). */
if (!fops_get(&kvm_gmem_fops)) {
@@ -611,8 +579,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
xa_init(&f->bindings);
list_add(&f->entry, &GMEM_I(inode)->gmem_file_list);
- fd_install(fd, file);
- return fd;
+ return file;
err_inode:
iput(inode);
@@ -620,7 +587,28 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
fops_put(&kvm_gmem_fops);
err_gmem:
kfree(f);
-err_fd:
+ return ERR_PTR(err);
+}
+
+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
+{
+ struct file *file;
+ int fd, err;
+
+ fd = get_unused_fd_flags(0);
+ if (fd < 0)
+ return fd;
+
+ file = __kvm_gmem_create_file(kvm, size, flags);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_put_fd;
+ }
+
+ fd_install(fd, file);
+ return fd;
+
+err_put_fd:
put_unused_fd(fd);
return err;
}
diff --git a/virt/kvm/guest_memfd.h b/virt/kvm/guest_memfd.h
new file mode 100644
index 000000000000..c528b046dd69
--- /dev/null
+++ b/virt/kvm/guest_memfd.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_GUEST_MEMFD_H__
+#define __KVM_GUEST_MEMFD_H__ 1
+
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <linux/mempolicy.h>
+
+/*
+ * A guest_memfd instance can be associated multiple VMs, each with its own
+ * "view" of the underlying physical memory.
+ *
+ * The gmem's inode is effectively the raw underlying physical storage, and is
+ * used to track properties of the physical memory, while each gmem file is
+ * effectively a single VM's view of that storage, and is used to track assets
+ * specific to its associated VM, e.g. memslots=>gmem bindings.
+ */
+struct gmem_file {
+ struct kvm *kvm;
+ struct xarray bindings;
+ struct list_head entry;
+};
+
+struct gmem_inode {
+ struct shared_policy policy;
+ struct inode vfs_inode;
+ struct list_head gmem_file_list;
+
+ u64 flags;
+};
+
+static inline struct gmem_inode *GMEM_I(struct inode *inode)
+{
+ return container_of(inode, struct gmem_inode, vfs_inode);
+}
+
+struct file *__kvm_gmem_create_file(struct kvm *kvm, loff_t size, u64 flags);
+
+#endif /* __KVM_GUEST_MEMFD_H__ */
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
[not found] <cover.1780676742.git.tarunsahu@google.com>
` (4 preceding siblings ...)
2026-06-05 17:08 ` [RFC PATCH v2 05/10] kvm: guest_memfd: Move internal definitions and helper to new header Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:21 ` sashiko-bot
2026-06-22 23:54 ` Ackerley Tng
2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
` (3 subsequent siblings)
9 siblings, 2 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
This patch introduces the freeze on gmem_inode which prevents
the fallocate call and any new page fault allocation. This will avoid
gmem file modification when it is being preserved
Used srcu lock to synchronise the freeze call, where write blocks
until all the reads are free. And reads are re-entrant.
Incase fault fails, It return -EPERM and VM_EXIT to userspace. userspace
must handle this properly as every new fault will fail.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
virt/kvm/guest_memfd.c | 117 +++++++++++++++++++++++++++++++++++++----
virt/kvm/guest_memfd.h | 5 ++
2 files changed, 111 insertions(+), 11 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 6740ae2bf948..b94639cdf312 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -7,11 +7,13 @@
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
+#include <linux/srcu.h>
#include "guest_memfd.h"
#include "kvm_mm.h"
static struct vfsmount *kvm_gmem_mnt;
+static struct srcu_struct kvm_gmem_freeze_srcu;
#define kvm_gmem_for_each_file(f, inode) \
@@ -96,6 +98,7 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
/* TODO: Support huge pages. */
struct mempolicy *policy;
struct folio *folio;
+ int idx;
/*
* Fast-path: See if folio is already present in mapping to avoid
@@ -105,12 +108,20 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
if (!IS_ERR(folio))
return folio;
+ idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
+ if (kvm_gmem_is_frozen(inode)) {
+ srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
+ return ERR_PTR(-EPERM);
+ }
+
policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
folio = __filemap_get_folio_mpol(inode->i_mapping, index,
FGP_LOCK | FGP_CREAT,
mapping_gfp_mask(inode->i_mapping), policy);
mpol_cond_put(policy);
+ srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
+
/*
* External interfaces like kvm_gmem_get_pfn() support dealing
* with hugepages to a degree, but internally, guest_memfd currently
@@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
+ struct inode *inode = file_inode(file);
int ret;
+ int idx;
- if (!(mode & FALLOC_FL_KEEP_SIZE))
- return -EOPNOTSUPP;
+ idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
+ if (kvm_gmem_is_frozen(inode)) {
+ srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
+ return -EPERM;
+ }
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
- return -EOPNOTSUPP;
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
- if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
- return -EINVAL;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
+ ret = -EINVAL;
+ goto out;
+ }
if (mode & FALLOC_FL_PUNCH_HOLE)
ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
@@ -291,6 +316,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
if (!ret)
file_modified(file);
+
+out:
+ srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
return ret;
}
@@ -944,7 +972,9 @@ static void kvm_gmem_destroy_inode(struct inode *inode)
static void kvm_gmem_free_inode(struct inode *inode)
{
- kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
+ struct gmem_inode *gi = GMEM_I(inode);
+
+ kmem_cache_free(kvm_gmem_inode_cachep, gi);
}
static const struct super_operations kvm_gmem_super_operations = {
@@ -1001,12 +1031,21 @@ int kvm_gmem_init(struct module *module)
if (!kvm_gmem_inode_cachep)
return -ENOMEM;
+ ret = init_srcu_struct(&kvm_gmem_freeze_srcu);
+ if (ret)
+ goto err_cache;
+
ret = kvm_gmem_init_mount();
- if (ret) {
- kmem_cache_destroy(kvm_gmem_inode_cachep);
- return ret;
- }
+ if (ret)
+ goto err_srcu;
+
return 0;
+
+err_srcu:
+ cleanup_srcu_struct(&kvm_gmem_freeze_srcu);
+err_cache:
+ kmem_cache_destroy(kvm_gmem_inode_cachep);
+ return ret;
}
void kvm_gmem_exit(void)
@@ -1014,5 +1053,61 @@ void kvm_gmem_exit(void)
kern_unmount(kvm_gmem_mnt);
kvm_gmem_mnt = NULL;
rcu_barrier();
+ cleanup_srcu_struct(&kvm_gmem_freeze_srcu);
kmem_cache_destroy(kvm_gmem_inode_cachep);
}
+
+/**
+ * kvm_gmem_freeze - Freeze or unfreeze a guest_memfd inode mapping.
+ * @inode: The guest_memfd inode.
+ * @freeze: True to freeze, false to unfreeze.
+ *
+ * This API is used strictly during the live update / preservation transition
+ * window to prevent host userspace and guest-side faults from making any
+ * mapping modifications (such as fallocate or page fault allocation)
+ * to the guest_memfd page cache.
+ *
+ * Synchronization Strategy (Sleepable RCU):
+ * To avoid high-contention VFS locks (like inode_lock or
+ * filemap_invalidate_lock) on the vCPU page fault hot paths, this subsystem
+ * implements a lightweight, system-wide Sleepable RCU (SRCU) mechanism
+ * (`kvm_gmem_freeze_srcu`):
+ *
+ * Global vs. Per-Inode SRCU
+ * ======================
+ * A single system-wide global static `srcu_struct` is used instead of a
+ * per-inode SRCU structure to completely prevent unprivileged users from
+ * exhausting the host's per-CPU memory allocator. Because
+ * `init_srcu_struct()` allocates per-CPU memory via `alloc_percpu()`, which
+ * is not accounted by memory cgroups (memcg),
+ * a per-inode SRCU structure would allow a tenant to bypass cgroup limits and
+ * trigger a system-wide Out-of-Memory (OOM) crash simply by spawning a large
+ * number of guest_memfd file descriptors (bounded only by RLIMIT_NOFILE).
+ *
+ * Flag Modification Note:
+ * Since `GUEST_MEMFD_F_MAPPING_FROZEN` is the ONLY flag in
+ * `GMEM_I(inode)->flags` that is mutated dynamically at runtime (all other
+ * flags are creation-time flags which remain strictly read-only), there is
+ * no possibility of concurrent bit-modification races. Therefore, a standard
+ * `WRITE_ONCE` is fully safe and does not require complex `cmpxchg`
+ * synchronization loops.
+ */
+void kvm_gmem_freeze(struct inode *inode, bool freeze)
+{
+ u64 flags = READ_ONCE(GMEM_I(inode)->flags);
+
+ if (freeze)
+ flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
+ else
+ flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
+
+ WRITE_ONCE(GMEM_I(inode)->flags, flags);
+
+ if (freeze)
+ synchronize_srcu(&kvm_gmem_freeze_srcu);
+}
+
+bool kvm_gmem_is_frozen(struct inode *inode)
+{
+ return READ_ONCE(GMEM_I(inode)->flags) & GUEST_MEMFD_F_MAPPING_FROZEN;
+}
diff --git a/virt/kvm/guest_memfd.h b/virt/kvm/guest_memfd.h
index c528b046dd69..028c348a1023 100644
--- a/virt/kvm/guest_memfd.h
+++ b/virt/kvm/guest_memfd.h
@@ -29,11 +29,16 @@ struct gmem_inode {
u64 flags;
};
+/* Internal kernel-only flags (must not overlap with UAPI flags) */
+#define GUEST_MEMFD_F_MAPPING_FROZEN (1ULL << 63)
+
static inline struct gmem_inode *GMEM_I(struct inode *inode)
{
return container_of(inode, struct gmem_inode, vfs_inode);
}
struct file *__kvm_gmem_create_file(struct kvm *kvm, loff_t size, u64 flags);
+void kvm_gmem_freeze(struct inode *inode, bool freeze);
+bool kvm_gmem_is_frozen(struct inode *inode);
#endif /* __KVM_GUEST_MEMFD_H__ */
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation
[not found] <cover.1780676742.git.tarunsahu@google.com>
` (5 preceding siblings ...)
2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:25 ` sashiko-bot
2026-06-22 23:27 ` Ackerley Tng
2026-06-05 17:08 ` [RFC PATCH v2 08/10] docs: add documentation for guest_memfd preservation via LUO Tarun Sahu
` (2 subsequent siblings)
9 siblings, 2 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
This patch sets up the basic infrastructure to preserve the guest_memfd.
Currently this supports only fully shared guest_memfd and backed by
PAGE_SIZE pages.
It registers a new LUO file handler for guest_memfd files to serialize
and deserialize guest memory. This allows preserving guest memory backed
by guest_memfd across updates, ensuring that guest instances can be
resumed seamlessly without losing their memory contents.
Preservation is straight forward. It walks through the folios and
serialize them.
There is kvm_gmem_freeze call on preserve which freeze the guest_memfd
inode. It avoids any changes to inode mapping with fallocate calls or
any new fault allocation (fails) on or after preservation. No need to check
this during the page fault as preservation is only supported for
pre-faulted/pre-allocated guest_memfd.
While retrieving the guest_memfd, it requires the struct kvm to create
new guest_memfd. So it first get the vm_file from the same session using
the token passed during the preservation. And use it to get
vm_file->kvm.
This change also update the MAINTAINERS list.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
MAINTAINERS | 1 +
include/linux/kho/abi/kvm.h | 79 +++++-
virt/kvm/Makefile.kvm | 2 +-
virt/kvm/guest_memfd_luo.c | 485 ++++++++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 7 +
virt/kvm/kvm_mm.h | 4 +
6 files changed, 571 insertions(+), 7 deletions(-)
create mode 100644 virt/kvm/guest_memfd_luo.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 9bfc3c1f6676..16cba790a84d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14418,6 +14418,7 @@ L: kexec@lists.infradead.org
L: kvm@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
+F: virt/kvm/guest_memfd_luo.c
F: virt/kvm/kvm_luo.c
KVM PARAVIRT (KVM/paravirt)
diff --git a/include/linux/kho/abi/kvm.h b/include/linux/kho/abi/kvm.h
index 718db68a541a..42074d76e04a 100644
--- a/include/linux/kho/abi/kvm.h
+++ b/include/linux/kho/abi/kvm.h
@@ -9,20 +9,23 @@
#define _LINUX_KHO_ABI_KVM_H
#include <linux/types.h>
+#include <linux/bits.h>
#include <linux/kho/abi/kexec_handover.h>
/**
- * DOC: KVM Live Update ABI
+ * DOC: KVM and guest_memfd Live Update ABI
*
- * KVM uses the ABI defined below for preserving its state
+ * KVM and guest_memfd use the ABI defined below for preserving their states
* across a kexec reboot using the LUO.
*
- * The state is serialized into a packed structure `struct kvm_luo_ser`
- * which is handed over to the next kernel via the KHO mechanism.
+ * The state is serialized into packed structures (struct kvm_luo_ser and
+ * struct guest_memfd_luo_ser) which are handed over to the next kernel via
+ * the KHO mechanism.
*
- * This interface is a contract. Any modification to the structure layout
+ * This interface is a contract. Any modification to the structure layouts
* constitutes a breaking change. Such changes require incrementing the
- * version number in the KVM_LUO_FH_COMPATIBLE compatibility string.
+ * version number in the KVM_LUO_FH_COMPATIBLE or
+ * GUEST_MEMFD_LUO_FH_COMPATIBLE compatibility strings.
*/
/**
@@ -36,4 +39,68 @@ struct kvm_luo_ser {
/* The compatibility string for KVM VM file handler */
#define KVM_LUO_FH_COMPATIBLE "kvm_vm_luo_v1"
+/**
+ * struct guest_memfd_luo_folio_ser - Serialization layout for a single folio in guest_memfd.
+ * @pfn: Page Frame Number of the folio.
+ * @index: Page offset of the folio within the file.
+ * @flags: State flags associated with the folio.
+ */
+struct guest_memfd_luo_folio_ser {
+ u64 pfn:52;
+ u64 flags:12;
+ u64 index;
+} __packed;
+
+/**
+ * GUEST_MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date.
+ *
+ * This flag is per folio to check if the folio is uptodate.
+ */
+#define GUEST_MEMFD_LUO_FOLIO_UPTODATE BIT(0)
+
+
+/**
+ * GUEST_MEMFD_LUO_FLAG_MMAP - The guest_memfd supports mmap.
+ *
+ * This flag indicates that the guest_memfd supports host-side mmap.
+ */
+#define GUEST_MEMFD_LUO_FLAG_MMAP BIT(0)
+
+/**
+ * GUEST_MEMFD_LUO_FLAG_INIT_SHARED - Initialize memory as shared.
+ *
+ * This flag indicates that the guest_memfd has been initialized as shared
+ * memory.
+ */
+#define GUEST_MEMFD_LUO_FLAG_INIT_SHARED BIT(1)
+
+/**
+ * GUEST_MEMFD_LUO_SUPPORTED_FLAGS - Supported guest_memfd LUO flags mask.
+ *
+ * A mask of all guest_memfd preservation flags supported by this version
+ * of the KVM LUO ABI.
+ */
+#define GUEST_MEMFD_LUO_SUPPORTED_FLAGS (GUEST_MEMFD_LUO_FLAG_MMAP | \
+ GUEST_MEMFD_LUO_FLAG_INIT_SHARED)
+
+/**
+ * struct guest_memfd_luo_ser - Main serialization structure for guest_memfd.
+ * @size: The size of the file in bytes.
+ * @flags: File-level flags.
+ * @nr_folios: Number of folios in the folios array.
+ * @vm_token: Token of the associated KVM VM instance.
+ * @folios: KHO vmalloc descriptor pointing to the array of
+ * struct guest_memfd_luo_folio_ser.
+ */
+struct guest_memfd_luo_ser {
+ u64 size;
+ u64 flags;
+ u64 nr_folios;
+ u64 vm_token;
+ struct kho_vmalloc folios;
+} __packed;
+
+/* The compatibility string for GUEST_MEMFD file handler */
+#define GUEST_MEMFD_LUO_FH_COMPATIBLE "guest_memfd_luo_v1"
+
#endif /* _LINUX_KHO_ABI_KVM_H */
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index c1a962159264..d30fca094c42 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -13,4 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
-kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/kvm_luo.o
+kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/guest_memfd_luo.o $(KVM)/kvm_luo.o
diff --git a/virt/kvm/guest_memfd_luo.c b/virt/kvm/guest_memfd_luo.c
new file mode 100644
index 000000000000..d466f889c9aa
--- /dev/null
+++ b/virt/kvm/guest_memfd_luo.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2026, Google LLC.
+ * Tarun Sahu <tarunsahu@google.com>
+ *
+ * Guestmemfd Preservation for Live Update Orchestrator (LUO)
+ */
+
+/**
+ * DOC: Guestmemfd Preservation via LUO
+ *
+ * Overview
+ * ========
+ *
+ * Guest memory file descriptors (guest_memfd) can be preserved over a kexec
+ * reboot using the Live Update Orchestrator (LUO) file preservation. This
+ * allows userspace to preserve VM memory across kexec reboots.
+ *
+ * The preservation is not intended to be transparent. Only select properties
+ * of the guest_memfd are preserved, while others are reset to default.
+ *
+ * Preserved Properties
+ * ====================
+ *
+ * The following properties of guest_memfd are preserved across kexec:
+ *
+ * File Size
+ * The size of the file is preserved.
+ *
+ * File Contents
+ * All folios present in the page cache are preserved.
+ *
+ * File-level Flags
+ * The file-level flags (such as MMAP support and INIT_SHARED default mapping)
+ * are preserved.
+ *
+ * Non-Preserved Properties
+ * ========================
+ *
+ * NUMA Memory Policy
+ * NUMA memory policies associated with the guest_memfd are not preserved.
+ */
+#include <linux/liveupdate.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/err.h>
+#include <linux/anon_inodes.h>
+#include <linux/magic.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/kexec_handover.h>
+#include <linux/kho/abi/kvm.h>
+#include "guest_memfd.h"
+
+static int kvm_gmem_luo_walk_folios(struct address_space *mapping,
+ pgoff_t end_index, struct guest_memfd_luo_folio_ser *folios_ser,
+ u64 *out_count)
+{
+ struct folio_batch fbatch;
+ pgoff_t index = 0;
+ u64 count = 0;
+ int err = 0;
+
+ folio_batch_init(&fbatch);
+ while (index < end_index) {
+ unsigned int nr, i;
+
+ nr = filemap_get_folios(mapping, &index, end_index - 1, &fbatch);
+ if (nr == 0)
+ break;
+
+ for (i = 0; i < nr; i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ if (folios_ser) {
+ if (folio_test_hwpoison(folio)) {
+ err = -EHWPOISON;
+ folio_batch_release(&fbatch);
+ goto out;
+ }
+ err = kho_preserve_folio(folio);
+ if (err) {
+ folio_batch_release(&fbatch);
+ goto out;
+ }
+
+ folios_ser[count].pfn = folio_pfn(folio);
+ folios_ser[count].index = folio->index;
+ folios_ser[count].flags = folio_test_uptodate(folio) ?
+ GUEST_MEMFD_LUO_FOLIO_UPTODATE : 0;
+ }
+ count++;
+ }
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
+
+out:
+ *out_count = count;
+ return err;
+}
+
+static bool kvm_gmem_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct gmem_file *gmem_file = file->private_data;
+ struct kvm *kvm = gmem_file->kvm;
+
+ if (inode->i_sb->s_magic != GUEST_MEMFD_MAGIC)
+ return 0;
+
+ if (kvm_arch_has_private_mem(kvm))
+ return 0;
+
+ if (mapping_large_folio_support(inode->i_mapping))
+ return 0;
+
+ return 1;
+}
+
+static int kvm_gmem_luo_preserve(struct liveupdate_file_op_args *args)
+{
+ struct guest_memfd_luo_folio_ser *folios_ser = NULL;
+ u64 count = 0, gmem_flags, abi_flags = 0;
+ struct guest_memfd_luo_ser *ser;
+ struct address_space *mapping;
+ struct gmem_file *gmem_file;
+ struct inode *inode;
+ pgoff_t end_index;
+ struct kvm *kvm;
+ int err = 0;
+ long size;
+
+ inode = file_inode(args->file);
+ kvm_gmem_freeze(inode, true);
+
+ mapping = inode->i_mapping;
+ size = i_size_read(inode);
+ if (!size) {
+ err = -EINVAL;
+ goto err_unfreeze_inode;
+ }
+
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(size))) {
+ err = -EINVAL;
+ goto err_unfreeze_inode;
+ }
+
+ gmem_file = args->file->private_data;
+ kvm = gmem_file->kvm;
+
+ gmem_flags = READ_ONCE(GMEM_I(inode)->flags);
+ if (gmem_flags & ~(GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED
+ | GUEST_MEMFD_F_MAPPING_FROZEN)) {
+ err = -EOPNOTSUPP;
+ goto err_unfreeze_inode;
+ }
+
+ if (gmem_flags & GUEST_MEMFD_FLAG_MMAP)
+ abi_flags |= GUEST_MEMFD_LUO_FLAG_MMAP;
+ if (gmem_flags & GUEST_MEMFD_FLAG_INIT_SHARED)
+ abi_flags |= GUEST_MEMFD_LUO_FLAG_INIT_SHARED;
+
+ end_index = size >> PAGE_SHIFT;
+
+ ser = kho_alloc_preserve(sizeof(*ser));
+ if (IS_ERR(ser)) {
+ err = PTR_ERR(ser);
+ goto err_unfreeze_inode;
+ }
+
+ /* First pass: Count the folios present in the page cache */
+ err = kvm_gmem_luo_walk_folios(mapping, end_index, NULL, &count);
+ if (err)
+ goto err_free_ser;
+
+ ser->size = size;
+ ser->flags = abi_flags;
+ ser->nr_folios = count;
+ ser->vm_token = 0; // It will be set during the kvm_gmem_luo_freeze()
+
+ if (count > 0) {
+ folios_ser = vcalloc(count, sizeof(*folios_ser));
+ if (!folios_ser) {
+ err = -ENOMEM;
+ goto err_free_ser;
+ }
+
+ /* Second pass: Fill the metadata array and preserve folios */
+ err = kvm_gmem_luo_walk_folios(mapping, end_index, folios_ser, &count);
+ if (err)
+ goto err_unpreserve_unlocked;
+
+ if (WARN_ON_ONCE(count != ser->nr_folios)) {
+ err = -EINVAL;
+ goto err_unpreserve_unlocked;
+ }
+ }
+
+ if (count > 0) {
+ err = kho_preserve_vmalloc(folios_ser, &ser->folios);
+ if (err)
+ goto err_unpreserve_unlocked;
+ }
+
+ args->serialized_data = virt_to_phys(ser);
+ args->private_data = folios_ser;
+
+ return 0;
+
+err_unpreserve_unlocked:
+ for (long i = (long)count - 1; i >= 0; i--) {
+ struct folio *folio = pfn_folio(folios_ser[i].pfn);
+
+ kho_unpreserve_folio(folio);
+ }
+ vfree(folios_ser);
+err_free_ser:
+ kho_unpreserve_free(ser);
+err_unfreeze_inode:
+ kvm_gmem_freeze(inode, false);
+ return err;
+}
+
+static int kvm_gmem_luo_freeze(struct liveupdate_file_op_args *args)
+{
+ struct guest_memfd_luo_ser *ser;
+ struct gmem_file *gmem_file;
+ struct kvm *kvm;
+ struct file *kvm_file;
+ u64 vm_token;
+ int err;
+
+ if (WARN_ON_ONCE(!args->serialized_data))
+ return -EINVAL;
+
+ ser = phys_to_virt(args->serialized_data);
+
+ gmem_file = args->file->private_data;
+ kvm = gmem_file->kvm;
+
+ /*
+ * Obtain a strong reference to kvm->vm_file to prevent the SLAB_TYPESAFE_BY_RCU
+ * file memory from being reallocated while it is being processed.
+ */
+ kvm_file = get_file_active(&kvm->vm_file);
+ if (!kvm_file)
+ return -ENOENT;
+
+ err = liveupdate_get_token_outgoing(args->session, kvm_file, &vm_token);
+ fput(kvm_file);
+ if (err)
+ return err;
+
+ ser->vm_token = vm_token;
+ return 0;
+}
+
+static void kvm_gmem_luo_discard_folios(
+ const struct guest_memfd_luo_folio_ser *folios_ser,
+ u64 nr_folios, u64 start_idx)
+{
+ long i;
+
+ for (i = start_idx; i < nr_folios; i++) {
+ struct folio *folio;
+ phys_addr_t phys;
+
+ if (!folios_ser[i].pfn)
+ continue;
+
+ phys = PFN_PHYS(folios_ser[i].pfn);
+ folio = kho_restore_folio(phys);
+ if (folio)
+ folio_put(folio);
+ }
+}
+
+static void kvm_gmem_luo_unpreserve(struct liveupdate_file_op_args *args)
+{
+ struct guest_memfd_luo_folio_ser *folios_ser = args->private_data;
+ struct guest_memfd_luo_ser *ser;
+ long i;
+
+ if (WARN_ON_ONCE(!args->serialized_data))
+ return;
+
+ ser = phys_to_virt(args->serialized_data);
+ if (!ser)
+ return;
+
+ if (ser->nr_folios > 0)
+ kho_unpreserve_vmalloc(&ser->folios);
+ for (i = ser->nr_folios - 1; i >= 0; i--) {
+ struct folio *folio;
+
+ if (!folios_ser[i].pfn)
+ continue;
+
+ folio = pfn_folio(folios_ser[i].pfn);
+ kho_unpreserve_folio(folio);
+ }
+ vfree(folios_ser);
+
+ kho_unpreserve_free(ser);
+ kvm_gmem_freeze(file_inode(args->file), false);
+}
+
+static int kvm_gmem_luo_retrieve(struct liveupdate_file_op_args *args)
+{
+ struct guest_memfd_luo_folio_ser *folios_ser = NULL;
+ struct guest_memfd_luo_ser *ser;
+ struct kvm *kvm = NULL;
+ struct file *vm_file;
+ struct inode *inode;
+ struct file *file;
+ u64 gmem_flags = 0;
+ int err = 0;
+ long i = 0;
+
+ if (!args->serialized_data)
+ return -EINVAL;
+
+ ser = phys_to_virt(args->serialized_data);
+
+ if (ser->flags & ~GUEST_MEMFD_LUO_SUPPORTED_FLAGS) {
+ err = -EOPNOTSUPP;
+ goto err_free_ser;
+ }
+
+ if (ser->flags & GUEST_MEMFD_LUO_FLAG_MMAP)
+ gmem_flags |= GUEST_MEMFD_FLAG_MMAP;
+ if (ser->flags & GUEST_MEMFD_LUO_FLAG_INIT_SHARED)
+ gmem_flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
+
+ err = liveupdate_get_file_incoming(args->session, ser->vm_token, &vm_file);
+ if (err) {
+ pr_warn("gmem: provided VM FD token (%llx) on preserve is incorrect\n",
+ ser->vm_token);
+ goto err_free_ser;
+ }
+
+ if (file_is_kvm(vm_file))
+ kvm = vm_file->private_data;
+
+ /*
+ * Release the temporary reference taken by the liveupdate_get_file_incoming
+ * call. LUO still holds a reference.
+ */
+ fput(vm_file);
+
+ if (!kvm) {
+ err = -EINVAL;
+ goto err_free_ser;
+ }
+
+ file = __kvm_gmem_create_file(kvm, ser->size, gmem_flags);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_free_ser;
+ }
+
+ inode = file_inode(file);
+
+ if (ser->nr_folios) {
+ folios_ser = kho_restore_vmalloc(&ser->folios);
+ if (!folios_ser) {
+ err = -EINVAL;
+ goto err_destroy_file;
+ }
+
+ for (i = 0; i < ser->nr_folios; i++) {
+ struct folio *folio;
+ phys_addr_t phys;
+
+ if (!folios_ser[i].pfn)
+ continue;
+
+ phys = PFN_PHYS(folios_ser[i].pfn);
+ folio = kho_restore_folio(phys);
+ if (!folio) {
+ pr_err("gmem: failed to restore folio at %llx\n", phys);
+ err = -EIO;
+ goto err_put_remaining_folios;
+ }
+
+ err = filemap_add_folio(inode->i_mapping, folio, folios_ser[i].index,
+ GFP_KERNEL);
+ if (err) {
+ pr_err("gmem: failed to add folio to page cache\n");
+ folio_put(folio);
+ goto err_put_remaining_folios;
+ }
+
+ if (folios_ser[i].flags & GUEST_MEMFD_LUO_FOLIO_UPTODATE)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ vfree(folios_ser);
+ }
+
+ args->file = file;
+ kho_restore_free(ser);
+ return 0;
+
+err_put_remaining_folios:
+ i++;
+err_destroy_file:
+ fput(file);
+err_free_ser:
+ if (ser->nr_folios) {
+ if (!folios_ser)
+ folios_ser = kho_restore_vmalloc(&ser->folios);
+ if (folios_ser) {
+ kvm_gmem_luo_discard_folios(folios_ser, ser->nr_folios, i);
+ vfree(folios_ser);
+ }
+ }
+ kho_restore_free(ser);
+ return err;
+}
+
+static void kvm_gmem_luo_finish(struct liveupdate_file_op_args *args)
+{
+ struct guest_memfd_luo_ser *ser;
+ struct guest_memfd_luo_folio_ser *folios_ser;
+
+ /* Nothing to be done here, if retrieve_status was successful or errored,
+ * Cleanup is taken care of in retrieval call.
+ */
+ if (args->retrieve_status)
+ return;
+
+ if (!args->serialized_data)
+ return;
+
+ ser = phys_to_virt(args->serialized_data);
+ if (!ser)
+ return;
+
+ if (ser->nr_folios) {
+ folios_ser = kho_restore_vmalloc(&ser->folios);
+ if (folios_ser) {
+ kvm_gmem_luo_discard_folios(folios_ser, ser->nr_folios, 0);
+ vfree(folios_ser);
+ }
+ }
+
+ kho_restore_free(ser);
+}
+
+static const struct liveupdate_file_ops kvm_gmem_luo_file_ops = {
+ .can_preserve = kvm_gmem_luo_can_preserve,
+ .preserve = kvm_gmem_luo_preserve,
+ .freeze = kvm_gmem_luo_freeze,
+ .retrieve = kvm_gmem_luo_retrieve,
+ .unpreserve = kvm_gmem_luo_unpreserve,
+ .finish = kvm_gmem_luo_finish,
+ .owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler kvm_gmem_luo_handler = {
+ .ops = &kvm_gmem_luo_file_ops,
+ .compatible = GUEST_MEMFD_LUO_FH_COMPATIBLE,
+};
+
+int kvm_gmem_luo_init(void)
+{
+ int err = liveupdate_register_file_handler(&kvm_gmem_luo_handler);
+
+ if (err && err != -EOPNOTSUPP) {
+ pr_err("Could not register luo filesystem handler: %pe\n", ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+}
+
+void kvm_gmem_luo_exit(void)
+{
+ liveupdate_unregister_file_handler(&kvm_gmem_luo_handler);
+}
+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c70346906a89..501a5d048418 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6580,6 +6580,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (r)
goto err_luo;
+ r = kvm_gmem_luo_init();
+ if (r)
+ goto err_gmem_luo;
+
/*
* Registration _must_ be the very last thing done, as this exposes
* /dev/kvm to userspace, i.e. all infrastructure must be setup!
@@ -6593,6 +6597,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
return 0;
err_register:
+ kvm_gmem_luo_exit();
+err_gmem_luo:
kvm_luo_exit();
err_luo:
kvm_uninit_virtualization();
@@ -6624,6 +6630,7 @@ void kvm_exit(void)
*/
misc_deregister(&kvm_dev);
+ kvm_gmem_luo_exit();
kvm_luo_exit();
kvm_uninit_virtualization();
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 118edc47df83..d8ccb68e7e9b 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -100,9 +100,13 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
int kvm_luo_init(void);
void kvm_luo_exit(void);
+int kvm_gmem_luo_init(void);
+void kvm_gmem_luo_exit(void);
#else
static inline int kvm_luo_init(void) { return 0; }
static inline void kvm_luo_exit(void) {}
+static inline int kvm_gmem_luo_init(void) { return 0; }
+static inline void kvm_gmem_luo_exit(void) {}
#endif /* CONFIG_LIVEUPDATE_GUEST_MEMFD */
#endif /* __KVM_MM_H__ */
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 08/10] docs: add documentation for guest_memfd preservation via LUO
[not found] <cover.1780676742.git.tarunsahu@google.com>
` (6 preceding siblings ...)
2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 09/10] selftests: kvm: Split ____vm_create() to expose init helpers Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
9 siblings, 0 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Add the documentation under the "Preserving file descriptors" section
of LUO's documentation.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
Documentation/core-api/liveupdate.rst | 1 +
Documentation/liveupdate/vmm.rst | 103 ++++++++++++++++++++++++++
MAINTAINERS | 1 +
3 files changed, 105 insertions(+)
create mode 100644 Documentation/liveupdate/vmm.rst
diff --git a/Documentation/core-api/liveupdate.rst b/Documentation/core-api/liveupdate.rst
index 5a292d0f3706..bac58a363151 100644
--- a/Documentation/core-api/liveupdate.rst
+++ b/Documentation/core-api/liveupdate.rst
@@ -34,6 +34,7 @@ The following types of file descriptors can be preserved
:maxdepth: 1
../mm/memfd_preservation
+ ../liveupdate/vmm
Public API
==========
diff --git a/Documentation/liveupdate/vmm.rst b/Documentation/liveupdate/vmm.rst
new file mode 100644
index 000000000000..0cd487a0e1a6
--- /dev/null
+++ b/Documentation/liveupdate/vmm.rst
@@ -0,0 +1,103 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+=============================
+VM & Guest_Memfd Preservation
+=============================
+
+.. kernel-doc:: virt/kvm/kvm_luo.c
+ :doc: KVM VM Preservation via LUO
+
+.. kernel-doc:: virt/kvm/guest_memfd_luo.c
+ :doc: Guest_Memfd Preservation via LUO
+
+VMM Instructions
+================
+
+This section describes the requirements, scope, conditions, and
+ordering constraints that a Virtual Machine Monitor (VMM) must adhere
+to for successful preservation and retrieval of guest_memfd files
+across a Live Update Orchestrator (LUO) sequence.
+
+Scope and Limitations
+---------------------
+
+At this stage, the scope of guest_memfd preservation is restricted to:
+
+1. **Fully Shared guest_memfd**:
+ This time only fully shared guest_memfd supported. Any system that
+ supports coco vm (which uses private guest_memfd), will not support
+ the preservation.
+
+2. **Standard Page Size**:
+ Only guest_memfd backed by standard page size (``PAGE_SIZE``,
+ order-0) pages is supported. Large/huge page backing (e.g.,
+ hugetlb guest_memfd) is not supported.
+
+Any Virtual Machine (VM) whose memory is fully backed by such
+guest_memfd files can be preserved across live update.
+
+VMM Actions and Conditions during Live Update
+---------------------------------------------
+
+During the live update sequence, the kernel introduces a *freezing*
+phase for the guest_memfd inode. Freezing prevents any modifications to
+the guest_memfd page cache. Specifically, once a guest_memfd mapping is
+frozen:
+
+- Any subsequent ``fallocate`` calls on the guest_memfd file descriptor
+ will fail and return ``-EPERM``.
+- Any new page faults (guest-side or host-userspace-side) that require
+ folio allocation will fail and return ``-EPERM``.
+
+To prevent vCPUs or VMM helper threads from failing due to these
+``-EPERM`` errors, the VMM must implement one of the following
+strategies:
+
+1. **Pause the VM (Recommended)**:
+ The VMM should pause/suspend all vCPUs before invoking the
+ preservation or freezing of the VM and guest_memfd files. This
+ ensures no new page faults or memory accesses can occur while the
+ guest_memfd is frozen.
+
+2. **Handle Fault Failures**:
+ If the VM is not paused, the VMM must be prepared to handle VM
+ exits or user page fault errors resulting from the ``-EPERM``
+ failures. The VMM must take appropriate action, such as
+ immediately pausing the VM, or aborting the live update sequence
+ (by tearing down or unpreserving the live update session).
+
+Preservation and Retrieval Ordering
+-----------------------------------
+
+Preservation Order
+~~~~~~~~~~~~~~~~~~
+
+There is no strict ordering requirement for initiating the
+preservation of the KVM VM file and the guest_memfd files; they are
+preserved independently. If kexec is triggered with guest_memfd
+preservation without preserving the vm file, kexec will fail.
+
+Retrieval Order
+~~~~~~~~~~~~~~~
+
+Similarly, there is no strict ordering required for retrieving the VM
+and guest_memfd files. Any file can be retrieved at any order.
+
+If guest_memfd file is retrieved and VM file is not retrieved, and
+luo_finish is called, then vm_file will be lost and guest_memfd file
+will be hanging around.
+
+VM & Guest_Memfd Preservation ABI
+=================================
+
+.. kernel-doc:: include/linux/kho/abi/kvm.h
+ :doc: DOC: guest_memfd Live Update ABI
+
+.. kernel-doc:: include/linux/kho/abi/kvm.h
+ :internal:
+
+See Also
+========
+
+- :doc:`/core-api/liveupdate`
+- :doc:`/userspace-api/liveupdate`
diff --git a/MAINTAINERS b/MAINTAINERS
index 16cba790a84d..ca459d032712 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14418,6 +14418,7 @@ L: kexec@lists.infradead.org
L: kvm@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
+F: Documentation/liveupdate/vmm.rst
F: virt/kvm/guest_memfd_luo.c
F: virt/kvm/kvm_luo.c
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 09/10] selftests: kvm: Split ____vm_create() to expose init helpers
[not found] <cover.1780676742.git.tarunsahu@google.com>
` (7 preceding siblings ...)
2026-06-05 17:08 ` [RFC PATCH v2 08/10] docs: add documentation for guest_memfd preservation via LUO Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
9 siblings, 0 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Refactor `____vm_create()` in the KVM selftest library to extract its
initialization steps into separate, reusable internal helpers.
Introduce `vm_init_fields()` and `vm_init_memory_properties()`. This
allows advanced test setups to perform targeted VM fields or memory
property initializations independently, which is required by upcoming
test cases that restore preserved VMs. No functional changes are
introduced for the existing tests.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
.../testing/selftests/kvm/include/kvm_util.h | 2 ++
tools/testing/selftests/kvm/lib/kvm_util.c | 26 +++++++++++++------
2 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 2ecaaa0e9965..d10cd25d0658 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -471,6 +471,8 @@ const char *vm_guest_mode_string(u32 i);
void kvm_vm_free(struct kvm_vm *vmp);
void kvm_vm_restart(struct kvm_vm *vmp);
+void vm_init_fields(struct kvm_vm *vm, struct vm_shape shape);
+void vm_init_memory_properties(struct kvm_vm *vm);
void kvm_vm_release(struct kvm_vm *vmp);
void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename);
int kvm_memfd_alloc(size_t size, bool hugepages);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index e08967ef7b7b..d3e6508e9863 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -276,13 +276,8 @@ __weak void vm_populate_gva_bitmap(struct kvm_vm *vm)
(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
}
-struct kvm_vm *____vm_create(struct vm_shape shape)
+void vm_init_fields(struct kvm_vm *vm, struct vm_shape shape)
{
- struct kvm_vm *vm;
-
- vm = calloc(1, sizeof(*vm));
- TEST_ASSERT(vm != NULL, "Insufficient Memory");
-
INIT_LIST_HEAD(&vm->vcpus);
vm->regions.gpa_tree = RB_ROOT;
vm->regions.hva_tree = RB_ROOT;
@@ -380,9 +375,10 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
if (vm->pa_bits != 40)
vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
#endif
+}
- vm_open(vm);
-
+void vm_init_memory_properties(struct kvm_vm *vm)
+{
/* Limit to VA-bit canonical virtual addresses. */
vm->vpages_valid = sparsebit_alloc();
vm_populate_gva_bitmap(vm);
@@ -392,6 +388,20 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
/* Allocate and setup memory for guest. */
vm->vpages_mapped = sparsebit_alloc();
+}
+
+struct kvm_vm *____vm_create(struct vm_shape shape)
+{
+ struct kvm_vm *vm;
+
+ vm = calloc(1, sizeof(*vm));
+ TEST_ASSERT(vm != NULL, "Insufficient Memory");
+
+ vm_init_fields(vm, shape);
+
+ vm_open(vm);
+
+ vm_init_memory_properties(vm);
return vm;
}
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test
[not found] <cover.1780676742.git.tarunsahu@google.com>
` (8 preceding siblings ...)
2026-06-05 17:08 ` [RFC PATCH v2 09/10] selftests: kvm: Split ____vm_create() to expose init helpers Tarun Sahu
@ 2026-06-05 17:08 ` Tarun Sahu
2026-06-05 17:22 ` sashiko-bot
2026-06-22 23:01 ` Ackerley Tng
9 siblings, 2 replies; 27+ messages in thread
From: Tarun Sahu @ 2026-06-05 17:08 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, Tarun Sahu, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Add a new KVM selftest `guest_memfd_preservation_test` to verify that
guest memory backed by guest_memfd is preserved properly.
The test leverages the Live Update Orchestrator (LUO) infrastructure
to validate that memory folios and configuration layouts are
successfully saved and then restored during kernel live updates,
preventing any memory loss for the guest.
Here, I have used the kvm selftests framework by creating a new
vm and mapping two memory slots to it. One is the code that is executed
inside the vm and other is the guest_memfd whose memory is being
written by the guest code.
In Phase 1: Once data is written the vm exits and wait for the user
to trigger the kexec.
In Phase 2: A new vm is created with retrieved kvm and again two
memory slots are assigned. Once for guest code, and another is for
retrieved guest_memfd where guest_memfd memory is verified by the
executed guest code. If verification succeeds, The test passes.
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
---
MAINTAINERS | 1 +
tools/testing/selftests/kvm/Makefile.kvm | 6 +-
.../kvm/guest_memfd_preservation_test.c | 230 ++++++++++++++++++
3 files changed, 236 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/kvm/guest_memfd_preservation_test.c
diff --git a/MAINTAINERS b/MAINTAINERS
index ca459d032712..76e59620d2f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14419,6 +14419,7 @@ L: kvm@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
F: Documentation/liveupdate/vmm.rst
+F: tools/testing/selftests/kvm/guest_memfd_preservation_test.c
F: virt/kvm/guest_memfd_luo.c
F: virt/kvm/kvm_luo.c
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 9118a5a51b89..68584d4ee1b0 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -161,6 +161,8 @@ TEST_GEN_PROGS_x86 += pre_fault_memory_test
# Compiled outputs used by test targets
TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test
+# Manual test that forks a persistent background daemon; skip auto CI run
+TEST_GEN_PROGS_EXTENDED_x86 += guest_memfd_preservation_test
TEST_GEN_PROGS_arm64 = $(TEST_GEN_PROGS_COMMON)
TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs
@@ -254,6 +256,7 @@ OVERRIDE_TARGETS = 1
# which causes the environment variable to override the makefile).
include ../lib.mk
include ../cgroup/lib/libcgroup.mk
+include ../liveupdate/lib/libliveupdate.mk
INSTALL_HDR_PATH = $(top_srcdir)/usr
LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
@@ -308,7 +311,8 @@ LIBKVM_S := $(filter %.S,$(LIBKVM))
LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) $(LIBCGROUP_O)
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) \
+ $(LIBCGROUP_O) $(LIBLIVEUPDATE_O)
SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH)/%.o, $(SPLIT_TESTS))
diff --git a/tools/testing/selftests/kvm/guest_memfd_preservation_test.c b/tools/testing/selftests/kvm/guest_memfd_preservation_test.c
new file mode 100644
index 000000000000..74f90c5c4bf5
--- /dev/null
+++ b/tools/testing/selftests/kvm/guest_memfd_preservation_test.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2026, Google LLC.
+ *
+ * Author: Tarun Sahu <tarunsahu@google.com>
+ *
+ * Test for VM and guest_memfd preservation across kexec (Live Update) via LUO.
+ *
+ * NOTE: This is a MANUAL test and is excluded from automated CI/testing
+ * frameworks because Phase 1 daemonizes into the background to pin resources
+ * and requires a human operator to manually trigger kexec before Phase 2
+ * is executed. Running Phase 1 automatically would leak the background daemon
+ * and cause CI runners to falsely interpret it as a passed test.
+ *
+ * Usage:
+ * Phase 1: ./guest_memfd_preservation_test
+ * Phase 2: ./guest_memfd_preservation_test --phase2
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/sizes.h>
+#include <linux/falloc.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+#include "ucall_common.h"
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+#include <libliveupdate.h>
+
+#define SESSION_NAME "gmem_vm_preservation_session"
+#define VM_TOKEN 0x1001
+#define GMEM_TOKEN 0x1002
+
+#define GMEM_SIZE (16ULL * 1024 * 1024)
+#define DATA_SIZE (5ULL * 1024 * 1024)
+
+static size_t page_size;
+
+/* Deterministic byte pattern generation based on offset */
+static inline uint8_t get_pattern_byte(size_t offset)
+{
+ return (uint8_t)(offset ^ 0x5A);
+}
+
+static void guest_code_phase1(uint64_t gpa, uint64_t size, uint64_t data_size)
+{
+ uint8_t *mem = (uint8_t *)gpa;
+ size_t i;
+
+ for (i = 0; i < data_size; i++)
+ mem[i] = get_pattern_byte(i);
+
+ GUEST_DONE();
+}
+
+static void guest_code_phase2(uint64_t gpa, uint64_t size, uint64_t data_size)
+{
+ uint8_t *mem = (uint8_t *)gpa;
+ size_t i;
+
+ for (i = 0; i < data_size; i++) {
+ uint8_t val = get_pattern_byte(i);
+
+ __GUEST_ASSERT(mem[i] == val,
+ "Data mismatch at offset %lu! Expected 0x%x, got 0x%x",
+ i, val, mem[i]);
+ }
+
+ GUEST_DONE();
+}
+
+static void do_phase1(void)
+{
+ uint64_t flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;
+ int gmem_fd, dev_luo_fd, session_fd, ret;
+ const uint64_t gpa = SZ_4G;
+ struct kvm_vcpu *vcpu;
+ const int slot = 1;
+ struct kvm_vm *vm;
+
+ vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1,
+ guest_code_phase1);
+ gmem_fd = vm_create_guest_memfd(vm, GMEM_SIZE, flags);
+ vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
+ gmem_fd, 0);
+
+ for (size_t i = 0; i < GMEM_SIZE; i += page_size)
+ virt_pg_map(vm, gpa + i, gpa + i);
+
+ vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
+
+ vcpu_run(vcpu);
+ TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+ dev_luo_fd = luo_open_device();
+ TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
+
+ session_fd = luo_create_session(dev_luo_fd, SESSION_NAME);
+ TEST_ASSERT(session_fd >= 0, "Failed to create LUO session");
+
+ ret = luo_session_preserve_fd(session_fd, vm->fd, VM_TOKEN);
+ TEST_ASSERT(ret == 0, "Failed to preserve VM file descriptor");
+
+ ret = luo_session_preserve_fd(session_fd, gmem_fd, GMEM_TOKEN);
+ TEST_ASSERT(ret == 0, "Failed to preserve guest_memfd file descriptor");
+
+ printf("\n============================================================\n");
+ printf("Phase 1 Complete Successfully!\n");
+ printf("VM file and guest_memfd file have been preserved via LUO.\n");
+ printf("Tokens: VM_TOKEN=0x%x, GMEM_TOKEN=0x%x\n", VM_TOKEN, GMEM_TOKEN);
+ printf("Machine Size: %llu MB, Data Size: %llu MB\n", GMEM_SIZE / SZ_1M,
+ DATA_SIZE / SZ_1M);
+ printf("------------------------------------------------------------\n");
+
+ daemonize_and_wait();
+}
+
+static struct kvm_vm *vm_create_from_fd(int resurrected_vm_fd,
+ struct vm_shape shape)
+{
+ struct kvm_vm *vm;
+
+ vm = calloc(1, sizeof(*vm));
+ TEST_ASSERT(vm != NULL, "Insufficient Memory");
+
+ vm_init_fields(vm, shape);
+
+ vm->kvm_fd = open_path_or_exit(KVM_DEV_PATH, O_RDWR);
+ vm->fd = resurrected_vm_fd;
+
+ if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
+ vm->stats.fd = vm_get_stats_fd(vm);
+ else
+ vm->stats.fd = -1;
+
+ vm_init_memory_properties(vm);
+
+ return vm;
+}
+
+static void do_phase2(void)
+{
+ int retrieved_vm_fd, retrieved_gmem_fd, dev_luo_fd, session_fd;
+ struct vm_shape shape = VM_SHAPE_DEFAULT;
+ const uint64_t gpa = SZ_4G;
+ struct kvm_vcpu *vcpu;
+ const int slot = 1;
+ struct kvm_vm *vm;
+
+ dev_luo_fd = luo_open_device();
+ TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
+
+ session_fd = luo_retrieve_session(dev_luo_fd, SESSION_NAME);
+ TEST_ASSERT(session_fd >= 0, "Failed to retrieve LUO session");
+
+ retrieved_vm_fd = luo_session_retrieve_fd(session_fd, VM_TOKEN);
+ TEST_ASSERT(retrieved_vm_fd >= 0, "Failed to retrieve VM file descriptor");
+
+ retrieved_gmem_fd = luo_session_retrieve_fd(session_fd, GMEM_TOKEN);
+ TEST_ASSERT(retrieved_gmem_fd >= 0, "Failed to retrieve guest_memfd file descriptor");
+
+ vm = vm_create_from_fd(retrieved_vm_fd, shape);
+
+ u64 nr_pages = 2048; /* 8MB is plenty for slot0 pages */
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
+ kvm_vm_elf_load(vm, program_invocation_name);
+
+ for (int i = 0; i < NR_MEM_REGIONS; i++)
+ vm->memslots[i] = 0;
+
+ struct userspace_mem_region *slot0 = memslot2region(vm, 0);
+
+ ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
+
+ vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
+ retrieved_gmem_fd, 0);
+
+ for (size_t i = 0; i < GMEM_SIZE; i += page_size)
+ virt_pg_map(vm, gpa + i, gpa + i);
+
+ vcpu = vm_vcpu_add(vm, 0, guest_code_phase2);
+ kvm_arch_vm_finalize_vcpus(vm);
+
+ vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
+
+ printf("Resuming / Running VM in Phase 2...\n");
+ vcpu_run(vcpu);
+ TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+ printf("\nSUCCESS: Phase 2 Complete! All 5MB complex data verified intact!\n");
+
+ luo_session_finish(session_fd);
+ close(session_fd);
+ close(dev_luo_fd);
+ /* This will also close the vm_fd */
+ kvm_vm_free(vm);
+ close(retrieved_gmem_fd);
+}
+
+int main(int argc, char *argv[])
+{
+ bool phase2 = false;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
+ page_size = getpagesize();
+
+ for (int i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "--phase2") == 0)
+ phase2 = true;
+ }
+
+ if (phase2)
+ do_phase2();
+ else
+ do_phase1();
+
+ return 0;
+}
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support
2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
@ 2026-06-05 17:21 ` sashiko-bot
2026-06-22 23:59 ` Ackerley Tng
1 sibling, 0 replies; 27+ messages in thread
From: sashiko-bot @ 2026-06-05 17:21 UTC (permalink / raw)
To: Tarun Sahu; +Cc: kvm
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Low] Missing `__rcu` annotation on the `kvm->vm_file` field triggers Sparse warnings.
--
commit aba59ce9475ece27b2837967edea8ac0058bfe79
Author: Tarun Sahu <tarunsahu@google.com>
kvm: Prepare core VM structs and helpers for LUO support
This introduces core infrastructure to support VM preservation with LUO.
It refactors VM file creation into a new helper and tracks a weak
reference to the backing file in struct kvm.
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4c14aee1fb063..9111a28637aff 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
[ ... ]
> @@ -874,6 +874,18 @@ struct kvm {
> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> /* Protected by slots_lock (for writes) and RCU (for reads) */
> struct xarray mem_attr_array;
> +#endif
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> + /*
> + * Weak reference to the VFS file backing this KVM instance. Stored
> + * without incrementing the file refcount to prevent a circular lifetime
> + * dependency (since file->private_data already pins this struct kvm).
> + * Used exclusively to resolve the file pointer back from struct kvm.
> + *
> + * Written/cleared via rcu_assign_pointer() and read locklessly under
> + * RCU (e.g. via get_file_active() to prevent ABA races).
> + */
> + struct file *vm_file;
[Severity: Low]
Since this pointer is meant to be read locklessly under RCU and is updated
using rcu_assign_pointer() in kvm_create_vm_file() and kvm_vm_release(),
should it be declared with the __rcu annotation?
Without the __rcu qualifier, static analysis tools like Sparse will trigger
a "different address spaces" warning during the rcu_assign_pointer() call.
--
Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=3
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
@ 2026-06-05 17:21 ` sashiko-bot
2026-06-08 18:20 ` tarunsahu
2026-06-22 23:54 ` Ackerley Tng
1 sibling, 1 reply; 27+ messages in thread
From: sashiko-bot @ 2026-06-05 17:21 UTC (permalink / raw)
To: Tarun Sahu; +Cc: kvm
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] A malicious tenant can inflict a cross-tenant denial-of-service by holding the global `kvm_gmem_freeze_srcu` read lock during massive, unbounded `fallocate()` operations, blocking `kvm_gmem_freeze()` for other VMs.
--
commit 5ee5700d9538d9bd5f29f00de6d11fcc22d28ecc
Author: Tarun Sahu <tarunsahu@google.com>
kvm: guest_memfd: Add support for freezing and unfreezing mappings
This commit introduces freezing and unfreezing for guest_memfd mappings to
prevent fallocate and page fault allocations during preservation. It uses a
global SRCU structure to synchronize read-side operations with
kvm_gmem_freeze().
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 6740ae2bf948d..b94639cdf3126 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
[ ... ]
> @@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> loff_t len)
> {
> + struct inode *inode = file_inode(file);
> int ret;
> + int idx;
>
> - if (!(mode & FALLOC_FL_KEEP_SIZE))
> - return -EOPNOTSUPP;
> + idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> + if (kvm_gmem_is_frozen(inode)) {
> + srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> + return -EPERM;
> + }
>
[ ... ]
> + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
> + ret = -EINVAL;
> + goto out;
> + }
>
> if (mode & FALLOC_FL_PUNCH_HOLE)
> ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
> else
> ret = kvm_gmem_allocate(file_inode(file), offset, len);
[Severity: High]
Does this code allow a malicious tenant to inflict a cross-tenant denial of
service?
The srcu_read_lock() is acquired at the beginning of kvm_gmem_fallocate() and
held across the entire kvm_gmem_punch_hole() or kvm_gmem_allocate() execution.
Because a guest_memfd can be terabytes in size, these operations could iterate
over millions of pages and take a long time to complete.
During this entire time, the global SRCU read lock is held uninterruptibly
(yielding via cond_resched() in kvm_gmem_allocate() does not drop SRCU).
>
> if (!ret)
> file_modified(file);
> +
> +out:
> + srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> return ret;
> }
[ ... ]
> @@ -1014,5 +1053,61 @@ void kvm_gmem_exit(void)
[ ... ]
> +void kvm_gmem_freeze(struct inode *inode, bool freeze)
> +{
> + u64 flags = READ_ONCE(GMEM_I(inode)->flags);
> +
> + if (freeze)
> + flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
> + else
> + flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
> +
> + WRITE_ONCE(GMEM_I(inode)->flags, flags);
> +
> + if (freeze)
> + synchronize_srcu(&kvm_gmem_freeze_srcu);
[Severity: High]
Can this synchronize_srcu() block indefinitely waiting on another tenant?
Since the SRCU is global, calling kvm_gmem_freeze() on a completely unrelated
VM will invoke synchronize_srcu(&kvm_gmem_freeze_srcu). This will wait for the
first tenant's massive fallocate() to finish.
The loop in kvm_gmem_allocate() checks kvm_gmem_is_frozen(inode) on its own
inode, so it will not abort when a different inode is frozen. This effectively
allows an unprivileged tenant to stall live updates and preservation
system-wide.
> +}
--
Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=6
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test
2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
@ 2026-06-05 17:22 ` sashiko-bot
2026-06-08 18:26 ` tarunsahu
2026-06-22 23:01 ` Ackerley Tng
1 sibling, 1 reply; 27+ messages in thread
From: sashiko-bot @ 2026-06-05 17:22 UTC (permalink / raw)
To: Tarun Sahu; +Cc: kvm
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [Medium] Phase 2 blindly recreates a vCPU on the preserved VM, failing with -EEXIST.
- [Medium] Phase 2 attempts to recreate preserved memslots without explicit deletion, causing KVM_SET_USER_MEMORY_REGION ioctls to fail with -EINVAL.
--
commit fe4115b3a7bf4dcb06bece577462cff5bac12723
Author: Tarun Sahu <tarunsahu@google.com>
selftests: kvm: Add guest_memfd_preservation_test
Add a new KVM selftest guest_memfd_preservation_test to verify that
guest memory backed by guest_memfd is preserved properly. The test leverages
the Live Update Orchestrator (LUO) infrastructure to save and restore memory
folios and configuration layouts during live updates.
> diff --git a/tools/testing/selftests/kvm/guest_memfd_preservation_test.c b/tools/testing/selftests/kvm/guest_memfd_preservation_test.c
> new file mode 100644
> index 0000000000000..74f90c5c4bf5b
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/guest_memfd_preservation_test.c
[ ... ]
> +static void do_phase1(void)
> +{
[ ... ]
> + session_fd = luo_create_session(dev_luo_fd, SESSION_NAME);
> + TEST_ASSERT(session_fd >= 0, "Failed to create LUO session");
> +
> + ret = luo_session_preserve_fd(session_fd, vm->fd, VM_TOKEN);
> + TEST_ASSERT(ret == 0, "Failed to preserve VM file descriptor");
[Severity: Medium]
Since this test creates a VM with vCPU 0 and preserves the VM file descriptor,
but doesn't preserve the vCPU file descriptor, will vCPU 0 still be retained
in KVM's state for phase 2 without a way to access it?
> +
> + ret = luo_session_preserve_fd(session_fd, gmem_fd, GMEM_TOKEN);
> + TEST_ASSERT(ret == 0, "Failed to preserve guest_memfd file descriptor");
[ ... ]
> +static void do_phase2(void)
> +{
[ ... ]
> + vm = vm_create_from_fd(retrieved_vm_fd, shape);
> +
> + u64 nr_pages = 2048; /* 8MB is plenty for slot0 pages */
> +
> + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
[Severity: Medium]
Does this code attempt to overwrite existing memslot 0 with a new userspace
address without deleting it first?
Since the VM's internal memslots are retained in KVM kernel state when
preserved, redefining slot 0 with a new userspace_addr violates KVM's memslot
mutability rules and will likely cause KVM_SET_USER_MEMORY_REGION to fail with
-EINVAL.
> + kvm_vm_elf_load(vm, program_invocation_name);
> +
> + for (int i = 0; i < NR_MEM_REGIONS; i++)
> + vm->memslots[i] = 0;
> +
> + struct userspace_mem_region *slot0 = memslot2region(vm, 0);
> +
> + ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
> +
> + vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
> + retrieved_gmem_fd, 0);
[Severity: Medium]
Will this also trigger an -EINVAL error from KVM?
Similar to slot 0, redefining the guest_memfd memslot (slot 1) without explicit
deletion first also violates KVM's memslot mutability rules.
> +
> + for (size_t i = 0; i < GMEM_SIZE; i += page_size)
> + virt_pg_map(vm, gpa + i, gpa + i);
> +
> + vcpu = vm_vcpu_add(vm, 0, guest_code_phase2);
[Severity: Medium]
Will this blindly recreate a vCPU on the preserved VM, resulting in an
immediate -EEXIST failure?
Because the entire VM state was preserved in phase 1, vCPU 0 already exists in
KVM's state. Calling vm_vcpu_add() for vCPU 0 again causes KVM to reject the
duplicate KVM_CREATE_VCPU request.
> + kvm_arch_vm_finalize_vcpus(vm);
> +
> + vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
--
Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=10
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation
2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
@ 2026-06-05 17:24 ` sashiko-bot
2026-06-07 0:41 ` tarunsahu
2026-06-07 0:35 ` tarunsahu
1 sibling, 1 reply; 27+ messages in thread
From: sashiko-bot @ 2026-06-05 17:24 UTC (permalink / raw)
To: Tarun Sahu; +Cc: kvm
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [High] Nested acquisition of `luo_file->mutex` causes a lockdep splat when subsystems retrieve files programmatically.
- [Low] Missing EXPORT_SYMBOL_GPL for `liveupdate_get_token_outgoing` and `liveupdate_get_file_incoming` causes module build failures.
--
commit 3b62824c5e4a0e08095576efe5b452875a92a685
Author: Pasha Tatashin <pasha.tatashin@soleen.com>
liveupdate: luo_file: Add internal APIs for file preservation
This commit introduces internal APIs to allow kernel subsystems to
programmatically preserve and retrieve file descriptors during a live update.
This enables subsystems managing struct file objects directly to participate
without solely relying on userspace interaction.
> diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
> index a0a419085e283..0aa0b4e5339f3 100644
> --- a/kernel/liveupdate/luo_file.c
> +++ b/kernel/liveupdate/luo_file.c
[ ... ]
> @@ -924,3 +931,65 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
> luo_flb_unregister_all(fh);
> list_del(&ACCESS_PRIVATE(fh, list));
> }
> +EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler);
> +
> +/**
> + * liveupdate_get_token_outgoing - Get the token for a preserved file.
[ ... ]
> + */
> +int liveupdate_get_token_outgoing(struct liveupdate_session *s,
> + struct file *file, u64 *tokenp)
> +{
[Severity: Low]
Should this function be exported?
Since this API is intended for kernel subsystems that might be built as
loadable modules, will leaving out EXPORT_SYMBOL_GPL() cause module link
errors?
> + struct luo_file_set *file_set = luo_file_set_from_session_locked(s);
> + struct luo_file *luo_file;
> + int err = -ENOENT;
> +
> + list_for_each_entry(luo_file, &file_set->files_list, list) {
> + if (luo_file->file == file) {
> + if (tokenp)
> + *tokenp = luo_file->token;
> + err = 0;
> + break;
> + }
> + }
> +
> + return err;
> +}
> +
> +/**
> + * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
[ ... ]
> + */
> +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
> + struct file **filep)
> +{
[Severity: Low]
Similar to liveupdate_get_token_outgoing(), should this function also be
exported with EXPORT_SYMBOL_GPL() to allow loadable modules to use it?
> + return luo_retrieve_file(luo_file_set_from_session_locked(s),
> + token, filep);
[Severity: High]
Will this cause a lockdep splat if a subsystem uses this API during retrieval?
When a subsystem's .retrieve() callback is invoked from luo_retrieve_file(),
it already holds the target file's luo_file->mutex:
luo_retrieve_file() {
...
guard(mutex)(&luo_file->mutex);
...
err = luo_file->fh->ops->retrieve(&args);
}
If the subsystem callback then calls liveupdate_get_file_incoming(), it
recursively calls luo_retrieve_file() for the dependency file, attempting
to acquire a second luo_file->mutex.
Since all luo_file->mutex instances share the same lock class, won't this
trigger a 'possible recursive locking detected' warning?
> +}
--
Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation
2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
@ 2026-06-05 17:25 ` sashiko-bot
2026-06-08 18:22 ` tarunsahu
2026-06-22 23:27 ` Ackerley Tng
1 sibling, 1 reply; 27+ messages in thread
From: sashiko-bot @ 2026-06-05 17:25 UTC (permalink / raw)
To: Tarun Sahu; +Cc: kvm
Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
- [Critical] Dereferencing `file->private_data` before validating the file type causes a NULL pointer dereference.
- [High] Logic inversion in `kvm_gmem_luo_finish()` causes a double-free of preservation structures and active guest memory folios on successful retrieval.
- [High] Missing `kho_is_enabled()` gate before invoking KHO serialization APIs causes potential NULL pointer dereference.
--
commit d0ebe234914d6981be774d511af45334f3db4977
Author: Tarun Sahu <tarunsahu@google.com>
Subject: kvm: guest_memfd_luo: add support for guest_memfd preservation
This patch sets up the basic infrastructure to preserve the guest_memfd.
It registers a new LUO file handler for guest_memfd files to serialize
and deserialize guest memory.
> diff --git a/virt/kvm/guest_memfd_luo.c b/virt/kvm/guest_memfd_luo.c
> new file mode 100644
> index 0000000000000..d466f889c9aa1
> --- /dev/null
> +++ b/virt/kvm/guest_memfd_luo.c
> @@ -0,0 +1,485 @@
[ ... ]
> +static bool kvm_gmem_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file)
> +{
> + struct inode *inode = file_inode(file);
> + struct gmem_file *gmem_file = file->private_data;
> + struct kvm *kvm = gmem_file->kvm;
[Severity: Critical]
Does this code dereference file->private_data before confirming the file
is actually a guest_memfd?
The LUO framework might pass unrelated file descriptors where
private_data is NULL. By extracting gmem_file->kvm before checking
inode->i_sb->s_magic, could this cause a NULL pointer dereference
when evaluating unrelated files?
> +
> + if (inode->i_sb->s_magic != GUEST_MEMFD_MAGIC)
> + return 0;
> +
[ ... ]
> +static int kvm_gmem_luo_preserve(struct liveupdate_file_op_args *args)
> +{
[ ... ]
> + end_index = size >> PAGE_SHIFT;
> +
> + ser = kho_alloc_preserve(sizeof(*ser));
[Severity: High]
Could this call trigger a NULL pointer dereference if the KHO subsystem
is disabled?
According to the KHO subsystem guidelines, all callers must explicitly
gate KHO usage on kho_is_enabled(). It looks like kvm_gmem_luo_init()
registers the file handler unconditionally, and this path invokes
kho_alloc_preserve() and other KHO serialization APIs without checking.
Should this function (or the initialization) verify kho_is_enabled()
before interacting with the KHO subsystem?
> + if (IS_ERR(ser)) {
> + err = PTR_ERR(ser);
> + goto err_unfreeze_inode;
> + }
[ ... ]
> +static int kvm_gmem_luo_retrieve(struct liveupdate_file_op_args *args)
> +{
[ ... ]
> + args->file = file;
> + kho_restore_free(ser);
> + return 0;
> +
> +err_put_remaining_folios:
[ ... ]
> +static void kvm_gmem_luo_finish(struct liveupdate_file_op_args *args)
> +{
> + struct guest_memfd_luo_ser *ser;
> + struct guest_memfd_luo_folio_ser *folios_ser;
> +
> + /* Nothing to be done here, if retrieve_status was successful or errored,
> + * Cleanup is taken care of in retrieval call.
> + */
> + if (args->retrieve_status)
> + return;
[Severity: High]
Does this logic inversion cause a double-free on successful retrieval?
In kvm_gmem_luo_retrieve(), a successful retrieval frees the ser struct
but leaves args->serialized_data intact:
kvm_gmem_luo_retrieve()
kho_restore_free(ser);
return 0;
Because standard Linux error handling uses 0 for success,
if (args->retrieve_status) evaluates to false on success, causing this
function to fall through instead of returning.
Could this lead to accessing the freed ser struct, erroneously putting
active folios, and calling kho_restore_free() a second time?
> +
> + if (!args->serialized_data)
> + return;
> +
> + ser = phys_to_virt(args->serialized_data);
> + if (!ser)
> + return;
--
Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=7
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO
2026-06-05 17:08 ` [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO Tarun Sahu
@ 2026-06-05 17:26 ` sashiko-bot
2026-06-08 16:13 ` tarunsahu
0 siblings, 1 reply; 27+ messages in thread
From: sashiko-bot @ 2026-06-05 17:26 UTC (permalink / raw)
To: Tarun Sahu; +Cc: kvm
Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
- [High] KVM live update silently drops the VM type on non-x86 architectures, leading to incorrect VM initialization during restore.
- [High] Double free attempt of KHO serialized data on the successful retrieve path triggers a kernel warning.
- [Medium] Debugfs directory name collision for restored VMs silently drops statistics for new VMs.
--
commit ded3fe4d8e96caf44b395f665f8fedc9e63e601b
Author: Tarun Sahu <tarunsahu@google.com>
kvm: kvm_luo: Allow kvm preservation with LUO
Introduce KVM VM preservation support for Live Update Orchestrator.
This registers an LUO file handler for KVM files to serialize and
deserialize necessary VM state across live updates.
> diff --git a/virt/kvm/kvm_luo.c b/virt/kvm/kvm_luo.c
> new file mode 100644
> index 0000000000000..25619f94ace5b
> --- /dev/null
> +++ b/virt/kvm/kvm_luo.c
[ ... ]
> +static int kvm_luo_preserve(struct liveupdate_file_op_args *args)
> +{
[ ... ]
> +#ifdef CONFIG_X86
> + ser->type = kvm->arch.vm_type;
> +#else
> + ser->type = 0;
[Severity: High]
Does hardcoding the VM type to 0 on non-x86 architectures discard required
state?
For instance, on ARM64, the VM type parameter encodes architectural
information such as the Intermediate Physical Address (IPA) size and
protection flags.
Will restoring the VM with type 0 cause the new kernel to initialize the
VM with default settings, potentially leading to a mismatched memory layout?
> +#endif
> +
> + args->serialized_data = virt_to_phys(ser);
> +
> + return 0;
> +}
> +
> +static atomic_t restored_vm_id = ATOMIC_INIT(0);
> +
> +static int kvm_luo_retrieve(struct liveupdate_file_op_args *args)
> +{
> + char fdname[ITOA_MAX_LEN + 1];
[ ... ]
> + snprintf(fdname, sizeof(fdname), "%d",
> + atomic_inc_return(&restored_vm_id));
[Severity: Medium]
Since standard VMs use their native file descriptor numbers as fdname, and
those typically start from low numbers, will this auto-incrementing counter
guarantee namespace collisions?
A collision here would result in a duplicate directory error in
kvm_create_vm_debugfs(), which would cause debugfs statistics for newly
created VMs to be skipped.
> +
> + file = kvm_create_vm_file(ser->type, fdname);
[ ... ]
> +static void kvm_luo_finish(struct liveupdate_file_op_args *args)
> +{
> + struct kvm_luo_ser *ser;
> +
> + /*
> + * If retrieve_status is true or set to error, nothing to do here.
> + * Already cleaned up in kvm_luo_retrieve().
> + */
> + if (args->retrieve_status)
> + return;
> +
> + if (!args->serialized_data)
> + return;
> +
> + ser = phys_to_virt(args->serialized_data);
> + kho_restore_free(ser);
[Severity: High]
Can this result in a double-free on the successful retrieve path?
In kvm_luo_retrieve(), kho_restore_free() is explicitly called on success
before returning 0:
args->file = file;
kho_restore_free(ser);
kvm_uevent_notify_vm_create(kvm);
return 0;
Since the retrieve is successful, args->retrieve_status will be 0 here,
and this function will call kho_restore_free() again on the same
serialized data.
> +}
--
Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=4
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation
2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
2026-06-05 17:24 ` sashiko-bot
@ 2026-06-07 0:35 ` tarunsahu
1 sibling, 0 replies; 27+ messages in thread
From: tarunsahu @ 2026-06-07 0:35 UTC (permalink / raw)
To: Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin, Shuah Khan,
sagis, aneesh.kumar, skhawaja, vipinsh, ackerleytng,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Hi,
I am sorry for incorrect layout of this series. After copy-pasting
I forgot to change the message id in header and the cover letter is not
attached to the same thread.
Please find the V2 cover letter here:
https://lore.kernel.org/all/cover.1780667929.git.tarunsahu@google.com/
Also for latest discussion related to scope:
https://lore.kernel.org/all/9huzldcrxkch.fsf@tarunix.c.googlers.com/
Thankyou
Tarun Sahu <tarunsahu@google.com> writes:
> From: Pasha Tatashin <pasha.tatashin@soleen.com>
>
> The core liveupdate mechanism allows userspace to preserve file
> descriptors. However, kernel subsystems often manage struct file
> objects directly and need to participate in the preservation process
> programmatically without relying solely on userspace interaction.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
> ---
> include/linux/liveupdate.h | 21 ++++++++++
> kernel/liveupdate/luo_file.c | 69 ++++++++++++++++++++++++++++++++
> kernel/liveupdate/luo_internal.h | 17 ++++++++
> 3 files changed, 107 insertions(+)
>
> diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
> index 30c5a39ff9e9..de052438eaac 100644
> --- a/include/linux/liveupdate.h
> +++ b/include/linux/liveupdate.h
> @@ -24,6 +24,7 @@ struct file;
> /**
> * struct liveupdate_file_op_args - Arguments for file operation callbacks.
> * @handler: The file handler being called.
> + * @session: The session this file belongs to.
> * @retrieve_status: The retrieve status for the 'can_finish / finish'
> * operation. A value of 0 means the retrieve has not been
> * attempted, a positive value means the retrieve was
> @@ -44,6 +45,7 @@ struct file;
> */
> struct liveupdate_file_op_args {
> struct liveupdate_file_handler *handler;
> + struct liveupdate_session *session;
> int retrieve_status;
> struct file *file;
> u64 serialized_data;
> @@ -240,6 +242,13 @@ void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
>
> int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp);
> int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp);
> +/* kernel can internally retrieve files */
> +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
> + struct file **filep);
> +
> +/* Get a token for an outgoing file, or -ENOENT if file is not preserved */
> +int liveupdate_get_token_outgoing(struct liveupdate_session *s,
> + struct file *file, u64 *tokenp);
>
> #else /* CONFIG_LIVEUPDATE */
>
> @@ -285,5 +294,17 @@ static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb,
> return -EOPNOTSUPP;
> }
>
> +static inline int liveupdate_get_file_incoming(struct liveupdate_session *s,
> + u64 token, struct file **filep)
> +{
> + return -EOPNOTSUPP;
> +}
> +
> +static inline int liveupdate_get_token_outgoing(struct liveupdate_session *s,
> + struct file *file, u64 *tokenp)
> +{
> + return -EOPNOTSUPP;
> +}
> +
> #endif /* CONFIG_LIVEUPDATE */
> #endif /* _LINUX_LIVEUPDATE_H */
> diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
> index a0a419085e28..0aa0b4e5339f 100644
> --- a/kernel/liveupdate/luo_file.c
> +++ b/kernel/liveupdate/luo_file.c
> @@ -323,6 +323,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
> mutex_init(&luo_file->mutex);
>
> args.handler = fh;
> + args.session = luo_session_from_file_set(file_set);
> args.file = file;
> err = fh->ops->preserve(&args);
> if (err)
> @@ -380,6 +381,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
> struct luo_file, list);
>
> args.handler = luo_file->fh;
> + args.session = luo_session_from_file_set(file_set);
> args.file = luo_file->file;
> args.serialized_data = luo_file->serialized_data;
> args.private_data = luo_file->private_data;
> @@ -411,6 +413,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set,
> struct liveupdate_file_op_args args = {0};
>
> args.handler = luo_file->fh;
> + args.session = luo_session_from_file_set(file_set);
> args.file = luo_file->file;
> args.serialized_data = luo_file->serialized_data;
> args.private_data = luo_file->private_data;
> @@ -432,6 +435,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set,
> struct liveupdate_file_op_args args = {0};
>
> args.handler = luo_file->fh;
> + args.session = luo_session_from_file_set(file_set);
> args.file = luo_file->file;
> args.serialized_data = luo_file->serialized_data;
> args.private_data = luo_file->private_data;
> @@ -621,6 +625,7 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
> }
>
> args.handler = luo_file->fh;
> + args.session = luo_session_from_file_set(file_set);
> args.serialized_data = luo_file->serialized_data;
> err = luo_file->fh->ops->retrieve(&args);
> if (err) {
> @@ -654,6 +659,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set,
> struct liveupdate_file_op_args args = {0};
>
> args.handler = luo_file->fh;
> + args.session = luo_session_from_file_set(file_set);
> args.file = luo_file->file;
> args.serialized_data = luo_file->serialized_data;
> args.retrieve_status = luo_file->retrieve_status;
> @@ -671,6 +677,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
> guard(mutex)(&luo_file->mutex);
>
> args.handler = luo_file->fh;
> + args.session = luo_session_from_file_set(file_set);
> args.file = luo_file->file;
> args.serialized_data = luo_file->serialized_data;
> args.retrieve_status = luo_file->retrieve_status;
> @@ -924,3 +931,65 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
> luo_flb_unregister_all(fh);
> list_del(&ACCESS_PRIVATE(fh, list));
> }
> +EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler);
> +
> +/**
> + * liveupdate_get_token_outgoing - Get the token for a preserved file.
> + * @s: The outgoing liveupdate session.
> + * @file: The file object to search for.
> + * @tokenp: Output parameter for the found token.
> + *
> + * Searches the list of preserved files in an outgoing session for a matching
> + * file object. If found, the corresponding user-provided token is returned.
> + *
> + * This function is intended for in-kernel callers that need to correlate a
> + * file with its liveupdate token.
> + *
> + * Context: It must be called with session mutex acquired.
> + * Return: 0 on success, -ENOENT if the file is not preserved in this session.
> + */
> +int liveupdate_get_token_outgoing(struct liveupdate_session *s,
> + struct file *file, u64 *tokenp)
> +{
> + struct luo_file_set *file_set = luo_file_set_from_session_locked(s);
> + struct luo_file *luo_file;
> + int err = -ENOENT;
> +
> + list_for_each_entry(luo_file, &file_set->files_list, list) {
> + if (luo_file->file == file) {
> + if (tokenp)
> + *tokenp = luo_file->token;
> + err = 0;
> + break;
> + }
> + }
> +
> + return err;
> +}
> +
> +/**
> + * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
> + * @s: The incoming liveupdate session (restored from the previous kernel).
> + * @token: The unique token identifying the file to retrieve.
> + * @filep: On success, this will be populated with a pointer to the retrieved
> + * 'struct file'.
> + *
> + * Provides a kernel-internal API for other subsystems to retrieve their
> + * preserved files after a live update. This function is a simple wrapper
> + * around luo_retrieve_file(), allowing callers to find a file by its token.
> + *
> + * The caller receives a new reference to the file and must call fput() when it
> + * is no longer needed. The file's lifetime is managed by LUO and any userspace
> + * file descriptors. If the caller needs to hold a reference to the file beyond
> + * the immediate scope, it must call get_file() itself.
> + *
> + * Context: It must be called with session mutex acquired of a restored session.
> + * Return: 0 on success. Returns -ENOENT if no file with the matching token is
> + * found, or any other negative errno on failure.
> + */
> +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
> + struct file **filep)
> +{
> + return luo_retrieve_file(luo_file_set_from_session_locked(s),
> + token, filep);
> +}
> diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
> index 875844d7a41d..08b198802e7f 100644
> --- a/kernel/liveupdate/luo_internal.h
> +++ b/kernel/liveupdate/luo_internal.h
> @@ -79,6 +79,23 @@ struct luo_session {
>
> extern struct rw_semaphore luo_register_rwlock;
>
> +static inline struct liveupdate_session *luo_session_from_file_set(struct luo_file_set *file_set)
> +{
> + struct luo_session *session;
> +
> + session = container_of(file_set, struct luo_session, file_set);
> +
> + return (struct liveupdate_session *)session;
> +}
> +
> +static inline struct luo_file_set *luo_file_set_from_session_locked(struct liveupdate_session *s)
> +{
> + struct luo_session *session = (struct luo_session *)s;
> +
> + lockdep_assert_held(&session->mutex);
> + return &session->file_set;
> +}
> +
> int luo_session_create(const char *name, struct file **filep);
> int luo_session_retrieve(const char *name, struct file **filep);
> int __init luo_session_setup_outgoing(void *fdt);
> --
> 2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation
2026-06-05 17:24 ` sashiko-bot
@ 2026-06-07 0:41 ` tarunsahu
0 siblings, 0 replies; 27+ messages in thread
From: tarunsahu @ 2026-06-07 0:41 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvm, skhawaja
Hi,
skhawaja@google.com is working on next version of this.
I will ignore the sashiko error in this patch, as it is just to make sure
the series has its base requirement. This patch is being developed as part other
series. So I will pull latest/merged version as soon as we get.
Thanks
sashiko-bot@kernel.org writes:
> Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
> - [High] Nested acquisition of `luo_file->mutex` causes a lockdep splat when subsystems retrieve files programmatically.
> - [Low] Missing EXPORT_SYMBOL_GPL for `liveupdate_get_token_outgoing` and `liveupdate_get_file_incoming` causes module build failures.
> --
>
> commit 3b62824c5e4a0e08095576efe5b452875a92a685
> Author: Pasha Tatashin <pasha.tatashin@soleen.com>
>
> liveupdate: luo_file: Add internal APIs for file preservation
>
> This commit introduces internal APIs to allow kernel subsystems to
> programmatically preserve and retrieve file descriptors during a live update.
> This enables subsystems managing struct file objects directly to participate
> without solely relying on userspace interaction.
>
>> diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
>> index a0a419085e283..0aa0b4e5339f3 100644
>> --- a/kernel/liveupdate/luo_file.c
>> +++ b/kernel/liveupdate/luo_file.c
> [ ... ]
>> @@ -924,3 +931,65 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
>> luo_flb_unregister_all(fh);
>> list_del(&ACCESS_PRIVATE(fh, list));
>> }
>> +EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler);
>> +
>> +/**
>> + * liveupdate_get_token_outgoing - Get the token for a preserved file.
> [ ... ]
>> + */
>> +int liveupdate_get_token_outgoing(struct liveupdate_session *s,
>> + struct file *file, u64 *tokenp)
>> +{
>
> [Severity: Low]
> Should this function be exported?
>
> Since this API is intended for kernel subsystems that might be built as
> loadable modules, will leaving out EXPORT_SYMBOL_GPL() cause module link
> errors?
>
>> + struct luo_file_set *file_set = luo_file_set_from_session_locked(s);
>> + struct luo_file *luo_file;
>> + int err = -ENOENT;
>> +
>> + list_for_each_entry(luo_file, &file_set->files_list, list) {
>> + if (luo_file->file == file) {
>> + if (tokenp)
>> + *tokenp = luo_file->token;
>> + err = 0;
>> + break;
>> + }
>> + }
>> +
>> + return err;
>> +}
>> +
>> +/**
>> + * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
> [ ... ]
>> + */
>> +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
>> + struct file **filep)
>> +{
>
> [Severity: Low]
> Similar to liveupdate_get_token_outgoing(), should this function also be
> exported with EXPORT_SYMBOL_GPL() to allow loadable modules to use it?
>
>> + return luo_retrieve_file(luo_file_set_from_session_locked(s),
>> + token, filep);
>
> [Severity: High]
> Will this cause a lockdep splat if a subsystem uses this API during retrieval?
>
> When a subsystem's .retrieve() callback is invoked from luo_retrieve_file(),
> it already holds the target file's luo_file->mutex:
>
> luo_retrieve_file() {
> ...
> guard(mutex)(&luo_file->mutex);
> ...
> err = luo_file->fh->ops->retrieve(&args);
> }
>
> If the subsystem callback then calls liveupdate_get_file_incoming(), it
> recursively calls luo_retrieve_file() for the dependency file, attempting
> to acquire a second luo_file->mutex.
>
> Since all luo_file->mutex instances share the same lock class, won't this
> trigger a 'possible recursive locking detected' warning?
>
>> +}
>
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO
2026-06-05 17:26 ` sashiko-bot
@ 2026-06-08 16:13 ` tarunsahu
0 siblings, 0 replies; 27+ messages in thread
From: tarunsahu @ 2026-06-08 16:13 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvm
Hi,
sashiko-bot@kernel.org writes:
> [ ... ]
>> +#ifdef CONFIG_X86
>> + ser->type = kvm->arch.vm_type;
>> +#else
>> + ser->type = 0;
>
> [Severity: High]
> Does hardcoding the VM type to 0 on non-x86 architectures discard required
> state?
>
> For instance, on ARM64, the VM type parameter encodes architectural
> information such as the Intermediate Physical Address (IPA) size and
> protection flags.
>
> Will restoring the VM with type 0 cause the new kernel to initialize the
> VM with default settings, potentially leading to a mismatched memory layout?
>
Ack. Other than x86_64 arch, vm_type field is not part of struct
kvm_arch. But the type is used in creating the new kvm which we will
need on retrieval path. Currently only ARM64 supports the guest_memfd
other than x86_64, So I suggest the following:
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86)
ser->type = kvm->arch.vm_type;
+#elif defined(CONFIG_ARM64)
+ ser->type = kvm_phys_shift(&kvm->arch.mmu);
+ if (kvm_vm_is_protected(kvm))
+ ser->type |= KVM_VM_TYPE_ARM_PROTECTED;
#else
ser->type = 0;
#endif
>> +
>> +static atomic_t restored_vm_id = ATOMIC_INIT(0);
>> +
>> +static int kvm_luo_retrieve(struct liveupdate_file_op_args *args)
>> +{
>> + char fdname[ITOA_MAX_LEN + 1];
> [ ... ]
>> + snprintf(fdname, sizeof(fdname), "%d",
>> + atomic_inc_return(&restored_vm_id));
>
> [Severity: Medium]
> Since standard VMs use their native file descriptor numbers as fdname, and
> those typically start from low numbers, will this auto-incrementing counter
> guarantee namespace collisions?
>
> A collision here would result in a duplicate directory error in
> kvm_create_vm_debugfs(), which would cause debugfs statistics for newly
> created VMs to be skipped.
>
Yes, I agree, I have a comment in cover letter, Would be happy to
discuss alternative, if this is really an issue.
>> +
>> + file = kvm_create_vm_file(ser->type, fdname);
> [ ... ]
>> +static void kvm_luo_finish(struct liveupdate_file_op_args *args)
>> +{
>> + struct kvm_luo_ser *ser;
>> +
>> + /*
>> + * If retrieve_status is true or set to error, nothing to do here.
>> + * Already cleaned up in kvm_luo_retrieve().
>> + */
>> + if (args->retrieve_status)
>> + return;
>> +
>> + if (!args->serialized_data)
>> + return;
>> +
>> + ser = phys_to_virt(args->serialized_data);
>> + kho_restore_free(ser);
>
> [Severity: High]
> Can this result in a double-free on the successful retrieve path?
>
> In kvm_luo_retrieve(), kho_restore_free() is explicitly called on success
> before returning 0:
>
> args->file = file;
> kho_restore_free(ser);
>
> kvm_uevent_notify_vm_create(kvm);
> return 0;
>
> Since the retrieve is successful, args->retrieve_status will be 0 here,
> and this function will call kho_restore_free() again on the same
> serialized data.
This is incorrect: retrieve_status is 1 when it is successful. So this
review comment is incorrect.
>
>> +}
>
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=4
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
2026-06-05 17:21 ` sashiko-bot
@ 2026-06-08 18:20 ` tarunsahu
0 siblings, 0 replies; 27+ messages in thread
From: tarunsahu @ 2026-06-08 18:20 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvm
sashiko-bot@kernel.org writes:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [High] A malicious tenant can inflict a cross-tenant denial-of-service by holding the global `kvm_gmem_freeze_srcu` read lock during massive, unbounded `fallocate()` operations, blocking `kvm_gmem_freeze()` for other VMs.
> --
>
> commit 5ee5700d9538d9bd5f29f00de6d11fcc22d28ecc
> Author: Tarun Sahu <tarunsahu@google.com>
>
> kvm: guest_memfd: Add support for freezing and unfreezing mappings
>
> This commit introduces freezing and unfreezing for guest_memfd mappings to
> prevent fallocate and page fault allocations during preservation. It uses a
> global SRCU structure to synchronize read-side operations with
> kvm_gmem_freeze().
>
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index 6740ae2bf948d..b94639cdf3126 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
> [ ... ]
>> @@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
>> static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>> loff_t len)
>> {
>> + struct inode *inode = file_inode(file);
>> int ret;
>> + int idx;
>>
>> - if (!(mode & FALLOC_FL_KEEP_SIZE))
>> - return -EOPNOTSUPP;
>> + idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
>> + if (kvm_gmem_is_frozen(inode)) {
>> + srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
>> + return -EPERM;
>> + }
>>
> [ ... ]
>> + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
>> + ret = -EINVAL;
>> + goto out;
>> + }
>>
>> if (mode & FALLOC_FL_PUNCH_HOLE)
>> ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
>> else
>> ret = kvm_gmem_allocate(file_inode(file), offset, len);
>
> [Severity: High]
> Does this code allow a malicious tenant to inflict a cross-tenant denial of
> service?
>
> The srcu_read_lock() is acquired at the beginning of kvm_gmem_fallocate() and
> held across the entire kvm_gmem_punch_hole() or kvm_gmem_allocate() execution.
> Because a guest_memfd can be terabytes in size, these operations could iterate
> over millions of pages and take a long time to complete.
>
> During this entire time, the global SRCU read lock is held uninterruptibly
> (yielding via cond_resched() in kvm_gmem_allocate() does not drop SRCU).
>
>>
>> if (!ret)
>> file_modified(file);
>> +
>> +out:
>> + srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
>> return ret;
>> }
> [ ... ]
>> @@ -1014,5 +1053,61 @@ void kvm_gmem_exit(void)
> [ ... ]
>> +void kvm_gmem_freeze(struct inode *inode, bool freeze)
>> +{
>> + u64 flags = READ_ONCE(GMEM_I(inode)->flags);
>> +
>> + if (freeze)
>> + flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
>> + else
>> + flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
>> +
>> + WRITE_ONCE(GMEM_I(inode)->flags, flags);
>> +
>> + if (freeze)
>> + synchronize_srcu(&kvm_gmem_freeze_srcu);
>
> [Severity: High]
> Can this synchronize_srcu() block indefinitely waiting on another tenant?
>
> Since the SRCU is global, calling kvm_gmem_freeze() on a completely unrelated
> VM will invoke synchronize_srcu(&kvm_gmem_freeze_srcu). This will wait for the
> first tenant's massive fallocate() to finish.
>
> The loop in kvm_gmem_allocate() checks kvm_gmem_is_frozen(inode) on its own
> inode, so it will not abort when a different inode is frozen. This effectively
> allows an unprivileged tenant to stall live updates and preservation
> system-wide.
Agree. With having per-cpu have a problem with the per-cpu structure for
the inode freeze as it is not tracked in cgroup (IIUC). which might result in
OOM on host when a lot of guest_memfd allocation are triggered.
We can use per inode rw_semaphore, but they will not be per-cpu, so
there will be cache line bouncing. Which will affect only slow path
(page fault allocation, fallocate). I would like to discuss on the best
strategy.
>
>> +}
>
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=6
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation
2026-06-05 17:25 ` sashiko-bot
@ 2026-06-08 18:22 ` tarunsahu
0 siblings, 0 replies; 27+ messages in thread
From: tarunsahu @ 2026-06-08 18:22 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvm
sashiko-bot@kernel.org writes:
> [ ... ]
>> +static bool kvm_gmem_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file)
>> +{
>> + struct inode *inode = file_inode(file);
>> + struct gmem_file *gmem_file = file->private_data;
>> + struct kvm *kvm = gmem_file->kvm;
>
> [Severity: Critical]
> Does this code dereference file->private_data before confirming the file
> is actually a guest_memfd?
>
> The LUO framework might pass unrelated file descriptors where
> private_data is NULL. By extracting gmem_file->kvm before checking
> inode->i_sb->s_magic, could this cause a NULL pointer dereference
> when evaluating unrelated files?
Oops, This is a bug, I will update it.
>
>> +
>> + if (inode->i_sb->s_magic != GUEST_MEMFD_MAGIC)
>> + return 0;
>> +
> [ ... ]
>> +static int kvm_gmem_luo_preserve(struct liveupdate_file_op_args *args)
>> +{
> [ ... ]
>> + end_index = size >> PAGE_SHIFT;
>> +
>> + ser = kho_alloc_preserve(sizeof(*ser));
>
> [Severity: High]
> Could this call trigger a NULL pointer dereference if the KHO subsystem
> is disabled?
>
> According to the KHO subsystem guidelines, all callers must explicitly
> gate KHO usage on kho_is_enabled(). It looks like kvm_gmem_luo_init()
> registers the file handler unconditionally, and this path invokes
> kho_alloc_preserve() and other KHO serialization APIs without checking.
>
> Should this function (or the initialization) verify kho_is_enabled()
> before interacting with the KHO subsystem?
Luo already does the check, this review in incorrect.
> [ ... ]
>> +static void kvm_gmem_luo_finish(struct liveupdate_file_op_args *args)
>> +{
>> + struct guest_memfd_luo_ser *ser;
>> + struct guest_memfd_luo_folio_ser *folios_ser;
>> +
>> + /* Nothing to be done here, if retrieve_status was successful or errored,
>> + * Cleanup is taken care of in retrieval call.
>> + */
>> + if (args->retrieve_status)
>> + return;
>
> [Severity: High]
> Does this logic inversion cause a double-free on successful retrieval?
>
> In kvm_gmem_luo_retrieve(), a successful retrieval frees the ser struct
> but leaves args->serialized_data intact:
>
> kvm_gmem_luo_retrieve()
> kho_restore_free(ser);
> return 0;
>
> Because standard Linux error handling uses 0 for success,
> if (args->retrieve_status) evaluates to false on success, causing this
> function to fall through instead of returning.
>
> Could this lead to accessing the freed ser struct, erroneously putting
> active folios, and calling kho_restore_free() a second time?
retrieve_status is 1 when successful not 0, this review is incorrect.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test
2026-06-05 17:22 ` sashiko-bot
@ 2026-06-08 18:26 ` tarunsahu
0 siblings, 0 replies; 27+ messages in thread
From: tarunsahu @ 2026-06-08 18:26 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvm
sashiko-bot@kernel.org writes:
>
>> + session_fd = luo_create_session(dev_luo_fd, SESSION_NAME);
>> + TEST_ASSERT(session_fd >= 0, "Failed to create LUO session");
>> +
>> + ret = luo_session_preserve_fd(session_fd, vm->fd, VM_TOKEN);
>> + TEST_ASSERT(ret == 0, "Failed to preserve VM file descriptor");
>
> [Severity: Medium]
> Since this test creates a VM with vCPU 0 and preserves the VM file descriptor,
> but doesn't preserve the vCPU file descriptor, will vCPU 0 still be retained
> in KVM's state for phase 2 without a way to access it?
This is not the scope of the test.
>
>> +
>> + ret = luo_session_preserve_fd(session_fd, gmem_fd, GMEM_TOKEN);
>> + TEST_ASSERT(ret == 0, "Failed to preserve guest_memfd file descriptor");
>
> [ ... ]
>
>> +static void do_phase2(void)
>> +{
>
> [ ... ]
>
>> + vm = vm_create_from_fd(retrieved_vm_fd, shape);
>> +
>> + u64 nr_pages = 2048; /* 8MB is plenty for slot0 pages */
>> +
>> + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
>
> [Severity: Medium]
> Does this code attempt to overwrite existing memslot 0 with a new userspace
> address without deleting it first?
>
> Since the VM's internal memslots are retained in KVM kernel state when
> preserved, redefining slot 0 with a new userspace_addr violates KVM's memslot
> mutability rules and will likely cause KVM_SET_USER_MEMORY_REGION to fail with
> -EINVAL.
Not preserving the memslots. Only preserving the guest_memfd which is
attached to memslot 1.
>
>> + kvm_vm_elf_load(vm, program_invocation_name);
>> +
>> + for (int i = 0; i < NR_MEM_REGIONS; i++)
>> + vm->memslots[i] = 0;
>> +
>> + struct userspace_mem_region *slot0 = memslot2region(vm, 0);
>> +
>> + ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
>> +
>> + vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
>> + retrieved_gmem_fd, 0);
>
> [Severity: Medium]
> Will this also trigger an -EINVAL error from KVM?
>
> Similar to slot 0, redefining the guest_memfd memslot (slot 1) without explicit
> deletion first also violates KVM's memslot mutability rules.
reattaching the guest_memfd not preserving the memslot mapping. so this
is not valid here.
>
>> +
>> + for (size_t i = 0; i < GMEM_SIZE; i += page_size)
>> + virt_pg_map(vm, gpa + i, gpa + i);
>> +
>> + vcpu = vm_vcpu_add(vm, 0, guest_code_phase2);
>
> [Severity: Medium]
> Will this blindly recreate a vCPU on the preserved VM, resulting in an
> immediate -EEXIST failure?
Incorrect, THere was no preservation of vCPU.
>
> Because the entire VM state was preserved in phase 1, vCPU 0 already exists in
> KVM's state. Calling vm_vcpu_add() for vCPU 0 again causes KVM to reject the
> duplicate KVM_CREATE_VCPU request.
No. Only guest_memfd and vm file was preserved. vm_file only preserved
with vm_type and everything else was recreated so, incorrect review.
>
>> + kvm_arch_vm_finalize_vcpus(vm);
>> +
>> + vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
>
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/cover.1780676742.git.tarunsahu@google.com?part=10
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test
2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
2026-06-05 17:22 ` sashiko-bot
@ 2026-06-22 23:01 ` Ackerley Tng
1 sibling, 0 replies; 27+ messages in thread
From: Ackerley Tng @ 2026-06-22 23:01 UTC (permalink / raw)
To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Tarun Sahu <tarunsahu@google.com> writes:
> Add a new KVM selftest `guest_memfd_preservation_test` to verify that
> guest memory backed by guest_memfd is preserved properly.
>
Don't think using backticks in commit messages is a common practice but
I might be wrong here.
> The test leverages the Live Update Orchestrator (LUO) infrastructure
> to validate that memory folios and configuration layouts are
> successfully saved and then restored during kernel live updates,
> preventing any memory loss for the guest.
>
> Here, I have used the kvm selftests framework by creating a new
> vm and mapping two memory slots to it. One is the code that is executed
> inside the vm and other is the guest_memfd whose memory is being
> written by the guest code.
>
Don't think commit messages with "I" are common either
> In Phase 1: Once data is written the vm exits and wait for the user
> to trigger the kexec.
>
> In Phase 2: A new vm is created with retrieved kvm and again two
> memory slots are assigned. Once for guest code, and another is for
> retrieved guest_memfd where guest_memfd memory is verified by the
> executed guest code. If verification succeeds, The test passes.
>
>
> [...snip...]
>
> +#define SESSION_NAME "gmem_vm_preservation_session"
> +#define VM_TOKEN 0x1001
> +#define GMEM_TOKEN 0x1002
> +
> +#define GMEM_SIZE (16ULL * 1024 * 1024)
> +#define DATA_SIZE (5ULL * 1024 * 1024)
> +
> +static size_t page_size;
> +
> +/* Deterministic byte pattern generation based on offset */
> +static inline uint8_t get_pattern_byte(size_t offset)
> +{
> + return (uint8_t)(offset ^ 0x5A);
> +}
> +
> +static void guest_code_phase1(uint64_t gpa, uint64_t size, uint64_t data_size)
> +{
> + uint8_t *mem = (uint8_t *)gpa;
> + size_t i;
> +
> + for (i = 0; i < data_size; i++)
> + mem[i] = get_pattern_byte(i);
> +
> + GUEST_DONE();
> +}
> +
> +static void guest_code_phase2(uint64_t gpa, uint64_t size, uint64_t data_size)
> +{
> + uint8_t *mem = (uint8_t *)gpa;
> + size_t i;
> +
> + for (i = 0; i < data_size; i++) {
> + uint8_t val = get_pattern_byte(i);
> +
> + __GUEST_ASSERT(mem[i] == val,
> + "Data mismatch at offset %lu! Expected 0x%x, got 0x%x",
> + i, val, mem[i]);
> + }
> +
> + GUEST_DONE();
> +}
> +
> +static void do_phase1(void)
> +{
> + uint64_t flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;
Is there a reason to set GUEST_MEMFD_FLAG_MMAP? We're not really
accessing that memory from the host in this test.
> + int gmem_fd, dev_luo_fd, session_fd, ret;
> + const uint64_t gpa = SZ_4G;
> + struct kvm_vcpu *vcpu;
> + const int slot = 1;
> + struct kvm_vm *vm;
> +
> + vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1,
> + guest_code_phase1);
> + gmem_fd = vm_create_guest_memfd(vm, GMEM_SIZE, flags);
> + vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
> + gmem_fd, 0);
> +
> + for (size_t i = 0; i < GMEM_SIZE; i += page_size)
> + virt_pg_map(vm, gpa + i, gpa + i);
> +
> + vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
If GMEM_SIZE and DATA_SIZE are static I think we don't have to set those
as vcpu_args_set(), they can be used as macros from within the guest.
> +
> + vcpu_run(vcpu);
> + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
> +
> + dev_luo_fd = luo_open_device();
> + TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
> +
> + session_fd = luo_create_session(dev_luo_fd, SESSION_NAME);
> + TEST_ASSERT(session_fd >= 0, "Failed to create LUO session");
> +
> + ret = luo_session_preserve_fd(session_fd, vm->fd, VM_TOKEN);
> + TEST_ASSERT(ret == 0, "Failed to preserve VM file descriptor");
> +
> + ret = luo_session_preserve_fd(session_fd, gmem_fd, GMEM_TOKEN);
> + TEST_ASSERT(ret == 0, "Failed to preserve guest_memfd file descriptor");
> +
Thanks for showing how this works :)
> + printf("\n============================================================\n");
> + printf("Phase 1 Complete Successfully!\n");
> + printf("VM file and guest_memfd file have been preserved via LUO.\n");
> + printf("Tokens: VM_TOKEN=0x%x, GMEM_TOKEN=0x%x\n", VM_TOKEN, GMEM_TOKEN);
> + printf("Machine Size: %llu MB, Data Size: %llu MB\n", GMEM_SIZE / SZ_1M,
> + DATA_SIZE / SZ_1M);
> + printf("------------------------------------------------------------\n");
> +
> + daemonize_and_wait();
> +}
> +
> +static struct kvm_vm *vm_create_from_fd(int resurrected_vm_fd,
> + struct vm_shape shape)
> +{
> + struct kvm_vm *vm;
> +
> + vm = calloc(1, sizeof(*vm));
> + TEST_ASSERT(vm != NULL, "Insufficient Memory");
> +
> + vm_init_fields(vm, shape);
What would happen if the shape was changed between preserving and
restoring?
> +
> + vm->kvm_fd = open_path_or_exit(KVM_DEV_PATH, O_RDWR);
> + vm->fd = resurrected_vm_fd;
> +
> + if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD))
> + vm->stats.fd = vm_get_stats_fd(vm);
> + else
> + vm->stats.fd = -1;
> +
> + vm_init_memory_properties(vm);
> +
> + return vm;
> +}
> +
I think vm_create_from_fd() could be introduced in an earlier patch to
reduce the amount of new code in this patch. Also, I think it could
perhaps be moved to kvm_util.c assuming that other test will use it too.
> +static void do_phase2(void)
> +{
> + int retrieved_vm_fd, retrieved_gmem_fd, dev_luo_fd, session_fd;
> + struct vm_shape shape = VM_SHAPE_DEFAULT;
> + const uint64_t gpa = SZ_4G;
> + struct kvm_vcpu *vcpu;
> + const int slot = 1;
> + struct kvm_vm *vm;
> +
> + dev_luo_fd = luo_open_device();
> + TEST_ASSERT(dev_luo_fd >= 0, "Failed to open /dev/liveupdate");
> +
> + session_fd = luo_retrieve_session(dev_luo_fd, SESSION_NAME);
> + TEST_ASSERT(session_fd >= 0, "Failed to retrieve LUO session");
> +
> + retrieved_vm_fd = luo_session_retrieve_fd(session_fd, VM_TOKEN);
> + TEST_ASSERT(retrieved_vm_fd >= 0, "Failed to retrieve VM file descriptor");
> +
> + retrieved_gmem_fd = luo_session_retrieve_fd(session_fd, GMEM_TOKEN);
> + TEST_ASSERT(retrieved_gmem_fd >= 0, "Failed to retrieve guest_memfd file descriptor");
> +
> + vm = vm_create_from_fd(retrieved_vm_fd, shape);
> +
> + u64 nr_pages = 2048; /* 8MB is plenty for slot0 pages */
> +
I don't think declarations are usually mixed with regular code.
> + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
> + kvm_vm_elf_load(vm, program_invocation_name);
> +
> + for (int i = 0; i < NR_MEM_REGIONS; i++)
> + vm->memslots[i] = 0;
> +
> + struct userspace_mem_region *slot0 = memslot2region(vm, 0);
> +
> + ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
> +
> + vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, GMEM_SIZE, NULL,
> + retrieved_gmem_fd, 0);
> +
> + for (size_t i = 0; i < GMEM_SIZE; i += page_size)
> + virt_pg_map(vm, gpa + i, gpa + i);
> +
> + vcpu = vm_vcpu_add(vm, 0, guest_code_phase2);
> + kvm_arch_vm_finalize_vcpus(vm);
> +
> + vcpu_args_set(vcpu, 3, gpa, GMEM_SIZE, DATA_SIZE);
> +
> + printf("Resuming / Running VM in Phase 2...\n");
> + vcpu_run(vcpu);
> + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
> +
> + printf("\nSUCCESS: Phase 2 Complete! All 5MB complex data verified intact!\n");
> +
> + luo_session_finish(session_fd);
> + close(session_fd);
> + close(dev_luo_fd);
> + /* This will also close the vm_fd */
> + kvm_vm_free(vm);
> + close(retrieved_gmem_fd);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + bool phase2 = false;
> +
> + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD));
> + page_size = getpagesize();
> +
> + for (int i = 1; i < argc; i++) {
> + if (strcmp(argv[i], "--phase2") == 0)
> + phase2 = true;
> + }
> +
Maybe use getopt() here?
> + if (phase2)
> + do_phase2();
> + else
> + do_phase1();
> +
> + return 0;
> +}
> --
> 2.54.0.1032.g2f8565e1d1-goog
I think we also need tests for trying to allocate while frozen, and
conversion while frozen, and trying to preserve while preservation is
not allowed.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation
2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
2026-06-05 17:25 ` sashiko-bot
@ 2026-06-22 23:27 ` Ackerley Tng
1 sibling, 0 replies; 27+ messages in thread
From: Ackerley Tng @ 2026-06-22 23:27 UTC (permalink / raw)
To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Tarun Sahu <tarunsahu@google.com> writes:
> This patch sets up the basic infrastructure to preserve the guest_memfd.
> Currently this supports only fully shared guest_memfd and backed by
> PAGE_SIZE pages.
>
> It registers a new LUO file handler for guest_memfd files to serialize
> and deserialize guest memory. This allows preserving guest memory backed
> by guest_memfd across updates, ensuring that guest instances can be
> resumed seamlessly without losing their memory contents.
>
> Preservation is straight forward. It walks through the folios and
> serialize them.
>
> There is kvm_gmem_freeze call on preserve which freeze the guest_memfd
> inode. It avoids any changes to inode mapping with fallocate calls or
> any new fault allocation (fails) on or after preservation. No need to check
> this during the page fault as preservation is only supported for
> pre-faulted/pre-allocated guest_memfd.
>
> While retrieving the guest_memfd, it requires the struct kvm to create
> new guest_memfd. So it first get the vm_file from the same session using
> the token passed during the preservation. And use it to get
> vm_file->kvm.
>
> This change also update the MAINTAINERS list.
>
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
> ---
> MAINTAINERS | 1 +
> include/linux/kho/abi/kvm.h | 79 +++++-
> virt/kvm/Makefile.kvm | 2 +-
> virt/kvm/guest_memfd_luo.c | 485 ++++++++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 7 +
> virt/kvm/kvm_mm.h | 4 +
> 6 files changed, 571 insertions(+), 7 deletions(-)
> create mode 100644 virt/kvm/guest_memfd_luo.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 9bfc3c1f6676..16cba790a84d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -14418,6 +14418,7 @@ L: kexec@lists.infradead.org
> L: kvm@vger.kernel.org
> S: Maintained
> T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
> +F: virt/kvm/guest_memfd_luo.c
> F: virt/kvm/kvm_luo.c
>
> KVM PARAVIRT (KVM/paravirt)
> diff --git a/include/linux/kho/abi/kvm.h b/include/linux/kho/abi/kvm.h
> index 718db68a541a..42074d76e04a 100644
> --- a/include/linux/kho/abi/kvm.h
> +++ b/include/linux/kho/abi/kvm.h
> @@ -9,20 +9,23 @@
> #define _LINUX_KHO_ABI_KVM_H
>
> #include <linux/types.h>
> +#include <linux/bits.h>
> #include <linux/kho/abi/kexec_handover.h>
>
> /**
> - * DOC: KVM Live Update ABI
> + * DOC: KVM and guest_memfd Live Update ABI
> *
> - * KVM uses the ABI defined below for preserving its state
> + * KVM and guest_memfd use the ABI defined below for preserving their states
> * across a kexec reboot using the LUO.
> *
> - * The state is serialized into a packed structure `struct kvm_luo_ser`
> - * which is handed over to the next kernel via the KHO mechanism.
> + * The state is serialized into packed structures (struct kvm_luo_ser and
> + * struct guest_memfd_luo_ser) which are handed over to the next kernel via
> + * the KHO mechanism.
> *
> - * This interface is a contract. Any modification to the structure layout
> + * This interface is a contract. Any modification to the structure layouts
> * constitutes a breaking change. Such changes require incrementing the
> - * version number in the KVM_LUO_FH_COMPATIBLE compatibility string.
> + * version number in the KVM_LUO_FH_COMPATIBLE or
> + * GUEST_MEMFD_LUO_FH_COMPATIBLE compatibility strings.
> */
>
> /**
> @@ -36,4 +39,68 @@ struct kvm_luo_ser {
> /* The compatibility string for KVM VM file handler */
> #define KVM_LUO_FH_COMPATIBLE "kvm_vm_luo_v1"
>
> +/**
> + * struct guest_memfd_luo_folio_ser - Serialization layout for a single folio in guest_memfd.
> + * @pfn: Page Frame Number of the folio.
> + * @index: Page offset of the folio within the file.
> + * @flags: State flags associated with the folio.
> + */
> +struct guest_memfd_luo_folio_ser {
> + u64 pfn:52;
> + u64 flags:12;
> + u64 index;
> +} __packed;
> +
> +/**
> + * GUEST_MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date.
> + *
> + * This flag is per folio to check if the folio is uptodate.
> + */
> +#define GUEST_MEMFD_LUO_FOLIO_UPTODATE BIT(0)
> +
> +
> +/**
> + * GUEST_MEMFD_LUO_FLAG_MMAP - The guest_memfd supports mmap.
> + *
> + * This flag indicates that the guest_memfd supports host-side mmap.
> + */
> +#define GUEST_MEMFD_LUO_FLAG_MMAP BIT(0)
> +
> +/**
> + * GUEST_MEMFD_LUO_FLAG_INIT_SHARED - Initialize memory as shared.
> + *
> + * This flag indicates that the guest_memfd has been initialized as shared
> + * memory.
> + */
> +#define GUEST_MEMFD_LUO_FLAG_INIT_SHARED BIT(1)
> +
> +/**
> + * GUEST_MEMFD_LUO_SUPPORTED_FLAGS - Supported guest_memfd LUO flags mask.
> + *
> + * A mask of all guest_memfd preservation flags supported by this version
> + * of the KVM LUO ABI.
> + */
> +#define GUEST_MEMFD_LUO_SUPPORTED_FLAGS (GUEST_MEMFD_LUO_FLAG_MMAP | \
> + GUEST_MEMFD_LUO_FLAG_INIT_SHARED)
> +
> +/**
> + * struct guest_memfd_luo_ser - Main serialization structure for guest_memfd.
> + * @size: The size of the file in bytes.
> + * @flags: File-level flags.
> + * @nr_folios: Number of folios in the folios array.
> + * @vm_token: Token of the associated KVM VM instance.
> + * @folios: KHO vmalloc descriptor pointing to the array of
> + * struct guest_memfd_luo_folio_ser.
> + */
> +struct guest_memfd_luo_ser {
> + u64 size;
> + u64 flags;
> + u64 nr_folios;
> + u64 vm_token;
> + struct kho_vmalloc folios;
> +} __packed;
> +
> +/* The compatibility string for GUEST_MEMFD file handler */
> +#define GUEST_MEMFD_LUO_FH_COMPATIBLE "guest_memfd_luo_v1"
> +
> #endif /* _LINUX_KHO_ABI_KVM_H */
> diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> index c1a962159264..d30fca094c42 100644
> --- a/virt/kvm/Makefile.kvm
> +++ b/virt/kvm/Makefile.kvm
> @@ -13,4 +13,4 @@ kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
> kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
> kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
> kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
> -kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/kvm_luo.o
> +kvm-$(CONFIG_LIVEUPDATE_GUEST_MEMFD) += $(KVM)/guest_memfd_luo.o $(KVM)/kvm_luo.o
> diff --git a/virt/kvm/guest_memfd_luo.c b/virt/kvm/guest_memfd_luo.c
> new file mode 100644
> index 000000000000..d466f889c9aa
> --- /dev/null
> +++ b/virt/kvm/guest_memfd_luo.c
> @@ -0,0 +1,485 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright (c) 2026, Google LLC.
> + * Tarun Sahu <tarunsahu@google.com>
> + *
> + * Guestmemfd Preservation for Live Update Orchestrator (LUO)
> + */
> +
> +/**
> + * DOC: Guestmemfd Preservation via LUO
> + *
> + * Overview
> + * ========
> + *
> + * Guest memory file descriptors (guest_memfd) can be preserved over a kexec
> + * reboot using the Live Update Orchestrator (LUO) file preservation. This
> + * allows userspace to preserve VM memory across kexec reboots.
> + *
> + * The preservation is not intended to be transparent. Only select properties
> + * of the guest_memfd are preserved, while others are reset to default.
> + *
> + * Preserved Properties
> + * ====================
> + *
> + * The following properties of guest_memfd are preserved across kexec:
> + *
> + * File Size
> + * The size of the file is preserved.
> + *
> + * File Contents
> + * All folios present in the page cache are preserved.
> + *
> + * File-level Flags
> + * The file-level flags (such as MMAP support and INIT_SHARED default mapping)
> + * are preserved.
> + *
> + * Non-Preserved Properties
> + * ========================
> + *
> + * NUMA Memory Policy
> + * NUMA memory policies associated with the guest_memfd are not preserved.
> + */
> +#include <linux/liveupdate.h>
> +#include <linux/kvm_host.h>
> +#include <linux/pagemap.h>
> +#include <linux/file.h>
> +#include <linux/err.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/magic.h>
> +#include <linux/kexec_handover.h>
> +#include <linux/kho/abi/kexec_handover.h>
> +#include <linux/kho/abi/kvm.h>
> +#include "guest_memfd.h"
> +
> +static int kvm_gmem_luo_walk_folios(struct address_space *mapping,
> + pgoff_t end_index, struct guest_memfd_luo_folio_ser *folios_ser,
> + u64 *out_count)
> +{
> + struct folio_batch fbatch;
> + pgoff_t index = 0;
> + u64 count = 0;
> + int err = 0;
> +
> + folio_batch_init(&fbatch);
> + while (index < end_index) {
> + unsigned int nr, i;
> +
> + nr = filemap_get_folios(mapping, &index, end_index - 1, &fbatch);
> + if (nr == 0)
> + break;
> +
> + for (i = 0; i < nr; i++) {
> + struct folio *folio = fbatch.folios[i];
> +
> + if (folios_ser) {
> + if (folio_test_hwpoison(folio)) {
> + err = -EHWPOISON;
> + folio_batch_release(&fbatch);
> + goto out;
> + }
> + err = kho_preserve_folio(folio);
> + if (err) {
> + folio_batch_release(&fbatch);
> + goto out;
> + }
> +
> + folios_ser[count].pfn = folio_pfn(folio);
> + folios_ser[count].index = folio->index;
> + folios_ser[count].flags = folio_test_uptodate(folio) ?
> + GUEST_MEMFD_LUO_FOLIO_UPTODATE : 0;
> + }
> + count++;
> + }
> + folio_batch_release(&fbatch);
> + cond_resched();
> + }
> +
> +out:
> + *out_count = count;
> + return err;
> +}
> +
> +static bool kvm_gmem_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file)
> +{
> + struct inode *inode = file_inode(file);
> + struct gmem_file *gmem_file = file->private_data;
> + struct kvm *kvm = gmem_file->kvm;
> +
> + if (inode->i_sb->s_magic != GUEST_MEMFD_MAGIC)
> + return 0;
> +
How does .can_preserve decide route to this function? If it already
routes here, wouldn't this inode definitely be a guest_memfd file?
> + if (kvm_arch_has_private_mem(kvm))
> + return 0;
> +
> + if (mapping_large_folio_support(inode->i_mapping))
> + return 0;
> +
> + return 1;
Let's return true and false rather than relying on casting.
> +}
> +
> +static int kvm_gmem_luo_preserve(struct liveupdate_file_op_args *args)
> +{
> + struct guest_memfd_luo_folio_ser *folios_ser = NULL;
> + u64 count = 0, gmem_flags, abi_flags = 0;
> + struct guest_memfd_luo_ser *ser;
> + struct address_space *mapping;
> + struct gmem_file *gmem_file;
> + struct inode *inode;
> + pgoff_t end_index;
> + struct kvm *kvm;
> + int err = 0;
> + long size;
> +
> + inode = file_inode(args->file);
I think to lock out all allocates, you'd have to take
filemap_invalidate_lock() before freezing.
> + kvm_gmem_freeze(inode, true);
> +
> + mapping = inode->i_mapping;
> + size = i_size_read(inode);
> + if (!size) {
> + err = -EINVAL;
> + goto err_unfreeze_inode;
> + }
> +
> + if (WARN_ON_ONCE(!PAGE_ALIGNED(size))) {
> + err = -EINVAL;
> + goto err_unfreeze_inode;
> + }
> +
> + gmem_file = args->file->private_data;
> + kvm = gmem_file->kvm;
> +
> + gmem_flags = READ_ONCE(GMEM_I(inode)->flags);
> + if (gmem_flags & ~(GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED
Why condition this on MMAP?
After conversions lands, we'd have to iterate to check that the entire
guest_memfd is shared offset-by-offset instead of checking for INIT_SHARED.
> + | GUEST_MEMFD_F_MAPPING_FROZEN)) {
This would always be true since kvm_gmem_freeze() is done above.
> + err = -EOPNOTSUPP;
> + goto err_unfreeze_inode;
> + }
> +
> + if (gmem_flags & GUEST_MEMFD_FLAG_MMAP)
> + abi_flags |= GUEST_MEMFD_LUO_FLAG_MMAP;
> + if (gmem_flags & GUEST_MEMFD_FLAG_INIT_SHARED)
> + abi_flags |= GUEST_MEMFD_LUO_FLAG_INIT_SHARED;
> +
Is it intentional to have a different set of flags that are actually
preserved? I think we should refactor out a function to transfer the
flags over.
> + end_index = size >> PAGE_SHIFT;
> +
> + ser = kho_alloc_preserve(sizeof(*ser));
> + if (IS_ERR(ser)) {
> + err = PTR_ERR(ser);
> + goto err_unfreeze_inode;
> + }
> +
> + /* First pass: Count the folios present in the page cache */
> + err = kvm_gmem_luo_walk_folios(mapping, end_index, NULL, &count);
> + if (err)
> + goto err_free_ser;
> +
> + ser->size = size;
> + ser->flags = abi_flags;
> + ser->nr_folios = count;
> + ser->vm_token = 0; // It will be set during the kvm_gmem_luo_freeze()
I don't think // is commonly used.
> +
> + if (count > 0) {
> + folios_ser = vcalloc(count, sizeof(*folios_ser));
> + if (!folios_ser) {
> + err = -ENOMEM;
> + goto err_free_ser;
> + }
> +
> + /* Second pass: Fill the metadata array and preserve folios */
> + err = kvm_gmem_luo_walk_folios(mapping, end_index, folios_ser, &count);
I think it's clearer to just define 2 functions rather than using the
same function twice to do these different things. The comments on the
two passes can then be dropped.
> + if (err)
> + goto err_unpreserve_unlocked;
> +
> + if (WARN_ON_ONCE(count != ser->nr_folios)) {
> + err = -EINVAL;
> + goto err_unpreserve_unlocked;
> + }
> + }
> +
> + if (count > 0) {
> + err = kho_preserve_vmalloc(folios_ser, &ser->folios);
> + if (err)
> + goto err_unpreserve_unlocked;
> + }
> +
> + args->serialized_data = virt_to_phys(ser);
> + args->private_data = folios_ser;
> +
> + return 0;
> +
> +err_unpreserve_unlocked:
> + for (long i = (long)count - 1; i >= 0; i--) {
Not sure if it's common to define long i inline.
> + struct folio *folio = pfn_folio(folios_ser[i].pfn);
> +
> + kho_unpreserve_folio(folio);
> + }
> + vfree(folios_ser);
> +err_free_ser:
> + kho_unpreserve_free(ser);
> +err_unfreeze_inode:
> + kvm_gmem_freeze(inode, false);
> + return err;
> +}
> +
> +static int kvm_gmem_luo_freeze(struct liveupdate_file_op_args *args)
> +{
> + struct guest_memfd_luo_ser *ser;
> + struct gmem_file *gmem_file;
> + struct kvm *kvm;
> + struct file *kvm_file;
> + u64 vm_token;
> + int err;
> +
> + if (WARN_ON_ONCE(!args->serialized_data))
> + return -EINVAL;
> +
> + ser = phys_to_virt(args->serialized_data);
> +
> + gmem_file = args->file->private_data;
> + kvm = gmem_file->kvm;
> +
> + /*
> + * Obtain a strong reference to kvm->vm_file to prevent the SLAB_TYPESAFE_BY_RCU
> + * file memory from being reallocated while it is being processed.
> + */
> + kvm_file = get_file_active(&kvm->vm_file);
> + if (!kvm_file)
> + return -ENOENT;
> +
> + err = liveupdate_get_token_outgoing(args->session, kvm_file, &vm_token);
> + fput(kvm_file);
> + if (err)
> + return err;
> +
> + ser->vm_token = vm_token;
> + return 0;
> +}
> +
> +static void kvm_gmem_luo_discard_folios(
> + const struct guest_memfd_luo_folio_ser *folios_ser,
> + u64 nr_folios, u64 start_idx)
> +{
> + long i;
> +
> + for (i = start_idx; i < nr_folios; i++) {
> + struct folio *folio;
> + phys_addr_t phys;
> +
> + if (!folios_ser[i].pfn)
> + continue;
> +
> + phys = PFN_PHYS(folios_ser[i].pfn);
> + folio = kho_restore_folio(phys);
> + if (folio)
> + folio_put(folio);
> + }
> +}
> +
> +static void kvm_gmem_luo_unpreserve(struct liveupdate_file_op_args *args)
> +{
> + struct guest_memfd_luo_folio_ser *folios_ser = args->private_data;
> + struct guest_memfd_luo_ser *ser;
> + long i;
> +
> + if (WARN_ON_ONCE(!args->serialized_data))
> + return;
> +
> + ser = phys_to_virt(args->serialized_data);
> + if (!ser)
> + return;
> +
> + if (ser->nr_folios > 0)
> + kho_unpreserve_vmalloc(&ser->folios);
> + for (i = ser->nr_folios - 1; i >= 0; i--) {
> + struct folio *folio;
> +
> + if (!folios_ser[i].pfn)
Is it possible for pfn to be 0 here? Perhaps this should be a
WARN_ON_ONCE().
> + continue;
> +
> + folio = pfn_folio(folios_ser[i].pfn);
> + kho_unpreserve_folio(folio);
> + }
> + vfree(folios_ser);
> +
> + kho_unpreserve_free(ser);
> + kvm_gmem_freeze(file_inode(args->file), false);
> +}
> +
>
> [...snip...]
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
2026-06-05 17:21 ` sashiko-bot
@ 2026-06-22 23:54 ` Ackerley Tng
2026-06-23 0:09 ` Sean Christopherson
1 sibling, 1 reply; 27+ messages in thread
From: Ackerley Tng @ 2026-06-22 23:54 UTC (permalink / raw)
To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Tarun Sahu <tarunsahu@google.com> writes:
> This patch introduces the freeze on gmem_inode which prevents
Can't find the reference now, but commit messages should take the
imperative mood and avoid "this patch" [*]
[*] https://lore.kernel.org/all/YKRWNaqzo4GVDxHP@google.com/
> the fallocate call and any new page fault allocation. This will avoid
> gmem file modification when it is being preserved
>
> Used srcu lock to synchronise the freeze call, where write blocks
> until all the reads are free. And reads are re-entrant.
>
> Incase fault fails, It return -EPERM and VM_EXIT to userspace. userspace
> must handle this properly as every new fault will fail.
>
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
>
> [...snip...]
>
> @@ -105,12 +108,20 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> if (!IS_ERR(folio))
> return folio;
>
> + idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> + if (kvm_gmem_is_frozen(inode)) {
> + srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> + return ERR_PTR(-EPERM);
> + }
> +
> policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
> folio = __filemap_get_folio_mpol(inode->i_mapping, index,
> FGP_LOCK | FGP_CREAT,
> mapping_gfp_mask(inode->i_mapping), policy);
> mpol_cond_put(policy);
>
> + srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +
> /*
> * External interfaces like kvm_gmem_get_pfn() support dealing
> * with hugepages to a degree, but internally, guest_memfd currently
> @@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> loff_t len)
> {
> + struct inode *inode = file_inode(file);
> int ret;
> + int idx;
>
> - if (!(mode & FALLOC_FL_KEEP_SIZE))
> - return -EOPNOTSUPP;
> + idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> + if (kvm_gmem_is_frozen(inode)) {
> + srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> + return -EPERM;
> + }
fallocate may eventually go to kvm_gmem_get_folio(), so that would check
kvm_gmem_is_frozen() twice. Is this meant to catch the punch hole case?
>
> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> - return -EOPNOTSUPP;
> + if (!(mode & FALLOC_FL_KEEP_SIZE)) {
> + ret = -EOPNOTSUPP;
> + goto out;
> + }
>
> - if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> - return -EINVAL;
> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
> + ret = -EOPNOTSUPP;
> + goto out;
> + }
> +
> + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
> + ret = -EINVAL;
> + goto out;
> + }
There's some reordering here. Why not let the validation happen like
before, then check kvm_gmem_is_frozen()?
>
> if (mode & FALLOC_FL_PUNCH_HOLE)
> ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
>
> [...snip...]
>
> +
> +/**
> + * kvm_gmem_freeze - Freeze or unfreeze a guest_memfd inode mapping.
> + * @inode: The guest_memfd inode.
> + * @freeze: True to freeze, false to unfreeze.
> + *
> + * This API is used strictly during the live update / preservation transition
> + * window to prevent host userspace and guest-side faults from making any
> + * mapping modifications (such as fallocate or page fault allocation)
> + * to the guest_memfd page cache.
> + *
> + * Synchronization Strategy (Sleepable RCU):
> + * To avoid high-contention VFS locks (like inode_lock or
> + * filemap_invalidate_lock) on the vCPU page fault hot paths, this subsystem
> + * implements a lightweight, system-wide Sleepable RCU (SRCU) mechanism
> + * (`kvm_gmem_freeze_srcu`):
> + *
> + * Global vs. Per-Inode SRCU
> + * ======================
> + * A single system-wide global static `srcu_struct` is used instead of a
> + * per-inode SRCU structure to completely prevent unprivileged users from
> + * exhausting the host's per-CPU memory allocator. Because
> + * `init_srcu_struct()` allocates per-CPU memory via `alloc_percpu()`, which
> + * is not accounted by memory cgroups (memcg),
> + * a per-inode SRCU structure would allow a tenant to bypass cgroup limits and
> + * trigger a system-wide Out-of-Memory (OOM) crash simply by spawning a large
> + * number of guest_memfd file descriptors (bounded only by RLIMIT_NOFILE).
> + *
> + * Flag Modification Note:
> + * Since `GUEST_MEMFD_F_MAPPING_FROZEN` is the ONLY flag in
> + * `GMEM_I(inode)->flags` that is mutated dynamically at runtime (all other
> + * flags are creation-time flags which remain strictly read-only), there is
> + * no possibility of concurrent bit-modification races. Therefore, a standard
> + * `WRITE_ONCE` is fully safe and does not require complex `cmpxchg`
> + * synchronization loops.
> + */
> +void kvm_gmem_freeze(struct inode *inode, bool freeze)
> +{
> + u64 flags = READ_ONCE(GMEM_I(inode)->flags);
> +
> + if (freeze)
> + flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
> + else
> + flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
> +
> + WRITE_ONCE(GMEM_I(inode)->flags, flags);
> +
> + if (freeze)
> + synchronize_srcu(&kvm_gmem_freeze_srcu);
Why only synchronize on freeze but not unfreeze?
> +}
> +
>
> [...snip...]
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support
2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
2026-06-05 17:21 ` sashiko-bot
@ 2026-06-22 23:59 ` Ackerley Tng
1 sibling, 0 replies; 27+ messages in thread
From: Ackerley Tng @ 2026-06-22 23:59 UTC (permalink / raw)
To: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, seanjc, axelrasmussen
Cc: linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
Tarun Sahu <tarunsahu@google.com> writes:
> Introduce core infrastructure to support VM preservation with LUO.
>
> First two changes are just refactoring, no functional change, third
> change introduces a new member in struct kvm.
> - Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
> - Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
> and anon_inode_getfile() to provide a unified VM file creation API.
> - Track a weak reference to the backing file in struct kvm under
> CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
> without circular lifetime dependencies.
>
Given the above, I think this should be separate patches.
> Signed-off-by: Tarun Sahu <tarunsahu@google.com>
> ---
> include/linux/kvm_host.h | 14 +++++++
> virt/kvm/kvm_main.c | 79 +++++++++++++++++++++++++++++-----------
> virt/kvm/kvm_mm.h | 3 ++
> 3 files changed, 75 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4c14aee1fb06..9111a28637af 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -874,6 +874,18 @@ struct kvm {
> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> /* Protected by slots_lock (for writes) and RCU (for reads) */
> struct xarray mem_attr_array;
> +#endif
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> + /*
> + * Weak reference to the VFS file backing this KVM instance. Stored
> + * without incrementing the file refcount to prevent a circular lifetime
> + * dependency (since file->private_data already pins this struct kvm).
> + * Used exclusively to resolve the file pointer back from struct kvm.
> + *
> + * Written/cleared via rcu_assign_pointer() and read locklessly under
> + * RCU (e.g. via get_file_active() to prevent ABA races).
> + */
> + struct file *vm_file;
> #endif
We didn't really talk about this during the calls, but it seems weird to
preserve a vm_file with pretty much nothing other than the vm type. The
entire VM is re-created, which means it could potentially be a
completely different VM?
In some sense it's more flexible since the guest_memfd can be restored
with some completely different VM, but it seems like it could introduce
other issues.
I think other KVM folks would probably have more thoughts here.
> char stats_id[KVM_STATS_NAME_SIZE];
> };
> @@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
> bool kvm_get_kvm_safe(struct kvm *kvm);
> void kvm_put_kvm(struct kvm *kvm);
> bool file_is_kvm(struct file *file);
> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
> void kvm_put_kvm_no_destroy(struct kvm *kvm);
> +void kvm_uevent_notify_vm_create(struct kvm *kvm);
>
> static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
> {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 89489996fbc1..65f0c5fb353e 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -67,9 +67,6 @@
> #include <linux/kvm_dirty_ring.h>
>
>
> -/* Worst case buffer size needed for holding an integer. */
> -#define ITOA_MAX_LEN 12
> -
> MODULE_AUTHOR("Qumranet");
> MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
> MODULE_LICENSE("GPL");
> @@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
> {
> struct kvm *kvm = filp->private_data;
>
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> + /*
> + * Clear the weak reference of the vm file.
> + * In case vm file is closed by userspace, but kvm still has
> + * other users like vCPUs, clearing this pointer ensures
> + * that we don't have a dangling pointer to a closed file.
> + *
> + * Cleared via rcu_assign_pointer() to ensure proper memory visibility
> + * for concurrent lockless readers under RCU.
> + */
> + rcu_assign_pointer(kvm->vm_file, NULL);
> +#endif
> +
> kvm_irqfd_release(kvm);
>
> kvm_put_kvm(kvm);
> @@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
>
> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
> +{
> + struct kvm *kvm = kvm_create_vm(type, fdname);
> + struct file *file;
> +
> + if (IS_ERR(kvm))
> + return ERR_CAST(kvm);
> +
> + file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> + if (IS_ERR(file)) {
> + kvm_put_kvm(kvm);
> + return file;
> + }
> +
> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
> + /*
> + * Weak reference to the file (without get_file()) to prevent a circular
> + * dependency. Safe because the file's release path clears this pointer
> + * and drops its reference to the VM.
> + *
> + * Written via rcu_assign_pointer() because the pointer can be read
> + * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
> + * get_file_active() to prevent lockless ABA races).
> + */
> + rcu_assign_pointer(kvm->vm_file, file);
> +#endif
> +
> + /*
> + * Don't call kvm_put_kvm anymore at this point; file->f_op is
> + * already set, with ->release() being kvm_vm_release(). In error
> + * cases it will be called by the final fput(file) and will take
> + * care of doing kvm_put_kvm(kvm).
> + */
> +
> + return file;
> +}
> +
> static int kvm_dev_ioctl_create_vm(unsigned long type)
> {
> char fdname[ITOA_MAX_LEN + 1];
> int r, fd;
> - struct kvm *kvm;
> struct file *file;
>
> fd = get_unused_fd_flags(O_CLOEXEC);
> @@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
>
> snprintf(fdname, sizeof(fdname), "%d", fd);
>
> - kvm = kvm_create_vm(type, fdname);
> - if (IS_ERR(kvm)) {
> - r = PTR_ERR(kvm);
> - goto put_fd;
> - }
> -
> - file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> + file = kvm_create_vm_file(type, fdname);
> if (IS_ERR(file)) {
> r = PTR_ERR(file);
> - goto put_kvm;
> + goto put_fd;
> }
>
> - /*
> - * Don't call kvm_put_kvm anymore at this point; file->f_op is
> - * already set, with ->release() being kvm_vm_release(). In error
> - * cases it will be called by the final fput(file) and will take
> - * care of doing kvm_put_kvm(kvm).
> - */
> - kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
> + kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);
Notifying with file->private_data threw me off... I would rather inline
the rcu_assign_pointer() in this function and have this line read
notify(..., kvm) like before.
>
> fd_install(fd, file);
> return fd;
>
> -put_kvm:
> - kvm_put_kvm(kvm);
> put_fd:
> put_unused_fd(fd);
> return r;
> @@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
> kfree(env);
> }
>
> +void kvm_uevent_notify_vm_create(struct kvm *kvm)
> +{
> + kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
> +}
> +
> static void kvm_init_debug(void)
> {
> const struct file_operations *fops;
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 9fcc5d5b7f8d..7aa1d65c3d46 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -3,6 +3,9 @@
> #ifndef __KVM_MM_H__
> #define __KVM_MM_H__ 1
>
> +/* Worst case buffer size needed for holding an integer as a string. */
> +#define ITOA_MAX_LEN 12
> +
> /*
> * Architectures can choose whether to use an rwlock or spinlock
> * for the mmu_lock. These macros, for use in common code
> --
> 2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
2026-06-22 23:54 ` Ackerley Tng
@ 2026-06-23 0:09 ` Sean Christopherson
0 siblings, 0 replies; 27+ messages in thread
From: Sean Christopherson @ 2026-06-23 0:09 UTC (permalink / raw)
To: Ackerley Tng
Cc: Tarun Sahu, Jonathan Corbet, vannapurve, fvdl, Pasha Tatashin,
Shuah Khan, sagis, aneesh.kumar, skhawaja, vipinsh,
Pratyush Yadav, david, dmatlack, mark.rutland, Paolo Bonzini,
Mike Rapoport, Alexander Graf, axelrasmussen, linux-kselftest,
kexec, linux-kernel, linux-doc, kvm, linux-mm
On Mon, Jun 22, 2026, Ackerley Tng wrote:
> Tarun Sahu <tarunsahu@google.com> writes:
>
> > This patch introduces the freeze on gmem_inode which prevents
>
> Can't find the reference now, but commit messages should take the
> imperative mood and avoid "this patch" [*]
From Documentation/process/submitting-patches.rst:
Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
to do frotz", as if you are giving orders to the codebase to change
its behaviour.
Documentation/process/maintainer-tip.rst and Documentation/process/maintainer-kvm-x86.rst
elaborate more on the preferred style (I do most of the guest_memfd maintenance,
and so for all intents and purpose it's bound by KVM x86 "rules").
^ permalink raw reply [flat|nested] 27+ messages in thread
end of thread, other threads:[~2026-06-23 0:09 UTC | newest]
Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <cover.1780676742.git.tarunsahu@google.com>
2026-06-05 17:08 ` [RFC PATCH v2 01/10] liveupdate: luo_file: Add internal APIs for file preservation Tarun Sahu
2026-06-05 17:24 ` sashiko-bot
2026-06-07 0:41 ` tarunsahu
2026-06-07 0:35 ` tarunsahu
2026-06-05 17:08 ` [RFC PATCH v2 02/10] liveupdate: Add LIVEUPDATE_GUEST_MEMFD config option Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 03/10] kvm: Prepare core VM structs and helpers for LUO support Tarun Sahu
2026-06-05 17:21 ` sashiko-bot
2026-06-22 23:59 ` Ackerley Tng
2026-06-05 17:08 ` [RFC PATCH v2 04/10] kvm: kvm_luo: Allow kvm preservation with LUO Tarun Sahu
2026-06-05 17:26 ` sashiko-bot
2026-06-08 16:13 ` tarunsahu
2026-06-05 17:08 ` [RFC PATCH v2 05/10] kvm: guest_memfd: Move internal definitions and helper to new header Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings Tarun Sahu
2026-06-05 17:21 ` sashiko-bot
2026-06-08 18:20 ` tarunsahu
2026-06-22 23:54 ` Ackerley Tng
2026-06-23 0:09 ` Sean Christopherson
2026-06-05 17:08 ` [RFC PATCH v2 07/10] kvm: guest_memfd_luo: add support for guest_memfd preservation Tarun Sahu
2026-06-05 17:25 ` sashiko-bot
2026-06-08 18:22 ` tarunsahu
2026-06-22 23:27 ` Ackerley Tng
2026-06-05 17:08 ` [RFC PATCH v2 08/10] docs: add documentation for guest_memfd preservation via LUO Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 09/10] selftests: kvm: Split ____vm_create() to expose init helpers Tarun Sahu
2026-06-05 17:08 ` [RFC PATCH v2 10/10] selftests: kvm: Add guest_memfd_preservation_test Tarun Sahu
2026-06-05 17:22 ` sashiko-bot
2026-06-08 18:26 ` tarunsahu
2026-06-22 23:01 ` Ackerley Tng
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.