* [PATCH v2 01/16] liveupdate: luo_file: Add internal APIs for file preservation
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 02/16] iommu: Implement IOMMU Live update FLB callbacks Samiullah Khawaja
` (14 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Pasha Tatashin, Samiullah Khawaja, Robin Murphy, Kevin Tian,
Alex Williamson, Shuah Khan, iommu, linux-kernel, kvm,
Saeed Mahameed, Adithya Jayachandran, Parav Pandit,
Leon Romanovsky, William Tu, Pratyush Yadav, David Matlack,
Andrew Morton, Chris Li, Pranjal Shrivastava, Vipin Sharma,
YiFei Zhu
From: Pasha Tatashin <pasha.tatashin@soleen.com>
The core liveupdate mechanism allows userspace to preserve file
descriptors. However, kernel subsystems often manage struct file
objects directly and need to participate in the preservation process
programmatically without relying solely on userspace interaction.
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
include/linux/liveupdate.h | 21 ++++++++++
kernel/liveupdate/luo_file.c | 69 ++++++++++++++++++++++++++++++++
kernel/liveupdate/luo_internal.h | 17 ++++++++
3 files changed, 107 insertions(+)
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index 88722e5caf02..261f61998fce 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -25,6 +25,7 @@ struct file;
/**
* struct liveupdate_file_op_args - Arguments for file operation callbacks.
* @handler: The file handler being called.
+ * @session: The session this file belongs to.
* @retrieve_status: The retrieve status for the 'can_finish / finish'
* operation. A value of 0 means the retrieve has not been
* attempted, a positive value means the retrieve was
@@ -45,6 +46,7 @@ struct file;
*/
struct liveupdate_file_op_args {
struct liveupdate_file_handler *handler;
+ struct liveupdate_session *session;
int retrieve_status;
struct file *file;
u64 serialized_data;
@@ -243,6 +245,13 @@ int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp);
void liveupdate_flb_put_incoming(struct liveupdate_flb *flb);
int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp);
+/* kernel can internally retrieve files */
+int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
+ struct file **filep);
+
+/* Get a token for an outgoing file, or -ENOENT if file is not preserved */
+int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+ struct file *file, u64 *tokenp);
#else /* CONFIG_LIVEUPDATE */
@@ -292,5 +301,17 @@ static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb,
return -EOPNOTSUPP;
}
+static inline int liveupdate_get_file_incoming(struct liveupdate_session *s,
+ u64 token, struct file **filep)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+ struct file *file, u64 *tokenp)
+{
+ return -EOPNOTSUPP;
+}
+
#endif /* CONFIG_LIVEUPDATE */
#endif /* _LINUX_LIVEUPDATE_H */
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index a0a419085e28..0aa0b4e5339f 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -323,6 +323,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
mutex_init(&luo_file->mutex);
args.handler = fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = file;
err = fh->ops->preserve(&args);
if (err)
@@ -380,6 +381,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
struct luo_file, list);
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.private_data = luo_file->private_data;
@@ -411,6 +413,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set,
struct liveupdate_file_op_args args = {0};
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.private_data = luo_file->private_data;
@@ -432,6 +435,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set,
struct liveupdate_file_op_args args = {0};
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.private_data = luo_file->private_data;
@@ -621,6 +625,7 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
}
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.serialized_data = luo_file->serialized_data;
err = luo_file->fh->ops->retrieve(&args);
if (err) {
@@ -654,6 +659,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set,
struct liveupdate_file_op_args args = {0};
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.retrieve_status = luo_file->retrieve_status;
@@ -671,6 +677,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
guard(mutex)(&luo_file->mutex);
args.handler = luo_file->fh;
+ args.session = luo_session_from_file_set(file_set);
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
args.retrieve_status = luo_file->retrieve_status;
@@ -924,3 +931,65 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
luo_flb_unregister_all(fh);
list_del(&ACCESS_PRIVATE(fh, list));
}
+EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler);
+
+/**
+ * liveupdate_get_token_outgoing - Get the token for a preserved file.
+ * @s: The outgoing liveupdate session.
+ * @file: The file object to search for.
+ * @tokenp: Output parameter for the found token.
+ *
+ * Searches the list of preserved files in an outgoing session for a matching
+ * file object. If found, the corresponding user-provided token is returned.
+ *
+ * This function is intended for in-kernel callers that need to correlate a
+ * file with its liveupdate token.
+ *
+ * Context: It must be called with session mutex acquired.
+ * Return: 0 on success, -ENOENT if the file is not preserved in this session.
+ */
+int liveupdate_get_token_outgoing(struct liveupdate_session *s,
+ struct file *file, u64 *tokenp)
+{
+ struct luo_file_set *file_set = luo_file_set_from_session_locked(s);
+ struct luo_file *luo_file;
+ int err = -ENOENT;
+
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ if (luo_file->file == file) {
+ if (tokenp)
+ *tokenp = luo_file->token;
+ err = 0;
+ break;
+ }
+ }
+
+ return err;
+}
+
+/**
+ * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use.
+ * @s: The incoming liveupdate session (restored from the previous kernel).
+ * @token: The unique token identifying the file to retrieve.
+ * @filep: On success, this will be populated with a pointer to the retrieved
+ * 'struct file'.
+ *
+ * Provides a kernel-internal API for other subsystems to retrieve their
+ * preserved files after a live update. This function is a simple wrapper
+ * around luo_retrieve_file(), allowing callers to find a file by its token.
+ *
+ * The caller receives a new reference to the file and must call fput() when it
+ * is no longer needed. The file's lifetime is managed by LUO and any userspace
+ * file descriptors. If the caller needs to hold a reference to the file beyond
+ * the immediate scope, it must call get_file() itself.
+ *
+ * Context: It must be called with session mutex acquired of a restored session.
+ * Return: 0 on success. Returns -ENOENT if no file with the matching token is
+ * found, or any other negative errno on failure.
+ */
+int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token,
+ struct file **filep)
+{
+ return luo_retrieve_file(luo_file_set_from_session_locked(s),
+ token, filep);
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 875844d7a41d..08b198802e7f 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -79,6 +79,23 @@ struct luo_session {
extern struct rw_semaphore luo_register_rwlock;
+static inline struct liveupdate_session *luo_session_from_file_set(struct luo_file_set *file_set)
+{
+ struct luo_session *session;
+
+ session = container_of(file_set, struct luo_session, file_set);
+
+ return (struct liveupdate_session *)session;
+}
+
+static inline struct luo_file_set *luo_file_set_from_session_locked(struct liveupdate_session *s)
+{
+ struct luo_session *session = (struct luo_session *)s;
+
+ lockdep_assert_held(&session->mutex);
+ return &session->file_set;
+}
+
int luo_session_create(const char *name, struct file **filep);
int luo_session_retrieve(const char *name, struct file **filep);
int __init luo_session_setup_outgoing(void *fdt);
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 02/16] iommu: Implement IOMMU Live update FLB callbacks
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 01/16] liveupdate: luo_file: Add internal APIs for file preservation Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-05-01 21:45 ` David Matlack
2026-04-27 17:56 ` [PATCH v2 03/16] iommu: Implement IOMMU domain preservation Samiullah Khawaja
` (13 subsequent siblings)
15 siblings, 1 reply; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
Add liveupdate FLB for IOMMU state preservation. Use KHO preserve memory
alloc/free helper functions to allocate memory for the IOMMU Live update
FLB object and the serialization structs for device, domain and iommu.
During retrieve, walk through the preserved obj array headers and
restore each folio. Also recreate the FLB obj.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
MAINTAINERS | 9 ++
drivers/iommu/Kconfig | 12 ++
drivers/iommu/Makefile | 1 +
drivers/iommu/liveupdate.c | 198 ++++++++++++++++++++++++++++
include/linux/iommu-liveupdate.h | 18 +++
include/linux/kho/abi/iommu.h | 218 +++++++++++++++++++++++++++++++
6 files changed, 456 insertions(+)
create mode 100644 drivers/iommu/liveupdate.c
create mode 100644 include/linux/iommu-liveupdate.h
create mode 100644 include/linux/kho/abi/iommu.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 737c5ed1ce38..980041955abc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13490,6 +13490,15 @@ F: include/linux/iova.h
F: include/linux/of_iommu.h
F: rust/kernel/iommu/
+IOMMU LIVEUPDATE
+M: Samiullah Khawaja <skhawaja@google.com>
+R: Pranjal Shrivastava <praan@google.com>
+L: iommu@lists.linux.dev
+S: Maintained
+F: drivers/iommu/liveupdate.c
+F: include/linux/iommu-liveupdate.h
+F: include/linux/kho/abi/iommu.h
+
IOMMUFD
M: Jason Gunthorpe <jgg@nvidia.com>
M: Kevin Tian <kevin.tian@intel.com>
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f86262b11416..a2f416df3214 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -403,6 +403,18 @@ config IOMMU_DEBUG_PAGEALLOC
line to activate the runtime checks.
If unsure, say N.
+
+config IOMMU_LIVEUPDATE
+ bool "IOMMU live update state preservation support"
+ depends on LIVEUPDATE && IOMMUFD
+ help
+ Enable support for preserving IOMMU state across a kexec live update.
+
+ This allows devices managed by iommufd to maintain their DMA mappings
+ during kexec base kernel update.
+
+ If unsure, say N.
+
endif # IOMMU_SUPPORT
source "drivers/iommu/generic_pt/Kconfig"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 0275821f4ef9..c333f4a3ada3 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o
obj-$(CONFIG_IOMMU_IOVA) += iova.o
+obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o
obj-$(CONFIG_OF_IOMMU) += of_iommu.o
obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c
new file mode 100644
index 000000000000..a26099b145c3
--- /dev/null
+++ b/drivers/iommu/liveupdate.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2026, Google LLC
+ * Author: Samiullah Khawaja <skhawaja@google.com>
+ */
+
+#define pr_fmt(fmt) "iommu: liveupdate: " fmt
+
+#include <linux/kexec_handover.h>
+#include <linux/liveupdate.h>
+#include <linux/iommu-liveupdate.h>
+#include <linux/iommu.h>
+#include <linux/errno.h>
+
+static void *iommu_liveupdate_restore_array(u64 array_phys)
+{
+ struct iommu_array_hdr_ser *array_hdr;
+ void *vaddr = array_phys ? phys_to_virt(array_phys) : NULL;
+
+ while (array_phys) {
+ /*
+ * Failure to restore preserved IOMMU state is considered fatal.
+ *
+ * This is because the IOMMU translations for preserved IOMMUs
+ * were kept enabled in the previous kernel and the preserved
+ * devices have their IOMMU domains still present. Not being
+ * able to restore means that the memory mapped into preserved
+ * domains might be already corrupted by the preserved devices.
+ *
+ * There is no way to confirm the integrity of the memory that
+ * was mapped. BUG_ON is the safest option at this point.
+ */
+ BUG_ON(!kho_restore_folio(array_phys));
+ array_hdr = phys_to_virt(array_phys);
+ array_phys = array_hdr->next_array_phys;
+ }
+
+ return vaddr;
+}
+
+static void iommu_liveupdate_unpreserve_free(u64 array_phys)
+{
+ struct iommu_array_hdr_ser *array_hdr;
+
+ while (array_phys) {
+ array_hdr = phys_to_virt(array_phys);
+ array_phys = array_hdr->next_array_phys;
+ kho_unpreserve_free(array_hdr);
+ }
+}
+
+static void iommu_liveupdate_folio_put(u64 array_phys)
+{
+ struct iommu_array_hdr_ser *array_hdr;
+
+ while (array_phys) {
+ array_hdr = phys_to_virt(array_phys);
+ array_phys = array_hdr->next_array_phys;
+ folio_put(virt_to_folio(array_hdr));
+ }
+}
+
+static void iommu_liveupdate_flb_free(struct iommu_flb_obj *obj)
+{
+ if (obj->ser->iommu_domain_array_phys)
+ iommu_liveupdate_unpreserve_free(obj->ser->iommu_domain_array_phys);
+
+ if (obj->ser->device_array_phys)
+ iommu_liveupdate_unpreserve_free(obj->ser->device_array_phys);
+
+ if (obj->ser->iommu_array_phys)
+ iommu_liveupdate_unpreserve_free(obj->ser->iommu_array_phys);
+
+ kho_unpreserve_free(obj->ser);
+ kfree(obj);
+}
+
+static int iommu_liveupdate_flb_preserve(struct liveupdate_flb_op_args *argp)
+{
+ struct iommu_flb_obj *obj;
+ struct iommu_flb_ser *ser;
+ void *mem;
+
+ /* obj exists only in the current kernel to track preserved state */
+ obj = kzalloc_obj(*obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ mutex_init(&obj->lock);
+
+ /* mem is allocated via KHO and will survive the kexec */
+ mem = kho_alloc_preserve(sizeof(*ser));
+ if (IS_ERR(mem))
+ goto err_free_obj;
+
+ ser = mem;
+ obj->ser = ser;
+
+ mem = kho_alloc_preserve(PAGE_SIZE);
+ if (IS_ERR(mem))
+ goto err_free_ser;
+
+ obj->curr_domain_array = mem;
+ ser->iommu_domain_array_phys = virt_to_phys(obj->curr_domain_array);
+
+ mem = kho_alloc_preserve(PAGE_SIZE);
+ if (IS_ERR(mem))
+ goto err_free_domains;
+
+ obj->curr_device_array = mem;
+ ser->device_array_phys = virt_to_phys(obj->curr_device_array);
+
+ mem = kho_alloc_preserve(PAGE_SIZE);
+ if (IS_ERR(mem))
+ goto err_free_devices;
+
+ obj->curr_iommu_array = mem;
+ ser->iommu_array_phys = virt_to_phys(obj->curr_iommu_array);
+
+ argp->obj = obj;
+ argp->data = virt_to_phys(ser);
+ return 0;
+
+err_free_devices:
+ kho_unpreserve_free(obj->curr_device_array);
+err_free_domains:
+ kho_unpreserve_free(obj->curr_domain_array);
+err_free_ser:
+ kho_unpreserve_free(obj->ser);
+err_free_obj:
+ kfree(obj);
+ return PTR_ERR(mem);
+}
+
+static void iommu_liveupdate_flb_unpreserve(struct liveupdate_flb_op_args *argp)
+{
+ iommu_liveupdate_flb_free(argp->obj);
+}
+
+static void iommu_liveupdate_flb_finish(struct liveupdate_flb_op_args *argp)
+{
+ struct iommu_flb_obj *obj = argp->obj;
+
+ iommu_liveupdate_folio_put(obj->ser->iommu_domain_array_phys);
+ iommu_liveupdate_folio_put(obj->ser->device_array_phys);
+ iommu_liveupdate_folio_put(obj->ser->iommu_array_phys);
+
+ folio_put(virt_to_folio(obj->ser));
+ kfree(obj);
+}
+
+static int iommu_liveupdate_flb_retrieve(struct liveupdate_flb_op_args *argp)
+{
+ struct iommu_flb_obj *obj;
+ struct iommu_flb_ser *ser;
+
+ obj = kzalloc_obj(*obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ /* Data must be present and valid from the previous kernel */
+ BUG_ON(!kho_restore_folio(argp->data));
+
+ mutex_init(&obj->lock);
+ ser = phys_to_virt(argp->data);
+ obj->ser = ser;
+
+ obj->curr_domain_array = iommu_liveupdate_restore_array(ser->iommu_domain_array_phys);
+ obj->curr_device_array = iommu_liveupdate_restore_array(ser->device_array_phys);
+ obj->curr_iommu_array = iommu_liveupdate_restore_array(ser->iommu_array_phys);
+ argp->obj = obj;
+ return 0;
+}
+
+static struct liveupdate_flb_ops iommu_flb_ops = {
+ .preserve = iommu_liveupdate_flb_preserve,
+ .unpreserve = iommu_liveupdate_flb_unpreserve,
+ .finish = iommu_liveupdate_flb_finish,
+ .retrieve = iommu_liveupdate_flb_retrieve,
+};
+
+static struct liveupdate_flb iommu_flb = {
+ .compatible = IOMMU_LUO_FLB_COMPATIBLE,
+ .ops = &iommu_flb_ops,
+};
+
+int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler)
+{
+ return liveupdate_register_flb(handler, &iommu_flb);
+}
+EXPORT_SYMBOL(iommu_liveupdate_register_flb);
+
+void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler)
+{
+ liveupdate_unregister_flb(handler, &iommu_flb);
+}
+EXPORT_SYMBOL(iommu_liveupdate_unregister_flb);
diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h
new file mode 100644
index 000000000000..3d1c65ed76fa
--- /dev/null
+++ b/include/linux/iommu-liveupdate.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (C) 2026, Google LLC
+ * Author: Samiullah Khawaja <skhawaja@google.com>
+ */
+
+#ifndef _LINUX_IOMMU_LIVEUPDATE_H
+#define _LINUX_IOMMU_LIVEUPDATE_H
+
+#include <linux/iommu.h>
+#include <linux/liveupdate.h>
+#include <linux/kho/abi/iommu.h>
+
+int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler);
+void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler);
+
+#endif /* _LINUX_IOMMU_LIVEUPDATE_H */
diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h
new file mode 100644
index 000000000000..37b967820f14
--- /dev/null
+++ b/include/linux/kho/abi/iommu.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (C) 2026, Google LLC
+ * Author: Samiullah Khawaja <skhawaja@google.com>
+ */
+
+#ifndef _LINUX_KHO_ABI_IOMMU_H
+#define _LINUX_KHO_ABI_IOMMU_H
+
+#include <linux/mutex_types.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * DOC: IOMMU File-Lifecycle Bound (FLB) Live Update ABI
+ *
+ * This header defines the ABI for preserving IOMMU state across kexec using
+ * Live Update File-Lifecycle Bound (FLB) data.
+ *
+ * This interface is a contract. Any modification to any of the serialization
+ * structs defined here constitutes a breaking change. Such changes require
+ * incrementing the version number in the IOMMU_LUO_FLB_COMPATIBLE string.
+ *
+ * Memory Layout of Serialization Structures:
+ * ==========================================
+ *
+ * Each serialized type (IOMMU, Domain, Device) is stored in a linked list of
+ * arrays. The first array is allocated initially. When an array is full, a new
+ * array is allocated and its physical address is stored in the next_array_phys
+ * field of the hdr of the current array.
+ *
+ * Top Level (struct iommu_flb_ser):
+ * +---------------------------+
+ * | - iommu_array_phys |
+ * | - iommu_domain_array_phys |
+ * | - device_array_phys |
+ * +---------------------------+
+ *
+ * Each Array contains the serialized objects of the respective type. For
+ * example see below the representation of struct iommu_domain_array_ser.
+ *
+ * +---------------------------+ +---------------------------+
+ * | iommu_domain_array_ser |-->| iommu_domain_array_ser |--> NULL
+ * | - hdr.next_array_phys | | - hdr.next_array_phys |
+ * | - hdr.nr_objects | | - hdr.nr_objects |
+ * | | | |
+ * | objects[]: | | objects[]: |
+ * | [ iommu_domain_ser ] | | [ iommu_domain_ser ] |
+ * | [ iommu_domain_ser ] | | [ iommu_domain_ser ] |
+ * | ... | | ... |
+ * +---------------------------+ +---------------------------+
+ *
+ * Each object in the array starts with a common header (iommu_hdr_ser).
+ * For example, the layout of struct iommu_domain_ser is:
+ *
+ * +-----------------------------+
+ * | iommu_domain_ser |
+ * | +-------------------------+ |
+ * | | hdr (iommu_hdr_ser) | |
+ * | | - ref_count | |
+ * | | - deleted / incoming | |
+ * | +-------------------------+ |
+ * | - top_table_phys | |
+ * | - top_level | |
+ * | - restored_domain | |
+ * +-----------------------------+
+ *
+ * This pattern applies identically to iommu_device_ser and iommu_hw_ser.
+ */
+
+#define IOMMU_LUO_FLB_COMPATIBLE "iommu-liveupdate-v1"
+
+enum iommu_type_ser {
+ IOMMU_INVALID,
+};
+
+/**
+ * struct iommu_hdr_ser - Common header for all serialized IOMMU objects
+ * @ref_count: Reference count for the object
+ * @deleted: Flag indicating if the object is deleted
+ * @incoming: Flag indicating if the object was preserved in previous kernel
+ */
+struct iommu_hdr_ser {
+ u32 ref_count;
+ u32 deleted:1;
+ u32 incoming:1;
+} __packed;
+
+/**
+ * struct iommu_domain_ser - Serialized state of an IOMMU domain
+ * @hdr: Common object header
+ * @top_table_phys: Physical address of the top-level page table
+ * @top_level: Level of the top-level page table
+ * @vasz: Virtual Address Size
+ * @sign_extend: FEAT_SIGN_EXTEND is enabled for this domain
+ * @restored_domain: Pointer to the restored domain (valid only after restore)
+ */
+struct iommu_domain_ser {
+ struct iommu_hdr_ser hdr;
+ u64 top_table_phys;
+ u64 top_level;
+ u32 vasz;
+ u32 sign_extend:1;
+ struct iommu_domain *restored_domain;
+} __packed;
+
+/**
+ * struct iommu_dev_map_ser - Serialized mapping between device, domain,
+ * and IOMMU instance.
+ * @attachment_id: ID of the attachment between device and domain.
+ * @domain_phys: Physical address of the domain
+ * @iommu_phys: Physical address of the IOMMU
+ */
+struct iommu_dev_map_ser {
+ u64 attachment_id;
+ u64 domain_phys;
+ u64 iommu_phys;
+} __packed;
+
+/**
+ * struct iommu_device_ser - Serialized state of a device
+ * @hdr: Common object header
+ * @devid: Device ID
+ * @pci_domain_nr: PCI domain number
+ * @domain_iommu_ser: Domain and IOMMU mapping
+ */
+struct iommu_device_ser {
+ struct iommu_hdr_ser hdr;
+ u32 devid;
+ u32 pci_domain_nr;
+ struct iommu_dev_map_ser domain_iommu_ser;
+} __packed;
+
+/**
+ * struct iommu_hw_ser - Serialized state of an IOMMU instance
+ * @hdr: Common object header
+ * @token: Unique token for the IOMMU
+ * @type: IOMMU type serialized state belongs to
+ */
+struct iommu_hw_ser {
+ struct iommu_hdr_ser hdr;
+ u64 token;
+ u64 type;
+} __packed;
+
+/**
+ * struct iommu_array_hdr_ser - Header for an array of serialized objects
+ * @next_array_phys: Physical address of the next array of objects
+ * @nr_objects: Number of objects in the current array
+ */
+struct iommu_array_hdr_ser {
+ u64 next_array_phys;
+ u64 nr_objects;
+} __packed;
+
+/**
+ * struct iommu_hw_array_ser - An array containing serialized IOMMU HWs
+ * @hdr: Array header
+ * @objects: Array of serialized IOMMU devices
+ */
+struct iommu_hw_array_ser {
+ struct iommu_array_hdr_ser hdr;
+ struct iommu_hw_ser objects[];
+} __packed;
+
+/**
+ * struct iommu_domain_array_ser - An array containing serialized domains
+ * @hdr: Array header
+ * @objects: Array of serialized domains
+ */
+struct iommu_domain_array_ser {
+ struct iommu_array_hdr_ser hdr;
+ struct iommu_domain_ser objects[];
+} __packed;
+
+/**
+ * struct iommu_device_array_ser - An array containing serialized devices
+ * @hdr: Array header
+ * @objects: Array of serialized devices
+ */
+struct iommu_device_array_ser {
+ struct iommu_array_hdr_ser hdr;
+ struct iommu_device_ser objects[];
+} __packed;
+
+/**
+ * struct iommu_flb_ser - Top-level serialization structure
+ * @iommu_array_phys: Physical address of the first array of IOMMU HWs
+ * @iommu_domain_array_phys: Physical address of the first array of domains
+ * @device_array_phys: Physical address of the first array of devices
+ */
+struct iommu_flb_ser {
+ u64 iommu_array_phys;
+ u64 iommu_domain_array_phys;
+ u64 device_array_phys;
+} __packed;
+
+/**
+ * struct iommu_flb_obj - FLB object allocated in current kernel pointing to
+ * preserved state in FLB
+ * @lock: Mutex protecting the object
+ * @ser: Pointer to the serialized state in FLB
+ * @curr_iommu_array: Pointer to the current array of IOMMU instances
+ * @curr_domain_array: Pointer to the current array of domains
+ * @curr_device_array: Pointer to the current array of devices
+ */
+struct iommu_flb_obj {
+ /* @lock: Protects the serialized objects during concurrent preservation */
+ struct mutex lock;
+ struct iommu_flb_ser *ser;
+
+ struct iommu_hw_array_ser *curr_iommu_array;
+ struct iommu_domain_array_ser *curr_domain_array;
+ struct iommu_device_array_ser *curr_device_array;
+} __packed;
+
+#endif /* _LINUX_KHO_ABI_IOMMU_H */
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH v2 02/16] iommu: Implement IOMMU Live update FLB callbacks
2026-04-27 17:56 ` [PATCH v2 02/16] iommu: Implement IOMMU Live update FLB callbacks Samiullah Khawaja
@ 2026-05-01 21:45 ` David Matlack
0 siblings, 0 replies; 22+ messages in thread
From: David Matlack @ 2026-05-01 21:45 UTC (permalink / raw)
To: Samiullah Khawaja
Cc: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, Andrew Morton, Chris Li,
Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
On 2026-04-27 05:56 PM, Samiullah Khawaja wrote:
> Add liveupdate FLB for IOMMU state preservation. Use KHO preserve memory
> alloc/free helper functions to allocate memory for the IOMMU Live update
> FLB object and the serialization structs for device, domain and iommu.
>
> During retrieve, walk through the preserved obj array headers and
> restore each folio. Also recreate the FLB obj.
>
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> +static void *iommu_liveupdate_restore_array(u64 array_phys)
> +{
> + struct iommu_array_hdr_ser *array_hdr;
> + void *vaddr = array_phys ? phys_to_virt(array_phys) : NULL;
> +
> + while (array_phys) {
> + /*
> + * Failure to restore preserved IOMMU state is considered fatal.
> + *
> + * This is because the IOMMU translations for preserved IOMMUs
> + * were kept enabled in the previous kernel and the preserved
> + * devices have their IOMMU domains still present. Not being
> + * able to restore means that the memory mapped into preserved
> + * domains might be already corrupted by the preserved devices.
> + *
> + * There is no way to confirm the integrity of the memory that
> + * was mapped. BUG_ON is the safest option at this point.
> + */
> + BUG_ON(!kho_restore_folio(array_phys));
> + array_hdr = phys_to_virt(array_phys);
> + array_phys = array_hdr->next_array_phys;
> + }
> +
> + return vaddr;
> +}
> +static int iommu_liveupdate_flb_retrieve(struct liveupdate_flb_op_args *argp)
> +{
> + struct iommu_flb_obj *obj;
> + struct iommu_flb_ser *ser;
> +
> + obj = kzalloc_obj(*obj, GFP_KERNEL);
> + if (!obj)
> + return -ENOMEM;
Should this be considered fatal for the same reason
iommu_liveupdate_restore_array() is considered fatal? If anything in
iommu_liveupdate_flb_retrieve() fails then the risk of corruption as
described in iommu_liveupdate_restore_array() is possible.
> +
> + /* Data must be present and valid from the previous kernel */
> + BUG_ON(!kho_restore_folio(argp->data));
> +
> + mutex_init(&obj->lock);
> + ser = phys_to_virt(argp->data);
> + obj->ser = ser;
> +
> + obj->curr_domain_array = iommu_liveupdate_restore_array(ser->iommu_domain_array_phys);
> + obj->curr_device_array = iommu_liveupdate_restore_array(ser->device_array_phys);
> + obj->curr_iommu_array = iommu_liveupdate_restore_array(ser->iommu_array_phys);
> + argp->obj = obj;
> + return 0;
> +}
> +
> +static struct liveupdate_flb_ops iommu_flb_ops = {
> + .preserve = iommu_liveupdate_flb_preserve,
> + .unpreserve = iommu_liveupdate_flb_unpreserve,
> + .finish = iommu_liveupdate_flb_finish,
> + .retrieve = iommu_liveupdate_flb_retrieve,
nit: I think it's helpful to put these in the order they are expected to
be called.
.preserve = iommu_liveupdate_flb_preserve,
.unpreserve = iommu_liveupdate_flb_unpreserve,
.retrieve = iommu_liveupdate_flb_retrieve,
.finish = iommu_liveupdate_flb_finish,
> diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h
> new file mode 100644
> index 000000000000..37b967820f14
> +enum iommu_type_ser {
> + IOMMU_INVALID,
> +};
Please document this enum.
> +
> +/**
> + * struct iommu_hdr_ser - Common header for all serialized IOMMU objects
> + * @ref_count: Reference count for the object
> + * @deleted: Flag indicating if the object is deleted
> + * @incoming: Flag indicating if the object was preserved in previous kernel
> + */
> +struct iommu_hdr_ser {
> + u32 ref_count;
> + u32 deleted:1;
> + u32 incoming:1;
Are C bitfields safe to use in Live Update ABI?
> +} __packed;
> +/**
> + * struct iommu_flb_obj - FLB object allocated in current kernel pointing to
> + * preserved state in FLB
> + * @lock: Mutex protecting the object
> + * @ser: Pointer to the serialized state in FLB
> + * @curr_iommu_array: Pointer to the current array of IOMMU instances
> + * @curr_domain_array: Pointer to the current array of domains
> + * @curr_device_array: Pointer to the current array of devices
> + */
> +struct iommu_flb_obj {
> + /* @lock: Protects the serialized objects during concurrent preservation */
> + struct mutex lock;
> + struct iommu_flb_ser *ser;
> +
> + struct iommu_hw_array_ser *curr_iommu_array;
> + struct iommu_domain_array_ser *curr_domain_array;
> + struct iommu_device_array_ser *curr_device_array;
> +} __packed;
This struct is not ABI so it should not be __packed nor defined in this
file. I haven't read the whole series yet but this definition can
probably go in drivers/iommu/liveupdate.c.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v2 03/16] iommu: Implement IOMMU domain preservation
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 01/16] liveupdate: luo_file: Add internal APIs for file preservation Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 02/16] iommu: Implement IOMMU Live update FLB callbacks Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-05-01 22:08 ` David Matlack
2026-04-27 17:56 ` [PATCH v2 04/16] iommu: Implement device and IOMMU HW preservation Samiullah Khawaja
` (12 subsequent siblings)
15 siblings, 1 reply; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
Add IOMMU domain ops that can be implemented by the IOMMU drivers if
they support IOMMU domain preservation across liveupdate. The new IOMMU
domain preserve, unpreserve and restore APIs call these ops to perform
respective live update operations.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/liveupdate.c | 97 ++++++++++++++++++++++++++++++++
include/linux/iommu-liveupdate.h | 14 +++++
include/linux/iommu.h | 13 +++++
3 files changed, 124 insertions(+)
diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c
index a26099b145c3..f71f14518248 100644
--- a/drivers/iommu/liveupdate.c
+++ b/drivers/iommu/liveupdate.c
@@ -13,6 +13,9 @@
#include <linux/iommu.h>
#include <linux/errno.h>
+#define iommu_max_objs_per_page(_array) \
+ ((PAGE_SIZE - sizeof(struct iommu_array_hdr_ser)) / sizeof((_array)->objects[0]))
+
static void *iommu_liveupdate_restore_array(u64 array_phys)
{
struct iommu_array_hdr_ser *array_hdr;
@@ -196,3 +199,97 @@ void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler)
liveupdate_unregister_flb(handler, &iommu_flb);
}
EXPORT_SYMBOL(iommu_liveupdate_unregister_flb);
+
+static int alloc_object_ser(struct iommu_array_hdr_ser **curr_array_ptr, u64 max_objs)
+{
+ struct iommu_array_hdr_ser *curr_array = *curr_array_ptr;
+ struct iommu_array_hdr_ser *next_array;
+
+ if (curr_array->nr_objects >= max_objs) {
+ next_array = kho_alloc_preserve(PAGE_SIZE);
+ if (IS_ERR(next_array))
+ return PTR_ERR(next_array);
+
+ curr_array->next_array_phys = virt_to_phys(next_array);
+ *curr_array_ptr = next_array;
+ curr_array = next_array;
+ }
+
+ return curr_array->nr_objects++;
+}
+
+static struct iommu_domain_ser *alloc_iommu_domain_ser(struct iommu_flb_obj *flb)
+{
+ int idx;
+
+ idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_domain_array,
+ iommu_max_objs_per_page(flb->curr_domain_array));
+ if (idx < 0)
+ return ERR_PTR(idx);
+
+ flb->curr_domain_array->objects[idx].hdr.ref_count = 1;
+ return &flb->curr_domain_array->objects[idx];
+}
+
+int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser)
+{
+ struct iommu_domain_ser *domain_ser;
+ struct iommu_flb_obj *flb_obj;
+ int ret;
+
+ if (!domain->ops->preserve)
+ return -EOPNOTSUPP;
+
+ ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&flb_obj->lock);
+ domain_ser = alloc_iommu_domain_ser(flb_obj);
+ if (IS_ERR(domain_ser))
+ return PTR_ERR(domain_ser);
+
+ ret = domain->ops->preserve(domain, domain_ser);
+ if (ret) {
+ domain_ser->hdr.deleted = true;
+ return ret;
+ }
+
+ domain->preserved_state = domain_ser;
+ *ser = domain_ser;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_domain_preserve);
+
+void iommu_domain_unpreserve(struct iommu_domain *domain)
+{
+ struct iommu_domain_ser *domain_ser;
+ struct iommu_flb_obj *flb_obj;
+ int ret;
+
+ if (!domain->ops->unpreserve)
+ return;
+
+ ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj);
+ if (WARN_ON(ret))
+ return;
+
+ guard(mutex)(&flb_obj->lock);
+
+ if (!domain->preserved_state)
+ return;
+
+ /*
+ * There is no check for attached devices here. The correctness relies
+ * on the Live Update Orchestrator's session lifecycle. All resources
+ * (iommufd, vfio devices) are preserved within a single session. If the
+ * session is torn down, the .unpreserve callbacks for all files will be
+ * invoked, ensuring a consistent cleanup without needing explicit
+ * refcounting for the serialized objects here.
+ */
+ domain_ser = domain->preserved_state;
+ domain->ops->unpreserve(domain, domain_ser);
+ domain_ser->hdr.deleted = true;
+ domain->preserved_state = NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_domain_unpreserve);
diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h
index 3d1c65ed76fa..6019cfc27428 100644
--- a/include/linux/iommu-liveupdate.h
+++ b/include/linux/iommu-liveupdate.h
@@ -12,6 +12,20 @@
#include <linux/liveupdate.h>
#include <linux/kho/abi/iommu.h>
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser);
+void iommu_domain_unpreserve(struct iommu_domain *domain);
+#else
+static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void iommu_domain_unpreserve(struct iommu_domain *domain)
+{
+}
+#endif
+
int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler);
void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6f5d1dec3f89..3853a3946733 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/of.h>
#include <linux/iova_bitmap.h>
+#include <linux/kho/abi/iommu.h>
#include <uapi/linux/iommufd.h>
#define IOMMU_READ (1 << 0)
@@ -249,6 +250,10 @@ struct iommu_domain {
struct list_head next;
};
};
+
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ struct iommu_domain_ser *preserved_state;
+#endif
};
static inline bool iommu_is_dma_domain(struct iommu_domain *domain)
@@ -752,6 +757,11 @@ struct iommu_ops {
* specific mechanisms.
* @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
* @free: Release the domain after use.
+ * @preserve: Preserve the iommu domain for liveupdate.
+ * Returns 0 on success, a negative errno on failure.
+ * @unpreserve: Unpreserve the iommu domain that was preserved earlier.
+ * @restore: Restore the iommu domain after liveupdate.
+ * Returns 0 on success, a negative errno on failure.
*/
struct iommu_domain_ops {
int (*attach_dev)(struct iommu_domain *domain, struct device *dev,
@@ -782,6 +792,9 @@ struct iommu_domain_ops {
unsigned long quirks);
void (*free)(struct iommu_domain *domain);
+ int (*preserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser);
+ void (*unpreserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser);
+ int (*restore)(struct iommu_domain *domain, struct iommu_domain_ser *ser);
};
/**
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH v2 03/16] iommu: Implement IOMMU domain preservation
2026-04-27 17:56 ` [PATCH v2 03/16] iommu: Implement IOMMU domain preservation Samiullah Khawaja
@ 2026-05-01 22:08 ` David Matlack
2026-05-04 18:33 ` Samiullah Khawaja
0 siblings, 1 reply; 22+ messages in thread
From: David Matlack @ 2026-05-01 22:08 UTC (permalink / raw)
To: Samiullah Khawaja
Cc: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, Andrew Morton, Chris Li,
Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
On 2026-04-27 05:56 PM, Samiullah Khawaja wrote:
> Add IOMMU domain ops that can be implemented by the IOMMU drivers if
> they support IOMMU domain preservation across liveupdate. The new IOMMU
> domain preserve, unpreserve and restore APIs call these ops to perform
> respective live update operations.
>
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> +static int alloc_object_ser(struct iommu_array_hdr_ser **curr_array_ptr, u64 max_objs)
> +{
> + struct iommu_array_hdr_ser *curr_array = *curr_array_ptr;
> + struct iommu_array_hdr_ser *next_array;
> +
There's a trade-off being made in this function to leak deleted array
elements instead of trying to reuse them that warrants a comment.
> + if (curr_array->nr_objects >= max_objs) {
> + next_array = kho_alloc_preserve(PAGE_SIZE);
> + if (IS_ERR(next_array))
> + return PTR_ERR(next_array);
> +
> + curr_array->next_array_phys = virt_to_phys(next_array);
> + *curr_array_ptr = next_array;
> + curr_array = next_array;
> + }
> +
> + return curr_array->nr_objects++;
> +}
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH v2 03/16] iommu: Implement IOMMU domain preservation
2026-05-01 22:08 ` David Matlack
@ 2026-05-04 18:33 ` Samiullah Khawaja
0 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-05-04 18:33 UTC (permalink / raw)
To: David Matlack
Cc: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, Andrew Morton, Chris Li,
Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
On Fri, May 01, 2026 at 10:08:47PM +0000, David Matlack wrote:
>On 2026-04-27 05:56 PM, Samiullah Khawaja wrote:
>> Add IOMMU domain ops that can be implemented by the IOMMU drivers if
>> they support IOMMU domain preservation across liveupdate. The new IOMMU
>> domain preserve, unpreserve and restore APIs call these ops to perform
>> respective live update operations.
>>
>> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
>
>> +static int alloc_object_ser(struct iommu_array_hdr_ser **curr_array_ptr, u64 max_objs)
>> +{
>> + struct iommu_array_hdr_ser *curr_array = *curr_array_ptr;
>> + struct iommu_array_hdr_ser *next_array;
>> +
>
>There's a trade-off being made in this function to leak deleted array
>elements instead of trying to reuse them that warrants a comment.
Are you referring to the "deleted" flag in the HDR of each object? Yes
we don't look for deleted objects in linked-list of arrays and reuse
them for simplicity. I will add a comment that says this.
>
>> + if (curr_array->nr_objects >= max_objs) {
>> + next_array = kho_alloc_preserve(PAGE_SIZE);
>> + if (IS_ERR(next_array))
>> + return PTR_ERR(next_array);
>> +
>> + curr_array->next_array_phys = virt_to_phys(next_array);
>> + *curr_array_ptr = next_array;
>> + curr_array = next_array;
>> + }
>> +
>> + return curr_array->nr_objects++;
>> +}
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v2 04/16] iommu: Implement device and IOMMU HW preservation
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (2 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 03/16] iommu: Implement IOMMU domain preservation Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-05-01 22:42 ` David Matlack
2026-04-27 17:56 ` [PATCH v2 05/16] iommu/pages: Add APIs to preserve/unpreserve/restore iommu pages Samiullah Khawaja
` (11 subsequent siblings)
15 siblings, 1 reply; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
Add IOMMU ops to preserve/unpreserve a device. These can be implemented
by the IOMMU drivers that support preservation of devices that have
their IOMMU domains preserved. During device preservation the state of
the associated IOMMU is also preserved as dependency.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/liveupdate.c | 162 +++++++++++++++++++++++++++++++
include/linux/iommu-liveupdate.h | 33 +++++++
include/linux/iommu.h | 20 ++++
3 files changed, 215 insertions(+)
diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c
index f71f14518248..765d042e22e3 100644
--- a/drivers/iommu/liveupdate.c
+++ b/drivers/iommu/liveupdate.c
@@ -11,6 +11,7 @@
#include <linux/liveupdate.h>
#include <linux/iommu-liveupdate.h>
#include <linux/iommu.h>
+#include <linux/pci.h>
#include <linux/errno.h>
#define iommu_max_objs_per_page(_array) \
@@ -293,3 +294,164 @@ void iommu_domain_unpreserve(struct iommu_domain *domain)
domain->preserved_state = NULL;
}
EXPORT_SYMBOL_GPL(iommu_domain_unpreserve);
+
+static struct iommu_hw_ser *alloc_iommu_hw_ser(struct iommu_flb_obj *flb)
+{
+ int idx;
+
+ idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_iommu_array,
+ iommu_max_objs_per_page(flb->curr_iommu_array));
+ if (idx < 0)
+ return ERR_PTR(idx);
+
+ flb->curr_iommu_array->objects[idx].hdr.ref_count = 1;
+ return &flb->curr_iommu_array->objects[idx];
+}
+
+static int iommu_preserve_locked(struct iommu_device *iommu,
+ struct iommu_flb_obj *flb_obj)
+{
+ struct iommu_hw_ser *iommu_hw_ser;
+ int ret;
+
+ if (!iommu->ops->preserve)
+ return -EOPNOTSUPP;
+
+ lockdep_assert_held(&flb_obj->lock);
+ if (iommu->outgoing_preserved_state) {
+ iommu->outgoing_preserved_state->hdr.ref_count++;
+ return 0;
+ }
+
+ iommu_hw_ser = alloc_iommu_hw_ser(flb_obj);
+ if (IS_ERR(iommu_hw_ser))
+ return PTR_ERR(iommu_hw_ser);
+
+ ret = iommu->ops->preserve(iommu, iommu_hw_ser);
+ if (ret) {
+ iommu_hw_ser->hdr.deleted = true;
+ return ret;
+ }
+
+ iommu->outgoing_preserved_state = iommu_hw_ser;
+ return ret;
+}
+
+static void iommu_unpreserve_locked(struct iommu_device *iommu,
+ struct iommu_flb_obj *flb_obj)
+{
+ struct iommu_hw_ser *iommu_hw_ser = iommu->outgoing_preserved_state;
+
+ lockdep_assert_held(&flb_obj->lock);
+ iommu_hw_ser->hdr.ref_count--;
+ if (iommu_hw_ser->hdr.ref_count)
+ return;
+
+ iommu->outgoing_preserved_state = NULL;
+ iommu->ops->unpreserve(iommu, iommu_hw_ser);
+ iommu_hw_ser->hdr.deleted = true;
+}
+
+static struct iommu_device_ser *alloc_iommu_device_ser(struct iommu_flb_obj *flb)
+{
+ int idx;
+
+ idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_device_array,
+ iommu_max_objs_per_page(flb->curr_device_array));
+ if (idx < 0)
+ return ERR_PTR(idx);
+
+ flb->curr_device_array->objects[idx].hdr.ref_count = 1;
+ return &flb->curr_device_array->objects[idx];
+}
+
+int iommu_preserve_device(struct iommu_domain *domain,
+ struct device *dev, u64 *preserved_state)
+{
+ struct iommu_flb_obj *flb_obj;
+ struct iommu_device_ser *device_ser;
+ struct dev_iommu *iommu;
+ struct pci_dev *pdev;
+ int ret;
+
+ if (!dev_is_pci(dev))
+ return -EOPNOTSUPP;
+
+ if (!domain->preserved_state)
+ return -EINVAL;
+
+ if (!iommu_group_dma_owner_claimed(dev->iommu_group))
+ return -EINVAL;
+
+ pdev = to_pci_dev(dev);
+ iommu = dev->iommu;
+ if (!iommu->iommu_dev->ops->preserve_device ||
+ !iommu->iommu_dev->ops->preserve)
+ return -EOPNOTSUPP;
+
+ ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&flb_obj->lock);
+ device_ser = alloc_iommu_device_ser(flb_obj);
+ if (IS_ERR(device_ser))
+ return PTR_ERR(device_ser);
+
+ ret = iommu_preserve_locked(iommu->iommu_dev, flb_obj);
+ if (ret) {
+ device_ser->hdr.deleted = true;
+ return ret;
+ }
+
+ device_ser->domain_iommu_ser.domain_phys = __pa(domain->preserved_state);
+ device_ser->domain_iommu_ser.iommu_phys = __pa(iommu->iommu_dev->outgoing_preserved_state);
+ device_ser->devid = pci_dev_id(pdev);
+ device_ser->pci_domain_nr = pci_domain_nr(pdev->bus);
+
+ ret = iommu->iommu_dev->ops->preserve_device(dev, device_ser);
+ if (ret) {
+ device_ser->hdr.deleted = true;
+ iommu_unpreserve_locked(iommu->iommu_dev, flb_obj);
+ return ret;
+ }
+
+ dev->iommu->device_ser = device_ser;
+ *preserved_state = virt_to_phys(device_ser);
+ return 0;
+}
+
+void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev)
+{
+ struct iommu_flb_obj *flb_obj;
+ struct iommu_device_ser *iommu_device_ser;
+ struct dev_iommu *iommu;
+ struct pci_dev *pdev;
+ int ret;
+
+ if (!dev_is_pci(dev))
+ return;
+
+ if (!iommu_group_dma_owner_claimed(dev->iommu_group))
+ return;
+
+ pdev = to_pci_dev(dev);
+ iommu = dev->iommu;
+ if (!iommu->iommu_dev->ops->unpreserve_device ||
+ !iommu->iommu_dev->ops->unpreserve)
+ return;
+
+ ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj);
+ if (WARN_ON(ret))
+ return;
+
+ guard(mutex)(&flb_obj->lock);
+ iommu_device_ser = dev_iommu_preserved_state(dev);
+ if (WARN_ON(!iommu_device_ser))
+ return;
+
+ iommu->iommu_dev->ops->unpreserve_device(dev, iommu_device_ser);
+ dev->iommu->device_ser = NULL;
+
+ iommu_unpreserve_locked(iommu->iommu_dev, flb_obj);
+}
diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h
index 6019cfc27428..279c7ab04f09 100644
--- a/include/linux/iommu-liveupdate.h
+++ b/include/linux/iommu-liveupdate.h
@@ -8,14 +8,37 @@
#ifndef _LINUX_IOMMU_LIVEUPDATE_H
#define _LINUX_IOMMU_LIVEUPDATE_H
+#include <linux/device.h>
#include <linux/iommu.h>
#include <linux/liveupdate.h>
#include <linux/kho/abi/iommu.h>
#ifdef CONFIG_IOMMU_LIVEUPDATE
+static inline void *dev_iommu_preserved_state(struct device *dev)
+{
+ struct iommu_device_ser *ser;
+
+ if (!dev->iommu)
+ return NULL;
+
+ ser = dev->iommu->device_ser;
+ if (ser && !ser->hdr.incoming)
+ return ser;
+
+ return NULL;
+}
+
int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser);
void iommu_domain_unpreserve(struct iommu_domain *domain);
+int iommu_preserve_device(struct iommu_domain *domain,
+ struct device *dev, u64 *preserved_state);
+void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev);
#else
+static inline void *dev_iommu_preserved_state(struct device *dev)
+{
+ return NULL;
+}
+
static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser)
{
return -EOPNOTSUPP;
@@ -24,6 +47,16 @@ static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iomm
static inline void iommu_domain_unpreserve(struct iommu_domain *domain)
{
}
+
+static inline int iommu_preserve_device(struct iommu_domain *domain,
+ struct device *dev, u64 *preserved_state)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev)
+{
+}
#endif
int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3853a3946733..1c424b32c5fc 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -655,6 +655,10 @@ __iommu_copy_struct_to_user(const struct iommu_user_data *dst_data,
* resources shared/passed to user space IOMMU instance. Associate
* it with a nesting @parent_domain. It is required for driver to
* set @viommu->ops pointing to its own viommu_ops
+ * @preserve_device: Preserve state of a device for liveupdate.
+ * @unpreserve_device: Unpreserve state that was preserved earlier.
+ * @preserve: Preserve state of iommu translation hardware for liveupdate.
+ * @unpreserve: Unpreserve state of iommu that was preserved earlier.
* @owner: Driver module providing these ops
* @identity_domain: An always available, always attachable identity
* translation.
@@ -711,6 +715,13 @@ struct iommu_ops {
struct iommu_domain *parent_domain,
const struct iommu_user_data *user_data);
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ int (*preserve_device)(struct device *dev, struct iommu_device_ser *device_ser);
+ void (*unpreserve_device)(struct device *dev, struct iommu_device_ser *device_ser);
+ int (*preserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser);
+ void (*unpreserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser);
+#endif
+
const struct iommu_domain_ops *default_domain_ops;
struct module *owner;
struct iommu_domain *identity_domain;
@@ -806,6 +817,8 @@ struct iommu_domain_ops {
* @singleton_group: Used internally for drivers that have only one group
* @max_pasids: number of supported PASIDs
* @ready: set once iommu_device_register() has completed successfully
+ * @outgoing_preserved_state: preserved iommu state of outgoing kernel for
+ * liveupdate.
*/
struct iommu_device {
struct list_head list;
@@ -815,6 +828,10 @@ struct iommu_device {
struct iommu_group *singleton_group;
u32 max_pasids;
bool ready;
+
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ struct iommu_hw_ser *outgoing_preserved_state;
+#endif
};
/**
@@ -869,6 +886,9 @@ struct dev_iommu {
u32 pci_32bit_workaround:1;
u32 require_direct:1;
u32 shadow_on_flush:1;
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ struct iommu_device_ser *device_ser;
+#endif
};
int iommu_device_register(struct iommu_device *iommu,
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH v2 04/16] iommu: Implement device and IOMMU HW preservation
2026-04-27 17:56 ` [PATCH v2 04/16] iommu: Implement device and IOMMU HW preservation Samiullah Khawaja
@ 2026-05-01 22:42 ` David Matlack
2026-05-04 19:06 ` Samiullah Khawaja
0 siblings, 1 reply; 22+ messages in thread
From: David Matlack @ 2026-05-01 22:42 UTC (permalink / raw)
To: Samiullah Khawaja
Cc: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, Andrew Morton, Chris Li,
Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
On 2026-04-27 05:56 PM, Samiullah Khawaja wrote:
> Add IOMMU ops to preserve/unpreserve a device. These can be implemented
Can you make this comment more specific about what is being preserved?
Saying it preserves a device is vague and maybe even misleading. It's
more about about preserving a device's attachment to a specific domain
correct?
> by the IOMMU drivers that support preservation of devices that have
> their IOMMU domains preserved. During device preservation the state of
> the associated IOMMU is also preserved as dependency.
>
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c
> index f71f14518248..765d042e22e3 100644
> --- a/drivers/iommu/liveupdate.c
> +++ b/drivers/iommu/liveupdate.c
> +static struct iommu_device_ser *alloc_iommu_device_ser(struct iommu_flb_obj *flb)
It is unforunate that struct iommu_device_ser has nothing to do with
struct iommu_device. The former represents an a PCI device, while the
latter represents an IOMMU.
> diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h
> index 6019cfc27428..279c7ab04f09 100644
> --- a/include/linux/iommu-liveupdate.h
> +++ b/include/linux/iommu-liveupdate.h
> int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser);
> void iommu_domain_unpreserve(struct iommu_domain *domain);
> +int iommu_preserve_device(struct iommu_domain *domain,
> + struct device *dev, u64 *preserved_state);
> +void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev);
The naming scheme is inconsistent... Maybe it can be:
iommu_preserve_domain()
iommu_unpreserve_domain()
iommu_preserve_device() or iommu_preserve_device_attachment()
iommu_unpreserve_device() or iommu_unpreserve_device_attachment()
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 3853a3946733..1c424b32c5fc 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> +#ifdef CONFIG_IOMMU_LIVEUPDATE
> + int (*preserve_device)(struct device *dev, struct iommu_device_ser *device_ser);
> + void (*unpreserve_device)(struct device *dev, struct iommu_device_ser *device_ser);
> + int (*preserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser);
> + void (*unpreserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser);
> +#endif
Maybe we can make these names a little more specific:
preserve_device_attachment()
unpreserve_device_attachment()
preserve_iommu()
unpreserve_iommu()
?
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH v2 04/16] iommu: Implement device and IOMMU HW preservation
2026-05-01 22:42 ` David Matlack
@ 2026-05-04 19:06 ` Samiullah Khawaja
0 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-05-04 19:06 UTC (permalink / raw)
To: David Matlack
Cc: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, Andrew Morton, Chris Li,
Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
On Fri, May 01, 2026 at 10:42:57PM +0000, David Matlack wrote:
>On 2026-04-27 05:56 PM, Samiullah Khawaja wrote:
>> Add IOMMU ops to preserve/unpreserve a device. These can be implemented
>
>Can you make this comment more specific about what is being preserved?
>Saying it preserves a device is vague and maybe even misleading. It's
>more about about preserving a device's attachment to a specific domain
>correct?
There is attachment ID, but the preservation of device can have IOMMU
driver specific things, so in core I mostly mention "preseve device
specific state". In later patches in this series, we save PASID table
using the same callback. I will add more details in the commit message.
>
>> by the IOMMU drivers that support preservation of devices that have
>> their IOMMU domains preserved. During device preservation the state of
>> the associated IOMMU is also preserved as dependency.
>>
>> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
>
>> diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c
>> index f71f14518248..765d042e22e3 100644
>> --- a/drivers/iommu/liveupdate.c
>> +++ b/drivers/iommu/liveupdate.c
>
>> +static struct iommu_device_ser *alloc_iommu_device_ser(struct iommu_flb_obj *flb)
>
>It is unforunate that struct iommu_device_ser has nothing to do with
>struct iommu_device. The former represents an a PCI device, while the
>latter represents an IOMMU.
Yes, I went through various iterations of trying to name it in a
different way but keeping the "iommu_" prefix and the "device" state
naturally falls into this. Not sure if iommu_pci_device_ser is suitable?
>
>> diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h
>> index 6019cfc27428..279c7ab04f09 100644
>> --- a/include/linux/iommu-liveupdate.h
>> +++ b/include/linux/iommu-liveupdate.h
>
>> int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser);
>> void iommu_domain_unpreserve(struct iommu_domain *domain);
>> +int iommu_preserve_device(struct iommu_domain *domain,
>> + struct device *dev, u64 *preserved_state);
>> +void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev);
>
>The naming scheme is inconsistent... Maybe it can be:
>
> iommu_preserve_domain()
> iommu_unpreserve_domain()
> iommu_preserve_device() or iommu_preserve_device_attachment()
> iommu_unpreserve_device() or iommu_unpreserve_device_attachment()
Agreed, but I am trying to follow the already existing naming scheme
for domains that is used for APIs in iommu.c
iommu_domain_free()
iommu_domain_init()
iommu_domain_preserve()
But I think this is rare, I will update to this as you mentioned above:
iommu_preserve_domain()
iommu_unpreserve_domain()
iommu_preserve_device()
iommu_unpreserve_device()
>
>> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index 3853a3946733..1c424b32c5fc 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>
>> +#ifdef CONFIG_IOMMU_LIVEUPDATE
>> + int (*preserve_device)(struct device *dev, struct iommu_device_ser *device_ser);
>> + void (*unpreserve_device)(struct device *dev, struct iommu_device_ser *device_ser);
>> + int (*preserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser);
>> + void (*unpreserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser);
>> +#endif
>
>Maybe we can make these names a little more specific:
>
> preserve_device_attachment()
> unpreserve_device_attachment()
Attachment is too specific. See my comment above.
> preserve_iommu()
> unpreserve_iommu()
These are part of iommu_ops and having preserve_iommu() instead of
preserve() is redundant I think. Note ops like capable(), hw_info() in
the same struct.
>
>?
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH v2 05/16] iommu/pages: Add APIs to preserve/unpreserve/restore iommu pages
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (3 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 04/16] iommu: Implement device and IOMMU HW preservation Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 06/16] iommupt: Implement preserve/unpreserve/restore callbacks Samiullah Khawaja
` (10 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
IOMMU pages are allocated/freed using APIs using struct ioptdesc. For
the proper preservation and restoration of ioptdesc add helper
functions.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/iommu-pages.c | 108 ++++++++++++++++++++++++++++++++++--
drivers/iommu/iommu-pages.h | 30 ++++++++++
2 files changed, 134 insertions(+), 4 deletions(-)
diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c
index 3bab175d8557..b1ffeb930e6d 100644
--- a/drivers/iommu/iommu-pages.c
+++ b/drivers/iommu/iommu-pages.c
@@ -6,6 +6,7 @@
#include "iommu-pages.h"
#include <linux/dma-mapping.h>
#include <linux/gfp.h>
+#include <linux/kexec_handover.h>
#include <linux/mm.h>
#define IOPTDESC_MATCH(pg_elm, elm) \
@@ -28,6 +29,13 @@ static inline size_t ioptdesc_mem_size(struct ioptdesc *desc)
return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT);
}
+static inline void iommu_folio_update_stats(struct folio *folio,
+ unsigned long nr_pages)
+{
+ mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, nr_pages);
+}
+
/**
* iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from
* specific NUMA node
@@ -80,8 +88,7 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
* rather large, i.e. multiple gigabytes in size.
*/
pgcnt = 1UL << order;
- mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, pgcnt);
- lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, pgcnt);
+ iommu_folio_update_stats(folio, pgcnt);
return folio_address(folio);
}
@@ -95,8 +102,7 @@ static void __iommu_free_desc(struct ioptdesc *iopt)
if (IOMMU_PAGES_USE_DMA_API)
WARN_ON_ONCE(iopt->incoherent);
- mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt);
- lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt);
+ iommu_folio_update_stats(folio, -pgcnt);
folio_put(folio);
}
@@ -131,6 +137,100 @@ void iommu_put_pages_list(struct iommu_pages_list *list)
}
EXPORT_SYMBOL_GPL(iommu_put_pages_list);
+#if IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE)
+/**
+ * iommu_unpreserve_page - Unpreserve a page that was preserved in KHO
+ * @virt: Virtual address of a page
+ */
+void iommu_unpreserve_page(void *virt)
+{
+ kho_unpreserve_folio(ioptdesc_folio(virt_to_ioptdesc(virt)));
+}
+EXPORT_SYMBOL_GPL(iommu_unpreserve_page);
+
+/**
+ * iommu_preserve_page - Preserve a page during kexec handover
+ * @virt: Virtual address of the page to preserve
+ *
+ * Returns 0 on success, negative error on failure
+ */
+int iommu_preserve_page(void *virt)
+{
+ return kho_preserve_folio(ioptdesc_folio(virt_to_ioptdesc(virt)));
+}
+EXPORT_SYMBOL_GPL(iommu_preserve_page);
+
+/**
+ * iommu_unpreserve_pages - Unpreserve pages that were preserved in KHO
+ * @list: List of pages to unpreserve
+ */
+void iommu_unpreserve_pages(struct iommu_pages_list *list)
+{
+ struct ioptdesc *iopt;
+
+ list_for_each_entry(iopt, &list->pages, iopt_freelist_elm)
+ kho_unpreserve_folio(ioptdesc_folio(iopt));
+}
+EXPORT_SYMBOL_GPL(iommu_unpreserve_pages);
+
+/**
+ * iommu_restore_page - Restore a page that was preserved in KHO
+ * @phys: Physical address of a page
+ */
+void iommu_restore_page(u64 phys)
+{
+ struct ioptdesc *iopt;
+ struct folio *folio;
+ unsigned long pgcnt;
+ unsigned int order;
+
+ folio = kho_restore_folio(phys);
+ BUG_ON(!folio);
+
+ iopt = folio_ioptdesc(folio);
+
+ /*
+ * For the restored pages incoherent is set to false as these are not
+ * mapped using the DMA_API. The remapping of these pages using DMA_API
+ * is not needed as these are not going to be written to by the new
+ * kernel.
+ */
+ iopt->incoherent = false;
+
+ order = folio_order(folio);
+ pgcnt = 1UL << order;
+ iommu_folio_update_stats(folio, pgcnt);
+}
+EXPORT_SYMBOL_GPL(iommu_restore_page);
+
+/**
+ * iommu_preserve_pages - Preserve pages during kexec handover
+ * @list: List of pages to preserve
+ *
+ * Returns 0 on success, negative error on failure
+ */
+int iommu_preserve_pages(struct iommu_pages_list *list)
+{
+ struct ioptdesc *iopt;
+ int ret;
+
+ list_for_each_entry(iopt, &list->pages, iopt_freelist_elm) {
+ ret = kho_preserve_folio(ioptdesc_folio(iopt));
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+
+err:
+ list_for_each_entry_continue_reverse(iopt, &list->pages, iopt_freelist_elm)
+ kho_unpreserve_folio(ioptdesc_folio(iopt));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_preserve_pages);
+#endif
+
/**
* iommu_pages_start_incoherent - Setup the page for cache incoherent operation
* @virt: The page to setup
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index ae9da4f571f6..7b9b6bb504b2 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -53,6 +53,36 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size);
void iommu_free_pages(void *virt);
void iommu_put_pages_list(struct iommu_pages_list *list);
+#if IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE)
+int iommu_preserve_page(void *virt);
+void iommu_unpreserve_page(void *virt);
+int iommu_preserve_pages(struct iommu_pages_list *list);
+void iommu_unpreserve_pages(struct iommu_pages_list *list);
+void iommu_restore_page(u64 phys);
+#else
+static inline int iommu_preserve_page(void *virt)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void iommu_unpreserve_page(void *virt)
+{
+}
+
+static inline int iommu_preserve_pages(struct iommu_pages_list *list)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void iommu_unpreserve_pages(struct iommu_pages_list *list, int count)
+{
+}
+
+static inline void iommu_restore_page(u64 phys)
+{
+}
+#endif
+
/**
* iommu_pages_list_add - add the page to a iommu_pages_list
* @list: List to add the page to
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 06/16] iommupt: Implement preserve/unpreserve/restore callbacks
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (4 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 05/16] iommu/pages: Add APIs to preserve/unpreserve/restore iommu pages Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 07/16] iommu/vt-d: Implement device and iommu preserve/unpreserve ops Samiullah Khawaja
` (9 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
Implement the iommu domain ops for presevation, unpresevation and
restoration of iommu domains for liveupdate. Use the existing page
walker to preserve the ioptdesc of the top_table and the lower tables.
Preserve top_level, VASZ and FEAT Sign Extended to restore the domain in
the next kernel. On restore the domain has only the preserved features
enabled and all the other features are zeroed. This is ok since the
restored domain is made immutable and can only be freed. A kunit test is
added to verify that the IOMMU domain free can be done with trimmed
features.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/generic_pt/iommu_pt.h | 131 ++++++++++++++++++++++
drivers/iommu/generic_pt/kunit_iommu_pt.h | 28 +++++
include/linux/generic_pt/iommu.h | 19 +++-
3 files changed, 177 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 19b6daf88f2a..7bca827e3a55 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -961,6 +961,133 @@ static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova,
return ret;
}
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+/**
+ * unpreserve() - Unpreserve page tables and other state of a domain.
+ * @domain: Domain to unpreserve
+ */
+void DOMAIN_NS(unpreserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_all_range(common);
+ struct pt_iommu_collect_args collect = {
+ .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+ };
+
+ iommu_pages_list_add(&collect.free_list, range.top_table);
+ pt_walk_range(&range, __collect_tables, &collect);
+
+ iommu_unpreserve_pages(&collect.free_list);
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unpreserve), "GENERIC_PT_IOMMU");
+
+/**
+ * preserve() - Preserve page tables and other state of a domain.
+ * @domain: Domain to preserve
+ *
+ * Returns: -ERRNO on failure, 0 on success.
+ */
+int DOMAIN_NS(preserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_all_range(common);
+ struct pt_iommu_collect_args collect = {
+ .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+ };
+ int ret;
+
+ iommu_pages_list_add(&collect.free_list, range.top_table);
+ pt_walk_range(&range, __collect_tables, &collect);
+
+ ret = iommu_preserve_pages(&collect.free_list);
+ if (ret)
+ return ret;
+
+ ser->top_table_phys = virt_to_phys(range.top_table);
+ ser->top_level = range.top_level;
+
+ /*
+ * VASZ and SIGN_EXTEND will be needed in next kernel for collector page
+ * table walk to restore and free pages.
+ */
+ ser->vasz = common->max_vasz_lg2;
+ ser->sign_extend = pt_feature(common, PT_FEAT_SIGN_EXTEND);
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(preserve), "GENERIC_PT_IOMMU");
+
+static int __restore_tables(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ int ret;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ iommu_restore_page(virt_to_phys(pts.table_lower));
+
+ /*
+ * pt_descend can only fail if pts.table_lower is not
+ * init. So the if statement below is dead code.
+ */
+ ret = pt_descend(&pts, arg, __restore_tables);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static const struct pt_iommu_ops NS(ops_immutable);
+
+/**
+ * restore() - Restore page tables and other state of a domain.
+ * @domain: Domain to preserve
+ *
+ * Returns: -ERRNO on failure, 0 on success.
+ */
+int DOMAIN_NS(restore)(struct iommu_domain *domain, struct iommu_domain_ser *ser)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range;
+
+ common->max_vasz_lg2 = ser->vasz;
+
+ /* Make this domain immutable.*/
+ iommu_table->ops = &NS(ops_immutable);
+
+ /*
+ * It is safe to override this here since this domain is immutable and
+ * can only be freed.
+ */
+ common->features = 0;
+ if (ser->sign_extend)
+ common->features |= BIT(PT_FEAT_SIGN_EXTEND);
+
+ range = pt_all_range(common);
+ iommu_restore_page(ser->top_table_phys);
+
+ /* Free new table */
+ iommu_free_pages(range.top_table);
+
+ /* Set the restored top table */
+ pt_top_set(common, phys_to_virt(ser->top_table_phys), ser->top_level);
+
+ /* Restore all pages*/
+ range = pt_all_range(common);
+ return pt_walk_range(&range, __restore_tables, NULL);
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(restore), "GENERIC_PT_IOMMU");
+#endif
+
struct pt_unmap_args {
struct iommu_pages_list free_list;
pt_vaddr_t unmapped;
@@ -1138,6 +1265,10 @@ static const struct pt_iommu_ops NS(ops) = {
.deinit = NS(deinit),
};
+static const struct pt_iommu_ops NS(ops_immutable) = {
+ .deinit = NS(deinit),
+};
+
static int pt_init_common(struct pt_common *common)
{
struct pt_range top_range = pt_top_range(common);
diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h
index e8a63c8ea850..af1918d693ed 100644
--- a/drivers/iommu/generic_pt/kunit_iommu_pt.h
+++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h
@@ -426,6 +426,33 @@ static void test_mixed(struct kunit *test)
check_iova(test, start, oa, len);
}
+static void test_restore_free(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ u64 start = 0x3fe400ULL << 12;
+ u64 end = 0x4c0600ULL << 12;
+ pt_vaddr_t len = end - start;
+
+ if (top_range.last_va <= start || sizeof(unsigned long) == 4)
+ kunit_skip(test, "range is too small");
+ if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21)))
+ kunit_skip(test, "incompatible psize");
+
+ /* Map a large mixed range to populate multiple levels of page tables */
+ do_map(test, start, start, len);
+
+ /*
+ * Simulate a restored state by clearing all features except
+ * SIGN_EXTEND. This verifies that the generic page table free walker
+ * can correctly tear down a populated domain when other features are
+ * zeroed.
+ */
+ priv->common->features &= BIT(PT_FEAT_SIGN_EXTEND);
+
+ /* The domain will be freed when the test exits. */
+}
+
static struct kunit_case iommu_test_cases[] = {
KUNIT_CASE_FMT(test_increase_level),
KUNIT_CASE_FMT(test_map_simple),
@@ -434,6 +461,7 @@ static struct kunit_case iommu_test_cases[] = {
KUNIT_CASE_FMT(test_random_map),
KUNIT_CASE_FMT(test_pgsize_boundary),
KUNIT_CASE_FMT(test_mixed),
+ KUNIT_CASE_FMT(test_restore_free),
{},
};
diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index dd0edd02a48a..649b3b9eb1a0 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -13,6 +13,7 @@ struct iommu_iotlb_gather;
struct pt_iommu_ops;
struct pt_iommu_driver_ops;
struct iommu_dirty_bitmap;
+struct iommu_domain_ser;
/**
* DOC: IOMMU Radix Page Table
@@ -251,6 +252,12 @@ struct pt_iommu_cfg {
#define IOMMU_PROTOTYPES(fmt) \
phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \
dma_addr_t iova); \
+ int pt_iommu_##fmt##_preserve(struct iommu_domain *domain, \
+ struct iommu_domain_ser *ser); \
+ void pt_iommu_##fmt##_unpreserve(struct iommu_domain *domain, \
+ struct iommu_domain_ser *ser); \
+ int pt_iommu_##fmt##_restore(struct iommu_domain *domain, \
+ struct iommu_domain_ser *ser); \
int pt_iommu_##fmt##_read_and_clear_dirty( \
struct iommu_domain *domain, unsigned long iova, size_t size, \
unsigned long flags, struct iommu_dirty_bitmap *dirty); \
@@ -266,12 +273,22 @@ struct pt_iommu_cfg {
}; \
IOMMU_PROTOTYPES(fmt)
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+#define IOMMU_PT_LIVEUPDATE_OPS(fmt) \
+ , .preserve = &pt_iommu_##fmt##_preserve, \
+ .unpreserve = &pt_iommu_##fmt##_unpreserve, \
+ .restore = &pt_iommu_##fmt##_restore
+#else
+#define IOMMU_PT_LIVEUPDATE_OPS(fmt)
+#endif
+
/*
* A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the
* iommu_pt
*/
#define IOMMU_PT_DOMAIN_OPS(fmt) \
- .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys
+ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys \
+ IOMMU_PT_LIVEUPDATE_OPS(fmt)
#define IOMMU_PT_DIRTY_OPS(fmt) \
.read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 07/16] iommu/vt-d: Implement device and iommu preserve/unpreserve ops
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (5 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 06/16] iommupt: Implement preserve/unpreserve/restore callbacks Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 08/16] iommu: Add APIs to get iommu and device preserved state Samiullah Khawaja
` (8 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
Add implementation of the device and iommu presevation in a separate
file. Also set the device and iommu preserve/unpreserve ops in the
struct iommu_ops.
During normal shutdown the iommu translation is disabled. Since the root
table is preserved during live update, it needs to be cleaned up and the
context entries of the unpreserved devices need to be cleared.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
MAINTAINERS | 1 +
drivers/iommu/intel/Makefile | 1 +
drivers/iommu/intel/iommu.c | 52 +++++++++++-
drivers/iommu/intel/iommu.h | 28 +++++++
drivers/iommu/intel/liveupdate.c | 139 +++++++++++++++++++++++++++++++
drivers/iommu/iommu.c | 18 ++++
include/linux/iommu-liveupdate.h | 10 +++
include/linux/iommu.h | 14 ++++
include/linux/kho/abi/iommu.h | 18 ++++
9 files changed, 277 insertions(+), 4 deletions(-)
create mode 100644 drivers/iommu/intel/liveupdate.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 980041955abc..9f5c02c6c8c1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13495,6 +13495,7 @@ M: Samiullah Khawaja <skhawaja@google.com>
R: Pranjal Shrivastava <praan@google.com>
L: iommu@lists.linux.dev
S: Maintained
+F: drivers/iommu/intel/liveupdate.c
F: drivers/iommu/liveupdate.c
F: include/linux/iommu-liveupdate.h
F: include/linux/kho/abi/iommu.h
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index ada651c4a01b..d38fc101bc35 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o
+obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c3d18cd77d2f..68fecd4e57fa 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -16,6 +16,7 @@
#include <linux/crash_dump.h>
#include <linux/dma-direct.h>
#include <linux/dmi.h>
+#include <linux/iommu-liveupdate.h>
#include <linux/memory.h>
#include <linux/pci.h>
#include <linux/pci-ats.h>
@@ -52,6 +53,8 @@ static int rwbf_quirk;
#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap))
+static void clear_unpreserved_context_entries(struct intel_iommu *iommu);
+
/*
* set to 1 to panic kernel if can't successfully enable VT-d
* (used when kernel is launched w/ TXT)
@@ -60,8 +63,6 @@ static int force_on = 0;
static int intel_iommu_tboot_noforce;
static int no_platform_optin;
-#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
-
/*
* Take a root_entry and return the Lower Context Table Pointer (LCTP)
* if marked present.
@@ -2375,8 +2376,11 @@ void intel_iommu_shutdown(void)
/* Disable PMRs explicitly here. */
iommu_disable_protect_mem_regions(iommu);
- /* Make sure the IOMMUs are switched off */
- iommu_disable_translation(iommu);
+ /* Make sure the IOMMUs are switched off if not preserved. */
+ if (iommu_preserved_state(&iommu->iommu))
+ clear_unpreserved_context_entries(iommu);
+ else
+ iommu_disable_translation(iommu);
}
}
@@ -2899,6 +2903,41 @@ static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
.set_dirty_tracking = intel_iommu_set_dirty_tracking,
};
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+static int clear_unpreserve_context_entry_fn(struct device *dev,
+ struct iommu_device *iommu,
+ void *arg)
+{
+ struct device_domain_info *info;
+
+ info = dev_iommu_priv_get(dev);
+ if (!info)
+ return 0;
+
+ if (dev_is_pci(dev) && dev_iommu_preserved_state(dev))
+ return 0;
+
+ domain_context_clear(info);
+ return 0;
+}
+
+static void clear_unpreserved_context_entries(struct intel_iommu *iommu)
+{
+ struct iommu_dev_iter iter = {
+ .fn = clear_unpreserve_context_entry_fn,
+ .iommu = &iommu->iommu,
+ .arg = NULL,
+
+ };
+
+ iommu_for_each_dev(&iter);
+}
+#else
+static void clear_unpreserved_context_entries(struct intel_iommu *iommu)
+{
+}
+#endif
+
static struct iommu_domain *
intel_iommu_domain_alloc_second_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
@@ -3926,6 +3965,11 @@ const struct iommu_ops intel_iommu_ops = {
.is_attach_deferred = intel_iommu_is_attach_deferred,
.def_domain_type = device_def_domain_type,
.page_response = intel_iommu_page_response,
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ .preserve_device = intel_iommu_preserve_device,
+ .preserve = intel_iommu_preserve,
+ .unpreserve = intel_iommu_unpreserve,
+#endif
};
static void quirk_iommu_igfx(struct pci_dev *dev)
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index ef145560aa98..5e0bc17e76bf 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -552,6 +552,8 @@ struct root_entry {
u64 hi;
};
+#define ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(struct root_entry))
+
/*
* low 64 bits:
* 0: present
@@ -1284,6 +1286,32 @@ static inline int iopf_for_domain_replace(struct iommu_domain *new,
return 0;
}
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+int intel_iommu_preserve_device(struct device *dev,
+ struct iommu_device_ser *device_ser);
+int intel_iommu_preserve(struct iommu_device *iommu,
+ struct iommu_hw_ser *iommu_ser);
+void intel_iommu_unpreserve(struct iommu_device *iommu,
+ struct iommu_hw_ser *iommu_ser);
+#else
+static inline int intel_iommu_preserve_device(struct device *dev,
+ struct iommu_device_ser *device_ser)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int intel_iommu_preserve(struct iommu_device *iommu,
+ struct iommu_hw_ser *iommu_ser)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void intel_iommu_unpreserve(struct iommu_device *iommu,
+ struct iommu_hw_ser *iommu_ser)
+{
+}
+#endif
+
#ifdef CONFIG_INTEL_IOMMU_SVM
void intel_svm_check(struct intel_iommu *iommu);
struct iommu_domain *intel_svm_domain_alloc(struct device *dev,
diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c
new file mode 100644
index 000000000000..75fa68b701bf
--- /dev/null
+++ b/drivers/iommu/intel/liveupdate.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2026, Google LLC
+ * Author: Samiullah Khawaja <skhawaja@google.com>
+ */
+
+#define pr_fmt(fmt) "DMAR: liveupdate: " fmt
+
+#include <linux/kexec_handover.h>
+#include <linux/liveupdate.h>
+#include <linux/iommu-liveupdate.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include "iommu.h"
+#include "../iommu-pages.h"
+
+static void unpreserve_iommu_context_table(struct intel_iommu *iommu, int end)
+{
+ struct context_entry *context;
+ int i;
+
+ for (i = 0; i < end; i++) {
+ context = iommu_context_addr(iommu, i, 0, 0);
+ if (context)
+ iommu_unpreserve_page(context);
+
+ if (!sm_supported(iommu))
+ continue;
+
+ context = iommu_context_addr(iommu, i, 0x80, 0);
+ if (context)
+ iommu_unpreserve_page(context);
+ }
+}
+
+static int preserve_iommu_context_table(struct intel_iommu *iommu)
+{
+ struct context_entry *context;
+ int ret;
+ int i;
+
+ for (i = 0; i < ROOT_ENTRY_NR; i++) {
+ /*
+ * Alloc the context tables now to make sure the iommu unit is
+ * properly preserved. These might stay unused and wastes around
+ * 32MB max in scalable mode.
+ */
+ spin_lock(&iommu->lock);
+ context = iommu_context_addr(iommu, i, 0, 1);
+ spin_unlock(&iommu->lock);
+ if (!context) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ ret = iommu_preserve_page(context);
+ if (ret)
+ goto error;
+
+ if (!sm_supported(iommu))
+ continue;
+
+ spin_lock(&iommu->lock);
+ context = iommu_context_addr(iommu, i, 0x80, 1);
+ spin_unlock(&iommu->lock);
+ if (!context) {
+ ret = -ENOMEM;
+ goto error_sm;
+ }
+ ret = iommu_preserve_page(context);
+ if (ret)
+ goto error_sm;
+ }
+
+ return 0;
+
+error_sm:
+ context = iommu_context_addr(iommu, i, 0, 0);
+ iommu_unpreserve_page(context);
+error:
+ unpreserve_iommu_context_table(iommu, i);
+ return ret;
+}
+
+int intel_iommu_preserve_device(struct device *dev,
+ struct iommu_device_ser *device_ser)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+
+ if (!dev_is_pci(dev)) {
+ dev_err(dev, "Cannot preserve non-PCI device\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (!info)
+ return -EINVAL;
+
+ device_ser->domain_iommu_ser.attachment_id = domain_id_iommu(info->domain,
+ info->iommu);
+ return 0;
+}
+
+int intel_iommu_preserve(struct iommu_device *iommu_dev,
+ struct iommu_hw_ser *ser)
+{
+ struct intel_iommu *iommu;
+ int ret;
+
+ iommu = container_of(iommu_dev, struct intel_iommu, iommu);
+
+ ret = preserve_iommu_context_table(iommu);
+ if (ret)
+ return ret;
+
+ ret = iommu_preserve_page(iommu->root_entry);
+ if (ret) {
+ unpreserve_iommu_context_table(iommu, ROOT_ENTRY_NR);
+ return ret;
+ }
+
+ ser->intel.phys_addr = iommu->reg_phys;
+ ser->intel.root_table = __pa(iommu->root_entry);
+ ser->type = IOMMU_INTEL;
+ ser->token = ser->intel.phys_addr;
+
+ return 0;
+}
+
+void intel_iommu_unpreserve(struct iommu_device *iommu_dev,
+ struct iommu_hw_ser *iommu_ser)
+{
+ struct intel_iommu *iommu;
+
+ iommu = container_of(iommu_dev, struct intel_iommu, iommu);
+
+ unpreserve_iommu_context_table(iommu, ROOT_ENTRY_NR);
+ iommu_unpreserve_page(iommu->root_entry);
+}
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 782e73a9d45f..0561990f46e3 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -307,6 +307,24 @@ void iommu_device_unregister(struct iommu_device *iommu)
}
EXPORT_SYMBOL_GPL(iommu_device_unregister);
+static int _iommu_for_each_dev_cb(struct device *dev, void *data)
+{
+ struct iommu_dev_iter *iter = data;
+
+ if (dev->iommu && dev->iommu->iommu_dev == iter->iommu)
+ return iter->fn(dev, iter->iommu, iter->arg);
+
+ return 0;
+}
+
+void iommu_for_each_dev(struct iommu_dev_iter *iter)
+{
+ for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++)
+ bus_for_each_dev(iommu_buses[i], NULL, iter,
+ _iommu_for_each_dev_cb);
+}
+EXPORT_SYMBOL_GPL(iommu_for_each_dev);
+
#if IS_ENABLED(CONFIG_IOMMUFD_TEST)
void iommu_device_unregister_bus(struct iommu_device *iommu,
const struct bus_type *bus,
diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h
index 279c7ab04f09..c9d75c6b3be9 100644
--- a/include/linux/iommu-liveupdate.h
+++ b/include/linux/iommu-liveupdate.h
@@ -33,6 +33,11 @@ void iommu_domain_unpreserve(struct iommu_domain *domain);
int iommu_preserve_device(struct iommu_domain *domain,
struct device *dev, u64 *preserved_state);
void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev);
+
+static inline void *iommu_preserved_state(struct iommu_device *iommu)
+{
+ return iommu->outgoing_preserved_state;
+}
#else
static inline void *dev_iommu_preserved_state(struct device *dev)
{
@@ -57,6 +62,11 @@ static inline int iommu_preserve_device(struct iommu_domain *domain,
static inline void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev)
{
}
+
+static inline void *iommu_preserved_state(struct iommu_device *iommu)
+{
+ return NULL;
+}
#endif
int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1c424b32c5fc..999be5127c65 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1207,6 +1207,20 @@ static inline void *dev_iommu_priv_get(struct device *dev)
void dev_iommu_priv_set(struct device *dev, void *priv);
+typedef int (*iommu_dev_iter_fn)(struct device *dev,
+ struct iommu_device *iommu, void *arg);
+
+/**
+ * struct iommu_dev_iter - Iterator for devices attached to an IOMMU
+ */
+struct iommu_dev_iter {
+ struct iommu_device *iommu;
+ iommu_dev_iter_fn fn;
+ void *arg;
+};
+
+void iommu_for_each_dev(struct iommu_dev_iter *iter);
+
extern struct mutex iommu_probe_device_lock;
int iommu_probe_device(struct device *dev);
diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h
index 37b967820f14..5ffedf0dbd5a 100644
--- a/include/linux/kho/abi/iommu.h
+++ b/include/linux/kho/abi/iommu.h
@@ -73,6 +73,7 @@
enum iommu_type_ser {
IOMMU_INVALID,
+ IOMMU_INTEL,
};
/**
@@ -132,16 +133,33 @@ struct iommu_device_ser {
struct iommu_dev_map_ser domain_iommu_ser;
} __packed;
+/**
+ * struct iommu_intel_ser - Serialized state of an Intel IOMMU instance
+ * @restored: Whether IOMMU state is restored
+ * @phys_addr: Physical address of the IOMMU register base
+ * @root_table: Physical address of the root entry table
+ */
+struct iommu_intel_ser {
+ u8 restored;
+ u8 padding[7];
+ u64 phys_addr;
+ u64 root_table;
+};
+
/**
* struct iommu_hw_ser - Serialized state of an IOMMU instance
* @hdr: Common object header
* @token: Unique token for the IOMMU
* @type: IOMMU type serialized state belongs to
+ * @intel: Intel specific serialization data
*/
struct iommu_hw_ser {
struct iommu_hdr_ser hdr;
u64 token;
u64 type;
+ union {
+ struct iommu_intel_ser intel;
+ };
} __packed;
/**
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 08/16] iommu: Add APIs to get iommu and device preserved state
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (6 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 07/16] iommu/vt-d: Implement device and iommu preserve/unpreserve ops Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 09/16] iommu/vt-d: Restore IOMMU state and reclaimed domain ids Samiullah Khawaja
` (7 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
The preserved state of the device and IOMMU needs to be fetched during
shutdown and boot in the next kernel. Add APIs that can be used to fetch
the preserved state of a device and IOMMU. The APIs will only be used
during shutdown and after liveupdate so no locking needed.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/liveupdate.c | 57 ++++++++++++++++++++++++++++++++
include/linux/iommu-liveupdate.h | 31 +++++++++++++++++
2 files changed, 88 insertions(+)
diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c
index 765d042e22e3..60ee29b0c6bd 100644
--- a/drivers/iommu/liveupdate.c
+++ b/drivers/iommu/liveupdate.c
@@ -17,6 +17,14 @@
#define iommu_max_objs_per_page(_array) \
((PAGE_SIZE - sizeof(struct iommu_array_hdr_ser)) / sizeof((_array)->objects[0]))
+#define iommu_liveupdate_for_each_obj(_arr, _obj, _idx) \
+ for (; (_arr); \
+ (_arr) = (_arr)->hdr.next_array_phys ? \
+ phys_to_virt((_arr)->hdr.next_array_phys) : NULL) \
+ for ((_idx) = 0, (_obj) = (_arr)->objects; \
+ (_idx) < (_arr)->hdr.nr_objects; (_idx)++, (_obj)++) \
+ if (!(_obj)->hdr.deleted)
+
static void *iommu_liveupdate_restore_array(u64 array_phys)
{
struct iommu_array_hdr_ser *array_hdr;
@@ -201,6 +209,55 @@ void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler)
}
EXPORT_SYMBOL(iommu_liveupdate_unregister_flb);
+int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn,
+ void *arg)
+{
+ struct iommu_flb_obj *flb_obj;
+ struct iommu_device_array_ser *array;
+ struct iommu_device_ser *device_ser;
+ int ret, idx;
+
+ ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj);
+ if (ret)
+ return -ENOENT;
+
+ array = phys_to_virt(flb_obj->ser->device_array_phys);
+ iommu_liveupdate_for_each_obj(array, device_ser, idx) {
+ ret = fn(device_ser, arg);
+ if (ret)
+ goto out;
+ }
+
+out:
+ liveupdate_flb_put_incoming(&iommu_flb);
+ return ret;
+}
+EXPORT_SYMBOL(iommu_for_each_preserved_device);
+
+struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type)
+{
+ struct iommu_hw_ser *iommu_ser = NULL;
+ struct iommu_hw_array_ser *array;
+ struct iommu_flb_obj *flb_obj;
+ int ret, idx;
+
+ ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj);
+ if (ret)
+ return NULL;
+
+ array = phys_to_virt(flb_obj->ser->iommu_array_phys);
+ iommu_liveupdate_for_each_obj(array, iommu_ser, idx) {
+ if (iommu_ser->token == token && iommu_ser->type == type)
+ goto out;
+ }
+
+ iommu_ser = NULL;
+out:
+ liveupdate_flb_put_incoming(&iommu_flb);
+ return iommu_ser;
+}
+EXPORT_SYMBOL(iommu_get_preserved_data);
+
static int alloc_object_ser(struct iommu_array_hdr_ser **curr_array_ptr, u64 max_objs)
{
struct iommu_array_hdr_ser *curr_array = *curr_array_ptr;
diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h
index c9d75c6b3be9..0baf6bc2d93f 100644
--- a/include/linux/iommu-liveupdate.h
+++ b/include/linux/iommu-liveupdate.h
@@ -13,6 +13,8 @@
#include <linux/liveupdate.h>
#include <linux/kho/abi/iommu.h>
+typedef int (*iommu_preserved_device_iter_fn)(struct iommu_device_ser *ser,
+ void *arg);
#ifdef CONFIG_IOMMU_LIVEUPDATE
static inline void *dev_iommu_preserved_state(struct device *dev)
{
@@ -28,6 +30,20 @@ static inline void *dev_iommu_preserved_state(struct device *dev)
return NULL;
}
+static inline void *iommu_domain_restored_state(struct iommu_domain *domain)
+{
+ struct iommu_domain_ser *ser;
+
+ ser = domain->preserved_state;
+ if (ser && ser->hdr.incoming)
+ return ser;
+
+ return NULL;
+}
+
+int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn,
+ void *arg);
+struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type);
int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser);
void iommu_domain_unpreserve(struct iommu_domain *domain);
int iommu_preserve_device(struct iommu_domain *domain,
@@ -44,6 +60,21 @@ static inline void *dev_iommu_preserved_state(struct device *dev)
return NULL;
}
+static inline void *iommu_domain_restored_state(struct iommu_domain *domain)
+{
+ return NULL;
+}
+
+static inline int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, void *arg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type)
+{
+ return NULL;
+}
+
static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser)
{
return -EOPNOTSUPP;
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 09/16] iommu/vt-d: Restore IOMMU state and reclaimed domain ids
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (7 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 08/16] iommu: Add APIs to get iommu and device preserved state Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 10/16] iommu: Restore and reattach preserved domains to devices Samiullah Khawaja
` (6 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
During boot fetch the preserved state of IOMMU unit and if found then
restore the state.
- Reuse the root_table that was preserved in the previous kernel.
- Reclaim the domain ids of the preserved domains for each preserved
devices so these are not acquired by another domain.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/intel/iommu.c | 55 ++++++++++++++++++++++--------
drivers/iommu/intel/iommu.h | 7 ++++
drivers/iommu/intel/liveupdate.c | 57 ++++++++++++++++++++++++++++++++
3 files changed, 105 insertions(+), 14 deletions(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 68fecd4e57fa..4118a0861f38 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -670,10 +670,17 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
#endif
/* iommu handling */
-static int iommu_alloc_root_entry(struct intel_iommu *iommu)
+static int iommu_alloc_root_entry(struct intel_iommu *iommu,
+ struct iommu_hw_ser *iommu_ser)
{
struct root_entry *root;
+ if (iommu_ser) {
+ intel_iommu_liveupdate_restore_root_table(iommu, iommu_ser);
+ __iommu_flush_cache(iommu, iommu->root_entry, ROOT_SIZE);
+ return 0;
+ }
+
root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K);
if (!root) {
pr_err("Allocating root entry for %s failed\n",
@@ -992,15 +999,16 @@ static void disable_dmar_iommu(struct intel_iommu *iommu)
iommu_disable_translation(iommu);
}
-static void free_dmar_iommu(struct intel_iommu *iommu)
+static void free_dmar_iommu(struct intel_iommu *iommu, struct iommu_hw_ser *iommu_ser)
{
if (iommu->copied_tables) {
bitmap_free(iommu->copied_tables);
iommu->copied_tables = NULL;
}
- /* free context mapping */
- free_context_table(iommu);
+ /* free context mapping if there is no serialized state. */
+ if (!iommu_ser)
+ free_context_table(iommu);
if (ecap_prs(iommu->ecap))
intel_iommu_finish_prq(iommu);
@@ -1611,6 +1619,7 @@ static int copy_translation_tables(struct intel_iommu *iommu)
static int __init init_dmars(void)
{
+ struct iommu_hw_ser *iommu_ser = NULL;
struct dmar_drhd_unit *drhd;
struct intel_iommu *iommu;
int ret;
@@ -1633,8 +1642,12 @@ static int __init init_dmars(void)
intel_pasid_max_id);
}
+ iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL);
+
intel_iommu_init_qi(iommu);
- init_translation_status(iommu);
+
+ if (!iommu_ser)
+ init_translation_status(iommu);
if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
iommu_disable_translation(iommu);
@@ -1648,7 +1661,7 @@ static int __init init_dmars(void)
* we could share the same root & context tables
* among all IOMMU's. Need to Split it later.
*/
- ret = iommu_alloc_root_entry(iommu);
+ ret = iommu_alloc_root_entry(iommu, iommu_ser);
if (ret)
goto free_iommu;
@@ -1732,8 +1745,12 @@ static int __init init_dmars(void)
free_iommu:
for_each_active_iommu(iommu, drhd) {
- disable_dmar_iommu(iommu);
- free_dmar_iommu(iommu);
+ iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL);
+
+ if (!iommu_ser)
+ disable_dmar_iommu(iommu);
+
+ free_dmar_iommu(iommu, iommu_ser);
}
return ret;
@@ -2107,15 +2124,19 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
{
struct intel_iommu *iommu = dmaru->iommu;
+ struct iommu_hw_ser *iommu_ser = NULL;
int ret;
+ /* Use IOMMU HW unit MMIO base to identify the preserved state. */
+ iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL);
+
/*
* Disable translation if already enabled prior to OS handover.
*/
- if (iommu->gcmd & DMA_GCMD_TE)
+ if (!iommu_ser && iommu->gcmd & DMA_GCMD_TE)
iommu_disable_translation(iommu);
- ret = iommu_alloc_root_entry(iommu);
+ ret = iommu_alloc_root_entry(iommu, iommu_ser);
if (ret)
goto out;
@@ -2150,9 +2171,10 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
return 0;
disable_iommu:
- disable_dmar_iommu(iommu);
+ if (!iommu_ser)
+ disable_dmar_iommu(iommu);
out:
- free_dmar_iommu(iommu);
+ free_dmar_iommu(iommu, iommu_ser);
return ret;
}
@@ -2160,6 +2182,7 @@ int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
{
int ret = 0;
struct intel_iommu *iommu = dmaru->iommu;
+ struct iommu_hw_ser *iommu_ser;
if (!intel_iommu_enabled)
return 0;
@@ -2169,8 +2192,12 @@ int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
if (insert) {
ret = intel_iommu_add(dmaru);
} else {
- disable_dmar_iommu(iommu);
- free_dmar_iommu(iommu);
+ iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL);
+
+ if (!iommu_ser)
+ disable_dmar_iommu(iommu);
+
+ free_dmar_iommu(iommu, iommu_ser);
}
return ret;
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 5e0bc17e76bf..b0ec0b471a43 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1293,6 +1293,8 @@ int intel_iommu_preserve(struct iommu_device *iommu,
struct iommu_hw_ser *iommu_ser);
void intel_iommu_unpreserve(struct iommu_device *iommu,
struct iommu_hw_ser *iommu_ser);
+void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu,
+ struct iommu_hw_ser *iommu_ser);
#else
static inline int intel_iommu_preserve_device(struct device *dev,
struct iommu_device_ser *device_ser)
@@ -1310,6 +1312,11 @@ static inline void intel_iommu_unpreserve(struct iommu_device *iommu,
struct iommu_hw_ser *iommu_ser)
{
}
+
+static inline void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu,
+ struct iommu_hw_ser *iommu_ser)
+{
+}
#endif
#ifdef CONFIG_INTEL_IOMMU_SVM
diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c
index 75fa68b701bf..50a63812533f 100644
--- a/drivers/iommu/intel/liveupdate.c
+++ b/drivers/iommu/intel/liveupdate.c
@@ -83,6 +83,63 @@ static int preserve_iommu_context_table(struct intel_iommu *iommu)
return ret;
}
+static void restore_iommu_context(struct intel_iommu *iommu)
+{
+ struct context_entry *context;
+ int i;
+
+ for (i = 0; i < ROOT_ENTRY_NR; i++) {
+ context = iommu_context_addr(iommu, i, 0, 0);
+ if (context)
+ BUG_ON(!kho_restore_folio(virt_to_phys(context)));
+
+ if (!sm_supported(iommu))
+ continue;
+
+ context = iommu_context_addr(iommu, i, 0x80, 0);
+ if (context)
+ BUG_ON(!kho_restore_folio(virt_to_phys(context)));
+ }
+}
+
+static int _restore_used_domain_ids(struct iommu_device_ser *ser, void *arg)
+{
+ int id = ser->domain_iommu_ser.attachment_id;
+ struct iommu_hw_ser *iommu_hw_ser;
+ struct intel_iommu *iommu = arg;
+
+ iommu_hw_ser = phys_to_virt(ser->domain_iommu_ser.iommu_phys);
+ if (iommu_hw_ser->type != IOMMU_INTEL)
+ return 0;
+
+ /* Only allocate domain ID from associated IOMMU HW unit */
+ if (iommu_hw_ser->intel.phys_addr != iommu->reg_phys)
+ return 0;
+
+ /*
+ * This can fail as multiple preserved devices can share the same domain
+ * ID. Since this is done during DMAR init so these failures can be
+ * ignored.
+ */
+ ida_alloc_range(&iommu->domain_ida, id, id, GFP_ATOMIC);
+ return 0;
+}
+
+void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu,
+ struct iommu_hw_ser *iommu_ser)
+{
+ if (!iommu_ser->intel.restored)
+ BUG_ON(!kho_restore_folio(iommu_ser->intel.root_table));
+
+ iommu->root_entry = __va(iommu_ser->intel.root_table);
+
+ if (!iommu_ser->intel.restored)
+ restore_iommu_context(iommu);
+
+ iommu_ser->intel.restored = 1;
+ iommu_for_each_preserved_device(_restore_used_domain_ids, iommu);
+}
+
int intel_iommu_preserve_device(struct device *dev,
struct iommu_device_ser *device_ser)
{
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 10/16] iommu: Restore and reattach preserved domains to devices
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (8 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 09/16] iommu/vt-d: Restore IOMMU state and reclaimed domain ids Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 11/16] iommu/vt-d: preserve PASID table of preserved device Samiullah Khawaja
` (5 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
Restore the preserved domains by restoring the page tables using restore
IOMMU domain op. Reattach the preserved domain to the device during
default domain setup. While attaching, reuse the domain ID that was used
in the previous kernel. The context entry setup is not needed as that is
preserved during liveupdate.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/intel/iommu.c | 49 ++++++++++++++------
drivers/iommu/intel/iommu.h | 3 +-
drivers/iommu/intel/nested.c | 2 +-
drivers/iommu/iommu.c | 61 ++++++++++++++++++++++++-
drivers/iommu/liveupdate.c | 78 ++++++++++++++++++++++++++++++++
include/linux/iommu-liveupdate.h | 50 ++++++++++++++++++++
6 files changed, 224 insertions(+), 19 deletions(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 4118a0861f38..b90757164cd8 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1031,7 +1031,8 @@ static bool first_level_by_default(struct intel_iommu *iommu)
return true;
}
-int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu,
+ int restore_did)
{
struct iommu_domain_info *info, *curr;
int num, ret = -ENOSPC;
@@ -1051,8 +1052,11 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
return 0;
}
- num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
- cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
+ if (restore_did >= IDA_START_DID)
+ num = restore_did;
+ else
+ num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
+ cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
if (num < 0) {
pr_err("%s: No free domain ids\n", iommu->name);
goto err_unlock;
@@ -1320,10 +1324,14 @@ static int dmar_domain_attach_device(struct dmar_domain *domain,
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
+ struct device_ser *device_ser = NULL;
unsigned long flags;
int ret;
- ret = domain_attach_iommu(domain, iommu);
+ device_ser = dev_iommu_restored_state(dev);
+
+ ret = domain_attach_iommu(domain, iommu,
+ dev_iommu_restore_did(dev, &domain->domain));
if (ret)
return ret;
@@ -1336,16 +1344,18 @@ static int dmar_domain_attach_device(struct dmar_domain *domain,
if (dev_is_real_dma_subdevice(dev))
return 0;
- if (!sm_supported(iommu))
- ret = domain_context_mapping(domain, dev);
- else if (intel_domain_is_fs_paging(domain))
- ret = domain_setup_first_level(iommu, domain, dev,
- IOMMU_NO_PASID, NULL);
- else if (intel_domain_is_ss_paging(domain))
- ret = domain_setup_second_level(iommu, domain, dev,
- IOMMU_NO_PASID, NULL);
- else if (WARN_ON(true))
- ret = -EINVAL;
+ if (!device_ser) {
+ if (!sm_supported(iommu))
+ ret = domain_context_mapping(domain, dev);
+ else if (intel_domain_is_fs_paging(domain))
+ ret = domain_setup_first_level(iommu, domain, dev,
+ IOMMU_NO_PASID, NULL);
+ else if (intel_domain_is_ss_paging(domain))
+ ret = domain_setup_second_level(iommu, domain, dev,
+ IOMMU_NO_PASID, NULL);
+ else if (WARN_ON(true))
+ ret = -EINVAL;
+ }
if (ret)
goto out_block_translation;
@@ -3170,6 +3180,15 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
struct intel_iommu *iommu = info->iommu;
int ret = -EINVAL;
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ /*
+ * Restored IOMMU domains are already attached to the device and can
+ * only be freed. So no need to check the compatibility.
+ */
+ if (iommu_domain_restored_state(domain))
+ return 0;
+#endif
+
if (intel_domain_is_fs_paging(dmar_domain))
ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
else if (intel_domain_is_ss_paging(dmar_domain))
@@ -3647,7 +3666,7 @@ domain_add_dev_pasid(struct iommu_domain *domain,
if (!dev_pasid)
return ERR_PTR(-ENOMEM);
- ret = domain_attach_iommu(dmar_domain, iommu);
+ ret = domain_attach_iommu(dmar_domain, iommu, -1);
if (ret)
goto out_free;
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index b0ec0b471a43..8e37acf7de12 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1182,7 +1182,8 @@ void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
*/
#define QI_OPT_WAIT_DRAIN BIT(0)
-int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu,
+ int restore_did);
void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
void device_block_translation(struct device *dev);
int paging_domain_compatible(struct iommu_domain *domain, struct device *dev);
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 2b979bec56ce..6e13f697b463 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -40,7 +40,7 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
return ret;
}
- ret = domain_attach_iommu(dmar_domain, iommu);
+ ret = domain_attach_iommu(dmar_domain, iommu, -1);
if (ret) {
dev_err_ratelimited(dev, "Failed to attach domain to iommu\n");
return ret;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0561990f46e3..e888700da53d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -18,6 +18,7 @@
#include <linux/errno.h>
#include <linux/host1x_context_bus.h>
#include <linux/iommu.h>
+#include <linux/iommu-liveupdate.h>
#include <linux/iommufd.h>
#include <linux/idr.h>
#include <linux/err.h>
@@ -505,6 +506,10 @@ static int iommu_init_device(struct device *dev)
goto err_free;
}
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ dev->iommu->device_ser = iommu_get_device_preserved_data(dev);
+#endif
+
iommu_dev = ops->probe_device(dev);
if (IS_ERR(iommu_dev)) {
ret = PTR_ERR(iommu_dev);
@@ -2204,6 +2209,13 @@ static int __iommu_attach_device(struct iommu_domain *domain,
ret = domain->ops->attach_dev(domain, dev, old);
if (ret)
return ret;
+
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ /* The associated state can be unset once restored. */
+ if (dev_iommu_restored_state(dev))
+ WRITE_ONCE(dev->iommu->device_ser, NULL);
+#endif
+
dev->iommu->attach_deferred = 0;
trace_attach_device_to_domain(dev);
return 0;
@@ -3159,6 +3171,47 @@ int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids)
}
EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
+static inline void *__iommu_group_restored_state(struct iommu_group *group)
+{
+ struct device *dev;
+
+ dev = iommu_group_first_dev(group);
+ if (!dev_is_pci(dev))
+ return NULL;
+
+ return dev_iommu_restored_state(dev);
+}
+
+static struct iommu_domain *__iommu_group_restore_domain(struct iommu_group *group)
+{
+ struct iommu_device_ser *device_ser;
+ struct iommu_domain *domain;
+ struct device *dev;
+ void *owner;
+
+ lockdep_assert_held(&group->mutex);
+ dev = iommu_group_first_dev(group);
+ if (!dev_is_pci(dev))
+ return NULL;
+
+ device_ser = dev_iommu_restored_state(dev);
+ if (!device_ser)
+ return NULL;
+
+ domain = iommu_restore_domain(dev, device_ser, &owner);
+ if (WARN_ON(IS_ERR(domain)))
+ return NULL;
+
+ /*
+ * Ownership of groups with preserved devices is set during boot. These
+ * will be reclaimed later by the entity (iommufd) that preserved them.
+ */
+ WARN_ON(group->owner);
+ group->owner = owner;
+ group->owner_cnt = 1;
+ return domain;
+}
+
/**
* iommu_setup_default_domain - Set the default_domain for the group
* @group: Group to change
@@ -3173,8 +3226,8 @@ static int iommu_setup_default_domain(struct iommu_group *group,
int target_type)
{
struct iommu_domain *old_dom = group->default_domain;
+ struct iommu_domain *dom, *restored_domain;
struct group_device *gdev;
- struct iommu_domain *dom;
bool direct_failed;
int req_type;
int ret;
@@ -3218,6 +3271,10 @@ static int iommu_setup_default_domain(struct iommu_group *group,
/* We must set default_domain early for __iommu_device_set_domain */
group->default_domain = dom;
if (!group->domain) {
+ if (__iommu_group_restored_state(group))
+ restored_domain = __iommu_group_restore_domain(group);
+ else
+ restored_domain = dom;
/*
* Drivers are not allowed to fail the first domain attach.
* The only way to recover from this is to fail attaching the
@@ -3225,7 +3282,7 @@ static int iommu_setup_default_domain(struct iommu_group *group,
* in group->default_domain so it is freed after.
*/
ret = __iommu_group_set_domain_internal(
- group, dom, IOMMU_SET_DOMAIN_MUST_SUCCEED);
+ group, restored_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED);
if (WARN_ON(ret))
goto out_free_old;
} else {
diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c
index 60ee29b0c6bd..0888871784ea 100644
--- a/drivers/iommu/liveupdate.c
+++ b/drivers/iommu/liveupdate.c
@@ -234,6 +234,41 @@ int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn,
}
EXPORT_SYMBOL(iommu_for_each_preserved_device);
+static inline bool match_device_ser(struct iommu_device_ser *match,
+ struct pci_dev *pdev)
+{
+ return match->devid == pci_dev_id(pdev) && match->pci_domain_nr == pci_domain_nr(pdev->bus);
+}
+
+struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev)
+{
+ struct iommu_device_ser *device_ser = NULL;
+ struct iommu_device_array_ser *array;
+ struct iommu_flb_obj *flb_obj;
+ int ret, idx;
+
+ if (!dev_is_pci(dev))
+ return NULL;
+
+ ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj);
+ if (ret)
+ return NULL;
+
+ array = phys_to_virt(flb_obj->ser->device_array_phys);
+ iommu_liveupdate_for_each_obj(array, device_ser, idx) {
+ if (match_device_ser(device_ser, to_pci_dev(dev))) {
+ device_ser->hdr.incoming = true;
+ goto out;
+ }
+ }
+
+ device_ser = NULL;
+out:
+ liveupdate_flb_put_incoming(&iommu_flb);
+ return device_ser;
+}
+EXPORT_SYMBOL(iommu_get_device_preserved_data);
+
struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type)
{
struct iommu_hw_ser *iommu_ser = NULL;
@@ -512,3 +547,46 @@ void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev)
iommu_unpreserve_locked(iommu->iommu_dev, flb_obj);
}
+
+struct iommu_domain *iommu_restore_domain(struct device *dev,
+ struct iommu_device_ser *ser,
+ void **owner)
+{
+ struct iommu_domain_ser *domain_ser;
+ struct iommu_flb_obj *flb_obj;
+ struct iommu_domain *domain;
+ int ret;
+
+ domain_ser = phys_to_virt(ser->domain_iommu_ser.domain_phys);
+
+ ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj);
+ if (ret)
+ return ERR_PTR(ret);
+
+ guard(mutex)(&flb_obj->lock);
+ if (domain_ser->restored_domain) {
+ domain = domain_ser->restored_domain;
+ goto out;
+ }
+
+ domain_ser->hdr.incoming = true;
+ domain = iommu_paging_domain_alloc(dev);
+ if (IS_ERR(domain))
+ goto out;
+
+ ret = domain->ops->restore(domain, domain_ser);
+ if (ret) {
+ iommu_domain_free(domain);
+ domain = ERR_PTR(ret);
+ goto out;
+ }
+
+ /* The device is owned by the preserved state. */
+ *owner = ser;
+ domain->preserved_state = domain_ser;
+ domain_ser->restored_domain = domain;
+
+out:
+ liveupdate_flb_put_incoming(&iommu_flb);
+ return domain;
+}
diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h
index 0baf6bc2d93f..75d27256c883 100644
--- a/include/linux/iommu-liveupdate.h
+++ b/include/linux/iommu-liveupdate.h
@@ -30,6 +30,20 @@ static inline void *dev_iommu_preserved_state(struct device *dev)
return NULL;
}
+static inline void *dev_iommu_restored_state(struct device *dev)
+{
+ struct iommu_device_ser *ser;
+
+ if (!dev->iommu)
+ return NULL;
+
+ ser = dev->iommu->device_ser;
+ if (ser && ser->hdr.incoming)
+ return ser;
+
+ return NULL;
+}
+
static inline void *iommu_domain_restored_state(struct iommu_domain *domain)
{
struct iommu_domain_ser *ser;
@@ -41,8 +55,22 @@ static inline void *iommu_domain_restored_state(struct iommu_domain *domain)
return NULL;
}
+static inline int dev_iommu_restore_did(struct device *dev, struct iommu_domain *domain)
+{
+ struct iommu_device_ser *ser = dev_iommu_restored_state(dev);
+
+ if (ser && iommu_domain_restored_state(domain))
+ return ser->domain_iommu_ser.attachment_id;
+
+ return -1;
+}
+
+struct iommu_domain *iommu_restore_domain(struct device *dev,
+ struct iommu_device_ser *ser,
+ void **owner);
int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn,
void *arg);
+struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev);
struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type);
int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser);
void iommu_domain_unpreserve(struct iommu_domain *domain);
@@ -60,16 +88,38 @@ static inline void *dev_iommu_preserved_state(struct device *dev)
return NULL;
}
+static inline void *dev_iommu_restored_state(struct device *dev)
+{
+ return NULL;
+}
+
+static inline int dev_iommu_restore_did(struct device *dev, struct iommu_domain *domain)
+{
+ return -1;
+}
+
static inline void *iommu_domain_restored_state(struct iommu_domain *domain)
{
return NULL;
}
+static inline struct iommu_domain *iommu_restore_domain(struct device *dev,
+ struct iommu_device_ser *ser,
+ void **owner)
+{
+ return NULL;
+}
+
static inline int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, void *arg)
{
return -EOPNOTSUPP;
}
+static inline struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev)
+{
+ return NULL;
+}
+
static inline struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type)
{
return NULL;
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 11/16] iommu/vt-d: preserve PASID table of preserved device
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (9 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 10/16] iommu: Restore and reattach preserved domains to devices Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 12/16] iommufd: Implement ioctl to mark HWPT for preservation Samiullah Khawaja
` (4 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
In scalable mode the PASID table is used to fetch the io page tables.
Preserve and restore the PASID table of the preserved devices.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/intel/iommu.c | 5 +-
drivers/iommu/intel/iommu.h | 12 +++
drivers/iommu/intel/liveupdate.c | 141 +++++++++++++++++++++++++++++++
drivers/iommu/intel/pasid.c | 7 +-
drivers/iommu/intel/pasid.h | 9 ++
include/linux/kho/abi/iommu.h | 13 +++
6 files changed, 184 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b90757164cd8..6d42051dcf7c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2951,8 +2951,10 @@ static int clear_unpreserve_context_entry_fn(struct device *dev,
if (!info)
return 0;
- if (dev_is_pci(dev) && dev_iommu_preserved_state(dev))
+ if (dev_is_pci(dev) && dev_iommu_preserved_state(dev)) {
+ pasid_cleanup_preserved_table(dev);
return 0;
+ }
domain_context_clear(info);
return 0;
@@ -4013,6 +4015,7 @@ const struct iommu_ops intel_iommu_ops = {
.page_response = intel_iommu_page_response,
#ifdef CONFIG_IOMMU_LIVEUPDATE
.preserve_device = intel_iommu_preserve_device,
+ .unpreserve_device = intel_iommu_unpreserve_device,
.preserve = intel_iommu_preserve,
.unpreserve = intel_iommu_unpreserve,
#endif
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 8e37acf7de12..62076a1a0b4d 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1290,12 +1290,15 @@ static inline int iopf_for_domain_replace(struct iommu_domain *new,
#ifdef CONFIG_IOMMU_LIVEUPDATE
int intel_iommu_preserve_device(struct device *dev,
struct iommu_device_ser *device_ser);
+void intel_iommu_unpreserve_device(struct device *dev,
+ struct iommu_device_ser *device_ser);
int intel_iommu_preserve(struct iommu_device *iommu,
struct iommu_hw_ser *iommu_ser);
void intel_iommu_unpreserve(struct iommu_device *iommu,
struct iommu_hw_ser *iommu_ser);
void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu,
struct iommu_hw_ser *iommu_ser);
+void pasid_cleanup_preserved_table(struct device *dev);
#else
static inline int intel_iommu_preserve_device(struct device *dev,
struct iommu_device_ser *device_ser)
@@ -1303,6 +1306,11 @@ static inline int intel_iommu_preserve_device(struct device *dev,
return -EOPNOTSUPP;
}
+static inline void intel_iommu_unpreserve_device(struct device *dev,
+ struct iommu_device_ser *device_ser)
+{
+}
+
static inline int intel_iommu_preserve(struct iommu_device *iommu,
struct iommu_hw_ser *iommu_ser)
{
@@ -1318,6 +1326,10 @@ static inline void intel_iommu_liveupdate_restore_root_table(struct intel_iommu
struct iommu_hw_ser *iommu_ser)
{
}
+
+static inline void pasid_cleanup_preserved_table(struct device *dev)
+{
+}
#endif
#ifdef CONFIG_INTEL_IOMMU_SVM
diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c
index 50a63812533f..404b485e97b9 100644
--- a/drivers/iommu/intel/liveupdate.c
+++ b/drivers/iommu/intel/liveupdate.c
@@ -14,6 +14,7 @@
#include <linux/pci.h>
#include "iommu.h"
+#include "pasid.h"
#include "../iommu-pages.h"
static void unpreserve_iommu_context_table(struct intel_iommu *iommu, int end)
@@ -140,10 +141,96 @@ void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu,
iommu_for_each_preserved_device(_restore_used_domain_ids, iommu);
}
+enum pasid_lu_op {
+ PASID_LU_OP_PRESERVE = 1,
+ PASID_LU_OP_UNPRESERVE,
+ PASID_LU_OP_RESTORE,
+ PASID_LU_OP_FREE,
+};
+
+static int pasid_lu_do_op(void *table, enum pasid_lu_op op)
+{
+ int ret = 0;
+
+ switch (op) {
+ case PASID_LU_OP_PRESERVE:
+ ret = iommu_preserve_page(table);
+ break;
+ case PASID_LU_OP_UNPRESERVE:
+ iommu_unpreserve_page(table);
+ break;
+ case PASID_LU_OP_RESTORE:
+ iommu_restore_page(virt_to_phys(table));
+ break;
+ case PASID_LU_OP_FREE:
+ iommu_free_pages(table);
+ break;
+ }
+
+ return ret;
+}
+
+static int pasid_lu_handle_pd(struct pasid_dir_entry *dir, enum pasid_lu_op op)
+{
+ struct pasid_entry *table;
+ int ret;
+
+ /* Only preserve first table for NO_PASID. */
+ table = get_pasid_table_from_pde(&dir[0]);
+ if (!table)
+ return -EINVAL;
+
+ ret = pasid_lu_do_op(table, op);
+ if (ret)
+ return ret;
+
+ ret = pasid_lu_do_op(dir, op);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ if (op == PASID_LU_OP_PRESERVE)
+ pasid_lu_do_op(table, PASID_LU_OP_UNPRESERVE);
+
+ return ret;
+}
+
+void pasid_cleanup_preserved_table(struct device *dev)
+{
+ struct pasid_table *pasid_table;
+ struct pasid_dir_entry *dir;
+ struct pasid_entry *table;
+ size_t dir_size;
+
+ pasid_table = intel_pasid_get_table(dev);
+ if (!pasid_table)
+ return;
+
+ dir = pasid_table->table;
+ table = get_pasid_table_from_pde(&dir[0]);
+ if (!table)
+ return;
+
+ /* Clear everything except the first entry in table. */
+ memset(&table[1], 0, SZ_4K - sizeof(*table));
+
+ /* Use the folio order to calculate the size of Pasid Directory */
+ dir_size = (1 << (folio_order(virt_to_folio(dir)) + PAGE_SHIFT));
+
+ /* Clear everything except the first entry in directory */
+ memset(&dir[1], 0, dir_size - sizeof(struct pasid_dir_entry));
+
+ clflush_cache_range(&table[0], SZ_4K);
+ clflush_cache_range(&dir[0], dir_size);
+}
+
int intel_iommu_preserve_device(struct device *dev,
struct iommu_device_ser *device_ser)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct pasid_table *pasid_table;
+ int ret;
if (!dev_is_pci(dev)) {
dev_err(dev, "Cannot preserve non-PCI device\n");
@@ -155,9 +242,45 @@ int intel_iommu_preserve_device(struct device *dev,
device_ser->domain_iommu_ser.attachment_id = domain_id_iommu(info->domain,
info->iommu);
+
+ if (!sm_supported(info->iommu))
+ return 0;
+
+ pasid_table = intel_pasid_get_table(dev);
+ if (!pasid_table)
+ return -EINVAL;
+
+ ret = pasid_lu_handle_pd(pasid_table->table, PASID_LU_OP_PRESERVE);
+ if (ret)
+ return ret;
+
+ device_ser->intel.pasid_table = virt_to_phys(pasid_table->table);
+ device_ser->intel.max_pasid = pasid_table->max_pasid;
return 0;
}
+void intel_iommu_unpreserve_device(struct device *dev,
+ struct iommu_device_ser *device_ser)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct pasid_table *pasid_table;
+
+ if (!dev_is_pci(dev))
+ return;
+
+ if (!info)
+ return;
+
+ if (!sm_supported(info->iommu))
+ return;
+
+ pasid_table = intel_pasid_get_table(dev);
+ if (!pasid_table)
+ return;
+
+ pasid_lu_handle_pd(pasid_table->table, PASID_LU_OP_UNPRESERVE);
+}
+
int intel_iommu_preserve(struct iommu_device *iommu_dev,
struct iommu_hw_ser *ser)
{
@@ -194,3 +317,21 @@ void intel_iommu_unpreserve(struct iommu_device *iommu_dev,
unpreserve_iommu_context_table(iommu, ROOT_ENTRY_NR);
iommu_unpreserve_page(iommu->root_entry);
}
+
+void *intel_pasid_try_restore_table(struct device *dev, u64 max_pasid)
+{
+ struct iommu_device_ser *ser = dev_iommu_restored_state(dev);
+
+ if (!ser)
+ return NULL;
+
+ BUG_ON(pasid_lu_handle_pd(phys_to_virt(ser->intel.pasid_table),
+ PASID_LU_OP_RESTORE));
+ if (WARN_ON_ONCE(ser->intel.max_pasid != max_pasid)) {
+ pasid_lu_handle_pd(phys_to_virt(ser->intel.pasid_table),
+ PASID_LU_OP_FREE);
+ return NULL;
+ }
+
+ return phys_to_virt(ser->intel.pasid_table);
+}
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 89541b74ab8c..5cac8e95f73b 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -60,8 +60,11 @@ int intel_pasid_alloc_table(struct device *dev)
size = max_pasid >> (PASID_PDE_SHIFT - 3);
order = size ? get_order(size) : 0;
- dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL,
- 1 << (order + PAGE_SHIFT));
+
+ dir = intel_pasid_try_restore_table(dev, 1 << (order + PAGE_SHIFT + 3));
+ if (!dir)
+ dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL,
+ 1 << (order + PAGE_SHIFT));
if (!dir) {
kfree(pasid_table);
return -ENOMEM;
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 48d3bb6b68de..44e673a4ad8f 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -301,6 +301,15 @@ static inline void pasid_set_eafe(struct pasid_entry *pe)
extern unsigned int intel_pasid_max_id;
int intel_pasid_alloc_table(struct device *dev);
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+void *intel_pasid_try_restore_table(struct device *dev, u64 max_pasid);
+#else
+static inline void *intel_pasid_try_restore_table(struct device *dev,
+ u64 max_pasid)
+{
+ return NULL;
+}
+#endif
void intel_pasid_free_table(struct device *dev);
struct pasid_table *intel_pasid_get_table(struct device *dev);
int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev,
diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h
index 5ffedf0dbd5a..5eeb1e0c9bce 100644
--- a/include/linux/kho/abi/iommu.h
+++ b/include/linux/kho/abi/iommu.h
@@ -119,6 +119,16 @@ struct iommu_dev_map_ser {
u64 iommu_phys;
} __packed;
+/**
+ * struct iommu_device_intel_ser - Intel specific state of serialized device
+ * @pasid_table: Physical address of pasid table
+ * @max_pasid: Maximum supported pasid
+ */
+struct iommu_device_intel_ser {
+ u64 pasid_table;
+ u64 max_pasid;
+} __packed;
+
/**
* struct iommu_device_ser - Serialized state of a device
* @hdr: Common object header
@@ -131,6 +141,9 @@ struct iommu_device_ser {
u32 devid;
u32 pci_domain_nr;
struct iommu_dev_map_ser domain_iommu_ser;
+ union {
+ struct iommu_device_intel_ser intel;
+ };
} __packed;
/**
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 12/16] iommufd: Implement ioctl to mark HWPT for preservation
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (10 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 11/16] iommu/vt-d: preserve PASID table of preserved device Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 13/16] iommufd: Persist iommu hardware pagetables for live update Samiullah Khawaja
` (3 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: YiFei Zhu, Samiullah Khawaja, Robin Murphy, Kevin Tian,
Alex Williamson, Shuah Khan, iommu, linux-kernel, kvm,
Saeed Mahameed, Adithya Jayachandran, Parav Pandit,
Leon Romanovsky, William Tu, Pratyush Yadav, Pasha Tatashin,
David Matlack, Andrew Morton, Chris Li, Pranjal Shrivastava,
Vipin Sharma
From: YiFei Zhu <zhuyifei@google.com>
Userspace provides a token to mark the HWPT for preservation. Note that
this token is not the LUO token that is used to preserve the iommufd.
Once all the required HWPT are marked for preservation, the user can
preserve the iommufd into LUO. The iommufd will preserve the HWPTs that
are marked for preservation.
The marked HWPTs are tracked using a new XArray mark protected by a new
liveupdate mutex. This mutex will also be used during iommufd
preservation to protect against any race with the mark preserve ioctl.
The HWPT token will be used during restore to identify this HWPT. The
restoration logic is not implemented and will be added later.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
MAINTAINERS | 1 +
drivers/iommu/iommufd/Makefile | 1 +
drivers/iommu/iommufd/iommufd_private.h | 18 +++++++++
drivers/iommu/iommufd/liveupdate.c | 52 +++++++++++++++++++++++++
drivers/iommu/iommufd/main.c | 9 +++++
include/uapi/linux/iommufd.h | 26 +++++++++++++
6 files changed, 107 insertions(+)
create mode 100644 drivers/iommu/iommufd/liveupdate.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 9f5c02c6c8c1..bf6a2ad61989 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13496,6 +13496,7 @@ R: Pranjal Shrivastava <praan@google.com>
L: iommu@lists.linux.dev
S: Maintained
F: drivers/iommu/intel/liveupdate.c
+F: drivers/iommu/iommufd/liveupdate.c
F: drivers/iommu/liveupdate.c
F: include/linux/iommu-liveupdate.h
F: include/linux/kho/abi/iommu.h
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 71d692c9a8f4..c3bf0b6452d3 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
iommufd_driver-y := driver.o
obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o
+obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 6ac1965199e9..111f4d42e210 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -44,6 +44,11 @@ struct iommufd_ctx {
struct file *file;
struct xarray objects;
struct xarray groups;
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+#define IOMMUFD_OBJ_LIVEUPDATE_MARK XA_MARK_1
+ /* @liveupdate_mutex: Protects the preservation of HWPTs. */
+ struct mutex liveupdate_mutex;
+#endif
wait_queue_head_t destroy_wait;
struct rw_semaphore ioas_creation_lock;
struct maple_tree mt_mmap;
@@ -373,6 +378,10 @@ struct iommufd_hwpt_paging {
bool auto_domain : 1;
bool enforce_cache_coherency : 1;
bool nest_parent : 1;
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ bool liveupdate_preserve : 1;
+ u64 liveupdate_token;
+#endif
/* Head at iommufd_ioas::hwpt_list */
struct list_head hwpt_item;
struct iommufd_sw_msi_maps present_sw_msi;
@@ -706,6 +715,15 @@ iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
struct iommufd_vdevice, obj);
}
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd);
+#else
+static inline int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd)
+{
+ return -ENOTTY;
+}
+#endif
+
#ifdef CONFIG_IOMMUFD_TEST
int iommufd_test(struct iommufd_ucmd *ucmd);
void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/liveupdate.c b/drivers/iommu/iommufd/liveupdate.c
new file mode 100644
index 000000000000..2d3abfa9e9f8
--- /dev/null
+++ b/drivers/iommu/iommufd/liveupdate.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2026, Google LLC
+ * Author: Samiullah Khawaja <skhawaja@google.com>
+ */
+
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/iommufd.h>
+#include <linux/liveupdate.h>
+
+#include "iommufd_private.h"
+
+int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hwpt_liveupdate_mark_preserve *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_target;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_ctx *ictx = ucmd->ictx;
+ struct iommufd_object *obj;
+ unsigned long index;
+ int rc = 0;
+
+ hwpt_target = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_target))
+ return PTR_ERR(hwpt_target);
+
+ mutex_lock(&ictx->liveupdate_mutex);
+
+ xa_lock(&ictx->objects);
+ xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) {
+ if (WARN_ON_ONCE(obj->type != IOMMUFD_OBJ_HWPT_PAGING))
+ continue;
+
+ hwpt_paging = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj));
+ if (hwpt_paging->liveupdate_token == cmd->hwpt_token) {
+ rc = -EADDRINUSE;
+ goto out_unlock;
+ }
+ }
+
+ __xa_set_mark(&ictx->objects, hwpt_target->common.obj.id, IOMMUFD_OBJ_LIVEUPDATE_MARK);
+ hwpt_target->liveupdate_token = cmd->hwpt_token;
+
+out_unlock:
+ xa_unlock(&ictx->objects);
+ mutex_unlock(&ictx->liveupdate_mutex);
+ iommufd_put_object(ictx, &hwpt_target->common.obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 8c6d43601afb..0114c1520db4 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -313,6 +313,9 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
init_rwsem(&ictx->ioas_creation_lock);
xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
xa_init(&ictx->groups);
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ mutex_init(&ictx->liveupdate_mutex);
+#endif
ictx->file = filp;
mt_init_flags(&ictx->mt_mmap, MT_FLAGS_ALLOC_RANGE);
init_waitqueue_head(&ictx->destroy_wait);
@@ -375,6 +378,9 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
* iommufd_object_tombstone_user()
*/
xa_destroy(&ictx->objects);
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ mutex_destroy(&ictx->liveupdate_mutex);
+#endif
WARN_ON(!xa_empty(&ictx->groups));
@@ -420,6 +426,7 @@ union ucmd_buffer {
struct iommu_hwpt_alloc hwpt;
struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
struct iommu_hwpt_invalidate cache;
+ struct iommu_hwpt_liveupdate_mark_preserve mark_preserve;
struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
struct iommu_ioas_alloc alloc;
struct iommu_ioas_allow_iovas allow_iovas;
@@ -493,6 +500,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
__reserved),
IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
struct iommu_viommu_alloc, out_viommu_id),
+ IOCTL_OP(IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE, iommufd_hwpt_liveupdate_mark_preserve,
+ struct iommu_hwpt_liveupdate_mark_preserve, hwpt_token),
#ifdef CONFIG_IOMMUFD_TEST
IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
#endif
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index e998dfbd6960..d96a74b43c9d 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -57,6 +57,7 @@ enum {
IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94,
+ IOMMUFD_CMD_HWPT_LU_MARK_PRESERVE = 0x95,
};
/**
@@ -1347,4 +1348,29 @@ struct iommu_hw_queue_alloc {
__aligned_u64 length;
};
#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC)
+
+/**
+ * struct iommu_hwpt_liveupdate_mark_preserve - ioctl(IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE)
+ * @size: sizeof(struct iommu_hwpt_liveupdate_mark_preserve)
+ * @hwpt_id: Iommufd object ID of the target HWPT
+ * @hwpt_token: Token to identify this hwpt upon restore
+ *
+ * The target HWPT will be preserved during iommufd preservation.
+ * Only file-based memory mappings (e.g. memfd) are supported for HWPTs marked
+ * for preservation. Mapping anonymous memory into a preserved HWPT will result
+ * in a failure during the preservation phase.
+ *
+ * The hwpt_token is provided by userspace. If userspace enters a token
+ * already in use within this iommufd, -EADDRINUSE is returned from this ioctl.
+ *
+ * Note: There is no 'unmark' operation, so any HWPTs pooled in userspace that
+ * are marked for preservation must be destroyed after use.
+ */
+struct iommu_hwpt_liveupdate_mark_preserve {
+ __u32 size;
+ __u32 hwpt_id;
+ __u64 hwpt_token;
+};
+#define IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_LU_MARK_PRESERVE)
+
#endif
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 13/16] iommufd: Persist iommu hardware pagetables for live update
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (11 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 12/16] iommufd: Implement ioctl to mark HWPT for preservation Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 14/16] iommufd: Add APIs to preserve/unpreserve a vfio cdev Samiullah Khawaja
` (2 subsequent siblings)
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: YiFei Zhu, Samiullah Khawaja, Robin Murphy, Kevin Tian,
Alex Williamson, Shuah Khan, iommu, linux-kernel, kvm,
Saeed Mahameed, Adithya Jayachandran, Parav Pandit,
Leon Romanovsky, William Tu, Pratyush Yadav, Pasha Tatashin,
David Matlack, Andrew Morton, Chris Li, Pranjal Shrivastava,
Vipin Sharma
From: YiFei Zhu <zhuyifei@google.com>
Register iommufd with the LUO framework and implement the preserve and
unpreserve ops to save marked HWPTs.
To make sure mappings do not change during preserved state, add a
liveupdate_immutable flag to IOAS. When an HWPT is preserved, its IOAS
is marked immutable and any map/unmap attempts will fail with -EBUSY.
This is synchronized using the domains_rwsem to prevent races with
concurrent mapping operations.
The preserve callback iterates over the marked HWPTs, verifies that the
backing memory pages are preserved, and calls iommu_domain_preserve() to
preserve the associated IOMMU domain.
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
MAINTAINERS | 1 +
drivers/iommu/iommufd/io_pagetable.c | 11 +
drivers/iommu/iommufd/io_pagetable.h | 1 +
drivers/iommu/iommufd/iommufd_private.h | 27 ++-
drivers/iommu/iommufd/liveupdate.c | 287 ++++++++++++++++++++++++
drivers/iommu/iommufd/main.c | 10 +-
drivers/iommu/iommufd/pages.c | 7 +
include/linux/kho/abi/iommufd.h | 51 +++++
8 files changed, 393 insertions(+), 2 deletions(-)
create mode 100644 include/linux/kho/abi/iommufd.h
diff --git a/MAINTAINERS b/MAINTAINERS
index bf6a2ad61989..6005b737d1c5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13500,6 +13500,7 @@ F: drivers/iommu/iommufd/liveupdate.c
F: drivers/iommu/liveupdate.c
F: include/linux/iommu-liveupdate.h
F: include/linux/kho/abi/iommu.h
+F: include/linux/kho/abi/iommufd.h
IOMMUFD
M: Jason Gunthorpe <jgg@nvidia.com>
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 24d4917105d9..b18dba9dd147 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -384,6 +384,11 @@ int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
return rc;
down_read(&iopt->domains_rwsem);
+ if (iopt_liveupdate_immutable(iopt)) {
+ rc = -EBUSY;
+ goto out_unlock_domains;
+ }
+
rc = iopt_fill_domains_pages(pages_list);
if (rc)
goto out_unlock_domains;
@@ -755,6 +760,12 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
again:
down_read(&iopt->domains_rwsem);
down_write(&iopt->iova_rwsem);
+
+ if (iopt_liveupdate_immutable(iopt)) {
+ rc = -EBUSY;
+ goto out_unlock_iova;
+ }
+
while ((area = iopt_area_iter_first(iopt, start, last))) {
unsigned long area_last = iopt_area_last_iova(area);
unsigned long area_first = iopt_area_iova(area);
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 27e3e311d395..207ff368d412 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -234,6 +234,7 @@ struct iopt_pages {
struct { /* IOPT_ADDRESS_FILE */
struct file *file;
unsigned long start;
+ u32 seals;
};
/* IOPT_ADDRESS_DMABUF */
struct iopt_pages_dmabuf dmabuf;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 111f4d42e210..3c88aa115d08 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -98,6 +98,9 @@ struct io_pagetable {
/* IOVA that cannot be allocated, struct iopt_reserved */
struct rb_root_cached reserved_itree;
u8 disable_large_pages;
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ bool liveupdate_immutable;
+#endif
unsigned long iova_alignment;
};
@@ -379,7 +382,7 @@ struct iommufd_hwpt_paging {
bool enforce_cache_coherency : 1;
bool nest_parent : 1;
#ifdef CONFIG_IOMMU_LIVEUPDATE
- bool liveupdate_preserve : 1;
+ bool liveupdate_preserved : 1;
u64 liveupdate_token;
#endif
/* Head at iommufd_ioas::hwpt_list */
@@ -716,12 +719,34 @@ iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
}
#ifdef CONFIG_IOMMU_LIVEUPDATE
+int iommufd_liveupdate_register(void);
+void iommufd_liveupdate_unregister(void);
+
int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd);
+
+static inline bool iopt_liveupdate_immutable(const struct io_pagetable *iopt)
+{
+ return iopt->liveupdate_immutable;
+}
#else
+static inline int iommufd_liveupdate_register(void)
+{
+ return 0;
+}
+
+static inline void iommufd_liveupdate_unregister(void)
+{
+}
+
static inline int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd)
{
return -ENOTTY;
}
+
+static inline bool iopt_liveupdate_immutable(const struct io_pagetable *iopt)
+{
+ return false;
+}
#endif
#ifdef CONFIG_IOMMUFD_TEST
diff --git a/drivers/iommu/iommufd/liveupdate.c b/drivers/iommu/iommufd/liveupdate.c
index 2d3abfa9e9f8..3cb220557d0d 100644
--- a/drivers/iommu/iommufd/liveupdate.c
+++ b/drivers/iommu/iommufd/liveupdate.c
@@ -9,9 +9,22 @@
#include <linux/file.h>
#include <linux/iommufd.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/iommufd.h>
#include <linux/liveupdate.h>
+#include <linux/iommu-liveupdate.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
#include "iommufd_private.h"
+#include "io_pagetable.h"
+
+static void ioas_set_immutable(struct iommufd_ioas *ioas, bool immutable)
+{
+ down_write(&ioas->iopt.domains_rwsem);
+ ioas->iopt.liveupdate_immutable = immutable;
+ up_write(&ioas->iopt.domains_rwsem);
+}
int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd)
{
@@ -50,3 +63,277 @@ int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd)
iommufd_put_object(ictx, &hwpt_target->common.obj);
return rc;
}
+
+static int check_iopt_pages_preserved(struct liveupdate_session *s,
+ struct iommufd_hwpt_paging *hwpt)
+{
+ u32 req_seals = F_SEAL_SEAL | F_SEAL_GROW | F_SEAL_SHRINK;
+ struct iopt_area *area;
+ int ret = 0;
+
+ down_read(&hwpt->ioas->iopt.iova_rwsem);
+ for (area = iopt_area_iter_first(&hwpt->ioas->iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ struct iopt_pages *pages = area->pages;
+
+ /* Only allow file based mapping */
+ if (pages->type != IOPT_ADDRESS_FILE) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * When this memory file was mapped it should be sealed and seal
+ * should be sealed. This means that since mapping was done the
+ * memory file was not grown or shrink and the pages being used
+ * until now remain pinned and preserved.
+ */
+ if ((pages->seals & req_seals) != req_seals) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Make sure that the file was preserved. */
+ ret = liveupdate_get_token_outgoing(s, pages->file, NULL);
+ if (ret)
+ break;
+ }
+ up_read(&hwpt->ioas->iopt.iova_rwsem);
+
+ return ret;
+}
+
+static int iommufd_preserve_hwpt(struct iommufd_hwpt_paging *hwpt,
+ struct iommufd_hwpt_ser *hwpt_ser,
+ struct liveupdate_session *session)
+{
+ struct iommu_domain_ser *domain_ser;
+ bool ioas_made_immutable = false;
+ int rc;
+
+ if (!hwpt->ioas->iopt.liveupdate_immutable) {
+ /*
+ * Make IOAS immutable so the DMA mappings do not change while
+ * the HWPT is preserved. Since one IOAS can have multiple
+ * HWPTs, if an error occurs this call needs to make the IOAS
+ * mutable again if it was the one that made it immutable.
+ */
+ ioas_made_immutable = true;
+ ioas_set_immutable(hwpt->ioas, true);
+
+ rc = check_iopt_pages_preserved(session, hwpt);
+ if (rc)
+ goto err;
+ }
+
+ hwpt_ser->token = hwpt->liveupdate_token;
+ hwpt_ser->reclaimed = false;
+
+ rc = iommu_domain_preserve(hwpt->common.domain, &domain_ser);
+ if (rc < 0)
+ goto err;
+
+ hwpt_ser->domain_data = virt_to_phys(domain_ser);
+ return 0;
+
+err:
+ if (ioas_made_immutable)
+ ioas_set_immutable(hwpt->ioas, false);
+
+ return rc;
+}
+
+static void _iommufd_unpreserve(struct iommufd_ctx *ictx,
+ struct iommufd_ser *ser)
+{
+ struct iommufd_hwpt_paging *hwpt;
+ struct iommufd_object *obj;
+ unsigned long index;
+
+ xa_lock(&ictx->objects);
+ xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) {
+ if (obj->type != IOMMUFD_OBJ_HWPT_PAGING)
+ continue;
+
+ hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj));
+ if (!hwpt->liveupdate_preserved)
+ continue;
+
+ xa_unlock(&ictx->objects);
+
+ iommu_domain_unpreserve(hwpt->common.domain);
+ if (hwpt->ioas->iopt.liveupdate_immutable)
+ ioas_set_immutable(hwpt->ioas, false);
+
+ hwpt->liveupdate_preserved = false;
+ iommufd_put_object(ictx, obj);
+
+ xa_lock(&ictx->objects);
+ }
+ xa_unlock(&ictx->objects);
+
+ kho_unpreserve_free(ser);
+}
+
+static int iommufd_liveupdate_preserve(struct liveupdate_file_op_args *args)
+{
+ struct iommufd_ctx *ictx = iommufd_ctx_from_file(args->file);
+ struct iommufd_hwpt_paging *hwpt;
+ struct iommufd_ser *iommufd_ser;
+ struct iommufd_object *obj;
+ unsigned int nr_hwpts;
+ unsigned long index;
+ unsigned int i;
+ void *mem;
+ int rc;
+
+ if (IS_ERR(ictx))
+ return PTR_ERR(ictx);
+
+ mutex_lock(&ictx->liveupdate_mutex);
+
+ /* Count the number of HWPTs to preserve */
+ nr_hwpts = 0;
+ xa_lock(&ictx->objects);
+ xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) {
+ if (obj->type != IOMMUFD_OBJ_HWPT_PAGING)
+ continue;
+
+ hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj));
+ if (!hwpt->common.domain) {
+ rc = -EINVAL;
+ xa_unlock(&ictx->objects);
+ goto out_unlock;
+ }
+ nr_hwpts++;
+ }
+ xa_unlock(&ictx->objects);
+
+ mem = kho_alloc_preserve(struct_size(iommufd_ser,
+ hwpt_array, nr_hwpts));
+ if (!mem) {
+ rc = -ENOMEM;
+ goto out_unlock;
+ }
+
+ iommufd_ser = mem;
+ iommufd_ser->nr_hwpts = nr_hwpts;
+
+ /* Preserve HWPTs */
+ i = 0;
+ xa_lock(&ictx->objects);
+ xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) {
+ if (obj->type != IOMMUFD_OBJ_HWPT_PAGING)
+ continue;
+
+ if (!iommufd_lock_obj(obj)) {
+ rc = -ENOENT;
+ xa_unlock(&ictx->objects);
+ goto out_unpreserve;
+ }
+
+ /*
+ * HWPT is locked so it will not be destroyed. The xarray lock
+ * can be released here before preserving the HWPT.
+ */
+ xa_unlock(&ictx->objects);
+ hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj));
+ rc = iommufd_preserve_hwpt(hwpt, &iommufd_ser->hwpt_array[i++], args->session);
+ if (rc) {
+ iommufd_put_object(ictx, obj);
+ goto out_unpreserve;
+ }
+
+ /* Mark as preserved */
+ hwpt->liveupdate_preserved = true;
+ xa_lock(&ictx->objects);
+ }
+ xa_unlock(&ictx->objects);
+
+ args->serialized_data = virt_to_phys(iommufd_ser);
+ mutex_unlock(&ictx->liveupdate_mutex);
+ iommufd_ctx_put(ictx);
+ return 0;
+
+out_unpreserve:
+ _iommufd_unpreserve(ictx, iommufd_ser);
+out_unlock:
+ mutex_unlock(&ictx->liveupdate_mutex);
+ iommufd_ctx_put(ictx);
+ return rc;
+}
+
+static void iommufd_liveupdate_unpreserve(struct liveupdate_file_op_args *args)
+{
+ struct iommufd_ctx *ictx = iommufd_ctx_from_file(args->file);
+
+ if (WARN_ON(IS_ERR(ictx)))
+ return;
+
+ mutex_lock(&ictx->liveupdate_mutex);
+ _iommufd_unpreserve(ictx, phys_to_virt(args->serialized_data));
+ mutex_unlock(&ictx->liveupdate_mutex);
+
+ iommufd_ctx_put(ictx);
+}
+
+static int iommufd_liveupdate_retrieve(struct liveupdate_file_op_args *args)
+{
+ return -EOPNOTSUPP;
+}
+
+static bool iommufd_liveupdate_can_finish(struct liveupdate_file_op_args *args)
+{
+ return false;
+}
+
+static void iommufd_liveupdate_finish(struct liveupdate_file_op_args *args)
+{
+}
+
+static bool iommufd_liveupdate_can_preserve(struct liveupdate_file_handler *handler,
+ struct file *file)
+{
+ struct iommufd_ctx *ictx = iommufd_ctx_from_file(file);
+
+ if (IS_ERR(ictx))
+ return false;
+
+ iommufd_ctx_put(ictx);
+ return true;
+}
+
+static struct liveupdate_file_ops iommufd_ser_file_ops = {
+ .can_preserve = iommufd_liveupdate_can_preserve,
+ .preserve = iommufd_liveupdate_preserve,
+ .unpreserve = iommufd_liveupdate_unpreserve,
+ .retrieve = iommufd_liveupdate_retrieve,
+ .can_finish = iommufd_liveupdate_can_finish,
+ .finish = iommufd_liveupdate_finish,
+};
+
+static struct liveupdate_file_handler iommufd_ser_handler = {
+ .compatible = IOMMUFD_LUO_COMPATIBLE,
+ .ops = &iommufd_ser_file_ops,
+};
+
+int iommufd_liveupdate_register(void)
+{
+ int ret;
+
+ ret = liveupdate_register_file_handler(&iommufd_ser_handler);
+ if (ret)
+ return ret;
+
+ ret = iommu_liveupdate_register_flb(&iommufd_ser_handler);
+ if (ret)
+ liveupdate_unregister_file_handler(&iommufd_ser_handler);
+
+ return ret;
+}
+
+void iommufd_liveupdate_unregister(void)
+{
+ iommu_liveupdate_unregister_flb(&iommufd_ser_handler);
+ liveupdate_unregister_file_handler(&iommufd_ser_handler);
+}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 0114c1520db4..0a7e7bb586d7 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -782,11 +782,18 @@ static int __init iommufd_init(void)
if (ret)
goto err_misc;
}
- ret = iommufd_test_init();
+
+ ret = iommufd_liveupdate_register();
if (ret)
goto err_vfio_misc;
+
+ ret = iommufd_test_init();
+ if (ret)
+ goto err_liveupdate;
return 0;
+err_liveupdate:
+ iommufd_liveupdate_unregister();
err_vfio_misc:
if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER))
misc_deregister(&vfio_misc_dev);
@@ -798,6 +805,7 @@ static int __init iommufd_init(void)
static void __exit iommufd_exit(void)
{
iommufd_test_exit();
+ iommufd_liveupdate_unregister();
if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER))
misc_deregister(&vfio_misc_dev);
misc_deregister(&iommu_misc_dev);
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 9bdb2945afe1..3b0c0acb8856 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -55,6 +55,7 @@
#include <linux/overflow.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
+#include <linux/memfd.h>
#include <linux/vfio_pci_core.h>
#include "double_span.h"
@@ -1421,6 +1422,7 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file,
{
struct iopt_pages *pages;
+ int seals;
pages = iopt_alloc_pages(start_byte, length, writable);
if (IS_ERR(pages))
@@ -1428,6 +1430,11 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file,
pages->file = get_file(file);
pages->start = start - start_byte;
pages->type = IOPT_ADDRESS_FILE;
+
+ seals = memfd_get_seals(file);
+ if (seals > 0)
+ pages->seals = seals;
+
return pages;
}
diff --git a/include/linux/kho/abi/iommufd.h b/include/linux/kho/abi/iommufd.h
new file mode 100644
index 000000000000..e0c13b965cb9
--- /dev/null
+++ b/include/linux/kho/abi/iommufd.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (C) 2026, Google LLC
+ * Author: Samiullah Khawaja <skhawaja@google.com>
+ */
+
+#ifndef _LINUX_KHO_ABI_IOMMUFD_H
+#define _LINUX_KHO_ABI_IOMMUFD_H
+
+#include <linux/mutex_types.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * DOC: IOMMUFD Live Update ABI
+ *
+ * This header defines the ABI for preserving the state of an IOMMUFD file
+ * across a kexec reboot using LUO.
+ *
+ * This interface is a contract. Any modification to any of the serialization
+ * structs defined here constitutes a breaking change. Such changes require
+ * incrementing the version number in the IOMMUFD_LUO_COMPATIBLE string.
+ */
+
+#define IOMMUFD_LUO_COMPATIBLE "iommufd-v1"
+
+/**
+ * struct iommu_hwpt_ser - IOMMUFD HWPT serialized state
+ * @domain_data: Physical address of the serialized state of associated domain
+ * @token: User provided token
+ * @reclaimed: Whether the HWPT is reclaimed
+ */
+struct iommufd_hwpt_ser {
+ u64 domain_data;
+ u64 token;
+ u8 reclaimed;
+ u8 padding[7];
+} __packed;
+
+/**
+ * struct iommu_ser - IOMMUFD serialized state
+ * @nr_hwpts: Number of preserved HWPTs
+ * @hwpt_array: Array of serialized state of preserved HWPTs
+ */
+struct iommufd_ser {
+ u64 nr_hwpts;
+ struct iommufd_hwpt_ser hwpt_array[];
+} __packed;
+
+#endif /* _LINUX_KHO_ABI_IOMMUFD_H */
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 14/16] iommufd: Add APIs to preserve/unpreserve a vfio cdev
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (12 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 13/16] iommufd: Persist iommu hardware pagetables for live update Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 15/16] vfio/pci: Preserve the iommufd state of the " Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 16/16] iommufd/selftest: Add test to verify iommufd preservation Samiullah Khawaja
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
Add APIs that can be used to preserve and unpreserve a vfio cdev. Use
the APIs exported by the IOMMU core to preserve/unpreserve device.
The LUO token of the preserved iommufd is fetched and returned back to
the caller as that can be used during restore to get the restored
iommufd. Handle to the preserved state of the device is also returned to
reassociate with the restored state after live update kexec.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/iommu/iommufd/device.c | 102 ++++++++++++++++++++++++
drivers/iommu/iommufd/iommufd_private.h | 3 +
include/linux/iommufd.h | 29 +++++++
3 files changed, 134 insertions(+)
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 170a7005f0bc..d19fece00da3 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
*/
#include <linux/iommu.h>
+#include <linux/iommu-liveupdate.h>
#include <linux/iommufd.h>
#include <linux/pci-ats.h>
#include <linux/slab.h>
@@ -610,6 +611,10 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
int rc;
mutex_lock(&igroup->lock);
+ if (iommufd_device_is_preserved(idev)) {
+ rc = -EBUSY;
+ goto err_unlock;
+ }
attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL,
XA_ZERO_ENTRY, GFP_KERNEL);
@@ -1665,3 +1670,100 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
iommufd_put_object(ucmd->ictx, &idev->obj);
return rc;
}
+
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+static bool _iommufd_device_has_pasid_attachments(struct iommufd_device *idev)
+{
+ struct iommufd_group *igroup = idev->igroup;
+ unsigned long start = IOMMU_NO_PASID;
+
+ if (xa_find_after(&igroup->pasid_attach,
+ &start, UINT_MAX, XA_PRESENT))
+ return true;
+
+ return false;
+}
+
+int iommufd_device_preserve(struct liveupdate_session *s,
+ struct iommufd_device *idev,
+ u64 *iommufd_tokenp,
+ u64 *preserved_state)
+{
+ struct iommufd_group *igroup = idev->igroup;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_attach *attach;
+ int ret;
+
+ mutex_lock(&igroup->lock);
+ if (_iommufd_device_has_pasid_attachments(idev)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
+ if (!attach) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ hwpt = attach->hwpt;
+ hwpt_paging = find_hwpt_paging(hwpt);
+ if (!hwpt_paging || !hwpt_paging->liveupdate_preserved) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = liveupdate_get_token_outgoing(s, idev->ictx->file, iommufd_tokenp);
+ if (ret)
+ goto out;
+
+ ret = iommu_preserve_device(hwpt_paging->common.domain,
+ idev->dev,
+ preserved_state);
+
+ if (!ret)
+ igroup->liveupdate_preserved = true;
+out:
+ mutex_unlock(&igroup->lock);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_preserve, "IOMMUFD");
+
+void iommufd_device_unpreserve(struct liveupdate_session *s,
+ struct iommufd_device *idev)
+{
+ struct iommufd_group *igroup = idev->igroup;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_attach *attach;
+
+ mutex_lock(&igroup->lock);
+ attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
+ if (!attach) {
+ WARN(1, "IOMMU_NO_PASID attachment not found");
+ igroup->liveupdate_preserved = false;
+ goto out;
+ }
+
+ hwpt = attach->hwpt;
+ hwpt_paging = find_hwpt_paging(hwpt);
+ if (!hwpt_paging || !hwpt_paging->liveupdate_preserved) {
+ WARN(1, "Attached domain is not preserved");
+ igroup->liveupdate_preserved = false;
+ goto out;
+ }
+
+ iommu_unpreserve_device(hwpt_paging->common.domain, idev->dev);
+ igroup->liveupdate_preserved = false;
+out:
+ mutex_unlock(&igroup->lock);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_unpreserve, "IOMMUFD");
+
+bool iommufd_device_is_preserved(struct iommufd_device *idev)
+{
+ return idev && idev->igroup && idev->igroup->liveupdate_preserved;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_is_preserved, "IOMMUFD");
+#endif
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 3c88aa115d08..9b47eaf92d42 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -486,6 +486,9 @@ struct iommufd_group {
struct xarray pasid_attach;
struct iommufd_sw_msi_maps required_sw_msi;
phys_addr_t sw_msi_start;
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ bool liveupdate_preserved;
+#endif
};
/*
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 6e7efe83bc5d..d1fd5d71e0fd 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -9,6 +9,7 @@
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/iommu.h>
+#include <linux/liveupdate.h>
#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/xarray.h>
@@ -71,6 +72,34 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid);
struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev);
u32 iommufd_device_to_id(struct iommufd_device *idev);
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+int iommufd_device_preserve(struct liveupdate_session *s,
+ struct iommufd_device *idev,
+ u64 *iommufd_tokenp,
+ u64 *preserved_state);
+void iommufd_device_unpreserve(struct liveupdate_session *s,
+ struct iommufd_device *idev);
+bool iommufd_device_is_preserved(struct iommufd_device *idev);
+#else
+static inline int iommufd_device_preserve(struct liveupdate_session *s,
+ struct iommufd_device *idev,
+ u64 *iommufd_tokenp,
+ u64 *preserved_state)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void iommufd_device_unpreserve(struct liveupdate_session *s,
+ struct iommufd_device *idev)
+{
+}
+
+static inline bool iommufd_device_is_preserved(struct iommufd_device *idev)
+{
+ return false;
+}
+#endif
+
struct iommufd_access_ops {
u8 needs_pin_pages : 1;
void (*unmap)(void *data, unsigned long iova, unsigned long length);
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 15/16] vfio/pci: Preserve the iommufd state of the vfio cdev
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (13 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 14/16] iommufd: Add APIs to preserve/unpreserve a vfio cdev Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
2026-04-27 17:56 ` [PATCH v2 16/16] iommufd/selftest: Add test to verify iommufd preservation Samiullah Khawaja
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
If the vfio cdev is attached to an iommufd, preserve the state of the
attached iommufd also. Basically preserve the iommu specific state of
the device and also the attach iommu HW unit.
Once the device and its iommufd attachment is preserved, it cannot be
detached or attached to another IOAS until it is unpreserved.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
---
drivers/vfio/device_cdev.c | 10 ++++++++
drivers/vfio/pci/vfio_pci_liveupdate.c | 33 +++++++++++++++++++++++++-
2 files changed, 42 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 6844684a3d8e..23f083c0891a 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -271,6 +271,11 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
}
mutex_lock(&device->dev_set->lock);
+ if (iommufd_device_is_preserved(device->iommufd_device)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
if (attach.flags & VFIO_DEVICE_ATTACH_PASID)
ret = device->ops->pasid_attach_ioas(device,
attach.pasid,
@@ -329,6 +334,11 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
}
mutex_lock(&device->dev_set->lock);
+ if (iommufd_device_is_preserved(device->iommufd_device)) {
+ mutex_unlock(&device->dev_set->lock);
+ return -EBUSY;
+ }
+
if (detach.flags & VFIO_DEVICE_DETACH_PASID)
device->ops->pasid_detach_ioas(device, detach.pasid);
else
diff --git a/drivers/vfio/pci/vfio_pci_liveupdate.c b/drivers/vfio/pci/vfio_pci_liveupdate.c
index 976ef17e6103..b56d80379ffc 100644
--- a/drivers/vfio/pci/vfio_pci_liveupdate.c
+++ b/drivers/vfio/pci/vfio_pci_liveupdate.c
@@ -108,10 +108,13 @@
#include <linux/kho/abi/vfio_pci.h>
#include <linux/liveupdate.h>
#include <linux/errno.h>
+#include <linux/iommufd.h>
#include <linux/vfio.h>
#include "vfio_pci_priv.h"
+MODULE_IMPORT_NS("IOMMUFD");
+
static bool vfio_pci_liveupdate_can_preserve(struct liveupdate_file_handler *handler,
struct file *file)
{
@@ -153,9 +156,26 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args)
vdev = container_of(device, struct vfio_pci_core_device, vdev);
pdev = vdev->pdev;
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ /* If iommufd is attached, preserve the underlying domain */
+ mutex_lock(&device->dev_set->lock);
+ if (device->iommufd_attached) {
+ u64 token, preserved_state;
+
+ ret = iommufd_device_preserve(args->session,
+ device->iommufd_device,
+ &token, &preserved_state);
+ if (ret) {
+ mutex_unlock(&device->dev_set->lock);
+ return ret;
+ }
+ }
+ mutex_unlock(&device->dev_set->lock);
+#endif
+
ret = pci_liveupdate_preserve(pdev);
if (ret)
- return ret;
+ goto err_iommufd_unpreserve;
ser = kho_alloc_preserve(sizeof(*ser));
if (IS_ERR(ser)) {
@@ -170,6 +190,9 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args)
args->serialized_data = virt_to_phys(ser);
return 0;
+err_iommufd_unpreserve:
+ iommufd_device_unpreserve(args->session, device->iommufd_device);
+
err_unpreserve:
pci_liveupdate_unpreserve(pdev);
return ret;
@@ -178,6 +201,14 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args)
static void vfio_pci_liveupdate_unpreserve(struct liveupdate_file_op_args *args)
{
struct vfio_device *device = vfio_device_from_file(args->file);
+ struct vfio_pci_core_device_ser *ser;
+
+ ser = phys_to_virt(args->serialized_data);
+ mutex_lock(&device->dev_set->lock);
+ if (device->iommufd_attached)
+ iommufd_device_unpreserve(args->session,
+ device->iommufd_device);
+ mutex_unlock(&device->dev_set->lock);
pci_liveupdate_unpreserve(to_pci_dev(device->dev));
kho_unpreserve_free(phys_to_virt(args->serialized_data));
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread* [PATCH v2 16/16] iommufd/selftest: Add test to verify iommufd preservation
2026-04-27 17:56 [PATCH v2 00/16] iommu: Add live update state preservation Samiullah Khawaja
` (14 preceding siblings ...)
2026-04-27 17:56 ` [PATCH v2 15/16] vfio/pci: Preserve the iommufd state of the " Samiullah Khawaja
@ 2026-04-27 17:56 ` Samiullah Khawaja
15 siblings, 0 replies; 22+ messages in thread
From: Samiullah Khawaja @ 2026-04-27 17:56 UTC (permalink / raw)
To: David Woodhouse, Lu Baolu, Joerg Roedel, Will Deacon,
Jason Gunthorpe
Cc: Samiullah Khawaja, Robin Murphy, Kevin Tian, Alex Williamson,
Shuah Khan, iommu, linux-kernel, kvm, Saeed Mahameed,
Adithya Jayachandran, Parav Pandit, Leon Romanovsky, William Tu,
Pratyush Yadav, Pasha Tatashin, David Matlack, Andrew Morton,
Chris Li, Pranjal Shrivastava, Vipin Sharma, YiFei Zhu
Test iommufd preservation by setting up an iommufd and vfio cdev and
preserve it across live update. Test takes VFIO cdev path of a device
bound to vfio-pci driver and binds it to an iommufd being preserved. It
also preserves the vfio cdev so the iommufd state associated with it is
also preserved.
The restore path is tested by restoring the preserved vfio cdev only. On
restore, test verifies that the bind with a new iommufd fails as the
device is attached to the restored IOMMU domain. Also the LUO session
finish fails as the preserved iommufd is not restored.
Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
---
tools/testing/selftests/iommu/Makefile | 12 +
.../iommu/iommufd_liveupdate_kexec_test.c | 239 ++++++++++++++++++
2 files changed, 251 insertions(+)
create mode 100644 tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c
diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile
index 84abeb2f0949..ab35e8b21580 100644
--- a/tools/testing/selftests/iommu/Makefile
+++ b/tools/testing/selftests/iommu/Makefile
@@ -7,4 +7,16 @@ TEST_GEN_PROGS :=
TEST_GEN_PROGS += iommufd
TEST_GEN_PROGS += iommufd_fail_nth
+TEST_GEN_PROGS_EXTENDED += iommufd_liveupdate_kexec_test
+
include ../lib.mk
+include ../liveupdate/lib/libliveupdate.mk
+
+CFLAGS += -I$(top_srcdir)/tools/include
+CFLAGS += -MD
+CFLAGS += $(EXTRA_CFLAGS)
+
+$(TEST_GEN_PROGS_EXTENDED): %: %.o $(LIBLIVEUPDATE_O)
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBLIVEUPDATE_O) $(LDLIBS) -o $@
+
+EXTRA_CLEAN += $(LIBLIVEUPDATE_O)
diff --git a/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c b/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c
new file mode 100644
index 000000000000..49f34146ab63
--- /dev/null
+++ b/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (c) 2026, Google LLC.
+ * Samiullah Khawaja <skhawaja@google.com>
+ */
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <limits.h>
+
+#define __EXPORTED_HEADERS__
+#include <linux/iommufd.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/sizes.h>
+#include <libliveupdate.h>
+
+#include "../kselftest.h"
+
+#define ksft_assert(condition) \
+ do { \
+ if (!(condition)) \
+ fail_exit("Failed: %s", #condition); \
+ } while (0)
+
+static const char *device_cdev_path;
+static char state_session[LIVEUPDATE_SESSION_NAME_LENGTH];
+static char iommufd_session[LIVEUPDATE_SESSION_NAME_LENGTH];
+
+static const uint64_t STATE_TOKEN;
+static const uint64_t IOMMUFD_TOKEN = 0x123456;
+static const uint64_t CDEV_TOKEN = 0x654321;
+static const uint64_t HWPT_TOKEN = 0x789012;
+static const uint64_t MEMFD_TOKEN = 0x890123;
+
+static int open_cdev(const char *vfio_cdev_path)
+{
+ int cdev_fd;
+
+ cdev_fd = open(vfio_cdev_path, O_RDWR);
+ if (cdev_fd < 0)
+ ksft_exit_skip("Failed to open VFIO cdev: %s\n", vfio_cdev_path);
+
+ return cdev_fd;
+}
+
+static int open_iommufd(void)
+{
+ int iommufd;
+
+ iommufd = open("/dev/iommu", O_RDWR);
+ if (iommufd < 0)
+ ksft_exit_skip("Failed to open /dev/iommu. IOMMUFD support not enabled.\n");
+
+ return iommufd;
+}
+
+static int create_sealed_memfd(size_t size)
+{
+ int fd, ret;
+
+ fd = memfd_create("buffer", MFD_ALLOW_SEALING);
+ if (fd < 0)
+ fail_exit("memfd_create failed");
+
+ ret = ftruncate(fd, size);
+ if (ret)
+ fail_exit("ftruncate failed");
+
+ ret = fcntl(fd, F_ADD_SEALS, F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL);
+ if (ret)
+ fail_exit("fcntl F_ADD_SEALS failed");
+
+ return fd;
+}
+
+#define test_ioctl(fd, cmd, arg) \
+ do { \
+ if (ioctl(fd, cmd, arg)) \
+ fail_exit("ioctl(%s) failed", #cmd); \
+ } while (0)
+
+#define test_luo_session_preserve_fd(session, fd, token) \
+ do { \
+ if (luo_session_preserve_fd(session, fd, token)) \
+ fail_exit("luo_session_preserve_fd(%s) failed", #token); \
+ } while (0)
+
+#define test_luo_session_retrieve_fd(session, token) \
+ ({ \
+ int _fd = luo_session_retrieve_fd(session, token); \
+ if (_fd < 0) \
+ fail_exit("luo_session_retrieve_fd(%s) failed", #token); \
+ _fd; \
+ })
+
+static void setup_iommufd(int iommufd, int memfd, int cdev_fd)
+{
+ struct vfio_device_bind_iommufd bind = {
+ .argsz = sizeof(bind),
+ .flags = 0,
+ .iommufd = iommufd,
+ };
+ struct iommu_ioas_alloc alloc_data = {
+ .size = sizeof(alloc_data),
+ .flags = 0,
+ };
+ struct iommu_hwpt_alloc hwpt_alloc = {
+ .size = sizeof(hwpt_alloc),
+ .flags = 0,
+ };
+ struct vfio_device_attach_iommufd_pt attach_data = {
+ .argsz = sizeof(attach_data),
+ .flags = 0,
+ };
+ struct iommu_hwpt_liveupdate_mark_preserve mark_preserve = {
+ .size = sizeof(mark_preserve),
+ .hwpt_token = HWPT_TOKEN,
+ };
+ struct iommu_ioas_map_file map_file = {
+ .size = sizeof(map_file),
+ .length = SZ_1M,
+ .flags = IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE,
+ .iova = SZ_4G,
+ .fd = memfd,
+ .start = 0,
+ };
+
+ test_ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
+
+ test_ioctl(iommufd, IOMMU_IOAS_ALLOC, &alloc_data);
+
+ hwpt_alloc.dev_id = bind.out_devid;
+ hwpt_alloc.pt_id = alloc_data.out_ioas_id;
+ test_ioctl(iommufd, IOMMU_HWPT_ALLOC, &hwpt_alloc);
+
+ attach_data.pt_id = hwpt_alloc.out_hwpt_id;
+ test_ioctl(cdev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data);
+
+ map_file.ioas_id = alloc_data.out_ioas_id;
+ test_ioctl(iommufd, IOMMU_IOAS_MAP_FILE, &map_file);
+
+ mark_preserve.hwpt_id = attach_data.pt_id;
+ test_ioctl(iommufd, IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE, &mark_preserve);
+}
+
+static void before_kexec(int luo_fd)
+{
+ int iommufd, cdev_fd, memfd, session;
+
+ create_state_file(luo_fd, state_session, STATE_TOKEN, /*next_stage=*/2);
+
+ session = luo_create_session(luo_fd, iommufd_session);
+ if (session < 0)
+ fail_exit("luo_create_session failed");
+
+ iommufd = open_iommufd();
+ memfd = create_sealed_memfd(SZ_1M);
+ cdev_fd = open_cdev(device_cdev_path);
+
+ setup_iommufd(iommufd, memfd, cdev_fd);
+
+ /* Cannot preserve cdev without iommufd */
+ if (!luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN))
+ fail_exit("Preserving cdev without iommufd should fail");
+
+ /* Cannot preserve iommufd without preserving memfd. */
+ if (!luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN))
+ fail_exit("Preserving iommufd without memfd should fail");
+
+ test_luo_session_preserve_fd(session, memfd, MEMFD_TOKEN);
+ test_luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN);
+ test_luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN);
+
+ close(session);
+ session = luo_create_session(luo_fd, iommufd_session);
+ if (session < 0)
+ fail_exit("luo_create_session failed");
+
+ test_luo_session_preserve_fd(session, memfd, MEMFD_TOKEN);
+ test_luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN);
+ test_luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN);
+
+ close(luo_fd);
+ daemonize_and_wait();
+}
+
+static void after_kexec(int luo_fd, int state_session_fd)
+{
+ int iommufd, cdev_fd, session, stage;
+ struct vfio_device_bind_iommufd bind = {
+ .argsz = sizeof(bind),
+ .flags = 0,
+ };
+
+ restore_and_read_stage(state_session_fd, STATE_TOKEN, &stage);
+ ksft_assert(stage == 2);
+
+ session = luo_retrieve_session(luo_fd, iommufd_session);
+ if (session < 0)
+ fail_exit("luo_retrieve_session failed");
+
+ cdev_fd = test_luo_session_retrieve_fd(session, CDEV_TOKEN);
+
+ iommufd = luo_session_retrieve_fd(session, IOMMUFD_TOKEN);
+ if (iommufd >= 0)
+ fail_exit("iommufd should not be retrievable yet");
+
+ iommufd = open_iommufd();
+
+ bind.iommufd = iommufd;
+ if (ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind) == 0 || errno != EPERM)
+ fail_exit("Binding cdev to new iommufd should fail with EPERM");
+
+ /* Should fail */
+ if (luo_session_finish(session) == 0)
+ fail_exit("luo_session_finish should fail if iommufd is not restored");
+
+ close(iommufd);
+ close(cdev_fd);
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc < 2) {
+ printf("Usage: %s <vfio_cdev_path>\n", argv[0]);
+ return 1;
+ }
+
+ device_cdev_path = argv[1];
+ sprintf(iommufd_session, "iommufd-test-%s", "cdev");
+ sprintf(state_session, "state-%s", "iommufd-cdev");
+
+ return luo_test(argc, argv, state_session, before_kexec, after_kexec);
+}
--
2.54.0.545.g6539524ca2-goog
^ permalink raw reply related [flat|nested] 22+ messages in thread