Kernel KVM virtualization development
 help / color / mirror / Atom feed
From: Baolu Lu <baolu.lu@linux.intel.com>
To: Samiullah Khawaja <skhawaja@google.com>,
	David Woodhouse <dwmw2@infradead.org>,
	Joerg Roedel <joro@8bytes.org>, Will Deacon <will@kernel.org>,
	Jason Gunthorpe <jgg@ziepe.ca>
Cc: Robin Murphy <robin.murphy@arm.com>,
	Kevin Tian <kevin.tian@intel.com>,
	Alex Williamson <alex@shazbot.org>, Shuah Khan <shuah@kernel.org>,
	iommu@lists.linux.dev, linux-kernel@vger.kernel.org,
	kvm@vger.kernel.org, Pratyush Yadav <pratyush@kernel.org>,
	Pasha Tatashin <pasha.tatashin@soleen.com>,
	David Matlack <dmatlack@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Pranjal Shrivastava <praan@google.com>,
	Vipin Sharma <vipinsh@google.com>
Subject: Re: [PATCH v3 07/18] iommu/vt-d: Implement device and iommu preserve/unpreserve ops
Date: Mon, 22 Jun 2026 09:50:36 +0800	[thread overview]
Message-ID: <f3e2fef9-4f8c-4032-8d7b-007f47b3fb61@linux.intel.com> (raw)
In-Reply-To: <20260614233728.2212104-8-skhawaja@google.com>

On 6/15/26 07:37, Samiullah Khawaja wrote:
> Add implementation of the device and iommu presevation in a separate
> file. Also set the device and iommu preserve/unpreserve ops in the
> struct iommu_ops.
> 
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> ---
>   MAINTAINERS                      |   8 ++
>   drivers/iommu/intel/Makefile     |   1 +
>   drivers/iommu/intel/iommu.c      |   8 +-
>   drivers/iommu/intel/iommu.h      |  28 +++++
>   drivers/iommu/intel/liveupdate.c | 175 +++++++++++++++++++++++++++++++
>   include/linux/kho/abi/iommu.h    |  21 ++++
>   6 files changed, 239 insertions(+), 2 deletions(-)
>   create mode 100644 drivers/iommu/intel/liveupdate.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cef90bb50781..4dfb56962426 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13002,6 +13002,14 @@ S:	Supported
>   T:	git git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux.git
>   F:	drivers/iommu/intel/
>   
> +INTEL IOMMU LIVEUPDATE (VT-d)
> +M:	Samiullah Khawaja <skhawaja@google.com>
> +M:	Lu Baolu <baolu.lu@linux.intel.com>
> +L:	iommu@lists.linux.dev
> +S:	Maintained
> +T:	git git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux.git
> +F:	drivers/iommu/intel/liveupdate.c
> +
>   INTEL IPU3 CSI-2 CIO2 DRIVER
>   M:	Yong Zhi <yong.zhi@intel.com>
>   M:	Sakari Ailus <sakari.ailus@linux.intel.com>
> diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
> index ada651c4a01b..d38fc101bc35 100644
> --- a/drivers/iommu/intel/Makefile
> +++ b/drivers/iommu/intel/Makefile
> @@ -6,3 +6,4 @@ obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
>   obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
>   obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
>   obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o
> +obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index c3d18cd77d2f..715b538e7efe 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -16,6 +16,7 @@
>   #include <linux/crash_dump.h>
>   #include <linux/dma-direct.h>
>   #include <linux/dmi.h>
> +#include <linux/iommu-liveupdate.h>
>   #include <linux/memory.h>
>   #include <linux/pci.h>
>   #include <linux/pci-ats.h>
> @@ -60,8 +61,6 @@ static int force_on = 0;
>   static int intel_iommu_tboot_noforce;
>   static int no_platform_optin;
>   
> -#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
> -
>   /*
>    * Take a root_entry and return the Lower Context Table Pointer (LCTP)
>    * if marked present.
> @@ -3926,6 +3925,11 @@ const struct iommu_ops intel_iommu_ops = {
>   	.is_attach_deferred	= intel_iommu_is_attach_deferred,
>   	.def_domain_type	= device_def_domain_type,
>   	.page_response		= intel_iommu_page_response,
> +#ifdef CONFIG_IOMMU_LIVEUPDATE
> +	.preserve_device	= intel_iommu_preserve_device,
> +	.preserve		= intel_iommu_preserve,
> +	.unpreserve		= intel_iommu_unpreserve,
> +#endif

Any reason why an unpreserve_device callback is missing here?

>   };
>   
>   static void quirk_iommu_igfx(struct pci_dev *dev)
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index ef145560aa98..5e0bc17e76bf 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -552,6 +552,8 @@ struct root_entry {
>   	u64     hi;
>   };
>   
> +#define ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(struct root_entry))
> +
>   /*
>    * low 64 bits:
>    * 0: present
> @@ -1284,6 +1286,32 @@ static inline int iopf_for_domain_replace(struct iommu_domain *new,
>   	return 0;
>   }
>   
> +#ifdef CONFIG_IOMMU_LIVEUPDATE
> +int intel_iommu_preserve_device(struct device *dev,
> +				struct iommu_device_ser *device_ser);
> +int intel_iommu_preserve(struct iommu_device *iommu,
> +			 struct iommu_hw_ser *iommu_ser);
> +void intel_iommu_unpreserve(struct iommu_device *iommu,
> +			    struct iommu_hw_ser *iommu_ser);
> +#else
> +static inline int intel_iommu_preserve_device(struct device *dev,
> +					      struct iommu_device_ser *device_ser)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int intel_iommu_preserve(struct iommu_device *iommu,
> +				       struct iommu_hw_ser *iommu_ser)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline void intel_iommu_unpreserve(struct iommu_device *iommu,
> +					  struct iommu_hw_ser *iommu_ser)
> +{
> +}
> +#endif
> +
>   #ifdef CONFIG_INTEL_IOMMU_SVM
>   void intel_svm_check(struct intel_iommu *iommu);
>   struct iommu_domain *intel_svm_domain_alloc(struct device *dev,
> diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c
> new file mode 100644
> index 000000000000..b5613e4ad43a
> --- /dev/null
> +++ b/drivers/iommu/intel/liveupdate.c
> @@ -0,0 +1,175 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +/*
> + * Copyright (C) 2026, Google LLC
> + * Author: Samiullah Khawaja <skhawaja@google.com>
> + */
> +
> +#define pr_fmt(fmt)    "DMAR: liveupdate: " fmt
> +
> +#include <linux/kexec_handover.h>
> +#include <linux/liveupdate.h>
> +#include <linux/iommu-liveupdate.h>
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +
> +#include "iommu.h"
> +#include "../iommu-pages.h"
> +
> +/* 2 tables per bus in scalable mode with upper table at odd bit */
> +#define CONTEXT_TABLE_PRESERVED_BIT(bus, devfn) ((bus << 1) + (devfn >> 7))
> +static bool is_context_table_preserved(struct intel_iommu *iommu,
> +				       struct iommu_hw_ser *ser,
> +				       u8 bus, u8 devfn)
> +{
> +	return test_bit(CONTEXT_TABLE_PRESERVED_BIT(bus, devfn),
> +			(unsigned long *)&ser->intel.context_tables_bitmap[0]);
> +}
> +
> +static void unpreserve_context_table(struct intel_iommu *iommu,
> +				     struct iommu_hw_ser *ser,
> +				     u8 bus, u8 devfn)
> +{
> +	struct context_entry *context;
> +
> +	spin_lock(&iommu->lock);
> +	context = iommu_context_addr(iommu, bus, devfn, 0);
> +	spin_unlock(&iommu->lock);

The spinlock is dropped immediately after reading the address pointer.
If this is guaranteed to be safe, please add a comment to explain why a
UAF or race is avoided here. Otherwise, the locking scope needs to be
extened to protect both the pointer lookup and use.

> +	if (context && is_context_table_preserved(iommu, ser, bus, devfn)) {
> +		iommu_unpreserve_pages(context);
> +		clear_bit(CONTEXT_TABLE_PRESERVED_BIT(bus, devfn),
> +			  (unsigned long *)&ser->intel.context_tables_bitmap[0]);
> +	}
> +}
> +
> +static int preserve_context_table(struct intel_iommu *iommu,
> +				  struct iommu_hw_ser *ser,
> +				  u8 bus, u8 devfn)
> +{
> +	struct context_entry *context;
> +	int ret;
> +
> +	spin_lock(&iommu->lock);
> +	context = iommu_context_addr(iommu, bus, devfn, 0);
> +	spin_unlock(&iommu->lock);

Ditto.

> +	if (context && !is_context_table_preserved(iommu, ser, bus, devfn)) {
> +		ret = iommu_preserve_pages(context);
> +		if (ret)
> +			return ret;
> +
> +		set_bit(CONTEXT_TABLE_PRESERVED_BIT(bus, devfn),
> +			(unsigned long *)&ser->intel.context_tables_bitmap[0]);
> +	}
> +
> +	return 0;
> +}
> +
> +static void unpreserve_iommu_context_tables(struct intel_iommu *iommu,
> +					    struct iommu_hw_ser *ser)
> +{
> +	int i;
> +
> +	for (i = 0; i < ROOT_ENTRY_NR; i++) {
> +		unpreserve_context_table(iommu, ser, i, 0);
> +
> +		if (!sm_supported(iommu))
> +			continue;
> +
> +		unpreserve_context_table(iommu, ser, i, 0x80);
> +	}
> +}
> +
> +static int preserve_iommu_context_tables(struct device_domain_info *info)
> +{
> +	struct iommu_hw_ser *iommu_ser;
> +	struct intel_iommu *iommu;
> +	int ret;
> +	int i;
> +
> +	/* IOMMU for this device should already preserved.*/
> +	iommu = info->iommu;
> +	iommu_ser = iommu_preserved_state(&iommu->iommu);
> +	if (!iommu_ser)
> +		return -EINVAL;
> +
> +	/*
> +	 * We could do preservation of context tables only for the bus of this
> +	 * device, but these devices can have PCI aliases, so context tables for
> +	 * those will also require preservation. Also unpreserve would require
> +	 * some kind of refcounting where the context table will only be
> +	 * unpreserved when the last device associated with it is unpreserved.
> +	 *
> +	 * This introduces unnecessary complication with minimum benefits as the
> +	 * unpreserved context tables will probably be recreated by the next
> +	 * kernel as these are all active devices. We follow simpler approach by
> +	 * just preserving the currently active context tables.
> +	 */
> +	for (i = 0; i < ROOT_ENTRY_NR; i++) {
> +		ret = preserve_context_table(iommu, iommu_ser, i, 0);
> +		if (ret)
> +			return ret;
> +
> +		if (!sm_supported(iommu))
> +			continue;
> +
> +		ret = preserve_context_table(iommu, iommu_ser, i, 0x80);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +int intel_iommu_preserve_device(struct device *dev,
> +				struct iommu_device_ser *device_ser)
> +{
> +	struct device_domain_info *info = dev_iommu_priv_get(dev);
> +	int ret;
> +
> +	if (!dev_is_pci(dev)) {
> +		dev_err(dev, "Cannot preserve non-PCI device\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (!info || !info->domain)
> +		return -EINVAL;
> +
> +	ret = preserve_iommu_context_tables(info);
> +	if (ret)
> +		return ret;
> +
> +	device_ser->domain_iommu_ser.attachment_id = domain_id_iommu(info->domain,
> +								     info->iommu);
> +	return 0;
> +}
> +
> +int intel_iommu_preserve(struct iommu_device *iommu_dev,
> +			 struct iommu_hw_ser *ser)
> +{
> +	struct intel_iommu *iommu;
> +	int ret;
> +
> +	iommu = container_of(iommu_dev, struct intel_iommu, iommu);
> +
> +	ret = iommu_preserve_pages(iommu->root_entry);
> +	if (ret)
> +		return ret;
> +
> +	ser->intel.phys_addr = iommu->reg_phys;
> +	ser->intel.root_table = __pa(iommu->root_entry);
> +	ser->type = IOMMU_INTEL;
> +	ser->token = ser->intel.phys_addr;
> +
> +	return 0;
> +}
> +
> +void intel_iommu_unpreserve(struct iommu_device *iommu_dev,
> +			    struct iommu_hw_ser *ser)
> +{
> +	struct intel_iommu *iommu;
> +
> +	iommu = container_of(iommu_dev, struct intel_iommu, iommu);
> +
> +	unpreserve_iommu_context_tables(iommu, ser);
> +	iommu_unpreserve_pages(iommu->root_entry);
> +}
> diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h
> index d06f251a2df4..ad760c497e13 100644
> --- a/include/linux/kho/abi/iommu.h
> +++ b/include/linux/kho/abi/iommu.h
> @@ -80,6 +80,7 @@
>    */
>   enum iommu_type_ser {
>   	IOMMU_INVALID,
> +	IOMMU_INTEL,
>   };
>   
>   #define IOMMU_SER_FLAG_DELETED	(1 << 0)
> @@ -140,16 +141,36 @@ struct iommu_device_ser {
>   	struct iommu_dev_map_ser domain_iommu_ser;
>   } __packed;
>   
> +/**
> + * struct iommu_intel_ser - Serialized state of an Intel IOMMU instance
> + * @restored: Whether IOMMU state is restored
> + * @phys_addr: Physical address of the IOMMU register base
> + * @root_table: Physical address of the root entry table
> + * @context_tables_bitmap: Bitmap representing the context tables that are
> + * preserved.
> + */
> +struct iommu_intel_ser {
> +	u8 restored;
> +	u8 padding[7];
> +	u64 phys_addr;
> +	u64 root_table;
> +	u64 context_tables_bitmap[8]; /* Tracks upto 512 context tables */

To avoid open-coded magic numbers, how about something like,

#define VTD_PRESERVED_BITMAP_LONGS  DIV_ROUND_UP(512, BITS_PER_LONG_LONG)
u64 context_tables_bitmap[VTD_PRESERVED_BITMAP_LONGS];

?

> +};
> +
>   /**
>    * struct iommu_hw_ser - Serialized state of an IOMMU instance
>    * @hdr: Common object header
>    * @token: Unique token for the IOMMU
>    * @type: IOMMU type serialized state belongs to
> + * @intel: Intel specific serialization data
>    */
>   struct iommu_hw_ser {
>   	struct iommu_hdr_ser hdr;
>   	u64 token;
>   	u64 type;
> +	union {
> +		struct iommu_intel_ser intel;
> +	};
>   } __packed;
>   
>   /**

Thanks,
baolu

  reply	other threads:[~2026-06-22  1:51 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-14 23:37 [PATCH v3 00/18] iommu: Add live update state preservation Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 01/18] memfd: export memfd_get_seals() Samiullah Khawaja
2026-06-15  5:14   ` Ankit Soni
2026-06-15 11:45   ` Pratyush Yadav
2026-06-14 23:37 ` [PATCH v3 02/18] iommu: Implement IOMMU Live update FLB callbacks Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 03/18] iommu/pages: Add APIs to preserve/unpreserve/restore iommu pages Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 04/18] iommupt: Implement preserve/unpreserve/restore callbacks Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 05/18] iommu: Implement IOMMU domain preservation Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 06/18] iommu: Implement device and IOMMU HW preservation Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 07/18] iommu/vt-d: Implement device and iommu preserve/unpreserve ops Samiullah Khawaja
2026-06-22  1:50   ` Baolu Lu [this message]
2026-06-22 19:19     ` Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 08/18] iommu/vt-d: clear unpreserved context entries during shutdown Samiullah Khawaja
2026-06-22  2:47   ` Baolu Lu
2026-06-22 22:56     ` Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 09/18] iommu: Add APIs to get iommu and device preserved state Samiullah Khawaja
2026-06-22  3:10   ` Baolu Lu
2026-06-22 23:27     ` Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 10/18] iommu/vt-d: Restore IOMMU state and reclaimed domain ids Samiullah Khawaja
2026-06-22  5:14   ` Baolu Lu
2026-06-22 23:30     ` Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 11/18] iommu: Restore and reattach preserved domains to devices Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 12/18] iommu/vt-d: Handle reattach of the restored domain Samiullah Khawaja
2026-06-22  5:44   ` Baolu Lu
2026-06-23  0:26     ` Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 13/18] iommu/vt-d: preserve PASID table of preserved device Samiullah Khawaja
2026-06-22  6:01   ` Baolu Lu
2026-06-23  0:36     ` Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 14/18] iommufd: Implement ioctl to mark HWPT for preservation Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 15/18] iommufd: Persist iommu hardware pagetables for live update Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 16/18] iommufd: Add APIs to preserve/unpreserve a vfio cdev Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 17/18] vfio/pci: Preserve the iommufd state of the " Samiullah Khawaja
2026-06-14 23:37 ` [PATCH v3 18/18] iommufd/selftest: Add test to verify iommufd preservation Samiullah Khawaja

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f3e2fef9-4f8c-4032-8d7b-007f47b3fb61@linux.intel.com \
    --to=baolu.lu@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex@shazbot.org \
    --cc=dmatlack@google.com \
    --cc=dwmw2@infradead.org \
    --cc=iommu@lists.linux.dev \
    --cc=jgg@ziepe.ca \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pasha.tatashin@soleen.com \
    --cc=praan@google.com \
    --cc=pratyush@kernel.org \
    --cc=robin.murphy@arm.com \
    --cc=shuah@kernel.org \
    --cc=skhawaja@google.com \
    --cc=vipinsh@google.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox