All of lore.kernel.org
 help / color / mirror / Atom feed
From: Pranjal Shrivastava <praan@google.com>
To: Samiullah Khawaja <skhawaja@google.com>
Cc: David Woodhouse <dwmw2@infradead.org>,
	Lu Baolu <baolu.lu@linux.intel.com>,
	Joerg Roedel <joro@8bytes.org>, Will Deacon <will@kernel.org>,
	Jason Gunthorpe <jgg@ziepe.ca>,
	Robin Murphy <robin.murphy@arm.com>,
	Kevin Tian <kevin.tian@intel.com>,
	Alex Williamson <alex@shazbot.org>, Shuah Khan <shuah@kernel.org>,
	iommu@lists.linux.dev, linux-kernel@vger.kernel.org,
	kvm@vger.kernel.org, Saeed Mahameed <saeedm@nvidia.com>,
	Adithya Jayachandran <ajayachandra@nvidia.com>,
	Parav Pandit <parav@nvidia.com>,
	Leon Romanovsky <leonro@nvidia.com>, William Tu <witu@nvidia.com>,
	Pratyush Yadav <pratyush@kernel.org>,
	Pasha Tatashin <pasha.tatashin@soleen.com>,
	David Matlack <dmatlack@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Chris Li <chrisl@kernel.org>, Vipin Sharma <vipinsh@google.com>,
	YiFei Zhu <zhuyifei@google.com>
Subject: Re: [PATCH 06/14] iommu/vt-d: Implement device and iommu preserve/unpreserve ops
Date: Fri, 20 Mar 2026 23:01:59 +0000	[thread overview]
Message-ID: <ab3R54-kyHGDEW9L@google.com> (raw)
In-Reply-To: <20260203220948.2176157-7-skhawaja@google.com>

On Tue, Feb 03, 2026 at 10:09:40PM +0000, Samiullah Khawaja wrote:
> Add implementation of the device and iommu presevation in a separate
> file. Also set the device and iommu preserve/unpreserve ops in the
> struct iommu_ops.
> 
> During normal shutdown the iommu translation is disabled. Since the root
> table is preserved during live update, it needs to be cleaned up and the
> context entries of the unpreserved devices need to be cleared.
> 
> Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
> ---
>  drivers/iommu/intel/Makefile     |   1 +
>  drivers/iommu/intel/iommu.c      |  47 ++++++++++-
>  drivers/iommu/intel/iommu.h      |  27 +++++++
>  drivers/iommu/intel/liveupdate.c | 134 +++++++++++++++++++++++++++++++
>  4 files changed, 205 insertions(+), 4 deletions(-)
>  create mode 100644 drivers/iommu/intel/liveupdate.c
> 
> diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
> index ada651c4a01b..d38fc101bc35 100644
> --- a/drivers/iommu/intel/Makefile
> +++ b/drivers/iommu/intel/Makefile
> @@ -6,3 +6,4 @@ obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
>  obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
>  obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
>  obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o
> +obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 134302fbcd92..c95de93fb72f 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -16,6 +16,7 @@
>  #include <linux/crash_dump.h>
>  #include <linux/dma-direct.h>
>  #include <linux/dmi.h>
> +#include <linux/iommu-lu.h>
>  #include <linux/memory.h>
>  #include <linux/pci.h>
>  #include <linux/pci-ats.h>
> @@ -52,6 +53,8 @@ static int rwbf_quirk;
>  
>  #define rwbf_required(iommu)	(rwbf_quirk || cap_rwbf((iommu)->cap))
>  
> +static bool __maybe_clean_unpreserved_context_entries(struct intel_iommu *iommu);
> +
>  /*
>   * set to 1 to panic kernel if can't successfully enable VT-d
>   * (used when kernel is launched w/ TXT)
> @@ -60,8 +63,6 @@ static int force_on = 0;
>  static int intel_iommu_tboot_noforce;
>  static int no_platform_optin;
>  
> -#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
> -
>  /*
>   * Take a root_entry and return the Lower Context Table Pointer (LCTP)
>   * if marked present.
> @@ -2378,8 +2379,10 @@ void intel_iommu_shutdown(void)
>  		/* Disable PMRs explicitly here. */
>  		iommu_disable_protect_mem_regions(iommu);
>  
> -		/* Make sure the IOMMUs are switched off */
> -		iommu_disable_translation(iommu);
> +		if (!__maybe_clean_unpreserved_context_entries(iommu)) {
> +			/* Make sure the IOMMUs are switched off */
> +			iommu_disable_translation(iommu);
> +		}
>  	}
>  }
>  
> @@ -2902,6 +2905,38 @@ static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
>  	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
>  };
>  
> +#ifdef CONFIG_IOMMU_LIVEUPDATE
> +static bool __maybe_clean_unpreserved_context_entries(struct intel_iommu *iommu)
> +{
> +	struct device_domain_info *info;
> +	struct pci_dev *pdev = NULL;
> +
> +	if (!iommu->iommu.outgoing_preserved_state)
> +		return false;
> +
> +	for_each_pci_dev(pdev) {

Shouldn't we *also* clean context entries for non-pci devices?

AFAICT, in the current code, we simply turn off translation during
shutdowm. However, with this change, because this cleanup loop only runs
for_each_pci_dev(), it completely ignores any non-PCI devices (like 
platform devices) that might be attached to this IOMMU?

Since the IOMMU is left powered on during the handover, skipping non-PCI
devices in this cleanup loop means their context entries are left
perfectly intact and active in the hardware root table. There could be a
chance that the new kernel doesn't reclaim/overwrite these entries in
which case it could result in zombie DMAs surviving the KHO.

> +		info = dev_iommu_priv_get(&pdev->dev);
> +		if (!info)
> +			continue;
> +
> +		if (info->iommu != iommu)
> +			continue;
> +
> +		if (dev_iommu_preserved_state(&pdev->dev))
> +			continue;
> +
> +		domain_context_clear(info);
> +	}
> +
> +	return true;
> +}
> +#else
> +static bool __maybe_clean_unpreserved_context_entries(struct intel_iommu *iommu)
> +{
> +	return false;
> +}
> +#endif
> +
>  static struct iommu_domain *
>  intel_iommu_domain_alloc_second_stage(struct device *dev,
>  				      struct intel_iommu *iommu, u32 flags)
> @@ -3925,6 +3960,10 @@ const struct iommu_ops intel_iommu_ops = {
>  	.is_attach_deferred	= intel_iommu_is_attach_deferred,
>  	.def_domain_type	= device_def_domain_type,
>  	.page_response		= intel_iommu_page_response,
> +	.preserve_device	= intel_iommu_preserve_device,
> +	.unpreserve_device	= intel_iommu_unpreserve_device,
> +	.preserve		= intel_iommu_preserve,
> +	.unpreserve		= intel_iommu_unpreserve,
>  };
>  
>  static void quirk_iommu_igfx(struct pci_dev *dev)
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index 25c5e22096d4..70032e86437d 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -557,6 +557,8 @@ struct root_entry {
>  	u64     hi;
>  };
>  
> +#define ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(struct root_entry))
> +
>  /*
>   * low 64 bits:
>   * 0: present
> @@ -1276,6 +1278,31 @@ static inline int iopf_for_domain_replace(struct iommu_domain *new,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_IOMMU_LIVEUPDATE
> +int intel_iommu_preserve_device(struct device *dev, struct device_ser *device_ser);
> +void intel_iommu_unpreserve_device(struct device *dev, struct device_ser *device_ser);
> +int intel_iommu_preserve(struct iommu_device *iommu, struct iommu_ser *iommu_ser);
> +void intel_iommu_unpreserve(struct iommu_device *iommu, struct iommu_ser *iommu_ser);
> +#else
> +static inline int intel_iommu_preserve_device(struct device *dev, struct device_ser *device_ser)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline void intel_iommu_unpreserve_device(struct device *dev, struct device_ser *device_ser)
> +{
> +}
> +
> +static inline int intel_iommu_preserve(struct iommu_device *iommu, struct iommu_ser *iommu_ser)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline void intel_iommu_unpreserve(struct iommu_device *iommu, struct iommu_ser *iommu_ser)
> +{
> +}
> +#endif
> +
>  #ifdef CONFIG_INTEL_IOMMU_SVM
>  void intel_svm_check(struct intel_iommu *iommu);
>  struct iommu_domain *intel_svm_domain_alloc(struct device *dev,
> diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c
> new file mode 100644
> index 000000000000..82ba1daf1711
> --- /dev/null
> +++ b/drivers/iommu/intel/liveupdate.c
> @@ -0,0 +1,134 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +/*
> + * Copyright (C) 2025, Google LLC
> + * Author: Samiullah Khawaja <skhawaja@google.com>
> + */
> +
> +#define pr_fmt(fmt)    "iommu: liveupdate: " fmt
> +
> +#include <linux/kexec_handover.h>
> +#include <linux/liveupdate.h>
> +#include <linux/iommu-lu.h>
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +
> +#include "iommu.h"
> +#include "../iommu-pages.h"
> +
> +static void unpreserve_iommu_context(struct intel_iommu *iommu, int end)
> +{
> +	struct context_entry *context;
> +	int i;
> +
> +	if (end < 0)
> +		end = ROOT_ENTRY_NR;
> +
> +	for (i = 0; i < end; i++) {
> +		context = iommu_context_addr(iommu, i, 0, 0);
> +		if (context)
> +			iommu_unpreserve_page(context);
> +
> +		if (!sm_supported(iommu))
> +			continue;
> +
> +		context = iommu_context_addr(iommu, i, 0x80, 0);
> +		if (context)
> +			iommu_unpreserve_page(context);
> +	}
> +}
> +
> +static int preserve_iommu_context(struct intel_iommu *iommu)
> +{
> +	struct context_entry *context;
> +	int ret;
> +	int i;
> +
> +	for (i = 0; i < ROOT_ENTRY_NR; i++) {
> +		context = iommu_context_addr(iommu, i, 0, 0);
> +		if (context) {
> +			ret = iommu_preserve_page(context);
> +			if (ret)
> +				goto error;
> +		}
> +
> +		if (!sm_supported(iommu))
> +			continue;
> +
> +		context = iommu_context_addr(iommu, i, 0x80, 0);
> +		if (context) {
> +			ret = iommu_preserve_page(context);
> +			if (ret)
> +				goto error_sm;
> +		}
> +	}
> +
> +	return 0;
> +
> +error_sm:
> +	context = iommu_context_addr(iommu, i, 0, 0);
> +	iommu_unpreserve_page(context);
> +error:
> +	unpreserve_iommu_context(iommu, i);
> +	return ret;
> +}
> +
> +int intel_iommu_preserve_device(struct device *dev, struct device_ser *device_ser)
> +{
> +	struct device_domain_info *info = dev_iommu_priv_get(dev);
> +
> +	if (!dev_is_pci(dev))
> +		return -EOPNOTSUPP;

Maybe also WARN_ON_ONCE ?

> +
> +	if (!info)
> +		return -EINVAL;
> +
> +	device_ser->domain_iommu_ser.did = domain_id_iommu(info->domain, info->iommu);
> +	return 0;
> +}
> +
> +void intel_iommu_unpreserve_device(struct device *dev, struct device_ser *device_ser)
> +{
> +}

Nit: We can just not populate the unpreserve_device op here, we don't
need an empty function

> +
> +int intel_iommu_preserve(struct iommu_device *iommu_dev, struct iommu_ser *ser)
> +{
> +	struct intel_iommu *iommu;
> +	int ret;
> +
> +	iommu = container_of(iommu_dev, struct intel_iommu, iommu);
> +
> +	spin_lock(&iommu->lock);

Minor note: We call this with the session->mutex acquired which is a
sleepable/preemptable context whereas spin_lock disables preemption and
puts us in an atomic context. 

While iommu_context_addr() happens to be safe here because it uses
GFP_ATOMIC & we pass alloc=0, we are heavily relying on the assumption
that iommu_context_addr() or other helpers will never sleep or attempt
a GFP_KERNEL allocation under the hood.

Given how easy it would be for a future refactor to accidentally
introduce a sleeping lock inside any of the other helpers, I'd recommend
adding an explicit comment here warning that this runs in an atomic
context (same for unpreserve below).

>r +	ret = preserve_iommu_context(iommu);
> +	if (ret)
> +		goto err;
> +
> +	ret = iommu_preserve_page(iommu->root_entry);
> +	if (ret) {
> +		unpreserve_iommu_context(iommu, -1);

Nit: The same overloading discussion as Vipin pointed out in the other
patch.

> +		goto err;
> +	}
> +
> +	ser->intel.phys_addr = iommu->reg_phys;
> +	ser->intel.root_table = __pa(iommu->root_entry);
> +	ser->type = IOMMU_INTEL;
> +	ser->token = ser->intel.phys_addr;
> +	spin_unlock(&iommu->lock);
> +
> +	return 0;
> +err:
> +	spin_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +void intel_iommu_unpreserve(struct iommu_device *iommu_dev, struct iommu_ser *iommu_ser)
> +{
> +	struct intel_iommu *iommu;
> +
> +	iommu = container_of(iommu_dev, struct intel_iommu, iommu);
> +
> +	spin_lock(&iommu->lock);
> +	unpreserve_iommu_context(iommu, -1);
> +	iommu_unpreserve_page(iommu->root_entry);
> +	spin_unlock(&iommu->lock);
> +}

Thanks,
Praan

  parent reply	other threads:[~2026-03-20 23:02 UTC|newest]

Thread overview: 109+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-03 22:09 [PATCH 00/14] iommu: Add live update state preservation Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 01/14] iommu: Implement IOMMU LU FLB callbacks Samiullah Khawaja
2026-03-11 21:07   ` Pranjal Shrivastava
2026-03-12 16:43     ` Samiullah Khawaja
2026-03-12 23:43       ` Pranjal Shrivastava
2026-03-13 16:47         ` Samiullah Khawaja
2026-03-13 15:36       ` Pranjal Shrivastava
2026-03-13 16:58         ` Samiullah Khawaja
2026-04-10 13:51     ` Jason Gunthorpe
2026-04-13  6:41       ` Tian, Kevin
2026-03-16 22:54   ` Vipin Sharma
2026-03-17  1:06     ` Samiullah Khawaja
2026-03-23 23:27       ` Vipin Sharma
2026-02-03 22:09 ` [PATCH 02/14] iommu: Implement IOMMU core liveupdate skeleton Samiullah Khawaja
2026-03-12 23:10   ` Pranjal Shrivastava
2026-03-13 18:42     ` Samiullah Khawaja
2026-03-17 20:09       ` Pranjal Shrivastava
2026-03-17 20:13         ` Samiullah Khawaja
2026-03-17 20:23           ` Pranjal Shrivastava
2026-03-17 21:03             ` Vipin Sharma
2026-03-18 18:51               ` Pranjal Shrivastava
2026-03-18 17:49             ` Samiullah Khawaja
2026-03-17 19:58   ` Vipin Sharma
2026-03-17 20:33     ` Samiullah Khawaja
2026-03-24 19:06       ` Vipin Sharma
2026-03-24 19:45         ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 03/14] liveupdate: luo_file: Add internal APIs for file preservation Samiullah Khawaja
2026-03-18 10:00   ` Pranjal Shrivastava
2026-03-18 16:54     ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 04/14] iommu/pages: Add APIs to preserve/unpreserve/restore iommu pages Samiullah Khawaja
2026-03-03 16:42   ` Ankit Soni
2026-03-03 18:41     ` Samiullah Khawaja
2026-03-20 17:27       ` Pranjal Shrivastava
2026-03-20 18:12         ` Samiullah Khawaja
2026-04-10 14:13           ` Jason Gunthorpe
2026-04-10 22:13             ` Samiullah Khawaja
2026-03-17 20:59   ` Vipin Sharma
2026-03-20  9:28     ` Pranjal Shrivastava
2026-03-20 18:27       ` Samiullah Khawaja
2026-03-20 11:01     ` Pranjal Shrivastava
2026-03-20 18:56       ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 05/14] iommupt: Implement preserve/unpreserve/restore callbacks Samiullah Khawaja
2026-03-20 21:57   ` Pranjal Shrivastava
2026-03-23 16:41     ` Samiullah Khawaja
2026-04-10 14:16     ` Jason Gunthorpe
2026-04-10 23:02       ` Samiullah Khawaja
2026-04-10 23:16         ` Jason Gunthorpe
2026-04-13 19:31           ` Samiullah Khawaja
2026-04-13 22:33             ` Jason Gunthorpe
2026-04-13 23:28               ` Samiullah Khawaja
2026-04-13 23:40                 ` Jason Gunthorpe
2026-02-03 22:09 ` [PATCH 06/14] iommu/vt-d: Implement device and iommu preserve/unpreserve ops Samiullah Khawaja
2026-03-19 16:04   ` Vipin Sharma
2026-03-19 16:27     ` Samiullah Khawaja
2026-03-20 23:01   ` Pranjal Shrivastava [this message]
2026-03-21 13:27     ` Pranjal Shrivastava
2026-03-23 18:32     ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 07/14] iommu/vt-d: Restore IOMMU state and reclaimed domain ids Samiullah Khawaja
2026-03-19 20:54   ` Vipin Sharma
2026-03-20  1:05     ` Samiullah Khawaja
2026-03-22 19:51   ` Pranjal Shrivastava
2026-03-23 19:33     ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 08/14] iommu: Restore and reattach preserved domains to devices Samiullah Khawaja
2026-03-10  5:16   ` Ankit Soni
2026-03-10 21:47     ` Samiullah Khawaja
2026-03-22 21:59   ` Pranjal Shrivastava
2026-03-23 18:02     ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 09/14] iommu/vt-d: preserve PASID table of preserved device Samiullah Khawaja
2026-03-23 18:19   ` Pranjal Shrivastava
2026-03-23 18:51     ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 10/14] iommufd-lu: Implement ioctl to let userspace mark an HWPT to be preserved Samiullah Khawaja
2026-03-19 23:35   ` Vipin Sharma
2026-03-20  0:40     ` Samiullah Khawaja
2026-03-20 23:34       ` Vipin Sharma
2026-03-23 16:24         ` Samiullah Khawaja
2026-03-25 14:37   ` Pranjal Shrivastava
2026-03-25 17:31     ` Samiullah Khawaja
2026-03-25 18:55       ` Pranjal Shrivastava
2026-03-25 20:19         ` Samiullah Khawaja
2026-03-25 20:36           ` Pranjal Shrivastava
2026-03-25 20:46             ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 11/14] iommufd-lu: Persist iommu hardware pagetables for live update Samiullah Khawaja
2026-02-25 23:47   ` Samiullah Khawaja
2026-03-03  5:56   ` Ankit Soni
2026-03-03 18:51     ` Samiullah Khawaja
2026-03-23 20:28   ` Vipin Sharma
2026-03-23 21:34     ` Samiullah Khawaja
2026-03-25 20:08   ` Pranjal Shrivastava
2026-03-25 20:32     ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 12/14] iommufd: Add APIs to preserve/unpreserve a vfio cdev Samiullah Khawaja
2026-03-23 20:59   ` Vipin Sharma
2026-03-23 21:38     ` Samiullah Khawaja
2026-03-25 20:24   ` Pranjal Shrivastava
2026-03-25 20:41     ` Samiullah Khawaja
2026-03-25 21:23       ` Pranjal Shrivastava
2026-03-26  0:16         ` Samiullah Khawaja
2026-02-03 22:09 ` [PATCH 13/14] vfio/pci: Preserve the iommufd state of the " Samiullah Khawaja
2026-02-17  4:18   ` Ankit Soni
2026-03-03 18:35     ` Samiullah Khawaja
2026-03-23 21:17   ` Vipin Sharma
2026-03-23 22:07     ` Samiullah Khawaja
2026-03-24 20:30       ` Vipin Sharma
2026-03-25 20:55   ` Pranjal Shrivastava
2026-02-03 22:09 ` [PATCH 14/14] iommufd/selftest: Add test to verify iommufd preservation Samiullah Khawaja
2026-03-23 22:18   ` Vipin Sharma
2026-03-27 18:32     ` Samiullah Khawaja
2026-03-25 21:05   ` Pranjal Shrivastava
2026-03-27 18:25     ` Samiullah Khawaja
2026-03-27 18:40       ` Samiullah Khawaja

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ab3R54-kyHGDEW9L@google.com \
    --to=praan@google.com \
    --cc=ajayachandra@nvidia.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex@shazbot.org \
    --cc=baolu.lu@linux.intel.com \
    --cc=chrisl@kernel.org \
    --cc=dmatlack@google.com \
    --cc=dwmw2@infradead.org \
    --cc=iommu@lists.linux.dev \
    --cc=jgg@ziepe.ca \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=leonro@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=parav@nvidia.com \
    --cc=pasha.tatashin@soleen.com \
    --cc=pratyush@kernel.org \
    --cc=robin.murphy@arm.com \
    --cc=saeedm@nvidia.com \
    --cc=shuah@kernel.org \
    --cc=skhawaja@google.com \
    --cc=vipinsh@google.com \
    --cc=will@kernel.org \
    --cc=witu@nvidia.com \
    --cc=zhuyifei@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.