ARM Sunxi Platform Development
 help / color / mirror / Atom feed
From: Nicolin Chen <nicolinc@nvidia.com>
To: <joro@8bytes.org>, <jgg@nvidia.com>, <bhelgaas@google.com>
Cc: <suravee.suthikulpanit@amd.com>, <will@kernel.org>,
	<robin.murphy@arm.com>, <sven@kernel.org>, <j@jannau.net>,
	<alyssa@rosenzweig.io>, <neal@gompa.dev>,
	<robin.clark@oss.qualcomm.com>, <m.szyprowski@samsung.com>,
	<krzk@kernel.org>, <alim.akhtar@samsung.com>,
	<dwmw2@infradead.org>, <baolu.lu@linux.intel.com>,
	<kevin.tian@intel.com>, <yong.wu@mediatek.com>,
	<matthias.bgg@gmail.com>,
	<angelogioacchino.delregno@collabora.com>,
	<tjeznach@rivosinc.com>, <paul.walmsley@sifive.com>,
	<palmer@dabbelt.com>, <aou@eecs.berkeley.edu>, <alex@ghiti.fr>,
	<heiko@sntech.de>, <schnelle@linux.ibm.com>,
	<mjrosato@linux.ibm.com>, <gerald.schaefer@linux.ibm.com>,
	<orsonzhai@gmail.com>, <baolin.wang@linux.alibaba.com>,
	<zhang.lyra@gmail.com>, <wens@csie.org>,
	<jernej.skrabec@gmail.com>, <samuel@sholland.org>,
	<jean-philippe@linaro.org>, <rafael@kernel.org>,
	<lenb@kernel.org>, <yi.l.liu@intel.com>, <cwabbott0@gmail.com>,
	<quic_pbrahma@quicinc.com>, <iommu@lists.linux.dev>,
	<linux-kernel@vger.kernel.org>, <asahi@lists.linux.dev>,
	<linux-arm-kernel@lists.infradead.org>,
	<linux-arm-msm@vger.kernel.org>,
	<linux-samsung-soc@vger.kernel.org>,
	<linux-mediatek@lists.infradead.org>,
	<linux-riscv@lists.infradead.org>,
	<linux-rockchip@lists.infradead.org>,
	<linux-s390@vger.kernel.org>, <linux-sunxi@lists.linux.dev>,
	<linux-tegra@vger.kernel.org>, <virtualization@lists.linux.dev>,
	<linux-acpi@vger.kernel.org>, <linux-pci@vger.kernel.org>,
	<patches@lists.linux.dev>, <vsethi@nvidia.com>,
	<helgaas@kernel.org>, <etzhao1900@gmail.com>
Subject: [PATCH v4 7/7] pci: Suspend iommu function prior to resetting a device
Date: Sun, 31 Aug 2025 16:31:59 -0700	[thread overview]
Message-ID: <cbceeb65dd248fd06e5665dbcb6df4484b2d8958.1756682135.git.nicolinc@nvidia.com> (raw)
In-Reply-To: <cover.1756682135.git.nicolinc@nvidia.com>

PCIe permits a device to ignore ATS invalidation TLPs, while processing a
reset. This creates a problem visible to the OS where an ATS invalidation
command will time out: e.g. an SVA domain will have no coordination with a
reset event and can racily issue ATS invalidations to a resetting device.

The PCIe spec in sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and
block ATS before initiating a Function Level Reset. It also mentions that
other reset methods could have the same vulnerability as well.

Now iommu_dev_reset_prepare/done() helpers are introduced for this matter.
Use them in all the existing reset functions, which will attach the device
to an IOMMU_DOMAIN_BLOCKED during a reset, so as to allow IOMMU driver to:
 - invoke pci_disable_ats() and pci_enable_ats(), if necessary
 - wait for all ATS invalidations to complete
 - stop issuing new ATS invalidations
 - fence any incoming ATS queries

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/pci/pci.h      |  2 ++
 drivers/pci/pci-acpi.c | 12 ++++++--
 drivers/pci/pci.c      | 68 ++++++++++++++++++++++++++++++++++++++----
 drivers/pci/quirks.c   | 18 ++++++++++-
 4 files changed, 92 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 34f65d69662e9..9700ebca55771 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -106,6 +106,8 @@ void pci_init_reset_methods(struct pci_dev *dev);
 int pci_bridge_secondary_bus_reset(struct pci_dev *dev);
 int pci_bus_error_reset(struct pci_dev *dev);
 int __pci_reset_bus(struct pci_bus *bus);
+int pci_reset_iommu_prepare(struct pci_dev *dev);
+void pci_reset_iommu_done(struct pci_dev *dev);
 
 struct pci_cap_saved_data {
 	u16		cap_nr;
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index ddb25960ea47d..3291424730824 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -969,6 +969,7 @@ void pci_set_acpi_fwnode(struct pci_dev *dev)
 int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
 {
 	acpi_handle handle = ACPI_HANDLE(&dev->dev);
+	int ret = 0;
 
 	if (!handle || !acpi_has_method(handle, "_RST"))
 		return -ENOTTY;
@@ -976,12 +977,19 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
 	if (probe)
 		return 0;
 
+	ret = pci_reset_iommu_prepare(dev);
+	if (ret) {
+		pci_err(dev, "failed to stop IOMMU\n");
+		return ret;
+	}
+
 	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) {
 		pci_warn(dev, "ACPI _RST failed\n");
-		return -ENOTTY;
+		ret = -ENOTTY;
 	}
 
-	return 0;
+	pci_reset_iommu_done(dev);
+	return ret;
 }
 
 bool acpi_pci_power_manageable(struct pci_dev *dev)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b0f4d98036cdd..b4ca44ea6f494 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include <linux/dmi.h>
 #include <linux/init.h>
+#include <linux/iommu.h>
 #include <linux/msi.h>
 #include <linux/of.h>
 #include <linux/pci.h>
@@ -25,6 +26,7 @@
 #include <linux/logic_pio.h>
 #include <linux/device.h>
 #include <linux/pm_runtime.h>
+#include <linux/pci-ats.h>
 #include <linux/pci_hotplug.h>
 #include <linux/vmalloc.h>
 #include <asm/dma.h>
@@ -95,6 +97,23 @@ bool pci_reset_supported(struct pci_dev *dev)
 	return dev->reset_methods[0] != 0;
 }
 
+/*
+ * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS before
+ * initiating a reset. Notify the iommu driver that enabled ATS.
+ */
+int pci_reset_iommu_prepare(struct pci_dev *dev)
+{
+	if (pci_ats_supported(dev))
+		return iommu_dev_reset_prepare(&dev->dev);
+	return 0;
+}
+
+void pci_reset_iommu_done(struct pci_dev *dev)
+{
+	if (pci_ats_supported(dev))
+		iommu_dev_reset_done(&dev->dev);
+}
+
 #ifdef CONFIG_PCI_DOMAINS
 int pci_domains_supported = 1;
 #endif
@@ -4529,13 +4548,22 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction);
  */
 int pcie_flr(struct pci_dev *dev)
 {
+	int ret = 0;
+
 	if (!pci_wait_for_pending_transaction(dev))
 		pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n");
 
+	/* Have to call it after waiting for pending DMA transaction */
+	ret = pci_reset_iommu_prepare(dev);
+	if (ret) {
+		pci_err(dev, "failed to stop IOMMU\n");
+		return ret;
+	}
+
 	pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR);
 
 	if (dev->imm_ready)
-		return 0;
+		goto done;
 
 	/*
 	 * Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within
@@ -4544,7 +4572,10 @@ int pcie_flr(struct pci_dev *dev)
 	 */
 	msleep(100);
 
-	return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
+	ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
+done:
+	pci_reset_iommu_done(dev);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pcie_flr);
 
@@ -4572,6 +4603,7 @@ EXPORT_SYMBOL_GPL(pcie_reset_flr);
 
 static int pci_af_flr(struct pci_dev *dev, bool probe)
 {
+	int ret = 0;
 	int pos;
 	u8 cap;
 
@@ -4598,10 +4630,17 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
 				 PCI_AF_STATUS_TP << 8))
 		pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n");
 
+	/* Have to call it after waiting for pending DMA transaction */
+	ret = pci_reset_iommu_prepare(dev);
+	if (ret) {
+		pci_err(dev, "failed to stop IOMMU\n");
+		return ret;
+	}
+
 	pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
 
 	if (dev->imm_ready)
-		return 0;
+		goto done;
 
 	/*
 	 * Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006,
@@ -4611,7 +4650,10 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
 	 */
 	msleep(100);
 
-	return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
+	ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
+done:
+	pci_reset_iommu_done(dev);
+	return ret;
 }
 
 /**
@@ -4632,6 +4674,7 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
 static int pci_pm_reset(struct pci_dev *dev, bool probe)
 {
 	u16 csr;
+	int ret;
 
 	if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET)
 		return -ENOTTY;
@@ -4646,6 +4689,12 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
 	if (dev->current_state != PCI_D0)
 		return -EINVAL;
 
+	ret = pci_reset_iommu_prepare(dev);
+	if (ret) {
+		pci_err(dev, "failed to stop IOMMU\n");
+		return ret;
+	}
+
 	csr &= ~PCI_PM_CTRL_STATE_MASK;
 	csr |= PCI_D3hot;
 	pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
@@ -4656,7 +4705,9 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
 	pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
 	pci_dev_d3_sleep(dev);
 
-	return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
+	ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
+	pci_reset_iommu_done(dev);
+	return ret;
 }
 
 /**
@@ -5111,6 +5162,12 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
 	if (rc)
 		return -ENOTTY;
 
+	rc = pci_reset_iommu_prepare(dev);
+	if (rc) {
+		pci_err(dev, "failed to stop IOMMU\n");
+		return rc;
+	}
+
 	if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) {
 		val = reg;
 	} else {
@@ -5125,6 +5182,7 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
 		pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL,
 				      reg);
 
+	pci_reset_iommu_done(dev);
 	return rc;
 }
 
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index d97335a401930..c1c32e57fe267 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4225,6 +4225,22 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
 	{ 0 }
 };
 
+static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe,
+				    const struct pci_dev_reset_methods *i)
+{
+	int ret;
+
+	ret = pci_reset_iommu_prepare(dev);
+	if (ret) {
+		pci_err(dev, "failed to stop IOMMU\n");
+		return ret;
+	}
+
+	ret = i->reset(dev, probe);
+	pci_reset_iommu_done(dev);
+	return ret;
+}
+
 /*
  * These device-specific reset methods are here rather than in a driver
  * because when a host assigns a device to a guest VM, the host may need
@@ -4239,7 +4255,7 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
 		     i->vendor == (u16)PCI_ANY_ID) &&
 		    (i->device == dev->device ||
 		     i->device == (u16)PCI_ANY_ID))
-			return i->reset(dev, probe);
+			return __pci_dev_specific_reset(dev, probe, i);
 	}
 
 	return -ENOTTY;
-- 
2.43.0


      parent reply	other threads:[~2025-08-31 23:33 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-31 23:31 [PATCH v4 0/7] Disable ATS via iommu during PCI resets Nicolin Chen
2025-08-31 23:31 ` [PATCH v4 1/7] iommu/arm-smmu-v3: Add release_domain to attach prior to release_dev() Nicolin Chen
2025-09-12  9:33   ` Tian, Kevin
2025-09-15 12:35     ` Jason Gunthorpe
2025-09-19 22:47       ` Nicolin Chen
2025-09-23 17:22         ` Jason Gunthorpe
2025-09-23 17:37           ` Nicolin Chen
2025-09-23 17:44             ` Jason Gunthorpe
2025-09-23 19:46               ` Nicolin Chen
2025-08-31 23:31 ` [PATCH v4 2/7] iommu: Lock group->mutex in iommu_deferred_attach() Nicolin Chen
2025-09-12  9:34   ` Tian, Kevin
2025-08-31 23:31 ` [PATCH v4 3/7] iommu: Pass in gdev to __iommu_device_set_domain Nicolin Chen
2025-08-31 23:31 ` [PATCH v4 4/7] iommu: Pass in old domain to attach_dev callback functions Nicolin Chen
2025-09-12  9:35   ` Tian, Kevin
2025-09-19 22:56   ` Nicolin Chen
2025-09-24 18:43   ` Jason Gunthorpe
2025-09-24 19:18     ` Nicolin Chen
2025-09-24 19:22       ` Jason Gunthorpe
2025-08-31 23:31 ` [PATCH v4 5/7] iommu: Add iommu_get_domain_for_dev_locked() helper Nicolin Chen
2025-09-12  9:36   ` Tian, Kevin
2025-09-19 23:14     ` Nicolin Chen
2025-09-24 19:10   ` Jason Gunthorpe
2025-09-24 19:49     ` Nicolin Chen
2025-09-24 19:52       ` Jason Gunthorpe
2025-09-24 20:02         ` Nicolin Chen
2025-09-24 21:02           ` Jason Gunthorpe
2025-08-31 23:31 ` [PATCH v4 6/7] iommu: Introduce iommu_dev_reset_prepare() and iommu_dev_reset_done() Nicolin Chen
2025-09-12  9:49   ` Tian, Kevin
2025-09-15 12:53     ` Jason Gunthorpe
2025-09-22 19:39     ` Nicolin Chen
2025-08-31 23:31 ` Nicolin Chen [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=cbceeb65dd248fd06e5665dbcb6df4484b2d8958.1756682135.git.nicolinc@nvidia.com \
    --to=nicolinc@nvidia.com \
    --cc=alex@ghiti.fr \
    --cc=alim.akhtar@samsung.com \
    --cc=alyssa@rosenzweig.io \
    --cc=angelogioacchino.delregno@collabora.com \
    --cc=aou@eecs.berkeley.edu \
    --cc=asahi@lists.linux.dev \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=baolu.lu@linux.intel.com \
    --cc=bhelgaas@google.com \
    --cc=cwabbott0@gmail.com \
    --cc=dwmw2@infradead.org \
    --cc=etzhao1900@gmail.com \
    --cc=gerald.schaefer@linux.ibm.com \
    --cc=heiko@sntech.de \
    --cc=helgaas@kernel.org \
    --cc=iommu@lists.linux.dev \
    --cc=j@jannau.net \
    --cc=jean-philippe@linaro.org \
    --cc=jernej.skrabec@gmail.com \
    --cc=jgg@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=krzk@kernel.org \
    --cc=lenb@kernel.org \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-arm-msm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mediatek@lists.infradead.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=linux-rockchip@lists.infradead.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=linux-samsung-soc@vger.kernel.org \
    --cc=linux-sunxi@lists.linux.dev \
    --cc=linux-tegra@vger.kernel.org \
    --cc=m.szyprowski@samsung.com \
    --cc=matthias.bgg@gmail.com \
    --cc=mjrosato@linux.ibm.com \
    --cc=neal@gompa.dev \
    --cc=orsonzhai@gmail.com \
    --cc=palmer@dabbelt.com \
    --cc=patches@lists.linux.dev \
    --cc=paul.walmsley@sifive.com \
    --cc=quic_pbrahma@quicinc.com \
    --cc=rafael@kernel.org \
    --cc=robin.clark@oss.qualcomm.com \
    --cc=robin.murphy@arm.com \
    --cc=samuel@sholland.org \
    --cc=schnelle@linux.ibm.com \
    --cc=suravee.suthikulpanit@amd.com \
    --cc=sven@kernel.org \
    --cc=tjeznach@rivosinc.com \
    --cc=virtualization@lists.linux.dev \
    --cc=vsethi@nvidia.com \
    --cc=wens@csie.org \
    --cc=will@kernel.org \
    --cc=yi.l.liu@intel.com \
    --cc=yong.wu@mediatek.com \
    --cc=zhang.lyra@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox