[PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC

Kernel KVM virtualization development
 help / color / mirror / Atom feed

* [PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
@ 2026-05-28  9:38 Ankit Agrawal
  2026-05-28 10:22 ` sashiko-bot
  2026-05-28 17:56 ` Alex Williamson
  0 siblings, 2 replies; 5+ messages in thread
From: Ankit Agrawal @ 2026-05-28  9:38 UTC (permalink / raw)
  To: alex, kvm
  Cc: jgg, yishaih, skolothumtho, kevin.tian, ankita, bhelgaas,
	linux-kernel, linux-pci

Add a CXL DVSEC-based readiness check for Blackwell-Next GPUs alongside
the existing legacy BAR0 polling path. On probe and after reset, the
driver reads the CXL Device DVSEC capability to determine whether the
GPU memory is ready. The CXL DVSEC offset is discovered at probe time;
subsequent paths branch on its presence to invoke either the legacy
BAR0 polling or the CXL DVSEC polling.

The memory readiness is checked by polling on the Memory_Active bit
based on the Memory_Active_Timeout. It also checks if MEM_INFO_VALID
is set within 1 second. If not, return error. This is based on the
CXL spec r4.0 sec 8.1.3.8.2.

Considering the large worst case wait of 256s for the CXL, the wait is
kept outside of memory lock. Only after the wait is the memory_lock
taken to check the state.

Add PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT to pci_regs.h for the timeout
field encoding.

Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Suggested-by: Alex Williamson <alex@shazbot.org>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 144 ++++++++++++++++++++++++++--
 include/uapi/linux/pci_regs.h       |   1 +
 2 files changed, 136 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index fa056b69f899..04fcc0d088f5 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -3,7 +3,9 @@
  * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
  */
 
+#include <linux/bitfield.h>
 #include <linux/sizes.h>
+#include <linux/time64.h>
 #include <linux/vfio_pci_core.h>
 #include <linux/delay.h>
 #include <linux/jiffies.h>
@@ -64,6 +66,8 @@ struct nvgrace_gpu_pci_core_device {
 	bool has_mig_hw_bug;
 	/* GPU has just been reset */
 	bool reset_done;
+	/* CXL Device DVSEC offset; 0 if not present (legacy GB path) */
+	int cxl_dvsec;
 };
 
 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -242,7 +246,7 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
 	vfio_pci_core_close_device(core_vdev);
 }
 
-static int nvgrace_gpu_wait_device_ready(void __iomem *io)
+static int nvgrace_gpu_wait_device_ready_legacy(void __iomem *io)
 {
 	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
 
@@ -256,10 +260,89 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io)
 	return -ETIME;
 }
 
+/*
+ * Decode the 3-bit Memory_Active_Timeout field from CXL DVSEC Range 1 Low
+ * (bits 15:13) into milliseconds. Encoding per CXL spec r4.0 sec 8.1.3.8.2:
+ * 000b = 1s, 001b = 4s, 010b = 16s, 011b = 64s, 100b = 256s,
+ * 101b-111b = reserved (clamped to 256s).
+ */
+static inline unsigned long cxl_mem_active_timeout_ms(u8 timeout)
+{
+	return MSEC_PER_SEC << (2 * min_t(u8, timeout, 4));
+}
+
+/*
+ * Check if CXL DVSEC reports memory as valid and active.
+ */
+static inline bool cxl_dvsec_mem_is_active(u32 status)
+{
+	return (status & PCI_DVSEC_CXL_MEM_INFO_VALID) &&
+	       (status & PCI_DVSEC_CXL_MEM_ACTIVE);
+}
+
+static int nvgrace_gpu_test_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev,
+					     u32 *status)
+{
+	struct pci_dev *pdev = nvdev->core_device.pdev;
+	int cxl_dvsec = nvdev->cxl_dvsec;
+	u32 val;
+
+	pci_read_config_dword(pdev,
+			      cxl_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0),
+			      &val);
+
+	if (val == ~0U)
+		return -ENODEV;
+
+	if (status)
+		*status = val;
+
+	if (cxl_dvsec_mem_is_active(val))
+		return 0;
+
+	return -EAGAIN;
+}
+
+/*
+ * As per CXL spec r4.0 sec 8.1.3.8.2, MEM_INFO_VALID needs to be set
+ * within 1s and MEM_ACTIVE within Memory_Active_Timeout (up to ~256s)
+ * after reset and bootup.
+ */
+static int nvgrace_gpu_wait_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev)
+{
+	unsigned long deadline = jiffies + msecs_to_jiffies(POLL_QUANTUM_MS);
+	bool active_phase = false;
+	u32 status;
+	int ret;
+
+	for (;;) {
+		ret = nvgrace_gpu_test_device_ready_cxl(nvdev, &status);
+		if (ret != -EAGAIN)
+			return ret;
+
+		if (!active_phase && (status & PCI_DVSEC_CXL_MEM_INFO_VALID)) {
+			u8 t = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT, status);
+
+			deadline = jiffies +
+				   msecs_to_jiffies(cxl_mem_active_timeout_ms(t));
+			active_phase = true;
+		}
+
+		if (time_after(jiffies, deadline))
+			return -ETIME;
+
+		msleep(POLL_QUANTUM_MS);
+	}
+}
+
 /*
  * If the GPU memory is accessed by the CPU while the GPU is not ready
  * after reset, it can cause harmless corrected RAS events to be logged.
  * Make sure the GPU is ready before establishing the mappings.
+ *
+ * Since the CXL polling wait could take 256s, it happens outside
+ * memory_lock. Only do quick readiness check under the lock. Legacy
+ * keeps the in-lock poll.
  */
 static int
 nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
@@ -275,7 +358,10 @@ nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
 	if (!__vfio_pci_memory_enabled(vdev))
 		return -EIO;
 
-	ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+	if (nvdev->cxl_dvsec)
+		ret = nvgrace_gpu_test_device_ready_cxl(nvdev, NULL);
+	else
+		ret = nvgrace_gpu_wait_device_ready_legacy(vdev->barmap[0]);
 	if (ret)
 		return ret;
 
@@ -313,6 +399,21 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
 	pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
 
 	if (is_aligned_for_order(vma, addr, pfn, order)) {
+		/*
+		 * Exit early under memory_lock to avoid a potentially lengthy
+		 * device readiness wait on a runtime-suspended device. Any
+		 * race after the lock is dropped is benign as the re-check
+		 * inside the scoped guard below catches it.
+		 */
+		scoped_guard(rwsem_read, &vdev->memory_lock) {
+			if (vdev->pm_runtime_engaged)
+				return VM_FAULT_SIGBUS;
+		}
+
+		if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done) &&
+		    nvgrace_gpu_wait_device_ready_cxl(nvdev))
+			return VM_FAULT_SIGBUS;
+
 		scoped_guard(rwsem_read, &vdev->memory_lock) {
 			if (vdev->pm_runtime_engaged ||
 			    nvgrace_gpu_check_device_ready(nvdev))
@@ -712,6 +813,12 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
 	else
 		mem_count = min(count, memregion->memlength - (size_t)offset);
 
+	if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done)) {
+		ret = nvgrace_gpu_wait_device_ready_cxl(nvdev);
+		if (ret)
+			return ret;
+	}
+
 	scoped_guard(rwsem_read, &vdev->memory_lock) {
 		ret = nvgrace_gpu_check_device_ready(nvdev);
 		if (ret)
@@ -846,6 +953,12 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
 	 */
 	mem_count = min(count, memregion->memlength - (size_t)offset);
 
+	if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done)) {
+		ret = nvgrace_gpu_wait_device_ready_cxl(nvdev);
+		if (ret)
+			return ret;
+	}
+
 	scoped_guard(rwsem_read, &vdev->memory_lock) {
 		ret = nvgrace_gpu_check_device_ready(nvdev);
 		if (ret)
@@ -1143,14 +1256,24 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
  * is beneficial to make the check to ensure the device is in an
  * expected state.
  *
- * Ensure that the BAR0 region is enabled before accessing the
+ * On Blackwell-Next systems, memory readiness is determined via the
+ * CXL Device DVSEC in PCI config space and does not require BAR0.
+ * For the legacy path, ensure BAR0 is enabled before accessing the
  * registers.
  */
-static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
+static int nvgrace_gpu_probe_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
 {
+	struct pci_dev *pdev = nvdev->core_device.pdev;
 	void __iomem *io;
 	int ret;
 
+	/*
+	 * Note that the worst-case wait here is ~256s (vs ~30s on the
+	 * legacy path) and may block device unbind/sysfs for the duration.
+	 */
+	if (nvdev->cxl_dvsec)
+		return nvgrace_gpu_wait_device_ready_cxl(nvdev);
+
 	ret = pci_enable_device(pdev);
 	if (ret)
 		return ret;
@@ -1165,7 +1288,7 @@ static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
 		goto iomap_exit;
 	}
 
-	ret = nvgrace_gpu_wait_device_ready(io);
+	ret = nvgrace_gpu_wait_device_ready_legacy(io);
 
 	pci_iounmap(pdev, io);
 iomap_exit:
@@ -1183,10 +1306,6 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	u64 memphys, memlength;
 	int ret;
 
-	ret = nvgrace_gpu_probe_check_device_ready(pdev);
-	if (ret)
-		return ret;
-
 	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
 	if (!ret)
 		ops = &nvgrace_gpu_pci_ops;
@@ -1196,6 +1315,13 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	if (IS_ERR(nvdev))
 		return PTR_ERR(nvdev);
 
+	nvdev->cxl_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
+						     PCI_DVSEC_CXL_DEVICE);
+
+	ret = nvgrace_gpu_probe_check_device_ready(nvdev);
+	if (ret)
+		goto out_put_vdev;
+
 	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
 
 	if (ops == &nvgrace_gpu_pci_ops) {
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 14f634ab9350..718fb630f5bb 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1357,6 +1357,7 @@
 #define  PCI_DVSEC_CXL_RANGE_SIZE_LOW(i)		(0x1C + (i * 0x10))
 #define   PCI_DVSEC_CXL_MEM_INFO_VALID			_BITUL(0)
 #define   PCI_DVSEC_CXL_MEM_ACTIVE			_BITUL(1)
+#define   PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT		__GENMASK(15, 13)
 #define   PCI_DVSEC_CXL_MEM_SIZE_LOW			__GENMASK(31, 28)
 #define  PCI_DVSEC_CXL_RANGE_BASE_HIGH(i)		(0x20 + (i * 0x10))
 #define  PCI_DVSEC_CXL_RANGE_BASE_LOW(i)		(0x24 + (i * 0x10))
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
  2026-05-28  9:38 [PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC Ankit Agrawal
@ 2026-05-28 10:22 ` sashiko-bot
  2026-05-28 17:56 ` Alex Williamson
  1 sibling, 0 replies; 5+ messages in thread
From: sashiko-bot @ 2026-05-28 10:22 UTC (permalink / raw)
  To: Ankit Agrawal; +Cc: linux-pci, kvm

Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [High] TOCTOU race in lockless device readiness checks causes unexpected -EAGAIN or VM_FAULT_SIGBUS errors instead of waiting for the device.
- [High] The 256-second device readiness polling uses uninterruptible `msleep()` loops, which can trigger hung task panics and renders user tasks unkillable.
--

commit a85819b2171b67b275d15e9b87eb7447b4dd63a4
Author: Ankit Agrawal <ankita@nvidia.com>

vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC

Adds a CXL DVSEC-based readiness check for Blackwell-Next GPUs in addition
to the existing legacy BAR0 polling. The device readiness is determined by
polling the Memory_Active bit outside the memory lock for up to 256 seconds.

> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index fa056b69f899a..04fcc0d088f54 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c

[ ... ]

> +static int nvgrace_gpu_wait_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev)
> +{
> +	unsigned long deadline = jiffies + msecs_to_jiffies(POLL_QUANTUM_MS);
> +	bool active_phase = false;
> +	u32 status;
> +	int ret;
> +
> +	for (;;) {

[ ... ]

> +		if (time_after(jiffies, deadline))
> +			return -ETIME;
> +
> +		msleep(POLL_QUANTUM_MS);
> +	}
> +}

[Severity: High]
Will this uninterruptible sleep trigger hung task panics?

Since the CXL device readiness can take up to 256 seconds and msleep() places
the task in the TASK_UNINTERRUPTIBLE state, processes accessing the device
during a reset will become unresponsive and immune to signals (including
SIGKILL) for up to 4 minutes.

With the default Linux hung_task_timeout_secs at 120 seconds, blocking here
is highly likely to trigger the hung task detector and cause a kernel panic
on systems with hung_task_panic enabled. Would it be better to use an
interruptible sleep and abort on pending fatal signals?

[ ... ]

> @@ -313,6 +399,21 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
>  	pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
>  
>  	if (is_aligned_for_order(vma, addr, pfn, order)) {
> +		/*
> +		 * Exit early under memory_lock to avoid a potentially lengthy
> +		 * device readiness wait on a runtime-suspended device. Any
> +		 * race after the lock is dropped is benign as the re-check
> +		 * inside the scoped guard below catches it.
> +		 */
> +		scoped_guard(rwsem_read, &vdev->memory_lock) {
> +			if (vdev->pm_runtime_engaged)
> +				return VM_FAULT_SIGBUS;
> +		}
> +
> +		if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done) &&
> +		    nvgrace_gpu_wait_device_ready_cxl(nvdev))
> +			return VM_FAULT_SIGBUS;

[Severity: High]
Can this lockless check race with a concurrent device reset and bypass the
intended polling mechanism?

If a concurrent device reset finishes and sets reset_done = true right after
this lockless check evaluates to false, the thread proceeds without polling
and blocks waiting for memory_lock.

Once the lock is acquired, the thread calls nvgrace_gpu_check_device_ready()
which observes reset_done == true and invokes the fallback
nvgrace_gpu_test_device_ready_cxl() under the lock. Because this CXL test is
non-blocking, it immediately returns -EAGAIN, causing
nvgrace_gpu_vfio_pci_huge_fault() to return VM_FAULT_SIGBUS and crash user
applications instead of waiting.

> +
>  		scoped_guard(rwsem_read, &vdev->memory_lock) {
>  			if (vdev->pm_runtime_engaged ||
>  			    nvgrace_gpu_check_device_ready(nvdev))

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260528093840.4018-1-ankita@nvidia.com?part=1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
  2026-05-28  9:38 [PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC Ankit Agrawal
  2026-05-28 10:22 ` sashiko-bot
@ 2026-05-28 17:56 ` Alex Williamson
  2026-05-29 11:27   ` Ankit Agrawal
  2026-06-08 15:47   ` Bjorn Helgaas
  1 sibling, 2 replies; 5+ messages in thread
From: Alex Williamson @ 2026-05-28 17:56 UTC (permalink / raw)
  To: Ankit Agrawal
  Cc: kvm, jgg, yishaih, skolothumtho, kevin.tian, bhelgaas,
	linux-kernel, linux-pci, alex

On Thu, 28 May 2026 09:38:40 +0000
Ankit Agrawal <ankita@nvidia.com> wrote:
...
>  drivers/vfio/pci/nvgrace-gpu/main.c | 144 ++++++++++++++++++++++++++--
>  include/uapi/linux/pci_regs.h       |   1 +

Bjorn, I assume you don't object to this trivial addition:

> diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
> index 14f634ab9350..718fb630f5bb 100644
> --- a/include/uapi/linux/pci_regs.h
> +++ b/include/uapi/linux/pci_regs.h
> @@ -1357,6 +1357,7 @@
>  #define  PCI_DVSEC_CXL_RANGE_SIZE_LOW(i)		(0x1C + (i * 0x10))
>  #define   PCI_DVSEC_CXL_MEM_INFO_VALID			_BITUL(0)
>  #define   PCI_DVSEC_CXL_MEM_ACTIVE			_BITUL(1)
> +#define   PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT		__GENMASK(15, 13)
>  #define   PCI_DVSEC_CXL_MEM_SIZE_LOW			__GENMASK(31, 28)
>  #define  PCI_DVSEC_CXL_RANGE_BASE_HIGH(i)		(0x20 + (i * 0x10))
>  #define  PCI_DVSEC_CXL_RANGE_BASE_LOW(i)		(0x24 + (i * 0x10))

>  2 files changed, 136 insertions(+), 9 deletions(-)

Replying to the sashiko comments in thread to keep all the reviewers in
the loop...

On Thu, 28 May 2026 10:22:14 +0000
sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
> - [High] TOCTOU race in lockless device readiness checks causes unexpected -EAGAIN or VM_FAULT_SIGBUS errors instead of waiting for the device.
> - [High] The 256-second device readiness polling uses uninterruptible `msleep()` loops, which can trigger hung task panics and renders user tasks unkillable.
> --
> 
> commit a85819b2171b67b275d15e9b87eb7447b4dd63a4
> Author: Ankit Agrawal <ankita@nvidia.com>
> 
> vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
> 
> Adds a CXL DVSEC-based readiness check for Blackwell-Next GPUs in addition
> to the existing legacy BAR0 polling. The device readiness is determined by
> polling the Memory_Active bit outside the memory lock for up to 256 seconds.
> 
> > diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> > index fa056b69f899a..04fcc0d088f54 100644
> > --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> > +++ b/drivers/vfio/pci/nvgrace-gpu/main.c  
> 
> [ ... ]
> 
> > +static int nvgrace_gpu_wait_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev)
> > +{
> > +	unsigned long deadline = jiffies + msecs_to_jiffies(POLL_QUANTUM_MS);
> > +	bool active_phase = false;
> > +	u32 status;
> > +	int ret;
> > +
> > +	for (;;) {  
> 
> [ ... ]
> 
> > +		if (time_after(jiffies, deadline))
> > +			return -ETIME;
> > +
> > +		msleep(POLL_QUANTUM_MS);
> > +	}
> > +}  
> 
> [Severity: High]
> Will this uninterruptible sleep trigger hung task panics?
> 
> Since the CXL device readiness can take up to 256 seconds and msleep() places
> the task in the TASK_UNINTERRUPTIBLE state, processes accessing the device
> during a reset will become unresponsive and immune to signals (including
> SIGKILL) for up to 4 minutes.
> 
> With the default Linux hung_task_timeout_secs at 120 seconds, blocking here
> is highly likely to trigger the hung task detector and cause a kernel panic
> on systems with hung_task_panic enabled. Would it be better to use an
> interruptible sleep and abort on pending fatal signals?

The legacy path already has this same issue, but the timeout is 30s
rather than 256s.  Both locations could pretty trivially change to:

	msleep_interruptible(POLL_QUANTUM_MS);
	if (fatal_signal_pending(current))
		return -EINTR;

It's maybe worth a follow-up to address both for a little more
robustness.

> > @@ -313,6 +399,21 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
> >  	pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
> >  
> >  	if (is_aligned_for_order(vma, addr, pfn, order)) {
> > +		/*
> > +		 * Exit early under memory_lock to avoid a potentially lengthy
> > +		 * device readiness wait on a runtime-suspended device. Any
> > +		 * race after the lock is dropped is benign as the re-check
> > +		 * inside the scoped guard below catches it.
> > +		 */
> > +		scoped_guard(rwsem_read, &vdev->memory_lock) {
> > +			if (vdev->pm_runtime_engaged)
> > +				return VM_FAULT_SIGBUS;
> > +		}
> > +
> > +		if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done) &&
> > +		    nvgrace_gpu_wait_device_ready_cxl(nvdev))
> > +			return VM_FAULT_SIGBUS;  
> 
> [Severity: High]
> Can this lockless check race with a concurrent device reset and bypass the
> intended polling mechanism?
> 
> If a concurrent device reset finishes and sets reset_done = true right after
> this lockless check evaluates to false, the thread proceeds without polling
> and blocks waiting for memory_lock.
> 
> Once the lock is acquired, the thread calls nvgrace_gpu_check_device_ready()
> which observes reset_done == true and invokes the fallback
> nvgrace_gpu_test_device_ready_cxl() under the lock. Because this CXL test is
> non-blocking, it immediately returns -EAGAIN, causing
> nvgrace_gpu_vfio_pci_huge_fault() to return VM_FAULT_SIGBUS and crash user
> applications instead of waiting.

This doesn't really seem to break our contract with userspace, they're
issuing a reset while the device is taking a fault, reset necessarily
disables memory, therefore a sigbus is a valid outcome.  We could make
this a bit more robust to such a race with a retry, but it would be
contingent on the interruptible msleep above or else the user could
potentially trigger endless resets during fault.  Something like this:

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 01b819156c03..9e7f8a7e1abc 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -415,14 +415,21 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
                        if (vdev->pm_runtime_engaged)
                                return VM_FAULT_SIGBUS;
                }
-
+retry:
                if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done) &&
                    nvgrace_gpu_wait_device_ready_cxl(nvdev))
                        return VM_FAULT_SIGBUS;
 
                scoped_guard(rwsem_read, &vdev->memory_lock) {
-                       if (vdev->pm_runtime_engaged ||
-                           nvgrace_gpu_check_device_ready(nvdev))
+                       int rc;
+
+                       if (vdev->pm_runtime_engaged)
+                               return VM_FAULT_SIGBUS;
+
+                       rc = nvgrace_gpu_check_device_ready(nvdev);
+                       if (rc == -EAGAIN)
+                               goto retry;
+                       if (rc)
                                return VM_FAULT_SIGBUS;
 
                        ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);

Note that the read/write paths also have this gap where we can wait for
the device to be ready, but the check under memory_lock returns
-EAGAIN.  The difference is that userspace will already automatically
handle the -EAGAIN vs the SIGBUS could be fatal.

I think both of the above would be good improvements, though I don't
think either is strictly necessary.  I don't spot any other must-fix
issues, any concerns from others?  Thanks,

Alex

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
  2026-05-28 17:56 ` Alex Williamson
@ 2026-05-29 11:27   ` Ankit Agrawal
  2026-06-08 15:47   ` Bjorn Helgaas
  1 sibling, 0 replies; 5+ messages in thread
From: Ankit Agrawal @ 2026-05-29 11:27 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm@vger.kernel.org, jgg@ziepe.ca, Yishai Hadas,
	Shameer Kolothum Thodi, kevin.tian@intel.com, bhelgaas@google.com,
	linux-kernel@vger.kernel.org, linux-pci@vger.kernel.org

> The legacy path already has this same issue, but the timeout is 30s
> rather than 256s.  Both locations could pretty trivially change to:
>     msleep_interruptible(POLL_QUANTUM_MS);
>     if (fatal_signal_pending(current))
>             return -EINTR;

Good point, thanks. I'll fold this into both the CXL and legacy waits
for v8.

> We could make this a bit more robust to such a race with a retry, but
> it would be contingent on the interruptible msleep above

Sounds good, I'll add the retry in nvgrace_gpu_vfio_pci_huge_fault() as
you have it.

> Note that the read/write paths also have this gap [...] userspace will
> already automatically handle the -EAGAIN

Agreed.

Thanks for the review!  I'll send v8 with both changes.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
  2026-05-28 17:56 ` Alex Williamson
  2026-05-29 11:27   ` Ankit Agrawal
@ 2026-06-08 15:47   ` Bjorn Helgaas
  1 sibling, 0 replies; 5+ messages in thread
From: Bjorn Helgaas @ 2026-06-08 15:47 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Ankit Agrawal, kvm, jgg, yishaih, skolothumtho, kevin.tian,
	bhelgaas, linux-kernel, linux-pci

On Thu, May 28, 2026 at 11:56:13AM -0600, Alex Williamson wrote:
> On Thu, 28 May 2026 09:38:40 +0000
> Ankit Agrawal <ankita@nvidia.com> wrote:
> ...
> >  drivers/vfio/pci/nvgrace-gpu/main.c | 144 ++++++++++++++++++++++++++--
> >  include/uapi/linux/pci_regs.h       |   1 +
> 
> Bjorn, I assume you don't object to this trivial addition:
> 
> > diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
> > index 14f634ab9350..718fb630f5bb 100644
> > --- a/include/uapi/linux/pci_regs.h
> > +++ b/include/uapi/linux/pci_regs.h
> > @@ -1357,6 +1357,7 @@
> >  #define  PCI_DVSEC_CXL_RANGE_SIZE_LOW(i)		(0x1C + (i * 0x10))
> >  #define   PCI_DVSEC_CXL_MEM_INFO_VALID			_BITUL(0)
> >  #define   PCI_DVSEC_CXL_MEM_ACTIVE			_BITUL(1)
> > +#define   PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT		__GENMASK(15, 13)
> >  #define   PCI_DVSEC_CXL_MEM_SIZE_LOW			__GENMASK(31, 28)
> >  #define  PCI_DVSEC_CXL_RANGE_BASE_HIGH(i)		(0x20 + (i * 0x10))
> >  #define  PCI_DVSEC_CXL_RANGE_BASE_LOW(i)		(0x24 + (i * 0x10))

Yep, looks fine to me.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-06-08 15:47 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-28  9:38 [PATCH v7 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC Ankit Agrawal
2026-05-28 10:22 ` sashiko-bot
2026-05-28 17:56 ` Alex Williamson
2026-05-29 11:27   ` Ankit Agrawal
2026-06-08 15:47   ` Bjorn Helgaas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox