[PATCH v2 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
@ 2026-04-09 13:36 Ankit Agrawal
  2026-04-09 18:30 ` Alex Williamson
  0 siblings, 1 reply; 2+ messages in thread
From: Ankit Agrawal @ 2026-04-09 13:36 UTC (permalink / raw)
  To: alex
  Cc: kvm, jgg, yishaih, skolothumtho, kevin.tian, ankita, bhelgaas,
	linux-kernel, linux-pci

Add a CXL DVSEC-based readiness check for Blackwell-Next GPUs alongside
the existing legacy BAR0 polling path. On probe and after reset, the
driver reads the CXL Device DVSEC capability to determine whether the
GPU memory is valid. This is checked by polling on the Memory_Active bit
based on the Memory_Active_Timeout.

A static inline wrapper dispatches to the appropriate readiness check
based on whether the CXL DVSEC capability is present.

Suggested-by: Alex Williamson <alex@shazbot.org>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 75 ++++++++++++++++++++++++++---
 include/uapi/linux/pci_regs.h       |  1 +
 2 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index fa056b69f899..52f7e3a3054a 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -64,6 +64,8 @@ struct nvgrace_gpu_pci_core_device {
 	bool has_mig_hw_bug;
 	/* GPU has just been reset */
 	bool reset_done;
+	/* CXL Device DVSEC offset; 0 if not present (legacy GB path) */
+	int cxl_dvsec;
 };
 
 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -242,7 +244,7 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
 	vfio_pci_core_close_device(core_vdev);
 }
 
-static int nvgrace_gpu_wait_device_ready(void __iomem *io)
+static int nvgrace_gpu_wait_device_ready_legacy(void __iomem *io)
 {
 	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
 
@@ -256,6 +258,59 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io)
 	return -ETIME;
 }
 
+/*
+ * Decode the 3-bit Memory_Active_Timeout field from CXL DVSEC Range 1 Low
+ * (bits 15:13) into milliseconds. Encoding per CXL spec r4.0 sec 8.1.3.8.2:
+ * 000b = 1s, 001b = 4s, 010b = 16s, 011b = 64s, 100b = 256s,
+ * 101b-111b = reserved (clamped to 256s).
+ */
+static inline unsigned long nvgrace_gpu_cxl_mem_active_timeout_ms(u8 timeout)
+{
+	return 1000UL << (2 * min_t(u8, timeout, 4));
+}
+
+static int nvgrace_gpu_wait_device_ready_bw_next(struct nvgrace_gpu_pci_core_device *nvdev)
+{
+	struct pci_dev *pdev = nvdev->core_device.pdev;
+	int pcie_dvsec = nvdev->cxl_dvsec;
+	unsigned long timeout;
+	u32 dvsec_memory_status;
+	u8 mem_active_timeout;
+
+	pci_read_config_dword(pdev, pcie_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0),
+			      &dvsec_memory_status);
+
+	if (!(dvsec_memory_status & PCI_DVSEC_CXL_MEM_INFO_VALID))
+		return -ENODEV;
+
+	mem_active_timeout = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT,
+				       dvsec_memory_status);
+
+	timeout = jiffies +
+		  msecs_to_jiffies(nvgrace_gpu_cxl_mem_active_timeout_ms(mem_active_timeout));
+
+	do {
+		pci_read_config_dword(pdev,
+				      pcie_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0),
+				      &dvsec_memory_status);
+
+		if (dvsec_memory_status & PCI_DVSEC_CXL_MEM_ACTIVE)
+			return 0;
+
+		msleep(POLL_QUANTUM_MS);
+	} while (!time_after(jiffies, timeout));
+
+	return -ETIME;
+}
+
+static inline int nvgrace_gpu_wait_device_ready(struct nvgrace_gpu_pci_core_device *nvdev,
+						void __iomem *io)
+{
+	return nvdev->cxl_dvsec ?
+		nvgrace_gpu_wait_device_ready_bw_next(nvdev) :
+		nvgrace_gpu_wait_device_ready_legacy(io);
+}
+
 /*
  * If the GPU memory is accessed by the CPU while the GPU is not ready
  * after reset, it can cause harmless corrected RAS events to be logged.
@@ -275,7 +330,7 @@ nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
 	if (!__vfio_pci_memory_enabled(vdev))
 		return -EIO;
 
-	ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+	ret = nvgrace_gpu_wait_device_ready(nvdev, vdev->barmap[0]);
 	if (ret)
 		return ret;
 
@@ -1146,8 +1201,9 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
  * Ensure that the BAR0 region is enabled before accessing the
  * registers.
  */
-static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
+static int nvgrace_gpu_probe_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
 {
+	struct pci_dev *pdev = nvdev->core_device.pdev;
 	void __iomem *io;
 	int ret;
 
@@ -1165,7 +1221,7 @@ static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
 		goto iomap_exit;
 	}
 
-	ret = nvgrace_gpu_wait_device_ready(io);
+	ret = nvgrace_gpu_wait_device_ready(nvdev, io);
 
 	pci_iounmap(pdev, io);
 iomap_exit:
@@ -1183,10 +1239,6 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	u64 memphys, memlength;
 	int ret;
 
-	ret = nvgrace_gpu_probe_check_device_ready(pdev);
-	if (ret)
-		return ret;
-
 	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
 	if (!ret)
 		ops = &nvgrace_gpu_pci_ops;
@@ -1198,6 +1250,13 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 
 	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
 
+	nvdev->cxl_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
+						     PCI_DVSEC_CXL_DEVICE);
+
+	ret = nvgrace_gpu_probe_check_device_ready(nvdev);
+	if (ret)
+		goto out_put_vdev;
+
 	if (ops == &nvgrace_gpu_pci_ops) {
 		nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 14f634ab9350..718fb630f5bb 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1357,6 +1357,7 @@
 #define  PCI_DVSEC_CXL_RANGE_SIZE_LOW(i)		(0x1C + (i * 0x10))
 #define   PCI_DVSEC_CXL_MEM_INFO_VALID			_BITUL(0)
 #define   PCI_DVSEC_CXL_MEM_ACTIVE			_BITUL(1)
+#define   PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT		__GENMASK(15, 13)
 #define   PCI_DVSEC_CXL_MEM_SIZE_LOW			__GENMASK(31, 28)
 #define  PCI_DVSEC_CXL_RANGE_BASE_HIGH(i)		(0x20 + (i * 0x10))
 #define  PCI_DVSEC_CXL_RANGE_BASE_LOW(i)		(0x24 + (i * 0x10))
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v2 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
  2026-04-09 13:36 [PATCH v2 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC Ankit Agrawal
@ 2026-04-09 18:30 ` Alex Williamson
  0 siblings, 0 replies; 2+ messages in thread
From: Alex Williamson @ 2026-04-09 18:30 UTC (permalink / raw)
  To: Ankit Agrawal, bhelgaas
  Cc: kvm, jgg, yishaih, skolothumtho, kevin.tian, linux-kernel,
	linux-pci, alex

On Thu, 9 Apr 2026 13:36:51 +0000
Ankit Agrawal <ankita@nvidia.com> wrote:

> Add a CXL DVSEC-based readiness check for Blackwell-Next GPUs alongside
> the existing legacy BAR0 polling path. On probe and after reset, the
> driver reads the CXL Device DVSEC capability to determine whether the
> GPU memory is valid. This is checked by polling on the Memory_Active bit
> based on the Memory_Active_Timeout.
> 
> A static inline wrapper dispatches to the appropriate readiness check
> based on whether the CXL DVSEC capability is present.
> 
> Suggested-by: Alex Williamson <alex@shazbot.org>
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 75 ++++++++++++++++++++++++++---
>  include/uapi/linux/pci_regs.h       |  1 +
>  2 files changed, 68 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index fa056b69f899..52f7e3a3054a 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -64,6 +64,8 @@ struct nvgrace_gpu_pci_core_device {
>  	bool has_mig_hw_bug;
>  	/* GPU has just been reset */
>  	bool reset_done;
> +	/* CXL Device DVSEC offset; 0 if not present (legacy GB path) */
> +	int cxl_dvsec;
>  };
>  
>  static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
> @@ -242,7 +244,7 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
>  	vfio_pci_core_close_device(core_vdev);
>  }
>  
> -static int nvgrace_gpu_wait_device_ready(void __iomem *io)
> +static int nvgrace_gpu_wait_device_ready_legacy(void __iomem *io)
>  {
>  	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
>  
> @@ -256,6 +258,59 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io)
>  	return -ETIME;
>  }
>  
> +/*
> + * Decode the 3-bit Memory_Active_Timeout field from CXL DVSEC Range 1 Low
> + * (bits 15:13) into milliseconds. Encoding per CXL spec r4.0 sec 8.1.3.8.2:
> + * 000b = 1s, 001b = 4s, 010b = 16s, 011b = 64s, 100b = 256s,
> + * 101b-111b = reserved (clamped to 256s).
> + */
> +static inline unsigned long nvgrace_gpu_cxl_mem_active_timeout_ms(u8 timeout)
> +{
> +	return 1000UL << (2 * min_t(u8, timeout, 4));
> +}
> +
> +static int nvgrace_gpu_wait_device_ready_bw_next(struct nvgrace_gpu_pci_core_device *nvdev)
> +{
> +	struct pci_dev *pdev = nvdev->core_device.pdev;
> +	int pcie_dvsec = nvdev->cxl_dvsec;
> +	unsigned long timeout;
> +	u32 dvsec_memory_status;
> +	u8 mem_active_timeout;
> +
> +	pci_read_config_dword(pdev, pcie_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0),
> +			      &dvsec_memory_status);
> +
> +	if (!(dvsec_memory_status & PCI_DVSEC_CXL_MEM_INFO_VALID))
> +		return -ENODEV;

Nit, if MEM_ACTIVE is already set, we still read it twice rather than
exit here:

	if (dvsec_memory_status & PCI_DVSEC_CXL_MEM_ACTIVE)
		return 0;


> +
> +	mem_active_timeout = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT,
> +				       dvsec_memory_status);
> +
> +	timeout = jiffies +
> +		  msecs_to_jiffies(nvgrace_gpu_cxl_mem_active_timeout_ms(mem_active_timeout));
> +
> +	do {
> +		pci_read_config_dword(pdev,
> +				      pcie_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0),
> +				      &dvsec_memory_status);
> +
> +		if (dvsec_memory_status & PCI_DVSEC_CXL_MEM_ACTIVE)
> +			return 0;

Do we need to monitor PCI_DVSEC_CXL_MEM_INFO_VALID in the loop too?

> +
> +		msleep(POLL_QUANTUM_MS);
> +	} while (!time_after(jiffies, timeout));
> +
> +	return -ETIME;
> +}
> +
> +static inline int nvgrace_gpu_wait_device_ready(struct nvgrace_gpu_pci_core_device *nvdev,
> +						void __iomem *io)
> +{
> +	return nvdev->cxl_dvsec ?
> +		nvgrace_gpu_wait_device_ready_bw_next(nvdev) :
> +		nvgrace_gpu_wait_device_ready_legacy(io);
> +}
> +
>  /*
>   * If the GPU memory is accessed by the CPU while the GPU is not ready
>   * after reset, it can cause harmless corrected RAS events to be logged.
> @@ -275,7 +330,7 @@ nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
>  	if (!__vfio_pci_memory_enabled(vdev))
>  		return -EIO;
>  
> -	ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
> +	ret = nvgrace_gpu_wait_device_ready(nvdev, vdev->barmap[0]);
>  	if (ret)
>  		return ret;
>  
> @@ -1146,8 +1201,9 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
>   * Ensure that the BAR0 region is enabled before accessing the
>   * registers.
>   */
> -static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
> +static int nvgrace_gpu_probe_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
>  {
> +	struct pci_dev *pdev = nvdev->core_device.pdev;
>  	void __iomem *io;
>  	int ret;
>  
> @@ -1165,7 +1221,7 @@ static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
>  		goto iomap_exit;
>  	}
>  
> -	ret = nvgrace_gpu_wait_device_ready(io);
> +	ret = nvgrace_gpu_wait_device_ready(nvdev, io);
>  
>  	pci_iounmap(pdev, io);
>  iomap_exit:
> @@ -1183,10 +1239,6 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
>  	u64 memphys, memlength;
>  	int ret;
>  
> -	ret = nvgrace_gpu_probe_check_device_ready(pdev);
> -	if (ret)
> -		return ret;
> -
>  	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
>  	if (!ret)
>  		ops = &nvgrace_gpu_pci_ops;
> @@ -1198,6 +1250,13 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
>  
>  	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
>  
> +	nvdev->cxl_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
> +						     PCI_DVSEC_CXL_DEVICE);
> +
> +	ret = nvgrace_gpu_probe_check_device_ready(nvdev);
> +	if (ret)
> +		goto out_put_vdev;
> +
>  	if (ops == &nvgrace_gpu_pci_ops) {
>  		nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
>  
> diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
> index 14f634ab9350..718fb630f5bb 100644
> --- a/include/uapi/linux/pci_regs.h
> +++ b/include/uapi/linux/pci_regs.h
> @@ -1357,6 +1357,7 @@
>  #define  PCI_DVSEC_CXL_RANGE_SIZE_LOW(i)		(0x1C + (i * 0x10))
>  #define   PCI_DVSEC_CXL_MEM_INFO_VALID			_BITUL(0)
>  #define   PCI_DVSEC_CXL_MEM_ACTIVE			_BITUL(1)
> +#define   PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT		__GENMASK(15, 13)

Bjorn, please ack if this is ok to go through vfio.  Thanks,

Alex

>  #define   PCI_DVSEC_CXL_MEM_SIZE_LOW			__GENMASK(31, 28)
>  #define  PCI_DVSEC_CXL_RANGE_BASE_HIGH(i)		(0x20 + (i * 0x10))
>  #define  PCI_DVSEC_CXL_RANGE_BASE_LOW(i)		(0x24 + (i * 0x10))


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-04-09 18:30 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-09 13:36 [PATCH v2 1/1] vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC Ankit Agrawal
2026-04-09 18:30 ` Alex Williamson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox