All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] vfio/pci: Ensure D0 power state before guest access
@ 2026-02-09 16:20 Narayana Murty N
  2026-02-10 22:33 ` Alex Williamson
  0 siblings, 1 reply; 2+ messages in thread
From: Narayana Murty N @ 2026-02-09 16:20 UTC (permalink / raw)
  To: alex, clg, vaibhav, harshpb; +Cc: qemu-devel

Add vfio_ensure_d0_state() to safely transition PCI devices from D3hot/D3cold
to D0 before QEMU guest access, preventing config space inaccessibility and
tg3 IRQ crashes during VFIO realize.

Key changes:
- D3hot: Direct PMCSR write (offset 0x44) to force PowerState=00 (D0)
- D3cold: pm_runtime_resume() + pm_runtime_get_sync() for full power restore
- Polling loop verifies D0 transition completion
- No-op for already D0 devices

Fixes PowerPC EEH races where devices enter low-power states during VFIO
handover, causing config space access failures.

Signed-off-by: Narayana Murty N <nnmlinux@linux.ibm.com>
---
 hw/vfio/pci.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c734472721..851cd789aa 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3392,6 +3392,51 @@ bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
     return true;
 }
 
+static int write_sysfs(const char *path, const char *value)
+{
+    FILE *f = fopen(path, "w");
+    if (!f) {
+        return -1;
+    }
+    int ret = fprintf(f, "%s", value);
+    fclose(f);
+    return (ret > 0) ? 0 : -1;
+}
+
+static void vfio_ensure_d0_state(VFIOPCIDevice *vdev)
+{
+    VFIODevice *vbasedev = &vdev->vbasedev;
+    char sysfs_power_path[PATH_MAX];
+
+    /*
+     * Test config region accessibility (D3cold-safe, no PCI config
+     * reads!)
+     */
+    struct vfio_region_info reg_info = {
+        .argsz = sizeof(reg_info),
+        .index = VFIO_PCI_CONFIG_REGION_INDEX,
+        .offset = 0,
+        .size = 0
+    };
+
+    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info) < 0) {
+        warn_report("vfio: %s config region probe failed (D3cold): %s",
+                    vbasedev->name, strerror(errno));
+
+        /* D3cold confirmed → sysfs power control (EEH-safe) */
+        snprintf(sysfs_power_path, sizeof(sysfs_power_path),
+                 "/sys/bus/pci/devices/%s/power/control", vbasedev->name;
+
+        /* Force runtime resume */
+        if (write_sysfs(sysfs_power_path, "on") == 0) {
+            g_usleep(10000);  /* 10ms settle */
+            write_sysfs(sysfs_power_path, "auto");
+            info_report("vfio: %s D3cold → D0 via sysfs", vbasedev->name);
+        }
+    }
+    return;
+}
+
 static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
 {
     ERRP_GUARD();
@@ -3401,6 +3446,13 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
     char uuid[UUID_STR_LEN];
     g_autofree char *name = NULL;
 
+    /*
+     * ensure the power state of the pci device to D0,
+     * otherwise it will set to D0, before accessing the
+     * config space.
+     */
+    vfio_ensure_d0_state(vdev);
+
     if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
         if (!(~vdev->host.domain || ~vdev->host.bus ||
               ~vdev->host.slot || ~vdev->host.function)) {
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH 1/1] vfio/pci: Ensure D0 power state before guest access
  2026-02-09 16:20 [PATCH 1/1] vfio/pci: Ensure D0 power state before guest access Narayana Murty N
@ 2026-02-10 22:33 ` Alex Williamson
  0 siblings, 0 replies; 2+ messages in thread
From: Alex Williamson @ 2026-02-10 22:33 UTC (permalink / raw)
  To: Narayana Murty N; +Cc: clg, vaibhav, harshpb, qemu-devel

On Mon,  9 Feb 2026 21:50:48 +0530
Narayana Murty N <nnmlinux@linux.ibm.com> wrote:

> Add vfio_ensure_d0_state() to safely transition PCI devices from D3hot/D3cold
> to D0 before QEMU guest access, preventing config space inaccessibility and
> tg3 IRQ crashes during VFIO realize.
> 
> Key changes:
> - D3hot: Direct PMCSR write (offset 0x44) to force PowerState=00 (D0)
> - D3cold: pm_runtime_resume() + pm_runtime_get_sync() for full power restore
> - Polling loop verifies D0 transition completion
> - No-op for already D0 devices
> 
> Fixes PowerPC EEH races where devices enter low-power states during VFIO
> handover, causing config space access failures.
> 
> Signed-off-by: Narayana Murty N <nnmlinux@linux.ibm.com>
> ---
>  hw/vfio/pci.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)

NAK.  This is very broken.

QEMU cannot write to arbitrary sysfs attributes.  QEMU should not write
power state controls to sysfs nor impose a device power state policy.
vbasedev->fd is more than likely invalid where we're performing an
ioctl test, making the entire premise of the test invalid.

When the device is opened by QEMU, vfio-pci will issues a
pm_runtime_resume_and_get(), incrementing the PM usage counter and
waking the device.  This should properly bring the device to the D0
power state and keep it there regardless of any ill-timed race to low
power state.  If it does not, then fix it in the kernel or block
vfio-pci from using low power states, ie. disable_idle_d3.  Thanks,

Alex

> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index c734472721..851cd789aa 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3392,6 +3392,51 @@ bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
>      return true;
>  }
>  
> +static int write_sysfs(const char *path, const char *value)
> +{
> +    FILE *f = fopen(path, "w");
> +    if (!f) {
> +        return -1;
> +    }
> +    int ret = fprintf(f, "%s", value);
> +    fclose(f);
> +    return (ret > 0) ? 0 : -1;
> +}
> +
> +static void vfio_ensure_d0_state(VFIOPCIDevice *vdev)
> +{
> +    VFIODevice *vbasedev = &vdev->vbasedev;
> +    char sysfs_power_path[PATH_MAX];
> +
> +    /*
> +     * Test config region accessibility (D3cold-safe, no PCI config
> +     * reads!)
> +     */
> +    struct vfio_region_info reg_info = {
> +        .argsz = sizeof(reg_info),
> +        .index = VFIO_PCI_CONFIG_REGION_INDEX,
> +        .offset = 0,
> +        .size = 0
> +    };
> +
> +    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info) < 0) {
> +        warn_report("vfio: %s config region probe failed (D3cold): %s",
> +                    vbasedev->name, strerror(errno));
> +
> +        /* D3cold confirmed → sysfs power control (EEH-safe) */
> +        snprintf(sysfs_power_path, sizeof(sysfs_power_path),
> +                 "/sys/bus/pci/devices/%s/power/control", vbasedev->name;
> +
> +        /* Force runtime resume */
> +        if (write_sysfs(sysfs_power_path, "on") == 0) {
> +            g_usleep(10000);  /* 10ms settle */
> +            write_sysfs(sysfs_power_path, "auto");
> +            info_report("vfio: %s D3cold → D0 via sysfs", vbasedev->name);
> +        }
> +    }
> +    return;
> +}
> +
>  static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
>  {
>      ERRP_GUARD();
> @@ -3401,6 +3446,13 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp)
>      char uuid[UUID_STR_LEN];
>      g_autofree char *name = NULL;
>  
> +    /*
> +     * ensure the power state of the pci device to D0,
> +     * otherwise it will set to D0, before accessing the
> +     * config space.
> +     */
> +    vfio_ensure_d0_state(vdev);
> +
>      if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
>          if (!(~vdev->host.domain || ~vdev->host.bus ||
>                ~vdev->host.slot || ~vdev->host.function)) {



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-02-10 22:34 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-09 16:20 [PATCH 1/1] vfio/pci: Ensure D0 power state before guest access Narayana Murty N
2026-02-10 22:33 ` Alex Williamson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.