qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Eric Auger <eric.auger@redhat.com>
To: eric.auger.pro@gmail.com, eric.auger@redhat.com,
	qemu-devel@nongnu.org, alex.williamson@redhat.com,
	clg@redhat.com, zhenzhong.duan@intel.com
Subject: [RFC 2/2] hw/vfio/pci: Prevents BARs from being dma mapped in d3hot state
Date: Wed, 19 Feb 2025 18:59:00 +0100	[thread overview]
Message-ID: <20250219175941.135390-3-eric.auger@redhat.com> (raw)
In-Reply-To: <20250219175941.135390-1-eric.auger@redhat.com>

Since kernel commit:
2b2c651baf1c ("vfio/pci: Invalidate mmaps and block the access
in D3hot power state")
any attempt to do an mmap access to a BAR when the device is in d3hot
state will generate a fault.

On system_powerdown, if the VFIO device is translated by an IOMMU,
the device is moved to D3hot state and then the vIOMMU gets disabled
by the guest. As a result of this later operation, the address space is
swapped from translated to untranslated. When re-enabling the aliased
regions, the RAM regions are dma-mapped again and this causes DMA_MAP
faults when attempting the operation on BARs.

To avoid doing the remap on those BARs, we need to retrieve the
information whether the device is in a non compatible state.

Implement the vfio_is_dma_map_allowed() callback for PCI devices.
If the device is in D3hot state, skip the DMA MAP in vfio_listener_add().

To ease the implementation, vfio_section_is_vfio_pci now returns
a VFIOPCIDevice pointer and the function is moved before the first
caller.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
 hw/vfio/common.c     | 57 +++++++++++++++++++++++++++-----------------
 hw/vfio/pci.c        | 22 +++++++++++++++++
 hw/vfio/trace-events |  1 +
 3 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 173fb3a997..96f401f10a 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -555,11 +555,34 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
     return true;
 }
 
+static VFIOPCIDevice *vfio_section_is_vfio_pci(MemoryRegionSection *section,
+                                     VFIOContainerBase *bcontainer)
+{
+    VFIOPCIDevice *pcidev;
+    VFIODevice *vbasedev;
+    Object *owner;
+
+    owner = memory_region_owner(section->mr);
+
+    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+        if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+            continue;
+        }
+        pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+        if (OBJECT(pcidev) == owner) {
+            return pcidev;
+        }
+    }
+
+    return NULL;
+}
+
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
     VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
                                                  listener);
+    VFIOPCIDevice *vdev;
     hwaddr iova, end;
     Int128 llend, llsize;
     void *vaddr;
@@ -630,6 +653,18 @@ static void vfio_listener_region_add(MemoryListener *listener,
 
     /* Here we assume that memory_region_is_ram(section->mr)==true */
 
+    /* skip if the region is a BAR and the power state forbids DMA MAP */
+    vdev = vfio_section_is_vfio_pci(section, bcontainer);
+    if (vdev) {
+        VFIODevice *vbasedev = &vdev->vbasedev;
+        assert(vbasedev->ops->vfio_is_dma_map_allowed);
+        if (!vbasedev->ops->vfio_is_dma_map_allowed(vbasedev)) {
+            trace_vfio_listener_region_add_skip(section->mr->name);
+            return;
+        }
+    }
+
+
     /*
      * For RAM memory regions with a RamDiscardManager, we only want to map the
      * actually populated parts - and update the mapping whenever we're notified
@@ -804,28 +839,6 @@ typedef struct VFIODirtyRangesListener {
     MemoryListener listener;
 } VFIODirtyRangesListener;
 
-static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
-                                     VFIOContainerBase *bcontainer)
-{
-    VFIOPCIDevice *pcidev;
-    VFIODevice *vbasedev;
-    Object *owner;
-
-    owner = memory_region_owner(section->mr);
-
-    QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
-        if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
-            continue;
-        }
-        pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
-        if (OBJECT(pcidev) == owner) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range,
                                              hwaddr iova, hwaddr end,
                                              bool update_pci)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ab17a98ee5..314dddae4a 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2653,6 +2653,26 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
     return ret;
 }
 
+/*
+ * BARs cannot be dma-mapped if the device is in D3hot state since
+ * linux commit 2b2c651baf1c ("vfio/pci: Invalidate mmaps and block
+ * the access in D3hot power state")
+ */
+static bool vfio_pci_is_dma_map_allowed(VFIODevice *vbasedev)
+{
+    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+    uint16_t pmcsr;
+    uint8_t state;
+
+    pmcsr = vfio_pci_read_config(&vdev->pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
+    state = pmcsr & PCI_PM_CTRL_STATE_MASK;
+    if (state == 3) {
+        return false;
+    }
+    return true;
+}
+
+
 static VFIODeviceOps vfio_pci_ops = {
     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
@@ -2660,6 +2680,7 @@ static VFIODeviceOps vfio_pci_ops = {
     .vfio_get_object = vfio_pci_get_object,
     .vfio_save_config = vfio_pci_save_config,
     .vfio_load_config = vfio_pci_load_config,
+    .vfio_is_dma_map_allowed = vfio_pci_is_dma_map_allowed,
 };
 
 bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
@@ -3477,3 +3498,4 @@ static void register_vfio_pci_dev_type(void)
 }
 
 type_init(register_vfio_pci_dev_type)
+
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index c5385e1a4f..a0d5868c2f 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -121,6 +121,7 @@ vfio_legacy_dma_unmap_overflow_workaround(void) ""
 vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
 vfio_reset_handler(void) ""
+vfio_listener_region_add_skip(const char *name) "DMA MAP would fail on region %s due to incompatible power state, skip it"
 
 # platform.c
 vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s"
-- 
2.47.1



  parent reply	other threads:[~2025-02-19 18:00 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-19 17:58 [RFC 0/2] hw/vfio/pci: Prevent BARs from being dma mapped in d3hot state Eric Auger
2025-02-19 17:58 ` [RFC 1/2] hw/vfio: Introduce vfio_is_dma_map_allowed() callback Eric Auger
2025-02-19 17:59 ` Eric Auger [this message]
2025-02-19 18:58 ` [RFC 0/2] hw/vfio/pci: Prevent BARs from being dma mapped in d3hot state Alex Williamson
2025-02-19 21:19   ` Alex Williamson
2025-02-20 10:31     ` Eric Auger
2025-02-20 10:45       ` Eric Auger
2025-02-20 15:07         ` Alex Williamson
2025-02-20 15:48           ` Alex Williamson
2025-02-20  4:24   ` Duan, Zhenzhong
2025-02-20  5:05     ` Alex Williamson
2025-02-20  8:25       ` Duan, Zhenzhong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250219175941.135390-3-eric.auger@redhat.com \
    --to=eric.auger@redhat.com \
    --cc=alex.williamson@redhat.com \
    --cc=clg@redhat.com \
    --cc=eric.auger.pro@gmail.com \
    --cc=qemu-devel@nongnu.org \
    --cc=zhenzhong.duan@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).