qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: qemu-devel@nongnu.org
Cc: Peter Maydell <peter.maydell@linaro.org>,
	Alejandro Jimenez <alejandro.j.jimenez@oracle.com>,
	Marcel Apfelbaum <marcel.apfelbaum@gmail.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Richard Henderson <richard.henderson@linaro.org>,
	Eduardo Habkost <eduardo@habkost.net>
Subject: [PULL 60/75] amd_iommu: Use iova_tree records to determine large page size on UNMAP
Date: Sun, 5 Oct 2025 15:18:15 -0400	[thread overview]
Message-ID: <95bc772592eb5560e9c78d313b118788ea6a3826.1759691708.git.mst@redhat.com> (raw)
In-Reply-To: <cover.1759691708.git.mst@redhat.com>

From: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>

Keep a record of mapped IOVA ranges per address space, using the iova_tree
implementation. Besides enabling optimizations like avoiding unnecessary
notifications, a record of existing <IOVA, size> mappings makes it possible
to determine if a specific IOVA is mapped by the guest using a large page,
and adjust the size when notifying UNMAP events.

When unmapping a large page, the information in the guest PTE encoding the
page size is lost, since the guest clears the PTE before issuing the
invalidation command to the IOMMU. In such case, the size of the original
mapping can be retrieved from the iova_tree and used to issue the UNMAP
notification. Using the correct size is essential since the VFIO IOMMU
Type1v2 driver in the host kernel will reject unmap requests that do not
fully cover previous mappings.

Signed-off-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Message-ID: <20250919213515.917111-12-alejandro.j.jimenez@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/amd_iommu.c | 95 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 6 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index caae65c4b3..4376e977f8 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -33,6 +33,7 @@
 #include "hw/i386/apic-msidef.h"
 #include "hw/qdev-properties.h"
 #include "kvm/kvm_i386.h"
+#include "qemu/iova-tree.h"
 
 /* used AMD-Vi MMIO registers */
 const char *amdvi_mmio_low[] = {
@@ -71,6 +72,8 @@ struct AMDVIAddressSpace {
     IOMMUNotifierFlag notifier_flags;
     /* entry in list of Address spaces with registered notifiers */
     QLIST_ENTRY(AMDVIAddressSpace) next;
+    /* Record DMA translation ranges */
+    IOVATree *iova_tree;
 };
 
 /* AMDVI cache entry */
@@ -685,6 +688,75 @@ static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte,
     return 0;
 }
 
+/*
+ * Invoke notifiers registered for the address space. Update record of mapped
+ * ranges in IOVA Tree.
+ */
+static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event)
+{
+    IOMMUTLBEntry *entry = &event->entry;
+
+    DMAMap target = {
+        .iova = entry->iova,
+        .size = entry->addr_mask,
+        .translated_addr = entry->translated_addr,
+        .perm = entry->perm,
+    };
+
+    /*
+     * Search the IOVA Tree for an existing translation for the target, and skip
+     * the notification if the mapping is already recorded.
+     * When the guest uses large pages, comparing against the record makes it
+     * possible to determine the size of the original MAP and adjust the UNMAP
+     * request to match it. This avoids failed checks against the mappings kept
+     * by the VFIO kernel driver.
+     */
+    const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+    if (event->type == IOMMU_NOTIFIER_UNMAP) {
+        if (!mapped) {
+            /* No record exists of this mapping, nothing to do */
+            return;
+        }
+        /*
+         * Adjust the size based on the original record. This is essential to
+         * determine when large/contiguous pages are used, since the guest has
+         * already cleared the PTE (erasing the pagesize encoded on it) before
+         * issuing the invalidation command.
+         */
+        if (mapped->size != target.size) {
+            assert(mapped->size > target.size);
+            target.size = mapped->size;
+            /* Adjust event to invoke notifier with correct range */
+            entry->addr_mask = mapped->size;
+        }
+        iova_tree_remove(as->iova_tree, target);
+    } else { /* IOMMU_NOTIFIER_MAP */
+        if (mapped) {
+            /*
+             * If a mapping is present and matches the request, skip the
+             * notification.
+             */
+            if (!memcmp(mapped, &target, sizeof(DMAMap))) {
+                return;
+            } else {
+                /*
+                 * This should never happen unless a buggy guest OS omits or
+                 * sends incorrect invalidation(s). Report an error in the event
+                 * it does happen.
+                 */
+                error_report("Found conflicting translation. This could be due "
+                             "to an incorrect or missing invalidation command");
+            }
+        }
+        /* Record the new mapping */
+        iova_tree_insert(as->iova_tree, &target);
+    }
+
+    /* Invoke the notifiers registered for this address space */
+    memory_region_notify_iommu(&as->iommu, 0, *event);
+}
+
 /*
  * Walk the guest page table for an IOVA and range and signal the registered
  * notifiers to sync the shadow page tables in the host.
@@ -696,7 +768,7 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
 {
     IOMMUTLBEvent event;
 
-    hwaddr iova_next, page_mask, pagesize;
+    hwaddr page_mask, pagesize;
     hwaddr iova = addr;
     hwaddr end = iova + size - 1;
 
@@ -719,7 +791,6 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
         /* PTE has been validated for major errors and pagesize is set */
         assert(pagesize);
         page_mask = ~(pagesize - 1);
-        iova_next = (iova & page_mask) + pagesize;
 
         if (ret == -AMDVI_FR_PT_ENTRY_INV) {
             /*
@@ -752,15 +823,26 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
             event.type = IOMMU_NOTIFIER_MAP;
         }
 
-        /* Invoke the notifiers registered for this address space */
-        memory_region_notify_iommu(&as->iommu, 0, event);
+        /*
+         * The following call might need to adjust event.entry.size in cases
+         * where the guest unmapped a series of large pages.
+         */
+        amdvi_notify_iommu(as, &event);
+        /*
+         * In the special scenario where the guest is unmapping a large page,
+         * addr_mask has been adjusted before sending the notification. Update
+         * pagesize accordingly in order to correctly compute the next IOVA.
+         */
+        pagesize = event.entry.addr_mask + 1;
 
 next:
+        iova &= ~(pagesize - 1);
+
         /* Check for 64-bit overflow and terminate walk in such cases */
-        if (iova_next < iova) {
+        if ((iova + pagesize) < iova) {
             break;
         } else {
-            iova = iova_next;
+            iova += pagesize;
         }
     }
 }
@@ -1845,6 +1927,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
         iommu_as[devfn]->devfn = (uint8_t)devfn;
         iommu_as[devfn]->iommu_state = s;
         iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE;
+        iommu_as[devfn]->iova_tree = iova_tree_new();
 
         amdvi_dev_as = iommu_as[devfn];
 
-- 
MST



  parent reply	other threads:[~2025-10-05 19:19 UTC|newest]

Thread overview: 85+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-05 19:16 [PULL 00/75] virtio,pci,pc: features, fixes Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 01/75] net: bundle all offloads in a single struct Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 02/75] linux-headers: deal with counted_by annotation Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 03/75] linux-headers: Update to Linux v6.17-rc1 Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 04/75] virtio: introduce extended features type Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 05/75] virtio: serialize extended features state Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 06/75] virtio: add support for negotiating extended features Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 07/75] virtio-pci: implement support for " Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 08/75] vhost: add support for negotiating " Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 09/75] qmp: update virtio features map to support " Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 10/75] vhost-backend: implement extended features support Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 11/75] vhost-net: " Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 12/75] virtio-net: " Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 13/75] net: implement tunnel probing Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 14/75] net: implement UDP tunnel features offloading Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 15/75] Revert "hw/acpi/ghes: Make ghes_record_cper_errors() static" Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 16/75] acpi/ghes: Cleanup the code which gets ghes ged state Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 17/75] acpi/ghes: prepare to change the way HEST offsets are calculated Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 18/75] acpi/ghes: add a firmware file with HEST address Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 19/75] acpi/ghes: Use HEST table offsets when preparing GHES records Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 20/75] acpi/ghes: don't hard-code the number of sources for HEST table Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 21/75] acpi/ghes: add a notifier to notify when error data is ready Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 22/75] acpi/generic_event_device: Update GHES migration to cover hest addr Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 23/75] acpi/generic_event_device: add logic to detect if HEST addr is available Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 24/75] acpi/generic_event_device: add an APEI error device Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 25/75] tests/acpi: virt: allow acpi table changes at DSDT and HEST tables Michael S. Tsirkin
2025-10-05 19:16 ` [PULL 26/75] arm/virt: Wire up a GED error device for ACPI / GHES Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 27/75] qapi/acpi-hest: add an interface to do generic CPER error injection Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 28/75] acpi/generic_event_device.c: enable use_hest_addr for QEMU 10.x Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 29/75] tests/acpi: virt: update HEST and DSDT tables Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 30/75] docs: hest: add new "etc/acpi_table_hest_addr" and update workflow Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 31/75] scripts/ghes_inject: add a script to generate GHES error inject Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 32/75] hw/smbios: allow clearing the VM bit in SMBIOS table 0 Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 33/75] hw/i386/pc: Avoid overlap between CXL window and PCI 64bit BARs in QEMU Michael S. Tsirkin
2025-10-06 17:08   ` Michael Tokarev
2025-10-28 19:26     ` Michael Tokarev
2025-10-05 19:17 ` [PULL 34/75] pcie_sriov: Fix broken MMIO accesses from SR-IOV VFs Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 35/75] hw/virtio: rename vhost-user-device and make user creatable Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 36/75] smbios: cap DIMM size to 2Tb as workaround for broken Windows Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 37/75] pcie: Add a way to get the outstanding page request allocation (pri) from the config space Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 38/75] intel_iommu: Bypass barrier wait descriptor Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 39/75] intel_iommu: Declare PRI constants and structures Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 40/75] intel_iommu: Declare registers for PRI Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 41/75] intel_iommu: Add PRI operations support Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 42/75] x86: ich9: fix default value of 'No Reboot' bit in GCS Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 43/75] vhost: use virtio_config_get_guest_notifier() Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 44/75] virtio: unify virtio_notify_irqfd() and virtio_notify() Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 45/75] virtio: support irqfd in virtio_notify_config() Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 46/75] tests/libqos: extract qvirtqueue_set_avail_idx() Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 47/75] tests/virtio-scsi: add a virtio_error() IOThread test Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 48/75] pcie_sriov: make pcie_sriov_pf_exit() safe on non-SR-IOV devices Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 49/75] virtio: Add function name to error messages Michael S. Tsirkin
2025-10-05 20:13   ` Alessandro Ratti
2025-10-05 20:24     ` Michael S. Tsirkin
2025-10-08 10:01     ` Michael S. Tsirkin
2025-10-08 16:53       ` Alessandro Ratti
2025-10-05 20:19   ` [PULL v2 75/75] virtio: improve virtqueue mapping " Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 50/75] memory: Adjust event ranges to fit within notifier boundaries Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 51/75] amd_iommu: Document '-device amd-iommu' common options Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 52/75] amd_iommu: Reorder device and page table helpers Michael S. Tsirkin
2025-10-05 19:17 ` [PULL 53/75] amd_iommu: Helper to decode size of page invalidation command Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 54/75] amd_iommu: Add helper function to extract the DTE Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 55/75] amd_iommu: Return an error when unable to read PTE from guest memory Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 56/75] amd_iommu: Add helpers to walk AMD v1 Page Table format Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 57/75] amd_iommu: Add a page walker to sync shadow page tables on invalidation Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 58/75] amd_iommu: Add basic structure to support IOMMU notifier updates Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 59/75] amd_iommu: Sync shadow page tables on page invalidation Michael S. Tsirkin
2025-10-05 19:18 ` Michael S. Tsirkin [this message]
2025-10-05 19:18 ` [PULL 61/75] amd_iommu: Unmap all address spaces under the AMD IOMMU on reset Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 62/75] amd_iommu: Add replay callback Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 63/75] amd_iommu: Invalidate address translations on INVALIDATE_IOMMU_ALL Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 64/75] amd_iommu: Toggle memory regions based on address translation mode Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 65/75] amd_iommu: Set all address spaces to use passthrough mode on reset Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 66/75] amd_iommu: Add dma-remap property to AMD vIOMMU device Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 67/75] amd_iommu: Toggle address translation mode on devtab entry invalidation Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 68/75] amd_iommu: Do not assume passthrough translation when DTE[TV]=0 Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 69/75] amd_iommu: Refactor amdvi_page_walk() to use common code for page walk Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 70/75] intel-iommu: Move dma_translation to x86-iommu Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 71/75] amd_iommu: HATDis/HATS=11 support Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 72/75] vdpa-dev: add get_vhost() callback for vhost-vdpa device Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 73/75] intel_iommu: Enable Enhanced Set Root Table Pointer Support (ESRTPS) Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 74/75] intel_iommu: Simplify caching mode check with VFIO device Michael S. Tsirkin
2025-10-05 19:18 ` [PULL 75/75] pci: Fix wrong parameter passing to pci_device_get_iommu_bus_devfn() Michael S. Tsirkin
2025-10-05 20:20 ` [PULL 00/75] virtio,pci,pc: features, fixes Michael S. Tsirkin
2025-10-06 21:59 ` Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=95bc772592eb5560e9c78d313b118788ea6a3826.1759691708.git.mst@redhat.com \
    --to=mst@redhat.com \
    --cc=alejandro.j.jimenez@oracle.com \
    --cc=eduardo@habkost.net \
    --cc=marcel.apfelbaum@gmail.com \
    --cc=pbonzini@redhat.com \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).