qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
To: qemu-devel@nongnu.org
Cc: mst@redhat.com, clement.mathieu--drif@eviden.com,
	pbonzini@redhat.com, richard.henderson@linaro.org,
	eduardo@habkost.net, peterx@redhat.com, david@redhat.com,
	philmd@linaro.org, marcel.apfelbaum@gmail.com,
	alex.williamson@redhat.com, imammedo@redhat.com,
	anisinha@redhat.com, vasant.hegde@amd.com,
	suravee.suthikulpanit@amd.com, santosh.shukla@amd.com,
	sarunkod@amd.com, Wei.Huang2@amd.com, Ankit.Soni@amd.com,
	ethan.milon@eviden.com, joao.m.martins@oracle.com,
	boris.ostrovsky@oracle.com, alejandro.j.jimenez@oracle.com
Subject: [PATCH v3 11/22] amd_iommu: Use iova_tree records to determine large page size on UNMAP
Date: Fri, 19 Sep 2025 21:35:04 +0000	[thread overview]
Message-ID: <20250919213515.917111-12-alejandro.j.jimenez@oracle.com> (raw)
In-Reply-To: <20250919213515.917111-1-alejandro.j.jimenez@oracle.com>

Keep a record of mapped IOVA ranges per address space, using the iova_tree
implementation. Besides enabling optimizations like avoiding unnecessary
notifications, a record of existing <IOVA, size> mappings makes it possible
to determine if a specific IOVA is mapped by the guest using a large page,
and adjust the size when notifying UNMAP events.

When unmapping a large page, the information in the guest PTE encoding the
page size is lost, since the guest clears the PTE before issuing the
invalidation command to the IOMMU. In such case, the size of the original
mapping can be retrieved from the iova_tree and used to issue the UNMAP
notification. Using the correct size is essential since the VFIO IOMMU
Type1v2 driver in the host kernel will reject unmap requests that do not
fully cover previous mappings.

Signed-off-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
---
 hw/i386/amd_iommu.c | 95 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 6 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index caae65c4b3565..4376e977f8886 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -33,6 +33,7 @@
 #include "hw/i386/apic-msidef.h"
 #include "hw/qdev-properties.h"
 #include "kvm/kvm_i386.h"
+#include "qemu/iova-tree.h"
 
 /* used AMD-Vi MMIO registers */
 const char *amdvi_mmio_low[] = {
@@ -71,6 +72,8 @@ struct AMDVIAddressSpace {
     IOMMUNotifierFlag notifier_flags;
     /* entry in list of Address spaces with registered notifiers */
     QLIST_ENTRY(AMDVIAddressSpace) next;
+    /* Record DMA translation ranges */
+    IOVATree *iova_tree;
 };
 
 /* AMDVI cache entry */
@@ -685,6 +688,75 @@ static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte,
     return 0;
 }
 
+/*
+ * Invoke notifiers registered for the address space. Update record of mapped
+ * ranges in IOVA Tree.
+ */
+static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event)
+{
+    IOMMUTLBEntry *entry = &event->entry;
+
+    DMAMap target = {
+        .iova = entry->iova,
+        .size = entry->addr_mask,
+        .translated_addr = entry->translated_addr,
+        .perm = entry->perm,
+    };
+
+    /*
+     * Search the IOVA Tree for an existing translation for the target, and skip
+     * the notification if the mapping is already recorded.
+     * When the guest uses large pages, comparing against the record makes it
+     * possible to determine the size of the original MAP and adjust the UNMAP
+     * request to match it. This avoids failed checks against the mappings kept
+     * by the VFIO kernel driver.
+     */
+    const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+    if (event->type == IOMMU_NOTIFIER_UNMAP) {
+        if (!mapped) {
+            /* No record exists of this mapping, nothing to do */
+            return;
+        }
+        /*
+         * Adjust the size based on the original record. This is essential to
+         * determine when large/contiguous pages are used, since the guest has
+         * already cleared the PTE (erasing the pagesize encoded on it) before
+         * issuing the invalidation command.
+         */
+        if (mapped->size != target.size) {
+            assert(mapped->size > target.size);
+            target.size = mapped->size;
+            /* Adjust event to invoke notifier with correct range */
+            entry->addr_mask = mapped->size;
+        }
+        iova_tree_remove(as->iova_tree, target);
+    } else { /* IOMMU_NOTIFIER_MAP */
+        if (mapped) {
+            /*
+             * If a mapping is present and matches the request, skip the
+             * notification.
+             */
+            if (!memcmp(mapped, &target, sizeof(DMAMap))) {
+                return;
+            } else {
+                /*
+                 * This should never happen unless a buggy guest OS omits or
+                 * sends incorrect invalidation(s). Report an error in the event
+                 * it does happen.
+                 */
+                error_report("Found conflicting translation. This could be due "
+                             "to an incorrect or missing invalidation command");
+            }
+        }
+        /* Record the new mapping */
+        iova_tree_insert(as->iova_tree, &target);
+    }
+
+    /* Invoke the notifiers registered for this address space */
+    memory_region_notify_iommu(&as->iommu, 0, *event);
+}
+
 /*
  * Walk the guest page table for an IOVA and range and signal the registered
  * notifiers to sync the shadow page tables in the host.
@@ -696,7 +768,7 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
 {
     IOMMUTLBEvent event;
 
-    hwaddr iova_next, page_mask, pagesize;
+    hwaddr page_mask, pagesize;
     hwaddr iova = addr;
     hwaddr end = iova + size - 1;
 
@@ -719,7 +791,6 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
         /* PTE has been validated for major errors and pagesize is set */
         assert(pagesize);
         page_mask = ~(pagesize - 1);
-        iova_next = (iova & page_mask) + pagesize;
 
         if (ret == -AMDVI_FR_PT_ENTRY_INV) {
             /*
@@ -752,15 +823,26 @@ static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
             event.type = IOMMU_NOTIFIER_MAP;
         }
 
-        /* Invoke the notifiers registered for this address space */
-        memory_region_notify_iommu(&as->iommu, 0, event);
+        /*
+         * The following call might need to adjust event.entry.size in cases
+         * where the guest unmapped a series of large pages.
+         */
+        amdvi_notify_iommu(as, &event);
+        /*
+         * In the special scenario where the guest is unmapping a large page,
+         * addr_mask has been adjusted before sending the notification. Update
+         * pagesize accordingly in order to correctly compute the next IOVA.
+         */
+        pagesize = event.entry.addr_mask + 1;
 
 next:
+        iova &= ~(pagesize - 1);
+
         /* Check for 64-bit overflow and terminate walk in such cases */
-        if (iova_next < iova) {
+        if ((iova + pagesize) < iova) {
             break;
         } else {
-            iova = iova_next;
+            iova += pagesize;
         }
     }
 }
@@ -1845,6 +1927,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
         iommu_as[devfn]->devfn = (uint8_t)devfn;
         iommu_as[devfn]->iommu_state = s;
         iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE;
+        iommu_as[devfn]->iova_tree = iova_tree_new();
 
         amdvi_dev_as = iommu_as[devfn];
 
-- 
2.43.5



  parent reply	other threads:[~2025-09-19 21:37 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-19 21:34 [PATCH v3 00/22] AMD vIOMMU: DMA remapping support for VFIO devices Alejandro Jimenez
2025-09-19 21:34 ` [PATCH v3 01/22] memory: Adjust event ranges to fit within notifier boundaries Alejandro Jimenez
2025-09-19 21:34 ` [PATCH v3 02/22] amd_iommu: Document '-device amd-iommu' common options Alejandro Jimenez
2025-09-19 21:34 ` [PATCH v3 03/22] amd_iommu: Reorder device and page table helpers Alejandro Jimenez
2025-09-19 21:34 ` [PATCH v3 04/22] amd_iommu: Helper to decode size of page invalidation command Alejandro Jimenez
2025-09-19 21:34 ` [PATCH v3 05/22] amd_iommu: Add helper function to extract the DTE Alejandro Jimenez
2025-09-19 21:34 ` [PATCH v3 06/22] amd_iommu: Return an error when unable to read PTE from guest memory Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 07/22] amd_iommu: Add helpers to walk AMD v1 Page Table format Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 08/22] amd_iommu: Add a page walker to sync shadow page tables on invalidation Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 09/22] amd_iommu: Add basic structure to support IOMMU notifier updates Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 10/22] amd_iommu: Sync shadow page tables on page invalidation Alejandro Jimenez
2025-09-19 21:35 ` Alejandro Jimenez [this message]
2025-09-19 21:35 ` [PATCH v3 12/22] amd_iommu: Unmap all address spaces under the AMD IOMMU on reset Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 13/22] amd_iommu: Add replay callback Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 14/22] amd_iommu: Invalidate address translations on INVALIDATE_IOMMU_ALL Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 15/22] amd_iommu: Toggle memory regions based on address translation mode Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 16/22] amd_iommu: Set all address spaces to use passthrough mode on reset Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 17/22] amd_iommu: Add dma-remap property to AMD vIOMMU device Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 18/22] amd_iommu: Toggle address translation mode on devtab entry invalidation Alejandro Jimenez
2025-10-06  6:08   ` Sairaj Kodilkar
2025-10-06  6:15     ` Michael S. Tsirkin
2025-10-06  6:25       ` Sairaj Kodilkar
2025-10-06 16:03         ` Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 19/22] amd_iommu: Do not assume passthrough translation when DTE[TV]=0 Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 20/22] amd_iommu: Refactor amdvi_page_walk() to use common code for page walk Alejandro Jimenez
2025-09-19 21:35 ` [PATCH v3 21/22] i386/intel-iommu: Move dma_translation to x86-iommu Alejandro Jimenez
2025-09-22  5:33   ` CLEMENT MATHIEU--DRIF
2025-09-19 21:35 ` [PATCH v3 22/22] amd_iommu: HATDis/HATS=11 support Alejandro Jimenez
2025-10-06 16:07 ` [PATCH v3 00/22] AMD vIOMMU: DMA remapping support for VFIO devices Cédric Le Goater
2025-10-06 18:44   ` Alejandro Jimenez
2025-10-07  5:45     ` Cédric Le Goater
2025-10-07  8:17       ` Vasant Hegde
2025-10-07 19:04       ` Joao Martins
2025-10-07 20:41         ` Cédric Le Goater

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250919213515.917111-12-alejandro.j.jimenez@oracle.com \
    --to=alejandro.j.jimenez@oracle.com \
    --cc=Ankit.Soni@amd.com \
    --cc=Wei.Huang2@amd.com \
    --cc=alex.williamson@redhat.com \
    --cc=anisinha@redhat.com \
    --cc=boris.ostrovsky@oracle.com \
    --cc=clement.mathieu--drif@eviden.com \
    --cc=david@redhat.com \
    --cc=eduardo@habkost.net \
    --cc=ethan.milon@eviden.com \
    --cc=imammedo@redhat.com \
    --cc=joao.m.martins@oracle.com \
    --cc=marcel.apfelbaum@gmail.com \
    --cc=mst@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=philmd@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    --cc=santosh.shukla@amd.com \
    --cc=sarunkod@amd.com \
    --cc=suravee.suthikulpanit@amd.com \
    --cc=vasant.hegde@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).