qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Avihai Horon <avihaih@nvidia.com>
To: <qemu-devel@nongnu.org>
Cc: "Alex Williamson" <alex.williamson@redhat.com>,
	"Cédric Le Goater" <clg@redhat.com>,
	"Juan Quintela" <quintela@redhat.com>,
	"Dr. David Alan Gilbert" <dgilbert@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	"Peter Xu" <peterx@redhat.com>,
	"Jason Wang" <jasowang@redhat.com>,
	"Marcel Apfelbaum" <marcel.apfelbaum@gmail.com>,
	"Paolo Bonzini" <pbonzini@redhat.com>,
	"Richard Henderson" <richard.henderson@linaro.org>,
	"Eduardo Habkost" <eduardo@habkost.net>,
	"David Hildenbrand" <david@redhat.com>,
	"Philippe Mathieu-Daudé" <philmd@linaro.org>,
	"Yishai Hadas" <yishaih@nvidia.com>,
	"Jason Gunthorpe" <jgg@nvidia.com>,
	"Maor Gottlieb" <maorg@nvidia.com>,
	"Avihai Horon" <avihaih@nvidia.com>,
	"Kirti Wankhede" <kwankhede@nvidia.com>,
	"Tarun Gupta" <targupta@nvidia.com>,
	"Joao Martins" <joao.m.martins@oracle.com>
Subject: [PATCH v2 17/20] vfio/common: Support device dirty page tracking with vIOMMU
Date: Wed, 22 Feb 2023 19:49:12 +0200	[thread overview]
Message-ID: <20230222174915.5647-18-avihaih@nvidia.com> (raw)
In-Reply-To: <20230222174915.5647-1-avihaih@nvidia.com>

Currently, device dirty page tracking with vIOMMU is not supported - RAM
pages are perpetually marked dirty in this case.

When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
the vIOMMU maps/unmaps them. These IOVA ranges can potentially be mapped
anywhere in the vIOMMU IOVA space.

Due to this dynamic nature of vIOMMU mapping/unmapping, tracking only
the currently mapped IOVA ranges, as done in the non-vIOMMU case,
doesn't work very well.

Instead, to support device dirty tracking when vIOMMU is enabled, track
the entire vIOMMU IOVA space. If that fails (IOVA space can be rather
big and we might hit HW limitation), try tracking smaller range while
marking untracked ranges dirty.

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 include/hw/vfio/vfio-common.h |   2 +
 hw/vfio/common.c              | 196 +++++++++++++++++++++++++++++++---
 2 files changed, 181 insertions(+), 17 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 1f21e1fa43..1dc00cabcd 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -95,6 +95,8 @@ typedef struct VFIOContainer {
     unsigned int dma_max_mappings;
     IOVATree *mappings;
     QemuMutex mappings_mutex;
+    /* Represents the range [0, giommu_tracked_range) not inclusive */
+    hwaddr giommu_tracked_range;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 4a7fff6eeb..1024788bcc 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -45,6 +45,8 @@
 #include "migration/qemu-file.h"
 #include "sysemu/tpm.h"
 #include "qemu/iova-tree.h"
+#include "hw/boards.h"
+#include "hw/mem/memory-device.h"
 
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -430,6 +432,38 @@ void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }
 
+static uint64_t vfio_get_ram_size(void)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    uint64_t plugged_size;
+
+    plugged_size = get_plugged_memory_size();
+    if (plugged_size == (uint64_t)-1) {
+        plugged_size = 0;
+    }
+
+    return ms->ram_size + plugged_size;
+}
+
+static int vfio_iommu_get_max_iova(VFIOContainer *container, hwaddr *max_iova)
+{
+    VFIOGuestIOMMU *giommu;
+    int ret;
+
+    giommu = QLIST_FIRST(&container->giommu_list);
+    if (!giommu) {
+        return -ENOENT;
+    }
+
+    ret = memory_region_iommu_get_attr(giommu->iommu_mr, IOMMU_ATTR_MAX_IOVA,
+                                       max_iova);
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
 static bool vfio_have_giommu(VFIOContainer *container)
 {
     return !QLIST_EMPTY(&container->giommu_list);
@@ -1510,7 +1544,8 @@ static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data)
 }
 
 static struct vfio_device_feature *
-vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
+vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
+                                             bool giommu)
 {
     struct vfio_device_feature *feature;
     size_t feature_size;
@@ -1529,6 +1564,16 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
     control = (struct vfio_device_feature_dma_logging_control *)feature->data;
     control->page_size = qemu_real_host_page_size();
 
+    if (giommu) {
+        ranges = g_malloc0(sizeof(*ranges));
+        ranges->iova = 0;
+        ranges->length = container->giommu_tracked_range;
+        control->num_ranges = 1;
+        control->ranges = (uint64_t)ranges;
+
+        return feature;
+    }
+
     QEMU_LOCK_GUARD(&container->mappings_mutex);
 
     /*
@@ -1578,12 +1623,12 @@ static void vfio_device_feature_dma_logging_start_destroy(
     g_free(feature);
 }
 
-static int vfio_devices_dma_logging_start(VFIOContainer *container)
+static int vfio_devices_dma_logging_start(VFIOContainer *container, bool giommu)
 {
     struct vfio_device_feature *feature;
     int ret;
 
-    feature = vfio_device_feature_dma_logging_start_create(container);
+    feature = vfio_device_feature_dma_logging_start_create(container, giommu);
     if (!feature) {
         return -errno;
     }
@@ -1598,18 +1643,128 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
     return ret;
 }
 
+typedef struct {
+    hwaddr *ranges;
+    unsigned int ranges_num;
+} VFIOGIOMMUDeviceDTRanges;
+
+/*
+ * This value is used in the second attempt to start device dirty tracking with
+ * vIOMMU, or if the giommu fails to report its max iova.
+ * It should be in the middle, not too big and not too small, allowing devices
+ * with HW limitations to do device dirty tracking while covering a fair amount
+ * of the IOVA space.
+ *
+ * This arbitrary value was chosen becasue it is the minimum value of Intel
+ * IOMMU max IOVA and mlx5 devices support tracking a range of this size.
+ */
+#define VFIO_IOMMU_DEFAULT_MAX_IOVA ((1ULL << 39) - 1)
+
+#define VFIO_IOMMU_RANGES_NUM 3
+static VFIOGIOMMUDeviceDTRanges *
+vfio_iommu_device_dirty_tracking_ranges_create(VFIOContainer *container)
+{
+    hwaddr iommu_max_iova = VFIO_IOMMU_DEFAULT_MAX_IOVA;
+    hwaddr retry_iova;
+    hwaddr ram_size = vfio_get_ram_size();
+    VFIOGIOMMUDeviceDTRanges *dt_ranges;
+    int ret;
+
+    dt_ranges = g_try_new0(VFIOGIOMMUDeviceDTRanges, 1);
+    if (!dt_ranges) {
+        errno = ENOMEM;
+
+        return NULL;
+    }
+
+    dt_ranges->ranges_num = VFIO_IOMMU_RANGES_NUM;
+
+    dt_ranges->ranges = g_try_new0(hwaddr, dt_ranges->ranges_num);
+    if (!dt_ranges->ranges) {
+        g_free(dt_ranges);
+        errno = ENOMEM;
+
+        return NULL;
+    }
+
+    /*
+     * With vIOMMU we try to track the entire IOVA space. As the IOVA space can
+     * be rather big, devices might not be able to track it due to HW
+     * limitations. In that case:
+     * (1) Retry tracking a smaller part of the IOVA space.
+     * (2) Retry tracking a range in the size of the physical memory.
+     */
+    ret = vfio_iommu_get_max_iova(container, &iommu_max_iova);
+    if (!ret) {
+        /* Check 2^64 wrap around */
+        if (!REAL_HOST_PAGE_ALIGN(iommu_max_iova)) {
+            iommu_max_iova -= qemu_real_host_page_size();
+        }
+    }
+
+    retry_iova = MIN(iommu_max_iova / 2, VFIO_IOMMU_DEFAULT_MAX_IOVA);
+
+    dt_ranges->ranges[0] = REAL_HOST_PAGE_ALIGN(iommu_max_iova);
+    dt_ranges->ranges[1] = REAL_HOST_PAGE_ALIGN(retry_iova);
+    dt_ranges->ranges[2] = REAL_HOST_PAGE_ALIGN(MIN(ram_size, retry_iova / 2));
+
+    return dt_ranges;
+}
+
+static void vfio_iommu_device_dirty_tracking_ranges_destroy(
+    VFIOGIOMMUDeviceDTRanges *dt_ranges)
+{
+    g_free(dt_ranges->ranges);
+    g_free(dt_ranges);
+}
+
+static int vfio_devices_start_dirty_page_tracking(VFIOContainer *container)
+{
+    VFIOGIOMMUDeviceDTRanges *dt_ranges;
+    int ret;
+    int i;
+
+    if (!vfio_have_giommu(container)) {
+        return vfio_devices_dma_logging_start(container, false);
+    }
+
+    dt_ranges = vfio_iommu_device_dirty_tracking_ranges_create(container);
+    if (!dt_ranges) {
+        return -errno;
+    }
+
+    for (i = 0; i < dt_ranges->ranges_num; i++) {
+        container->giommu_tracked_range = dt_ranges->ranges[i];
+        ret = vfio_devices_dma_logging_start(container, true);
+        if (!ret) {
+            break;
+        }
+
+        if (i < dt_ranges->ranges_num - 1) {
+            warn_report("Failed to start device dirty tracking with vIOMMU "
+                        "with range of size 0x%" HWADDR_PRIx
+                        ", err: %d. Retrying with range "
+                        "of size 0x%" HWADDR_PRIx,
+                        dt_ranges->ranges[i], ret, dt_ranges->ranges[i + 1]);
+        } else {
+            error_report("Failed to start device dirty tracking with vIOMMU "
+                         "with range of size 0x%" HWADDR_PRIx ", err: %d",
+                         dt_ranges->ranges[i], ret);
+        }
+    }
+
+    vfio_iommu_device_dirty_tracking_ranges_destroy(dt_ranges);
+
+    return ret;
+}
+
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
     int ret;
 
     if (vfio_devices_all_device_dirty_tracking(container)) {
-        if (vfio_have_giommu(container)) {
-            /* Device dirty page tracking currently doesn't support vIOMMU */
-            return;
-        }
-
-        ret = vfio_devices_dma_logging_start(container);
+        ret = vfio_devices_start_dirty_page_tracking(container);
     } else {
         ret = vfio_set_dirty_page_tracking(container, true);
     }
@@ -1627,11 +1782,6 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
     int ret;
 
     if (vfio_devices_all_device_dirty_tracking(container)) {
-        if (vfio_have_giommu(container)) {
-            /* Device dirty page tracking currently doesn't support vIOMMU */
-            return;
-        }
-
         ret = vfio_devices_dma_logging_stop(container);
     } else {
         ret = vfio_set_dirty_page_tracking(container, false);
@@ -1670,6 +1820,17 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
     return 0;
 }
 
+static bool vfio_iommu_range_is_device_tracked(VFIOContainer *container,
+                                               hwaddr iova, hwaddr size)
+{
+    /* Check for 2^64 wrap around */
+    if (!(iova + size)) {
+        return false;
+    }
+
+    return iova + size <= container->giommu_tracked_range;
+}
+
 static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
                                            VFIOBitmap *vbmap, hwaddr iova,
                                            hwaddr size)
@@ -1679,10 +1840,11 @@ static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
     int ret;
 
     if (vfio_have_giommu(container)) {
-        /* Device dirty page tracking currently doesn't support vIOMMU */
-        bitmap_set(vbmap->bitmap, 0, vbmap->pages);
+        if (!vfio_iommu_range_is_device_tracked(container, iova, size)) {
+            bitmap_set(vbmap->bitmap, 0, vbmap->pages);
 
-        return 0;
+            return 0;
+        }
     }
 
     QLIST_FOREACH(group, &container->group_list, container_next) {
-- 
2.26.3



  parent reply	other threads:[~2023-02-22 17:52 UTC|newest]

Thread overview: 93+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-22 17:48 [PATCH v2 00/20] vfio: Add migration pre-copy support and device dirty tracking Avihai Horon
2023-02-22 17:48 ` [PATCH v2 01/20] migration: Pass threshold_size to .state_pending_{estimate, exact}() Avihai Horon via
2023-02-22 17:48 ` [PATCH v2 02/20] vfio/migration: Refactor vfio_save_block() to return saved data size Avihai Horon
2023-02-27 14:10   ` Cédric Le Goater
2023-02-22 17:48 ` [PATCH v2 03/20] vfio/migration: Add VFIO migration pre-copy support Avihai Horon
2023-02-22 20:58   ` Alex Williamson
2023-02-23 15:25     ` Avihai Horon
2023-02-23 21:16       ` Alex Williamson
2023-02-26 16:43         ` Avihai Horon
2023-02-27 16:14           ` Alex Williamson
2023-02-27 17:26             ` Jason Gunthorpe
2023-02-27 17:43               ` Alex Williamson
2023-03-01 18:49                 ` Avihai Horon
2023-03-01 19:55                   ` Alex Williamson
2023-03-01 21:12                     ` Jason Gunthorpe
2023-03-01 22:39                       ` Alex Williamson
2023-03-06 19:01                         ` Jason Gunthorpe
2023-02-22 17:48 ` [PATCH v2 04/20] vfio/common: Fix error reporting in vfio_get_dirty_bitmap() Avihai Horon
2023-02-22 17:49 ` [PATCH v2 05/20] vfio/common: Fix wrong %m usages Avihai Horon
2023-02-22 17:49 ` [PATCH v2 06/20] vfio/common: Abort migration if dirty log start/stop/sync fails Avihai Horon
2023-02-22 17:49 ` [PATCH v2 07/20] vfio/common: Add VFIOBitmap and (de)alloc functions Avihai Horon
2023-02-22 21:40   ` Alex Williamson
2023-02-23 15:27     ` Avihai Horon
2023-02-27 14:09   ` Cédric Le Goater
2023-03-01 18:56     ` Avihai Horon
2023-03-02 13:24     ` Joao Martins
2023-03-02 14:52       ` Cédric Le Goater
2023-03-02 16:30         ` Joao Martins
2023-03-04  0:23         ` Joao Martins
2023-02-22 17:49 ` [PATCH v2 08/20] util: Add iova_tree_nnodes() Avihai Horon
2023-02-22 17:49 ` [PATCH v2 09/20] util: Extend iova_tree_foreach() to take data argument Avihai Horon
2023-02-22 17:49 ` [PATCH v2 10/20] vfio/common: Record DMA mapped IOVA ranges Avihai Horon
2023-02-22 22:10   ` Alex Williamson
2023-02-23 10:37     ` Joao Martins
2023-02-23 21:05       ` Alex Williamson
2023-02-23 21:19         ` Joao Martins
2023-02-23 21:50           ` Alex Williamson
2023-02-23 21:54             ` Joao Martins
2023-02-28 12:11             ` Joao Martins
2023-02-28 20:36               ` Alex Williamson
2023-03-02  0:07                 ` Joao Martins
2023-03-02  0:13                   ` Joao Martins
2023-03-02 18:42                   ` Alex Williamson
2023-03-03  0:19                     ` Joao Martins
2023-03-03 16:58                       ` Joao Martins
2023-03-03 17:05                         ` Alex Williamson
2023-03-03 19:14                           ` Joao Martins
2023-03-03 19:40                             ` Alex Williamson
2023-03-03 20:16                               ` Joao Martins
2023-03-03 23:47                                 ` Alex Williamson
2023-03-03 23:57                                   ` Joao Martins
2023-03-04  0:21                                     ` Joao Martins
2023-02-22 17:49 ` [PATCH v2 11/20] vfio/common: Add device dirty page tracking start/stop Avihai Horon
2023-02-22 22:40   ` Alex Williamson
2023-02-23  2:02     ` Jason Gunthorpe
2023-02-23 19:27       ` Alex Williamson
2023-02-23 19:30         ` Jason Gunthorpe
2023-02-23 20:16           ` Alex Williamson
2023-02-23 20:54             ` Jason Gunthorpe
2023-02-26 16:54               ` Avihai Horon
2023-02-23 15:36     ` Avihai Horon
2023-02-22 17:49 ` [PATCH v2 12/20] vfio/common: Extract code from vfio_get_dirty_bitmap() to new function Avihai Horon
2023-02-22 17:49 ` [PATCH v2 13/20] vfio/common: Add device dirty page bitmap sync Avihai Horon
2023-02-22 17:49 ` [PATCH v2 14/20] vfio/common: Extract vIOMMU code from vfio_sync_dirty_bitmap() Avihai Horon
2023-02-22 17:49 ` [PATCH v2 15/20] memory/iommu: Add IOMMU_ATTR_MAX_IOVA attribute Avihai Horon
2023-02-22 17:49 ` [PATCH v2 16/20] intel-iommu: Implement get_attr() method Avihai Horon
2023-02-22 17:49 ` Avihai Horon [this message]
2023-02-22 23:34   ` [PATCH v2 17/20] vfio/common: Support device dirty page tracking with vIOMMU Alex Williamson
2023-02-23  2:08     ` Jason Gunthorpe
2023-02-23 20:06       ` Alex Williamson
2023-02-23 20:55         ` Jason Gunthorpe
2023-02-23 21:30           ` Joao Martins
2023-02-23 22:33           ` Alex Williamson
2023-02-23 23:26             ` Jason Gunthorpe
2023-02-24 11:25               ` Joao Martins
2023-02-24 12:53                 ` Joao Martins
2023-02-24 15:47                   ` Jason Gunthorpe
2023-02-24 15:56                   ` Alex Williamson
2023-02-24 19:16                     ` Joao Martins
2023-02-22 17:49 ` [PATCH v2 18/20] vfio/common: Optimize " Avihai Horon
2023-02-22 17:49 ` [PATCH v2 19/20] vfio/migration: Query device dirty page tracking support Avihai Horon
2023-02-22 17:49 ` [PATCH v2 20/20] docs/devel: Document VFIO device dirty page tracking Avihai Horon
2023-02-27 14:29   ` Cédric Le Goater
2023-02-22 18:00 ` [PATCH v2 00/20] vfio: Add migration pre-copy support and device dirty tracking Avihai Horon
2023-02-22 20:55 ` Alex Williamson
2023-02-23 10:05   ` Cédric Le Goater
2023-02-23 15:07     ` Avihai Horon
2023-02-27 10:24       ` Cédric Le Goater
2023-02-23 14:56   ` Avihai Horon
2023-02-24 19:26     ` Joao Martins
2023-02-26 17:00       ` Avihai Horon
2023-02-27 13:50         ` Cédric Le Goater
2023-03-01 19:04           ` Avihai Horon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230222174915.5647-18-avihaih@nvidia.com \
    --to=avihaih@nvidia.com \
    --cc=alex.williamson@redhat.com \
    --cc=clg@redhat.com \
    --cc=david@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=eduardo@habkost.net \
    --cc=jasowang@redhat.com \
    --cc=jgg@nvidia.com \
    --cc=joao.m.martins@oracle.com \
    --cc=kwankhede@nvidia.com \
    --cc=maorg@nvidia.com \
    --cc=marcel.apfelbaum@gmail.com \
    --cc=mst@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=philmd@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=quintela@redhat.com \
    --cc=richard.henderson@linaro.org \
    --cc=targupta@nvidia.com \
    --cc=yishaih@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).