qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: David Hildenbrand <david@redhat.com>
To: qemu-devel@nongnu.org
Cc: Eduardo Habkost <ehabkost@redhat.com>,
	"Michael S . Tsirkin" <mst@redhat.com>,
	Igor Kotrasinski <i.kotrasinsk@partner.samsung.com>,
	David Hildenbrand <david@redhat.com>,
	"Dr . David Alan Gilbert" <dgilbert@redhat.com>,
	Peter Xu <peterx@redhat.com>,
	Alex Williamson <alex.williamson@redhat.com>,
	Murilo Opsfelder Araujo <muriloo@linux.ibm.com>,
	Igor Mammedov <imammedo@redhat.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Richard Henderson <rth@twiddle.net>
Subject: [PATCH v4 12/15] util: vfio-helpers: Implement ram_block_resized()
Date: Thu,  5 Mar 2020 15:29:42 +0100	[thread overview]
Message-ID: <20200305142945.216465-13-david@redhat.com> (raw)
In-Reply-To: <20200305142945.216465-1-david@redhat.com>

Let's implement ram_block_resized(), allowing resizeable mappings.

For resizeable mappings, we reserve $max_size IOVA address space, but only
map $size of it. When resizing, unmap the old part and remap the new
part. We'll need e.g., new ioctl to do this atomically (e.g., to resize
while the guest is running).

Right now, we only resize RAM blocks during incoming migration (when
syncing RAM block sizes during the precopy phase) or after guest
resets when building acpi tables. Any future user of resizeable RAM has to
be aware that vfio has to be treated with care.

Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 util/trace-events   |  7 ++--
 util/vfio-helpers.c | 95 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/util/trace-events b/util/trace-events
index 83b6639018..a4d39eca5e 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -74,10 +74,11 @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex
 
 # vfio-helpers.c
 qemu_vfio_dma_reset_temporary(void *s) "s %p"
-qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
-qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_ram_block_added(void *s, void *p, size_t size, size_t max_size) "s %p host %p size 0x%zx max_size 0x%zx"
+qemu_vfio_ram_block_removed(void *s, void *p, size_t size, size_t max_size) "s %p host %p size 0x%zx max_size 0x%zx"
+qemu_vfio_ram_block_resized(void *s, void *p, size_t old_size, size_t new_sizze) "s %p host %p old_size 0x%zx new_size 0x%zx"
 qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
-qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size %zu index %d iova 0x%"PRIx64
+qemu_vfio_new_mapping(void *s, void *host, size_t size, size_t max_size, int index, uint64_t iova) "s %p host %p size %zu max_size %zu index %d iova 0x%"PRIx64
 qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size %zu iova 0x%"PRIx64
 qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size %zu temporary %d iova %p"
 qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index f0c77f0d69..789faf38bd 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -36,6 +36,7 @@ typedef struct {
     /* Page aligned addr. */
     void *host;
     size_t size;
+    size_t max_size;
     uint64_t iova;
 } IOVAMapping;
 
@@ -372,14 +373,20 @@ fail_container:
     return ret;
 }
 
+static int qemu_vfio_dma_map_resizeable(QEMUVFIOState *s, void *host,
+                                        size_t size, size_t max_size,
+                                        bool temporary, uint64_t *iova);
+static void qemu_vfio_dma_map_resize(QEMUVFIOState *s, void *host,
+                                     size_t old_size, size_t new_size);
+
 static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host,
                                       size_t size, size_t max_size)
 {
     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
     int ret;
 
-    trace_qemu_vfio_ram_block_added(s, host, max_size);
-    ret = qemu_vfio_dma_map(s, host, max_size, false, NULL);
+    trace_qemu_vfio_ram_block_added(s, host, size, max_size);
+    ret = qemu_vfio_dma_map_resizeable(s, host, size, max_size, false, NULL);
     if (ret) {
         error_report("qemu_vfio_dma_map(%p, %zu) failed: %s", host, max_size,
                      strerror(-ret));
@@ -391,16 +398,28 @@ static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host,
 {
     QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
     if (host) {
-        trace_qemu_vfio_ram_block_removed(s, host, max_size);
+        trace_qemu_vfio_ram_block_removed(s, host, size, max_size);
         qemu_vfio_dma_unmap(s, host);
     }
 }
 
+static void qemu_vfio_ram_block_resized(RAMBlockNotifier *n, void *host,
+                                        size_t old_size, size_t new_size)
+{
+    QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
+
+    if (host) {
+        trace_qemu_vfio_ram_block_resized(s, host, old_size, new_size);
+        qemu_vfio_dma_map_resize(s, host, old_size, new_size);
+    }
+}
+
 static void qemu_vfio_open_common(QEMUVFIOState *s)
 {
     qemu_mutex_init(&s->lock);
     s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
     s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
+    s->ram_notifier.ram_block_resized = qemu_vfio_ram_block_resized;
     s->low_water_mark = QEMU_VFIO_IOVA_MIN;
     s->high_water_mark = QEMU_VFIO_IOVA_MAX;
     ram_block_notifier_add(&s->ram_notifier);
@@ -495,16 +514,23 @@ static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
  */
 static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
                                           void *host, size_t size,
-                                          int index, uint64_t iova)
+                                          size_t max_size, int index,
+                                          uint64_t iova)
 {
+    const IOVAMapping m = {
+        .host = host,
+        .size = size,
+        .max_size = max_size,
+        .iova = iova,
+    };
     int shift;
-    IOVAMapping m = {.host = host, .size = size, .iova = iova};
     IOVAMapping *insert;
 
     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
+    assert(size <= max_size);
     assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size));
     assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size));
-    trace_qemu_vfio_new_mapping(s, host, size, index, iova);
+    trace_qemu_vfio_new_mapping(s, host, size, max_size, index, iova);
 
     assert(index >= 0);
     s->nr_mappings++;
@@ -597,9 +623,14 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
  * the result in @iova if not NULL. The caller need to make sure the area is
  * aligned to page size, and mustn't overlap with existing mapping areas (split
  * mapping status within this area is not allowed).
+ *
+ * If size < max_size, a region of max_size in IOVA address is reserved, such
+ * that the mapping can later be resized. Resizeable mappings are only allowed
+ * for !temporary mappings.
  */
-int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
-                      bool temporary, uint64_t *iova)
+static int qemu_vfio_dma_map_resizeable(QEMUVFIOState *s, void *host,
+                                        size_t size, size_t max_size,
+                                        bool temporary, uint64_t *iova)
 {
     int ret = 0;
     int index;
@@ -608,19 +639,24 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
 
     assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size));
     assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
+    assert(QEMU_IS_ALIGNED(max_size, qemu_real_host_page_size));
+    assert(size == max_size || !temporary);
+    assert(size <= max_size);
+
     trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
     qemu_mutex_lock(&s->lock);
     mapping = qemu_vfio_find_mapping(s, host, &index);
     if (mapping) {
         iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
     } else {
-        if (s->high_water_mark - s->low_water_mark + 1 < size) {
+        if (s->high_water_mark - s->low_water_mark + 1 < max_size) {
             ret = -ENOMEM;
             goto out;
         }
         if (!temporary) {
             iova0 = s->low_water_mark;
-            mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
+            mapping = qemu_vfio_add_mapping(s, host, size, max_size, index + 1,
+                                            iova0);
             if (!mapping) {
                 ret = -ENOMEM;
                 goto out;
@@ -631,7 +667,7 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
                 qemu_vfio_remove_mapping(s, mapping);
                 goto out;
             }
-            s->low_water_mark += size;
+            s->low_water_mark += max_size;
             qemu_vfio_dump_mappings(s);
         } else {
             iova0 = s->high_water_mark - size;
@@ -650,6 +686,12 @@ out:
     return ret;
 }
 
+int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
+                      bool temporary, uint64_t *iova)
+{
+    return qemu_vfio_dma_map_resizeable(s, host, size, size, temporary, iova);
+}
+
 /* Reset the high watermark and free all "temporary" mappings. */
 int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
 {
@@ -694,6 +736,37 @@ out:
     qemu_mutex_unlock(&s->lock);
 }
 
+static void qemu_vfio_dma_map_resize(QEMUVFIOState *s, void *host,
+                                     size_t old_size, size_t new_size)
+{
+    IOVAMapping *m;
+    int index = 0;
+
+    qemu_mutex_lock(&s->lock);
+    m = qemu_vfio_find_mapping(s, host, &index);
+    if (!m) {
+        return;
+    }
+    assert(m->size == old_size);
+    assert(new_size <= m->max_size);
+
+    /*
+     * For now, we must unmap the whole mapped range first and remap with
+     * the new size. The reason is that VFIO_IOMMU_UNMAP_DMA might fail
+     * when partially unmapping previous mappings. Although we could add
+     * new mappings to extend the old range, we won't able to always
+     * shrink. The side effect is that it's never safe to resize during VM
+     * execution and we'll e.g., need a new IOCTL to make this work.
+     */
+    qemu_vfio_undo_mapping(s, m->iova, m->size);
+    qemu_vfio_do_mapping(s, host, m->iova, new_size);
+
+    m->size = new_size;
+    assert(qemu_vfio_verify_mappings(s));
+
+    qemu_mutex_unlock(&s->lock);
+}
+
 static void qemu_vfio_reset(QEMUVFIOState *s)
 {
     ioctl(s->device, VFIO_DEVICE_RESET);
-- 
2.24.1



  parent reply	other threads:[~2020-03-05 14:37 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-05 14:29 [PATCH v4 00/15] Ram blocks with resizeable anonymous allocations under POSIX David Hildenbrand
2020-03-05 14:29 ` [PATCH v4 01/15] util: vfio-helpers: Fix qemu_vfio_close() David Hildenbrand
2020-04-17 10:22   ` Philippe Mathieu-Daudé
2020-03-05 14:29 ` [PATCH v4 02/15] util: vfio-helpers: Remove Error parameter from qemu_vfio_undo_mapping() David Hildenbrand
2020-03-25 14:32   ` Murilo Opsfelder Araújo
2020-03-05 14:29 ` [PATCH v4 03/15] util: vfio-helpers: Factor out removal " David Hildenbrand
2020-03-25 14:45   ` Murilo Opsfelder Araújo
2020-03-05 14:29 ` [PATCH v4 04/15] exec: Factor out setting ram settings (madvise ...) into qemu_ram_apply_settings() David Hildenbrand
2020-03-05 14:29 ` [PATCH v4 05/15] exec: Reuse qemu_ram_apply_settings() in qemu_ram_remap() David Hildenbrand
2020-03-05 14:29 ` [PATCH v4 06/15] exec: Drop "shared" parameter from ram_block_add() David Hildenbrand
2020-03-05 14:29 ` [PATCH v4 07/15] util/mmap-alloc: Factor out calculation of the pagesize for the guard page David Hildenbrand
2020-03-25 15:03   ` Murilo Opsfelder Araújo
2020-03-05 14:29 ` [PATCH v4 08/15] util/mmap-alloc: Factor out reserving of a memory region to mmap_reserve() David Hildenbrand
2020-03-05 14:29 ` [PATCH v4 09/15] util/mmap-alloc: Factor out activating of memory to mmap_activate() David Hildenbrand
2020-03-05 14:29 ` [PATCH v4 10/15] util/mmap-alloc: Prepare for resizeable mmaps David Hildenbrand
2020-03-25 15:09   ` Murilo Opsfelder Araújo
2020-03-05 14:29 ` [PATCH v4 11/15] util/mmap-alloc: Implement " David Hildenbrand
2020-03-25 15:14   ` Murilo Opsfelder Araújo
2020-03-05 14:29 ` David Hildenbrand [this message]
2020-03-25 15:17   ` [PATCH v4 12/15] util: vfio-helpers: Implement ram_block_resized() Murilo Opsfelder Araújo
2020-03-05 14:29 ` [PATCH v4 13/15] util: oslib: Resizeable anonymous allocations under POSIX David Hildenbrand
2020-03-25 15:20   ` Murilo Opsfelder Araújo
2020-03-05 14:29 ` [PATCH v4 14/15] numa: Introduce ram_block_notifiers_support_resize() David Hildenbrand
2020-03-25 15:24   ` Murilo Opsfelder Araújo
2020-03-05 14:29 ` [PATCH v4 15/15] exec: Ram blocks with resizeable anonymous allocations under POSIX David Hildenbrand
2020-03-25 15:34   ` Murilo Opsfelder Araújo
2020-03-27 11:24     ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200305142945.216465-13-david@redhat.com \
    --to=david@redhat.com \
    --cc=alex.williamson@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=ehabkost@redhat.com \
    --cc=i.kotrasinsk@partner.samsung.com \
    --cc=imammedo@redhat.com \
    --cc=mst@redhat.com \
    --cc=muriloo@linux.ibm.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=rth@twiddle.net \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).