[PATCH v2 1/2] vhost: dirty log should be per backend type

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 1/2] vhost: dirty log should be per backend type
@ 2024-02-14 11:50 Si-Wei Liu
  2024-02-14 11:50 ` [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Si-Wei Liu @ 2024-02-14 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: mst, jasowang, joao.m.martins, si-wei.liu

There could be a mix of both vhost-user and vhost-kernel clients
in the same QEMU process, where separate vhost loggers for the
specific vhost type have to be used. Make the vhost logger per
backend type, and have them properly reference counted.

Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
---
 hw/virtio/vhost.c | 49 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2c9ac79..ef6d9b5 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -43,8 +43,8 @@
     do { } while (0)
 #endif
 
-static struct vhost_log *vhost_log;
-static struct vhost_log *vhost_log_shm;
+static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
+static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
 
 /* Memslots used by backends that support private memslots (without an fd). */
 static unsigned int used_memslots;
@@ -287,6 +287,8 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
         r = -1;
     }
 
+    assert(dev->vhost_ops->backend_type == backend_type || r < 0);
+
     return r;
 }
 
@@ -319,16 +321,23 @@ static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
     return log;
 }
 
-static struct vhost_log *vhost_log_get(uint64_t size, bool share)
+static struct vhost_log *vhost_log_get(VhostBackendType backend_type,
+                                       uint64_t size, bool share)
 {
-    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
+    struct vhost_log *log;
+
+    if (backend_type == VHOST_BACKEND_TYPE_NONE ||
+        backend_type >= VHOST_BACKEND_TYPE_MAX)
+        return NULL;
+
+    log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
 
     if (!log || log->size != size) {
         log = vhost_log_alloc(size, share);
         if (share) {
-            vhost_log_shm = log;
+            vhost_log_shm[backend_type] = log;
         } else {
-            vhost_log = log;
+            vhost_log[backend_type] = log;
         }
     } else {
         ++log->refcnt;
@@ -340,11 +349,20 @@ static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 static void vhost_log_put(struct vhost_dev *dev, bool sync)
 {
     struct vhost_log *log = dev->log;
+    VhostBackendType backend_type;
 
     if (!log) {
         return;
     }
 
+    assert(dev->vhost_ops);
+    backend_type = dev->vhost_ops->backend_type;
+
+    if (backend_type == VHOST_BACKEND_TYPE_NONE ||
+        backend_type >= VHOST_BACKEND_TYPE_MAX) {
+        return;
+    }
+
     --log->refcnt;
     if (log->refcnt == 0) {
         /* Sync only the range covered by the old log */
@@ -352,13 +370,13 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
         }
 
-        if (vhost_log == log) {
+        if (vhost_log[backend_type] == log) {
             g_free(log->log);
-            vhost_log = NULL;
-        } else if (vhost_log_shm == log) {
+            vhost_log[backend_type] = NULL;
+        } else if (vhost_log_shm[backend_type] == log) {
             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
                             log->fd);
-            vhost_log_shm = NULL;
+            vhost_log_shm[backend_type] = NULL;
         }
 
         g_free(log);
@@ -376,7 +394,8 @@ static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 
 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 {
-    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
+    struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type,
+                                          size, vhost_dev_log_is_shared(dev));
     uint64_t log_base = (uintptr_t)log->log;
     int r;
 
@@ -2037,8 +2056,14 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
         uint64_t log_base;
 
         hdev->log_size = vhost_get_log_size(hdev);
-        hdev->log = vhost_log_get(hdev->log_size,
+        hdev->log = vhost_log_get(hdev->vhost_ops->backend_type,
+                                  hdev->log_size,
                                   vhost_dev_log_is_shared(hdev));
+        if (!hdev->log) {
+            VHOST_OPS_DEBUG(r, "vhost_log_get failed");
+            goto fail_vq;
+        }
+
         log_base = (uintptr_t)hdev->log->log;
         r = hdev->vhost_ops->vhost_set_log_base(hdev,
                                                 hdev->log_size ? log_base : 0,
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-02-14 11:50 [PATCH v2 1/2] vhost: dirty log should be per backend type Si-Wei Liu
@ 2024-02-14 11:50 ` Si-Wei Liu
  2024-03-06 18:42   ` Eugenio Perez Martin
  2024-03-12 18:35   ` Michael S. Tsirkin
  2024-02-14 18:42 ` [PATCH v2 1/2] vhost: dirty log should be per backend type Si-Wei Liu
  2024-03-06 17:27 ` Eugenio Perez Martin
  2 siblings, 2 replies; 8+ messages in thread
From: Si-Wei Liu @ 2024-02-14 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: mst, jasowang, joao.m.martins, si-wei.liu

On setups with one or more virtio-net devices with vhost on,
dirty tracking iteration increases cost the bigger the number
amount of queues are set up e.g. on idle guests migration the
following is observed with virtio-net with vhost=on:

48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14

With high memory rates the symptom is lack of convergence as soon
as it has a vhost device with a sufficiently high number of queues,
the sufficient number of vhost devices.

On every migration iteration (every 100msecs) it will redundantly
query the *shared log* the number of queues configured with vhost
that exist in the guest. For the virtqueue data, this is necessary,
but not for the memory sections which are the same. So
essentially we end up scanning the dirty log too often.

To fix that, select a vhost device responsible for scanning the
log with regards to memory sections dirty tracking. It is selected
when we enable the logger (during migration) and cleared when we
disable the logger. If the vhost logger device goes away for some
reason, the logger will be re-selected from the rest of vhost
devices.

Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
---
 hw/virtio/vhost.c         | 75 +++++++++++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost.h |  1 +
 2 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index ef6d9b5..997d560 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -45,6 +45,9 @@
 
 static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
 static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
+static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX];
+static QLIST_HEAD(, vhost_dev) vhost_mlog_devices =
+    QLIST_HEAD_INITIALIZER(vhost_mlog_devices);
 
 /* Memslots used by backends that support private memslots (without an fd). */
 static unsigned int used_memslots;
@@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
     }
 }
 
+static bool vhost_log_dev_enabled(struct vhost_dev *dev)
+{
+    assert(dev->vhost_ops);
+    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
+    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
+
+    return dev == vhost_mem_logger[dev->vhost_ops->backend_type];
+}
+
+static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable)
+{
+    struct vhost_dev *logdev = NULL;
+    VhostBackendType backend_type;
+    bool reelect = false;
+
+    assert(hdev->vhost_ops);
+    assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
+    assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
+
+    backend_type = hdev->vhost_ops->backend_type;
+
+    if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
+        reelect = !vhost_mem_logger[backend_type];
+        QLIST_INSERT_HEAD(&vhost_mlog_devices, hdev, logdev_entry);
+    } else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) {
+        reelect = vhost_mem_logger[backend_type] == hdev;
+        QLIST_REMOVE(hdev, logdev_entry);
+    }
+
+    if (!reelect)
+        return;
+
+    QLIST_FOREACH(hdev, &vhost_mlog_devices, logdev_entry) {
+        if (!hdev->vhost_ops ||
+            hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE ||
+            hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX)
+            continue;
+
+        if (hdev->vhost_ops->backend_type == backend_type) {
+            logdev = hdev;
+            break;
+        }
+    }
+
+    vhost_mem_logger[backend_type] = logdev;
+}
+
 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
                                    MemoryRegionSection *section,
                                    hwaddr first,
@@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
     start_addr = MAX(first, start_addr);
     end_addr = MIN(last, end_addr);
 
-    for (i = 0; i < dev->mem->nregions; ++i) {
-        struct vhost_memory_region *reg = dev->mem->regions + i;
-        vhost_dev_sync_region(dev, section, start_addr, end_addr,
-                              reg->guest_phys_addr,
-                              range_get_last(reg->guest_phys_addr,
-                                             reg->memory_size));
+    if (vhost_log_dev_enabled(dev)) {
+        for (i = 0; i < dev->mem->nregions; ++i) {
+            struct vhost_memory_region *reg = dev->mem->regions + i;
+            vhost_dev_sync_region(dev, section, start_addr, end_addr,
+                                  reg->guest_phys_addr,
+                                  range_get_last(reg->guest_phys_addr,
+                                                 reg->memory_size));
+        }
     }
     for (i = 0; i < dev->nvqs; ++i) {
         struct vhost_virtqueue *vq = dev->vqs + i;
@@ -382,6 +434,7 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
         g_free(log);
     }
 
+    vhost_mlog_set_dev(dev, false);
     dev->log = NULL;
     dev->log_size = 0;
 }
@@ -997,6 +1050,15 @@ static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
             goto err_vq;
         }
     }
+
+    /*
+     * At log start we select our vhost_device logger that will scan the
+     * memory sections and skip for the others. This is possible because
+     * the log is shared amongst all vhost devices for a given type of
+     * backend.
+     */
+    vhost_mlog_set_dev(dev, enable_log);
+
     return 0;
 err_vq:
     for (; i >= 0; --i) {
@@ -2072,6 +2134,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
             VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
             goto fail_log;
         }
+        vhost_mlog_set_dev(hdev, true);
     }
     if (vrings) {
         r = vhost_dev_set_vring_enable(hdev, true);
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 0247778..d75faf4 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -129,6 +129,7 @@ struct vhost_dev {
     void *opaque;
     struct vhost_log *log;
     QLIST_ENTRY(vhost_dev) entry;
+    QLIST_ENTRY(vhost_dev) logdev_entry;
     QLIST_HEAD(, vhost_iommu) iommu_list;
     IOMMUNotifier n;
     const VhostDevConfigOps *config_ops;
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-02-14 11:50 ` [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
@ 2024-03-06 18:42   ` Eugenio Perez Martin
  2024-03-12 18:35   ` Michael S. Tsirkin
  1 sibling, 0 replies; 8+ messages in thread
From: Eugenio Perez Martin @ 2024-03-06 18:42 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, jasowang, joao.m.martins

On Wed, Feb 14, 2024 at 2:02 PM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
> On setups with one or more virtio-net devices with vhost on,
> dirty tracking iteration increases cost the bigger the number
> amount of queues are set up e.g. on idle guests migration the
> following is observed with virtio-net with vhost=on:
>
> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>

I think the after benchmark should also be included.

> With high memory rates the symptom is lack of convergence as soon
> as it has a vhost device with a sufficiently high number of queues,
> the sufficient number of vhost devices.
>
> On every migration iteration (every 100msecs) it will redundantly
> query the *shared log* the number of queues configured with vhost
> that exist in the guest. For the virtqueue data, this is necessary,
> but not for the memory sections which are the same. So
> essentially we end up scanning the dirty log too often.
>
> To fix that, select a vhost device responsible for scanning the
> log with regards to memory sections dirty tracking. It is selected
> when we enable the logger (during migration) and cleared when we
> disable the logger. If the vhost logger device goes away for some
> reason, the logger will be re-selected from the rest of vhost
> devices.
>
> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> ---
>  hw/virtio/vhost.c         | 75 +++++++++++++++++++++++++++++++++++++++++++----
>  include/hw/virtio/vhost.h |  1 +
>  2 files changed, 70 insertions(+), 6 deletions(-)
>
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index ef6d9b5..997d560 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -45,6 +45,9 @@
>
>  static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>  static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> +static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX];
> +static QLIST_HEAD(, vhost_dev) vhost_mlog_devices =
> +    QLIST_HEAD_INITIALIZER(vhost_mlog_devices);
>
>  /* Memslots used by backends that support private memslots (without an fd). */
>  static unsigned int used_memslots;
> @@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>      }
>  }
>
> +static bool vhost_log_dev_enabled(struct vhost_dev *dev)

"Enabled" sounds misleading to me. Maybe vhost_dev_should_log? More
suggestions below.

> +{
> +    assert(dev->vhost_ops);
> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +    return dev == vhost_mem_logger[dev->vhost_ops->backend_type];
> +}
> +
> +static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable)
> +{
> +    struct vhost_dev *logdev = NULL;
> +    VhostBackendType backend_type;
> +    bool reelect = false;
> +
> +    assert(hdev->vhost_ops);
> +    assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +    assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +    backend_type = hdev->vhost_ops->backend_type;
> +
> +    if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
> +        reelect = !vhost_mem_logger[backend_type];
> +        QLIST_INSERT_HEAD(&vhost_mlog_devices, hdev, logdev_entry);
> +    } else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) {
> +        reelect = vhost_mem_logger[backend_type] == hdev;
> +        QLIST_REMOVE(hdev, logdev_entry);
> +    }
> +
> +    if (!reelect)
> +        return;
> +
> +    QLIST_FOREACH(hdev, &vhost_mlog_devices, logdev_entry) {
> +        if (!hdev->vhost_ops ||
> +            hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE ||
> +            hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX)

Aren't comparisons with ops->backend_type already contained in the
following "hdev->vhost_ops->backend_type == backend_type" ?

> +            continue;
> +
> +        if (hdev->vhost_ops->backend_type == backend_type) {
> +            logdev = hdev;
> +            break;
> +        }

Why not use VHOST_BACKEND_TYPE_MAX QLISTs, and then simply check if
*dev is the head at vhost_log_dev_enabled?

That way we can remove this foreach and vhost_log_dev_enabled
entirely, as the check is simpler. I think it could even remove this
function entirely and inline QLIST_INSERT / QLIST_REMOVE at callers.
What do you think?

Thanks!

> +    }
> +
> +    vhost_mem_logger[backend_type] = logdev;
> +}
> +
>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
>                                     MemoryRegionSection *section,
>                                     hwaddr first,
> @@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
>      start_addr = MAX(first, start_addr);
>      end_addr = MIN(last, end_addr);
>
> -    for (i = 0; i < dev->mem->nregions; ++i) {
> -        struct vhost_memory_region *reg = dev->mem->regions + i;
> -        vhost_dev_sync_region(dev, section, start_addr, end_addr,
> -                              reg->guest_phys_addr,
> -                              range_get_last(reg->guest_phys_addr,
> -                                             reg->memory_size));
> +    if (vhost_log_dev_enabled(dev)) {
> +        for (i = 0; i < dev->mem->nregions; ++i) {
> +            struct vhost_memory_region *reg = dev->mem->regions + i;
> +            vhost_dev_sync_region(dev, section, start_addr, end_addr,
> +                                  reg->guest_phys_addr,
> +                                  range_get_last(reg->guest_phys_addr,
> +                                                 reg->memory_size));
> +        }
>      }
>      for (i = 0; i < dev->nvqs; ++i) {
>          struct vhost_virtqueue *vq = dev->vqs + i;
> @@ -382,6 +434,7 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
>          g_free(log);
>      }
>
> +    vhost_mlog_set_dev(dev, false);
>      dev->log = NULL;
>      dev->log_size = 0;
>  }
> @@ -997,6 +1050,15 @@ static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
>              goto err_vq;
>          }
>      }
> +
> +    /*
> +     * At log start we select our vhost_device logger that will scan the
> +     * memory sections and skip for the others. This is possible because
> +     * the log is shared amongst all vhost devices for a given type of
> +     * backend.
> +     */
> +    vhost_mlog_set_dev(dev, enable_log);
> +
>      return 0;
>  err_vq:
>      for (; i >= 0; --i) {
> @@ -2072,6 +2134,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
>              VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
>              goto fail_log;
>          }
> +        vhost_mlog_set_dev(hdev, true);
>      }
>      if (vrings) {
>          r = vhost_dev_set_vring_enable(hdev, true);
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 0247778..d75faf4 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -129,6 +129,7 @@ struct vhost_dev {
>      void *opaque;
>      struct vhost_log *log;
>      QLIST_ENTRY(vhost_dev) entry;
> +    QLIST_ENTRY(vhost_dev) logdev_entry;
>      QLIST_HEAD(, vhost_iommu) iommu_list;
>      IOMMUNotifier n;
>      const VhostDevConfigOps *config_ops;
> --
> 1.8.3.1
>
>



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-02-14 11:50 ` [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
  2024-03-06 18:42   ` Eugenio Perez Martin
@ 2024-03-12 18:35   ` Michael S. Tsirkin
  1 sibling, 0 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2024-03-12 18:35 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, jasowang, joao.m.martins

On Wed, Feb 14, 2024 at 03:50:19AM -0800, Si-Wei Liu wrote:
> On setups with one or more virtio-net devices with vhost on,
> dirty tracking iteration increases cost the bigger the number
> amount of queues are set up e.g. on idle guests migration the
> following is observed with virtio-net with vhost=on:
> 
> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14

Given the drastic slowdown I am prepared to treat this as
a bugfix if a version addressing all comments and rebased
is sent early during the freeze.

> With high memory rates the symptom is lack of convergence as soon
> as it has a vhost device with a sufficiently high number of queues,
> the sufficient number of vhost devices.
> 
> On every migration iteration (every 100msecs) it will redundantly
> query the *shared log* the number of queues configured with vhost
> that exist in the guest. For the virtqueue data, this is necessary,
> but not for the memory sections which are the same. So
> essentially we end up scanning the dirty log too often.
> 
> To fix that, select a vhost device responsible for scanning the
> log with regards to memory sections dirty tracking. It is selected
> when we enable the logger (during migration) and cleared when we
> disable the logger. If the vhost logger device goes away for some
> reason, the logger will be re-selected from the rest of vhost
> devices.
> 
> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> ---
>  hw/virtio/vhost.c         | 75 +++++++++++++++++++++++++++++++++++++++++++----
>  include/hw/virtio/vhost.h |  1 +
>  2 files changed, 70 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index ef6d9b5..997d560 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -45,6 +45,9 @@
>  
>  static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>  static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> +static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX];
> +static QLIST_HEAD(, vhost_dev) vhost_mlog_devices =
> +    QLIST_HEAD_INITIALIZER(vhost_mlog_devices);
>  
>  /* Memslots used by backends that support private memslots (without an fd). */
>  static unsigned int used_memslots;
> @@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>      }
>  }
>  
> +static bool vhost_log_dev_enabled(struct vhost_dev *dev)
> +{
> +    assert(dev->vhost_ops);
> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +    return dev == vhost_mem_logger[dev->vhost_ops->backend_type];
> +}
> +
> +static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable)
> +{
> +    struct vhost_dev *logdev = NULL;
> +    VhostBackendType backend_type;
> +    bool reelect = false;
> +
> +    assert(hdev->vhost_ops);
> +    assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +    assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +    backend_type = hdev->vhost_ops->backend_type;
> +
> +    if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
> +        reelect = !vhost_mem_logger[backend_type];
> +        QLIST_INSERT_HEAD(&vhost_mlog_devices, hdev, logdev_entry);
> +    } else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) {
> +        reelect = vhost_mem_logger[backend_type] == hdev;
> +        QLIST_REMOVE(hdev, logdev_entry);
> +    }
> +
> +    if (!reelect)
> +        return;
> +
> +    QLIST_FOREACH(hdev, &vhost_mlog_devices, logdev_entry) {
> +        if (!hdev->vhost_ops ||
> +            hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE ||
> +            hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX)
> +            continue;
> +
> +        if (hdev->vhost_ops->backend_type == backend_type) {
> +            logdev = hdev;
> +            break;
> +        }
> +    }
> +
> +    vhost_mem_logger[backend_type] = logdev;
> +}
> +
>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
>                                     MemoryRegionSection *section,
>                                     hwaddr first,
> @@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
>      start_addr = MAX(first, start_addr);
>      end_addr = MIN(last, end_addr);
>  
> -    for (i = 0; i < dev->mem->nregions; ++i) {
> -        struct vhost_memory_region *reg = dev->mem->regions + i;
> -        vhost_dev_sync_region(dev, section, start_addr, end_addr,
> -                              reg->guest_phys_addr,
> -                              range_get_last(reg->guest_phys_addr,
> -                                             reg->memory_size));
> +    if (vhost_log_dev_enabled(dev)) {
> +        for (i = 0; i < dev->mem->nregions; ++i) {
> +            struct vhost_memory_region *reg = dev->mem->regions + i;
> +            vhost_dev_sync_region(dev, section, start_addr, end_addr,
> +                                  reg->guest_phys_addr,
> +                                  range_get_last(reg->guest_phys_addr,
> +                                                 reg->memory_size));
> +        }
>      }
>      for (i = 0; i < dev->nvqs; ++i) {
>          struct vhost_virtqueue *vq = dev->vqs + i;
> @@ -382,6 +434,7 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
>          g_free(log);
>      }
>  
> +    vhost_mlog_set_dev(dev, false);
>      dev->log = NULL;
>      dev->log_size = 0;
>  }
> @@ -997,6 +1050,15 @@ static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
>              goto err_vq;
>          }
>      }
> +
> +    /*
> +     * At log start we select our vhost_device logger that will scan the
> +     * memory sections and skip for the others. This is possible because
> +     * the log is shared amongst all vhost devices for a given type of
> +     * backend.
> +     */
> +    vhost_mlog_set_dev(dev, enable_log);
> +
>      return 0;
>  err_vq:
>      for (; i >= 0; --i) {
> @@ -2072,6 +2134,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
>              VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
>              goto fail_log;
>          }
> +        vhost_mlog_set_dev(hdev, true);
>      }
>      if (vrings) {
>          r = vhost_dev_set_vring_enable(hdev, true);
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 0247778..d75faf4 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -129,6 +129,7 @@ struct vhost_dev {
>      void *opaque;
>      struct vhost_log *log;
>      QLIST_ENTRY(vhost_dev) entry;
> +    QLIST_ENTRY(vhost_dev) logdev_entry;
>      QLIST_HEAD(, vhost_iommu) iommu_list;
>      IOMMUNotifier n;
>      const VhostDevConfigOps *config_ops;
> -- 
> 1.8.3.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] vhost: dirty log should be per backend type
  2024-02-14 11:50 [PATCH v2 1/2] vhost: dirty log should be per backend type Si-Wei Liu
  2024-02-14 11:50 ` [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
@ 2024-02-14 18:42 ` Si-Wei Liu
  2024-03-12 15:07   ` Michael S. Tsirkin
  2024-03-06 17:27 ` Eugenio Perez Martin
  2 siblings, 1 reply; 8+ messages in thread
From: Si-Wei Liu @ 2024-02-14 18:42 UTC (permalink / raw)
  To: mst; +Cc: jasowang, joao.m.martins, qemu-devel

Hi Michael,

I'm taking off for 2+ weeks, but please feel free to provide comment and 
feedback while I'm off. I'll be checking emails still, and am about to 
address any opens as soon as I am back.

Thanks,
-Siwei

On 2/14/2024 3:50 AM, Si-Wei Liu wrote:
> There could be a mix of both vhost-user and vhost-kernel clients
> in the same QEMU process, where separate vhost loggers for the
> specific vhost type have to be used. Make the vhost logger per
> backend type, and have them properly reference counted.
>
> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> ---
>   hw/virtio/vhost.c | 49 +++++++++++++++++++++++++++++++++++++------------
>   1 file changed, 37 insertions(+), 12 deletions(-)
>
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 2c9ac79..ef6d9b5 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -43,8 +43,8 @@
>       do { } while (0)
>   #endif
>   
> -static struct vhost_log *vhost_log;
> -static struct vhost_log *vhost_log_shm;
> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>   
>   /* Memslots used by backends that support private memslots (without an fd). */
>   static unsigned int used_memslots;
> @@ -287,6 +287,8 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
>           r = -1;
>       }
>   
> +    assert(dev->vhost_ops->backend_type == backend_type || r < 0);
> +
>       return r;
>   }
>   
> @@ -319,16 +321,23 @@ static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
>       return log;
>   }
>   
> -static struct vhost_log *vhost_log_get(uint64_t size, bool share)
> +static struct vhost_log *vhost_log_get(VhostBackendType backend_type,
> +                                       uint64_t size, bool share)
>   {
> -    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
> +    struct vhost_log *log;
> +
> +    if (backend_type == VHOST_BACKEND_TYPE_NONE ||
> +        backend_type >= VHOST_BACKEND_TYPE_MAX)
> +        return NULL;
> +
> +    log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
>   
>       if (!log || log->size != size) {
>           log = vhost_log_alloc(size, share);
>           if (share) {
> -            vhost_log_shm = log;
> +            vhost_log_shm[backend_type] = log;
>           } else {
> -            vhost_log = log;
> +            vhost_log[backend_type] = log;
>           }
>       } else {
>           ++log->refcnt;
> @@ -340,11 +349,20 @@ static struct vhost_log *vhost_log_get(uint64_t size, bool share)
>   static void vhost_log_put(struct vhost_dev *dev, bool sync)
>   {
>       struct vhost_log *log = dev->log;
> +    VhostBackendType backend_type;
>   
>       if (!log) {
>           return;
>       }
>   
> +    assert(dev->vhost_ops);
> +    backend_type = dev->vhost_ops->backend_type;
> +
> +    if (backend_type == VHOST_BACKEND_TYPE_NONE ||
> +        backend_type >= VHOST_BACKEND_TYPE_MAX) {
> +        return;
> +    }
> +
>       --log->refcnt;
>       if (log->refcnt == 0) {
>           /* Sync only the range covered by the old log */
> @@ -352,13 +370,13 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
>               vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
>           }
>   
> -        if (vhost_log == log) {
> +        if (vhost_log[backend_type] == log) {
>               g_free(log->log);
> -            vhost_log = NULL;
> -        } else if (vhost_log_shm == log) {
> +            vhost_log[backend_type] = NULL;
> +        } else if (vhost_log_shm[backend_type] == log) {
>               qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
>                               log->fd);
> -            vhost_log_shm = NULL;
> +            vhost_log_shm[backend_type] = NULL;
>           }
>   
>           g_free(log);
> @@ -376,7 +394,8 @@ static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
>   
>   static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
>   {
> -    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
> +    struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type,
> +                                          size, vhost_dev_log_is_shared(dev));
>       uint64_t log_base = (uintptr_t)log->log;
>       int r;
>   
> @@ -2037,8 +2056,14 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
>           uint64_t log_base;
>   
>           hdev->log_size = vhost_get_log_size(hdev);
> -        hdev->log = vhost_log_get(hdev->log_size,
> +        hdev->log = vhost_log_get(hdev->vhost_ops->backend_type,
> +                                  hdev->log_size,
>                                     vhost_dev_log_is_shared(hdev));
> +        if (!hdev->log) {
> +            VHOST_OPS_DEBUG(r, "vhost_log_get failed");
> +            goto fail_vq;
> +        }
> +
>           log_base = (uintptr_t)hdev->log->log;
>           r = hdev->vhost_ops->vhost_set_log_base(hdev,
>                                                   hdev->log_size ? log_base : 0,



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] vhost: dirty log should be per backend type
  2024-02-14 18:42 ` [PATCH v2 1/2] vhost: dirty log should be per backend type Si-Wei Liu
@ 2024-03-12 15:07   ` Michael S. Tsirkin
  2024-03-13 19:13     ` Si-Wei Liu
  0 siblings, 1 reply; 8+ messages in thread
From: Michael S. Tsirkin @ 2024-03-12 15:07 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: jasowang, joao.m.martins, qemu-devel

On Wed, Feb 14, 2024 at 10:42:29AM -0800, Si-Wei Liu wrote:
> Hi Michael,
> 
> I'm taking off for 2+ weeks, but please feel free to provide comment and
> feedback while I'm off. I'll be checking emails still, and am about to
> address any opens as soon as I am back.
> 
> Thanks,
> -Siwei

Eugenio sent some comments. I don't have more, just address these
please. Thanks!

-- 
MST



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] vhost: dirty log should be per backend type
  2024-03-12 15:07   ` Michael S. Tsirkin
@ 2024-03-13 19:13     ` Si-Wei Liu
  0 siblings, 0 replies; 8+ messages in thread
From: Si-Wei Liu @ 2024-03-13 19:13 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: jasowang, joao.m.martins, qemu-devel



On 3/12/2024 8:07 AM, Michael S. Tsirkin wrote:
> On Wed, Feb 14, 2024 at 10:42:29AM -0800, Si-Wei Liu wrote:
>> Hi Michael,
>>
>> I'm taking off for 2+ weeks, but please feel free to provide comment and
>> feedback while I'm off. I'll be checking emails still, and am about to
>> address any opens as soon as I am back.
>>
>> Thanks,
>> -Siwei
> Eugenio sent some comments. I don't have more, just address these
> please. Thanks!

Thanks Michael, good to know you don't have more other than the one from 
Eugenio. I will post a v3 shortly to address his comments.

-Siwei


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/2] vhost: dirty log should be per backend type
  2024-02-14 11:50 [PATCH v2 1/2] vhost: dirty log should be per backend type Si-Wei Liu
  2024-02-14 11:50 ` [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
  2024-02-14 18:42 ` [PATCH v2 1/2] vhost: dirty log should be per backend type Si-Wei Liu
@ 2024-03-06 17:27 ` Eugenio Perez Martin
  2 siblings, 0 replies; 8+ messages in thread
From: Eugenio Perez Martin @ 2024-03-06 17:27 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, jasowang, joao.m.martins

On Wed, Feb 14, 2024 at 2:01 PM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
> There could be a mix of both vhost-user and vhost-kernel clients
> in the same QEMU process, where separate vhost loggers for the
> specific vhost type have to be used. Make the vhost logger per
> backend type, and have them properly reference counted.
>
> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>

It seems to me you missed the cover letter and sent 01/02 as the first message?

> ---
>  hw/virtio/vhost.c | 49 +++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 37 insertions(+), 12 deletions(-)
>
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 2c9ac79..ef6d9b5 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -43,8 +43,8 @@
>      do { } while (0)
>  #endif
>
> -static struct vhost_log *vhost_log;
> -static struct vhost_log *vhost_log_shm;
> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>
>  /* Memslots used by backends that support private memslots (without an fd). */
>  static unsigned int used_memslots;
> @@ -287,6 +287,8 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
>          r = -1;
>      }
>
> +    assert(dev->vhost_ops->backend_type == backend_type || r < 0);

Is this a debug leftover (at least the r<0)? This should never be
reached effectively, but then it does not make sense to leave the
default switch branch.

> +
>      return r;
>  }
>
> @@ -319,16 +321,23 @@ static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
>      return log;
>  }
>
> -static struct vhost_log *vhost_log_get(uint64_t size, bool share)
> +static struct vhost_log *vhost_log_get(VhostBackendType backend_type,
> +                                       uint64_t size, bool share)
>  {
> -    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
> +    struct vhost_log *log;
> +
> +    if (backend_type == VHOST_BACKEND_TYPE_NONE ||
> +        backend_type >= VHOST_BACKEND_TYPE_MAX)
> +        return NULL;

The callers (vhost_log_resize, etc) don't expect vhost_log_get to
return NULL. I think all of these should be an assertion, if any.

The rest looks good to me.

> +
> +    log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
>
>      if (!log || log->size != size) {
>          log = vhost_log_alloc(size, share);
>          if (share) {
> -            vhost_log_shm = log;
> +            vhost_log_shm[backend_type] = log;
>          } else {
> -            vhost_log = log;
> +            vhost_log[backend_type] = log;
>          }
>      } else {
>          ++log->refcnt;
> @@ -340,11 +349,20 @@ static struct vhost_log *vhost_log_get(uint64_t size, bool share)
>  static void vhost_log_put(struct vhost_dev *dev, bool sync)
>  {
>      struct vhost_log *log = dev->log;
> +    VhostBackendType backend_type;
>
>      if (!log) {
>          return;
>      }
>
> +    assert(dev->vhost_ops);
> +    backend_type = dev->vhost_ops->backend_type;
> +
> +    if (backend_type == VHOST_BACKEND_TYPE_NONE ||
> +        backend_type >= VHOST_BACKEND_TYPE_MAX) {
> +        return;
> +    }
> +
>      --log->refcnt;
>      if (log->refcnt == 0) {
>          /* Sync only the range covered by the old log */
> @@ -352,13 +370,13 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
>              vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
>          }
>
> -        if (vhost_log == log) {
> +        if (vhost_log[backend_type] == log) {
>              g_free(log->log);
> -            vhost_log = NULL;
> -        } else if (vhost_log_shm == log) {
> +            vhost_log[backend_type] = NULL;
> +        } else if (vhost_log_shm[backend_type] == log) {
>              qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
>                              log->fd);
> -            vhost_log_shm = NULL;
> +            vhost_log_shm[backend_type] = NULL;
>          }
>
>          g_free(log);
> @@ -376,7 +394,8 @@ static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
>
>  static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
>  {
> -    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
> +    struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type,
> +                                          size, vhost_dev_log_is_shared(dev));
>      uint64_t log_base = (uintptr_t)log->log;
>      int r;
>
> @@ -2037,8 +2056,14 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
>          uint64_t log_base;
>
>          hdev->log_size = vhost_get_log_size(hdev);
> -        hdev->log = vhost_log_get(hdev->log_size,
> +        hdev->log = vhost_log_get(hdev->vhost_ops->backend_type,
> +                                  hdev->log_size,
>                                    vhost_dev_log_is_shared(hdev));
> +        if (!hdev->log) {
> +            VHOST_OPS_DEBUG(r, "vhost_log_get failed");
> +            goto fail_vq;
> +        }
> +
>          log_base = (uintptr_t)hdev->log->log;
>          r = hdev->vhost_ops->vhost_set_log_base(hdev,
>                                                  hdev->log_size ? log_base : 0,
> --
> 1.8.3.1
>
>



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-03-13 19:14 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-02-14 11:50 [PATCH v2 1/2] vhost: dirty log should be per backend type Si-Wei Liu
2024-02-14 11:50 ` [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
2024-03-06 18:42   ` Eugenio Perez Martin
2024-03-12 18:35   ` Michael S. Tsirkin
2024-02-14 18:42 ` [PATCH v2 1/2] vhost: dirty log should be per backend type Si-Wei Liu
2024-03-12 15:07   ` Michael S. Tsirkin
2024-03-13 19:13     ` Si-Wei Liu
2024-03-06 17:27 ` Eugenio Perez Martin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).