[PATCH v4 1/2] vhost: dirty log should be per backend type

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4 1/2] vhost: dirty log should be per backend type
@ 2024-03-14 20:27 Si-Wei Liu
  2024-03-14 20:27 ` [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
  2024-03-15  3:50 ` [PATCH v4 1/2] vhost: dirty log should be per backend type Jason Wang
  0 siblings, 2 replies; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-14 20:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: mst, jasowang, eperezma, joao.m.martins, si-wei.liu

There could be a mix of both vhost-user and vhost-kernel clients
in the same QEMU process, where separate vhost loggers for the
specific vhost type have to be used. Make the vhost logger per
backend type, and have them properly reference counted.

Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>

---
v3->v4:
  - remove checking NULL return value from vhost_log_get

v2->v3:
  - remove non-effective assertion that never be reached
  - do not return NULL from vhost_log_get()
  - add neccessary assertions to vhost_log_get()
---
 hw/virtio/vhost.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2c9ac79..612f4db 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -43,8 +43,8 @@
     do { } while (0)
 #endif
 
-static struct vhost_log *vhost_log;
-static struct vhost_log *vhost_log_shm;
+static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
+static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
 
 /* Memslots used by backends that support private memslots (without an fd). */
 static unsigned int used_memslots;
@@ -287,6 +287,10 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
         r = -1;
     }
 
+    if (r == 0) {
+        assert(dev->vhost_ops->backend_type == backend_type);
+    }
+
     return r;
 }
 
@@ -319,16 +323,22 @@ static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
     return log;
 }
 
-static struct vhost_log *vhost_log_get(uint64_t size, bool share)
+static struct vhost_log *vhost_log_get(VhostBackendType backend_type,
+                                       uint64_t size, bool share)
 {
-    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
+    struct vhost_log *log;
+
+    assert(backend_type > VHOST_BACKEND_TYPE_NONE);
+    assert(backend_type < VHOST_BACKEND_TYPE_MAX);
+
+    log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
 
     if (!log || log->size != size) {
         log = vhost_log_alloc(size, share);
         if (share) {
-            vhost_log_shm = log;
+            vhost_log_shm[backend_type] = log;
         } else {
-            vhost_log = log;
+            vhost_log[backend_type] = log;
         }
     } else {
         ++log->refcnt;
@@ -340,11 +350,20 @@ static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 static void vhost_log_put(struct vhost_dev *dev, bool sync)
 {
     struct vhost_log *log = dev->log;
+    VhostBackendType backend_type;
 
     if (!log) {
         return;
     }
 
+    assert(dev->vhost_ops);
+    backend_type = dev->vhost_ops->backend_type;
+
+    if (backend_type == VHOST_BACKEND_TYPE_NONE ||
+        backend_type >= VHOST_BACKEND_TYPE_MAX) {
+        return;
+    }
+
     --log->refcnt;
     if (log->refcnt == 0) {
         /* Sync only the range covered by the old log */
@@ -352,13 +371,13 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
         }
 
-        if (vhost_log == log) {
+        if (vhost_log[backend_type] == log) {
             g_free(log->log);
-            vhost_log = NULL;
-        } else if (vhost_log_shm == log) {
+            vhost_log[backend_type] = NULL;
+        } else if (vhost_log_shm[backend_type] == log) {
             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
                             log->fd);
-            vhost_log_shm = NULL;
+            vhost_log_shm[backend_type] = NULL;
         }
 
         g_free(log);
@@ -376,7 +395,8 @@ static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 
 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
 {
-    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
+    struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type,
+                                          size, vhost_dev_log_is_shared(dev));
     uint64_t log_base = (uintptr_t)log->log;
     int r;
 
@@ -2037,7 +2057,8 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
         uint64_t log_base;
 
         hdev->log_size = vhost_get_log_size(hdev);
-        hdev->log = vhost_log_get(hdev->log_size,
+        hdev->log = vhost_log_get(hdev->vhost_ops->backend_type,
+                                  hdev->log_size,
                                   vhost_dev_log_is_shared(hdev));
         log_base = (uintptr_t)hdev->log->log;
         r = hdev->vhost_ops->vhost_set_log_base(hdev,
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-14 20:27 [PATCH v4 1/2] vhost: dirty log should be per backend type Si-Wei Liu
@ 2024-03-14 20:27 ` Si-Wei Liu
  2024-03-15  4:03   ` Jason Wang
  2024-03-15  3:50 ` [PATCH v4 1/2] vhost: dirty log should be per backend type Jason Wang
  1 sibling, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-14 20:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: mst, jasowang, eperezma, joao.m.martins, si-wei.liu

On setups with one or more virtio-net devices with vhost on,
dirty tracking iteration increases cost the bigger the number
amount of queues are set up e.g. on idle guests migration the
following is observed with virtio-net with vhost=on:

48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14

With high memory rates the symptom is lack of convergence as soon
as it has a vhost device with a sufficiently high number of queues,
the sufficient number of vhost devices.

On every migration iteration (every 100msecs) it will redundantly
query the *shared log* the number of queues configured with vhost
that exist in the guest. For the virtqueue data, this is necessary,
but not for the memory sections which are the same. So essentially
we end up scanning the dirty log too often.

To fix that, select a vhost device responsible for scanning the
log with regards to memory sections dirty tracking. It is selected
when we enable the logger (during migration) and cleared when we
disable the logger. If the vhost logger device goes away for some
reason, the logger will be re-selected from the rest of vhost
devices.

After making mem-section logger a singleton instance, constant cost
of 7%-9% (like the 1 queue report) will be seen, no matter how many
queues or how many vhost devices are configured:

48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14

Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>

---
v3 -> v4:
  - add comment to clarify effect on cache locality and
    performance

v2 -> v3:
  - add after-fix benchmark to commit log
  - rename vhost_log_dev_enabled to vhost_dev_should_log
  - remove unneeded comparisons for backend_type
  - use QLIST array instead of single flat list to store vhost
    logger devices
  - simplify logger election logic
---
 hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
 include/hw/virtio/vhost.h |  1 +
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 612f4db..58522f1 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -45,6 +45,7 @@
 
 static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
 static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
+static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
 
 /* Memslots used by backends that support private memslots (without an fd). */
 static unsigned int used_memslots;
@@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
     }
 }
 
+static inline bool vhost_dev_should_log(struct vhost_dev *dev)
+{
+    assert(dev->vhost_ops);
+    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
+    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
+
+    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
+}
+
+static inline void vhost_dev_elect_mem_logger(struct vhost_dev *hdev, bool add)
+{
+    VhostBackendType backend_type;
+
+    assert(hdev->vhost_ops);
+
+    backend_type = hdev->vhost_ops->backend_type;
+    assert(backend_type > VHOST_BACKEND_TYPE_NONE);
+    assert(backend_type < VHOST_BACKEND_TYPE_MAX);
+
+    if (add && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
+        if (QLIST_EMPTY(&vhost_log_devs[backend_type])) {
+            QLIST_INSERT_HEAD(&vhost_log_devs[backend_type],
+                              hdev, logdev_entry);
+        } else {
+            /*
+             * The first vhost_device in the list is selected as the shared
+             * logger to scan memory sections. Put new entry next to the head
+             * to avoid inadvertent change to the underlying logger device.
+             * This is done in order to get better cache locality and to avoid
+             * performance churn on the hot path for log scanning. Even when
+             * new devices come and go quickly, it wouldn't end up changing
+             * the active leading logger device at all.
+             */
+            QLIST_INSERT_AFTER(QLIST_FIRST(&vhost_log_devs[backend_type]),
+                               hdev, logdev_entry);
+        }
+    } else if (!add && QLIST_IS_INSERTED(hdev, logdev_entry)) {
+        QLIST_REMOVE(hdev, logdev_entry);
+    }
+}
+
 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
                                    MemoryRegionSection *section,
                                    hwaddr first,
@@ -166,12 +208,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
     start_addr = MAX(first, start_addr);
     end_addr = MIN(last, end_addr);
 
-    for (i = 0; i < dev->mem->nregions; ++i) {
-        struct vhost_memory_region *reg = dev->mem->regions + i;
-        vhost_dev_sync_region(dev, section, start_addr, end_addr,
-                              reg->guest_phys_addr,
-                              range_get_last(reg->guest_phys_addr,
-                                             reg->memory_size));
+    if (vhost_dev_should_log(dev)) {
+        for (i = 0; i < dev->mem->nregions; ++i) {
+            struct vhost_memory_region *reg = dev->mem->regions + i;
+            vhost_dev_sync_region(dev, section, start_addr, end_addr,
+                                  reg->guest_phys_addr,
+                                  range_get_last(reg->guest_phys_addr,
+                                                 reg->memory_size));
+        }
     }
     for (i = 0; i < dev->nvqs; ++i) {
         struct vhost_virtqueue *vq = dev->vqs + i;
@@ -383,6 +427,7 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
         g_free(log);
     }
 
+    vhost_dev_elect_mem_logger(dev, false);
     dev->log = NULL;
     dev->log_size = 0;
 }
@@ -998,6 +1043,15 @@ static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
             goto err_vq;
         }
     }
+
+    /*
+     * At log start we select our vhost_device logger that will scan the
+     * memory sections and skip for the others. This is possible because
+     * the log is shared amongst all vhost devices for a given type of
+     * backend.
+     */
+    vhost_dev_elect_mem_logger(dev, enable_log);
+
     return 0;
 err_vq:
     for (; i >= 0; --i) {
@@ -2068,6 +2122,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
             VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
             goto fail_log;
         }
+        vhost_dev_elect_mem_logger(hdev, true);
     }
     if (vrings) {
         r = vhost_dev_set_vring_enable(hdev, true);
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 0247778..d75faf4 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -129,6 +129,7 @@ struct vhost_dev {
     void *opaque;
     struct vhost_log *log;
     QLIST_ENTRY(vhost_dev) entry;
+    QLIST_ENTRY(vhost_dev) logdev_entry;
     QLIST_HEAD(, vhost_iommu) iommu_list;
     IOMMUNotifier n;
     const VhostDevConfigOps *config_ops;
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/2] vhost: dirty log should be per backend type
  2024-03-14 20:27 [PATCH v4 1/2] vhost: dirty log should be per backend type Si-Wei Liu
  2024-03-14 20:27 ` [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
@ 2024-03-15  3:50 ` Jason Wang
  2024-03-15 18:33   ` Si-Wei Liu
  1 sibling, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-15  3:50 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
> There could be a mix of both vhost-user and vhost-kernel clients
> in the same QEMU process, where separate vhost loggers for the
> specific vhost type have to be used. Make the vhost logger per
> backend type, and have them properly reference counted.

It's better to describe what's the advantage of doing this.

>
> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>
> ---
> v3->v4:
>   - remove checking NULL return value from vhost_log_get
>
> v2->v3:
>   - remove non-effective assertion that never be reached
>   - do not return NULL from vhost_log_get()
>   - add neccessary assertions to vhost_log_get()
> ---
>  hw/virtio/vhost.c | 45 +++++++++++++++++++++++++++++++++------------
>  1 file changed, 33 insertions(+), 12 deletions(-)
>
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 2c9ac79..612f4db 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -43,8 +43,8 @@
>      do { } while (0)
>  #endif
>
> -static struct vhost_log *vhost_log;
> -static struct vhost_log *vhost_log_shm;
> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>
>  /* Memslots used by backends that support private memslots (without an fd). */
>  static unsigned int used_memslots;
> @@ -287,6 +287,10 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
>          r = -1;
>      }
>
> +    if (r == 0) {
> +        assert(dev->vhost_ops->backend_type == backend_type);
> +    }
> +

Under which condition could we hit this? It seems not good to assert a
local logic.

Thanks



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-14 20:27 ` [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
@ 2024-03-15  4:03   ` Jason Wang
  2024-03-15 18:44     ` Si-Wei Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-15  4:03 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
> On setups with one or more virtio-net devices with vhost on,
> dirty tracking iteration increases cost the bigger the number
> amount of queues are set up e.g. on idle guests migration the
> following is observed with virtio-net with vhost=on:
>
> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>
> With high memory rates the symptom is lack of convergence as soon
> as it has a vhost device with a sufficiently high number of queues,
> the sufficient number of vhost devices.
>
> On every migration iteration (every 100msecs) it will redundantly
> query the *shared log* the number of queues configured with vhost
> that exist in the guest. For the virtqueue data, this is necessary,
> but not for the memory sections which are the same. So essentially
> we end up scanning the dirty log too often.
>
> To fix that, select a vhost device responsible for scanning the
> log with regards to memory sections dirty tracking. It is selected
> when we enable the logger (during migration) and cleared when we
> disable the logger. If the vhost logger device goes away for some
> reason, the logger will be re-selected from the rest of vhost
> devices.
>
> After making mem-section logger a singleton instance, constant cost
> of 7%-9% (like the 1 queue report) will be seen, no matter how many
> queues or how many vhost devices are configured:
>
> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
>
> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>
> ---
> v3 -> v4:
>   - add comment to clarify effect on cache locality and
>     performance
>
> v2 -> v3:
>   - add after-fix benchmark to commit log
>   - rename vhost_log_dev_enabled to vhost_dev_should_log
>   - remove unneeded comparisons for backend_type
>   - use QLIST array instead of single flat list to store vhost
>     logger devices
>   - simplify logger election logic
> ---
>  hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
>  include/hw/virtio/vhost.h |  1 +
>  2 files changed, 62 insertions(+), 6 deletions(-)
>
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 612f4db..58522f1 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -45,6 +45,7 @@
>
>  static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>  static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
>
>  /* Memslots used by backends that support private memslots (without an fd). */
>  static unsigned int used_memslots;
> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>      }
>  }
>
> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
> +{
> +    assert(dev->vhost_ops);
> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);

A dumb question, why not simple check

dev->log == vhost_log_shm[dev->vhost_ops->backend_type]

?

Thanks



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/2] vhost: dirty log should be per backend type
  2024-03-15  3:50 ` [PATCH v4 1/2] vhost: dirty log should be per backend type Jason Wang
@ 2024-03-15 18:33   ` Si-Wei Liu
  2024-03-18  3:20     ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-15 18:33 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/14/2024 8:50 PM, Jason Wang wrote:
> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>> There could be a mix of both vhost-user and vhost-kernel clients
>> in the same QEMU process, where separate vhost loggers for the
>> specific vhost type have to be used. Make the vhost logger per
>> backend type, and have them properly reference counted.
> It's better to describe what's the advantage of doing this.
Yes, I can add that to the log. Although it's a niche use case, it was 
actually a long standing limitation / bug that vhost-user and 
vhost-kernel loggers can't co-exist per QEMU process, but today it's 
just silent failure that may be ended up with. This bug fix removes that 
implicit limitation in the code.
>
>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>
>> ---
>> v3->v4:
>>    - remove checking NULL return value from vhost_log_get
>>
>> v2->v3:
>>    - remove non-effective assertion that never be reached
>>    - do not return NULL from vhost_log_get()
>>    - add neccessary assertions to vhost_log_get()
>> ---
>>   hw/virtio/vhost.c | 45 +++++++++++++++++++++++++++++++++------------
>>   1 file changed, 33 insertions(+), 12 deletions(-)
>>
>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>> index 2c9ac79..612f4db 100644
>> --- a/hw/virtio/vhost.c
>> +++ b/hw/virtio/vhost.c
>> @@ -43,8 +43,8 @@
>>       do { } while (0)
>>   #endif
>>
>> -static struct vhost_log *vhost_log;
>> -static struct vhost_log *vhost_log_shm;
>> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>>
>>   /* Memslots used by backends that support private memslots (without an fd). */
>>   static unsigned int used_memslots;
>> @@ -287,6 +287,10 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
>>           r = -1;
>>       }
>>
>> +    if (r == 0) {
>> +        assert(dev->vhost_ops->backend_type == backend_type);
>> +    }
>> +
> Under which condition could we hit this?
Just in case some other function inadvertently corrupted this earlier, 
we have to capture discrepancy in the first place... On the other hand, 
it will be helpful for other vhost backend writers to diagnose day-one 
bug in the code. I feel just code comment here will not be 
sufficient/helpful.

>   It seems not good to assert a local logic.
It seems to me quite a few local asserts are in the same file already, 
vhost_save_backend_state, vhost_load_backend_state, 
vhost_virtqueue_mask, vhost_config_mask, just to name a few. Why local 
assert a problem?

Thanks,
-Siwei

> Thanks
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-15  4:03   ` Jason Wang
@ 2024-03-15 18:44     ` Si-Wei Liu
  2024-03-18  3:22       ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-15 18:44 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/14/2024 9:03 PM, Jason Wang wrote:
> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>> On setups with one or more virtio-net devices with vhost on,
>> dirty tracking iteration increases cost the bigger the number
>> amount of queues are set up e.g. on idle guests migration the
>> following is observed with virtio-net with vhost=on:
>>
>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>>
>> With high memory rates the symptom is lack of convergence as soon
>> as it has a vhost device with a sufficiently high number of queues,
>> the sufficient number of vhost devices.
>>
>> On every migration iteration (every 100msecs) it will redundantly
>> query the *shared log* the number of queues configured with vhost
>> that exist in the guest. For the virtqueue data, this is necessary,
>> but not for the memory sections which are the same. So essentially
>> we end up scanning the dirty log too often.
>>
>> To fix that, select a vhost device responsible for scanning the
>> log with regards to memory sections dirty tracking. It is selected
>> when we enable the logger (during migration) and cleared when we
>> disable the logger. If the vhost logger device goes away for some
>> reason, the logger will be re-selected from the rest of vhost
>> devices.
>>
>> After making mem-section logger a singleton instance, constant cost
>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
>> queues or how many vhost devices are configured:
>>
>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
>>
>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>
>> ---
>> v3 -> v4:
>>    - add comment to clarify effect on cache locality and
>>      performance
>>
>> v2 -> v3:
>>    - add after-fix benchmark to commit log
>>    - rename vhost_log_dev_enabled to vhost_dev_should_log
>>    - remove unneeded comparisons for backend_type
>>    - use QLIST array instead of single flat list to store vhost
>>      logger devices
>>    - simplify logger election logic
>> ---
>>   hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
>>   include/hw/virtio/vhost.h |  1 +
>>   2 files changed, 62 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>> index 612f4db..58522f1 100644
>> --- a/hw/virtio/vhost.c
>> +++ b/hw/virtio/vhost.c
>> @@ -45,6 +45,7 @@
>>
>>   static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>>   static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
>>
>>   /* Memslots used by backends that support private memslots (without an fd). */
>>   static unsigned int used_memslots;
>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>>       }
>>   }
>>
>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
>> +{
>> +    assert(dev->vhost_ops);
>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
>> +
>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
> A dumb question, why not simple check
>
> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
Because we are not sure if the logger comes from vhost_log_shm[] or 
vhost_log[]. Don't want to complicate the check here by calling into 
vhost_dev_log_is_shared() everytime when the .log_sync() is called.

-Siwei
> ?
>
> Thanks
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/2] vhost: dirty log should be per backend type
  2024-03-15 18:33   ` Si-Wei Liu
@ 2024-03-18  3:20     ` Jason Wang
  2024-03-18 22:06       ` Si-Wei Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-18  3:20 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Sat, Mar 16, 2024 at 2:33 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/14/2024 8:50 PM, Jason Wang wrote:
> > On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >> There could be a mix of both vhost-user and vhost-kernel clients
> >> in the same QEMU process, where separate vhost loggers for the
> >> specific vhost type have to be used. Make the vhost logger per
> >> backend type, and have them properly reference counted.
> > It's better to describe what's the advantage of doing this.
> Yes, I can add that to the log. Although it's a niche use case, it was
> actually a long standing limitation / bug that vhost-user and
> vhost-kernel loggers can't co-exist per QEMU process, but today it's
> just silent failure that may be ended up with. This bug fix removes that
> implicit limitation in the code.

Ok.

> >
> >> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> >> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>
> >> ---
> >> v3->v4:
> >>    - remove checking NULL return value from vhost_log_get
> >>
> >> v2->v3:
> >>    - remove non-effective assertion that never be reached
> >>    - do not return NULL from vhost_log_get()
> >>    - add neccessary assertions to vhost_log_get()
> >> ---
> >>   hw/virtio/vhost.c | 45 +++++++++++++++++++++++++++++++++------------
> >>   1 file changed, 33 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >> index 2c9ac79..612f4db 100644
> >> --- a/hw/virtio/vhost.c
> >> +++ b/hw/virtio/vhost.c
> >> @@ -43,8 +43,8 @@
> >>       do { } while (0)
> >>   #endif
> >>
> >> -static struct vhost_log *vhost_log;
> >> -static struct vhost_log *vhost_log_shm;
> >> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >>
> >>   /* Memslots used by backends that support private memslots (without an fd). */
> >>   static unsigned int used_memslots;
> >> @@ -287,6 +287,10 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
> >>           r = -1;
> >>       }
> >>
> >> +    if (r == 0) {
> >> +        assert(dev->vhost_ops->backend_type == backend_type);
> >> +    }
> >> +
> > Under which condition could we hit this?
> Just in case some other function inadvertently corrupted this earlier,
> we have to capture discrepancy in the first place... On the other hand,
> it will be helpful for other vhost backend writers to diagnose day-one
> bug in the code. I feel just code comment here will not be
> sufficient/helpful.

See below.

>
> >   It seems not good to assert a local logic.
> It seems to me quite a few local asserts are in the same file already,
> vhost_save_backend_state,

For example it has assert for

assert(!dev->started);

which is not the logic of the function itself but require
vhost_dev_start() not to be called before.

But it looks like this patch you assert the code just a few lines
above the assert itself?

dev->vhost_ops = &xxx_ops;

...

assert(dev->vhost_ops->backend_type == backend_type)

?

Thanks

> vhost_load_backend_state,
> vhost_virtqueue_mask, vhost_config_mask, just to name a few. Why local
> assert a problem?
>
> Thanks,
> -Siwei
>
> > Thanks
> >
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-15 18:44     ` Si-Wei Liu
@ 2024-03-18  3:22       ` Jason Wang
  2024-03-18 22:16         ` Si-Wei Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-18  3:22 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/14/2024 9:03 PM, Jason Wang wrote:
> > On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >> On setups with one or more virtio-net devices with vhost on,
> >> dirty tracking iteration increases cost the bigger the number
> >> amount of queues are set up e.g. on idle guests migration the
> >> following is observed with virtio-net with vhost=on:
> >>
> >> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> >> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> >> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> >> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
> >>
> >> With high memory rates the symptom is lack of convergence as soon
> >> as it has a vhost device with a sufficiently high number of queues,
> >> the sufficient number of vhost devices.
> >>
> >> On every migration iteration (every 100msecs) it will redundantly
> >> query the *shared log* the number of queues configured with vhost
> >> that exist in the guest. For the virtqueue data, this is necessary,
> >> but not for the memory sections which are the same. So essentially
> >> we end up scanning the dirty log too often.
> >>
> >> To fix that, select a vhost device responsible for scanning the
> >> log with regards to memory sections dirty tracking. It is selected
> >> when we enable the logger (during migration) and cleared when we
> >> disable the logger. If the vhost logger device goes away for some
> >> reason, the logger will be re-selected from the rest of vhost
> >> devices.
> >>
> >> After making mem-section logger a singleton instance, constant cost
> >> of 7%-9% (like the 1 queue report) will be seen, no matter how many
> >> queues or how many vhost devices are configured:
> >>
> >> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
> >> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
> >>
> >> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>
> >> ---
> >> v3 -> v4:
> >>    - add comment to clarify effect on cache locality and
> >>      performance
> >>
> >> v2 -> v3:
> >>    - add after-fix benchmark to commit log
> >>    - rename vhost_log_dev_enabled to vhost_dev_should_log
> >>    - remove unneeded comparisons for backend_type
> >>    - use QLIST array instead of single flat list to store vhost
> >>      logger devices
> >>    - simplify logger election logic
> >> ---
> >>   hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
> >>   include/hw/virtio/vhost.h |  1 +
> >>   2 files changed, 62 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >> index 612f4db..58522f1 100644
> >> --- a/hw/virtio/vhost.c
> >> +++ b/hw/virtio/vhost.c
> >> @@ -45,6 +45,7 @@
> >>
> >>   static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >>   static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
> >>
> >>   /* Memslots used by backends that support private memslots (without an fd). */
> >>   static unsigned int used_memslots;
> >> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
> >>       }
> >>   }
> >>
> >> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
> >> +{
> >> +    assert(dev->vhost_ops);
> >> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> >> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> >> +
> >> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
> > A dumb question, why not simple check
> >
> > dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
> Because we are not sure if the logger comes from vhost_log_shm[] or
> vhost_log[]. Don't want to complicate the check here by calling into
> vhost_dev_log_is_shared() everytime when the .log_sync() is called.

It has very low overhead, isn't it?

static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
{
    return dev->vhost_ops->vhost_requires_shm_log &&
           dev->vhost_ops->vhost_requires_shm_log(dev);
}

And it helps to simplify the logic.

Thanks

>
> -Siwei
> > ?
> >
> > Thanks
> >
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/2] vhost: dirty log should be per backend type
  2024-03-18  3:20     ` Jason Wang
@ 2024-03-18 22:06       ` Si-Wei Liu
  2024-03-20  3:25         ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-18 22:06 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/17/2024 8:20 PM, Jason Wang wrote:
> On Sat, Mar 16, 2024 at 2:33 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 3/14/2024 8:50 PM, Jason Wang wrote:
>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>> There could be a mix of both vhost-user and vhost-kernel clients
>>>> in the same QEMU process, where separate vhost loggers for the
>>>> specific vhost type have to be used. Make the vhost logger per
>>>> backend type, and have them properly reference counted.
>>> It's better to describe what's the advantage of doing this.
>> Yes, I can add that to the log. Although it's a niche use case, it was
>> actually a long standing limitation / bug that vhost-user and
>> vhost-kernel loggers can't co-exist per QEMU process, but today it's
>> just silent failure that may be ended up with. This bug fix removes that
>> implicit limitation in the code.
> Ok.
>
>>>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>>>
>>>> ---
>>>> v3->v4:
>>>>     - remove checking NULL return value from vhost_log_get
>>>>
>>>> v2->v3:
>>>>     - remove non-effective assertion that never be reached
>>>>     - do not return NULL from vhost_log_get()
>>>>     - add neccessary assertions to vhost_log_get()
>>>> ---
>>>>    hw/virtio/vhost.c | 45 +++++++++++++++++++++++++++++++++------------
>>>>    1 file changed, 33 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>> index 2c9ac79..612f4db 100644
>>>> --- a/hw/virtio/vhost.c
>>>> +++ b/hw/virtio/vhost.c
>>>> @@ -43,8 +43,8 @@
>>>>        do { } while (0)
>>>>    #endif
>>>>
>>>> -static struct vhost_log *vhost_log;
>>>> -static struct vhost_log *vhost_log_shm;
>>>> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>>>> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>>>>
>>>>    /* Memslots used by backends that support private memslots (without an fd). */
>>>>    static unsigned int used_memslots;
>>>> @@ -287,6 +287,10 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
>>>>            r = -1;
>>>>        }
>>>>
>>>> +    if (r == 0) {
>>>> +        assert(dev->vhost_ops->backend_type == backend_type);
>>>> +    }
>>>> +
>>> Under which condition could we hit this?
>> Just in case some other function inadvertently corrupted this earlier,
>> we have to capture discrepancy in the first place... On the other hand,
>> it will be helpful for other vhost backend writers to diagnose day-one
>> bug in the code. I feel just code comment here will not be
>> sufficient/helpful.
> See below.
>
>>>    It seems not good to assert a local logic.
>> It seems to me quite a few local asserts are in the same file already,
>> vhost_save_backend_state,
> For example it has assert for
>
> assert(!dev->started);
>
> which is not the logic of the function itself but require
> vhost_dev_start() not to be called before.
>
> But it looks like this patch you assert the code just a few lines
> above the assert itself?
Yes, that was the intent - for e.g. xxx_ops may contain corrupted 
xxx_ops.backend_type already before coming to this 
vhost_set_backend_type() function. And we may capture this corrupted 
state by asserting the expected xxx_ops.backend_type (to be consistent 
with the backend_type passed in), which needs be done in the first place 
when this discrepancy is detected. In practice I think there should be 
no harm to add this assert, but this will add warranted guarantee to the 
current code.

Regards,
-Siwei

>
> dev->vhost_ops = &xxx_ops;
>
> ...
>
> assert(dev->vhost_ops->backend_type == backend_type)
>
> ?
>
> Thanks
>
>> vhost_load_backend_state,
>> vhost_virtqueue_mask, vhost_config_mask, just to name a few. Why local
>> assert a problem?
>>
>> Thanks,
>> -Siwei
>>
>>> Thanks
>>>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-18  3:22       ` Jason Wang
@ 2024-03-18 22:16         ` Si-Wei Liu
  2024-03-20  3:27           ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-18 22:16 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/17/2024 8:22 PM, Jason Wang wrote:
> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 3/14/2024 9:03 PM, Jason Wang wrote:
>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>> On setups with one or more virtio-net devices with vhost on,
>>>> dirty tracking iteration increases cost the bigger the number
>>>> amount of queues are set up e.g. on idle guests migration the
>>>> following is observed with virtio-net with vhost=on:
>>>>
>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>>>>
>>>> With high memory rates the symptom is lack of convergence as soon
>>>> as it has a vhost device with a sufficiently high number of queues,
>>>> the sufficient number of vhost devices.
>>>>
>>>> On every migration iteration (every 100msecs) it will redundantly
>>>> query the *shared log* the number of queues configured with vhost
>>>> that exist in the guest. For the virtqueue data, this is necessary,
>>>> but not for the memory sections which are the same. So essentially
>>>> we end up scanning the dirty log too often.
>>>>
>>>> To fix that, select a vhost device responsible for scanning the
>>>> log with regards to memory sections dirty tracking. It is selected
>>>> when we enable the logger (during migration) and cleared when we
>>>> disable the logger. If the vhost logger device goes away for some
>>>> reason, the logger will be re-selected from the rest of vhost
>>>> devices.
>>>>
>>>> After making mem-section logger a singleton instance, constant cost
>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
>>>> queues or how many vhost devices are configured:
>>>>
>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
>>>>
>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>>>
>>>> ---
>>>> v3 -> v4:
>>>>     - add comment to clarify effect on cache locality and
>>>>       performance
>>>>
>>>> v2 -> v3:
>>>>     - add after-fix benchmark to commit log
>>>>     - rename vhost_log_dev_enabled to vhost_dev_should_log
>>>>     - remove unneeded comparisons for backend_type
>>>>     - use QLIST array instead of single flat list to store vhost
>>>>       logger devices
>>>>     - simplify logger election logic
>>>> ---
>>>>    hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
>>>>    include/hw/virtio/vhost.h |  1 +
>>>>    2 files changed, 62 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>> index 612f4db..58522f1 100644
>>>> --- a/hw/virtio/vhost.c
>>>> +++ b/hw/virtio/vhost.c
>>>> @@ -45,6 +45,7 @@
>>>>
>>>>    static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>>>>    static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
>>>>
>>>>    /* Memslots used by backends that support private memslots (without an fd). */
>>>>    static unsigned int used_memslots;
>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>>>>        }
>>>>    }
>>>>
>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
>>>> +{
>>>> +    assert(dev->vhost_ops);
>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
>>>> +
>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
>>> A dumb question, why not simple check
>>>
>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
>> Because we are not sure if the logger comes from vhost_log_shm[] or
>> vhost_log[]. Don't want to complicate the check here by calling into
>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
> It has very low overhead, isn't it?
Whether this has low overhead will have to depend on the specific 
backend's implementation for .vhost_requires_shm_log(), which the common 
vhost layer should not assume upon or rely on the current implementation.

>
> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
> {
>      return dev->vhost_ops->vhost_requires_shm_log &&
>             dev->vhost_ops->vhost_requires_shm_log(dev);
> }
>
> And it helps to simplify the logic.
Generally yes, but when it comes to hot path operations the performance 
consideration could override this principle. I think there's no harm to 
check against logger device cached in vhost layer itself, and the 
current patch does not create a lot of complexity or performance side 
effect (actually I think the conditional should be very straightforward 
to turn into just a couple of assembly compare and branch instructions 
rather than indirection through another jmp call).

-Siwei

>
> Thanks
>
>> -Siwei
>>> ?
>>>
>>> Thanks
>>>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/2] vhost: dirty log should be per backend type
  2024-03-18 22:06       ` Si-Wei Liu
@ 2024-03-20  3:25         ` Jason Wang
  2024-03-20 20:29           ` Si-Wei Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-20  3:25 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Tue, Mar 19, 2024 at 6:06 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/17/2024 8:20 PM, Jason Wang wrote:
> > On Sat, Mar 16, 2024 at 2:33 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 3/14/2024 8:50 PM, Jason Wang wrote:
> >>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>> There could be a mix of both vhost-user and vhost-kernel clients
> >>>> in the same QEMU process, where separate vhost loggers for the
> >>>> specific vhost type have to be used. Make the vhost logger per
> >>>> backend type, and have them properly reference counted.
> >>> It's better to describe what's the advantage of doing this.
> >> Yes, I can add that to the log. Although it's a niche use case, it was
> >> actually a long standing limitation / bug that vhost-user and
> >> vhost-kernel loggers can't co-exist per QEMU process, but today it's
> >> just silent failure that may be ended up with. This bug fix removes that
> >> implicit limitation in the code.
> > Ok.
> >
> >>>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> >>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>>>
> >>>> ---
> >>>> v3->v4:
> >>>>     - remove checking NULL return value from vhost_log_get
> >>>>
> >>>> v2->v3:
> >>>>     - remove non-effective assertion that never be reached
> >>>>     - do not return NULL from vhost_log_get()
> >>>>     - add neccessary assertions to vhost_log_get()
> >>>> ---
> >>>>    hw/virtio/vhost.c | 45 +++++++++++++++++++++++++++++++++------------
> >>>>    1 file changed, 33 insertions(+), 12 deletions(-)
> >>>>
> >>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>> index 2c9ac79..612f4db 100644
> >>>> --- a/hw/virtio/vhost.c
> >>>> +++ b/hw/virtio/vhost.c
> >>>> @@ -43,8 +43,8 @@
> >>>>        do { } while (0)
> >>>>    #endif
> >>>>
> >>>> -static struct vhost_log *vhost_log;
> >>>> -static struct vhost_log *vhost_log_shm;
> >>>> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >>>> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >>>>
> >>>>    /* Memslots used by backends that support private memslots (without an fd). */
> >>>>    static unsigned int used_memslots;
> >>>> @@ -287,6 +287,10 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
> >>>>            r = -1;
> >>>>        }
> >>>>
> >>>> +    if (r == 0) {
> >>>> +        assert(dev->vhost_ops->backend_type == backend_type);
> >>>> +    }
> >>>> +
> >>> Under which condition could we hit this?
> >> Just in case some other function inadvertently corrupted this earlier,
> >> we have to capture discrepancy in the first place... On the other hand,
> >> it will be helpful for other vhost backend writers to diagnose day-one
> >> bug in the code. I feel just code comment here will not be
> >> sufficient/helpful.
> > See below.
> >
> >>>    It seems not good to assert a local logic.
> >> It seems to me quite a few local asserts are in the same file already,
> >> vhost_save_backend_state,
> > For example it has assert for
> >
> > assert(!dev->started);
> >
> > which is not the logic of the function itself but require
> > vhost_dev_start() not to be called before.
> >
> > But it looks like this patch you assert the code just a few lines
> > above the assert itself?
> Yes, that was the intent - for e.g. xxx_ops may contain corrupted
> xxx_ops.backend_type already before coming to this
> vhost_set_backend_type() function. And we may capture this corrupted
> state by asserting the expected xxx_ops.backend_type (to be consistent
> with the backend_type passed in),

This can happen for all variables. Not sure why backend_ops is special.

> which needs be done in the first place
> when this discrepancy is detected. In practice I think there should be
> no harm to add this assert, but this will add warranted guarantee to the
> current code.

For example, such corruption can happen after the assert() so a TOCTOU issue.

Thanks

>
> Regards,
> -Siwei
>
> >
> > dev->vhost_ops = &xxx_ops;
> >
> > ...
> >
> > assert(dev->vhost_ops->backend_type == backend_type)
> >
> > ?
> >
> > Thanks
> >
> >> vhost_load_backend_state,
> >> vhost_virtqueue_mask, vhost_config_mask, just to name a few. Why local
> >> assert a problem?
> >>
> >> Thanks,
> >> -Siwei
> >>
> >>> Thanks
> >>>
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-18 22:16         ` Si-Wei Liu
@ 2024-03-20  3:27           ` Jason Wang
  2024-03-20 21:02             ` Si-Wei Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-20  3:27 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/17/2024 8:22 PM, Jason Wang wrote:
> > On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 3/14/2024 9:03 PM, Jason Wang wrote:
> >>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>> On setups with one or more virtio-net devices with vhost on,
> >>>> dirty tracking iteration increases cost the bigger the number
> >>>> amount of queues are set up e.g. on idle guests migration the
> >>>> following is observed with virtio-net with vhost=on:
> >>>>
> >>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> >>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> >>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> >>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
> >>>>
> >>>> With high memory rates the symptom is lack of convergence as soon
> >>>> as it has a vhost device with a sufficiently high number of queues,
> >>>> the sufficient number of vhost devices.
> >>>>
> >>>> On every migration iteration (every 100msecs) it will redundantly
> >>>> query the *shared log* the number of queues configured with vhost
> >>>> that exist in the guest. For the virtqueue data, this is necessary,
> >>>> but not for the memory sections which are the same. So essentially
> >>>> we end up scanning the dirty log too often.
> >>>>
> >>>> To fix that, select a vhost device responsible for scanning the
> >>>> log with regards to memory sections dirty tracking. It is selected
> >>>> when we enable the logger (during migration) and cleared when we
> >>>> disable the logger. If the vhost logger device goes away for some
> >>>> reason, the logger will be re-selected from the rest of vhost
> >>>> devices.
> >>>>
> >>>> After making mem-section logger a singleton instance, constant cost
> >>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
> >>>> queues or how many vhost devices are configured:
> >>>>
> >>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
> >>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
> >>>>
> >>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> >>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>>>
> >>>> ---
> >>>> v3 -> v4:
> >>>>     - add comment to clarify effect on cache locality and
> >>>>       performance
> >>>>
> >>>> v2 -> v3:
> >>>>     - add after-fix benchmark to commit log
> >>>>     - rename vhost_log_dev_enabled to vhost_dev_should_log
> >>>>     - remove unneeded comparisons for backend_type
> >>>>     - use QLIST array instead of single flat list to store vhost
> >>>>       logger devices
> >>>>     - simplify logger election logic
> >>>> ---
> >>>>    hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
> >>>>    include/hw/virtio/vhost.h |  1 +
> >>>>    2 files changed, 62 insertions(+), 6 deletions(-)
> >>>>
> >>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>> index 612f4db..58522f1 100644
> >>>> --- a/hw/virtio/vhost.c
> >>>> +++ b/hw/virtio/vhost.c
> >>>> @@ -45,6 +45,7 @@
> >>>>
> >>>>    static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >>>>    static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
> >>>>
> >>>>    /* Memslots used by backends that support private memslots (without an fd). */
> >>>>    static unsigned int used_memslots;
> >>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
> >>>>        }
> >>>>    }
> >>>>
> >>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
> >>>> +{
> >>>> +    assert(dev->vhost_ops);
> >>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> >>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> >>>> +
> >>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
> >>> A dumb question, why not simple check
> >>>
> >>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
> >> Because we are not sure if the logger comes from vhost_log_shm[] or
> >> vhost_log[]. Don't want to complicate the check here by calling into
> >> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
> > It has very low overhead, isn't it?
> Whether this has low overhead will have to depend on the specific
> backend's implementation for .vhost_requires_shm_log(), which the common
> vhost layer should not assume upon or rely on the current implementation.
>
> >
> > static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
> > {
> >      return dev->vhost_ops->vhost_requires_shm_log &&
> >             dev->vhost_ops->vhost_requires_shm_log(dev);
> > }

For example, if I understand the code correctly, the log type won't be
changed during runtime, so we can endup with a boolean to record that
instead of a query ops?

> >
> > And it helps to simplify the logic.
> Generally yes, but when it comes to hot path operations the performance
> consideration could override this principle. I think there's no harm to
> check against logger device cached in vhost layer itself, and the
> current patch does not create a lot of complexity or performance side
> effect (actually I think the conditional should be very straightforward
> to turn into just a couple of assembly compare and branch instructions
> rather than indirection through another jmp call).

Thanks

>
> -Siwei
>
> >
> > Thanks
> >
> >> -Siwei
> >>> ?
> >>>
> >>> Thanks
> >>>
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/2] vhost: dirty log should be per backend type
  2024-03-20  3:25         ` Jason Wang
@ 2024-03-20 20:29           ` Si-Wei Liu
  2024-03-21  3:53             ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-20 20:29 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/19/2024 8:25 PM, Jason Wang wrote:
> On Tue, Mar 19, 2024 at 6:06 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 3/17/2024 8:20 PM, Jason Wang wrote:
>>> On Sat, Mar 16, 2024 at 2:33 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>
>>>> On 3/14/2024 8:50 PM, Jason Wang wrote:
>>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>> There could be a mix of both vhost-user and vhost-kernel clients
>>>>>> in the same QEMU process, where separate vhost loggers for the
>>>>>> specific vhost type have to be used. Make the vhost logger per
>>>>>> backend type, and have them properly reference counted.
>>>>> It's better to describe what's the advantage of doing this.
>>>> Yes, I can add that to the log. Although it's a niche use case, it was
>>>> actually a long standing limitation / bug that vhost-user and
>>>> vhost-kernel loggers can't co-exist per QEMU process, but today it's
>>>> just silent failure that may be ended up with. This bug fix removes that
>>>> implicit limitation in the code.
>>> Ok.
>>>
>>>>>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
>>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>>>>>
>>>>>> ---
>>>>>> v3->v4:
>>>>>>      - remove checking NULL return value from vhost_log_get
>>>>>>
>>>>>> v2->v3:
>>>>>>      - remove non-effective assertion that never be reached
>>>>>>      - do not return NULL from vhost_log_get()
>>>>>>      - add neccessary assertions to vhost_log_get()
>>>>>> ---
>>>>>>     hw/virtio/vhost.c | 45 +++++++++++++++++++++++++++++++++------------
>>>>>>     1 file changed, 33 insertions(+), 12 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>>>> index 2c9ac79..612f4db 100644
>>>>>> --- a/hw/virtio/vhost.c
>>>>>> +++ b/hw/virtio/vhost.c
>>>>>> @@ -43,8 +43,8 @@
>>>>>>         do { } while (0)
>>>>>>     #endif
>>>>>>
>>>>>> -static struct vhost_log *vhost_log;
>>>>>> -static struct vhost_log *vhost_log_shm;
>>>>>> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>>>>>> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>>>>>>
>>>>>>     /* Memslots used by backends that support private memslots (without an fd). */
>>>>>>     static unsigned int used_memslots;
>>>>>> @@ -287,6 +287,10 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
>>>>>>             r = -1;
>>>>>>         }
>>>>>>
>>>>>> +    if (r == 0) {
>>>>>> +        assert(dev->vhost_ops->backend_type == backend_type);
>>>>>> +    }
>>>>>> +
>>>>> Under which condition could we hit this?
>>>> Just in case some other function inadvertently corrupted this earlier,
>>>> we have to capture discrepancy in the first place... On the other hand,
>>>> it will be helpful for other vhost backend writers to diagnose day-one
>>>> bug in the code. I feel just code comment here will not be
>>>> sufficient/helpful.
>>> See below.
>>>
>>>>>     It seems not good to assert a local logic.
>>>> It seems to me quite a few local asserts are in the same file already,
>>>> vhost_save_backend_state,
>>> For example it has assert for
>>>
>>> assert(!dev->started);
>>>
>>> which is not the logic of the function itself but require
>>> vhost_dev_start() not to be called before.
>>>
>>> But it looks like this patch you assert the code just a few lines
>>> above the assert itself?
>> Yes, that was the intent - for e.g. xxx_ops may contain corrupted
>> xxx_ops.backend_type already before coming to this
>> vhost_set_backend_type() function. And we may capture this corrupted
>> state by asserting the expected xxx_ops.backend_type (to be consistent
>> with the backend_type passed in),
> This can happen for all variables. Not sure why backend_ops is special.
The assert is just checking the backend_type field only. The other op 
fields in backend_ops have similar assert within the op function itself 
also. For e.g. vhost_user_requires_shm_log() and a lot of other 
vhost_user ops have the following:

     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);

vhost_vdpa_vq_get_addr() and a lot of other vhost_vdpa ops have:

     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);

vhost_kernel ops has similar assertions as well.

The reason why it has to be checked against here is now the callers of 
vhost_log_get(), would pass in dev->vhost_ops->backend_type to the API, 
which are unable to verify the validity of the backend_type by 
themselves. The vhost_log_get() has necessary asserts to make bound 
check for the vhost_log[] or vhost_log_shm[] array, but specific assert 
against the exact backend type in vhost_set_backend_type() will further 
harden the implementation in vhost_log_get() and other backend ops.

>
>> which needs be done in the first place
>> when this discrepancy is detected. In practice I think there should be
>> no harm to add this assert, but this will add warranted guarantee to the
>> current code.
> For example, such corruption can happen after the assert() so a TOCTOU issue.
Sure, it's best effort only. As pointed out earlier, I think together 
with this, there are other similar asserts already in various backend 
ops, which could be helpful to nail down the earliest point or a 
specific range where things may go wrong in the first place.

Thanks,
-Siwei

>
> Thanks
>
>> Regards,
>> -Siwei
>>
>>> dev->vhost_ops = &xxx_ops;
>>>
>>> ...
>>>
>>> assert(dev->vhost_ops->backend_type == backend_type)
>>>
>>> ?
>>>
>>> Thanks
>>>
>>>> vhost_load_backend_state,
>>>> vhost_virtqueue_mask, vhost_config_mask, just to name a few. Why local
>>>> assert a problem?
>>>>
>>>> Thanks,
>>>> -Siwei
>>>>
>>>>> Thanks
>>>>>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-20  3:27           ` Jason Wang
@ 2024-03-20 21:02             ` Si-Wei Liu
  2024-03-21  3:56               ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-20 21:02 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/19/2024 8:27 PM, Jason Wang wrote:
> On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 3/17/2024 8:22 PM, Jason Wang wrote:
>>> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>
>>>> On 3/14/2024 9:03 PM, Jason Wang wrote:
>>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>> On setups with one or more virtio-net devices with vhost on,
>>>>>> dirty tracking iteration increases cost the bigger the number
>>>>>> amount of queues are set up e.g. on idle guests migration the
>>>>>> following is observed with virtio-net with vhost=on:
>>>>>>
>>>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
>>>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
>>>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
>>>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>>>>>>
>>>>>> With high memory rates the symptom is lack of convergence as soon
>>>>>> as it has a vhost device with a sufficiently high number of queues,
>>>>>> the sufficient number of vhost devices.
>>>>>>
>>>>>> On every migration iteration (every 100msecs) it will redundantly
>>>>>> query the *shared log* the number of queues configured with vhost
>>>>>> that exist in the guest. For the virtqueue data, this is necessary,
>>>>>> but not for the memory sections which are the same. So essentially
>>>>>> we end up scanning the dirty log too often.
>>>>>>
>>>>>> To fix that, select a vhost device responsible for scanning the
>>>>>> log with regards to memory sections dirty tracking. It is selected
>>>>>> when we enable the logger (during migration) and cleared when we
>>>>>> disable the logger. If the vhost logger device goes away for some
>>>>>> reason, the logger will be re-selected from the rest of vhost
>>>>>> devices.
>>>>>>
>>>>>> After making mem-section logger a singleton instance, constant cost
>>>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
>>>>>> queues or how many vhost devices are configured:
>>>>>>
>>>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
>>>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
>>>>>>
>>>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>>>>>
>>>>>> ---
>>>>>> v3 -> v4:
>>>>>>      - add comment to clarify effect on cache locality and
>>>>>>        performance
>>>>>>
>>>>>> v2 -> v3:
>>>>>>      - add after-fix benchmark to commit log
>>>>>>      - rename vhost_log_dev_enabled to vhost_dev_should_log
>>>>>>      - remove unneeded comparisons for backend_type
>>>>>>      - use QLIST array instead of single flat list to store vhost
>>>>>>        logger devices
>>>>>>      - simplify logger election logic
>>>>>> ---
>>>>>>     hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
>>>>>>     include/hw/virtio/vhost.h |  1 +
>>>>>>     2 files changed, 62 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>>>> index 612f4db..58522f1 100644
>>>>>> --- a/hw/virtio/vhost.c
>>>>>> +++ b/hw/virtio/vhost.c
>>>>>> @@ -45,6 +45,7 @@
>>>>>>
>>>>>>     static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>>>>>>     static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>>>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
>>>>>>
>>>>>>     /* Memslots used by backends that support private memslots (without an fd). */
>>>>>>     static unsigned int used_memslots;
>>>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>>>>>>         }
>>>>>>     }
>>>>>>
>>>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
>>>>>> +{
>>>>>> +    assert(dev->vhost_ops);
>>>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
>>>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
>>>>>> +
>>>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
>>>>> A dumb question, why not simple check
>>>>>
>>>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
>>>> Because we are not sure if the logger comes from vhost_log_shm[] or
>>>> vhost_log[]. Don't want to complicate the check here by calling into
>>>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
>>> It has very low overhead, isn't it?
>> Whether this has low overhead will have to depend on the specific
>> backend's implementation for .vhost_requires_shm_log(), which the common
>> vhost layer should not assume upon or rely on the current implementation.
>>
>>> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
>>> {
>>>       return dev->vhost_ops->vhost_requires_shm_log &&
>>>              dev->vhost_ops->vhost_requires_shm_log(dev);
>>> }
> For example, if I understand the code correctly, the log type won't be
> changed during runtime, so we can endup with a boolean to record that
> instead of a query ops?
Right now the log type won't change during runtime, but I am not sure if 
this may prohibit future revisit to allow change at the runtime, then 
there'll be complex code involvled to maintain the state.

Other than this, I think it's insufficient to just check the shm log 
v.s. normal log. The logger device requires to identify a leading logger 
device that gets elected in vhost_dev_elect_mem_logger(), as all the 
dev->log points to the same logger that is refenerce counted, that we 
have to add extra field and complex logic to maintain the election 
status. I thought that Eugenio's previous suggestion tried to simplify 
the logic in vhost_dev_elect_mem_logger(), as the QLIST_FIRST macro that 
gets expanded to use the lh_first field for the QLIST would simply 
satisfy the basic need. Why extra logic to make the check ever more 
complex, is there any benefit by adding more fields to the vhost_dev?


Thanks,
-Siwei

>
>>> And it helps to simplify the logic.
>> Generally yes, but when it comes to hot path operations the performance
>> consideration could override this principle. I think there's no harm to
>> check against logger device cached in vhost layer itself, and the
>> current patch does not create a lot of complexity or performance side
>> effect (actually I think the conditional should be very straightforward
>> to turn into just a couple of assembly compare and branch instructions
>> rather than indirection through another jmp call).
> Thanks
>
>> -Siwei
>>
>>> Thanks
>>>
>>>> -Siwei
>>>>> ?
>>>>>
>>>>> Thanks
>>>>>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/2] vhost: dirty log should be per backend type
  2024-03-20 20:29           ` Si-Wei Liu
@ 2024-03-21  3:53             ` Jason Wang
  0 siblings, 0 replies; 22+ messages in thread
From: Jason Wang @ 2024-03-21  3:53 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Thu, Mar 21, 2024 at 4:29 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/19/2024 8:25 PM, Jason Wang wrote:
> > On Tue, Mar 19, 2024 at 6:06 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 3/17/2024 8:20 PM, Jason Wang wrote:
> >>> On Sat, Mar 16, 2024 at 2:33 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>
> >>>> On 3/14/2024 8:50 PM, Jason Wang wrote:
> >>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>> There could be a mix of both vhost-user and vhost-kernel clients
> >>>>>> in the same QEMU process, where separate vhost loggers for the
> >>>>>> specific vhost type have to be used. Make the vhost logger per
> >>>>>> backend type, and have them properly reference counted.
> >>>>> It's better to describe what's the advantage of doing this.
> >>>> Yes, I can add that to the log. Although it's a niche use case, it was
> >>>> actually a long standing limitation / bug that vhost-user and
> >>>> vhost-kernel loggers can't co-exist per QEMU process, but today it's
> >>>> just silent failure that may be ended up with. This bug fix removes that
> >>>> implicit limitation in the code.
> >>> Ok.
> >>>
> >>>>>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> >>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>>>>>
> >>>>>> ---
> >>>>>> v3->v4:
> >>>>>>      - remove checking NULL return value from vhost_log_get
> >>>>>>
> >>>>>> v2->v3:
> >>>>>>      - remove non-effective assertion that never be reached
> >>>>>>      - do not return NULL from vhost_log_get()
> >>>>>>      - add neccessary assertions to vhost_log_get()
> >>>>>> ---
> >>>>>>     hw/virtio/vhost.c | 45 +++++++++++++++++++++++++++++++++------------
> >>>>>>     1 file changed, 33 insertions(+), 12 deletions(-)
> >>>>>>
> >>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>>>> index 2c9ac79..612f4db 100644
> >>>>>> --- a/hw/virtio/vhost.c
> >>>>>> +++ b/hw/virtio/vhost.c
> >>>>>> @@ -43,8 +43,8 @@
> >>>>>>         do { } while (0)
> >>>>>>     #endif
> >>>>>>
> >>>>>> -static struct vhost_log *vhost_log;
> >>>>>> -static struct vhost_log *vhost_log_shm;
> >>>>>> +static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >>>>>> +static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >>>>>>
> >>>>>>     /* Memslots used by backends that support private memslots (without an fd). */
> >>>>>>     static unsigned int used_memslots;
> >>>>>> @@ -287,6 +287,10 @@ static int vhost_set_backend_type(struct vhost_dev *dev,
> >>>>>>             r = -1;
> >>>>>>         }
> >>>>>>
> >>>>>> +    if (r == 0) {
> >>>>>> +        assert(dev->vhost_ops->backend_type == backend_type);
> >>>>>> +    }
> >>>>>> +
> >>>>> Under which condition could we hit this?
> >>>> Just in case some other function inadvertently corrupted this earlier,
> >>>> we have to capture discrepancy in the first place... On the other hand,
> >>>> it will be helpful for other vhost backend writers to diagnose day-one
> >>>> bug in the code. I feel just code comment here will not be
> >>>> sufficient/helpful.
> >>> See below.
> >>>
> >>>>>     It seems not good to assert a local logic.
> >>>> It seems to me quite a few local asserts are in the same file already,
> >>>> vhost_save_backend_state,
> >>> For example it has assert for
> >>>
> >>> assert(!dev->started);
> >>>
> >>> which is not the logic of the function itself but require
> >>> vhost_dev_start() not to be called before.
> >>>
> >>> But it looks like this patch you assert the code just a few lines
> >>> above the assert itself?
> >> Yes, that was the intent - for e.g. xxx_ops may contain corrupted
> >> xxx_ops.backend_type already before coming to this
> >> vhost_set_backend_type() function. And we may capture this corrupted
> >> state by asserting the expected xxx_ops.backend_type (to be consistent
> >> with the backend_type passed in),
> > This can happen for all variables. Not sure why backend_ops is special.
> The assert is just checking the backend_type field only. The other op
> fields in backend_ops have similar assert within the op function itself
> also. For e.g. vhost_user_requires_shm_log() and a lot of other
> vhost_user ops have the following:
>
>      assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
>
> vhost_vdpa_vq_get_addr() and a lot of other vhost_vdpa ops have:
>
>      assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
>
> vhost_kernel ops has similar assertions as well.
>
> The reason why it has to be checked against here is now the callers of
> vhost_log_get(), would pass in dev->vhost_ops->backend_type to the API,
> which are unable to verify the validity of the backend_type by
> themselves. The vhost_log_get() has necessary asserts to make bound
> check for the vhost_log[] or vhost_log_shm[] array, but specific assert
> against the exact backend type in vhost_set_backend_type() will further
> harden the implementation in vhost_log_get() and other backend ops.

As discussed, those assertions are to make sure of the logic
dependencies of other functions. (The assignment of vhost_ops doesn't
happen in those functions).

>
> >
> >> which needs be done in the first place
> >> when this discrepancy is detected. In practice I think there should be
> >> no harm to add this assert, but this will add warranted guarantee to the
> >> current code.
> > For example, such corruption can happen after the assert() so a TOCTOU issue.
> Sure, it's best effort only. As pointed out earlier, I think together
> with this, there are other similar asserts already in various backend
> ops, which could be helpful to nail down the earliest point or a
> specific range where things may go wrong in the first place.

Ok, I think I'd leave the decision to Michael.

Thanks

>
> Thanks,
> -Siwei
>
> >
> > Thanks
> >
> >> Regards,
> >> -Siwei
> >>
> >>> dev->vhost_ops = &xxx_ops;
> >>>
> >>> ...
> >>>
> >>> assert(dev->vhost_ops->backend_type == backend_type)
> >>>
> >>> ?
> >>>
> >>> Thanks
> >>>
> >>>> vhost_load_backend_state,
> >>>> vhost_virtqueue_mask, vhost_config_mask, just to name a few. Why local
> >>>> assert a problem?
> >>>>
> >>>> Thanks,
> >>>> -Siwei
> >>>>
> >>>>> Thanks
> >>>>>
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-20 21:02             ` Si-Wei Liu
@ 2024-03-21  3:56               ` Jason Wang
  2024-03-21 21:42                 ` Si-Wei Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-21  3:56 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/19/2024 8:27 PM, Jason Wang wrote:
> > On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 3/17/2024 8:22 PM, Jason Wang wrote:
> >>> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>
> >>>> On 3/14/2024 9:03 PM, Jason Wang wrote:
> >>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>> On setups with one or more virtio-net devices with vhost on,
> >>>>>> dirty tracking iteration increases cost the bigger the number
> >>>>>> amount of queues are set up e.g. on idle guests migration the
> >>>>>> following is observed with virtio-net with vhost=on:
> >>>>>>
> >>>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> >>>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> >>>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> >>>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
> >>>>>>
> >>>>>> With high memory rates the symptom is lack of convergence as soon
> >>>>>> as it has a vhost device with a sufficiently high number of queues,
> >>>>>> the sufficient number of vhost devices.
> >>>>>>
> >>>>>> On every migration iteration (every 100msecs) it will redundantly
> >>>>>> query the *shared log* the number of queues configured with vhost
> >>>>>> that exist in the guest. For the virtqueue data, this is necessary,
> >>>>>> but not for the memory sections which are the same. So essentially
> >>>>>> we end up scanning the dirty log too often.
> >>>>>>
> >>>>>> To fix that, select a vhost device responsible for scanning the
> >>>>>> log with regards to memory sections dirty tracking. It is selected
> >>>>>> when we enable the logger (during migration) and cleared when we
> >>>>>> disable the logger. If the vhost logger device goes away for some
> >>>>>> reason, the logger will be re-selected from the rest of vhost
> >>>>>> devices.
> >>>>>>
> >>>>>> After making mem-section logger a singleton instance, constant cost
> >>>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
> >>>>>> queues or how many vhost devices are configured:
> >>>>>>
> >>>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
> >>>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
> >>>>>>
> >>>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> >>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>>>>>
> >>>>>> ---
> >>>>>> v3 -> v4:
> >>>>>>      - add comment to clarify effect on cache locality and
> >>>>>>        performance
> >>>>>>
> >>>>>> v2 -> v3:
> >>>>>>      - add after-fix benchmark to commit log
> >>>>>>      - rename vhost_log_dev_enabled to vhost_dev_should_log
> >>>>>>      - remove unneeded comparisons for backend_type
> >>>>>>      - use QLIST array instead of single flat list to store vhost
> >>>>>>        logger devices
> >>>>>>      - simplify logger election logic
> >>>>>> ---
> >>>>>>     hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
> >>>>>>     include/hw/virtio/vhost.h |  1 +
> >>>>>>     2 files changed, 62 insertions(+), 6 deletions(-)
> >>>>>>
> >>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>>>> index 612f4db..58522f1 100644
> >>>>>> --- a/hw/virtio/vhost.c
> >>>>>> +++ b/hw/virtio/vhost.c
> >>>>>> @@ -45,6 +45,7 @@
> >>>>>>
> >>>>>>     static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >>>>>>     static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >>>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
> >>>>>>
> >>>>>>     /* Memslots used by backends that support private memslots (without an fd). */
> >>>>>>     static unsigned int used_memslots;
> >>>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
> >>>>>>         }
> >>>>>>     }
> >>>>>>
> >>>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
> >>>>>> +{
> >>>>>> +    assert(dev->vhost_ops);
> >>>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> >>>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> >>>>>> +
> >>>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
> >>>>> A dumb question, why not simple check
> >>>>>
> >>>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
> >>>> Because we are not sure if the logger comes from vhost_log_shm[] or
> >>>> vhost_log[]. Don't want to complicate the check here by calling into
> >>>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
> >>> It has very low overhead, isn't it?
> >> Whether this has low overhead will have to depend on the specific
> >> backend's implementation for .vhost_requires_shm_log(), which the common
> >> vhost layer should not assume upon or rely on the current implementation.
> >>
> >>> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
> >>> {
> >>>       return dev->vhost_ops->vhost_requires_shm_log &&
> >>>              dev->vhost_ops->vhost_requires_shm_log(dev);
> >>> }
> > For example, if I understand the code correctly, the log type won't be
> > changed during runtime, so we can endup with a boolean to record that
> > instead of a query ops?
> Right now the log type won't change during runtime, but I am not sure if
> this may prohibit future revisit to allow change at the runtime,

We can be bothered when we have such a request then.

> then
> there'll be complex code involvled to maintain the state.
>
> Other than this, I think it's insufficient to just check the shm log
> v.s. normal log. The logger device requires to identify a leading logger
> device that gets elected in vhost_dev_elect_mem_logger(), as all the
> dev->log points to the same logger that is refenerce counted, that we
> have to add extra field and complex logic to maintain the election
> status.

One thing I don't understand here (and in the changelog) is why do we
need an election here?

> I thought that Eugenio's previous suggestion tried to simplify
> the logic in vhost_dev_elect_mem_logger(), as the QLIST_FIRST macro that
> gets expanded to use the lh_first field for the QLIST would simply
> satisfy the basic need. Why extra logic to make the check ever more
> complex, is there any benefit by adding more fields to the vhost_dev?

I don't get here, the idea is to just pick one shared log which should
be much more simpler than what is proposed here.

Thanks

>
>
> Thanks,
> -Siwei
>
> >
> >>> And it helps to simplify the logic.
> >> Generally yes, but when it comes to hot path operations the performance
> >> consideration could override this principle. I think there's no harm to
> >> check against logger device cached in vhost layer itself, and the
> >> current patch does not create a lot of complexity or performance side
> >> effect (actually I think the conditional should be very straightforward
> >> to turn into just a couple of assembly compare and branch instructions
> >> rather than indirection through another jmp call).
> > Thanks
> >
> >> -Siwei
> >>
> >>> Thanks
> >>>
> >>>> -Siwei
> >>>>> ?
> >>>>>
> >>>>> Thanks
> >>>>>
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-21  3:56               ` Jason Wang
@ 2024-03-21 21:42                 ` Si-Wei Liu
  2024-03-22  5:08                   ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-21 21:42 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/20/2024 8:56 PM, Jason Wang wrote:
> On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 3/19/2024 8:27 PM, Jason Wang wrote:
>>> On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>
>>>> On 3/17/2024 8:22 PM, Jason Wang wrote:
>>>>> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>> On 3/14/2024 9:03 PM, Jason Wang wrote:
>>>>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>>>> On setups with one or more virtio-net devices with vhost on,
>>>>>>>> dirty tracking iteration increases cost the bigger the number
>>>>>>>> amount of queues are set up e.g. on idle guests migration the
>>>>>>>> following is observed with virtio-net with vhost=on:
>>>>>>>>
>>>>>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
>>>>>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
>>>>>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
>>>>>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>>>>>>>>
>>>>>>>> With high memory rates the symptom is lack of convergence as soon
>>>>>>>> as it has a vhost device with a sufficiently high number of queues,
>>>>>>>> the sufficient number of vhost devices.
>>>>>>>>
>>>>>>>> On every migration iteration (every 100msecs) it will redundantly
>>>>>>>> query the *shared log* the number of queues configured with vhost
>>>>>>>> that exist in the guest. For the virtqueue data, this is necessary,
>>>>>>>> but not for the memory sections which are the same. So essentially
>>>>>>>> we end up scanning the dirty log too often.
>>>>>>>>
>>>>>>>> To fix that, select a vhost device responsible for scanning the
>>>>>>>> log with regards to memory sections dirty tracking. It is selected
>>>>>>>> when we enable the logger (during migration) and cleared when we
>>>>>>>> disable the logger. If the vhost logger device goes away for some
>>>>>>>> reason, the logger will be re-selected from the rest of vhost
>>>>>>>> devices.
>>>>>>>>
>>>>>>>> After making mem-section logger a singleton instance, constant cost
>>>>>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
>>>>>>>> queues or how many vhost devices are configured:
>>>>>>>>
>>>>>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
>>>>>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
>>>>>>>>
>>>>>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>>>>>>>
>>>>>>>> ---
>>>>>>>> v3 -> v4:
>>>>>>>>       - add comment to clarify effect on cache locality and
>>>>>>>>         performance
>>>>>>>>
>>>>>>>> v2 -> v3:
>>>>>>>>       - add after-fix benchmark to commit log
>>>>>>>>       - rename vhost_log_dev_enabled to vhost_dev_should_log
>>>>>>>>       - remove unneeded comparisons for backend_type
>>>>>>>>       - use QLIST array instead of single flat list to store vhost
>>>>>>>>         logger devices
>>>>>>>>       - simplify logger election logic
>>>>>>>> ---
>>>>>>>>      hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
>>>>>>>>      include/hw/virtio/vhost.h |  1 +
>>>>>>>>      2 files changed, 62 insertions(+), 6 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>>>>>> index 612f4db..58522f1 100644
>>>>>>>> --- a/hw/virtio/vhost.c
>>>>>>>> +++ b/hw/virtio/vhost.c
>>>>>>>> @@ -45,6 +45,7 @@
>>>>>>>>
>>>>>>>>      static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>>>>>>>>      static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>>>>>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
>>>>>>>>
>>>>>>>>      /* Memslots used by backends that support private memslots (without an fd). */
>>>>>>>>      static unsigned int used_memslots;
>>>>>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>>>>>>>>          }
>>>>>>>>      }
>>>>>>>>
>>>>>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
>>>>>>>> +{
>>>>>>>> +    assert(dev->vhost_ops);
>>>>>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
>>>>>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
>>>>>>>> +
>>>>>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
>>>>>>> A dumb question, why not simple check
>>>>>>>
>>>>>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
>>>>>> Because we are not sure if the logger comes from vhost_log_shm[] or
>>>>>> vhost_log[]. Don't want to complicate the check here by calling into
>>>>>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
>>>>> It has very low overhead, isn't it?
>>>> Whether this has low overhead will have to depend on the specific
>>>> backend's implementation for .vhost_requires_shm_log(), which the common
>>>> vhost layer should not assume upon or rely on the current implementation.
>>>>
>>>>> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
>>>>> {
>>>>>        return dev->vhost_ops->vhost_requires_shm_log &&
>>>>>               dev->vhost_ops->vhost_requires_shm_log(dev);
>>>>> }
>>> For example, if I understand the code correctly, the log type won't be
>>> changed during runtime, so we can endup with a boolean to record that
>>> instead of a query ops?
>> Right now the log type won't change during runtime, but I am not sure if
>> this may prohibit future revisit to allow change at the runtime,
> We can be bothered when we have such a request then.
>
>> then
>> there'll be complex code involvled to maintain the state.
>>
>> Other than this, I think it's insufficient to just check the shm log
>> v.s. normal log. The logger device requires to identify a leading logger
>> device that gets elected in vhost_dev_elect_mem_logger(), as all the
>> dev->log points to the same logger that is refenerce counted, that we
>> have to add extra field and complex logic to maintain the election
>> status.
> One thing I don't understand here (and in the changelog) is why do we
> need an election here?

vhost_sync_dirty_bitmap() not just scans the guest memory sections but 
the specific one for virtqueues (used rings) also. To save more CPU 
cycles to the best extend, the guest memory must be scanned only once in 
each log iteration, though the logging for used rings would still have 
to use the specific vhost instance, so all vhost_device instance still 
keeps the dev->log pointer to the shared log as-is. Generally the shared 
memory logger can be picked from an arbitrary vhost_device instance, but 
to keep the code simple, performant and predictable, logger selection is 
made on the control path at the vhost add/remove time rather than be 
determined at the dirty log collection runtime, the latter of which is 
in the hotpath.

>
>> I thought that Eugenio's previous suggestion tried to simplify
>> the logic in vhost_dev_elect_mem_logger(), as the QLIST_FIRST macro that
>> gets expanded to use the lh_first field for the QLIST would simply
>> satisfy the basic need. Why extra logic to make the check ever more
>> complex, is there any benefit by adding more fields to the vhost_dev?
> I don't get here, the idea is to just pick one shared log which should
> be much more simpler than what is proposed here.
The code you showed earlier won't work as all vhost_device instance 
points to the same dev->log device...

Regards,
-Siwei
>
> Thanks
>
>>
>> Thanks,
>> -Siwei
>>
>>>>> And it helps to simplify the logic.
>>>> Generally yes, but when it comes to hot path operations the performance
>>>> consideration could override this principle. I think there's no harm to
>>>> check against logger device cached in vhost layer itself, and the
>>>> current patch does not create a lot of complexity or performance side
>>>> effect (actually I think the conditional should be very straightforward
>>>> to turn into just a couple of assembly compare and branch instructions
>>>> rather than indirection through another jmp call).
>>> Thanks
>>>
>>>> -Siwei
>>>>
>>>>> Thanks
>>>>>
>>>>>> -Siwei
>>>>>>> ?
>>>>>>>
>>>>>>> Thanks
>>>>>>>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-21 21:42                 ` Si-Wei Liu
@ 2024-03-22  5:08                   ` Jason Wang
  2024-03-22 21:13                     ` Si-Wei Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-22  5:08 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Fri, Mar 22, 2024 at 5:43 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/20/2024 8:56 PM, Jason Wang wrote:
> > On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 3/19/2024 8:27 PM, Jason Wang wrote:
> >>> On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>
> >>>> On 3/17/2024 8:22 PM, Jason Wang wrote:
> >>>>> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>> On 3/14/2024 9:03 PM, Jason Wang wrote:
> >>>>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>>>> On setups with one or more virtio-net devices with vhost on,
> >>>>>>>> dirty tracking iteration increases cost the bigger the number
> >>>>>>>> amount of queues are set up e.g. on idle guests migration the
> >>>>>>>> following is observed with virtio-net with vhost=on:
> >>>>>>>>
> >>>>>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> >>>>>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> >>>>>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> >>>>>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
> >>>>>>>>
> >>>>>>>> With high memory rates the symptom is lack of convergence as soon
> >>>>>>>> as it has a vhost device with a sufficiently high number of queues,
> >>>>>>>> the sufficient number of vhost devices.
> >>>>>>>>
> >>>>>>>> On every migration iteration (every 100msecs) it will redundantly
> >>>>>>>> query the *shared log* the number of queues configured with vhost
> >>>>>>>> that exist in the guest. For the virtqueue data, this is necessary,
> >>>>>>>> but not for the memory sections which are the same. So essentially
> >>>>>>>> we end up scanning the dirty log too often.
> >>>>>>>>
> >>>>>>>> To fix that, select a vhost device responsible for scanning the
> >>>>>>>> log with regards to memory sections dirty tracking. It is selected
> >>>>>>>> when we enable the logger (during migration) and cleared when we
> >>>>>>>> disable the logger. If the vhost logger device goes away for some
> >>>>>>>> reason, the logger will be re-selected from the rest of vhost
> >>>>>>>> devices.
> >>>>>>>>
> >>>>>>>> After making mem-section logger a singleton instance, constant cost
> >>>>>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
> >>>>>>>> queues or how many vhost devices are configured:
> >>>>>>>>
> >>>>>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
> >>>>>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
> >>>>>>>>
> >>>>>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> >>>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >>>>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>>>>>>>
> >>>>>>>> ---
> >>>>>>>> v3 -> v4:
> >>>>>>>>       - add comment to clarify effect on cache locality and
> >>>>>>>>         performance
> >>>>>>>>
> >>>>>>>> v2 -> v3:
> >>>>>>>>       - add after-fix benchmark to commit log
> >>>>>>>>       - rename vhost_log_dev_enabled to vhost_dev_should_log
> >>>>>>>>       - remove unneeded comparisons for backend_type
> >>>>>>>>       - use QLIST array instead of single flat list to store vhost
> >>>>>>>>         logger devices
> >>>>>>>>       - simplify logger election logic
> >>>>>>>> ---
> >>>>>>>>      hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
> >>>>>>>>      include/hw/virtio/vhost.h |  1 +
> >>>>>>>>      2 files changed, 62 insertions(+), 6 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>>>>>> index 612f4db..58522f1 100644
> >>>>>>>> --- a/hw/virtio/vhost.c
> >>>>>>>> +++ b/hw/virtio/vhost.c
> >>>>>>>> @@ -45,6 +45,7 @@
> >>>>>>>>
> >>>>>>>>      static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>>      static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>>
> >>>>>>>>      /* Memslots used by backends that support private memslots (without an fd). */
> >>>>>>>>      static unsigned int used_memslots;
> >>>>>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
> >>>>>>>>          }
> >>>>>>>>      }
> >>>>>>>>
> >>>>>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
> >>>>>>>> +{
> >>>>>>>> +    assert(dev->vhost_ops);
> >>>>>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> >>>>>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> >>>>>>>> +
> >>>>>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
> >>>>>>> A dumb question, why not simple check
> >>>>>>>
> >>>>>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
> >>>>>> Because we are not sure if the logger comes from vhost_log_shm[] or
> >>>>>> vhost_log[]. Don't want to complicate the check here by calling into
> >>>>>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
> >>>>> It has very low overhead, isn't it?
> >>>> Whether this has low overhead will have to depend on the specific
> >>>> backend's implementation for .vhost_requires_shm_log(), which the common
> >>>> vhost layer should not assume upon or rely on the current implementation.
> >>>>
> >>>>> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
> >>>>> {
> >>>>>        return dev->vhost_ops->vhost_requires_shm_log &&
> >>>>>               dev->vhost_ops->vhost_requires_shm_log(dev);
> >>>>> }
> >>> For example, if I understand the code correctly, the log type won't be
> >>> changed during runtime, so we can endup with a boolean to record that
> >>> instead of a query ops?
> >> Right now the log type won't change during runtime, but I am not sure if
> >> this may prohibit future revisit to allow change at the runtime,
> > We can be bothered when we have such a request then.
> >
> >> then
> >> there'll be complex code involvled to maintain the state.
> >>
> >> Other than this, I think it's insufficient to just check the shm log
> >> v.s. normal log. The logger device requires to identify a leading logger
> >> device that gets elected in vhost_dev_elect_mem_logger(), as all the
> >> dev->log points to the same logger that is refenerce counted, that we
> >> have to add extra field and complex logic to maintain the election
> >> status.
> > One thing I don't understand here (and in the changelog) is why do we
> > need an election here?
>
> vhost_sync_dirty_bitmap() not just scans the guest memory sections but
> the specific one for virtqueues (used rings) also. To save more CPU
> cycles to the best extend, the guest memory must be scanned only once in
> each log iteration, though the logging for used rings would still have
> to use the specific vhost instance, so all vhost_device instance still
> keeps the dev->log pointer to the shared log as-is. Generally the shared
> memory logger can be picked from an arbitrary vhost_device instance, but
> to keep the code simple, performant and predictable

This is the point, I don't see why election is simpler than picking an
arbitrary shared log in this case.

> , logger selection is
> made on the control path at the vhost add/remove time rather than be
> determined at the dirty log collection runtime, the latter of which is
> in the hotpath.
>
> >
> >> I thought that Eugenio's previous suggestion tried to simplify
> >> the logic in vhost_dev_elect_mem_logger(), as the QLIST_FIRST macro that
> >> gets expanded to use the lh_first field for the QLIST would simply
> >> satisfy the basic need. Why extra logic to make the check ever more
> >> complex, is there any benefit by adding more fields to the vhost_dev?
> > I don't get here, the idea is to just pick one shared log which should
> > be much more simpler than what is proposed here.
> The code you showed earlier won't work as all vhost_device instance
> points to the same dev->log device...

This part I don't understand.

Thanks

>
> Regards,
> -Siwei
> >
> > Thanks
> >
> >>
> >> Thanks,
> >> -Siwei
> >>
> >>>>> And it helps to simplify the logic.
> >>>> Generally yes, but when it comes to hot path operations the performance
> >>>> consideration could override this principle. I think there's no harm to
> >>>> check against logger device cached in vhost layer itself, and the
> >>>> current patch does not create a lot of complexity or performance side
> >>>> effect (actually I think the conditional should be very straightforward
> >>>> to turn into just a couple of assembly compare and branch instructions
> >>>> rather than indirection through another jmp call).
> >>> Thanks
> >>>
> >>>> -Siwei
> >>>>
> >>>>> Thanks
> >>>>>
> >>>>>> -Siwei
> >>>>>>> ?
> >>>>>>>
> >>>>>>> Thanks
> >>>>>>>
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-22  5:08                   ` Jason Wang
@ 2024-03-22 21:13                     ` Si-Wei Liu
  2024-03-25  6:13                       ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-22 21:13 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/21/2024 10:08 PM, Jason Wang wrote:
> On Fri, Mar 22, 2024 at 5:43 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 3/20/2024 8:56 PM, Jason Wang wrote:
>>> On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>
>>>> On 3/19/2024 8:27 PM, Jason Wang wrote:
>>>>> On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>> On 3/17/2024 8:22 PM, Jason Wang wrote:
>>>>>>> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>>>> On 3/14/2024 9:03 PM, Jason Wang wrote:
>>>>>>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>>>>>> On setups with one or more virtio-net devices with vhost on,
>>>>>>>>>> dirty tracking iteration increases cost the bigger the number
>>>>>>>>>> amount of queues are set up e.g. on idle guests migration the
>>>>>>>>>> following is observed with virtio-net with vhost=on:
>>>>>>>>>>
>>>>>>>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
>>>>>>>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
>>>>>>>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
>>>>>>>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>>>>>>>>>>
>>>>>>>>>> With high memory rates the symptom is lack of convergence as soon
>>>>>>>>>> as it has a vhost device with a sufficiently high number of queues,
>>>>>>>>>> the sufficient number of vhost devices.
>>>>>>>>>>
>>>>>>>>>> On every migration iteration (every 100msecs) it will redundantly
>>>>>>>>>> query the *shared log* the number of queues configured with vhost
>>>>>>>>>> that exist in the guest. For the virtqueue data, this is necessary,
>>>>>>>>>> but not for the memory sections which are the same. So essentially
>>>>>>>>>> we end up scanning the dirty log too often.
>>>>>>>>>>
>>>>>>>>>> To fix that, select a vhost device responsible for scanning the
>>>>>>>>>> log with regards to memory sections dirty tracking. It is selected
>>>>>>>>>> when we enable the logger (during migration) and cleared when we
>>>>>>>>>> disable the logger. If the vhost logger device goes away for some
>>>>>>>>>> reason, the logger will be re-selected from the rest of vhost
>>>>>>>>>> devices.
>>>>>>>>>>
>>>>>>>>>> After making mem-section logger a singleton instance, constant cost
>>>>>>>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
>>>>>>>>>> queues or how many vhost devices are configured:
>>>>>>>>>>
>>>>>>>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
>>>>>>>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
>>>>>>>>>>
>>>>>>>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>>>>>>>>>
>>>>>>>>>> ---
>>>>>>>>>> v3 -> v4:
>>>>>>>>>>        - add comment to clarify effect on cache locality and
>>>>>>>>>>          performance
>>>>>>>>>>
>>>>>>>>>> v2 -> v3:
>>>>>>>>>>        - add after-fix benchmark to commit log
>>>>>>>>>>        - rename vhost_log_dev_enabled to vhost_dev_should_log
>>>>>>>>>>        - remove unneeded comparisons for backend_type
>>>>>>>>>>        - use QLIST array instead of single flat list to store vhost
>>>>>>>>>>          logger devices
>>>>>>>>>>        - simplify logger election logic
>>>>>>>>>> ---
>>>>>>>>>>       hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
>>>>>>>>>>       include/hw/virtio/vhost.h |  1 +
>>>>>>>>>>       2 files changed, 62 insertions(+), 6 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>>>>>>>> index 612f4db..58522f1 100644
>>>>>>>>>> --- a/hw/virtio/vhost.c
>>>>>>>>>> +++ b/hw/virtio/vhost.c
>>>>>>>>>> @@ -45,6 +45,7 @@
>>>>>>>>>>
>>>>>>>>>>       static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>>>>>>>>>>       static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>>>>>>>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
>>>>>>>>>>
>>>>>>>>>>       /* Memslots used by backends that support private memslots (without an fd). */
>>>>>>>>>>       static unsigned int used_memslots;
>>>>>>>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>>>>>>>>>>           }
>>>>>>>>>>       }
>>>>>>>>>>
>>>>>>>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
>>>>>>>>>> +{
>>>>>>>>>> +    assert(dev->vhost_ops);
>>>>>>>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
>>>>>>>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
>>>>>>>>>> +
>>>>>>>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
>>>>>>>>> A dumb question, why not simple check
>>>>>>>>>
>>>>>>>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
>>>>>>>> Because we are not sure if the logger comes from vhost_log_shm[] or
>>>>>>>> vhost_log[]. Don't want to complicate the check here by calling into
>>>>>>>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
>>>>>>> It has very low overhead, isn't it?
>>>>>> Whether this has low overhead will have to depend on the specific
>>>>>> backend's implementation for .vhost_requires_shm_log(), which the common
>>>>>> vhost layer should not assume upon or rely on the current implementation.
>>>>>>
>>>>>>> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
>>>>>>> {
>>>>>>>         return dev->vhost_ops->vhost_requires_shm_log &&
>>>>>>>                dev->vhost_ops->vhost_requires_shm_log(dev);
>>>>>>> }
>>>>> For example, if I understand the code correctly, the log type won't be
>>>>> changed during runtime, so we can endup with a boolean to record that
>>>>> instead of a query ops?
>>>> Right now the log type won't change during runtime, but I am not sure if
>>>> this may prohibit future revisit to allow change at the runtime,
>>> We can be bothered when we have such a request then.
>>>
>>>> then
>>>> there'll be complex code involvled to maintain the state.
>>>>
>>>> Other than this, I think it's insufficient to just check the shm log
>>>> v.s. normal log. The logger device requires to identify a leading logger
>>>> device that gets elected in vhost_dev_elect_mem_logger(), as all the
>>>> dev->log points to the same logger that is refenerce counted, that we
>>>> have to add extra field and complex logic to maintain the election
>>>> status.
>>> One thing I don't understand here (and in the changelog) is why do we
>>> need an election here?
>> vhost_sync_dirty_bitmap() not just scans the guest memory sections but
>> the specific one for virtqueues (used rings) also. To save more CPU
>> cycles to the best extend, the guest memory must be scanned only once in
>> each log iteration, though the logging for used rings would still have
>> to use the specific vhost instance, so all vhost_device instance still
>> keeps the dev->log pointer to the shared log as-is. Generally the shared
>> memory logger can be picked from an arbitrary vhost_device instance, but
>> to keep the code simple, performant and predictable
> This is the point, I don't see why election is simpler than picking an
> arbitrary shared log in this case.
Maybe I missed your point, but I am confused and fail to understand why 
electing a fixed vhost_dev is not as simple? Regardless of the 
specifics, I think the point is one _fixed_ vhost_dev has to be picked 
amongst all the instances per backend type in charge of logging guest 
memory, no matter if it's at the start on the list or not.

>
>> , logger selection is
>> made on the control path at the vhost add/remove time rather than be
>> determined at the dirty log collection runtime, the latter of which is
>> in the hotpath.
>>
>>>> I thought that Eugenio's previous suggestion tried to simplify
>>>> the logic in vhost_dev_elect_mem_logger(), as the QLIST_FIRST macro that
>>>> gets expanded to use the lh_first field for the QLIST would simply
>>>> satisfy the basic need. Why extra logic to make the check ever more
>>>> complex, is there any benefit by adding more fields to the vhost_dev?
>>> I don't get here, the idea is to just pick one shared log which should
>>> be much more simpler than what is proposed here.
>> The code you showed earlier won't work as all vhost_device instance
>> points to the same dev->log device...
> This part I don't understand.

vhost_log_get() has the following:

     log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];

     if (!log || log->size != size) {
         log = vhost_log_alloc(size, share);
         if (share) {
             vhost_log_shm[backend_type] = log;
         } else {
             vhost_log[backend_type] = log;
         }
     } else {
         ++log->refcnt;
     }

So for a specific backend type, vhost_log_get() would return the same 
logger device (stored at either vhost_log_shm[] or vhost_log[]) to all 
vhost_dev instances, and the check you suggested earlier:

dev->log == vhost_log_shm[dev->vhost_ops->backend_type]

will always hold true if the vhost_dev instance (representing the 
specific virtqueue) is active.

Regards,
-Siwei
>
> Thanks
>
>> Regards,
>> -Siwei
>>> Thanks
>>>
>>>> Thanks,
>>>> -Siwei
>>>>
>>>>>>> And it helps to simplify the logic.
>>>>>> Generally yes, but when it comes to hot path operations the performance
>>>>>> consideration could override this principle. I think there's no harm to
>>>>>> check against logger device cached in vhost layer itself, and the
>>>>>> current patch does not create a lot of complexity or performance side
>>>>>> effect (actually I think the conditional should be very straightforward
>>>>>> to turn into just a couple of assembly compare and branch instructions
>>>>>> rather than indirection through another jmp call).
>>>>> Thanks
>>>>>
>>>>>> -Siwei
>>>>>>
>>>>>>> Thanks
>>>>>>>
>>>>>>>> -Siwei
>>>>>>>>> ?
>>>>>>>>>
>>>>>>>>> Thanks
>>>>>>>>>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-22 21:13                     ` Si-Wei Liu
@ 2024-03-25  6:13                       ` Jason Wang
  2024-03-25 23:20                         ` [External] : " Si-Wei Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2024-03-25  6:13 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Sat, Mar 23, 2024 at 5:14 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/21/2024 10:08 PM, Jason Wang wrote:
> > On Fri, Mar 22, 2024 at 5:43 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 3/20/2024 8:56 PM, Jason Wang wrote:
> >>> On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>
> >>>> On 3/19/2024 8:27 PM, Jason Wang wrote:
> >>>>> On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>> On 3/17/2024 8:22 PM, Jason Wang wrote:
> >>>>>>> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>>>> On 3/14/2024 9:03 PM, Jason Wang wrote:
> >>>>>>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>>>>>> On setups with one or more virtio-net devices with vhost on,
> >>>>>>>>>> dirty tracking iteration increases cost the bigger the number
> >>>>>>>>>> amount of queues are set up e.g. on idle guests migration the
> >>>>>>>>>> following is observed with virtio-net with vhost=on:
> >>>>>>>>>>
> >>>>>>>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> >>>>>>>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> >>>>>>>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> >>>>>>>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
> >>>>>>>>>>
> >>>>>>>>>> With high memory rates the symptom is lack of convergence as soon
> >>>>>>>>>> as it has a vhost device with a sufficiently high number of queues,
> >>>>>>>>>> the sufficient number of vhost devices.
> >>>>>>>>>>
> >>>>>>>>>> On every migration iteration (every 100msecs) it will redundantly
> >>>>>>>>>> query the *shared log* the number of queues configured with vhost
> >>>>>>>>>> that exist in the guest. For the virtqueue data, this is necessary,
> >>>>>>>>>> but not for the memory sections which are the same. So essentially
> >>>>>>>>>> we end up scanning the dirty log too often.
> >>>>>>>>>>
> >>>>>>>>>> To fix that, select a vhost device responsible for scanning the
> >>>>>>>>>> log with regards to memory sections dirty tracking. It is selected
> >>>>>>>>>> when we enable the logger (during migration) and cleared when we
> >>>>>>>>>> disable the logger. If the vhost logger device goes away for some
> >>>>>>>>>> reason, the logger will be re-selected from the rest of vhost
> >>>>>>>>>> devices.
> >>>>>>>>>>
> >>>>>>>>>> After making mem-section logger a singleton instance, constant cost
> >>>>>>>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
> >>>>>>>>>> queues or how many vhost devices are configured:
> >>>>>>>>>>
> >>>>>>>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
> >>>>>>>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
> >>>>>>>>>>
> >>>>>>>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> >>>>>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >>>>>>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>>>>>>>>>
> >>>>>>>>>> ---
> >>>>>>>>>> v3 -> v4:
> >>>>>>>>>>        - add comment to clarify effect on cache locality and
> >>>>>>>>>>          performance
> >>>>>>>>>>
> >>>>>>>>>> v2 -> v3:
> >>>>>>>>>>        - add after-fix benchmark to commit log
> >>>>>>>>>>        - rename vhost_log_dev_enabled to vhost_dev_should_log
> >>>>>>>>>>        - remove unneeded comparisons for backend_type
> >>>>>>>>>>        - use QLIST array instead of single flat list to store vhost
> >>>>>>>>>>          logger devices
> >>>>>>>>>>        - simplify logger election logic
> >>>>>>>>>> ---
> >>>>>>>>>>       hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
> >>>>>>>>>>       include/hw/virtio/vhost.h |  1 +
> >>>>>>>>>>       2 files changed, 62 insertions(+), 6 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>>>>>>>> index 612f4db..58522f1 100644
> >>>>>>>>>> --- a/hw/virtio/vhost.c
> >>>>>>>>>> +++ b/hw/virtio/vhost.c
> >>>>>>>>>> @@ -45,6 +45,7 @@
> >>>>>>>>>>
> >>>>>>>>>>       static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>>>>       static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>>>>
> >>>>>>>>>>       /* Memslots used by backends that support private memslots (without an fd). */
> >>>>>>>>>>       static unsigned int used_memslots;
> >>>>>>>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
> >>>>>>>>>>           }
> >>>>>>>>>>       }
> >>>>>>>>>>
> >>>>>>>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
> >>>>>>>>>> +{
> >>>>>>>>>> +    assert(dev->vhost_ops);
> >>>>>>>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> >>>>>>>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> >>>>>>>>>> +
> >>>>>>>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
> >>>>>>>>> A dumb question, why not simple check
> >>>>>>>>>
> >>>>>>>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
> >>>>>>>> Because we are not sure if the logger comes from vhost_log_shm[] or
> >>>>>>>> vhost_log[]. Don't want to complicate the check here by calling into
> >>>>>>>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
> >>>>>>> It has very low overhead, isn't it?
> >>>>>> Whether this has low overhead will have to depend on the specific
> >>>>>> backend's implementation for .vhost_requires_shm_log(), which the common
> >>>>>> vhost layer should not assume upon or rely on the current implementation.
> >>>>>>
> >>>>>>> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
> >>>>>>> {
> >>>>>>>         return dev->vhost_ops->vhost_requires_shm_log &&
> >>>>>>>                dev->vhost_ops->vhost_requires_shm_log(dev);
> >>>>>>> }
> >>>>> For example, if I understand the code correctly, the log type won't be
> >>>>> changed during runtime, so we can endup with a boolean to record that
> >>>>> instead of a query ops?
> >>>> Right now the log type won't change during runtime, but I am not sure if
> >>>> this may prohibit future revisit to allow change at the runtime,
> >>> We can be bothered when we have such a request then.
> >>>
> >>>> then
> >>>> there'll be complex code involvled to maintain the state.
> >>>>
> >>>> Other than this, I think it's insufficient to just check the shm log
> >>>> v.s. normal log. The logger device requires to identify a leading logger
> >>>> device that gets elected in vhost_dev_elect_mem_logger(), as all the
> >>>> dev->log points to the same logger that is refenerce counted, that we
> >>>> have to add extra field and complex logic to maintain the election
> >>>> status.
> >>> One thing I don't understand here (and in the changelog) is why do we
> >>> need an election here?
> >> vhost_sync_dirty_bitmap() not just scans the guest memory sections but
> >> the specific one for virtqueues (used rings) also. To save more CPU
> >> cycles to the best extend, the guest memory must be scanned only once in
> >> each log iteration, though the logging for used rings would still have
> >> to use the specific vhost instance, so all vhost_device instance still
> >> keeps the dev->log pointer to the shared log as-is. Generally the shared
> >> memory logger can be picked from an arbitrary vhost_device instance, but
> >> to keep the code simple, performant and predictable
> > This is the point, I don't see why election is simpler than picking an
> > arbitrary shared log in this case.
> Maybe I missed your point, but I am confused and fail to understand why
> electing a fixed vhost_dev is not as simple? Regardless of the
> specifics, I think the point is one _fixed_ vhost_dev has to be picked
> amongst all the instances per backend type in charge of logging guest
> memory, no matter if it's at the start on the list or not.

See below.

>
> >
> >> , logger selection is
> >> made on the control path at the vhost add/remove time rather than be
> >> determined at the dirty log collection runtime, the latter of which is
> >> in the hotpath.
> >>
> >>>> I thought that Eugenio's previous suggestion tried to simplify
> >>>> the logic in vhost_dev_elect_mem_logger(), as the QLIST_FIRST macro that
> >>>> gets expanded to use the lh_first field for the QLIST would simply
> >>>> satisfy the basic need. Why extra logic to make the check ever more
> >>>> complex, is there any benefit by adding more fields to the vhost_dev?
> >>> I don't get here, the idea is to just pick one shared log which should
> >>> be much more simpler than what is proposed here.
> >> The code you showed earlier won't work as all vhost_device instance
> >> points to the same dev->log device...
> > This part I don't understand.
>
> vhost_log_get() has the following:
>
>      log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
>
>      if (!log || log->size != size) {
>          log = vhost_log_alloc(size, share);
>          if (share) {
>              vhost_log_shm[backend_type] = log;
>          } else {
>              vhost_log[backend_type] = log;
>          }
>      } else {
>          ++log->refcnt;
>      }
>
> So for a specific backend type, vhost_log_get() would return the same
> logger device (stored at either vhost_log_shm[] or vhost_log[]) to all
> vhost_dev instances, and the check you suggested earlier:
>
> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
>
> will always hold true if the vhost_dev instance (representing the
> specific virtqueue) is active.

Right, so the point is see if we can find something simpler to avoid
the QLIST as it involves vhost_dev which seems unnecessary.

Does something like a counter work?

vhost_sync_dirty_bitmap() {
      rounds ++;
      vhost_dev_sync_region(rounds);
}

vhost_dev_sync_region(rounds) {
      if (dev->log->rounds == rounds)
           return;
      else
           dev->log->rounds;
}

(pseudo codes, just used to demonstrate the idea).

Thanks

>
> Regards,
> -Siwei
> >
> > Thanks
> >
> >> Regards,
> >> -Siwei
> >>> Thanks
> >>>
> >>>> Thanks,
> >>>> -Siwei
> >>>>
> >>>>>>> And it helps to simplify the logic.
> >>>>>> Generally yes, but when it comes to hot path operations the performance
> >>>>>> consideration could override this principle. I think there's no harm to
> >>>>>> check against logger device cached in vhost layer itself, and the
> >>>>>> current patch does not create a lot of complexity or performance side
> >>>>>> effect (actually I think the conditional should be very straightforward
> >>>>>> to turn into just a couple of assembly compare and branch instructions
> >>>>>> rather than indirection through another jmp call).
> >>>>> Thanks
> >>>>>
> >>>>>> -Siwei
> >>>>>>
> >>>>>>> Thanks
> >>>>>>>
> >>>>>>>> -Siwei
> >>>>>>>>> ?
> >>>>>>>>>
> >>>>>>>>> Thanks
> >>>>>>>>>
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [External] : Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-25  6:13                       ` Jason Wang
@ 2024-03-25 23:20                         ` Si-Wei Liu
  2024-03-26  4:36                           ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Si-Wei Liu @ 2024-03-25 23:20 UTC (permalink / raw)
  To: Jason Wang; +Cc: qemu-devel, mst, eperezma, joao.m.martins



On 3/24/2024 11:13 PM, Jason Wang wrote:
> On Sat, Mar 23, 2024 at 5:14 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>
>>
>> On 3/21/2024 10:08 PM, Jason Wang wrote:
>>> On Fri, Mar 22, 2024 at 5:43 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>
>>>> On 3/20/2024 8:56 PM, Jason Wang wrote:
>>>>> On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>> On 3/19/2024 8:27 PM, Jason Wang wrote:
>>>>>>> On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>>>> On 3/17/2024 8:22 PM, Jason Wang wrote:
>>>>>>>>> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>>>>>> On 3/14/2024 9:03 PM, Jason Wang wrote:
>>>>>>>>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>>>>>>>>>>>> On setups with one or more virtio-net devices with vhost on,
>>>>>>>>>>>> dirty tracking iteration increases cost the bigger the number
>>>>>>>>>>>> amount of queues are set up e.g. on idle guests migration the
>>>>>>>>>>>> following is observed with virtio-net with vhost=on:
>>>>>>>>>>>>
>>>>>>>>>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
>>>>>>>>>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
>>>>>>>>>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
>>>>>>>>>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>>>>>>>>>>>>
>>>>>>>>>>>> With high memory rates the symptom is lack of convergence as soon
>>>>>>>>>>>> as it has a vhost device with a sufficiently high number of queues,
>>>>>>>>>>>> the sufficient number of vhost devices.
>>>>>>>>>>>>
>>>>>>>>>>>> On every migration iteration (every 100msecs) it will redundantly
>>>>>>>>>>>> query the *shared log* the number of queues configured with vhost
>>>>>>>>>>>> that exist in the guest. For the virtqueue data, this is necessary,
>>>>>>>>>>>> but not for the memory sections which are the same. So essentially
>>>>>>>>>>>> we end up scanning the dirty log too often.
>>>>>>>>>>>>
>>>>>>>>>>>> To fix that, select a vhost device responsible for scanning the
>>>>>>>>>>>> log with regards to memory sections dirty tracking. It is selected
>>>>>>>>>>>> when we enable the logger (during migration) and cleared when we
>>>>>>>>>>>> disable the logger. If the vhost logger device goes away for some
>>>>>>>>>>>> reason, the logger will be re-selected from the rest of vhost
>>>>>>>>>>>> devices.
>>>>>>>>>>>>
>>>>>>>>>>>> After making mem-section logger a singleton instance, constant cost
>>>>>>>>>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
>>>>>>>>>>>> queues or how many vhost devices are configured:
>>>>>>>>>>>>
>>>>>>>>>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
>>>>>>>>>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
>>>>>>>>>>>>
>>>>>>>>>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>>>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>>>>>>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>>>>>>>>>>>
>>>>>>>>>>>> ---
>>>>>>>>>>>> v3 -> v4:
>>>>>>>>>>>>         - add comment to clarify effect on cache locality and
>>>>>>>>>>>>           performance
>>>>>>>>>>>>
>>>>>>>>>>>> v2 -> v3:
>>>>>>>>>>>>         - add after-fix benchmark to commit log
>>>>>>>>>>>>         - rename vhost_log_dev_enabled to vhost_dev_should_log
>>>>>>>>>>>>         - remove unneeded comparisons for backend_type
>>>>>>>>>>>>         - use QLIST array instead of single flat list to store vhost
>>>>>>>>>>>>           logger devices
>>>>>>>>>>>>         - simplify logger election logic
>>>>>>>>>>>> ---
>>>>>>>>>>>>        hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
>>>>>>>>>>>>        include/hw/virtio/vhost.h |  1 +
>>>>>>>>>>>>        2 files changed, 62 insertions(+), 6 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>>>>>>>>>>>> index 612f4db..58522f1 100644
>>>>>>>>>>>> --- a/hw/virtio/vhost.c
>>>>>>>>>>>> +++ b/hw/virtio/vhost.c
>>>>>>>>>>>> @@ -45,6 +45,7 @@
>>>>>>>>>>>>
>>>>>>>>>>>>        static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>>>>>>>>>>>>        static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>>>>>>>>>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
>>>>>>>>>>>>
>>>>>>>>>>>>        /* Memslots used by backends that support private memslots (without an fd). */
>>>>>>>>>>>>        static unsigned int used_memslots;
>>>>>>>>>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>>>>>>>>>>>>            }
>>>>>>>>>>>>        }
>>>>>>>>>>>>
>>>>>>>>>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    assert(dev->vhost_ops);
>>>>>>>>>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
>>>>>>>>>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
>>>>>>>>>>> A dumb question, why not simple check
>>>>>>>>>>>
>>>>>>>>>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
>>>>>>>>>> Because we are not sure if the logger comes from vhost_log_shm[] or
>>>>>>>>>> vhost_log[]. Don't want to complicate the check here by calling into
>>>>>>>>>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
>>>>>>>>> It has very low overhead, isn't it?
>>>>>>>> Whether this has low overhead will have to depend on the specific
>>>>>>>> backend's implementation for .vhost_requires_shm_log(), which the common
>>>>>>>> vhost layer should not assume upon or rely on the current implementation.
>>>>>>>>
>>>>>>>>> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
>>>>>>>>> {
>>>>>>>>>          return dev->vhost_ops->vhost_requires_shm_log &&
>>>>>>>>>                 dev->vhost_ops->vhost_requires_shm_log(dev);
>>>>>>>>> }
>>>>>>> For example, if I understand the code correctly, the log type won't be
>>>>>>> changed during runtime, so we can endup with a boolean to record that
>>>>>>> instead of a query ops?
>>>>>> Right now the log type won't change during runtime, but I am not sure if
>>>>>> this may prohibit future revisit to allow change at the runtime,
>>>>> We can be bothered when we have such a request then.
>>>>>
>>>>>> then
>>>>>> there'll be complex code involvled to maintain the state.
>>>>>>
>>>>>> Other than this, I think it's insufficient to just check the shm log
>>>>>> v.s. normal log. The logger device requires to identify a leading logger
>>>>>> device that gets elected in vhost_dev_elect_mem_logger(), as all the
>>>>>> dev->log points to the same logger that is refenerce counted, that we
>>>>>> have to add extra field and complex logic to maintain the election
>>>>>> status.
>>>>> One thing I don't understand here (and in the changelog) is why do we
>>>>> need an election here?
>>>> vhost_sync_dirty_bitmap() not just scans the guest memory sections but
>>>> the specific one for virtqueues (used rings) also. To save more CPU
>>>> cycles to the best extend, the guest memory must be scanned only once in
>>>> each log iteration, though the logging for used rings would still have
>>>> to use the specific vhost instance, so all vhost_device instance still
>>>> keeps the dev->log pointer to the shared log as-is. Generally the shared
>>>> memory logger can be picked from an arbitrary vhost_device instance, but
>>>> to keep the code simple, performant and predictable
>>> This is the point, I don't see why election is simpler than picking an
>>> arbitrary shared log in this case.
>> Maybe I missed your point, but I am confused and fail to understand why
>> electing a fixed vhost_dev is not as simple? Regardless of the
>> specifics, I think the point is one _fixed_ vhost_dev has to be picked
>> amongst all the instances per backend type in charge of logging guest
>> memory, no matter if it's at the start on the list or not.
> See below.
>
>>>> , logger selection is
>>>> made on the control path at the vhost add/remove time rather than be
>>>> determined at the dirty log collection runtime, the latter of which is
>>>> in the hotpath.
>>>>
>>>>>> I thought that Eugenio's previous suggestion tried to simplify
>>>>>> the logic in vhost_dev_elect_mem_logger(), as the QLIST_FIRST macro that
>>>>>> gets expanded to use the lh_first field for the QLIST would simply
>>>>>> satisfy the basic need. Why extra logic to make the check ever more
>>>>>> complex, is there any benefit by adding more fields to the vhost_dev?
>>>>> I don't get here, the idea is to just pick one shared log which should
>>>>> be much more simpler than what is proposed here.
>>>> The code you showed earlier won't work as all vhost_device instance
>>>> points to the same dev->log device...
>>> This part I don't understand.
>> vhost_log_get() has the following:
>>
>>       log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
>>
>>       if (!log || log->size != size) {
>>           log = vhost_log_alloc(size, share);
>>           if (share) {
>>               vhost_log_shm[backend_type] = log;
>>           } else {
>>               vhost_log[backend_type] = log;
>>           }
>>       } else {
>>           ++log->refcnt;
>>       }
>>
>> So for a specific backend type, vhost_log_get() would return the same
>> logger device (stored at either vhost_log_shm[] or vhost_log[]) to all
>> vhost_dev instances, and the check you suggested earlier:
>>
>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
>>
>> will always hold true if the vhost_dev instance (representing the
>> specific virtqueue) is active.
> Right, so the point is see if we can find something simpler to avoid
> the QLIST as it involves vhost_dev which seems unnecessary.
To make it independent of the specific vhost_dev, it would require the 
framework (migration dirty logger) to pass down "dirty_sync_count" like 
information to the vhost layer through memory listener .log_sync 
interface. I'm not sure if this change is worth the effort, as this 
patch is meant to fix a long standing bug I suppose we need to find out 
a way that is applicable to back-port to past -stable's.

>
> Does something like a counter work?
It won't. It seems the "rounds" counter is still per vhost_dev instance, 
but we need it per migration log_sync iteration across all vhost_dev 
instance of same backend type. Maybe I miss something, but I don't see 
how easily it would be to come up with proper accounting for "rounds", 
if not going through the list of vhost_dev instances at the run time 
(which is what I tried to avoid).


Thanks,
-Siwei
>
> vhost_sync_dirty_bitmap() {
>        rounds ++;
>        vhost_dev_sync_region(rounds);
> }
>
> vhost_dev_sync_region(rounds) {
>        if (dev->log->rounds == rounds)
>             return;
>        else
>             dev->log->rounds;
> }
>
> (pseudo codes, just used to demonstrate the idea).
>
> Thanks
>
>> Regards,
>> -Siwei
>>> Thanks
>>>
>>>> Regards,
>>>> -Siwei
>>>>> Thanks
>>>>>
>>>>>> Thanks,
>>>>>> -Siwei
>>>>>>
>>>>>>>>> And it helps to simplify the logic.
>>>>>>>> Generally yes, but when it comes to hot path operations the performance
>>>>>>>> consideration could override this principle. I think there's no harm to
>>>>>>>> check against logger device cached in vhost layer itself, and the
>>>>>>>> current patch does not create a lot of complexity or performance side
>>>>>>>> effect (actually I think the conditional should be very straightforward
>>>>>>>> to turn into just a couple of assembly compare and branch instructions
>>>>>>>> rather than indirection through another jmp call).
>>>>>>> Thanks
>>>>>>>
>>>>>>>> -Siwei
>>>>>>>>
>>>>>>>>> Thanks
>>>>>>>>>
>>>>>>>>>> -Siwei
>>>>>>>>>>> ?
>>>>>>>>>>>
>>>>>>>>>>> Thanks
>>>>>>>>>>>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [External] : Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration
  2024-03-25 23:20                         ` [External] : " Si-Wei Liu
@ 2024-03-26  4:36                           ` Jason Wang
  0 siblings, 0 replies; 22+ messages in thread
From: Jason Wang @ 2024-03-26  4:36 UTC (permalink / raw)
  To: Si-Wei Liu; +Cc: qemu-devel, mst, eperezma, joao.m.martins

On Tue, Mar 26, 2024 at 7:21 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 3/24/2024 11:13 PM, Jason Wang wrote:
> > On Sat, Mar 23, 2024 at 5:14 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>
> >>
> >> On 3/21/2024 10:08 PM, Jason Wang wrote:
> >>> On Fri, Mar 22, 2024 at 5:43 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>
> >>>> On 3/20/2024 8:56 PM, Jason Wang wrote:
> >>>>> On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>> On 3/19/2024 8:27 PM, Jason Wang wrote:
> >>>>>>> On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>>>> On 3/17/2024 8:22 PM, Jason Wang wrote:
> >>>>>>>>> On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>>>>>> On 3/14/2024 9:03 PM, Jason Wang wrote:
> >>>>>>>>>>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >>>>>>>>>>>> On setups with one or more virtio-net devices with vhost on,
> >>>>>>>>>>>> dirty tracking iteration increases cost the bigger the number
> >>>>>>>>>>>> amount of queues are set up e.g. on idle guests migration the
> >>>>>>>>>>>> following is observed with virtio-net with vhost=on:
> >>>>>>>>>>>>
> >>>>>>>>>>>> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> >>>>>>>>>>>> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> >>>>>>>>>>>> 1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
> >>>>>>>>>>>> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
> >>>>>>>>>>>>
> >>>>>>>>>>>> With high memory rates the symptom is lack of convergence as soon
> >>>>>>>>>>>> as it has a vhost device with a sufficiently high number of queues,
> >>>>>>>>>>>> the sufficient number of vhost devices.
> >>>>>>>>>>>>
> >>>>>>>>>>>> On every migration iteration (every 100msecs) it will redundantly
> >>>>>>>>>>>> query the *shared log* the number of queues configured with vhost
> >>>>>>>>>>>> that exist in the guest. For the virtqueue data, this is necessary,
> >>>>>>>>>>>> but not for the memory sections which are the same. So essentially
> >>>>>>>>>>>> we end up scanning the dirty log too often.
> >>>>>>>>>>>>
> >>>>>>>>>>>> To fix that, select a vhost device responsible for scanning the
> >>>>>>>>>>>> log with regards to memory sections dirty tracking. It is selected
> >>>>>>>>>>>> when we enable the logger (during migration) and cleared when we
> >>>>>>>>>>>> disable the logger. If the vhost logger device goes away for some
> >>>>>>>>>>>> reason, the logger will be re-selected from the rest of vhost
> >>>>>>>>>>>> devices.
> >>>>>>>>>>>>
> >>>>>>>>>>>> After making mem-section logger a singleton instance, constant cost
> >>>>>>>>>>>> of 7%-9% (like the 1 queue report) will be seen, no matter how many
> >>>>>>>>>>>> queues or how many vhost devices are configured:
> >>>>>>>>>>>>
> >>>>>>>>>>>> 48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
> >>>>>>>>>>>> 2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
> >>>>>>>>>>>>
> >>>>>>>>>>>> Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
> >>>>>>>>>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >>>>>>>>>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >>>>>>>>>>>>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>> v3 -> v4:
> >>>>>>>>>>>>         - add comment to clarify effect on cache locality and
> >>>>>>>>>>>>           performance
> >>>>>>>>>>>>
> >>>>>>>>>>>> v2 -> v3:
> >>>>>>>>>>>>         - add after-fix benchmark to commit log
> >>>>>>>>>>>>         - rename vhost_log_dev_enabled to vhost_dev_should_log
> >>>>>>>>>>>>         - remove unneeded comparisons for backend_type
> >>>>>>>>>>>>         - use QLIST array instead of single flat list to store vhost
> >>>>>>>>>>>>           logger devices
> >>>>>>>>>>>>         - simplify logger election logic
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>        hw/virtio/vhost.c         | 67 ++++++++++++++++++++++++++++++++++++++++++-----
> >>>>>>>>>>>>        include/hw/virtio/vhost.h |  1 +
> >>>>>>>>>>>>        2 files changed, 62 insertions(+), 6 deletions(-)
> >>>>>>>>>>>>
> >>>>>>>>>>>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>>>>>>>>>> index 612f4db..58522f1 100644
> >>>>>>>>>>>> --- a/hw/virtio/vhost.c
> >>>>>>>>>>>> +++ b/hw/virtio/vhost.c
> >>>>>>>>>>>> @@ -45,6 +45,7 @@
> >>>>>>>>>>>>
> >>>>>>>>>>>>        static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>>>>>>        static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>>>>>> +static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
> >>>>>>>>>>>>
> >>>>>>>>>>>>        /* Memslots used by backends that support private memslots (without an fd). */
> >>>>>>>>>>>>        static unsigned int used_memslots;
> >>>>>>>>>>>> @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
> >>>>>>>>>>>>            }
> >>>>>>>>>>>>        }
> >>>>>>>>>>>>
> >>>>>>>>>>>> +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
> >>>>>>>>>>>> +{
> >>>>>>>>>>>> +    assert(dev->vhost_ops);
> >>>>>>>>>>>> +    assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> >>>>>>>>>>>> +    assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +    return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
> >>>>>>>>>>> A dumb question, why not simple check
> >>>>>>>>>>>
> >>>>>>>>>>> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
> >>>>>>>>>> Because we are not sure if the logger comes from vhost_log_shm[] or
> >>>>>>>>>> vhost_log[]. Don't want to complicate the check here by calling into
> >>>>>>>>>> vhost_dev_log_is_shared() everytime when the .log_sync() is called.
> >>>>>>>>> It has very low overhead, isn't it?
> >>>>>>>> Whether this has low overhead will have to depend on the specific
> >>>>>>>> backend's implementation for .vhost_requires_shm_log(), which the common
> >>>>>>>> vhost layer should not assume upon or rely on the current implementation.
> >>>>>>>>
> >>>>>>>>> static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
> >>>>>>>>> {
> >>>>>>>>>          return dev->vhost_ops->vhost_requires_shm_log &&
> >>>>>>>>>                 dev->vhost_ops->vhost_requires_shm_log(dev);
> >>>>>>>>> }
> >>>>>>> For example, if I understand the code correctly, the log type won't be
> >>>>>>> changed during runtime, so we can endup with a boolean to record that
> >>>>>>> instead of a query ops?
> >>>>>> Right now the log type won't change during runtime, but I am not sure if
> >>>>>> this may prohibit future revisit to allow change at the runtime,
> >>>>> We can be bothered when we have such a request then.
> >>>>>
> >>>>>> then
> >>>>>> there'll be complex code involvled to maintain the state.
> >>>>>>
> >>>>>> Other than this, I think it's insufficient to just check the shm log
> >>>>>> v.s. normal log. The logger device requires to identify a leading logger
> >>>>>> device that gets elected in vhost_dev_elect_mem_logger(), as all the
> >>>>>> dev->log points to the same logger that is refenerce counted, that we
> >>>>>> have to add extra field and complex logic to maintain the election
> >>>>>> status.
> >>>>> One thing I don't understand here (and in the changelog) is why do we
> >>>>> need an election here?
> >>>> vhost_sync_dirty_bitmap() not just scans the guest memory sections but
> >>>> the specific one for virtqueues (used rings) also. To save more CPU
> >>>> cycles to the best extend, the guest memory must be scanned only once in
> >>>> each log iteration, though the logging for used rings would still have
> >>>> to use the specific vhost instance, so all vhost_device instance still
> >>>> keeps the dev->log pointer to the shared log as-is. Generally the shared
> >>>> memory logger can be picked from an arbitrary vhost_device instance, but
> >>>> to keep the code simple, performant and predictable
> >>> This is the point, I don't see why election is simpler than picking an
> >>> arbitrary shared log in this case.
> >> Maybe I missed your point, but I am confused and fail to understand why
> >> electing a fixed vhost_dev is not as simple? Regardless of the
> >> specifics, I think the point is one _fixed_ vhost_dev has to be picked
> >> amongst all the instances per backend type in charge of logging guest
> >> memory, no matter if it's at the start on the list or not.
> > See below.
> >
> >>>> , logger selection is
> >>>> made on the control path at the vhost add/remove time rather than be
> >>>> determined at the dirty log collection runtime, the latter of which is
> >>>> in the hotpath.
> >>>>
> >>>>>> I thought that Eugenio's previous suggestion tried to simplify
> >>>>>> the logic in vhost_dev_elect_mem_logger(), as the QLIST_FIRST macro that
> >>>>>> gets expanded to use the lh_first field for the QLIST would simply
> >>>>>> satisfy the basic need. Why extra logic to make the check ever more
> >>>>>> complex, is there any benefit by adding more fields to the vhost_dev?
> >>>>> I don't get here, the idea is to just pick one shared log which should
> >>>>> be much more simpler than what is proposed here.
> >>>> The code you showed earlier won't work as all vhost_device instance
> >>>> points to the same dev->log device...
> >>> This part I don't understand.
> >> vhost_log_get() has the following:
> >>
> >>       log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
> >>
> >>       if (!log || log->size != size) {
> >>           log = vhost_log_alloc(size, share);
> >>           if (share) {
> >>               vhost_log_shm[backend_type] = log;
> >>           } else {
> >>               vhost_log[backend_type] = log;
> >>           }
> >>       } else {
> >>           ++log->refcnt;
> >>       }
> >>
> >> So for a specific backend type, vhost_log_get() would return the same
> >> logger device (stored at either vhost_log_shm[] or vhost_log[]) to all
> >> vhost_dev instances, and the check you suggested earlier:
> >>
> >> dev->log == vhost_log_shm[dev->vhost_ops->backend_type]
> >>
> >> will always hold true if the vhost_dev instance (representing the
> >> specific virtqueue) is active.
> > Right, so the point is see if we can find something simpler to avoid
> > the QLIST as it involves vhost_dev which seems unnecessary.
> To make it independent of the specific vhost_dev, it would require the
> framework (migration dirty logger) to pass down "dirty_sync_count" like
> information to the vhost layer through memory listener .log_sync
> interface. I'm not sure if this change is worth the effort, as this
> patch is meant to fix a long standing bug

I may miss something but it looks to be a performance optimization.

> I suppose we need to find out
> a way that is applicable to back-port to past -stable's.
>
> >
> > Does something like a counter work?
> It won't. It seems the "rounds" counter is still per vhost_dev instance,
> but we need it per migration log_sync iteration across all vhost_dev
> instance of same backend type. Maybe I miss something, but I don't see
> how easily it would be to come up with proper accounting for "rounds",
> if not going through the list of vhost_dev instances at the run time
> (which is what I tried to avoid).

Ok, so I think I'm fine with this now.

So the root cause is the multiple listeners which we could tweak in the future.

Acked-by: Jason Wang <jasowang@redhat.com>

Thanks

>
>
> Thanks,
> -Siwei
> >
> > vhost_sync_dirty_bitmap() {
> >        rounds ++;
> >        vhost_dev_sync_region(rounds);
> > }
> >
> > vhost_dev_sync_region(rounds) {
> >        if (dev->log->rounds == rounds)
> >             return;
> >        else
> >             dev->log->rounds;
> > }
> >
> > (pseudo codes, just used to demonstrate the idea).
> >
> > Thanks
> >
> >> Regards,
> >> -Siwei
> >>> Thanks
> >>>
> >>>> Regards,
> >>>> -Siwei
> >>>>> Thanks
> >>>>>
> >>>>>> Thanks,
> >>>>>> -Siwei
> >>>>>>
> >>>>>>>>> And it helps to simplify the logic.
> >>>>>>>> Generally yes, but when it comes to hot path operations the performance
> >>>>>>>> consideration could override this principle. I think there's no harm to
> >>>>>>>> check against logger device cached in vhost layer itself, and the
> >>>>>>>> current patch does not create a lot of complexity or performance side
> >>>>>>>> effect (actually I think the conditional should be very straightforward
> >>>>>>>> to turn into just a couple of assembly compare and branch instructions
> >>>>>>>> rather than indirection through another jmp call).
> >>>>>>> Thanks
> >>>>>>>
> >>>>>>>> -Siwei
> >>>>>>>>
> >>>>>>>>> Thanks
> >>>>>>>>>
> >>>>>>>>>> -Siwei
> >>>>>>>>>>> ?
> >>>>>>>>>>>
> >>>>>>>>>>> Thanks
> >>>>>>>>>>>
>



^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2024-03-26  4:38 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-03-14 20:27 [PATCH v4 1/2] vhost: dirty log should be per backend type Si-Wei Liu
2024-03-14 20:27 ` [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration Si-Wei Liu
2024-03-15  4:03   ` Jason Wang
2024-03-15 18:44     ` Si-Wei Liu
2024-03-18  3:22       ` Jason Wang
2024-03-18 22:16         ` Si-Wei Liu
2024-03-20  3:27           ` Jason Wang
2024-03-20 21:02             ` Si-Wei Liu
2024-03-21  3:56               ` Jason Wang
2024-03-21 21:42                 ` Si-Wei Liu
2024-03-22  5:08                   ` Jason Wang
2024-03-22 21:13                     ` Si-Wei Liu
2024-03-25  6:13                       ` Jason Wang
2024-03-25 23:20                         ` [External] : " Si-Wei Liu
2024-03-26  4:36                           ` Jason Wang
2024-03-15  3:50 ` [PATCH v4 1/2] vhost: dirty log should be per backend type Jason Wang
2024-03-15 18:33   ` Si-Wei Liu
2024-03-18  3:20     ` Jason Wang
2024-03-18 22:06       ` Si-Wei Liu
2024-03-20  3:25         ` Jason Wang
2024-03-20 20:29           ` Si-Wei Liu
2024-03-21  3:53             ` Jason Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).