qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] virtio-blk: iothread-vq-mapping coroutine pool sizing
@ 2024-03-12 15:12 Stefan Hajnoczi
  2024-03-12 18:02 ` Michael S. Tsirkin
  2024-03-18 18:37 ` Stefan Hajnoczi
  0 siblings, 2 replies; 3+ messages in thread
From: Stefan Hajnoczi @ 2024-03-12 15:12 UTC (permalink / raw)
  To: qemu-devel
  Cc: qemu-block, Michael S. Tsirkin, Kevin Wolf, Hanna Reitz,
	Stefan Hajnoczi, Sanjay Rao, Boaz Ben Shabat, Joe Mario

It is possible to hit the sysctl vm.max_map_count limit when the
coroutine pool size becomes large. Each coroutine requires two mappings
(one for the stack and one for the guard page). QEMU can crash with
"failed to set up stack guard page" or "failed to allocate memory for
stack" when this happens.

Coroutine pool sizing is simple when there is only thread: sum up all
I/O requests across all virtqueues.

When the iothread-vq-mapping option is used we should calculate tighter
bounds because thread may serve a subset of the device's virtqueues:
take the maximum number of the number of I/O requests across all
virtqueues. A thread does not need coroutine pool space for I/O requests
that are handled by other threads.

This is not a solution to hitting vm.max_map_count, but it helps. A
guest with 64 vCPUs (hence 64 virtqueues) across 4 IOThreads with one
iothread-vq-mapping virtio-blk device and a root disk without goes from
pool_max_size 16,448 to 10,304.

Reported-by: Sanjay Rao <srao@redhat.com>
Reported-by: Boaz Ben Shabat <bbenshab@redhat.com>
Reported-by: Joe Mario <jmario@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
v2:
- State the the tighter bounds reflect the fact that threads may only
  process a subset of the total I/O requests from a device [Kevin]
- Add Reported-by: Joe Mario, he has been investigating this issue.

 include/hw/virtio/virtio-blk.h |  2 ++
 hw/block/virtio-blk.c          | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 5c14110c4b..ac29700ad4 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -74,6 +74,8 @@ struct VirtIOBlock {
     uint64_t host_features;
     size_t config_size;
     BlockRAMRegistrar blk_ram_registrar;
+
+    unsigned coroutine_pool_size;
 };
 
 typedef struct VirtIOBlockReq {
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 738cb2ac36..0a14b2b175 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1957,6 +1957,35 @@ static void virtio_blk_stop_ioeventfd(VirtIODevice *vdev)
     s->ioeventfd_stopping = false;
 }
 
+/* Increase the coroutine pool size to include our I/O requests */
+static void virtio_blk_inc_coroutine_pool_size(VirtIOBlock *s)
+{
+    VirtIOBlkConf *conf = &s->conf;
+    unsigned max_requests = 0;
+
+    /* Tracks the total number of requests for AioContext */
+    g_autoptr(GHashTable) counters = g_hash_table_new(NULL, NULL);
+
+    /* Call this function after setting up vq_aio_context[] */
+    assert(s->vq_aio_context);
+
+    for (unsigned i = 0; i < conf->num_queues; i++) {
+        AioContext *ctx = s->vq_aio_context[i];
+        unsigned n = GPOINTER_TO_UINT(g_hash_table_lookup(counters, ctx));
+
+        n += conf->queue_size / 2; /* this is a heuristic */
+
+        g_hash_table_insert(counters, ctx, GUINT_TO_POINTER(n));
+
+        if (n > max_requests) {
+            max_requests = n;
+        }
+    }
+
+    qemu_coroutine_inc_pool_size(max_requests);
+    s->coroutine_pool_size = max_requests; /* stash it for ->unrealize() */
+}
+
 static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -2048,7 +2077,6 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
     for (i = 0; i < conf->num_queues; i++) {
         virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
     }
-    qemu_coroutine_inc_pool_size(conf->num_queues * conf->queue_size / 2);
 
     /* Don't start ioeventfd if transport does not support notifiers. */
     if (!virtio_device_ioeventfd_enabled(vdev)) {
@@ -2065,6 +2093,8 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    virtio_blk_inc_coroutine_pool_size(s);
+
     /*
      * This must be after virtio_init() so virtio_blk_dma_restart_cb() gets
      * called after ->start_ioeventfd() has already set blk's AioContext.
@@ -2096,7 +2126,7 @@ static void virtio_blk_device_unrealize(DeviceState *dev)
     for (i = 0; i < conf->num_queues; i++) {
         virtio_del_queue(vdev, i);
     }
-    qemu_coroutine_dec_pool_size(conf->num_queues * conf->queue_size / 2);
+    qemu_coroutine_dec_pool_size(s->coroutine_pool_size);
     qemu_mutex_destroy(&s->rq_lock);
     blk_ram_registrar_destroy(&s->blk_ram_registrar);
     qemu_del_vm_change_state_handler(s->change);
-- 
2.44.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] virtio-blk: iothread-vq-mapping coroutine pool sizing
  2024-03-12 15:12 [PATCH v2] virtio-blk: iothread-vq-mapping coroutine pool sizing Stefan Hajnoczi
@ 2024-03-12 18:02 ` Michael S. Tsirkin
  2024-03-18 18:37 ` Stefan Hajnoczi
  1 sibling, 0 replies; 3+ messages in thread
From: Michael S. Tsirkin @ 2024-03-12 18:02 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: qemu-devel, qemu-block, Kevin Wolf, Hanna Reitz, Sanjay Rao,
	Boaz Ben Shabat, Joe Mario

On Tue, Mar 12, 2024 at 11:12:04AM -0400, Stefan Hajnoczi wrote:
> It is possible to hit the sysctl vm.max_map_count limit when the
> coroutine pool size becomes large. Each coroutine requires two mappings
> (one for the stack and one for the guard page). QEMU can crash with
> "failed to set up stack guard page" or "failed to allocate memory for
> stack" when this happens.
> 
> Coroutine pool sizing is simple when there is only thread: sum up all
> I/O requests across all virtqueues.
> 
> When the iothread-vq-mapping option is used we should calculate tighter
> bounds because thread may serve a subset of the device's virtqueues:
> take the maximum number of the number of I/O requests across all
> virtqueues. A thread does not need coroutine pool space for I/O requests
> that are handled by other threads.
> 
> This is not a solution to hitting vm.max_map_count, but it helps. A
> guest with 64 vCPUs (hence 64 virtqueues) across 4 IOThreads with one
> iothread-vq-mapping virtio-blk device and a root disk without goes from
> pool_max_size 16,448 to 10,304.
> 
> Reported-by: Sanjay Rao <srao@redhat.com>
> Reported-by: Boaz Ben Shabat <bbenshab@redhat.com>
> Reported-by: Joe Mario <jmario@redhat.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>


Looks reasonable.

Reviewed-by: Michael S. Tsirkin <mst@redhat.com>

if you want me to merge it, let me know pls.

> ---
> v2:
> - State the the tighter bounds reflect the fact that threads may only
>   process a subset of the total I/O requests from a device [Kevin]
> - Add Reported-by: Joe Mario, he has been investigating this issue.
> 
>  include/hw/virtio/virtio-blk.h |  2 ++
>  hw/block/virtio-blk.c          | 34 ++++++++++++++++++++++++++++++++--
>  2 files changed, 34 insertions(+), 2 deletions(-)
> 
> diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
> index 5c14110c4b..ac29700ad4 100644
> --- a/include/hw/virtio/virtio-blk.h
> +++ b/include/hw/virtio/virtio-blk.h
> @@ -74,6 +74,8 @@ struct VirtIOBlock {
>      uint64_t host_features;
>      size_t config_size;
>      BlockRAMRegistrar blk_ram_registrar;
> +
> +    unsigned coroutine_pool_size;
>  };
>  
>  typedef struct VirtIOBlockReq {
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 738cb2ac36..0a14b2b175 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -1957,6 +1957,35 @@ static void virtio_blk_stop_ioeventfd(VirtIODevice *vdev)
>      s->ioeventfd_stopping = false;
>  }
>  
> +/* Increase the coroutine pool size to include our I/O requests */
> +static void virtio_blk_inc_coroutine_pool_size(VirtIOBlock *s)
> +{
> +    VirtIOBlkConf *conf = &s->conf;
> +    unsigned max_requests = 0;
> +
> +    /* Tracks the total number of requests for AioContext */
> +    g_autoptr(GHashTable) counters = g_hash_table_new(NULL, NULL);
> +
> +    /* Call this function after setting up vq_aio_context[] */
> +    assert(s->vq_aio_context);
> +
> +    for (unsigned i = 0; i < conf->num_queues; i++) {
> +        AioContext *ctx = s->vq_aio_context[i];
> +        unsigned n = GPOINTER_TO_UINT(g_hash_table_lookup(counters, ctx));
> +
> +        n += conf->queue_size / 2; /* this is a heuristic */
> +
> +        g_hash_table_insert(counters, ctx, GUINT_TO_POINTER(n));
> +
> +        if (n > max_requests) {
> +            max_requests = n;
> +        }
> +    }
> +
> +    qemu_coroutine_inc_pool_size(max_requests);
> +    s->coroutine_pool_size = max_requests; /* stash it for ->unrealize() */
> +}
> +
>  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>  {
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> @@ -2048,7 +2077,6 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>      for (i = 0; i < conf->num_queues; i++) {
>          virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
>      }
> -    qemu_coroutine_inc_pool_size(conf->num_queues * conf->queue_size / 2);
>  
>      /* Don't start ioeventfd if transport does not support notifiers. */
>      if (!virtio_device_ioeventfd_enabled(vdev)) {
> @@ -2065,6 +2093,8 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> +    virtio_blk_inc_coroutine_pool_size(s);
> +
>      /*
>       * This must be after virtio_init() so virtio_blk_dma_restart_cb() gets
>       * called after ->start_ioeventfd() has already set blk's AioContext.
> @@ -2096,7 +2126,7 @@ static void virtio_blk_device_unrealize(DeviceState *dev)
>      for (i = 0; i < conf->num_queues; i++) {
>          virtio_del_queue(vdev, i);
>      }
> -    qemu_coroutine_dec_pool_size(conf->num_queues * conf->queue_size / 2);
> +    qemu_coroutine_dec_pool_size(s->coroutine_pool_size);
>      qemu_mutex_destroy(&s->rq_lock);
>      blk_ram_registrar_destroy(&s->blk_ram_registrar);
>      qemu_del_vm_change_state_handler(s->change);
> -- 
> 2.44.0



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] virtio-blk: iothread-vq-mapping coroutine pool sizing
  2024-03-12 15:12 [PATCH v2] virtio-blk: iothread-vq-mapping coroutine pool sizing Stefan Hajnoczi
  2024-03-12 18:02 ` Michael S. Tsirkin
@ 2024-03-18 18:37 ` Stefan Hajnoczi
  1 sibling, 0 replies; 3+ messages in thread
From: Stefan Hajnoczi @ 2024-03-18 18:37 UTC (permalink / raw)
  To: qemu-devel
  Cc: qemu-block, Michael S. Tsirkin, Kevin Wolf, Hanna Reitz,
	Sanjay Rao, Boaz Ben Shabat, Joe Mario

[-- Attachment #1: Type: text/plain, Size: 5121 bytes --]

On Tue, Mar 12, 2024 at 11:12:04AM -0400, Stefan Hajnoczi wrote:
> It is possible to hit the sysctl vm.max_map_count limit when the
> coroutine pool size becomes large. Each coroutine requires two mappings
> (one for the stack and one for the guard page). QEMU can crash with
> "failed to set up stack guard page" or "failed to allocate memory for
> stack" when this happens.
> 
> Coroutine pool sizing is simple when there is only thread: sum up all
> I/O requests across all virtqueues.
> 
> When the iothread-vq-mapping option is used we should calculate tighter
> bounds because thread may serve a subset of the device's virtqueues:
> take the maximum number of the number of I/O requests across all
> virtqueues. A thread does not need coroutine pool space for I/O requests
> that are handled by other threads.
> 
> This is not a solution to hitting vm.max_map_count, but it helps. A
> guest with 64 vCPUs (hence 64 virtqueues) across 4 IOThreads with one
> iothread-vq-mapping virtio-blk device and a root disk without goes from
> pool_max_size 16,448 to 10,304.
> 
> Reported-by: Sanjay Rao <srao@redhat.com>
> Reported-by: Boaz Ben Shabat <bbenshab@redhat.com>
> Reported-by: Joe Mario <jmario@redhat.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> v2:
> - State the the tighter bounds reflect the fact that threads may only
>   process a subset of the total I/O requests from a device [Kevin]
> - Add Reported-by: Joe Mario, he has been investigating this issue.

I have sent a new patch series that obsoletes this patch. Please do not
apply this patch.

The new series is here:
https://lore.kernel.org/qemu-devel/20240318183429.1039340-1-stefanha@redhat.com/T/#u

> 
>  include/hw/virtio/virtio-blk.h |  2 ++
>  hw/block/virtio-blk.c          | 34 ++++++++++++++++++++++++++++++++--
>  2 files changed, 34 insertions(+), 2 deletions(-)
> 
> diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
> index 5c14110c4b..ac29700ad4 100644
> --- a/include/hw/virtio/virtio-blk.h
> +++ b/include/hw/virtio/virtio-blk.h
> @@ -74,6 +74,8 @@ struct VirtIOBlock {
>      uint64_t host_features;
>      size_t config_size;
>      BlockRAMRegistrar blk_ram_registrar;
> +
> +    unsigned coroutine_pool_size;
>  };
>  
>  typedef struct VirtIOBlockReq {
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 738cb2ac36..0a14b2b175 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -1957,6 +1957,35 @@ static void virtio_blk_stop_ioeventfd(VirtIODevice *vdev)
>      s->ioeventfd_stopping = false;
>  }
>  
> +/* Increase the coroutine pool size to include our I/O requests */
> +static void virtio_blk_inc_coroutine_pool_size(VirtIOBlock *s)
> +{
> +    VirtIOBlkConf *conf = &s->conf;
> +    unsigned max_requests = 0;
> +
> +    /* Tracks the total number of requests for AioContext */
> +    g_autoptr(GHashTable) counters = g_hash_table_new(NULL, NULL);
> +
> +    /* Call this function after setting up vq_aio_context[] */
> +    assert(s->vq_aio_context);
> +
> +    for (unsigned i = 0; i < conf->num_queues; i++) {
> +        AioContext *ctx = s->vq_aio_context[i];
> +        unsigned n = GPOINTER_TO_UINT(g_hash_table_lookup(counters, ctx));
> +
> +        n += conf->queue_size / 2; /* this is a heuristic */
> +
> +        g_hash_table_insert(counters, ctx, GUINT_TO_POINTER(n));
> +
> +        if (n > max_requests) {
> +            max_requests = n;
> +        }
> +    }
> +
> +    qemu_coroutine_inc_pool_size(max_requests);
> +    s->coroutine_pool_size = max_requests; /* stash it for ->unrealize() */
> +}
> +
>  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>  {
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> @@ -2048,7 +2077,6 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>      for (i = 0; i < conf->num_queues; i++) {
>          virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
>      }
> -    qemu_coroutine_inc_pool_size(conf->num_queues * conf->queue_size / 2);
>  
>      /* Don't start ioeventfd if transport does not support notifiers. */
>      if (!virtio_device_ioeventfd_enabled(vdev)) {
> @@ -2065,6 +2093,8 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> +    virtio_blk_inc_coroutine_pool_size(s);
> +
>      /*
>       * This must be after virtio_init() so virtio_blk_dma_restart_cb() gets
>       * called after ->start_ioeventfd() has already set blk's AioContext.
> @@ -2096,7 +2126,7 @@ static void virtio_blk_device_unrealize(DeviceState *dev)
>      for (i = 0; i < conf->num_queues; i++) {
>          virtio_del_queue(vdev, i);
>      }
> -    qemu_coroutine_dec_pool_size(conf->num_queues * conf->queue_size / 2);
> +    qemu_coroutine_dec_pool_size(s->coroutine_pool_size);
>      qemu_mutex_destroy(&s->rq_lock);
>      blk_ram_registrar_destroy(&s->blk_ram_registrar);
>      qemu_del_vm_change_state_handler(s->change);
> -- 
> 2.44.0
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-03-18 18:37 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-03-12 15:12 [PATCH v2] virtio-blk: iothread-vq-mapping coroutine pool sizing Stefan Hajnoczi
2024-03-12 18:02 ` Michael S. Tsirkin
2024-03-18 18:37 ` Stefan Hajnoczi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).