qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Dongli Zhang <dongli.zhang@oracle.com>
To: Mark Kanda <mark.kanda@oracle.com>, qemu-devel@nongnu.org
Cc: david@redhat.com, pbonzini@redhat.com, berrange@redhat.com
Subject: Re: [PATCH v4 1/1] oslib-posix: initialize backend memory objects in parallel
Date: Sat, 3 Feb 2024 14:43:37 -0800	[thread overview]
Message-ID: <8bec3d5c-03a2-02bc-ffff-4b46beeb206d@oracle.com> (raw)
In-Reply-To: <20240131165327.3154970-2-mark.kanda@oracle.com>



On 1/31/24 08:53, Mark Kanda wrote:
> QEMU initializes preallocated backend memory as the objects are parsed from
> the command line. This is not optimal in some cases (e.g. memory spanning
> multiple NUMA nodes) because the memory objects are initialized in series.
> 
> Allow the initialization to occur in parallel (asynchronously). In order to
> ensure optimal thread placement, asynchronous initialization requires prealloc
> context threads to be in use.
> 
> Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  backends/hostmem.c     |   7 ++-
>  hw/virtio/virtio-mem.c |   4 +-
>  include/hw/qdev-core.h |   5 ++
>  include/qemu/osdep.h   |  18 +++++-
>  system/vl.c            |   9 +++
>  util/oslib-posix.c     | 131 +++++++++++++++++++++++++++++++----------
>  util/oslib-win32.c     |   8 ++-
>  7 files changed, 145 insertions(+), 37 deletions(-)
> 
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index 30f69b2cb5..17221e422a 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -20,6 +20,7 @@
>  #include "qom/object_interfaces.h"
>  #include "qemu/mmap-alloc.h"
>  #include "qemu/madvise.h"
> +#include "hw/qdev-core.h"
>  
>  #ifdef CONFIG_NUMA
>  #include <numaif.h>
> @@ -237,7 +238,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
>          uint64_t sz = memory_region_size(&backend->mr);
>  
>          if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
> -                               backend->prealloc_context, errp)) {
> +                               backend->prealloc_context, false, errp)) {
>              return;
>          }
>          backend->prealloc = true;
> @@ -323,6 +324,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
>      HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
>      void *ptr;
>      uint64_t sz;
> +    bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
>  
>      if (!bc->alloc) {
>          return;
> @@ -398,7 +400,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
>      if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
>                                                  ptr, sz,
>                                                  backend->prealloc_threads,
> -                                                backend->prealloc_context, errp)) {
> +                                                backend->prealloc_context,
> +                                                async, errp)) {
>          return;
>      }
>  }
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index 99ab989852..ffd119ebac 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
>          int fd = memory_region_get_fd(&vmem->memdev->mr);
>          Error *local_err = NULL;
>  
> -        if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
> +        if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
>              static bool warned;
>  
>              /*
> @@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
>      int fd = memory_region_get_fd(&vmem->memdev->mr);
>      Error *local_err = NULL;
>  
> -    if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
> +    if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
>          error_report_err(local_err);
>          return -ENOMEM;
>      }
> diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
> index 151d968238..83dd9e2485 100644
> --- a/include/hw/qdev-core.h
> +++ b/include/hw/qdev-core.h
> @@ -1071,6 +1071,11 @@ typedef enum MachineInitPhase {
>       */
>      PHASE_ACCEL_CREATED,
>  
> +    /*
> +     * Late backend objects have been created and initialized.
> +     */
> +    PHASE_LATE_BACKENDS_CREATED,
> +
>      /*
>       * machine_class->init has been called, thus creating any embedded
>       * devices and validating machine properties.  Devices created at
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index c9692cc314..7d359dabc4 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
>   * @area: start address of the are to preallocate
>   * @sz: the size of the area to preallocate
>   * @max_threads: maximum number of threads to use
> + * @tc: prealloc context threads pointer, NULL if not in use
> + * @async: request asynchronous preallocation, requires @tc
>   * @errp: returns an error if this function fails
>   *
>   * Preallocate memory (populate/prefault page tables writable) for the virtual
> @@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
>   * each page in the area was faulted in writable at least once, for example,
>   * after allocating file blocks for mapped files.
>   *
> + * When setting @async, allocation might be performed asynchronously.
> + * qemu_finish_async_prealloc_mem() must be called to finish any asynchronous
> + * preallocation.
> + *
>   * Return: true on success, else false setting @errp with error.
>   */
>  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> -                       ThreadContext *tc, Error **errp);
> +                       ThreadContext *tc, bool async, Error **errp);
> +
> +/**
> + * qemu_finish_async_prealloc_mem:
> + * @errp: returns an error if this function fails
> + *
> + * Finish all outstanding asynchronous memory preallocation.
> + *
> + * Return: true on success, else false setting @errp with error.
> + */
> +bool qemu_finish_async_prealloc_mem(Error **errp);
>  
>  /**
>   * qemu_get_pid_name:
> diff --git a/system/vl.c b/system/vl.c
> index 788d88ea03..e6bc5d9dd9 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)
>  
>      object_option_foreach_add(object_create_late);
>  
> +    /*
> +     * Wait for any outstanding memory prealloc from created memory
> +     * backends to complete.
> +     */
> +    if (!qemu_finish_async_prealloc_mem(&error_fatal)) {
> +        exit(1);
> +    }
> +
>      if (tpm_init() < 0) {
>          exit(1);
>      }
> @@ -3695,6 +3703,7 @@ void qemu_init(int argc, char **argv)
>       * over memory-backend-file objects).
>       */
>      qemu_create_late_backends();
> +    phase_advance(PHASE_LATE_BACKENDS_CREATED);
>  
>      /*
>       * Note: creates a QOM object, must run only after global and
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 7c297003b9..dada4722f6 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -42,6 +42,7 @@
>  #include "qemu/cutils.h"
>  #include "qemu/units.h"
>  #include "qemu/thread-context.h"
> +#include "qemu/main-loop.h"
>  
>  #ifdef CONFIG_LINUX
>  #include <sys/syscall.h>
> @@ -63,11 +64,15 @@
>  
>  struct MemsetThread;
>  
> +static QLIST_HEAD(, MemsetContext) memset_contexts =
> +    QLIST_HEAD_INITIALIZER(memset_contexts);
> +
>  typedef struct MemsetContext {
>      bool all_threads_created;
>      bool any_thread_failed;
>      struct MemsetThread *threads;
>      int num_threads;
> +    QLIST_ENTRY(MemsetContext) next;
>  } MemsetContext;
>  
>  struct MemsetThread {
> @@ -412,19 +417,44 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
>      return ret;
>  }
>  
> +static int wait_and_free_mem_prealloc_context(MemsetContext *context)
> +{
> +    int i, ret = 0, tmp;
> +
> +    for (i = 0; i < context->num_threads; i++) {
> +        tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
> +
> +        if (tmp) {
> +            ret = tmp;
> +        }
> +    }
> +    g_free(context->threads);
> +    g_free(context);
> +    return ret;
> +}
> +
>  static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> -                           int max_threads, ThreadContext *tc,
> +                           int max_threads, ThreadContext *tc, bool async,
>                             bool use_madv_populate_write)
>  {
>      static gsize initialized = 0;
> -    MemsetContext context = {
> -        .num_threads = get_memset_num_threads(hpagesize, numpages, max_threads),
> -    };
> +    MemsetContext *context = g_malloc0(sizeof(MemsetContext));
>      size_t numpages_per_thread, leftover;
>      void *(*touch_fn)(void *);
> -    int ret = 0, i = 0;
> +    int ret, i = 0;
>      char *addr = area;
>  
> +    /*
> +     * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
> +     * and prealloc context for thread placement.
> +     */
> +    if (!use_madv_populate_write || !tc) {
> +        async = false;
> +    }
> +
> +    context->num_threads =
> +        get_memset_num_threads(hpagesize, numpages, max_threads);
> +
>      if (g_once_init_enter(&initialized)) {
>          qemu_mutex_init(&page_mutex);
>          qemu_cond_init(&page_cond);
> @@ -432,8 +462,11 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>      }
>  
>      if (use_madv_populate_write) {
> -        /* Avoid creating a single thread for MADV_POPULATE_WRITE */
> -        if (context.num_threads == 1) {
> +        /*
> +         * Avoid creating a single thread for MADV_POPULATE_WRITE when
> +         * preallocating synchronously.
> +         */
> +        if (context->num_threads == 1 && !async) {
>              if (qemu_madvise(area, hpagesize * numpages,
>                               QEMU_MADV_POPULATE_WRITE)) {
>                  return -errno;
> @@ -445,50 +478,86 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>          touch_fn = do_touch_pages;
>      }
>  
> -    context.threads = g_new0(MemsetThread, context.num_threads);
> -    numpages_per_thread = numpages / context.num_threads;
> -    leftover = numpages % context.num_threads;
> -    for (i = 0; i < context.num_threads; i++) {
> -        context.threads[i].addr = addr;
> -        context.threads[i].numpages = numpages_per_thread + (i < leftover);
> -        context.threads[i].hpagesize = hpagesize;
> -        context.threads[i].context = &context;
> +    context->threads = g_new0(MemsetThread, context->num_threads);
> +    numpages_per_thread = numpages / context->num_threads;
> +    leftover = numpages % context->num_threads;
> +    for (i = 0; i < context->num_threads; i++) {
> +        context->threads[i].addr = addr;
> +        context->threads[i].numpages = numpages_per_thread + (i < leftover);
> +        context->threads[i].hpagesize = hpagesize;
> +        context->threads[i].context = context;
>          if (tc) {
> -            thread_context_create_thread(tc, &context.threads[i].pgthread,
> +            thread_context_create_thread(tc, &context->threads[i].pgthread,
>                                           "touch_pages",
> -                                         touch_fn, &context.threads[i],
> +                                         touch_fn, &context->threads[i],
>                                           QEMU_THREAD_JOINABLE);
>          } else {
> -            qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
> -                               touch_fn, &context.threads[i],
> +            qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
> +                               touch_fn, &context->threads[i],
>                                 QEMU_THREAD_JOINABLE);
>          }
> -        addr += context.threads[i].numpages * hpagesize;
> +        addr += context->threads[i].numpages * hpagesize;
> +    }
> +
> +    if (async) {
> +        /*
> +         * async requests currently require the BQL. Add it to the list and kick
> +         * preallocation off during qemu_finish_async_prealloc_mem().
> +         */
> +        assert(bql_locked());
> +        QLIST_INSERT_HEAD(&memset_contexts, context, next);
> +        return 0;
>      }
>  
>      if (!use_madv_populate_write) {
> -        sigbus_memset_context = &context;
> +        sigbus_memset_context = context;
>      }
>  
>      qemu_mutex_lock(&page_mutex);
> -    context.all_threads_created = true;
> +    context->all_threads_created = true;
>      qemu_cond_broadcast(&page_cond);
>      qemu_mutex_unlock(&page_mutex);
>  
> -    for (i = 0; i < context.num_threads; i++) {
> -        int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
> +    ret = wait_and_free_mem_prealloc_context(context);
>  
> +    if (!use_madv_populate_write) {
> +        sigbus_memset_context = NULL;
> +    }
> +    return ret;
> +}
> +
> +bool qemu_finish_async_prealloc_mem(Error **errp)
> +{
> +    int ret, tmp;

The above should be initialized?

I did a build test and encounter:

In file included from ../util/oslib-posix.c:36:
../util/oslib-posix.c: In function ‘qemu_finish_async_prealloc_mem’:
/home/libvirt/vm/software/qemu/include/qapi/error.h:334:5: error: ‘ret’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
  334 |     error_setg_errno_internal((errp), __FILE__, __LINE__, __func__,     \
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~
../util/oslib-posix.c:531:9: note: ‘ret’ was declared here
  531 |     int ret, tmp;
      |         ^~~
cc1: all warnings being treated as errors
ninja: build stopped: subcommand failed.
make: *** [Makefile:162: run-ninja] Error 1

Thank you very much!

Dongli Zhang

> +    MemsetContext *context, *next_context;
> +
> +    /* Waiting for preallocation requires the BQL. */
> +    assert(bql_locked());
> +    if (QLIST_EMPTY(&memset_contexts)) {
> +        return true;
> +    }
> +
> +    qemu_mutex_lock(&page_mutex);
> +    QLIST_FOREACH(context, &memset_contexts, next) {
> +        context->all_threads_created = true;
> +    }
> +    qemu_cond_broadcast(&page_cond);
> +    qemu_mutex_unlock(&page_mutex);
> +
> +    QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
> +        QLIST_REMOVE(context, next);
> +        tmp = wait_and_free_mem_prealloc_context(context);
>          if (tmp) {
>              ret = tmp;
>          }
>      }
>  
> -    if (!use_madv_populate_write) {
> -        sigbus_memset_context = NULL;
> +    if (ret) {
> +        error_setg_errno(errp, -ret,
> +                         "qemu_prealloc_mem: preallocating memory failed");
> +        return false;
>      }
> -    g_free(context.threads);
> -
> -    return ret;
> +    return true;
>  }
>  
>  static bool madv_populate_write_possible(char *area, size_t pagesize)
> @@ -498,7 +567,7 @@ static bool madv_populate_write_possible(char *area, size_t pagesize)
>  }
>  
>  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> -                       ThreadContext *tc, Error **errp)
> +                       ThreadContext *tc, bool async, Error **errp)
>  {
>      static gsize initialized;
>      int ret;
> @@ -540,7 +609,7 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
>      }
>  
>      /* touch pages simultaneously */
> -    ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
> +    ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
>                            use_madv_populate_write);
>      if (ret) {
>          error_setg_errno(errp, -ret,
> diff --git a/util/oslib-win32.c b/util/oslib-win32.c
> index c4a5f05a49..b623830d62 100644
> --- a/util/oslib-win32.c
> +++ b/util/oslib-win32.c
> @@ -265,7 +265,7 @@ int getpagesize(void)
>  }
>  
>  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> -                       ThreadContext *tc, Error **errp)
> +                       ThreadContext *tc, bool async, Error **errp)
>  {
>      int i;
>      size_t pagesize = qemu_real_host_page_size();
> @@ -278,6 +278,12 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
>      return true;
>  }
>  
> +bool qemu_finish_async_prealloc_mem(Error **errp)
> +{
> +    /* async prealloc not supported, there is nothing to finish */
> +    return true;
> +}
> +
>  char *qemu_get_pid_name(pid_t pid)
>  {
>      /* XXX Implement me */


  parent reply	other threads:[~2024-02-03 22:44 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-31 16:53 [PATCH v4 0/1] Initialize backend memory objects in parallel Mark Kanda
2024-01-31 16:53 ` [PATCH v4 1/1] oslib-posix: initialize " Mark Kanda
2024-01-31 18:22   ` David Hildenbrand
2024-02-02  8:30     ` Mario Casquero
2024-02-03 22:43   ` Dongli Zhang [this message]
2024-02-04 16:54     ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8bec3d5c-03a2-02bc-ffff-4b46beeb206d@oracle.com \
    --to=dongli.zhang@oracle.com \
    --cc=berrange@redhat.com \
    --cc=david@redhat.com \
    --cc=mark.kanda@oracle.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).