qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Mark Kanda <mark.kanda@oracle.com>
To: "David Hildenbrand" <david@redhat.com>,
	"Daniel P. Berrangé" <berrange@redhat.com>
Cc: qemu-devel@nongnu.org, pbonzini@redhat.com
Subject: Re: [PATCH v1 1/2] oslib-posix: refactor memory prealloc threads
Date: Tue, 9 Jan 2024 12:42:01 -0600	[thread overview]
Message-ID: <c44c60b2-bc82-4884-a7df-98b327737495@oracle.com> (raw)
In-Reply-To: <fc3a6a13-6a2d-44c4-9cf0-a91dc2cf1b97@redhat.com>



On 1/9/24 8:25 AM, David Hildenbrand wrote:
> On 09.01.24 15:15, Daniel P. Berrangé wrote:
>> On Tue, Jan 09, 2024 at 03:02:00PM +0100, David Hildenbrand wrote:
>>> On 08.01.24 19:40, Mark Kanda wrote:
>>>> On 1/8/24 9:40 AM, David Hildenbrand wrote:
>>>>> On 08.01.24 16:10, Mark Kanda wrote:
>>>>>> Refactor the memory prealloc threads support:
>>>>>> - Make memset context a global qlist
>>>>>> - Move the memset thread join/cleanup code to a separate routine
>>>>>>
>>>>>> This is functionally equivalent and facilitates multiple memset 
>>>>>> contexts
>>>>>> (used in a subsequent patch).
>>>>>>
>>>>>> Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
>>>>>> ---
>>>>>>     util/oslib-posix.c | 104 
>>>>>> +++++++++++++++++++++++++++++----------------
>>>>>>     1 file changed, 68 insertions(+), 36 deletions(-)
>>>>>>
>>>>>> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
>>>>>> index e86fd64e09..293297ac6c 100644
>>>>>> --- a/util/oslib-posix.c
>>>>>> +++ b/util/oslib-posix.c
>>>>>> @@ -63,11 +63,15 @@
>>>>>>       struct MemsetThread;
>>>>>>     +static QLIST_HEAD(, MemsetContext) memset_contexts =
>>>>>> +    QLIST_HEAD_INITIALIZER(memset_contexts);
>>>>>> +
>>>>>>     typedef struct MemsetContext {
>>>>>>         bool all_threads_created;
>>>>>>         bool any_thread_failed;
>>>>>>         struct MemsetThread *threads;
>>>>>>         int num_threads;
>>>>>> +    QLIST_ENTRY(MemsetContext) next;
>>>>>>     } MemsetContext;
>>>>>>       struct MemsetThread {
>>>>>> @@ -81,7 +85,7 @@ struct MemsetThread {
>>>>>>     typedef struct MemsetThread MemsetThread;
>>>>>>       /* used by sigbus_handler() */
>>>>>> -static MemsetContext *sigbus_memset_context;
>>>>>> +static bool sigbus_memset_context;
>>>>>>     struct sigaction sigbus_oldact;
>>>>>>     static QemuMutex sigbus_mutex;
>>>>>>     @@ -295,13 +299,16 @@ static void sigbus_handler(int signal)
>>>>>>     #endif /* CONFIG_LINUX */
>>>>>>     {
>>>>>>         int i;
>>>>>> +    MemsetContext *context;
>>>>>>           if (sigbus_memset_context) {
>>>>>> -        for (i = 0; i < sigbus_memset_context->num_threads; i++) {
>>>>>> -            MemsetThread *thread = 
>>>>>> &sigbus_memset_context->threads[i];
>>>>>> +        QLIST_FOREACH(context, &memset_contexts, next) {
>>>>>> +            for (i = 0; i < context->num_threads; i++) {
>>>>>> +                MemsetThread *thread = &context->threads[i];
>>>>>>     -            if (qemu_thread_is_self(&thread->pgthread)) {
>>>>>> -                siglongjmp(thread->env, 1);
>>>>>> +                if (qemu_thread_is_self(&thread->pgthread)) {
>>>>>> +                    siglongjmp(thread->env, 1);
>>>>>> +                }
>>>>>>                 }
>>>>>>             }
>>>>>>         }
>>>>>> @@ -417,14 +424,15 @@ static int touch_all_pages(char *area, size_t
>>>>>> hpagesize, size_t numpages,
>>>>>>                                bool use_madv_populate_write)
>>>>>>     {
>>>>>>         static gsize initialized = 0;
>>>>>> -    MemsetContext context = {
>>>>>> -        .num_threads = get_memset_num_threads(hpagesize, numpages,
>>>>>> max_threads),
>>>>>> -    };
>>>>>> +    MemsetContext *context = g_malloc0(sizeof(MemsetContext));
>>>>>>         size_t numpages_per_thread, leftover;
>>>>>>         void *(*touch_fn)(void *);
>>>>>> -    int ret = 0, i = 0;
>>>>>> +    int i = 0;
>>>>>>         char *addr = area;
>>>>>>     +    context->num_threads =
>>>>>> +        get_memset_num_threads(hpagesize, numpages, max_threads);
>>>>>> +
>>>>>>         if (g_once_init_enter(&initialized)) {
>>>>>>             qemu_mutex_init(&page_mutex);
>>>>>>             qemu_cond_init(&page_cond);
>>>>>> @@ -433,7 +441,7 @@ static int touch_all_pages(char *area, size_t
>>>>>> hpagesize, size_t numpages,
>>>>>>           if (use_madv_populate_write) {
>>>>>>             /* Avoid creating a single thread for 
>>>>>> MADV_POPULATE_WRITE */
>>>>>> -        if (context.num_threads == 1) {
>>>>>> +        if (context->num_threads == 1) {
>>>>>>                 if (qemu_madvise(area, hpagesize * numpages,
>>>>>> QEMU_MADV_POPULATE_WRITE)) {
>>>>>>                     return -errno;
>>>>>> @@ -445,49 +453,74 @@ static int touch_all_pages(char *area, size_t
>>>>>> hpagesize, size_t numpages,
>>>>>>             touch_fn = do_touch_pages;
>>>>>>         }
>>>>>>     -    context.threads = g_new0(MemsetThread, 
>>>>>> context.num_threads);
>>>>>> -    numpages_per_thread = numpages / context.num_threads;
>>>>>> -    leftover = numpages % context.num_threads;
>>>>>> -    for (i = 0; i < context.num_threads; i++) {
>>>>>> -        context.threads[i].addr = addr;
>>>>>> -        context.threads[i].numpages = numpages_per_thread + (i <
>>>>>> leftover);
>>>>>> -        context.threads[i].hpagesize = hpagesize;
>>>>>> -        context.threads[i].context = &context;
>>>>>> +    context->threads = g_new0(MemsetThread, context->num_threads);
>>>>>> +    numpages_per_thread = numpages / context->num_threads;
>>>>>> +    leftover = numpages % context->num_threads;
>>>>>> +    for (i = 0; i < context->num_threads; i++) {
>>>>>> +        context->threads[i].addr = addr;
>>>>>> +        context->threads[i].numpages = numpages_per_thread + (i <
>>>>>> leftover);
>>>>>> +        context->threads[i].hpagesize = hpagesize;
>>>>>> +        context->threads[i].context = context;
>>>>>>             if (tc) {
>>>>>> -            thread_context_create_thread(tc,
>>>>>> &context.threads[i].pgthread,
>>>>>> +            thread_context_create_thread(tc,
>>>>>> &context->threads[i].pgthread,
>>>>>> "touch_pages",
>>>>>> -                                         touch_fn, 
>>>>>> &context.threads[i],
>>>>>> +                                         touch_fn,
>>>>>> &context->threads[i],
>>>>>> QEMU_THREAD_JOINABLE);
>>>>>>             } else {
>>>>>> - qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
>>>>>> -                               touch_fn, &context.threads[i],
>>>>>> + qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
>>>>>> +                               touch_fn, &context->threads[i],
>>>>>> QEMU_THREAD_JOINABLE);
>>>>>>             }
>>>>>> -        addr += context.threads[i].numpages * hpagesize;
>>>>>> +        addr += context->threads[i].numpages * hpagesize;
>>>>>>         }
>>>>>>     +    QLIST_INSERT_HEAD(&memset_contexts, context, next);
>>>>>> +
>>>>>>         if (!use_madv_populate_write) {
>>>>>> -        sigbus_memset_context = &context;
>>>>>> +        sigbus_memset_context = true;
>>>>>
>>>> Thanks David,
>>>>
>>>>> Could we just use the sigbus handling alone and support parallel init
>>>>> only when using MADV_POPULATE_WRITE where we don't have to mess with
>>>>> signal handlers?
>>>>>
>>>>
>>>> Ideally, we're hoping to support this with earlier kernels which don't
>>>> support MADV_POPULATE _WRITE. But, I will check to see if we really 
>>>> need it.
>>>
>>> That's around since Linux 5.14, so please try simplifying.
>>>
>>>>
>>>>> Further, how do you changes interact with other callers of
>>>>> qemu_prealloc_mem(), like virtio-mem?
>>>>>
>>>>
>>>> I'm not familiar with the intricacies of virtio-mem, but the basic 
>>>> idea
>>>> of this series is to *only* allow parallel init during the start up
>>>> phase (while prealloc_init == false). Once we have parsed all the
>>>> command line args, we set prealloc_init = true
>>>> (wait_mem_prealloc_init()) which causes all subsequent calls to
>>>> qemu_prealloc_mem() to perform initialization synchronously. So, I
>>>> *think* this covers the virtio-mem use case. Am I missing something?
>>>
>>> Good, so this should likely not affect virtio-mem (which also ends up
>>> preallocating memory when loading from a vmstate).
>>>
>>> To make this all a bit clearer, what about the following to make this:
>>>
>>> * Optimize for MADV_POPULATE_WRITE. If we really need support for
>>>    !MADV_POPULATE_WRITE, this is better added on top later.
>>> * Pass in via a parameter that the caller requests async handling. 
>>> "bool
>>>    async" should be good enough. Then, pass that only from the memory
>>>    backend call, while QEMU is still initializing (we can find a way to
>>>    make that work).
>>> * Provide a function that waits for any remaining async os-mem-prealloc
>>>    activity. That is essentially "wait_mem_prealloc_init", but without
>>>    the special internal flag handling.
>>>
>>> Further, I do wonder if we want to make that behavior configurable. For
>>> example, one might want to initialize backends sequentially using 16 
>>> threads
>>> max, instead of consuming multiple times 16 threads concurrently.
>>
>> Seems to me that no matter what parallelisation we use (within
>> mem regions, or across mem regions, or a mix of both), we should
>> never use more threads than there are host CPUs.
>
> Yes. It gets tricky with multipe NUMA nodes, though. And that's what's 
> being requested here.
>
>>
>> Can we have a pool of threads sized per available host CPUs that
>> QEMU can access. Then for each memory backend fire off a set of
>> init jobs that get distributed to "appropriate" threads in the
>> pool. By appropriate I mean threads with the same NUMA affinity
>> as the memory backend. This would give parallelisation both
>> within a single large memory region, as well as across memory
>> regions.
>>
>>
>> eg 4 host CPUs, 3 memory regions (1GB for 1st numa node, 1GB
>> for 2nd numa node, and 1 GB with no numa affinity). If we
>> spread the init work then we end up with
>
> I'll note that having a mixture of numa + no numa is not a common 
> config we should try to optimize.
>
>>
>>   1st thread gets 500MB to init from 1st memory region, and
>>       250MB to init from 3rd memory region
>>
>>   2st thread gets 500MB to init from 1st memory region, and
>>       250MB to init from 3rd memory region
>>
>>   3st thread gets 500MB to init from 2nd memory region, and
>>       250MB to init from 3rd memory region
>>
>>   4th thread gets 500MB to init from 2nd memory region, and
>>       250MB to init from 3rd memory region
>
> We do have prealloc contexts that are used to craft new CPUs with the 
> right NUMA affinity. Prealloc contexts are set for memory backends. So 
> extending prealloc context to limit/reuse threads and do the pooling 
> might be possible.
>
>>
>>> Could
>>> either be a machine option or a memory backend option 
>>> (async-prealloc=on).
>>> Once hotplugging such backends, we would disable it for now, but 
>>> could allow
>>> it in the future.
>>
>> IMHO "do the right thing" without user config is preferrable to adding
>> yet more cli options here.
>
> Sure, if there is an easy way.
>

Thank you David and Daniel for your feedback. I plan to address your 
suggestions in v2 (including optimizing for MADV_POPULATE_WRITE).

Best regards,
-Mark


  reply	other threads:[~2024-01-09 18:42 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-08 15:10 [PATCH v1 0/2] Initialize backend memory objects in parallel Mark Kanda
2024-01-08 15:10 ` [PATCH v1 1/2] oslib-posix: refactor memory prealloc threads Mark Kanda
2024-01-08 15:40   ` David Hildenbrand
2024-01-08 18:40     ` Mark Kanda
2024-01-09 14:02       ` David Hildenbrand
2024-01-09 14:15         ` Daniel P. Berrangé
2024-01-09 14:25           ` David Hildenbrand
2024-01-09 18:42             ` Mark Kanda [this message]
2024-01-17 14:44             ` Mark Kanda
2024-01-17 14:51               ` David Hildenbrand
2024-01-08 15:10 ` [PATCH v1 2/2] oslib-posix: initialize backend memory objects in parallel Mark Kanda

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c44c60b2-bc82-4884-a7df-98b327737495@oracle.com \
    --to=mark.kanda@oracle.com \
    --cc=berrange@redhat.com \
    --cc=david@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).