From: Mark Kanda <mark.kanda@oracle.com>
To: "David Hildenbrand" <david@redhat.com>,
"Daniel P. Berrangé" <berrange@redhat.com>
Cc: qemu-devel@nongnu.org, pbonzini@redhat.com
Subject: Re: [PATCH v1 1/2] oslib-posix: refactor memory prealloc threads
Date: Tue, 9 Jan 2024 12:42:01 -0600 [thread overview]
Message-ID: <c44c60b2-bc82-4884-a7df-98b327737495@oracle.com> (raw)
In-Reply-To: <fc3a6a13-6a2d-44c4-9cf0-a91dc2cf1b97@redhat.com>
On 1/9/24 8:25 AM, David Hildenbrand wrote:
> On 09.01.24 15:15, Daniel P. Berrangé wrote:
>> On Tue, Jan 09, 2024 at 03:02:00PM +0100, David Hildenbrand wrote:
>>> On 08.01.24 19:40, Mark Kanda wrote:
>>>> On 1/8/24 9:40 AM, David Hildenbrand wrote:
>>>>> On 08.01.24 16:10, Mark Kanda wrote:
>>>>>> Refactor the memory prealloc threads support:
>>>>>> - Make memset context a global qlist
>>>>>> - Move the memset thread join/cleanup code to a separate routine
>>>>>>
>>>>>> This is functionally equivalent and facilitates multiple memset
>>>>>> contexts
>>>>>> (used in a subsequent patch).
>>>>>>
>>>>>> Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
>>>>>> ---
>>>>>> util/oslib-posix.c | 104
>>>>>> +++++++++++++++++++++++++++++----------------
>>>>>> 1 file changed, 68 insertions(+), 36 deletions(-)
>>>>>>
>>>>>> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
>>>>>> index e86fd64e09..293297ac6c 100644
>>>>>> --- a/util/oslib-posix.c
>>>>>> +++ b/util/oslib-posix.c
>>>>>> @@ -63,11 +63,15 @@
>>>>>> struct MemsetThread;
>>>>>> +static QLIST_HEAD(, MemsetContext) memset_contexts =
>>>>>> + QLIST_HEAD_INITIALIZER(memset_contexts);
>>>>>> +
>>>>>> typedef struct MemsetContext {
>>>>>> bool all_threads_created;
>>>>>> bool any_thread_failed;
>>>>>> struct MemsetThread *threads;
>>>>>> int num_threads;
>>>>>> + QLIST_ENTRY(MemsetContext) next;
>>>>>> } MemsetContext;
>>>>>> struct MemsetThread {
>>>>>> @@ -81,7 +85,7 @@ struct MemsetThread {
>>>>>> typedef struct MemsetThread MemsetThread;
>>>>>> /* used by sigbus_handler() */
>>>>>> -static MemsetContext *sigbus_memset_context;
>>>>>> +static bool sigbus_memset_context;
>>>>>> struct sigaction sigbus_oldact;
>>>>>> static QemuMutex sigbus_mutex;
>>>>>> @@ -295,13 +299,16 @@ static void sigbus_handler(int signal)
>>>>>> #endif /* CONFIG_LINUX */
>>>>>> {
>>>>>> int i;
>>>>>> + MemsetContext *context;
>>>>>> if (sigbus_memset_context) {
>>>>>> - for (i = 0; i < sigbus_memset_context->num_threads; i++) {
>>>>>> - MemsetThread *thread =
>>>>>> &sigbus_memset_context->threads[i];
>>>>>> + QLIST_FOREACH(context, &memset_contexts, next) {
>>>>>> + for (i = 0; i < context->num_threads; i++) {
>>>>>> + MemsetThread *thread = &context->threads[i];
>>>>>> - if (qemu_thread_is_self(&thread->pgthread)) {
>>>>>> - siglongjmp(thread->env, 1);
>>>>>> + if (qemu_thread_is_self(&thread->pgthread)) {
>>>>>> + siglongjmp(thread->env, 1);
>>>>>> + }
>>>>>> }
>>>>>> }
>>>>>> }
>>>>>> @@ -417,14 +424,15 @@ static int touch_all_pages(char *area, size_t
>>>>>> hpagesize, size_t numpages,
>>>>>> bool use_madv_populate_write)
>>>>>> {
>>>>>> static gsize initialized = 0;
>>>>>> - MemsetContext context = {
>>>>>> - .num_threads = get_memset_num_threads(hpagesize, numpages,
>>>>>> max_threads),
>>>>>> - };
>>>>>> + MemsetContext *context = g_malloc0(sizeof(MemsetContext));
>>>>>> size_t numpages_per_thread, leftover;
>>>>>> void *(*touch_fn)(void *);
>>>>>> - int ret = 0, i = 0;
>>>>>> + int i = 0;
>>>>>> char *addr = area;
>>>>>> + context->num_threads =
>>>>>> + get_memset_num_threads(hpagesize, numpages, max_threads);
>>>>>> +
>>>>>> if (g_once_init_enter(&initialized)) {
>>>>>> qemu_mutex_init(&page_mutex);
>>>>>> qemu_cond_init(&page_cond);
>>>>>> @@ -433,7 +441,7 @@ static int touch_all_pages(char *area, size_t
>>>>>> hpagesize, size_t numpages,
>>>>>> if (use_madv_populate_write) {
>>>>>> /* Avoid creating a single thread for
>>>>>> MADV_POPULATE_WRITE */
>>>>>> - if (context.num_threads == 1) {
>>>>>> + if (context->num_threads == 1) {
>>>>>> if (qemu_madvise(area, hpagesize * numpages,
>>>>>> QEMU_MADV_POPULATE_WRITE)) {
>>>>>> return -errno;
>>>>>> @@ -445,49 +453,74 @@ static int touch_all_pages(char *area, size_t
>>>>>> hpagesize, size_t numpages,
>>>>>> touch_fn = do_touch_pages;
>>>>>> }
>>>>>> - context.threads = g_new0(MemsetThread,
>>>>>> context.num_threads);
>>>>>> - numpages_per_thread = numpages / context.num_threads;
>>>>>> - leftover = numpages % context.num_threads;
>>>>>> - for (i = 0; i < context.num_threads; i++) {
>>>>>> - context.threads[i].addr = addr;
>>>>>> - context.threads[i].numpages = numpages_per_thread + (i <
>>>>>> leftover);
>>>>>> - context.threads[i].hpagesize = hpagesize;
>>>>>> - context.threads[i].context = &context;
>>>>>> + context->threads = g_new0(MemsetThread, context->num_threads);
>>>>>> + numpages_per_thread = numpages / context->num_threads;
>>>>>> + leftover = numpages % context->num_threads;
>>>>>> + for (i = 0; i < context->num_threads; i++) {
>>>>>> + context->threads[i].addr = addr;
>>>>>> + context->threads[i].numpages = numpages_per_thread + (i <
>>>>>> leftover);
>>>>>> + context->threads[i].hpagesize = hpagesize;
>>>>>> + context->threads[i].context = context;
>>>>>> if (tc) {
>>>>>> - thread_context_create_thread(tc,
>>>>>> &context.threads[i].pgthread,
>>>>>> + thread_context_create_thread(tc,
>>>>>> &context->threads[i].pgthread,
>>>>>> "touch_pages",
>>>>>> - touch_fn,
>>>>>> &context.threads[i],
>>>>>> + touch_fn,
>>>>>> &context->threads[i],
>>>>>> QEMU_THREAD_JOINABLE);
>>>>>> } else {
>>>>>> - qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
>>>>>> - touch_fn, &context.threads[i],
>>>>>> + qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
>>>>>> + touch_fn, &context->threads[i],
>>>>>> QEMU_THREAD_JOINABLE);
>>>>>> }
>>>>>> - addr += context.threads[i].numpages * hpagesize;
>>>>>> + addr += context->threads[i].numpages * hpagesize;
>>>>>> }
>>>>>> + QLIST_INSERT_HEAD(&memset_contexts, context, next);
>>>>>> +
>>>>>> if (!use_madv_populate_write) {
>>>>>> - sigbus_memset_context = &context;
>>>>>> + sigbus_memset_context = true;
>>>>>
>>>> Thanks David,
>>>>
>>>>> Could we just use the sigbus handling alone and support parallel init
>>>>> only when using MADV_POPULATE_WRITE where we don't have to mess with
>>>>> signal handlers?
>>>>>
>>>>
>>>> Ideally, we're hoping to support this with earlier kernels which don't
>>>> support MADV_POPULATE _WRITE. But, I will check to see if we really
>>>> need it.
>>>
>>> That's around since Linux 5.14, so please try simplifying.
>>>
>>>>
>>>>> Further, how do you changes interact with other callers of
>>>>> qemu_prealloc_mem(), like virtio-mem?
>>>>>
>>>>
>>>> I'm not familiar with the intricacies of virtio-mem, but the basic
>>>> idea
>>>> of this series is to *only* allow parallel init during the start up
>>>> phase (while prealloc_init == false). Once we have parsed all the
>>>> command line args, we set prealloc_init = true
>>>> (wait_mem_prealloc_init()) which causes all subsequent calls to
>>>> qemu_prealloc_mem() to perform initialization synchronously. So, I
>>>> *think* this covers the virtio-mem use case. Am I missing something?
>>>
>>> Good, so this should likely not affect virtio-mem (which also ends up
>>> preallocating memory when loading from a vmstate).
>>>
>>> To make this all a bit clearer, what about the following to make this:
>>>
>>> * Optimize for MADV_POPULATE_WRITE. If we really need support for
>>> !MADV_POPULATE_WRITE, this is better added on top later.
>>> * Pass in via a parameter that the caller requests async handling.
>>> "bool
>>> async" should be good enough. Then, pass that only from the memory
>>> backend call, while QEMU is still initializing (we can find a way to
>>> make that work).
>>> * Provide a function that waits for any remaining async os-mem-prealloc
>>> activity. That is essentially "wait_mem_prealloc_init", but without
>>> the special internal flag handling.
>>>
>>> Further, I do wonder if we want to make that behavior configurable. For
>>> example, one might want to initialize backends sequentially using 16
>>> threads
>>> max, instead of consuming multiple times 16 threads concurrently.
>>
>> Seems to me that no matter what parallelisation we use (within
>> mem regions, or across mem regions, or a mix of both), we should
>> never use more threads than there are host CPUs.
>
> Yes. It gets tricky with multipe NUMA nodes, though. And that's what's
> being requested here.
>
>>
>> Can we have a pool of threads sized per available host CPUs that
>> QEMU can access. Then for each memory backend fire off a set of
>> init jobs that get distributed to "appropriate" threads in the
>> pool. By appropriate I mean threads with the same NUMA affinity
>> as the memory backend. This would give parallelisation both
>> within a single large memory region, as well as across memory
>> regions.
>>
>>
>> eg 4 host CPUs, 3 memory regions (1GB for 1st numa node, 1GB
>> for 2nd numa node, and 1 GB with no numa affinity). If we
>> spread the init work then we end up with
>
> I'll note that having a mixture of numa + no numa is not a common
> config we should try to optimize.
>
>>
>> 1st thread gets 500MB to init from 1st memory region, and
>> 250MB to init from 3rd memory region
>>
>> 2st thread gets 500MB to init from 1st memory region, and
>> 250MB to init from 3rd memory region
>>
>> 3st thread gets 500MB to init from 2nd memory region, and
>> 250MB to init from 3rd memory region
>>
>> 4th thread gets 500MB to init from 2nd memory region, and
>> 250MB to init from 3rd memory region
>
> We do have prealloc contexts that are used to craft new CPUs with the
> right NUMA affinity. Prealloc contexts are set for memory backends. So
> extending prealloc context to limit/reuse threads and do the pooling
> might be possible.
>
>>
>>> Could
>>> either be a machine option or a memory backend option
>>> (async-prealloc=on).
>>> Once hotplugging such backends, we would disable it for now, but
>>> could allow
>>> it in the future.
>>
>> IMHO "do the right thing" without user config is preferrable to adding
>> yet more cli options here.
>
> Sure, if there is an easy way.
>
Thank you David and Daniel for your feedback. I plan to address your
suggestions in v2 (including optimizing for MADV_POPULATE_WRITE).
Best regards,
-Mark
next prev parent reply other threads:[~2024-01-09 18:42 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-08 15:10 [PATCH v1 0/2] Initialize backend memory objects in parallel Mark Kanda
2024-01-08 15:10 ` [PATCH v1 1/2] oslib-posix: refactor memory prealloc threads Mark Kanda
2024-01-08 15:40 ` David Hildenbrand
2024-01-08 18:40 ` Mark Kanda
2024-01-09 14:02 ` David Hildenbrand
2024-01-09 14:15 ` Daniel P. Berrangé
2024-01-09 14:25 ` David Hildenbrand
2024-01-09 18:42 ` Mark Kanda [this message]
2024-01-17 14:44 ` Mark Kanda
2024-01-17 14:51 ` David Hildenbrand
2024-01-08 15:10 ` [PATCH v1 2/2] oslib-posix: initialize backend memory objects in parallel Mark Kanda
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=c44c60b2-bc82-4884-a7df-98b327737495@oracle.com \
--to=mark.kanda@oracle.com \
--cc=berrange@redhat.com \
--cc=david@redhat.com \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).