All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chegu Vinod <chegu_vinod@hp.com>
To: Eduardo Habkost <ehabkost@redhat.com>
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: Re: [Qemu-devel] [PATCH] Fixes related to processing of qemu's -numa option
Date: Mon, 18 Jun 2012 14:55:14 -0700	[thread overview]
Message-ID: <4FDFA3C2.9030602@hp.com> (raw)
In-Reply-To: <20120618202956.GA32417@otherpad.lan.raisama.net>

On 6/18/2012 1:29 PM, Eduardo Habkost wrote:
> On Sun, Jun 17, 2012 at 01:12:31PM -0700, Chegu Vinod wrote:
>> The -numa option to qemu is used to create [fake] numa nodes
>> and expose them to the guest OS instance.
>>
>> There are a couple of issues with the -numa option:
>>
>> a) Max VCPU's that can be specified for a guest while using
>>     the qemu's -numa option is 64. Due to a typecasting issue
>>     when the number of VCPUs is>  32 the VCPUs don't show up
>>     under the specified [fake] numa nodes.
>>
>> b) KVM currently has support for 160VCPUs per guest. The
>>     qemu's -numa option has only support for upto 64VCPUs
>>     per guest.
>>
>> This patch addresses these two issues. [ Note: This
>> patch has been verified by Eduardo Habkost ].
> I was going to add a Tested-by line, but this patch breaks the automatic
> round-robin assignment when no CPU is added to any node on the
> command-line.

Pl. see below.
>
> Additional questions below:
>
> [...]
>> diff --git a/cpus.c b/cpus.c
>> index b182b3d..f9cee60 100644
>> --- a/cpus.c
>> +++ b/cpus.c
>> @@ -1145,7 +1145,7 @@ void set_numa_modes(void)
>>
>>       for (env = first_cpu; env != NULL; env = env->next_cpu) {
>>           for (i = 0; i<  nb_numa_nodes; i++) {
>> -            if (node_cpumask[i]&  (1<<  env->cpu_index)) {
>> +            if (CPU_ISSET_S(env->cpu_index, cpumask_size, node_cpumask[i])) {
>>                   env->numa_node = i;
>>               }
>>           }
>> diff --git a/hw/pc.c b/hw/pc.c
>> index 8368701..f0c3665 100644
>> --- a/hw/pc.c
>> +++ b/hw/pc.c
>> @@ -639,7 +639,7 @@ static void *bochs_bios_init(void)
>>       numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
>>       for (i = 0; i<  max_cpus; i++) {
>>           for (j = 0; j<  nb_numa_nodes; j++) {
>> -            if (node_cpumask[j]&  (1<<  i)) {
>> +            if (CPU_ISSET_S(i, cpumask_size, node_cpumask[j])) {
>>                   numa_fw_cfg[i + 1] = cpu_to_le64(j);
>>                   break;
>>               }
>> diff --git a/sysemu.h b/sysemu.h
>> index bc2c788..6e4d342 100644
>> --- a/sysemu.h
>> +++ b/sysemu.h
>> @@ -9,6 +9,7 @@
>>   #include "qapi-types.h"
>>   #include "notify.h"
>>   #include "main-loop.h"
>> +#include<sched.h>
>>
>>   /* vl.c */
>>
>> @@ -133,9 +134,11 @@ extern uint8_t qemu_extra_params_fw[2];
>>   extern QEMUClock *rtc_clock;
>>
>>   #define MAX_NODES 64
>> +#define KVM_MAX_VCPUS 254
> Do we really need this constant? Why not just use max_cpus when
> allocating the CPU sets, instead?

Hmm.... I had thought about this earlier too max_cpus was not getting 
set at the point where the allocations were being done. I have now moved 
that code to a bit later and switched to using
max_cpus.

>
>
>>   extern int nb_numa_nodes;
>>   extern uint64_t node_mem[MAX_NODES];
>> -extern uint64_t node_cpumask[MAX_NODES];
>> +extern cpu_set_t *node_cpumask[MAX_NODES];
>> +extern size_t cpumask_size;
>>
>>   #define MAX_OPTION_ROMS 16
>>   typedef struct QEMUOptionRom {
>> diff --git a/vl.c b/vl.c
>> index 204d85b..1906412 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -28,6 +28,7 @@
>>   #include<errno.h>
>>   #include<sys/time.h>
>>   #include<zlib.h>
>> +#include<sched.h>
>>
>>   /* Needed early for CONFIG_BSD etc. */
>>   #include "config-host.h"
>> @@ -240,7 +241,8 @@ QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order
>>
>>   int nb_numa_nodes;
>>   uint64_t node_mem[MAX_NODES];
>> -uint64_t node_cpumask[MAX_NODES];
>> +cpu_set_t *node_cpumask[MAX_NODES];
>> +size_t cpumask_size;
>>
>>   uint8_t qemu_uuid[16];
>>
>> @@ -950,6 +952,9 @@ static void numa_add(const char *optarg)
>>       char *endptr;
>>       unsigned long long value, endvalue;
>>       int nodenr;
>> +    int i;
>> +
>> +    value = endvalue = 0;
>>
>>       optarg = get_opt_name(option, 128, optarg, ',') + 1;
>>       if (!strcmp(option, "node")) {
>> @@ -970,27 +975,17 @@ static void numa_add(const char *optarg)
>>               }
>>               node_mem[nodenr] = sval;
>>           }
>> -        if (get_param_value(option, 128, "cpus", optarg) == 0) {
>> -            node_cpumask[nodenr] = 0;
>> -        } else {
>> +        if (get_param_value(option, 128, "cpus", optarg) != 0) {
>>               value = strtoull(option,&endptr, 10);
>> -            if (value>= 64) {
>> -                value = 63;
>> -                fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n");
>> +            if (*endptr == '-') {
>> +                endvalue = strtoull(endptr+1,&endptr, 10);
>>               } else {
>> -                if (*endptr == '-') {
>> -                    endvalue = strtoull(endptr+1,&endptr, 10);
>> -                    if (endvalue>= 63) {
>> -                        endvalue = 62;
>> -                        fprintf(stderr,
>> -                            "only 63 CPUs in NUMA mode supported.\n");
>> -                    }
>> -                    value = (2ULL<<  endvalue) - (1ULL<<  value);
>> -                } else {
>> -                    value = 1ULL<<  value;
>> -                }
>> +                endvalue = value;
>> +            }
>> +
>> +            for (i = value; i<= endvalue; ++i) {
>> +                CPU_SET_S(i, cpumask_size, node_cpumask[nodenr]);
> What if endvalue is larger than the cpu mask size we allocated?

I can add a check (endvalue >= max_cpus) and print an error.
Should we force set the endvalue to max_cpus-1 in that case ?


>
>
>>               }
>> -            node_cpumask[nodenr] = value;
>>           }
>>           nb_numa_nodes++;
>>       }
>> @@ -2331,7 +2326,9 @@ int main(int argc, char **argv, char **envp)
>>
>>       for (i = 0; i<  MAX_NODES; i++) {
>>           node_mem[i] = 0;
>> -        node_cpumask[i] = 0;
>> +        node_cpumask[i] = CPU_ALLOC(KVM_MAX_VCPUS);
>> +        cpumask_size = CPU_ALLOC_SIZE(KVM_MAX_VCPUS);
>> +        CPU_ZERO_S(cpumask_size, node_cpumask[i]);
>>       }
>>
>>       nb_numa_nodes = 0;
>> @@ -3465,8 +3462,9 @@ int main(int argc, char **argv, char **envp)
>>           }
>>
>>           for (i = 0; i<  nb_numa_nodes; i++) {
>> -            if (node_cpumask[i] != 0)
>> +            if (node_cpumask[i] != NULL) {
> This will be true for every node (as you preallocate all the CPU
> sets in the beginning), so the loop will always end with i==0, in turn
> unconditionally disabling the round-robin CPU assignment code below.
> You can use CPU_COUNT_S, instead.

Argh... I should have tested this. My bad !  Thanks for catching this. I 
made the change and tested it and its doing the round-robin assignment now.

Will post the next version of the patch soon.

Thanks for your feedback.
Vinod
>
>>                   break;
>> +            }
>>           }
>>           /* assigning the VCPUs round-robin is easier to implement, guest OSes
>>            * must cope with this anyway, because there are BIOSes out there in
>> @@ -3474,7 +3472,7 @@ int main(int argc, char **argv, char **envp)
>>            */
>>           if (i == nb_numa_nodes) {
>>               for (i = 0; i<  max_cpus; i++) {
>> -                node_cpumask[i % nb_numa_nodes] |= 1<<  i;
>> +                CPU_SET_S(i, cpumask_size, node_cpumask[i % nb_numa_nodes]);
>>               }
>>           }
>>       }
>> -- 
>> 1.7.1
>>


  reply	other threads:[~2012-06-18 21:55 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-06-17 20:12 [PATCH] Fixes related to processing of qemu's -numa option Chegu Vinod
2012-06-17 20:12 ` [Qemu-devel] " Chegu Vinod
2012-06-18 20:29 ` Eduardo Habkost
2012-06-18 20:29   ` [Qemu-devel] " Eduardo Habkost
2012-06-18 21:55   ` Chegu Vinod [this message]
2012-06-18 22:05 ` Andreas Färber
2012-06-18 22:05   ` [Qemu-devel] " Andreas Färber
2012-06-18 22:11   ` Eric Blake
2012-06-18 22:11     ` Eric Blake
2012-06-18 22:15     ` Chegu Vinod
2012-06-18 22:15       ` Chegu Vinod
     [not found]     ` <4FE0638F.9080200@hp.com>
     [not found]       ` <20120619203502.GB5073@otherpad.lan.raisama.net>
2012-06-19 20:51         ` Stefan Weil

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4FDFA3C2.9030602@hp.com \
    --to=chegu_vinod@hp.com \
    --cc=ehabkost@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.