From: Andre Przywara <andre.przywara@amd.com>
To: <avi@redhat.com>, <anthony@codemonkey.ws>
Cc: <kvm@vger.kernel.org>, Andre Przywara <andre.przywara@amd.com>
Subject: [PATCH 1/4] NUMA: change existing NUMA guest code to use new bitmap implementation
Date: Wed, 11 Aug 2010 15:52:15 +0200 [thread overview]
Message-ID: <1281534738-8310-2-git-send-email-andre.przywara@amd.com> (raw)
In-Reply-To: <1281534738-8310-1-git-send-email-andre.przywara@amd.com>
The current NUMA guest implementation uses a "poor-man's-bitmap"
consisting of a single uint64_t. This patch reworks this by
leveraging the new generic bitmap code and thus lifts the 64 VCPUs
limit for NUMA guests.
Beside that it improves the NUMA data structures in preparation
for future host binding code.
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
cpus.c | 2 +-
hw/pc.c | 4 +-
monitor.c | 2 +-
sysemu.h | 11 ++++++-
vl.c | 94 +++++++++++++++++++++++++++++++++++++++----------------------
5 files changed, 73 insertions(+), 40 deletions(-)
diff --git a/cpus.c b/cpus.c
index 2e40814..86a0a47 100644
--- a/cpus.c
+++ b/cpus.c
@@ -805,7 +805,7 @@ void set_numa_modes(void)
for (env = first_cpu; env != NULL; env = env->next_cpu) {
for (i = 0; i < nb_numa_nodes; i++) {
- if (node_cpumask[i] & (1 << env->cpu_index)) {
+ if (test_bit(env->cpu_index, numa_info[i].guest_cpu)) {
env->numa_node = i;
}
}
diff --git a/hw/pc.c b/hw/pc.c
index 89bd4af..1b24409 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -529,14 +529,14 @@ static void *bochs_bios_init(void)
numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
for (i = 0; i < smp_cpus; i++) {
for (j = 0; j < nb_numa_nodes; j++) {
- if (node_cpumask[j] & (1 << i)) {
+ if (test_bit(i, numa_info[j].guest_cpu)) {
numa_fw_cfg[i + 1] = cpu_to_le64(j);
break;
}
}
}
for (i = 0; i < nb_numa_nodes; i++) {
- numa_fw_cfg[smp_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
+ numa_fw_cfg[smp_cpus + 1 + i] = cpu_to_le64(numa_info[i].guest_mem);
}
fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
(1 + smp_cpus + nb_numa_nodes) * 8);
diff --git a/monitor.c b/monitor.c
index e51df62..74da6c4 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1983,7 +1983,7 @@ static void do_info_numa(Monitor *mon)
}
monitor_printf(mon, "\n");
monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i,
- node_mem[i] >> 20);
+ numa_info[i].guest_mem >> 20);
}
}
diff --git a/sysemu.h b/sysemu.h
index bf1d68a..e5f88d1 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -7,6 +7,7 @@
#include "qemu-queue.h"
#include "qemu-timer.h"
#include "notify.h"
+#include "bitmap.h"
#ifdef _WIN32
#include <windows.h>
@@ -136,9 +137,15 @@ extern QEMUClock *rtc_clock;
extern long hpagesize;
#define MAX_NODES 64
+#ifndef MAX_NUMA_VCPUS
+#define MAX_NUMA_VCPUS 256
+#endif
extern int nb_numa_nodes;
-extern uint64_t node_mem[MAX_NODES];
-extern uint64_t node_cpumask[MAX_NODES];
+struct numa_info {
+ uint64_t guest_mem;
+ DECLARE_BITMAP(guest_cpu, MAX_NUMA_VCPUS);
+};
+extern struct numa_info numa_info[MAX_NODES];
#define MAX_OPTION_ROMS 16
extern const char *option_rom[MAX_OPTION_ROMS];
diff --git a/vl.c b/vl.c
index 3d8298e..40fac59 100644
--- a/vl.c
+++ b/vl.c
@@ -161,6 +161,7 @@ int main(int argc, char **argv)
#include "qemu-queue.h"
#include "cpus.h"
#include "arch_init.h"
+#include "bitmap.h"
//#define DEBUG_NET
//#define DEBUG_SLIRP
@@ -230,8 +231,7 @@ const char *nvram = NULL;
int boot_menu;
int nb_numa_nodes;
-uint64_t node_mem[MAX_NODES];
-uint64_t node_cpumask[MAX_NODES];
+struct numa_info numa_info[MAX_NODES];
static QEMUTimer *nographic_timer;
@@ -717,11 +717,51 @@ static void restore_boot_devices(void *opaque)
qemu_free(standard_boot_devices);
}
+static int parse_bitmap(const char *str, unsigned long *bm, int maxlen)
+{
+ unsigned long long value, endvalue;
+ char *endptr;
+ unsigned int flags = 0;
+
+ if (str[0] == '!') {
+ flags |= 2;
+ bitmap_fill(bm, maxlen);
+ str++;
+ }
+ if (str[0] == '+') {
+ flags |= 1;
+ str++;
+ }
+ value = strtoull(str, &endptr, 10);
+ if (endptr == str) {
+ if (strcmp(str, "all"))
+ return -1;
+ bitmap_fill(bm, maxlen);
+ return flags;
+ }
+ if (value >= maxlen)
+ return -value;
+ if (*endptr == '-') {
+ endvalue = strtoull(endptr + 1, &endptr, 10);
+ if (endvalue >= maxlen)
+ endvalue = maxlen;
+ } else {
+ endvalue = value;
+ }
+
+ if (flags & 2)
+ bitmap_clear(bm, value, endvalue + 1 - value);
+ else
+ bitmap_set(bm, value, endvalue + 1 - value);
+
+ return flags;
+}
+
static void numa_add(const char *optarg)
{
char option[128];
char *endptr;
- unsigned long long value, endvalue;
+ unsigned long long value;
int nodenr;
optarg = get_opt_name(option, 128, optarg, ',') + 1;
@@ -733,7 +773,7 @@ static void numa_add(const char *optarg)
}
if (get_param_value(option, 128, "mem", optarg) == 0) {
- node_mem[nodenr] = 0;
+ numa_info[nodenr].guest_mem = 0;
} else {
value = strtoull(option, &endptr, 0);
switch (*endptr) {
@@ -744,29 +784,12 @@ static void numa_add(const char *optarg)
value <<= 30;
break;
}
- node_mem[nodenr] = value;
+ numa_info[nodenr].guest_mem = value;
}
if (get_param_value(option, 128, "cpus", optarg) == 0) {
- node_cpumask[nodenr] = 0;
+ bitmap_zero(numa_info[nodenr].guest_cpu, MAX_NUMA_VCPUS);
} else {
- value = strtoull(option, &endptr, 10);
- if (value >= 64) {
- value = 63;
- fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n");
- } else {
- if (*endptr == '-') {
- endvalue = strtoull(endptr+1, &endptr, 10);
- if (endvalue >= 63) {
- endvalue = 62;
- fprintf(stderr,
- "only 63 CPUs in NUMA mode supported.\n");
- }
- value = (2ULL << endvalue) - (1ULL << value);
- } else {
- value = 1ULL << value;
- }
- }
- node_cpumask[nodenr] = value;
+ parse_bitmap(option, numa_info[nodenr].guest_cpu, MAX_NUMA_VCPUS);
}
nb_numa_nodes++;
}
@@ -1870,8 +1893,8 @@ int main(int argc, char **argv, char **envp)
translation = BIOS_ATA_TRANSLATION_AUTO;
for (i = 0; i < MAX_NODES; i++) {
- node_mem[i] = 0;
- node_cpumask[i] = 0;
+ numa_info[i].guest_mem = 0;
+ bitmap_zero(numa_info[i].guest_cpu, MAX_NUMA_VCPUS);
}
assigned_devices_index = 0;
@@ -2887,7 +2910,7 @@ int main(int argc, char **argv, char **envp)
* and distribute the available memory equally across all nodes
*/
for (i = 0; i < nb_numa_nodes; i++) {
- if (node_mem[i] != 0)
+ if (numa_info[i].guest_mem != 0)
break;
}
if (i == nb_numa_nodes) {
@@ -2897,14 +2920,18 @@ int main(int argc, char **argv, char **envp)
* the final node gets the rest.
*/
for (i = 0; i < nb_numa_nodes - 1; i++) {
- node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
- usedmem += node_mem[i];
+ numa_info[i].guest_mem = (ram_size / nb_numa_nodes) &
+ ~((1 << 23UL) - 1);
+ usedmem += numa_info[i].guest_mem;
}
- node_mem[i] = ram_size - usedmem;
+ numa_info[i].guest_mem = ram_size - usedmem;
}
+ /* check whether any guest CPU number has been specified.
+ * If not, we use an automatic assignment algorithm.
+ */
for (i = 0; i < nb_numa_nodes; i++) {
- if (node_cpumask[i] != 0)
+ if (!bitmap_empty(numa_info[i].guest_cpu, MAX_NUMA_VCPUS))
break;
}
/* assigning the VCPUs round-robin is easier to implement, guest OSes
@@ -2912,9 +2939,8 @@ int main(int argc, char **argv, char **envp)
* real machines which also use this scheme.
*/
if (i == nb_numa_nodes) {
- for (i = 0; i < smp_cpus; i++) {
- node_cpumask[i % nb_numa_nodes] |= 1 << i;
- }
+ for (i = 0; i < smp_cpus; i++)
+ set_bit(i, numa_info[i % nb_numa_nodes].guest_cpu);
}
}
--
1.6.4
next prev parent reply other threads:[~2010-08-11 13:52 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-08-11 13:52 [PATCH 0/4]: NUMA: add host binding Andre Przywara
2010-08-11 13:52 ` Andre Przywara [this message]
2010-08-11 13:52 ` [PATCH 2/4] NUMA: add Linux libnuma detection Andre Przywara
2010-08-11 13:52 ` [PATCH 3/4] NUMA: parse new host dependent command line options Andre Przywara
2010-08-11 13:52 ` [PATCH 4/4] NUMA: realize NUMA memory pinning Andre Przywara
2010-08-23 18:59 ` Marcelo Tosatti
2010-08-23 19:27 ` Anthony Liguori
2010-08-23 21:16 ` Andre Przywara
2010-08-23 21:27 ` Anthony Liguori
2010-08-31 20:54 ` Andrew Theurer
2010-08-31 22:03 ` Anthony Liguori
2010-09-01 3:38 ` Andrew Theurer
2010-09-09 20:00 ` Andre Przywara
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1281534738-8310-2-git-send-email-andre.przywara@amd.com \
--to=andre.przywara@amd.com \
--cc=anthony@codemonkey.ws \
--cc=avi@redhat.com \
--cc=kvm@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox