[PATCH 1/4] NUMA: change existing NUMA guest code to use new bitmap implementation

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

From: Andre Przywara <andre.przywara@amd.com>
To: <avi@redhat.com>, <anthony@codemonkey.ws>
Cc: <kvm@vger.kernel.org>, Andre Przywara <andre.przywara@amd.com>
Subject: [PATCH 1/4] NUMA: change existing NUMA guest code to use new bitmap implementation
Date: Wed, 11 Aug 2010 15:52:15 +0200	[thread overview]
Message-ID: <1281534738-8310-2-git-send-email-andre.przywara@amd.com> (raw)
In-Reply-To: <1281534738-8310-1-git-send-email-andre.przywara@amd.com>

The current NUMA guest implementation uses a "poor-man's-bitmap"
consisting of a single uint64_t. This patch reworks this by
leveraging the new generic bitmap code and thus lifts the 64 VCPUs
limit for NUMA guests.
Beside that it improves the NUMA data structures in preparation
for future host binding code.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
 cpus.c    |    2 +-
 hw/pc.c   |    4 +-
 monitor.c |    2 +-
 sysemu.h  |   11 ++++++-
 vl.c      |   94 +++++++++++++++++++++++++++++++++++++++----------------------
 5 files changed, 73 insertions(+), 40 deletions(-)

diff --git a/cpus.c b/cpus.c
index 2e40814..86a0a47 100644
--- a/cpus.c
+++ b/cpus.c
@@ -805,7 +805,7 @@ void set_numa_modes(void)
 
     for (env = first_cpu; env != NULL; env = env->next_cpu) {
         for (i = 0; i < nb_numa_nodes; i++) {
-            if (node_cpumask[i] & (1 << env->cpu_index)) {
+            if (test_bit(env->cpu_index, numa_info[i].guest_cpu)) {
                 env->numa_node = i;
             }
         }
diff --git a/hw/pc.c b/hw/pc.c
index 89bd4af..1b24409 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -529,14 +529,14 @@ static void *bochs_bios_init(void)
     numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
     for (i = 0; i < smp_cpus; i++) {
         for (j = 0; j < nb_numa_nodes; j++) {
-            if (node_cpumask[j] & (1 << i)) {
+            if (test_bit(i, numa_info[j].guest_cpu)) {
                 numa_fw_cfg[i + 1] = cpu_to_le64(j);
                 break;
             }
         }
     }
     for (i = 0; i < nb_numa_nodes; i++) {
-        numa_fw_cfg[smp_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
+        numa_fw_cfg[smp_cpus + 1 + i] = cpu_to_le64(numa_info[i].guest_mem);
     }
     fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
                      (1 + smp_cpus + nb_numa_nodes) * 8);
diff --git a/monitor.c b/monitor.c
index e51df62..74da6c4 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1983,7 +1983,7 @@ static void do_info_numa(Monitor *mon)
         }
         monitor_printf(mon, "\n");
         monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i,
-            node_mem[i] >> 20);
+            numa_info[i].guest_mem >> 20);
     }
 }
 
diff --git a/sysemu.h b/sysemu.h
index bf1d68a..e5f88d1 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -7,6 +7,7 @@
 #include "qemu-queue.h"
 #include "qemu-timer.h"
 #include "notify.h"
+#include "bitmap.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -136,9 +137,15 @@ extern QEMUClock *rtc_clock;
 extern long hpagesize;
 
 #define MAX_NODES 64
+#ifndef MAX_NUMA_VCPUS
+#define MAX_NUMA_VCPUS 256
+#endif
 extern int nb_numa_nodes;
-extern uint64_t node_mem[MAX_NODES];
-extern uint64_t node_cpumask[MAX_NODES];
+struct numa_info {
+    uint64_t guest_mem;
+    DECLARE_BITMAP(guest_cpu, MAX_NUMA_VCPUS);
+};
+extern struct numa_info numa_info[MAX_NODES];
 
 #define MAX_OPTION_ROMS 16
 extern const char *option_rom[MAX_OPTION_ROMS];
diff --git a/vl.c b/vl.c
index 3d8298e..40fac59 100644
--- a/vl.c
+++ b/vl.c
@@ -161,6 +161,7 @@ int main(int argc, char **argv)
 #include "qemu-queue.h"
 #include "cpus.h"
 #include "arch_init.h"
+#include "bitmap.h"
 
 //#define DEBUG_NET
 //#define DEBUG_SLIRP
@@ -230,8 +231,7 @@ const char *nvram = NULL;
 int boot_menu;
 
 int nb_numa_nodes;
-uint64_t node_mem[MAX_NODES];
-uint64_t node_cpumask[MAX_NODES];
+struct numa_info numa_info[MAX_NODES];
 
 static QEMUTimer *nographic_timer;
 
@@ -717,11 +717,51 @@ static void restore_boot_devices(void *opaque)
     qemu_free(standard_boot_devices);
 }
 
+static int parse_bitmap(const char *str, unsigned long *bm, int maxlen)
+{
+    unsigned long long value, endvalue;
+    char *endptr;
+    unsigned int flags = 0;
+
+    if (str[0] == '!') {
+        flags |= 2;
+        bitmap_fill(bm, maxlen);
+        str++;
+    }
+    if (str[0] == '+') {
+        flags |= 1;
+        str++;
+    }
+    value = strtoull(str, &endptr, 10);
+    if (endptr == str) {
+        if (strcmp(str, "all"))
+            return -1;
+        bitmap_fill(bm, maxlen);
+        return flags;
+    }
+    if (value >= maxlen)
+        return -value;
+    if (*endptr == '-') {
+        endvalue = strtoull(endptr + 1, &endptr, 10);
+        if (endvalue >= maxlen)
+            endvalue = maxlen;
+    } else {
+        endvalue = value;
+    }
+
+    if (flags & 2)
+        bitmap_clear(bm, value, endvalue + 1 - value);
+    else
+        bitmap_set(bm, value, endvalue + 1 - value);
+
+    return flags;
+}
+
 static void numa_add(const char *optarg)
 {
     char option[128];
     char *endptr;
-    unsigned long long value, endvalue;
+    unsigned long long value;
     int nodenr;
 
     optarg = get_opt_name(option, 128, optarg, ',') + 1;
@@ -733,7 +773,7 @@ static void numa_add(const char *optarg)
         }
 
         if (get_param_value(option, 128, "mem", optarg) == 0) {
-            node_mem[nodenr] = 0;
+            numa_info[nodenr].guest_mem = 0;
         } else {
             value = strtoull(option, &endptr, 0);
             switch (*endptr) {
@@ -744,29 +784,12 @@ static void numa_add(const char *optarg)
                 value <<= 30;
                 break;
             }
-            node_mem[nodenr] = value;
+            numa_info[nodenr].guest_mem = value;
         }
         if (get_param_value(option, 128, "cpus", optarg) == 0) {
-            node_cpumask[nodenr] = 0;
+            bitmap_zero(numa_info[nodenr].guest_cpu, MAX_NUMA_VCPUS);
         } else {
-            value = strtoull(option, &endptr, 10);
-            if (value >= 64) {
-                value = 63;
-                fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n");
-            } else {
-                if (*endptr == '-') {
-                    endvalue = strtoull(endptr+1, &endptr, 10);
-                    if (endvalue >= 63) {
-                        endvalue = 62;
-                        fprintf(stderr,
-                            "only 63 CPUs in NUMA mode supported.\n");
-                    }
-                    value = (2ULL << endvalue) - (1ULL << value);
-                } else {
-                    value = 1ULL << value;
-                }
-            }
-            node_cpumask[nodenr] = value;
+            parse_bitmap(option, numa_info[nodenr].guest_cpu, MAX_NUMA_VCPUS);
         }
         nb_numa_nodes++;
     }
@@ -1870,8 +1893,8 @@ int main(int argc, char **argv, char **envp)
     translation = BIOS_ATA_TRANSLATION_AUTO;
 
     for (i = 0; i < MAX_NODES; i++) {
-        node_mem[i] = 0;
-        node_cpumask[i] = 0;
+        numa_info[i].guest_mem = 0;
+        bitmap_zero(numa_info[i].guest_cpu, MAX_NUMA_VCPUS);
     }
 
     assigned_devices_index = 0;
@@ -2887,7 +2910,7 @@ int main(int argc, char **argv, char **envp)
          * and distribute the available memory equally across all nodes
          */
         for (i = 0; i < nb_numa_nodes; i++) {
-            if (node_mem[i] != 0)
+            if (numa_info[i].guest_mem != 0)
                 break;
         }
         if (i == nb_numa_nodes) {
@@ -2897,14 +2920,18 @@ int main(int argc, char **argv, char **envp)
              * the final node gets the rest.
              */
             for (i = 0; i < nb_numa_nodes - 1; i++) {
-                node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
-                usedmem += node_mem[i];
+                numa_info[i].guest_mem = (ram_size / nb_numa_nodes) &
+                    ~((1 << 23UL) - 1);
+                usedmem += numa_info[i].guest_mem;
             }
-            node_mem[i] = ram_size - usedmem;
+            numa_info[i].guest_mem = ram_size - usedmem;
         }
 
+        /* check whether any guest CPU number has been specified.
+         * If not, we use an automatic assignment algorithm.
+         */
         for (i = 0; i < nb_numa_nodes; i++) {
-            if (node_cpumask[i] != 0)
+            if (!bitmap_empty(numa_info[i].guest_cpu, MAX_NUMA_VCPUS))
                 break;
         }
         /* assigning the VCPUs round-robin is easier to implement, guest OSes
@@ -2912,9 +2939,8 @@ int main(int argc, char **argv, char **envp)
          * real machines which also use this scheme.
          */
         if (i == nb_numa_nodes) {
-            for (i = 0; i < smp_cpus; i++) {
-                node_cpumask[i % nb_numa_nodes] |= 1 << i;
-            }
+            for (i = 0; i < smp_cpus; i++)
+                set_bit(i, numa_info[i % nb_numa_nodes].guest_cpu);
         }
     }
 
-- 
1.6.4

next prev parent reply	other threads:[~2010-08-11 13:52 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-08-11 13:52 [PATCH 0/4]: NUMA: add host binding Andre Przywara
2010-08-11 13:52 ` Andre Przywara [this message]
2010-08-11 13:52 ` [PATCH 2/4] NUMA: add Linux libnuma detection Andre Przywara
2010-08-11 13:52 ` [PATCH 3/4] NUMA: parse new host dependent command line options Andre Przywara
2010-08-11 13:52 ` [PATCH 4/4] NUMA: realize NUMA memory pinning Andre Przywara
2010-08-23 18:59   ` Marcelo Tosatti
2010-08-23 19:27     ` Anthony Liguori
2010-08-23 21:16       ` Andre Przywara
2010-08-23 21:27         ` Anthony Liguori
2010-08-31 20:54           ` Andrew Theurer
2010-08-31 22:03             ` Anthony Liguori
2010-09-01  3:38               ` Andrew Theurer
2010-09-09 20:00               ` Andre Przywara

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:2e40814 dfblob:86a0a47 dfblob:89bd4af dfblob:1b24409
dfblob:e51df62 dfblob:74da6c4 dfblob:bf1d68a dfblob:e5f88d1
dfblob:3d8298e dfblob:40fac59 )
 OR (
bs:"[PATCH 1/4] NUMA: change existing NUMA guest code to use new bitmap implementation" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1281534738-8310-2-git-send-email-andre.przywara@amd.com \
    --to=andre.przywara@amd.com \
    --cc=anthony@codemonkey.ws \
    --cc=avi@redhat.com \
    --cc=kvm@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox