* [PATCH 2/3] v2: KVM-userspace: allocate guest resources from different host nodes
@ 2008-12-05 13:33 Andre Przywara
0 siblings, 0 replies; only message in thread
From: Andre Przywara @ 2008-12-05 13:33 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm, Daniel P. Berrange
[-- Attachment #1: Type: text/plain, Size: 857 bytes --]
According to the host node map given on the command line the VCPUs are
pinned to the respective node (allowing at least scheduling between the
cores belonging to this node). The mmap'ed guest memory will be bound to
the correct host nodes (this will of course not take effect until the
memory actually faults in).
The presence of libnuma will be auto-detected.
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
[-- Attachment #2: kvmnuma_hostalloc.patch --]
[-- Type: text/x-patch, Size: 6977 bytes --]
commit 0bc93b19ba3132140d5b34746523d0c7c8169093
Author: Andre Przywara <aprzywar@hagen.osrc.amd.com>
Date: Fri Dec 5 14:05:44 2008 +0100
allocate guest resources from different host NUMA nodes
diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 05ace8e..690903e 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -698,6 +698,10 @@ LIBS += -lkvm
DEPLIBS += ../libkvm/libkvm.a
endif
+ifdef CONFIG_NUMA
+LIBS += -lnuma
+endif
+
ifdef CONFIG_VNC_TLS
CPPFLAGS += $(CONFIG_VNC_TLS_CFLAGS)
LIBS += $(CONFIG_VNC_TLS_LIBS)
diff --git a/qemu/configure b/qemu/configure
index 63a85d6..3e2c9f9 100755
--- a/qemu/configure
+++ b/qemu/configure
@@ -121,6 +121,7 @@ bluez="yes"
kvm="yes"
kvm_cap_pit="no"
kvm_cap_device_assignment="no"
+getcpu="no"
kerneldir=""
aix="no"
blobs="yes"
@@ -391,6 +392,8 @@ for opt do
;;
--enable-mixemu) mixemu="yes"
;;
+ --disable-numa) numa="no"
+ ;;
--disable-aio) aio="no"
;;
--disable-blobs) blobs="no"
@@ -489,6 +492,7 @@ echo " Available drivers: $audio_possible_drivers"
echo " --audio-card-list=LIST set list of additional emulated audio cards"
echo " Available cards: ac97 adlib cs4231a gus"
echo " --enable-mixemu enable mixer emulation"
+echo " --disable-numa disable NUMA support (host side)"
echo " --disable-brlapi disable BrlAPI"
echo " --disable-vnc-tls disable TLS encryption for VNC server"
echo " --disable-curses disable curses output"
@@ -985,6 +989,29 @@ for drv in $audio_drv_list; do
done
##########################################
+# libnuma probe
+
+if test -z "$numa" ; then
+ numa=no
+
+ cat > $TMPC << EOF
+#include <numa.h>
+int main(void) { return numa_available(); }
+EOF
+ if $cc ${ARCH_CFLAGS} -o $TMPE ${OS_CFLAGS} $TMPC -lnuma 2> /dev/null ; then
+ numa=yes
+ fi
+fi
+
+cat > $TMPC << EOF
+#include <sched.h>
+int main(void) { return sched_getcpu(); }
+EOF
+if $cc ${ARCH_CFLAGS} -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then
+ getcpu=yes
+fi
+
+##########################################
# BrlAPI probe
if test -z "$brlapi" ; then
@@ -1181,6 +1208,7 @@ echo "mingw32 support $mingw32"
echo "Audio drivers $audio_drv_list"
echo "Extra audio cards $audio_card_list"
echo "Mixer emulation $mixemu"
+echo "NUMA support $numa"
echo "VNC TLS support $vnc_tls"
if test "$vnc_tls" = "yes" ; then
echo " TLS CFLAGS $vnc_tls_cflags"
@@ -1415,6 +1443,13 @@ if test "$mixemu" = "yes" ; then
echo "CONFIG_MIXEMU=yes" >> $config_mak
echo "#define CONFIG_MIXEMU 1" >> $config_h
fi
+if test "$numa" = "yes" ; then
+ echo "CONFIG_NUMA=yes" >> $config_mak
+ echo "#define CONFIG_NUMA 1" >> $config_h
+fi
+if test "$getcpu" = "yes" ; then
+ echo "#define HAVE_GETCPU 1" >> $config_h
+fi
if test "$vnc_tls" = "yes" ; then
echo "CONFIG_VNC_TLS=yes" >> $config_mak
echo "CONFIG_VNC_TLS_CFLAGS=$vnc_tls_cflags" >> $config_mak
diff --git a/qemu/hw/fw_cfg.h b/qemu/hw/fw_cfg.h
index ef8f378..b370e4e 100644
--- a/qemu/hw/fw_cfg.h
+++ b/qemu/hw/fw_cfg.h
@@ -8,6 +8,9 @@
#define FW_CFG_NOGRAPHIC 0x04
#define FW_CFG_NB_CPUS 0x05
#define FW_CFG_MACHINE_ID 0x06
+#define FW_CFG_NUMA_NODES 0x07
+#define FW_CFG_NUMA_NODE_CPUS 0x08
+#define FW_CFG_NUMA_NODE_MEM 0x09
#define FW_CFG_MAX_ENTRY 0x10
#define FW_CFG_WRITE_CHANNEL 0x4000
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 6de460c..b723125 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -439,6 +439,12 @@ static void bochs_bios_init(void)
fw_cfg = fw_cfg_init(BIOS_CFG_IOPORT, BIOS_CFG_IOPORT + 1, 0, 0);
fw_cfg_add_i32(fw_cfg, FW_CFG_ID, 1);
fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)ram_size);
+ fw_cfg_add_i16(fw_cfg, FW_CFG_NUMA_NODES, numnumanodes);
+
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA_NODE_MEM, (uint8_t*)node_mem,
+ sizeof(node_mem[0]) * numnumanodes);
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA_NODE_CPUS, (uint8_t*)node_to_cpus,
+ sizeof(node_to_cpus[0]) * numnumanodes);
}
/* Generate an initial boot sector which sets state and jump to
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index a7cfa24..63afe85 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -28,6 +28,10 @@ int kvm_pit = 1;
#include <sys/syscall.h>
#include <sys/mman.h>
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#endif
+
#define false 0
#define true 1
@@ -424,6 +428,36 @@ static int kvm_main_loop_cpu(CPUState *env)
return 0;
}
+#ifdef CONFIG_NUMA
+
+#ifndef HAVE_GETCPU
+static int get_cur_node (void)
+{
+ return -1;
+}
+#else /* HAVE_GETCPU */
+
+#define NUMA_MASK_SIZE 16
+static int get_cur_node (void)
+{
+int cpunr, node, maskbits;
+unsigned long maskbuf[NUMA_MASK_SIZE];
+
+ maskbits = sizeof(maskbuf[0]) * 8;
+ cpunr = sched_getcpu();
+ if (cpunr >= maskbits * NUMA_MASK_SIZE) return 0;
+ for (node = 0; node <= numa_max_node(); node++)
+ {
+ numa_node_to_cpus (node, maskbuf, sizeof(maskbuf[0]) * NUMA_MASK_SIZE);
+ if (maskbuf[cpunr / maskbits] & (1 << (cpunr % maskbits)))
+ return node;
+ }
+ return 0;
+}
+#endif /* HAVE_GETCPU */
+
+#endif /* CONFIG_NUMA */
+
static void *ap_main_loop(void *_env)
{
CPUState *env = _env;
@@ -432,6 +466,32 @@ static void *ap_main_loop(void *_env)
current_env = env;
env->thread_id = kvm_get_thread_id();
+
+#ifdef CONFIG_NUMA
+ if (numnumanodes > 0 && numa_available() != -1)
+ {
+ int i;
+ for (i = 0; i < numnumanodes; i++) {
+ if (!(node_to_cpus[i] & (1 << env->cpu_index))) continue;
+ if (hostnodes[i] == (uint64_t)-1) {
+ int j;
+ unsigned long offset = 0;
+
+ hostnodes[i] = get_cur_node();
+ if (hostnodes[i] != (uint64_t)-1) {
+ for (j = 0; j < i; ++j) offset += node_mem[i];
+ numa_tonode_memory (phys_ram_base + offset,
+ node_mem[i], hostnodes[i] % (numa_max_node() + 1));
+ }
+ }
+
+ if (hostnodes[i] != (uint64_t)-1)
+ numa_run_on_node (hostnodes[i] % (numa_max_node() + 1));
+ break;
+ }
+ }
+#endif
+
sigfillset(&signals);
sigprocmask(SIG_BLOCK, &signals, NULL);
kvm_create_vcpu(kvm_context, env->cpu_index);
@@ -840,6 +900,21 @@ int kvm_setup_guest_memory(void *area, unsigned long size)
if (ret)
perror ("madvise");
+#ifdef CONFIG_NUMA
+ if (numnumanodes > 0 && numa_available() != -1) {
+ unsigned long offset = 0;
+ int i;
+
+ for (i = 0; i < numnumanodes; ++i) {
+ if (hostnodes[i] != (uint64_t)-1) {
+ numa_tonode_memory ((char*)area + offset,
+ node_mem[i], hostnodes[i] % (numa_max_node() + 1));
+ }
+ offset += node_mem[i];
+ }
+ }
+#endif
+
return ret;
}
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2008-12-05 13:32 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-05 13:33 [PATCH 2/3] v2: KVM-userspace: allocate guest resources from different host nodes Andre Przywara
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).