From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andre Przywara Subject: [PATCH 2/3] v2: KVM-userspace: allocate guest resources from different host nodes Date: Fri, 5 Dec 2008 14:33:14 +0100 Message-ID: <49392D9A.9080105@amd.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="------------010201060905090104060802" Cc: kvm@vger.kernel.org, "Daniel P. Berrange" To: Avi Kivity Return-path: Received: from outbound-wa4.frontbridge.com ([216.32.181.16]:17926 "EHLO WA4EHSOBE001.bigfish.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758496AbYLENc7 (ORCPT ); Fri, 5 Dec 2008 08:32:59 -0500 Sender: kvm-owner@vger.kernel.org List-ID: --------------010201060905090104060802 Content-Type: text/plain; charset="ISO-8859-1"; format=flowed Content-Transfer-Encoding: 7bit According to the host node map given on the command line the VCPUs are pinned to the respective node (allowing at least scheduling between the cores belonging to this node). The mmap'ed guest memory will be bound to the correct host nodes (this will of course not take effect until the memory actually faults in). The presence of libnuma will be auto-detected. Signed-off-by: Andre Przywara -- Andre Przywara AMD-Operating System Research Center (OSRC), Dresden, Germany Tel: +49 351 277-84917 ----to satisfy European Law for business letters: AMD Saxony Limited Liability Company & Co. KG, Wilschdorfer Landstr. 101, 01109 Dresden, Germany Register Court Dresden: HRA 4896, General Partner authorized to represent: AMD Saxony LLC (Wilmington, Delaware, US) General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy --------------010201060905090104060802 Content-Type: text/x-patch; name="kvmnuma_hostalloc.patch" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="kvmnuma_hostalloc.patch" commit 0bc93b19ba3132140d5b34746523d0c7c8169093 Author: Andre Przywara Date: Fri Dec 5 14:05:44 2008 +0100 allocate guest resources from different host NUMA nodes diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 05ace8e..690903e 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -698,6 +698,10 @@ LIBS += -lkvm DEPLIBS += ../libkvm/libkvm.a endif +ifdef CONFIG_NUMA +LIBS += -lnuma +endif + ifdef CONFIG_VNC_TLS CPPFLAGS += $(CONFIG_VNC_TLS_CFLAGS) LIBS += $(CONFIG_VNC_TLS_LIBS) diff --git a/qemu/configure b/qemu/configure index 63a85d6..3e2c9f9 100755 --- a/qemu/configure +++ b/qemu/configure @@ -121,6 +121,7 @@ bluez="yes" kvm="yes" kvm_cap_pit="no" kvm_cap_device_assignment="no" +getcpu="no" kerneldir="" aix="no" blobs="yes" @@ -391,6 +392,8 @@ for opt do ;; --enable-mixemu) mixemu="yes" ;; + --disable-numa) numa="no" + ;; --disable-aio) aio="no" ;; --disable-blobs) blobs="no" @@ -489,6 +492,7 @@ echo " Available drivers: $audio_possible_drivers" echo " --audio-card-list=LIST set list of additional emulated audio cards" echo " Available cards: ac97 adlib cs4231a gus" echo " --enable-mixemu enable mixer emulation" +echo " --disable-numa disable NUMA support (host side)" echo " --disable-brlapi disable BrlAPI" echo " --disable-vnc-tls disable TLS encryption for VNC server" echo " --disable-curses disable curses output" @@ -985,6 +989,29 @@ for drv in $audio_drv_list; do done ########################################## +# libnuma probe + +if test -z "$numa" ; then + numa=no + + cat > $TMPC << EOF +#include +int main(void) { return numa_available(); } +EOF + if $cc ${ARCH_CFLAGS} -o $TMPE ${OS_CFLAGS} $TMPC -lnuma 2> /dev/null ; then + numa=yes + fi +fi + +cat > $TMPC << EOF +#include +int main(void) { return sched_getcpu(); } +EOF +if $cc ${ARCH_CFLAGS} -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then + getcpu=yes +fi + +########################################## # BrlAPI probe if test -z "$brlapi" ; then @@ -1181,6 +1208,7 @@ echo "mingw32 support $mingw32" echo "Audio drivers $audio_drv_list" echo "Extra audio cards $audio_card_list" echo "Mixer emulation $mixemu" +echo "NUMA support $numa" echo "VNC TLS support $vnc_tls" if test "$vnc_tls" = "yes" ; then echo " TLS CFLAGS $vnc_tls_cflags" @@ -1415,6 +1443,13 @@ if test "$mixemu" = "yes" ; then echo "CONFIG_MIXEMU=yes" >> $config_mak echo "#define CONFIG_MIXEMU 1" >> $config_h fi +if test "$numa" = "yes" ; then + echo "CONFIG_NUMA=yes" >> $config_mak + echo "#define CONFIG_NUMA 1" >> $config_h +fi +if test "$getcpu" = "yes" ; then + echo "#define HAVE_GETCPU 1" >> $config_h +fi if test "$vnc_tls" = "yes" ; then echo "CONFIG_VNC_TLS=yes" >> $config_mak echo "CONFIG_VNC_TLS_CFLAGS=$vnc_tls_cflags" >> $config_mak diff --git a/qemu/hw/fw_cfg.h b/qemu/hw/fw_cfg.h index ef8f378..b370e4e 100644 --- a/qemu/hw/fw_cfg.h +++ b/qemu/hw/fw_cfg.h @@ -8,6 +8,9 @@ #define FW_CFG_NOGRAPHIC 0x04 #define FW_CFG_NB_CPUS 0x05 #define FW_CFG_MACHINE_ID 0x06 +#define FW_CFG_NUMA_NODES 0x07 +#define FW_CFG_NUMA_NODE_CPUS 0x08 +#define FW_CFG_NUMA_NODE_MEM 0x09 #define FW_CFG_MAX_ENTRY 0x10 #define FW_CFG_WRITE_CHANNEL 0x4000 diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c index 6de460c..b723125 100644 --- a/qemu/hw/pc.c +++ b/qemu/hw/pc.c @@ -439,6 +439,12 @@ static void bochs_bios_init(void) fw_cfg = fw_cfg_init(BIOS_CFG_IOPORT, BIOS_CFG_IOPORT + 1, 0, 0); fw_cfg_add_i32(fw_cfg, FW_CFG_ID, 1); fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)ram_size); + fw_cfg_add_i16(fw_cfg, FW_CFG_NUMA_NODES, numnumanodes); + + fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA_NODE_MEM, (uint8_t*)node_mem, + sizeof(node_mem[0]) * numnumanodes); + fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA_NODE_CPUS, (uint8_t*)node_to_cpus, + sizeof(node_to_cpus[0]) * numnumanodes); } /* Generate an initial boot sector which sets state and jump to diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index a7cfa24..63afe85 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -28,6 +28,10 @@ int kvm_pit = 1; #include #include +#ifdef CONFIG_NUMA +#include +#endif + #define false 0 #define true 1 @@ -424,6 +428,36 @@ static int kvm_main_loop_cpu(CPUState *env) return 0; } +#ifdef CONFIG_NUMA + +#ifndef HAVE_GETCPU +static int get_cur_node (void) +{ + return -1; +} +#else /* HAVE_GETCPU */ + +#define NUMA_MASK_SIZE 16 +static int get_cur_node (void) +{ +int cpunr, node, maskbits; +unsigned long maskbuf[NUMA_MASK_SIZE]; + + maskbits = sizeof(maskbuf[0]) * 8; + cpunr = sched_getcpu(); + if (cpunr >= maskbits * NUMA_MASK_SIZE) return 0; + for (node = 0; node <= numa_max_node(); node++) + { + numa_node_to_cpus (node, maskbuf, sizeof(maskbuf[0]) * NUMA_MASK_SIZE); + if (maskbuf[cpunr / maskbits] & (1 << (cpunr % maskbits))) + return node; + } + return 0; +} +#endif /* HAVE_GETCPU */ + +#endif /* CONFIG_NUMA */ + static void *ap_main_loop(void *_env) { CPUState *env = _env; @@ -432,6 +466,32 @@ static void *ap_main_loop(void *_env) current_env = env; env->thread_id = kvm_get_thread_id(); + +#ifdef CONFIG_NUMA + if (numnumanodes > 0 && numa_available() != -1) + { + int i; + for (i = 0; i < numnumanodes; i++) { + if (!(node_to_cpus[i] & (1 << env->cpu_index))) continue; + if (hostnodes[i] == (uint64_t)-1) { + int j; + unsigned long offset = 0; + + hostnodes[i] = get_cur_node(); + if (hostnodes[i] != (uint64_t)-1) { + for (j = 0; j < i; ++j) offset += node_mem[i]; + numa_tonode_memory (phys_ram_base + offset, + node_mem[i], hostnodes[i] % (numa_max_node() + 1)); + } + } + + if (hostnodes[i] != (uint64_t)-1) + numa_run_on_node (hostnodes[i] % (numa_max_node() + 1)); + break; + } + } +#endif + sigfillset(&signals); sigprocmask(SIG_BLOCK, &signals, NULL); kvm_create_vcpu(kvm_context, env->cpu_index); @@ -840,6 +900,21 @@ int kvm_setup_guest_memory(void *area, unsigned long size) if (ret) perror ("madvise"); +#ifdef CONFIG_NUMA + if (numnumanodes > 0 && numa_available() != -1) { + unsigned long offset = 0; + int i; + + for (i = 0; i < numnumanodes; ++i) { + if (hostnodes[i] != (uint64_t)-1) { + numa_tonode_memory ((char*)area + offset, + node_mem[i], hostnodes[i] % (numa_max_node() + 1)); + } + offset += node_mem[i]; + } + } +#endif + return ret; } --------------010201060905090104060802--