# HG changeset patch # User Andre Przywara # Date 1215083831 -7200 # Node ID b84c5f2fe83bd7c94ed956ba412689e614177f5c # Parent a0dccef499b005ba13eb70bf6cac856af44a10a0 make guest memory allocation NUMA aware diff -r a0dccef499b0 -r b84c5f2fe83b tools/libxc/xc_hvm_build.c --- a/tools/libxc/xc_hvm_build.c Thu Jul 03 13:04:01 2008 +0200 +++ b/tools/libxc/xc_hvm_build.c Thu Jul 03 13:17:11 2008 +0200 @@ -18,6 +18,8 @@ #include "xc_e820.h" #include + +#include #define SUPERPAGE_PFN_SHIFT 9 #define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT) @@ -155,8 +157,171 @@ return rc; } +static int hweight_long (unsigned long value) +{ +int ret=0; + + while (value>0) + { + if (value&1) ++ret; + value>>=1; + } + return ret; +} + +static int get_nodemasks (int xc_handle, uint64_t **nodemasks) +{ +#define MAX_CPU_ID 255 + xc_physinfo_t physinfo; + xc_cpu_to_node_t *cpumap; + int nrcpus, i; + + cpumap=(xc_cpu_to_node_t *)malloc(sizeof(xc_cpu_to_node_t)*MAX_CPU_ID); + set_xen_guest_handle(physinfo.cpu_to_node, cpumap); + + xc_physinfo (xc_handle,&physinfo); + nrcpus = physinfo.threads_per_core * physinfo.cores_per_socket * + physinfo.nr_nodes; + + *nodemasks=malloc(sizeof(uint64_t)*physinfo.nr_nodes); + memset (*nodemasks,0,sizeof(uint64_t)*physinfo.nr_nodes); + for ( i = 0; i < nrcpus; i++ ) + { + (*nodemasks)[cpumap[i]] |= 1 << i; + } + return nrcpus; +} + +/* Distribute the VCPUs to the given NUMA nodes. + * Use xc_vcpu_setaffinity to pin physical CPUs to the VCPUs. + */ +static int setup_numa_affinity (int xc_handle, uint32_t dom, + unsigned long nodemask) +{ + uint64_t *nodemasks, usemask; + + int nrcpus, i; + xc_dominfo_t dominfo; + int nrnodes,curnode,vcpusleft; + + nrnodes = hweight_long (nodemask); + + nrcpus = get_nodemasks (xc_handle, &nodemasks); + + if (xc_domain_getinfo (xc_handle, dom, 1, &dominfo) != 1) + { + ERROR("Unable to get platform info."); + return -1; + } + curnode = -1; + vcpusleft = 0; + for ( i = 0; i <= dominfo.max_vcpu_id; i++ ) + { + if ( vcpusleft == 0 ) + { + vcpusleft = ( dominfo.max_vcpu_id + 1 ) / nrnodes; + if ( ++curnode < ( ( dominfo.max_vcpu_id + 1 ) % nrnodes ) ) + vcpusleft++; + usemask = nodemasks[__ffs(nodemask)]; + nodemask &= ~(1ULL<<__ffs(nodemask)); + } + xc_vcpu_setaffinity (xc_handle, dom, i, usemask); + vcpusleft--; + } + + return 0; +} + +static int populate_on_node ( int xc_handle, uint32_t dom, + unsigned long *cur_pages, + unsigned long nr_pages, + int node, xen_pfn_t* page_array) +{ +int rc=0; +unsigned long i; + + while ( (rc == 0) && (nr_pages > 0 ) ) + { + /* Clip count to maximum 8MB extent. */ + unsigned long count = nr_pages; + if ( count > 2048 ) + count = 2048; + + /* Clip partial superpage extents to superpage boundaries. */ + if ( ((*cur_pages & (SUPERPAGE_NR_PFNS-1)) != 0) && + (count > (-*cur_pages & (SUPERPAGE_NR_PFNS-1))) ) + count = -*cur_pages & (SUPERPAGE_NR_PFNS-1); /* clip s.p. tail */ + else if ( ((count & (SUPERPAGE_NR_PFNS-1)) != 0) && + (count > SUPERPAGE_NR_PFNS) ) + count &= ~(SUPERPAGE_NR_PFNS - 1); /* clip non-s.p. tail */ + + /* Attempt to allocate superpage extents. */ + if ( ((count | *cur_pages) & (SUPERPAGE_NR_PFNS - 1)) == 0 ) + { + long done; + xen_pfn_t sp_extents[2048 >> SUPERPAGE_PFN_SHIFT]; + struct xen_memory_reservation sp_req = { + .nr_extents = count >> SUPERPAGE_PFN_SHIFT, + .extent_order = SUPERPAGE_PFN_SHIFT, + .mem_flags = XENMEM_set_node(node), + .domid = dom + }; + set_xen_guest_handle(sp_req.extent_start, sp_extents); + for ( i = 0; i < sp_req.nr_extents; i++ ) + sp_extents[i] = page_array[*cur_pages+(i< 0 ) + { + done <<= SUPERPAGE_PFN_SHIFT; + *cur_pages += done; + count -= done; + nr_pages -= done; + } + } + + /* Fall back to 4kB extents. */ + if ( count != 0 ) + { + rc = xc_domain_memory_populate_physmap( + xc_handle, dom, count, 0, 0, node, &page_array[*cur_pages]); + *cur_pages += count; + nr_pages -= count; + } + } + return rc; +} + +static int setup_numa_mem ( int xc_handle, uint32_t dom, + unsigned long *cur_pages, unsigned long nr_pages, + unsigned nodemask, xen_pfn_t *page_array) +{ + int i, rc; + unsigned long cur_node_pages; + unsigned long pages_per_node; + int numanodes; + + numanodes = hweight_long (nodemask); + + pages_per_node = ((nr_pages+0xFF)&(~0xFFUL))/numanodes; + + for ( i = 0 ; i < numanodes ; i++ ) + { + if ( i == numanodes - 1 ) + cur_node_pages = nr_pages - i * pages_per_node; + else cur_node_pages = pages_per_node; + if ( i == 0 ) cur_node_pages -= *cur_pages; + + rc = populate_on_node (xc_handle, dom, cur_pages, cur_node_pages, + __ffs(nodemask), page_array); + if ( rc != 0 ) return rc; + + nodemask &= ~(1<<__ffs(nodemask)); + } + return 0; +} + static int setup_guest(int xc_handle, - uint32_t dom, int memsize, + uint32_t dom, int memsize, unsigned long nodemask, char *image, unsigned long image_size) { xen_pfn_t *page_array = NULL; @@ -169,6 +334,7 @@ struct elf_binary elf; uint64_t v_start, v_end; int rc; + int node; xen_capabilities_info_t caps; /* An HVM guest must be initialised with at least 2MB memory. */ @@ -217,60 +383,30 @@ * We allocate pages in batches of no more than 8MB to ensure that * we can be preempted and hence dom0 remains responsive. */ + + if ( nodemask == 0 ) node = XENMEM_DEFAULT_NODE; + else node = __ffs (nodemask); + rc = xc_domain_memory_populate_physmap( - xc_handle, dom, 0xa0, 0, 0, XENMEM_DEFAULT_NODE, &page_array[0x00]); + xc_handle, dom, 0xa0, 0, 0, node, &page_array[0x00]); cur_pages = 0xc0; - while ( (rc == 0) && (nr_pages > cur_pages) ) - { - /* Clip count to maximum 8MB extent. */ - unsigned long count = nr_pages - cur_pages; - if ( count > 2048 ) - count = 2048; - /* Clip partial superpage extents to superpage boundaries. */ - if ( ((cur_pages & (SUPERPAGE_NR_PFNS-1)) != 0) && - (count > (-cur_pages & (SUPERPAGE_NR_PFNS-1))) ) - count = -cur_pages & (SUPERPAGE_NR_PFNS-1); /* clip s.p. tail */ - else if ( ((count & (SUPERPAGE_NR_PFNS-1)) != 0) && - (count > SUPERPAGE_NR_PFNS) ) - count &= ~(SUPERPAGE_NR_PFNS - 1); /* clip non-s.p. tail */ - - /* Attempt to allocate superpage extents. */ - if ( ((count | cur_pages) & (SUPERPAGE_NR_PFNS - 1)) == 0 ) - { - long done; - xen_pfn_t sp_extents[2048 >> SUPERPAGE_PFN_SHIFT]; - struct xen_memory_reservation sp_req = { - .nr_extents = count >> SUPERPAGE_PFN_SHIFT, - .extent_order = SUPERPAGE_PFN_SHIFT, - .mem_flags = XENMEM_set_node(XENMEM_DEFAULT_NODE), - .domid = dom - }; - set_xen_guest_handle(sp_req.extent_start, sp_extents); - for ( i = 0; i < sp_req.nr_extents; i++ ) - sp_extents[i] = page_array[cur_pages+(i< 0 ) - { - done <<= SUPERPAGE_PFN_SHIFT; - cur_pages += done; - count -= done; - } - } - - /* Fall back to 4kB extents. */ - if ( count != 0 ) - { - rc = xc_domain_memory_populate_physmap( - xc_handle, dom, count, 0, 0, XENMEM_DEFAULT_NODE, &page_array[cur_pages]); - cur_pages += count; - } - } + if ( hweight_long (nodemask) > 1 ) + rc = setup_numa_mem (xc_handle, dom, &cur_pages, nr_pages, + nodemask, page_array); + else + rc = populate_on_node (xc_handle, dom, &cur_pages, nr_pages - cur_pages, + node, page_array); if ( rc != 0 ) { PERROR("Could not allocate memory for HVM guest.\n"); goto error_out; + } + + if ( hweight_long (nodemask) > 1 ) + { + setup_numa_affinity (xc_handle, dom, nodemask); } if ( loadelfimage(&elf, xc_handle, dom, page_array) != 0 ) @@ -365,6 +501,7 @@ static int xc_hvm_build_internal(int xc_handle, uint32_t domid, int memsize, + unsigned long nodemask, char *image, unsigned long image_size) { @@ -374,7 +511,7 @@ return -1; } - return setup_guest(xc_handle, domid, memsize, image, image_size); + return setup_guest(xc_handle, domid, memsize, nodemask, image, image_size); } static inline int is_loadable_phdr(Elf32_Phdr *phdr) @@ -389,6 +526,7 @@ int xc_hvm_build(int xc_handle, uint32_t domid, int memsize, + unsigned long nodemask, const char *image_name) { char *image; @@ -399,7 +537,8 @@ ((image = xc_read_image(image_name, &image_size)) == NULL) ) return -1; - sts = xc_hvm_build_internal(xc_handle, domid, memsize, image, image_size); + sts = xc_hvm_build_internal(xc_handle, domid, memsize, nodemask, + image, image_size); free(image); @@ -412,6 +551,7 @@ int xc_hvm_build_mem(int xc_handle, uint32_t domid, int memsize, + unsigned long nodemask, const char *image_buffer, unsigned long image_size) { @@ -434,7 +574,7 @@ return -1; } - sts = xc_hvm_build_internal(xc_handle, domid, memsize, + sts = xc_hvm_build_internal(xc_handle, domid, memsize, nodemask, img, img_len); /* xc_inflate_buffer may return the original buffer pointer (for diff -r a0dccef499b0 -r b84c5f2fe83b tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Thu Jul 03 13:04:01 2008 +0200 +++ b/tools/libxc/xenguest.h Thu Jul 03 13:17:11 2008 +0200 @@ -128,11 +128,13 @@ int xc_hvm_build(int xc_handle, uint32_t domid, int memsize, + unsigned long nodemask, const char *image_name); int xc_hvm_build_mem(int xc_handle, uint32_t domid, int memsize, + unsigned long nodemask, const char *image_buffer, unsigned long image_size); diff -r a0dccef499b0 -r b84c5f2fe83b tools/libxc/xg_private.c --- a/tools/libxc/xg_private.c Thu Jul 03 13:04:01 2008 +0200 +++ b/tools/libxc/xg_private.c Thu Jul 03 13:17:11 2008 +0200 @@ -177,6 +177,7 @@ int xc_hvm_build(int xc_handle, uint32_t domid, int memsize, + unsigned long nodemask, const char *image_name) { errno = ENOSYS; diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:04:01 2008 +0200 +++ b/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:17:11 2008 +0200 @@ -857,16 +857,17 @@ #endif char *image; int memsize, vcpus = 1, acpi = 0, apic = 1; + unsigned long nodemask; static char *kwd_list[] = { "domid", "memsize", "image", "vcpus", "acpi", - "apic", NULL }; - if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iii", kwd_list, - &dom, &memsize, - &image, &vcpus, &acpi, &apic) ) + "apic", "nodemask", NULL }; + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iiil", kwd_list, + &dom, &memsize, &image, + &vcpus, &acpi, &apic, &nodemask) ) return NULL; - if ( xc_hvm_build(self->xc_handle, dom, memsize, image) != 0 ) + if ( xc_hvm_build(self->xc_handle, dom, memsize, nodemask, image) != 0 ) return pyxc_error_to_exception(); #if !defined(__ia64__) diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/xend/XendConfig.py --- a/tools/python/xen/xend/XendConfig.py Thu Jul 03 13:04:01 2008 +0200 +++ b/tools/python/xen/xend/XendConfig.py Thu Jul 03 13:17:11 2008 +0200 @@ -162,6 +162,7 @@ 'vhpt': int, 'guest_os_type': str, 'hap': int, + 'guestnodes': int, } # Xen API console 'other_config' keys. @@ -374,6 +375,7 @@ 'other_config': {}, 'platform': {}, 'target': 0, + 'guestnodes': 0, } return defaults @@ -569,7 +571,10 @@ cfg["memory"] = int(sxp.child_value(sxp_cfg, "memory")) if sxp.child_value(sxp_cfg, "maxmem") != None: cfg["maxmem"] = int(sxp.child_value(sxp_cfg, "maxmem")) - + + if sxp.child_value(sxp_cfg, "guestnodes") != None: + cfg["guestnodes"] = int(sxp.child_value(sxp_cfg, "guestnodes")) + # Convert scheduling parameters to vcpus_params if 'vcpus_params' not in cfg: cfg['vcpus_params'] = {} diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Thu Jul 03 13:04:01 2008 +0200 +++ b/tools/python/xen/xend/XendDomainInfo.py Thu Jul 03 13:17:11 2008 +0200 @@ -2053,7 +2053,7 @@ if self.info['cpus'][v]: xc.vcpu_setaffinity(self.domid, v, self.info['cpus'][v]) else: - def find_relaxed_node(node_list): + def find_relaxed_node(node_list, numnodes): import sys nr_nodes = info['nr_nodes'] if node_list is None: @@ -2078,21 +2078,36 @@ nodeload[i] = int(nodeload[i] * 16 / len(info['node_to_cpu'][i])) else: nodeload[i] = sys.maxint - index = nodeload.index( min(nodeload) ) - return index + + if numnodes == 0: + return nodeload.index( min(nodeload) ) + else: + nodemask = 0 + for i in range (0,numnodes): + index = min((n, i) for i, n in enumerate(nodeload))[1] + nodemask = nodemask | (1 << index) + nodeload[index] = sys.maxint + return nodemask info = xc.physinfo() + nodemask = 0 if info['nr_nodes'] > 1: node_memory_list = info['node_to_memory'] needmem = self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024 + if self.image.guestnodes > 1: + needmem = needmem / self.image.guestnodes candidate_node_list = [] for i in range(0, info['nr_nodes']): if node_memory_list[i] >= needmem and len(info['node_to_cpu'][i]) > 0: candidate_node_list.append(i) - index = find_relaxed_node(candidate_node_list) - cpumask = info['node_to_cpu'][index] - for v in range(0, self.info['VCPUs_max']): - xc.vcpu_setaffinity(self.domid, v, cpumask) + nodemask = find_relaxed_node(candidate_node_list, + self.image.guestnodes) + if self.image.guestnodes < 1: + cpumask = info['node_to_cpu'][nodemask] + for v in range(0, self.info['VCPUs_max']): + xc.vcpu_setaffinity(self.domid, v, cpumask) + else: + self.image.nodemask = nodemask # Use architecture- and image-specific calculations to determine # the various headrooms necessary, given the raw configured diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Thu Jul 03 13:04:01 2008 +0200 +++ b/tools/python/xen/xend/image.py Thu Jul 03 13:17:11 2008 +0200 @@ -127,6 +127,9 @@ self.cpuid = vmConfig['cpuid']; if 'cpuid_check' in vmConfig: self.cpuid_check = vmConfig['cpuid_check'] + + self.guestnodes = int(vmConfig['platform'].get('guestnodes',0)) + self.nodemask = 0 def cleanupBootloading(self): if self.bootloader: @@ -696,6 +699,7 @@ self.apic = int(vmConfig['platform'].get('apic', 0)) self.acpi = int(vmConfig['platform'].get('acpi', 0)) self.guest_os_type = vmConfig['platform'].get('guest_os_type') + self.guestnodes = int(vmConfig['platform'].get('guestnodes', 0)) # Return a list of cmd line args to the device models based on the @@ -797,13 +801,16 @@ log.debug("vcpus = %d", self.vm.getVCpuCount()) log.debug("acpi = %d", self.acpi) log.debug("apic = %d", self.apic) + log.debug("guestnodes = %d", self.guestnodes) + log.debug("nodemask = %d", self.nodemask) rc = xc.hvm_build(domid = self.vm.getDomid(), image = self.loader, memsize = mem_mb, vcpus = self.vm.getVCpuCount(), acpi = self.acpi, - apic = self.apic) + apic = self.apic, + nodemask = self.nodemask) rc['notes'] = { 'SUSPEND_CANCEL': 1 } rc['store_mfn'] = xc.hvm_get_param(self.vm.getDomid(), diff -r a0dccef499b0 -r b84c5f2fe83b tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Thu Jul 03 13:04:01 2008 +0200 +++ b/tools/python/xen/xm/create.py Thu Jul 03 13:17:11 2008 +0200 @@ -557,6 +557,10 @@ fn=append_value, default=[], use="""Cpuid check description.""") +gopts.var('guestnodes', val="GUESTNODES", + fn=set_int, default=0, + use="""Number of NUMA nodes to appear in the guest.""") + def err(msg): """Print an error to stderr and exit. """ @@ -765,7 +769,8 @@ 'vnc', 'vncdisplay', 'vncunused', 'vncconsole', 'vnclisten', 'sdl', 'display', 'xauthority', 'rtc_timeoffset', 'monitor', 'acpi', 'apic', 'usb', 'usbdevice', 'keymap', 'pci', 'hpet', - 'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check'] + 'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check', + 'guestnodes' ] for a in args: if a in vals.__dict__ and vals.__dict__[a] is not None: