From mboxrd@z Thu Jan 1 00:00:00 1970 From: Juergen Gross Subject: Re: [PATCH v2] xl: add memory allocation logic for numa platform Date: Tue, 09 Aug 2011 08:10:05 +0200 Message-ID: <4E40CF3D.50305@ts.fujitsu.com> References: <749B9D3DBF0F054390025D9EAFF47F2212D10A330D@shsmsx501.ccr.corp.intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <749B9D3DBF0F054390025D9EAFF47F2212D10A330D@shsmsx501.ccr.corp.intel.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xensource.com Errors-To: xen-devel-bounces@lists.xensource.com To: "Zhang, Yang Z" Cc: "xen-devel@lists.xensource.com" , "'Stefano Stabellini (stefano.stabellini@eu.citrix.com)'" List-Id: xen-devel@lists.xenproject.org Hi again, sorry, didn't spot it before: you should use the generic cpumap functions (libxl_cpumap_alloc, libxl_cpumap_test, libxl_cpumap_set, libxl_cpumap_reset) and the libxl_cpumap type for the cpumaps. This will remove the little overkill of using an asm construct for setting a bit, too. On 08/09/2011 07:52 AM, Zhang, Yang Z wrote: > Thanks Juergen's comments. Here is the revised patch which add cpupool check. > > For numa platform, we need to allocate memory for guest on which guest cpu reside. > This patch add this feature for xl. Just use the simple algorithm to select the best node. > > Signed-off-by: Zhang Yang > > diff -r 9aa47ef52e4d tools/libxl/libxl.c > --- a/tools/libxl/libxl.c Mon Jul 04 06:08:05 2011 +0800 > +++ b/tools/libxl/libxl.c Fri Aug 12 13:51:41 2011 +0800 > @@ -2259,6 +2259,108 @@ > return ERROR_FAIL; > } > > +static inline void set_bit(int nr, volatile void *addr) > +{ > + asm volatile ( > + "btsl %1,%0" > + : "=m" (ADDR) > + : "Ir" (nr), "m" (ADDR) : "memory"); This breaks other architectures (e.g. ia64). > +} > + > +int libxl_get_numainfo(libxl_ctx *ctx, libxl_numainfo_t *numainfo) > +{ > + xc_numainfo_t ninfo = { 0 }; > + libxl_physinfo physinfo = { 0 }; > + libxl_topologyinfo topoinfo; > + int i, max_nodes, max_cpus, node; > + libxl_nodeinfo_t *nodeinfo; > + DECLARE_HYPERCALL_BUFFER(xc_node_to_memsize_t, node_memsize); > + DECLARE_HYPERCALL_BUFFER(xc_node_to_memfree_t, node_memfree); > + > + if (libxl_get_physinfo(ctx,&physinfo)) > + goto out; > + > + max_cpus = physinfo.max_cpu_id + 1; > + max_nodes = NUMA_NO_NODE + 1; > + numainfo->max_cpus = max_cpus; > + > + numainfo->cpu_to_node = calloc(max_cpus, sizeof (unsigned long)); > + if (numainfo->cpu_to_node == NULL) > + goto out; > + > + numainfo->nodeinfo = (char *)calloc(max_nodes, sizeof(libxl_nodeinfo_t)); > + if (numainfo->nodeinfo == NULL) > + goto out; > + > + nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo; > + node_memsize = xc_hypercall_buffer_alloc(ctx->xch, node_memsize, sizeof(*node_memsize) * max_nodes); > + if ( node_memsize == NULL ) > + goto out; > + node_memfree = xc_hypercall_buffer_alloc(ctx->xch, node_memfree, sizeof(*node_memfree) * max_nodes); > + if ( node_memfree == NULL ) > + goto out; > + > + set_xen_guest_handle(ninfo.node_to_memsize, node_memsize); > + set_xen_guest_handle(ninfo.node_to_memfree, node_memfree); > + ninfo.max_node_index = max_nodes - 1; > + > + if ( xc_numainfo(ctx->xch,&ninfo) != 0 ) > + goto out; > + > + max_nodes = ninfo.max_node_index + 1; > + numainfo->max_nodes = max_nodes; > + > + if (libxl_get_topologyinfo(ctx,&topoinfo)) > + goto out; > + > + for ( i = 0; i<= max_nodes; i++ ) { > + if (node_memsize[i] != INVALID_MEM_NODE) { > + nodeinfo[i].online = 1; > + nodeinfo[i].cpumap = malloc(BITS_TO_LONGS(max_cpus) * sizeof (unsigned long)); > + bzero(nodeinfo[i].cpumap, BITS_TO_LONGS(max_cpus) * sizeof (unsigned long)); > + > + /* Total Memory */ > + nodeinfo[i].total_memkb = node_memsize[i]>> 10; /* KB */ > + > + /* Free Memory */ > + nodeinfo[i].free_memkb = node_memfree[i]>> 10; /* KB */ > + } else > + nodeinfo[i].online = 0; > + } > + > + for (i = 0; i< max_cpus; i++) > + if (topoinfo.coremap.array[i] != LIBXL_CPUARRAY_INVALID_ENTRY) { > + node = topoinfo.nodemap.array[i]; > + set_bit(i, nodeinfo[node].cpumap); > + numainfo->cpu_to_node[i] = node; > + } > + libxl_topologyinfo_destroy(&topoinfo); > + > + xc_hypercall_buffer_free(ctx->xch, node_memsize); > + xc_hypercall_buffer_free(ctx->xch, node_memfree); > + return 0; > + > +out: > + if (numainfo->cpu_to_node) > + free(numainfo->cpu_to_node); > + if (numainfo->nodeinfo); > + free(numainfo->nodeinfo); > + xc_hypercall_buffer_free(ctx->xch, node_memsize); > + xc_hypercall_buffer_free(ctx->xch, node_memfree); > + return ERROR_FAIL; > +} > + > +void libxl_free_numainfo(libxl_numainfo_t *numainfo) > +{ > + int i; > + libxl_nodeinfo_t *nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo; > + > + for(i = 0; i< numainfo->max_nodes; i++) > + if(nodeinfo[i].cpumap) > + free(nodeinfo[i].cpumap); > + free(numainfo->cpu_to_node); > + free(numainfo->nodeinfo); > +} > const libxl_version_info* libxl_get_version_info(libxl_ctx *ctx) > { > union { > diff -r 9aa47ef52e4d tools/libxl/libxl.h > --- a/tools/libxl/libxl.h Mon Jul 04 06:08:05 2011 +0800 > +++ b/tools/libxl/libxl.h Fri Aug 12 13:51:41 2011 +0800 > @@ -496,6 +496,16 @@ > > int libxl_get_physinfo(libxl_ctx *ctx, libxl_physinfo *physinfo); > int libxl_get_topologyinfo(libxl_ctx *ctx, libxl_topologyinfo *info); > + > +#define NUMA_NO_NODE 0xFF > +#define INVALID_MEM_NODE 0ul > +#define BITS_PER_LONG (sizeof(unsigned long) * 8) > +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) > +#define ADDR (*(volatile long *) addr) > + > +int libxl_get_numainfo(libxl_ctx *ctx, libxl_numainfo_t *numainfo); > +void libxl_free_numainfo(libxl_numainfo_t *numainfo); > + > libxl_vcpuinfo *libxl_list_vcpu(libxl_ctx *ctx, uint32_t domid, > int *nb_vcpu, int *nrcpus); > int libxl_set_vcpuaffinity(libxl_ctx *ctx, uint32_t domid, uint32_t vcpuid, > diff -r 9aa47ef52e4d tools/libxl/libxl.idl > --- a/tools/libxl/libxl.idl Mon Jul 04 06:08:05 2011 +0800 > +++ b/tools/libxl/libxl.idl Fri Aug 12 13:51:41 2011 +0800 > @@ -291,6 +291,19 @@ > ("socketmap", libxl_cpuarray, False, "cpu to socket map"), > ("nodemap", libxl_cpuarray, False, "cpu to node map"), > ]) > +libxl_nodeinfo = Struct("nodeinfo_t", [ > + ("free_memkb", uint32), > + ("total_memkb", uint32), > + ("candidate", uint32), > + ("online", uint32), > + ("cpumap", string), > + ]) > +libxl_numainfo = Struct("numainfo_t", [ > + ("nodeinfo", string), > + ("max_nodes", uint32), > + ("cpu_to_node", string), > + ("max_cpus", uint32), > + ]) > > libxl_sched_credit = Struct("sched_credit", [ > ("weight", integer), > diff -r 9aa47ef52e4d tools/libxl/libxl_create.c > --- a/tools/libxl/libxl_create.c Mon Jul 04 06:08:05 2011 +0800 > +++ b/tools/libxl/libxl_create.c Fri Aug 12 13:51:41 2011 +0800 > @@ -143,17 +143,110 @@ > console->build_state = state; > return 0; > } > +static int find_best_node(libxl_ctx *ctx, libxl_numainfo_t *numainfo) > +{ > + int nr_doms, i, j, nr_vcpu, nrcpus, best_node, pcpu, node_id; > + unsigned long max_nodes = numainfo->max_nodes; > + unsigned long *nodeload; > + libxl_dominfo *dominfo; > + libxl_vcpuinfo *vcpuinfo; > + libxl_nodeinfo_t *nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo; > + > + nodeload = malloc(max_nodes * sizeof(*nodeload)); > + bzero(nodeload, max_nodes * sizeof(*nodeload)); > + > + if (!(dominfo = libxl_list_domain(ctx,&nr_doms))) > + goto out; > + > + for (i = 0; i< nr_doms; i++) { > + vcpuinfo = libxl_list_vcpu(ctx, dominfo[i].domid,&nr_vcpu,&nrcpus); > + if (!vcpuinfo) > + goto out; > + for (j = 0; j< nr_vcpu; j++) { > + if (!vcpuinfo[j].online) > + continue; > + pcpu = vcpuinfo[j].cpu; > + node_id = numainfo->cpu_to_node[pcpu]; > + if (nodeinfo[node_id].candidate) > + nodeload[node_id]++; > + else > + nodeload[node_id] += 8; > + } > + free(vcpuinfo); > + } > + best_node = 0; > + for (i = 1; i< max_nodes; i++) > + if(nodeinfo[i].candidate&& nodeinfo[i].online > +&& nodeload[i]< nodeload[best_node]) > + best_node = i; > + > + return best_node; > +out: > + if (dominfo) > + free(dominfo); > + return -1; > +} > + > +static int libxl_node_select(libxl_ctx *ctx, libxl_domain_build_info *b_info, uint32_t domid) > +{ > + unsigned long i, best_node; > + unsigned long needmem = b_info->max_memkb; > + libxl_numainfo_t numainfo ={ 0 }; > + libxl_nodeinfo_t *nodeinfo; > + libxl_cpupoolinfo *poolinfo; > + int n_pools; > + > + poolinfo = libxl_list_cpupool(ctx,&n_pools); > + for (i = 0; i< n_pools; i++) > + libxl_cpupoolinfo_destroy(poolinfo + i); > + if (n_pools> 1) { > + printf("cpupools are being used - skip numa optimization.\n"); > + return 0; > + } > + > + > + if (libxl_get_numainfo(ctx,&numainfo)) { > + fprintf(stderr, "libxl_get_topologyinfo failed.\n"); > + return -1; > + } > + > + if (numainfo.max_nodes< 2) { > + printf("max_nodes = %d\n", numainfo.max_nodes); > + return 0; > + } > + > + nodeinfo = (libxl_nodeinfo_t *)numainfo.nodeinfo; > + for (i = 0; i< numainfo.max_nodes; i++) > + if (nodeinfo[i].free_memkb> needmem) > + nodeinfo[i].candidate = 1; > + > + best_node = find_best_node(ctx,&numainfo); > + if (best_node == -1) { > + libxl_numainfo_t_destroy(&numainfo); > + return -1; > + } > + > + for (i = 0; i< b_info->max_vcpus; i++) > + xc_vcpu_setaffinity(ctx->xch, domid, i, (uint8_t *)(nodeinfo[best_node].cpumap)); > + > + libxl_numainfo_t_destroy(&numainfo); > + return 0; > +} > > int libxl__domain_build(libxl__gc *gc, libxl_domain_build_info *info, uint32_t domid, libxl_domain_build_state *state) > { > char **vments = NULL, **localents = NULL; > struct timeval start_time; > int i, ret; > + libxl_ctx *ctx = libxl__gc_owner(gc); > > ret = libxl__build_pre(gc, domid, info, state); > if (ret) > goto out; > > + if (libxl_node_select(ctx, info, domid)) > + printf("Cannot find best node, using defaul algorithm\n"); > + > gettimeofday(&start_time, NULL); > > if (info->hvm) { Juergen -- Juergen Gross Principal Developer Operating Systems PDG ES&S SWE OS6 Telephone: +49 (0) 89 3222 2967 Fujitsu Technology Solutions e-mail: juergen.gross@ts.fujitsu.com Domagkstr. 28 Internet: ts.fujitsu.com D-80807 Muenchen Company details: ts.fujitsu.com/imprint.html