From mboxrd@z Thu Jan 1 00:00:00 1970 From: Dario Faggioli Subject: [PATCH 1 of 3] libxl: take node distances into account during NUMA placement Date: Tue, 16 Oct 2012 19:26:26 +0200 Message-ID: References: Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: xen-devel@lists.xen.org Cc: Andre Przywara , Ian Jackson , Ian Campbell List-Id: xen-devel@lists.xenproject.org In fact, among placement candidates with the same number of nodes, the closer the various nodes are to each others, the better the performances for a domain placed there. Signed-off-by: Dario Faggioli diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -105,6 +105,9 @@ out: * - the number of vcpus runnable on the candidates is considered, and * candidates with fewer of them are preferred. If two candidate have * the same number of runnable vcpus, + * - the sum of the node distances in the candidates is considered, and + * candidates with smaller total distance are preferred. If total + * distance is the same for the two candidatess, * - the amount of free memory in the candidates is considered, and the * candidate with greater amount of it is preferred. * @@ -114,6 +117,10 @@ out: * overloading large (from a memory POV) nodes. That's right the effect * that counting the vcpus able to run on the nodes tries to prevent. * + * The relative distance within the nodes in the candidates is considered + * as the closer the nodes, the better for the domain ending up on the + * candidate. + * * Note that this completely ignore the number of nodes each candidate span, * as the fact that fewer nodes is better is already accounted for in the * algorithm. @@ -124,6 +131,9 @@ static int numa_cmpf(const libxl__numa_c if (c1->nr_vcpus != c2->nr_vcpus) return c1->nr_vcpus - c2->nr_vcpus; + if (c1->dists_sum != c2->dists_sum) + return c1->dists_sum - c2->dists_sum; + return c2->free_memkb - c1->free_memkb; } diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h --- a/tools/libxl/libxl_internal.h +++ b/tools/libxl/libxl_internal.h @@ -2732,6 +2732,7 @@ static inline void libxl__ctx_unlock(lib typedef struct { int nr_cpus, nr_nodes; int nr_vcpus; + int dists_sum; uint32_t free_memkb; libxl_bitmap nodemap; } libxl__numa_candidate; diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c --- a/tools/libxl/libxl_numa.c +++ b/tools/libxl/libxl_numa.c @@ -218,6 +218,40 @@ static int nodemap_to_nr_vcpus(libxl__gc return nr_vcpus; } +/* Sum the relative distances of nodes in the nodemap to help finding + * out which candidate is the "tightest" one. */ +static int nodemap_to_dists_sum(libxl_numainfo *ninfo, libxl_bitmap *nodemap) +{ + int tot_dist = 0; + int i, j, a = 0, b; + + for (i = 0; i < libxl_bitmap_count_set(nodemap); i++) { + while (!libxl_bitmap_test(nodemap, a)) + a++; + + /* As it is usually non-zero, we do take the latency of + * of a node to itself into account. */ + b = a; + for (j = 0; j < libxl_bitmap_count_set(nodemap) - i; j++) { + while (!libxl_bitmap_test(nodemap, b)) + b++; + + /* + * In most architectures, going from node A to node B costs + * exactly as much as going from B to A does. However, let's + * not rely on this and consider both contributions, just to + * be ready for everything future might reserve for us. + */ + tot_dist += ninfo[a].dists[b]; + tot_dist += ninfo[b].dists[a]; + b++; + } + a++; + } + + return tot_dist; +} + /* * This function tries to figure out if the host has a consistent number * of cpus along all its NUMA nodes. In fact, if that is the case, we can @@ -415,6 +449,7 @@ int libxl__get_numa_candidate(libxl__gc */ libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap); new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo, &nodemap); + new_cndt.dists_sum = nodemap_to_dists_sum(ninfo, &nodemap); new_cndt.free_memkb = nodes_free_memkb; new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap); new_cndt.nr_cpus = nodes_cpus; @@ -430,12 +465,14 @@ int libxl__get_numa_candidate(libxl__gc LOG(DEBUG, "New best NUMA placement candidate found: " "nr_nodes=%d, nr_cpus=%d, nr_vcpus=%d, " - "free_memkb=%"PRIu32"", new_cndt.nr_nodes, - new_cndt.nr_cpus, new_cndt.nr_vcpus, + "dists_sum=%d, free_memkb=%"PRIu32"", + new_cndt.nr_nodes, new_cndt.nr_cpus, + new_cndt.nr_vcpus, new_cndt.dists_sum, new_cndt.free_memkb / 1024); libxl__numa_candidate_put_nodemap(gc, cndt_out, &nodemap); cndt_out->nr_vcpus = new_cndt.nr_vcpus; + cndt_out->dists_sum = new_cndt.dists_sum; cndt_out->free_memkb = new_cndt.free_memkb; cndt_out->nr_nodes = new_cndt.nr_nodes; cndt_out->nr_cpus = new_cndt.nr_cpus;