From mboxrd@z Thu Jan  1 00:00:00 1970
From: Juergen Gross <juergen.gross@ts.fujitsu.com>
Subject: Re: [PATCH 08 of 11 v3] libxl: optimize the calculation
 of how many VCPUs can run on a candidate
Date: Fri, 01 Feb 2013 15:28:51 +0100
Message-ID: <510BD123.1010203@ts.fujitsu.com>
References: <patchbomb.1359716470@Solace>
	<4e5a5f4ba12771415995.1359716478@Solace>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"; Format="flowed"
Content-Transfer-Encoding: 7bit
Return-path: <xen-devel-bounces@lists.xen.org>
In-Reply-To: <4e5a5f4ba12771415995.1359716478@Solace>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: Dario Faggioli <dario.faggioli@citrix.com>
Cc: Marcus Granado <Marcus.Granado@eu.citrix.com>, Dan Magenheimer <dan.magenheimer@oracle.com>, Ian Campbell <Ian.Campbell@citrix.com>, Anil Madhavapeddy <anil@recoil.org>, George Dunlap <george.dunlap@eu.citrix.com>, Andrew Cooper <Andrew.Cooper3@citrix.com>, Ian Jackson <Ian.Jackson@eu.citrix.com>, xen-devel@lists.xen.org, Jan Beulich <JBeulich@suse.com>, Daniel De Graaf <dgdegra@tycho.nsa.gov>, Matt Wilson <msw@amazon.com>
List-Id: xen-devel@lists.xenproject.org

Am 01.02.2013 12:01, schrieb Dario Faggioli:
> For choosing the best NUMA placement candidate, we need to figure out
> how many VCPUs are runnable on each of them. That requires going through
> all the VCPUs of all the domains and check their affinities.
>
> With this change, instead of doing the above for each candidate, we
> do it once for all, populating an array while counting. This way, when
> we later are evaluating candidates, all we need is summing up the right
> elements of the array itself.
>
> This reduces the complexity of the overall algorithm, as it moves a
> potentially expensive operation (for_each_vcpu_of_each_domain {})
> outside from the core placement loop, so that it is performed only
> once instead of (potentially) tens or hundreds of times. More
> specifically, we go from a worst case computation time complaxity of:
>
>   O(2^n_nodes) * O(n_domains*n_domain_vcpus)
>
> To, with this change:
>
>   O(n_domains*n_domains_vcpus) + O(2^n_nodes) = O(2^n_nodes)
>
> (with n_nodes<=16, otherwise the algorithm suggests partitioning with
> cpupools and does not even start.)
>
> Signed-off-by: Dario Faggioli<dario.faggioli@citrix.com>

Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com>

> ---
> Changes from v2:
>   * in nr_vcpus_on_nodes(), vcpu_nodemap renamed to something more
>     sensible, as suggested during review.
>
> diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
> --- a/tools/libxl/libxl_numa.c
> +++ b/tools/libxl/libxl_numa.c
> @@ -165,22 +165,34 @@ static uint32_t nodemap_to_free_memkb(li
>       return free_memkb;
>   }
>
> -/* Retrieve the number of vcpus able to run on the cpus of the nodes
> - * that are part of the nodemap. */
> -static int nodemap_to_nr_vcpus(libxl__gc *gc, libxl_cputopology *tinfo,
> +/* Retrieve the number of vcpus able to run on the nodes in nodemap */
> +static int nodemap_to_nr_vcpus(libxl__gc *gc, int vcpus_on_node[],
>                                  const libxl_bitmap *nodemap)
>   {
> +    int i, nr_vcpus = 0;
> +
> +    libxl_for_each_set_bit(i, *nodemap)
> +        nr_vcpus += vcpus_on_node[i];
> +
> +    return nr_vcpus;
> +}
> +
> +/* Number of vcpus able to run on the cpus of the various nodes
> + * (reported by filling the array vcpus_on_node[]). */
> +static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo,
> +                             const libxl_bitmap *suitable_cpumap,
> +                             int vcpus_on_node[])
> +{
>       libxl_dominfo *dinfo = NULL;
> -    libxl_bitmap vcpu_nodemap;
> +    libxl_bitmap nodes_counted;
>       int nr_doms, nr_cpus;
> -    int nr_vcpus = 0;
>       int i, j, k;
>
>       dinfo = libxl_list_domain(CTX,&nr_doms);
>       if (dinfo == NULL)
>           return ERROR_FAIL;
>
> -    if (libxl_node_bitmap_alloc(CTX,&vcpu_nodemap, 0)<  0) {
> +    if (libxl_node_bitmap_alloc(CTX,&nodes_counted, 0)<  0) {
>           libxl_dominfo_list_free(dinfo, nr_doms);
>           return ERROR_FAIL;
>       }
> @@ -193,19 +205,17 @@ static int nodemap_to_nr_vcpus(libxl__gc
>           if (vinfo == NULL)
>               continue;
>
> -        /* For each vcpu of each domain ... */
>           for (j = 0; j<  nr_dom_vcpus; j++) {
> +            /* For each vcpu of each domain, increment the elements of
> +             * the array corresponding to the nodes where the vcpu runs */
> +            libxl_bitmap_set_none(&nodes_counted);
> +            libxl_for_each_set_bit(k, vinfo[j].cpumap) {
> +                int node = tinfo[k].node;
>
> -            /* Build up a map telling on which nodes the vcpu is runnable on */
> -            libxl_bitmap_set_none(&vcpu_nodemap);
> -            libxl_for_each_set_bit(k, vinfo[j].cpumap)
> -                libxl_bitmap_set(&vcpu_nodemap, tinfo[k].node);
> -
> -            /* And check if that map has any intersection with our nodemap */
> -            libxl_for_each_set_bit(k, vcpu_nodemap) {
> -                if (libxl_bitmap_test(nodemap, k)) {
> -                    nr_vcpus++;
> -                    break;
> +                if (libxl_bitmap_test(suitable_cpumap, k)&&
> +                    !libxl_bitmap_test(&nodes_counted, node)) {
> +                    libxl_bitmap_set(&nodes_counted, node);
> +                    vcpus_on_node[node]++;
>                   }
>               }
>           }
> @@ -213,9 +223,9 @@ static int nodemap_to_nr_vcpus(libxl__gc
>           libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus);
>       }
>
> -    libxl_bitmap_dispose(&vcpu_nodemap);
> +    libxl_bitmap_dispose(&nodes_counted);
>       libxl_dominfo_list_free(dinfo, nr_doms);
> -    return nr_vcpus;
> +    return 0;
>   }
>
>   /*
> @@ -270,7 +280,7 @@ int libxl__get_numa_candidate(libxl__gc
>       libxl_numainfo *ninfo = NULL;
>       int nr_nodes = 0, nr_suit_nodes, nr_cpus = 0;
>       libxl_bitmap suitable_nodemap, nodemap;
> -    int rc = 0;
> +    int *vcpus_on_node, rc = 0;
>
>       libxl_bitmap_init(&nodemap);
>       libxl_bitmap_init(&suitable_nodemap);
> @@ -281,6 +291,8 @@ int libxl__get_numa_candidate(libxl__gc
>       if (ninfo == NULL)
>           return ERROR_FAIL;
>
> +    GCNEW_ARRAY(vcpus_on_node, nr_nodes);
> +
>       /*
>        * The good thing about this solution is that it is based on heuristics
>        * (implemented in numa_cmpf() ), but we at least can evaluate it on
> @@ -330,6 +342,19 @@ int libxl__get_numa_candidate(libxl__gc
>           goto out;
>
>       /*
> +     * Later on, we will try to figure out how many vcpus are runnable on
> +     * each candidate (as a part of choosing the best one of them). That
> +     * requires going through all the vcpus of all the domains and check
> +     * their affinities. So, instead of doing that for each candidate,
> +     * let's count here the number of vcpus runnable on each node, so that
> +     * all we have to do later is summing up the right elements of the
> +     * vcpus_on_node array.
> +     */
> +    rc = nr_vcpus_on_nodes(gc, tinfo, suitable_cpumap, vcpus_on_node);
> +    if (rc)
> +        goto out;
> +
> +    /*
>        * If the minimum number of NUMA nodes is not explicitly specified
>        * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes
>        * from where to start generating candidates, if possible (or just start
> @@ -414,7 +439,8 @@ int libxl__get_numa_candidate(libxl__gc
>                * current best one (if any).
>                */
>               libxl__numa_candidate_put_nodemap(gc,&new_cndt,&nodemap);
> -            new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo,&nodemap);
> +            new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, vcpus_on_node,
> +&nodemap);
>               new_cndt.free_memkb = nodes_free_memkb;
>               new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap);
>               new_cndt.nr_cpus = nodes_cpus;
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>
>


-- 
Juergen Gross                 Principal Developer Operating Systems
PBG PDG ES&S SWE OS6                   Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html