From: Dulloor <dulloor@gmail.com>
To: xen-devel@lists.xensource.com
Subject: [vNUMA v2][PATCH 4/8] allocation strategies
Date: Sun, 1 Aug 2010 15:03:27 -0700 [thread overview]
Message-ID: <AANLkTi=Ny0F99+a5hZgt-dGn2v5Exky4eub5VMgFW=Pw@mail.gmail.com> (raw)
In-Reply-To: <AANLkTinI-bhXc-j1YJJpzHk_K1D1NwkX1kaaKRXh8wu0@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 143 bytes --]
Implement the core routines for the selection of nodes for a given
allocation strategy.
-dulloor
Signed-off-by : Dulloor <dulloor@gmail.com>
[-- Attachment #2: xen-04-allocation-strategies.patch --]
[-- Type: text/x-patch, Size: 32922 bytes --]
vNUMA : Implement allocation strategies
diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -28,6 +28,7 @@ CTRL_SRCS-y += xc_mem_event.c
CTRL_SRCS-y += xc_mem_paging.c
CTRL_SRCS-y += xc_memshr.c
CTRL_SRCS-y += xc_cpumap.c
+CTRL_SRCS-y += xc_dom_numa.c
CTRL_SRCS-y += xtl_core.c
CTRL_SRCS-y += xtl_logger_stdio.c
CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c
diff --git a/tools/libxc/xc_dom_numa.c b/tools/libxc/xc_dom_numa.c
new file mode 100644
--- /dev/null
+++ b/tools/libxc/xc_dom_numa.c
@@ -0,0 +1,901 @@
+/* XEN Guest NUMA support
+ * Author : Dulloor (dulloor@gatech.edu) */
+
+#include <string.h>
+#include <stdint.h>
+#include "xg_private.h"
+#include "xc_dom_numa.h"
+#include "xc_cpumap.h"
+
+#ifdef __DOM_NUMA_DEBUG__
+#undef DBGPRINTF
+#define DBGPRINTF(_f, _a...) xc_report(xch, xch->error_handler, XTL_INFO,0, _f , ## _a)
+#endif
+
+#define XC_MAX_NODES 16
+struct xc_node_data {
+ uint32_t node_id;
+ uint64_t size_pages;
+ uint64_t free_pages;
+ xc_cpumask_t cpu_mask; /* node_to_cpumask */
+};
+typedef struct xc_node_data xc_node_data_t;
+
+struct xc_machine_numa_layout {
+ uint64_t size_pages;
+ uint64_t free_pages;
+
+ uint32_t nr_nodes;
+
+ /* Only (nr_nodes*nr_nodes) entries are filled */
+ uint32_t node_distance[XC_MAX_NODES*XC_MAX_NODES];
+ /* Only (nr_nodes) entries are filled */
+ xc_node_data_t node_data[XC_MAX_NODES];
+};
+typedef struct xc_machine_numa_layout xc_machine_numa_layout_t;
+
+/* XXX: Move all sanity checks to this funtion */
+#define XC_DOM_NUMA_MIN_STRIPE 256
+xc_domain_numa_layout_t * xc_dom_alloc_numa_layout(xc_interface *xch,
+ uint32_t domid, uint64_t nr_pages, xc_domain_numa_config_t *config)
+{
+ xc_domain_numa_layout_t *dom_layout;
+
+ if (config->strategy == XC_DOM_NUMA_NONE)
+ {
+ IPRINTF("%s: NUMA memory allocation disabled\n", __FUNCTION__);
+ return 0;
+ }
+ if (!(dom_layout = (xc_domain_numa_layout_t *)malloc(sizeof(*dom_layout))))
+ {
+ ERROR("%s: dom_layout allocation failed\n", __FUNCTION__);
+ return dom_layout;
+ }
+
+ DBGPRINTF("%s: dom_layout allocated\n", __FUNCTION__);
+ memset(dom_layout, 0, sizeof(*dom_layout));
+
+ dom_layout->version = XEN_DOM_NUMA_INTERFACE_VERSION;
+ dom_layout->nr_pages = nr_pages;
+ dom_layout->nr_vnodes = config->nr_nodes;
+
+ /* Internal data */
+ dom_layout->domid = domid;
+ dom_layout->strategy = config->strategy;
+ dom_layout->stripe_size = config->stripe_size;
+ if (dom_layout->stripe_size &&
+ (dom_layout->stripe_size < XC_DOM_NUMA_MIN_STRIPE))
+ {
+ dom_layout->stripe_size = XC_DOM_NUMA_MIN_STRIPE;
+ IPRINTF("%s: Min STRIPE size is %d pages\n",
+ __FUNCTION__, dom_layout->stripe_size);
+ }
+ return dom_layout;
+}
+
+void
+xc_dom_free_numa_layout(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+ DBGPRINTF("%s: dom_layout freed\n", __FUNCTION__);
+ free(dom_layout);
+}
+
+#define XC_DUMP_STR_SZ (8192)
+static void
+xc_dump_dom_numa_layout(xc_interface *xch, xc_domain_numa_layout_t *layout)
+{
+ unsigned int i, j;
+ char *xc_dump_str, *dumpstr;
+ if (!(xc_dump_str = malloc(XC_DUMP_STR_SZ)))
+ {
+ DBGPRINTF("%s : dump_str allocation failed", __FUNCTION__);
+ return;
+ }
+ dumpstr = xc_dump_str;
+ dumpstr += sprintf(dumpstr,
+ "NUMA-LAYOUT(Dom %d) : vcpus(%u), vnodes(%u)",
+ layout->domid, layout->nr_vcpus, layout->nr_vnodes);
+ switch (layout->type)
+ {
+ case XEN_DOM_NUMA_CONFINE:
+ dumpstr += sprintf(dumpstr, ", type(CONFINE)\n");
+ break;
+ case XEN_DOM_NUMA_SPLIT:
+ dumpstr += sprintf(dumpstr, ", type(SPLIT)\n");
+ break;
+ case XEN_DOM_NUMA_STRIPE:
+ dumpstr += sprintf(dumpstr, ", type(STRIPE)\n");
+ break;
+ case XEN_DOM_NUMA_DONTCARE:
+ dumpstr += sprintf(dumpstr, ", type(DONTCARE)\n");
+ break;
+ default:
+ dumpstr += sprintf(dumpstr, ", type(UNDEFINED)\n");
+ }
+ for (i = 0; i < layout->nr_vnodes; i++)
+ {
+ xc_vnode_data_t *vnode_data = &layout->vnode_data[i];
+ dumpstr += sprintf(dumpstr, "vnode[%u]:mnode(%u), node_nr_pages(%x)",
+ vnode_data->vnode_id, vnode_data->mnode_id,
+ vnode_data->nr_pages);
+ if (layout->type == XEN_DOM_NUMA_SPLIT)
+ {
+ char mapstr[128] = "";
+ struct xenctl_cpumap cpumap;
+ xc_cpumap_from_cpumask(&cpumap, &vnode_data->vcpu_mask);
+ xc_cpumap_snprintf(mapstr, sizeof(mapstr), cpumap);
+ dumpstr += sprintf(dumpstr, ", vcpu_mask(%s)", mapstr);
+ }
+ dumpstr += sprintf(dumpstr, "\n");
+ }
+
+ if (layout->type == XEN_DOM_NUMA_CONFINE)
+ goto done;
+ dumpstr += sprintf(dumpstr, "vnode distances :\n");
+ for (i = 0; i < layout->nr_vnodes; i++)
+ dumpstr += sprintf(dumpstr, "\tvnode[%u]", i);
+ for (i = 0; i < layout->nr_vnodes; i++)
+ {
+ dumpstr += sprintf(dumpstr, "\nvnode[%u]", i);
+ for (j = 0; j < layout->nr_vnodes; j++)
+ dumpstr += sprintf(dumpstr, "\t%u",
+ layout->vnode_distance[i*layout->nr_vnodes + j]);
+ dumpstr += sprintf(dumpstr, "\n");
+ }
+done:
+ IPRINTF("%s", xc_dump_str);
+ free(xc_dump_str);
+ return;
+}
+
+
+static int
+xc_get_machine_numa_layout(xc_interface *xch, xc_machine_numa_layout_t *layout)
+{
+ uint32_t i, nr_nodes, nr_cpus;
+ xc_numainfo_t ninfo = { 0 };
+ uint64_t node_memsize[XC_MAX_NODES];
+ uint64_t node_memfree[XC_MAX_NODES];
+ xc_topologyinfo_t tinfo = { 0 };
+ uint32_t cpu_to_node[XC_CPUMASK_NR_CPUS];
+
+ memset(layout, 0, sizeof(*layout));
+ memset(node_memsize, 0, sizeof(uint64_t)*XC_MAX_NODES);
+ memset(node_memfree, 0, sizeof(uint64_t)*XC_MAX_NODES);
+
+ set_xen_guest_handle(ninfo.node_to_memsize, node_memsize);
+ set_xen_guest_handle(ninfo.node_to_memfree, node_memfree);
+ /* Read directly into layout's structure */
+ set_xen_guest_handle(ninfo.node_to_node_distance, layout->node_distance);
+ ninfo.max_node_index = XC_MAX_NODES-1;
+ if (xc_numainfo(xch, &ninfo))
+ {
+ ERROR("%s: xc_numainfo failed", __FUNCTION__);
+ return -1;
+ }
+ /* No need to check if a node is invalid, as in that case
+ * the size would be zero and it would never get selected*/
+ nr_nodes = ninfo.max_node_index + 1;
+ if ( nr_nodes > XC_MAX_NODES )
+ nr_nodes = XC_MAX_NODES;
+
+
+ set_xen_guest_handle(tinfo.cpu_to_core, NULL);
+ set_xen_guest_handle(tinfo.cpu_to_socket, NULL);
+ set_xen_guest_handle(tinfo.cpu_to_node, cpu_to_node);
+ tinfo.max_cpu_index = XC_CPUMASK_NR_CPUS-1;
+
+ if (xc_topologyinfo(xch, &tinfo))
+ {
+ ERROR("%s: xc_topologyinfo failed", __FUNCTION__);
+ return -1;
+ }
+
+ nr_cpus = tinfo.max_cpu_index+1;
+ if (nr_cpus > XC_CPUMASK_NR_CPUS)
+ nr_cpus = XC_CPUMASK_NR_CPUS;
+
+ layout->nr_nodes = nr_nodes;
+ for (i=0; i<nr_nodes; i++)
+ {
+ uint64_t size_pages, free_pages;
+ layout->node_data[i].node_id = i;
+ size_pages = (node_memsize[i] >> PAGE_SHIFT);
+ free_pages = (node_memfree[i] >> PAGE_SHIFT);
+ layout->node_data[i].size_pages = size_pages;
+ layout->node_data[i].free_pages = free_pages;
+ layout->size_pages += size_pages;
+ layout->free_pages += free_pages;
+ }
+
+ for (i=0; i<nr_cpus; i++)
+ {
+ struct xenctl_cpumap cpumap;
+ xc_cpumask_t *cpumask;
+
+ if (cpu_to_node[i] == INVALID_TOPOLOGY_ID)
+ continue;
+ cpumask = &(layout->node_data[(cpu_to_node[i])].cpu_mask);
+ xc_cpumap_from_cpumask(&cpumap, cpumask);
+ xc_cpumap_set_cpu(i, cpumap);
+ }
+ return 0;
+}
+
+static int
+xc_get_max_vcpus(xc_interface *xch, uint32_t domid)
+{
+ DECLARE_DOMCTL;
+ domctl.cmd = XEN_DOMCTL_getdomaininfo;
+ domctl.domain = (domid_t)domid;
+ return ((do_domctl(xch, &domctl) < 0)
+ ? 0 : (domctl.u.getdomaininfo.max_vcpu_id+1));
+}
+
+/* The function makes a (greedy) best fit selection of num_vnodes of
+ * vnode_size each. The number of pages selected from each node are returned
+ * in the node_pages_selected array.
+ * The best_fit ranking is based on the fraction(up to 1024 parts) of node
+ * memory occupied, if the node is selected.
+ * Returns 0 on success and 1 if selection fails. */
+/* XXX: Node selection needs more research/experience. */
+static int xc_select_best_fit_nodes(
+ xc_interface *xch, xc_machine_numa_layout_t *phys_layout,
+ uint32_t num_vnodes, uint64_t vnode_pages, uint64_t *nodes_pages)
+{
+ int i, num_nodes_selected;
+ uint64_t best_fit_rank;
+
+ DBGPRINTF("%s: called\n", __FUNCTION__);
+#define INVALID_NODE (~0)
+#define NODE_FIT_RANK_SHIFT (10)
+ best_fit_rank = 0;
+ num_nodes_selected = 0;
+
+ do {
+ int selected_node = INVALID_NODE;
+ for (i=0; i<phys_layout->nr_nodes; i++)
+ {
+ xc_node_data_t *node_data;
+ uint64_t node_sizepages, node_freepages;
+ uint64_t node_fit_rank;
+
+ /* Node is already selected */
+ if (nodes_pages[i])
+ continue;
+
+ node_data = &phys_layout->node_data[i];
+ node_sizepages = node_data->size_pages;
+ node_freepages = node_data->free_pages;
+
+ if (node_freepages < vnode_pages)
+ continue;
+
+ node_fit_rank = ((node_sizepages-node_freepages-vnode_pages)
+ << NODE_FIT_RANK_SHIFT) / node_sizepages;
+
+ if (node_fit_rank > best_fit_rank)
+ selected_node = i;
+ }
+
+ /* Nodes could not be selected. Bail out ! */
+ if (selected_node == INVALID_NODE)
+ return -1;
+
+ nodes_pages[selected_node] = vnode_pages;
+ num_nodes_selected++;
+ } while(num_nodes_selected < num_vnodes);
+#undef NODE_FIT_RANK_SHIFT
+#undef INVALID_NODE
+ return 0;
+}
+
+/* Sort the phys nodes in the decreasing order of free node memory */
+static void xc_sort_nodeload(xc_machine_numa_layout_t *phys_layout)
+{
+ int i, j;
+ uint32_t nr_nodes;
+
+ nr_nodes = phys_layout->nr_nodes;
+
+ for (i = 0; i < nr_nodes; i++)
+ {
+ uint64_t i_node_free = phys_layout->node_data[i].free_pages;
+ for (j = i+1; j < nr_nodes; j++)
+ {
+ uint64_t j_node_free = phys_layout->node_data[j].free_pages;
+ if (i_node_free > j_node_free)
+ {
+ xc_node_data_t tmp_node_data;
+ tmp_node_data = phys_layout->node_data[i];
+ phys_layout->node_data[i] = phys_layout->node_data[j];
+ phys_layout->node_data[j] = tmp_node_data;
+ }
+ }
+ }
+
+ return;
+}
+
+/* The function selects the nodes in the increasing order of free node memory,
+ * and fills them. The physical memory map for such a domain is striped
+ * across all the selected nodes.
+ * The phys_layout node_data structures could be sorted inplace. So, we
+ * should always use node_data->node_id while using the node_distance array.
+ * Returns the number of nodes selected. */
+static int xc_select_max_fit_nodes(
+ xc_interface *xch, xc_machine_numa_layout_t *phys_layout,
+ uint64_t dom_pages, uint64_t *node_pages)
+{
+ int i;
+ uint64_t dom_alloc_pages;
+
+ DBGPRINTF("%s: called\n", __FUNCTION__);
+ xc_sort_nodeload(phys_layout);
+
+ dom_alloc_pages = 0;
+ for (i=0; i<phys_layout->nr_nodes; i++)
+ {
+ xc_node_data_t *node_data;
+ uint64_t node_freepages;
+
+ node_data = &phys_layout->node_data[i];
+
+ /* In max-fit, if we try to pack the nodes too aggressively
+ * we might fail on any small allocation (from xen node heaps).
+ * That's why, with DEFAULT, we don't use exact_node flag. */
+ node_freepages = node_data->free_pages;
+ if (!node_freepages)
+ continue;
+
+ if (node_freepages > (dom_pages-dom_alloc_pages))
+ node_freepages = (dom_pages-dom_alloc_pages);
+
+ node_pages[i] = node_freepages;
+ dom_alloc_pages += node_freepages;
+ }
+ if (dom_alloc_pages != dom_pages)
+ {
+ ERROR(
+ "%s: Failed to allocate memory. Maybe had to balloon more\n",
+ __FUNCTION__);
+ return -1;
+ }
+ return (i+1);
+}
+
+static int xc_setup_vnode_vcpu_masks(xc_domain_numa_layout_t *dom_layout)
+{
+ int vcpu;
+ for (vcpu=0; vcpu<dom_layout->nr_vcpus; vcpu++)
+ {
+ struct xenctl_cpumap vcpumap;
+ xc_cpumask_t *vcpumask;
+ int vnode = vcpu/(dom_layout->nr_vcpus/dom_layout->nr_vnodes);
+
+ vcpumask = &dom_layout->vnode_data[vnode].vcpu_mask;
+ xc_cpumap_from_cpumask(&vcpumap, vcpumask);
+ xc_cpumap_set_cpu(vcpu, vcpumap);
+ }
+ return 0;
+}
+
+static int xc_setup_vnode_distances(xc_machine_numa_layout_t *phys_layout,
+ xc_domain_numa_layout_t *dom_layout)
+{
+ int vn1, vn2;
+ for (vn1=0; vn1<dom_layout->nr_vnodes; vn1++)
+ {
+ int n1 = dom_layout->vnode_data[vn1].mnode_id;
+ for (vn2=0; vn2<dom_layout->nr_vnodes; vn2++)
+ {
+ int n2 = dom_layout->vnode_data[vn2].mnode_id;
+ dom_layout->vnode_distance[(vn1*dom_layout->nr_vnodes)+vn2] =
+ phys_layout->node_distance[(n1*phys_layout->nr_nodes)+n2];
+
+ }
+ }
+ return 0;
+}
+
+/* We require the vnodes to be aligned to 1GB
+ * SHIFT values for 4K pages */
+#define XC_VNODE_MIN_SHIFT (XEN_MIN_VNODE_SHIFT-PAGE_SHIFT)
+#define XC_VNODE_MIN_SIZE (1UL << XC_VNODE_MIN_SHIFT)
+#define XC_VNODE_MIN_MASK ~(XC_VNODE_MIN_SIZE-1)
+/* Because we are strict with the alignment, we boost the size
+ * to account for the pages not seen in physmap (by 16MB for now). */
+#define XC_VNODE_BOOST_SIZE (4096)
+#define XC_VCPUS_PER_VNODE (1)
+#define XC_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
+
+static int xc_setup_domain_vnodes(xc_interface *xch,
+ xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout,
+ uint64_t *node_pages_selected)
+{
+ int i;
+ uint32_t vnode_id;
+
+ for (i=0, vnode_id=0; i<phys_layout->nr_nodes; i++)
+ {
+ xc_node_data_t *node_data;
+ xc_vnode_data_t *vnode_data;
+
+ if (!node_pages_selected[i])
+ continue;
+
+ node_data = &phys_layout->node_data[i];
+ vnode_data = &dom_layout->vnode_data[vnode_id];
+ vnode_data->vnode_id = vnode_id;
+ vnode_data->nr_pages = node_pages_selected[i];
+ vnode_data->mnode_id = node_data->node_id;
+ vnode_id++;
+ }
+ if (vnode_id != dom_layout->nr_vnodes)
+ {
+ ERROR("%s: Internal Error(vnode count mismatch) (%d/%d) !\n",
+ __FUNCTION__, vnode_id, dom_layout->nr_vnodes);
+ return -1;
+ }
+ /* vnodes are exposed to the guest only for SPLIT. */
+ if (xc_setup_vnode_vcpu_masks(dom_layout) ||
+ (xc_setup_vnode_distances(phys_layout, dom_layout)))
+ {
+ ERROR("%s: vnode setup failed !\n", __FUNCTION__);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xc_select_domain_prep(xc_interface *xch,
+ xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+ if (!dom_layout->nr_vnodes)
+ {
+ ERROR("%s: VM nr_vnodes configured incorrectly !\n", __FUNCTION__);
+ return -1;
+ }
+
+ if (dom_layout->nr_pages > phys_layout->free_pages)
+ {
+ ERROR(
+ "%s: Not enough memory for pv (unlikely after balloon checks)\n",
+ __FUNCTION__);
+ return -1;
+ }
+
+ if (!(dom_layout->nr_vcpus = xc_get_max_vcpus(xch, dom_layout->domid)))
+ {
+ ERROR("%s: xc_get_max_vcpus failed !\n", __FUNCTION__);
+ return -1;
+ }
+
+ if (dom_layout->nr_vcpus > XC_CPUMASK_NR_CPUS)
+ {
+ ERROR("%s: Failed - More than %d vcpus!\n",
+ __FUNCTION__, XC_CPUMASK_NR_CPUS);
+ return -1;
+ }
+
+ if (dom_layout->nr_vcpus < dom_layout->nr_vnodes )
+ {
+ ERROR("%s: VM (%d) - more vcpus(%d) than vnodes(%d)!\n",
+ __FUNCTION__, dom_layout->domid, dom_layout->nr_vcpus,
+ dom_layout->nr_vnodes);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xc_select_domain_confine(xc_interface *xch,
+ xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+ uint64_t *node_pages_selected = 0;
+ int rc;
+
+ DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+ if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+ return -1;
+
+ if (!(node_pages_selected =
+ (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+ {
+ rc = -1;
+ ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+ goto failed;
+ }
+ if ((rc = xc_select_best_fit_nodes(xch, phys_layout, 1,
+ dom_layout->nr_pages, node_pages_selected)))
+ {
+ ERROR("%s: Not enough memory for CONFINE (Had to balloon more ?)\n",
+ __FUNCTION__);
+ goto failed;
+ }
+
+ dom_layout->type = XEN_DOM_NUMA_CONFINE;
+ rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout,
+ node_pages_selected);
+ if (!rc)
+ DBGPRINTF("%s: Selected CONFINE for VM %d\n",
+ __FUNCTION__, dom_layout->domid);
+failed:
+ if (node_pages_selected)
+ free(node_pages_selected);
+ return rc;
+}
+
+/* For the numa guests, we construct a symmetrical topology (wrt the
+ * distribution of vcpus over vnodes).
+ * We require the numa guests to have (2^n) vcpus and (2^k) vnodes.
+ * Each vnode is then assigned 2^(n-k) vcpus, where (n>=k).
+ */
+static int xc_select_domain_split(xc_interface *xch,
+ xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+ uint64_t vnode_nr_pages, *node_pages_selected = 0;
+ int rc;
+
+ DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+ if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+ return -1;
+
+ if (!XC_POWER_OF_2(dom_layout->nr_vcpus))
+ {
+ ERROR("%s: #vcpus != 2^n (disable numa split)\n", __FUNCTION__);
+ return -1;
+ }
+ if (!XC_POWER_OF_2(dom_layout->nr_vnodes))
+ {
+ ERROR("%s: #vnodes != 2^n (disable numa split)\n", __FUNCTION__);
+ return -1;
+ }
+ if (dom_layout->nr_vcpus < (dom_layout->nr_vnodes*XC_VCPUS_PER_VNODE))
+ {
+ ERROR("%s: Failed - Not enough vcpus (%d on %d)!\n",
+ __FUNCTION__, dom_layout->nr_vcpus, dom_layout->nr_vnodes);
+ return -1;
+ }
+
+ vnode_nr_pages =
+ (dom_layout->nr_pages+XC_VNODE_BOOST_SIZE)/dom_layout->nr_vnodes;
+ vnode_nr_pages &= XC_VNODE_MIN_MASK;
+ if (vnode_nr_pages < XC_VNODE_MIN_SIZE)
+ {
+ ERROR("%s: vnode_size(%lx)<min(%lx), nr_pages(%lx), nr_vnodes(%d)!\n",
+ __FUNCTION__, vnode_nr_pages, XC_VNODE_MIN_SIZE,
+ dom_layout->nr_pages, dom_layout->nr_vnodes);
+ return -1;
+ }
+ dom_layout->nr_pages = vnode_nr_pages*dom_layout->nr_vnodes;
+
+ if (!(node_pages_selected =
+ (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+ {
+ rc = -1;
+ ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+ goto failed;
+ }
+ if ((rc = xc_select_best_fit_nodes(xch, phys_layout, dom_layout->nr_vnodes,
+ vnode_nr_pages, node_pages_selected)) != 0)
+ {
+ ERROR("%s: Not enough memory for SPLIT (Had to balloon more ?)\n",
+ __FUNCTION__);
+ goto failed;
+ }
+
+ dom_layout->nr_pages = dom_layout->nr_vnodes*vnode_nr_pages;
+ dom_layout->type = XEN_DOM_NUMA_SPLIT;
+ if ((rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout,
+ node_pages_selected)))
+ goto failed;
+
+ if ((rc = xc_domain_setmaxmem(xch, dom_layout->domid,
+ (dom_layout->nr_pages+XC_VNODE_BOOST_SIZE)<<(PAGE_SHIFT-10))))
+ goto failed;
+
+ DBGPRINTF("%s: Selected SPLIT for VM %d\n",
+ __FUNCTION__, dom_layout->domid);
+failed:
+ if (node_pages_selected)
+ free(node_pages_selected);
+ return rc;
+}
+
+static int xc_select_domain_stripe(xc_interface *xch,
+ xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+ uint64_t vnode_nr_pages, *node_pages_selected = 0;
+ int rc;
+
+ DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+ if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+ return -1;
+
+ vnode_nr_pages = dom_layout->nr_pages/dom_layout->nr_vnodes;
+
+ if (!(node_pages_selected =
+ (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+ {
+ rc = -1;
+ ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+ goto failed;
+ }
+ if ((rc = xc_select_best_fit_nodes(xch, phys_layout, dom_layout->nr_vnodes,
+ vnode_nr_pages, node_pages_selected)) != 0)
+ {
+ ERROR("%s: Not enough memory for STRIPE (Had to balloon more ?)\n",
+ __FUNCTION__);
+ goto failed;
+ }
+
+ dom_layout->nr_pages = dom_layout->nr_vnodes*vnode_nr_pages;
+ dom_layout->type = XEN_DOM_NUMA_STRIPE;
+ rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout,
+ node_pages_selected);
+ if (!rc)
+ DBGPRINTF("%s: Selected STRIPE for VM %d\n",
+ __FUNCTION__, dom_layout->domid);
+failed:
+ if (node_pages_selected)
+ free(node_pages_selected);
+ return rc;
+}
+
+static int xc_select_domain_dontcare(xc_interface *xch,
+ xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+ uint64_t *node_pages_selected = 0;
+ int rc;
+
+ DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+ if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+ return -1;
+
+ if (!(node_pages_selected =
+ (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+ {
+ rc = -1;
+ ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+ goto failed;
+ }
+ if ((rc = xc_select_max_fit_nodes(xch, phys_layout, dom_layout->nr_pages,
+ node_pages_selected)) < 0)
+ {
+ ERROR("%s: Not enough memory for CONFINE (Had to balloon more ?)\n",
+ __FUNCTION__);
+ goto failed;
+ }
+
+ dom_layout->type = XEN_DOM_NUMA_DONTCARE;
+ dom_layout->nr_vnodes = rc;
+ rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout,
+ node_pages_selected);
+ if (!rc)
+ DBGPRINTF("%s: Selected DONTCARE for VM %d\n",
+ __FUNCTION__, dom_layout->domid);
+failed:
+ if (node_pages_selected)
+ free(node_pages_selected);
+ return rc;
+}
+
+#define XC_DOM_IS_NUMA_GUEST(n) (0)
+
+static int xc_select_domain_auto(xc_interface *xch,
+ xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+ int i;
+
+ /* Attempt to confine the VM */
+ DBGPRINTF("%s: Selecting allocation strategy for (VM %d)\n",
+ __FUNCTION__, dom_layout->domid);
+
+ dom_layout->nr_vnodes = 1;
+ if (!xc_select_domain_confine(xch, phys_layout, dom_layout))
+ return 0;
+
+ if (!XC_DOM_IS_NUMA_GUEST(dom_layout))
+ DBGPRINTF("%s: Image doesn't support numa (VM %d)\n",
+ __FUNCTION__, dom_layout->domid);
+ else
+ {
+ /* Attempt to split the VM */
+ for (i = 2; i <= phys_layout->nr_nodes; i<<=1)
+ {
+ dom_layout->nr_vnodes = i;
+ if (!xc_select_domain_split(xch, phys_layout, dom_layout))
+ return 0;
+ }
+ }
+
+ /* Attempt to stripe the VM */
+ for (i = 2; i <= phys_layout->nr_nodes; i++)
+ {
+ dom_layout->nr_vnodes = i;
+ if (!xc_select_domain_stripe(xch, phys_layout, dom_layout))
+ return 0;
+ }
+
+ if (!xc_select_domain_dontcare(xch, phys_layout, dom_layout))
+ return 0;
+
+ ERROR("%s: Failed to allocate memory for the VM (Had to balloon more ?)\n",
+ __FUNCTION__);
+ return -1;
+}
+
+int xc_setup_numa_domain(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+ int rc;
+ xc_machine_numa_layout_t *phys_layout;
+
+ DBGPRINTF("%s: called (mem_strategy:%d)\n",
+ __FUNCTION__, dom_layout->strategy);
+
+ if (!(phys_layout = malloc(sizeof(*phys_layout))))
+ {
+ ERROR( "%s: phys_layout allocation failed\n", __FUNCTION__);
+ return -1;
+ }
+
+ if ((rc = xc_get_machine_numa_layout(xch, phys_layout)))
+ {
+ ERROR( "%s: xc_get_machine_numa_layout failed\n", __FUNCTION__);
+ goto done;
+ }
+
+ switch (dom_layout->strategy)
+ {
+ case XC_DOM_NUMA_AUTO:
+ rc = xc_select_domain_auto(xch, phys_layout, dom_layout);
+ break;
+ case XC_DOM_NUMA_CONFINE:
+ dom_layout->nr_vnodes = 1; /* In case configured bad */
+ rc = xc_select_domain_confine(xch, phys_layout, dom_layout);
+ break;
+ case XC_DOM_NUMA_SPLIT:
+ rc = xc_select_domain_split(xch, phys_layout, dom_layout);
+ break;
+ case XC_DOM_NUMA_STRIPE:
+ rc = xc_select_domain_stripe(xch, phys_layout, dom_layout);
+ break;
+ default:
+ rc = -1;
+ ERROR("%s: Unknown memory allocation strategy (%d)\n",
+ __FUNCTION__, dom_layout->strategy);
+ }
+
+ if (rc)
+ {
+ ERROR("%s: xc_select_domain failed for (%d)\n",
+ __FUNCTION__, dom_layout->strategy);
+ goto done;
+ }
+
+ xc_dump_dom_numa_layout(xch, dom_layout);
+done:
+ free(phys_layout);
+ return rc;
+}
+
+static int
+xc_domain_numa_vcpu_setaffinity(xc_interface *xch, uint32_t domid,
+ int vcpu, struct xenctl_cpumap *cpumap)
+{
+ DECLARE_DOMCTL;
+ int ret = -1;
+
+ domctl.cmd = XEN_DOMCTL_setvcpuaffinity;
+ domctl.domain = (domid_t)domid;
+ domctl.u.vcpuaffinity.vcpu = vcpu;
+ domctl.u.vcpuaffinity.cpumap = *cpumap;
+
+ if ( xc_cpumap_lock_pages(cpumap) != 0 )
+ {
+ PERROR("Could not lock memory for Xen hypercall");
+ goto out;
+ }
+
+ ret = do_domctl(xch, &domctl);
+ xc_cpumap_unlock_pages(cpumap);
+ out:
+ return ret;
+}
+
+static int
+xc_domain_numa_pinvcpus_split(xc_interface *xch,
+ xc_domain_numa_layout_t *dom_layout,
+ xc_machine_numa_layout_t *phys_layout)
+{
+ int vnode;
+
+ for (vnode = 0; vnode < dom_layout->nr_vnodes; vnode++)
+ {
+ int vcpu;
+ int mnode = dom_layout->vnode_data[vnode].mnode_id;
+ xc_cpumask_t *node_cpumask =
+ &phys_layout->node_data[mnode].cpu_mask;
+ xc_cpumask_t *vnode_vcpumask =
+ &dom_layout->vnode_data[vnode].vcpu_mask;
+ struct xenctl_cpumap node_cpumap, vnode_vcpumap;
+
+ xc_cpumap_from_cpumask(&node_cpumap, node_cpumask);
+ xc_cpumap_from_cpumask(&vnode_vcpumap, vnode_vcpumask);
+ xc_for_each_cpu(vcpu, vnode_vcpumap)
+ {
+ if (xc_domain_numa_vcpu_setaffinity(
+ xch, dom_layout->domid, vcpu, &node_cpumap))
+ {
+ ERROR( "%s:xc_vcpu_setaffinity failed\n", __FUNCTION__);
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+static int
+xc_domain_numa_pinvcpus_stripe(xc_interface *xch,
+ xc_domain_numa_layout_t *dom_layout,
+ xc_machine_numa_layout_t *phys_layout)
+{
+ int vnode, vcpu;
+ xc_cpumask_t stripe_cpumask;
+ struct xenctl_cpumap stripe_cpumap;
+
+ xc_cpumap_from_cpumask(&stripe_cpumap, &stripe_cpumask);
+ xc_cpumap_clearall(stripe_cpumap);
+
+ for (vnode = 0; vnode < dom_layout->nr_vnodes; vnode++)
+ {
+ int mnode = dom_layout->vnode_data[vnode].mnode_id;
+ xc_cpumask_t *node_cpumask =
+ &phys_layout->node_data[mnode].cpu_mask;
+ struct xenctl_cpumap node_cpumap;
+
+ xc_cpumap_from_cpumask(&node_cpumap, node_cpumask);
+ xc_cpumap_or(stripe_cpumap, stripe_cpumap, node_cpumap);
+ }
+
+ for (vcpu = 0; vcpu < dom_layout->nr_vcpus; vcpu++)
+ {
+ if (xc_domain_numa_vcpu_setaffinity(
+ xch, dom_layout->domid, vcpu, &stripe_cpumap))
+ {
+ ERROR( "%s:xc_scpu_getaffinity failed\n", __FUNCTION__);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int
+xc_domain_numa_pinvcpus(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+ int rc;
+
+ xc_machine_numa_layout_t *phys_layout;
+ if (!(phys_layout = malloc(sizeof(*phys_layout))))
+ {
+ ERROR( "%s: layout allocation failed\n", __FUNCTION__);
+ return -1;
+ }
+
+ if ((rc = xc_get_machine_numa_layout(xch, phys_layout)))
+ {
+ ERROR( "%s: xc_get_machine_numa_layout failed\n",
+ __FUNCTION__);
+ goto done;
+ }
+
+ if ((dom_layout->type == XEN_DOM_NUMA_STRIPE) ||
+ (dom_layout->type == XEN_DOM_NUMA_DONTCARE))
+ rc = xc_domain_numa_pinvcpus_stripe(xch, dom_layout, phys_layout);
+ else
+ rc = xc_domain_numa_pinvcpus_split(xch, dom_layout, phys_layout);
+done:
+ free(phys_layout);
+ return rc;
+}
diff --git a/tools/libxc/xc_dom_numa.h b/tools/libxc/xc_dom_numa.h
new file mode 100644
--- /dev/null
+++ b/tools/libxc/xc_dom_numa.h
@@ -0,0 +1,73 @@
+#ifndef __XC_DOM_NUMA_H
+#define __XC_DOM_NUMA_H
+
+#include "xenctrl.h"
+#include <xen/dom_numa.h>
+
+#define XC_CPUMASK_NR_CPUS XEN_MAX_VCPUS
+#define XC_MAX_VNODES 8
+
+#define XC_CPUMASK_BITS_PER_BYTE 8
+#define XC_CPUMASK_BITS_TO_BYTES(bits) \
+ (((bits)+XC_CPUMASK_BITS_PER_BYTE-1)/XC_CPUMASK_BITS_PER_BYTE)
+#define XC_CPUMASK_DECLARE_BITMAP(name,bits) \
+ uint8_t name[XC_CPUMASK_BITS_TO_BYTES(bits)]
+
+struct xc_cpumask{ XC_CPUMASK_DECLARE_BITMAP(bits, XC_CPUMASK_NR_CPUS); };
+typedef struct xc_cpumask xc_cpumask_t;
+
+/* Construct a xenctl_cpumap structure using buffer from the xc_cpumask
+ * structure */
+#define xc_cpumap_from_cpumask(map, mask) \
+do { \
+ (map)->nr_cpus = XC_CPUMASK_NR_CPUS; \
+ set_xen_guest_handle((map)->bitmap, (mask)->bits); \
+}while(0)
+
+
+struct xc_vnode_data {
+ uint8_t vnode_id;
+ uint8_t mnode_id;
+ uint32_t nr_pages;
+ xc_cpumask_t vcpu_mask; /* vnode_to_vcpumask */
+};
+typedef struct xc_vnode_data xc_vnode_data_t;
+
+struct xc_domain_numa_layout {
+ uint8_t version;
+ uint8_t type;
+
+ uint8_t nr_vcpus;
+ uint8_t nr_vnodes;
+
+ uint32_t nr_pages;
+ /* Only (nr_vnodes) entries are filled */
+ xc_vnode_data_t vnode_data[XC_MAX_VNODES];
+ /* Only (nr_vnodes*nr_vnodes) entries are filled */
+ uint8_t vnode_distance[XC_MAX_VNODES*XC_MAX_VNODES];
+
+ /* For Internal USE only */
+ uint32_t domid;
+ uint16_t strategy;
+ uint16_t stripe_size;
+};
+typedef struct xc_domain_numa_layout xc_domain_numa_layout_t;
+
+extern xc_domain_numa_layout_t * xc_dom_alloc_numa_layout(xc_interface *xch,
+ uint32_t domid, uint64_t nr_pages, xc_domain_numa_config_t *config);
+extern void xc_dom_free_numa_layout(xc_interface *xch,
+ xc_domain_numa_layout_t *dom_layout);
+
+extern int
+xc_setup_numa_domain(xc_interface *xch, xc_domain_numa_layout_t *dom_layout);
+extern int
+xc_domain_numa_pinvcpus(xc_interface *xch, xc_domain_numa_layout_t *dom_layout);
+
+static inline int xc_domain_nr_vnodes(xc_domain_numa_layout_t * dom_layout)
+{
+ if (!dom_layout || (dom_layout->type != XEN_DOM_NUMA_SPLIT))
+ return 0;
+ return dom_layout->nr_vnodes;
+}
+
+#endif
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
prev parent reply other threads:[~2010-08-01 22:03 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <1BEA8649F0C00540AB2811D7922ECB6C9338B4CE@orsmsx507.amr.corp.intel.com>
2010-07-02 23:54 ` [XEN][vNUMA][PATCH 5/9] allocation strategies Dulloor
2010-08-01 22:03 ` Dulloor [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='AANLkTi=Ny0F99+a5hZgt-dGn2v5Exky4eub5VMgFW=Pw@mail.gmail.com' \
--to=dulloor@gmail.com \
--cc=xen-devel@lists.xensource.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).