From: Elena Ufimtseva <ufimtseva@gmail.com>
To: xen-devel@lists.xen.org
Cc: keir@xen.org, Ian.Campbell@citrix.com,
stefano.stabellini@eu.citrix.com, george.dunlap@eu.citrix.com,
msw@linux.com, dario.faggioli@citrix.com, lccycc123@gmail.com,
ian.jackson@eu.citrix.com, JBeulich@suse.com,
Elena Ufimtseva <ufimtseva@gmail.com>
Subject: [PATCH v5 4/8] vnuma topology parsing routines
Date: Tue, 3 Jun 2014 00:53:16 -0400 [thread overview]
Message-ID: <1401771200-11448-6-git-send-email-ufimtseva@gmail.com> (raw)
In-Reply-To: <1401771200-11448-1-git-send-email-ufimtseva@gmail.com>
Parses vnuma topoplogy number of nodes and memory
ranges. If not defined, initializes vnuma with
only one node and default topology.
Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
tools/libxl/libxl_vnuma.h | 11 ++
tools/libxl/xl_cmdimpl.c | 406 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 417 insertions(+)
create mode 100644 tools/libxl/libxl_vnuma.h
diff --git a/tools/libxl/libxl_vnuma.h b/tools/libxl/libxl_vnuma.h
new file mode 100644
index 0000000..f1568ae
--- /dev/null
+++ b/tools/libxl/libxl_vnuma.h
@@ -0,0 +1,11 @@
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#define VNUMA_NO_NODE ~((unsigned int)0)
+
+/*
+ * Max vNUMA node size in Mb is taken 64Mb even now Linux lets
+ * 32Mb, thus letting some slack. Will be modified to match Linux.
+ */
+#define MIN_VNODE_SIZE 64U
+
+#define MAX_VNUMA_NODES (unsigned int)1 << 10
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 5195914..59855ed 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -40,6 +40,7 @@
#include "libxl_json.h"
#include "libxlutil.h"
#include "xl.h"
+#include "libxl_vnuma.h"
/* For calls which return an errno on failure */
#define CHK_ERRNOVAL( call ) ({ \
@@ -725,6 +726,403 @@ static void parse_top_level_sdl_options(XLU_Config *config,
xlu_cfg_replace_string (config, "xauthority", &sdl->xauthority, 0);
}
+
+static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
+{
+ const char *buf;
+ char *ep;
+ unsigned long ul;
+ int rc = -EINVAL;
+ buf = xlu_cfg_get_listitem(list, i);
+ if (!buf)
+ return rc;
+ ul = strtoul(buf, &ep, 10);
+ if (ep == buf)
+ return rc;
+ if (ul >= UINT16_MAX)
+ return rc;
+ return (unsigned int)ul;
+}
+
+static void vdistance_set(unsigned int *vdistance,
+ unsigned int nr_vnodes,
+ unsigned int samenode,
+ unsigned int othernode)
+{
+ unsigned int idx, slot;
+ for (idx = 0; idx < nr_vnodes; idx++)
+ for (slot = 0; slot < nr_vnodes; slot++)
+ *(vdistance + slot * nr_vnodes + idx) =
+ idx == slot ? samenode : othernode;
+}
+
+static void vcputovnode_default(unsigned int *cpu_to_node,
+ unsigned int nr_vnodes,
+ unsigned int max_vcpus)
+{
+ unsigned int cpu;
+ for (cpu = 0; cpu < max_vcpus; cpu++)
+ cpu_to_node[cpu] = cpu % nr_vnodes;
+}
+
+/* Split domain memory between vNUMA nodes equally */
+static int split_vnumamem(libxl_domain_build_info *b_info)
+{
+ unsigned long long vnodemem = 0;
+ unsigned long n;
+ unsigned int i;
+
+ /* In MBytes */
+ if (b_info->nr_nodes == 0)
+ return -1;
+ vnodemem = (b_info->max_memkb >> 10) / b_info->nr_nodes;
+ if (vnodemem < MIN_VNODE_SIZE)
+ return -1;
+ /* reminder in MBytes */
+ n = (b_info->max_memkb >> 10) % b_info->nr_nodes;
+ /* get final sizes in MBytes */
+ for (i = 0; i < (b_info->nr_nodes - 1); i++)
+ b_info->numa_memszs[i] = vnodemem;
+ /* add the reminder to the last node */
+ b_info->numa_memszs[i] = vnodemem + n;
+ return 0;
+}
+
+static void vnode_to_pnode_default(unsigned int *vnode_to_pnode,
+ unsigned int nr_vnodes)
+{
+ unsigned int i;
+ for (i = 0; i < nr_vnodes; i++)
+ vnode_to_pnode[i] = VNUMA_NO_NODE;
+}
+
+/*
+ * init vNUMA to "zero config" with one node and all other
+ * topology parameters set to default.
+ */
+static int vnuma_zero_config(libxl_domain_build_info *b_info)
+{
+ b_info->nr_nodes = 1;
+ /* all memory goes to this one vnode */
+ if (!(b_info->numa_memszs = (uint64_t *)calloc(b_info->nr_nodes,
+ sizeof(*b_info->numa_memszs))))
+ goto bad_vnumazerocfg;
+
+ if (!(b_info->cpu_to_node = (unsigned int *)calloc(b_info->max_vcpus,
+ sizeof(*b_info->cpu_to_node))))
+ goto bad_vnumazerocfg;
+
+ if (!(b_info->distance = (unsigned int *)calloc(b_info->nr_nodes *
+ b_info->nr_nodes, sizeof(*b_info->distance))))
+ goto bad_vnumazerocfg;
+
+ if (!(b_info->vnode_to_pnode = (unsigned int *)calloc(b_info->nr_nodes,
+ sizeof(*b_info->vnode_to_pnode))))
+ goto bad_vnumazerocfg;
+
+ b_info->numa_memszs[0] = b_info->max_memkb >> 10;
+
+ /* all vcpus assigned to this vnode */
+ vcputovnode_default(b_info->cpu_to_node, b_info->nr_nodes,
+ b_info->max_vcpus);
+
+ /* default vdistance is 10 */
+ vdistance_set(b_info->distance, b_info->nr_nodes, 10, 10);
+
+ /* VNUMA_NO_NODE for vnode_to_pnode */
+ vnode_to_pnode_default(b_info->vnode_to_pnode, b_info->nr_nodes);
+
+ /*
+ * will be placed to some physical nodes defined by automatic
+ * numa placement or VNUMA_NO_NODE will not request exact node
+ */
+ libxl_defbool_set(&b_info->vnuma_autoplacement, true);
+ return 0;
+
+ bad_vnumazerocfg:
+ return -1;
+}
+
+/* Caller must exit */
+static void free_vnuma_info(libxl_domain_build_info *b_info)
+{
+ free(b_info->numa_memszs);
+ free(b_info->distance);
+ free(b_info->cpu_to_node);
+ free(b_info->vnode_to_pnode);
+ b_info->nr_nodes = 0;
+}
+
+/*
+static int vdistance_parse(char *vdistcfg, unsigned int *vdistance,
+ unsigned int nr_vnodes)
+{
+ char *endptr, *toka, *tokb, *saveptra = NULL, *saveptrb = NULL;
+ unsigned int *vdist_tmp = NULL;
+ int rc = 0;
+ unsigned int i, j, parsed = 0;
+ unsigned long dist;
+
+ rc = -EINVAL;
+ if (vdistance == NULL) {
+ return rc;
+ }
+ vdist_tmp = (unsigned int *)malloc(nr_vnodes * nr_vnodes * sizeof(*vdistance));
+ if (vdist_tmp == NULL)
+ return rc;
+
+ i = j = 0;
+ for (toka = strtok_r(vdistcfg, ",", &saveptra); toka;
+ toka = strtok_r(NULL, ",", &saveptra)) {
+ if ( i >= nr_vnodes )
+ goto vdist_parse_err;
+ for (tokb = strtok_r(toka, " ", &saveptrb); tokb;
+ tokb = strtok_r(NULL, " ", &saveptrb)) {
+ if (j >= nr_vnodes)
+ goto vdist_parse_err;
+ dist = strtol(tokb, &endptr, 10);
+ if (dist > UINT16_MAX || dist < 0)
+ goto vdist_parse_err;
+ if (tokb == endptr)
+ goto vdist_parse_err;
+ *(vdist_tmp + j*nr_vnodes + i) = dist;
+ parsed++;
+ j++;
+ }
+ i++;
+ j = 0;
+ }
+ rc = parsed;
+ memcpy(vdistance, vdist_tmp, nr_vnodes * nr_vnodes * sizeof(*vdistance));
+
+ vdist_parse_err:
+ free(vdist_tmp);
+ return rc;
+}
+*/
+
+static void parse_vnuma_config(XLU_Config *config, libxl_domain_build_info *b_info)
+{
+ XLU_ConfigList *vnumamemcfg;
+ XLU_ConfigList *vdistancecfg, *vnodemap, *vcpumap;
+ int nr_vnuma_regions;
+ int nr_vdist, nr_vnodemap, nr_vcpumap, i;
+ unsigned long long vnuma_memparsed = 0;
+ long l;
+ unsigned long ul;
+ const char *buf;
+
+ if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
+ if (l > MAX_VNUMA_NODES) {
+ fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n", MAX_VNUMA_NODES);
+ goto bad_vnuma_config;
+ }
+ b_info->nr_nodes = l;
+
+ xlu_cfg_get_defbool(config, "vnuma_autoplacement", &b_info->vnuma_autoplacement, 0);
+
+ /* Only construct nodes with at least one vcpu for now */
+ if (b_info->nr_nodes != 0 && b_info->max_vcpus >= b_info->nr_nodes) {
+ if (!xlu_cfg_get_list(config, "vnumamem",
+ &vnumamemcfg, &nr_vnuma_regions, 0)) {
+
+ if (nr_vnuma_regions != b_info->nr_nodes) {
+ fprintf(stderr, "Number of numa regions (vnumamem = %d) is incorrect (should be %d).\n",
+ nr_vnuma_regions, b_info->nr_nodes);
+ goto bad_vnuma_config;
+ }
+
+ b_info->numa_memszs = calloc(b_info->nr_nodes,
+ sizeof(*b_info->numa_memszs));
+ if (b_info->numa_memszs == NULL) {
+ fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
+ goto bad_vnuma_config;
+ }
+
+ char *ep;
+ /*
+ * Will parse only nr_vnodes times, even if we have more/less regions.
+ * Take care of it later if less or discard if too many regions.
+ */
+ for (i = 0; i < b_info->nr_nodes; i++) {
+ buf = xlu_cfg_get_listitem(vnumamemcfg, i);
+ if (!buf) {
+ fprintf(stderr,
+ "xl: Unable to get element %d in vnuma memory list.\n", i);
+ break;
+ }
+ ul = strtoul(buf, &ep, 10);
+ if (ep == buf) {
+ fprintf(stderr,
+ "xl: Invalid argument parsing vnumamem: %s.\n", buf);
+ break;
+ }
+
+ /* 32Mb is a min size for a node, taken from Linux */
+ if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
+ fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u range.\n",
+ ul, MIN_VNODE_SIZE, UINT32_MAX);
+ break;
+ }
+
+ /* memory in MBytes */
+ b_info->numa_memszs[i] = ul;
+ }
+
+ /* Total memory for vNUMA parsed to verify */
+ for (i = 0; i < nr_vnuma_regions; i++)
+ vnuma_memparsed = vnuma_memparsed + (b_info->numa_memszs[i]);
+
+ /* Amount of memory for vnodes same as total? */
+ if ((vnuma_memparsed << 10) != (b_info->max_memkb)) {
+ fprintf(stderr, "xl: vnuma memory is not the same as domain memory size.\n");
+ goto bad_vnuma_config;
+ }
+ } else {
+ b_info->numa_memszs = calloc(b_info->nr_nodes,
+ sizeof(*b_info->numa_memszs));
+ if (b_info->numa_memszs == NULL) {
+ fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
+ goto bad_vnuma_config;
+ }
+
+ fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n");
+ fprintf(stderr, "Using default equal vnode memory size %lu Kbytes to cover %lu Kbytes.\n",
+ b_info->max_memkb / b_info->nr_nodes, b_info->max_memkb);
+
+ if (split_vnumamem(b_info) < 0) {
+ fprintf(stderr, "Could not split vnuma memory into equal chunks.\n");
+ goto bad_vnuma_config;
+ }
+ }
+
+ b_info->distance = calloc(b_info->nr_nodes * b_info->nr_nodes,
+ sizeof(*b_info->distance));
+ if (b_info->distance == NULL)
+ goto bad_vnuma_config;
+
+ if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) {
+ int d1, d2;
+ /*
+ * First value is the same node distance, the second as the
+ * rest of distances. The following is required right now to
+ * avoid non-symmetrical distance table as it may break latest kernel.
+ * TODO: Better way to analyze extended distance table, possibly
+ * OS specific.
+ */
+ d1 = get_list_item_uint(vdistancecfg, 0);
+ d2 = get_list_item_uint(vdistancecfg, 1);
+
+ if (d1 >= 0 && d2 >= 0 && d1 < d2) {
+ vdistance_set(b_info->distance, b_info->nr_nodes, d1, d2);
+ } else {
+ fprintf(stderr, "WARNING: vnuma distance values are incorrect.\n");
+ goto bad_vnuma_config;
+ }
+
+ } else {
+ fprintf(stderr, "Could not parse vnuma distances.\n");
+ vdistance_set(b_info->distance, b_info->nr_nodes, 10, 20);
+ }
+
+ b_info->cpu_to_node = (unsigned int *)calloc(b_info->max_vcpus,
+ sizeof(*b_info->cpu_to_node));
+ if (b_info->cpu_to_node == NULL)
+ goto bad_vnuma_config;
+
+ if (!xlu_cfg_get_list(config, "numa_cpumask",
+ &vcpumap, &nr_vcpumap, 0)) {
+ if (nr_vcpumap == b_info->max_vcpus) {
+ unsigned int vnode, vcpumask = 0, vmask;
+ vmask = ~(~0 << nr_vcpumap);
+ for (i = 0; i < nr_vcpumap; i++) {
+ vnode = get_list_item_uint(vcpumap, i);
+ if (vnode >= 0 && vnode < b_info->nr_nodes) {
+ vcpumask |= (1 << i);
+ b_info->cpu_to_node[i] = vnode;
+ }
+ }
+
+ /* Did it covered all vnodes in the vcpu mask? */
+ if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
+ fprintf(stderr, "WARNING: Not all vnodes were covered in numa_cpumask.\n");
+ goto bad_vnuma_config;
+ }
+ } else {
+ fprintf(stderr, "WARNING: Bad vnuma_vcpumap.\n");
+ goto bad_vnuma_config;
+ }
+ }
+ else
+ vcputovnode_default(b_info->cpu_to_node,
+ b_info->nr_nodes,
+ b_info->max_vcpus);
+
+ /* There is mapping to NUMA physical nodes? */
+ b_info->vnode_to_pnode = (unsigned int *)calloc(b_info->nr_nodes,
+ sizeof(*b_info->vnode_to_pnode));
+ if (b_info->vnode_to_pnode == NULL)
+ goto bad_vnuma_config;
+ if (!xlu_cfg_get_list(config, "vnuma_vnodemap",&vnodemap,
+ &nr_vnodemap, 0)) {
+ /*
+ * If not specified or incorred, will be defined
+ * later based on the machine architecture, configuration
+ * and memory availble when creating domain.
+ */
+ if (nr_vnodemap == b_info->nr_nodes) {
+ unsigned int vnodemask = 0, pnode, smask;
+ smask = ~(~0 << b_info->nr_nodes);
+ for (i = 0; i < b_info->nr_nodes; i++) {
+ pnode = get_list_item_uint(vnodemap, i);
+ if (pnode >= 0) {
+ vnodemask |= (1 << i);
+ b_info->vnode_to_pnode[i] = pnode;
+ }
+ }
+
+ /* Did it covered all vnodes in the mask? */
+ if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
+ fprintf(stderr, "WARNING: Not all vnodes were covered vnuma_vnodemap.\n");
+
+ if (libxl_defbool_val(b_info->vnuma_autoplacement)) {
+ fprintf(stderr, "Automatic placement will be used for vnodes.\n");
+ vnode_to_pnode_default(b_info->vnode_to_pnode, b_info->nr_nodes);
+ } else
+ goto bad_vnuma_config;
+ }
+ } else {
+ fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
+
+ if (libxl_defbool_val(b_info->vnuma_autoplacement)) {
+ fprintf(stderr, "Automatic placement will be used for vnodes.\n");
+ vnode_to_pnode_default(b_info->vnode_to_pnode, b_info->nr_nodes);
+ } else
+ goto bad_vnuma_config;
+ }
+ } else {
+ fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n");
+
+ if (libxl_defbool_val(b_info->vnuma_autoplacement)) {
+ fprintf(stderr, "Automatic placement will be used for vnodes.\n");
+ vnode_to_pnode_default(b_info->vnode_to_pnode, b_info->nr_nodes);
+ } else
+ goto bad_vnuma_config;
+ }
+ }
+ else if (vnuma_zero_config(b_info))
+ goto bad_vnuma_config;
+ }
+ /* If vnuma topology is not defined for domain, init one node */
+ else if (vnuma_zero_config(b_info))
+ goto bad_vnuma_config;
+ return;
+
+ bad_vnuma_config:
+ free_vnuma_info(b_info);
+ exit(1);
+}
+
static void parse_config_data(const char *config_source,
const char *config_data,
int config_len,
@@ -1081,6 +1479,14 @@ static void parse_config_data(const char *config_source,
exit(1);
}
+ libxl_defbool_set(&b_info->vnuma_autoplacement, false);
+
+ /*
+ * If there is no vnuma in config, "zero" vnuma config
+ * will be initialized with one node and other defaults.
+ */
+ parse_vnuma_config(config, b_info);
+
xlu_cfg_replace_string (config, "bootloader", &b_info->u.pv.bootloader, 0);
switch (xlu_cfg_get_list_as_string_list(config, "bootloader_args",
&b_info->u.pv.bootloader_args, 1))
--
1.7.10.4
next prev parent reply other threads:[~2014-06-03 4:53 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-06-03 4:53 [PATCH v5 0/8] vnuma introduction Elena Ufimtseva
2014-06-03 4:53 ` [PATCH v5 8/8] add vnuma info for debug-key Elena Ufimtseva
2014-06-03 9:04 ` Jan Beulich
2014-06-04 4:13 ` Elena Ufimtseva
2014-06-03 4:53 ` [PATCH v5 1/8] xen: vnuma topoplogy and subop hypercalls Elena Ufimtseva
2014-06-03 8:55 ` Jan Beulich
2014-06-03 4:53 ` [PATCH v5 2/8] libxc: Plumb Xen with vnuma topology Elena Ufimtseva
2014-06-03 4:53 ` [PATCH v5 3/8] vnuma xl.cfg.pod and idl config options Elena Ufimtseva
2014-06-03 4:53 ` Elena Ufimtseva [this message]
2014-06-03 4:53 ` [PATCH v5 5/8] libxc: allocate domain vnuma nodes Elena Ufimtseva
2014-06-03 4:53 ` [PATCH v5 6/8] libxl: build e820 map for vnodes Elena Ufimtseva
2014-06-03 4:53 ` [PATCH v5 7/8] libxl: place vnuma domain nodes on numa nodes Elena Ufimtseva
2014-06-03 4:53 ` [PATCH v5 8/8] add vnuma info out on debug-key Elena Ufimtseva
2014-06-03 11:37 ` [PATCH v5 0/8] vnuma introduction Wei Liu
2014-06-04 4:05 ` Elena Ufimtseva
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1401771200-11448-6-git-send-email-ufimtseva@gmail.com \
--to=ufimtseva@gmail.com \
--cc=Ian.Campbell@citrix.com \
--cc=JBeulich@suse.com \
--cc=dario.faggioli@citrix.com \
--cc=george.dunlap@eu.citrix.com \
--cc=ian.jackson@eu.citrix.com \
--cc=keir@xen.org \
--cc=lccycc123@gmail.com \
--cc=msw@linux.com \
--cc=stefano.stabellini@eu.citrix.com \
--cc=xen-devel@lists.xen.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).