From: He Chen <he.chen@linux.intel.com>
To: qemu-devel@nongnu.org
Cc: "Michael S . Tsirkin" <mst@redhat.com>,
Igor Mammedov <imammedo@redhat.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Richard Henderson <rth@twiddle.net>,
Eduardo Habkost <ehabkost@redhat.com>,
Eric Blake <eblake@redhat.com>,
Markus Armbruster <armbru@redhat.com>,
He Chen <he.chen@linux.intel.com>
Subject: [Qemu-devel] [PATCH] x86: Allow to set NUMA distance for different NUMA nodes
Date: Fri, 10 Mar 2017 18:18:17 +0800 [thread overview]
Message-ID: <1489141097-28587-1-git-send-email-he.chen@linux.intel.com> (raw)
Current, QEMU does not provide a clear command to set vNUMA distance for
guest although we already have `-numa` command to set vNUMA nodes.
vNUMA distance makes sense in certain scenario.
But now, if we create a guest that has 4 vNUMA nodes, when we check NUMA
info via `numactl -H`, we will see:
node distance:
node 0 1 2 3
0: 10 20 20 20
1: 20 10 20 20
2: 20 20 10 20
3: 20 20 20 10
Guest kernel regards all local node as distance 10, and all remote node
as distance 20 when there is no SLIT table since QEMU doesn't build it.
It looks like a little strange when you have seen the distance in an
actual physical machine that contains 4 NUMA nodes. My machine shows:
node distance:
node 0 1 2 3
0: 10 21 31 41
1: 21 10 21 31
2: 31 21 10 21
3: 41 31 21 10
This patch is going to add SLIT table support in QEMU, and provides
addtional option `dist` for command `-numa` to allow user set vNUMA
distance by QEMU command.
With this patch, when a user wants to create a guest that contains
several vNUMA nodes and also wants to set distance among those nodes,
the QEMU command would like:
```
-object memory-backend-ram,size=1G,prealloc=yes,host-nodes=0,policy=bind,id=node0 \
-numa node,nodeid=0,cpus=0,memdev=node0 \
-object memory-backend-ram,size=1G,prealloc=yes,host-nodes=1,policy=bind,id=node1 \
-numa node,nodeid=1,cpus=1,memdev=node1 \
-object memory-backend-ram,size=1G,prealloc=yes,host-nodes=2,policy=bind,id=node2 \
-numa node,nodeid=2,cpus=2,memdev=node2 \
-object memory-backend-ram,size=1G,prealloc=yes,host-nodes=3,policy=bind,id=node3 \
-numa node,nodeid=3,cpus=3,memdev=node3 \
-numa dist,a=0,b=1,val=21 \
-numa dist,a=0,b=2,val=31 \
-numa dist,a=0,b=3,val=41 \
-numa dist,a=1,b=0,val=21 \
...
```
Thanks,
-He
Signed-off-by: He Chen <he.chen@linux.intel.com>
---
hw/i386/acpi-build.c | 28 ++++++++++++++++++++++++++
include/hw/acpi/acpi-defs.h | 9 +++++++++
include/sysemu/numa.h | 1 +
numa.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
qapi-schema.json | 24 +++++++++++++++++++++--
qemu-options.hx | 5 ++++-
6 files changed, 112 insertions(+), 3 deletions(-)
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 2073108..7ced37d 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2396,6 +2396,32 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
}
static void
+build_slit(GArray *table_data, BIOSLinker *linker, MachineState *machine)
+{
+ struct AcpiSystemLocalityDistanceTable *slit;
+ uint8_t *entry;
+ int slit_start, slit_data_len, i, j;
+ slit_start = table_data->len;
+
+ slit = acpi_data_push(table_data, sizeof(*slit));
+ slit->nb_localities = nb_numa_nodes;
+
+ slit_data_len = sizeof(uint8_t) * nb_numa_nodes * nb_numa_nodes;
+ entry = acpi_data_push(table_data, slit_data_len);
+
+ for (i = 0; i < nb_numa_nodes; i++) {
+ for (j = 0; j < nb_numa_nodes; j++) {
+ entry[i * nb_numa_nodes + j] = numa_info[i].distance[j];
+ }
+ }
+
+ build_header(linker, table_data,
+ (void *)(table_data->data + slit_start),
+ "SLIT",
+ table_data->len - slit_start, 1, NULL, NULL);
+}
+
+static void
build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
{
AcpiTableMcfg *mcfg;
@@ -2678,6 +2704,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
if (pcms->numa_nodes) {
acpi_add_table(table_offsets, tables_blob);
build_srat(tables_blob, tables->linker, machine);
+ acpi_add_table(table_offsets, tables_blob);
+ build_slit(tables_blob, tables->linker, machine);
}
if (acpi_get_mcfg(&mcfg)) {
acpi_add_table(table_offsets, tables_blob);
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index 4cc3630..b183a8f 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -527,6 +527,15 @@ struct AcpiSratProcessorGiccAffinity
typedef struct AcpiSratProcessorGiccAffinity AcpiSratProcessorGiccAffinity;
+/*
+ * SLIT (NUMA distance description) table
+ */
+struct AcpiSystemLocalityDistanceTable {
+ ACPI_TABLE_HEADER_DEF
+ uint64_t nb_localities;
+} QEMU_PACKED;
+typedef struct AcpiSystemLocalityDistanceTable AcpiSystemLocalityDistanceTable;
+
/* PCI fw r3.0 MCFG table. */
/* Subtable */
struct AcpiMcfgAllocation {
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 8f09dcf..2f7a941 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -21,6 +21,7 @@ typedef struct node_info {
struct HostMemoryBackend *node_memdev;
bool present;
QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
+ uint8_t distance[MAX_NODES];
} NodeInfo;
extern NodeInfo numa_info[MAX_NODES];
diff --git a/numa.c b/numa.c
index e01cb54..897657a 100644
--- a/numa.c
+++ b/numa.c
@@ -50,6 +50,9 @@ static int have_memdevs = -1;
static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
* For all nodes, nodeid < max_numa_nodeid
*/
+static int min_numa_distance = 10;
+static int def_numa_distance = 20;
+static int max_numa_distance = 255;
int nb_numa_nodes;
NodeInfo numa_info[MAX_NODES];
@@ -208,10 +211,33 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
}
+
numa_info[nodenr].present = true;
max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
}
+static void numa_distance_parse(NumaDistOptions *dist, QemuOpts *opts, Error **errp)
+{
+ uint8_t a = dist->a;
+ uint8_t b = dist->b;
+ uint8_t val = dist->val;
+
+ if (a >= MAX_NODES || b >= MAX_NODES) {
+ error_setg(errp, "Max number of NUMA nodes reached: %"
+ PRIu16 "", a > b ? a : b);
+ return;
+ }
+
+ if (val > max_numa_distance || val < min_numa_distance) {
+ error_setg(errp,
+ "NUMA distance (%" PRIu8 ") out of range (%d)~(%d)",
+ dist->val, max_numa_distance, min_numa_distance);
+ return;
+ }
+
+ numa_info[a].distance[b] = val;
+}
+
static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
{
NumaOptions *object = NULL;
@@ -235,6 +261,12 @@ static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
}
nb_numa_nodes++;
break;
+ case NUMA_OPTIONS_TYPE_DIST:
+ numa_distance_parse(&object->u.dist, opts, &err);
+ if (err) {
+ goto end;
+ }
+ break;
default:
abort();
}
@@ -294,6 +326,21 @@ static void validate_numa_cpus(void)
g_free(seen_cpus);
}
+static void default_numa_distance(void)
+{
+ int i, j;
+
+ for (i = 0; i < nb_numa_nodes; i++) {
+ for (j = 0; j < nb_numa_nodes; j++) {
+ if (i == j && numa_info[i].distance[j] != min_numa_distance) {
+ numa_info[i].distance[j] = min_numa_distance;
+ } else if (numa_info[i].distance[j] <= min_numa_distance) {
+ numa_info[i].distance[j] = def_numa_distance;
+ }
+ }
+ }
+}
+
void parse_numa_opts(MachineClass *mc)
{
int i;
@@ -390,6 +437,7 @@ void parse_numa_opts(MachineClass *mc)
}
validate_numa_cpus();
+ default_numa_distance();
} else {
numa_set_mem_node_id(0, ram_size, 0);
}
diff --git a/qapi-schema.json b/qapi-schema.json
index 32b4a4b..2988304 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -5647,7 +5647,7 @@
# Since: 2.1
##
{ 'enum': 'NumaOptionsType',
- 'data': [ 'node' ] }
+ 'data': [ 'node', 'dist' ] }
##
# @NumaOptions:
@@ -5660,7 +5660,8 @@
'base': { 'type': 'NumaOptionsType' },
'discriminator': 'type',
'data': {
- 'node': 'NumaNodeOptions' }}
+ 'node': 'NumaNodeOptions',
+ 'dist': 'NumaDistOptions' }}
##
# @NumaNodeOptions:
@@ -5689,6 +5690,25 @@
'*memdev': 'str' }}
##
+# @NumaDistOptions:
+#
+# Set distance between 2 NUMA nodes. (for OptsVisitor)
+#
+# @a: first NUMA node.
+#
+# @b: second NUMA node.
+#
+# @val: NUMA distance between 2 given NUMA nodes.
+#
+# Since: 2.9
+##
+{ 'struct': 'NumaDistOptions',
+ 'data': {
+ 'a': 'uint8',
+ 'b': 'uint8',
+ 'val': 'uint8' }}
+
+##
# @HostMemPolicy:
#
# Host memory policy types
diff --git a/qemu-options.hx b/qemu-options.hx
index 8dd8ee3..0de5cf8 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -139,12 +139,15 @@ ETEXI
DEF("numa", HAS_ARG, QEMU_OPTION_numa,
"-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
- "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n", QEMU_ARCH_ALL)
+ "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
+ "-numa dist,a=firstnode,b=secondnode,val=distance\n", QEMU_ARCH_ALL)
STEXI
@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
+@itemx -numa dist,a=@var{firstnode},b=@var{secondnode},val=@var{distance}
@findex -numa
Define a NUMA node and assign RAM and VCPUs to it.
+Set NUMA distance between 2 NUMA nodes.
@var{firstcpu} and @var{lastcpu} are CPU indexes. Each
@samp{cpus} option represent a contiguous range of CPU indexes
--
2.7.4
next reply other threads:[~2017-03-10 10:18 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-03-10 10:18 He Chen [this message]
2017-03-10 16:34 ` [Qemu-devel] [PATCH] x86: Allow to set NUMA distance for different NUMA nodes Michael S. Tsirkin
2017-03-10 16:37 ` Eric Blake
2017-03-10 16:38 ` Daniel P. Berrange
2017-03-10 16:40 ` Michael S. Tsirkin
2017-03-10 16:45 ` Daniel P. Berrange
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1489141097-28587-1-git-send-email-he.chen@linux.intel.com \
--to=he.chen@linux.intel.com \
--cc=armbru@redhat.com \
--cc=eblake@redhat.com \
--cc=ehabkost@redhat.com \
--cc=imammedo@redhat.com \
--cc=mst@redhat.com \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=rth@twiddle.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.