From: He Chen <he.chen@linux.intel.com>
To: qemu-devel@nongnu.org
Cc: "Michael S . Tsirkin" <mst@redhat.com>,
Igor Mammedov <imammedo@redhat.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Richard Henderson <rth@twiddle.net>,
Eduardo Habkost <ehabkost@redhat.com>,
Eric Blake <eblake@redhat.com>,
Markus Armbruster <armbru@redhat.com>,
Andrew Jones <drjones@redhat.com>
Subject: [Qemu-devel] [PATCH v5] Allow setting NUMA distance for different NUMA nodes
Date: Thu, 6 Apr 2017 10:18:53 +0800 [thread overview]
Message-ID: <1491445133-6534-1-git-send-email-he.chen@linux.intel.com> (raw)
This patch is going to add SLIT table support in QEMU, and provides
additional option `dist` for command `-numa` to allow user set vNUMA
distance by QEMU command.
With this patch, when a user wants to create a guest that contains
several vNUMA nodes and also wants to set distance among those nodes,
the QEMU command would like:
```
-numa node,nodeid=0,cpus=0 \
-numa node,nodeid=1,cpus=1 \
-numa node,nodeid=2,cpus=2 \
-numa node,nodeid=3,cpus=3 \
-numa dist,src=0,dst=1,val=21 \
-numa dist,src=0,dst=2,val=31 \
-numa dist,src=0,dst=3,val=41 \
-numa dist,src=1,dst=2,val=21 \
-numa dist,src=1,dst=3,val=31 \
-numa dist,src=2,dst=3,val=21 \
```
Signed-off-by: He Chen <he.chen@linux.intel.com>
---
hw/acpi/aml-build.c | 25 +++++++++
hw/i386/acpi-build.c | 2 +
include/hw/acpi/aml-build.h | 1 +
include/sysemu/numa.h | 1 +
include/sysemu/sysemu.h | 4 ++
numa.c | 121 ++++++++++++++++++++++++++++++++++++++++++++
qapi-schema.json | 30 ++++++++++-
qemu-options.hx | 17 ++++++-
8 files changed, 198 insertions(+), 3 deletions(-)
diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index c6f2032..2c6ab07 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -24,6 +24,7 @@
#include "hw/acpi/aml-build.h"
#include "qemu/bswap.h"
#include "qemu/bitops.h"
+#include "sysemu/numa.h"
static GArray *build_alloc_array(void)
{
@@ -1609,3 +1610,27 @@ void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base,
numamem->base_addr = cpu_to_le64(base);
numamem->range_length = cpu_to_le64(len);
}
+
+/*
+ * ACPI spec 5.2.17 System Locality Distance Information Table
+ * (Revision 2.0 or later)
+ */
+void build_slit(GArray *table_data, BIOSLinker *linker)
+{
+ int slit_start, i, j;
+ slit_start = table_data->len;
+
+ acpi_data_push(table_data, sizeof(AcpiTableHeader));
+
+ build_append_int_noprefix(table_data, nb_numa_nodes, 8);
+ for (i = 0; i < nb_numa_nodes; i++) {
+ for (j = 0; j < nb_numa_nodes; j++) {
+ build_append_int_noprefix(table_data, numa_info[i].distance[j], 1);
+ }
+ }
+
+ build_header(linker, table_data,
+ (void *)(table_data->data + slit_start),
+ "SLIT",
+ table_data->len - slit_start, 1, NULL, NULL);
+}
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 2073108..12730ea 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2678,6 +2678,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
if (pcms->numa_nodes) {
acpi_add_table(table_offsets, tables_blob);
build_srat(tables_blob, tables->linker, machine);
+ acpi_add_table(table_offsets, tables_blob);
+ build_slit(tables_blob, tables->linker);
}
if (acpi_get_mcfg(&mcfg)) {
acpi_add_table(table_offsets, tables_blob);
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 00c21f1..329a0d0 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -389,4 +389,5 @@ GCC_FMT_ATTR(2, 3);
void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base,
uint64_t len, int node, MemoryAffinityFlags flags);
+void build_slit(GArray *table_data, BIOSLinker *linker);
#endif
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 8f09dcf..2f7a941 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -21,6 +21,7 @@ typedef struct node_info {
struct HostMemoryBackend *node_memdev;
bool present;
QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
+ uint8_t distance[MAX_NODES];
} NodeInfo;
extern NodeInfo numa_info[MAX_NODES];
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 576c7ce..6999545 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -169,6 +169,10 @@ extern int mem_prealloc;
#define MAX_NODES 128
#define NUMA_NODE_UNASSIGNED MAX_NODES
+#define NUMA_DISTANCE_MIN 10
+#define NUMA_DISTANCE_DEFAULT 20
+#define NUMA_DISTANCE_MAX 254
+#define NUMA_DISTANCE_UNREACHABLE 255
#define MAX_OPTION_ROMS 16
typedef struct QEMUOptionRom {
diff --git a/numa.c b/numa.c
index 6fc2393..838e45a 100644
--- a/numa.c
+++ b/numa.c
@@ -52,6 +52,7 @@ static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
*/
int nb_numa_nodes;
NodeInfo numa_info[MAX_NODES];
+static bool have_numa_distance;
void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node)
{
@@ -212,6 +213,41 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
}
+static void numa_distance_parse(NumaDistOptions *dist, QemuOpts *opts, Error **errp)
+{
+ uint16_t src = dist->src;
+ uint16_t dst = dist->dst;
+ uint8_t val = dist->val;
+
+ if (!numa_info[src].present || !numa_info[dst].present) {
+ error_setg(errp, "Source/Destination NUMA node is missing. "
+ "Please use '-numa node' option to declare it first.");
+ return;
+ }
+
+ if (src >= MAX_NODES || dst >= MAX_NODES) {
+ error_setg(errp, "Max number of NUMA nodes reached: %"
+ PRIu16 "", src > dst ? src : dst);
+ return;
+ }
+
+ if (val < NUMA_DISTANCE_MIN) {
+ error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, "
+ "it should be larger than %d.",
+ val, NUMA_DISTANCE_MIN);
+ return;
+ }
+
+ if (src == dst && val != NUMA_DISTANCE_MIN) {
+ error_setg(errp, "Local distance of node %d should be %d.",
+ src, NUMA_DISTANCE_MIN);
+ return;
+ }
+
+ numa_info[src].distance[dst] = val;
+ have_numa_distance = true;
+}
+
static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
{
NumaOptions *object = NULL;
@@ -235,6 +271,12 @@ static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
}
nb_numa_nodes++;
break;
+ case NUMA_OPTIONS_TYPE_DIST:
+ numa_distance_parse(&object->u.dist, opts, &err);
+ if (err) {
+ goto end;
+ }
+ break;
default:
abort();
}
@@ -294,6 +336,84 @@ static void validate_numa_cpus(void)
g_free(seen_cpus);
}
+static void validate_numa_distance(void)
+{
+ int src, dst, s, d;
+ bool is_asymmetrical = false;
+ bool opposite_miss = false;
+
+ if (!have_numa_distance) {
+ for (src = 0; src < nb_numa_nodes; src++) {
+ for (dst = 0; dst < nb_numa_nodes; dst++) {
+ if (numa_info[src].present && numa_info[dst].present) {
+ if (src == dst) {
+ numa_info[src].distance[dst] = NUMA_DISTANCE_MIN;
+ } else {
+ numa_info[src].distance[dst] = NUMA_DISTANCE_DEFAULT;
+ }
+ }
+ }
+ }
+
+ return;
+ }
+
+ for (src = 0; src < nb_numa_nodes; src++) {
+ for (dst = src; dst < nb_numa_nodes; dst++) {
+ s = src;
+ d = dst;
+
+ if (numa_info[s].present && numa_info[d].present) {
+ if (numa_info[s].distance[d] == 0 &&
+ numa_info[d].distance[s] == 0) {
+ if (s == d) {
+ numa_info[s].distance[d] = NUMA_DISTANCE_MIN;
+ continue;
+ } else {
+ error_report("The distance between node %d and %d is missing, "
+ "please provide all unique node pair's distance.",
+ s, d);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (s == d && numa_info[s].distance[d] != NUMA_DISTANCE_MIN) {
+ error_report("The local distance of node %d should be %d.",
+ s, NUMA_DISTANCE_MIN);
+ exit(EXIT_FAILURE);
+ }
+
+ if (numa_info[s].distance[d] == 0) {
+ s = dst;
+ d = src;
+ }
+
+ if (numa_info[d].distance[s] == 0) {
+ opposite_miss = true;
+ }
+
+ if ((numa_info[d].distance[s] != 0) &&
+ (numa_info[s].distance[d] != numa_info[d].distance[s])) {
+ is_asymmetrical = true;
+ }
+
+ if (is_asymmetrical) {
+ if (opposite_miss) {
+ error_report("At least one asymmetrical pair of distance "
+ "is given, please provide all node pairs' "
+ "distance value for both directions.");
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ numa_info[d].distance[s] = numa_info[s].distance[d];
+ }
+ }
+ }
+ }
+
+ return;
+}
+
void parse_numa_opts(MachineClass *mc)
{
int i;
@@ -390,6 +510,7 @@ void parse_numa_opts(MachineClass *mc)
}
validate_numa_cpus();
+ validate_numa_distance();
} else {
numa_set_mem_node_id(0, ram_size, 0);
}
diff --git a/qapi-schema.json b/qapi-schema.json
index 250e4dc..7552777 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -5673,10 +5673,14 @@
##
# @NumaOptionsType:
#
+# @node: NUMA nodes configuration
+#
+# @dist: NUMA distance configuration (since 2.10)
+#
# Since: 2.1
##
{ 'enum': 'NumaOptionsType',
- 'data': [ 'node' ] }
+ 'data': [ 'node', 'dist' ] }
##
# @NumaOptions:
@@ -5689,7 +5693,8 @@
'base': { 'type': 'NumaOptionsType' },
'discriminator': 'type',
'data': {
- 'node': 'NumaNodeOptions' }}
+ 'node': 'NumaNodeOptions',
+ 'dist': 'NumaDistOptions' }}
##
# @NumaNodeOptions:
@@ -5718,6 +5723,27 @@
'*memdev': 'str' }}
##
+# @NumaDistOptions:
+#
+# Set the distance between 2 NUMA nodes.
+#
+# @src: source NUMA node.
+#
+# @dst: destination NUMA node.
+#
+# @val: NUMA distance from source node to destination node.
+# When a node is unreachable from another node, set the distance
+# to 255.
+#
+# Since: 2.10
+##
+{ 'struct': 'NumaDistOptions',
+ 'data': {
+ 'src': 'uint16',
+ 'dst': 'uint16',
+ 'val': 'uint8' }}
+
+##
# @HostMemPolicy:
#
# Host memory policy types
diff --git a/qemu-options.hx b/qemu-options.hx
index 99af8ed..2318d85 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -139,12 +139,15 @@ ETEXI
DEF("numa", HAS_ARG, QEMU_OPTION_numa,
"-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
- "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n", QEMU_ARCH_ALL)
+ "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n"
+ "-numa dist,src=source,dst=destination,val=distance\n", QEMU_ARCH_ALL)
STEXI
@item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
@itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}]
+@itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
@findex -numa
Define a NUMA node and assign RAM and VCPUs to it.
+Set the NUMA distance from a source node to a destination node.
@var{firstcpu} and @var{lastcpu} are CPU indexes. Each
@samp{cpus} option represent a contiguous range of CPU indexes
@@ -167,6 +170,18 @@ split equally between them.
@samp{mem} and @samp{memdev} are mutually exclusive. Furthermore,
if one node uses @samp{memdev}, all of them have to use it.
+@var{source} and @var{destination} are NUMA node IDs.
+@var{distance} is the NUMA distance from @var{source} to @var{destination}.
+The distance from a node to itself is always 10. If no distance values
+are given for node pairs, then the default distance of 20 is used for each
+pair. If any pair of nodes is given a distance, then all pairs must be
+given distances. Although, when distances are only given in one direction
+for each pair of nodes, then the distances in the opposite directions are
+assumed to be the same. If, however, an asymmetrical pair of distances is
+given for even one node pair, then all node pairs must be provided
+distance values for both directions, even when they are symmetrical. When
+a node is unreachable from another node, set the pair's distance to 255.
+
Note that the -@option{numa} option doesn't allocate any of the
specified resources, it just assigns existing resources to NUMA
nodes. This means that one still has to use the @option{-m},
--
2.7.4
next reply other threads:[~2017-04-06 2:19 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-04-06 2:18 He Chen [this message]
2017-04-06 14:56 ` [Qemu-devel] [PATCH v5] Allow setting NUMA distance for different NUMA nodes Andrew Jones
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1491445133-6534-1-git-send-email-he.chen@linux.intel.com \
--to=he.chen@linux.intel.com \
--cc=armbru@redhat.com \
--cc=drjones@redhat.com \
--cc=eblake@redhat.com \
--cc=ehabkost@redhat.com \
--cc=imammedo@redhat.com \
--cc=mst@redhat.com \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=rth@twiddle.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.