qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Eduardo Habkost <ehabkost@redhat.com>
To: Peter Maydell <peter.maydell@linaro.org>
Cc: qemu-devel@nongnu.org, Laurent Vivier <lvivier@redhat.com>
Subject: [Qemu-devel] [PULL 03/29] numa: equally distribute memory on nodes
Date: Thu, 11 May 2017 16:18:17 -0300	[thread overview]
Message-ID: <20170511191843.13784-4-ehabkost@redhat.com> (raw)
In-Reply-To: <20170511191843.13784-1-ehabkost@redhat.com>

From: Laurent Vivier <lvivier@redhat.com>

When there are more nodes than available memory to put the minimum
allowed memory by node, all the memory is put on the last node.

This is because we put (ram_size / nb_numa_nodes) &
~((1 << mc->numa_mem_align_shift) - 1); on each node, and in this
case the value is 0. This is particularly true with pseries,
as the memory must be aligned to 256MB.

To avoid this problem, this patch uses an error diffusion algorithm [1]
to distribute equally the memory on nodes.

We introduce numa_auto_assign_ram() function in MachineClass
to keep compatibility between machine type versions.
The legacy function is used with pseries-2.9, pc-q35-2.9 and
pc-i440fx-2.9 (and previous), the new one with all others.

Example:

qemu-system-ppc64 -S -nographic  -nodefaults -monitor stdio -m 1G -smp 8 \
                  -numa node -numa node -numa node \
                  -numa node -numa node -numa node

Before:

(qemu) info numa
6 nodes
node 0 cpus: 0 6
node 0 size: 0 MB
node 1 cpus: 1 7
node 1 size: 0 MB
node 2 cpus: 2
node 2 size: 0 MB
node 3 cpus: 3
node 3 size: 0 MB
node 4 cpus: 4
node 4 size: 0 MB
node 5 cpus: 5
node 5 size: 1024 MB

After:
(qemu) info numa
6 nodes
node 0 cpus: 0 6
node 0 size: 0 MB
node 1 cpus: 1 7
node 1 size: 256 MB
node 2 cpus: 2
node 2 size: 0 MB
node 3 cpus: 3
node 3 size: 256 MB
node 4 cpus: 4
node 4 size: 256 MB
node 5 cpus: 5
node 5 size: 256 MB

[1] https://en.wikipedia.org/wiki/Error_diffusion

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Message-Id: <20170502162955.1610-2-lvivier@redhat.com>
Reviewed-by: Eduardo Habkost <ehabkost@redhat.com>
[ehabkost: s/ram_size/size/ at numa_default_auto_assign_ram()]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
---
 include/hw/boards.h     |  2 ++
 include/qemu/typedefs.h |  1 +
 include/sysemu/numa.h   |  9 +++++++--
 hw/core/machine.c       |  2 ++
 hw/i386/pc_piix.c       |  2 ++
 hw/i386/pc_q35.c        |  2 ++
 hw/ppc/spapr.c          |  1 +
 numa.c                  | 49 ++++++++++++++++++++++++++++++++++++++-----------
 8 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/include/hw/boards.h b/include/hw/boards.h
index 31d9c72fb0..99458eb859 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -136,6 +136,8 @@ struct MachineClass {
     int minimum_page_bits;
     bool has_hotpluggable_cpus;
     int numa_mem_align_shift;
+    void (*numa_auto_assign_ram)(MachineClass *mc, NodeInfo *nodes,
+                                 int nb_nodes, ram_addr_t size);
 
     HotplugHandler *(*get_hotplug_handler)(MachineState *machine,
                                            DeviceState *dev);
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index f08d327aec..7d8505730c 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -97,5 +97,6 @@ typedef struct SSIBus SSIBus;
 typedef struct uWireSlave uWireSlave;
 typedef struct VirtIODevice VirtIODevice;
 typedef struct Visitor Visitor;
+typedef struct node_info NodeInfo;
 
 #endif /* QEMU_TYPEDEFS_H */
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 0ea1bc086e..70e56214e5 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -16,14 +16,14 @@ struct numa_addr_range {
     QLIST_ENTRY(numa_addr_range) entry;
 };
 
-typedef struct node_info {
+struct node_info {
     uint64_t node_mem;
     unsigned long *node_cpu;
     struct HostMemoryBackend *node_memdev;
     bool present;
     QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
     uint8_t distance[MAX_NODES];
-} NodeInfo;
+};
 
 extern NodeInfo numa_info[MAX_NODES];
 void parse_numa_opts(MachineClass *mc);
@@ -33,6 +33,11 @@ extern QemuOptsList qemu_numa_opts;
 void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
 void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
 uint32_t numa_get_node(ram_addr_t addr, Error **errp);
+void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
+                                 int nb_nodes, ram_addr_t size);
+void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
+                                  int nb_nodes, ram_addr_t size);
+
 
 /* on success returns node index in numa_info,
  * on failure returns nb_numa_nodes */
diff --git a/hw/core/machine.c b/hw/core/machine.c
index ada9eea483..2482c630c1 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -17,6 +17,7 @@
 #include "qapi/visitor.h"
 #include "hw/sysbus.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/numa.h"
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"
 
@@ -400,6 +401,7 @@ static void machine_class_init(ObjectClass *oc, void *data)
      * On Linux, each node's border has to be 8MB aligned
      */
     mc->numa_mem_align_shift = 23;
+    mc->numa_auto_assign_ram = numa_default_auto_assign_ram;
 
     object_class_property_add_str(oc, "accel",
         machine_get_accel, machine_set_accel, &error_abort);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 9f102aa388..d468b963fb 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -54,6 +54,7 @@
 #endif
 #include "migration/migration.h"
 #include "kvm_i386.h"
+#include "sysemu/numa.h"
 
 #define MAX_IDE_BUS 2
 
@@ -442,6 +443,7 @@ static void pc_i440fx_2_9_machine_options(MachineClass *m)
     pc_i440fx_machine_options(m);
     m->alias = "pc";
     m->is_default = 1;
+    m->numa_auto_assign_ram = numa_legacy_auto_assign_ram;
 }
 
 DEFINE_I440FX_MACHINE(v2_9, "pc-i440fx-2.9", NULL,
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index dd792a8547..66303a78cf 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -47,6 +47,7 @@
 #include "hw/usb.h"
 #include "qemu/error-report.h"
 #include "migration/migration.h"
+#include "sysemu/numa.h"
 
 /* ICH9 AHCI has 6 ports */
 #define MAX_SATA_PORTS     6
@@ -305,6 +306,7 @@ static void pc_q35_2_9_machine_options(MachineClass *m)
 {
     pc_q35_machine_options(m);
     m->alias = "q35";
+    m->numa_auto_assign_ram = numa_legacy_auto_assign_ram;
 }
 
 DEFINE_Q35_MACHINE(v2_9, "pc-q35-2.9", NULL,
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 80d12d005c..bdc31ce56c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -3242,6 +3242,7 @@ static void spapr_machine_2_9_class_options(MachineClass *mc)
 {
     spapr_machine_2_10_class_options(mc);
     SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_9);
+    mc->numa_auto_assign_ram = numa_legacy_auto_assign_ram;
 }
 
 DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
diff --git a/numa.c b/numa.c
index 2b3fc69915..d753687dec 100644
--- a/numa.c
+++ b/numa.c
@@ -407,6 +407,42 @@ static void complete_init_numa_distance(void)
     }
 }
 
+void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
+                                 int nb_nodes, ram_addr_t size)
+{
+    int i;
+    uint64_t usedmem = 0;
+
+    /* Align each node according to the alignment
+     * requirements of the machine class
+     */
+
+    for (i = 0; i < nb_nodes - 1; i++) {
+        nodes[i].node_mem = (size / nb_nodes) &
+                            ~((1 << mc->numa_mem_align_shift) - 1);
+        usedmem += nodes[i].node_mem;
+    }
+    nodes[i].node_mem = size - usedmem;
+}
+
+void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
+                                  int nb_nodes, ram_addr_t size)
+{
+    int i;
+    uint64_t usedmem = 0, node_mem;
+    uint64_t granularity = size / nb_nodes;
+    uint64_t propagate = 0;
+
+    for (i = 0; i < nb_nodes - 1; i++) {
+        node_mem = (granularity + propagate) &
+                   ~((1 << mc->numa_mem_align_shift) - 1);
+        propagate = granularity + propagate - node_mem;
+        nodes[i].node_mem = node_mem;
+        usedmem += node_mem;
+    }
+    nodes[i].node_mem = size - usedmem;
+}
+
 void parse_numa_opts(MachineClass *mc)
 {
     int i;
@@ -449,17 +485,8 @@ void parse_numa_opts(MachineClass *mc)
             }
         }
         if (i == nb_numa_nodes) {
-            uint64_t usedmem = 0;
-
-            /* Align each node according to the alignment
-             * requirements of the machine class
-             */
-            for (i = 0; i < nb_numa_nodes - 1; i++) {
-                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
-                                        ~((1 << mc->numa_mem_align_shift) - 1);
-                usedmem += numa_info[i].node_mem;
-            }
-            numa_info[i].node_mem = ram_size - usedmem;
+            assert(mc->numa_auto_assign_ram);
+            mc->numa_auto_assign_ram(mc, numa_info, nb_numa_nodes, ram_size);
         }
 
         numa_total = 0;
-- 
2.11.0.259.g40922b1

  parent reply	other threads:[~2017-05-11 19:18 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-05-11 19:18 [Qemu-devel] [PULL 00/29] x86 and machine queue, 2017-05-11 Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 01/29] i386: rewrite way CPUID index is validated Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 02/29] numa: Allow setting NUMA distance for different NUMA nodes Eduardo Habkost
2017-05-30 10:45   ` Peter Maydell
2017-05-30 14:01     ` Eduardo Habkost
2017-05-30 15:28       ` Eric Blake
2017-05-30 18:10         ` Eduardo Habkost
2017-05-30 18:21           ` Eric Blake
2017-05-30 17:08       ` Peter Maydell
2017-05-30 17:12         ` Daniel P. Berrange
2017-05-11 19:18 ` Eduardo Habkost [this message]
2017-05-11 19:18 ` [Qemu-devel] [PULL 04/29] tests: acpi: extend cphp and memhp testcase with numa distance check Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 05/29] tests: add CPUs to numa node mapping test Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 06/29] hw/arm/virt: extract mp-affinity calculation in separate function Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 07/29] hw/arm/virt: use machine->possible_cpus for storing possible topology info Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 08/29] hw/arm/virt: explicitly allocate cpu_index for cpus Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 09/29] numa: move source of default CPUs to NUMA node mapping into boards Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 10/29] spapr: add node-id property to sPAPR core Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 11/29] pc: add node-id property to CPU Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 12/29] virt-arm: " Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 13/29] numa: add check that board supports cpu_index to node mapping Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 14/29] numa: mirror cpu to node mapping in MachineState::possible_cpus Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 15/29] numa: do default mapping based on possible_cpus instead of node_cpu bitmaps Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 16/29] pc: get numa node mapping from possible_cpus instead of numa_get_node_for_cpu() Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 17/29] spapr: " Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 18/29] virt-arm: " Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 19/29] QMP: include CpuInstanceProperties into query_cpus output output Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 20/29] tests: numa: add case for QMP command query-cpus Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 21/29] numa: remove no longer need numa_post_machine_init() Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 22/29] machine: call machine init from wrapper Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 23/29] numa: use possible_cpus for not mapped CPUs check Eduardo Habkost
2017-05-17  8:07   ` Markus Armbruster
2017-05-17  9:09     ` Igor Mammedov
2017-05-11 19:18 ` [Qemu-devel] [PULL 24/29] numa: remove node_cpu bitmaps as they are no longer used Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 25/29] numa: add '-numa cpu, ...' option for property based node mapping Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 26/29] tests: check -numa node, cpu=props_list usecase Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 27/29] migration/i386: Remove old non-softfloat 64bit FP support Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 28/29] vmstatification: i386 FPReg Eduardo Habkost
2017-05-11 19:18 ` [Qemu-devel] [PULL 29/29] migration/i386: Remove support for pre-0.12 formats Eduardo Habkost
2017-05-15 13:15 ` [Qemu-devel] [PULL 00/29] x86 and machine queue, 2017-05-11 Stefan Hajnoczi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170511191843.13784-4-ehabkost@redhat.com \
    --to=ehabkost@redhat.com \
    --cc=lvivier@redhat.com \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).