[Qemu-devel] [PATCH V17 09/11] NUMA: set guest numa nodes memory policy

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
To: qemu-devel@nongnu.org
Cc: drjones@redhat.com, ehabkost@redhat.com, lersek@redhat.com,
	hutao@cn.fujitsu.com, mtosatti@redhat.com,
	peter.huangpeng@huawei.com, lcapitulino@redhat.com,
	bsd@redhat.com, anthony@codemonkey.ws, y-goto@jp.fujitsu.com,
	pbonzini@redhat.com, afaerber@suse.de, gaowanlong@cn.fujitsu.com
Subject: [Qemu-devel] [PATCH V17 09/11] NUMA: set guest numa nodes memory policy
Date: Wed, 4 Dec 2013 15:58:57 +0800	[thread overview]
Message-ID: <1386143939-19142-10-git-send-email-gaowanlong@cn.fujitsu.com> (raw)
In-Reply-To: <1386143939-19142-1-git-send-email-gaowanlong@cn.fujitsu.com>

Set the guest numa nodes memory policies using the mbind(2)
system call node by node.
After this patch, we are able to set guest nodes memory policies
through the QEMU options, this arms to solve the guest cross
nodes memory access performance issue.
And as you all know, if PCI-passthrough is used,
direct-attached-device uses DMA transfer between device and qemu process.
All pages of the guest will be pinned by get_user_pages().

KVM_ASSIGN_PCI_DEVICE ioctl
  kvm_vm_ioctl_assign_device()
    =>kvm_assign_device()
      => kvm_iommu_map_memslots()
        => kvm_iommu_map_pages()
           => kvm_pin_pages()

So, with direct-attached-device, all guest page's page count will be +1 and
any page migration will not work. AutoNUMA won't too.

So, we should set the guest nodes memory allocation policies before
the pages are really mapped.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 hw/i386/pc.c          |  9 +++++
 include/exec/memory.h | 15 ++++++++
 numa.c                | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 123 insertions(+)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 74c1f16..07553f2 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1178,6 +1178,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
     memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
                              0, below_4g_mem_size);
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
+    if (memory_region_set_mem_policy(ram_below_4g, 0, below_4g_mem_size, 0)) {
+        fprintf(stderr, "qemu: set below 4g memory policy failed\n");
+        exit(1);
+    }
     e820_add_entry(0, below_4g_mem_size, E820_RAM);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
@@ -1185,6 +1189,11 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
                                  below_4g_mem_size, above_4g_mem_size);
         memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+        if (memory_region_set_mem_policy(ram_above_4g, 0, above_4g_mem_size,
+                                     below_4g_mem_size)) {
+            fprintf(stderr, "qemu: set above 4g memory policy failed\n");
+            exit(1);
+        }
         e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
     }
 
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 480dfbf..33de50a 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -905,6 +905,21 @@ void memory_region_transaction_begin(void);
 void memory_region_transaction_commit(void);
 
 /**
+ * memory_region_set_mem_policy: Set memory policy
+ *
+ * Set the memory policy for the specified area.
+ *
+ * @mr: a MemoryRegion we are setting memory policy for
+ * @start: the start offset of the specific region in this MemoryRegion
+ * @length: the specific memory area length
+ * @offset: the start offset of the specific area in NUMA setting
+ */
+int memory_region_set_mem_policy(MemoryRegion *mr,
+                                 ram_addr_t start,
+                                 ram_addr_t length,
+                                 ram_addr_t offset);
+
+/**
  * memory_listener_register: register callbacks to be called when memory
  *                           sections are mapped or unmapped into an address
  *                           space
diff --git a/numa.c b/numa.c
index da4dbbd..43bba42 100644
--- a/numa.c
+++ b/numa.c
@@ -27,6 +27,16 @@
 #include "qapi-visit.h"
 #include "qapi/opts-visitor.h"
 #include "qapi/dealloc-visitor.h"
+#include "exec/memory.h"
+
+#ifdef __linux__
+#include <sys/syscall.h>
+#ifndef MPOL_F_RELATIVE_NODES
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+#define MPOL_F_STATIC_NODES   (1 << 15)
+#endif
+#endif
+
 QemuOptsList qemu_numa_opts = {
     .name = "numa",
     .implied_opt_name = "type",
@@ -228,6 +238,95 @@ void set_numa_nodes(void)
     }
 }
 
+#ifdef __linux__
+static int node_parse_bind_mode(unsigned int nodeid)
+{
+    int bind_mode;
+
+    switch (numa_info[nodeid].policy) {
+    case NUMA_NODE_POLICY_DEFAULT:
+    case NUMA_NODE_POLICY_PREFERRED:
+    case NUMA_NODE_POLICY_MEMBIND:
+    case NUMA_NODE_POLICY_INTERLEAVE:
+        bind_mode = numa_info[nodeid].policy;
+        break;
+    default:
+        bind_mode = NUMA_NODE_POLICY_DEFAULT;
+        return bind_mode;
+    }
+
+    bind_mode |= numa_info[nodeid].relative ?
+        MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES;
+
+    return bind_mode;
+}
+
+static int node_set_mem_policy(void *ram_ptr, ram_addr_t length, int nodeid)
+{
+    int bind_mode = node_parse_bind_mode(nodeid);
+    unsigned long *nodes = numa_info[nodeid].host_mem;
+
+    /* This is a workaround for a long standing bug in Linux'
+     * mbind implementation, which cuts off the last specified
+     * node. To stay compatible should this bug be fixed, we
+     * specify one more node and zero this one out.
+     */
+    unsigned long maxnode = find_last_bit(nodes, MAX_NODES);
+    if (syscall(SYS_mbind, ram_ptr, length, bind_mode,
+                nodes, maxnode + 2, 0)) {
+            perror("mbind");
+            return -1;
+    }
+
+    return 0;
+}
+#endif
+
+int memory_region_set_mem_policy(MemoryRegion *mr,
+                                 ram_addr_t start, ram_addr_t length,
+                                 ram_addr_t offset)
+{
+#ifdef __linux__
+    ram_addr_t len = 0;
+    int i;
+    for (i = 0; i < nb_numa_nodes; i++) {
+        len += numa_info[i].node_mem;
+        if (offset < len) {
+            break;
+        }
+    }
+    if (i == nb_numa_nodes) {
+        return -1;
+    }
+
+    void *ptr = memory_region_get_ram_ptr(mr);
+    for (; i < nb_numa_nodes; i++ ) {
+        if (offset + length <= len) {
+            if (node_set_mem_policy(ptr + start, length, i)) {
+                return -1;
+            }
+            break;
+        } else {
+            ram_addr_t tmp_len = len - offset;
+            offset += tmp_len;
+            length -= tmp_len;
+            if (node_set_mem_policy(ptr + start, tmp_len, i)) {
+                return -1;
+            }
+            start += tmp_len;
+        }
+
+        len += numa_info[i].node_mem;
+    }
+
+    if (i == nb_numa_nodes) {
+        return -1;
+    }
+#endif
+
+    return 0;
+}
+
 void set_numa_modes(void)
 {
     CPUState *cpu;
-- 
1.8.5

next prev parent reply	other threads:[~2013-12-04  8:01 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-12-04  7:58 [Qemu-devel] [PATCH V17 00/11] Add support for binding guest numa nodes to host numa nodes Wanlong Gao
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 01/11] NUMA: move numa related code to new file numa.c Wanlong Gao
2013-12-10 13:06   ` Eduardo Habkost
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 02/11] NUMA: check if the total numa memory size is equal to ram_size Wanlong Gao
2013-12-10 13:15   ` Eduardo Habkost
2013-12-10 18:03     ` Paolo Bonzini
2013-12-10 19:01       ` Eduardo Habkost
2013-12-11 12:26         ` Daniel P. Berrange
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 03/11] NUMA: Add numa_info structure to contain numa nodes info Wanlong Gao
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 04/11] NUMA: convert -numa option to use OptsVisitor Wanlong Gao
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 05/11] NUMA: introduce NumaMemOptions Wanlong Gao
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 06/11] NUMA: add "-numa mem," options Wanlong Gao
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 07/11] NUMA: expand MAX_NODES from 64 to 128 Wanlong Gao
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 08/11] NUMA: parse guest numa nodes memory policy Wanlong Gao
2013-12-04  7:58 ` Wanlong Gao [this message]
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 10/11] NUMA: add qmp command query-numa Wanlong Gao
2013-12-04  7:58 ` [Qemu-devel] [PATCH V17 11/11] NUMA: convert hmp command info_numa to use qmp command query_numa Wanlong Gao
2013-12-06  9:06 ` [Qemu-devel] [PATCH V17 00/11] Add support for binding guest numa nodes to host numa nodes Paolo Bonzini
2013-12-06  9:31   ` Wanlong Gao
2013-12-06  9:48     ` Paolo Bonzini
2013-12-09 18:16       ` Eduardo Habkost
2013-12-09 18:26         ` Paolo Bonzini
2013-12-06  9:06 ` Paolo Bonzini
2013-12-06 18:49   ` Marcelo Tosatti
2013-12-09 17:33     ` Paolo Bonzini
2013-12-09 18:10       ` Marcelo Tosatti
2013-12-09 18:26         ` Paolo Bonzini

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:74c1f16 dfblob:07553f2 dfblob:480dfbf dfblob:33de50a
dfblob:da4dbbd dfblob:43bba42 )
 OR (
bs:"[Qemu-devel] [PATCH V17 09/11] NUMA: set guest numa nodes memory policy" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1386143939-19142-10-git-send-email-gaowanlong@cn.fujitsu.com \
    --to=gaowanlong@cn.fujitsu.com \
    --cc=afaerber@suse.de \
    --cc=anthony@codemonkey.ws \
    --cc=bsd@redhat.com \
    --cc=drjones@redhat.com \
    --cc=ehabkost@redhat.com \
    --cc=hutao@cn.fujitsu.com \
    --cc=lcapitulino@redhat.com \
    --cc=lersek@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=peter.huangpeng@huawei.com \
    --cc=qemu-devel@nongnu.org \
    --cc=y-goto@jp.fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).