All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andre Przywara <andre.przywara@amd.com>
To: Keir Fraser <keir.fraser@eu.citrix.com>, xen-devel@lists.xensource.com
Subject: [PATCH 1/2]: hvm: NUMA guest: allocate memory and pin cpus according to guestnodes number (resend)
Date: Fri, 11 Jul 2008 16:11:54 +0200	[thread overview]
Message-ID: <48776A2A.8020407@amd.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 882 bytes --]

This patch introduces a new config file option called guestnodes.
Depending on the specified number (which can be 0 (the default) to
return to current behavior) a set of suitable nodes (which have enough
memory and are the least used ones) is selected and memory allocation is
split evenly across these host nodes. CPU affinity is set accordingly.

Reworked to apply against staging 18036.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>

-- 
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy

[-- Attachment #2: 03_numa_guest_18036.patch --]
[-- Type: text/plain, Size: 19765 bytes --]

diff -r f40c310dca31 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c	Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c	Fri Jul 11 15:53:59 2008 +0200
@@ -18,6 +18,8 @@
 #include "xc_e820.h"
 
 #include <xen/libelf.h>
+
+#include <asm/bitops.h>
 
 #define SUPERPAGE_PFN_SHIFT  9
 #define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
@@ -155,8 +157,173 @@
     return rc;
 }
 
+static int hweight_long (unsigned long value)
+{
+int ret=0;
+
+    while (value>0)
+    {
+        if (value&1) ++ret;
+        value>>=1;
+    }
+    return ret;
+}
+
+static int get_nodemasks (int xc_handle, uint64_t **nodemasks)
+{
+#define MAX_CPU_ID 255
+    xc_physinfo_t physinfo;
+    xc_cpu_to_node_t *cpumap;
+    int nrcpus, i;
+
+    cpumap=(xc_cpu_to_node_t *)malloc(sizeof(xc_cpu_to_node_t)*MAX_CPU_ID);
+    set_xen_guest_handle(physinfo.cpu_to_node, cpumap);
+
+    xc_physinfo (xc_handle,&physinfo);
+    nrcpus = physinfo.threads_per_core * physinfo.cores_per_socket *
+             physinfo.nr_nodes;
+
+    *nodemasks=malloc(sizeof(uint64_t)*physinfo.nr_nodes);
+    memset (*nodemasks,0,sizeof(uint64_t)*physinfo.nr_nodes);
+    for ( i = 0; i < nrcpus; i++ )
+    {
+        (*nodemasks)[cpumap[i]] |= 1 << i;
+    }
+    return nrcpus;
+}
+
+/* Distribute the VCPUs to the given NUMA nodes.
+ * Use xc_vcpu_setaffinity to pin physical CPUs to the VCPUs.
+ */
+static int setup_numa_affinity (int xc_handle, uint32_t dom,
+                                unsigned long nodemask)
+{
+    uint64_t *nodemasks, usemask;
+
+    int nrcpus, i;
+    xc_dominfo_t dominfo;
+    int nrnodes,curnode,vcpusleft;
+
+    nrnodes = hweight_long (nodemask);
+
+    nrcpus = get_nodemasks (xc_handle, &nodemasks);
+
+    if (xc_domain_getinfo (xc_handle, dom, 1, &dominfo) != 1)
+    {
+        ERROR("Unable to get platform info.");
+        return -1;
+    }
+    curnode = -1;
+    vcpusleft = 0;
+    for ( i = 0; i <= dominfo.max_vcpu_id; i++ )
+    {
+        if ( vcpusleft == 0 )
+        {
+            vcpusleft = ( dominfo.max_vcpu_id + 1 ) / nrnodes;
+            if ( ++curnode < ( ( dominfo.max_vcpu_id + 1 ) % nrnodes ) )
+                vcpusleft++;
+            usemask = nodemasks[__ffs(nodemask)];
+            nodemask &= ~(1ULL<<__ffs(nodemask));
+        }
+        xc_vcpu_setaffinity (xc_handle, dom, i, usemask);
+        vcpusleft--;
+    }
+
+    return 0;
+}
+
+static int populate_on_node ( int xc_handle, uint32_t dom,
+                              unsigned long *cur_pages,
+                              unsigned long nr_pages,
+                              int memflags, xen_pfn_t* page_array)
+{
+int rc=0;
+unsigned long i;
+
+    while ( (rc == 0) && (nr_pages > 0 ) )
+    {
+        /* Clip count to maximum 8MB extent. */
+        unsigned long count = nr_pages;
+        if ( count > 2048 )
+            count = 2048;
+
+        /* Clip partial superpage extents to superpage boundaries. */
+        if ( ((*cur_pages & (SUPERPAGE_NR_PFNS-1)) != 0) &&
+             (count > (-*cur_pages & (SUPERPAGE_NR_PFNS-1))) )
+            count = -*cur_pages & (SUPERPAGE_NR_PFNS-1); /* clip s.p. tail */
+        else if ( ((count & (SUPERPAGE_NR_PFNS-1)) != 0) &&
+                  (count > SUPERPAGE_NR_PFNS) )
+            count &= ~(SUPERPAGE_NR_PFNS - 1); /* clip non-s.p. tail */
+
+        /* Attempt to allocate superpage extents. */
+        if ( ((count | *cur_pages) & (SUPERPAGE_NR_PFNS - 1)) == 0 )
+        {
+            long done;
+            xen_pfn_t sp_extents[2048 >> SUPERPAGE_PFN_SHIFT];
+            struct xen_memory_reservation sp_req = {
+                .nr_extents   = count >> SUPERPAGE_PFN_SHIFT,
+                .extent_order = SUPERPAGE_PFN_SHIFT,
+                .mem_flags    = memflags,
+                .domid        = dom
+            };
+            set_xen_guest_handle(sp_req.extent_start, sp_extents);
+            for ( i = 0; i < sp_req.nr_extents; i++ )
+                sp_extents[i] = page_array[*cur_pages+(i<<SUPERPAGE_PFN_SHIFT)];
+
+            done = xc_memory_op(xc_handle, XENMEM_populate_physmap, &sp_req);
+            if ( done > 0 )
+            {
+                done <<= SUPERPAGE_PFN_SHIFT;
+                *cur_pages += done;
+                count -= done;
+                nr_pages -= done;
+            }
+        }
+
+        /* Fall back to 4kB extents. */
+        if ( count != 0 )
+        {
+            rc = xc_domain_memory_populate_physmap(
+                xc_handle, dom, count, 0, memflags,
+                &page_array[*cur_pages]);
+            *cur_pages += count;
+            nr_pages -= count;
+        }
+    }
+    return rc;
+}
+
+static int setup_numa_mem ( int xc_handle, uint32_t dom,
+                            unsigned long *cur_pages, unsigned long nr_pages,
+                            unsigned nodemask, xen_pfn_t *page_array)
+{
+    int i, rc;
+    unsigned long cur_node_pages;
+    unsigned long pages_per_node;
+    int numanodes;
+
+    numanodes = hweight_long (nodemask);
+
+    pages_per_node = ((nr_pages+0xFF)&(~0xFFUL))/numanodes;
+
+    for ( i = 0 ; i < numanodes ; i++ )
+    {
+        if ( i == numanodes - 1 )
+            cur_node_pages = nr_pages - i * pages_per_node;
+        else cur_node_pages = pages_per_node;
+        if ( i == 0 ) cur_node_pages -= *cur_pages;
+
+        rc = populate_on_node (xc_handle, dom, cur_pages, cur_node_pages,
+                               XENMEMF_node (__ffs(nodemask)), page_array);
+        if ( rc != 0 ) return rc;
+
+        nodemask &= ~(1<<__ffs(nodemask));
+    }
+    return 0;
+}
+
 static int setup_guest(int xc_handle,
-                       uint32_t dom, int memsize,
+                       uint32_t dom, int memsize, unsigned long nodemask,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
@@ -169,6 +336,7 @@
     struct elf_binary elf;
     uint64_t v_start, v_end;
     int rc;
+    unsigned int memflags;
     xen_capabilities_info_t caps;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
@@ -217,59 +385,30 @@
      * We allocate pages in batches of no more than 8MB to ensure that
      * we can be preempted and hence dom0 remains responsive.
      */
+
+    if ( nodemask == 0 ) memflags = 0;
+        else memflags = XENMEMF_node (__ffs (nodemask));
+
     rc = xc_domain_memory_populate_physmap(
-        xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]);
+        xc_handle, dom, 0xa0, 0, memflags, &page_array[0x00]);
     cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
-    {
-        /* Clip count to maximum 8MB extent. */
-        unsigned long count = nr_pages - cur_pages;
-        if ( count > 2048 )
-            count = 2048;
 
-        /* Clip partial superpage extents to superpage boundaries. */
-        if ( ((cur_pages & (SUPERPAGE_NR_PFNS-1)) != 0) &&
-             (count > (-cur_pages & (SUPERPAGE_NR_PFNS-1))) )
-            count = -cur_pages & (SUPERPAGE_NR_PFNS-1); /* clip s.p. tail */
-        else if ( ((count & (SUPERPAGE_NR_PFNS-1)) != 0) &&
-                  (count > SUPERPAGE_NR_PFNS) )
-            count &= ~(SUPERPAGE_NR_PFNS - 1); /* clip non-s.p. tail */
-
-        /* Attempt to allocate superpage extents. */
-        if ( ((count | cur_pages) & (SUPERPAGE_NR_PFNS - 1)) == 0 )
-        {
-            long done;
-            xen_pfn_t sp_extents[2048 >> SUPERPAGE_PFN_SHIFT];
-            struct xen_memory_reservation sp_req = {
-                .nr_extents   = count >> SUPERPAGE_PFN_SHIFT,
-                .extent_order = SUPERPAGE_PFN_SHIFT,
-                .domid        = dom
-            };
-            set_xen_guest_handle(sp_req.extent_start, sp_extents);
-            for ( i = 0; i < sp_req.nr_extents; i++ )
-                sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_PFN_SHIFT)];
-            done = xc_memory_op(xc_handle, XENMEM_populate_physmap, &sp_req);
-            if ( done > 0 )
-            {
-                done <<= SUPERPAGE_PFN_SHIFT;
-                cur_pages += done;
-                count -= done;
-            }
-        }
-
-        /* Fall back to 4kB extents. */
-        if ( count != 0 )
-        {
-            rc = xc_domain_memory_populate_physmap(
-                xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
-            cur_pages += count;
-        }
-    }
+    if ( hweight_long (nodemask) > 1 )
+        rc = setup_numa_mem (xc_handle, dom, &cur_pages, nr_pages,
+                             nodemask, page_array);
+    else
+        rc = populate_on_node (xc_handle, dom, &cur_pages, nr_pages - cur_pages,
+                               memflags, page_array);
 
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
         goto error_out;
+    }
+
+    if ( hweight_long (nodemask) > 1 )
+    {
+        setup_numa_affinity (xc_handle, dom, nodemask);
     }
 
     if ( loadelfimage(&elf, xc_handle, dom, page_array) != 0 )
@@ -364,6 +503,7 @@
 static int xc_hvm_build_internal(int xc_handle,
                                  uint32_t domid,
                                  int memsize,
+                                 unsigned long nodemask,
                                  char *image,
                                  unsigned long image_size)
 {
@@ -373,7 +513,7 @@
         return -1;
     }
 
-    return setup_guest(xc_handle, domid, memsize, image, image_size);
+    return setup_guest(xc_handle, domid, memsize, nodemask, image, image_size);
 }
 
 static inline int is_loadable_phdr(Elf32_Phdr *phdr)
@@ -388,6 +528,7 @@
 int xc_hvm_build(int xc_handle,
                  uint32_t domid,
                  int memsize,
+                 unsigned long nodemask,
                  const char *image_name)
 {
     char *image;
@@ -398,7 +539,8 @@
          ((image = xc_read_image(image_name, &image_size)) == NULL) )
         return -1;
 
-    sts = xc_hvm_build_internal(xc_handle, domid, memsize, image, image_size);
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, nodemask,
+                                image, image_size);
 
     free(image);
 
@@ -411,6 +553,7 @@
 int xc_hvm_build_mem(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_buffer,
                      unsigned long image_size)
 {
@@ -433,7 +576,7 @@
         return -1;
     }
 
-    sts = xc_hvm_build_internal(xc_handle, domid, memsize,
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, nodemask,
                                 img, img_len);
 
     /* xc_inflate_buffer may return the original buffer pointer (for
diff -r f40c310dca31 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h	Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/libxc/xenguest.h	Fri Jul 11 15:53:59 2008 +0200
@@ -128,11 +128,13 @@
 int xc_hvm_build(int xc_handle,
                  uint32_t domid,
                  int memsize,
+                 unsigned long nodemask,
                  const char *image_name);
 
 int xc_hvm_build_mem(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_buffer,
                      unsigned long image_size);
 
diff -r f40c310dca31 tools/libxc/xg_private.c
--- a/tools/libxc/xg_private.c	Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/libxc/xg_private.c	Fri Jul 11 15:53:59 2008 +0200
@@ -177,6 +177,7 @@
     int xc_hvm_build(int xc_handle,
                      uint32_t domid,
                      int memsize,
+                     unsigned long nodemask,
                      const char *image_name)
 {
     errno = ENOSYS;
diff -r f40c310dca31 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c	Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c	Fri Jul 11 15:53:59 2008 +0200
@@ -873,16 +873,17 @@
 #endif
     char *image;
     int memsize, vcpus = 1, acpi = 0, apic = 1;
+    unsigned long nodemask;
 
     static char *kwd_list[] = { "domid",
                                 "memsize", "image", "vcpus", "acpi",
-                                "apic", NULL };
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iii", kwd_list,
-                                      &dom, &memsize,
-                                      &image, &vcpus, &acpi, &apic) )
+                                "apic", "nodemask", NULL };
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iiil", kwd_list,
+                                      &dom, &memsize, &image,
+                                      &vcpus, &acpi, &apic, &nodemask) )
         return NULL;
 
-    if ( xc_hvm_build(self->xc_handle, dom, memsize, image) != 0 )
+    if ( xc_hvm_build(self->xc_handle, dom, memsize, nodemask, image) != 0 )
         return pyxc_error_to_exception();
 
 #if !defined(__ia64__)
diff -r f40c310dca31 tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py	Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/xend/XendConfig.py	Fri Jul 11 15:53:59 2008 +0200
@@ -162,6 +162,7 @@
     'vhpt': int,
     'guest_os_type': str,
     'hap': int,
+    'guestnodes': int,
 }
 
 # Xen API console 'other_config' keys.
@@ -375,6 +376,7 @@
             'other_config': {},
             'platform': {},
             'target': 0,
+            'guestnodes': 0,
         }
         
         return defaults
@@ -570,7 +572,10 @@
             cfg["memory"] = int(sxp.child_value(sxp_cfg, "memory"))
         if sxp.child_value(sxp_cfg, "maxmem") != None:
             cfg["maxmem"] = int(sxp.child_value(sxp_cfg, "maxmem"))
-            
+
+        if sxp.child_value(sxp_cfg, "guestnodes") != None:
+            cfg["guestnodes"] = int(sxp.child_value(sxp_cfg, "guestnodes"))
+
         # Convert scheduling parameters to vcpus_params
         if 'vcpus_params' not in cfg:
             cfg['vcpus_params'] = {}
diff -r f40c310dca31 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py	Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py	Fri Jul 11 15:53:59 2008 +0200
@@ -2150,7 +2150,7 @@
                     if self.info['cpus'][v]:
                         xc.vcpu_setaffinity(self.domid, v, self.info['cpus'][v])
             else:
-                def find_relaxed_node(node_list):
+                def find_relaxed_node(node_list, numnodes):
                     import sys
                     nr_nodes = info['nr_nodes']
                     if node_list is None:
@@ -2175,21 +2175,36 @@
                             nodeload[i] = int(nodeload[i] * 16 / len(info['node_to_cpu'][i]))
                         else:
                             nodeload[i] = sys.maxint
-                    index = nodeload.index( min(nodeload) )    
-                    return index
+
+                    if numnodes == 0:
+                        return nodeload.index( min(nodeload) )
+                    else:
+                        nodemask = 0
+                        for i in range (0,numnodes):
+                            index = min((n, i) for i, n in enumerate(nodeload))[1]
+                            nodemask = nodemask | (1 << index)
+                            nodeload[index] = sys.maxint
+                        return nodemask
 
                 info = xc.physinfo()
+                nodemask = 0
                 if info['nr_nodes'] > 1:
                     node_memory_list = info['node_to_memory']
                     needmem = self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024
+                    if self.image.guestnodes > 1:
+                        needmem = needmem / self.image.guestnodes
                     candidate_node_list = []
                     for i in range(0, info['nr_nodes']):
                         if node_memory_list[i] >= needmem and len(info['node_to_cpu'][i]) > 0:
                             candidate_node_list.append(i)
-                    index = find_relaxed_node(candidate_node_list)
-                    cpumask = info['node_to_cpu'][index]
-                    for v in range(0, self.info['VCPUs_max']):
-                        xc.vcpu_setaffinity(self.domid, v, cpumask)
+                    nodemask = find_relaxed_node(candidate_node_list, 
+                                                 self.image.guestnodes)
+                    if self.image.guestnodes < 1:
+                        cpumask = info['node_to_cpu'][nodemask]
+                        for v in range(0, self.info['VCPUs_max']):
+                            xc.vcpu_setaffinity(self.domid, v, cpumask)
+                    else:
+                        self.image.nodemask = nodemask
 
             # Use architecture- and image-specific calculations to determine
             # the various headrooms necessary, given the raw configured
diff -r f40c310dca31 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py	Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/xend/image.py	Fri Jul 11 15:53:59 2008 +0200
@@ -127,6 +127,9 @@
             self.cpuid = vmConfig['cpuid'];
         if 'cpuid_check' in vmConfig:
             self.cpuid_check = vmConfig['cpuid_check']
+
+        self.guestnodes = int(vmConfig['platform'].get('guestnodes',0))
+        self.nodemask   = 0
 
     def cleanupBootloading(self):
         if self.bootloader:
@@ -696,6 +699,7 @@
         self.apic = int(vmConfig['platform'].get('apic', 0))
         self.acpi = int(vmConfig['platform'].get('acpi', 0))
         self.guest_os_type = vmConfig['platform'].get('guest_os_type')
+        self.guestnodes = int(vmConfig['platform'].get('guestnodes', 0))
            
 
     # Return a list of cmd line args to the device models based on the
@@ -797,13 +801,16 @@
         log.debug("vcpus          = %d", self.vm.getVCpuCount())
         log.debug("acpi           = %d", self.acpi)
         log.debug("apic           = %d", self.apic)
+        log.debug("guestnodes     = %d", self.guestnodes)
+        log.debug("nodemask       = %d", self.nodemask)
 
         rc = xc.hvm_build(domid          = self.vm.getDomid(),
                           image          = self.loader,
                           memsize        = mem_mb,
                           vcpus          = self.vm.getVCpuCount(),
                           acpi           = self.acpi,
-                          apic           = self.apic)
+                          apic           = self.apic,
+                          nodemask       = self.nodemask)
         rc['notes'] = { 'SUSPEND_CANCEL': 1 }
 
         rc['store_mfn'] = xc.hvm_get_param(self.vm.getDomid(),
diff -r f40c310dca31 tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py	Fri Jul 11 12:51:26 2008 +0100
+++ b/tools/python/xen/xm/create.py	Fri Jul 11 15:53:59 2008 +0200
@@ -567,6 +567,10 @@
           fn=set_int, default=None,
           use="""Maximum machine address size""")
 
+gopts.var('guestnodes', val="GUESTNODES",
+          fn=set_int, default=0,
+          use="""Number of NUMA nodes to appear in the guest.""")
+
 def err(msg):
     """Print an error to stderr and exit.
     """
@@ -845,7 +849,8 @@
              'vnc', 'vncdisplay', 'vncunused', 'vncconsole', 'vnclisten',
              'sdl', 'display', 'xauthority', 'rtc_timeoffset', 'monitor',
              'acpi', 'apic', 'usb', 'usbdevice', 'keymap', 'pci', 'hpet',
-             'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check']
+             'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check',
+             'guestnodes' ]
 
     for a in args:
         if a in vals.__dict__ and vals.__dict__[a] is not None:

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

                 reply	other threads:[~2008-07-11 14:11 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=48776A2A.8020407@amd.com \
    --to=andre.przywara@amd.com \
    --cc=keir.fraser@eu.citrix.com \
    --cc=xen-devel@lists.xensource.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.