xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes
@ 2010-04-04 19:30 Dulloor
  2010-04-05 14:16 ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 3+ messages in thread
From: Dulloor @ 2010-04-04 19:30 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 264 bytes --]

The patch uses the virtual(pv) NUMA enlightenment to setup the nodes
(and NUMA layout), if the guest is compiled with the support. The
method used is the same as that in faking nodes on a non-NUMA system.

-dulloor

Signed-off-by: Dulloor Rao <dulloor@gatech.edu>

[-- Attachment #2: linux-02-pv-numa-enlightenment.patch --]
[-- Type: text/x-patch, Size: 15300 bytes --]

diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 9539998..c47b9fa 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -32,6 +32,7 @@
 	} while (0)
 #elif defined(__x86_64__)
 #define set_xen_guest_handle(hnd, val)	do { (hnd).p = val; } while (0)
+#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
 #endif
 #else
 #if defined(__i386__)
@@ -43,6 +44,7 @@
 	} while (0)
 #elif defined(__x86_64__)
 #define set_xen_guest_handle(hnd, val)	do { (hnd) = val; } while (0)
+#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
 #endif
 #endif
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index df20f15..3e3f546 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -942,7 +942,8 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * Parse SRAT to discover nodes.
 	 */
-	acpi_numa_init();
+    if (acpi_numa > 0)
+	    acpi_numa_init();
 #endif
 
 	initmem_init(0, max_pfn);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913b..3a856dc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -11,7 +11,11 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/nodemask.h>
+#include <linux/cpumask.h>
 #include <linux/sched.h>
+#include <linux/bitops.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -19,6 +23,8 @@
 #include <asm/numa.h>
 #include <asm/acpi.h>
 #include <asm/k8.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
 	 */
 	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
 		long n = simple_strtol(cmdline, NULL, 0);
-
 		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
 		if (num_nodes < 0)
 			return num_nodes;
@@ -522,6 +527,246 @@ out:
 	numa_init_array();
 	return 0;
 }
+
+/************************************************************************/
+#ifdef CONFIG_XEN_NUMA_GUEST
+/* XEN PV GUEST NUMA */
+struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout;
+
+static inline void __init
+bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits)
+{
+	/* We may need to pad the final longword with zeroes. */
+	if (nbits & (BITS_PER_LONG-1))
+		lp[BITS_TO_LONGS(nbits)-1] = 0;
+	memcpy(lp, bp, (nbits+7)/8);
+}
+
+static void __init
+xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask *xcpumask)
+{
+    unsigned int nr_cpus;
+    uint8_t *bytemap;
+
+    bytemap = xcpumask->bits;
+
+    nr_cpus =
+        min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS);
+
+    cpumask_clear(cpumask);
+    bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus);
+}
+
+static void __init
+xen_dump_numa_layout(struct xen_domain_numa_layout *layout)
+{
+    unsigned int i, j;
+    char vcpumaskstr[128];
+    printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ",
+                            layout->max_vcpus, layout->max_vnodes);
+    switch (layout->type)
+    {
+        case XEN_DOM_NUMA_CONFINED:
+            printk("type(CONFINED)\n");
+            break;
+        case XEN_DOM_NUMA_SPLIT:
+            printk("type(SPLIT)\n");
+            break;
+        case XEN_DOM_NUMA_STRIPED:
+            printk("type(STRIPED)\n");
+            break;
+        default:
+            printk("type(UNDEFINED)\n");
+    }
+
+    for (i = 0; i < layout->max_vnodes; i++)
+    {
+        cpumask_t vcpu_mask;
+        struct xenctl_cpumask *xvcpumask;
+        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
+        xvcpumask = &vnode_data->vcpu_mask;
+        xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask);
+        cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask);
+        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", 
+                vnode_data->vnode_id, vnode_data->mnode_id,
+                (unsigned long)vnode_data->nr_pages, vcpumaskstr);
+    }
+
+    printk("vnode distances :\n");
+    for (i = 0; i < layout->max_vnodes; i++)
+        printk("\tvnode[%u]", i);
+    for (i = 0; i < layout->max_vnodes; i++)
+    {
+        printk("\nvnode[%u]", i);
+        for (j = 0; j < layout->max_vnodes; j++)
+            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
+        printk("\n");
+    }
+    return;
+}
+
+static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
+{
+    /* Construct a slit table (using layout->vnode_distance).
+     * Copy it to acpi_slit. */
+    return;
+}
+
+/* Distribute the vcpus over the vnodes according to their affinity */
+static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
+{
+	int vcpu, vnode;
+   
+	printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
+
+
+    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
+    {
+        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
+        struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask;
+        cpumask_t vcpu_mask;
+
+        xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask);
+   
+        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
+        {
+            if (cpu_isset(vcpu, vcpu_mask))
+            {
+                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
+                {
+                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", 
+                                        vcpu, early_cpu_to_node(vcpu)); 
+                    continue;
+                }
+                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
+		        numa_set_node(vcpu, vnode);
+            }
+        }
+    }
+    return;
+}
+
+static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
+                            unsigned long start_pfn, unsigned long last_pfn)
+{
+	int num_vnodes, i;
+    u64 node_start_addr, node_end_addr, max_addr;
+
+    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
+                                        layout->max_vnodes, layout->max_vcpus);
+
+    if (layout->type != XEN_DOM_NUMA_SPLIT)
+    {
+        printk(KERN_INFO "xen_numa_emulation : Invalid layout type");
+        return -1;
+    }
+
+	memset(&nodes, 0, sizeof(nodes));
+
+    num_vnodes = layout->max_vnodes;
+    BUG_ON(num_vnodes > MAX_NUMNODES);
+
+    max_addr = last_pfn << PAGE_SHIFT;
+
+    node_start_addr = start_pfn << PAGE_SHIFT;
+    for (i = 0; i < num_vnodes; i++)
+    {
+        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
+        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
+
+		node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
+
+		if (i == (num_vnodes-1))
+			node_end_addr = max_addr;
+		else
+            node_end_addr = node_start_addr + node_size;
+        /* node_start_addr updated inside the function */
+        if (setup_node_range(i, nodes, &node_start_addr, 
+                    (node_end_addr-node_start_addr), max_addr+1))
+            goto failed;
+    }
+
+	printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
+
+    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
+    if (memnode_shift < 0) {
+	    printk(KERN_ERR "No NUMA hash function found.\n");
+        goto failed;
+    }
+    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
+	/*
+	 * We need to vacate all active ranges that may have been registered by
+	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
+	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
+	 */
+	remove_all_active_ranges();
+
+    BUG_ON(acpi_numa >= 0);
+	for_each_node_mask(i, node_possible_map) {
+		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+						nodes[i].end >> PAGE_SHIFT);
+		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+	}
+    xen_init_slit_table(layout);
+	xen_init_numa_array(layout);
+	return 0;
+failed:
+    return -1;
+}
+
+static int __init
+xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout)
+{
+    int rc;
+    struct xenmem_numa_op memop;
+    memop.cmd = XENMEM_get_domain_numa_layout;
+    memop.u.dinfo.domid = DOMID_SELF;
+    memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION;
+    memop.u.dinfo.bufsize = sizeof(*pv_layout);
+    set_xen_guest_handle(memop.u.dinfo.buf, pv_layout);
+
+    if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop)))
+    {
+        printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout failed\n");
+        xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
+        goto done;
+    }
+
+    if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION)
+    {
+        printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n");
+        xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
+        rc = -1;
+    }
+    xen_dump_numa_layout(pv_layout);
+done:
+    return rc;
+}
+
+static int __init
+xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
+{
+    int rc = 0;
+    if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN))
+    {
+        rc = -1;
+	    printk(KERN_INFO "xen numa emulation disabled\n");
+        goto done;
+    }
+    if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout)))
+        goto done;
+    rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, last_pfn);
+done:
+    return rc;
+}
+#else
+static inline int __init
+xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
+{
+    return -1;
+}
+#endif /* CONFIG_XEN_NUMA_GUEST */
+/************************************************************************/
 #endif /* CONFIG_NUMA_EMU */
 
 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
@@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
+    if (!xen_pv_numa(start_pfn, last_pfn))
+        return;
+
 	if (cmdline && !numa_emulation(start_pfn, last_pfn))
 		return;
 	nodes_clear(node_possible_map);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 7675f9b..7cf6a4f 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH
        help
          Enable support for passing PCI devices through to
 	 unprivileged domains. (COMPLETELY UNTESTED)
+
+config XEN_NUMA_GUEST
+    bool "Enable support NUMA aware Xen domains"
+    depends on XEN && X86_64 && NUMA && NUMA_EMU
+    help
+        Enable support for NUMA aware Xen domains. With this
+        option, if the memory for the domain is allocated 
+        from different memory nodes, the domain is made aware
+        of this through a Virtual NUMA enlightenment.
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index df3e84c..2ee9f0b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -285,6 +285,8 @@ void __init xen_arch_setup(void)
 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
 		disable_acpi();
 	}
+    acpi_numa = -1;
+    numa_off = 1;
 #endif
 
 	/* 
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 32ab005..7f5b85e 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
  */
 #define XENMEM_machine_memory_map   10
 
+/* xen guest numa operations */
+#define XENMEM_numa_op           15
+
+#define XEN_DOM_NUMA_INTERFACE_VERSION  0x00000001
+#define XENCTL_NR_CPUS 64
+#define XENCTL_BITS_PER_BYTE 8
+#define XENCTL_BITS_TO_BYTES(bits) \
+    (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE)
+
+#define XENCTL_DECLARE_BITMAP(name,bits) \
+    uint8_t name[XENCTL_BITS_TO_BYTES(bits)]
+struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); };
+
+/* Not used in guest */
+#define XENMEM_machine_numa_layout 0x01
+struct xenmem_node_data {
+    uint32_t node_id;
+    uint64_t node_memsize;
+    uint64_t node_memfree;
+    struct xenctl_cpumask cpu_mask; /* node_to_cpumask */
+};
+
+/* NUMA layout for the machine.
+ * Structure has to fit within a page. */
+struct xenmem_machine_numa_layout {
+    uint32_t max_nodes;
+    /* Only (max_nodes*max_nodes) entries are filled */
+    GUEST_HANDLE(uint32_t) node_distance;
+    /* max_vnodes entries of xenmem_node_data type */
+    GUEST_HANDLE(void) node_data;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout);
+
+
+#define XENMEM_machine_nodemap  0x02
+struct xenmem_machine_nodemap {
+    /* On call the size of the available buffer */
+    uint32_t bufsize;
+
+    /* memnode map parameters */
+    int32_t shift;
+    uint32_t mapsize;
+    GUEST_HANDLE(void) map;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap);
+
+/* NUMA layout for the domain at the time of startup.
+ * Structure has to fit within a page. */
+#define XENMEM_set_domain_numa_layout 0x03
+#define XENMEM_get_domain_numa_layout 0x04
+
+/* NUMA layout for the domain at the time of startup.
+ * Structure has to fit within a page. */
+#define XEN_MAX_VNODES 8
+
+struct xen_vnode_data {
+    uint32_t vnode_id;
+    uint32_t mnode_id;
+    uint64_t nr_pages;
+    struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */
+};
+
+#define XEN_DOM_NUMA_CONFINED       0x01
+#define XEN_DOM_NUMA_SPLIT          0x02
+#define XEN_DOM_NUMA_STRIPED         0x03
+struct xen_domain_numa_layout {
+    uint32_t version;
+    uint32_t type;
+
+    uint32_t max_vcpus;
+    uint32_t max_vnodes;
+
+    /* Only (max_vnodes*max_vnodes) entries are filled */
+    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
+    /* Only (max_vnodes) entries are filled */
+    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
+};
+
+struct xenmem_domain_numa_layout {
+    domid_t domid;
+    uint32_t version;
+
+    uint32_t bufsize;
+    GUEST_HANDLE(void) buf;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout);
+
+struct xenmem_numa_op {
+    uint32_t cmd;
+    union {
+        struct xenmem_machine_numa_layout minfo;
+        struct xenmem_machine_nodemap mnodemap;
+        struct xenmem_domain_numa_layout dinfo;
+    } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op);
 
 /*
  * Prevent the balloon driver from changing the memory reservation
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 9ffaee0..17cb17d 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -494,6 +494,8 @@ struct dom0_vga_console_info {
 /* These flags are passed in the 'flags' field of start_info_t. */
 #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
 #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
+#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
+#define SIF_NUMA_DOMAIN   (1<<3)  /* Is the domain NUMA aware ? */
 #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
 
 typedef uint64_t cpumap_t;
@@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16];
 #define __mk_unsigned_long(x) x ## UL
 #define mk_unsigned_long(x) __mk_unsigned_long(x)
 
+DEFINE_GUEST_HANDLE(uint32_t);
 DEFINE_GUEST_HANDLE(uint64_t);
 
 #else /* __ASSEMBLY__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2010-04-06  5:19 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-04-04 19:30 [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes Dulloor
2010-04-05 14:16 ` Konrad Rzeszutek Wilk
2010-04-06  5:19   ` Dulloor

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).