* [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes
@ 2010-04-04 19:30 Dulloor
2010-04-05 14:16 ` Konrad Rzeszutek Wilk
0 siblings, 1 reply; 3+ messages in thread
From: Dulloor @ 2010-04-04 19:30 UTC (permalink / raw)
To: xen-devel
[-- Attachment #1: Type: text/plain, Size: 264 bytes --]
The patch uses the virtual(pv) NUMA enlightenment to setup the nodes
(and NUMA layout), if the guest is compiled with the support. The
method used is the same as that in faking nodes on a non-NUMA system.
-dulloor
Signed-off-by: Dulloor Rao <dulloor@gatech.edu>
[-- Attachment #2: linux-02-pv-numa-enlightenment.patch --]
[-- Type: text/x-patch, Size: 15300 bytes --]
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 9539998..c47b9fa 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -32,6 +32,7 @@
} while (0)
#elif defined(__x86_64__)
#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
+#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
#endif
#else
#if defined(__i386__)
@@ -43,6 +44,7 @@
} while (0)
#elif defined(__x86_64__)
#define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0)
+#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
#endif
#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index df20f15..3e3f546 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -942,7 +942,8 @@ void __init setup_arch(char **cmdline_p)
/*
* Parse SRAT to discover nodes.
*/
- acpi_numa_init();
+ if (acpi_numa > 0)
+ acpi_numa_init();
#endif
initmem_init(0, max_pfn);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913b..3a856dc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -11,7 +11,11 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/nodemask.h>
+#include <linux/cpumask.h>
#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -19,6 +23,8 @@
#include <asm/numa.h>
#include <asm/acpi.h>
#include <asm/k8.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
@@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
*/
if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
long n = simple_strtol(cmdline, NULL, 0);
-
num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
if (num_nodes < 0)
return num_nodes;
@@ -522,6 +527,246 @@ out:
numa_init_array();
return 0;
}
+
+/************************************************************************/
+#ifdef CONFIG_XEN_NUMA_GUEST
+/* XEN PV GUEST NUMA */
+struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout;
+
+static inline void __init
+bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits)
+{
+ /* We may need to pad the final longword with zeroes. */
+ if (nbits & (BITS_PER_LONG-1))
+ lp[BITS_TO_LONGS(nbits)-1] = 0;
+ memcpy(lp, bp, (nbits+7)/8);
+}
+
+static void __init
+xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask *xcpumask)
+{
+ unsigned int nr_cpus;
+ uint8_t *bytemap;
+
+ bytemap = xcpumask->bits;
+
+ nr_cpus =
+ min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS);
+
+ cpumask_clear(cpumask);
+ bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus);
+}
+
+static void __init
+xen_dump_numa_layout(struct xen_domain_numa_layout *layout)
+{
+ unsigned int i, j;
+ char vcpumaskstr[128];
+ printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ",
+ layout->max_vcpus, layout->max_vnodes);
+ switch (layout->type)
+ {
+ case XEN_DOM_NUMA_CONFINED:
+ printk("type(CONFINED)\n");
+ break;
+ case XEN_DOM_NUMA_SPLIT:
+ printk("type(SPLIT)\n");
+ break;
+ case XEN_DOM_NUMA_STRIPED:
+ printk("type(STRIPED)\n");
+ break;
+ default:
+ printk("type(UNDEFINED)\n");
+ }
+
+ for (i = 0; i < layout->max_vnodes; i++)
+ {
+ cpumask_t vcpu_mask;
+ struct xenctl_cpumask *xvcpumask;
+ struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
+ xvcpumask = &vnode_data->vcpu_mask;
+ xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask);
+ cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask);
+ printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n",
+ vnode_data->vnode_id, vnode_data->mnode_id,
+ (unsigned long)vnode_data->nr_pages, vcpumaskstr);
+ }
+
+ printk("vnode distances :\n");
+ for (i = 0; i < layout->max_vnodes; i++)
+ printk("\tvnode[%u]", i);
+ for (i = 0; i < layout->max_vnodes; i++)
+ {
+ printk("\nvnode[%u]", i);
+ for (j = 0; j < layout->max_vnodes; j++)
+ printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
+ printk("\n");
+ }
+ return;
+}
+
+static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
+{
+ /* Construct a slit table (using layout->vnode_distance).
+ * Copy it to acpi_slit. */
+ return;
+}
+
+/* Distribute the vcpus over the vnodes according to their affinity */
+static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
+{
+ int vcpu, vnode;
+
+ printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
+
+
+ for (vnode = 0; vnode < layout->max_vnodes; vnode++)
+ {
+ struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
+ struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask;
+ cpumask_t vcpu_mask;
+
+ xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask);
+
+ for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
+ {
+ if (cpu_isset(vcpu, vcpu_mask))
+ {
+ if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
+ {
+ printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n",
+ vcpu, early_cpu_to_node(vcpu));
+ continue;
+ }
+ printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
+ numa_set_node(vcpu, vnode);
+ }
+ }
+ }
+ return;
+}
+
+static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
+ unsigned long start_pfn, unsigned long last_pfn)
+{
+ int num_vnodes, i;
+ u64 node_start_addr, node_end_addr, max_addr;
+
+ printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
+ layout->max_vnodes, layout->max_vcpus);
+
+ if (layout->type != XEN_DOM_NUMA_SPLIT)
+ {
+ printk(KERN_INFO "xen_numa_emulation : Invalid layout type");
+ return -1;
+ }
+
+ memset(&nodes, 0, sizeof(nodes));
+
+ num_vnodes = layout->max_vnodes;
+ BUG_ON(num_vnodes > MAX_NUMNODES);
+
+ max_addr = last_pfn << PAGE_SHIFT;
+
+ node_start_addr = start_pfn << PAGE_SHIFT;
+ for (i = 0; i < num_vnodes; i++)
+ {
+ struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
+ u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
+
+ node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
+
+ if (i == (num_vnodes-1))
+ node_end_addr = max_addr;
+ else
+ node_end_addr = node_start_addr + node_size;
+ /* node_start_addr updated inside the function */
+ if (setup_node_range(i, nodes, &node_start_addr,
+ (node_end_addr-node_start_addr), max_addr+1))
+ goto failed;
+ }
+
+ printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
+
+ memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
+ if (memnode_shift < 0) {
+ printk(KERN_ERR "No NUMA hash function found.\n");
+ goto failed;
+ }
+ /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
+ /*
+ * We need to vacate all active ranges that may have been registered by
+ * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
+ * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
+ */
+ remove_all_active_ranges();
+
+ BUG_ON(acpi_numa >= 0);
+ for_each_node_mask(i, node_possible_map) {
+ e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+ nodes[i].end >> PAGE_SHIFT);
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ }
+ xen_init_slit_table(layout);
+ xen_init_numa_array(layout);
+ return 0;
+failed:
+ return -1;
+}
+
+static int __init
+xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout)
+{
+ int rc;
+ struct xenmem_numa_op memop;
+ memop.cmd = XENMEM_get_domain_numa_layout;
+ memop.u.dinfo.domid = DOMID_SELF;
+ memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION;
+ memop.u.dinfo.bufsize = sizeof(*pv_layout);
+ set_xen_guest_handle(memop.u.dinfo.buf, pv_layout);
+
+ if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop)))
+ {
+ printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout failed\n");
+ xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
+ goto done;
+ }
+
+ if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION)
+ {
+ printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n");
+ xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
+ rc = -1;
+ }
+ xen_dump_numa_layout(pv_layout);
+done:
+ return rc;
+}
+
+static int __init
+xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
+{
+ int rc = 0;
+ if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN))
+ {
+ rc = -1;
+ printk(KERN_INFO "xen numa emulation disabled\n");
+ goto done;
+ }
+ if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout)))
+ goto done;
+ rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, last_pfn);
+done:
+ return rc;
+}
+#else
+static inline int __init
+xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
+{
+ return -1;
+}
+#endif /* CONFIG_XEN_NUMA_GUEST */
+/************************************************************************/
#endif /* CONFIG_NUMA_EMU */
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
@@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
+ if (!xen_pv_numa(start_pfn, last_pfn))
+ return;
+
if (cmdline && !numa_emulation(start_pfn, last_pfn))
return;
nodes_clear(node_possible_map);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 7675f9b..7cf6a4f 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH
help
Enable support for passing PCI devices through to
unprivileged domains. (COMPLETELY UNTESTED)
+
+config XEN_NUMA_GUEST
+ bool "Enable support NUMA aware Xen domains"
+ depends on XEN && X86_64 && NUMA && NUMA_EMU
+ help
+ Enable support for NUMA aware Xen domains. With this
+ option, if the memory for the domain is allocated
+ from different memory nodes, the domain is made aware
+ of this through a Virtual NUMA enlightenment.
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index df3e84c..2ee9f0b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -285,6 +285,8 @@ void __init xen_arch_setup(void)
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
disable_acpi();
}
+ acpi_numa = -1;
+ numa_off = 1;
#endif
/*
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 32ab005..7f5b85e 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
*/
#define XENMEM_machine_memory_map 10
+/* xen guest numa operations */
+#define XENMEM_numa_op 15
+
+#define XEN_DOM_NUMA_INTERFACE_VERSION 0x00000001
+#define XENCTL_NR_CPUS 64
+#define XENCTL_BITS_PER_BYTE 8
+#define XENCTL_BITS_TO_BYTES(bits) \
+ (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE)
+
+#define XENCTL_DECLARE_BITMAP(name,bits) \
+ uint8_t name[XENCTL_BITS_TO_BYTES(bits)]
+struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); };
+
+/* Not used in guest */
+#define XENMEM_machine_numa_layout 0x01
+struct xenmem_node_data {
+ uint32_t node_id;
+ uint64_t node_memsize;
+ uint64_t node_memfree;
+ struct xenctl_cpumask cpu_mask; /* node_to_cpumask */
+};
+
+/* NUMA layout for the machine.
+ * Structure has to fit within a page. */
+struct xenmem_machine_numa_layout {
+ uint32_t max_nodes;
+ /* Only (max_nodes*max_nodes) entries are filled */
+ GUEST_HANDLE(uint32_t) node_distance;
+ /* max_vnodes entries of xenmem_node_data type */
+ GUEST_HANDLE(void) node_data;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout);
+
+
+#define XENMEM_machine_nodemap 0x02
+struct xenmem_machine_nodemap {
+ /* On call the size of the available buffer */
+ uint32_t bufsize;
+
+ /* memnode map parameters */
+ int32_t shift;
+ uint32_t mapsize;
+ GUEST_HANDLE(void) map;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap);
+
+/* NUMA layout for the domain at the time of startup.
+ * Structure has to fit within a page. */
+#define XENMEM_set_domain_numa_layout 0x03
+#define XENMEM_get_domain_numa_layout 0x04
+
+/* NUMA layout for the domain at the time of startup.
+ * Structure has to fit within a page. */
+#define XEN_MAX_VNODES 8
+
+struct xen_vnode_data {
+ uint32_t vnode_id;
+ uint32_t mnode_id;
+ uint64_t nr_pages;
+ struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */
+};
+
+#define XEN_DOM_NUMA_CONFINED 0x01
+#define XEN_DOM_NUMA_SPLIT 0x02
+#define XEN_DOM_NUMA_STRIPED 0x03
+struct xen_domain_numa_layout {
+ uint32_t version;
+ uint32_t type;
+
+ uint32_t max_vcpus;
+ uint32_t max_vnodes;
+
+ /* Only (max_vnodes*max_vnodes) entries are filled */
+ uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
+ /* Only (max_vnodes) entries are filled */
+ struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
+};
+
+struct xenmem_domain_numa_layout {
+ domid_t domid;
+ uint32_t version;
+
+ uint32_t bufsize;
+ GUEST_HANDLE(void) buf;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout);
+
+struct xenmem_numa_op {
+ uint32_t cmd;
+ union {
+ struct xenmem_machine_numa_layout minfo;
+ struct xenmem_machine_nodemap mnodemap;
+ struct xenmem_domain_numa_layout dinfo;
+ } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op);
/*
* Prevent the balloon driver from changing the memory reservation
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 9ffaee0..17cb17d 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -494,6 +494,8 @@ struct dom0_vga_console_info {
/* These flags are passed in the 'flags' field of start_info_t. */
#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
+#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */
+#define SIF_NUMA_DOMAIN (1<<3) /* Is the domain NUMA aware ? */
#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
typedef uint64_t cpumap_t;
@@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16];
#define __mk_unsigned_long(x) x ## UL
#define mk_unsigned_long(x) __mk_unsigned_long(x)
+DEFINE_GUEST_HANDLE(uint32_t);
DEFINE_GUEST_HANDLE(uint64_t);
#else /* __ASSEMBLY__ */
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes
2010-04-04 19:30 [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes Dulloor
@ 2010-04-05 14:16 ` Konrad Rzeszutek Wilk
2010-04-06 5:19 ` Dulloor
0 siblings, 1 reply; 3+ messages in thread
From: Konrad Rzeszutek Wilk @ 2010-04-05 14:16 UTC (permalink / raw)
To: Dulloor; +Cc: xen-devel
It looks as the comments I provided in
<20100216174900.GB21067@phenom.dumpdata.com> were ignored.
http://article.gmane.org/gmane.comp.emulators.xen.devel/78118
Here I am repeating some of them and adding some new ones.
Please address/answer them.
.. snip..
> - acpi_numa_init();
> + if (acpi_numa > 0)
> + acpi_numa_init();
The style guide is to use tabs, not spaces. It looks out-off-place?
Why not just do acpi_numa_init() by itself?
> #endif
>
> initmem_init(0, max_pfn);
> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
> index 459913b..3a856dc 100644
> --- a/arch/x86/mm/numa_64.c
> +++ b/arch/x86/mm/numa_64.c
> @@ -11,7 +11,11 @@
> #include <linux/ctype.h>
> #include <linux/module.h>
> #include <linux/nodemask.h>
> +#include <linux/cpumask.h>
> #include <linux/sched.h>
> +#include <linux/bitops.h>
> +#include <xen/interface/xen.h>
> +#include <xen/interface/memory.h>
>
> #include <asm/e820.h>
> #include <asm/proto.h>
> @@ -19,6 +23,8 @@
> #include <asm/numa.h>
> #include <asm/acpi.h>
> #include <asm/k8.h>
> +#include <asm/xen/hypervisor.h>
> +#include <asm/xen/hypercall.h>
>
> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
> EXPORT_SYMBOL(node_data);
> @@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
> */
> if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
> long n = simple_strtol(cmdline, NULL, 0);
> -
Uhh, why?
> num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
> if (num_nodes < 0)
> return num_nodes;
> @@ -522,6 +527,246 @@ out:
> numa_init_array();
> return 0;
> }
> +
> +/************************************************************************/
> +#ifdef CONFIG_XEN_NUMA_GUEST
Yikes. Can you move all of this code in it is own file so that there is
no need to sprinkle this with #ifdefs?
> +/* XEN PV GUEST NUMA */
> +struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout;
> +
> +static inline void __init
> +bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits)
> +{
> + /* We may need to pad the final longword with zeroes. */
> + if (nbits & (BITS_PER_LONG-1))
> + lp[BITS_TO_LONGS(nbits)-1] = 0;
> + memcpy(lp, bp, (nbits+7)/8);
> +}
> +
> +static void __init
> +xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask *xcpumask)
> +{
> + unsigned int nr_cpus;
> + uint8_t *bytemap;
> +
> + bytemap = xcpumask->bits;
> +
> + nr_cpus =
> + min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS);
> +
> + cpumask_clear(cpumask);
> + bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus);
> +}
> +
> +static void __init
> +xen_dump_numa_layout(struct xen_domain_numa_layout *layout)
> +{
> + unsigned int i, j;
> + char vcpumaskstr[128];
> + printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ",
> + layout->max_vcpus, layout->max_vnodes);
No good. You need printk(KERN_DEBUG ..
> + switch (layout->type)
> + {
> + case XEN_DOM_NUMA_CONFINED:
> + printk("type(CONFINED)\n");
ditoo.
> + break;
> + case XEN_DOM_NUMA_SPLIT:
> + printk("type(SPLIT)\n");
ditto.
> + break;
> + case XEN_DOM_NUMA_STRIPED:
> + printk("type(STRIPED)\n");
ditto.
> + break;
> + default:
> + printk("type(UNDEFINED)\n");
ditto.
> + }
> +
> + for (i = 0; i < layout->max_vnodes; i++)
> + {
> + cpumask_t vcpu_mask;
> + struct xenctl_cpumask *xvcpumask;
> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> + xvcpumask = &vnode_data->vcpu_mask;
> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask);
> + cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask);
> + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n",
ditto
> + vnode_data->vnode_id, vnode_data->mnode_id,
> + (unsigned long)vnode_data->nr_pages, vcpumaskstr);
> + }
> +
> + printk("vnode distances :\n");
ditoo
> + for (i = 0; i < layout->max_vnodes; i++)
> + printk("\tvnode[%u]", i);
ditto
> + for (i = 0; i < layout->max_vnodes; i++)
> + {
> + printk("\nvnode[%u]", i);
ditoo
> + for (j = 0; j < layout->max_vnodes; j++)
> + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
> + printk("\n");
ditto
> + }
> + return;
> +}
> +
> +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
> +{
> + /* Construct a slit table (using layout->vnode_distance).
> + * Copy it to acpi_slit. */
> + return;
> +}
> +
> +/* Distribute the vcpus over the vnodes according to their affinity */
> +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
> +{
> + int vcpu, vnode;
> +
> + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
> +
> +
> + for (vnode = 0; vnode < layout->max_vnodes; vnode++)
> + {
> + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
> + struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask;
> + cpumask_t vcpu_mask;
> +
> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask);
> +
> + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
> + {
> + if (cpu_isset(vcpu, vcpu_mask))
> + {
> + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
> + {
> + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n",
> + vcpu, early_cpu_to_node(vcpu));
> + continue;
> + }
> + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
> + numa_set_node(vcpu, vnode);
> + }
> + }
> + }
> + return;
> +}
> +
> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
> + unsigned long start_pfn, unsigned long last_pfn)
> +{
> + int num_vnodes, i;
> + u64 node_start_addr, node_end_addr, max_addr;
> +
> + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
> + layout->max_vnodes, layout->max_vcpus);
> +
> + if (layout->type != XEN_DOM_NUMA_SPLIT)
> + {
> + printk(KERN_INFO "xen_numa_emulation : Invalid layout type");
> + return -1;
> + }
> +
> + memset(&nodes, 0, sizeof(nodes));
> +
> + num_vnodes = layout->max_vnodes;
> + BUG_ON(num_vnodes > MAX_NUMNODES);
> +
> + max_addr = last_pfn << PAGE_SHIFT;
> +
> + node_start_addr = start_pfn << PAGE_SHIFT;
> + for (i = 0; i < num_vnodes; i++)
> + {
> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
> +
> + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
> +
> + if (i == (num_vnodes-1))
> + node_end_addr = max_addr;
> + else
> + node_end_addr = node_start_addr + node_size;
> + /* node_start_addr updated inside the function */
> + if (setup_node_range(i, nodes, &node_start_addr,
> + (node_end_addr-node_start_addr), max_addr+1))
> + goto failed;
> + }
> +
> + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
> +
> + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
> + if (memnode_shift < 0) {
> + printk(KERN_ERR "No NUMA hash function found.\n");
> + goto failed;
> + }
> + /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
> + /*
> + * We need to vacate all active ranges that may have been registered by
> + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
> + * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
> + */
> + remove_all_active_ranges();
> +
> + BUG_ON(acpi_numa >= 0);
> + for_each_node_mask(i, node_possible_map) {
> + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
> + nodes[i].end >> PAGE_SHIFT);
> + setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> + }
> + xen_init_slit_table(layout);
> + xen_init_numa_array(layout);
> + return 0;
> +failed:
> + return -1;
> +}
> +
> +static int __init
> +xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout)
> +{
> + int rc;
> + struct xenmem_numa_op memop;
> + memop.cmd = XENMEM_get_domain_numa_layout;
> + memop.u.dinfo.domid = DOMID_SELF;
> + memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION;
> + memop.u.dinfo.bufsize = sizeof(*pv_layout);
> + set_xen_guest_handle(memop.u.dinfo.buf, pv_layout);
> +
> + if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop)))
> + {
> + printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout failed\n");
> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
> + goto done;
> + }
> +
> + if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION)
> + {
> + printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n");
> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
> + rc = -1;
> + }
> + xen_dump_numa_layout(pv_layout);
> +done:
> + return rc;
> +}
> +
> +static int __init
> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
> +{
> + int rc = 0;
> + if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN))
> + {
> + rc = -1;
> + printk(KERN_INFO "xen numa emulation disabled\n");
> + goto done;
The tabs/spaces don't look right. Can you run this patch through
checkpatch.pl?
> + }
> + if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout)))
> + goto done;
> + rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, last_pfn);
> +done:
> + return rc;
> +}
> +#else
> +static inline int __init
> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
> +{
> + return -1;
> +}
> +#endif /* CONFIG_XEN_NUMA_GUEST */
> +/************************************************************************/
> #endif /* CONFIG_NUMA_EMU */
>
> void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
> @@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
> nodes_clear(node_online_map);
>
> #ifdef CONFIG_NUMA_EMU
> + if (!xen_pv_numa(start_pfn, last_pfn))
> + return;
> +
> if (cmdline && !numa_emulation(start_pfn, last_pfn))
> return;
> nodes_clear(node_possible_map);
> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
> index 7675f9b..7cf6a4f 100644
> --- a/arch/x86/xen/Kconfig
> +++ b/arch/x86/xen/Kconfig
> @@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH
> help
> Enable support for passing PCI devices through to
> unprivileged domains. (COMPLETELY UNTESTED)
> +
> +config XEN_NUMA_GUEST
> + bool "Enable support NUMA aware Xen domains"
> + depends on XEN && X86_64 && NUMA && NUMA_EMU
> + help
> + Enable support for NUMA aware Xen domains. With this
> + option, if the memory for the domain is allocated
> + from different memory nodes, the domain is made aware
> + of this through a Virtual NUMA enlightenment.
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index df3e84c..2ee9f0b 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -285,6 +285,8 @@ void __init xen_arch_setup(void)
> printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
> disable_acpi();
> }
> + acpi_numa = -1;
> + numa_off = 1;
> #endif
>
> /*
> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
> index 32ab005..7f5b85e 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
> */
> #define XENMEM_machine_memory_map 10
>
> +/* xen guest numa operations */
> +#define XENMEM_numa_op 15
> +
> +#define XEN_DOM_NUMA_INTERFACE_VERSION 0x00000001
> +#define XENCTL_NR_CPUS 64
> +#define XENCTL_BITS_PER_BYTE 8
> +#define XENCTL_BITS_TO_BYTES(bits) \
> + (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE)
> +
> +#define XENCTL_DECLARE_BITMAP(name,bits) \
> + uint8_t name[XENCTL_BITS_TO_BYTES(bits)]
> +struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); };
> +
> +/* Not used in guest */
> +#define XENMEM_machine_numa_layout 0x01
> +struct xenmem_node_data {
> + uint32_t node_id;
> + uint64_t node_memsize;
> + uint64_t node_memfree;
> + struct xenctl_cpumask cpu_mask; /* node_to_cpumask */
> +};
> +
> +/* NUMA layout for the machine.
> + * Structure has to fit within a page. */
> +struct xenmem_machine_numa_layout {
> + uint32_t max_nodes;
> + /* Only (max_nodes*max_nodes) entries are filled */
> + GUEST_HANDLE(uint32_t) node_distance;
> + /* max_vnodes entries of xenmem_node_data type */
> + GUEST_HANDLE(void) node_data;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout);
> +
> +
> +#define XENMEM_machine_nodemap 0x02
> +struct xenmem_machine_nodemap {
> + /* On call the size of the available buffer */
> + uint32_t bufsize;
> +
> + /* memnode map parameters */
> + int32_t shift;
> + uint32_t mapsize;
> + GUEST_HANDLE(void) map;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap);
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +#define XENMEM_set_domain_numa_layout 0x03
> +#define XENMEM_get_domain_numa_layout 0x04
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +#define XEN_MAX_VNODES 8
> +
> +struct xen_vnode_data {
> + uint32_t vnode_id;
> + uint32_t mnode_id;
> + uint64_t nr_pages;
> + struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */
> +};
> +
> +#define XEN_DOM_NUMA_CONFINED 0x01
> +#define XEN_DOM_NUMA_SPLIT 0x02
> +#define XEN_DOM_NUMA_STRIPED 0x03
> +struct xen_domain_numa_layout {
> + uint32_t version;
> + uint32_t type;
> +
> + uint32_t max_vcpus;
> + uint32_t max_vnodes;
> +
> + /* Only (max_vnodes*max_vnodes) entries are filled */
> + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
> + /* Only (max_vnodes) entries are filled */
> + struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
> +};
> +
> +struct xenmem_domain_numa_layout {
> + domid_t domid;
> + uint32_t version;
> +
> + uint32_t bufsize;
> + GUEST_HANDLE(void) buf;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout);
> +
> +struct xenmem_numa_op {
> + uint32_t cmd;
> + union {
> + struct xenmem_machine_numa_layout minfo;
> + struct xenmem_machine_nodemap mnodemap;
> + struct xenmem_domain_numa_layout dinfo;
> + } u;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op);
>
> /*
> * Prevent the balloon driver from changing the memory reservation
> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index 9ffaee0..17cb17d 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -494,6 +494,8 @@ struct dom0_vga_console_info {
> /* These flags are passed in the 'flags' field of start_info_t. */
> #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
> #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
> +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */
> +#define SIF_NUMA_DOMAIN (1<<3) /* Is the domain NUMA aware ? */
> #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
>
> typedef uint64_t cpumap_t;
> @@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16];
> #define __mk_unsigned_long(x) x ## UL
> #define mk_unsigned_long(x) __mk_unsigned_long(x)
>
> +DEFINE_GUEST_HANDLE(uint32_t);
> DEFINE_GUEST_HANDLE(uint64_t);
>
> #else /* __ASSEMBLY__ */
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes
2010-04-05 14:16 ` Konrad Rzeszutek Wilk
@ 2010-04-06 5:19 ` Dulloor
0 siblings, 0 replies; 3+ messages in thread
From: Dulloor @ 2010-04-06 5:19 UTC (permalink / raw)
To: Konrad Rzeszutek Wilk; +Cc: xen-devel
> It looks as the comments I provided in
> <20100216174900.GB21067@phenom.dumpdata.com> were ignored.
I incorporated a number of your comments, but obviously missed some.
Sorry, wasn't intentional.
Comments inline.
On Mon, Apr 5, 2010 at 10:16 AM, Konrad Rzeszutek Wilk
<konrad.wilk@oracle.com> wrote:
>
> It looks as the comments I provided in
> <20100216174900.GB21067@phenom.dumpdata.com> were ignored.
>
> http://article.gmane.org/gmane.comp.emulators.xen.devel/78118
>
> Here I am repeating some of them and adding some new ones.
> Please address/answer them.
>
> .. snip..
>
>> - acpi_numa_init();
>> + if (acpi_numa > 0)
>> + acpi_numa_init();
>
> The style guide is to use tabs, not spaces. It looks out-off-place?
I had set the darn "expandtab" set in my vimrc. I have run the patch
through checkpatch.pl and will fix the indentation issues.
> Why not just do acpi_numa_init() by itself?
>
With pv kernels, we know well ahead that numa acpi is irrelevant, same
as when booting linux with noacpi. I thought it is unnecessary to even
parse the numa acpi tables in that case.
>
>> #endif
>>
>> initmem_init(0, max_pfn);
>> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
>> index 459913b..3a856dc 100644
>> --- a/arch/x86/mm/numa_64.c
>> +++ b/arch/x86/mm/numa_64.c
>> @@ -11,7 +11,11 @@
>> #include <linux/ctype.h>
>> #include <linux/module.h>
>> #include <linux/nodemask.h>
>> +#include <linux/cpumask.h>
>> #include <linux/sched.h>
>> +#include <linux/bitops.h>
>> +#include <xen/interface/xen.h>
>> +#include <xen/interface/memory.h>
>>
>> #include <asm/e820.h>
>> #include <asm/proto.h>
>> @@ -19,6 +23,8 @@
>> #include <asm/numa.h>
>> #include <asm/acpi.h>
>> #include <asm/k8.h>
>> +#include <asm/xen/hypervisor.h>
>> +#include <asm/xen/hypercall.h>
>>
>> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>> EXPORT_SYMBOL(node_data);
>> @@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
>> */
>> if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>> long n = simple_strtol(cmdline, NULL, 0);
>> -
>
> Uhh, why?
By mistake ! :)
>> num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>> if (num_nodes < 0)
>> return num_nodes;
>> @@ -522,6 +527,246 @@ out:
>> numa_init_array();
>> return 0;
>> }
>> +
>> +/************************************************************************/
>> +#ifdef CONFIG_XEN_NUMA_GUEST
>
> Yikes. Can you move all of this code in it is own file so that there is
> no need to sprinkle this with #ifdefs?
I agree with you. I will also take care of the few xen header files I
have added to the file.
>
>> +/* XEN PV GUEST NUMA */
>> +struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout;
>> +
>> +static inline void __init
>> +bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits)
>> +{
>> + /* We may need to pad the final longword with zeroes. */
>> + if (nbits & (BITS_PER_LONG-1))
>> + lp[BITS_TO_LONGS(nbits)-1] = 0;
>> + memcpy(lp, bp, (nbits+7)/8);
>> +}
>> +
>> +static void __init
>> +xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask *xcpumask)
>> +{
>> + unsigned int nr_cpus;
>> + uint8_t *bytemap;
>> +
>> + bytemap = xcpumask->bits;
>> +
>> + nr_cpus =
>> + min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS);
>> +
>> + cpumask_clear(cpumask);
>> + bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus);
>> +}
>> +
>> +static void __init
>> +xen_dump_numa_layout(struct xen_domain_numa_layout *layout)
>> +{
>> + unsigned int i, j;
>> + char vcpumaskstr[128];
>> + printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ",
>> + layout->max_vcpus, layout->max_vnodes);
>
> No good. You need printk(KERN_DEBUG ..
Missed these from your previous comments. Will take care of these.
>> + switch (layout->type)
>> + {
>> + case XEN_DOM_NUMA_CONFINED:
>> + printk("type(CONFINED)\n");
> ditoo.
>> + break;
>> + case XEN_DOM_NUMA_SPLIT:
>> + printk("type(SPLIT)\n");
> ditto.
>> + break;
>> + case XEN_DOM_NUMA_STRIPED:
>> + printk("type(STRIPED)\n");
> ditto.
>> + break;
>> + default:
>> + printk("type(UNDEFINED)\n");
>
> ditto.
>> + }
>> +
>> + for (i = 0; i < layout->max_vnodes; i++)
>> + {
>> + cpumask_t vcpu_mask;
>> + struct xenctl_cpumask *xvcpumask;
>> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> + xvcpumask = &vnode_data->vcpu_mask;
>> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask);
>> + cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask);
>> + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n",
> ditto
>> + vnode_data->vnode_id, vnode_data->mnode_id,
>> + (unsigned long)vnode_data->nr_pages, vcpumaskstr);
>> + }
>> +
>> + printk("vnode distances :\n");
> ditoo
>> + for (i = 0; i < layout->max_vnodes; i++)
>> + printk("\tvnode[%u]", i);
> ditto
>> + for (i = 0; i < layout->max_vnodes; i++)
>> + {
>> + printk("\nvnode[%u]", i);
> ditoo
>> + for (j = 0; j < layout->max_vnodes; j++)
>> + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
>> + printk("\n");
> ditto
>> + }
>> + return;
>> +}
>> +
>> +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
>> +{
>> + /* Construct a slit table (using layout->vnode_distance).
>> + * Copy it to acpi_slit. */
>> + return;
>> +}
>> +
>> +/* Distribute the vcpus over the vnodes according to their affinity */
>> +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
>> +{
>> + int vcpu, vnode;
>> +
>> + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
>> +
>> +
>> + for (vnode = 0; vnode < layout->max_vnodes; vnode++)
>> + {
>> + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
>> + struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask;
>> + cpumask_t vcpu_mask;
>> +
>> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask);
>> +
>> + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
>> + {
>> + if (cpu_isset(vcpu, vcpu_mask))
>> + {
>> + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
>> + {
>> + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n",
>> + vcpu, early_cpu_to_node(vcpu));
>> + continue;
>> + }
>> + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
>> + numa_set_node(vcpu, vnode);
>> + }
>> + }
>> + }
>> + return;
>> +}
>> +
>> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
>> + unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> + int num_vnodes, i;
>> + u64 node_start_addr, node_end_addr, max_addr;
>> +
>> + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
>> + layout->max_vnodes, layout->max_vcpus);
>> +
>> + if (layout->type != XEN_DOM_NUMA_SPLIT)
>> + {
>> + printk(KERN_INFO "xen_numa_emulation : Invalid layout type");
>> + return -1;
>> + }
>> +
>> + memset(&nodes, 0, sizeof(nodes));
>> +
>> + num_vnodes = layout->max_vnodes;
>> + BUG_ON(num_vnodes > MAX_NUMNODES);
>> +
>> + max_addr = last_pfn << PAGE_SHIFT;
>> +
>> + node_start_addr = start_pfn << PAGE_SHIFT;
>> + for (i = 0; i < num_vnodes; i++)
>> + {
>> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
>> +
>> + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
>> +
>> + if (i == (num_vnodes-1))
>> + node_end_addr = max_addr;
>> + else
>> + node_end_addr = node_start_addr + node_size;
>> + /* node_start_addr updated inside the function */
>> + if (setup_node_range(i, nodes, &node_start_addr,
>> + (node_end_addr-node_start_addr), max_addr+1))
>> + goto failed;
>> + }
>> +
>> + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
>> +
>> + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
>> + if (memnode_shift < 0) {
>> + printk(KERN_ERR "No NUMA hash function found.\n");
>> + goto failed;
>> + }
>> + /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
>> + /*
>> + * We need to vacate all active ranges that may have been registered by
>> + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
>> + * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
>> + */
>> + remove_all_active_ranges();
>> +
>> + BUG_ON(acpi_numa >= 0);
>> + for_each_node_mask(i, node_possible_map) {
>> + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
>> + nodes[i].end >> PAGE_SHIFT);
>> + setup_node_bootmem(i, nodes[i].start, nodes[i].end);
>> + }
>> + xen_init_slit_table(layout);
>> + xen_init_numa_array(layout);
>> + return 0;
>> +failed:
>> + return -1;
>> +}
>> +
>> +static int __init
>> +xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout)
>> +{
>> + int rc;
>> + struct xenmem_numa_op memop;
>> + memop.cmd = XENMEM_get_domain_numa_layout;
>> + memop.u.dinfo.domid = DOMID_SELF;
>> + memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION;
>> + memop.u.dinfo.bufsize = sizeof(*pv_layout);
>> + set_xen_guest_handle(memop.u.dinfo.buf, pv_layout);
>> +
>> + if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop)))
>> + {
>> + printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout failed\n");
>> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
>> + goto done;
>> + }
>> +
>> + if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION)
>> + {
>> + printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n");
>> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
>> + rc = -1;
>> + }
>> + xen_dump_numa_layout(pv_layout);
>> +done:
>> + return rc;
>> +}
>> +
>> +static int __init
>> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> + int rc = 0;
>> + if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN))
>> + {
>> + rc = -1;
>> + printk(KERN_INFO "xen numa emulation disabled\n");
>> + goto done;
>
> The tabs/spaces don't look right. Can you run this patch through
> checkpatch.pl?
Ok.
>
>> + }
>> + if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout)))
>> + goto done;
>> + rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, last_pfn);
>> +done:
>> + return rc;
>> +}
>> +#else
>> +static inline int __init
>> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> + return -1;
>> +}
>> +#endif /* CONFIG_XEN_NUMA_GUEST */
>> +/************************************************************************/
>> #endif /* CONFIG_NUMA_EMU */
>>
>> void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>> @@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>> nodes_clear(node_online_map);
>>
>> #ifdef CONFIG_NUMA_EMU
>> + if (!xen_pv_numa(start_pfn, last_pfn))
>> + return;
>> +
>> if (cmdline && !numa_emulation(start_pfn, last_pfn))
>> return;
>> nodes_clear(node_possible_map);
>> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
>> index 7675f9b..7cf6a4f 100644
>> --- a/arch/x86/xen/Kconfig
>> +++ b/arch/x86/xen/Kconfig
>> @@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH
>> help
>> Enable support for passing PCI devices through to
>> unprivileged domains. (COMPLETELY UNTESTED)
>> +
>> +config XEN_NUMA_GUEST
>> + bool "Enable support NUMA aware Xen domains"
>> + depends on XEN && X86_64 && NUMA && NUMA_EMU
>> + help
>> + Enable support for NUMA aware Xen domains. With this
>> + option, if the memory for the domain is allocated
>> + from different memory nodes, the domain is made aware
>> + of this through a Virtual NUMA enlightenment.
>> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
>> index df3e84c..2ee9f0b 100644
>> --- a/arch/x86/xen/setup.c
>> +++ b/arch/x86/xen/setup.c
>> @@ -285,6 +285,8 @@ void __init xen_arch_setup(void)
>> printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>> disable_acpi();
>> }
>> + acpi_numa = -1;
>> + numa_off = 1;
>> #endif
>>
>> /*
>> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
>> index 32ab005..7f5b85e 100644
>> --- a/include/xen/interface/memory.h
>> +++ b/include/xen/interface/memory.h
>> @@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
>> */
>> #define XENMEM_machine_memory_map 10
>>
>> +/* xen guest numa operations */
>> +#define XENMEM_numa_op 15
>> +
>> +#define XEN_DOM_NUMA_INTERFACE_VERSION 0x00000001
>> +#define XENCTL_NR_CPUS 64
>> +#define XENCTL_BITS_PER_BYTE 8
>> +#define XENCTL_BITS_TO_BYTES(bits) \
>> + (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE)
>> +
>> +#define XENCTL_DECLARE_BITMAP(name,bits) \
>> + uint8_t name[XENCTL_BITS_TO_BYTES(bits)]
>> +struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); };
>> +
>> +/* Not used in guest */
>> +#define XENMEM_machine_numa_layout 0x01
>> +struct xenmem_node_data {
>> + uint32_t node_id;
>> + uint64_t node_memsize;
>> + uint64_t node_memfree;
>> + struct xenctl_cpumask cpu_mask; /* node_to_cpumask */
>> +};
>> +
>> +/* NUMA layout for the machine.
>> + * Structure has to fit within a page. */
>> +struct xenmem_machine_numa_layout {
>> + uint32_t max_nodes;
>> + /* Only (max_nodes*max_nodes) entries are filled */
>> + GUEST_HANDLE(uint32_t) node_distance;
>> + /* max_vnodes entries of xenmem_node_data type */
>> + GUEST_HANDLE(void) node_data;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout);
>> +
>> +
>> +#define XENMEM_machine_nodemap 0x02
>> +struct xenmem_machine_nodemap {
>> + /* On call the size of the available buffer */
>> + uint32_t bufsize;
>> +
>> + /* memnode map parameters */
>> + int32_t shift;
>> + uint32_t mapsize;
>> + GUEST_HANDLE(void) map;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap);
>> +
>> +/* NUMA layout for the domain at the time of startup.
>> + * Structure has to fit within a page. */
>> +#define XENMEM_set_domain_numa_layout 0x03
>> +#define XENMEM_get_domain_numa_layout 0x04
>> +
>> +/* NUMA layout for the domain at the time of startup.
>> + * Structure has to fit within a page. */
>> +#define XEN_MAX_VNODES 8
>> +
>> +struct xen_vnode_data {
>> + uint32_t vnode_id;
>> + uint32_t mnode_id;
>> + uint64_t nr_pages;
>> + struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */
>> +};
>> +
>> +#define XEN_DOM_NUMA_CONFINED 0x01
>> +#define XEN_DOM_NUMA_SPLIT 0x02
>> +#define XEN_DOM_NUMA_STRIPED 0x03
>> +struct xen_domain_numa_layout {
>> + uint32_t version;
>> + uint32_t type;
>> +
>> + uint32_t max_vcpus;
>> + uint32_t max_vnodes;
>> +
>> + /* Only (max_vnodes*max_vnodes) entries are filled */
>> + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
>> + /* Only (max_vnodes) entries are filled */
>> + struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
>> +};
>> +
>> +struct xenmem_domain_numa_layout {
>> + domid_t domid;
>> + uint32_t version;
>> +
>> + uint32_t bufsize;
>> + GUEST_HANDLE(void) buf;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout);
>> +
>> +struct xenmem_numa_op {
>> + uint32_t cmd;
>> + union {
>> + struct xenmem_machine_numa_layout minfo;
>> + struct xenmem_machine_nodemap mnodemap;
>> + struct xenmem_domain_numa_layout dinfo;
>> + } u;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op);
>>
>> /*
>> * Prevent the balloon driver from changing the memory reservation
>> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
>> index 9ffaee0..17cb17d 100644
>> --- a/include/xen/interface/xen.h
>> +++ b/include/xen/interface/xen.h
>> @@ -494,6 +494,8 @@ struct dom0_vga_console_info {
>> /* These flags are passed in the 'flags' field of start_info_t. */
>> #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
>> #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
>> +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */
>> +#define SIF_NUMA_DOMAIN (1<<3) /* Is the domain NUMA aware ? */
>> #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
>>
>> typedef uint64_t cpumap_t;
>> @@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16];
>> #define __mk_unsigned_long(x) x ## UL
>> #define mk_unsigned_long(x) __mk_unsigned_long(x)
>>
>> +DEFINE_GUEST_HANDLE(uint32_t);
>> DEFINE_GUEST_HANDLE(uint64_t);
>>
>> #else /* __ASSEMBLY__ */
>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>
>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2010-04-06 5:19 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-04-04 19:30 [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes Dulloor
2010-04-05 14:16 ` Konrad Rzeszutek Wilk
2010-04-06 5:19 ` Dulloor
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).