From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
To: Dulloor <dulloor@gmail.com>
Cc: xen-devel@lists.xensource.com
Subject: Re: [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes
Date: Mon, 5 Apr 2010 10:16:48 -0400 [thread overview]
Message-ID: <20100405141648.GC1227@phenom.dumpdata.com> (raw)
In-Reply-To: <k2p940bcfd21004041230wdde55c1fie4647bef37da36b8@mail.gmail.com>
It looks as the comments I provided in
<20100216174900.GB21067@phenom.dumpdata.com> were ignored.
http://article.gmane.org/gmane.comp.emulators.xen.devel/78118
Here I am repeating some of them and adding some new ones.
Please address/answer them.
.. snip..
> - acpi_numa_init();
> + if (acpi_numa > 0)
> + acpi_numa_init();
The style guide is to use tabs, not spaces. It looks out-off-place?
Why not just do acpi_numa_init() by itself?
> #endif
>
> initmem_init(0, max_pfn);
> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
> index 459913b..3a856dc 100644
> --- a/arch/x86/mm/numa_64.c
> +++ b/arch/x86/mm/numa_64.c
> @@ -11,7 +11,11 @@
> #include <linux/ctype.h>
> #include <linux/module.h>
> #include <linux/nodemask.h>
> +#include <linux/cpumask.h>
> #include <linux/sched.h>
> +#include <linux/bitops.h>
> +#include <xen/interface/xen.h>
> +#include <xen/interface/memory.h>
>
> #include <asm/e820.h>
> #include <asm/proto.h>
> @@ -19,6 +23,8 @@
> #include <asm/numa.h>
> #include <asm/acpi.h>
> #include <asm/k8.h>
> +#include <asm/xen/hypervisor.h>
> +#include <asm/xen/hypercall.h>
>
> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
> EXPORT_SYMBOL(node_data);
> @@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
> */
> if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
> long n = simple_strtol(cmdline, NULL, 0);
> -
Uhh, why?
> num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
> if (num_nodes < 0)
> return num_nodes;
> @@ -522,6 +527,246 @@ out:
> numa_init_array();
> return 0;
> }
> +
> +/************************************************************************/
> +#ifdef CONFIG_XEN_NUMA_GUEST
Yikes. Can you move all of this code in it is own file so that there is
no need to sprinkle this with #ifdefs?
> +/* XEN PV GUEST NUMA */
> +struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout;
> +
> +static inline void __init
> +bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits)
> +{
> + /* We may need to pad the final longword with zeroes. */
> + if (nbits & (BITS_PER_LONG-1))
> + lp[BITS_TO_LONGS(nbits)-1] = 0;
> + memcpy(lp, bp, (nbits+7)/8);
> +}
> +
> +static void __init
> +xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask *xcpumask)
> +{
> + unsigned int nr_cpus;
> + uint8_t *bytemap;
> +
> + bytemap = xcpumask->bits;
> +
> + nr_cpus =
> + min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS);
> +
> + cpumask_clear(cpumask);
> + bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus);
> +}
> +
> +static void __init
> +xen_dump_numa_layout(struct xen_domain_numa_layout *layout)
> +{
> + unsigned int i, j;
> + char vcpumaskstr[128];
> + printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ",
> + layout->max_vcpus, layout->max_vnodes);
No good. You need printk(KERN_DEBUG ..
> + switch (layout->type)
> + {
> + case XEN_DOM_NUMA_CONFINED:
> + printk("type(CONFINED)\n");
ditoo.
> + break;
> + case XEN_DOM_NUMA_SPLIT:
> + printk("type(SPLIT)\n");
ditto.
> + break;
> + case XEN_DOM_NUMA_STRIPED:
> + printk("type(STRIPED)\n");
ditto.
> + break;
> + default:
> + printk("type(UNDEFINED)\n");
ditto.
> + }
> +
> + for (i = 0; i < layout->max_vnodes; i++)
> + {
> + cpumask_t vcpu_mask;
> + struct xenctl_cpumask *xvcpumask;
> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> + xvcpumask = &vnode_data->vcpu_mask;
> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask);
> + cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask);
> + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n",
ditto
> + vnode_data->vnode_id, vnode_data->mnode_id,
> + (unsigned long)vnode_data->nr_pages, vcpumaskstr);
> + }
> +
> + printk("vnode distances :\n");
ditoo
> + for (i = 0; i < layout->max_vnodes; i++)
> + printk("\tvnode[%u]", i);
ditto
> + for (i = 0; i < layout->max_vnodes; i++)
> + {
> + printk("\nvnode[%u]", i);
ditoo
> + for (j = 0; j < layout->max_vnodes; j++)
> + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
> + printk("\n");
ditto
> + }
> + return;
> +}
> +
> +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
> +{
> + /* Construct a slit table (using layout->vnode_distance).
> + * Copy it to acpi_slit. */
> + return;
> +}
> +
> +/* Distribute the vcpus over the vnodes according to their affinity */
> +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
> +{
> + int vcpu, vnode;
> +
> + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
> +
> +
> + for (vnode = 0; vnode < layout->max_vnodes; vnode++)
> + {
> + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
> + struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask;
> + cpumask_t vcpu_mask;
> +
> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask);
> +
> + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
> + {
> + if (cpu_isset(vcpu, vcpu_mask))
> + {
> + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
> + {
> + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n",
> + vcpu, early_cpu_to_node(vcpu));
> + continue;
> + }
> + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
> + numa_set_node(vcpu, vnode);
> + }
> + }
> + }
> + return;
> +}
> +
> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
> + unsigned long start_pfn, unsigned long last_pfn)
> +{
> + int num_vnodes, i;
> + u64 node_start_addr, node_end_addr, max_addr;
> +
> + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
> + layout->max_vnodes, layout->max_vcpus);
> +
> + if (layout->type != XEN_DOM_NUMA_SPLIT)
> + {
> + printk(KERN_INFO "xen_numa_emulation : Invalid layout type");
> + return -1;
> + }
> +
> + memset(&nodes, 0, sizeof(nodes));
> +
> + num_vnodes = layout->max_vnodes;
> + BUG_ON(num_vnodes > MAX_NUMNODES);
> +
> + max_addr = last_pfn << PAGE_SHIFT;
> +
> + node_start_addr = start_pfn << PAGE_SHIFT;
> + for (i = 0; i < num_vnodes; i++)
> + {
> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
> +
> + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
> +
> + if (i == (num_vnodes-1))
> + node_end_addr = max_addr;
> + else
> + node_end_addr = node_start_addr + node_size;
> + /* node_start_addr updated inside the function */
> + if (setup_node_range(i, nodes, &node_start_addr,
> + (node_end_addr-node_start_addr), max_addr+1))
> + goto failed;
> + }
> +
> + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
> +
> + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
> + if (memnode_shift < 0) {
> + printk(KERN_ERR "No NUMA hash function found.\n");
> + goto failed;
> + }
> + /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
> + /*
> + * We need to vacate all active ranges that may have been registered by
> + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
> + * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
> + */
> + remove_all_active_ranges();
> +
> + BUG_ON(acpi_numa >= 0);
> + for_each_node_mask(i, node_possible_map) {
> + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
> + nodes[i].end >> PAGE_SHIFT);
> + setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> + }
> + xen_init_slit_table(layout);
> + xen_init_numa_array(layout);
> + return 0;
> +failed:
> + return -1;
> +}
> +
> +static int __init
> +xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout)
> +{
> + int rc;
> + struct xenmem_numa_op memop;
> + memop.cmd = XENMEM_get_domain_numa_layout;
> + memop.u.dinfo.domid = DOMID_SELF;
> + memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION;
> + memop.u.dinfo.bufsize = sizeof(*pv_layout);
> + set_xen_guest_handle(memop.u.dinfo.buf, pv_layout);
> +
> + if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop)))
> + {
> + printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout failed\n");
> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
> + goto done;
> + }
> +
> + if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION)
> + {
> + printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n");
> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
> + rc = -1;
> + }
> + xen_dump_numa_layout(pv_layout);
> +done:
> + return rc;
> +}
> +
> +static int __init
> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
> +{
> + int rc = 0;
> + if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN))
> + {
> + rc = -1;
> + printk(KERN_INFO "xen numa emulation disabled\n");
> + goto done;
The tabs/spaces don't look right. Can you run this patch through
checkpatch.pl?
> + }
> + if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout)))
> + goto done;
> + rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, last_pfn);
> +done:
> + return rc;
> +}
> +#else
> +static inline int __init
> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
> +{
> + return -1;
> +}
> +#endif /* CONFIG_XEN_NUMA_GUEST */
> +/************************************************************************/
> #endif /* CONFIG_NUMA_EMU */
>
> void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
> @@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
> nodes_clear(node_online_map);
>
> #ifdef CONFIG_NUMA_EMU
> + if (!xen_pv_numa(start_pfn, last_pfn))
> + return;
> +
> if (cmdline && !numa_emulation(start_pfn, last_pfn))
> return;
> nodes_clear(node_possible_map);
> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
> index 7675f9b..7cf6a4f 100644
> --- a/arch/x86/xen/Kconfig
> +++ b/arch/x86/xen/Kconfig
> @@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH
> help
> Enable support for passing PCI devices through to
> unprivileged domains. (COMPLETELY UNTESTED)
> +
> +config XEN_NUMA_GUEST
> + bool "Enable support NUMA aware Xen domains"
> + depends on XEN && X86_64 && NUMA && NUMA_EMU
> + help
> + Enable support for NUMA aware Xen domains. With this
> + option, if the memory for the domain is allocated
> + from different memory nodes, the domain is made aware
> + of this through a Virtual NUMA enlightenment.
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index df3e84c..2ee9f0b 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -285,6 +285,8 @@ void __init xen_arch_setup(void)
> printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
> disable_acpi();
> }
> + acpi_numa = -1;
> + numa_off = 1;
> #endif
>
> /*
> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
> index 32ab005..7f5b85e 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
> */
> #define XENMEM_machine_memory_map 10
>
> +/* xen guest numa operations */
> +#define XENMEM_numa_op 15
> +
> +#define XEN_DOM_NUMA_INTERFACE_VERSION 0x00000001
> +#define XENCTL_NR_CPUS 64
> +#define XENCTL_BITS_PER_BYTE 8
> +#define XENCTL_BITS_TO_BYTES(bits) \
> + (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE)
> +
> +#define XENCTL_DECLARE_BITMAP(name,bits) \
> + uint8_t name[XENCTL_BITS_TO_BYTES(bits)]
> +struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); };
> +
> +/* Not used in guest */
> +#define XENMEM_machine_numa_layout 0x01
> +struct xenmem_node_data {
> + uint32_t node_id;
> + uint64_t node_memsize;
> + uint64_t node_memfree;
> + struct xenctl_cpumask cpu_mask; /* node_to_cpumask */
> +};
> +
> +/* NUMA layout for the machine.
> + * Structure has to fit within a page. */
> +struct xenmem_machine_numa_layout {
> + uint32_t max_nodes;
> + /* Only (max_nodes*max_nodes) entries are filled */
> + GUEST_HANDLE(uint32_t) node_distance;
> + /* max_vnodes entries of xenmem_node_data type */
> + GUEST_HANDLE(void) node_data;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout);
> +
> +
> +#define XENMEM_machine_nodemap 0x02
> +struct xenmem_machine_nodemap {
> + /* On call the size of the available buffer */
> + uint32_t bufsize;
> +
> + /* memnode map parameters */
> + int32_t shift;
> + uint32_t mapsize;
> + GUEST_HANDLE(void) map;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap);
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +#define XENMEM_set_domain_numa_layout 0x03
> +#define XENMEM_get_domain_numa_layout 0x04
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +#define XEN_MAX_VNODES 8
> +
> +struct xen_vnode_data {
> + uint32_t vnode_id;
> + uint32_t mnode_id;
> + uint64_t nr_pages;
> + struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */
> +};
> +
> +#define XEN_DOM_NUMA_CONFINED 0x01
> +#define XEN_DOM_NUMA_SPLIT 0x02
> +#define XEN_DOM_NUMA_STRIPED 0x03
> +struct xen_domain_numa_layout {
> + uint32_t version;
> + uint32_t type;
> +
> + uint32_t max_vcpus;
> + uint32_t max_vnodes;
> +
> + /* Only (max_vnodes*max_vnodes) entries are filled */
> + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
> + /* Only (max_vnodes) entries are filled */
> + struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
> +};
> +
> +struct xenmem_domain_numa_layout {
> + domid_t domid;
> + uint32_t version;
> +
> + uint32_t bufsize;
> + GUEST_HANDLE(void) buf;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout);
> +
> +struct xenmem_numa_op {
> + uint32_t cmd;
> + union {
> + struct xenmem_machine_numa_layout minfo;
> + struct xenmem_machine_nodemap mnodemap;
> + struct xenmem_domain_numa_layout dinfo;
> + } u;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op);
>
> /*
> * Prevent the balloon driver from changing the memory reservation
> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index 9ffaee0..17cb17d 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -494,6 +494,8 @@ struct dom0_vga_console_info {
> /* These flags are passed in the 'flags' field of start_info_t. */
> #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
> #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
> +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */
> +#define SIF_NUMA_DOMAIN (1<<3) /* Is the domain NUMA aware ? */
> #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
>
> typedef uint64_t cpumap_t;
> @@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16];
> #define __mk_unsigned_long(x) x ## UL
> #define mk_unsigned_long(x) __mk_unsigned_long(x)
>
> +DEFINE_GUEST_HANDLE(uint32_t);
> DEFINE_GUEST_HANDLE(uint64_t);
>
> #else /* __ASSEMBLY__ */
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
next prev parent reply other threads:[~2010-04-05 14:16 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-04-04 19:30 [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes Dulloor
2010-04-05 14:16 ` Konrad Rzeszutek Wilk [this message]
2010-04-06 5:19 ` Dulloor
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100405141648.GC1227@phenom.dumpdata.com \
--to=konrad.wilk@oracle.com \
--cc=dulloor@gmail.com \
--cc=xen-devel@lists.xensource.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.