Re: [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
To: Dulloor <dulloor@gmail.com>
Cc: xen-devel@lists.xensource.com
Subject: Re: [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes
Date: Mon, 5 Apr 2010 10:16:48 -0400	[thread overview]
Message-ID: <20100405141648.GC1227@phenom.dumpdata.com> (raw)
In-Reply-To: <k2p940bcfd21004041230wdde55c1fie4647bef37da36b8@mail.gmail.com>


It looks as the comments I provided in 
<20100216174900.GB21067@phenom.dumpdata.com> were ignored.

http://article.gmane.org/gmane.comp.emulators.xen.devel/78118

Here I am repeating some of them and adding some new ones.
Please address/answer them.

.. snip..

> -	acpi_numa_init();
> +    if (acpi_numa > 0)
> +	    acpi_numa_init();

The style guide is to use tabs, not spaces. It looks out-off-place?
Why not just do acpi_numa_init() by itself?


>  #endif
>  
>  	initmem_init(0, max_pfn);
> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
> index 459913b..3a856dc 100644
> --- a/arch/x86/mm/numa_64.c
> +++ b/arch/x86/mm/numa_64.c
> @@ -11,7 +11,11 @@
>  #include <linux/ctype.h>
>  #include <linux/module.h>
>  #include <linux/nodemask.h>
> +#include <linux/cpumask.h>
>  #include <linux/sched.h>
> +#include <linux/bitops.h>
> +#include <xen/interface/xen.h>
> +#include <xen/interface/memory.h>
>  
>  #include <asm/e820.h>
>  #include <asm/proto.h>
> @@ -19,6 +23,8 @@
>  #include <asm/numa.h>
>  #include <asm/acpi.h>
>  #include <asm/k8.h>
> +#include <asm/xen/hypervisor.h>
> +#include <asm/xen/hypercall.h>
>  
>  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>  EXPORT_SYMBOL(node_data);
> @@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
>  	 */
>  	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>  		long n = simple_strtol(cmdline, NULL, 0);
> -

Uhh, why?
>  		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>  		if (num_nodes < 0)
>  			return num_nodes;
> @@ -522,6 +527,246 @@ out:
>  	numa_init_array();
>  	return 0;
>  }
> +
> +/************************************************************************/
> +#ifdef CONFIG_XEN_NUMA_GUEST

Yikes. Can you move all of this code in it is own file so that there is
no need to sprinkle this with #ifdefs?

> +/* XEN PV GUEST NUMA */
> +struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout;
> +
> +static inline void __init
> +bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits)
> +{
> +	/* We may need to pad the final longword with zeroes. */
> +	if (nbits & (BITS_PER_LONG-1))
> +		lp[BITS_TO_LONGS(nbits)-1] = 0;
> +	memcpy(lp, bp, (nbits+7)/8);
> +}
> +
> +static void __init
> +xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask *xcpumask)
> +{
> +    unsigned int nr_cpus;
> +    uint8_t *bytemap;
> +
> +    bytemap = xcpumask->bits;
> +
> +    nr_cpus =
> +        min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS);
> +
> +    cpumask_clear(cpumask);
> +    bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus);
> +}
> +
> +static void __init
> +xen_dump_numa_layout(struct xen_domain_numa_layout *layout)
> +{
> +    unsigned int i, j;
> +    char vcpumaskstr[128];
> +    printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ",
> +                            layout->max_vcpus, layout->max_vnodes);

No good. You need printk(KERN_DEBUG ..
> +    switch (layout->type)
> +    {
> +        case XEN_DOM_NUMA_CONFINED:
> +            printk("type(CONFINED)\n");
ditoo.
> +            break;
> +        case XEN_DOM_NUMA_SPLIT:
> +            printk("type(SPLIT)\n");
ditto.
> +            break;
> +        case XEN_DOM_NUMA_STRIPED:
> +            printk("type(STRIPED)\n");
ditto.
> +            break;
> +        default:
> +            printk("type(UNDEFINED)\n");

ditto.
> +    }
> +
> +    for (i = 0; i < layout->max_vnodes; i++)
> +    {
> +        cpumask_t vcpu_mask;
> +        struct xenctl_cpumask *xvcpumask;
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> +        xvcpumask = &vnode_data->vcpu_mask;
> +        xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask);
> +        cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask);
> +        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", 
ditto
> +                vnode_data->vnode_id, vnode_data->mnode_id,
> +                (unsigned long)vnode_data->nr_pages, vcpumaskstr);
> +    }
> +
> +    printk("vnode distances :\n");
ditoo
> +    for (i = 0; i < layout->max_vnodes; i++)
> +        printk("\tvnode[%u]", i);
ditto
> +    for (i = 0; i < layout->max_vnodes; i++)
> +    {
> +        printk("\nvnode[%u]", i);
ditoo
> +        for (j = 0; j < layout->max_vnodes; j++)
> +            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
> +        printk("\n");
ditto
> +    }
> +    return;
> +}
> +
> +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
> +{
> +    /* Construct a slit table (using layout->vnode_distance).
> +     * Copy it to acpi_slit. */
> +    return;
> +}
> +
> +/* Distribute the vcpus over the vnodes according to their affinity */
> +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
> +{
> +	int vcpu, vnode;
> +   
> +	printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
> +
> +
> +    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
> +        struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask;
> +        cpumask_t vcpu_mask;
> +
> +        xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask);
> +   
> +        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
> +        {
> +            if (cpu_isset(vcpu, vcpu_mask))
> +            {
> +                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
> +                {
> +                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", 
> +                                        vcpu, early_cpu_to_node(vcpu)); 
> +                    continue;
> +                }
> +                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
> +		        numa_set_node(vcpu, vnode);
> +            }
> +        }
> +    }
> +    return;
> +}
> +
> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
> +                            unsigned long start_pfn, unsigned long last_pfn)
> +{
> +	int num_vnodes, i;
> +    u64 node_start_addr, node_end_addr, max_addr;
> +
> +    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
> +                                        layout->max_vnodes, layout->max_vcpus);
> +
> +    if (layout->type != XEN_DOM_NUMA_SPLIT)
> +    {
> +        printk(KERN_INFO "xen_numa_emulation : Invalid layout type");
> +        return -1;
> +    }
> +
> +	memset(&nodes, 0, sizeof(nodes));
> +
> +    num_vnodes = layout->max_vnodes;
> +    BUG_ON(num_vnodes > MAX_NUMNODES);
> +
> +    max_addr = last_pfn << PAGE_SHIFT;
> +
> +    node_start_addr = start_pfn << PAGE_SHIFT;
> +    for (i = 0; i < num_vnodes; i++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> +        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
> +
> +		node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
> +
> +		if (i == (num_vnodes-1))
> +			node_end_addr = max_addr;
> +		else
> +            node_end_addr = node_start_addr + node_size;
> +        /* node_start_addr updated inside the function */
> +        if (setup_node_range(i, nodes, &node_start_addr, 
> +                    (node_end_addr-node_start_addr), max_addr+1))
> +            goto failed;
> +    }
> +
> +	printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
> +
> +    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
> +    if (memnode_shift < 0) {
> +	    printk(KERN_ERR "No NUMA hash function found.\n");
> +        goto failed;
> +    }
> +    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
> +	/*
> +	 * We need to vacate all active ranges that may have been registered by
> +	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
> +	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
> +	 */
> +	remove_all_active_ranges();
> +
> +    BUG_ON(acpi_numa >= 0);
> +	for_each_node_mask(i, node_possible_map) {
> +		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
> +						nodes[i].end >> PAGE_SHIFT);
> +		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> +	}
> +    xen_init_slit_table(layout);
> +	xen_init_numa_array(layout);
> +	return 0;
> +failed:
> +    return -1;
> +}
> +
> +static int __init
> +xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout)
> +{
> +    int rc;
> +    struct xenmem_numa_op memop;
> +    memop.cmd = XENMEM_get_domain_numa_layout;
> +    memop.u.dinfo.domid = DOMID_SELF;
> +    memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION;
> +    memop.u.dinfo.bufsize = sizeof(*pv_layout);
> +    set_xen_guest_handle(memop.u.dinfo.buf, pv_layout);
> +
> +    if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop)))
> +    {
> +        printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout failed\n");
> +        xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
> +        goto done;
> +    }
> +
> +    if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION)
> +    {
> +        printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n");
> +        xen_start_info->flags &= ~SIF_NUMA_DOMAIN;
> +        rc = -1;
> +    }
> +    xen_dump_numa_layout(pv_layout);
> +done:
> +    return rc;
> +}
> +
> +static int __init
> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
> +{
> +    int rc = 0;
> +    if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN))
> +    {
> +        rc = -1;
> +	    printk(KERN_INFO "xen numa emulation disabled\n");
> +        goto done;

The tabs/spaces don't look right. Can you run this patch through
checkpatch.pl?

> +    }
> +    if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout)))
> +        goto done;
> +    rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, last_pfn);
> +done:
> +    return rc;
> +}
> +#else
> +static inline int __init
> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn)
> +{
> +    return -1;
> +}
> +#endif /* CONFIG_XEN_NUMA_GUEST */
> +/************************************************************************/
>  #endif /* CONFIG_NUMA_EMU */
>  
>  void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
> @@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>  	nodes_clear(node_online_map);
>  
>  #ifdef CONFIG_NUMA_EMU
> +    if (!xen_pv_numa(start_pfn, last_pfn))
> +        return;
> +
>  	if (cmdline && !numa_emulation(start_pfn, last_pfn))
>  		return;
>  	nodes_clear(node_possible_map);
> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
> index 7675f9b..7cf6a4f 100644
> --- a/arch/x86/xen/Kconfig
> +++ b/arch/x86/xen/Kconfig
> @@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH
>         help
>           Enable support for passing PCI devices through to
>  	 unprivileged domains. (COMPLETELY UNTESTED)
> +
> +config XEN_NUMA_GUEST
> +    bool "Enable support NUMA aware Xen domains"
> +    depends on XEN && X86_64 && NUMA && NUMA_EMU
> +    help
> +        Enable support for NUMA aware Xen domains. With this
> +        option, if the memory for the domain is allocated 
> +        from different memory nodes, the domain is made aware
> +        of this through a Virtual NUMA enlightenment.
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index df3e84c..2ee9f0b 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -285,6 +285,8 @@ void __init xen_arch_setup(void)
>  		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>  		disable_acpi();
>  	}
> +    acpi_numa = -1;
> +    numa_off = 1;
>  #endif
>  
>  	/* 
> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
> index 32ab005..7f5b85e 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
>   */
>  #define XENMEM_machine_memory_map   10
>  
> +/* xen guest numa operations */
> +#define XENMEM_numa_op           15
> +
> +#define XEN_DOM_NUMA_INTERFACE_VERSION  0x00000001
> +#define XENCTL_NR_CPUS 64
> +#define XENCTL_BITS_PER_BYTE 8
> +#define XENCTL_BITS_TO_BYTES(bits) \
> +    (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE)
> +
> +#define XENCTL_DECLARE_BITMAP(name,bits) \
> +    uint8_t name[XENCTL_BITS_TO_BYTES(bits)]
> +struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); };
> +
> +/* Not used in guest */
> +#define XENMEM_machine_numa_layout 0x01
> +struct xenmem_node_data {
> +    uint32_t node_id;
> +    uint64_t node_memsize;
> +    uint64_t node_memfree;
> +    struct xenctl_cpumask cpu_mask; /* node_to_cpumask */
> +};
> +
> +/* NUMA layout for the machine.
> + * Structure has to fit within a page. */
> +struct xenmem_machine_numa_layout {
> +    uint32_t max_nodes;
> +    /* Only (max_nodes*max_nodes) entries are filled */
> +    GUEST_HANDLE(uint32_t) node_distance;
> +    /* max_vnodes entries of xenmem_node_data type */
> +    GUEST_HANDLE(void) node_data;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout);
> +
> +
> +#define XENMEM_machine_nodemap  0x02
> +struct xenmem_machine_nodemap {
> +    /* On call the size of the available buffer */
> +    uint32_t bufsize;
> +
> +    /* memnode map parameters */
> +    int32_t shift;
> +    uint32_t mapsize;
> +    GUEST_HANDLE(void) map;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap);
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +#define XENMEM_set_domain_numa_layout 0x03
> +#define XENMEM_get_domain_numa_layout 0x04
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +#define XEN_MAX_VNODES 8
> +
> +struct xen_vnode_data {
> +    uint32_t vnode_id;
> +    uint32_t mnode_id;
> +    uint64_t nr_pages;
> +    struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */
> +};
> +
> +#define XEN_DOM_NUMA_CONFINED       0x01
> +#define XEN_DOM_NUMA_SPLIT          0x02
> +#define XEN_DOM_NUMA_STRIPED         0x03
> +struct xen_domain_numa_layout {
> +    uint32_t version;
> +    uint32_t type;
> +
> +    uint32_t max_vcpus;
> +    uint32_t max_vnodes;
> +
> +    /* Only (max_vnodes*max_vnodes) entries are filled */
> +    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
> +    /* Only (max_vnodes) entries are filled */
> +    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
> +};
> +
> +struct xenmem_domain_numa_layout {
> +    domid_t domid;
> +    uint32_t version;
> +
> +    uint32_t bufsize;
> +    GUEST_HANDLE(void) buf;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout);
> +
> +struct xenmem_numa_op {
> +    uint32_t cmd;
> +    union {
> +        struct xenmem_machine_numa_layout minfo;
> +        struct xenmem_machine_nodemap mnodemap;
> +        struct xenmem_domain_numa_layout dinfo;
> +    } u;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op);
>  
>  /*
>   * Prevent the balloon driver from changing the memory reservation
> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index 9ffaee0..17cb17d 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -494,6 +494,8 @@ struct dom0_vga_console_info {
>  /* These flags are passed in the 'flags' field of start_info_t. */
>  #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
>  #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
> +#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
> +#define SIF_NUMA_DOMAIN   (1<<3)  /* Is the domain NUMA aware ? */
>  #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
>  
>  typedef uint64_t cpumap_t;
> @@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16];
>  #define __mk_unsigned_long(x) x ## UL
>  #define mk_unsigned_long(x) __mk_unsigned_long(x)
>  
> +DEFINE_GUEST_HANDLE(uint32_t);
>  DEFINE_GUEST_HANDLE(uint64_t);
>  
>  #else /* __ASSEMBLY__ */

> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

next prev parent reply	other threads:[~2010-04-05 14:16 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-04 19:30 [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes Dulloor
2010-04-05 14:16 ` Konrad Rzeszutek Wilk [this message]
2010-04-06  5:19   ` Dulloor

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100405141648.GC1227@phenom.dumpdata.com \
    --to=konrad.wilk@oracle.com \
    --cc=dulloor@gmail.com \
    --cc=xen-devel@lists.xensource.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).