Re: [RFC] pv guest numa [RE: Host Numa informtion in dom0]

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
To: Dulloor <dulloor@gmail.com>
Cc: Andre Przywara <andre.przywara@amd.com>,
	Ian Pratt <Ian.Pratt@eu.citrix.com>,
	"xen-devel@lists.xensource.com" <xen-devel@lists.xensource.com>,
	Keir Fraser <keir.fraser@eu.citrix.com>,
	"Nakajima, Jun" <jun.nakajima@intel.com>
Subject: Re: [RFC] pv guest numa [RE: Host Numa informtion in dom0]
Date: Tue, 16 Feb 2010 12:49:00 -0500	[thread overview]
Message-ID: <20100216174900.GB21067@phenom.dumpdata.com> (raw)
In-Reply-To: <940bcfd21002122225i2c79aad2q91ff6432faef3192@mail.gmail.com>

Run this patch through scripts/checkpatch.pl

On Sat, Feb 13, 2010 at 01:25:22AM -0500, Dulloor wrote:
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 86bef0f..7a24070 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p)
>  	/*
>  	 * Parse SRAT to discover nodes.
>  	 */
> -	acpi_numa_init();
> +    if (acpi_numa > 0)
> +	    acpi_numa_init();

Why? Why can't we just try  acpi_numa_init()?

>  #endif
>  
>  	initmem_init(0, max_pfn);
> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
> index 459913b..14fa654 100644
> --- a/arch/x86/mm/numa_64.c
> +++ b/arch/x86/mm/numa_64.c
> @@ -11,7 +11,9 @@
>  #include <linux/ctype.h>
>  #include <linux/module.h>
>  #include <linux/nodemask.h>
> +#include <linux/cpumask.h>
>  #include <linux/sched.h>
> +#include <xen/interface/xen.h>
>  
>  #include <asm/e820.h>
>  #include <asm/proto.h>
> @@ -19,6 +21,7 @@
>  #include <asm/numa.h>
>  #include <asm/acpi.h>
>  #include <asm/k8.h>
> +#include <asm/xen/hypervisor.h>

If one does not set CONFIG_XEN does this compile?
>  
>  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>  EXPORT_SYMBOL(node_data);
> @@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
>  	 */
>  	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>  		long n = simple_strtol(cmdline, NULL, 0);
> -

No need for this.
>  		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>  		if (num_nodes < 0)
>  			return num_nodes;
> @@ -522,16 +524,162 @@ out:
>  	numa_init_array();
>  	return 0;
>  }
> +struct xen_domain_numa_layout pv_numa_layout;
> +
> +void dump_numa_layout(struct xen_domain_numa_layout *layout)
> +{
> +    unsigned int i, j;
> +    char vcpumask[128];
> +    printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n",
> +                            layout->max_vcpus, layout->max_vnodes);

Redo the printk's. They need KERN_DEBUG
> +    for (i = 0; i < layout->max_vnodes; i++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> +        cpumask_scnprintf(vcpumask, sizeof(vcpumask), 
> +                                ((cpumask_t *)&vnode_data->vcpu_mask));
> +        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", 
> +                vnode_data->vnode_id, vnode_data->mnode_id,
> +                (unsigned long)vnode_data->nr_pages, vcpumask);

This one too.
> +    }
> +
> +    printk("vnode distances :\n");

and 
> +    for (i = 0; i < layout->max_vnodes; i++)
> +        printk("\tvnode[%u]", i);
> +    for (i = 0; i < layout->max_vnodes; i++)
> +    {
> +        printk("\nvnode[%u]", i);

this
> +        for (j = 0; j < layout->max_vnodes; j++)
> +            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
> +        printk("\n");
one to.
> +    }
> +    return;
> +}
> +
> +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
> +{
> +    /* Construct a slit table (using layout->vnode_distance).
> +     * Copy it to acpi_slit. */
> +    return;
> +}
> +/* Distribute the vcpus over the vnodes according to their affinity */
> +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
> +{
> +	int vcpu, vnode;
> +   
> +	printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");

pr_debug instead?
> +
> +    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
> +        cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask);
> +   
> +        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
> +        {
> +            if (cpu_isset(vcpu, vcpu_mask))
> +            {
> +                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
> +                {
> +                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", 
> +                                        vcpu, early_cpu_to_node(vcpu)); 
> +                    continue;
> +                }
> +                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
> +		        numa_set_node(vcpu, vnode);
> +            }
> +        }
> +    }
> +    return;
> +}
> +
> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
> +                            unsigned long start_pfn, unsigned long last_pfn)
> +{
> +	int num_vnodes, i;
> +    u64 node_start_addr, node_end_addr, max_addr;
> +
> +    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
> +                                        layout->max_vnodes, layout->max_vcpus);
> +    dump_numa_layout(layout);
> +	memset(&nodes, 0, sizeof(nodes));
> +
> +    num_vnodes = layout->max_vnodes;
> +    BUG_ON(num_vnodes > MAX_NUMNODES);

Hmm.. Is that really neccessary? What if we just do WARN("some lengthy explanation"),
and bail out and not initialize these structures?
> +
> +    max_addr = last_pfn << PAGE_SHIFT;
> +
> +    node_start_addr = start_pfn << PAGE_SHIFT;
> +    for (i = 0; i < num_vnodes; i++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> +        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
> +
> +		node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
> +
> +		if (i == (num_vnodes-1))
> +			node_end_addr = max_addr;
> +		else
> +        {
> +            node_end_addr = node_start_addr + node_size;
> +			while ((node_end_addr - node_start_addr - 
> +                e820_hole_size(node_start_addr, node_end_addr)) < node_size)
> +            {
> +                node_end_addr += FAKE_NODE_MIN_SIZE;
> +				if (node_end_addr > max_addr) {
> +					node_end_addr = max_addr;
> +					break;
> +				}
> +			}
> +        }
> +        /* node_start_addr updated inside the function */
> +        if (setup_node_range(i, nodes, &node_start_addr, 
> +                    (node_end_addr-node_start_addr), max_addr+1))
> +            BUG();
> +    }
> +
> +	printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");

Is this neccessary? Can be it be debug?
> +
> +    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
> +    if (memnode_shift < 0) {
> +	    printk(KERN_ERR "No NUMA hash function found.\n");
> +        BUG();

Wow. BUG? What about just bailing out and unset xen_pv_emulation flag?

> +    }
> +    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
> +	/*
> +	 * We need to vacate all active ranges that may have been registered by
> +	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
> +	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
> +	 */
> +	remove_all_active_ranges();
> +
> +    BUG_ON(acpi_numa >= 0);
> +	for_each_node_mask(i, node_possible_map) {
> +		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
> +						nodes[i].end >> PAGE_SHIFT);
> +		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> +	}
> +    xen_init_slit_table(layout);
> +	xen_init_numa_array(layout);
> +	return 0;
> +}
>  #endif /* CONFIG_NUMA_EMU */
>  
>  void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>  {
>  	int i;
> +    struct xen_domain_numa_layout *numa_layout = &pv_numa_layout;
> +
> +    int xen_pv_numa_enabled = numa_layout->max_vnodes;
>  
>  	nodes_clear(node_possible_map);
>  	nodes_clear(node_online_map);
>  
>  #ifdef CONFIG_NUMA_EMU
> +    if (xen_pv_domain() && xen_pv_numa_enabled)
> +    {
> +        if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn))
> +            return;
> +    }
> +
>  	if (cmdline && !numa_emulation(start_pfn, last_pfn))
>  		return;
>  	nodes_clear(node_possible_map);
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index ecb9b0d..b020555 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -83,6 +83,8 @@ void *xen_initial_gdt;
>   */
>  struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
>  
> +struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout;
> +
>  /*
>   * Flag to determine whether vcpu info placement is available on all
>   * VCPUs.  We assume it is to start with, and then set it to zero on
> @@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void)
>  	pv_cpu_ops.load_gdt = xen_load_gdt;
>  }
>  
> +extern struct xen_domain_numa_layout pv_numa_layout;
>  /* First C function to be called on Xen boot */
>  asmlinkage void __init xen_start_kernel(void)
>  {
> @@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void)
>  		xen_start_info->console.domU.evtchn = 0;
>  	}
>  
> +    {
> +        struct xen_domain_numa_layout *layout = 
> +            (void *)((char *)xen_start_info +
> +                        xen_start_info->numa_layout_info.info_off);
> +        memcpy(&pv_numa_layout, layout, sizeof(*layout));
> +    }

Shouldn't there be a check to see if the Xen actually exports
this structure? Otherwise you might copy garbage in and set
pv_numa_layout values to random values.

>  	xen_raw_console_write("about to get started...\n");
>  
>  	/* Start the world */
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index 612f2c9..cb944a2 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -281,6 +281,9 @@ void __init xen_arch_setup(void)
>  		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>  		disable_acpi();
>  	}
> +
> +    acpi_numa = -1;
> +    numa_off = 1;
>  #endif
>  
>  	/* 
> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index 812ffd5..d4588fa 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -398,6 +398,53 @@ struct shared_info {
>  
>  };
>  
> +#define XEN_NR_CPUS 64
No way to share this with some other variable? Like NR_CPUS?

> +#if defined(__i386__)
> +#define XEN_BITS_PER_LONG 32
> +#define XEN_BYTES_PER_LONG 4

There gotta be some of these defined in other headers.
> +#define XEN_LONG_BYTEORDER 2
> +#elif defined(__x86_64__)
> +#define XEN_BITS_PER_LONG 64
> +#define XEN_BYTES_PER_LONG 8
> +#define XEN_LONG_BYTEORDER 3
> +#endif
> +
> +/* same as cpumask_t - in xen and even Linux (for now) */
> +#define XEN_BITS_TO_LONGS(bits) \
> +    (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG)
> +#define XEN_DECLARE_BITMAP(name,bits) \
> +    unsigned long name[XEN_BITS_TO_LONGS(bits)]

I am pretty sure there are macros for this laready.
> +struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); };
> +#ifndef __XEN__

Is that neccessary? typedefs are frowned upon in kernel.
> +typedef struct xen_cpumask xen_cpumask_t;
> +#endif
> +
> +#define XEN_MAX_VNODES 8
> +struct xen_vnode_data {
> +    uint32_t vnode_id;
> +    uint32_t mnode_id;
> +    uint64_t nr_pages;
> +    /* XXX: Can we use this in xen<->domain interfaces ? */
> +    struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */
> +};
> +#ifndef __XEN__
> +typedef struct xen_vnode_data xen_vnode_data_t;

Ditto.
> +#endif
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +struct xen_domain_numa_layout {
> +    uint32_t max_vcpus;
> +    uint32_t max_vnodes;
> +
> +    /* Only (max_vnodes*max_vnodes) entries are filled */
> +    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
> +    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
> +};
> +#ifndef __XEN__
> +typedef struct xen_domain_numa_layout xen_domain_numa_layout_t;

Ditto.
> +#endif
> +
>  /*
>   * Start-of-day memory layout for the initial domain (DOM0):
>   *  1. The domain is started within contiguous virtual-memory region.
> @@ -449,6 +496,13 @@ struct start_info {
>  	unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
>  	unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
>  	int8_t cmd_line[MAX_GUEST_CMDLINE];
> +    /* The pfn range here covers both page table and p->m table frames.   */
> +    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */

Why is this added in this patch? That doesn't look to be used by your
patch.
> +    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */

Ditto.
> +    struct {
> +        uint32_t info_off;  /* Offset of console_info struct.         *
> +        uint32_t info_size; /* Size of console_info struct from start.*/

Wrong comment.
> +    } numa_layout_info;

>  };
>  
>  struct dom0_vga_console_info {

next prev parent reply	other threads:[~2010-02-16 17:49 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-02-13  6:25 [RFC] pv guest numa [RE: Host Numa informtion in dom0] Dulloor
2010-02-16  1:15 ` Dan Magenheimer
2010-02-16 17:49 ` Konrad Rzeszutek Wilk [this message]
2010-02-18 16:24   ` Dulloor

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100216174900.GB21067@phenom.dumpdata.com \
    --to=konrad.wilk@oracle.com \
    --cc=Ian.Pratt@eu.citrix.com \
    --cc=andre.przywara@amd.com \
    --cc=dulloor@gmail.com \
    --cc=jun.nakajima@intel.com \
    --cc=keir.fraser@eu.citrix.com \
    --cc=xen-devel@lists.xensource.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).