[RFC] pv guest numa [RE: Host Numa informtion in dom0]

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [RFC] pv guest numa [RE: Host Numa informtion in dom0]
@ 2010-02-13  6:25 Dulloor
  2010-02-16  1:15 ` Dan Magenheimer
  2010-02-16 17:49 ` Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 4+ messages in thread
From: Dulloor @ 2010-02-13  6:25 UTC (permalink / raw)
  To: Ian Pratt
  Cc: Andre Przywara, xen-devel@lists.xensource.com, Nakajima, Jun,
	Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 2429 bytes --]

I am attaching (RFC) patches for NUMA-aware pv guests.

* The patch adds hypervisor interfaces to export minimal numa-related
information about the memory of pv domain, which can then be used to
setup the node ranges, virtual cpu<->node maps, and virtual slit
tables in the pv domain.
* The guest-domain also maintains a mapping between its vnodes and
mnodes(actual machine nodes). These mappings can be used in the memory
operations, such as those in ballooning.
* In the patch, dom0 is made numa-aware using these interfaces. Other
domains should be simpler. I am in the process of adding python
interfaces for this. And, this would work with any node selection
policy.
* The patch is tested only for 64-on-64 (on x86_64)

* Along with the following other patches, this could provide a good
solution for numa-aware guests -
- numa-aware ballooning  (previously posted by me on xen-devel)
- Andre's patch for HVM domains (posted by Andre recently)

I am in the process of making other places of dynamic memory
mgmt/operations numa-aware - tmem, memory exchange operations, etc.

Please let know your comments.

-dulloor

On Thu, Feb 11, 2010 at 10:21 AM, Ian Pratt <Ian.Pratt@eu.citrix.com> wrote:
>> > If guest NUMA is disabled, we just use a single node mask which is the
>> > union of the per-VCPU node masks.
>> >
>> > Where allowed node masks span more than one physical node, we should
>> > allocate memory to the guest's virtual node by pseudo randomly striping
>> > memory allocations (in 2MB chunks) from across the specified physical
>> > nodes. [pseudo random is probably better than round robin]
>>
>> Do we really want to support this? I don't think the allowed node masks
>> should span more than one physical NUMA node. We also need to look at I/O
>> devices as well.
>
> Given that we definitely need this striping code in the case where the guest is non NUMA, I'd be inclined to still allow it to be used even if the guest has multiple NUMA nodes. It could come in handy where there is a hierarchy between physical NUMA nodes, enabling for example striping to be used between a pair of 'close' nodes, while exposing the higher-level topology of sets of the paired nodes to be exposed to the guest.
>
> Ian
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

[-- Attachment #2: guest-numa-linux.patch --]
[-- Type: text/x-diff, Size: 10709 bytes --]

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 86bef0f..7a24070 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * Parse SRAT to discover nodes.
 	 */
-	acpi_numa_init();
+    if (acpi_numa > 0)
+	    acpi_numa_init();
 #endif
 
 	initmem_init(0, max_pfn);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913b..14fa654 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -11,7 +11,9 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/nodemask.h>
+#include <linux/cpumask.h>
 #include <linux/sched.h>
+#include <xen/interface/xen.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -19,6 +21,7 @@
 #include <asm/numa.h>
 #include <asm/acpi.h>
 #include <asm/k8.h>
+#include <asm/xen/hypervisor.h>
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
 	 */
 	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
 		long n = simple_strtol(cmdline, NULL, 0);
-
 		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
 		if (num_nodes < 0)
 			return num_nodes;
@@ -522,16 +524,162 @@ out:
 	numa_init_array();
 	return 0;
 }
+struct xen_domain_numa_layout pv_numa_layout;
+
+void dump_numa_layout(struct xen_domain_numa_layout *layout)
+{
+    unsigned int i, j;
+    char vcpumask[128];
+    printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n",
+                            layout->max_vcpus, layout->max_vnodes);
+    for (i = 0; i < layout->max_vnodes; i++)
+    {
+        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
+        cpumask_scnprintf(vcpumask, sizeof(vcpumask), 
+                                ((cpumask_t *)&vnode_data->vcpu_mask));
+        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", 
+                vnode_data->vnode_id, vnode_data->mnode_id,
+                (unsigned long)vnode_data->nr_pages, vcpumask);
+    }
+
+    printk("vnode distances :\n");
+    for (i = 0; i < layout->max_vnodes; i++)
+        printk("\tvnode[%u]", i);
+    for (i = 0; i < layout->max_vnodes; i++)
+    {
+        printk("\nvnode[%u]", i);
+        for (j = 0; j < layout->max_vnodes; j++)
+            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
+        printk("\n");
+    }
+    return;
+}
+
+static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
+{
+    /* Construct a slit table (using layout->vnode_distance).
+     * Copy it to acpi_slit. */
+    return;
+}
+/* Distribute the vcpus over the vnodes according to their affinity */
+static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
+{
+	int vcpu, vnode;
+   
+	printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
+
+    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
+    {
+        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
+        cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask);
+   
+        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
+        {
+            if (cpu_isset(vcpu, vcpu_mask))
+            {
+                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
+                {
+                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", 
+                                        vcpu, early_cpu_to_node(vcpu)); 
+                    continue;
+                }
+                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
+		        numa_set_node(vcpu, vnode);
+            }
+        }
+    }
+    return;
+}
+
+static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
+                            unsigned long start_pfn, unsigned long last_pfn)
+{
+	int num_vnodes, i;
+    u64 node_start_addr, node_end_addr, max_addr;
+
+    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
+                                        layout->max_vnodes, layout->max_vcpus);
+    dump_numa_layout(layout);
+	memset(&nodes, 0, sizeof(nodes));
+
+    num_vnodes = layout->max_vnodes;
+    BUG_ON(num_vnodes > MAX_NUMNODES);
+
+    max_addr = last_pfn << PAGE_SHIFT;
+
+    node_start_addr = start_pfn << PAGE_SHIFT;
+    for (i = 0; i < num_vnodes; i++)
+    {
+        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
+        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
+
+		node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
+
+		if (i == (num_vnodes-1))
+			node_end_addr = max_addr;
+		else
+        {
+            node_end_addr = node_start_addr + node_size;
+			while ((node_end_addr - node_start_addr - 
+                e820_hole_size(node_start_addr, node_end_addr)) < node_size)
+            {
+                node_end_addr += FAKE_NODE_MIN_SIZE;
+				if (node_end_addr > max_addr) {
+					node_end_addr = max_addr;
+					break;
+				}
+			}
+        }
+        /* node_start_addr updated inside the function */
+        if (setup_node_range(i, nodes, &node_start_addr, 
+                    (node_end_addr-node_start_addr), max_addr+1))
+            BUG();
+    }
+
+	printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
+
+    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
+    if (memnode_shift < 0) {
+	    printk(KERN_ERR "No NUMA hash function found.\n");
+        BUG();
+    }
+    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
+	/*
+	 * We need to vacate all active ranges that may have been registered by
+	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
+	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
+	 */
+	remove_all_active_ranges();
+
+    BUG_ON(acpi_numa >= 0);
+	for_each_node_mask(i, node_possible_map) {
+		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+						nodes[i].end >> PAGE_SHIFT);
+		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+	}
+    xen_init_slit_table(layout);
+	xen_init_numa_array(layout);
+	return 0;
+}
 #endif /* CONFIG_NUMA_EMU */
 
 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
 {
 	int i;
+    struct xen_domain_numa_layout *numa_layout = &pv_numa_layout;
+
+    int xen_pv_numa_enabled = numa_layout->max_vnodes;
 
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
+    if (xen_pv_domain() && xen_pv_numa_enabled)
+    {
+        if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn))
+            return;
+    }
+
 	if (cmdline && !numa_emulation(start_pfn, last_pfn))
 		return;
 	nodes_clear(node_possible_map);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ecb9b0d..b020555 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -83,6 +83,8 @@ void *xen_initial_gdt;
  */
 struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
 
+struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout;
+
 /*
  * Flag to determine whether vcpu info placement is available on all
  * VCPUs.  We assume it is to start with, and then set it to zero on
@@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void)
 	pv_cpu_ops.load_gdt = xen_load_gdt;
 }
 
+extern struct xen_domain_numa_layout pv_numa_layout;
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void)
 		xen_start_info->console.domU.evtchn = 0;
 	}
 
+    {
+        struct xen_domain_numa_layout *layout = 
+            (void *)((char *)xen_start_info +
+                        xen_start_info->numa_layout_info.info_off);
+        memcpy(&pv_numa_layout, layout, sizeof(*layout));
+    }
 	xen_raw_console_write("about to get started...\n");
 
 	/* Start the world */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 612f2c9..cb944a2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -281,6 +281,9 @@ void __init xen_arch_setup(void)
 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
 		disable_acpi();
 	}
+
+    acpi_numa = -1;
+    numa_off = 1;
 #endif
 
 	/* 
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 812ffd5..d4588fa 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -398,6 +398,53 @@ struct shared_info {
 
 };
 
+#define XEN_NR_CPUS 64
+#if defined(__i386__)
+#define XEN_BITS_PER_LONG 32
+#define XEN_BYTES_PER_LONG 4
+#define XEN_LONG_BYTEORDER 2
+#elif defined(__x86_64__)
+#define XEN_BITS_PER_LONG 64
+#define XEN_BYTES_PER_LONG 8
+#define XEN_LONG_BYTEORDER 3
+#endif
+
+/* same as cpumask_t - in xen and even Linux (for now) */
+#define XEN_BITS_TO_LONGS(bits) \
+    (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG)
+#define XEN_DECLARE_BITMAP(name,bits) \
+    unsigned long name[XEN_BITS_TO_LONGS(bits)]
+struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); };
+#ifndef __XEN__
+typedef struct xen_cpumask xen_cpumask_t;
+#endif
+
+#define XEN_MAX_VNODES 8
+struct xen_vnode_data {
+    uint32_t vnode_id;
+    uint32_t mnode_id;
+    uint64_t nr_pages;
+    /* XXX: Can we use this in xen<->domain interfaces ? */
+    struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */
+};
+#ifndef __XEN__
+typedef struct xen_vnode_data xen_vnode_data_t;
+#endif
+
+/* NUMA layout for the domain at the time of startup.
+ * Structure has to fit within a page. */
+struct xen_domain_numa_layout {
+    uint32_t max_vcpus;
+    uint32_t max_vnodes;
+
+    /* Only (max_vnodes*max_vnodes) entries are filled */
+    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
+    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
+};
+#ifndef __XEN__
+typedef struct xen_domain_numa_layout xen_domain_numa_layout_t;
+#endif
+
 /*
  * Start-of-day memory layout for the initial domain (DOM0):
  *  1. The domain is started within contiguous virtual-memory region.
@@ -449,6 +496,13 @@ struct start_info {
 	unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
 	unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
 	int8_t cmd_line[MAX_GUEST_CMDLINE];
+    /* The pfn range here covers both page table and p->m table frames.   */
+    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
+    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
+    struct {
+        uint32_t info_off;  /* Offset of console_info struct.         */
+        uint32_t info_size; /* Size of console_info struct from start.*/
+    } numa_layout_info;
 };
 
 struct dom0_vga_console_info {

[-- Attachment #3: guest-numa-xen.patch --]
[-- Type: text/x-diff, Size: 25827 bytes --]

diff -r c0e32941ee69 tools/include/xen-foreign/reference.size
--- a/tools/include/xen-foreign/reference.size	Wed Nov 25 14:19:50 2009 +0000
+++ b/tools/include/xen-foreign/reference.size	Sat Feb 13 00:55:44 2010 -0500
@@ -1,7 +1,7 @@
 
 structs                   |  x86_32  x86_64    ia64
 
-start_info                |    1112    1168    1168
+start_info                |    1120    1176    1176
 trap_info                 |       8      16       -
 pt_fpreg                  |       -       -      16
 cpu_user_regs             |      68     200       -
diff -r c0e32941ee69 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/arch/x86/domain_build.c	Sat Feb 13 00:55:44 2010 -0500
@@ -31,6 +31,7 @@
 #include <asm/e820.h>
 #include <asm/acpi.h>
 #include <asm/bzimage.h> /* for bzimage_parse */
+#include <xen/nodemask.h>
 
 #include <public/version.h>
 
@@ -122,8 +123,10 @@
 #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
 #define round_pgdown(_p)  ((_p)&PAGE_MASK)
 
+#define DOM0_BOOT_NODE 0
+#define XEN_MEMF_exact_node(n) (MEMF_node(n) | MEMF_exact_node)
 static struct page_info * __init alloc_chunk(
-    struct domain *d, unsigned long max_pages)
+    struct domain *d, unsigned long max_pages, unsigned int node)
 {
     struct page_info *page;
     unsigned int order;
@@ -136,12 +139,14 @@
     order = get_order_from_pages(max_pages);
     if ( (max_pages & (max_pages-1)) != 0 )
         order--;
-    while ( (page = alloc_domheap_pages(d, order, 0)) == NULL )
+    while ( (page = alloc_domheap_pages(d, order, XEN_MEMF_exact_node(node)))
+                                                                    == NULL )
         if ( order-- == 0 )
-            break;
+                break;
     return page;
 }
 
+
 static unsigned long __init compute_dom0_nr_pages(
 #ifdef __x86_64__
     unsigned long vstart, unsigned long vend, size_t sizeof_long)
@@ -241,6 +246,37 @@
     }
 }
 
+void dump_numa_layout(struct xen_domain_numa_layout *layout)
+{
+    unsigned int i, j;
+    char vcpumask[128];
+    printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n",
+                            layout->max_vcpus, layout->max_vnodes);
+    for (i = 0; i < layout->max_vnodes; i++)
+    {
+        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
+        cpumask_scnprintf(vcpumask, sizeof(vcpumask), 
+                                *((cpumask_t *)&vnode_data->vcpu_mask));
+        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", 
+                vnode_data->vnode_id, vnode_data->mnode_id,
+                vnode_data->nr_pages, vcpumask);
+    }
+
+    printk("vnode distances :\n");
+    for (i = 0; i < layout->max_vnodes; i++)
+        printk("\tvnode[%u]", i);
+    for (i = 0; i < layout->max_vnodes; i++)
+    {
+        printk("\nvnode[%u]", i);
+        for (j = 0; j < layout->max_vnodes; j++)
+            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
+        printk("\n");
+    }
+    return;
+}
+
+struct xen_domain_numa_layout dom0_numa_layout;
+
 int __init construct_dom0(
     struct domain *d,
     unsigned long _image_base,
@@ -258,6 +294,7 @@
     unsigned long count;
     struct page_info *page = NULL;
     start_info_t *si;
+    struct xen_domain_numa_layout *numa_layout;
     struct vcpu *v = d->vcpu[0];
     unsigned long long value;
 #if defined(__i386__)
@@ -381,7 +418,7 @@
 #else
     nr_pages = compute_dom0_nr_pages();
 #endif
-
+    
     if ( parms.pae == PAEKERN_extended_cr3 )
             set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
 
@@ -430,7 +467,8 @@
     vstartinfo_start = round_pgup(vphysmap_end);
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
-                        sizeof(struct dom0_vga_console_info));
+                        sizeof(struct dom0_vga_console_info) +
+                        sizeof(struct xen_domain_numa_layout));
     vpt_start        = round_pgup(vstartinfo_end);
     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
     {
@@ -473,11 +511,12 @@
         vphysmap_start = parms.p2m_base;
         vphysmap_end   = vphysmap_start + nr_pages * sizeof(unsigned long);
     }
-    page = alloc_domheap_pages(d, order, 0);
+    page = alloc_domheap_pages(d, order, XEN_MEMF_exact_node(DOM0_BOOT_NODE));
 #endif
     if ( page == NULL )
         panic("Not enough RAM for domain 0 allocation.\n");
     alloc_spfn = page_to_mfn(page);
+    /* XXX: What happens in a layout with holes : pfn_pdx_hole_shift != 0 */
     alloc_epfn = alloc_spfn + d->tot_pages;
 
     printk("PHYSICAL MEMORY ARRANGEMENT:\n"
@@ -503,6 +542,47 @@
            _p(v_start), _p(v_end));
     printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
 
+#if 0
+    if (d->numa)
+#endif
+    {
+        int i, j;
+        cpumask_t *vcpumask;
+        numa_layout = &dom0_numa_layout;
+
+        numa_layout->max_vcpus = d->max_vcpus;
+        numa_layout->max_vnodes = num_online_nodes();
+
+        BUG_ON(DOM0_BOOT_NODE);
+#define NR_NODE_PAGES (nr_pages / num_online_nodes())
+        for (i = 0; i < numa_layout->max_vnodes; i++)
+        {
+            /* XXX: Dom0 eventually looks at the same e820 map, so is it 
+             * right to set Dom0's max_pfn to nr_pages ? */
+            struct xen_vnode_data *vnode_data = &numa_layout->vnode_data[i];
+            vnode_data->vnode_id = i;
+            /* We are allocating from all nodes starting at DOM0_BOOT_NODE(0) */
+            vnode_data->mnode_id = i;
+            vnode_data->nr_pages = NR_NODE_PAGES;
+        }
+#undef NR_NODE_PAGES
+        vcpumask = (cpumask_t *)&numa_layout->vnode_data[0].vcpu_mask;
+        /* VCPU0 is placed on DOM0_BOOT_NODE(0) */
+        cpu_set(0, *vcpumask);
+        
+        /* Fill up the vnode<->vnode distances */
+        for (i = 0; i < numa_layout->max_vnodes; i++)
+        {
+            uint32_t imnode = numa_layout->vnode_data[i].mnode_id;
+            for (j = 0; j < numa_layout->max_vnodes; j++)
+            {
+                uint32_t jmnode = numa_layout->vnode_data[j].mnode_id;
+                numa_layout->vnode_distance[(i*XEN_MAX_VNODES) + j] = 
+                                            node_distance(imnode, jmnode);
+            }
+        }
+    }
+
     mpt_alloc = (vpt_start - v_start) +
         (unsigned long)pfn_to_paddr(alloc_spfn);
 
@@ -625,7 +705,7 @@
     }
     else
     {
-        page = alloc_domheap_page(NULL, 0);
+        page = alloc_domheap_page(NULL, XEN_MEMF_exact_node(DOM0_BOOT_NODE));
         if ( !page )
             panic("Not enough RAM for domain 0 PML4.\n");
         page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
@@ -757,8 +837,24 @@
 
     printk("Dom0 has maximum %u VCPUs\n", opt_dom0_max_vcpus);
 
-    for ( i = 1; i < opt_dom0_max_vcpus; i++ )
-        (void)alloc_vcpu(d, i, i % num_online_cpus());
+    for (i = 1; i < d->max_vcpus; i++)
+    {
+        int processor;
+
+        /* Distribute the vcpus (over nodes or cpus) */
+        if((d->is_numa) && (d->max_vcpus >= num_online_nodes()))
+        {
+            int node = i/(d->max_vcpus/num_online_nodes());
+            cpumask_t *vcpumask = 
+                (cpumask_t *)&numa_layout->vnode_data[node].vcpu_mask;
+            cpu_set(i, *vcpumask);
+            processor = first_cpu(node_to_cpumask(node));
+        }
+        else
+            processor = i%num_online_cpus();
+
+        (void)alloc_vcpu(d, i, processor);
+    }
 
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
@@ -805,6 +901,14 @@
     si->pt_base      = vpt_start + 2 * PAGE_SIZE * !!is_pv_32on64_domain(d);
     si->nr_pt_frames = nr_pt_pages;
     si->mfn_list     = vphysmap_start;
+
+    si->numa_layout_info.info_off  = 
+        sizeof(struct start_info) + sizeof(struct dom0_vga_console_info);
+    si->numa_layout_info.info_size = sizeof(struct xen_domain_numa_layout);
+    numa_layout = (struct xen_domain_numa_layout *)
+                    (vstartinfo_start + si->numa_layout_info.info_off);
+    memcpy(numa_layout, &dom0_numa_layout, sizeof(*numa_layout));
+
     snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
              elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
 
@@ -827,7 +931,7 @@
             l4tab = l4start + l4_table_offset(va);
             if ( !l4e_get_intpte(*l4tab) )
             {
-                page = alloc_domheap_page(d, 0);
+                page = alloc_domheap_page(d, XEN_MEMF_exact_node(DOM0_BOOT_NODE));
                 if ( !page )
                     break;
                 /* No mapping, PGC_allocated + page-table page. */
@@ -847,14 +951,15 @@
                      (page = alloc_domheap_pages(d,
                                                  L3_PAGETABLE_SHIFT -
                                                      PAGE_SHIFT,
-                                                 0)) != NULL )
+                                              XEN_MEMF_exact_node(DOM0_BOOT_NODE))) != NULL )
                 {
                     *l3tab = l3e_from_page(page,
                                            L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
                     va += 1UL << L3_PAGETABLE_SHIFT;
                     continue;
                 }
-                if ( (page = alloc_domheap_page(d, 0)) == NULL )
+                if ( (page = alloc_domheap_page(d, XEN_MEMF_exact_node(DOM0_BOOT_NODE)))
+                                                                    == NULL )
                     break;
                 else
                 {
@@ -875,14 +980,15 @@
                      (page = alloc_domheap_pages(d,
                                                  L2_PAGETABLE_SHIFT -
                                                      PAGE_SHIFT,
-                                                 0)) != NULL )
+                                        XEN_MEMF_exact_node(DOM0_BOOT_NODE))) != NULL )
                 {
                     *l2tab = l2e_from_page(page,
                                            L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
                     va += 1UL << L2_PAGETABLE_SHIFT;
                     continue;
                 }
-                if ( (page = alloc_domheap_page(d, 0)) == NULL )
+                if ( (page = alloc_domheap_page(d, XEN_MEMF_exact_node(DOM0_BOOT_NODE)))
+                                                                    == NULL )
                     break;
                 else
                 {
@@ -897,7 +1003,7 @@
             l1tab = page_to_virt(l2e_get_page(*l2tab));
             l1tab += l1_table_offset(va);
             BUG_ON(l1e_get_intpte(*l1tab));
-            page = alloc_domheap_page(d, 0);
+            page = alloc_domheap_page(d, XEN_MEMF_exact_node(DOM0_BOOT_NODE));
             if ( !page )
                 break;
             *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
@@ -917,6 +1023,7 @@
 #define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
         if ( pfn > REVERSE_START )
             mfn = alloc_epfn - (pfn - REVERSE_START);
+#undef REVERSE_START
 #endif
         if ( !is_pv_32on64_domain(d) )
             ((unsigned long *)vphysmap_start)[pfn] = mfn;
@@ -948,27 +1055,45 @@
         }
     }
     BUG_ON(pfn != d->tot_pages);
-    while ( pfn < nr_pages )
+
+{
+    unsigned int vnode;
+    for (vnode = 0; vnode < numa_layout->max_vnodes; vnode++)
     {
-        if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
-            panic("Not enough RAM for DOM0 reservation.\n");
-        while ( pfn < d->tot_pages )
+        struct xen_vnode_data *vnode_data = &numa_layout->vnode_data[vnode];
+        unsigned int mnode = vnode_data->mnode_id;
+#define NR_NODE_PAGES (nr_pages / num_online_nodes())
+        unsigned long vnode_end_pfn = (vnode+1)*NR_NODE_PAGES;
+        if (pfn > vnode_end_pfn)
         {
-            mfn = page_to_mfn(page);
+                dump_numa_layout(numa_layout);
+                printk("pfn(%lx) > vnode_end_pfn(%lx)\n", pfn, vnode_end_pfn);
+                panic("pfn(%lx) > vnode_end_pfn(%lx)\n", pfn, vnode_end_pfn);
+        }
+#undef NR_NODE_PAGES 
+        while (pfn < vnode_end_pfn)
+        {
+            if (!(page = alloc_chunk(d, vnode_end_pfn - d->tot_pages, mnode)))
+                    panic("Not enough RAM for DOM0 reservation.\n");
+            while (pfn < d->tot_pages)
+            {
+                mfn = page_to_mfn(page);
 #ifndef NDEBUG
 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
 #endif
-            if ( !is_pv_32on64_domain(d) )
-                ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            else
-                ((unsigned int *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+                if ( !is_pv_32on64_domain(d) )
+                    ((unsigned long *)vphysmap_start)[pfn] = mfn;
+                else
+                    ((unsigned int *)vphysmap_start)[pfn] = mfn;
+                set_gpfn_from_mfn(mfn, pfn);
 #undef pfn
-            page++; pfn++;
-            if (!(pfn & 0xfffff))
-                process_pending_timers();
+                page++; pfn++;
+                if (!(pfn & 0xfffff))
+                    process_pending_timers();
+            }
         }
     }
+}
 
     if ( initrd_len != 0 )
     {
@@ -986,10 +1111,12 @@
         si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
     }
 
+#if 0
 #if defined(__x86_64__)
     if ( is_pv_32on64_domain(d) )
         xlat_start_info(si, XLAT_start_info_console_dom0);
 #endif
+#endif
 
     /* Return to idle domain's page tables. */
     write_ptbase(current);
diff -r c0e32941ee69 xen/arch/x86/e820.c
--- a/xen/arch/x86/e820.c	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/arch/x86/e820.c	Sat Feb 13 00:55:44 2010 -0500
@@ -647,3 +647,50 @@
 
     return find_max_pfn();
 }
+
+#define round_down(addr, mask)   ((addr) & ~(mask))
+#define round_up(addr, mask)     ((addr) | (mask))
+
+int __init e820_find_active_region(const struct e820entry *ei,
+				  unsigned long start_pfn,
+				  unsigned long last_pfn,
+				  unsigned long *ei_startpfn,
+				  unsigned long *ei_endpfn)
+{
+	unsigned long align_mask = (PAGE_SIZE-1);
+
+	*ei_startpfn = round_up(ei->addr, align_mask) >> PAGE_SHIFT;
+	*ei_endpfn = round_down(ei->addr + ei->size, align_mask) >> PAGE_SHIFT;
+
+	/* Skip map entries smaller than a page */
+	if (*ei_startpfn >= *ei_endpfn)
+		return 0;
+
+	/* Skip if map is outside the node */
+	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+				    *ei_startpfn >= last_pfn)
+		return 0;
+
+	/* Check for overlaps */
+	if (*ei_startpfn < start_pfn)
+		*ei_startpfn = start_pfn;
+	if (*ei_endpfn > last_pfn)
+		*ei_endpfn = last_pfn;
+
+	return 1;
+}
+
+unsigned long __init e820_hole_size(unsigned long start_pfn,
+                                                unsigned long last_pfn)
+{
+	unsigned long ei_startpfn, ei_endpfn, ram = 0;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		if (e820_find_active_region(&e820.map[i],
+					    start_pfn, last_pfn,
+					    &ei_startpfn, &ei_endpfn))
+			ram += ei_endpfn - ei_startpfn;
+	}
+	return (last_pfn - start_pfn - ram);
+}
diff -r c0e32941ee69 xen/arch/x86/numa.c
--- a/xen/arch/x86/numa.c	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/arch/x86/numa.c	Sat Feb 13 00:55:44 2010 -0500
@@ -28,8 +28,10 @@
 
 struct node_data node_data[MAX_NUMNODES];
 
-int memnode_shift;
-u8  memnodemap[NODEMAPSIZE];
+struct memnode memnode = {.mapsize = NODEMAPSIZE};
+#define memnode_shift memnode.shift
+#define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
 
 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
 	[0 ... NR_CPUS-1] = NUMA_NO_NODE
@@ -278,6 +280,9 @@
 EXPORT_SYMBOL(memnodemap);
 EXPORT_SYMBOL(node_data);
 
+extern struct xen_domain_numa_layout dom0_numa_layout;
+extern void dump_numa_layout(struct xen_domain_numa_layout *layout);
+
 static void dump_numa(unsigned char key)
 {
 	s_time_t now = NOW();
@@ -289,6 +294,8 @@
 	printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
 		  (u32)(now>>32), (u32)now);
 
+    dump_numa_layout(&dom0_numa_layout);
+
 	for_each_online_node(i) {
 		paddr_t pa = (paddr_t)(NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT;
 		printk("idx%d -> NODE%d start->%lu size->%lu\n",
diff -r c0e32941ee69 xen/common/compat/xlat.c
--- a/xen/common/compat/xlat.c	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/common/compat/xlat.c	Sat Feb 13 00:55:44 2010 -0500
@@ -8,6 +8,7 @@
 #include <compat/event_channel.h>
 #include <compat/vcpu.h>
 
+#if 0
 /* In-place translation functons: */
 void xlat_start_info(struct start_info *native,
                      enum XLAT_start_info_console console)
@@ -17,6 +18,7 @@
     BUILD_BUG_ON(sizeof(*native) < sizeof(*compat));
     XLAT_start_info(compat, native);
 }
+#endif
 
 void xlat_vcpu_runstate_info(struct vcpu_runstate_info *native)
 {
diff -r c0e32941ee69 xen/common/domain.c
--- a/xen/common/domain.c	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/common/domain.c	Sat Feb 13 00:55:44 2010 -0500
@@ -241,7 +241,10 @@
 
     if ( domid == 0 )
     {
+        /* should be opt_dom0_numa */
         d->is_pinned = opt_dom0_vcpus_pin;
+        d->is_numa = 1;
+        d->is_pinned = 1;
         d->disable_migrate = 1;
     }
 
diff -r c0e32941ee69 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/common/page_alloc.c	Sat Feb 13 00:55:44 2010 -0500
@@ -287,11 +287,17 @@
     unsigned int i, j, zone = 0;
     unsigned int num_nodes = num_online_nodes();
     unsigned long request = 1UL << order;
+    unsigned int exact_node_request;
     cpumask_t extra_cpus_mask, mask;
     struct page_info *pg;
 
-    if ( node == NUMA_NO_NODE )
+    if ( node == NUMA_NO_NODE ) {
         node = cpu_to_node(smp_processor_id());
+        exact_node_request = 0;
+    } else {
+        exact_node_request = (memflags & MEMF_exact_node); 
+    }
+    
 
     ASSERT(node >= 0);
     ASSERT(zone_lo <= zone_hi);
@@ -321,6 +327,8 @@
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
+        if (exact_node_request)
+            goto not_found;
         /* Pick next node, wrapping around if needed. */
         node = next_node(node, node_online_map);
         if (node == MAX_NUMNODES)
@@ -335,6 +343,7 @@
         return pg;
     }
 
+not_found:
     /* No suitable memory blocks. Fail the request. */
     spin_unlock(&heap_lock);
     return NULL;
@@ -834,6 +843,11 @@
     return free_pages;
 }
 
+unsigned long avail_node_heap_pages(unsigned int node)
+{
+    return avail_heap_pages(MEMZONE_XEN, NR_ZONES-1, node);
+}
+
 void __init end_boot_allocator(void)
 {
     unsigned int i;
@@ -1188,9 +1202,7 @@
 
 unsigned long avail_domheap_pages(void)
 {
-    return avail_heap_pages(MEMZONE_XEN + 1,
-                            NR_ZONES - 1,
-                            -1);
+    return avail_heap_pages(MEMZONE_XEN + 1, NR_ZONES - 1, -1);
 }
 
 static void pagealloc_info(unsigned char key)
diff -r c0e32941ee69 xen/common/schedule.c
--- a/xen/common/schedule.c	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/common/schedule.c	Sat Feb 13 00:55:44 2010 -0500
@@ -33,6 +33,7 @@
 #include <xen/multicall.h>
 #include <public/sched.h>
 #include <xsm/xsm.h>
+#include <asm/numa.h>
 
 /* opt_sched: scheduler - default to credit */
 static char __initdata opt_sched[10] = "credit";
@@ -150,7 +151,7 @@
     return state.time[RUNSTATE_running];
 }
 
-int sched_init_vcpu(struct vcpu *v, unsigned int processor) 
+int sched_init_vcpu(struct vcpu *v, unsigned int processor)
 {
     struct domain *d = v->domain;
 
@@ -160,7 +161,12 @@
      */
     v->processor = processor;
     if ( is_idle_domain(d) || d->is_pinned )
-        v->cpu_affinity = cpumask_of_cpu(processor);
+    {
+        if (d->is_numa)
+            v->cpu_affinity = node_to_cpumask(cpu_to_node(processor));
+        else
+            v->cpu_affinity = cpumask_of_cpu(processor);
+    }
     else
         cpus_setall(v->cpu_affinity);
 
diff -r c0e32941ee69 xen/include/asm-x86/e820.h
--- a/xen/include/asm-x86/e820.h	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/include/asm-x86/e820.h	Sat Feb 13 00:55:44 2010 -0500
@@ -29,6 +29,11 @@
     struct e820map *e820, uint64_t s, uint64_t e,
     uint32_t orig_type, uint32_t new_type);
 extern unsigned long init_e820(const char *, struct e820entry *, int *);
+extern int e820_find_active_region(const struct e820entry *ei,
+				  unsigned long start_pfn, unsigned long last_pfn,
+				  unsigned long *ei_startpfn, unsigned long *ei_endpfn);
+extern unsigned long e820_hole_size(unsigned long start_pfn,
+                                                unsigned long end_pfn);
 extern struct e820map e820;
 
 /* These symbols live in the boot trampoline. */
diff -r c0e32941ee69 xen/include/asm-x86/numa.h
--- a/xen/include/asm-x86/numa.h	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/include/asm-x86/numa.h	Sat Feb 13 00:55:44 2010 -0500
@@ -12,7 +12,7 @@
 
 #define cpu_to_node(cpu)		(cpu_to_node[cpu])
 #define parent_node(node)		(node)
-#define node_to_first_cpu(node)  (__ffs(node_to_cpumask[node]))
+#define node_to_first_cpu(node)  (ffs(node_to_cpumask[node]))
 #define node_to_cpumask(node)    (node_to_cpumask[node])
 
 struct node { 
@@ -49,8 +49,15 @@
 }
 
 /* Simple perfect hash to map physical addresses to node numbers */
-extern int memnode_shift; 
-extern u8  memnodemap[NODEMAPSIZE]; 
+struct memnode {
+    int shift;
+    unsigned int mapsize;
+    u8  map[NODEMAPSIZE];
+};
+extern struct memnode memnode; 
+#define memnode_shift memnode.shift
+#define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
 
 struct node_data {
     unsigned long node_start_pfn;
@@ -69,11 +76,15 @@
 	return nid; 
 } 
 
+int __node_distance(int a, int b);
+
 #define NODE_DATA(nid)		(&(node_data[nid]))
 
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
+#define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
 				 NODE_DATA(nid)->node_spanned_pages)
+#define node_distance(a, b) (__node_distance(a, b))
 
 
 #else
diff -r c0e32941ee69 xen/include/public/xen.h
--- a/xen/include/public/xen.h	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/include/public/xen.h	Sat Feb 13 00:55:44 2010 -0500
@@ -519,6 +519,53 @@
 typedef struct shared_info shared_info_t;
 #endif
 
+#define XEN_NR_CPUS 64
+#if defined(__i386__)
+#define XEN_BITS_PER_LONG 32
+#define XEN_BYTES_PER_LONG 4
+#define XEN_LONG_BYTEORDER 2
+#elif defined(__x86_64__)
+#define XEN_BITS_PER_LONG 64
+#define XEN_BYTES_PER_LONG 8
+#define XEN_LONG_BYTEORDER 3
+#endif
+
+/* same as cpumask_t - in xen and even Linux (for now) */
+#define XEN_BITS_TO_LONGS(bits) \
+    (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG)
+#define XEN_DECLARE_BITMAP(name,bits) \
+    unsigned long name[XEN_BITS_TO_LONGS(bits)]
+struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); };
+#ifndef __XEN__
+typedef struct xen_cpumask xen_cpumask_t;
+#endif
+
+#define XEN_MAX_VNODES 8
+struct xen_vnode_data {
+    uint32_t vnode_id;
+    uint32_t mnode_id;
+    uint64_t nr_pages;
+    /* XXX: Can we use this in xen<->domain interfaces ? */
+    struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */
+};
+#ifndef __XEN__
+typedef struct xen_vnode_data xen_vnode_data_t;
+#endif
+
+/* NUMA layout for the domain at the time of startup.
+ * Structure has to fit within a page. */
+struct xen_domain_numa_layout {
+    uint32_t max_vcpus;
+    uint32_t max_vnodes;
+
+    /* Only (max_vnodes*max_vnodes) entries are filled */
+    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
+    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
+};
+#ifndef __XEN__
+typedef struct xen_domain_numa_layout xen_domain_numa_layout_t;
+#endif
+
 /*
  * Start-of-day memory layout:
  *  1. The domain is started within contiguous virtual-memory region.
@@ -572,6 +619,10 @@
     /* The pfn range here covers both page table and p->m table frames.   */
     unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
     unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
+    struct {
+            uint32_t info_off;  /* Offset of console_info struct.         */
+            uint32_t info_size; /* Size of console_info struct from start.*/
+    } numa_layout_info;
 };
 typedef struct start_info start_info_t;
 
diff -r c0e32941ee69 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/include/xen/mm.h	Sat Feb 13 00:55:44 2010 -0500
@@ -46,6 +46,7 @@
 void init_xenheap_pages(paddr_t ps, paddr_t pe);
 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags);
 void free_xenheap_pages(void *v, unsigned int order);
+unsigned long avail_node_heap_pages(unsigned int node);
 #define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
 #define free_xenheap_page(v) (free_xenheap_pages(v,0))
 
@@ -78,6 +79,8 @@
 #define  MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand)
 #define _MEMF_tmem        2
 #define  MEMF_tmem        (1U<<_MEMF_tmem)
+#define _MEMF_exact_node  3
+#define  MEMF_exact_node  (1U << _MEMF_exact_node)
 #define _MEMF_node        8
 #define  MEMF_node(n)     ((((n)+1)&0xff)<<_MEMF_node)
 #define _MEMF_bits        24
diff -r c0e32941ee69 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Wed Nov 25 14:19:50 2009 +0000
+++ b/xen/include/xen/sched.h	Sat Feb 13 00:55:44 2010 -0500
@@ -226,6 +226,8 @@
     bool_t           is_paused_by_controller;
     /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
     bool_t           is_pinned;
+    /* Domain is numa aware */
+    bool_t           is_numa;
 
     /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
 #if MAX_VIRT_CPUS <= BITS_PER_LONG

[-- Attachment #4: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* RE: [RFC] pv guest numa [RE: Host Numa informtion in dom0]
  2010-02-13  6:25 [RFC] pv guest numa [RE: Host Numa informtion in dom0] Dulloor
@ 2010-02-16  1:15 ` Dan Magenheimer
  2010-02-16 17:49 ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 4+ messages in thread
From: Dan Magenheimer @ 2010-02-16  1:15 UTC (permalink / raw)
  To: Dulloor, Ian Pratt
  Cc: Andre Przywara, xen-devel, Keir Fraser, Nakajima, Jun, tmem-devel

Hi Dulloor --

> I am in the process of making other places of dynamic memory
> mgmt/operations numa-aware - tmem, memory exchange operations, etc.

I'd be interested in your thoughts on numa-aware tmem
as well as the other dynamic memory mechanisms in Xen 4.0.

Tmem is special in that it uses primarily full-page copies
from/to tmem-space to/from guest-space so, assuming the
interconnect can pipeline/stream a memcpy, overhead of
off-node memory vs on-node memory should be less
noticeable.  However tmem uses large data structures
(rbtrees and radix-trees) and the lookup process might
benefit from being NUMA-aware.

Also, I will be looking into adding some page-sharing
techniques into tmem in the near future.  This (and the
existing page sharing feature just added to 4.0) may
create some other interesting challenges for NUMA-awareness.

Dan

> -----Original Message-----
> From: Dulloor [mailto:dulloor@gmail.com]
> Sent: Friday, February 12, 2010 11:25 PM
> To: Ian Pratt
> Cc: Andre Przywara; xen-devel@lists.xensource.com; Nakajima, Jun; Keir
> Fraser
> Subject: [Xen-devel] [RFC] pv guest numa [RE: Host Numa informtion in
> dom0]
> 
> I am attaching (RFC) patches for NUMA-aware pv guests.
> 
> * The patch adds hypervisor interfaces to export minimal numa-related
> information about the memory of pv domain, which can then be used to
> setup the node ranges, virtual cpu<->node maps, and virtual slit
> tables in the pv domain.
> * The guest-domain also maintains a mapping between its vnodes and
> mnodes(actual machine nodes). These mappings can be used in the memory
> operations, such as those in ballooning.
> * In the patch, dom0 is made numa-aware using these interfaces. Other
> domains should be simpler. I am in the process of adding python
> interfaces for this. And, this would work with any node selection
> policy.
> * The patch is tested only for 64-on-64 (on x86_64)
> 
> * Along with the following other patches, this could provide a good
> solution for numa-aware guests -
> - numa-aware ballooning  (previously posted by me on xen-devel)
> - Andre's patch for HVM domains (posted by Andre recently)
> 
> I am in the process of making other places of dynamic memory
> mgmt/operations numa-aware - tmem, memory exchange operations, etc.
> 
> Please let know your comments.
> 
> -dulloor
> 
> On Thu, Feb 11, 2010 at 10:21 AM, Ian Pratt <Ian.Pratt@eu.citrix.com>
> wrote:
> >> > If guest NUMA is disabled, we just use a single node mask which is
> the
> >> > union of the per-VCPU node masks.
> >> >
> >> > Where allowed node masks span more than one physical node, we
> should
> >> > allocate memory to the guest's virtual node by pseudo randomly
> striping
> >> > memory allocations (in 2MB chunks) from across the specified
> physical
> >> > nodes. [pseudo random is probably better than round robin]
> >>
> >> Do we really want to support this? I don't think the allowed node
> masks
> >> should span more than one physical NUMA node. We also need to look
> at I/O
> >> devices as well.
> >
> > Given that we definitely need this striping code in the case where
> the guest is non NUMA, I'd be inclined to still allow it to be used
> even if the guest has multiple NUMA nodes. It could come in handy where
> there is a hierarchy between physical NUMA nodes, enabling for example
> striping to be used between a pair of 'close' nodes, while exposing the
> higher-level topology of sets of the paired nodes to be exposed to the
> guest.
> >
> > Ian
> >
> >
> >
> > _______________________________________________
> > Xen-devel mailing list
> > Xen-devel@lists.xensource.com
> > http://lists.xensource.com/xen-devel
> >

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] pv guest numa [RE: Host Numa informtion in dom0]
  2010-02-13  6:25 [RFC] pv guest numa [RE: Host Numa informtion in dom0] Dulloor
  2010-02-16  1:15 ` Dan Magenheimer
@ 2010-02-16 17:49 ` Konrad Rzeszutek Wilk
  2010-02-18 16:24   ` Dulloor
  1 sibling, 1 reply; 4+ messages in thread
From: Konrad Rzeszutek Wilk @ 2010-02-16 17:49 UTC (permalink / raw)
  To: Dulloor
  Cc: Andre Przywara, Ian Pratt, xen-devel@lists.xensource.com,
	Keir Fraser, Nakajima, Jun

Run this patch through scripts/checkpatch.pl

On Sat, Feb 13, 2010 at 01:25:22AM -0500, Dulloor wrote:
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 86bef0f..7a24070 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p)
>  	/*
>  	 * Parse SRAT to discover nodes.
>  	 */
> -	acpi_numa_init();
> +    if (acpi_numa > 0)
> +	    acpi_numa_init();

Why? Why can't we just try  acpi_numa_init()?

>  #endif
>  
>  	initmem_init(0, max_pfn);
> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
> index 459913b..14fa654 100644
> --- a/arch/x86/mm/numa_64.c
> +++ b/arch/x86/mm/numa_64.c
> @@ -11,7 +11,9 @@
>  #include <linux/ctype.h>
>  #include <linux/module.h>
>  #include <linux/nodemask.h>
> +#include <linux/cpumask.h>
>  #include <linux/sched.h>
> +#include <xen/interface/xen.h>
>  
>  #include <asm/e820.h>
>  #include <asm/proto.h>
> @@ -19,6 +21,7 @@
>  #include <asm/numa.h>
>  #include <asm/acpi.h>
>  #include <asm/k8.h>
> +#include <asm/xen/hypervisor.h>

If one does not set CONFIG_XEN does this compile?
>  
>  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>  EXPORT_SYMBOL(node_data);
> @@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
>  	 */
>  	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>  		long n = simple_strtol(cmdline, NULL, 0);
> -

No need for this.
>  		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>  		if (num_nodes < 0)
>  			return num_nodes;
> @@ -522,16 +524,162 @@ out:
>  	numa_init_array();
>  	return 0;
>  }
> +struct xen_domain_numa_layout pv_numa_layout;
> +
> +void dump_numa_layout(struct xen_domain_numa_layout *layout)
> +{
> +    unsigned int i, j;
> +    char vcpumask[128];
> +    printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n",
> +                            layout->max_vcpus, layout->max_vnodes);

Redo the printk's. They need KERN_DEBUG
> +    for (i = 0; i < layout->max_vnodes; i++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> +        cpumask_scnprintf(vcpumask, sizeof(vcpumask), 
> +                                ((cpumask_t *)&vnode_data->vcpu_mask));
> +        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", 
> +                vnode_data->vnode_id, vnode_data->mnode_id,
> +                (unsigned long)vnode_data->nr_pages, vcpumask);

This one too.
> +    }
> +
> +    printk("vnode distances :\n");

and 
> +    for (i = 0; i < layout->max_vnodes; i++)
> +        printk("\tvnode[%u]", i);
> +    for (i = 0; i < layout->max_vnodes; i++)
> +    {
> +        printk("\nvnode[%u]", i);

this
> +        for (j = 0; j < layout->max_vnodes; j++)
> +            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
> +        printk("\n");
one to.
> +    }
> +    return;
> +}
> +
> +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
> +{
> +    /* Construct a slit table (using layout->vnode_distance).
> +     * Copy it to acpi_slit. */
> +    return;
> +}
> +/* Distribute the vcpus over the vnodes according to their affinity */
> +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
> +{
> +	int vcpu, vnode;
> +   
> +	printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");

pr_debug instead?
> +
> +    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
> +        cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask);
> +   
> +        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
> +        {
> +            if (cpu_isset(vcpu, vcpu_mask))
> +            {
> +                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
> +                {
> +                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", 
> +                                        vcpu, early_cpu_to_node(vcpu)); 
> +                    continue;
> +                }
> +                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
> +		        numa_set_node(vcpu, vnode);
> +            }
> +        }
> +    }
> +    return;
> +}
> +
> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
> +                            unsigned long start_pfn, unsigned long last_pfn)
> +{
> +	int num_vnodes, i;
> +    u64 node_start_addr, node_end_addr, max_addr;
> +
> +    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
> +                                        layout->max_vnodes, layout->max_vcpus);
> +    dump_numa_layout(layout);
> +	memset(&nodes, 0, sizeof(nodes));
> +
> +    num_vnodes = layout->max_vnodes;
> +    BUG_ON(num_vnodes > MAX_NUMNODES);

Hmm.. Is that really neccessary? What if we just do WARN("some lengthy explanation"),
and bail out and not initialize these structures?
> +
> +    max_addr = last_pfn << PAGE_SHIFT;
> +
> +    node_start_addr = start_pfn << PAGE_SHIFT;
> +    for (i = 0; i < num_vnodes; i++)
> +    {
> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
> +        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
> +
> +		node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
> +
> +		if (i == (num_vnodes-1))
> +			node_end_addr = max_addr;
> +		else
> +        {
> +            node_end_addr = node_start_addr + node_size;
> +			while ((node_end_addr - node_start_addr - 
> +                e820_hole_size(node_start_addr, node_end_addr)) < node_size)
> +            {
> +                node_end_addr += FAKE_NODE_MIN_SIZE;
> +				if (node_end_addr > max_addr) {
> +					node_end_addr = max_addr;
> +					break;
> +				}
> +			}
> +        }
> +        /* node_start_addr updated inside the function */
> +        if (setup_node_range(i, nodes, &node_start_addr, 
> +                    (node_end_addr-node_start_addr), max_addr+1))
> +            BUG();
> +    }
> +
> +	printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");

Is this neccessary? Can be it be debug?
> +
> +    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
> +    if (memnode_shift < 0) {
> +	    printk(KERN_ERR "No NUMA hash function found.\n");
> +        BUG();

Wow. BUG? What about just bailing out and unset xen_pv_emulation flag?

> +    }
> +    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
> +	/*
> +	 * We need to vacate all active ranges that may have been registered by
> +	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
> +	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
> +	 */
> +	remove_all_active_ranges();
> +
> +    BUG_ON(acpi_numa >= 0);
> +	for_each_node_mask(i, node_possible_map) {
> +		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
> +						nodes[i].end >> PAGE_SHIFT);
> +		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> +	}
> +    xen_init_slit_table(layout);
> +	xen_init_numa_array(layout);
> +	return 0;
> +}
>  #endif /* CONFIG_NUMA_EMU */
>  
>  void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>  {
>  	int i;
> +    struct xen_domain_numa_layout *numa_layout = &pv_numa_layout;
> +
> +    int xen_pv_numa_enabled = numa_layout->max_vnodes;
>  
>  	nodes_clear(node_possible_map);
>  	nodes_clear(node_online_map);
>  
>  #ifdef CONFIG_NUMA_EMU
> +    if (xen_pv_domain() && xen_pv_numa_enabled)
> +    {
> +        if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn))
> +            return;
> +    }
> +
>  	if (cmdline && !numa_emulation(start_pfn, last_pfn))
>  		return;
>  	nodes_clear(node_possible_map);
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index ecb9b0d..b020555 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -83,6 +83,8 @@ void *xen_initial_gdt;
>   */
>  struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
>  
> +struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout;
> +
>  /*
>   * Flag to determine whether vcpu info placement is available on all
>   * VCPUs.  We assume it is to start with, and then set it to zero on
> @@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void)
>  	pv_cpu_ops.load_gdt = xen_load_gdt;
>  }
>  
> +extern struct xen_domain_numa_layout pv_numa_layout;
>  /* First C function to be called on Xen boot */
>  asmlinkage void __init xen_start_kernel(void)
>  {
> @@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void)
>  		xen_start_info->console.domU.evtchn = 0;
>  	}
>  
> +    {
> +        struct xen_domain_numa_layout *layout = 
> +            (void *)((char *)xen_start_info +
> +                        xen_start_info->numa_layout_info.info_off);
> +        memcpy(&pv_numa_layout, layout, sizeof(*layout));
> +    }

Shouldn't there be a check to see if the Xen actually exports
this structure? Otherwise you might copy garbage in and set
pv_numa_layout values to random values.

>  	xen_raw_console_write("about to get started...\n");
>  
>  	/* Start the world */
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index 612f2c9..cb944a2 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -281,6 +281,9 @@ void __init xen_arch_setup(void)
>  		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>  		disable_acpi();
>  	}
> +
> +    acpi_numa = -1;
> +    numa_off = 1;
>  #endif
>  
>  	/* 
> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index 812ffd5..d4588fa 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -398,6 +398,53 @@ struct shared_info {
>  
>  };
>  
> +#define XEN_NR_CPUS 64
No way to share this with some other variable? Like NR_CPUS?

> +#if defined(__i386__)
> +#define XEN_BITS_PER_LONG 32
> +#define XEN_BYTES_PER_LONG 4

There gotta be some of these defined in other headers.
> +#define XEN_LONG_BYTEORDER 2
> +#elif defined(__x86_64__)
> +#define XEN_BITS_PER_LONG 64
> +#define XEN_BYTES_PER_LONG 8
> +#define XEN_LONG_BYTEORDER 3
> +#endif
> +
> +/* same as cpumask_t - in xen and even Linux (for now) */
> +#define XEN_BITS_TO_LONGS(bits) \
> +    (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG)
> +#define XEN_DECLARE_BITMAP(name,bits) \
> +    unsigned long name[XEN_BITS_TO_LONGS(bits)]

I am pretty sure there are macros for this laready.
> +struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); };
> +#ifndef __XEN__

Is that neccessary? typedefs are frowned upon in kernel.
> +typedef struct xen_cpumask xen_cpumask_t;
> +#endif
> +
> +#define XEN_MAX_VNODES 8
> +struct xen_vnode_data {
> +    uint32_t vnode_id;
> +    uint32_t mnode_id;
> +    uint64_t nr_pages;
> +    /* XXX: Can we use this in xen<->domain interfaces ? */
> +    struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */
> +};
> +#ifndef __XEN__
> +typedef struct xen_vnode_data xen_vnode_data_t;

Ditto.
> +#endif
> +
> +/* NUMA layout for the domain at the time of startup.
> + * Structure has to fit within a page. */
> +struct xen_domain_numa_layout {
> +    uint32_t max_vcpus;
> +    uint32_t max_vnodes;
> +
> +    /* Only (max_vnodes*max_vnodes) entries are filled */
> +    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
> +    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
> +};
> +#ifndef __XEN__
> +typedef struct xen_domain_numa_layout xen_domain_numa_layout_t;

Ditto.
> +#endif
> +
>  /*
>   * Start-of-day memory layout for the initial domain (DOM0):
>   *  1. The domain is started within contiguous virtual-memory region.
> @@ -449,6 +496,13 @@ struct start_info {
>  	unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
>  	unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
>  	int8_t cmd_line[MAX_GUEST_CMDLINE];
> +    /* The pfn range here covers both page table and p->m table frames.   */
> +    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */

Why is this added in this patch? That doesn't look to be used by your
patch.
> +    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */

Ditto.
> +    struct {
> +        uint32_t info_off;  /* Offset of console_info struct.         *
> +        uint32_t info_size; /* Size of console_info struct from start.*/

Wrong comment.
> +    } numa_layout_info;

>  };
>  
>  struct dom0_vga_console_info {

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] pv guest numa [RE: Host Numa informtion in dom0]
  2010-02-16 17:49 ` Konrad Rzeszutek Wilk
@ 2010-02-18 16:24   ` Dulloor
  0 siblings, 0 replies; 4+ messages in thread
From: Dulloor @ 2010-02-18 16:24 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Andre Przywara, Ian Pratt, xen-devel@lists.xensource.com,
	Keir Fraser, Nakajima, Jun

Konrad, Thanks for the comments. I will make the changes and send over
the patch.

Any comments on the general approach are welcome too.

-dulloor

On Tue, Feb 16, 2010 at 12:49 PM, Konrad Rzeszutek Wilk
<konrad.wilk@oracle.com> wrote:
> Run this patch through scripts/checkpatch.pl
>
> On Sat, Feb 13, 2010 at 01:25:22AM -0500, Dulloor wrote:
>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>> index 86bef0f..7a24070 100644
>> --- a/arch/x86/kernel/setup.c
>> +++ b/arch/x86/kernel/setup.c
>> @@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p)
>>       /*
>>        * Parse SRAT to discover nodes.
>>        */
>> -     acpi_numa_init();
>> +    if (acpi_numa > 0)
>> +         acpi_numa_init();
>
> Why? Why can't we just try  acpi_numa_init()?
>
>>  #endif
>>
>>       initmem_init(0, max_pfn);
>> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
>> index 459913b..14fa654 100644
>> --- a/arch/x86/mm/numa_64.c
>> +++ b/arch/x86/mm/numa_64.c
>> @@ -11,7 +11,9 @@
>>  #include <linux/ctype.h>
>>  #include <linux/module.h>
>>  #include <linux/nodemask.h>
>> +#include <linux/cpumask.h>
>>  #include <linux/sched.h>
>> +#include <xen/interface/xen.h>
>>
>>  #include <asm/e820.h>
>>  #include <asm/proto.h>
>> @@ -19,6 +21,7 @@
>>  #include <asm/numa.h>
>>  #include <asm/acpi.h>
>>  #include <asm/k8.h>
>> +#include <asm/xen/hypervisor.h>
>
> If one does not set CONFIG_XEN does this compile?
>>
>>  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
>>  EXPORT_SYMBOL(node_data);
>> @@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
>>        */
>>       if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
>>               long n = simple_strtol(cmdline, NULL, 0);
>> -
>
> No need for this.
>>               num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
>>               if (num_nodes < 0)
>>                       return num_nodes;
>> @@ -522,16 +524,162 @@ out:
>>       numa_init_array();
>>       return 0;
>>  }
>> +struct xen_domain_numa_layout pv_numa_layout;
>> +
>> +void dump_numa_layout(struct xen_domain_numa_layout *layout)
>> +{
>> +    unsigned int i, j;
>> +    char vcpumask[128];
>> +    printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n",
>> +                            layout->max_vcpus, layout->max_vnodes);
>
> Redo the printk's. They need KERN_DEBUG
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> +        cpumask_scnprintf(vcpumask, sizeof(vcpumask),
>> +                                ((cpumask_t *)&vnode_data->vcpu_mask));
>> +        printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n",
>> +                vnode_data->vnode_id, vnode_data->mnode_id,
>> +                (unsigned long)vnode_data->nr_pages, vcpumask);
>
> This one too.
>> +    }
>> +
>> +    printk("vnode distances :\n");
>
> and
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +        printk("\tvnode[%u]", i);
>> +    for (i = 0; i < layout->max_vnodes; i++)
>> +    {
>> +        printk("\nvnode[%u]", i);
>
> this
>> +        for (j = 0; j < layout->max_vnodes; j++)
>> +            printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]);
>> +        printk("\n");
> one to.
>> +    }
>> +    return;
>> +}
>> +
>> +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout)
>> +{
>> +    /* Construct a slit table (using layout->vnode_distance).
>> +     * Copy it to acpi_slit. */
>> +    return;
>> +}
>> +/* Distribute the vcpus over the vnodes according to their affinity */
>> +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout)
>> +{
>> +     int vcpu, vnode;
>> +
>> +     printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n");
>
> pr_debug instead?
>> +
>> +    for (vnode = 0; vnode < layout->max_vnodes; vnode++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode];
>> +        cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask);
>> +
>> +        for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++)
>> +        {
>> +            if (cpu_isset(vcpu, vcpu_mask))
>> +            {
>> +                if (early_cpu_to_node(vcpu) != NUMA_NO_NODE)
>> +                {
>> +                    printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n",
>> +                                        vcpu, early_cpu_to_node(vcpu));
>> +                    continue;
>> +                }
>> +                printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode);
>> +                     numa_set_node(vcpu, vnode);
>> +            }
>> +        }
>> +    }
>> +    return;
>> +}
>> +
>> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout,
>> +                            unsigned long start_pfn, unsigned long last_pfn)
>> +{
>> +     int num_vnodes, i;
>> +    u64 node_start_addr, node_end_addr, max_addr;
>> +
>> +    printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)",
>> +                                        layout->max_vnodes, layout->max_vcpus);
>> +    dump_numa_layout(layout);
>> +     memset(&nodes, 0, sizeof(nodes));
>> +
>> +    num_vnodes = layout->max_vnodes;
>> +    BUG_ON(num_vnodes > MAX_NUMNODES);
>
> Hmm.. Is that really neccessary? What if we just do WARN("some lengthy explanation"),
> and bail out and not initialize these structures?
>> +
>> +    max_addr = last_pfn << PAGE_SHIFT;
>> +
>> +    node_start_addr = start_pfn << PAGE_SHIFT;
>> +    for (i = 0; i < num_vnodes; i++)
>> +    {
>> +        struct xen_vnode_data *vnode_data = &layout->vnode_data[i];
>> +        u64 node_size = vnode_data->nr_pages << PAGE_SHIFT;
>> +
>> +             node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */
>> +
>> +             if (i == (num_vnodes-1))
>> +                     node_end_addr = max_addr;
>> +             else
>> +        {
>> +            node_end_addr = node_start_addr + node_size;
>> +                     while ((node_end_addr - node_start_addr -
>> +                e820_hole_size(node_start_addr, node_end_addr)) < node_size)
>> +            {
>> +                node_end_addr += FAKE_NODE_MIN_SIZE;
>> +                             if (node_end_addr > max_addr) {
>> +                                     node_end_addr = max_addr;
>> +                                     break;
>> +                             }
>> +                     }
>> +        }
>> +        /* node_start_addr updated inside the function */
>> +        if (setup_node_range(i, nodes, &node_start_addr,
>> +                    (node_end_addr-node_start_addr), max_addr+1))
>> +            BUG();
>> +    }
>> +
>> +     printk(KERN_INFO "XEN domain numa emulation - setup nodes\n");
>
> Is this neccessary? Can be it be debug?
>> +
>> +    memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL);
>> +    if (memnode_shift < 0) {
>> +         printk(KERN_ERR "No NUMA hash function found.\n");
>> +        BUG();
>
> Wow. BUG? What about just bailing out and unset xen_pv_emulation flag?
>
>> +    }
>> +    /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */
>> +     /*
>> +      * We need to vacate all active ranges that may have been registered by
>> +      * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
>> +      * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
>> +      */
>> +     remove_all_active_ranges();
>> +
>> +    BUG_ON(acpi_numa >= 0);
>> +     for_each_node_mask(i, node_possible_map) {
>> +             e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
>> +                                             nodes[i].end >> PAGE_SHIFT);
>> +             setup_node_bootmem(i, nodes[i].start, nodes[i].end);
>> +     }
>> +    xen_init_slit_table(layout);
>> +     xen_init_numa_array(layout);
>> +     return 0;
>> +}
>>  #endif /* CONFIG_NUMA_EMU */
>>
>>  void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
>>  {
>>       int i;
>> +    struct xen_domain_numa_layout *numa_layout = &pv_numa_layout;
>> +
>> +    int xen_pv_numa_enabled = numa_layout->max_vnodes;
>>
>>       nodes_clear(node_possible_map);
>>       nodes_clear(node_online_map);
>>
>>  #ifdef CONFIG_NUMA_EMU
>> +    if (xen_pv_domain() && xen_pv_numa_enabled)
>> +    {
>> +        if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn))
>> +            return;
>> +    }
>> +
>>       if (cmdline && !numa_emulation(start_pfn, last_pfn))
>>               return;
>>       nodes_clear(node_possible_map);
>> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>> index ecb9b0d..b020555 100644
>> --- a/arch/x86/xen/enlighten.c
>> +++ b/arch/x86/xen/enlighten.c
>> @@ -83,6 +83,8 @@ void *xen_initial_gdt;
>>   */
>>  struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
>>
>> +struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout;
>> +
>>  /*
>>   * Flag to determine whether vcpu info placement is available on all
>>   * VCPUs.  We assume it is to start with, and then set it to zero on
>> @@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void)
>>       pv_cpu_ops.load_gdt = xen_load_gdt;
>>  }
>>
>> +extern struct xen_domain_numa_layout pv_numa_layout;
>>  /* First C function to be called on Xen boot */
>>  asmlinkage void __init xen_start_kernel(void)
>>  {
>> @@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void)
>>               xen_start_info->console.domU.evtchn = 0;
>>       }
>>
>> +    {
>> +        struct xen_domain_numa_layout *layout =
>> +            (void *)((char *)xen_start_info +
>> +                        xen_start_info->numa_layout_info.info_off);
>> +        memcpy(&pv_numa_layout, layout, sizeof(*layout));
>> +    }
>
> Shouldn't there be a check to see if the Xen actually exports
> this structure? Otherwise you might copy garbage in and set
> pv_numa_layout values to random values.
>
>>       xen_raw_console_write("about to get started...\n");
>>
>>       /* Start the world */
>> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
>> index 612f2c9..cb944a2 100644
>> --- a/arch/x86/xen/setup.c
>> +++ b/arch/x86/xen/setup.c
>> @@ -281,6 +281,9 @@ void __init xen_arch_setup(void)
>>               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
>>               disable_acpi();
>>       }
>> +
>> +    acpi_numa = -1;
>> +    numa_off = 1;
>>  #endif
>>
>>       /*
>> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
>> index 812ffd5..d4588fa 100644
>> --- a/include/xen/interface/xen.h
>> +++ b/include/xen/interface/xen.h
>> @@ -398,6 +398,53 @@ struct shared_info {
>>
>>  };
>>
>> +#define XEN_NR_CPUS 64
> No way to share this with some other variable? Like NR_CPUS?
>
>> +#if defined(__i386__)
>> +#define XEN_BITS_PER_LONG 32
>> +#define XEN_BYTES_PER_LONG 4
>
> There gotta be some of these defined in other headers.
>> +#define XEN_LONG_BYTEORDER 2
>> +#elif defined(__x86_64__)
>> +#define XEN_BITS_PER_LONG 64
>> +#define XEN_BYTES_PER_LONG 8
>> +#define XEN_LONG_BYTEORDER 3
>> +#endif
>> +
>> +/* same as cpumask_t - in xen and even Linux (for now) */
>> +#define XEN_BITS_TO_LONGS(bits) \
>> +    (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG)
>> +#define XEN_DECLARE_BITMAP(name,bits) \
>> +    unsigned long name[XEN_BITS_TO_LONGS(bits)]
>
> I am pretty sure there are macros for this laready.
>> +struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); };
>> +#ifndef __XEN__
>
> Is that neccessary? typedefs are frowned upon in kernel.
>> +typedef struct xen_cpumask xen_cpumask_t;
>> +#endif
>> +
>> +#define XEN_MAX_VNODES 8
>> +struct xen_vnode_data {
>> +    uint32_t vnode_id;
>> +    uint32_t mnode_id;
>> +    uint64_t nr_pages;
>> +    /* XXX: Can we use this in xen<->domain interfaces ? */
>> +    struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */
>> +};
>> +#ifndef __XEN__
>> +typedef struct xen_vnode_data xen_vnode_data_t;
>
> Ditto.
>> +#endif
>> +
>> +/* NUMA layout for the domain at the time of startup.
>> + * Structure has to fit within a page. */
>> +struct xen_domain_numa_layout {
>> +    uint32_t max_vcpus;
>> +    uint32_t max_vnodes;
>> +
>> +    /* Only (max_vnodes*max_vnodes) entries are filled */
>> +    uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES];
>> +    struct xen_vnode_data vnode_data[XEN_MAX_VNODES];
>> +};
>> +#ifndef __XEN__
>> +typedef struct xen_domain_numa_layout xen_domain_numa_layout_t;
>
> Ditto.
>> +#endif
>> +
>>  /*
>>   * Start-of-day memory layout for the initial domain (DOM0):
>>   *  1. The domain is started within contiguous virtual-memory region.
>> @@ -449,6 +496,13 @@ struct start_info {
>>       unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
>>       unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
>>       int8_t cmd_line[MAX_GUEST_CMDLINE];
>> +    /* The pfn range here covers both page table and p->m table frames.   */
>> +    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
>
> Why is this added in this patch? That doesn't look to be used by your
> patch.
>> +    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
>
> Ditto.
>> +    struct {
>> +        uint32_t info_off;  /* Offset of console_info struct.         *
>> +        uint32_t info_size; /* Size of console_info struct from start.*/
>
> Wrong comment.
>> +    } numa_layout_info;
>
>>  };
>>
>>  struct dom0_vga_console_info {
>
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-02-18 16:24 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-02-13  6:25 [RFC] pv guest numa [RE: Host Numa informtion in dom0] Dulloor
2010-02-16  1:15 ` Dan Magenheimer
2010-02-16 17:49 ` Konrad Rzeszutek Wilk
2010-02-18 16:24   ` Dulloor

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).