All of lore.kernel.org
 help / color / mirror / Atom feed
* about the guest(Redhat6.3) shows white screen, but the suse, ubuntu is ok
@ 2013-07-21 14:41 butine
  2013-07-22 11:32 ` Dario Faggioli
  0 siblings, 1 reply; 3+ messages in thread
From: butine @ 2013-07-21 14:41 UTC (permalink / raw)
  To: dario.faggioli; +Cc: xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 210 bytes --]

hello,Dario,

I have implemented the HVM guest NUMA.After I created the guest(Redhat6.3),it would show white screen when guest started.But the suse,ubuntu is ok.why??

 

Thanks

Regard,
Butine Huang
2013-07-15

[-- Attachment #1.2: Type: text/html, Size: 257 bytes --]

[-- Attachment #2: guest_numa.patch --]
[-- Type: application/octet-stream, Size: 98304 bytes --]

diff --git a/tools/firmware/hvmloader/acpi/acpi2_0.h b/tools/firmware/hvmloader/acpi/acpi2_0.h
index 9ea356b..36a813b 100644
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h
@@ -283,8 +283,68 @@ struct acpi_20_madt {
     uint32_t flags;
 };
 
+/*
+ * System Resource Affinity Table header definition (SRAT) (Version 3.0)
+ * X2APIC_CPU_AFFINITY is defined in version 4.0
+ */
+struct acpi_30_srat {
+	struct acpi_header header;	/* Common ACPI table header */
+	uint32_t table_revision;	/* Must be value '1' */
+	uint32_t reserved[2];		/* Reserved, must be zero */
+};
+#define ACPI_30_SRAT_TABLE_REVISION    0x1
+
+/* Values for type (in SRAT subtables) */
+enum acpi_30_srat_type {
+	ACPI_30_SRAT_TYPE_CPU_AFFINITY = 0,
+	ACPI_30_SRAT_TYPE_MEMORY_AFFINITY = 1,
+	ACPI_30_SRAT_TYPE_RESERVED = 2	/* 2 and greater are reserved */
+};
+
+/* type(0) : Processor Local APIC/SAPIC Affinity */
+struct acpi_30_srat_cpu_affinity {
+    uint8_t type;
+    uint8_t length;
+    uint8_t proximity_domain_lo;
+    uint8_t apic_id;
+    uint32_t flags;
+    uint8_t local_sapic_eid;
+    uint8_t proximity_domain_hi[3];
+    uint32_t reserved;		/* Reserved, must be zero */
+};
+
+/* Flags */
+#define ACPI_30_SRAT_CPU_USE_AFFINITY  (1)	/* 00: Use affinity structure */
+
+/* 1: Memory Affinity */
+
+struct acpi_30_srat_mem_affinity {
+    uint8_t type;
+    uint8_t length;
+    uint32_t proximity_domain;
+    uint16_t reserved;		/* Reserved, must be zero */
+    uint64_t base_address;
+    uint64_t size;
+    uint32_t reserved1;
+    uint32_t flags;
+    uint64_t reserved2;	    /* Reserved, must be zero */
+};
+
+/* Flags */
+#define ACPI_30_SRAT_MEM_ENABLED       (1)	/* 00: Use affinity structure */
+#define ACPI_30_SRAT_MEM_HOT_PLUGGABLE (1<<1)	/* 01: Mem is hot pluggable */
+#define ACPI_30_SRAT_MEM_NON_VOLATILE  (1<<2)	/* 02: Mem is non-volatile */
 
 /*
+ * System Locality Information Table header definition (SLIT) (Version 1.0)
+ */
+struct acpi_10_slit {
+    struct acpi_header header;
+    uint64_t locality_count;
+    uint8_t entry[1];
+};
+ 
+/*
  * HPET Description Table
  */
 struct acpi_20_hpet {
@@ -367,6 +427,9 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
 #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
 #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_3_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
+#define ACPI_1_0_SLIT_SIGNATURE ASCII32('S','L','I','T')
+
 
 /*
  * Table revision numbers.
@@ -379,6 +442,8 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_TCPA_REVISION 0x02
 #define ACPI_2_0_HPET_REVISION 0x01
 #define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_3_0_SRAT_REVISION 0x01
+#define ACPI_1_0_SLIT_REVISION 0x01
 
 #pragma pack ()
 
diff --git a/tools/firmware/hvmloader/acpi/build.c b/tools/firmware/hvmloader/acpi/build.c
index dc38c73..0403e75 100644
--- a/tools/firmware/hvmloader/acpi/build.c
+++ b/tools/firmware/hvmloader/acpi/build.c
@@ -149,6 +149,114 @@ static int construct_madt(struct acpi_20_madt *madt)
     return align16(offset);
 }
 
+static int 
+construct_srat_cpu_affinity(struct acpi_30_srat_cpu_affinity *cpu_srat)
+{
+    struct acpi_30_srat_cpu_affinity *cpu_srat_iter;
+    int vnode, vcpu;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
+    uint8_t *numa_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(numa_info);
+
+    for ( vnode = 0, cpu_srat_iter = cpu_srat; 
+                            vnode < numa_info->nr_vnodes; vnode++ )
+    {
+        for ( vcpu = 0 ; vcpu < numa_info->nr_vcpus; vcpu++ )
+        {
+            if (numa_vcpu_to_vnode[vcpu] == vnode)
+            {
+                memset(cpu_srat_iter, 0, sizeof(*cpu_srat_iter));
+                cpu_srat_iter->type = ACPI_30_SRAT_TYPE_CPU_AFFINITY;
+                cpu_srat_iter->length = sizeof(*cpu_srat);
+                cpu_srat_iter->proximity_domain_lo = vnode;
+                cpu_srat_iter->apic_id = LAPIC_ID(vcpu);
+                cpu_srat_iter->flags = ACPI_30_SRAT_CPU_USE_AFFINITY;
+                cpu_srat_iter++;
+            }
+        }
+    }
+    /* return length of the sub-table */
+    return ((uint8_t *)cpu_srat_iter-(uint8_t *)cpu_srat);
+}
+
+static int 
+construct_srat_mem_affinity(struct acpi_30_srat_mem_affinity *mem_srat)
+{
+    int vnode;
+    struct acpi_30_srat_mem_affinity *mem_srat_iter = mem_srat;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
+    struct xen_vnode_info *numa_vnode_info = NUMA_INFO_VNODE_INFO(numa_info);
+
+    for ( vnode = 0; vnode < numa_info->nr_vnodes; vnode++ )
+    {
+        struct xen_vnode_info *vnode_info = &numa_vnode_info[vnode];
+        memset(mem_srat_iter, 0, sizeof(*mem_srat_iter));
+        mem_srat_iter->type = ACPI_30_SRAT_TYPE_MEMORY_AFFINITY;
+        mem_srat_iter->length = sizeof(*mem_srat_iter);
+        mem_srat_iter->proximity_domain = vnode;
+        mem_srat_iter->base_address = (uint64_t)vnode_info->start << PAGE_SHIFT;
+        mem_srat_iter->size = 
+				(uint64_t)(vnode_info->end - vnode_info->start) << PAGE_SHIFT;
+        mem_srat_iter->flags = ACPI_30_SRAT_MEM_ENABLED;
+        mem_srat_iter++;
+    }
+    /* return length of the sub-table */
+    return ((uint8_t *)mem_srat_iter-(uint8_t *)mem_srat);
+}
+
+static int construct_srat(struct acpi_30_srat *srat)
+{
+    int offset;
+
+    memset(srat, 0, sizeof(*srat));
+    srat->header.signature    = ACPI_3_0_SRAT_SIGNATURE;
+    srat->header.revision     = ACPI_3_0_SRAT_REVISION;
+    fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    srat->header.oem_revision = ACPI_OEM_REVISION;
+    srat->header.creator_id   = ACPI_CREATOR_ID;
+    srat->header.creator_revision = ACPI_CREATOR_REVISION;
+    srat->table_revision = ACPI_30_SRAT_TABLE_REVISION;
+    offset = sizeof(*srat);
+
+    offset += construct_srat_cpu_affinity((struct acpi_30_srat_cpu_affinity *)
+                                                ((uint8_t *)srat + offset));
+
+    offset += construct_srat_mem_affinity((struct acpi_30_srat_mem_affinity *)
+                                                ((uint8_t *)srat + offset));
+
+    srat->header.length = offset;
+    set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+    return offset;
+}
+
+static int construct_slit(struct acpi_10_slit *slit)
+{
+    int offset, i, nr_vnodes;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
+    uint8_t *numa_vnode_distance = NUMA_INFO_VNODE_DISTANCE(numa_info);
+
+    memset(slit, 0, sizeof(*slit));
+    slit->header.signature    = ACPI_1_0_SLIT_SIGNATURE;
+    slit->header.revision     = ACPI_1_0_SLIT_REVISION;
+    fixed_strcpy(slit->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(slit->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    slit->header.oem_revision = ACPI_OEM_REVISION;
+    slit->header.creator_id   = ACPI_CREATOR_ID;
+    slit->header.creator_revision = ACPI_CREATOR_REVISION;
+    slit->locality_count = numa_info->nr_vnodes;
+
+    nr_vnodes = numa_info->nr_vnodes;
+    for (i=0; i<(nr_vnodes*nr_vnodes); i++)
+        slit->entry[i] = numa_vnode_distance[i];
+
+    offset = sizeof(*slit)+(nr_vnodes*nr_vnodes)-1;
+    slit->header.length = offset;
+    set_checksum(slit, offsetof(struct acpi_header, checksum), offset);
+
+    return offset;
+}
+
 static int construct_hpet(struct acpi_20_hpet *hpet)
 {
     int offset;
@@ -177,6 +285,8 @@ static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
     struct acpi_20_madt *madt;
     struct acpi_20_hpet *hpet;
     struct acpi_20_tcpa *tcpa;
+    struct acpi_30_srat *srat;
+    struct acpi_10_slit *slit;
     static const uint16_t tis_signature[] = {0x0001, 0x0001, 0x0001};
     uint16_t *tis_hdr;
     void *lasa;
@@ -189,6 +299,18 @@ static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
         table_ptrs[nr_tables++] = (unsigned long)madt;
     }
 
+	 /* SRAT/SLIT. */
+	 if ( hvm_info->numa_enabled &&
+			 hvm_info->numa_info[0].version == XEN_DOM_NUMA_INTERFACE_VERSION )
+	 {
+		 srat = (struct acpi_30_srat *)&buf[offset];
+		 offset += construct_srat(srat);
+		 table_ptrs[nr_tables++] = (unsigned long)srat;
+		 slit = (struct acpi_10_slit *)&buf[offset];
+		 offset += construct_slit(slit);
+		 table_ptrs[nr_tables++] = (unsigned long)slit;
+	 }
+
     /* HPET. */
     if ( hpet_exists(ACPI_HPET_ADDRESS) )
     {
diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
index 9942c3a..eef9b44 100644
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -32,6 +32,8 @@ CTRL_SRCS-y       += xc_mem_access.c
 CTRL_SRCS-y       += xc_memshr.c
 CTRL_SRCS-y       += xc_hcall_buf.c
 CTRL_SRCS-y       += xc_foreign_memory.c
+CTRL_SRCS-y       += xc_cpumap.c
+CTRL_SRCS-y       += xc_dom_numa.c
 CTRL_SRCS-y       += xtl_core.c
 CTRL_SRCS-y       += xtl_logger_stdio.c
 CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c
diff --git a/tools/libxc/ia64/xc_ia64_hvm_build.c b/tools/libxc/ia64/xc_ia64_hvm_build.c
index 18be616..bc8358e 100644
--- a/tools/libxc/ia64/xc_ia64_hvm_build.c
+++ b/tools/libxc/ia64/xc_ia64_hvm_build.c
@@ -1119,6 +1119,7 @@ int xc_hvm_build_target_mem(xc_interface *xch,
                             uint32_t domid,
                             int memsize,
                             int target,
+                            xc_domain_numa_config_t *numa_config,
                             const char *image_name)
 {
     /* XXX:PoD isn't supported yet */
diff --git a/tools/libxc/xc_cpumap.c b/tools/libxc/xc_cpumap.c
new file mode 100755
index 0000000..66e41fc
--- /dev/null
+++ b/tools/libxc/xc_cpumap.c
@@ -0,0 +1,104 @@
+#include "xc_cpumap.h"
+#include <stdio.h>
+
+/* Author : Lab309 */
+
+uint32_t xc_cpumap_next(int cpu, struct xenctl_cpumap *srcp)
+{
+    uint8_t *p, pos;
+    uint8_t *addr = xc_cpumap_bits(srcp);
+    uint32_t size = xc_cpumap_len(srcp);
+    uint32_t offset = cpu+1; /* Find the next set cpu */
+
+    if (offset >= size)
+        return size;
+
+    p = addr + XC_BITMAP_BYTE(offset);
+    pos = XC_BITMAP_BYTE_OFFSET(offset);
+
+    do {
+        for (; (pos < XC_BITS_PER_BYTE) && !((*p)&(1<<pos)); pos++);
+        if (pos < XC_BITS_PER_BYTE)
+            break;
+        pos = 0; p++;
+    } while (p < (addr+size));
+
+    return (((p-addr)*XC_BITS_PER_BYTE) + pos);
+}
+
+void __xc_cpumap_or(struct xenctl_cpumap *dstp,
+        struct xenctl_cpumap *src1p, struct xenctl_cpumap *src2p)
+{
+    uint8_t *dp = xc_cpumap_bits(dstp);
+    uint8_t *s1p = xc_cpumap_bits(src1p);
+    uint8_t *s2p = xc_cpumap_bits(src2p);
+    int nr = XC_BITS_TO_BYTES(xc_cpumap_len(dstp));
+    int k;
+    for (k=0; k<nr; k++)
+        dp[k] = s1p[k] | s2p[k];
+}
+
+static inline uint8_t hweight8(uint8_t w)
+{
+    uint8_t res = (w & 0x55) + ((w >> 1) & 0x55);
+    res = (res & 0x33) + ((res >> 2) & 0x33);
+    return (res & 0x0F) + ((res >> 4) & 0x0F);
+}
+
+int __xc_cpumap_weight(struct xenctl_cpumap *srcp)
+{
+    const uint8_t *sp = xc_cpumap_bits(srcp);
+    int k, w = 0, lim = XC_BITS_TO_BYTES(xc_cpumap_len(srcp));
+    for (k=0; k <lim; k++)
+        w += hweight8(sp[k]);
+    return w;
+}
+
+/* xenctl_cpumap print function */
+#define CHUNKSZ	8
+#define roundup_power2(val,modulus)	(((val) + (modulus) - 1) & ~((modulus) - 1))
+
+int __xc_cpumap_snprintf(char *buf, unsigned int buflen,
+                                        const struct xenctl_cpumap *cpumap)
+{
+    const uint8_t *maskp = xc_cpumap_bits(cpumap);
+    int nmaskbits = xc_cpumap_len(cpumap);
+	int i, word, bit, len = 0;
+	unsigned long val;
+	const char *sep = "";
+	int chunksz;
+	uint8_t chunkmask;
+
+	chunksz = nmaskbits & (CHUNKSZ - 1);
+	if (chunksz == 0)
+		chunksz = CHUNKSZ;
+
+	i = roundup_power2(nmaskbits, CHUNKSZ) - CHUNKSZ;
+	for (; i >= 0; i -= CHUNKSZ) {
+		chunkmask = ((1ULL << chunksz) - 1);
+		word = i / XC_BITS_PER_BYTE;
+		bit = i % XC_BITS_PER_BYTE;
+		val = (maskp[word] >> bit) & chunkmask;
+		len += snprintf(buf+len, buflen-len, "%s%0*lx", sep,
+			(chunksz+3)/4, val);
+		chunksz = CHUNKSZ;
+		sep = ",";
+	}
+	return len;
+}
+
+int xc_cpumap_printf(const struct xenctl_cpumap *cpumap)
+{
+	char buffer[1024];
+	int ret =\r__xc_cpumap_snprintf(buffer,1024,cpumap);
+	if(ret>=0)
+		printf("cpumap:%s\n",buffer);
+	return ret;
+}
+
+int xc_bitmap_printf(const struct xenctl_bitmap *bitmap)
+{
+	struct xenctl_cpumap cpumap ={bitmap->bitmap,bitmap->nr_elems};
+	return xc_cpumap_printf(&cpumap);
+}
+
diff --git a/tools/libxc/xc_cpumap.h b/tools/libxc/xc_cpumap.h
new file mode 100755
index 0000000..5cd4cda
--- /dev/null
+++ b/tools/libxc/xc_cpumap.h
@@ -0,0 +1,137 @@
+#ifndef __XENCTL_CPUMAP_H
+#define __XENCTL_CPUMAP_H
+
+#include "xc_private.h"
+#include <stdint.h>
+#include <string.h>
+
+#define XC_BITS_PER_BYTE 8
+#define XC_BITS_TO_BYTES(bits) \
+    (((bits)+XC_BITS_PER_BYTE-1)/XC_BITS_PER_BYTE)
+#define XC_BITMAP_BIT(nr)   (1 << (nr))
+#define XC_BITMAP_BIT_MASK(nr)  (1 << ((nr) % XC_BITS_PER_BYTE))
+#define XC_BITMAP_BYTE(nr)  ((nr) / XC_BITS_PER_BYTE)
+#define XC_BITMAP_BYTE_OFFSET(nr)  ((nr) % XC_BITS_PER_BYTE)
+#define XC_BITMAP_BYTE_MASK (0xFF)
+#define XC_BITMAP_LAST_BYTE_MASK(nbits)                             \
+            (((nbits) % XC_BITS_PER_BYTE) ?                         \
+                       ((1<<((nbits) % XC_BITS_PER_BYTE))-1) :      \
+                                            XC_BITMAP_BYTE_MASK)
+
+#define xc_cpumap_bits(maskp)                                           \
+                    ({  uint8_t *bitmap;                                \
+                        get_xen_guest_handle(bitmap, (maskp)->bitmap);  \
+                        bitmap; })
+#define xc_cpumap_len(maskp) ((maskp)->nr_cpus)
+
+/* For iterating over the cpus set in the cpumap */
+#define xc_for_each_cpu(cpu, mask)				\
+            __xc_for_each_cpu(cpu, &(mask))
+#define __xc_for_each_cpu(cpu, mask)            \
+	for ((cpu) = -1;				            \
+		(cpu) = xc_cpumap_next((cpu), (mask)),	\
+		(cpu) < xc_cpumap_len(mask);)
+extern uint32_t xc_cpumap_next(int n, struct xenctl_cpumap *srcp);
+
+#define xc_cpumap_set_cpu(cpu, dst) __xc_cpumap_set_cpu(cpu, &(dst))
+static inline void __xc_cpumap_set_cpu(int cpu, struct xenctl_cpumap *dstp)
+{
+    uint8_t mask = XC_BITMAP_BIT_MASK(cpu);
+    uint8_t *p = ((uint8_t *)xc_cpumap_bits(dstp)) + XC_BITMAP_BYTE(cpu);
+    *p |= mask;
+}
+
+#define xc_cpumap_clear_cpu(cpu, dst) __xc_cpumap_clear_cpu(cpu, &(dst))
+static inline void __xc_cpumap_clear_cpu(int cpu, struct xenctl_cpumap *dstp)
+{
+    uint8_t mask = XC_BITMAP_BIT_MASK(cpu);
+    uint8_t *p = ((uint8_t *)xc_cpumap_bits(dstp)) + XC_BITMAP_BYTE(cpu);
+    *p &= ~mask;
+}
+
+#define xc_cpumap_test_cpu(cpu, dst) __xc_cpumap_test_cpu(cpu, &(dst))
+static inline int __xc_cpumap_test_cpu(int cpu, struct xenctl_cpumap *dstp)
+{
+    uint8_t mask = XC_BITMAP_BIT_MASK(cpu);
+    uint8_t *p = ((uint8_t *)xc_cpumap_bits(dstp)) + XC_BITMAP_BYTE(cpu);
+    return *p & mask;
+}
+
+#define xc_cpumap_setall(dst) __xc_cpumap_setall(&(dst))
+static inline void __xc_cpumap_setall(struct xenctl_cpumap *dstp)
+{
+    uint8_t *dp = xc_cpumap_bits(dstp);
+    int nbits = xc_cpumap_len(dstp);
+    size_t nbytes = XC_BITS_TO_BYTES(nbits);
+    if (nbytes > 1)
+        memset(dp, 0xff, nbytes);
+    dp[nbytes-1] = XC_BITMAP_LAST_BYTE_MASK(nbits);
+}
+
+#define xc_cpumap_clearall(dst) __xc_cpumap_clearall(&(dst))
+static inline void __xc_cpumap_clearall(struct xenctl_cpumap *dstp)
+{
+    size_t nbytes = XC_BITS_TO_BYTES(xc_cpumap_len(dstp));
+    if (nbytes > 1)
+        memset(xc_cpumap_bits(dstp), 0x00, nbytes); 
+}
+
+#define xc_cpumap_or(dst, src1, src2) \
+                        __xc_cpumap_or(&(dst), &(src1), &(src2))
+extern void __xc_cpumap_or(struct xenctl_cpumap *dstp,
+        struct xenctl_cpumap *src1p, struct xenctl_cpumap *src2p);
+
+#define xc_cpumap_weight(src) __xc_cpumap_weight(&(src))
+extern int __xc_cpumap_weight(struct xenctl_cpumap *srcp);
+
+#define xc_cpumap_snprintf(buf, len, src) \
+			__xc_cpumap_snprintf((buf), (len), &(src))
+extern int __xc_cpumap_snprintf(char *buf, unsigned int len,
+					        const struct xenctl_cpumap *srcp);
+
+/***********************************************************************/
+static inline int lock_pages(void *addr, size_t len)
+{
+      int e;
+      void *laddr = (void *)((unsigned long)addr & PAGE_MASK);
+      size_t llen = (len + ((unsigned long)addr - (unsigned long)laddr) +
+                     PAGE_SIZE - 1) & PAGE_MASK;
+      e = mlock(laddr, llen);
+      return e;
+}
+
+static inline void unlock_pages(void *addr, size_t len)
+{
+    void *laddr = (void *)((unsigned long)addr & PAGE_MASK);
+    size_t llen = (len + ((unsigned long)addr - (unsigned long)laddr) +
+                   PAGE_SIZE - 1) & PAGE_MASK;
+	int saved_errno = errno;
+    munlock(laddr, llen);
+	errno = saved_errno;
+}
+
+static inline int
+xc_cpumap_lock_pages(struct xenctl_cpumap *map)
+{
+    uint8_t *bitmap;
+    uint32_t nr_bytes = XC_BITS_TO_BYTES(map->nr_cpus);
+    get_xen_guest_handle(bitmap, map->bitmap);
+    if (lock_pages(bitmap, nr_bytes))
+        return -1;
+    return 0;
+}
+
+static inline void
+xc_cpumap_unlock_pages(struct xenctl_cpumap *map)
+{
+    uint8_t *bitmap;
+    uint32_t nr_bytes = XC_BITS_TO_BYTES(map->nr_cpus);
+    get_xen_guest_handle(bitmap, map->bitmap);
+    unlock_pages(bitmap, nr_bytes);
+}
+
+int xc_cpumap_printf(const struct xenctl_cpumap *cpumap);
+
+int xc_bitmap_printf(const struct xenctl_bitmap *bitmap);
+
+#endif /* __XENCTL_CPUMAP_H */
diff --git a/tools/libxc/xc_dom_numa.c b/tools/libxc/xc_dom_numa.c
new file mode 100755
index 0000000..4a346e4
--- /dev/null
+++ b/tools/libxc/xc_dom_numa.c
@@ -0,0 +1,976 @@
+/* XEN Guest NUMA support
+ * Author : Lab309 */
+
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include "xg_private.h"
+#include "xc_dom_numa.h"
+#include "xc_cpumap.h"
+
+#ifdef __DOM_NUMA_DEBUG__
+#undef DBGPRINTF
+#define DBGPRINTF(_f, _a...) xc_report(xch, xch->error_handler, XTL_INFO,0, _f , ## _a)
+#endif
+
+#ifdef set_xen_guest_handle
+#undef set_xen_guest_handle
+#endif
+#define set_xen_guest_handle(hnd, val)  do { (hnd).p = val; } while (0)
+
+void print_numa_info(xc_interface *xch)
+{
+	DECLARE_HYPERCALL_BUFFER(xc_node_to_memsize_t, memsize);
+    DECLARE_HYPERCALL_BUFFER(xc_node_to_memfree_t, memfree);
+    DECLARE_HYPERCALL_BUFFER(uint32_t, node_dists);
+	xc_numainfo_t ninfo;
+	int max_nodes=xc_get_max_nodes(xch),i,j;
+	memsize = xc_hypercall_buffer_alloc
+        (xch, memsize, sizeof(*memsize) * max_nodes);
+    memfree = xc_hypercall_buffer_alloc
+        (xch, memfree, sizeof(*memfree) * max_nodes);
+    node_dists = xc_hypercall_buffer_alloc
+		(xch, node_dists, sizeof(*node_dists) * max_nodes * max_nodes);
+	set_xen_guest_handle(ninfo.node_to_memsize, memsize);
+    set_xen_guest_handle(ninfo.node_to_memfree, memfree);
+    set_xen_guest_handle(ninfo.node_to_node_distance, node_dists);
+    ninfo.max_node_index = max_nodes - 1;
+	xc_numainfo(xch, &ninfo);
+	 if (ninfo.max_node_index < max_nodes - 1)
+        max_nodes = ninfo.max_node_index + 1;
+
+	
+	printf("numa_info              :\n");
+    printf("node:    memsize    memfree    distances\n");
+
+    for (i = 0; i < max_nodes; i++) 
+	{
+    	printf("%4d:    %6"PRIu64"     %6"PRIu64"      %d", i,
+        memsize[i] >> 20, memfree[i] >> 20,
+                   node_dists[i * max_nodes + 0]);
+    	for (j = 1; j < max_nodes; j++)
+        	printf(",%d", node_dists[i * max_nodes + j]);
+        printf("\n");
+    }
+}
+
+void print_machine_layout(xc_machine_numa_layout_t *layout)
+{
+	printf("size_page:%"PRIu64"\n",layout->size_pages);
+	printf("free_pages:%"PRIu64"\n",layout->free_pages);
+	printf("nr_nodes:%u\n",layout->nr_nodes);
+	printf("node_distance:\n");
+	for(int i=0;i<layout->nr_nodes;i++)
+	{
+		for(int j=0;j<layout->nr_nodes;j++)
+			printf("%u ",layout->node_distance[i+j*layout->nr_nodes]);
+		printf("\n");
+	}
+}
+
+/* XXX: Move all sanity checks to this funtion */
+#define XC_DOM_NUMA_MIN_UNIT  256
+
+static char *numa_val_to_str(uint32_t val)
+{
+    switch (val)
+    {
+        case XC_DOM_NUMA_AUTO:
+                return "AUTO";
+        case XC_DOM_NUMA_CLUSTER:
+                return "CLUSTER";
+        case XC_DOM_NUMA_GUEST_NUMA:
+                return "GUEST NUMA";
+        case XC_DOM_NUMA_CROSS:
+                return "CROSS";
+        default:
+                return "NONE";
+    }
+}
+
+void print_layout(xc_domain_numa_layout_t *layout)
+{
+	if(layout==NULL)
+		printf("Error:Empty Point!\n");
+	printf("version:%u\n",layout->version);
+	printf("type:%u\n",layout->type);
+	printf("nr_vcpus:%u\n",layout->nr_vcpus);
+	printf("nr_vnodes:%u\n",layout->nr_vnodes);
+	printf("nr_pages:%u\n",layout->nr_pages);
+	printf("domid:%u\n",layout->domid);
+	printf("strategy:%s\n",numa_val_to_str(layout->strategy));
+	printf("unit_size:%u\n",layout->unit_size);
+	printf("node_distance:\n");
+	for(int i=0;i<layout->nr_vnodes;i++)
+	{
+		for(int j=0;j<layout->nr_vnodes;j++)
+			printf("%u ",(uint32_t)layout->vnode_distance[i+j*layout->nr_vnodes]);
+		printf("\n");
+	}
+}
+
+xc_domain_numa_layout_t * xc_dom_alloc_numa_layout(xc_interface *xch, 
+        uint32_t domid, uint64_t nr_pages, xc_domain_numa_config_t *config)
+{
+    xc_domain_numa_layout_t *dom_layout;
+
+    if (config->strategy == XC_DOM_NUMA_NONE)
+    {
+        IPRINTF("%s: NUMA memory allocation disabled\n", __FUNCTION__);
+        return 0;
+    }
+    if (!(dom_layout = (xc_domain_numa_layout_t *)malloc(sizeof(*dom_layout))))
+    {
+        ERROR("%s: dom_layout allocation failed\n", __FUNCTION__);
+        return dom_layout;
+    }
+
+    DBGPRINTF("%s: dom_layout allocated\n", __FUNCTION__);
+    memset(dom_layout, 0, sizeof(*dom_layout));
+
+    dom_layout->version = XEN_DOM_NUMA_INTERFACE_VERSION;
+    dom_layout->nr_pages = nr_pages;
+    dom_layout->nr_vnodes = config->nr_nodes;
+
+    /* Internal data */
+    dom_layout->domid = domid;
+    dom_layout->strategy = config->strategy;
+    dom_layout->unit_size = config->unit_size;
+    if (dom_layout->unit_size && 
+                        (dom_layout->unit_size < XC_DOM_NUMA_MIN_UNIT))
+    {
+        dom_layout->unit_size = XC_DOM_NUMA_MIN_UNIT;
+        IPRINTF("%s: Min cross unit size is %d pages\n", 
+                                        __FUNCTION__, dom_layout->unit_size);
+    }
+    return dom_layout;
+}
+
+void
+xc_dom_free_numa_layout(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+    DBGPRINTF("%s: dom_layout freed\n", __FUNCTION__);
+    free(dom_layout);
+}
+
+#define XC_DUMP_STR_SZ  (8192)
+static void
+xc_dump_dom_numa_layout(xc_interface *xch, xc_domain_numa_layout_t *layout)
+{
+    unsigned int i, j;
+    char *xc_dump_str, *dumpstr;
+    if (!(xc_dump_str = malloc(XC_DUMP_STR_SZ)))
+    {
+        DBGPRINTF("%s : dump_str allocation failed", __FUNCTION__);
+        return;
+    }
+    dumpstr = xc_dump_str;
+    dumpstr += sprintf(dumpstr, 
+                        "NUMA-LAYOUT(Dom %d) : vcpus(%u), vnodes(%u)",
+                        layout->domid, layout->nr_vcpus, layout->nr_vnodes);
+    switch (layout->type)
+    {
+        case XEN_DOM_NUMA_CLUSTER:
+            dumpstr += sprintf(dumpstr, ", type(CLUSTER)\n");
+            break;
+        case XEN_DOM_NUMA_GUSET_NUMA:
+            dumpstr += sprintf(dumpstr, ", type(GUSET_NUMA)\n");
+            break;
+        case XEN_DOM_NUMA_CROSS:
+            dumpstr += sprintf(dumpstr, ", type(CROSS)\n");
+            break;
+        case XEN_DOM_NUMA_DONTCARE:
+            dumpstr += sprintf(dumpstr, ", type(DONTCARE)\n");
+            break;
+        default:
+            dumpstr += sprintf(dumpstr, ", type(UNDEFINED)\n");
+    }
+    for (i = 0; i < layout->nr_vnodes; i++)
+    {
+        xc_vnode_data_t *vnode_data = &layout->vnode_data[i];
+        dumpstr += sprintf(dumpstr, "vnode[%u]:mnode(%u), node_nr_pages(%x)", 
+                vnode_data->vnode_id, vnode_data->mnode_id,
+                vnode_data->nr_pages);
+        if (layout->type == XEN_DOM_NUMA_GUSET_NUMA)
+        {
+            char mapstr[128] = "";
+            struct xenctl_cpumap cpumap;
+            xc_cpumap_from_cpumask(&cpumap, &vnode_data->vcpu_mask);
+            xc_cpumap_snprintf(mapstr, sizeof(mapstr), cpumap);
+            dumpstr += sprintf(dumpstr, ", vcpu_mask(%s)", mapstr);
+        }
+        dumpstr += sprintf(dumpstr, "\n");
+    }
+
+    if (layout->type == XEN_DOM_NUMA_CLUSTER)
+        goto done;
+    dumpstr += sprintf(dumpstr, "vnode distances :\n");
+    for (i = 0; i < layout->nr_vnodes; i++)
+        dumpstr += sprintf(dumpstr, "\tvnode[%u]", i);
+    for (i = 0; i < layout->nr_vnodes; i++)
+    {
+        dumpstr += sprintf(dumpstr, "\nvnode[%u]", i);
+        for (j = 0; j < layout->nr_vnodes; j++)
+            dumpstr += sprintf(dumpstr, "\t%u",
+                            layout->vnode_distance[i*layout->nr_vnodes + j]);
+        dumpstr += sprintf(dumpstr, "\n");
+    }
+done:
+    IPRINTF("%s", xc_dump_str);
+    free(xc_dump_str);
+    return;
+}
+
+
+int xc_get_machine_numa_layout(xc_interface *xch, xc_machine_numa_layout_t *layout)
+{
+    uint32_t i, nr_nodes, nr_cpus;
+    xc_numainfo_t ninfo = { 0 };
+    uint64_t node_memsize[XC_MAX_NODES];
+    uint64_t node_memfree[XC_MAX_NODES];
+    xc_topologyinfo_t tinfo = { 0 };
+    uint32_t cpu_to_node[XC_CPUMASK_NR_CPUS];
+
+	memset(layout, 0, sizeof(*layout));
+	memset(node_memsize, 0, sizeof(uint64_t)*XC_MAX_NODES);
+	memset(node_memfree, 0, sizeof(uint64_t)*XC_MAX_NODES);
+
+    set_xen_guest_handle(ninfo.node_to_memsize, node_memsize);
+    set_xen_guest_handle(ninfo.node_to_memfree, node_memfree);
+    /* Read directly into layout's structure */
+    set_xen_guest_handle(ninfo.node_to_node_distance, layout->node_distance);
+    ninfo.max_node_index = XC_MAX_NODES-1;
+    if (xc_numainfo(xch, &ninfo))
+    {
+        ERROR("%s: xc_numainfo failed", __FUNCTION__);
+        return -1;
+    }
+    /* No need to check if a node is invalid, as in that case
+     * the size would be zero and it would never get selected*/
+    nr_nodes = ninfo.max_node_index + 1;
+    if ( nr_nodes > XC_MAX_NODES )
+        nr_nodes = XC_MAX_NODES;
+
+
+    set_xen_guest_handle(tinfo.cpu_to_core, NULL);
+    set_xen_guest_handle(tinfo.cpu_to_socket, NULL);
+    set_xen_guest_handle(tinfo.cpu_to_node, cpu_to_node);
+    tinfo.max_cpu_index = XC_CPUMASK_NR_CPUS-1;
+
+    if (xc_topologyinfo(xch, &tinfo))
+    {
+        ERROR("%s: xc_topologyinfo failed", __FUNCTION__);
+        return -1;
+    }
+
+    nr_cpus = tinfo.max_cpu_index+1;
+    if (nr_cpus > XC_CPUMASK_NR_CPUS)
+        nr_cpus = XC_CPUMASK_NR_CPUS;
+
+    layout->nr_nodes = nr_nodes;
+    for (i=0; i<nr_nodes; i++)
+    {
+        uint64_t size_pages, free_pages;
+        layout->node_data[i].node_id = i;
+        size_pages = (node_memsize[i] >> PAGE_SHIFT);
+        free_pages = (node_memfree[i] >> PAGE_SHIFT);
+        layout->node_data[i].size_pages = size_pages;
+        layout->node_data[i].free_pages = free_pages;
+        layout->size_pages += size_pages;
+        layout->free_pages += free_pages;
+    }
+
+    for (i=0; i<nr_cpus; i++)
+    {
+        struct xenctl_cpumap cpumap;
+        xc_cpumask_t *cpumask;
+
+        if (cpu_to_node[i] == INVALID_TOPOLOGY_ID)
+            continue;
+        cpumask = &(layout->node_data[(cpu_to_node[i])].cpu_mask);
+        xc_cpumap_from_cpumask(&cpumap, cpumask);
+        xc_cpumap_set_cpu(i, cpumap);
+    }
+    return 0;
+}
+
+static int
+xc_get_max_vcpus(xc_interface *xch, uint32_t domid)
+{
+    DECLARE_DOMCTL;
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)domid;
+    return ((do_domctl(xch, &domctl) < 0)
+            ? 0 : (domctl.u.getdomaininfo.max_vcpu_id+1));
+}
+
+/* The function makes a (greedy) best fit selection of num_vnodes of
+ * vnode_size each. The number of pages selected from each node are returned
+ * in the node_pages_selected array.
+ * The best_fit ranking is based on the fraction(up to 1024 parts) of node
+ * memory occupied, if the node is selected.
+ * Returns 0 on success and 1 if selection fails. */
+/* XXX: Node selection needs more research/experience. */
+static int xc_select_best_fit_nodes(
+        xc_interface *xch, xc_machine_numa_layout_t *phys_layout,
+        uint32_t num_vnodes, uint64_t vnode_pages, uint64_t *nodes_pages)
+{
+    int i, num_nodes_selected;
+    uint64_t best_fit_rank;
+
+    DBGPRINTF("%s: called\n", __FUNCTION__);
+#define INVALID_NODE (~0)
+#define NODE_FIT_RANK_SHIFT (10)
+    best_fit_rank = 0;
+	num_nodes_selected = 0;
+
+    do {
+        int selected_node = INVALID_NODE;
+        for (i=0; i<phys_layout->nr_nodes; i++)
+        {
+            xc_node_data_t *node_data;
+            uint64_t node_sizepages, node_freepages;
+            uint64_t node_fit_rank;
+
+            /* Node is already selected */
+            if (nodes_pages[i])
+                continue;
+
+            node_data = &phys_layout->node_data[i];
+            node_sizepages = node_data->size_pages;
+            node_freepages = node_data->free_pages;
+
+            if (node_freepages < vnode_pages)
+                continue;
+
+            node_fit_rank = ((node_sizepages-node_freepages-vnode_pages)
+                                    << NODE_FIT_RANK_SHIFT) / node_sizepages;
+
+            if (node_fit_rank > best_fit_rank)
+                selected_node = i;
+        }
+
+        /* Nodes could not be selected. Bail out ! */
+        if (selected_node == INVALID_NODE)
+            return -1;
+
+        nodes_pages[selected_node] = vnode_pages;
+        num_nodes_selected++;
+    } while(num_nodes_selected < num_vnodes);
+#undef NODE_FIT_RANK_SHIFT
+#undef INVALID_NODE
+    return 0;
+}
+
+/* Sort the phys nodes in the decreasing order of free node memory */
+static void xc_sort_nodeload(xc_machine_numa_layout_t *phys_layout)
+{
+    int i, j;
+    uint32_t nr_nodes;
+
+    nr_nodes = phys_layout->nr_nodes;
+
+    for (i = 0; i < nr_nodes; i++)
+    {
+        uint64_t i_node_free = phys_layout->node_data[i].free_pages; 
+        for (j = i+1; j < nr_nodes; j++)
+        {
+            uint64_t j_node_free = phys_layout->node_data[j].free_pages; 
+            if (i_node_free > j_node_free)
+            {
+                xc_node_data_t tmp_node_data;
+                tmp_node_data = phys_layout->node_data[i];
+                phys_layout->node_data[i] = phys_layout->node_data[j];
+                phys_layout->node_data[j] = tmp_node_data;
+            }
+        }
+    }
+
+    return;
+}
+
+/* The function selects the nodes in the increasing order of free node memory,
+ * and fills them. The physical memory map for such a domain is distrubuted 
+ * across all the selected nodes. 
+ * The phys_layout node_data structures could be sorted inplace. So, we 
+ * should always use node_data->node_id while using the node_distance array. 
+ * Returns the number of nodes selected. */
+static int xc_select_max_fit_nodes(
+        xc_interface *xch, xc_machine_numa_layout_t *phys_layout,
+                                    uint64_t dom_pages, uint64_t *node_pages)
+{
+    int i;
+    uint64_t dom_alloc_pages;
+
+    DBGPRINTF("%s: called\n", __FUNCTION__);
+    xc_sort_nodeload(phys_layout);
+
+    dom_alloc_pages = 0;
+    for (i=0; i<phys_layout->nr_nodes; i++)
+    {
+        xc_node_data_t *node_data;
+        uint64_t node_freepages;
+
+        node_data = &phys_layout->node_data[i];
+
+        /* In max-fit, if we try to pack the nodes too aggressively
+         * we might fail on any small allocation (from xen node heaps).
+		 * That's why, with DEFAULT, we don't use exact_node flag. */
+        node_freepages = node_data->free_pages;
+        if (!node_freepages)
+            continue;
+
+        if (node_freepages > (dom_pages-dom_alloc_pages))
+            node_freepages = (dom_pages-dom_alloc_pages);
+
+        node_pages[i] = node_freepages;
+        dom_alloc_pages += node_freepages;
+    }
+    if (dom_alloc_pages != dom_pages)
+    {
+        ERROR(
+                "%s: Failed to allocate memory. Maybe had to balloon more\n",
+                __FUNCTION__);
+        return -1;
+    }
+    return (i+1);
+}
+
+static int xc_setup_vnode_vcpu_masks(xc_domain_numa_layout_t *dom_layout)
+{
+    int vcpu;
+    for (vcpu=0; vcpu<dom_layout->nr_vcpus; vcpu++)
+    {
+        struct xenctl_cpumap vcpumap;
+        xc_cpumask_t *vcpumask;
+        int vnode = vcpu/(dom_layout->nr_vcpus/dom_layout->nr_vnodes);
+
+        vcpumask = &dom_layout->vnode_data[vnode].vcpu_mask;
+        xc_cpumap_from_cpumask(&vcpumap, vcpumask);
+        xc_cpumap_set_cpu(vcpu, vcpumap);
+    } 
+    return 0;    
+}
+
+static int xc_setup_vnode_distances(xc_machine_numa_layout_t *phys_layout, 
+                                        xc_domain_numa_layout_t *dom_layout)
+{
+    int vn1, vn2;
+    for (vn1=0; vn1<dom_layout->nr_vnodes; vn1++)
+    {
+        int n1 = dom_layout->vnode_data[vn1].mnode_id;
+        for (vn2=0; vn2<dom_layout->nr_vnodes; vn2++)
+        {
+            int n2 = dom_layout->vnode_data[vn2].mnode_id;
+            dom_layout->vnode_distance[(vn1*dom_layout->nr_vnodes)+vn2] =
+                phys_layout->node_distance[(n1*phys_layout->nr_nodes)+n2];
+        
+        }
+    }
+    return 0;
+}
+
+/* We require the vnodes to be aligned to 1GB 
+ * SHIFT values for 4K pages */
+#define XC_VNODE_MIN_SHIFT   (XEN_MIN_VNODE_SHIFT-PAGE_SHIFT)
+#define XC_VNODE_MIN_SIZE   (1UL << XC_VNODE_MIN_SHIFT)
+#define XC_VNODE_MIN_MASK ~(XC_VNODE_MIN_SIZE-1)
+/* Because we are strict with the alignment, we boost the size 
+ * to account for the pages not seen in physmap (by 16MB for now). */
+#define XC_VNODE_BOOST_SIZE (4096)
+#define XC_VCPUS_PER_VNODE (1)
+#define XC_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
+
+static int xc_setup_domain_vnodes(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout,
+	uint64_t *node_pages_selected)
+{
+	int i;
+    uint32_t vnode_id;
+
+    for (i=0, vnode_id=0; i<phys_layout->nr_nodes; i++)
+    {
+        xc_node_data_t *node_data;
+        xc_vnode_data_t *vnode_data;
+
+        if (!node_pages_selected[i])
+            continue;
+
+        node_data = &phys_layout->node_data[i];
+        vnode_data = &dom_layout->vnode_data[vnode_id];
+        vnode_data->vnode_id = vnode_id;
+        vnode_data->nr_pages = node_pages_selected[i];
+        vnode_data->mnode_id = node_data->node_id;
+        vnode_id++;
+    }
+    if (vnode_id != dom_layout->nr_vnodes)
+    {
+        ERROR("%s: Internal Error(vnode count mismatch) (%d/%d) !\n", 
+                                __FUNCTION__, vnode_id, dom_layout->nr_vnodes);
+        return -1;
+    }
+    /* vnodes are exposed to the guest only for GUEST NUMA. */
+    if (xc_setup_vnode_vcpu_masks(dom_layout) || 
+            (xc_setup_vnode_distances(phys_layout, dom_layout)))
+    {
+        ERROR("%s: vnode setup failed !\n", __FUNCTION__);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int xc_select_domain_prep(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    if (!dom_layout->nr_vnodes)
+    {
+        ERROR("%s: VM nr_vnodes configured incorrectly !\n", __FUNCTION__);
+        return -1; 
+    }
+
+    if (dom_layout->nr_pages > phys_layout->free_pages)
+    {
+        ERROR(
+            "%s: Not enough memory for pv (unlikely after balloon checks)\n",
+                __FUNCTION__);
+        return -1;
+    }
+
+    if (!(dom_layout->nr_vcpus = xc_get_max_vcpus(xch, dom_layout->domid)))
+    {
+        ERROR("%s: xc_get_max_vcpus failed !\n", __FUNCTION__);
+        return -1; 
+    }
+
+    if (dom_layout->nr_vcpus > XC_CPUMASK_NR_CPUS)
+    {
+        ERROR("%s: Failed - More than %d vcpus!\n",
+                                            __FUNCTION__,  XC_CPUMASK_NR_CPUS);
+        return -1; 
+    }
+
+    if (dom_layout->nr_vcpus < dom_layout->nr_vnodes )
+    {
+        ERROR("%s: VM (%d) - more vcpus(%d) than vnodes(%d)!\n",
+                __FUNCTION__, dom_layout->domid, dom_layout->nr_vcpus,
+                dom_layout->nr_vnodes);
+        return -1; 
+    }
+
+    return 0;
+}
+
+static int xc_select_domain_cluster(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    uint64_t *node_pages_selected = 0;
+    int rc;
+
+    DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+    if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+        return -1;
+
+    if (!(node_pages_selected = 
+                (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+    {
+        rc = -1;
+        ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+    	goto failed;
+    }
+	if ((rc = xc_select_best_fit_nodes(xch, phys_layout, 1, 
+                    dom_layout->nr_pages, node_pages_selected)))
+    {
+        ERROR("%s: Not enough memory for CLUSTER (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    	goto failed;
+    }
+
+    dom_layout->type = XEN_DOM_NUMA_CLUSTER;
+    rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, 
+                                                    node_pages_selected);
+    if (!rc)
+        DBGPRINTF("%s: Selected CLUSTER for VM %d\n", 
+                                    __FUNCTION__, dom_layout->domid);
+failed:
+    if (node_pages_selected)
+        free(node_pages_selected);
+    return rc;
+}
+
+/* For the numa guests, we construct a symmetrical topology (wrt the 
+ * distribution of vcpus over vnodes).
+ * We require the numa guests to have (2^n) vcpus and (2^k) vnodes.
+ * Each vnode is then assigned 2^(n-k) vcpus, where (n>=k).
+ */
+static int xc_select_domain_guest_numa(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    uint64_t vnode_nr_pages, *node_pages_selected = 0;
+    int rc;
+
+    DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+    if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+        return -1;
+
+    if (!XC_POWER_OF_2(dom_layout->nr_vcpus))
+    {
+        ERROR("%s: #vcpus != 2^n (disable guest numa)\n", __FUNCTION__);
+		return -1;
+    }
+    if (!XC_POWER_OF_2(dom_layout->nr_vnodes))
+    {
+        ERROR("%s: #vnodes != 2^n (disable guest numa)\n", __FUNCTION__);
+		return -1;
+    }
+	if (dom_layout->nr_vcpus < (dom_layout->nr_vnodes*XC_VCPUS_PER_VNODE))
+	{
+        ERROR("%s: Failed - Not enough vcpus (%d on %d)!\n",
+				__FUNCTION__, dom_layout->nr_vcpus, dom_layout->nr_vnodes);
+        return -1; 
+	}
+
+	vnode_nr_pages = 
+        (dom_layout->nr_pages+XC_VNODE_BOOST_SIZE)/dom_layout->nr_vnodes;
+    vnode_nr_pages &= XC_VNODE_MIN_MASK;
+	if (vnode_nr_pages < XC_VNODE_MIN_SIZE)
+	{
+        ERROR("%s: vnode_size(%lx)<min(%lx), nr_pages(%lx), nr_vnodes(%d)!\n",
+				__FUNCTION__, vnode_nr_pages, XC_VNODE_MIN_SIZE,
+                dom_layout->nr_pages, dom_layout->nr_vnodes);
+        return -1; 
+	}
+    dom_layout->nr_pages = vnode_nr_pages*dom_layout->nr_vnodes;
+
+    if (!(node_pages_selected = 
+                (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+    {
+        rc = -1;
+        ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+    	goto failed;
+    }
+	if ((rc = xc_select_best_fit_nodes(xch, phys_layout, dom_layout->nr_vnodes, 
+                    vnode_nr_pages, node_pages_selected)) != 0)
+    {
+        ERROR("%s: Not enough memory for GUSET_NUMA (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    	goto failed;
+    }
+
+    dom_layout->nr_pages = dom_layout->nr_vnodes*vnode_nr_pages;
+    dom_layout->type = XEN_DOM_NUMA_GUSET_NUMA;
+    if ((rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, 
+                                                    node_pages_selected)))
+        goto failed;
+
+    if ((rc = xc_domain_setmaxmem(xch, dom_layout->domid, 
+            (dom_layout->nr_pages+XC_VNODE_BOOST_SIZE)<<(PAGE_SHIFT-10))))
+        goto failed;
+
+    DBGPRINTF("%s: Selected GUEST_NUMA for VM %d\n", 
+                                    __FUNCTION__, dom_layout->domid);
+failed:
+    if (node_pages_selected)
+        free(node_pages_selected);
+    return rc;
+}
+
+static int xc_select_domain_cross(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    uint64_t vnode_nr_pages, *node_pages_selected = 0;
+    int rc;
+
+    DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+    if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+        return -1;
+
+	vnode_nr_pages = dom_layout->nr_pages/dom_layout->nr_vnodes;
+
+    if (!(node_pages_selected = 
+                (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+    {
+        rc = -1;
+        ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+    	goto failed;
+    }
+	if ((rc = xc_select_best_fit_nodes(xch, phys_layout, dom_layout->nr_vnodes, 
+                    vnode_nr_pages, node_pages_selected)) != 0)
+    {
+        ERROR("%s: Not enough memory for CROSS (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    	goto failed;
+    }
+
+    dom_layout->nr_pages = dom_layout->nr_vnodes*vnode_nr_pages;
+    dom_layout->type = XEN_DOM_NUMA_CROSS;
+    rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, 
+                                                    node_pages_selected);
+    if (!rc)
+        DBGPRINTF("%s: Selected CROSS for VM %d\n", 
+                                    __FUNCTION__, dom_layout->domid);
+failed:
+    if (node_pages_selected)
+        free(node_pages_selected);
+    return rc;
+}
+
+static int xc_select_domain_dontcare(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+    uint64_t *node_pages_selected = 0;
+    int rc;
+
+    DBGPRINTF("%s: Called for VM %d\n", __FUNCTION__, dom_layout->domid);
+    if ((rc = xc_select_domain_prep(xch, phys_layout, dom_layout)))
+        return -1;
+
+    if (!(node_pages_selected = 
+                (uint64_t *)calloc(XC_MAX_NODES, sizeof(uint64_t))))
+    {
+        rc = -1;
+        ERROR("%s: node_pages allocation failed\n", __FUNCTION__);
+    	goto failed;
+    }
+	if ((rc = xc_select_max_fit_nodes(xch, phys_layout, dom_layout->nr_pages, 
+                                    node_pages_selected)) < 0)
+    {
+        ERROR("%s: Not enough memory for CLUSTER (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    	goto failed;
+    }
+
+    dom_layout->type = XEN_DOM_NUMA_DONTCARE;
+    dom_layout->nr_vnodes = rc;
+    rc = xc_setup_domain_vnodes(xch, phys_layout, dom_layout, 
+                                                    node_pages_selected);
+    if (!rc)
+        DBGPRINTF("%s: Selected DONTCARE for VM %d\n", 
+                                    __FUNCTION__, dom_layout->domid);
+failed:
+    if (node_pages_selected)
+        free(node_pages_selected);
+    return rc;
+}
+
+#define XC_DOM_IS_NUMA_GUEST(n) (0)
+
+static int xc_select_domain_auto(xc_interface *xch,
+    xc_machine_numa_layout_t *phys_layout, xc_domain_numa_layout_t *dom_layout)
+{
+	int i;
+
+    /* Attempt to cluster the VM */
+    DBGPRINTF("%s: Selecting allocation strategy for (VM %d)\n", 
+                                    __FUNCTION__, dom_layout->domid);
+
+    dom_layout->nr_vnodes = 1;
+	if (!xc_select_domain_cluster(xch, phys_layout, dom_layout))
+		return 0;
+
+    if (!XC_DOM_IS_NUMA_GUEST(dom_layout))
+        DBGPRINTF("%s: Image doesn't support numa (VM %d)\n", 
+                                    __FUNCTION__, dom_layout->domid);
+    else
+	{
+    	/* Attempt to show guest numa to the VM */
+    	for (i = 2; i <= phys_layout->nr_nodes; i<<=1)
+        {
+            dom_layout->nr_vnodes = i;
+			if (!xc_select_domain_guest_numa(xch, phys_layout, dom_layout))
+				return 0;
+        }
+	}
+
+  	/* Attempt to make the VM cross echo node*/
+   	for (i = 2; i <= phys_layout->nr_nodes; i++)
+    {
+        dom_layout->nr_vnodes = i;
+		if (!xc_select_domain_cross(xch, phys_layout, dom_layout))
+			return 0;
+    }
+
+	if (!xc_select_domain_dontcare(xch, phys_layout, dom_layout))
+		return 0;
+
+    ERROR("%s: Failed to allocate memory for the VM (Had to balloon more ?)\n",
+                                                            __FUNCTION__);
+    return -1;
+}
+
+int xc_setup_numa_domain(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+    int rc;
+    xc_machine_numa_layout_t *phys_layout;
+
+    DBGPRINTF("%s: called (mem_strategy:%d)\n",
+                                    __FUNCTION__, dom_layout->strategy);
+
+    if (!(phys_layout = malloc(sizeof(*phys_layout))))
+    {
+        ERROR( "%s: phys_layout allocation failed\n", __FUNCTION__);
+        return -1;
+    }
+
+    if ((rc = xc_get_machine_numa_layout(xch, phys_layout)))
+    {
+        ERROR( "%s: xc_get_machine_numa_layout failed\n", __FUNCTION__);
+        goto done;
+    }
+
+	switch (dom_layout->strategy)
+	{
+		case XC_DOM_NUMA_AUTO:
+			rc = xc_select_domain_auto(xch, phys_layout, dom_layout);
+			break;
+		case XC_DOM_NUMA_CLUSTER:
+            dom_layout->nr_vnodes = 1; /* In case configured bad */
+			rc = xc_select_domain_cluster(xch, phys_layout, dom_layout);
+			break;
+		case XC_DOM_NUMA_GUEST_NUMA:
+			rc = xc_select_domain_guest_numa(xch, phys_layout, dom_layout);
+			break;
+		case XC_DOM_NUMA_CROSS:
+			rc = xc_select_domain_cross(xch, phys_layout, dom_layout);
+			break;
+		default:
+			rc = -1;
+        	ERROR("%s: Unknown memory allocation strategy (%d)\n",
+								__FUNCTION__, dom_layout->strategy);
+	}
+
+	if (rc)
+ 	{
+       	ERROR("%s: xc_select_domain failed for (%d)\n", 
+				__FUNCTION__, dom_layout->strategy);
+       	goto done;
+   	}
+
+    xc_dump_dom_numa_layout(xch, dom_layout);
+done:
+    free(phys_layout);
+    return rc;
+}
+
+static int
+xc_domain_numa_vcpu_setaffinity(xc_interface *xch, uint32_t domid,
+                                int vcpu, struct xenctl_cpumap *cpumap)
+{
+    DECLARE_DOMCTL;
+    int ret = -1;
+
+    domctl.cmd = XEN_DOMCTL_setvcpuaffinity;
+    domctl.domain = (domid_t)domid;
+    domctl.u.vcpuaffinity.vcpu = vcpu;
+    domctl.u.vcpuaffinity.cpumap.bitmap = cpumap->bitmap;
+	domctl.u.vcpuaffinity.cpumap.nr_elems = cpumap->nr_cpus;
+
+    if ( xc_cpumap_lock_pages(cpumap) != 0 )
+    {
+        PERROR("Could not lock memory for Xen hypercall");
+        goto out;
+    }
+
+    ret = do_domctl(xch, &domctl);
+    xc_cpumap_unlock_pages(cpumap);
+ out:
+    return ret;
+}
+
+static int
+xc_domain_numa_pinvcpus_guest_numa(xc_interface *xch,
+                                xc_domain_numa_layout_t *dom_layout,
+                                xc_machine_numa_layout_t *phys_layout)
+{
+    int vnode;
+
+    for (vnode = 0; vnode < dom_layout->nr_vnodes; vnode++)
+    {
+        int vcpu;
+        int mnode = dom_layout->vnode_data[vnode].mnode_id;
+        xc_cpumask_t *node_cpumask =
+                    &phys_layout->node_data[mnode].cpu_mask;
+        xc_cpumask_t *vnode_vcpumask =
+                    &dom_layout->vnode_data[vnode].vcpu_mask;
+        struct xenctl_cpumap node_cpumap, vnode_vcpumap;
+
+        xc_cpumap_from_cpumask(&node_cpumap, node_cpumask);
+        xc_cpumap_from_cpumask(&vnode_vcpumap, vnode_vcpumask);
+        xc_for_each_cpu(vcpu, vnode_vcpumap)
+        {
+            if (xc_domain_numa_vcpu_setaffinity(
+                        xch, dom_layout->domid, vcpu, &node_cpumap)) 
+            {
+                ERROR( "%s:xc_vcpu_setaffinity failed\n", __FUNCTION__);
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+static int
+xc_domain_numa_pinvcpus_cross(xc_interface *xch,
+                                xc_domain_numa_layout_t *dom_layout,
+                                xc_machine_numa_layout_t *phys_layout)
+{
+    int vnode, vcpu;
+    xc_cpumask_t cross_cpumask;
+    struct xenctl_cpumap cross_cpumap;
+
+    xc_cpumap_from_cpumask(&cross_cpumap, &cross_cpumask);
+    xc_cpumap_clearall(cross_cpumap);
+
+    for (vnode = 0; vnode < dom_layout->nr_vnodes; vnode++)
+    {
+        int mnode = dom_layout->vnode_data[vnode].mnode_id;
+        xc_cpumask_t *node_cpumask =
+                    &phys_layout->node_data[mnode].cpu_mask;
+        struct xenctl_cpumap node_cpumap;
+
+        xc_cpumap_from_cpumask(&node_cpumap, node_cpumask);
+        xc_cpumap_or(cross_cpumap, cross_cpumap, node_cpumap);
+    }
+
+    for (vcpu = 0; vcpu < dom_layout->nr_vcpus; vcpu++)
+    {
+        if (xc_domain_numa_vcpu_setaffinity(
+                    xch, dom_layout->domid, vcpu, &cross_cpumap)) 
+        {
+            ERROR( "%s:xc_scpu_getaffinity failed\n", __FUNCTION__);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+int
+xc_domain_numa_pinvcpus(xc_interface *xch, xc_domain_numa_layout_t *dom_layout)
+{
+    int rc;
+
+    xc_machine_numa_layout_t *phys_layout;
+    if (!(phys_layout = malloc(sizeof(*phys_layout))))
+    {
+        ERROR( "%s: layout allocation failed\n", __FUNCTION__);
+        return -1;
+    }
+
+    if ((rc = xc_get_machine_numa_layout(xch, phys_layout)))
+    {
+        ERROR( "%s: xc_get_machine_numa_layout failed\n",
+                                                            __FUNCTION__);
+        goto done;
+    }
+
+    if ((dom_layout->type == XEN_DOM_NUMA_CROSS) || 
+						(dom_layout->type == XEN_DOM_NUMA_DONTCARE))
+        rc = xc_domain_numa_pinvcpus_cross(xch, dom_layout, phys_layout);
+    else
+        rc = xc_domain_numa_pinvcpus_guest_numa(xch, dom_layout, phys_layout);
+done:
+    free(phys_layout);
+    return rc;
+}
+
+#undef set_xen_guest_handle
diff --git a/tools/libxc/xc_dom_numa.h b/tools/libxc/xc_dom_numa.h
new file mode 100755
index 0000000..764dcba
--- /dev/null
+++ b/tools/libxc/xc_dom_numa.h
@@ -0,0 +1,115 @@
+#ifndef __XC_DOM_NUMA_H
+#define __XC_DOM_NUMA_H
+
+#include "xenctrl.h"
+#include <xen/dom_numa.h>
+
+#define XC_CPUMASK_NR_CPUS XEN_MAX_VCPUS
+#define XC_MAX_VNODES 8
+
+#define XC_CPUMASK_BITS_PER_BYTE 8
+#define XC_CPUMASK_BITS_TO_BYTES(bits) \
+    (((bits)+XC_CPUMASK_BITS_PER_BYTE-1)/XC_CPUMASK_BITS_PER_BYTE)
+#define XC_CPUMASK_DECLARE_BITMAP(name,bits) \
+    uint8_t name[XC_CPUMASK_BITS_TO_BYTES(bits)]
+
+struct xc_cpumask{ XC_CPUMASK_DECLARE_BITMAP(bits, XC_CPUMASK_NR_CPUS); };
+typedef struct xc_cpumask xc_cpumask_t;
+
+/* Construct a xenctl_cpumap structure using buffer from the xc_cpumask
+ * structure */
+#define xc_cpumap_from_cpumask(map, mask)               \
+do {                                                    \
+    (map)->nr_cpus = XC_CPUMASK_NR_CPUS;                    \
+    set_xen_guest_handle((map)->bitmap, (mask)->bits);  \
+}while(0)
+
+
+struct xc_vnode_data {
+    uint8_t vnode_id;
+    uint8_t mnode_id;
+    uint32_t nr_pages;
+    xc_cpumask_t vcpu_mask; /* vnode_to_vcpumask */
+};
+typedef struct xc_vnode_data xc_vnode_data_t;
+
+struct xc_domain_numa_layout {
+    uint8_t version;
+    uint8_t type;
+
+    uint8_t nr_vcpus;
+    uint8_t nr_vnodes;
+
+    uint32_t nr_pages;
+    /* Only (nr_vnodes) entries are filled */
+    xc_vnode_data_t vnode_data[XC_MAX_VNODES];
+    /* Only (nr_vnodes*nr_vnodes) entries are filled */
+    uint8_t vnode_distance[XC_MAX_VNODES*XC_MAX_VNODES];
+
+    /* For Internal USE only */
+    uint32_t domid;
+    uint16_t strategy;
+    uint16_t unit_size;
+};
+typedef struct xc_domain_numa_layout xc_domain_numa_layout_t;
+
+#define XC_MAX_NODES 16
+struct xc_node_data {
+    uint32_t node_id;
+    uint64_t size_pages;
+    uint64_t free_pages;
+    xc_cpumask_t cpu_mask; /* node_to_cpumask */
+};
+typedef struct xc_node_data xc_node_data_t;
+
+struct xc_machine_numa_layout {
+    uint64_t size_pages;
+    uint64_t free_pages;
+
+    uint32_t nr_nodes;
+
+    /* Only (nr_nodes*nr_nodes) entries are filled */
+    uint32_t node_distance[XC_MAX_NODES*XC_MAX_NODES];
+    /* Only (nr_nodes) entries are filled */
+    xc_node_data_t node_data[XC_MAX_NODES];
+};
+typedef struct xc_machine_numa_layout xc_machine_numa_layout_t;
+
+#define DEBUG_LAYOUT(layout) \
+{\
+	DEBUG_PRINT("layout(%s):\n",#layout);\
+	print_layout(layout);\
+}
+
+void print_layout(xc_domain_numa_layout_t *layout);
+
+#define DEBUG_M_LAYOUT(layout) \
+{\
+	DEBUG_PRINT("layout(%s):\n",#layout);\
+	print_machine_layout(layout);\
+}
+	
+void print_machine_layout(xc_machine_numa_layout_t *layout);
+
+extern xc_domain_numa_layout_t * xc_dom_alloc_numa_layout(xc_interface *xch, 
+        uint32_t domid, uint64_t nr_pages, xc_domain_numa_config_t *config);
+extern void xc_dom_free_numa_layout(xc_interface *xch, 
+                                        xc_domain_numa_layout_t *dom_layout);
+
+extern int 
+xc_setup_numa_domain(xc_interface *xch, xc_domain_numa_layout_t *dom_layout);
+extern int
+xc_domain_numa_pinvcpus(xc_interface *xch, xc_domain_numa_layout_t *dom_layout);
+
+static inline int xc_domain_nr_vnodes(xc_domain_numa_layout_t * dom_layout)
+{
+    if (!dom_layout || (dom_layout->type != XEN_DOM_NUMA_GUSET_NUMA))
+        return 0;
+    return dom_layout->nr_vnodes;
+}
+
+int xc_get_machine_numa_layout(xc_interface *xch, xc_machine_numa_layout_t *layout);
+
+void print_numa_info(xc_interface *xch);
+
+#endif
diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c
index d619f88..aff7f08 100644
--- a/tools/libxc/xc_hvm_build.c
+++ b/tools/libxc/xc_hvm_build.c
@@ -24,6 +24,8 @@
 
 #include "xg_private.h"
 #include "xc_private.h"
+#include "xc_dom_numa.h"
+#include "xc_cpumap.h"
 
 #include <xen/foreign/x86_32.h>
 #include <xen/foreign/x86_64.h>
@@ -46,7 +48,67 @@
 #define NR_SPECIAL_PAGES     5
 #define special_pfn(x) (0xff000u - NR_SPECIAL_PAGES + (x))
 
-static void build_hvm_info(void *hvm_info_page, uint64_t mem_size)
+#ifdef set_xen_guest_handle
+#undef set_xen_guest_handle
+#endif
+#define set_xen_guest_handle(hnd, val)  do { (hnd).p = val; } while (0)
+
+static int build_hvm_numa_info(struct hvm_info_table *hvm_info, 
+                                        xc_domain_numa_layout_t *dlayout)
+{
+    int i, j;
+    uint64_t vnode_pgstart;
+    struct xen_domain_numa_info *ninfo;
+    struct xen_vnode_info *ninfo_vnode_info;
+    uint8_t *ninfo_vcpu_to_vnode, *ninfo_vnode_distance;
+
+    ninfo = &hvm_info->numa_info[0];
+    ninfo->version = dlayout->version;
+    ninfo->type = dlayout->type;
+    ninfo->nr_vcpus = dlayout->nr_vcpus;
+    ninfo->nr_vnodes = dlayout->nr_vnodes;
+
+    ninfo_vnode_info = NUMA_INFO_VNODE_INFO(ninfo);
+    ninfo_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(ninfo);
+    ninfo_vnode_distance = NUMA_INFO_VNODE_DISTANCE(ninfo);
+
+	for (i=0; i<ninfo->nr_vcpus; i++)
+		ninfo_vcpu_to_vnode[i] = XEN_INVALID_NODE;
+
+    for (i=0, vnode_pgstart=0; i<dlayout->nr_vnodes; i++)
+    {
+        uint64_t vnode_pgend;
+		struct xenctl_cpumap vnode_vcpumap;
+        xc_vnode_data_t *vnode_data = &dlayout->vnode_data[i];
+		xc_cpumask_t *vnode_vcpumask = &vnode_data->vcpu_mask;
+        struct xen_vnode_info *vnode_info = &ninfo_vnode_info[i];
+
+        vnode_info->mnode_id = vnode_data->mnode_id;
+        vnode_pgend = vnode_pgstart + vnode_data->nr_pages;
+        /* Account for hole in the memory map */
+        if ( (vnode_pgstart < hvm_info->low_mem_pgend) && 
+                            (vnode_pgend >= hvm_info->low_mem_pgend) )
+                vnode_pgend += ((1ull<<32) - HVM_BELOW_4G_RAM_END)>>PAGE_SHIFT;
+       
+        vnode_info->start = vnode_pgstart;
+        vnode_info->end = vnode_pgend;
+        vnode_pgstart = vnode_pgend;
+
+        xc_cpumap_from_cpumask(&vnode_vcpumap, vnode_vcpumask);
+        xc_for_each_cpu(j, vnode_vcpumap)
+            ninfo_vcpu_to_vnode[j] = i;
+    }
+
+    for (i=0; i<ninfo->nr_vnodes; i++)
+        for (j=0; j<ninfo->nr_vnodes; j++)
+            ninfo_vnode_distance[(i*ninfo->nr_vnodes)+j] =
+                    dlayout->vnode_distance[(i*ninfo->nr_vnodes)+j];
+
+    return NUMA_INFO_SIZE(ninfo);
+}
+
+static void build_hvm_info(void *hvm_info_page, uint64_t mem_size, 
+                                        xc_domain_numa_layout_t *dom_layout)
 {
     struct hvm_info_table *hvm_info = (struct hvm_info_table *)
         (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
@@ -77,6 +139,12 @@ static void build_hvm_info(void *hvm_info_page, uint64_t mem_size)
     hvm_info->high_mem_pgend = highmem_end >> PAGE_SHIFT;
     hvm_info->reserved_mem_pgstart = special_pfn(0);
 
+	 if ( dom_layout && ( dom_layout->type == XEN_DOM_NUMA_GUSET_NUMA))
+	 {
+		 hvm_info->numa_enabled = 1;
+		hvm_info->length += build_hvm_numa_info(hvm_info, dom_layout);
+	 }
+
     /* Finish with the checksum. */
     for ( i = 0, sum = 0; i < hvm_info->length; i++ )
         sum += ((uint8_t *)hvm_info)[i];
@@ -131,206 +199,295 @@ static int check_mmio_hole(uint64_t start, uint64_t memsize)
         return 1;
 }
 
-static int setup_guest(xc_interface *xch,
-                       uint32_t dom, int memsize, int target,
-                       char *image, unsigned long image_size)
+#define INVALID_NODE (~0)
+static int __setup_guest_memory(xc_interface *xch, uint32_t dom, 
+                        unsigned long nr_pages, unsigned long target_pages, 
+                        unsigned long cur_pages, xen_pfn_t *page_array, 
+                        int vga_hole, int node, int exact_node)
 {
-    xen_pfn_t *page_array = NULL;
-    unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
-    unsigned long target_pages = (unsigned long)target << (20 - PAGE_SHIFT);
-    unsigned long entry_eip, cur_pages, cur_pfn;
-    void *hvm_info_page;
-    uint32_t *ident_pt;
-    struct elf_binary elf;
-    uint64_t v_start, v_end;
-    int rc;
-    xen_capabilities_info_t caps;
-    unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, 
-        stat_1gb_pages = 0;
+    unsigned long i,cur_pfn,rc = 0;
+    unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, stat_1gb_pages = 0;
+	unsigned int mem_flags = 0;
     int pod_mode = 0;
 
-    /* An HVM guest must be initialised with at least 2MB memory. */
-    if ( memsize < 2 || target < 2 )
-        goto error_out;
-
-    if ( memsize > target )
+    if ( nr_pages > target_pages )
+    {
         pod_mode = 1;
+        mem_flags |= XENMEMF_populate_on_demand;
+    }
+	if (node != INVALID_NODE)
+        mem_flags |= exact_node?XENMEMF_exact_node(node):XENMEMF_node(node);
+	
+	/*
+	* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
+	*
+	* We attempt to allocate 1GB pages if possible. It falls back on 2MB
+	* pages if 1GB allocation fails. 4KB pages will be used eventually if
+	* both fail.
+	* 
+	* Under 2MB mode, we allocate pages in batches of no more than 8MB to 
+	* ensure that we can be preempted and hence dom0 remains responsive.
+	*/
+		 
+	if (vga_hole)
+    {
+		xc_domain_populate_physmap_exact(
+			xch, dom, 0xa0, mem_flags , 0, &page_array[0x00]);
+		cur_pages = 0xc0;
+		stat_normal_pages = 0xc0;
+	}
+		while ( (rc == 0) && (nr_pages > cur_pages) )
+		{
+			/* Clip count to maximum 1GB extent. */
+			unsigned long count = nr_pages - cur_pages;
+			unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS;
+	
+			if ( count > max_pages )
+				count = max_pages;
+	
+			cur_pfn = page_array[cur_pages];
+	
+			/* Take care the corner cases of super page tails */
+			if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
+				 (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) )
+				count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1);
+			else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
+					  (count > SUPERPAGE_1GB_NR_PFNS) )
+				count &= ~(SUPERPAGE_1GB_NR_PFNS - 1);
+	
+			/* Attemp to allocate 1GB super page. Because in each pass we only
+			 * allocate at most 1GB, we don't have to clip super page boundaries.
+			 */
+			if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 &&
+				 /* Check if there exists MMIO hole in the 1GB memory range */
+				 !check_mmio_hole(cur_pfn << PAGE_SHIFT,
+								  SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT) )
+			{
+				long done;
+				unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT;
+				xen_pfn_t sp_extents[nr_extents];
+	
+				for ( i = 0; i < nr_extents; i++ )
+					sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_1GB_SHIFT)];
+	
+				done = xc_domain_populate_physmap(xch, dom, nr_extents, SUPERPAGE_1GB_SHIFT,
+												  mem_flags,sp_extents);
+	
+				if ( done > 0 )
+				{
+					stat_1gb_pages += done;
+					done <<= SUPERPAGE_1GB_SHIFT;
+					cur_pages += done;
+					count -= done;
+				}
+			}
+	
+			if ( count != 0 )
+			{
+				/* Clip count to maximum 8MB extent. */
+				max_pages = SUPERPAGE_2MB_NR_PFNS * 4;
+				if ( count > max_pages )
+					count = max_pages;
+				
+				/* Clip partial superpage extents to superpage boundaries. */
+				if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
+					 (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) )
+					count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1);
+				else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
+						  (count > SUPERPAGE_2MB_NR_PFNS) )
+					count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */
+	
+				/* Attempt to allocate superpage extents. */
+				if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 )
+				{
+					long done;
+					unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT;
+					xen_pfn_t sp_extents[nr_extents];
+	
+					for ( i = 0; i < nr_extents; i++ )
+						sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_2MB_SHIFT)];
+	
+					done = xc_domain_populate_physmap(xch, dom, nr_extents, SUPERPAGE_2MB_SHIFT,
+													  mem_flags, sp_extents);
+	
+					if ( done > 0 )
+					{
+						stat_2mb_pages += done;
+						done <<= SUPERPAGE_2MB_SHIFT;
+						cur_pages += done;
+						count -= done;
+					}
+				}
+			}
+	
+			/* Fall back to 4kB extents. */
+			if ( count != 0 )
+			{
+				rc = xc_domain_populate_physmap_exact(
+					xch, dom, count, 0, exact_node?XENMEMF_exact_node(node):XENMEMF_node(node), &page_array[cur_pages]);
+				cur_pages += count;
+				stat_normal_pages += count;
+			}
+		}
+	
+		/* Subtract 0x20 from target_pages for the VGA "hole".	Xen will
+		 * adjust the PoD cache size so that domain tot_pages will be
+		 * target_pages - 0x20 after this call. */
+		if ( pod_mode )
+			rc = xc_domain_set_pod_target(xch, dom, target_pages - 0x20,
+										  NULL, NULL, NULL);
+	
+		if ( rc != 0 )
+		{
+			PERROR("Could not allocate memory for HVM guest.");
+			goto error_out;
+		}
+	
+		IPRINTF("PHYSICAL MEMORY ALLOCATION (NODE %d):\n"
+				"  4KB PAGES: 0x%016lx\n"
+				"  2MB PAGES: 0x%016lx\n"
+				"  1GB PAGES: 0x%016lx\n",
+				node, stat_normal_pages, stat_2mb_pages, stat_1gb_pages);
+		
+		return 0;
+		
+error_out:
+		if ( page_array )
+			free(page_array);
+		return -1;
+		
+}
 
-    memset(&elf, 0, sizeof(elf));
-    if ( elf_init(&elf, image, image_size) != 0 )
-        goto error_out;
-    elf_parse_binary(&elf);
-    v_start = 0;
-    v_end = (unsigned long long)memsize << 20;
-
-    if ( xc_version(xch, XENVER_capabilities, &caps) != 0 )
+static int setup_guest_numa_cross(xc_interface *xch,
+                xc_domain_numa_layout_t *dom_layout, xen_pfn_t *page_array)
+{
+    int vnode, rc=0;
+    unsigned long cur_pages, nr_pages;
+    /* Make a private copy for cross iterations */
+    xc_domain_numa_layout_t *layout;
+    if (!(layout = malloc(sizeof(*layout))))
     {
-        PERROR("Could not get Xen capabilities");
-        goto error_out;
+        PERROR("%s : Failed malloc.", __FUNCTION__);
+        return -1;
     }
+    memcpy(layout, dom_layout, sizeof(*layout));
 
-    if ( (elf.pstart & (PAGE_SIZE - 1)) != 0 )
+    for (vnode=0, cur_pages=0, nr_pages=0; 
+                            cur_pages<layout->nr_pages && !rc; vnode++)
     {
-        PERROR("Guest OS must load to a page boundary.");
-        goto error_out;
+        unsigned long allocsz;
+        xc_vnode_data_t *vnode_data;
+        while (!layout->vnode_data[vnode].nr_pages)
+        {
+            vnode++;
+            if (vnode >= layout->nr_vnodes)
+                vnode = 0;
+        }
+        vnode_data = &layout->vnode_data[vnode];
+        allocsz = layout->unit_size;
+        if (allocsz > vnode_data->nr_pages)
+            allocsz = vnode_data->nr_pages;
+
+        nr_pages = cur_pages + allocsz;
+        rc = __setup_guest_memory(xch, layout->domid, nr_pages, nr_pages, 
+                cur_pages, page_array, !cur_pages, vnode_data->mnode_id, 1);
+        vnode_data->nr_pages -= allocsz;
+        cur_pages = nr_pages;
     }
+    free(layout);
+    return rc;
+}
 
-    IPRINTF("VIRTUAL MEMORY ARRANGEMENT:\n"
-            "  Loader:        %016"PRIx64"->%016"PRIx64"\n"
-            "  TOTAL:         %016"PRIx64"->%016"PRIx64"\n"
-            "  ENTRY ADDRESS: %016"PRIx64"\n",
-            elf.pstart, elf.pend,
-            v_start, v_end,
-            elf_uval(&elf, elf.ehdr, e_entry));
+static int setup_guest_numa_memory(xc_interface *xch, 
+                xc_domain_numa_layout_t *dom_layout, xen_pfn_t *page_array)
+{
+    int vnode, rc;
+    unsigned long cur_pages, nr_pages;
 
-    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
+    if ((rc = xc_setup_numa_domain(xch, dom_layout)))
+        goto setup_done;
+
+    if (dom_layout->type == XEN_DOM_NUMA_CROSS)
     {
-        PERROR("Could not allocate memory.");
-        goto error_out;
+        rc = setup_guest_numa_cross(xch, dom_layout, page_array);
+        goto setup_done;
     }
 
-    for ( i = 0; i < nr_pages; i++ )
-        page_array[i] = i;
-    for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ )
-        page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
-
-    /*
-     * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
-     *
-     * We attempt to allocate 1GB pages if possible. It falls back on 2MB
-     * pages if 1GB allocation fails. 4KB pages will be used eventually if
-     * both fail.
-     * 
-     * Under 2MB mode, we allocate pages in batches of no more than 8MB to 
-     * ensure that we can be preempted and hence dom0 remains responsive.
-     */
-    rc = xc_domain_populate_physmap_exact(
-        xch, dom, 0xa0, 0, 0, &page_array[0x00]);
-    cur_pages = 0xc0;
-    stat_normal_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
+    /* XXX: pod is turned off with NUMA allocation for now */
+    for (vnode=0, cur_pages=0, nr_pages=0; 
+                            vnode<dom_layout->nr_vnodes && !rc; vnode++)
     {
-        /* Clip count to maximum 1GB extent. */
-        unsigned long count = nr_pages - cur_pages;
-        unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS;
-
-        if ( count > max_pages )
-            count = max_pages;
-
-        cur_pfn = page_array[cur_pages];
-
-        /* Take care the corner cases of super page tails */
-        if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
-             (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) )
-            count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1);
-        else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
-                  (count > SUPERPAGE_1GB_NR_PFNS) )
-            count &= ~(SUPERPAGE_1GB_NR_PFNS - 1);
-
-        /* Attemp to allocate 1GB super page. Because in each pass we only
-         * allocate at most 1GB, we don't have to clip super page boundaries.
-         */
-        if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 &&
-             /* Check if there exists MMIO hole in the 1GB memory range */
-             !check_mmio_hole(cur_pfn << PAGE_SHIFT,
-                              SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT) )
-        {
-            long done;
-            unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT;
-            xen_pfn_t sp_extents[nr_extents];
-
-            for ( i = 0; i < nr_extents; i++ )
-                sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_1GB_SHIFT)];
-
-            done = xc_domain_populate_physmap(xch, dom, nr_extents, SUPERPAGE_1GB_SHIFT,
-                                              pod_mode ? XENMEMF_populate_on_demand : 0,
-                                              sp_extents);
-
-            if ( done > 0 )
-            {
-                stat_1gb_pages += done;
-                done <<= SUPERPAGE_1GB_SHIFT;
-                cur_pages += done;
-                count -= done;
-            }
-        }
+        xc_vnode_data_t *vnode_data = &dom_layout->vnode_data[vnode];
 
-        if ( count != 0 )
-        {
-            /* Clip count to maximum 8MB extent. */
-            max_pages = SUPERPAGE_2MB_NR_PFNS * 4;
-            if ( count > max_pages )
-                count = max_pages;
-            
-            /* Clip partial superpage extents to superpage boundaries. */
-            if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
-                 (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) )
-                count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1);
-            else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
-                      (count > SUPERPAGE_2MB_NR_PFNS) )
-                count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */
-
-            /* Attempt to allocate superpage extents. */
-            if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 )
-            {
-                long done;
-                unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT;
-                xen_pfn_t sp_extents[nr_extents];
-
-                for ( i = 0; i < nr_extents; i++ )
-                    sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_2MB_SHIFT)];
-
-                done = xc_domain_populate_physmap(xch, dom, nr_extents, SUPERPAGE_2MB_SHIFT,
-                                                  pod_mode ? XENMEMF_populate_on_demand : 0,
-                                                  sp_extents);
-
-                if ( done > 0 )
-                {
-                    stat_2mb_pages += done;
-                    done <<= SUPERPAGE_2MB_SHIFT;
-                    cur_pages += done;
-                    count -= done;
-                }
-            }
-        }
-
-        /* Fall back to 4kB extents. */
-        if ( count != 0 )
-        {
-            rc = xc_domain_populate_physmap_exact(
-                xch, dom, count, 0, 0, &page_array[cur_pages]);
-            cur_pages += count;
-            stat_normal_pages += count;
-        }
+        nr_pages = cur_pages + vnode_data->nr_pages;
+        rc = __setup_guest_memory(xch, dom_layout->domid, nr_pages, nr_pages,
+                    cur_pages, page_array, (vnode == 0), vnode_data->mnode_id, 
+                    (dom_layout->type != XEN_DOM_NUMA_DONTCARE));
+        cur_pages = nr_pages;
     }
+setup_done:
+    if (!rc)
+        rc = xc_domain_numa_pinvcpus(xch, dom_layout);
+    if (!rc)
+        rc = xc_domain_disable_migrate(xch, dom_layout->domid);
+    return rc;
+}
 
-    /* Subtract 0x20 from target_pages for the VGA "hole".  Xen will
-     * adjust the PoD cache size so that domain tot_pages will be
-     * target_pages - 0x20 after this call. */
-    if ( pod_mode )
-        rc = xc_domain_set_pod_target(xch, dom, target_pages - 0x20,
-                                      NULL, NULL, NULL);
+static int setup_guest_nonnuma_memory(xc_interface *xch, uint32_t domid,
+                unsigned long nr_pages, unsigned long target_pages, 
+                xen_pfn_t *page_array)
+{
+    return __setup_guest_memory(xch, domid, nr_pages, target_pages, 0, 
+            page_array, 1, INVALID_NODE, 0);
+}
+
+static int setup_guest_memory(xc_interface *xch, uint32_t dom,
+                            xc_domain_numa_layout_t *dom_layout,
+                            unsigned long nr_pages, unsigned long target_pages,
+                            struct elf_binary *elf)
+{
+    xen_pfn_t *page_array = NULL;
+    unsigned long i;
+    int rc;
 
-    if ( rc != 0 )
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
     {
-        PERROR("Could not allocate memory for HVM guest.");
-        goto error_out;
+        rc = -1;
+        PERROR("Could not allocate memory.");
+        goto out;
     }
 
-    IPRINTF("PHYSICAL MEMORY ALLOCATION:\n"
-            "  4KB PAGES: 0x%016lx\n"
-            "  2MB PAGES: 0x%016lx\n"
-            "  1GB PAGES: 0x%016lx\n",
-            stat_normal_pages, stat_2mb_pages, stat_1gb_pages);
-    
-    if ( loadelfimage(xch, &elf, dom, page_array) != 0 )
-        goto error_out;
+    for ( i = 0; i < nr_pages; i++ )
+        page_array[i] = i;
+    for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ )
+        page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
+
+    if ( dom_layout )
+        rc = setup_guest_numa_memory(xch, dom_layout, page_array);
+    else
+        rc = setup_guest_nonnuma_memory(xch, dom, 
+                                nr_pages, target_pages, page_array);
+    if ( rc )
+        goto out;
+    rc = loadelfimage(xch, elf, dom, page_array);
+out:
+    if ( page_array )
+        free(page_array);
+    return rc;
+}
 
-    if ( (hvm_info_page = xc_map_foreign_range(
+static int setup_guest_special_pages(xc_interface *xch, uint32_t dom, 
+                    uint64_t memsize, xc_domain_numa_layout_t *dom_layout)
+{
+    void *hvm_info_page;
+    uint32_t *ident_pt;
+    unsigned long i;
+	int rc=0;
+	if ( (hvm_info_page = xc_map_foreign_range(
               xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
               HVM_INFO_PFN)) == NULL )
         goto error_out;
-    build_hvm_info(hvm_info_page, v_end);
+    build_hvm_info(hvm_info_page, memsize, dom_layout);
     munmap(hvm_info_page, PAGE_SIZE);
 
     /* Allocate and clear special pages. */
@@ -370,6 +527,62 @@ static int setup_guest(xc_interface *xch,
     munmap(ident_pt, PAGE_SIZE);
     xc_set_hvm_param(xch, dom, HVM_PARAM_IDENT_PT,
                      special_pfn(SPECIALPAGE_IDENT_PT) << PAGE_SHIFT);
+	return 0;
+	error_out:
+    	return -1;
+}
+
+static int setup_guest(xc_interface *xch,
+                       uint32_t dom, int memsize, int target,
+                       xc_domain_numa_layout_t  *dom_layout,
+                       char *image, unsigned long image_size)
+{
+    unsigned long entry_eip;
+    struct elf_binary elf;
+    uint64_t v_start, v_end;
+    int rc;
+    xen_capabilities_info_t caps;
+
+    /* An HVM guest must be initialised with at least 2MB memory. */
+    if ( memsize < 2 || target < 2 )
+        goto error_out;
+
+    memset(&elf, 0, sizeof(elf));
+    if ( elf_init(&elf, image, image_size) != 0 )
+        goto error_out;
+    elf_parse_binary(&elf);
+    v_start = 0;
+    v_end = (unsigned long long)memsize << 20;
+
+    if ( xc_version(xch, XENVER_capabilities, &caps) != 0 )
+    {
+        PERROR("Could not get Xen capabilities");
+        goto error_out;
+    }
+
+    if ( (elf.pstart & (PAGE_SIZE - 1)) != 0 )
+    {
+        PERROR("Guest OS must load to a page boundary.");
+        goto error_out;
+    }
+
+    IPRINTF("VIRTUAL MEMORY ARRANGEMENT:\n"
+            "  Loader:        %016"PRIx64"->%016"PRIx64"\n"
+            "  TOTAL:         %016"PRIx64"->%016"PRIx64"\n"
+            "  ENTRY ADDRESS: %016"PRIx64"\n",
+            elf.pstart, elf.pend,
+            v_start, v_end,
+            elf_uval(&elf, elf.ehdr, e_entry));
+
+    rc = setup_guest_memory(xch, dom, dom_layout, 
+                    (unsigned long)memsize << (20 - PAGE_SHIFT),
+                    (unsigned long)target << (20 - PAGE_SHIFT), &elf);
+    if ( rc < 0 )
+        goto error_out;
+
+    rc = setup_guest_special_pages(xch, dom, v_end, dom_layout);
+    if ( rc < 0 )
+        goto error_out;
 
     /* Insert JMP <rel32> instruction at address 0x0 to reach entry point. */
     entry_eip = elf_uval(&elf, elf.ehdr, e_entry);
@@ -384,11 +597,9 @@ static int setup_guest(xc_interface *xch,
         munmap(page0, PAGE_SIZE);
     }
 
-    free(page_array);
     return 0;
 
  error_out:
-    free(page_array);
     return -1;
 }
 
@@ -396,16 +607,27 @@ static int xc_hvm_build_internal(xc_interface *xch,
                                  uint32_t domid,
                                  int memsize,
                                  int target,
+                                 xc_domain_numa_config_t *numa_config,
                                  char *image,
                                  unsigned long image_size)
 {
+	int rc;
+	xc_domain_numa_layout_t  *dom_layout = 0;
+
     if ( (image == NULL) || (image_size == 0) )
     {
         ERROR("Image required");
         return -1;
     }
 
-    return setup_guest(xch, domid, memsize, target, image, image_size);
+	 if ( numa_config )
+		 dom_layout = xc_dom_alloc_numa_layout(xch, domid, 
+						 (uint64_t)memsize << (20 - PAGE_SHIFT), numa_config);
+	 rc = setup_guest(xch, domid, memsize, target, dom_layout,
+														 image, image_size);
+	 if ( dom_layout )
+		 xc_dom_free_numa_layout(xch, dom_layout);
+	 return rc;
 }
 
 /* xc_hvm_build:
@@ -424,7 +646,7 @@ int xc_hvm_build(xc_interface *xch,
          ((image = xc_read_image(xch, image_name, &image_size)) == NULL) )
         return -1;
 
-    sts = xc_hvm_build_internal(xch, domid, memsize, memsize, image, image_size);
+    sts = xc_hvm_build_internal(xch, domid, memsize, memsize, NULL, image, image_size);
 
     free(image);
 
@@ -442,6 +664,7 @@ int xc_hvm_build_target_mem(xc_interface *xch,
                            uint32_t domid,
                            int memsize,
                            int target,
+                           xc_domain_numa_config_t *numa_config,
                            const char *image_name)
 {
     char *image;
@@ -452,7 +675,7 @@ int xc_hvm_build_target_mem(xc_interface *xch,
          ((image = xc_read_image(xch, image_name, &image_size)) == NULL) )
         return -1;
 
-    sts = xc_hvm_build_internal(xch, domid, memsize, target, image, image_size);
+    sts = xc_hvm_build_internal(xch, domid, memsize, target, numa_config, image, image_size);
 
     free(image);
 
@@ -487,7 +710,7 @@ int xc_hvm_build_mem(xc_interface *xch,
         return -1;
     }
 
-    sts = xc_hvm_build_internal(xch, domid, memsize, memsize,
+    sts = xc_hvm_build_internal(xch, domid, memsize, memsize, NULL,
                                 img, img_len);
 
     /* xc_inflate_buffer may return the original buffer pointer (for
@@ -499,6 +722,8 @@ int xc_hvm_build_mem(xc_interface *xch,
     return sts;
 }
 
+#undef set_xen_guest_handle
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xc_private.c b/tools/libxc/xc_private.c
index 09c8f23..2dbba3c 100644
--- a/tools/libxc/xc_private.c
+++ b/tools/libxc/xc_private.c
@@ -47,6 +47,19 @@
  *  0 - on success
  * -1 - on error
  */
+
+void print_xch(xc_interface *xch)
+{
+	printf("\ncurrently_progress_reporting=%s\n",xch->currently_progress_reporting);
+	printf("last_error=%s\n",xch->last_error.message);
+	printf("hypercall_buffer_cache_nr=%d\n",xch->hypercall_buffer_cache_nr);
+	printf("hypercall_buffer_total_allocations=%d\n",xch->hypercall_buffer_total_allocations);
+	printf("hypercall_buffer_maximum_allocations=%d\n",xch->hypercall_buffer_maximum_allocations);
+	printf("hypercall_buffer_cache_hits=%d\n",xch->hypercall_buffer_cache_hits);
+	printf("hypercall_buffer_cache_misses=%d\n",xch->hypercall_buffer_cache_misses);
+	printf("hypercall_buffer_cache_toobig=%d\n",xch->hypercall_buffer_cache_toobig);
+}
+
 static int xc_osdep_get_info(xc_interface *xch, xc_osdep_info_t *info)
 {
     int rc = -1;
diff --git a/tools/libxc/xc_private.h b/tools/libxc/xc_private.h
index 3687561..53902ec 100644
--- a/tools/libxc/xc_private.h
+++ b/tools/libxc/xc_private.h
@@ -102,6 +102,7 @@ struct xc_interface_core {
     xc_osdep_handle  ops_handle; /* opaque data for xc_osdep_ops */
 };
 
+void print_xch(xc_interface *xch);
 void xc_report_error(xc_interface *xch, int code, const char *fmt, ...);
 void xc_reportv(xc_interface *xch, xentoollog_logger *lg, xentoollog_level,
                 int code, const char *fmt, va_list args)
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index dc2561e..2833d59 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -122,7 +122,6 @@
 
 #define DEBUG_MARK() DEBUG_PRINT("mark(line:%d)\n",__LINE__)
 
-
 typedef struct xc_interface_core xc_interface;
 typedef struct xc_interface_core xc_evtchn;
 typedef struct xc_interface_core xc_gnttab;
@@ -414,6 +413,24 @@ typedef union
     start_info_t s;
 } start_info_any_t;
 
+/**
+ * struct xc_dom_numa_info : Carries information required for NUMA memory 
+ * allocation for the guests.
+ */
+#define XC_DOM_NUMA_AUTO     	0  /* Let the allocator choose */
+#define XC_DOM_NUMA_CLUSTER 	1
+#define XC_DOM_NUMA_GUEST_NUMA  2
+#define XC_DOM_NUMA_CROSS   	3
+#define XC_DOM_NUMA_NONE     	4
+
+#define XC_DOM_NUMA_DEF_UNIT_SIZE 32    /* in 4K pages */  
+
+typedef struct xc_domain_numa_config
+{
+    uint32_t strategy;      /* By default, DONTCARE (for now) */
+    uint32_t nr_nodes;      /* For GUEST_NUMA/CROSS */
+    uint32_t unit_size;   /* For CROSS only */
+} xc_domain_numa_config_t;
 
 int xc_domain_create(xc_interface *xch,
                      uint32_t ssidref,
diff --git a/tools/libxc/xenguest.h b/tools/libxc/xenguest.h
index 9ed0ea4..a87ca6f 100644
--- a/tools/libxc/xenguest.h
+++ b/tools/libxc/xenguest.h
@@ -173,6 +173,7 @@ int xc_hvm_build_target_mem(xc_interface *xch,
                             uint32_t domid,
                             int memsize,
                             int target,
+							xc_domain_numa_config_t *numa_config,
                             const char *image_name);
 
 int xc_hvm_build_mem(xc_interface *xch,
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 0dc6319..a527152 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -136,6 +136,8 @@
 
 #define LIBXL_DTOR_POISON 0xa5
 
+typedef xc_domain_numa_config_t libxl_domain_numa_config;
+
 typedef uint8_t libxl_mac[6];
 
 typedef char **libxl_string_list;
diff --git a/tools/libxl/libxl.idl b/tools/libxl/libxl.idl
index 0c777d7..11f036c 100644
--- a/tools/libxl/libxl.idl
+++ b/tools/libxl/libxl.idl
@@ -6,6 +6,7 @@
 libxl_ctx = Builtin("ctx")
 libxl_uuid = Builtin("uuid")
 libxl_mac = Builtin("mac")
+xc_domain_numa_config = Builtin("domain_numa_config");
 libxl_cpumap = Builtin("cpumap", destructor_fn="libxl_cpumap_destroy", passby=PASS_BY_REFERENCE)
 libxl_nodemap = Builtin("nodemap", destructor_fn="libxl_nodemap_destroy", passby=PASS_BY_REFERENCE)
 libxl_cpuarray = Builtin("cpuarray", destructor_fn="libxl_cpuarray_destroy", passby=PASS_BY_REFERENCE)
@@ -102,6 +103,7 @@ libxl_domain_build_info = Struct("domain_build_info",[
     ("disable_migrate", bool),
     ("kernel",          libxl_file_reference),
     ("cpuid",           libxl_cpuid_policy_list),
+    ("numa_config",		xc_domain_numa_config),
     ("hvm",             integer),
     ("u", KeyedUnion(None, "hvm",
                 [("hvm", "%s", Struct(None,
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 22e6006..face10d 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -103,6 +103,7 @@ void libxl_init_build_info(libxl_domain_build_info *b_info, libxl_domain_create_
     } else {
         b_info->u.pv.slack_memkb = 8 * 1024;
     }
+		b_info->numa_config.strategy = XC_DOM_NUMA_NONE;
 }
 
 void libxl_init_dm_info(libxl_device_model_info *dm_info,
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 0413dfd..8046ef6 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -901,6 +901,7 @@ int libxl__build_hvm(libxl_ctx *ctx, uint32_t domid,
         domid,
         (info->max_memkb - info->video_memkb) / 1024,
         (info->target_memkb - info->video_memkb) / 1024,
+        &info->numa_config,
         libxl__abs_path(&gc, (char *)info->kernel.path,
                        libxl_xenfirmwaredir_path()));
     if (ret) {
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 083efc9..006dc7b 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -339,6 +339,37 @@ static void dolog(const char *file, int line, const char *func, char *fmt, ...)
         libxl_write_exactly(NULL, logfile, s, rc, NULL, NULL);
 }
 
+static char *numa_val_to_str(uint32_t val)
+{
+    switch (val)
+    {
+        case XC_DOM_NUMA_AUTO:
+                return "AUTO";
+        case XC_DOM_NUMA_CLUSTER:
+                return "CLUSTER";
+        case XC_DOM_NUMA_GUEST_NUMA:
+                return "GUEST NUMA";
+        case XC_DOM_NUMA_CROSS:
+                return "CROSS";
+        default:
+                return "NONE";
+    }
+}
+
+static uint32_t numa_str_to_val(const char *str)
+{
+    if (!strcasecmp(str, "AUTO"))
+        return XC_DOM_NUMA_AUTO;
+    if (!strcasecmp(str, "CLUSTER"))
+        return XC_DOM_NUMA_CLUSTER;
+    if (!strcasecmp(str, "GUEST_NUMA")||!strcasecmp(str, "GUEST NUMA")||!strcasecmp(str, "GUESTNUMA"))
+        return XC_DOM_NUMA_GUEST_NUMA;
+    if (!strcasecmp(str, "CROSS"))
+        return XC_DOM_NUMA_CROSS;
+
+    return XC_DOM_NUMA_NONE;
+}
+
 static void printf_info(int domid,
                         libxl_domain_config *d_config,
                         libxl_device_model_info *dm_info)
@@ -383,6 +414,10 @@ static void printf_info(int domid,
     printf("\t(max_memkb %d)\n", b_info->max_memkb);
     printf("\t(target_memkb %d)\n", b_info->target_memkb);
     printf("\t(nomigrate %d)\n", b_info->disable_migrate);
+    printf("\t(numa_strategy %s)\n", 
+                            numa_val_to_str(b_info->numa_config.strategy));
+    printf("\t(numa_nodes %d)\n", b_info->numa_config.nr_nodes);
+    printf("\t(unit_size %d)\n", b_info->numa_config.unit_size);
 
     if (!c_info->hvm && b_info->u.pv.bootloader) {
         printf("\t(bootloader %s)\n", b_info->u.pv.bootloader);
@@ -1276,6 +1311,8 @@ static void parse_config_data(const char *configfile_filename_report,
         fprintf(stderr, "Illegal pool specified\n");
         exit(1);
     }
+	
+    libxl_init_build_info(b_info, c_info);
 
     /* the following is the actual config parsing with overriding values in the structures */
     if (!xlu_cfg_get_long (config, "vcpus", &l)) {
@@ -1465,6 +1502,14 @@ static void parse_config_data(const char *configfile_filename_report,
     if (!xlu_cfg_get_long (config, "videoram", &l))
         b_info->video_memkb = l * 1024;
 
+    if (!xlu_cfg_get_string (config, "strategy", &buf)) {
+        b_info->numa_config.strategy = numa_str_to_val(buf);
+        if (!xlu_cfg_get_long (config, "vnodes", &l))
+            b_info->numa_config.nr_nodes = l;
+        if (!xlu_cfg_get_long (config, "uintsz", &l))
+            b_info->numa_config.unit_size = l;
+    }
+
     xlu_cfg_replace_string (config, "kernel", &b_info->kernel.path);
 
     if (!xlu_cfg_get_long (config, "gfx_passthru", &l))
diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 2600b90..4820288 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -981,7 +981,7 @@ static PyObject *pyxc_hvm_build(XcObject *self,
         target = memsize;
 
     if ( xc_hvm_build_target_mem(self->xc_handle, dom, memsize,
-                                 target, image) != 0 )
+                                 target, NULL, image) != 0 )
         return pyxc_error_to_exception(self->xc_handle);
 
 #if !defined(__ia64__)
diff --git a/tools/python/xen/lowlevel/xl/xl.c b/tools/python/xen/lowlevel/xl/xl.c
index 1b55937..746f5dc 100644
--- a/tools/python/xen/lowlevel/xl/xl.c
+++ b/tools/python/xen/lowlevel/xl/xl.c
@@ -398,6 +398,16 @@ PyObject *attrib__struct_in_addr_get(struct in_addr *pptr)
     return NULL;
 }
 
+int attrib__libxl_domain_numa_config_set(PyObject *v, libxl_domain_numa_config *pptr)
+{
+	return 0;
+}
+
+PyObject *attrib__libxl_domain_numa_config_get(libxl_domain_numa_config *pptr)
+{
+	return NULL;
+}
+
 typedef struct {
     PyObject_HEAD;
     libxl_ctx ctx;
diff --git a/xen/include/public/arch-x86/dom_numa.h b/xen/include/public/arch-x86/dom_numa.h
new file mode 100755
index 0000000..c377ac9
--- /dev/null
+++ b/xen/include/public/arch-x86/dom_numa.h
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * dom_numa.h
+ *
+ * Guest NUMA common structures.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Author : Lab309
+ */
+#ifndef __XEN_PUBLIC_DOM_NUMA_X86_H__
+#define __XEN_PUBLIC_DOM_NUMA_X86_H__
+
+#define XEN_MAX_VCPUS 128
+
+/* vnodes are 1GB-aligned */
+#define XEN_MIN_VNODE_SHIFT (30)
+#define XEN_INVALID_NODE (0xFF)
+
+struct xen_vnode_info {
+    uint8_t mnode_id; 	/* physical node vnode is allocated from */
+    uint32_t start; 	/* start of the vnode range (in pages) */
+    uint32_t end; 		/* end of the vnode range (in pages) */
+};
+
+/* version : Interface version */
+#define XEN_DOM_NUMA_INTERFACE_VERSION  0x01
+
+/* type : On NUMA platforms, the VM memory could be distributed across 
+ * nodes in different ways.
+ */
+#define XEN_DOM_NUMA_CLUSTER   		0x01 /* Non-NUMA VM clustered to a node */
+#define XEN_DOM_NUMA_GUSET_NUMA    	0x02 /* NUMA VM guest numa across nodes */
+#define XEN_DOM_NUMA_CROSS		    0x03 /* Non-NUMA VM distributed across nodes */
+#define XEN_DOM_NUMA_DONTCARE   	0x04 /* Ad-hoc allocation */
+
+/* xen_domain_numa_info : 
+ * For PV VMs, this is the NUMA enlightenment structure.
+ * For HVMs, this structure is shared with the domain builder (hvmloader).
+ * Size of data[] depends on nr_vnodes and nr_vcpus.
+ */
+
+/* Macros to access data structures in dynamic data[] field.
+ * nr_vcpus and nr_vnodes must be initialized in the xen_domain_numa_info
+ * structure before calling these macros. */
+#define NUMA_INFO_SIZE(pinfo)                                           \
+            (sizeof(*pinfo)                                             \
+                + pinfo->nr_vnodes*sizeof(struct xen_vnode_info)        \
+                + pinfo->nr_vcpus*sizeof(uint8_t)                       \
+                + pinfo->nr_vnodes*pinfo->nr_vnodes*sizeof(uint8_t))
+
+#define NUMA_INFO_VNODE_INFO(pinfo)                                     \
+            (struct xen_vnode_info *)((uint8_t *)pinfo + sizeof(*pinfo))
+		
+#define NUMA_INFO_VCPU_TO_VNODE(pinfo)                                  \
+            (uint8_t *)((uint8_t *)NUMA_INFO_VNODE_INFO(pinfo)          \
+                + pinfo->nr_vnodes*sizeof(struct xen_vnode_info))
+
+#define NUMA_INFO_VNODE_DISTANCE(pinfo)                                 \
+            (uint8_t *)((uint8_t *)NUMA_INFO_VCPU_TO_VNODE(pinfo)       \
+                + pinfo->nr_vcpus*sizeof(uint8_t))
+
+struct xen_domain_numa_info {
+    uint8_t version;    /* Interface version */
+    uint8_t type;       /* VM memory allocation scheme (see above) */
+
+    uint8_t nr_vcpus;
+    uint8_t nr_vnodes;
+    /* data[] has the following entries :
+     * //Only (nr_vnodes) entries are filled, each sizeof(struct xen_vnode_info)
+     * struct xen_vnode_info vnode_info[nr_vnodes]; 
+     * //Only (nr_vcpus) entries are filled, each sizeof(uint8_t)
+     * uint8_t vcpu_to_vnode[nr_vcpus];
+     * //Only (nr_vnodes*nr_vnodes) entries are filled, each sizeof(uint8_t)
+     * uint8_t vnode_distance[nr_vnodes*nr_vnodes];
+     */
+	uint8_t data[0];
+};
+
+#endif
diff --git a/xen/include/public/dom_numa.h b/xen/include/public/dom_numa.h
new file mode 100755
index 0000000..d9750c0
--- /dev/null
+++ b/xen/include/public/dom_numa.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * dom_numa.h
+ *
+ * Guest NUMA common structures.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Author : Lab309
+ */
+
+#ifndef __XEN_PUBLIC_DOM_NUMA_H
+#define __XEN_PUBLIC_DOM_NUMA_H
+
+#if defined(__i386__) || defined(__x86_64__)
+#include "./arch-x86/dom_numa.h"
+#else
+#error "unsupported architecture"
+#endif
+
+
+#endif
diff --git a/xen/include/public/hvm/hvm_info_table.h b/xen/include/public/hvm/hvm_info_table.h
index bdb5995..6c3d4cc 100644
--- a/xen/include/public/hvm/hvm_info_table.h
+++ b/xen/include/public/hvm/hvm_info_table.h
@@ -25,12 +25,14 @@
 #ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
 #define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
 
+#include "../dom_numa.h"
+
 #define HVM_INFO_PFN         0x09F
 #define HVM_INFO_OFFSET      0x800
 #define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
 
 /* Maximum we can support with current vLAPIC ID mapping. */
-#define HVM_MAX_VCPUS        128
+#define HVM_MAX_VCPUS        XEN_MAX_VCPUS
 
 struct hvm_info_table {
     char        signature[8]; /* "HVM INFO" */
@@ -70,6 +72,12 @@ struct hvm_info_table {
 
     /* Bitmap of which CPUs are online at boot time. */
     uint8_t     vcpu_online[(HVM_MAX_VCPUS + 7)/8];
+
+    /* Domain NUMA memory distribution. Size of this structure should be 
+     * obtained using the macro XEN_DOMAIN_NUMA_INFO_SIZE(numa_info).
+     */
+    uint8_t numa_enabled; /* numa_info is populated only if numa_enabled != 0 */
+    struct xen_domain_numa_info numa_info[0];
 };
 
 #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
index e8f0532..eee1e84 100644
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -713,6 +713,7 @@ struct xenctl_bitmap {
     XEN_GUEST_HANDLE_64(uint8) bitmap;
     uint32_t nr_elems;
 };
+
 #endif
 
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2013-07-23  6:26 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-07-21 14:41 about the guest(Redhat6.3) shows white screen, but the suse, ubuntu is ok butine
2013-07-22 11:32 ` Dario Faggioli
2013-07-23  6:26   ` Matt Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.