[XEN][vNUMA][PATCH 8/9] Construct SRAT/SLIT for NUMA HVM

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [XEN][vNUMA][PATCH 8/9] Construct SRAT/SLIT for NUMA HVM
       [not found] <1BEA8649F0C00540AB2811D7922ECB6C9338B4D3@orsmsx507.amr.corp.intel.com>
@ 2010-07-02 23:55 ` Dulloor
  2010-08-01 22:05   ` [vNUMA v2][PATCH 7/8] " Dulloor
  0 siblings, 1 reply; 3+ messages in thread
From: Dulloor @ 2010-07-02 23:55 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 96 bytes --]

Construct SRAT/SLIT tables for HVM NUMA.

-dulloor

Signed-off-by : Dulloor <dulloor@gmail.com>

[-- Attachment #2: xen-08-numa-acpi-tables.patch --]
[-- Type: text/x-patch, Size: 12821 bytes --]

diff --git a/tools/firmware/hvmloader/acpi/acpi2_0.h b/tools/firmware/hvmloader/acpi/acpi2_0.h
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h
@@ -283,6 +283,66 @@ struct acpi_20_madt {
     uint32_t flags;
 };
 
+/*
+ * System Resource Affinity Table header definition (SRAT) (Version 3.0)
+ * X2APIC_CPU_AFFINITY is defined in version 4.0
+ */
+struct acpi_30_srat {
+	struct acpi_header header;	/* Common ACPI table header */
+	uint32_t table_revision;	/* Must be value '1' */
+	uint32_t reserved[2];		/* Reserved, must be zero */
+};
+#define ACPI_30_SRAT_TABLE_REVISION    0x1
+
+/* Values for type (in SRAT subtables) */
+enum acpi_30_srat_type {
+	ACPI_30_SRAT_TYPE_CPU_AFFINITY = 0,
+	ACPI_30_SRAT_TYPE_MEMORY_AFFINITY = 1,
+	ACPI_30_SRAT_TYPE_RESERVED = 2	/* 2 and greater are reserved */
+};
+
+/* type(0) : Processor Local APIC/SAPIC Affinity */
+struct acpi_30_srat_cpu_affinity {
+    uint8_t type;
+    uint8_t length;
+    uint8_t proximity_domain_lo;
+    uint8_t apic_id;
+    uint32_t flags;
+    uint8_t local_sapic_eid;
+    uint8_t proximity_domain_hi[3];
+    uint32_t reserved;		/* Reserved, must be zero */
+};
+
+/* Flags */
+#define ACPI_30_SRAT_CPU_USE_AFFINITY  (1)	/* 00: Use affinity structure */
+
+/* 1: Memory Affinity */
+
+struct acpi_30_srat_mem_affinity {
+    uint8_t type;
+    uint8_t length;
+    uint32_t proximity_domain;
+    uint16_t reserved;		/* Reserved, must be zero */
+    uint64_t base_address;
+    uint64_t size;
+    uint32_t reserved1;
+    uint32_t flags;
+    uint64_t reserved2;	    /* Reserved, must be zero */
+};
+
+/* Flags */
+#define ACPI_30_SRAT_MEM_ENABLED       (1)	/* 00: Use affinity structure */
+#define ACPI_30_SRAT_MEM_HOT_PLUGGABLE (1<<1)	/* 01: Mem is hot pluggable */
+#define ACPI_30_SRAT_MEM_NON_VOLATILE  (1<<2)	/* 02: Mem is non-volatile */
+
+/*
+ * System Locality Information Table header definition (SLIT) (Version 1.0)
+ */
+struct acpi_10_slit {
+    struct acpi_header header;
+    uint64_t locality_count;
+    uint8_t entry[1];
+};
 
 /*
  * HPET Description Table
@@ -367,6 +427,8 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
 #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
 #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_3_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
+#define ACPI_1_0_SLIT_SIGNATURE ASCII32('S','L','I','T')
 
 /*
  * Table revision numbers.
@@ -379,6 +441,8 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_TCPA_REVISION 0x02
 #define ACPI_2_0_HPET_REVISION 0x01
 #define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_3_0_SRAT_REVISION 0x01
+#define ACPI_1_0_SLIT_REVISION 0x01
 
 #pragma pack ()
 
diff --git a/tools/firmware/hvmloader/acpi/build.c b/tools/firmware/hvmloader/acpi/build.c
--- a/tools/firmware/hvmloader/acpi/build.c
+++ b/tools/firmware/hvmloader/acpi/build.c
@@ -149,6 +149,114 @@ static int construct_madt(struct acpi_20
     return align16(offset);
 }
 
+static int 
+construct_srat_cpu_affinity(struct acpi_30_srat_cpu_affinity *cpu_srat)
+{
+    struct acpi_30_srat_cpu_affinity *cpu_srat_iter;
+    int vnode, vcpu;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info;
+    for ( vnode = 0, cpu_srat_iter = cpu_srat; 
+                            vnode < numa_info->nr_vnodes; vnode++ )
+    {
+        struct xen_vnode_info *vnode_info = &numa_info->vnode_info[vnode];
+        for ( vcpu = 0 ; vcpu < HVM_MAX_VCPUS; vcpu++ )
+        {
+            if (test_bit(vcpu, XEN_CPUMASK_BITMAP(&vnode_info->vcpu_mask)))
+            {
+                memset(cpu_srat_iter, 0, sizeof(*cpu_srat_iter));
+                cpu_srat_iter->type = ACPI_30_SRAT_TYPE_CPU_AFFINITY;
+                cpu_srat_iter->length = sizeof(*cpu_srat);
+                cpu_srat_iter->proximity_domain_lo = vnode;
+                cpu_srat_iter->apic_id = LAPIC_ID(vcpu);
+                cpu_srat_iter->flags = ACPI_30_SRAT_CPU_USE_AFFINITY;
+                cpu_srat_iter++;
+            }
+        }
+    }
+    /* return length of the sub-table */
+    return ((uint8_t *)cpu_srat_iter-(uint8_t *)cpu_srat);
+}
+
+static int 
+construct_srat_mem_affinity(struct acpi_30_srat_mem_affinity *mem_srat)
+{
+    struct acpi_30_srat_mem_affinity *mem_srat_iter = mem_srat;
+    int vnode;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info;
+    uint64_t base_address;
+
+    for ( vnode = 0, base_address = 0; vnode < numa_info->nr_vnodes; vnode++ )
+    {
+        uint64_t size;
+        struct xen_vnode_info *vnode_info = &numa_info->vnode_info[vnode];
+        memset(mem_srat_iter, 0, sizeof(*mem_srat_iter));
+        mem_srat_iter->type = ACPI_30_SRAT_TYPE_MEMORY_AFFINITY;
+        mem_srat_iter->length = sizeof(*mem_srat_iter);
+        mem_srat_iter->proximity_domain = vnode;
+        mem_srat_iter->base_address = base_address;
+        size = ((uint64_t)vnode_info->nr_pages << PAGE_SHIFT);
+        mem_srat_iter->size = size;
+        mem_srat_iter->flags = ACPI_30_SRAT_MEM_ENABLED;
+        base_address += size;
+        mem_srat_iter++;
+    }
+    /* return length of the sub-table */
+    return ((uint8_t *)mem_srat_iter-(uint8_t *)mem_srat);
+}
+
+static int construct_srat(struct acpi_30_srat *srat)
+{
+    int offset;
+
+    memset(srat, 0, sizeof(*srat));
+    srat->header.signature    = ACPI_3_0_SRAT_SIGNATURE;
+    srat->header.revision     = ACPI_3_0_SRAT_REVISION;
+    fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    srat->header.oem_revision = ACPI_OEM_REVISION;
+    srat->header.creator_id   = ACPI_CREATOR_ID;
+    srat->header.creator_revision = ACPI_CREATOR_REVISION;
+    srat->table_revision = ACPI_30_SRAT_TABLE_REVISION;
+    offset = sizeof(*srat);
+
+    offset += construct_srat_cpu_affinity((struct acpi_30_srat_cpu_affinity *)
+                                                ((uint8_t *)srat + offset));
+
+    offset += construct_srat_mem_affinity((struct acpi_30_srat_mem_affinity *)
+                                                ((uint8_t *)srat + offset));
+
+    srat->header.length = offset;
+    set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+    return offset;
+}
+
+static int construct_slit(struct acpi_10_slit *slit)
+{
+    int offset, i, nr_vnodes;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info;
+
+    memset(slit, 0, sizeof(*slit));
+    slit->header.signature    = ACPI_1_0_SLIT_SIGNATURE;
+    slit->header.revision     = ACPI_1_0_SLIT_REVISION;
+    fixed_strcpy(slit->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(slit->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    slit->header.oem_revision = ACPI_OEM_REVISION;
+    slit->header.creator_id   = ACPI_CREATOR_ID;
+    slit->header.creator_revision = ACPI_CREATOR_REVISION;
+    slit->locality_count = numa_info->nr_vnodes;
+
+    nr_vnodes = numa_info->nr_vnodes;
+    for (i=0; i<(nr_vnodes*nr_vnodes); i++)
+        slit->entry[i] = numa_info->vnode_distance[i];
+
+    offset = sizeof(*slit)+(nr_vnodes*nr_vnodes)-1;
+    slit->header.length = offset;
+    set_checksum(slit, offsetof(struct acpi_header, checksum), offset);
+
+    return offset;
+}
+
 static int construct_hpet(struct acpi_20_hpet *hpet)
 {
     int offset;
@@ -177,6 +285,8 @@ static int construct_secondary_tables(ui
     struct acpi_20_madt *madt;
     struct acpi_20_hpet *hpet;
     struct acpi_20_tcpa *tcpa;
+    struct acpi_30_srat *srat;
+    struct acpi_10_slit *slit;
     static const uint16_t tis_signature[] = {0x0001, 0x0001, 0x0001};
     uint16_t *tis_hdr;
     void *lasa;
@@ -189,6 +299,17 @@ static int construct_secondary_tables(ui
         table_ptrs[nr_tables++] = (unsigned long)madt;
     }
 
+    /* SRAT/SLIT. */
+    if ( hvm_info->numa_info.version == XEN_DOM_NUMA_INTERFACE_VERSION )
+    {
+        srat = (struct acpi_30_srat *)&buf[offset];
+        offset += construct_srat(srat);
+        table_ptrs[nr_tables++] = (unsigned long)srat;
+        slit = (struct acpi_10_slit *)&buf[offset];
+        offset += construct_slit(slit);
+        table_ptrs[nr_tables++] = (unsigned long)slit;
+    }
+
     /* HPET. */
     if ( hpet_exists(ACPI_HPET_ADDRESS) )
     {
diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c
+++ b/tools/libxc/xc_hvm_build.c
@@ -33,7 +33,43 @@
 #define NR_SPECIAL_PAGES     5
 #define special_pfn(x) (0xff000u - NR_SPECIAL_PAGES + (x))
 
-static void build_hvm_info(void *hvm_info_page, uint64_t mem_size)
+static void build_hvm_numa_info(struct hvm_info_table *hvm_info, 
+                                        xc_domain_numa_layout_t *dom_layout)
+{
+    int vnode;
+    uint64_t vnode_pgstart;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info;
+
+    numa_info->version = dom_layout->version;
+    numa_info->type = dom_layout->type;
+    numa_info->nr_vcpus = dom_layout->nr_vcpus;
+    numa_info->nr_vnodes = dom_layout->nr_vnodes;
+    /* high_mem_pgend is 32-bit, so we should be fine too */
+    numa_info->nr_pages = dom_layout->nr_pages;
+
+    for (vnode=0, vnode_pgstart=0; vnode<dom_layout->nr_vnodes; vnode++)
+    {
+        xc_vnode_data_t *vnode_data = &dom_layout->vnode_data[vnode];
+        struct xen_vnode_info *vnode_info = &numa_info->vnode_info[vnode];
+        uint64_t vnode_pgend;
+
+        memcpy(vnode_info, vnode_data, sizeof(*vnode_info));
+        vnode_pgend = vnode_pgstart + vnode_info->nr_pages;
+        /* Account for hole in the memory map */
+        if ( (vnode_pgstart < hvm_info->low_mem_pgend) && 
+                            (vnode_pgend >= hvm_info->low_mem_pgend) )
+                vnode_pgend += ((1ull<<32) - HVM_BELOW_4G_RAM_END)>>PAGE_SHIFT;
+        
+        vnode_info->nr_pages = vnode_pgend - vnode_pgstart;
+        vnode_pgstart += vnode_info->nr_pages;
+    }
+    memcpy(numa_info->vnode_distance, dom_layout->vnode_distance,
+                                    sizeof(numa_info->vnode_distance));
+    return;
+}
+
+static void build_hvm_info(void *hvm_info_page, uint64_t mem_size, 
+                                        xc_domain_numa_layout_t *dom_layout)
 {
     struct hvm_info_table *hvm_info = (struct hvm_info_table *)
         (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
@@ -63,6 +99,9 @@ static void build_hvm_info(void *hvm_inf
     hvm_info->high_mem_pgend = highmem_end >> PAGE_SHIFT;
     hvm_info->reserved_mem_pgstart = special_pfn(0);
 
+    if ( dom_layout && ( dom_layout->type == XEN_DOM_NUMA_SPLIT ))
+        build_hvm_numa_info(hvm_info, dom_layout);
+
     /* Finish with the checksum. */
     for ( i = 0, sum = 0; i < hvm_info->length; i++ )
         sum += ((uint8_t *)hvm_info)[i];
@@ -411,8 +450,8 @@ out:
     return rc;
 }
 
-static int
-setup_guest_special_pages(xc_interface *xch, uint32_t dom, uint64_t memsize)
+static int setup_guest_special_pages(xc_interface *xch, uint32_t dom, 
+                    uint64_t memsize, xc_domain_numa_layout_t *dom_layout)
 {
     void *hvm_info_page;
     struct xen_add_to_physmap xatp;
@@ -424,7 +463,7 @@ setup_guest_special_pages(xc_interface *
               xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
               HVM_INFO_PFN)) == NULL )
         goto error_out;
-    build_hvm_info(hvm_info_page, memsize);
+    build_hvm_info(hvm_info_page, memsize, dom_layout);
     munmap(hvm_info_page, PAGE_SIZE);
 
     /* Map and initialise shared_info page. */
@@ -532,7 +571,7 @@ static int setup_guest(xc_interface *xch
     if ( rc < 0 )
         goto error_out;
 
-    rc = setup_guest_special_pages(xch, dom, v_end);
+    rc = setup_guest_special_pages(xch, dom, v_end, dom_layout);
     if ( rc < 0 )
         goto error_out;
 
diff --git a/xen/include/public/hvm/hvm_info_table.h b/xen/include/public/hvm/hvm_info_table.h
--- a/xen/include/public/hvm/hvm_info_table.h
+++ b/xen/include/public/hvm/hvm_info_table.h
@@ -25,12 +25,15 @@
 #ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
 #define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
 
+#include "../dom_numa.h"
+
 #define HVM_INFO_PFN         0x09F
 #define HVM_INFO_OFFSET      0x800
 #define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
 
 /* Maximum we can support with current vLAPIC ID mapping. */
-#define HVM_MAX_VCPUS        128
+#define HVM_MAX_VCPUS        XEN_MAX_VCPUS
+#define HVM_MAX_VNODES       XEN_MAX_VNODES
 
 struct hvm_info_table {
     char        signature[8]; /* "HVM INFO" */
@@ -70,6 +73,9 @@ struct hvm_info_table {
 
     /* Bitmap of which CPUs are online at boot time. */
     uint8_t     vcpu_online[(HVM_MAX_VCPUS + 7)/8];
+
+    /* Domain NUMA memory distribution */
+    struct xen_domain_numa_info numa_info;
 };
 
 #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [vNUMA v2][PATCH 7/8] Construct SRAT/SLIT for NUMA HVM
  2010-07-02 23:55 ` [XEN][vNUMA][PATCH 8/9] Construct SRAT/SLIT for NUMA HVM Dulloor
@ 2010-08-01 22:05   ` Dulloor
  2010-08-13 15:47     ` Andre Przywara
  0 siblings, 1 reply; 3+ messages in thread
From: Dulloor @ 2010-08-01 22:05 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 96 bytes --]

Construct SRAT/SLIT tables for HVM NUMA.

-dulloor

Signed-off-by : Dulloor <dulloor@gmail.com>

[-- Attachment #2: xen-07-numa-acpi-tables.patch --]
[-- Type: text/x-patch, Size: 13903 bytes --]

vNUMA : Build ACPI NUMA tables for HVMs

diff --git a/tools/firmware/hvmloader/acpi/acpi2_0.h b/tools/firmware/hvmloader/acpi/acpi2_0.h
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h
@@ -283,6 +283,66 @@ struct acpi_20_madt {
     uint32_t flags;
 };
 
+/*
+ * System Resource Affinity Table header definition (SRAT) (Version 3.0)
+ * X2APIC_CPU_AFFINITY is defined in version 4.0
+ */
+struct acpi_30_srat {
+	struct acpi_header header;	/* Common ACPI table header */
+	uint32_t table_revision;	/* Must be value '1' */
+	uint32_t reserved[2];		/* Reserved, must be zero */
+};
+#define ACPI_30_SRAT_TABLE_REVISION    0x1
+
+/* Values for type (in SRAT subtables) */
+enum acpi_30_srat_type {
+	ACPI_30_SRAT_TYPE_CPU_AFFINITY = 0,
+	ACPI_30_SRAT_TYPE_MEMORY_AFFINITY = 1,
+	ACPI_30_SRAT_TYPE_RESERVED = 2	/* 2 and greater are reserved */
+};
+
+/* type(0) : Processor Local APIC/SAPIC Affinity */
+struct acpi_30_srat_cpu_affinity {
+    uint8_t type;
+    uint8_t length;
+    uint8_t proximity_domain_lo;
+    uint8_t apic_id;
+    uint32_t flags;
+    uint8_t local_sapic_eid;
+    uint8_t proximity_domain_hi[3];
+    uint32_t reserved;		/* Reserved, must be zero */
+};
+
+/* Flags */
+#define ACPI_30_SRAT_CPU_USE_AFFINITY  (1)	/* 00: Use affinity structure */
+
+/* 1: Memory Affinity */
+
+struct acpi_30_srat_mem_affinity {
+    uint8_t type;
+    uint8_t length;
+    uint32_t proximity_domain;
+    uint16_t reserved;		/* Reserved, must be zero */
+    uint64_t base_address;
+    uint64_t size;
+    uint32_t reserved1;
+    uint32_t flags;
+    uint64_t reserved2;	    /* Reserved, must be zero */
+};
+
+/* Flags */
+#define ACPI_30_SRAT_MEM_ENABLED       (1)	/* 00: Use affinity structure */
+#define ACPI_30_SRAT_MEM_HOT_PLUGGABLE (1<<1)	/* 01: Mem is hot pluggable */
+#define ACPI_30_SRAT_MEM_NON_VOLATILE  (1<<2)	/* 02: Mem is non-volatile */
+
+/*
+ * System Locality Information Table header definition (SLIT) (Version 1.0)
+ */
+struct acpi_10_slit {
+    struct acpi_header header;
+    uint64_t locality_count;
+    uint8_t entry[1];
+};
 
 /*
  * HPET Description Table
@@ -367,6 +427,8 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
 #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
 #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_3_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
+#define ACPI_1_0_SLIT_SIGNATURE ASCII32('S','L','I','T')
 
 /*
  * Table revision numbers.
@@ -379,6 +441,8 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_TCPA_REVISION 0x02
 #define ACPI_2_0_HPET_REVISION 0x01
 #define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_3_0_SRAT_REVISION 0x01
+#define ACPI_1_0_SLIT_REVISION 0x01
 
 #pragma pack ()
 
diff --git a/tools/firmware/hvmloader/acpi/build.c b/tools/firmware/hvmloader/acpi/build.c
--- a/tools/firmware/hvmloader/acpi/build.c
+++ b/tools/firmware/hvmloader/acpi/build.c
@@ -149,6 +149,114 @@ static int construct_madt(struct acpi_20
     return align16(offset);
 }
 
+static int 
+construct_srat_cpu_affinity(struct acpi_30_srat_cpu_affinity *cpu_srat)
+{
+    struct acpi_30_srat_cpu_affinity *cpu_srat_iter;
+    int vnode, vcpu;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
+    uint8_t *numa_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(numa_info);
+
+    for ( vnode = 0, cpu_srat_iter = cpu_srat; 
+                            vnode < numa_info->nr_vnodes; vnode++ )
+    {
+        for ( vcpu = 0 ; vcpu < numa_info->nr_vcpus; vcpu++ )
+        {
+            if (numa_vcpu_to_vnode[vcpu] == vnode)
+            {
+                memset(cpu_srat_iter, 0, sizeof(*cpu_srat_iter));
+                cpu_srat_iter->type = ACPI_30_SRAT_TYPE_CPU_AFFINITY;
+                cpu_srat_iter->length = sizeof(*cpu_srat);
+                cpu_srat_iter->proximity_domain_lo = vnode;
+                cpu_srat_iter->apic_id = LAPIC_ID(vcpu);
+                cpu_srat_iter->flags = ACPI_30_SRAT_CPU_USE_AFFINITY;
+                cpu_srat_iter++;
+            }
+        }
+    }
+    /* return length of the sub-table */
+    return ((uint8_t *)cpu_srat_iter-(uint8_t *)cpu_srat);
+}
+
+static int 
+construct_srat_mem_affinity(struct acpi_30_srat_mem_affinity *mem_srat)
+{
+    int vnode;
+    struct acpi_30_srat_mem_affinity *mem_srat_iter = mem_srat;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
+    struct xen_vnode_info *numa_vnode_info = NUMA_INFO_VNODE_INFO(numa_info);
+
+    for ( vnode = 0; vnode < numa_info->nr_vnodes; vnode++ )
+    {
+        struct xen_vnode_info *vnode_info = &numa_vnode_info[vnode];
+        memset(mem_srat_iter, 0, sizeof(*mem_srat_iter));
+        mem_srat_iter->type = ACPI_30_SRAT_TYPE_MEMORY_AFFINITY;
+        mem_srat_iter->length = sizeof(*mem_srat_iter);
+        mem_srat_iter->proximity_domain = vnode;
+        mem_srat_iter->base_address = (uint64_t)vnode_info->start << PAGE_SHIFT;
+        mem_srat_iter->size = 
+				(uint64_t)(vnode_info->end - vnode_info->start) << PAGE_SHIFT;
+        mem_srat_iter->flags = ACPI_30_SRAT_MEM_ENABLED;
+        mem_srat_iter++;
+    }
+    /* return length of the sub-table */
+    return ((uint8_t *)mem_srat_iter-(uint8_t *)mem_srat);
+}
+
+static int construct_srat(struct acpi_30_srat *srat)
+{
+    int offset;
+
+    memset(srat, 0, sizeof(*srat));
+    srat->header.signature    = ACPI_3_0_SRAT_SIGNATURE;
+    srat->header.revision     = ACPI_3_0_SRAT_REVISION;
+    fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    srat->header.oem_revision = ACPI_OEM_REVISION;
+    srat->header.creator_id   = ACPI_CREATOR_ID;
+    srat->header.creator_revision = ACPI_CREATOR_REVISION;
+    srat->table_revision = ACPI_30_SRAT_TABLE_REVISION;
+    offset = sizeof(*srat);
+
+    offset += construct_srat_cpu_affinity((struct acpi_30_srat_cpu_affinity *)
+                                                ((uint8_t *)srat + offset));
+
+    offset += construct_srat_mem_affinity((struct acpi_30_srat_mem_affinity *)
+                                                ((uint8_t *)srat + offset));
+
+    srat->header.length = offset;
+    set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+    return offset;
+}
+
+static int construct_slit(struct acpi_10_slit *slit)
+{
+    int offset, i, nr_vnodes;
+    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
+    uint8_t *numa_vnode_distance = NUMA_INFO_VNODE_DISTANCE(numa_info);
+
+    memset(slit, 0, sizeof(*slit));
+    slit->header.signature    = ACPI_1_0_SLIT_SIGNATURE;
+    slit->header.revision     = ACPI_1_0_SLIT_REVISION;
+    fixed_strcpy(slit->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(slit->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    slit->header.oem_revision = ACPI_OEM_REVISION;
+    slit->header.creator_id   = ACPI_CREATOR_ID;
+    slit->header.creator_revision = ACPI_CREATOR_REVISION;
+    slit->locality_count = numa_info->nr_vnodes;
+
+    nr_vnodes = numa_info->nr_vnodes;
+    for (i=0; i<(nr_vnodes*nr_vnodes); i++)
+        slit->entry[i] = numa_vnode_distance[i];
+
+    offset = sizeof(*slit)+(nr_vnodes*nr_vnodes)-1;
+    slit->header.length = offset;
+    set_checksum(slit, offsetof(struct acpi_header, checksum), offset);
+
+    return offset;
+}
+
 static int construct_hpet(struct acpi_20_hpet *hpet)
 {
     int offset;
@@ -177,6 +285,8 @@ static int construct_secondary_tables(ui
     struct acpi_20_madt *madt;
     struct acpi_20_hpet *hpet;
     struct acpi_20_tcpa *tcpa;
+    struct acpi_30_srat *srat;
+    struct acpi_10_slit *slit;
     static const uint16_t tis_signature[] = {0x0001, 0x0001, 0x0001};
     uint16_t *tis_hdr;
     void *lasa;
@@ -189,6 +299,18 @@ static int construct_secondary_tables(ui
         table_ptrs[nr_tables++] = (unsigned long)madt;
     }
 
+    /* SRAT/SLIT. */
+    if ( hvm_info->numa_enabled &&
+            hvm_info->numa_info[0].version == XEN_DOM_NUMA_INTERFACE_VERSION )
+    {
+        srat = (struct acpi_30_srat *)&buf[offset];
+        offset += construct_srat(srat);
+        table_ptrs[nr_tables++] = (unsigned long)srat;
+        slit = (struct acpi_10_slit *)&buf[offset];
+        offset += construct_slit(slit);
+        table_ptrs[nr_tables++] = (unsigned long)slit;
+    }
+
     /* HPET. */
     if ( hpet_exists(ACPI_HPET_ADDRESS) )
     {
diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c
+++ b/tools/libxc/xc_hvm_build.c
@@ -11,6 +11,7 @@
 #include "xg_private.h"
 #include "xc_private.h"
 #include "xc_dom_numa.h"
+#include "xc_cpumap.h"
 
 #include <xen/foreign/x86_32.h>
 #include <xen/foreign/x86_64.h>
@@ -32,7 +33,62 @@
 #define NR_SPECIAL_PAGES     4
 #define special_pfn(x) (0xff000u - NR_SPECIAL_PAGES + (x))
 
-static void build_hvm_info(void *hvm_info_page, uint64_t mem_size)
+static int build_hvm_numa_info(struct hvm_info_table *hvm_info, 
+                                        xc_domain_numa_layout_t *dlayout)
+{
+    int i, j;
+    uint64_t vnode_pgstart;
+    struct xen_domain_numa_info *ninfo;
+    struct xen_vnode_info *ninfo_vnode_info;
+    uint8_t *ninfo_vcpu_to_vnode, *ninfo_vnode_distance;
+
+    ninfo = &hvm_info->numa_info[0];
+    ninfo->version = dlayout->version;
+    ninfo->type = dlayout->type;
+    ninfo->nr_vcpus = dlayout->nr_vcpus;
+    ninfo->nr_vnodes = dlayout->nr_vnodes;
+
+    ninfo_vnode_info = NUMA_INFO_VNODE_INFO(ninfo);
+    ninfo_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(ninfo);
+    ninfo_vnode_distance = NUMA_INFO_VNODE_DISTANCE(ninfo);
+
+	for (i=0; i<ninfo->nr_vcpus; i++)
+		ninfo_vcpu_to_vnode[i] = XEN_INVALID_NODE;
+
+    for (i=0, vnode_pgstart=0; i<dlayout->nr_vnodes; i++)
+    {
+        uint64_t vnode_pgend;
+		struct xenctl_cpumap vnode_vcpumap;
+        xc_vnode_data_t *vnode_data = &dlayout->vnode_data[i];
+		xc_cpumask_t *vnode_vcpumask = &vnode_data->vcpu_mask;
+        struct xen_vnode_info *vnode_info = &ninfo_vnode_info[i];
+
+        vnode_info->mnode_id = vnode_data->mnode_id;
+        vnode_pgend = vnode_pgstart + vnode_data->nr_pages;
+        /* Account for hole in the memory map */
+        if ( (vnode_pgstart < hvm_info->low_mem_pgend) && 
+                            (vnode_pgend >= hvm_info->low_mem_pgend) )
+                vnode_pgend += ((1ull<<32) - HVM_BELOW_4G_RAM_END)>>PAGE_SHIFT;
+       
+        vnode_info->start = vnode_pgstart;
+        vnode_info->end = vnode_pgend;
+        vnode_pgstart = vnode_pgend;
+
+        xc_cpumap_from_cpumask(&vnode_vcpumap, vnode_vcpumask);
+        xc_for_each_cpu(j, vnode_vcpumap)
+            ninfo_vcpu_to_vnode[j] = i;
+    }
+
+    for (i=0; i<ninfo->nr_vnodes; i++)
+        for (j=0; j<ninfo->nr_vnodes; j++)
+            ninfo_vnode_distance[(i*ninfo->nr_vnodes)+j] =
+                    dlayout->vnode_distance[(i*ninfo->nr_vnodes)+j];
+
+    return NUMA_INFO_SIZE(ninfo);
+}
+
+static void build_hvm_info(void *hvm_info_page, uint64_t mem_size, 
+                                        xc_domain_numa_layout_t *dom_layout)
 {
     struct hvm_info_table *hvm_info = (struct hvm_info_table *)
         (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
@@ -62,6 +118,12 @@ static void build_hvm_info(void *hvm_inf
     hvm_info->high_mem_pgend = highmem_end >> PAGE_SHIFT;
     hvm_info->reserved_mem_pgstart = special_pfn(0);
 
+    if ( dom_layout && ( dom_layout->type == XEN_DOM_NUMA_SPLIT ))
+    {
+        hvm_info->numa_enabled = 1;
+    	hvm_info->length += build_hvm_numa_info(hvm_info, dom_layout);
+    }
+
     /* Finish with the checksum. */
     for ( i = 0, sum = 0; i < hvm_info->length; i++ )
         sum += ((uint8_t *)hvm_info)[i];
@@ -408,8 +470,8 @@ out:
     return rc;
 }
 
-static int
-setup_guest_special_pages(xc_interface *xch, uint32_t dom, uint64_t memsize)
+static int setup_guest_special_pages(xc_interface *xch, uint32_t dom, 
+                    uint64_t memsize, xc_domain_numa_layout_t *dom_layout)
 {
     void *hvm_info_page;
     uint32_t *ident_pt;
@@ -419,7 +481,7 @@ setup_guest_special_pages(xc_interface *
               xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
               HVM_INFO_PFN)) == NULL )
         goto error_out;
-    build_hvm_info(hvm_info_page, memsize);
+    build_hvm_info(hvm_info_page, memsize, dom_layout);
     munmap(hvm_info_page, PAGE_SIZE);
 
     /* Allocate and clear special pages. */
@@ -509,7 +571,7 @@ static int setup_guest(xc_interface *xch
     if ( rc < 0 )
         goto error_out;
 
-    rc = setup_guest_special_pages(xch, dom, v_end);
+    rc = setup_guest_special_pages(xch, dom, v_end, dom_layout);
     if ( rc < 0 )
         goto error_out;
 
diff --git a/xen/include/public/hvm/hvm_info_table.h b/xen/include/public/hvm/hvm_info_table.h
--- a/xen/include/public/hvm/hvm_info_table.h
+++ b/xen/include/public/hvm/hvm_info_table.h
@@ -25,12 +25,14 @@
 #ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
 #define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
 
+#include "../dom_numa.h"
+
 #define HVM_INFO_PFN         0x09F
 #define HVM_INFO_OFFSET      0x800
 #define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
 
 /* Maximum we can support with current vLAPIC ID mapping. */
-#define HVM_MAX_VCPUS        128
+#define HVM_MAX_VCPUS        XEN_MAX_VCPUS
 
 struct hvm_info_table {
     char        signature[8]; /* "HVM INFO" */
@@ -70,6 +72,12 @@ struct hvm_info_table {
 
     /* Bitmap of which CPUs are online at boot time. */
     uint8_t     vcpu_online[(HVM_MAX_VCPUS + 7)/8];
+
+    /* Domain NUMA memory distribution. Size of this structure should be 
+     * obtained using the macro XEN_DOMAIN_NUMA_INFO_SIZE(numa_info).
+     */
+    uint8_t numa_enabled; /* numa_info is populated only if numa_enabled != 0 */
+    struct xen_domain_numa_info numa_info[0];
 };
 
 #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [vNUMA v2][PATCH 7/8] Construct SRAT/SLIT for NUMA HVM
  2010-08-01 22:05   ` [vNUMA v2][PATCH 7/8] " Dulloor
@ 2010-08-13 15:47     ` Andre Przywara
  0 siblings, 0 replies; 3+ messages in thread
From: Andre Przywara @ 2010-08-13 15:47 UTC (permalink / raw)
  To: Dulloor; +Cc: xen-devel@lists.xensource.com

Dulloor wrote:
> Construct SRAT/SLIT tables for HVM NUMA.
> 

> --- a/tools/firmware/hvmloader/acpi/acpi2_0.h
> +++ b/tools/firmware/hvmloader/acpi/acpi2_0.h
> 
> +struct acpi_30_srat_mem_affinity {
> +    uint8_t type;
> +    uint8_t length;
> +    uint32_t proximity_domain;
> +    uint16_t reserved;		/* Reserved, must be zero */
> +    uint64_t base_address;
> +    uint64_t size;
As the ACPI specs talks about 32bit members only, I'd rather
see this reflected here. In experiments I saw strange bugs due to the 
fact that hvmloader is a self contained 32bit binary, which lacks some 
runtime 64bit functionality (like 64 by 32 bit division). Beside that it 
would make the whole code endianess aware, though this is possibly of 
limited use for nowaday's Xen ;-)

> +    uint32_t reserved1;
> +    uint32_t flags;
> +    uint64_t reserved2;	    /* Reserved, must be zero */
> +};

> --- a/tools/firmware/hvmloader/acpi/build.c
> +++ b/tools/firmware/hvmloader/acpi/build.c
> ...
> +static int 
> +construct_srat_mem_affinity(struct acpi_30_srat_mem_affinity *mem_srat)
> +{
> +    int vnode;
> +    struct acpi_30_srat_mem_affinity *mem_srat_iter = mem_srat;
> +    struct xen_domain_numa_info *numa_info = &hvm_info->numa_info[0];
> +    struct xen_vnode_info *numa_vnode_info = NUMA_INFO_VNODE_INFO(numa_info);
> +
> +    for ( vnode = 0; vnode < numa_info->nr_vnodes; vnode++ )
> +    {
> +        struct xen_vnode_info *vnode_info = &numa_vnode_info[vnode];
> +        memset(mem_srat_iter, 0, sizeof(*mem_srat_iter));
> +        mem_srat_iter->type = ACPI_30_SRAT_TYPE_MEMORY_AFFINITY;
> +        mem_srat_iter->length = sizeof(*mem_srat_iter);
> +        mem_srat_iter->proximity_domain = vnode;
> +        mem_srat_iter->base_address = (uint64_t)vnode_info->start << PAGE_SHIFT;
> +        mem_srat_iter->size = 
> +				(uint64_t)(vnode_info->end - vnode_info->start) << PAGE_SHIFT;
> +        mem_srat_iter->flags = ACPI_30_SRAT_MEM_ENABLED;
> +        mem_srat_iter++;
> +    }
> +    /* return length of the sub-table */
> +    return ((uint8_t *)mem_srat_iter-(uint8_t *)mem_srat);
> +}
This approach will lead to possible problems. Although you do account
for the holes in libxc, the SRATs lacks them, leading to one node
possibly having a larger amount of memory than there actually is 
(increased by the size of the PCI hole). Linux seems to cope with this, 
but I'd rather see the PCI memory left out of SRAT. Actually this memory 
mapped region does not need to belong to that special node, but it 
should be attributed to the processor containing the I/O link. I don't 
think that we should model this in Xen, though, it would probably be 
over-engineered and wouldn't apply to the guest anyway (in case of PV I/O).
I have already ported my older code over to this, it works better and
matches SRAT tables I have seen on real machines (although AMD only).

> --- a/tools/libxc/xc_hvm_build.c
> +++ b/tools/libxc/xc_hvm_build.c
> @@ -11,6 +11,7 @@
>  #include "xg_private.h"
>  #include "xc_private.h"
>  #include "xc_dom_numa.h"
> +#include "xc_cpumap.h"
>  
>  #include <xen/foreign/x86_32.h>
>  #include <xen/foreign/x86_64.h>
> @@ -32,7 +33,62 @@
>  #define NR_SPECIAL_PAGES     4
>  #define special_pfn(x) (0xff000u - NR_SPECIAL_PAGES + (x))
>  
> -static void build_hvm_info(void *hvm_info_page, uint64_t mem_size)
> +static int build_hvm_numa_info(struct hvm_info_table *hvm_info, 
> +                                        xc_domain_numa_layout_t *dlayout)
> +{
> +    int i, j;
> +    uint64_t vnode_pgstart;
> +    struct xen_domain_numa_info *ninfo;
> +    struct xen_vnode_info *ninfo_vnode_info;
> +    uint8_t *ninfo_vcpu_to_vnode, *ninfo_vnode_distance;
> +
> +    ninfo = &hvm_info->numa_info[0];
> +    ninfo->version = dlayout->version;
> +    ninfo->type = dlayout->type;
> +    ninfo->nr_vcpus = dlayout->nr_vcpus;
> +    ninfo->nr_vnodes = dlayout->nr_vnodes;
> +
> +    ninfo_vnode_info = NUMA_INFO_VNODE_INFO(ninfo);
> +    ninfo_vcpu_to_vnode = NUMA_INFO_VCPU_TO_VNODE(ninfo);
> +    ninfo_vnode_distance = NUMA_INFO_VNODE_DISTANCE(ninfo);
> +
> +	for (i=0; i<ninfo->nr_vcpus; i++)
> +		ninfo_vcpu_to_vnode[i] = XEN_INVALID_NODE;
> +
> +    for (i=0, vnode_pgstart=0; i<dlayout->nr_vnodes; i++)
> +    {
> +        uint64_t vnode_pgend;
> +		struct xenctl_cpumap vnode_vcpumap;
> +        xc_vnode_data_t *vnode_data = &dlayout->vnode_data[i];
> +		xc_cpumask_t *vnode_vcpumask = &vnode_data->vcpu_mask;
> +        struct xen_vnode_info *vnode_info = &ninfo_vnode_info[i];
> +
> +        vnode_info->mnode_id = vnode_data->mnode_id;
> +        vnode_pgend = vnode_pgstart + vnode_data->nr_pages;
> +        /* Account for hole in the memory map */
> +        if ( (vnode_pgstart < hvm_info->low_mem_pgend) && 
> +                            (vnode_pgend >= hvm_info->low_mem_pgend) )
I think this is wrong. On guests with less than 4 GB of memory this 
leads to the last node containing more memory, although it does not 
touch the PCI hole. It should look like this:
+        if ( (vnode_pgstart < HVM_BELOW_4G_RAM_END) &&
+                            (vnode_pgend >= HVM_BELOW_4G_RAM_END) )
> +                vnode_pgend += ((1ull<<32) - HVM_BELOW_4G_RAM_END)>>PAGE_SHIFT;

Regards,
Andre.

-- 
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 448-3567-12

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2010-08-13 15:47 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1BEA8649F0C00540AB2811D7922ECB6C9338B4D3@orsmsx507.amr.corp.intel.com>
2010-07-02 23:55 ` [XEN][vNUMA][PATCH 8/9] Construct SRAT/SLIT for NUMA HVM Dulloor
2010-08-01 22:05   ` [vNUMA v2][PATCH 7/8] " Dulloor
2010-08-13 15:47     ` Andre Przywara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).