All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 3/3] v2: KVM-userspace: generate a SRAT table to describe the guests NUMA topology
@ 2008-12-05 13:34 Andre Przywara
  0 siblings, 0 replies; only message in thread
From: Andre Przywara @ 2008-12-05 13:34 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Daniel P. Berrange

[-- Attachment #1: Type: text/plain, Size: 788 bytes --]

According to the NUMA topology passed via the QEMU firmware 
configuration interface the BIOS code generates a SRAT (System Resources 
Affinity Table) to describe which (V)CPU and which part of memory is 
assigned to a certain node. This will then be read and hopefully honored 
by the guest OS.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>

-- 
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy

[-- Attachment #2: kvmnuma_sratbios.patch --]
[-- Type: text/x-patch, Size: 9092 bytes --]

commit 24fce48662f201903bca101e90ccca386428e764
Author: Andre Przywara <aprzywar@hagen.osrc.amd.com>
Date:   Fri Dec 5 14:18:16 2008 +0100

    generate appropriate SRAT ACPI table

diff --git a/bios/rombios32.c b/bios/rombios32.c
index 3c9a2d7..878690d 100755
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -455,12 +455,30 @@ void wrmsr_smp(uint32_t index, uint64_t val)
     p->ecx = 0;
 }
 
+static inline uint16_t le16_to_cpu(uint16_t x)
+{
+    return x;
+}
+
+static inline uint32_t le32_to_cpu(uint32_t x)
+{
+    return x;
+}
+
+static inline uint64_t le64_to_cpu(uint64_t x)
+{
+    return x;
+}
+
 #ifdef BX_QEMU
 #define QEMU_CFG_CTL_PORT 0x510
 #define QEMU_CFG_DATA_PORT 0x511
 #define QEMU_CFG_SIGNATURE  0x00
 #define QEMU_CFG_ID         0x01
 #define QEMU_CFG_UUID       0x02
+#define QEMU_CFG_NUMA_NODES 0x07
+#define QEMU_CFG_NUMA_VCPUS 0x08
+#define QEMU_CFG_NUMA_MEM   0x09
 
 int qemu_cfg_port;
 
@@ -488,6 +506,23 @@ void qemu_cfg_read(uint8_t *buf, int len)
     while (len--)
         *(buf++) = inb(QEMU_CFG_DATA_PORT);
 }
+
+uint32_t qemu_cfg_get32 (void)
+{
+uint32_t ret;
+
+    qemu_cfg_read ((uint8_t*)&ret, 4);
+    return le32_to_cpu (ret);
+}
+
+uint64_t qemu_cfg_get64 (void)
+{
+uint64_t ret;
+
+    qemu_cfg_read ((uint8_t*)&ret, 8);
+    return le64_to_cpu (ret);
+}
+
 #endif
 
 void uuid_probe(void)
@@ -502,6 +537,18 @@ void uuid_probe(void)
     memset(bios_uuid, 0, 16);
 }
 
+int get_numa_nodes(void)
+{
+uint16_t nodes = 0;
+#ifdef BX_QEMU
+    if(qemu_cfg_port) {
+        qemu_cfg_select(QEMU_CFG_NUMA_NODES);
+        qemu_cfg_read((uint8_t*)&nodes, 2);
+    }
+#endif
+    return le16_to_cpu(nodes);
+}
+
 void cpu_probe(void)
 {
     uint32_t eax, ebx, ecx, edx;
@@ -1232,7 +1279,7 @@ struct rsdp_descriptor         /* Root System Descriptor Pointer */
 struct rsdt_descriptor_rev1
 {
 	ACPI_TABLE_HEADER_DEF                           /* ACPI common table header */
-	uint32_t                             table_offset_entry [2]; /* Array of pointers to other */
+	uint32_t                             table_offset_entry [3]; /* Array of pointers to other */
 			 /* ACPI tables */
 };
 
@@ -1350,6 +1397,9 @@ struct multiple_apic_table
 #define APIC_XRUPT_SOURCE       8
 #define APIC_RESERVED           9           /* 9 and greater are reserved */
 
+#define SRAT_PROCESSOR          0
+#define SRAT_MEMORY             1
+
 /*
  * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
  */
@@ -1357,6 +1407,40 @@ struct multiple_apic_table
 	uint8_t                              type; \
 	uint8_t                              length;
 
+/*
+ * SRAT (NUMA topology description) table
+ */
+struct system_resource_affinity_table
+{
+    ACPI_TABLE_HEADER_DEF
+    uint32_t    reserved1;
+    uint32_t    reserved2[2];
+};
+
+struct srat_processor_affinity
+{
+APIC_HEADER_DEF
+	uint8_t     proximity_lo;
+	uint8_t     local_apic_id;
+	uint32_t    flags;
+	uint8_t     local_sapic_eid;
+	uint8_t     proximity_hi[3];
+	uint32_t    reserved;
+};
+
+struct srat_memory_affinity
+{
+	APIC_HEADER_DEF
+	uint8_t     proximity[4];
+	uint16_t    reserved1;
+	uint32_t    base_addr_low,base_addr_high;
+	uint32_t    length_low,length_high;
+	uint32_t    reserved2;
+	uint32_t    flags;
+	uint32_t    reserved3[2];
+};
+	
+
 /* Sub-structures for MADT */
 
 struct madt_processor_apic
@@ -1411,6 +1495,26 @@ static int acpi_checksum(const uint8_t *data, int len)
     return (-sum) & 0xff;
 }
 
+static void read_config_numa_vcpus (uint32_t *nodes, int numnodes)
+{
+#ifdef BX_QEMU
+uint64_t cpumask;
+int node,cpu;
+
+    qemu_cfg_select (QEMU_CFG_NUMA_VCPUS);
+    for (node = 0; node < numnodes; node++) {
+        cpumask = qemu_cfg_get64();
+        for (cpu = 0; cpu < 64; cpu++) {
+            if (cpumask == 0) break;
+            if (cpumask & 1) nodes[cpu]=node;
+            cpumask >>= 1;
+        }
+    }
+#endif
+    return;
+
+}
+
 static void acpi_build_table_header(struct acpi_table_header *h,
                                     char *sig, int len, uint8_t rev)
 {
@@ -1435,6 +1539,21 @@ static void acpi_build_table_header(struct acpi_table_header *h,
     h->checksum = acpi_checksum((void *)h, len);
 }
 
+static void acpi_build_srat_memory(struct srat_memory_affinity *numamem,
+    uint64_t base, uint64_t len, int node, int enabled)
+{
+    numamem->type = SRAT_MEMORY;
+    numamem->length = sizeof(*numamem);
+    memset (numamem->proximity, 0 ,4);
+    numamem->proximity[0] = node;
+    numamem->flags = cpu_to_le32(!!enabled);
+    numamem->base_addr_low = base & 0xFFFFFFFF;
+    numamem->base_addr_high = base >> 32;
+    numamem->length_low = len & 0xFFFFFFFF;
+    numamem->length_high = len >> 32;
+    return;
+}
+
 /* base_addr must be a multiple of 4KB */
 void acpi_bios_init(void)
 {
@@ -1443,10 +1562,12 @@ void acpi_bios_init(void)
     struct fadt_descriptor_rev1 *fadt;
     struct facs_descriptor_rev1 *facs;
     struct multiple_apic_table *madt;
+    struct system_resource_affinity_table *srat;
     uint8_t *dsdt;
     uint32_t base_addr, rsdt_addr, fadt_addr, addr, facs_addr, dsdt_addr;
     uint32_t acpi_tables_size, madt_addr, madt_size;
-    int i;
+    uint32_t srat_addr, srat_size;
+    int i, numanodes;
 
     /* reserve memory space for tables */
 #ifdef BX_USE_EBDA_TABLES
@@ -1478,6 +1599,21 @@ void acpi_bios_init(void)
     dsdt = (void *)(addr);
     addr += sizeof(AmlCode);
 
+    numanodes = get_numa_nodes();
+    if (numanodes > 0) {
+        addr = (addr + 7) & ~7;
+        srat_addr = addr;
+        srat_size = sizeof(*srat) +
+            sizeof(struct srat_processor_affinity) * smp_cpus +
+            sizeof(struct srat_memory_affinity) * (numanodes + 2);
+        srat = (void *)(addr);
+        addr += srat_size;
+    } else {
+        srat_addr = addr;
+        srat = (void*)(addr);
+        srat_size = 0;
+    }
+
     addr = (addr + 7) & ~7;
     madt_addr = addr;
     madt_size = sizeof(*madt) +
@@ -1507,8 +1643,10 @@ void acpi_bios_init(void)
     memset(rsdt, 0, sizeof(*rsdt));
     rsdt->table_offset_entry[0] = cpu_to_le32(fadt_addr);
     rsdt->table_offset_entry[1] = cpu_to_le32(madt_addr);
-    acpi_build_table_header((struct acpi_table_header *)rsdt,
-                            "RSDT", sizeof(*rsdt), 1);
+    if (numanodes > 0)
+        rsdt->table_offset_entry[2] = cpu_to_le32(srat_addr);
+    acpi_build_table_header((struct acpi_table_header *)rsdt, "RSDT",
+        sizeof(*rsdt) - (numanodes > 0? 0: sizeof(uint32_t)), 1);
 
     /* FADT */
     memset(fadt, 0, sizeof(*fadt));
@@ -1590,6 +1728,67 @@ void acpi_bios_init(void)
         acpi_build_table_header((struct acpi_table_header *)madt,
                                 "APIC", madt_size, 1);
     }
+
+    /* SRAT */
+    if (numanodes > 0) {
+        struct srat_processor_affinity *core;
+        struct srat_memory_affinity *numamem;
+        int slots;
+        uint64_t mem_len, mem_base, next_base = 0;
+        uint32_t nodes[64];
+
+        memset (srat, 0 , srat_size);
+        srat->reserved1=1;
+
+        read_config_numa_vcpus (nodes, numanodes);
+        core = (void*)(srat + 1);
+        for (i = 0; i < smp_cpus; ++i) {
+            core->type = SRAT_PROCESSOR;
+            core->length = sizeof(*core);
+            core->local_apic_id = i;
+            core->proximity_lo = nodes[i];
+            memset (core->proximity_hi, 0, 3);
+            core->local_sapic_eid = 0;
+            if (i < smp_cpus)
+                core->flags = cpu_to_le32(1);
+            else
+                core->flags = 0;
+            core++;
+        }
+        /* the memory map is a bit tricky, it contains at least one hole
+           from 640k-1M and possibly another one from 3.5G-4G. */
+        numamem = (void*)core; slots = 0;
+        qemu_cfg_select (QEMU_CFG_NUMA_MEM);
+        acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1);
+        next_base = 1024 * 1024; numamem++;slots++;
+        for (i = 1; i < numanodes + 1; ++i) {
+            mem_base = next_base;
+            mem_len = qemu_cfg_get64();
+            if (i == 1) mem_len -= 1024 * 1024;
+            next_base = mem_base + mem_len;
+
+            /* Cut out the PCI hole */
+            if (mem_base <= ram_size && next_base > ram_size) {
+                mem_len -= next_base - ram_size;
+                if (mem_len > 0) {
+                    acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+                    numamem++; slots++;
+                }
+                mem_base = 1ULL << 32;
+                mem_len = next_base - ram_size;
+                next_base += (1ULL << 32) - ram_size;
+            }
+            acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+            numamem++; slots++;
+        }
+        for (; slots < numanodes + 2; slots++) {
+            acpi_build_srat_memory(numamem, 0, 0, 0, 0);
+            numamem++;
+        }
+
+        acpi_build_table_header((struct acpi_table_header *)srat,
+                                "SRAT", srat_size, 1);
+    }
 }
 
 /* SMBIOS entry point -- must be written to a 16-bit aligned address

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2008-12-05 13:34 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-05 13:34 [PATCH 3/3] v2: KVM-userspace: generate a SRAT table to describe the guests NUMA topology Andre Przywara

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.