From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andre Przywara Subject: [PATCH 3/3] v2: KVM-userspace: generate a SRAT table to describe the guests NUMA topology Date: Fri, 5 Dec 2008 14:34:12 +0100 Message-ID: <49392DD4.4070806@amd.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="------------020605070709030601010104" Cc: kvm@vger.kernel.org, "Daniel P. Berrange" To: Avi Kivity Return-path: Received: from outbound-dub.frontbridge.com ([213.199.154.16]:35677 "EHLO IE1EHSOBE005.bigfish.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752275AbYLENeI (ORCPT ); Fri, 5 Dec 2008 08:34:08 -0500 Sender: kvm-owner@vger.kernel.org List-ID: --------------020605070709030601010104 Content-Type: text/plain; charset="ISO-8859-1"; format=flowed Content-Transfer-Encoding: 7bit According to the NUMA topology passed via the QEMU firmware configuration interface the BIOS code generates a SRAT (System Resources Affinity Table) to describe which (V)CPU and which part of memory is assigned to a certain node. This will then be read and hopefully honored by the guest OS. Signed-off-by: Andre Przywara -- Andre Przywara AMD-Operating System Research Center (OSRC), Dresden, Germany Tel: +49 351 277-84917 ----to satisfy European Law for business letters: AMD Saxony Limited Liability Company & Co. KG, Wilschdorfer Landstr. 101, 01109 Dresden, Germany Register Court Dresden: HRA 4896, General Partner authorized to represent: AMD Saxony LLC (Wilmington, Delaware, US) General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy --------------020605070709030601010104 Content-Type: text/x-patch; name="kvmnuma_sratbios.patch" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="kvmnuma_sratbios.patch" commit 24fce48662f201903bca101e90ccca386428e764 Author: Andre Przywara Date: Fri Dec 5 14:18:16 2008 +0100 generate appropriate SRAT ACPI table diff --git a/bios/rombios32.c b/bios/rombios32.c index 3c9a2d7..878690d 100755 --- a/bios/rombios32.c +++ b/bios/rombios32.c @@ -455,12 +455,30 @@ void wrmsr_smp(uint32_t index, uint64_t val) p->ecx = 0; } +static inline uint16_t le16_to_cpu(uint16_t x) +{ + return x; +} + +static inline uint32_t le32_to_cpu(uint32_t x) +{ + return x; +} + +static inline uint64_t le64_to_cpu(uint64_t x) +{ + return x; +} + #ifdef BX_QEMU #define QEMU_CFG_CTL_PORT 0x510 #define QEMU_CFG_DATA_PORT 0x511 #define QEMU_CFG_SIGNATURE 0x00 #define QEMU_CFG_ID 0x01 #define QEMU_CFG_UUID 0x02 +#define QEMU_CFG_NUMA_NODES 0x07 +#define QEMU_CFG_NUMA_VCPUS 0x08 +#define QEMU_CFG_NUMA_MEM 0x09 int qemu_cfg_port; @@ -488,6 +506,23 @@ void qemu_cfg_read(uint8_t *buf, int len) while (len--) *(buf++) = inb(QEMU_CFG_DATA_PORT); } + +uint32_t qemu_cfg_get32 (void) +{ +uint32_t ret; + + qemu_cfg_read ((uint8_t*)&ret, 4); + return le32_to_cpu (ret); +} + +uint64_t qemu_cfg_get64 (void) +{ +uint64_t ret; + + qemu_cfg_read ((uint8_t*)&ret, 8); + return le64_to_cpu (ret); +} + #endif void uuid_probe(void) @@ -502,6 +537,18 @@ void uuid_probe(void) memset(bios_uuid, 0, 16); } +int get_numa_nodes(void) +{ +uint16_t nodes = 0; +#ifdef BX_QEMU + if(qemu_cfg_port) { + qemu_cfg_select(QEMU_CFG_NUMA_NODES); + qemu_cfg_read((uint8_t*)&nodes, 2); + } +#endif + return le16_to_cpu(nodes); +} + void cpu_probe(void) { uint32_t eax, ebx, ecx, edx; @@ -1232,7 +1279,7 @@ struct rsdp_descriptor /* Root System Descriptor Pointer */ struct rsdt_descriptor_rev1 { ACPI_TABLE_HEADER_DEF /* ACPI common table header */ - uint32_t table_offset_entry [2]; /* Array of pointers to other */ + uint32_t table_offset_entry [3]; /* Array of pointers to other */ /* ACPI tables */ }; @@ -1350,6 +1397,9 @@ struct multiple_apic_table #define APIC_XRUPT_SOURCE 8 #define APIC_RESERVED 9 /* 9 and greater are reserved */ +#define SRAT_PROCESSOR 0 +#define SRAT_MEMORY 1 + /* * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE) */ @@ -1357,6 +1407,40 @@ struct multiple_apic_table uint8_t type; \ uint8_t length; +/* + * SRAT (NUMA topology description) table + */ +struct system_resource_affinity_table +{ + ACPI_TABLE_HEADER_DEF + uint32_t reserved1; + uint32_t reserved2[2]; +}; + +struct srat_processor_affinity +{ +APIC_HEADER_DEF + uint8_t proximity_lo; + uint8_t local_apic_id; + uint32_t flags; + uint8_t local_sapic_eid; + uint8_t proximity_hi[3]; + uint32_t reserved; +}; + +struct srat_memory_affinity +{ + APIC_HEADER_DEF + uint8_t proximity[4]; + uint16_t reserved1; + uint32_t base_addr_low,base_addr_high; + uint32_t length_low,length_high; + uint32_t reserved2; + uint32_t flags; + uint32_t reserved3[2]; +}; + + /* Sub-structures for MADT */ struct madt_processor_apic @@ -1411,6 +1495,26 @@ static int acpi_checksum(const uint8_t *data, int len) return (-sum) & 0xff; } +static void read_config_numa_vcpus (uint32_t *nodes, int numnodes) +{ +#ifdef BX_QEMU +uint64_t cpumask; +int node,cpu; + + qemu_cfg_select (QEMU_CFG_NUMA_VCPUS); + for (node = 0; node < numnodes; node++) { + cpumask = qemu_cfg_get64(); + for (cpu = 0; cpu < 64; cpu++) { + if (cpumask == 0) break; + if (cpumask & 1) nodes[cpu]=node; + cpumask >>= 1; + } + } +#endif + return; + +} + static void acpi_build_table_header(struct acpi_table_header *h, char *sig, int len, uint8_t rev) { @@ -1435,6 +1539,21 @@ static void acpi_build_table_header(struct acpi_table_header *h, h->checksum = acpi_checksum((void *)h, len); } +static void acpi_build_srat_memory(struct srat_memory_affinity *numamem, + uint64_t base, uint64_t len, int node, int enabled) +{ + numamem->type = SRAT_MEMORY; + numamem->length = sizeof(*numamem); + memset (numamem->proximity, 0 ,4); + numamem->proximity[0] = node; + numamem->flags = cpu_to_le32(!!enabled); + numamem->base_addr_low = base & 0xFFFFFFFF; + numamem->base_addr_high = base >> 32; + numamem->length_low = len & 0xFFFFFFFF; + numamem->length_high = len >> 32; + return; +} + /* base_addr must be a multiple of 4KB */ void acpi_bios_init(void) { @@ -1443,10 +1562,12 @@ void acpi_bios_init(void) struct fadt_descriptor_rev1 *fadt; struct facs_descriptor_rev1 *facs; struct multiple_apic_table *madt; + struct system_resource_affinity_table *srat; uint8_t *dsdt; uint32_t base_addr, rsdt_addr, fadt_addr, addr, facs_addr, dsdt_addr; uint32_t acpi_tables_size, madt_addr, madt_size; - int i; + uint32_t srat_addr, srat_size; + int i, numanodes; /* reserve memory space for tables */ #ifdef BX_USE_EBDA_TABLES @@ -1478,6 +1599,21 @@ void acpi_bios_init(void) dsdt = (void *)(addr); addr += sizeof(AmlCode); + numanodes = get_numa_nodes(); + if (numanodes > 0) { + addr = (addr + 7) & ~7; + srat_addr = addr; + srat_size = sizeof(*srat) + + sizeof(struct srat_processor_affinity) * smp_cpus + + sizeof(struct srat_memory_affinity) * (numanodes + 2); + srat = (void *)(addr); + addr += srat_size; + } else { + srat_addr = addr; + srat = (void*)(addr); + srat_size = 0; + } + addr = (addr + 7) & ~7; madt_addr = addr; madt_size = sizeof(*madt) + @@ -1507,8 +1643,10 @@ void acpi_bios_init(void) memset(rsdt, 0, sizeof(*rsdt)); rsdt->table_offset_entry[0] = cpu_to_le32(fadt_addr); rsdt->table_offset_entry[1] = cpu_to_le32(madt_addr); - acpi_build_table_header((struct acpi_table_header *)rsdt, - "RSDT", sizeof(*rsdt), 1); + if (numanodes > 0) + rsdt->table_offset_entry[2] = cpu_to_le32(srat_addr); + acpi_build_table_header((struct acpi_table_header *)rsdt, "RSDT", + sizeof(*rsdt) - (numanodes > 0? 0: sizeof(uint32_t)), 1); /* FADT */ memset(fadt, 0, sizeof(*fadt)); @@ -1590,6 +1728,67 @@ void acpi_bios_init(void) acpi_build_table_header((struct acpi_table_header *)madt, "APIC", madt_size, 1); } + + /* SRAT */ + if (numanodes > 0) { + struct srat_processor_affinity *core; + struct srat_memory_affinity *numamem; + int slots; + uint64_t mem_len, mem_base, next_base = 0; + uint32_t nodes[64]; + + memset (srat, 0 , srat_size); + srat->reserved1=1; + + read_config_numa_vcpus (nodes, numanodes); + core = (void*)(srat + 1); + for (i = 0; i < smp_cpus; ++i) { + core->type = SRAT_PROCESSOR; + core->length = sizeof(*core); + core->local_apic_id = i; + core->proximity_lo = nodes[i]; + memset (core->proximity_hi, 0, 3); + core->local_sapic_eid = 0; + if (i < smp_cpus) + core->flags = cpu_to_le32(1); + else + core->flags = 0; + core++; + } + /* the memory map is a bit tricky, it contains at least one hole + from 640k-1M and possibly another one from 3.5G-4G. */ + numamem = (void*)core; slots = 0; + qemu_cfg_select (QEMU_CFG_NUMA_MEM); + acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1); + next_base = 1024 * 1024; numamem++;slots++; + for (i = 1; i < numanodes + 1; ++i) { + mem_base = next_base; + mem_len = qemu_cfg_get64(); + if (i == 1) mem_len -= 1024 * 1024; + next_base = mem_base + mem_len; + + /* Cut out the PCI hole */ + if (mem_base <= ram_size && next_base > ram_size) { + mem_len -= next_base - ram_size; + if (mem_len > 0) { + acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1); + numamem++; slots++; + } + mem_base = 1ULL << 32; + mem_len = next_base - ram_size; + next_base += (1ULL << 32) - ram_size; + } + acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1); + numamem++; slots++; + } + for (; slots < numanodes + 2; slots++) { + acpi_build_srat_memory(numamem, 0, 0, 0, 0); + numamem++; + } + + acpi_build_table_header((struct acpi_table_header *)srat, + "SRAT", srat_size, 1); + } } /* SMBIOS entry point -- must be written to a 16-bit aligned address --------------020605070709030601010104--