* [PATCH 3/3] v2: KVM-userspace: generate a SRAT table to describe the guests NUMA topology
@ 2008-12-05 13:34 Andre Przywara
0 siblings, 0 replies; only message in thread
From: Andre Przywara @ 2008-12-05 13:34 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm, Daniel P. Berrange
[-- Attachment #1: Type: text/plain, Size: 788 bytes --]
According to the NUMA topology passed via the QEMU firmware
configuration interface the BIOS code generates a SRAT (System Resources
Affinity Table) to describe which (V)CPU and which part of memory is
assigned to a certain node. This will then be read and hopefully honored
by the guest OS.
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
[-- Attachment #2: kvmnuma_sratbios.patch --]
[-- Type: text/x-patch, Size: 9092 bytes --]
commit 24fce48662f201903bca101e90ccca386428e764
Author: Andre Przywara <aprzywar@hagen.osrc.amd.com>
Date: Fri Dec 5 14:18:16 2008 +0100
generate appropriate SRAT ACPI table
diff --git a/bios/rombios32.c b/bios/rombios32.c
index 3c9a2d7..878690d 100755
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -455,12 +455,30 @@ void wrmsr_smp(uint32_t index, uint64_t val)
p->ecx = 0;
}
+static inline uint16_t le16_to_cpu(uint16_t x)
+{
+ return x;
+}
+
+static inline uint32_t le32_to_cpu(uint32_t x)
+{
+ return x;
+}
+
+static inline uint64_t le64_to_cpu(uint64_t x)
+{
+ return x;
+}
+
#ifdef BX_QEMU
#define QEMU_CFG_CTL_PORT 0x510
#define QEMU_CFG_DATA_PORT 0x511
#define QEMU_CFG_SIGNATURE 0x00
#define QEMU_CFG_ID 0x01
#define QEMU_CFG_UUID 0x02
+#define QEMU_CFG_NUMA_NODES 0x07
+#define QEMU_CFG_NUMA_VCPUS 0x08
+#define QEMU_CFG_NUMA_MEM 0x09
int qemu_cfg_port;
@@ -488,6 +506,23 @@ void qemu_cfg_read(uint8_t *buf, int len)
while (len--)
*(buf++) = inb(QEMU_CFG_DATA_PORT);
}
+
+uint32_t qemu_cfg_get32 (void)
+{
+uint32_t ret;
+
+ qemu_cfg_read ((uint8_t*)&ret, 4);
+ return le32_to_cpu (ret);
+}
+
+uint64_t qemu_cfg_get64 (void)
+{
+uint64_t ret;
+
+ qemu_cfg_read ((uint8_t*)&ret, 8);
+ return le64_to_cpu (ret);
+}
+
#endif
void uuid_probe(void)
@@ -502,6 +537,18 @@ void uuid_probe(void)
memset(bios_uuid, 0, 16);
}
+int get_numa_nodes(void)
+{
+uint16_t nodes = 0;
+#ifdef BX_QEMU
+ if(qemu_cfg_port) {
+ qemu_cfg_select(QEMU_CFG_NUMA_NODES);
+ qemu_cfg_read((uint8_t*)&nodes, 2);
+ }
+#endif
+ return le16_to_cpu(nodes);
+}
+
void cpu_probe(void)
{
uint32_t eax, ebx, ecx, edx;
@@ -1232,7 +1279,7 @@ struct rsdp_descriptor /* Root System Descriptor Pointer */
struct rsdt_descriptor_rev1
{
ACPI_TABLE_HEADER_DEF /* ACPI common table header */
- uint32_t table_offset_entry [2]; /* Array of pointers to other */
+ uint32_t table_offset_entry [3]; /* Array of pointers to other */
/* ACPI tables */
};
@@ -1350,6 +1397,9 @@ struct multiple_apic_table
#define APIC_XRUPT_SOURCE 8
#define APIC_RESERVED 9 /* 9 and greater are reserved */
+#define SRAT_PROCESSOR 0
+#define SRAT_MEMORY 1
+
/*
* MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
*/
@@ -1357,6 +1407,40 @@ struct multiple_apic_table
uint8_t type; \
uint8_t length;
+/*
+ * SRAT (NUMA topology description) table
+ */
+struct system_resource_affinity_table
+{
+ ACPI_TABLE_HEADER_DEF
+ uint32_t reserved1;
+ uint32_t reserved2[2];
+};
+
+struct srat_processor_affinity
+{
+APIC_HEADER_DEF
+ uint8_t proximity_lo;
+ uint8_t local_apic_id;
+ uint32_t flags;
+ uint8_t local_sapic_eid;
+ uint8_t proximity_hi[3];
+ uint32_t reserved;
+};
+
+struct srat_memory_affinity
+{
+ APIC_HEADER_DEF
+ uint8_t proximity[4];
+ uint16_t reserved1;
+ uint32_t base_addr_low,base_addr_high;
+ uint32_t length_low,length_high;
+ uint32_t reserved2;
+ uint32_t flags;
+ uint32_t reserved3[2];
+};
+
+
/* Sub-structures for MADT */
struct madt_processor_apic
@@ -1411,6 +1495,26 @@ static int acpi_checksum(const uint8_t *data, int len)
return (-sum) & 0xff;
}
+static void read_config_numa_vcpus (uint32_t *nodes, int numnodes)
+{
+#ifdef BX_QEMU
+uint64_t cpumask;
+int node,cpu;
+
+ qemu_cfg_select (QEMU_CFG_NUMA_VCPUS);
+ for (node = 0; node < numnodes; node++) {
+ cpumask = qemu_cfg_get64();
+ for (cpu = 0; cpu < 64; cpu++) {
+ if (cpumask == 0) break;
+ if (cpumask & 1) nodes[cpu]=node;
+ cpumask >>= 1;
+ }
+ }
+#endif
+ return;
+
+}
+
static void acpi_build_table_header(struct acpi_table_header *h,
char *sig, int len, uint8_t rev)
{
@@ -1435,6 +1539,21 @@ static void acpi_build_table_header(struct acpi_table_header *h,
h->checksum = acpi_checksum((void *)h, len);
}
+static void acpi_build_srat_memory(struct srat_memory_affinity *numamem,
+ uint64_t base, uint64_t len, int node, int enabled)
+{
+ numamem->type = SRAT_MEMORY;
+ numamem->length = sizeof(*numamem);
+ memset (numamem->proximity, 0 ,4);
+ numamem->proximity[0] = node;
+ numamem->flags = cpu_to_le32(!!enabled);
+ numamem->base_addr_low = base & 0xFFFFFFFF;
+ numamem->base_addr_high = base >> 32;
+ numamem->length_low = len & 0xFFFFFFFF;
+ numamem->length_high = len >> 32;
+ return;
+}
+
/* base_addr must be a multiple of 4KB */
void acpi_bios_init(void)
{
@@ -1443,10 +1562,12 @@ void acpi_bios_init(void)
struct fadt_descriptor_rev1 *fadt;
struct facs_descriptor_rev1 *facs;
struct multiple_apic_table *madt;
+ struct system_resource_affinity_table *srat;
uint8_t *dsdt;
uint32_t base_addr, rsdt_addr, fadt_addr, addr, facs_addr, dsdt_addr;
uint32_t acpi_tables_size, madt_addr, madt_size;
- int i;
+ uint32_t srat_addr, srat_size;
+ int i, numanodes;
/* reserve memory space for tables */
#ifdef BX_USE_EBDA_TABLES
@@ -1478,6 +1599,21 @@ void acpi_bios_init(void)
dsdt = (void *)(addr);
addr += sizeof(AmlCode);
+ numanodes = get_numa_nodes();
+ if (numanodes > 0) {
+ addr = (addr + 7) & ~7;
+ srat_addr = addr;
+ srat_size = sizeof(*srat) +
+ sizeof(struct srat_processor_affinity) * smp_cpus +
+ sizeof(struct srat_memory_affinity) * (numanodes + 2);
+ srat = (void *)(addr);
+ addr += srat_size;
+ } else {
+ srat_addr = addr;
+ srat = (void*)(addr);
+ srat_size = 0;
+ }
+
addr = (addr + 7) & ~7;
madt_addr = addr;
madt_size = sizeof(*madt) +
@@ -1507,8 +1643,10 @@ void acpi_bios_init(void)
memset(rsdt, 0, sizeof(*rsdt));
rsdt->table_offset_entry[0] = cpu_to_le32(fadt_addr);
rsdt->table_offset_entry[1] = cpu_to_le32(madt_addr);
- acpi_build_table_header((struct acpi_table_header *)rsdt,
- "RSDT", sizeof(*rsdt), 1);
+ if (numanodes > 0)
+ rsdt->table_offset_entry[2] = cpu_to_le32(srat_addr);
+ acpi_build_table_header((struct acpi_table_header *)rsdt, "RSDT",
+ sizeof(*rsdt) - (numanodes > 0? 0: sizeof(uint32_t)), 1);
/* FADT */
memset(fadt, 0, sizeof(*fadt));
@@ -1590,6 +1728,67 @@ void acpi_bios_init(void)
acpi_build_table_header((struct acpi_table_header *)madt,
"APIC", madt_size, 1);
}
+
+ /* SRAT */
+ if (numanodes > 0) {
+ struct srat_processor_affinity *core;
+ struct srat_memory_affinity *numamem;
+ int slots;
+ uint64_t mem_len, mem_base, next_base = 0;
+ uint32_t nodes[64];
+
+ memset (srat, 0 , srat_size);
+ srat->reserved1=1;
+
+ read_config_numa_vcpus (nodes, numanodes);
+ core = (void*)(srat + 1);
+ for (i = 0; i < smp_cpus; ++i) {
+ core->type = SRAT_PROCESSOR;
+ core->length = sizeof(*core);
+ core->local_apic_id = i;
+ core->proximity_lo = nodes[i];
+ memset (core->proximity_hi, 0, 3);
+ core->local_sapic_eid = 0;
+ if (i < smp_cpus)
+ core->flags = cpu_to_le32(1);
+ else
+ core->flags = 0;
+ core++;
+ }
+ /* the memory map is a bit tricky, it contains at least one hole
+ from 640k-1M and possibly another one from 3.5G-4G. */
+ numamem = (void*)core; slots = 0;
+ qemu_cfg_select (QEMU_CFG_NUMA_MEM);
+ acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1);
+ next_base = 1024 * 1024; numamem++;slots++;
+ for (i = 1; i < numanodes + 1; ++i) {
+ mem_base = next_base;
+ mem_len = qemu_cfg_get64();
+ if (i == 1) mem_len -= 1024 * 1024;
+ next_base = mem_base + mem_len;
+
+ /* Cut out the PCI hole */
+ if (mem_base <= ram_size && next_base > ram_size) {
+ mem_len -= next_base - ram_size;
+ if (mem_len > 0) {
+ acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+ numamem++; slots++;
+ }
+ mem_base = 1ULL << 32;
+ mem_len = next_base - ram_size;
+ next_base += (1ULL << 32) - ram_size;
+ }
+ acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+ numamem++; slots++;
+ }
+ for (; slots < numanodes + 2; slots++) {
+ acpi_build_srat_memory(numamem, 0, 0, 0, 0);
+ numamem++;
+ }
+
+ acpi_build_table_header((struct acpi_table_header *)srat,
+ "SRAT", srat_size, 1);
+ }
}
/* SMBIOS entry point -- must be written to a 16-bit aligned address
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2008-12-05 13:34 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-05 13:34 [PATCH 3/3] v2: KVM-userspace: generate a SRAT table to describe the guests NUMA topology Andre Przywara
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.