* [PATCH 3/3] KVM-userspace: generate a SRAT table to describe the guests NUMA topology
@ 2008-11-27 22:28 Andre Przywara
0 siblings, 0 replies; only message in thread
From: Andre Przywara @ 2008-11-27 22:28 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm
[-- Attachment #1: Type: text/plain, Size: 773 bytes --]
According to the number of nodes passed in the CMOS RAM (offset 0x3e)
the BIOS code generates a SRAT (System Resources Affinity Table) to
describe which (V)CPU and which part of memory is assigned to a certain
node. This will then be read and hopefully honored by the guest OS.
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
[-- Attachment #2: kvmnuma_sratbios.patch --]
[-- Type: text/plain, Size: 7467 bytes --]
diff --git a/bios/rombios32.c b/bios/rombios32.c
index 3c9a2d7..c0bf08f 100755
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -1232,7 +1232,7 @@ struct rsdp_descriptor /* Root System Descriptor Pointer */
struct rsdt_descriptor_rev1
{
ACPI_TABLE_HEADER_DEF /* ACPI common table header */
- uint32_t table_offset_entry [2]; /* Array of pointers to other */
+ uint32_t table_offset_entry [3]; /* Array of pointers to other */
/* ACPI tables */
};
@@ -1350,6 +1350,9 @@ struct multiple_apic_table
#define APIC_XRUPT_SOURCE 8
#define APIC_RESERVED 9 /* 9 and greater are reserved */
+#define SRAT_PROCESSOR 0
+#define SRAT_MEMORY 1
+
/*
* MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
*/
@@ -1357,6 +1360,40 @@ struct multiple_apic_table
uint8_t type; \
uint8_t length;
+/*
+ * SRAT (NUMA topology description) table
+ */
+struct system_resource_affinity_table
+{
+ ACPI_TABLE_HEADER_DEF
+ uint32_t reserved1;
+ uint32_t reserved2[2];
+};
+
+struct srat_processor_affinity
+{
+APIC_HEADER_DEF
+ uint8_t proximity_lo;
+ uint8_t local_apic_id;
+ uint32_t flags;
+ uint8_t local_sapic_eid;
+ uint8_t proximity_hi[3];
+ uint32_t reserved;
+};
+
+struct srat_memory_affinity
+{
+ APIC_HEADER_DEF
+ uint8_t proximity[4];
+ uint16_t reserved1;
+ uint32_t base_addr_low,base_addr_high;
+ uint32_t length_low,length_high;
+ uint32_t reserved2;
+ uint32_t flags;
+ uint32_t reserved3[2];
+};
+
+
/* Sub-structures for MADT */
struct madt_processor_apic
@@ -1443,10 +1480,12 @@ void acpi_bios_init(void)
struct fadt_descriptor_rev1 *fadt;
struct facs_descriptor_rev1 *facs;
struct multiple_apic_table *madt;
+ struct system_resource_affinity_table *srat;
uint8_t *dsdt;
uint32_t base_addr, rsdt_addr, fadt_addr, addr, facs_addr, dsdt_addr;
uint32_t acpi_tables_size, madt_addr, madt_size;
- int i;
+ uint32_t srat_addr, srat_size;
+ int i, numanodes;
/* reserve memory space for tables */
#ifdef BX_USE_EBDA_TABLES
@@ -1478,6 +1517,21 @@ void acpi_bios_init(void)
dsdt = (void *)(addr);
addr += sizeof(AmlCode);
+ numanodes = cmos_readb (0x3E);
+ if (numanodes > 0) {
+ addr = (addr + 7) & ~7;
+ srat_addr = addr;
+ srat_size = sizeof(*srat) +
+ sizeof(struct srat_processor_affinity) * smp_cpus +
+ sizeof(struct srat_memory_affinity) * (numanodes + 2);
+ srat = (void *)(addr);
+ addr += srat_size;
+ } else {
+ srat_addr = addr;
+ srat = (void*)(addr);
+ srat_size = 0;
+ }
+
addr = (addr + 7) & ~7;
madt_addr = addr;
madt_size = sizeof(*madt) +
@@ -1507,8 +1561,10 @@ void acpi_bios_init(void)
memset(rsdt, 0, sizeof(*rsdt));
rsdt->table_offset_entry[0] = cpu_to_le32(fadt_addr);
rsdt->table_offset_entry[1] = cpu_to_le32(madt_addr);
- acpi_build_table_header((struct acpi_table_header *)rsdt,
- "RSDT", sizeof(*rsdt), 1);
+ if (numanodes > 0)
+ rsdt->table_offset_entry[2] = cpu_to_le32(srat_addr);
+ acpi_build_table_header((struct acpi_table_header *)rsdt, "RSDT",
+ sizeof(*rsdt) - (numanodes > 0? 0: sizeof(uint32_t)), 1);
/* FADT */
memset(fadt, 0, sizeof(*fadt));
@@ -1590,6 +1646,92 @@ void acpi_bios_init(void)
acpi_build_table_header((struct acpi_table_header *)madt,
"APIC", madt_size, 1);
}
+
+ /* SRAT */
+ if (numanodes > 0) {
+ struct srat_processor_affinity *core;
+ struct srat_memory_affinity *numamem;
+ int nodenr = 0, slots;
+ unsigned long numa_chunk_size;
+ uint64_t mem_len, mem_base, next_base = 0;
+
+ if (ram_end == ram_size) {
+ numa_chunk_size = (ram_size / numanodes) >> 20;
+ } else {
+ numa_chunk_size = (1ULL << 32) - ram_size;
+ numa_chunk_size = (ram_end - numa_chunk_size) >> 20;
+ numa_chunk_size /= numanodes;
+ }
+
+ memset (srat, 0 , srat_size);
+ srat->reserved1=1;
+ core = (void*)(srat + 1);
+ for (i = 0; i < smp_cpus; ++i) {
+ core->type = SRAT_PROCESSOR;
+ core->length = sizeof(*core);
+ core->local_apic_id = i;
+ core->proximity_lo = i % numanodes;
+ memset (core->proximity_hi, 0, 3);
+ core->local_sapic_eid = 0;
+ if (i < smp_cpus)
+ core->flags = cpu_to_le32(1);
+ else
+ core->flags = 0;
+ core++;
+ }
+ numamem = (void*)core; slots = 0;
+ for (i = 0; i < numanodes + 1; ++i) {
+ numamem->type = SRAT_MEMORY;
+ numamem->length = sizeof(*numamem);
+ memset (numamem->proximity, 0 ,4);
+ numamem->proximity[0] = nodenr;
+ mem_base = next_base;
+ mem_len = (uint64_t)numa_chunk_size << 20;
+ if (i == 1) mem_len -= 1024 * 1024;
+ if (i == 0) {
+ mem_len = 640 * 1024;
+ next_base = 1024 * 1024;
+ } else next_base = mem_base + mem_len;
+
+ numamem->flags = cpu_to_le32(1);
+ numamem->base_addr_low = mem_base & 0xFFFFFFFF;
+ numamem->base_addr_high = mem_base >> 32;
+
+ /* Cut out the PCI hole */
+
+ if (mem_base <= ram_size && next_base > ram_size && i > 0) {
+ mem_len -= next_base - ram_size;
+ if (mem_len > 0) {
+ numamem->length_low = mem_len & 0xFFFFFFFF;
+ numamem->length_high = mem_len >> 32;
+ numamem++; slots++;
+ numamem->type = SRAT_MEMORY;
+ numamem->length = sizeof(*numamem);
+ memset (numamem->proximity, 0 ,4);
+ numamem->proximity[0] = nodenr;
+ }
+ numamem->base_addr_low = 0;
+ numamem->base_addr_high = 1;
+ numamem->flags = cpu_to_le32(1);
+ mem_len = next_base - ram_size;
+ next_base += (1ULL << 32) - ram_size;
+ }
+ numamem->length_low = mem_len & 0xFFFFFFFF;
+ numamem->length_high = mem_len >> 32;
+ numamem++; slots++;
+ if (i != 0) nodenr++;
+ }
+ for (; slots < numanodes + 2; slots++) {
+ numamem->type = SRAT_MEMORY;
+ numamem->length = sizeof(*numamem);
+ memset (numamem->proximity, 0 ,4);
+ numamem->flags = 0;
+ numamem++;
+ }
+
+ acpi_build_table_header((struct acpi_table_header *)srat,
+ "SRAT", srat_size, 1);
+ }
}
/* SMBIOS entry point -- must be written to a 16-bit aligned address
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 7d296c4..e5c8b6e 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -268,6 +268,8 @@ static void cmos_init(ram_addr_t ram_size, ram_addr_t above_4g_mem_size,
rtc_set_memory(s, 0x34, val);
rtc_set_memory(s, 0x35, val >> 8);
+ rtc_set_memory(s, 0x3e, numnumanodes);
+
/* set the number of CPU */
rtc_set_memory(s, 0x5f, smp_cpus - 1);
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2008-11-27 22:28 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-27 22:28 [PATCH 3/3] KVM-userspace: generate a SRAT table to describe the guests NUMA topology Andre Przywara
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.