From: Andre Przywara <andre.przywara@amd.com>
To: Keir Fraser <keir.fraser@eu.citrix.com>, xen-devel@lists.xensource.com
Subject: [PATCH 2/2] hvm: NUMA guest: inject NUMA topology into the guest (resend)
Date: Fri, 11 Jul 2008 16:13:27 +0200 [thread overview]
Message-ID: <48776A87.7030401@amd.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 692 bytes --]
This patch extends the hvm_info_table to store the number of guest nodes
and will create a suitable ACPI SRAT table to describe the used guest
NUMA topology.
Rediffed to apply against staging 18036.
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
[-- Attachment #2: 04_numa_guest_18036.patch --]
[-- Type: text/plain, Size: 10234 bytes --]
diff -r f70a956b987f tools/firmware/hvmloader/acpi/acpi2_0.h
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h Fri Jul 11 16:02:39 2008 +0200
@@ -356,6 +356,61 @@
};
/*
+ * System Resource Affinity Table header definition (SRAT).
+ */
+struct acpi_20_srat {
+ struct acpi_header header;
+ uint32_t table_revision;
+ uint32_t reserved2[2];
+};
+
+#define ACPI_SRAT_TABLE_REVISION 1
+
+/*
+ * System Resource Affinity Table structure types.
+ */
+#define ACPI_PROCESSOR_AFFIN 0x00
+#define ACPI_MEMORY_AFFIN 0x01
+
+struct acpi_20_srat_processor {
+ uint8_t type;
+ uint8_t length;
+ uint8_t domain;
+ uint8_t apic_id;
+ uint32_t flags;
+ uint8_t sapic_id;
+ uint8_t domain_hi[3];
+ uint32_t reserved;
+};
+
+/*
+ * Local APIC Affinity Flags. All other bits are reserved and must be 0.
+ */
+#define ACPI_LOCAL_APIC_AFFIN_ENABLED (1 << 0)
+
+struct acpi_20_srat_memory {
+ uint8_t type;
+ uint8_t length;
+ uint8_t domain;
+ uint8_t domain_hi[3]; /* this is ACPI 3.0, reserved in 2.0 */
+ uint16_t reserved;
+ uint32_t base_address_lo;
+ uint32_t base_address_hi;
+ uint32_t length_lo;
+ uint32_t length_hi;
+ uint32_t reserved2;
+ uint32_t flags;
+ uint32_t reserved3[2];
+};
+
+/*
+ * Memory Affinity Flags. All other bits are reserved and must be 0.
+ */
+#define ACPI_MEM_AFFIN_ENABLED (1 << 0)
+#define ACPI_MEM_AFFIN_HOTPLUGGABLE (1 << 1)
+#define ACPI_MEM_AFFIN_NONVOLATILE (1 << 2) /* this is ACPI 3.0 */
+
+/*
* Table Signatures.
*/
#define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ')
@@ -366,6 +421,7 @@
#define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
#define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
#define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
/*
* Table revision numbers.
@@ -378,6 +434,7 @@
#define ACPI_2_0_TCPA_REVISION 0x02
#define ACPI_2_0_HPET_REVISION 0x01
#define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_2_0_SRAT_REVISION 0x01
#pragma pack ()
diff -r f70a956b987f tools/firmware/hvmloader/acpi/build.c
--- a/tools/firmware/hvmloader/acpi/build.c Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/build.c Fri Jul 11 16:02:39 2008 +0200
@@ -20,6 +20,9 @@
#include "ssdt_tpm.h"
#include "../config.h"
#include "../util.h"
+#include "../e820.h"
+
+#define ONEMB 0x100000
#define align16(sz) (((sz) + 15) & ~15)
#define fixed_strcpy(d, s) strncpy((d), (s), sizeof(d))
@@ -45,6 +48,140 @@
p = table;
p[checksum_offset] = -sum;
+}
+
+static int vcpu_to_numa_node (int vcpu_id, int nr_vcpus)
+{
+int div,mod;
+
+ div=nr_vcpus / get_numanodes();
+ mod=nr_vcpus % get_numanodes();
+
+ if ( vcpu_id < mod * (div + 1)) return vcpu_id / (div + 1);
+ return ( ( vcpu_id - (mod * (div + 1)) ) / div ) + mod;
+}
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+static uint64_t guessmemsize (void)
+{
+ uint64_t ret = 0;
+ struct e820entry *map = HVM_E820;
+ int i;
+
+ for ( i = 0; i < *HVM_E820_NR ; i++)
+ {
+ if (map[i].addr == ONEMB )
+ ret+=map[i].size + PAGE_SIZE * 3 + ONEMB;
+ if (map[i].addr == (1ULL << 32))
+ ret+=map[i].size;
+ }
+ return ret;
+}
+
+int construct_srat(struct acpi_20_srat *srat)
+{
+ struct acpi_20_srat_processor *processor;
+ struct acpi_20_srat_memory *memory;
+ struct e820entry *map = HVM_E820;
+ int i, offset = 0;
+ uint64_t hvm_node_mem;
+
+ memset(srat, 0, sizeof(*srat));
+ srat->header.signature = ACPI_2_0_SRAT_SIGNATURE;
+ srat->header.revision = ACPI_2_0_SRAT_REVISION;
+ fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+ fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+ srat->header.oem_revision = ACPI_OEM_REVISION;
+ srat->header.creator_id = ACPI_CREATOR_ID;
+ srat->header.creator_revision = ACPI_CREATOR_REVISION;
+ srat->table_revision = ACPI_SRAT_TABLE_REVISION;
+ offset += sizeof(*srat);
+
+ processor = (struct acpi_20_srat_processor *)(srat + 1);
+ for ( i = 0; i < get_vcpu_nr(); i++ )
+ {
+ memset(processor, 0, sizeof(*processor));
+ processor->type = ACPI_PROCESSOR_AFFIN;
+ processor->length = sizeof(*processor);
+ processor->domain = vcpu_to_numa_node (i, get_vcpu_nr());
+ processor->apic_id = LAPIC_ID(i);
+ processor->flags = ACPI_LOCAL_APIC_AFFIN_ENABLED;
+ processor->sapic_id= 0;
+ offset += sizeof(*processor);
+ processor++;
+ }
+
+ /*
+ * Equally distribute the memory on all NUMA nodes. Round up the size
+ * of available memory to whole megabytes, as (at least) Linux cannot cope
+ * with uneven NUMA node boundaries. The remaining part of memory will be
+ * assigned to the last NUMA node. The mapping of the first MB is copied
+ * from the E820 map and assigned to node 0
+ */
+ hvm_node_mem = guessmemsize()+ONEMB-1;
+ hvm_node_mem = hvm_node_mem >> 20;
+ /* 64bit/32bit does not work because of missing libgcc */
+ hvm_node_mem = (uint32_t)hvm_node_mem / get_numanodes();
+ hvm_node_mem = hvm_node_mem << 20;
+
+ memory = (struct acpi_20_srat_memory *)(processor);
+ for ( i = 0; i < *HVM_E820_NR; i++ )
+ {
+ if ( map[i].type != E820_RAM ) continue;
+ if ( map[i].addr >= ONEMB ) break;
+
+ memset(memory, 0, sizeof(*memory));
+ memory->type = ACPI_MEMORY_AFFIN;
+ memory->length = sizeof(*memory);
+ memory->domain = 0;
+ memory->base_address_lo = map[i].addr & 0xFFFFFFFFL;
+ memory->base_address_hi = map[i].addr >> 32;
+ memory->length_lo = map[i].size & 0xFFFFFFFFL;
+ memory->length_hi = map[i].size >> 32;
+ memory->flags = ACPI_MEM_AFFIN_ENABLED;
+
+ offset += sizeof(*memory);
+ memory++;
+ }
+
+ for ( i = 0; i < get_numanodes(); i++ )
+ {
+ memset(memory, 0, sizeof(*memory));
+ memory->type = ACPI_MEMORY_AFFIN;
+ memory->length = sizeof(*memory);
+ memory->domain = i;
+ if ( i == 0 )
+ {
+ memory->base_address_lo = ONEMB;
+ memory->base_address_hi = 0;
+ memory->length_lo = ( hvm_node_mem - ONEMB ) & 0xFFFFFFFFL;
+ memory->length_hi = ( hvm_node_mem - ONEMB ) >> 32;
+ } else
+ if ( i == get_numanodes()-1 )
+ {
+ memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+ memory->base_address_hi = (i * hvm_node_mem) >> 32;
+ memory->length_lo = (guessmemsize()-hvm_node_mem*i) & 0xFFFFFFFFL;
+ memory->length_hi = (guessmemsize()-hvm_node_mem*i) >> 32;
+ } else
+ {
+ memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+ memory->base_address_hi = (i * hvm_node_mem) >> 32;
+ memory->length_lo = hvm_node_mem & 0xFFFFFFFFL;
+ memory->length_hi = hvm_node_mem >> 32;
+ }
+ memory->flags = ACPI_MEM_AFFIN_ENABLED;
+ offset += sizeof(*memory);
+ memory++;
+ }
+
+ srat->header.length = offset;
+ set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+ return align16(offset);
}
static int uart_exists(uint16_t uart_base)
@@ -188,6 +325,7 @@
static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
{
int offset = 0, nr_tables = 0;
+ struct acpi_20_srat *srat;
struct acpi_20_madt *madt;
struct acpi_20_hpet *hpet;
struct acpi_20_tcpa *tcpa;
@@ -200,6 +338,14 @@
madt = (struct acpi_20_madt *)&buf[offset];
offset += construct_madt(madt);
table_ptrs[nr_tables++] = (unsigned long)madt;
+ }
+
+ /* SRAT. */
+ if ( get_numanodes() > 0 )
+ {
+ srat = (struct acpi_20_srat *)&buf[offset];
+ offset += construct_srat(srat);
+ table_ptrs[nr_tables++] = (unsigned long)srat;
}
/* HPET. */
diff -r f70a956b987f tools/firmware/hvmloader/util.c
--- a/tools/firmware/hvmloader/util.c Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.c Fri Jul 11 16:02:39 2008 +0200
@@ -594,6 +594,12 @@
return (t ? t->nr_vcpus : 1);
}
+int get_numanodes(void)
+{
+ struct hvm_info_table *t = get_hvm_info_table();
+ return (t ? t->numanodes : 1);
+}
+
int get_acpi_enabled(void)
{
struct hvm_info_table *t = get_hvm_info_table();
diff -r f70a956b987f tools/firmware/hvmloader/util.h
--- a/tools/firmware/hvmloader/util.h Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.h Fri Jul 11 16:02:39 2008 +0200
@@ -104,6 +104,7 @@
/* HVM-builder info. */
int get_vcpu_nr(void);
+int get_numanodes(void);
int get_acpi_enabled(void);
int get_apic_mode(void);
diff -r f70a956b987f tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri Jul 11 16:02:11 2008 +0200
+++ b/tools/python/xen/lowlevel/xc/xc.c Fri Jul 11 16:02:39 2008 +0200
@@ -861,6 +861,18 @@
#endif /* __i386__ || __x86_64__ */
+static unsigned hweight_long (unsigned long value)
+{
+int ret=0;
+
+ while (value>0)
+ {
+ if (value&1) ++ret;
+ value>>=1;
+ }
+ return ret;
+}
+
static PyObject *pyxc_hvm_build(XcObject *self,
PyObject *args,
PyObject *kwds)
@@ -900,6 +912,7 @@
va_hvm->acpi_enabled = acpi;
va_hvm->apic_mode = apic;
va_hvm->nr_vcpus = vcpus;
+ va_hvm->numanodes = hweight_long(nodemask);
for ( i = 0, sum = 0; i < va_hvm->length; i++ )
sum += ((uint8_t *)va_hvm)[i];
va_hvm->checksum = -sum;
diff -r f70a956b987f xen/include/public/hvm/hvm_info_table.h
--- a/xen/include/public/hvm/hvm_info_table.h Fri Jul 11 16:02:11 2008 +0200
+++ b/xen/include/public/hvm/hvm_info_table.h Fri Jul 11 16:02:39 2008 +0200
@@ -36,6 +36,7 @@
uint8_t acpi_enabled;
uint8_t apic_mode;
uint32_t nr_vcpus;
+ uint32_t numanodes;
};
#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
reply other threads:[~2008-07-11 14:13 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=48776A87.7030401@amd.com \
--to=andre.przywara@amd.com \
--cc=keir.fraser@eu.citrix.com \
--cc=xen-devel@lists.xensource.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.