From: "Jan Beulich" <jbeulich@novell.com>
To: xen-devel@lists.xensource.com
Subject: [PATCH, RFC] x86: make the GDT per-CPU
Date: Wed, 10 Sep 2008 15:35:40 +0100 [thread overview]
Message-ID: <48C7F75C.76E4.0078.0@novell.com> (raw)
The major issue with supporting a significantly larger number of physical
CPUs appears to be the use of per-CPU GDT entries - at present, x86-64
could support only up to 126 CPUs (with code changes to also use the
top-most GDT page, that would be 254). Instead of trying to go with
incremental steps here, by converting the GDT itself to be per-CPU,
limitations in that respect go away entirely.
There's one particular part of it that I'm not very happy with, but have
had no better idea so far: In the general case, it is now necessary to
reload the GDT twice during context switch. Hence I'd appreciate ideas
on how to avoid this and stay with a single reload.
The patch has several debug items in it (which are marked as such), so is
in no case intended to go in as-is.
Jan
Index: 2008-09-01/xen/arch/x86/boot/wakeup.S
===================================================================
--- 2008-09-01.orig/xen/arch/x86/boot/wakeup.S 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/boot/wakeup.S 2008-09-09 10:44:30.000000000 +0200
@@ -168,7 +168,7 @@ wakeup_32:
.word 0,0,0
lgdt_descr:
.word LAST_RESERVED_GDT_BYTE
- .quad gdt_table - FIRST_RESERVED_GDT_BYTE
+ .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
wakeup_64:
lgdt lgdt_descr(%rip)
Index: 2008-09-01/xen/arch/x86/boot/x86_32.S
===================================================================
--- 2008-09-01.orig/xen/arch/x86/boot/x86_32.S 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/boot/x86_32.S 2008-09-09 14:45:58.000000000 +0200
@@ -78,7 +78,7 @@ idt_descr:
.word 0
gdt_descr:
.word LAST_RESERVED_GDT_BYTE
- .long gdt_table - FIRST_RESERVED_GDT_BYTE
+ .long boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
.align 32
@@ -94,7 +94,7 @@ ENTRY(idle_pg_table)
#define GUEST_DESC(d) \
.long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff, \
((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d)
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
.quad 0x0000000000000000 /* unused */
.quad 0x00cf9a000000ffff /* 0xe008 ring 0 4.00GB code at 0x0 */
.quad 0x00cf92000000ffff /* 0xe010 ring 0 4.00GB data at 0x0 */
@@ -102,4 +102,6 @@ ENTRY(gdt_table)
GUEST_DESC(0x00c0b200) /* 0xe021 ring 1 3.xxGB data at 0x0 */
GUEST_DESC(0x00c0fa00) /* 0xe02b ring 3 3.xxGB code at 0x0 */
GUEST_DESC(0x00c0f200) /* 0xe033 ring 3 3.xxGB data at 0x0 */
+ .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0
+ .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */
.align PAGE_SIZE,0
Index: 2008-09-01/xen/arch/x86/boot/x86_64.S
===================================================================
--- 2008-09-01.orig/xen/arch/x86/boot/x86_64.S 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/boot/x86_64.S 2008-09-09 14:45:08.000000000 +0200
@@ -85,7 +85,7 @@ multiboot_ptr:
.word 0
gdt_descr:
.word LAST_RESERVED_GDT_BYTE
- .quad gdt_table - FIRST_RESERVED_GDT_BYTE
+ .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
.word 0,0,0
idt_descr:
@@ -96,7 +96,7 @@ ENTRY(stack_start)
.quad cpu0_stack
.align PAGE_SIZE, 0
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
.quad 0x0000000000000000 /* unused */
.quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */
.quad 0x00cf92000000ffff /* 0xe010 ring 0 data */
@@ -105,11 +105,13 @@ ENTRY(gdt_table)
.quad 0x00cff2000000ffff /* 0xe02b ring 3 data */
.quad 0x00affa000000ffff /* 0xe033 ring 3 code, 64-bit mode */
.quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */
+ .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+ .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */
.align PAGE_SIZE, 0
/* NB. Even rings != 0 get access to the full 4Gb, as only the */
/* (compatibility) machine->physical mapping table lives there. */
-ENTRY(compat_gdt_table)
+ENTRY(boot_cpu_compat_gdt_table)
.quad 0x0000000000000000 /* unused */
.quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */
.quad 0x00cf92000000ffff /* 0xe010 ring 0 data */
@@ -118,4 +120,6 @@ ENTRY(compat_gdt_table)
.quad 0x00cffa000000ffff /* 0xe02b ring 3 code, compatibility */
.quad 0x00cff2000000ffff /* 0xe033 ring 3 data */
.quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */
+ .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+ .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */
.align PAGE_SIZE, 0
Index: 2008-09-01/xen/arch/x86/cpu/common.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/cpu/common.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/cpu/common.c 2008-09-10 16:09:18.000000000 +0200
@@ -575,6 +575,9 @@ void __cpuinit cpu_init(void)
if (cpu_has_pat)
wrmsrl(MSR_IA32_CR_PAT, host_pat);
+ /* Install correct page table. */
+ write_ptbase(current);
+
*(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
*(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(current);
asm volatile ( "lgdt %0" : "=m" (gdt_load) );
@@ -605,9 +608,6 @@ void __cpuinit cpu_init(void)
#define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) );
CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
#undef CD
-
- /* Install correct page table. */
- write_ptbase(current);
}
#ifdef CONFIG_HOTPLUG_CPU
Index: 2008-09-01/xen/arch/x86/domain.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/domain.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/domain.c 2008-09-10 15:10:05.000000000 +0200
@@ -205,7 +205,6 @@ static inline int may_switch_mode(struct
int switch_native(struct domain *d)
{
- l1_pgentry_t gdt_l1e;
unsigned int vcpuid;
if ( d == NULL )
@@ -217,12 +216,8 @@ int switch_native(struct domain *d)
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
- /* switch gdt */
- gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
{
- d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
if (d->vcpu[vcpuid])
release_compat_l4(d->vcpu[vcpuid]);
}
@@ -232,7 +227,6 @@ int switch_native(struct domain *d)
int switch_compat(struct domain *d)
{
- l1_pgentry_t gdt_l1e;
unsigned int vcpuid;
if ( d == NULL )
@@ -244,15 +238,11 @@ int switch_compat(struct domain *d)
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
- /* switch gdt */
- gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
{
if ( (d->vcpu[vcpuid] != NULL) &&
(setup_compat_l4(d->vcpu[vcpuid]) != 0) )
goto undo_and_fail;
- d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
}
domain_set_alloc_bitsize(d);
@@ -261,13 +251,10 @@ int switch_compat(struct domain *d)
undo_and_fail:
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
- gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
while ( vcpuid-- != 0 )
{
if ( d->vcpu[vcpuid] != NULL )
release_compat_l4(d->vcpu[vcpuid]);
- d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
}
return -ENOMEM;
}
@@ -315,7 +302,13 @@ int vcpu_initialise(struct vcpu *v)
if ( is_idle_domain(d) )
{
v->arch.schedule_tail = continue_idle_domain;
- v->arch.cr3 = __pa(idle_pg_table);
+ if ( v->vcpu_id )
+ v->arch.cr3 = d->vcpu[0]->arch.cr3;
+ else if ( !*idle_vcpu )
+ v->arch.cr3 = __pa(idle_pg_table);
+ else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) )
+ return -ENOMEM;
+else printk("new idle domain: CR3=%lx\n", v->arch.cr3);//temp
}
v->arch.guest_context.ctrlreg[4] =
@@ -342,8 +335,7 @@ int arch_domain_create(struct domain *d,
#ifdef __x86_64__
struct page_info *pg;
#endif
- l1_pgentry_t gdt_l1e;
- int i, vcpuid, pdpt_order, paging_initialised = 0;
+ int i, pdpt_order, paging_initialised = 0;
int rc = -ENOMEM;
d->arch.hvm_domain.hap_enabled =
@@ -362,18 +354,6 @@ int arch_domain_create(struct domain *d,
goto fail;
memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
- /*
- * Map Xen segments into every VCPU's GDT, irrespective of whether every
- * VCPU will actually be used. This avoids an NMI race during context
- * switch: if we take an interrupt after switching CR3 but before switching
- * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
- * try to load CS from an invalid table.
- */
- gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
- for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
- d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
-
#if defined(__i386__)
mapcache_domain_init(d);
@@ -1183,12 +1163,26 @@ static void paravirt_ctxt_switch_to(stru
}
}
+static void check_cpu(unsigned int cpu, int line) {//temp
+ unsigned int _cpu;
+ asm("lsll %1, %0" : "=r" (_cpu) : "rm" (PER_CPU_GDT_ENTRY << 3));
+ if(_cpu != cpu) {
+ struct desc_ptr gdt_desc;
+ asm("sgdt %0" : "=m" (gdt_desc));
+ printk("CPU#%u: wrong GDT (%lx->%u) at #%d\n", cpu, gdt_desc.base, _cpu, line);
+ show_page_walk(gdt_desc.base + FIRST_RESERVED_GDT_BYTE);
+ }
+}
+
static void __context_switch(void)
{
struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
- unsigned int cpu = smp_processor_id();
+ unsigned int i, cpu = smp_processor_id();
struct vcpu *p = per_cpu(curr_vcpu, cpu);
struct vcpu *n = current;
+ struct desc_struct *gdt;
+ struct page_info *page;
+ struct desc_ptr gdt_desc;
ASSERT(p != n);
ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
@@ -1214,14 +1208,35 @@ static void __context_switch(void)
cpu_set(cpu, n->domain->domain_dirty_cpumask);
cpu_set(cpu, n->vcpu_dirty_cpumask);
+check_cpu(cpu, __LINE__);//temp
+ gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
+ per_cpu(compat_gdt_table, cpu);
+ page = virt_to_page(gdt);
+ for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+ {
+ n->domain->arch.mm_perdomain_pt
+ [(n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+ FIRST_RESERVED_GDT_PAGE + i]
+ = l1e_from_page(page + i, __PAGE_HYPERVISOR);
+ }
+
+check_cpu(cpu, __LINE__);//temp
+ if ( p->vcpu_id != n->vcpu_id )
+ {
+ gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
+ gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
+ asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+check_cpu(cpu, __LINE__);//temp
+ }
+
write_ptbase(n);
+check_cpu(cpu, __LINE__);//temp
if ( p->vcpu_id != n->vcpu_id )
{
- char gdt_load[10];
- *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
- *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n);
- asm volatile ( "lgdt %0" : "=m" (gdt_load) );
+ gdt_desc.base = GDT_VIRT_START(n);
+ asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+check_cpu(cpu, __LINE__);//temp
}
if ( p->domain != n->domain )
@@ -1257,6 +1272,7 @@ void context_switch(struct vcpu *prev, s
if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
{
+check_cpu(cpu, __LINE__);//temp
local_irq_enable();
}
else
@@ -1272,8 +1288,6 @@ void context_switch(struct vcpu *prev, s
uint64_t efer = read_efer();
if ( !(efer & EFER_SCE) )
write_efer(efer | EFER_SCE);
- flush_tlb_one_local(GDT_VIRT_START(next) +
- FIRST_RESERVED_GDT_BYTE);
}
#endif
Index: 2008-09-01/xen/arch/x86/domain_build.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/domain_build.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/domain_build.c 2008-09-09 11:01:01.000000000 +0200
@@ -313,24 +313,11 @@ int __init construct_dom0(
#if defined(__x86_64__)
if ( compat32 )
{
- l1_pgentry_t gdt_l1e;
-
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
if ( nr_pages != (unsigned int)nr_pages )
nr_pages = UINT_MAX;
-
- /*
- * Map compatibility Xen segments into every VCPU's GDT. See
- * arch_domain_create() for further comments.
- */
- gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
- PAGE_HYPERVISOR);
- for ( i = 0; i < MAX_VIRT_CPUS; i++ )
- d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
- flush_tlb_one_local(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
}
#endif
Index: 2008-09-01/xen/arch/x86/hvm/vmx/vmcs.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/hvm/vmx/vmcs.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/hvm/vmx/vmcs.c 2008-09-09 14:09:36.000000000 +0200
@@ -446,7 +446,7 @@ static void vmx_set_host_env(struct vcpu
__vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
- __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3);
+ __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
__vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]);
__vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
Index: 2008-09-01/xen/arch/x86/setup.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/setup.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/setup.c 2008-09-10 15:23:25.000000000 +0200
@@ -115,6 +115,12 @@ extern void early_cpu_init(void);
extern void vesa_init(void);
extern void vesa_mtrr_init(void);
+DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
+#ifdef CONFIG_COMPAT
+DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
+ = boot_cpu_compat_gdt_table;
+#endif
+
struct tss_struct init_tss[NR_CPUS];
char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
@@ -224,6 +230,7 @@ static void __init percpu_init_areas(voi
static void __init init_idle_domain(void)
{
struct domain *idle_domain;
+ unsigned int i;
/* Domain creation requires that scheduler structures are initialised. */
scheduler_init();
@@ -236,6 +243,12 @@ static void __init init_idle_domain(void
idle_vcpu[0] = this_cpu(curr_vcpu) = current;
setup_idle_pagetable();
+
+ for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+ idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] =
+ l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i,
+ __PAGE_HYPERVISOR);
+
}
static void __init srat_detect_node(int cpu)
@@ -443,7 +456,6 @@ void __init __start_xen(unsigned long mb
parse_video_info();
set_current((struct vcpu *)0xfffff000); /* debug sanity */
- idle_vcpu[0] = current;
set_processor_id(0); /* needed early, for smp_processor_id() */
if ( cpu_has_efer )
rdmsrl(MSR_EFER, this_cpu(efer));
Index: 2008-09-01/xen/arch/x86/smpboot.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/smpboot.c 2008-09-10 12:34:16.000000000 +0200
+++ 2008-09-01/xen/arch/x86/smpboot.c 2008-09-10 13:43:56.000000000 +0200
@@ -835,10 +835,15 @@ static int __devinit do_boot_cpu(int api
*/
{
unsigned long boot_error;
+ unsigned int i;
int timeout;
unsigned long start_eip;
unsigned short nmi_high = 0, nmi_low = 0;
struct vcpu *v;
+ struct desc_struct *gdt;
+#ifdef __x86_64__
+ struct page_info *page;
+#endif
/*
* Save current MTRR state in case it was changed since early boot
@@ -864,6 +869,38 @@ static int __devinit do_boot_cpu(int api
/* Debug build: detect stack overflow by setting up a guard page. */
memguard_guard_stack(stack_start.esp);
+ gdt = per_cpu(gdt_table, cpu);
+ if (gdt == boot_cpu_gdt_table) {
+ i = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+#ifdef __x86_64__
+#ifdef CONFIG_COMPAT
+ page = alloc_domheap_pages(NULL, i,
+ MEMF_node(cpu_to_node(cpu)));
+ per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page);
+ memcpy(gdt, boot_cpu_compat_gdt_table,
+ NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+ gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+#endif
+ page = alloc_domheap_pages(NULL, i,
+ MEMF_node(cpu_to_node(cpu)));
+ per_cpu(gdt_table, cpu) = gdt = page_to_virt(page);
+#else
+ per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i);
+#endif
+ memcpy(gdt, boot_cpu_gdt_table,
+ NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+ BUILD_BUG_ON(NR_CPUS > 0x10000);
+ gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+printk("CPU#%d: GDT@%p[%p]\n", cpu, gdt, per_cpu(compat_gdt_table, cpu));//temp
+ }
+
+ for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+ v->domain->arch.mm_perdomain_pt
+ [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+ FIRST_RESERVED_GDT_PAGE + i]
+ = l1e_from_page(virt_to_page(gdt) + i,
+ __PAGE_HYPERVISOR);
+
/*
* This grunge runs the startup process for
* the targeted processor.
Index: 2008-09-01/xen/arch/x86/traps.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/traps.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/traps.c 2008-09-09 16:15:21.000000000 +0200
@@ -2692,6 +2692,13 @@ asmlinkage void do_general_protection(st
return;
}
+if(regs->error_code) {//temp
+ struct desc_ptr gdt_desc;
+ asm("sgdt %0" : "=m" (gdt_desc));
+ printk("CPU[%u] GDT@%lx [%lx,%x]\n", smp_processor_id(), GDT_VIRT_START(v), gdt_desc.base, gdt_desc.limit);
+ show_page_walk(GDT_VIRT_START(v) + regs->error_code);
+}
+
#if defined(__i386__)
if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) &&
(regs->error_code == 0) &&
@@ -2961,13 +2968,13 @@ void set_intr_gate(unsigned int n, void
void set_tss_desc(unsigned int n, void *addr)
{
_set_tssldt_desc(
- gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+ per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
(unsigned long)addr,
offsetof(struct tss_struct, __cacheline_filler) - 1,
9);
#ifdef CONFIG_COMPAT
_set_tssldt_desc(
- compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+ per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
(unsigned long)addr,
offsetof(struct tss_struct, __cacheline_filler) - 1,
11);
Index: 2008-09-01/xen/arch/x86/x86_32/mm.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_32/mm.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_32/mm.c 2008-09-10 16:04:08.000000000 +0200
@@ -135,6 +135,30 @@ void __init setup_idle_pagetable(void)
__PAGE_HYPERVISOR));
}
+unsigned long clone_idle_pagetable(struct vcpu *v)
+{
+ unsigned int i;
+ struct domain *d = v->domain;
+ l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0];
+ l2_pgentry_t *l2_table = alloc_xenheap_page();
+
+ if ( !l2_table )
+ return 0;
+
+ memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table));
+ l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] =
+ l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT);
+
+ copy_page(l2_table, idle_pg_table_l2 +
+ l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES);
+ for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
+ l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
+
+ return __pa(l3_table);
+}
+
void __init zap_low_mappings(l2_pgentry_t *dom0_l2)
{
int i;
@@ -189,7 +213,7 @@ void __init subarch_init_memory(void)
{
/* Guest kernel runs in ring 0, not ring 1. */
struct desc_struct *d;
- d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
+ d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
d[0].b &= ~_SEGMENT_DPL;
d[1].b &= ~_SEGMENT_DPL;
}
Index: 2008-09-01/xen/arch/x86/x86_32/supervisor_mode_kernel.S
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_32/supervisor_mode_kernel.S 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_32/supervisor_mode_kernel.S 2008-09-09 13:57:13.000000000 +0200
@@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack)
# %gs:%esi now points to the guest stack before the
# interrupt/exception occured.
- /*
- * Reverse the __TSS macro, giving us the CPU number.
- * The TSS for this cpu is at init_tss + ( cpu * 128 ).
- */
- str %ecx
- shrl $3,%ecx # Calculate GDT index for TSS.
- subl $(FIRST_RESERVED_GDT_ENTRY+8),%ecx # %ecx = 2*cpu.
- shll $6,%ecx # Each TSS entry is 0x80 bytes
- addl $init_tss,%ecx # but we have 2*cpu from above.
+ movl $PER_CPU_GDT_ENTRY*8,%ecx
+ lsll %ecx,%ecx
+ shll $7,%ecx # Each TSS entry is 0x80 bytes
+ addl $init_tss,%ecx
# Load Xen stack from TSS.
movw TSS_ss0(%ecx),%ax
Index: 2008-09-01/xen/arch/x86/x86_32/traps.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_32/traps.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_32/traps.c 2008-09-09 14:48:33.000000000 +0200
@@ -197,13 +197,15 @@ static unsigned char doublefault_stack[D
asmlinkage void do_double_fault(void)
{
- struct tss_struct *tss = &doublefault_tss;
- unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1;
+ struct tss_struct *tss;
+ unsigned int cpu;
watchdog_disable();
console_force_unlock();
+ asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
+
/* Find information saved during fault and dump it to the console. */
tss = &init_tss[cpu];
printk("*** DOUBLE FAULT ***\n");
@@ -328,7 +330,7 @@ void __devinit subarch_percpu_traps_init
tss->eflags = 2;
tss->bitmap = IOBMP_INVALID_OFFSET;
_set_tssldt_desc(
- gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
+ boot_cpu_gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
(unsigned long)tss, 235, 9);
set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
Index: 2008-09-01/xen/arch/x86/x86_64/mm.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_64/mm.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_64/mm.c 2008-09-10 15:51:37.000000000 +0200
@@ -21,6 +21,7 @@
#include <xen/lib.h>
#include <xen/init.h>
#include <xen/mm.h>
+#include <xen/numa.h>
#include <xen/sched.h>
#include <xen/guest_access.h>
#include <asm/current.h>
@@ -209,6 +210,24 @@ void __init setup_idle_pagetable(void)
__PAGE_HYPERVISOR));
}
+unsigned long clone_idle_pagetable(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct page_info *page = alloc_domheap_page(NULL,
+ MEMF_node(vcpu_to_node(v)));
+ l4_pgentry_t *l4_table = page_to_virt(page);
+
+ if ( !page )
+ return 0;
+
+ copy_page(l4_table, idle_pg_table);
+ l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
+ l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
+ __PAGE_HYPERVISOR);
+
+ return __pa(l4_table);
+}
+
void __init zap_low_mappings(void)
{
BUG_ON(num_online_cpus() != 1);
Index: 2008-09-01/xen/arch/x86/x86_64/traps.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_64/traps.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_64/traps.c 2008-09-09 14:49:19.000000000 +0200
@@ -217,15 +217,14 @@ void show_page_walk(unsigned long addr)
asmlinkage void double_fault(void);
asmlinkage void do_double_fault(struct cpu_user_regs *regs)
{
- unsigned int cpu, tr;
-
- asm volatile ( "str %0" : "=r" (tr) );
- cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
+ unsigned int cpu;
watchdog_disable();
console_force_unlock();
+ asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
+
/* Find information saved during fault and dump it to the console. */
printk("*** DOUBLE FAULT ***\n");
print_xen_info();
Index: 2008-09-01/xen/common/domain.c
===================================================================
--- 2008-09-01.orig/xen/common/domain.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/common/domain.c 2008-09-10 08:29:58.000000000 +0200
@@ -172,7 +172,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
{
struct domain *d;
struct vcpu *v;
- unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
+ unsigned int vcpu_id = cpu_id % 2;//temp MAX_VIRT_CPUS;
if ( (v = idle_vcpu[cpu_id]) != NULL )
return v;
Index: 2008-09-01/xen/include/asm-x86/desc.h
===================================================================
--- 2008-09-01.orig/xen/include/asm-x86/desc.h 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/include/asm-x86/desc.h 2008-09-09 14:32:27.000000000 +0200
@@ -34,11 +34,9 @@
#define FLAT_COMPAT_USER_CS FLAT_COMPAT_RING3_CS
#define FLAT_COMPAT_USER_SS FLAT_COMPAT_RING3_SS
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2)
-
-#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 2)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2)
#elif defined(__i386__)
@@ -51,17 +49,15 @@
#define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1)
-
-#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 1)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1)
#endif
#ifndef __ASSEMBLY__
-#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (__TSS(n)<<3) )
+#define load_TR(n) __asm__ __volatile__ ("ltr %%ax" : : "a" (TSS_ENTRY<<3) )
#if defined(__x86_64__)
#define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3)
@@ -205,11 +201,19 @@ do {
#endif
-extern struct desc_struct gdt_table[];
+struct desc_ptr {
+ unsigned short limit;
+ unsigned long base;
+} __attribute__((__packed__)) ;
+
+extern struct desc_struct boot_cpu_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, gdt_table);
#ifdef CONFIG_COMPAT
-extern struct desc_struct compat_gdt_table[];
+extern struct desc_struct boot_cpu_compat_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table);
#else
-# define compat_gdt_table gdt_table
+# define boot_cpu_compat_gdt_table boot_cpu_gdt_table
+# define per_cpu__compat_gdt_table per_cpu__gdt_table
#endif
extern void set_intr_gate(unsigned int irq, void * addr);
Index: 2008-09-01/xen/include/asm-x86/ldt.h
===================================================================
--- 2008-09-01.orig/xen/include/asm-x86/ldt.h 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/include/asm-x86/ldt.h 2008-09-09 14:13:41.000000000 +0200
@@ -6,7 +6,6 @@
static inline void load_LDT(struct vcpu *v)
{
- unsigned int cpu;
struct desc_struct *desc;
unsigned long ents;
@@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu
}
else
{
- cpu = smp_processor_id();
- desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table)
- + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY;
+ desc = (!is_pv_32on64_vcpu(v)
+ ? this_cpu(gdt_table) : this_cpu(compat_gdt_table))
+ + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY;
_set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2);
- __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) );
+ __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) );
}
}
Index: 2008-09-01/xen/include/asm-x86/page.h
===================================================================
--- 2008-09-01.orig/xen/include/asm-x86/page.h 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/include/asm-x86/page.h 2008-09-10 09:06:02.000000000 +0200
@@ -278,6 +278,7 @@ extern unsigned int m2p_compat_vstart;
#endif
void paging_init(void);
void setup_idle_pagetable(void);
+unsigned long clone_idle_pagetable(struct vcpu *);
#endif /* !defined(__ASSEMBLY__) */
#define _PAGE_PRESENT 0x001U
next reply other threads:[~2008-09-10 14:35 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-09-10 14:35 Jan Beulich [this message]
2008-09-11 10:54 ` [PATCH, RFC] x86: make the GDT per-CPU Keir Fraser
2008-09-11 12:28 ` Jan Beulich
2008-09-11 12:35 ` Jan Beulich
2008-09-11 12:42 ` Jan Beulich
2008-09-11 13:15 ` Keir Fraser
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=48C7F75C.76E4.0078.0@novell.com \
--to=jbeulich@novell.com \
--cc=xen-devel@lists.xensource.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.