* [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
@ 2008-12-25 20:58 Avi Kivity
2008-12-25 20:58 ` [PATCH 1/3] x86: drop the use of the tss interrupt stack table (IST) Avi Kivity
` (4 more replies)
0 siblings, 5 replies; 13+ messages in thread
From: Avi Kivity @ 2008-12-25 20:58 UTC (permalink / raw)
To: Ingo Molnar; +Cc: H. Peter Anvin, kvm, linux-kernel
The interrupt stack table (IST) mechanism is the only thing preventing
kvm from deferring saving and reloading of some significant state. It
is also somewhat complicated.
Remove it by switching the special exceptions to use the normal irqstack.
Avi Kivity (3):
x86: drop the use of the tss interrupt stack table (IST)
x86: Remove pda.irqcount
x86: Switch critical exceptions and NMI to irqstack
arch/x86/include/asm/desc.h | 12 -----
arch/x86/include/asm/page_64.h | 7 ---
arch/x86/include/asm/pda.h | 2 +-
arch/x86/include/asm/processor.h | 11 ----
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/cpu/common.c | 35 --------------
arch/x86/kernel/dumpstack_64.c | 96 --------------------------------------
arch/x86/kernel/entry_64.S | 49 ++++++++-----------
arch/x86/kernel/traps.c | 12 ++--
9 files changed, 27 insertions(+), 198 deletions(-)
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 1/3] x86: drop the use of the tss interrupt stack table (IST)
2008-12-25 20:58 [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel Avi Kivity
@ 2008-12-25 20:58 ` Avi Kivity
2008-12-25 20:58 ` [PATCH 2/3] x86: Remove pda.irqcount Avi Kivity
` (3 subsequent siblings)
4 siblings, 0 replies; 13+ messages in thread
From: Avi Kivity @ 2008-12-25 20:58 UTC (permalink / raw)
To: Ingo Molnar; +Cc: H. Peter Anvin, kvm, linux-kernel
The IST is the only thing that requires a valid TSS while running in
kernel mode. Dropping its use unlocks an optimization opportunity for
kvm: if we don't need a valid TSS while in kernel mode we can defer the
use of the VMLOAD/VMSAVE instructions until the next context switch,
reducing the executions of these costly instructions by a nice factor.
Kernel reliability should also be improved since interrupt paths are
simplified.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/include/asm/desc.h | 12 -----
arch/x86/include/asm/page_64.h | 7 ---
arch/x86/include/asm/processor.h | 11 ----
arch/x86/kernel/cpu/common.c | 34 -------------
arch/x86/kernel/dumpstack_64.c | 96 --------------------------------------
arch/x86/kernel/entry_64.S | 17 ++-----
arch/x86/kernel/traps.c | 12 ++--
7 files changed, 10 insertions(+), 179 deletions(-)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e6b82b1..0465c75 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -369,18 +369,6 @@ static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
_set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
}
-static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
-{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
-}
-
-static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
-{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
-}
-
#else
/*
* GET_DESC_BASE reads the descriptor base of the specified segment.
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 5ebca29..7c89095 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -16,13 +16,6 @@
#define IRQSTACK_ORDER 2
#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
-#define STACKFAULT_STACK 1
-#define DOUBLEFAULT_STACK 2
-#define NMI_STACK 3
-#define DEBUG_STACK 4
-#define MCE_STACK 5
-#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
-
#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5ca01e3..4ef899c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -273,13 +273,6 @@ struct tss_struct {
DECLARE_PER_CPU(struct tss_struct, init_tss);
-/*
- * Save the original ist values for checking stack pointers during debugging
- */
-struct orig_ist {
- unsigned long ist[7];
-};
-
#define MXCSR_DEFAULT 0x1f80
struct i387_fsave_struct {
@@ -372,10 +365,6 @@ union thread_xstate {
struct xsave_struct xsave;
};
-#ifdef CONFIG_X86_64
-DECLARE_PER_CPU(struct orig_ist, orig_ist);
-#endif
-
extern void print_cpu_info(struct cpuinfo_x86 *);
extern unsigned int xstate_size;
extern void free_thread_xstate(struct task_struct *);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0..8563c51 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -903,9 +903,6 @@ void __cpuinit pda_init(int cpu)
}
}
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ] __page_aligned_bss;
-
extern asmlinkage void ignore_sysret(void);
/* May not be marked __init: used by software suspend */
@@ -931,12 +928,6 @@ void syscall_init(void)
unsigned long kernel_eflags;
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
#else
/* Make sure %fs is initialized properly in idle threads */
@@ -960,17 +951,13 @@ void __cpuinit cpu_init(void)
{
int cpu = stack_smp_processor_id();
struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
unsigned long v;
- char *estacks = NULL;
struct task_struct *me;
int i;
/* CPU 0 is initialised in head64.c */
if (cpu != 0)
pda_init(cpu);
- else
- estacks = boot_exception_stacks;
me = current;
@@ -1000,27 +987,6 @@ void __cpuinit cpu_init(void)
if (cpu != 0 && x2apic)
enable_x2apic();
- /*
- * set up and load the per-CPU TSS
- */
- if (!orig_ist->ist[0]) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
- };
- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- if (cpu) {
- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
- if (!estacks)
- panic("Cannot allocate exception "
- "stack %ld %d\n", v, cpu);
- }
- estacks += PAGE_SIZE << order[v];
- orig_ist->ist[v] = t->x86_tss.ist[v] =
- (unsigned long)estacks;
- }
- }
-
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
/*
* <= is required because the CPU will access up to
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7..a1be9af 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -31,81 +31,6 @@ void printk_address(unsigned long address, int reliable)
reliable ? "" : "? ", (void *) address);
}
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
-{
- static char ids[][8] = {
- [DEBUG_STACK - 1] = "#DB",
- [NMI_STACK - 1] = "NMI",
- [DOUBLEFAULT_STACK - 1] = "#DF",
- [STACKFAULT_STACK - 1] = "#SS",
- [MCE_STACK - 1] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [N_EXCEPTION_STACKS ...
- N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
-#endif
- };
- unsigned k;
-
- /*
- * Iterate over all exception stacks, and figure out whether
- * 'stack' is in one of them:
- */
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end = per_cpu(orig_ist, cpu).ist[k];
- /*
- * Is 'stack' above this exception frame's end?
- * If yes then skip to the next frame.
- */
- if (stack >= end)
- continue;
- /*
- * Is 'stack' above this exception frame's start address?
- * If yes then we found the right frame.
- */
- if (stack >= end - EXCEPTION_STKSZ) {
- /*
- * Make sure we only iterate through an exception
- * stack once. If it comes up for the second time
- * then there's something wrong going on - just
- * break out and return NULL:
- */
- if (*usedp & (1U << k))
- break;
- *usedp |= 1U << k;
- *idp = ids[k];
- return (unsigned long *)end;
- }
- /*
- * If this is a debug stack, and if it has a larger size than
- * the usual exception stacks, then 'stack' might still
- * be within the lower portion of the debug stack:
- */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
- unsigned j = N_EXCEPTION_STACKS - 1;
-
- /*
- * Black magic. A large debug stack is composed of
- * multiple exception stack entries, which we
- * iterate through now. Dont look:
- */
- do {
- ++j;
- end -= EXCEPTION_STKSZ;
- ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
- } while (stack < end - EXCEPTION_STKSZ);
- if (*usedp & (1U << j))
- break;
- *usedp |= 1U << j;
- *idp = ids[j];
- return (unsigned long *)end;
- }
-#endif
- }
- return NULL;
-}
-
/*
* x86-64 can have up to three kernel stacks:
* process stack
@@ -164,7 +89,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
{
const unsigned cpu = get_cpu();
unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
- unsigned used = 0;
struct thread_info *tinfo;
if (!task)
@@ -196,26 +120,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
*/
tinfo = task_thread_info(task);
for (;;) {
- char *id;
- unsigned long *estack_end;
- estack_end = in_exception_stack(cpu, (unsigned long)stack,
- &used, &id);
-
- if (estack_end) {
- if (ops->stack(data, id) < 0)
- break;
-
- bp = print_context_stack(tinfo, stack, bp, ops,
- data, estack_end);
- ops->stack(data, "<EOE>");
- /*
- * We link to the next stack via the
- * second-to-last pointer (index -2 to end) in the
- * exception stack:
- */
- stack = (unsigned long *) estack_end[-2];
- continue;
- }
if (irqstack_end) {
unsigned long *irqstack;
irqstack = irqstack_end -
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332..8c882e1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -899,7 +899,7 @@ END(spurious_interrupt)
/* error code is on the stack already */
/* handle NMI like exceptions that can happen everywhere */
- .macro paranoidentry sym, ist=0, irqtrace=1
+ .macro paranoidentry sym, irqtrace=1
SAVE_ALL
cld
movl $1,%ebx
@@ -910,22 +910,13 @@ END(spurious_interrupt)
SWAPGS
xorl %ebx,%ebx
1:
- .if \ist
- movq %gs:pda_data_offset, %rbp
- .endif
.if \irqtrace
TRACE_IRQS_OFF
.endif
movq %rsp,%rdi
movq ORIG_RAX(%rsp),%rsi
movq $-1,ORIG_RAX(%rsp)
- .if \ist
- subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
call \sym
- .if \ist
- addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
DISABLE_INTERRUPTS(CLBR_NONE)
.if \irqtrace
TRACE_IRQS_OFF
@@ -1226,7 +1217,7 @@ KPROBE_ENTRY(debug)
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_debug, DEBUG_STACK
+ paranoidentry do_debug
paranoidexit
KPROBE_END(debug)
@@ -1236,7 +1227,7 @@ KPROBE_ENTRY(nmi)
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1
CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_nmi, 0, 0
+ paranoidentry do_nmi, 0
#ifdef CONFIG_TRACE_IRQFLAGS
paranoidexit 0
#else
@@ -1250,7 +1241,7 @@ KPROBE_ENTRY(int3)
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_int3, DEBUG_STACK
+ paranoidentry do_int3
jmp paranoid_exit1
CFI_ENDPROC
KPROBE_END(int3)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 04d242a..c8f71ae 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -962,10 +962,10 @@ void __init trap_init(void)
#endif
set_intr_gate(0, ÷_error);
- set_intr_gate_ist(1, &debug, DEBUG_STACK);
- set_intr_gate_ist(2, &nmi, NMI_STACK);
+ set_intr_gate(1, &debug);
+ set_intr_gate(2, &nmi);
/* int3 can be called from all */
- set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
+ set_system_intr_gate(3, &int3);
/* int4 can be called from all */
set_system_intr_gate(4, &overflow);
set_intr_gate(5, &bounds);
@@ -974,19 +974,19 @@ void __init trap_init(void)
#ifdef CONFIG_X86_32
set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
- set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+ set_intr_gate(8, &double_fault);
#endif
set_intr_gate(9, &coprocessor_segment_overrun);
set_intr_gate(10, &invalid_TSS);
set_intr_gate(11, &segment_not_present);
- set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
+ set_intr_gate(12, &stack_segment);
set_intr_gate(13, &general_protection);
set_intr_gate(14, &page_fault);
set_intr_gate(15, &spurious_interrupt_bug);
set_intr_gate(16, &coprocessor_error);
set_intr_gate(17, &alignment_check);
#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(18, &machine_check, MCE_STACK);
+ set_intr_gate(18, &machine_check);
#endif
set_intr_gate(19, &simd_coprocessor_error);
--
1.6.0.6
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 2/3] x86: Remove pda.irqcount
2008-12-25 20:58 [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel Avi Kivity
2008-12-25 20:58 ` [PATCH 1/3] x86: drop the use of the tss interrupt stack table (IST) Avi Kivity
@ 2008-12-25 20:58 ` Avi Kivity
2008-12-25 20:58 ` [PATCH 3/3] x86: Switch critical exceptions and NMI to irqstack Avi Kivity
` (2 subsequent siblings)
4 siblings, 0 replies; 13+ messages in thread
From: Avi Kivity @ 2008-12-25 20:58 UTC (permalink / raw)
To: Ingo Molnar; +Cc: H. Peter Anvin, kvm, linux-kernel
pda.irqcount is used to test whether we need to switch to an irqstack or not.
We can do without it, however, by testing %rsp directly: if it's already
within the irqstack range we don't need to stacks.
This makes switching the nmi handler to use the irqstack easier.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/include/asm/pda.h | 2 +-
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/cpu/common.c | 1 -
arch/x86/kernel/entry_64.S | 29 +++++++++++++----------------
4 files changed, 14 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff8..2099610 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -14,7 +14,7 @@ struct x8664_pda {
address */
unsigned long kernelstack; /* 16 top of kernel stack for current */
unsigned long oldrsp; /* 24 user rsp for system call */
- int irqcount; /* 32 Irq nesting counter. Starts -1 */
+ int unused; /* 32 for rent */
unsigned int cpunumber; /* 36 Logical CPU number */
#ifdef CONFIG_CC_STACKPROTECTOR
unsigned long stack_canary; /* 40 stack canary value */
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7fcf63d..779d010 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -50,7 +50,6 @@ int main(void)
ENTRY(kernelstack);
ENTRY(oldrsp);
ENTRY(pcurrent);
- ENTRY(irqcount);
ENTRY(cpunumber);
ENTRY(irqstackptr);
ENTRY(data_offset);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8563c51..6313d03 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -877,7 +877,6 @@ void __cpuinit pda_init(int cpu)
mb();
pda->cpunumber = cpu;
- pda->irqcount = -1;
pda->kernelstack = (unsigned long)stack_thread_info() -
PDA_STACKOFFSET + THREAD_SIZE;
pda->active_mm = &init_mm;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8c882e1..245fecd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -627,6 +627,15 @@ END(stub_rt_sigreturn)
vector already pushed) */
#define XCPT_FRAME _frame ORIG_RAX
+ .macro enter_irqstack scratch
+ mov %gs:pda_irqstackptr, \scratch
+ sub %rsp, \scratch
+ cmp $IRQSTACKSIZE-64, \scratch
+ jbe 1234f
+ mov %gs:pda_irqstackptr, %rsp
+1234:
+ .endm
+
/*
* Interrupt entry/exit.
*
@@ -655,14 +664,7 @@ END(stub_rt_sigreturn)
testl $3,CS(%rdi)
je 1f
SWAPGS
- /* irqcount is used to check if a CPU is already on an interrupt
- stack or not. While this is essentially redundant with preempt_count
- it is a little cheaper to use a separate counter in the PDA
- (short of moving irq_enter into assembly, which would be too
- much work) */
-1: incl %gs:pda_irqcount
- cmoveq %gs:pda_irqstackptr,%rsp
- push %rbp # backlink for old unwinder
+1: enter_irqstack %rax
/*
* We entered an interrupt context - irqs are off:
*/
@@ -677,7 +679,6 @@ ENTRY(common_interrupt)
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- decl %gs:pda_irqcount
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
@@ -1325,14 +1326,12 @@ ENTRY(call_softirq)
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- incl %gs:pda_irqcount
- cmove %gs:pda_irqstackptr,%rsp
+ enter_irqstack %rax
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
- decl %gs:pda_irqcount
ret
CFI_ENDPROC
ENDPROC(call_softirq)
@@ -1369,15 +1368,13 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
CFI_DEFAULT_STACK
-11: incl %gs:pda_irqcount
- movq %rsp,%rbp
+11: movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- cmovzq %gs:pda_irqstackptr,%rsp
+ enter_irqstack %rax
pushq %rbp # backlink for old unwinder
call xen_evtchn_do_upcall
popq %rsp
CFI_DEF_CFA_REGISTER rsp
- decl %gs:pda_irqcount
jmp error_exit
CFI_ENDPROC
END(do_hypervisor_callback)
--
1.6.0.6
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 3/3] x86: Switch critical exceptions and NMI to irqstack
2008-12-25 20:58 [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel Avi Kivity
2008-12-25 20:58 ` [PATCH 1/3] x86: drop the use of the tss interrupt stack table (IST) Avi Kivity
2008-12-25 20:58 ` [PATCH 2/3] x86: Remove pda.irqcount Avi Kivity
@ 2008-12-25 20:58 ` Avi Kivity
2008-12-26 9:15 ` [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel Ingo Molnar
[not found] ` <87vdt5vfxc.fsf@basil.nowhere.org>
4 siblings, 0 replies; 13+ messages in thread
From: Avi Kivity @ 2008-12-25 20:58 UTC (permalink / raw)
To: Ingo Molnar; +Cc: H. Peter Anvin, kvm, linux-kernel
With the special exception stacks gone, the irqstack is a much safer place
than the regular task stacks.
Signed-off-by: Avi Kivity <avi@redhat.com>
---
arch/x86/kernel/entry_64.S | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 245fecd..8f40593 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -917,7 +917,10 @@ END(spurious_interrupt)
movq %rsp,%rdi
movq ORIG_RAX(%rsp),%rsi
movq $-1,ORIG_RAX(%rsp)
+ mov %rsp, %rbp
+ enter_irqstack %rax
call \sym
+ mov %rbp, %rsp
DISABLE_INTERRUPTS(CLBR_NONE)
.if \irqtrace
TRACE_IRQS_OFF
--
1.6.0.6
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
2008-12-25 20:58 [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel Avi Kivity
` (2 preceding siblings ...)
2008-12-25 20:58 ` [PATCH 3/3] x86: Switch critical exceptions and NMI to irqstack Avi Kivity
@ 2008-12-26 9:15 ` Ingo Molnar
[not found] ` <87vdt5vfxc.fsf@basil.nowhere.org>
4 siblings, 0 replies; 13+ messages in thread
From: Ingo Molnar @ 2008-12-26 9:15 UTC (permalink / raw)
To: Avi Kivity; +Cc: H. Peter Anvin, kvm, linux-kernel
* Avi Kivity <avi@redhat.com> wrote:
> The interrupt stack table (IST) mechanism is the only thing preventing
> kvm from deferring saving and reloading of some significant state. It
> is also somewhat complicated.
>
> Remove it by switching the special exceptions to use the normal irqstack.
>
> Avi Kivity (3):
> x86: drop the use of the tss interrupt stack table (IST)
> x86: Remove pda.irqcount
> x86: Switch critical exceptions and NMI to irqstack
>
> arch/x86/include/asm/desc.h | 12 -----
> arch/x86/include/asm/page_64.h | 7 ---
> arch/x86/include/asm/pda.h | 2 +-
> arch/x86/include/asm/processor.h | 11 ----
> arch/x86/kernel/asm-offsets_64.c | 1 -
> arch/x86/kernel/cpu/common.c | 35 --------------
> arch/x86/kernel/dumpstack_64.c | 96 --------------------------------------
> arch/x86/kernel/entry_64.S | 49 ++++++++-----------
> arch/x86/kernel/traps.c | 12 ++--
> 9 files changed, 27 insertions(+), 198 deletions(-)
looks good. Please base your work on the tip/master tree, we have a ton of
pending (and conflicting) changes in the lowlevel assembly area:
http://people.redhat.com/mingo/tip.git/README
Ingo
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
[not found] ` <20081228131605.GC496@one.firstfloor.org>
@ 2008-12-28 14:09 ` Avi Kivity
2008-12-28 19:08 ` Andi Kleen
0 siblings, 1 reply; 13+ messages in thread
From: Avi Kivity @ 2008-12-28 14:09 UTC (permalink / raw)
To: Andi Kleen; +Cc: x86, Ingo Molnar, linux-kernel, KVM list
(restoring cc list)
Andi Kleen wrote:
> One of the other problems: NMIs and MCEs have the same problem with SYSCALL
>
This one however looks unsolvable. Userspace can point %rsp into
arbitrary memory, issue a syscall, and hope for an nmi. Since we're in
cpl 0 and are not using IST, the processor will not switch stacks, and
the nmi stack frame will corrupt any memory the user chooses to point to.
Even without a malicious user, %rsp could legitimately point at unmapped
memory.
I don't see how syscall could work on i386, and indeed:
> vdso32.so-$(VDSO32-y) += int80
> vdso32.so-$(CONFIG_COMPAT) += syscall
> vdso32.so-$(VDSO32-y) += sysenter
It's disabled. Is that the reason?
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
2008-12-28 19:08 ` Andi Kleen
@ 2008-12-28 19:07 ` Avi Kivity
2008-12-28 19:19 ` Avi Kivity
2008-12-28 19:30 ` Andi Kleen
0 siblings, 2 replies; 13+ messages in thread
From: Avi Kivity @ 2008-12-28 19:07 UTC (permalink / raw)
To: Andi Kleen, Ingo Molnar; +Cc: x86, linux-kernel, KVM list
Andi Kleen wrote:
> On Sun, Dec 28, 2008 at 04:09:26PM +0200, Avi Kivity wrote:
>
>> I don't see how syscall could work on i386, and indeed:
>>
>
> i386 has task gates which support unconditional stack switching. But there
> are no 64bit task gates, just ISTs.
>
>
i386 is not that interesting to me (and probably task switching couldn't
be made to work well with guest state in TR).
> BTW I think there are more similar problems in your patch too.
>
One fatal problem is enough -- I don't thing that patch can be made to
work. Pity since it did clean up some stuff.
I would like however to speed up kvm. Here's a plan:
1. Add per-cpu IDT
2. When switching to the guest TR (and other state), switch off IST use
in the current IDT
3. When switching away from the kvm task, restore the IST entries
per-cpu IDT would cost around 4K per cpu. I propose to make it
kconfigurable, and have kvm select it.
Ingo, does this sound workable? It increases complexity rather than
decreasing it as the previous solution, but I don't see any way to drop
the use of IST as SYSCALL cannot work without IST if NMIs are enabled.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
2008-12-28 14:09 ` Avi Kivity
@ 2008-12-28 19:08 ` Andi Kleen
2008-12-28 19:07 ` Avi Kivity
0 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2008-12-28 19:08 UTC (permalink / raw)
To: Avi Kivity; +Cc: Andi Kleen, x86, Ingo Molnar, linux-kernel, KVM list
On Sun, Dec 28, 2008 at 04:09:26PM +0200, Avi Kivity wrote:
> I don't see how syscall could work on i386, and indeed:
i386 has task gates which support unconditional stack switching. But there
are no 64bit task gates, just ISTs.
BTW I think there are more similar problems in your patch too.
>
> >vdso32.so-$(VDSO32-y) += int80
> >vdso32.so-$(CONFIG_COMPAT) += syscall
> >vdso32.so-$(VDSO32-y) += sysenter
>
> It's disabled. Is that the reason?
No. All interesting 32bit CPUs have SYSENTER; the only one who has SYSCALL
but no SYSENTER is the K6, but it has a weird early variant of SYSCALL with
more problems which was never worth supporting.
-Andi
--
ak@linux.intel.com
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
2008-12-28 19:07 ` Avi Kivity
@ 2008-12-28 19:19 ` Avi Kivity
2008-12-28 20:08 ` Avi Kivity
2008-12-28 19:30 ` Andi Kleen
1 sibling, 1 reply; 13+ messages in thread
From: Avi Kivity @ 2008-12-28 19:19 UTC (permalink / raw)
To: Andi Kleen, Ingo Molnar; +Cc: x86, linux-kernel, KVM list
Avi Kivity wrote:
> 1. Add per-cpu IDT
Or we could have just two IDTs - one with IST and one without. I
clocked LIDT at 58 cycles (and we need two per heavyweight switch), so
it's not that wonderful.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
2008-12-28 19:07 ` Avi Kivity
2008-12-28 19:19 ` Avi Kivity
@ 2008-12-28 19:30 ` Andi Kleen
1 sibling, 0 replies; 13+ messages in thread
From: Andi Kleen @ 2008-12-28 19:30 UTC (permalink / raw)
To: Avi Kivity; +Cc: Andi Kleen, Ingo Molnar, x86, linux-kernel, KVM list
> One fatal problem is enough -- I don't thing that patch can be made to
> work. Pity since it did clean up some stuff.
Not sure that was true anyways.
> I would like however to speed up kvm. Here's a plan:
>
> 1. Add per-cpu IDT
You don't need that, do you? Just two sets.
> 2. When switching to the guest TR (and other state), switch off IST use
> in the current IDT
> 3. When switching away from the kvm task, restore the IST entries
>
> per-cpu IDT would cost around 4K per cpu. I propose to make it
> kconfigurable, and have kvm select it.
If anything please make it runtime switchable and disable it on Intel
CPUs.
-Andi
--
ak@linux.intel.com
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
2008-12-28 19:19 ` Avi Kivity
@ 2008-12-28 20:08 ` Avi Kivity
2008-12-28 20:34 ` Andi Kleen
0 siblings, 1 reply; 13+ messages in thread
From: Avi Kivity @ 2008-12-28 20:08 UTC (permalink / raw)
To: Andi Kleen, Ingo Molnar; +Cc: x86, linux-kernel, KVM list
Avi Kivity wrote:
> Avi Kivity wrote:
>> 1. Add per-cpu IDT
>
> Or we could have just two IDTs - one with IST and one without. I
> clocked LIDT at 58 cycles (and we need two per heavyweight switch), so
> it's not that wonderful.
This makes the whole thing unworthwhile. The vmload/vmsave pair costs
only 200 cycles (I should have started with this), and 120 cycles on the
heavyweight path plus complexity are not worth 200 cycles on the
lightweight path.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
2008-12-28 20:34 ` Andi Kleen
@ 2008-12-28 20:28 ` Avi Kivity
0 siblings, 0 replies; 13+ messages in thread
From: Avi Kivity @ 2008-12-28 20:28 UTC (permalink / raw)
To: Andi Kleen; +Cc: Ingo Molnar, x86, linux-kernel, KVM list
Andi Kleen wrote:
>> This makes the whole thing unworthwhile. The vmload/vmsave pair costs
>> only 200 cycles (I should have started with this), and 120 cycles on the
>> heavyweight path plus complexity are not worth 200 cycles on the
>> lightweight path.
>>
>
> Actually to switch ISTs you need to change the TSS, not the IDT.
> But I suppose that won't be any faster.
>
I can't touch the TSS (that's the starting point of the exercise). The
plan was to have a copy of the IDT with all IST pointers zeroed out (or
to have per-cpu IDT and zero out the IST pointers when entering guest
mode, restoring them on context switch).
It's not worth it though.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
2008-12-28 20:08 ` Avi Kivity
@ 2008-12-28 20:34 ` Andi Kleen
2008-12-28 20:28 ` Avi Kivity
0 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2008-12-28 20:34 UTC (permalink / raw)
To: Avi Kivity; +Cc: Andi Kleen, Ingo Molnar, x86, linux-kernel, KVM list
On Sun, Dec 28, 2008 at 10:08:35PM +0200, Avi Kivity wrote:
> Avi Kivity wrote:
> >Avi Kivity wrote:
> >>1. Add per-cpu IDT
> >
> >Or we could have just two IDTs - one with IST and one without. I
> >clocked LIDT at 58 cycles (and we need two per heavyweight switch), so
> >it's not that wonderful.
>
> This makes the whole thing unworthwhile. The vmload/vmsave pair costs
> only 200 cycles (I should have started with this), and 120 cycles on the
> heavyweight path plus complexity are not worth 200 cycles on the
> lightweight path.
Actually to switch ISTs you need to change the TSS, not the IDT.
But I suppose that won't be any faster.
-Andi
--
ak@linux.intel.com
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2008-12-28 20:28 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-25 20:58 [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel Avi Kivity
2008-12-25 20:58 ` [PATCH 1/3] x86: drop the use of the tss interrupt stack table (IST) Avi Kivity
2008-12-25 20:58 ` [PATCH 2/3] x86: Remove pda.irqcount Avi Kivity
2008-12-25 20:58 ` [PATCH 3/3] x86: Switch critical exceptions and NMI to irqstack Avi Kivity
2008-12-26 9:15 ` [PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel Ingo Molnar
[not found] ` <87vdt5vfxc.fsf@basil.nowhere.org>
[not found] ` <4956A0B1.1060908@redhat.com>
[not found] ` <20081227224029.GB496@one.firstfloor.org>
[not found] ` <49573FE7.9090802@redhat.com>
[not found] ` <20081228131605.GC496@one.firstfloor.org>
2008-12-28 14:09 ` Avi Kivity
2008-12-28 19:08 ` Andi Kleen
2008-12-28 19:07 ` Avi Kivity
2008-12-28 19:19 ` Avi Kivity
2008-12-28 20:08 ` Avi Kivity
2008-12-28 20:34 ` Andi Kleen
2008-12-28 20:28 ` Avi Kivity
2008-12-28 19:30 ` Andi Kleen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox