diff -urN linux-2.6.8-pcsp/arch/i386/kernel/entry.S linux-2.6.8-stacks/arch/i386/kernel/entry.S --- linux-2.6.8-pcsp/arch/i386/kernel/entry.S 2004-06-10 13:28:35.000000000 +0400 +++ linux-2.6.8-stacks/arch/i386/kernel/entry.S 2004-09-22 15:39:50.000000000 +0400 @@ -70,9 +70,16 @@ CF_MASK = 0x00000001 TF_MASK = 0x00000100 IF_MASK = 0x00000200 -DF_MASK = 0x00000400 +DF_MASK = 0x00000400 +IOPL1_MASK = 0x00001000 +IOPL2_MASK = 0x00002000 +IOPL_MASK = 0x00003000 NT_MASK = 0x00004000 +RF_MASK = 0x00010000 VM_MASK = 0x00020000 +AC_MASK = 0x00040000 +VIF_MASK = 0x00080000 +VIP_MASK = 0x00100000 #ifdef CONFIG_PREEMPT #define preempt_stop cli @@ -81,6 +88,49 @@ #define resume_kernel restore_all #endif +/* + * First, reserve the 0x400 bytes on stack for the NMI + * handler - it uses the same stack. + * Second, push the additional iret frame to switch to + * the Ring-1 trampoline. The correct value of ESP + * higher word is pushed here and popped on Ring-1. + * espfix_trampoline is working on a "small" stack. + * We patch its base in GDT here to allow the Ring-1 code + * to access the right place on stack. This is preempt-unsafe + * and therefore must be done with the interrupts disabled. + * To allow Ring-1 code to re-enable interrupts, we make + * sure it will be executing with at least IOPL1. + */ +#define NMI_STACK_RESERVE 0x400 +#define MAKE_ESPFIX \ + cli; \ + subl $NMI_STACK_RESERVE, %esp; \ + .rept 5; \ + pushl NMI_STACK_RESERVE+0x10(%esp); \ + .endr; \ + pushl %eax; \ + movl %esp, %eax; \ + pushl %ebp; \ + GET_THREAD_INFO(%ebp); \ + movl TI_cpu(%ebp), %ebp; \ + shll $GDT_SIZE_SHIFT, %ebp; \ + addl $cpu_gdt_table, %ebp; \ + movw %ax, ((GDT_ENTRY_ESPFIX_SS << GDT_ENTRY_SHIFT) + 2)(%ebp); \ + shrl $16, %eax; \ + movb %al, ((GDT_ENTRY_ESPFIX_SS << GDT_ENTRY_SHIFT) + 4)(%ebp); \ + movb %ah, ((GDT_ENTRY_ESPFIX_SS << GDT_ENTRY_SHIFT) + 7)(%ebp); \ + popl %ebp; \ + popl %eax; \ + pushw 14(%esp); \ + pushw $4; \ + pushl $__ESPFIX_SS; \ + pushl $0; \ + pushl 20(%esp); \ + andl $~(IF_MASK | TF_MASK | RF_MASK | NT_MASK | AC_MASK), (%esp); \ + orl $IOPL1_MASK, (%esp); \ + pushl $__ESPFIX_CS; \ + pushl $espfix_trampoline; + #define SAVE_ALL \ cld; \ pushl %es; \ @@ -122,9 +172,24 @@ .previous +/* If returning to Ring-3, not to V86, and with + * the small stack, try to fix the higher word of + * ESP, as the CPU won't restore it from stack. + * This is an "official" bug of all the x86-compatible + * CPUs, which we can try to work around to make + * dosemu happy. */ #define RESTORE_ALL \ - RESTORE_REGS \ - addl $4, %esp; \ + movl EFLAGS(%esp), %eax; \ + movb CS(%esp), %al; \ + andl $(VM_MASK | 2), %eax; \ + cmpl $2, %eax; \ + jne 3f; \ + larl OLDSS(%esp), %eax; \ + testl $0x00400000, %eax; \ +3: RESTORE_REGS \ + leal 4(%esp), %esp; \ + jnz 1f; \ + MAKE_ESPFIX \ 1: iret; \ .section .fixup,"ax"; \ 2: sti; \ @@ -174,6 +239,23 @@ jmp do_lcall +ENTRY(espfix_trampoline) + popl %esp +espfix_past_esp: +1: iret +.section .fixup,"ax" +2: sti + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + pushl $11 + call do_exit +.previous +.section __ex_table,"a" + .align 4 + .long 1b,2b +.previous + ENTRY(ret_from_fork) pushl %eax call schedule_tail @@ -196,7 +278,7 @@ GET_THREAD_INFO(%ebp) movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al - testl $(VM_MASK | 3), %eax + testl $(VM_MASK | 2), %eax jz resume_kernel # returning to kernel or vm86-space ENTRY(resume_userspace) cli # make sure we don't miss an interrupt @@ -504,7 +586,12 @@ ENTRY(nmi) cmpl $sysenter_entry,(%esp) je nmi_stack_fixup - pushl %eax + cmpl $espfix_past_esp,(%esp) + jne 1f + /* restart the "popl %esp" */ + decl (%esp) + subw $4, 12(%esp) +1: pushl %eax movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. diff -urN linux-2.6.8-pcsp/arch/i386/kernel/head.S linux-2.6.8-stacks/arch/i386/kernel/head.S --- linux-2.6.8-pcsp/arch/i386/kernel/head.S 2004-08-10 11:02:36.000000000 +0400 +++ linux-2.6.8-stacks/arch/i386/kernel/head.S 2004-09-21 14:58:16.000000000 +0400 @@ -514,8 +514,8 @@ .quad 0x00009a0000000000 /* 0xc0 APM CS 16 code (16 bit) */ .quad 0x0040920000000000 /* 0xc8 APM DS data */ - .quad 0x0000000000000000 /* 0xd0 - unused */ - .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x00cfba000000ffff /* 0xd0 - ESPfix CS */ + .quad 0x0000b2000000ffff /* 0xd8 - ESPfix SS */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ diff -urN linux-2.6.8-pcsp/arch/i386/kernel/process.c linux-2.6.8-stacks/arch/i386/kernel/process.c --- linux-2.6.8-pcsp/arch/i386/kernel/process.c 2004-06-21 09:19:07.000000000 +0400 +++ linux-2.6.8-stacks/arch/i386/kernel/process.c 2004-09-21 10:45:55.000000000 +0400 @@ -225,7 +225,7 @@ printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); print_symbol("EIP is at %s\n", regs->eip); - if (regs->xcs & 3) + if (regs->xcs & 2) printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s (%s)\n",regs->eflags, print_tainted(),UTS_RELEASE); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", diff -urN linux-2.6.8-pcsp/arch/i386/kernel/signal.c linux-2.6.8-stacks/arch/i386/kernel/signal.c --- linux-2.6.8-pcsp/arch/i386/kernel/signal.c 2004-08-10 11:02:36.000000000 +0400 +++ linux-2.6.8-stacks/arch/i386/kernel/signal.c 2004-09-21 10:48:38.000000000 +0400 @@ -558,7 +558,7 @@ * kernel mode. Just return without doing anything * if so. */ - if ((regs->xcs & 3) != 3) + if (!(regs->xcs & 2)) return 1; if (current->flags & PF_FREEZE) { diff -urN linux-2.6.8-pcsp/arch/i386/kernel/traps.c linux-2.6.8-stacks/arch/i386/kernel/traps.c --- linux-2.6.8-pcsp/arch/i386/kernel/traps.c 2004-08-10 11:02:36.000000000 +0400 +++ linux-2.6.8-stacks/arch/i386/kernel/traps.c 2004-09-21 10:47:37.000000000 +0400 @@ -210,7 +210,7 @@ esp = (unsigned long) (®s->esp); ss = __KERNEL_DS; - if (regs->xcs & 3) { + if (regs->xcs & 2) { in_kernel = 0; esp = regs->esp; ss = regs->xss & 0xffff; @@ -264,7 +264,7 @@ char c; unsigned long eip; - if (regs->xcs & 3) + if (regs->xcs & 2) goto no_bug; /* Not in kernel */ eip = regs->eip; @@ -335,7 +335,7 @@ static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) { - if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) + if (!(regs->eflags & VM_MASK) && !(2 & regs->xcs)) die(str, regs, err); } @@ -357,7 +357,7 @@ goto trap_signal; } - if (!(regs->xcs & 3)) + if (!(regs->xcs & 2)) goto kernel_trap; trap_signal: { @@ -437,7 +437,7 @@ if (regs->eflags & VM_MASK) goto gp_in_vm86; - if (!(regs->xcs & 3)) + if (!(regs->xcs & 2)) goto gp_in_kernel; current->thread.error_code = error_code; @@ -614,7 +614,7 @@ * allowing programs to debug themselves without the ptrace() * interface. */ - if ((regs->xcs & 3) == 0) + if ((regs->xcs & 2) == 0) goto clear_TF_reenable; if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; @@ -630,7 +630,7 @@ /* If this is a kernel mode trap, save the user PC on entry to * the kernel, that's what the debugger can make sense of. */ - info.si_addr = ((regs->xcs & 3) == 0) ? (void *)tsk->thread.eip : + info.si_addr = ((regs->xcs & 2) == 0) ? (void *)tsk->thread.eip : (void *)regs->eip; force_sig_info(SIGTRAP, &info, tsk); diff -urN linux-2.6.8-pcsp/arch/i386/mm/extable.c linux-2.6.8-stacks/arch/i386/mm/extable.c --- linux-2.6.8-pcsp/arch/i386/mm/extable.c 2004-06-09 15:44:19.000000000 +0400 +++ linux-2.6.8-stacks/arch/i386/mm/extable.c 2004-09-21 17:17:05.000000000 +0400 @@ -26,6 +26,11 @@ } #endif + if (unlikely(regs->xcs == __ESPFIX_CS)) + { + regs->xcs = __KERNEL_CS; + } + fixup = search_exception_tables(regs->eip); if (fixup) { regs->eip = fixup->fixup; diff -urN linux-2.6.8-pcsp/arch/i386/mm/fault.c linux-2.6.8-stacks/arch/i386/mm/fault.c --- linux-2.6.8-pcsp/arch/i386/mm/fault.c 2004-08-10 11:02:37.000000000 +0400 +++ linux-2.6.8-stacks/arch/i386/mm/fault.c 2004-09-21 16:03:28.000000000 +0400 @@ -77,7 +77,7 @@ u32 seg_ar, seg_limit, base, *desc; /* The standard kernel/user address space limit. */ - *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg; + *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg; /* Unlikely, but must come before segment checks. */ if (unlikely((regs->eflags & VM_MASK) != 0)) diff -urN linux-2.6.8-pcsp/include/asm-i386/segment.h linux-2.6.8-stacks/include/asm-i386/segment.h --- linux-2.6.8-pcsp/include/asm-i386/segment.h 2004-01-09 09:59:19.000000000 +0300 +++ linux-2.6.8-stacks/include/asm-i386/segment.h 2004-09-21 14:57:44.000000000 +0400 @@ -38,8 +38,8 @@ * 24 - APM BIOS support * 25 - APM BIOS support * - * 26 - unused - * 27 - unused + * 26 - ESPfix Ring-1 trampoline CS + * 27 - ESPfix Ring-1 small SS * 28 - unused * 29 - unused * 30 - unused @@ -71,14 +71,21 @@ #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) +#define GDT_ENTRY_ESPFIX_CS (GDT_ENTRY_KERNEL_BASE + 14) +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 15) +#define __ESPFIX_CS (GDT_ENTRY_ESPFIX_CS * 8 + 1) +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8 + 1) + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* * The GDT has 32 entries */ -#define GDT_ENTRIES 32 - -#define GDT_SIZE (GDT_ENTRIES * 8) +#define GDT_ENTRIES_SHIFT 5 +#define GDT_ENTRIES (1 << GDT_ENTRIES_SHIFT) +#define GDT_ENTRY_SHIFT 3 +#define GDT_SIZE_SHIFT (GDT_ENTRIES_SHIFT + GDT_ENTRY_SHIFT) +#define GDT_SIZE (1 << GDT_SIZE_SHIFT) /* Simple and small GDT entries for booting only */