[PATCH REPOST^3] Run IST traps from user mode preemptive on process stack

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
@ 2008-05-02  9:19 Andi Kleen
  2008-05-06 12:31 ` Thomas Gleixner
  0 siblings, 1 reply; 6+ messages in thread
From: Andi Kleen @ 2008-05-02  9:19 UTC (permalink / raw)
  To: tglx, mingo, linux-kernel, jkosina, zdenek.kabelac

[Fixes a bug originally introduced in 2.6.23 I think. Reposted many times
so far, but still not applied. Please apply.]

Run IST traps from user mode preemptive on process stack

x86-64 has a few exceptions which run on special architecture 
supported IST exception stacks: these are nmi, double fault, stack fault,
int 3, debug.

Previously they would check for a scheduling event on returning
if the original CPU state was user mode and then switch to a 
process stack to schedule.

But the actual trap handler would still run on the IST stack
with preemption disabled.

This patch changes these traps instead to always switch 
to the process stack when the trap originated from user mode.
For kernel traps it keeps running non preemptive on the IST stack
because that is much safer (e.g. to still get nmi watchdog events
out even when the process stack is corrupted) 

Then the actual trap handlers can run with preemption enabled
or schedule as needed (e.g. to take locks) 

This fixes a regression I added earlier with print_vma_addr()
executing down() in these trap handlers from user space.

Strictly the change would have been only needed for debug 
and int3, but since they share this code as macros it was 
cleanest to just change all.

Cc: jkosina@suse.cz
Cc: zdenek.kabelac@gmail.com

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andi Kleen <andi@firstfloor.org>

---
 arch/x86/kernel/entry_64.S |   17 ++++++++++-------
 arch/x86/kernel/traps_64.c |   17 ++++++++++-------
 2 files changed, 20 insertions(+), 14 deletions(-)

Index: linux/arch/x86/kernel/entry_64.S
===================================================================
--- linux.orig/arch/x86/kernel/entry_64.S
+++ linux/arch/x86/kernel/entry_64.S
@@ -771,12 +771,18 @@ END(spurious_interrupt)
 	.if \ist
 	movq	%gs:pda_data_offset, %rbp
 	.endif
-	movq %rsp,%rdi
 	movq ORIG_RAX(%rsp),%rsi
 	movq $-1,ORIG_RAX(%rsp)
 	.if \ist
 	subq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
 	.endif
+	testl $3,CS(%rsp)
+	jz 2f					/* in kernel? stay on exception stack for now */
+	movq %rsp,%rdi
+	call sync_regs				/* Move all state over to process stack */
+	movq %rax,%rsp				/* switch stack to process stack */
+2:
+	movq %rsp,%rdi
 	call \sym
 	.if \ist
 	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
@@ -802,16 +808,16 @@ END(spurious_interrupt)
 	.macro paranoidexit trace=1
 	/* ebx:	no swapgs flag */
 paranoid_exit\trace:
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_restore\trace
 	testl $3,CS(%rsp)
 	jnz   paranoid_userspace\trace
 paranoid_swapgs\trace:
+	testl %ebx,%ebx 			/* swapgs needed? */
+	jnz 1f
 	.if \trace
 	TRACE_IRQS_IRETQ 0
 	.endif
 	SWAPGS_UNSAFE_STACK
-paranoid_restore\trace:
+1:
 	RESTORE_ALL 8
 	jmp irq_return
 paranoid_userspace\trace:
@@ -819,9 +825,6 @@ paranoid_userspace\trace:
 	movl threadinfo_flags(%rcx),%ebx
 	andl $_TIF_WORK_MASK,%ebx
 	jz paranoid_swapgs\trace
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
 	testl $_TIF_NEED_RESCHED,%ebx
 	jnz paranoid_schedule\trace
 	movl %ebx,%edx			/* arg3: thread flags */
Index: linux/arch/x86/kernel/traps_64.c
===================================================================
--- linux.orig/arch/x86/kernel/traps_64.c
+++ linux/arch/x86/kernel/traps_64.c
@@ -84,7 +84,8 @@ static inline void conditional_sti(struc
 
 static inline void preempt_conditional_sti(struct pt_regs *regs)
 {
-	inc_preempt_count();
+	if (!user_mode(regs))
+		inc_preempt_count();
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
@@ -95,7 +96,8 @@ static inline void preempt_conditional_c
 		local_irq_disable();
 	/* Make sure to not schedule here because we could be running
 	   on an exception stack. */
-	dec_preempt_count();
+	if (!user_mode(regs))
+		dec_preempt_count();
 }
 
 int kstack_depth_to_print = 12;
@@ -855,7 +857,7 @@ asmlinkage __kprobes void default_do_nmi
 		io_check_error(reason, regs);
 }
 
-/* runs on IST stack. */
+/* May run on IST stack. */
 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
 {
 	trace_hardirqs_fixup();
@@ -868,9 +870,10 @@ asmlinkage void __kprobes do_int3(struct
 	preempt_conditional_cli(regs);
 }
 
-/* Help handler running on IST stack to switch back to user stack
-   for scheduling or signal handling. The actual stack switch is done in
-   entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack.
+ * The actual stack switch is done in entry.S
+ */
 asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
 	struct pt_regs *regs = eregs;
@@ -889,7 +892,7 @@ asmlinkage __kprobes struct pt_regs *syn
 	return regs;
 }
 
-/* runs on IST stack. */
+/* May run on IST stack. */
 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 				   unsigned long error_code)
 {

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
  2008-05-02  9:19 [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack Andi Kleen
@ 2008-05-06 12:31 ` Thomas Gleixner
  2008-05-06 13:03   ` Andi Kleen
  0 siblings, 1 reply; 6+ messages in thread
From: Thomas Gleixner @ 2008-05-06 12:31 UTC (permalink / raw)
  To: Andi Kleen; +Cc: mingo, linux-kernel, jkosina, zdenek.kabelac

On Fri, 2 May 2008, Andi Kleen wrote:
> [Fixes a bug originally introduced in 2.6.23 I think. Reposted many times
> so far, but still not applied. Please apply.]

The print_vma_addr() bug was introduced by commit 03252919 (by you) in
the 2.6.25 merge window and we fixed it before 25-rc2 via commit e8bff74a.

Do you know about any other breakage in that code? If not, why do you
claim that we ignored a bug fix?

> Run IST traps from user mode preemptive on process stack
> 
> x86-64 has a few exceptions which run on special architecture 
> supported IST exception stacks: these are nmi, double fault, stack fault,
> int 3, debug.

+ MCE

> Previously they would check for a scheduling event on returning
> if the original CPU state was user mode and then switch to a 
> process stack to schedule.
>
> But the actual trap handler would still run on the IST stack
> with preemption disabled.
> 
> This patch changes these traps instead to always switch 
> to the process stack when the trap originated from user mode.
> For kernel traps it keeps running non preemptive on the IST stack
> because that is much safer (e.g. to still get nmi watchdog events
> out even when the process stack is corrupted) 
> 
> Then the actual trap handlers can run with preemption enabled
> or schedule as needed (e.g. to take locks) 
> 
> This fixes a regression I added earlier with print_vma_addr()
> executing down() in these trap handlers from user space.

Aside of the fact, that it has been fixed long time ago in a simple
and non dangerous way already, your "fix" adds a far more serious
regression:

> --- linux.orig/arch/x86/kernel/entry_64.S
> +++ linux/arch/x86/kernel/entry_64.S
> @@ -771,12 +771,18 @@ END(spurious_interrupt)
>  	.if \ist
>  	movq	%gs:pda_data_offset, %rbp
>  	.endif
> -	movq %rsp,%rdi
>  	movq ORIG_RAX(%rsp),%rsi
>  	movq $-1,ORIG_RAX(%rsp)
>  	.if \ist
>  	subq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
>  	.endif
> +	testl $3,CS(%rsp)
> +	jz 2f					/* in kernel? stay on exception stack for now */
> +	movq %rsp,%rdi
> +	call sync_regs				/* Move all state over to process stack */
> +	movq %rax,%rsp				/* switch stack to process stack */
> +2:
> +	movq %rsp,%rdi
>  	call \sym
>  	.if \ist
>  	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)

This introduces two security bugs in one go:

1.) The IST stack pointer is elevated unconditionaly and with your
change we can now schedule away in the handler. Then another task on
this same CPU triggers the same trap and elevates it again. This can
nest multiple times and there is no protection against an IST stack
overflow at all!

This is probably a nice fat user-triggerable root hole in fact!
Exploitable because user-space has control over the pt_regs data,
so by nesting enough such overflows a critical kernel data structure
can probably be written with near-arbitrary content.

At minimum you've introuduced a nasty DoS hole that would almost never
trigger during normal loads - it would probably result in extremely hard
to debug memory corruption in data structures that are near the IST stacks.

2.) The IST stack pointer is unbalanced in the other direction as well:
it is incremented on CPUn then the handler is scheduled away and migrated
to CPUm. After return from the handler the IST stacks of both CPUs are
corrupted. Exploitable by unprivileged user-space code as well.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
  2008-05-06 12:31 ` Thomas Gleixner
@ 2008-05-06 13:03   ` Andi Kleen
  2008-05-06 14:34     ` Ingo Molnar
                       ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Andi Kleen @ 2008-05-06 13:03 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: mingo, linux-kernel, jkosina, zdenek.kabelac

Thomas Gleixner <tglx@linutronix.de> writes:

> On Fri, 2 May 2008, Andi Kleen wrote:
>> [Fixes a bug originally introduced in 2.6.23 I think. Reposted many times
>> so far, but still not applied. Please apply.]
>
> The print_vma_addr() bug was introduced by commit 03252919 (by you) in
> the 2.6.25 merge window and we fixed it before 25-rc2 via commit e8bff74a.

Well it was worked around, not properly fixed. This patch fixes it properly.
The problem of the original workaround is that it wouldn't print the vma
now in many cases because it couldn't take the semaphore.

The workaround was right back then because it was shortly before 
the release, but it was always a ward that needed fixing properly.

I believe it was a good idea anyways because there were always 
some possible problems with not being able to sleep in these 
exception handlers.

>
> This introduces two security bugs in one go:
>
> 1.) The IST stack pointer is elevated unconditionaly and with your
> change we can now schedule away in the handler. 

Yes, but that's fine.

> this same CPU triggers the same trap and elevates it again.

That's not possible generally. None of these interrupts can
nest in a normal kernel.

Do you refer to the DEBUG_STACK ist add/dec? That is not needed
for anything in tree to my knowledge. 

> 2.) The IST stack pointer is unbalanced in the other direction as well:
> it is incremented on CPUn then the handler is scheduled away and migrated
> to CPUm. After return from the handler the IST stacks of both CPUs are
> corrupted. Exploitable by unprivileged user-space code as well.

The IST is restored again after the handler. You're right that strictly
wouldn't be needed and could be avoided, but i don't think it's exploitable
in any ways.

Regarding user controlled pt_regs: I think you're forgetting that
x86-64 doesn't have vm86 mode and that we can always trust pt_regs
segment indexes. On i386 you would be right, but not here.

-Andi

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
  2008-05-06 13:03   ` Andi Kleen
@ 2008-05-06 14:34     ` Ingo Molnar
  2008-05-06 14:39     ` Ingo Molnar
  2008-05-06 14:41     ` Ingo Molnar
  2 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2008-05-06 14:34 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, linux-kernel, jkosina, zdenek.kabelac,
	Arjan van de Ven

* Andi Kleen <andi@firstfloor.org> wrote:

> > This introduces two security bugs in one go:
> >
> > 1.) The IST stack pointer is elevated unconditionaly and with your 
> > change we can now schedule away in the handler.
> 
> Yes, but that's fine.

no, your patch is not fine at all. It introduces a security hole, as 
Thomas pointed it out to you.

The IST pointer in the _TSS_ can go out of bounds by your patch! That 
corruption can be controlled by malicious userspace. Read the analysis 
from Thomas.

That percpu_tss.ist[] location is not just a random pointer. It is the 
TSS that is read by the CPU on every trap. It is a security-critical 
structure and every modification to it has to be treated very, very 
carefully. If it goes out of bounds that leads to very nasty problems.

This is a pretty bad security bug, and your reply shows ignorance about 
the code your patch is modifying - _after_ Thomas has pointed it out to 
you.

> > this same CPU triggers the same trap and elevates it again.
> 
> That's not possible generally. None of these interrupts can nest in a 
> normal kernel.

read the analysis from Thomas. Here is a sample buggy sequence:

  task #1 runs, does int3, goes on DEBUG_STACK IST stack,
      we do -= 4096 of the tss.ist[DEBUG_STACK-1] pointer, handler schedules
  task #2 runs, does int3, goes on DEBUG_STACK IST stack, handler schedules
      we do -= 4096 of the tss.ist[DEBUG_STACK-1] pointer, handler schedules
  task #3 runs, does int3, goes on DEBUG_STACK IST

  => poof, stack underflow, memory corruption of nearby data structures,
     because the DEBUG_STACK is only 8KB...

this can be triggered in almost arbitrary depth, from user-space.

> Do you refer to the DEBUG_STACK ist add/dec? That is not needed for 
> anything in tree to my knowledge.

Wrong. The pointer the subq/addq instructions modify are in fact used by 
_EVERY SINGLE_ IST trap sequence that the 64-bit kernel executes (!).

This is the modification that the paranoidentry macro does to the TSS in 
entry_64.S:

  subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
  ...
  addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)

that modifies the TSS _directly_. The TSS is directly read by the CPU on 
every trap entry. The percpu_tss.ist[] array of pointers is then used by 
the CPU for every single IST trap. (hw-debug, NMI, int3, double-fault, 
stacksegment-overflow, MCE)

This is nothing new or unexpected, it is basic x86-64 IST architectural 
behavior implemented in the CPU.

The bug you introduce is that if the handler schedules away (it blocks 
or gets preempted on CONFIG_PREEMPT), it would keep the per-CPU IST 
offset for that IST type decreased by 4096 => if done enough times the 
pointer goes out of bounds and it's kaboom.

> > 2.) The IST stack pointer is unbalanced in the other direction as 
> > well: it is incremented on CPUn then the handler is scheduled away 
> > and migrated to CPUm. After return from the handler the IST stacks 
> > of both CPUs are corrupted. Exploitable by unprivileged user-space 
> > code as well.
> 
> The IST is restored again after the handler. [...]

yes but it is restored on the wrong CPU, after the task has scheduled 
and migrated - moving the IST pointer in the TSS out of bounds, so the 
next IST trap (which user-space can trigger arbitrarily - just generate 
a stream of INT3 breakpoints) will corrupt memory!

> [...] You're right that strictly wouldn't be needed and could be 
> avoided, but i don't think it's exploitable in any ways.
> 
> Regarding user controlled pt_regs: I think you're forgetting that 
> x86-64 doesn't have vm86 mode and that we can always trust pt_regs 
> segment indexes. On i386 you would be right, but not here.

Thomas is not forgetting anything.

Thomas is doing security analysis about how the hole in your patch can 
be exploited: the wrong IST offsets are used to corrupt nearby data 
structures - a malicious nonprivileged user task can modify a good 
portion of the ~64 bytes of pt_regs at the end of every page near the 
IST stack address (and randomly corrupt some bytes below that) - just by 
virtue of generating an int3 (or hw-debug) trap with prepared register 
contents.

You first need to understand the hole you are introducing to understand 
the finer aspects of his security analysis though...

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
  2008-05-06 13:03   ` Andi Kleen
  2008-05-06 14:34     ` Ingo Molnar
@ 2008-05-06 14:39     ` Ingo Molnar
  2008-05-06 14:41     ` Ingo Molnar
  2 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2008-05-06 14:39 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Thomas Gleixner, linux-kernel, jkosina, zdenek.kabelac

* Andi Kleen <andi@firstfloor.org> wrote:

[...]
> Well it was worked around, not properly fixed. This patch fixes it 
> properly. The problem of the original workaround is that it wouldn't 
> print the vma now in many cases because it couldn't take the 
> semaphore.

huh? While this issue is dwarfed by the security hole your patch 
introduces, you miss the whole point about debug printouts in case of 
traps.

In practice we dont need to print out _anything_ from int3 traps (even 
if they were unexpected) - user-space very much knows it has set a 
breakpoint.

What we are interested in are the segmentation faults for example. Those 
do get printed out correctly as segmentation faults do not go via IST 
traps, they go via the normal process stack.

Furthermore, we _do_ print out the fault location even for int3 if we 
are not preemptible. An example i just triggered on latest -git:

  int3[2789] trap int3 ip:4004cd sp:7fff27501c50 error:0

And we do print out the vma information too in other, much more 
interesting trap types such as unresolved page faults:

  segfault[2652]: segfault at 0 ip 400471 sp 7fff05d42480 error 6 in segfault[400000+1000]

So what we do worst-case is that we do not do a find_vma() and we dont 
print out the vma. Not a big deal at all for an int3 or a hw-breakpoint 
trap ...

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
  2008-05-06 13:03   ` Andi Kleen
  2008-05-06 14:34     ` Ingo Molnar
  2008-05-06 14:39     ` Ingo Molnar
@ 2008-05-06 14:41     ` Ingo Molnar
  2 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2008-05-06 14:41 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Thomas Gleixner, linux-kernel, jkosina, zdenek.kabelac

* Andi Kleen <andi@firstfloor.org> wrote:

> Thomas Gleixner <tglx@linutronix.de> writes:
> 
> > On Fri, 2 May 2008, Andi Kleen wrote:
> >> [Fixes a bug originally introduced in 2.6.23 I think. Reposted many times
> >> so far, but still not applied. Please apply.]
> >
> > The print_vma_addr() bug was introduced by commit 03252919 (by you) in
> > the 2.6.25 merge window and we fixed it before 25-rc2 via commit e8bff74a.

hm, why did you cut out and ignore this question from Thomas:

> > Do you know about any other breakage in that code? If not, why do 
> > you claim that we ignored a bug fix?

First you are putting pressure on us by innuendo that we somehow did not 
react to an unfixed x86 bug, supposedly starting in v2.6.23 and still 
not fixed in v2.6.26 as we speak. This is a pretty serious accusation!

But, if one looks closer, your accusation does not hold up to scrutiny: 
it turns out that it is not in v2.6.23 but v2.6.25, that it got fixed 
within 2 weeks between v2.6.25-rc1 and v2.6.25-rc2, that it is not an 
unfixed bug that we ignored but at most about a small detail in a debug 
feature introduced by _you_.

Why are you doing this with us? First you are accusing us and then you 
are ignoring straight questions that would clear us from your unfair 
accusation. Why?

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-05-06 14:41 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-02  9:19 [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack Andi Kleen
2008-05-06 12:31 ` Thomas Gleixner
2008-05-06 13:03   ` Andi Kleen
2008-05-06 14:34     ` Ingo Molnar
2008-05-06 14:39     ` Ingo Molnar
2008-05-06 14:41     ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox