* [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
@ 2008-05-02 9:19 Andi Kleen
2008-05-06 12:31 ` Thomas Gleixner
0 siblings, 1 reply; 6+ messages in thread
From: Andi Kleen @ 2008-05-02 9:19 UTC (permalink / raw)
To: tglx, mingo, linux-kernel, jkosina, zdenek.kabelac
[Fixes a bug originally introduced in 2.6.23 I think. Reposted many times
so far, but still not applied. Please apply.]
Run IST traps from user mode preemptive on process stack
x86-64 has a few exceptions which run on special architecture
supported IST exception stacks: these are nmi, double fault, stack fault,
int 3, debug.
Previously they would check for a scheduling event on returning
if the original CPU state was user mode and then switch to a
process stack to schedule.
But the actual trap handler would still run on the IST stack
with preemption disabled.
This patch changes these traps instead to always switch
to the process stack when the trap originated from user mode.
For kernel traps it keeps running non preemptive on the IST stack
because that is much safer (e.g. to still get nmi watchdog events
out even when the process stack is corrupted)
Then the actual trap handlers can run with preemption enabled
or schedule as needed (e.g. to take locks)
This fixes a regression I added earlier with print_vma_addr()
executing down() in these trap handlers from user space.
Strictly the change would have been only needed for debug
and int3, but since they share this code as macros it was
cleanest to just change all.
Cc: jkosina@suse.cz
Cc: zdenek.kabelac@gmail.com
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andi Kleen <andi@firstfloor.org>
---
arch/x86/kernel/entry_64.S | 17 ++++++++++-------
arch/x86/kernel/traps_64.c | 17 ++++++++++-------
2 files changed, 20 insertions(+), 14 deletions(-)
Index: linux/arch/x86/kernel/entry_64.S
===================================================================
--- linux.orig/arch/x86/kernel/entry_64.S
+++ linux/arch/x86/kernel/entry_64.S
@@ -771,12 +771,18 @@ END(spurious_interrupt)
.if \ist
movq %gs:pda_data_offset, %rbp
.endif
- movq %rsp,%rdi
movq ORIG_RAX(%rsp),%rsi
movq $-1,ORIG_RAX(%rsp)
.if \ist
subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
.endif
+ testl $3,CS(%rsp)
+ jz 2f /* in kernel? stay on exception stack for now */
+ movq %rsp,%rdi
+ call sync_regs /* Move all state over to process stack */
+ movq %rax,%rsp /* switch stack to process stack */
+2:
+ movq %rsp,%rdi
call \sym
.if \ist
addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
@@ -802,16 +808,16 @@ END(spurious_interrupt)
.macro paranoidexit trace=1
/* ebx: no swapgs flag */
paranoid_exit\trace:
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore\trace
testl $3,CS(%rsp)
jnz paranoid_userspace\trace
paranoid_swapgs\trace:
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz 1f
.if \trace
TRACE_IRQS_IRETQ 0
.endif
SWAPGS_UNSAFE_STACK
-paranoid_restore\trace:
+1:
RESTORE_ALL 8
jmp irq_return
paranoid_userspace\trace:
@@ -819,9 +825,6 @@ paranoid_userspace\trace:
movl threadinfo_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
jz paranoid_swapgs\trace
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
testl $_TIF_NEED_RESCHED,%ebx
jnz paranoid_schedule\trace
movl %ebx,%edx /* arg3: thread flags */
Index: linux/arch/x86/kernel/traps_64.c
===================================================================
--- linux.orig/arch/x86/kernel/traps_64.c
+++ linux/arch/x86/kernel/traps_64.c
@@ -84,7 +84,8 @@ static inline void conditional_sti(struc
static inline void preempt_conditional_sti(struct pt_regs *regs)
{
- inc_preempt_count();
+ if (!user_mode(regs))
+ inc_preempt_count();
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
@@ -95,7 +96,8 @@ static inline void preempt_conditional_c
local_irq_disable();
/* Make sure to not schedule here because we could be running
on an exception stack. */
- dec_preempt_count();
+ if (!user_mode(regs))
+ dec_preempt_count();
}
int kstack_depth_to_print = 12;
@@ -855,7 +857,7 @@ asmlinkage __kprobes void default_do_nmi
io_check_error(reason, regs);
}
-/* runs on IST stack. */
+/* May run on IST stack. */
asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
{
trace_hardirqs_fixup();
@@ -868,9 +870,10 @@ asmlinkage void __kprobes do_int3(struct
preempt_conditional_cli(regs);
}
-/* Help handler running on IST stack to switch back to user stack
- for scheduling or signal handling. The actual stack switch is done in
- entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack.
+ * The actual stack switch is done in entry.S
+ */
asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
@@ -889,7 +892,7 @@ asmlinkage __kprobes struct pt_regs *syn
return regs;
}
-/* runs on IST stack. */
+/* May run on IST stack. */
asmlinkage void __kprobes do_debug(struct pt_regs * regs,
unsigned long error_code)
{
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
2008-05-02 9:19 [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack Andi Kleen
@ 2008-05-06 12:31 ` Thomas Gleixner
2008-05-06 13:03 ` Andi Kleen
0 siblings, 1 reply; 6+ messages in thread
From: Thomas Gleixner @ 2008-05-06 12:31 UTC (permalink / raw)
To: Andi Kleen; +Cc: mingo, linux-kernel, jkosina, zdenek.kabelac
On Fri, 2 May 2008, Andi Kleen wrote:
> [Fixes a bug originally introduced in 2.6.23 I think. Reposted many times
> so far, but still not applied. Please apply.]
The print_vma_addr() bug was introduced by commit 03252919 (by you) in
the 2.6.25 merge window and we fixed it before 25-rc2 via commit e8bff74a.
Do you know about any other breakage in that code? If not, why do you
claim that we ignored a bug fix?
> Run IST traps from user mode preemptive on process stack
>
> x86-64 has a few exceptions which run on special architecture
> supported IST exception stacks: these are nmi, double fault, stack fault,
> int 3, debug.
+ MCE
> Previously they would check for a scheduling event on returning
> if the original CPU state was user mode and then switch to a
> process stack to schedule.
>
> But the actual trap handler would still run on the IST stack
> with preemption disabled.
>
> This patch changes these traps instead to always switch
> to the process stack when the trap originated from user mode.
> For kernel traps it keeps running non preemptive on the IST stack
> because that is much safer (e.g. to still get nmi watchdog events
> out even when the process stack is corrupted)
>
> Then the actual trap handlers can run with preemption enabled
> or schedule as needed (e.g. to take locks)
>
> This fixes a regression I added earlier with print_vma_addr()
> executing down() in these trap handlers from user space.
Aside of the fact, that it has been fixed long time ago in a simple
and non dangerous way already, your "fix" adds a far more serious
regression:
> --- linux.orig/arch/x86/kernel/entry_64.S
> +++ linux/arch/x86/kernel/entry_64.S
> @@ -771,12 +771,18 @@ END(spurious_interrupt)
> .if \ist
> movq %gs:pda_data_offset, %rbp
> .endif
> - movq %rsp,%rdi
> movq ORIG_RAX(%rsp),%rsi
> movq $-1,ORIG_RAX(%rsp)
> .if \ist
> subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
> .endif
> + testl $3,CS(%rsp)
> + jz 2f /* in kernel? stay on exception stack for now */
> + movq %rsp,%rdi
> + call sync_regs /* Move all state over to process stack */
> + movq %rax,%rsp /* switch stack to process stack */
> +2:
> + movq %rsp,%rdi
> call \sym
> .if \ist
> addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
This introduces two security bugs in one go:
1.) The IST stack pointer is elevated unconditionaly and with your
change we can now schedule away in the handler. Then another task on
this same CPU triggers the same trap and elevates it again. This can
nest multiple times and there is no protection against an IST stack
overflow at all!
This is probably a nice fat user-triggerable root hole in fact!
Exploitable because user-space has control over the pt_regs data,
so by nesting enough such overflows a critical kernel data structure
can probably be written with near-arbitrary content.
At minimum you've introuduced a nasty DoS hole that would almost never
trigger during normal loads - it would probably result in extremely hard
to debug memory corruption in data structures that are near the IST stacks.
2.) The IST stack pointer is unbalanced in the other direction as well:
it is incremented on CPUn then the handler is scheduled away and migrated
to CPUm. After return from the handler the IST stacks of both CPUs are
corrupted. Exploitable by unprivileged user-space code as well.
Thanks,
tglx
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
2008-05-06 12:31 ` Thomas Gleixner
@ 2008-05-06 13:03 ` Andi Kleen
2008-05-06 14:34 ` Ingo Molnar
` (2 more replies)
0 siblings, 3 replies; 6+ messages in thread
From: Andi Kleen @ 2008-05-06 13:03 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: mingo, linux-kernel, jkosina, zdenek.kabelac
Thomas Gleixner <tglx@linutronix.de> writes:
> On Fri, 2 May 2008, Andi Kleen wrote:
>> [Fixes a bug originally introduced in 2.6.23 I think. Reposted many times
>> so far, but still not applied. Please apply.]
>
> The print_vma_addr() bug was introduced by commit 03252919 (by you) in
> the 2.6.25 merge window and we fixed it before 25-rc2 via commit e8bff74a.
Well it was worked around, not properly fixed. This patch fixes it properly.
The problem of the original workaround is that it wouldn't print the vma
now in many cases because it couldn't take the semaphore.
The workaround was right back then because it was shortly before
the release, but it was always a ward that needed fixing properly.
I believe it was a good idea anyways because there were always
some possible problems with not being able to sleep in these
exception handlers.
>
> This introduces two security bugs in one go:
>
> 1.) The IST stack pointer is elevated unconditionaly and with your
> change we can now schedule away in the handler.
Yes, but that's fine.
> this same CPU triggers the same trap and elevates it again.
That's not possible generally. None of these interrupts can
nest in a normal kernel.
Do you refer to the DEBUG_STACK ist add/dec? That is not needed
for anything in tree to my knowledge.
> 2.) The IST stack pointer is unbalanced in the other direction as well:
> it is incremented on CPUn then the handler is scheduled away and migrated
> to CPUm. After return from the handler the IST stacks of both CPUs are
> corrupted. Exploitable by unprivileged user-space code as well.
The IST is restored again after the handler. You're right that strictly
wouldn't be needed and could be avoided, but i don't think it's exploitable
in any ways.
Regarding user controlled pt_regs: I think you're forgetting that
x86-64 doesn't have vm86 mode and that we can always trust pt_regs
segment indexes. On i386 you would be right, but not here.
-Andi
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
2008-05-06 13:03 ` Andi Kleen
@ 2008-05-06 14:34 ` Ingo Molnar
2008-05-06 14:39 ` Ingo Molnar
2008-05-06 14:41 ` Ingo Molnar
2 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2008-05-06 14:34 UTC (permalink / raw)
To: Andi Kleen
Cc: Thomas Gleixner, linux-kernel, jkosina, zdenek.kabelac,
Arjan van de Ven
* Andi Kleen <andi@firstfloor.org> wrote:
> > This introduces two security bugs in one go:
> >
> > 1.) The IST stack pointer is elevated unconditionaly and with your
> > change we can now schedule away in the handler.
>
> Yes, but that's fine.
no, your patch is not fine at all. It introduces a security hole, as
Thomas pointed it out to you.
The IST pointer in the _TSS_ can go out of bounds by your patch! That
corruption can be controlled by malicious userspace. Read the analysis
from Thomas.
That percpu_tss.ist[] location is not just a random pointer. It is the
TSS that is read by the CPU on every trap. It is a security-critical
structure and every modification to it has to be treated very, very
carefully. If it goes out of bounds that leads to very nasty problems.
This is a pretty bad security bug, and your reply shows ignorance about
the code your patch is modifying - _after_ Thomas has pointed it out to
you.
> > this same CPU triggers the same trap and elevates it again.
>
> That's not possible generally. None of these interrupts can nest in a
> normal kernel.
read the analysis from Thomas. Here is a sample buggy sequence:
task #1 runs, does int3, goes on DEBUG_STACK IST stack,
we do -= 4096 of the tss.ist[DEBUG_STACK-1] pointer, handler schedules
task #2 runs, does int3, goes on DEBUG_STACK IST stack, handler schedules
we do -= 4096 of the tss.ist[DEBUG_STACK-1] pointer, handler schedules
task #3 runs, does int3, goes on DEBUG_STACK IST
=> poof, stack underflow, memory corruption of nearby data structures,
because the DEBUG_STACK is only 8KB...
this can be triggered in almost arbitrary depth, from user-space.
> Do you refer to the DEBUG_STACK ist add/dec? That is not needed for
> anything in tree to my knowledge.
Wrong. The pointer the subq/addq instructions modify are in fact used by
_EVERY SINGLE_ IST trap sequence that the 64-bit kernel executes (!).
This is the modification that the paranoidentry macro does to the TSS in
entry_64.S:
subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
...
addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
that modifies the TSS _directly_. The TSS is directly read by the CPU on
every trap entry. The percpu_tss.ist[] array of pointers is then used by
the CPU for every single IST trap. (hw-debug, NMI, int3, double-fault,
stacksegment-overflow, MCE)
This is nothing new or unexpected, it is basic x86-64 IST architectural
behavior implemented in the CPU.
The bug you introduce is that if the handler schedules away (it blocks
or gets preempted on CONFIG_PREEMPT), it would keep the per-CPU IST
offset for that IST type decreased by 4096 => if done enough times the
pointer goes out of bounds and it's kaboom.
> > 2.) The IST stack pointer is unbalanced in the other direction as
> > well: it is incremented on CPUn then the handler is scheduled away
> > and migrated to CPUm. After return from the handler the IST stacks
> > of both CPUs are corrupted. Exploitable by unprivileged user-space
> > code as well.
>
> The IST is restored again after the handler. [...]
yes but it is restored on the wrong CPU, after the task has scheduled
and migrated - moving the IST pointer in the TSS out of bounds, so the
next IST trap (which user-space can trigger arbitrarily - just generate
a stream of INT3 breakpoints) will corrupt memory!
> [...] You're right that strictly wouldn't be needed and could be
> avoided, but i don't think it's exploitable in any ways.
>
> Regarding user controlled pt_regs: I think you're forgetting that
> x86-64 doesn't have vm86 mode and that we can always trust pt_regs
> segment indexes. On i386 you would be right, but not here.
Thomas is not forgetting anything.
Thomas is doing security analysis about how the hole in your patch can
be exploited: the wrong IST offsets are used to corrupt nearby data
structures - a malicious nonprivileged user task can modify a good
portion of the ~64 bytes of pt_regs at the end of every page near the
IST stack address (and randomly corrupt some bytes below that) - just by
virtue of generating an int3 (or hw-debug) trap with prepared register
contents.
You first need to understand the hole you are introducing to understand
the finer aspects of his security analysis though...
Ingo
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
2008-05-06 13:03 ` Andi Kleen
2008-05-06 14:34 ` Ingo Molnar
@ 2008-05-06 14:39 ` Ingo Molnar
2008-05-06 14:41 ` Ingo Molnar
2 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2008-05-06 14:39 UTC (permalink / raw)
To: Andi Kleen; +Cc: Thomas Gleixner, linux-kernel, jkosina, zdenek.kabelac
* Andi Kleen <andi@firstfloor.org> wrote:
[...]
> Well it was worked around, not properly fixed. This patch fixes it
> properly. The problem of the original workaround is that it wouldn't
> print the vma now in many cases because it couldn't take the
> semaphore.
huh? While this issue is dwarfed by the security hole your patch
introduces, you miss the whole point about debug printouts in case of
traps.
In practice we dont need to print out _anything_ from int3 traps (even
if they were unexpected) - user-space very much knows it has set a
breakpoint.
What we are interested in are the segmentation faults for example. Those
do get printed out correctly as segmentation faults do not go via IST
traps, they go via the normal process stack.
Furthermore, we _do_ print out the fault location even for int3 if we
are not preemptible. An example i just triggered on latest -git:
int3[2789] trap int3 ip:4004cd sp:7fff27501c50 error:0
And we do print out the vma information too in other, much more
interesting trap types such as unresolved page faults:
segfault[2652]: segfault at 0 ip 400471 sp 7fff05d42480 error 6 in segfault[400000+1000]
So what we do worst-case is that we do not do a find_vma() and we dont
print out the vma. Not a big deal at all for an int3 or a hw-breakpoint
trap ...
Ingo
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack
2008-05-06 13:03 ` Andi Kleen
2008-05-06 14:34 ` Ingo Molnar
2008-05-06 14:39 ` Ingo Molnar
@ 2008-05-06 14:41 ` Ingo Molnar
2 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2008-05-06 14:41 UTC (permalink / raw)
To: Andi Kleen; +Cc: Thomas Gleixner, linux-kernel, jkosina, zdenek.kabelac
* Andi Kleen <andi@firstfloor.org> wrote:
> Thomas Gleixner <tglx@linutronix.de> writes:
>
> > On Fri, 2 May 2008, Andi Kleen wrote:
> >> [Fixes a bug originally introduced in 2.6.23 I think. Reposted many times
> >> so far, but still not applied. Please apply.]
> >
> > The print_vma_addr() bug was introduced by commit 03252919 (by you) in
> > the 2.6.25 merge window and we fixed it before 25-rc2 via commit e8bff74a.
hm, why did you cut out and ignore this question from Thomas:
> > Do you know about any other breakage in that code? If not, why do
> > you claim that we ignored a bug fix?
First you are putting pressure on us by innuendo that we somehow did not
react to an unfixed x86 bug, supposedly starting in v2.6.23 and still
not fixed in v2.6.26 as we speak. This is a pretty serious accusation!
But, if one looks closer, your accusation does not hold up to scrutiny:
it turns out that it is not in v2.6.23 but v2.6.25, that it got fixed
within 2 weeks between v2.6.25-rc1 and v2.6.25-rc2, that it is not an
unfixed bug that we ignored but at most about a small detail in a debug
feature introduced by _you_.
Why are you doing this with us? First you are accusing us and then you
are ignoring straight questions that would clear us from your unfair
accusation. Why?
Ingo
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2008-05-06 14:41 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-02 9:19 [PATCH REPOST^3] Run IST traps from user mode preemptive on process stack Andi Kleen
2008-05-06 12:31 ` Thomas Gleixner
2008-05-06 13:03 ` Andi Kleen
2008-05-06 14:34 ` Ingo Molnar
2008-05-06 14:39 ` Ingo Molnar
2008-05-06 14:41 ` Ingo Molnar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox