doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-18 23:01 Linux v2.5.62 --- spontaneous reboots Chris Wedgwood
@ 2003-02-19 23:35 ` Linus Torvalds
  2003-02-20  2:22   ` Zwane Mwaikambo
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2003-02-19 23:35 UTC (permalink / raw)
  To: Chris Wedgwood; +Cc: Kernel Mailing List, Martin J. Bligh, Ingo Molnar


Ok, I wrote up this doublefault task-gate handler which has gotten some
very very minimal testing, and which is probably totally buggered on SMP
machines etc, but which has caught at least one double-fault on one of my
test-machines (which I forced to double-fault by making %esp contain an
invalid value in kernel mode).

If the reboot is due to a triple-fault, this may give out some debugging 
information and then lock up hard instead of rebooting.

Change the "ptr_ok()" to match your hardware (or just make it do

	#define ptr_ok(x) (1)

since I only really wrote it that way due to debugging the damn thing).

Anyway, this patch should apply pretty directly on top of 2.5.62, and if 
you run UP it might even work. So apply this, and try to crash the 
machine, and see if it spits out any interesting information.

NOTE NOTE NOTE! When the double-fault happens, the machine as-is will be 
COMPLETELY DEAD! Don't try to access "current" or anything like that, 
since the stack is scrogged. That's why it gets the state by actually 
reading the current value of gdt, and following it to the TSS structure.

If this approach works, we can try to make the doublefault handling less 
prone to lock up the machine (ie kill the offending task and continuing),  
but in the meantime at least it should avoid having things like stack 
errors result in triple faults and reboots.

Improvements welcome (and boy was this a bitch to debug).

		Linus

-----
===== arch/i386/kernel/Makefile 1.35 vs edited =====
--- 1.35/arch/i386/kernel/Makefile	Tue Feb 18 18:59:01 2003
+++ edited/arch/i386/kernel/Makefile	Wed Feb 19 11:56:49 2003
@@ -6,7 +6,8 @@
 
 obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
 		ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
-		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
+		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
+		doublefault.o
 
 obj-y				+= cpu/
 obj-y				+= timers/
===== arch/i386/kernel/head.S 1.24 vs edited =====
--- 1.24/arch/i386/kernel/head.S	Tue Feb 18 18:58:53 2003
+++ edited/arch/i386/kernel/head.S	Wed Feb 19 11:56:50 2003
@@ -476,6 +476,13 @@
 	.quad 0x00009a0000000000	/* 0xc0 APM CS 16 code (16 bit) */
 	.quad 0x0040920000000000	/* 0xc8 APM DS    data */
 
+	.quad 0x0000000000000000	/* 0xd0 - unused */
+	.quad 0x0000000000000000	/* 0xd8 - unused */
+	.quad 0x0000000000000000	/* 0xe0 - unused */
+	.quad 0x0000000000000000	/* 0xe8 - unused */
+	.quad 0x0000000000000000	/* 0xf0 - unused */
+	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
+
 #if CONFIG_SMP
 	.fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */
 #endif
===== arch/i386/kernel/traps.c 1.44 vs edited =====
--- 1.44/arch/i386/kernel/traps.c	Sat Feb 15 19:30:17 2003
+++ edited/arch/i386/kernel/traps.c	Wed Feb 19 11:56:50 2003
@@ -775,7 +775,7 @@
 }
 #endif
 
-#define _set_gate(gate_addr,type,dpl,addr) \
+#define _set_gate(gate_addr,type,dpl,addr,seg) \
 do { \
   int __d0, __d1; \
   __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
@@ -785,7 +785,7 @@
 	:"=m" (*((long *) (gate_addr))), \
 	 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
 	:"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
-	 "3" ((char *) (addr)),"2" (__KERNEL_CS << 16)); \
+	 "3" ((char *) (addr)),"2" ((seg) << 16)); \
 } while (0)
 
 
@@ -797,22 +797,27 @@
  */
 void set_intr_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,14,0,addr);
+	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
 }
 
 static void __init set_trap_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,15,0,addr);
+	_set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
 }
 
 static void __init set_system_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,15,3,addr);
+	_set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
 }
 
 static void __init set_call_gate(void *a, void *addr)
 {
-	_set_gate(a,12,3,addr);
+	_set_gate(a,12,3,addr,__KERNEL_CS);
+}
+
+static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+	_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
 }
 
 
@@ -843,7 +848,7 @@
 	set_system_gate(5,&bounds);
 	set_trap_gate(6,&invalid_op);
 	set_trap_gate(7,&device_not_available);
-	set_trap_gate(8,&double_fault);
+	set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
 	set_trap_gate(9,&coprocessor_segment_overrun);
 	set_trap_gate(10,&invalid_TSS);
 	set_trap_gate(11,&segment_not_present);
===== arch/i386/kernel/cpu/common.c 1.17 vs edited =====
--- 1.17/arch/i386/kernel/cpu/common.c	Sat Dec 28 09:17:17 2002
+++ edited/arch/i386/kernel/cpu/common.c	Wed Feb 19 11:56:50 2003
@@ -490,6 +490,10 @@
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
+	/* Set up doublefault TSS pointer in the GDT */
+	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+	cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff;
+
 	/* Clear %fs and %gs. */
 	asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
 
===== include/asm-i386/desc.h 1.12 vs edited =====
--- 1.12/include/asm-i386/desc.h	Sat Dec 28 09:18:49 2002
+++ edited/include/asm-i386/desc.h	Wed Feb 19 11:56:51 2003
@@ -42,10 +42,12 @@
 	"rorl $16,%%eax" \
 	: "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type))
 
-static inline void set_tss_desc(unsigned int cpu, void *addr)
+static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
 {
-	_set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_TSS], (int)addr, 235, 0x89);
+	_set_tssldt_desc(&cpu_gdt_table[cpu][entry], (int)addr, 235, 0x89);
 }
+
+#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
 
 static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
 {
===== include/asm-i386/processor.h 1.39 vs edited =====
--- 1.39/include/asm-i386/processor.h	Fri Feb 14 18:24:10 2003
+++ edited/include/asm-i386/processor.h	Wed Feb 19 11:56:51 2003
@@ -83,6 +83,7 @@
 extern struct cpuinfo_x86 boot_cpu_data;
 extern struct cpuinfo_x86 new_cpu_data;
 extern struct tss_struct init_tss[NR_CPUS];
+extern struct tss_struct doublefault_tss;
 
 #ifdef CONFIG_SMP
 extern struct cpuinfo_x86 cpu_data[];
===== include/asm-i386/segment.h 1.5 vs edited =====
--- 1.5/include/asm-i386/segment.h	Sat Dec 28 09:18:49 2002
+++ edited/include/asm-i386/segment.h	Wed Feb 19 11:56:52 2003
@@ -37,6 +37,13 @@
  *  23 - APM BIOS support
  *  24 - APM BIOS support
  *  25 - APM BIOS support 
+ *
+ *  26 - unused
+ *  27 - unused
+ *  28 - unused
+ *  29 - unused
+ *  30 - unused
+ *  31 - TSS for double fault handler
  */
 #define GDT_ENTRY_TLS_ENTRIES	3
 #define GDT_ENTRY_TLS_MIN	6
@@ -64,10 +71,12 @@
 #define GDT_ENTRY_PNPBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 6)
 #define GDT_ENTRY_APMBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 11)
 
+#define GDT_ENTRY_DOUBLEFAULT_TSS	31
+
 /*
- * The GDT has 25 entries but we pad it to cacheline boundary:
+ * The GDT has 32 entries
  */
-#define GDT_ENTRIES 28
+#define GDT_ENTRIES 32
 
 #define GDT_SIZE (GDT_ENTRIES * 8)
 
--- /dev/null	2002-08-30 16:31:37.000000000 -0700
+++ ./arch/i386/kernel/doublefault.c	2003-02-19 15:26:44.000000000 -0800
@@ -0,0 +1,65 @@
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+
+#define DOUBLEFAULT_STACKSIZE (1024)
+static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
+#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+
+#define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000)
+
+static void doublefault_fn(void)
+{
+	struct Xgt_desc_struct gdt_desc = {0, 0};
+	unsigned long gdt, tss;
+
+	__asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory");
+	gdt = gdt_desc.address;
+
+	printk("double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+
+	if (ptr_ok(gdt)) {
+		gdt += GDT_ENTRY_TSS << 3;
+		tss = *(u16 *)(gdt+2);
+		tss += *(u8 *)(gdt+4) << 16;
+		tss += *(u8 *)(gdt+7) << 24;
+		printk("double fault, tss at %08lx\n", tss);
+
+		if (ptr_ok(tss)) {
+			struct tss_struct *t = (struct tss_struct *)tss;
+
+			printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp);
+
+			printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
+				t->eax, t->ebx, t->ecx, t->edx);
+			printk("esi = %08lx, edi = %08lx\n",
+				t->esi, t->edi);
+		}
+	}
+
+	for (;;) /* nothing */;
+}
+
+struct tss_struct doublefault_tss __cacheline_aligned = {
+	.esp0		= STACK_START,
+	.ss0		= __KERNEL_DS,
+	.ldt		= 0,
+	.bitmap		= INVALID_IO_BITMAP_OFFSET,
+	.io_bitmap	= { [0 ... IO_BITMAP_SIZE ] = ~0 },
+
+	.eip		= (unsigned long) doublefault_fn,
+	.eflags		= 0x00000082,
+	.esp		= STACK_START,
+	.es		= __USER_DS,
+	.cs		= __KERNEL_CS,
+	.ss		= __KERNEL_DS,
+	.ds		= __USER_DS,
+
+	.__cr3		= __pa(swapper_pg_dir)
+};


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-19 23:35 ` doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots) Linus Torvalds
@ 2003-02-20  2:22   ` Zwane Mwaikambo
  2003-02-20  2:26     ` William Lee Irwin III
  2003-02-20  4:52     ` Linus Torvalds
  0 siblings, 2 replies; 52+ messages in thread
From: Zwane Mwaikambo @ 2003-02-20  2:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Chris Wedgwood, Kernel Mailing List, Martin J. Bligh, Ingo Molnar,
	William Lee Irwin III

Thanks!
	Here is a triple fault case (2.5.62-pgcl) and since i'm not a Real 
Man i had to use a simulator ;) Unfortunately i can't unwind the stack.

Freeing unused kernel memory: 100k freed
double fault, gdt at c0268020 [255 bytes]
double fault, tss at c027d800
eip = c01181c4, esp = f7f9bf90
eax = c0003dfc, ebx = ffffffff, ecx = 0000007b, edx = f7f9c04c
esi = 00000003, edi = c01181b0

0xc01181c4 <do_page_fault+20>:  mov    %eax,0xc(%esp,1)

(0) [0x001139e4] 0060:c01139e4 (t doublefault_fn+c4): jmp c0113ae4  ; ebfe

eax            0x1f             31
ecx            0xc027d800       -1071130624
edx            0xc027d800       -1071130624
ebx            0xc027d800       -1071130624
esp            0xc029f7ec       0xc029f7ec
ebp            0x0              0x0
esi            0xffffffff       -1
edi            0x0              0
eip            0xc01139e4       0xc01139e4
eflags         0x4082           16514
cs             0x60             96
ss             0x68             104
ds             0x7b             123
es             0x7b             123
fs             0x0              0
gs             0x0              0

CR0=0x8005003b
    PG=paging=1
    CD=cache disable=0
    NW=not write through=0
    AM=alignment mask=1
    WP=write protect=1
    NE=numeric error=1
    ET=extension type=1
    TS=task switched=1
    EM=FPU emulation=0
    MP=monitor coprocessor=1
    PE=protection enable=1
CR2=page fault linear address=0xf7f9bf8c
CR3=0x00101000
    PCD=page-level cache disable=0
    PWT=page-level writes transparent=0
CR4=0x000000b0
    VME=virtual-8086 mode extensions=0
    PVI=protected-mode virtual interrupts=0
    TSD=time stamp disable=0
    DE=debugging extensions=0
    PSE=page size extensions=1
    PAE=physical address extension=1
    MCE=machine check enable=0
    PGE=page global enable=1
    PCE=performance-monitor counter enable=0
    OXFXSR=OS support for FXSAVE/FXRSTOR=0
    OSXMMEXCPT=OS support for unmasked SIMD FP exceptions=0

Global Descriptor Table (0xc0268020):
GDT[0x00]=??? descriptor hi=00000000, lo=00000000
GDT[0x01]=??? descriptor hi=00000000, lo=00000000
GDT[0x02]=??? descriptor hi=00000000, lo=00000000
GDT[0x03]=??? descriptor hi=00000000, lo=00000000
GDT[0x04]=??? descriptor hi=00000000, lo=00000000
GDT[0x05]=??? descriptor hi=00000000, lo=00000000
GDT[0x06]=??? descriptor hi=00000000, lo=00000000
GDT[0x07]=??? descriptor hi=00000000, lo=00000000
GDT[0x08]=??? descriptor hi=00000000, lo=00000000
GDT[0x09]=??? descriptor hi=00000000, lo=00000000
GDT[0x0a]=??? descriptor hi=00000000, lo=00000000
GDT[0x0b]=??? descriptor hi=00000000, lo=00000000
GDT[0x0c]=Code segment, linearaddr=00000000, len=fffff * 4Kbytes, Execute/Read, 32-bit addrs
GDT[0x0d]=Data segment, linearaddr=00000000, len=fffff * 4Kbytes, Read/Write, Accessed
GDT[0x0e]=Code segment, linearaddr=00000000, len=fffff * 4Kbytes, Execute/Read, 32-bit addrs
GDT[0x0f]=Data segment, linearaddr=00000000, len=fffff * 4Kbytes, Read/Write, Accessed
GDT[0x10]=32-Bit TSS (Busy) at c027d800, length 0x000eb
GDT[0x11]=LDT
GDT[0x12]=Code segment, linearaddr=00000000, len=00000 * 4Kbytes, Execute/Read, 32-bit addrs
GDT[0x13]=Code segment, linearaddr=00000000, len=00000 * 4Kbytes, Execute/Read, 16-bit addrs
GDT[0x14]=Data segment, linearaddr=00000000, len=00000 * 4Kbytes, Read/Write
GDT[0x15]=Data segment, linearaddr=00000000, len=00000 * 4Kbytes, Read/Write
GDT[0x16]=Data segment, linearaddr=00000000, len=00000 * 4Kbytes, Read/Write
GDT[0x17]=Code segment, linearaddr=00000000, len=00000 bytes, Execute/Read, 32-bit addrs
GDT[0x18]=Code segment, linearaddr=00000000, len=00000 bytes, Execute/Read, 16-bit addrs
GDT[0x19]=Data segment, linearaddr=00000000, len=00000 bytes, Read/Write
GDT[0x1a]=??? descriptor hi=00000000, lo=00000000
GDT[0x1b]=??? descriptor hi=00000000, lo=00000000
GDT[0x1c]=??? descriptor hi=00000000, lo=00000000
GDT[0x1d]=??? descriptor hi=00000000, lo=00000000
GDT[0x1e]=??? descriptor hi=00000000, lo=00000000
GDT[0x1f]=32-Bit TSS (Busy) at c027f500, length 0x000eb


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20  2:22   ` Zwane Mwaikambo
@ 2003-02-20  2:26     ` William Lee Irwin III
  2003-02-20  2:55       ` Zwane Mwaikambo
  2003-02-20  4:52     ` Linus Torvalds
  1 sibling, 1 reply; 52+ messages in thread
From: William Lee Irwin III @ 2003-02-20  2:26 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Linus Torvalds, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, Ingo Molnar

On Wed, Feb 19, 2003 at 09:22:42PM -0500, Zwane Mwaikambo wrote:
> 	Here is a triple fault case (2.5.62-pgcl) and since i'm not a Real 
> Man i had to use a simulator ;) Unfortunately i can't unwind the stack.
> 
> CR2=page fault linear address=0xf7f9bf8c
> CR3=0x00101000
>     PCD=page-level cache disable=0
>     PWT=page-level writes transparent=0

Looks like either a pagetable or physmap/vmalloc/fixmap screwup.
What do the bootlogs have for those things?


-- wli

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20  2:26     ` William Lee Irwin III
@ 2003-02-20  2:55       ` Zwane Mwaikambo
  2003-02-20  3:15         ` William Lee Irwin III
  0 siblings, 1 reply; 52+ messages in thread
From: Zwane Mwaikambo @ 2003-02-20  2:55 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Linus Torvalds, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, Ingo Molnar

On Wed, 19 Feb 2003, William Lee Irwin III wrote:

> On Wed, Feb 19, 2003 at 09:22:42PM -0500, Zwane Mwaikambo wrote:
> > 	Here is a triple fault case (2.5.62-pgcl) and since i'm not a Real 
> > Man i had to use a simulator ;) Unfortunately i can't unwind the stack.
> > 
> > CR2=page fault linear address=0xf7f9bf8c
> > CR3=0x00101000
> >     PCD=page-level cache disable=0
> >     PWT=page-level writes transparent=0
> 
> Looks like either a pagetable or physmap/vmalloc/fixmap screwup.
> What do the bootlogs have for those things?

Verified there were no overlapping regions. If you really really really 
want them i can put in some printks

	Zwane
-- 
function.linuxpower.ca

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20  2:55       ` Zwane Mwaikambo
@ 2003-02-20  3:15         ` William Lee Irwin III
  0 siblings, 0 replies; 52+ messages in thread
From: William Lee Irwin III @ 2003-02-20  3:15 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Linus Torvalds, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, Ingo Molnar

On Wed, 19 Feb 2003, William Lee Irwin III wrote:
>> Looks like either a pagetable or physmap/vmalloc/fixmap screwup.
>> What do the bootlogs have for those things?

On Wed, Feb 19, 2003 at 09:55:47PM -0500, Zwane Mwaikambo wrote:
> Verified there were no overlapping regions. If you really really really 
> want them i can put in some printks

The printk's should have come in with the pgcl patch. Did you keep the
bootlogs? I'm looking for rounding errors in my pagetable init stuff
to see if we're trying to use memory beyond the edge of a 2MB region
we didn't bother mapping or something but that only matters for phys
mappings and so on. If you hit vmallocspace or fixmapspace it's an
entirely different question. There are also small "holes"...

So it'd be very handy to figure out which of the three spaces the
address that turned up in %cr2 was supposed to be in. I can probably
guess a little better if you told me your PAGE_MMUSHIFT value also.

-- wli

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20  2:22   ` Zwane Mwaikambo
  2003-02-20  2:26     ` William Lee Irwin III
@ 2003-02-20  4:52     ` Linus Torvalds
  2003-02-20  5:07       ` William Lee Irwin III
                         ` (2 more replies)
  1 sibling, 3 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20  4:52 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Chris Wedgwood, Kernel Mailing List, Martin J. Bligh, Ingo Molnar,
	William Lee Irwin III

On Wed, 19 Feb 2003, Zwane Mwaikambo wrote:
>
> 	Here is a triple fault case (2.5.62-pgcl) and since i'm not a Real 
> Man i had to use a simulator ;) Unfortunately i can't unwind the stack.

Well, the reason you can't unwind the stack is the same reason you got the 
double fault: the stack pointer is crap.

> Freeing unused kernel memory: 100k freed
> double fault, gdt at c0268020 [255 bytes]
> double fault, tss at c027d800
> eip = c01181c4, esp = f7f9bf90
> eax = c0003dfc, ebx = ffffffff, ecx = 0000007b, edx = f7f9c04c
> esi = 00000003, edi = c01181b0

Whee. So the double-fault patch actually ends up being useful? It didn't 
help with Chris' problem, but hey, if it helps with something else..

Anyway, that %esp is crap, which also explains this:

> 0xc01181c4 <do_page_fault+20>:  mov    %eax,0xc(%esp,1)

Took a page fault because 0xc(%esp) wasn't there, and the page fault 
couldn't write the fault trace to the stack (same reason), so you got a 
double fault.

Anyway, it's hard to try to re-create any state from the above. Very few 
clues about why the stack pointer is so messed up, but _usually_ a messed 
up stack pointer is because the stack itself got hammered, and then the 
stack pointer gets corrupted when somebody restores it off the stack (ie 
the normal 

	movl %ebp,%esp
	popl %ebp
	ret

kind of epilogue thing).

You could try to make the double-fault handler print out more information, 
suggested starting point something like the following: the stack pointer 
is corrupted, but we know what the original top-of-stack was (esp0), so we 
could print out part of that stack to get a guess about what it was doing 
when it all went south..

			Linus

------

===== arch/i386/kernel/doublefault.c 1.1 vs edited =====
--- 1.1/arch/i386/kernel/doublefault.c	Wed Feb 19 17:48:55 2003
+++ edited/arch/i386/kernel/doublefault.c	Wed Feb 19 20:50:47 2003
@@ -33,13 +33,26 @@

 		if (ptr_ok(tss)) {
 			struct tss_struct *t = (struct tss_struct *)tss;
+			unsigned long esp0 = t->esp0;

 			printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp);

 			printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
 				t->eax, t->ebx, t->ecx, t->edx);
-			printk("esi = %08lx, edi = %08lx\n",
-				t->esi, t->edi);
+			printk("esi = %08lx, edi = %08lx, %ebp = %08lx\n",
+				t->esi, t->edi, t->ebp);
+
+			/*
+			 * We could print out the stack contents here: esp0
+			 * is the beginning of the stack, we could print out
+			 * all the code points we can find underneath it or
+			 * something.. 
+			 */
+		
+			/* This might be a point to try to kill the process and clean up */
+			t->esp = esp0;
+			t->eip = (unsigned long) do_exit;
+			asm volatile("iret");
 		}
 	}

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20  4:52     ` Linus Torvalds
@ 2003-02-20  5:07       ` William Lee Irwin III
  2003-02-20  6:05       ` Zwane Mwaikambo
  2003-02-20 11:46       ` Ingo Molnar
  2 siblings, 0 replies; 52+ messages in thread
From: William Lee Irwin III @ 2003-02-20  5:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, Ingo Molnar

On Wed, Feb 19, 2003 at 08:52:46PM -0800, Linus Torvalds wrote:
> Whee. So the double-fault patch actually ends up being useful? It didn't 
> help with Chris' problem, but hey, if it helps with something else..
> Anyway, that %esp is crap, which also explains this:
>> 0xc01181c4 <do_page_fault+20>:  mov    %eax,0xc(%esp,1)
> Took a page fault because 0xc(%esp) wasn't there, and the page fault 
> couldn't write the fault trace to the stack (same reason), so you got a 
> double fault.

Not sure where he got his %esp, but I extracted the following:

<zwane> MAXMEM=0x33e00000
<zwane> vmalloc: start = 0xf3e1f000, end = 0xfbe21000
<zwane> fixaddr: start = 0xfbe23000, end = 0xfffff000

which means somehow %esp landed in an unmapped tidbit in the middle of
of vmallocspace that isn't even mapped. I highly suspect rounding
errors of mine since I squished vmallocspace, fixmapspace, and the
physical mapping so close together they might share L3 pagetables, i.e.
they're separated by 2*MMUPAGE_SIZE instead of customary 8MB or so.


-- wli

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20  4:52     ` Linus Torvalds
  2003-02-20  5:07       ` William Lee Irwin III
@ 2003-02-20  6:05       ` Zwane Mwaikambo
  2003-02-20 11:46       ` Ingo Molnar
  2 siblings, 0 replies; 52+ messages in thread
From: Zwane Mwaikambo @ 2003-02-20  6:05 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Chris Wedgwood, Kernel Mailing List, Martin J. Bligh, Ingo Molnar,
	William Lee Irwin III

On Wed, 19 Feb 2003, Linus Torvalds wrote:

> +			printk("esi = %08lx, edi = %08lx, %ebp = %08lx\n",
> +				t->esi, t->edi, t->ebp);

Too much AT&T for you ;) '%ebp'

> +			 * We could print out the stack contents here: esp0
> +			 * is the beginning of the stack, we could print out
> +			 * all the code points we can find underneath it or
> +			 * something.. 
> +			 */

Simulator managed to dump stack for me, nothing interesting though

  > +		
> +			/* This might be a point to try to kill the process and clean up */
> +			t->esp = esp0;
> +			t->eip = (unsigned long) do_exit;
> +			asm volatile("iret");
>  		}
>  	}
>  
> 
> 

Here is what i managed to fish out from the sim, not a real call trace, 
i just piped the stack contents through ksymoops.

Trace; c02b97ec <doublefault_stack+fec/1000>
Trace; c02b97ee <doublefault_stack+fee/1000>
Trace; c02b97f0 <doublefault_stack+ff0/1000>
Trace; c02b97f2 <doublefault_stack+ff2/1000>
Trace; c02b97f4 <doublefault_stack+ff4/1000>
Trace; c02b97f6 <doublefault_stack+ff6/1000>
Trace; c02b97f8 <doublefault_stack+ff8/1000>
Trace; c02b97fa <doublefault_stack+ffa/1000>
Trace; c02b97fc <doublefault_stack+ffc/1000>
Trace; c02b97fe <doublefault_stack+ffe/1000>
Trace; c02b9800 <use_tsc+0/4>
Trace; c02b9802 <use_tsc+2/4>
Trace; c02b9804 <delay_at_last_interrupt+0/4>
Trace; c02b9806 <delay_at_last_interrupt+2/4>
Trace; c02b9808 <last_tsc_low+0/4>
Trace; c02b980a <last_tsc_low+2/4>
Trace; c02b980c <fast_gettimeoffset_quotient+0/4>
Trace; c02b980e <fast_gettimeoffset_quotient+2/4>
Trace; c02b9810 <pm_power_off+0/4>
Trace; c02b9812 <pm_power_off+2/4>
Trace; c02b9814 <no_idt+0/8>
Trace; c02b9816 <no_idt+2/8>
Trace; c02b9818 <no_idt+4/8>
Trace; c02b981a <no_idt+6/8>
Trace; c02b981c <reboot_mode+0/4>
Trace; c02b981e <reboot_mode+2/4>
Trace; c02b9820 <reboot_thru_bios+0/4>
Trace; c02b9822 <reboot_thru_bios+2/4>
Trace; c02b9824 <flush_cpumask+0/4>
Trace; c02b9826 <flush_cpumask+2/4>
Trace; c02b9828 <flush_mm+0/4>
Trace; c02b982a <flush_mm+2/4>
Trace; c02b982c <flush_va+0/4>
Trace; c02b982e <flush_va+2/4>
Trace; c02b9830 <call_data+0/8>
Trace; c02b9832 <call_data+2/8>
Trace; c02b9834 <call_data+4/8>
Trace; c02b9836 <call_data+6/8>
Trace; c02b9838 <cacheflush_time+0/8>
Trace; c02b983a <cacheflush_time+2/8>
Trace; c02b983c <cacheflush_time+4/8>
Trace; c02b983e <cacheflush_time+6/8>
Trace; c02b9840 <cpu_online_map+0/4>
Trace; c02b9842 <cpu_online_map+2/4>
Trace; c02b9844 <cpu_callout_map+0/4>
Trace; c02b9846 <cpu_callout_map+2/4>
Trace; c02b9848 <smp_threads_ready+0/4>
Trace; c02b984a <smp_threads_ready+2/4>
Trace; c02b984c <cache_decay_ticks+0/4>
Trace; c02b984e <cache_decay_ticks+2/4>
Trace; c02b9850 <phys_proc_id+0/4>
Trace; c02b9852 <phys_proc_id+2/4>
Trace; c02b9854 <cpu_callin_map+0/4>
Trace; c02b9856 <cpu_callin_map+2/4>
Trace; c02b9858 <smp_commenced_mask+0/4>
Trace; c02b985a <smp_commenced_mask+2/4>
Trace; c02b985c <trampoline_base+0/4>
Trace; c02b985e <trampoline_base+2/4>
Trace; c02b9860 <tsc_values+0/8>
Trace; c02b9862 <tsc_values+2/8>
Trace; c02b9864 <tsc_values+4/8>
Trace; c02b9866 <tsc_values+6/8>
Trace; c02b9868 <init_deasserted+0/4>
Trace; c02b986a <init_deasserted+2/4>

-- 
function.linuxpower.ca

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20  4:52     ` Linus Torvalds
  2003-02-20  5:07       ` William Lee Irwin III
  2003-02-20  6:05       ` Zwane Mwaikambo
@ 2003-02-20 11:46       ` Ingo Molnar
  2003-02-20 12:12         ` William Lee Irwin III
                           ` (2 more replies)
  2 siblings, 3 replies; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 11:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


i think i managed to trigger a potentially useful oops, with BK-curr:

Unable to handle kernel paging request at virtual address 6b6b6b8b
 printing eip:
c011944b
*pde = 00000000
Oops: 0002
CPU:    0
EIP:    0060:[<c011944b>]    Not tainted
EFLAGS: 00010046
EIP is at do_page_fault+0x7b/0x4e4
eax: 6b6b6b8b   ebx: 6b6b6b6b   ecx: 0000002b   edx: c02dd6ac
esi: 6b6b6b8b   edi: ca095320   ebp: ca092170   esp: ca0920c8
ds: 007b   es: 007b   ss: 0068
Process start-threads (pid: 21685, threadinfo=ca090000 task=ca094ce0)
Stack: c02dd6ac 0000002b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b8b 6b6b6b6b 6b6b6b6b
       6b6b6b6b 00030001 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b
       6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b
Call Trace:

 [tons of pagefault recursion]

 [<c01193d0>] do_page_fault+0x0/0x4e4
 [<c010a691>] error_code+0x2d/0x38
 [<c011944b>] do_page_fault+0x7b/0x4e4
 [<c01193d0>] do_page_fault+0x0/0x4e4
 [<c010a691>] error_code+0x2d/0x38
 [<c011944b>] do_page_fault+0x7b/0x4e4
 [<c01294f8>] do_timer+0xc8/0xd0
 [<c013330c>] rcu_process_callbacks+0x17c/0x1b0
 [<c011b4bf>] scheduler_tick+0x3ff/0x410
 [<c0125113>] tasklet_action+0x73/0xc0
 [<c01193d0>] do_page_fault+0x0/0x4e4
 [<c010a691>] error_code+0x2d/0x38
 [<c011b598>] schedule+0xb8/0x3d0
 [<c01219fd>] release_task+0x17d/0x200
 [<c011e70f>] mmput+0x1f/0xc0
 [<c0122cad>] do_exit+0x31d/0x3b0
 [<c010b328>] do_nmi+0x58/0x60
 [<c012a93e>] __dequeue_signal+0x6e/0xb0
 [<c0122ef0>] do_group_exit+0x110/0x140
 [<c012a9ae>] dequeue_signal+0x2e/0x60
 [<c012c2b1>] get_signal_to_deliver+0x2b1/0x440
 [<c01099a2>] do_signal+0xb2/0xf0
 [<c01296c4>] schedule_timeout+0x74/0xc0
 [<c012c4f9>] sigprocmask+0x89/0x140
 [<c0129640>] process_timeout+0x0/0x10
 [<c012c62d>] sys_rt_sigprocmask+0x7d/0x1a0
 [<c0129944>] sys_nanosleep+0x154/0x180
 [<c0109a3b>] do_notify_resume+0x5b/0x60
 [<c0109c72>] work_notifysig+0x13/0x15



^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 11:46       ` Ingo Molnar
@ 2003-02-20 12:12         ` William Lee Irwin III
  2003-02-20 12:33           ` Ingo Molnar
  2003-02-20 14:00         ` Zwane Mwaikambo
  2003-02-20 15:43         ` Linus Torvalds
  2 siblings, 1 reply; 52+ messages in thread
From: William Lee Irwin III @ 2003-02-20 12:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Zwane Mwaikambo, Chris Wedgwood,
	Kernel Mailing List, Martin J. Bligh

On Thu, Feb 20, 2003 at 12:46:51PM +0100, Ingo Molnar wrote:
> i think i managed to trigger a potentially useful oops, with BK-curr:
> Stack: c02dd6ac 0000002b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b8b 6b6b6b6b 6b6b6b6b
>        6b6b6b6b 00030001 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b
>        6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b

Looks like some kind of serious use-after-free slab issue. IF is clear,
so we aren't under spin_lock_irq(&rq->lock) on the initial fault. It
might be interesting to find a way to trap it earlier. Reproducible?
If so, how?


-- wli

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 12:12         ` William Lee Irwin III
@ 2003-02-20 12:33           ` Ingo Molnar
  2003-02-20 14:03             ` Zwane Mwaikambo
  0 siblings, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 12:33 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Linus Torvalds, Zwane Mwaikambo, Chris Wedgwood,
	Kernel Mailing List, Martin J. Bligh


i had some other stuff in my tree as well, which could be the culprit. The
crash looked unrelated though. (procfs optimizations for the threaded
case.)

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 11:46       ` Ingo Molnar
  2003-02-20 12:12         ` William Lee Irwin III
@ 2003-02-20 14:00         ` Zwane Mwaikambo
  2003-02-20 15:43         ` Linus Torvalds
  2 siblings, 0 replies; 52+ messages in thread
From: Zwane Mwaikambo @ 2003-02-20 14:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III

On Thu, 20 Feb 2003, Ingo Molnar wrote:

> 
> i think i managed to trigger a potentially useful oops, with BK-curr:
> 
> Unable to handle kernel paging request at virtual address 6b6b6b8b
>  printing eip:
> c011944b
> *pde = 00000000
> Oops: 0002
> CPU:    0
> EIP:    0060:[<c011944b>]    Not tainted
> EFLAGS: 00010046
> EIP is at do_page_fault+0x7b/0x4e4
> eax: 6b6b6b8b   ebx: 6b6b6b6b   ecx: 0000002b   edx: c02dd6ac
> esi: 6b6b6b8b   edi: ca095320   ebp: ca092170   esp: ca0920c8
> ds: 007b   es: 007b   ss: 0068
> Process start-threads (pid: 21685, threadinfo=ca090000 task=ca094ce0)
> Stack: c02dd6ac 0000002b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b8b 6b6b6b6b 6b6b6b6b
>        6b6b6b6b 00030001 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b
>        6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b
> Call Trace:

I've seen this with 2.5.62, it's here;

00407434086i[CPU0 ] task_switch: bad LDT segment at c0121a00
00407434086i[CPU0 ] task switch: posting exception 10 after commit point
00407434086p[CPU0 ] >>PANIC<< can_push(): SS invalidated.
00407434086i[SYS  ] Last time is 1045745354
00407434086i[XGUI ] Exit.
00407434086i[CPU0 ] protected mode
00407434086i[CPU0 ] CS.d_b = 32 bit
00407434086i[CPU0 ] SS.d_b = 32 bit
00407434086i[CPU0 ] | EAX=f7ffd6b4  EBX=ffffffff  ECX=0000007b  
EDX=f7f9c048
00407434086i[CPU0 ] | ESP=c02b97dc  EBP=00000001  ESI=00000000  
EDI=c0118250
00407434086i[CPU0 ] | IOPL=0 NV UP DI NG NZ NA PO NC
00407434086i[CPU0 ] | SEG selector     base    limit G D
00407434086i[CPU0 ] | SEG sltr(index|ti|rpl)     base    limit G D
00407434086i[CPU0 ] |  DS:007b( 000f| 0|  3) 00000000 000fffff 1 1
00407434086i[CPU0 ] |  ES:007b( 000f| 0|  3) 00000000 000fffff 1 1
00407434086i[CPU0 ] |  FS:0000( 0000| 0|  0) 00000000 000fffff 1 1
00407434086i[CPU0 ] |  GS:0000( 0000| 0|  0) 00000000 000fffff 1 1
00407434086i[CPU0 ] |  SS:0068( 000d| 0|  0) 00000000 000fffff 1 1
00407434086i[CPU0 ] |  CS:0060( 000c| 0|  0) 00000000 000fffff 1 1
00407434086i[CPU0 ] | EIP=c0121a00 (c0121a00)
00407434086i[CPU0 ] | CR0=0x8005003b CR1=0x00000000 CR2=0xf7f9bf88
00407434086i[CPU0 ] | CR3=0x00000000 CR4=0x000000b0
00407434086i[CPU0 ] >> 55
00407434086i[CPU0 ] >> : push EBP

(gdb) disassemble 0xc0121a00
Dump of assembler code for function do_exit:
0xc0121a00 <do_exit>:   push   %ebp
0xc0121a01 <do_exit+1>: push   %edi
0xc0121a02 <do_exit+2>: push   %esi
0xc0121a03 <do_exit+3>: push   %ebx

-- 
function.linuxpower.ca

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 12:33           ` Ingo Molnar
@ 2003-02-20 14:03             ` Zwane Mwaikambo
  0 siblings, 0 replies; 52+ messages in thread
From: Zwane Mwaikambo @ 2003-02-20 14:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: William Lee Irwin III, Linus Torvalds, Chris Wedgwood,
	Kernel Mailing List, Martin J. Bligh

On Thu, 20 Feb 2003, Ingo Molnar wrote:

> 
> i had some other stuff in my tree as well, which could be the culprit. The
> crash looked unrelated though. (procfs optimizations for the threaded
> case.)

I can provide more debug information when i get back from work later.

Cheers,
	Zwane
-- 
function.linuxpower.ca

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 11:46       ` Ingo Molnar
  2003-02-20 12:12         ` William Lee Irwin III
  2003-02-20 14:00         ` Zwane Mwaikambo
@ 2003-02-20 15:43         ` Linus Torvalds
  2003-02-20 15:52           ` Ingo Molnar
                             ` (3 more replies)
  2 siblings, 4 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 15:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III

On Thu, 20 Feb 2003, Ingo Molnar wrote:
> 
> i think i managed to trigger a potentially useful oops, with BK-curr:

Ok, this is definitely a stack overflow:

> EIP is at do_page_fault+0x7b/0x4e4
> eax: 6b6b6b8b   ebx: 6b6b6b6b   ecx: 0000002b   edx: c02dd6ac
> esi: 6b6b6b8b   edi: ca095320   ebp: ca092170   esp: ca0920c8
> ds: 007b   es: 007b   ss: 0068
> Process start-threads (pid: 21685, threadinfo=ca090000 task=ca094ce0)

Note the "threadinfo=ca090000" and "esp: ca0920c8".

If the threadinfo isn't on the same double-page as the stack, then you're 
screwed, and you've just overwritten the _real_ threadinfo, and the stack 
is probably screwed. In fact, any recursion on do_page_fault() is 
_probably_ due to the fact that you overwrote thread-info.

This could explain Chris' problems too - my doublefault thing won't help
much if recursion on the stack has clobbered a lot of kernel state (and
the doublefault will likely happen only after enough state is clobbered 
that even the doublefault handling might have trouble).

>  [tons of pagefault recursion]
> 
>  [<c01193d0>] do_page_fault+0x0/0x4e4
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011944b>] do_page_fault+0x7b/0x4e4
>  [<c01193d0>] do_page_fault+0x0/0x4e4
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011944b>] do_page_fault+0x7b/0x4e4
>  [<c01294f8>] do_timer+0xc8/0xd0
>  [<c013330c>] rcu_process_callbacks+0x17c/0x1b0
>  [<c011b4bf>] scheduler_tick+0x3ff/0x410
>  [<c0125113>] tasklet_action+0x73/0xc0
>  [<c01193d0>] do_page_fault+0x0/0x4e4
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011b598>] schedule+0xb8/0x3d0
>  [<c01219fd>] release_task+0x17d/0x200
>  [<c011e70f>] mmput+0x1f/0xc0
>  [<c0122cad>] do_exit+0x31d/0x3b0
>  [<c010b328>] do_nmi+0x58/0x60
>  [<c012a93e>] __dequeue_signal+0x6e/0xb0
>  [<c0122ef0>] do_group_exit+0x110/0x140
>  [<c012a9ae>] dequeue_signal+0x2e/0x60
>  [<c012c2b1>] get_signal_to_deliver+0x2b1/0x440
>  [<c01099a2>] do_signal+0xb2/0xf0
>  [<c01296c4>] schedule_timeout+0x74/0xc0
>  [<c012c4f9>] sigprocmask+0x89/0x140
>  [<c0129640>] process_timeout+0x0/0x10
>  [<c012c62d>] sys_rt_sigprocmask+0x7d/0x1a0
>  [<c0129944>] sys_nanosleep+0x154/0x180
>  [<c0109a3b>] do_notify_resume+0x5b/0x60
>  [<c0109c72>] work_notifysig+0x13/0x15

I bet the doublefaults are on "tsk->mm" accesses (specifically, 
tsk->mm->mmap_sem", which should be the first of them).

That easily happens if "tsk" is crud (either because recursion has already 
overwritten it, _or_ because %esp has recursed so far down that the 
"current()" logic ends up hitting the next page.

The stack doesn't look _that_ deep to me, but if some of these functions
have a large local frame, then that would certainly do it.. At a guess, it
Looks like a fairly deep "schedule()" coupled with deep RCU processing.

And that RCU path is reasonably new. The infrastructure was put in 2.5.43, 
which might explain Chris' case too ("somewhere before 2.5.51").

Does anybody have an up-to-date "use -gp and a special 'mcount()' 
function to check stack depth" patch? The CONFIG_DEBUG_STACKOVERFLOW thing 
is quite possibly too stupid to find things like this (it only finds 
interrupts that overflow the stack, not deep call sequences).

Guys: you could try to enable CONFIG_DEBUG_STACKOVERFLOW, and then perhaps 
make it a bit more aggressive (rigth now it does:

                if (unlikely(esp < (sizeof(struct thread_info) + 1024))) {

and I'd suggest changing it to something more like

		/* Have we used up more than half the stack? */
		if (unlikely(esp < 4096)) {

and add a "for (;;)" after doing the dump_stack() because otherwise the 
machine may reboot before you get anywhere.

		Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 15:43         ` Linus Torvalds
@ 2003-02-20 15:52           ` Ingo Molnar
  2003-02-20 16:11           ` Martin J. Bligh
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 15:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


another datapoint: on SMP i can get various types of backtraces, on UP
it's the spontaneous reboot that triggers.

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 15:43         ` Linus Torvalds
  2003-02-20 15:52           ` Ingo Molnar
@ 2003-02-20 16:11           ` Martin J. Bligh
  2003-02-20 16:54             ` Linus Torvalds
  2003-02-20 23:09             ` Chris Wedgwood
  2003-02-20 16:44           ` Ingo Molnar
  2003-02-20 20:13           ` Chris Wedgwood
  3 siblings, 2 replies; 52+ messages in thread
From: Martin J. Bligh @ 2003-02-20 16:11 UTC (permalink / raw)
  To: Linus Torvalds, Ingo Molnar, Dave Hansen
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	William Lee Irwin III

[-- Attachment #1: Type: text/plain, Size: 1140 bytes --]

> Does anybody have an up-to-date "use -gp and a special 'mcount()' 
> function to check stack depth" patch? The CONFIG_DEBUG_STACKOVERFLOW thing 
> is quite possibly too stupid to find things like this (it only finds 
> interrupts that overflow the stack, not deep call sequences).
> 
> Guys: you could try to enable CONFIG_DEBUG_STACKOVERFLOW, and then perhaps 
> make it a bit more aggressive (rigth now it does:
> 
>                 if (unlikely(esp < (sizeof(struct thread_info) + 1024))) {
> 
> and I'd suggest changing it to something more like
> 
> 		/* Have we used up more than half the stack? */
> 		if (unlikely(esp < 4096)) {
> 
> and add a "for (;;)" after doing the dump_stack() because otherwise the 
> machine may reboot before you get anywhere.

There are patches in -mjb from Dave Hansen / Ben LaHaise to detect stack
overflow included with the stuff for the 4K stacks patch (intended for 
scaling to large numbers of tasks). I've split them out attatched, should 
apply to mainline reasonably easily.

M.

PS. Linus, I think the attatchments will work for you as they're text/plain,
if not, I'll resend them all inline.

[-- Attachment #2: 220-thread_info_cleanup --]
[-- Type: text/plain, Size: 4328 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 211-shpte/arch/i386/kernel/entry.S 220-thread_info_cleanup/arch/i386/kernel/entry.S
--- 211-shpte/arch/i386/kernel/entry.S	Sun Feb 16 15:10:13 2003
+++ 220-thread_info_cleanup/arch/i386/kernel/entry.S	Mon Feb 17 10:57:56 2003
@@ -155,7 +155,7 @@ do_lcall:
 	movl %eax,EFLAGS(%ebp)	#
 	movl %edx,EIP(%ebp)	# Now we move them to their "normal" places
 	movl %ecx,CS(%ebp)	#
-	andl $-8192, %ebp	# GET_THREAD_INFO
+	GET_THREAD_INFO_WITH_ESP(%ebp)  # GET_THREAD_INFO
 	movl TI_EXEC_DOMAIN(%ebp), %edx	# Get the execution domain
 	call *4(%edx)		# Call the lcall7 handler for the domain
 	addl $4, %esp
diff -urpN -X /home/fletch/.diff.exclude 211-shpte/arch/i386/kernel/head.S 220-thread_info_cleanup/arch/i386/kernel/head.S
--- 211-shpte/arch/i386/kernel/head.S	Thu Jan  2 22:04:58 2003
+++ 220-thread_info_cleanup/arch/i386/kernel/head.S	Mon Feb 17 10:57:56 2003
@@ -16,6 +16,7 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 #include <asm/cache.h>
+#include <asm/thread_info.h>
 
 #define OLD_CL_MAGIC_ADDR	0x90020
 #define OLD_CL_MAGIC		0xA33F
@@ -309,7 +310,7 @@ rp_sidt:
 	ret
 
 ENTRY(stack_start)
-	.long init_thread_union+8192
+	.long init_thread_union+THREAD_SIZE
 	.long __BOOT_DS
 
 /* This is the default interrupt "handler" :-) */
diff -urpN -X /home/fletch/.diff.exclude 211-shpte/include/asm-i386/page.h 220-thread_info_cleanup/include/asm-i386/page.h
--- 211-shpte/include/asm-i386/page.h	Sun Feb 16 13:18:59 2003
+++ 220-thread_info_cleanup/include/asm-i386/page.h	Mon Feb 17 10:57:56 2003
@@ -3,7 +3,11 @@
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
-#define PAGE_SIZE	(1UL << PAGE_SHIFT)
+#ifndef __ASSEMBLY__
+#define PAGE_SIZE      (1UL << PAGE_SHIFT)
+#else
+#define PAGE_SIZE      (1 << PAGE_SHIFT)
+#endif
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 
 #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
diff -urpN -X /home/fletch/.diff.exclude 211-shpte/include/asm-i386/thread_info.h 220-thread_info_cleanup/include/asm-i386/thread_info.h
--- 211-shpte/include/asm-i386/thread_info.h	Thu Jan  9 19:16:11 2003
+++ 220-thread_info_cleanup/include/asm-i386/thread_info.h	Mon Feb 17 10:57:56 2003
@@ -9,6 +9,7 @@
 
 #ifdef __KERNEL__
 
+#include <asm/page.h>
 #ifndef __ASSEMBLY__
 #include <asm/processor.h>
 #endif
@@ -57,11 +58,14 @@ struct thread_info {
  *
  * preempt_count needs to be 1 initially, until the scheduler is functional.
  */
+#define THREAD_ORDER 1 
+#define INIT_THREAD_SIZE       THREAD_SIZE
+
 #ifndef __ASSEMBLY__
 
 #define INIT_THREAD_INFO(tsk)			\
 {						\
-	.task		= &tsk,			\
+	.task		= &tsk,         	\
 	.exec_domain	= &default_exec_domain,	\
 	.flags		= 0,			\
 	.cpu		= 0,			\
@@ -75,30 +79,36 @@ struct thread_info {
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
+/* thread information allocation */
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER))
+#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
+#define get_thread_info(ti) get_task_struct((ti)->task)
+#define put_thread_info(ti) put_task_struct((ti)->task)
+
 /* how to get the thread information struct from C */
 static inline struct thread_info *current_thread_info(void)
 {
 	struct thread_info *ti;
-	__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL));
+	__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
 	return ti;
 }
 
-/* thread information allocation */
-#define THREAD_SIZE (2*PAGE_SIZE)
-#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1))
-#define free_thread_info(ti) free_pages((unsigned long) (ti), 1)
-#define get_thread_info(ti) get_task_struct((ti)->task)
-#define put_thread_info(ti) put_task_struct((ti)->task)
-
 #else /* !__ASSEMBLY__ */
 
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+
 /* how to get the thread information struct from ASM */
 #define GET_THREAD_INFO(reg) \
-	movl $-8192, reg; \
+	movl $-THREAD_SIZE, reg; \
 	andl %esp, reg
 
-#endif
+/* use this one if reg already contains %esp */
+#define GET_THREAD_INFO_WITH_ESP(reg) \
+	andl $-THREAD_SIZE, reg
 
+#endif
+	 
 /*
  * thread information flags
  * - these are process state flags that various assembly files may need to access

[-- Attachment #3: 221-interrupt_stacks --]
[-- Type: text/plain, Size: 13839 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/Kconfig 221-interrupt_stacks/arch/i386/Kconfig
--- 220-thread_info_cleanup/arch/i386/Kconfig	Mon Feb 17 10:55:52 2003
+++ 221-interrupt_stacks/arch/i386/Kconfig	Mon Feb 17 10:57:57 2003
@@ -374,6 +374,11 @@ config X86_SSE2
 	depends on MK8 || MPENTIUM4
 	default y
 
+config X86_CMOV
+	bool
+	depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE
+	default y
+
 config HUGETLB_PAGE
 	bool "Huge TLB Page Support"
 	help
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/kernel/apic.c 221-interrupt_stacks/arch/i386/kernel/apic.c
--- 220-thread_info_cleanup/arch/i386/kernel/apic.c	Sat Feb 15 16:11:40 2003
+++ 221-interrupt_stacks/arch/i386/kernel/apic.c	Mon Feb 17 10:57:57 2003
@@ -1040,7 +1040,8 @@ inline void smp_local_timer_interrupt(st
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
 
-void smp_apic_timer_interrupt(struct pt_regs regs)
+struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs));
+struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs)
 {
 	int cpu = smp_processor_id();
 
@@ -1060,14 +1061,16 @@ void smp_apic_timer_interrupt(struct pt_
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
 	irq_enter();
-	smp_local_timer_interrupt(&regs);
+	smp_local_timer_interrupt(regs);
 	irq_exit();
+	return regs;
 }
 
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
-asmlinkage void smp_spurious_interrupt(void)
+struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs));
+struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs)
 {
 	unsigned long v;
 
@@ -1085,13 +1088,15 @@ asmlinkage void smp_spurious_interrupt(v
 	printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n",
 			smp_processor_id());
 	irq_exit();
+	return regs;
 }
 
 /*
  * This interrupt should never happen with our APIC/SMP architecture
  */
 
-asmlinkage void smp_error_interrupt(void)
+struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs));
+struct pt_regs * smp_error_interrupt(struct pt_regs* regs)
 {
 	unsigned long v, v1;
 
@@ -1116,6 +1121,7 @@ asmlinkage void smp_error_interrupt(void
 	printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n",
 	        smp_processor_id(), v , v1);
 	irq_exit();
+	return regs;
 }
 
 /*
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/kernel/cpu/mcheck/p4.c 221-interrupt_stacks/arch/i386/kernel/cpu/mcheck/p4.c
--- 220-thread_info_cleanup/arch/i386/kernel/cpu/mcheck/p4.c	Thu Jan  2 22:04:58 2003
+++ 221-interrupt_stacks/arch/i386/kernel/cpu/mcheck/p4.c	Mon Feb 17 10:57:57 2003
@@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru
 /* Thermal interrupt handler for this CPU setup */
 static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
 
-asmlinkage void smp_thermal_interrupt(struct pt_regs regs)
+struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs));
+struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs)
 {
 	irq_enter();
 	vendor_thermal_interrupt(&regs);
 	irq_exit();
+	return regs;
 }
 
 /* P4/Xeon Thermal regulation detect and init */
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/kernel/entry.S 221-interrupt_stacks/arch/i386/kernel/entry.S
--- 220-thread_info_cleanup/arch/i386/kernel/entry.S	Mon Feb 17 10:57:56 2003
+++ 221-interrupt_stacks/arch/i386/kernel/entry.S	Mon Feb 17 10:57:57 2003
@@ -388,17 +388,78 @@ ENTRY(irq_entries_start)
 vector=vector+1
 .endr
 
+
+# lets play optimizing compiler...
+#ifdef CONFIG_X86_CMOV
+#define COND_MOVE	cmovnz %esi,%esp;
+#else
+#define COND_MOVE	\
+	jz 1f;		\
+	mov %esi,%esp;	\
+1:
+#endif
+
+# These macros will switch you to, and from a per-cpu interrupt stack
+# They take the pt_regs arg and move it from the normal place on the 
+# stack to %eax.  Any handler function can retrieve it using regparm(1). 
+# The handlers are expected to return the stack to switch back to in 
+# the same register. 
+#
+# This means that the irq handlers need to return their arg
+#
+# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi
+# old stack gets put in %eax
+
+.macro SWITCH_TO_IRQSTACK 
+	GET_THREAD_INFO(%ebx);
+	movl TI_IRQ_STACK(%ebx),%ecx;
+	movl TI_TASK(%ebx),%edx;
+	movl %esp,%eax;
+
+	# %ecx+THREAD_SIZE is next stack -4 keeps us in the right one
+	leal (THREAD_SIZE-4)(%ecx),%esi; 
+
+	# is there a valid irq_stack?
+	testl %ecx,%ecx;
+	COND_MOVE;
+
+	# update the task pointer in the irq stack
+	GET_THREAD_INFO(%esi);
+	movl %edx,TI_TASK(%esi);
+
+	# update the preempt count in the irq stack
+	movl TI_PRE_COUNT(%ebx),%ecx;
+	movl %ecx,TI_PRE_COUNT(%esi);
+.endm
+
+# copy flags from the irq stack back into the task's thread_info
+# %esi is saved over the irq handler call and contains the irq stack's
+#      thread_info pointer
+# %eax was returned from the handler, as described above
+# %ebx contains the original thread_info pointer
+
+.macro RESTORE_FROM_IRQSTACK 
+	movl %eax,%esp;
+	movl TI_FLAGS(%esi),%eax;
+	movl $0,TI_FLAGS(%esi);
+	LOCK orl %eax,TI_FLAGS(%ebx);
+.endm
+
 	ALIGN
 common_interrupt:
 	SAVE_ALL
+	SWITCH_TO_IRQSTACK
 	call do_IRQ
+	RESTORE_FROM_IRQSTACK
 	jmp ret_from_intr
 
 #define BUILD_INTERRUPT(name, nr)	\
 ENTRY(name)				\
 	pushl $nr-256;			\
 	SAVE_ALL			\
-	call smp_/**/name;	\
+	SWITCH_TO_IRQSTACK;		\
+	call smp_/**/name;		\
+	RESTORE_FROM_IRQSTACK;		\
 	jmp ret_from_intr;
 
 /* The include is where all of the SMP etc. interrupts come from */
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/kernel/init_task.c 221-interrupt_stacks/arch/i386/kernel/init_task.c
--- 220-thread_info_cleanup/arch/i386/kernel/init_task.c	Thu Feb 13 11:08:02 2003
+++ 221-interrupt_stacks/arch/i386/kernel/init_task.c	Mon Feb 17 10:57:57 2003
@@ -14,6 +14,10 @@ static struct signal_struct init_signals
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
 
+union thread_union init_irq_union
+	__attribute__((__section__(".data.init_task")));
+
+
 /*
  * Initial thread structure.
  *
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/kernel/irq.c 221-interrupt_stacks/arch/i386/kernel/irq.c
--- 220-thread_info_cleanup/arch/i386/kernel/irq.c	Thu Feb 13 11:08:02 2003
+++ 221-interrupt_stacks/arch/i386/kernel/irq.c	Mon Feb 17 10:57:57 2003
@@ -311,7 +311,8 @@ void enable_irq(unsigned int irq)
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-asmlinkage unsigned int do_IRQ(struct pt_regs regs)
+struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs));
+struct pt_regs * do_IRQ(struct pt_regs *regs)
 {	
 	/* 
 	 * We ack quickly, we don't want the irq controller
@@ -323,7 +324,7 @@ asmlinkage unsigned int do_IRQ(struct pt
 	 * 0 return value means that this irq is already being
 	 * handled by some other CPU. (or is disabled)
 	 */
-	int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code  */
+	int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code  */
 	int cpu = smp_processor_id();
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
@@ -388,7 +389,7 @@ asmlinkage unsigned int do_IRQ(struct pt
 	 */
 	for (;;) {
 		spin_unlock(&desc->lock);
-		handle_IRQ_event(irq, &regs, action);
+		handle_IRQ_event(irq, regs, action);
 		spin_lock(&desc->lock);
 		
 		if (likely(!(desc->status & IRQ_PENDING)))
@@ -407,7 +408,7 @@ out:
 
 	irq_exit();
 
-	return 1;
+	return regs;
 }
 
 /**
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/kernel/process.c 221-interrupt_stacks/arch/i386/kernel/process.c
--- 220-thread_info_cleanup/arch/i386/kernel/process.c	Thu Feb 13 11:08:02 2003
+++ 221-interrupt_stacks/arch/i386/kernel/process.c	Mon Feb 17 10:57:57 2003
@@ -432,6 +432,7 @@ void __switch_to(struct task_struct *pre
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
+	next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack;
 	unlazy_fpu(prev_p);
 
 	/*
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/kernel/smp.c 221-interrupt_stacks/arch/i386/kernel/smp.c
--- 220-thread_info_cleanup/arch/i386/kernel/smp.c	Sun Feb 16 13:22:10 2003
+++ 221-interrupt_stacks/arch/i386/kernel/smp.c	Mon Feb 17 10:57:57 2003
@@ -305,7 +305,8 @@ static inline void leave_mm (unsigned lo
  * 2) Leave the mm if we are in the lazy tlb mode.
  */
 
-asmlinkage void smp_invalidate_interrupt (void)
+struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs));
+struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs)
 {
 	unsigned long cpu;
 
@@ -336,6 +337,7 @@ asmlinkage void smp_invalidate_interrupt
 
 out:
 	put_cpu_no_resched();
+	return regs;
 }
 
 static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
@@ -598,12 +600,15 @@ void smp_send_stop(void)
  * all the work is done automatically when
  * we return from the interrupt.
  */
-asmlinkage void smp_reschedule_interrupt(void)
+struct pt_regs * IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *regs));
+struct pt_regs * smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
+	return regs;
 }
 
-asmlinkage void smp_call_function_interrupt(struct pt_regs regs)
+struct pt_regs * IRQHANDLER(smp_call_function_interrupt(struct pt_regs *regs));
+struct pt_regs * smp_call_function_interrupt(struct pt_regs *regs)
 {
 	void (*func) (void *info, struct pt_regs *) = (void (*)(void *, struct pt_regs*))call_data->func;
 	void *info = call_data->info;
@@ -627,5 +632,6 @@ asmlinkage void smp_call_function_interr
 		mb();
 		atomic_inc(&call_data->finished);
 	}
+	return regs;
 }
 
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/arch/i386/kernel/smpboot.c 221-interrupt_stacks/arch/i386/kernel/smpboot.c
--- 220-thread_info_cleanup/arch/i386/kernel/smpboot.c	Sun Feb 16 13:18:39 2003
+++ 221-interrupt_stacks/arch/i386/kernel/smpboot.c	Mon Feb 17 10:57:57 2003
@@ -71,6 +71,11 @@ static unsigned long smp_commenced_mask;
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 
+/* Per CPU interrupt stacks */
+extern union thread_union init_irq_union;
+union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned =
+	{ &init_irq_union, };
+
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
@@ -770,6 +775,28 @@ wakeup_secondary_cpu(int phys_apicid, un
 }
 #endif	/* WAKE_SECONDARY_VIA_INIT */
 
+static void __init setup_irq_stack(struct task_struct *p, int cpu)
+{
+	unsigned long stk;
+
+	stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER);
+	if (!stk)
+		panic("I can't seem to allocate my irq stack.  Oh well, giving up.");
+
+	irq_stacks[cpu] = (void *)stk;
+	memset(irq_stacks[cpu], 0, THREAD_SIZE);
+	irq_stacks[cpu]->thread_info.cpu = cpu;
+	irq_stacks[cpu]->thread_info.preempt_count = 1;
+					/* interrupts are not preemptable */
+	p->thread_info->irq_stack = &irq_stacks[cpu]->thread_info;
+
+	/* If we want to make the irq stack more than one unit
+	 * deep, we can chain then off of the irq_stack pointer
+	 * here.
+	 */
+}
+
+
 extern unsigned long cpu_initialized;
 
 static int __init do_boot_cpu(int apicid)
@@ -793,6 +820,8 @@ static int __init do_boot_cpu(int apicid
 	idle = fork_by_hand();
 	if (IS_ERR(idle))
 		panic("failed fork for CPU %d", cpu);
+
+	setup_irq_stack(idle, cpu);
 
 	/*
 	 * We remove it from the pidhash and the runqueue
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/include/asm-i386/linkage.h 221-interrupt_stacks/include/asm-i386/linkage.h
--- 220-thread_info_cleanup/include/asm-i386/linkage.h	Sun Nov 17 20:29:46 2002
+++ 221-interrupt_stacks/include/asm-i386/linkage.h	Mon Feb 17 10:57:57 2003
@@ -3,6 +3,7 @@
 
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
 #define FASTCALL(x)	x __attribute__((regparm(3)))
+#define IRQHANDLER(x)	x __attribute__((regparm(1)))
 
 #ifdef CONFIG_X86_ALIGNMENT_16
 #define __ALIGN .align 16,0x90
diff -urpN -X /home/fletch/.diff.exclude 220-thread_info_cleanup/include/asm-i386/thread_info.h 221-interrupt_stacks/include/asm-i386/thread_info.h
--- 220-thread_info_cleanup/include/asm-i386/thread_info.h	Mon Feb 17 10:57:56 2003
+++ 221-interrupt_stacks/include/asm-i386/thread_info.h	Mon Feb 17 10:57:57 2003
@@ -30,9 +30,11 @@ struct thread_info {
 	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
+						   0 for interrupts: illegal
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
+	struct thread_info	*irq_stack;	/* pointer to cpu irq stack */
 	struct restart_block    restart_block;
 
 	__u8			supervisor_stack[0];
@@ -47,7 +49,8 @@ struct thread_info {
 #define TI_CPU		0x0000000C
 #define TI_PRE_COUNT	0x00000010
 #define TI_ADDR_LIMIT	0x00000014
-#define TI_RESTART_BLOCK 0x0000018
+#define TI_IRQ_STACK	0x00000018
+#define TI_RESTART_BLOCK 0x0000022
 
 #endif
 
@@ -63,17 +66,18 @@ struct thread_info {
 
 #ifndef __ASSEMBLY__
 
-#define INIT_THREAD_INFO(tsk)			\
-{						\
-	.task		= &tsk,         	\
-	.exec_domain	= &default_exec_domain,	\
-	.flags		= 0,			\
-	.cpu		= 0,			\
-	.preempt_count	= 1,			\
-	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
+#define INIT_THREAD_INFO(tsk)				\
+{							\
+	.task		= &tsk,         		\
+	.exec_domain	= &default_exec_domain,		\
+	.flags		= 0,				\
+	.cpu		= 0,				\
+	.preempt_count	= 1,				\
+	.addr_limit	= KERNEL_DS,			\
+	.irq_stack	= &init_irq_union.thread_info,	\
+	.restart_block = {				\
+		.fn = do_no_restart_syscall,		\
+	}						\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)

[-- Attachment #4: 222-stack_usage_check --]
[-- Type: text/plain, Size: 6600 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 221-interrupt_stacks/arch/i386/Kconfig 222-stack_usage_check/arch/i386/Kconfig
--- 221-interrupt_stacks/arch/i386/Kconfig	Mon Feb 17 10:57:57 2003
+++ 222-stack_usage_check/arch/i386/Kconfig	Mon Feb 17 10:57:57 2003
@@ -1764,6 +1764,25 @@ config FRAME_POINTER
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame pointers.
 
+config X86_STACK_CHECK
+	bool "Detect stack overflows"
+	depends on FRAME_POINTER
+	help
+	  Say Y here to have the kernel attempt to detect when the per-task
+	  kernel stack overflows.  This is much more robust checking than
+	  the above overflow check, which will only occasionally detect
+	  an overflow.  The level of guarantee here is much greater.
+	
+	  Some older versions of gcc don't handle the -p option correctly.  
+	  Kernprof is affected by the same problem, which is described here:
+	  http://oss.sgi.com/projects/kernprof/faq.html#Q9
+	
+	  Basically, if you get oopses in __free_pages_ok during boot when
+	  you have this turned on, you need to fix gcc.  The Redhat 2.96 
+	  version and gcc-3.x seem to work.  
+	
+	  If not debugging a stack overflow problem, say N
+
 config X86_EXTRA_IRQS
 	bool
 	depends on X86_LOCAL_APIC || X86_VOYAGER
diff -urpN -X /home/fletch/.diff.exclude 221-interrupt_stacks/arch/i386/Makefile 222-stack_usage_check/arch/i386/Makefile
--- 221-interrupt_stacks/arch/i386/Makefile	Sun Feb 16 13:18:58 2003
+++ 222-stack_usage_check/arch/i386/Makefile	Mon Feb 17 10:57:57 2003
@@ -76,6 +76,10 @@ mcore-$(CONFIG_X86_SUMMIT)  := mach-defa
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
+ifdef CONFIG_X86_STACK_CHECK
+CFLAGS += -p
+endif
+
 head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
 
 libs-y 					+= arch/i386/lib/
diff -urpN -X /home/fletch/.diff.exclude 221-interrupt_stacks/arch/i386/boot/compressed/misc.c 222-stack_usage_check/arch/i386/boot/compressed/misc.c
--- 221-interrupt_stacks/arch/i386/boot/compressed/misc.c	Thu Jan  2 22:04:58 2003
+++ 222-stack_usage_check/arch/i386/boot/compressed/misc.c	Mon Feb 17 10:57:57 2003
@@ -377,3 +377,7 @@ asmlinkage int decompress_kernel(struct 
 	if (high_loaded) close_output_buffer_if_we_run_high(mv);
 	return high_loaded;
 }
+
+/* We don't actually check for stack overflows this early. */
+__asm__(".globl mcount ; mcount: ret\n");
+
diff -urpN -X /home/fletch/.diff.exclude 221-interrupt_stacks/arch/i386/kernel/entry.S 222-stack_usage_check/arch/i386/kernel/entry.S
--- 221-interrupt_stacks/arch/i386/kernel/entry.S	Mon Feb 17 10:57:57 2003
+++ 222-stack_usage_check/arch/i386/kernel/entry.S	Mon Feb 17 10:57:57 2003
@@ -640,6 +640,61 @@ ENTRY(spurious_interrupt_bug)
 	pushl $do_spurious_interrupt_bug
 	jmp error_code
 
+
+#ifdef CONFIG_X86_STACK_CHECK
+.data
+	.globl	stack_overflowed
+stack_overflowed:
+	.long	0
+.text
+
+ENTRY(mcount)
+	push %eax
+	movl $(THREAD_SIZE - 1),%eax
+	andl %esp,%eax
+	cmpl $STACK_WARN,%eax	/* more than half the stack is used*/
+	jle 1f
+2:
+	popl %eax
+	ret
+1:	
+	lock;   btsl    $0,stack_overflowed
+	jc      2b
+	
+	# switch to overflow stack
+	movl	%esp,%eax
+	movl	$(stack_overflow_stack + THREAD_SIZE - 4),%esp
+
+	pushf
+	cli
+	pushl	%eax
+
+	# push eip then esp of error for stack_overflow_panic
+	pushl	4(%eax)
+	pushl	%eax
+
+	# update the task pointer and cpu in the overflow stack's thread_info.
+	GET_THREAD_INFO_WITH_ESP(%eax)
+	movl	TI_TASK(%eax),%ebx
+	movl	%ebx,stack_overflow_stack+TI_TASK
+	movl	TI_CPU(%eax),%ebx
+	movl	%ebx,stack_overflow_stack+TI_CPU
+
+	call	stack_overflow
+
+	# pop off call arguments
+	addl	$8,%esp 
+
+	popl	%eax
+	popf
+	movl	%eax,%esp
+	popl	%eax
+	movl	$0,stack_overflowed
+	ret
+
+#warning stack check enabled
+#endif
+
 .data
 ENTRY(sys_call_table)
 	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */
diff -urpN -X /home/fletch/.diff.exclude 221-interrupt_stacks/arch/i386/kernel/i386_ksyms.c 222-stack_usage_check/arch/i386/kernel/i386_ksyms.c
--- 221-interrupt_stacks/arch/i386/kernel/i386_ksyms.c	Sun Feb 16 15:10:06 2003
+++ 222-stack_usage_check/arch/i386/kernel/i386_ksyms.c	Mon Feb 17 10:57:57 2003
@@ -228,3 +228,8 @@ EXPORT_SYMBOL(kmap_atomic_to_page);
 EXPORT_SYMBOL(edd);
 EXPORT_SYMBOL(eddnr);
 #endif
+
+#ifdef CONFIG_X86_STACK_CHECK
+extern void mcount(void);
+EXPORT_SYMBOL(mcount);
+#endif
diff -urpN -X /home/fletch/.diff.exclude 221-interrupt_stacks/arch/i386/kernel/init_task.c 222-stack_usage_check/arch/i386/kernel/init_task.c
--- 221-interrupt_stacks/arch/i386/kernel/init_task.c	Mon Feb 17 10:57:57 2003
+++ 222-stack_usage_check/arch/i386/kernel/init_task.c	Mon Feb 17 10:57:57 2003
@@ -17,6 +17,10 @@ struct mm_struct init_mm = INIT_MM(init_
 union thread_union init_irq_union
 	__attribute__((__section__(".data.init_task")));
 
+#ifdef CONFIG_X86_STACK_CHECK
+union thread_union stack_overflow_stack
+	__attribute__((__section__(".data.init_task")));
+#endif
 
 /*
  * Initial thread structure.
diff -urpN -X /home/fletch/.diff.exclude 221-interrupt_stacks/arch/i386/kernel/process.c 222-stack_usage_check/arch/i386/kernel/process.c
--- 221-interrupt_stacks/arch/i386/kernel/process.c	Mon Feb 17 10:57:57 2003
+++ 222-stack_usage_check/arch/i386/kernel/process.c	Mon Feb 17 10:57:57 2003
@@ -159,7 +159,25 @@ static int __init idle_setup (char *str)
 
 __setup("idle=", idle_setup);
 
-void show_regs(struct pt_regs * regs)
+void stack_overflow(unsigned long esp, unsigned long eip)
+{
+	int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC);
+
+	printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%x %d %d\n", 
+		esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing );
+	
+	if (panicing)
+		print_symbol("stack overflow from %s\n", eip);
+	else
+		print_symbol("excessive stack use from %s\n", eip);
+	printk("esp: %p\n", (void*)esp);
+	show_trace((void*)esp);
+	
+	if (panicing)
+		panic("stack overflow\n");
+}
+
+asmlinkage void show_regs(struct pt_regs * regs)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
 
diff -urpN -X /home/fletch/.diff.exclude 221-interrupt_stacks/include/asm-i386/thread_info.h 222-stack_usage_check/include/asm-i386/thread_info.h
--- 221-interrupt_stacks/include/asm-i386/thread_info.h	Mon Feb 17 10:57:57 2003
+++ 222-stack_usage_check/include/asm-i386/thread_info.h	Mon Feb 17 10:57:57 2003
@@ -63,6 +63,8 @@ struct thread_info {
  */
 #define THREAD_ORDER 1 
 #define INIT_THREAD_SIZE       THREAD_SIZE
+#define STACK_PANIC		0x200ul
+#define STACK_WARN		((THREAD_SIZE)>>1)
 
 #ifndef __ASSEMBLY__
 

[-- Attachment #5: 223-4k_stacks --]
[-- Type: text/plain, Size: 1664 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 222-stack_usage_check/arch/i386/Kconfig 223-4k_stacks/arch/i386/Kconfig
--- 222-stack_usage_check/arch/i386/Kconfig	Mon Feb 17 10:57:57 2003
+++ 223-4k_stacks/arch/i386/Kconfig	Mon Feb 17 10:57:58 2003
@@ -742,6 +742,16 @@ config SHAREPTE
 	  level of the page table between address spaces that are sharing data
 	  pages.
 
+config 4K_STACK
+	bool "Use smaller 4k per-task stacks"
+	help
+	  This option will shrink the kernel's per-task stack from 8k to
+	  4k.  This will greatly increase your chance of overflowing it.
+	  But, if you use the per-cpu interrupt stacks as well, your chances
+	  go way down.  Also try the CONFIG_X86_STACK_CHECK overflow
+	  detection.  It is much more reliable than the currently in-kernel
+	  version.
+
 config MATH_EMULATION
 	bool "Math emulation"
 	---help---
diff -urpN -X /home/fletch/.diff.exclude 222-stack_usage_check/include/asm-i386/thread_info.h 223-4k_stacks/include/asm-i386/thread_info.h
--- 222-stack_usage_check/include/asm-i386/thread_info.h	Mon Feb 17 10:57:57 2003
+++ 223-4k_stacks/include/asm-i386/thread_info.h	Mon Feb 17 10:57:58 2003
@@ -61,10 +61,16 @@ struct thread_info {
  *
  * preempt_count needs to be 1 initially, until the scheduler is functional.
  */
-#define THREAD_ORDER 1 
+#ifdef CONFIG_4K_STACK
+#define THREAD_ORDER 0
+#define STACK_WARN		0x200
+#define STACK_PANIC		0x100
+#else
+#define THREAD_ORDER 1
+#define STACK_WARN              ((THREAD_SIZE)>>1)
+#define STACK_PANIC             0x100
+#endif
 #define INIT_THREAD_SIZE       THREAD_SIZE
-#define STACK_PANIC		0x200ul
-#define STACK_WARN		((THREAD_SIZE)>>1)
 
 #ifndef __ASSEMBLY__
 

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 15:43         ` Linus Torvalds
  2003-02-20 15:52           ` Ingo Molnar
  2003-02-20 16:11           ` Martin J. Bligh
@ 2003-02-20 16:44           ` Ingo Molnar
  2003-02-20 20:13           ` Chris Wedgwood
  3 siblings, 0 replies; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 16:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Linus Torvalds wrote:

> Ok, this is definitely a stack overflow:

> Does anybody have an up-to-date "use -gp and a special 'mcount()'
> function to check stack depth" patch? The CONFIG_DEBUG_STACKOVERFLOW
> thing is quite possibly too stupid to find things like this (it only
> finds interrupts that overflow the stack, not deep call sequences).

i had CONFIG_DEBUG_STACKOVERFLOW on, but i'll make it more agressive. It's
fairly easy to reproduce the oops. (at least it was when i was trying to
avoid them :-)

	Ing


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 16:11           ` Martin J. Bligh
@ 2003-02-20 16:54             ` Linus Torvalds
  2003-02-20 17:24               ` Jeff Garzik
                                 ` (3 more replies)
  2003-02-20 23:09             ` Chris Wedgwood
  1 sibling, 4 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 16:54 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Ingo Molnar, Dave Hansen, Zwane Mwaikambo, Chris Wedgwood,
	Kernel Mailing List, William Lee Irwin III


On Thu, 20 Feb 2003, Martin J. Bligh wrote:
> 
> There are patches in -mjb from Dave Hansen / Ben LaHaise to detect stack
> overflow included with the stuff for the 4K stacks patch (intended for 
> scaling to large numbers of tasks). I've split them out attatched, should 
> apply to mainline reasonably easily.

Ok, the 4kB stack definitely won't work in real life, but that's because 
we have some hopelessly bad stack users in the kernel. But the debugging 
part would be good to try (in fact, it might be a good idea to keep the 
8kB stack, but with rather anal debugging. Just the "mcount" part should 
do that).

A sorted list of bad stack users (more than 256 bytes) in my default build
follows. Anybody can create their own with something like

	objdump -d linux/vmlinux |
		grep 'sub.*$0x...,.*esp' |
		awk '{ print $9,$1 }' |
		sort > bigstack

and a script to look up the addresses.

That ide_unregister() thing uses up >2kB in just one call! And there are 
several in the 1.5kB range too, with a long list of ~500 byte offenders.

Yeah, and this assumes we don't have alloca() users or other dynamic 
stack allocators (non-constant-size automatic arrays). I hope we don't 
have that kind of crap anywhere..

			Linus

-----
0xc02ae062 <ide_unregister+8>:				sub    $0x8c4,%esp
0xc010535d <huft_build+9>:				sub    $0x5b0,%esp
0xc0326a53 <snd_pcm_oss_change_params+6>:		sub    $0x590,%esp
0xc0106156 <inflate_dynamic+6>:				sub    $0x554,%esp
0xc0176150 <elf_core_dump+13>:				sub    $0x4b4,%esp
0xc0105fb8 <inflate_fixed+7>:				sub    $0x4ac,%esp
0xc035935e <pci_sanity_check+6>:			sub    $0x398,%esp
0xc035986d <pcibios_fixup_peer_bridges+5>:		sub    $0x394,%esp
0xc0334b85 <snd_pcm_hw_params_old_user+8>:		sub    $0x37c,%esp
0xc0334a97 <snd_pcm_hw_refine_old_user+8>:		sub    $0x37c,%esp
0xc02fbc74 <cb_alloc+6>:				sub    $0x32c,%esp
0xc0211b2a <pci_do_scan_bus+14>:			sub    $0x314,%esp
0xc034be58 <snd_seq_midisynth_register_port+12>:	sub    $0x2f0,%esp
0xc0264406 <extract_entropy+6>:				sub    $0x2d8,%esp
0xc02fcdde <ds_ioctl+3>:				sub    $0x2c8,%esp
0xc01dbd6b <udf_load_pvoldesc+6>:			sub    $0x2bc,%esp
0xc0329c6e <snd_pcm_oss_proc_write+6>:			sub    $0x298,%esp
0xc02a218f <pcnet_config+6>:				sub    $0x294,%esp
0xc01c8457 <nlmclnt_proc+14>:				sub    $0x294,%esp
0xc0327ecc <snd_pcm_oss_get_formats+12>:		sub    $0x290,%esp
0xc01d781f <udf_add_entry+6>:				sub    $0x290,%esp
0xc01c8e56 <nlmclnt_reclaim+18>:			sub    $0x280,%esp
0xc0330802 <snd_pcm_hw_params_user+8>:			sub    $0x27c,%esp
0xc03304af <snd_pcm_hw_refine_user+8>:			sub    $0x27c,%esp
0xc01ea4c9 <reiserfs_rename+13>:			sub    $0x27c,%esp
0xc029b57c <e100_ethtool_eeprom+10>:			sub    $0x260,%esp
0xc020a9df <semctl_main+12>:				sub    $0x25c,%esp
0xc0267205 <do_kdgkb_ioctl+24>:				sub    $0x244,%esp
0xc01d0ac8 <do_udf_readdir+6>:				sub    $0x240,%esp
0xc01e137a <udf_get_filename+3>:			sub    $0x23c,%esp
0xc01bd38c <find_exported_dentry+8>:			sub    $0x234,%esp
0xc01a5fa4 <fat_readdirx+15>:				sub    $0x230,%esp
0xc01fe813 <reiserfs_delete_solid_item+6>:		sub    $0x22c,%esp
0xc031f24d <snd_iprintf+3>:				sub    $0x21c,%esp
0xc02b4d6f <cdrom_read_intr+8>:				sub    $0x21c,%esp
0xc024adfb <pnp_printf+3>:				sub    $0x218,%esp
0xc02b4cac <cdrom_buffer_sectors+11>:			sub    $0x210,%esp
0xc01ebf96 <reiserfs_get_block+8>:			sub    $0x210,%esp
0xc020b2f0 <sys_semtimedop+3>:				sub    $0x208,%esp
0xc01fe58d <reiserfs_delete_item+12>:			sub    $0x208,%esp
0xc0529e98 <snd_seq_oss_create_client+12>:		sub    $0x204,%esp
0xc038efed <tcp_check_req+6>:				sub    $0x1f8,%esp
0xc038b462 <tcp_v4_conn_request+6>:			sub    $0x1f8,%esp
0xc01fef81 <reiserfs_cut_from_item+6>:			sub    $0x1f8,%esp
0xc038df7f <tcp_timewait_state_process+8>:		sub    $0x1e4,%esp
0xc0325539 <snd_mixer_oss_build_input+3>:		sub    $0x1e0,%esp
0xc01d9328 <udf_symlink+13>:				sub    $0x1cc,%esp
0xc01ffb15 <reiserfs_insert_item+6>:			sub    $0x1c4,%esp
0xc01ffa03 <reiserfs_paste_into_item+6>:		sub    $0x1c4,%esp
0xc01c43b6 <svc_export_parse+3>:			sub    $0x1c4,%esp
0xc02f6770 <pcmcia_validate_cis+3>:			sub    $0x1c0,%esp
0xc052a2c7 <snd_seq_system_client_init+24>:		sub    $0x1bc,%esp
0xc03511c9 <snd_intel8x0_mixer+13>:			sub    $0x1bc,%esp
0xc01a54f8 <fat_search_long+6>:				sub    $0x1b4,%esp
0xc052a0a1 <snd_seq_oss_midi_lookup_ports+9>:		sub    $0x1ac,%esp
0xc02e99f5 <sg_ioctl+6>:				sub    $0x19c,%esp
0xc0320fb0 <snd_ctl_card_info+12>:			sub    $0x198,%esp
0xc0171860 <ep_send_events+8>:				sub    $0x198,%esp
0xc0155ad4 <blkdev_get+11>:				sub    $0x194,%esp
0xc01b3bea <nfs_symlink+6>:				sub    $0x18c,%esp
0xc01b2699 <nfs_readdir+9>:				sub    $0x18c,%esp
0xc01b347d <nfs_mknod+6>:				sub    $0x17c,%esp
0xc01d71e3 <udf_find_entry+6>:				sub    $0x178,%esp
0xc01b333d <nfs_create+6>:				sub    $0x178,%esp
0xc01b35ca <nfs_mkdir+6>:				sub    $0x174,%esp
0xc02873a3 <radeon_cp_vertex2+3>:			sub    $0x16c,%esp
0xc01583a5 <do_execve+3>:				sub    $0x158,%esp
0xc033e177 <snd_seq_oss_ioctl+3>:			sub    $0x154,%esp
0xc02f13d9 <mmc_ioctl+3>:				sub    $0x154,%esp
0xc017d267 <elf_kcore_store_hdr+6>:			sub    $0x150,%esp
0xc01f048d <reiserfs_readdir+6>:			sub    $0x148,%esp
0xc01b28aa <nfs_lookup_revalidate+11>:			sub    $0x148,%esp
0xc036d0e8 <rt_cache_seq_show+6>:			sub    $0x144,%esp
0xc01d4115 <udf_fill_inode+6>:				sub    $0x144,%esp
0xc032fec8 <snd_pcm_info_user+3>:			sub    $0x140,%esp
0xc0286167 <radeon_cp_clear+3>:				sub    $0x13c,%esp
0xc019608f <journal_commit_transaction+6>:		sub    $0x13c,%esp
0xc0174db5 <load_elf_binary+20>:			sub    $0x13c,%esp
0xc03b5ba4 <ip_map_parse+3>:				sub    $0x138,%esp
0xc035c698 <sys_sendmsg+8>:				sub    $0x134,%esp
0xc02f66fe <read_tuple+3>:				sub    $0x134,%esp
0xc01b2ed9 <nfs_lookup+6>:				sub    $0x134,%esp
0xc0172105 <aout_core_dump+21>:				sub    $0x134,%esp
0xc02df535 <ahc_linux_proc_info+11>:			sub    $0x130,%esp
0xc02d8097 <ahc_linux_info+16>:				sub    $0x130,%esp
0xc034d3db <snd_rawmidi_info_select_user+3>:		sub    $0x12c,%esp
0xc032e77a <snd_pcm_proc_info_read+4>:			sub    $0x12c,%esp
0xc0308874 <proc_getdriver+3>:				sub    $0x12c,%esp
0xc01d4c5c <udf_update_inode+6>:			sub    $0x12c,%esp
0xc034d2a5 <snd_rawmidi_info_user+3>:			sub    $0x128,%esp
0xc01e148f <udf_put_filename+3>:			sub    $0x128,%esp
0xc01d9c88 <udf_rename+6>:				sub    $0x128,%esp
0xc0325433 <snd_mixer_oss_build_test+3>:		sub    $0x124,%esp
0xc0321351 <snd_ctl_elem_info+11>:			sub    $0x124,%esp
0xc02f4c26 <verify_cis_cache+6>:			sub    $0x124,%esp
0xc0242307 <acpi_pci_bind+32>:				sub    $0x124,%esp
0xc01e8aff <reiserfs_add_entry+11>:			sub    $0x124,%esp
0xc01cc029 <nlmsvc_proc_granted_msg+3>:			sub    $0x124,%esp
0xc01cbfab <nlmsvc_proc_unlock_msg+3>:			sub    $0x124,%esp
0xc01cbf2d <nlmsvc_proc_cancel_msg+3>:			sub    $0x124,%esp
0xc01cbeaf <nlmsvc_proc_lock_msg+3>:			sub    $0x124,%esp
0xc01cbe31 <nlmsvc_proc_test_msg+3>:			sub    $0x124,%esp
0xc017c649 <meminfo_read_proc+15>:			sub    $0x124,%esp
0xc016a6b5 <setxattr+8>:				sub    $0x124,%esp
0xc01e37ef <autofs4_expire_run+12>:			sub    $0x120,%esp
0xc0198244 <log_do_checkpoint+6>:			sub    $0x120,%esp
0xc016a969 <getxattr+3>:				sub    $0x120,%esp
0xc0257e97 <parport_pc_probe_port+12>:			sub    $0x11c,%esp
0xc024263a <acpi_pci_bind_root+32>:			sub    $0x11c,%esp
0xc01e3118 <autofs4_notify_daemon+12>:			sub    $0x11c,%esp
0xc035c92a <sys_recvmsg+3>:				sub    $0x118,%esp
0xc031c91e <i8042_interrupt+8>:				sub    $0x118,%esp
0xc02ee068 <sg_proc_hoststrs_info+6>:			sub    $0x118,%esp
0xc02551f0 <do_autoprobe+3>:				sub    $0x118,%esp
0xc0241aab <acpi_pci_irq_add_prt+20>:			sub    $0x118,%esp
0xc016adc3 <removexattr+3>:				sub    $0x118,%esp
0xc02deecd <copy_info+3>:				sub    $0x114,%esp
0xc02c05c8 <scsi_request_sense+6>:			sub    $0x114,%esp
0xc020c619 <sys_shmctl+3>:				sub    $0x114,%esp
0xc0203c55 <reiserfs_breada+6>:				sub    $0x114,%esp
0xc012a88b <sys_reboot+10>:				sub    $0x114,%esp
0xc052aeea <pirq_peer_trick+13>:			sub    $0x110,%esp
0xc01a059f <ext2_get_parent+3>:				sub    $0x110,%esp
0xc01719cf <ep_events_transfer+11>:			sub    $0x110,%esp
0xc02efd9b <dvd_read_bca+3>:				sub    $0x10c,%esp
0xc02550e0 <do_active_device+8>:			sub    $0x10c,%esp
0xc01d2ab5 <inode_getblk+6>:				sub    $0x10c,%esp
0xc01898b5 <ext3_get_parent+12>:			sub    $0x10c,%esp
0xc024839d <acpi_bus_match+8>:				sub    $0x108,%esp
0xc029ac87 <e100_do_ethtool_ioctl+10>:			sub    $0x100,%esp
0xc01beba6 <write_filehandle+3>:			sub    $0x100,%esp


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 16:54             ` Linus Torvalds
@ 2003-02-20 17:24               ` Jeff Garzik
  2003-02-20 21:21               ` Alan Cox
                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 52+ messages in thread
From: Jeff Garzik @ 2003-02-20 17:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Martin J. Bligh, Ingo Molnar, Dave Hansen, Zwane Mwaikambo,
	Chris Wedgwood, Kernel Mailing List, William Lee Irwin III

On Thu, Feb 20, 2003 at 08:54:55AM -0800, Linus Torvalds wrote:
> A sorted list of bad stack users (more than 256 bytes) in my default build
> follows. Anybody can create their own with something like
[...]

Yum.  Thanks for this list (and means to reproduce)...

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
       [not found] <Pine.LNX.4.44.0302201830580.474-100000@localhost.localdomain>
@ 2003-02-20 18:01 ` Linus Torvalds
  2003-02-20 18:23   ` Linus Torvalds
  2003-02-20 19:00   ` Ingo Molnar
  0 siblings, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 18:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Ingo Molnar wrote:
> 
> a true heisenbug. I cannot reproduce it anymore. Anyway, from the serial
> console i collected 3 instances of crashes - whatever it's worth.

Pretty much every single time, release_task() has been there on the
backtrace.

In fact, I bet you this code in do_exit() is the cause:

        preempt_disable();

        if (tsk->exit_signal == -1)
***             release_task(tsk);	***

        schedule();

Note how "release_task()" will be releasing the stack that the process is 
running on right now. And the reason it doesn't crash _every_ time is 
simply that you need to have:

 - another memory allocation that picks up that page and fills it with
   something else in order to get a corrupted stack
 - and something delays schedule() so that you have time to race _and_ you 
   need the stack. Which is why most of the oopses have an interrupt come 
   in inside schedule (see the "common_interrupt()" thing

In other words, I think we need to have schedule_tail() do the 
release_task(), otherwise we'd release it too early while the task 
structure (and the stack) are both still in use.

You owe me a patch.

			Linus

---

>  [<c01219fd>] release_task+0x17d/0x200
>  [<c011e70f>] mmput+0x1f/0xc0
>  [<c0122cad>] do_exit+0x31d/0x3b0

>  [<c010a594>] common_interrupt+0x18/0x20
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011b881>] schedule+0x3a1/0x3d0
>  [<c01219fd>] release_task+0x17d/0x200
>  [<c011e70f>] mmput+0x1f/0xc0
>  [<c0122cad>] do_exit+0x31d/0x3b0

>  [<c010bc28>] handle_IRQ_event+0x38/0x60
>  [<c010bf6b>] do_IRQ+0x14b/0x1e0
>  [<c010a594>] common_interrupt+0x18/0x20
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011b881>] schedule+0x3a1/0x3d0
>  [<c01219fd>] release_task+0x17d/0x200
>  [<c011e70f>] mmput+0x1f/0xc0
>  [<c0122cad>] do_exit+0x31d/0x3b0

>  [<c010bf6b>] do_IRQ+0x14b/0x1e0
>  [<c010a594>] common_interrupt+0x18/0x20
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011b881>] schedule+0x3a1/0x3d0
>  [<c01219fd>] release_task+0x17d/0x200
>  [<c011e70f>] mmput+0x1f/0xc0
>  [<c0122cad>] do_exit+0x31d/0x3b0


>  [<c010bf6b>] do_IRQ+0x14b/0x1e0
>  [<c010a594>] common_interrupt+0x18/0x20
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011b881>] schedule+0x3a1/0x3d0
>  [<c01219fd>] release_task+0x17d/0x200
>  [<c011e70f>] mmput+0x1f/0xc0
>  [<c0122cad>] do_exit+0x31d/0x3b0

>  [<c011e06c>] __put_task_struct+0x7c/0x90
>  [<c0122cad>] do_exit+0x31d/0x3b0

>  [<c010bc28>] handle_IRQ_event+0x38/0x60
>  [<c010bf6b>] do_IRQ+0x14b/0x1e0
>  [<c010a594>] common_interrupt+0x18/0x20
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011b881>] schedule+0x3a1/0x3d0
>  [<c01219fd>] release_task+0x17d/0x200
>  [<c011e70f>] mmput+0x1f/0xc0
>  [<c0122cad>] do_exit+0x31d/0x3b0

>  [<c010bc28>] handle_IRQ_event+0x38/0x60
>  [<c010bf6b>] do_IRQ+0x14b/0x1e0
>  [<c010a594>] common_interrupt+0x18/0x20
>  [<c010a691>] error_code+0x2d/0x38
>  [<c011b881>] schedule+0x3a1/0x3d0
>  [<c01219fd>] release_task+0x17d/0x200
>  [<c011e70f>] mmput+0x1f/0xc0
>  [<c0122cad>] do_exit+0x31d/0x3b0




^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 18:01 ` doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots) Linus Torvalds
@ 2003-02-20 18:23   ` Linus Torvalds
  2003-02-20 19:36     ` Ingo Molnar
  2003-02-20 19:00   ` Ingo Molnar
  1 sibling, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 18:23 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Linus Torvalds wrote:
> 
> In other words, I think we need to have schedule_tail() do the 
> release_task(), otherwise we'd release it too early while the task 
> structure (and the stack) are both still in use.

Well, it's not "schedule_tail()" any more, since that is no longer called 
by the normal schedule end-path.

Test suggestion:

 - remove the 

        if (tsk->exit_signal == -1)
                release_task(tsk);

   from kernel/exit.c

 - make "finish_switch()" something like

	static void inline finish_switch(struct runqueue *rq, struct task_struct *prev)
	{
		finish_arch_switch(rp, prev);
		if ((prev->state & TASK_ZOMBIE) && (prev->exit_signal == -1))
			release_task(prev);
	}

 - make all of "kernel/sched.c" use "finish_switch()" instead of 
   "finish_arch_switch()" (ie replace it in both schedule_tail() and the
   end of schedule() itself).

At some point we can think about trying to speed up that test for 
release_task(), ie add some extra task-state or something that is set in 
kernel/exit.c so that we don't slow down the task switching unnecessarily.

How does this sound?

Also, for debugging, how about this simple (but expensive) debugging thing
that only works without HIGHMEM (and is obviously whitespace-damaged due
to indenting it):

	--- 1.148/mm/page_alloc.c	Wed Feb  5 20:05:13 2003
	+++ edited/mm/page_alloc.c	Thu Feb 20 10:22:42 2003
	@@ -685,6 +685,7 @@
	 void __free_pages(struct page *page, unsigned int order)
	 {
	 	if (!PageReserved(page) && put_page_testzero(page)) {
	+		memset(page_address(page), 0x01, PAGE_SIZE << order);
	 		if (order == 0)
	 			free_hot_page(page);
	 		else

which should show the effects of a buggy "release_task()" much more 
consistently.

Ehh?

		Linus


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 18:01 ` doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots) Linus Torvalds
  2003-02-20 18:23   ` Linus Torvalds
@ 2003-02-20 19:00   ` Ingo Molnar
  1 sibling, 0 replies; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 19:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Linus Torvalds wrote:

> > a true heisenbug. I cannot reproduce it anymore. Anyway, from the serial
> > console i collected 3 instances of crashes - whatever it's worth.
> 
> Pretty much every single time, release_task() has been there on the
> backtrace.
> 
> In fact, I bet you this code in do_exit() is the cause:
> 
>         preempt_disable();
> 
>         if (tsk->exit_signal == -1)
> ***             release_task(tsk);	***
> 
>         schedule();
> 
> Note how "release_task()" will be releasing the stack that the process
> is running on right now. [...]

but, release_task() is a delayed thing for exactly this reason. It fills
out the per-CPU task_cache but does not free the task.

the release_task() + schedule() must be atomic though - ie. we must not be
preempted anytime inbetween [because that other task could free the
task_cache] - but i wasnt running with CONFIG_PREEMPT, so i cannot see how
it could happen.

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 18:23   ` Linus Torvalds
@ 2003-02-20 19:36     ` Ingo Molnar
  2003-02-20 19:53       ` Ingo Molnar
  0 siblings, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 19:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III

hm, i think i can see the SMP race.

the last put_task_struct() can also be done by procfs - and nothing keeps
it from freeing the task in __put_task_struct(), while the task struct is
after its final put_task_struct(), but before the switch_to().

this does not explain the UP crash though.

	Ingo

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 19:36     ` Ingo Molnar
@ 2003-02-20 19:53       ` Ingo Molnar
  2003-02-20 19:57         ` Ingo Molnar
  0 siblings, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 19:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Ingo Molnar wrote:

> hm, i think i can see the SMP race.
> 
> the last put_task_struct() can also be done by procfs - and nothing
> keeps it from freeing the task in __put_task_struct(), while the task
> struct is after its final put_task_struct(), but before the switch_to().

this race is correctly solved by moving the wait_task_inactive() from
release_task() into the tsk != current branch of __free_task_struct().

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 19:53       ` Ingo Molnar
@ 2003-02-20 19:57         ` Ingo Molnar
  2003-02-20 20:14           ` Ingo Molnar
  2003-02-20 20:17           ` Linus Torvalds
  0 siblings, 2 replies; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 19:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


ie. something like:

(untested yet.)

--- linux/kernel/exit.c.orig2	2003-02-20 21:55:56.000000000 +0100
+++ linux/kernel/exit.c	2003-02-20 21:56:02.000000000 +0100
@@ -66,9 +66,6 @@
  
 	BUG_ON(p->state < TASK_ZOMBIE);
  
-	if (p != current)
-		wait_task_inactive(p);
-
 	atomic_dec(&p->user->processes);
 	security_task_free(p);
 	free_uid(p->user);
--- linux/kernel/fork.c.orig2	2003-02-20 21:55:59.000000000 +0100
+++ linux/kernel/fork.c	2003-02-20 21:57:07.000000000 +0100
@@ -75,6 +75,8 @@
 void __put_task_struct(struct task_struct *tsk)
 {
 	if (tsk != current) {
+	        if (tsk != current)
+			wait_task_inactive(tsk);
 		free_thread_info(tsk->thread_info);
 		kmem_cache_free(task_struct_cachep,tsk);
 	} else {


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 15:43         ` Linus Torvalds
                             ` (2 preceding siblings ...)
  2003-02-20 16:44           ` Ingo Molnar
@ 2003-02-20 20:13           ` Chris Wedgwood
  3 siblings, 0 replies; 52+ messages in thread
From: Chris Wedgwood @ 2003-02-20 20:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Zwane Mwaikambo, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III

On Thu, Feb 20, 2003 at 07:43:16AM -0800, Linus Torvalds wrote:

> This could explain Chris' problems too - my doublefault thing won't
> help much if recursion on the stack has clobbered a lot of kernel
> state (and the doublefault will likely happen only after enough
> state is clobbered that even the doublefault handling might have
> trouble).

An overflow *might* explain why

  - it never happens under 2.4.x

  - for some configurations of 2.5.x it never seems to happen either

  - for some configurations of 2.5.x it does happen, but it's very
    nebulous as to which options are required to make this happen;
    very few options seems table,  many options crashes quickly, and a
    in-between it lasts for what might be slightly longer periods of
    time

Now, one thing I'm using that many people may not be is XFS, ACLs &
quota.  Since IRIX has almost inifinite memory available in
kernel-space, I should check to make sure XFS isn't sucking too much
stack space somewhere...  it could be that it is, and depending on the
right magic internal XFS state and when an interrupt arrives or
similar, something goes splat.

I have the stack checking on, but as observed it may not suffice.  I
wonder if 16k stacks are possible for testing?



  --cw


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 19:57         ` Ingo Molnar
@ 2003-02-20 20:14           ` Ingo Molnar
  2003-02-20 20:17           ` Linus Torvalds
  1 sibling, 0 replies; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 20:14 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Ingo Molnar wrote:

> ie. something like:
> 
> (untested yet.)

tested it - works fine, but i was unable to reproduce the crash in the
past couple of hours, so this datapoint is of little value ATM.

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 19:57         ` Ingo Molnar
  2003-02-20 20:14           ` Ingo Molnar
@ 2003-02-20 20:17           ` Linus Torvalds
  2003-02-20 20:50             ` Andrew Morton
  2003-02-20 22:00             ` Ingo Molnar
  1 sibling, 2 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 20:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III

On Thu, 20 Feb 2003, Ingo Molnar wrote:
> 
> ie. something like:

Well, please remove the double test for task inequality.

I like the patch conceptually, HOWEVER, I'm not sure it's correct. The 
thing is, moving the wait_task_inactive() to __put_task_struct() means 
that we will be doing the "release_task()" teardown while the task is 
still potentially active on another CPU.

In particular, we'll be freeing the security stuff and the signals while 
the process may still be active in the scheduler on another CPU. This can 
be dangerous, ie doing things like calling "free_uid()" on a process that 
is still running means that suddenly you have issues like not being able 
to trust "current->user" from interrupts. We may not care right now, but 
it's still wrong (imagine us doing per-user time accounting - which makes 
a _lot_ of sense).

		Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 21:21               ` Alan Cox
@ 2003-02-20 20:20                 ` Linus Torvalds
  2003-02-20 20:23                 ` Martin J. Bligh
  1 sibling, 0 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 20:20 UTC (permalink / raw)
  To: Alan Cox
  Cc: Martin J. Bligh, Ingo Molnar, Dave Hansen, Zwane Mwaikambo,
	Chris Wedgwood, Linux Kernel Mailing List, William Lee Irwin III


On 20 Feb 2003, Alan Cox wrote:
> On Thu, 2003-02-20 at 16:54, Linus Torvalds wrote:
> > Ok, the 4kB stack definitely won't work in real life, but that's because 
> > we have some hopelessly bad stack users in the kernel. But the debugging 
> > part would be good to try (in fact, it might be a good idea to keep the 
> > 8kB stack, but with rather anal debugging. Just the "mcount" part should 
> > do that).
> 
> You also need IRQ stacks to get down to 4K. The wrong pattern of ten
> different IRQ handlers using a mere 200 bytes each will eventually
> happen and eventually kill you otherwise.

Martin's patch set included the per-IRQ stacks, so that part should be ok. 
However, since even a single function will overflow the stack depth test 
of "half the stack", I'm just saying that right now the 4kB stacks 
obviously shouldn't be used for overflow testing (and the 8kB stack 
version right now is way too permissive).

> > That ide_unregister() thing uses up >2kB in just one call! And there are 
> > several in the 1.5kB range too, with a long list of ~500 byte offenders.
> 
> ide_unregister is a really stupid one. Its copying a struct mostly to
> restore fields it shouldnt be restoring but should be setting in the 
> allocator. I hadn't realised quite how bad it was. Added to the ide
> shitlist

Well, ide_unregister() was only the worst of a fairly large bunch of crap. 

Although I guess nobody is really surprised.

		Linus


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 21:21               ` Alan Cox
  2003-02-20 20:20                 ` Linus Torvalds
@ 2003-02-20 20:23                 ` Martin J. Bligh
  2003-02-20 20:42                   ` William Lee Irwin III
  1 sibling, 1 reply; 52+ messages in thread
From: Martin J. Bligh @ 2003-02-20 20:23 UTC (permalink / raw)
  To: Alan Cox, Linus Torvalds
  Cc: Ingo Molnar, Dave Hansen, Zwane Mwaikambo, Chris Wedgwood,
	Linux Kernel Mailing List, William Lee Irwin III

>> Ok, the 4kB stack definitely won't work in real life, but that's because 
>> we have some hopelessly bad stack users in the kernel. But the debugging 
>> part would be good to try (in fact, it might be a good idea to keep the 
>> 8kB stack, but with rather anal debugging. Just the "mcount" part should 
>> do that).
> 
> You also need IRQ stacks to get down to 4K. The wrong pattern of ten
> different IRQ handlers using a mere 200 bytes each will eventually
> happen and eventually kill you otherwise.

That's in Dave's patchset, and 4K stacks is a config option for now.

M.


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 20:23                 ` Martin J. Bligh
@ 2003-02-20 20:42                   ` William Lee Irwin III
  2003-02-20 20:51                     ` Linus Torvalds
  0 siblings, 1 reply; 52+ messages in thread
From: William Lee Irwin III @ 2003-02-20 20:42 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Alan Cox, Linus Torvalds, Ingo Molnar, Dave Hansen,
	Zwane Mwaikambo, Chris Wedgwood, Linux Kernel Mailing List

At some point in the past, _A_ wrote:
>> You also need IRQ stacks to get down to 4K. The wrong pattern of ten
>> different IRQ handlers using a mere 200 bytes each will eventually
>> happen and eventually kill you otherwise.

On Thu, Feb 20, 2003 at 12:23:49PM -0800, Martin J. Bligh wrote:
> That's in Dave's patchset, and 4K stacks is a config option for now.

You might want to grab aeb's fully non-recursive pathwalking if
you really want to cut back the stack to 4KB, as well as fixing
whatever stackblasting drivers are about.


-- wli

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 20:17           ` Linus Torvalds
@ 2003-02-20 20:50             ` Andrew Morton
  2003-02-20 22:04               ` Ingo Molnar
  2003-02-20 22:00             ` Ingo Molnar
  1 sibling, 1 reply; 52+ messages in thread
From: Andrew Morton @ 2003-02-20 20:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: mingo, zwane, cw, linux-kernel, mbligh, wli

Linus Torvalds <torvalds@transmeta.com> wrote:
>
> wait_task_inactive()

There are two other bugs in this exact area.  I received the below from Bill
Irwin and Rick Lindsley yesterday.  Can someone take this off my hands?


Fixes two deadlocks in the scheduler exit path:

1: We're calling mmdrop() under spin_lock_irq(&rq->lock).  But mmdrop
   calls vfree(), which calls smp_call_function().  

   It is not legal to call smp_call_function() with irq's off.  Because
   another CPU may be running smp_call_function() against _this_ CPU, which
   deadlocks.

   So the patch arranges for mmdrop() to not be called under
   spin_lock_irq(&rq->lock).

2: We are leaving local interrupts disabled coming out of exit_notify(). 
   But we are about to call wait_task_inactive() which spins, waiting for
   another CPU to end a task.  If that CPU has issued smp_call_function() to
   this CPU, deadlock.

   So the patch enables interrupts again before returning from exit_notify().

   Also, exit_notify() returns with preemption disabled, so there is no
   need to perform another preempt_disable() in do_exit().


 exit.c  |   17 +++++++++++------
 sched.c |   43 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 11 deletions(-)

diff -puN kernel/exit.c~wli-mem-leak-fix kernel/exit.c
--- 25/kernel/exit.c~wli-mem-leak-fix	2003-02-20 03:10:08.000000000 -0800
+++ 25-akpm/kernel/exit.c	2003-02-20 03:10:35.000000000 -0800
@@ -674,13 +674,19 @@ static void exit_notify(struct task_stru
 
 	tsk->state = TASK_ZOMBIE;
 	/*
-	 * No need to unlock IRQs, we'll schedule() immediately
-	 * anyway. In the preemption case this also makes it
-	 * impossible for the task to get runnable again (thus
-	 * the "_raw_" unlock - to make sure we don't try to
-	 * preempt here).
+	 * In the preemption case it must be impossible for the task
+	 * to get runnable again, so use "_raw_" unlock to keep
+	 * preempt_count elevated until we schedule().
+	 *
+	 * To avoid deadlock on SMP, interrupts must be unmasked.  If we
+	 * don't, subsequently called functions (e.g, wait_task_inactive()
+	 * via release_task()) will spin, with interrupt flags
+	 * unwittingly blocked, until the other task sleeps.  That task
+	 * may itself be waiting for smp_call_function() to answer and
+	 * complete, and with interrupts blocked that will never happen.
 	 */
 	_raw_write_unlock(&tasklist_lock);
+	local_irq_enable();
 }
 
 NORET_TYPE void do_exit(long code)
@@ -727,7 +733,6 @@ NORET_TYPE void do_exit(long code)
 
 	tsk->exit_code = code;
 	exit_notify(tsk);
-	preempt_disable();
 
 	if (tsk->exit_signal == -1)
 		release_task(tsk);
diff -puN kernel/sched.c~wli-mem-leak-fix kernel/sched.c
--- 25/kernel/sched.c~wli-mem-leak-fix	2003-02-20 03:10:08.000000000 -0800
+++ 25-akpm/kernel/sched.c	2003-02-20 03:10:08.000000000 -0800
@@ -152,6 +152,7 @@ struct runqueue {
 	unsigned long nr_running, nr_switches, expired_timestamp,
 			nr_uninterruptible;
 	task_t *curr, *idle;
+	struct mm_struct *prev_mm;
 	prio_array_t *active, *expired, arrays[2];
 	int prev_nr_running[NR_CPUS];
 #ifdef CONFIG_NUMA
@@ -388,7 +389,10 @@ static inline void resched_task(task_t *
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time.
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
  */
 void wait_task_inactive(task_t * p)
 {
@@ -558,10 +562,24 @@ void sched_exit(task_t * p)
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+if (mm)
  */
 asmlinkage void schedule_tail(task_t *prev)
 {
-	finish_arch_switch(this_rq(), prev);
+	runqueue_t *rq = this_rq();
+	struct mm_struct *mm = rq->prev_mm;
+
+		rq->prev_mm = NULL;
+	finish_arch_switch(rq, prev);
+	if (mm)
+		mmdrop(mm);
+
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
@@ -570,7 +588,7 @@ asmlinkage void schedule_tail(task_t *pr
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
-static inline task_t * context_switch(task_t *prev, task_t *next)
+static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
 {
 	struct mm_struct *mm = next->mm;
 	struct mm_struct *oldmm = prev->active_mm;
@@ -584,7 +602,8 @@ static inline task_t * context_switch(ta
 
 	if (unlikely(!prev->mm)) {
 		prev->active_mm = NULL;
-		mmdrop(oldmm);
+		WARN_ON(rq->prev_mm);
+		rq->prev_mm = oldmm;
 	}
 
 	/* Here we just switch the register state and the stack. */
@@ -1223,14 +1242,28 @@ switch_tasks:
 	RCU_qsctr(prev->thread_info->cpu)++;
 
 	if (likely(prev != next)) {
+		struct mm_struct *prev_mm;
 		rq->nr_switches++;
 		rq->curr = next;
 	
 		prepare_arch_switch(rq, next);
-		prev = context_switch(prev, next);
+		prev = context_switch(rq, prev, next);
 		barrier();
 		rq = this_rq();
+		prev_mm = rq->prev_mm;
+		rq->prev_mm = NULL;
+
+		/*
+		 * It's extremely improtant to drop the runqueue lock
+		 * before mmdrop(): on i386, destroy_context(), called
+		 * by mmdrop(), can potentially vfree() LDT's. This may
+		 * generate interrupts to processors spinning (with
+		 * interrupts blocked) on the runqueue lock we're holding.
+		 */
 		finish_arch_switch(rq, prev);
+
+		if (prev_mm)
+			mmdrop(prev_mm);
 	} else
 		spin_unlock_irq(&rq->lock);
 

_


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 20:42                   ` William Lee Irwin III
@ 2003-02-20 20:51                     ` Linus Torvalds
  0 siblings, 0 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 20:51 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Martin J. Bligh, Alan Cox, Ingo Molnar, Dave Hansen,
	Zwane Mwaikambo, Chris Wedgwood, Linux Kernel Mailing List

On Thu, 20 Feb 2003, William Lee Irwin III wrote:
> 
> You might want to grab aeb's fully non-recursive pathwalking if
> you really want to cut back the stack to 4KB, as well as fixing
> whatever stackblasting drivers are about.

The path walking should really not be an issue. Each level of a symlink
takes something like 64 bytes of stack on x86 (I checked it some time ago,
maybe it's changed a bit), since the actual recursive part is very shallow
indeed.

And since we don't recurse deeper than 5 levels anyway, the symlink 
recursion ends up not being a real problem compared to a lot of other 
code (never mind the single functions with hundreds of bytes of stack 
space: just regular function calls 5 levels deep is quite normal).

That fs recursion was not the problem even back in the days when the max
stack depth was <3kB (4kB allocation, 1kB task_struct). It used to be 8
levels deep or something, it was changed to 5 not because we ran out on
x86, but because of those stupid sparc register windows (causing much
bigger minimum function stack requirements than on x86).

			Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 16:54             ` Linus Torvalds
  2003-02-20 17:24               ` Jeff Garzik
@ 2003-02-20 21:21               ` Alan Cox
  2003-02-20 20:20                 ` Linus Torvalds
  2003-02-20 20:23                 ` Martin J. Bligh
  2003-02-27 18:50               ` Randy.Dunlap
  2003-02-27 23:32               ` Randy.Dunlap
  3 siblings, 2 replies; 52+ messages in thread
From: Alan Cox @ 2003-02-20 21:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Martin J. Bligh, Ingo Molnar, Dave Hansen, Zwane Mwaikambo,
	Chris Wedgwood, Linux Kernel Mailing List, William Lee Irwin III

On Thu, 2003-02-20 at 16:54, Linus Torvalds wrote:
> Ok, the 4kB stack definitely won't work in real life, but that's because 
> we have some hopelessly bad stack users in the kernel. But the debugging 
> part would be good to try (in fact, it might be a good idea to keep the 
> 8kB stack, but with rather anal debugging. Just the "mcount" part should 
> do that).

You also need IRQ stacks to get down to 4K. The wrong pattern of ten
different IRQ handlers using a mere 200 bytes each will eventually
happen and eventually kill you otherwise.

> That ide_unregister() thing uses up >2kB in just one call! And there are 
> several in the 1.5kB range too, with a long list of ~500 byte offenders.

ide_unregister is a really stupid one. Its copying a struct mostly to
restore fields it shouldnt be restoring but should be setting in the 
allocator. I hadn't realised quite how bad it was. Added to the ide
shitlist



^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 20:17           ` Linus Torvalds
  2003-02-20 20:50             ` Andrew Morton
@ 2003-02-20 22:00             ` Ingo Molnar
  2003-02-20 22:32               ` Linus Torvalds
  1 sibling, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 22:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Linus Torvalds wrote:

> > ie. something like:
> 
> Well, please remove the double test for task inequality.

ok.

> I like the patch conceptually, HOWEVER, I'm not sure it's correct. The
> thing is, moving the wait_task_inactive() to __put_task_struct() means
> that we will be doing the "release_task()" teardown while the task is
> still potentially active on another CPU.
> 
> In particular, we'll be freeing the security stuff and the signals while
> the process may still be active in the scheduler on another CPU. This
> can be dangerous, ie doing things like calling "free_uid()" on a process
> that is still running means that suddenly you have issues like not being
> able to trust "current->user" from interrupts. We may not care right
> now, but it's still wrong (imagine us doing per-user time accounting -
> which makes a _lot_ of sense).

well, we can do the wait_task_inactive() in both cases - in
release_task(), and in __put_task_struct(). [in the release_task() path
that will just be a nop]. This further simplifies the patch.

	Ingo

--- kernel/fork.c.orig
+++ kernel/fork.c
@@ -75,6 +75,7 @@
 void __put_task_struct(struct task_struct *tsk)
 {
 	if (tsk != current) {
+		wait_task_inactive(tsk);
 		free_thread_info(tsk->thread_info);
 		kmem_cache_free(task_struct_cachep,tsk);
 	} else {


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 20:50             ` Andrew Morton
@ 2003-02-20 22:04               ` Ingo Molnar
  2003-02-20 22:42                 ` William Lee Irwin III
  0 siblings, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 22:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, zwane, cw, linux-kernel, mbligh, wli


On Thu, 20 Feb 2003, Andrew Morton wrote:

> Fixes two deadlocks in the scheduler exit path:
> 
> 1: We're calling mmdrop() under spin_lock_irq(&rq->lock).  But mmdrop
>    calls vfree(), which calls smp_call_function().  

this has been fixed in the -F3 scheduler patch.

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 22:00             ` Ingo Molnar
@ 2003-02-20 22:32               ` Linus Torvalds
  2003-02-20 22:40                 ` Linus Torvalds
                                   ` (2 more replies)
  0 siblings, 3 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 22:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Ingo Molnar wrote:
> 
> well, we can do the wait_task_inactive() in both cases - in
> release_task(), and in __put_task_struct(). [in the release_task() path
> that will just be a nop]. This further simplifies the patch.

I think the _real_ simplification is to just have the task switch do this 
in the tail:

	if (prev->state & TASK_DEAD)
		put_task_struct(prev);

suddenly we don't have any issues at all with possibly freeing stuff 
before its time, since we're guaranteed to keep the process around untill 
we've properly scheduled out of it.

Suggested patch (against current BK, which has the finish_task_switch() 
cleanups I mentioned earlier) appended. No special cases, nu subtlety with 
__put_task_struct() caches, no nothing.

		Linus

-----
===== kernel/exit.c 1.97 vs edited =====
--- 1.97/kernel/exit.c	Thu Feb 20 03:10:35 2003
+++ edited/kernel/exit.c	Thu Feb 20 14:28:39 2003
@@ -103,7 +103,6 @@
 		dput(proc_dentry);
 	}
 	release_thread(p);
-	put_task_struct(p);
 }
 
 /* we are using it only for SMP init */
===== kernel/sched.c 1.160 vs edited =====
--- 1.160/kernel/sched.c	Thu Feb 20 05:42:54 2003
+++ edited/kernel/sched.c	Thu Feb 20 14:27:23 2003
@@ -581,6 +581,8 @@
 	finish_arch_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
+	if (prev->state & TASK_DEAD)
+		put_task_struct(prev);
 }
 
 /**


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 22:32               ` Linus Torvalds
@ 2003-02-20 22:40                 ` Linus Torvalds
  2003-02-20 22:45                   ` Linus Torvalds
  2003-02-20 22:57                 ` John Levon
  2003-02-20 23:21                 ` Ingo Molnar
  2 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 22:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Linus Torvalds wrote:
> 
> Suggested patch (against current BK, which has the finish_task_switch() 
> cleanups I mentioned earlier) appended. No special cases, nu subtlety with 
> __put_task_struct() caches, no nothing.

Yeah, don't bother to tell me it doesn't work. We need the task pointer to
include information on _both_ "I'm still using it" (the task itself) _and_
the "I'm waiting for it" case. So it's not just a matter of moving the
put_task() thing around, it needs to get the accounting right..

		Linus


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 22:04               ` Ingo Molnar
@ 2003-02-20 22:42                 ` William Lee Irwin III
  2003-02-21  7:05                   ` Ingo Molnar
  0 siblings, 1 reply; 52+ messages in thread
From: William Lee Irwin III @ 2003-02-20 22:42 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Linus Torvalds, zwane, cw, linux-kernel, mbligh

On Thu, 20 Feb 2003, Andrew Morton wrote:
>> Fixes two deadlocks in the scheduler exit path:
>> 1: We're calling mmdrop() under spin_lock_irq(&rq->lock).  But mmdrop
>>    calls vfree(), which calls smp_call_function().  

On Thu, Feb 20, 2003 at 11:04:41PM +0100, Ingo Molnar wrote:
> this has been fixed in the -F3 scheduler patch.

Not quite. It leaks mm's because schedule_tail() isn't cleaning
up rq->prev_mm.


-- wli

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 22:40                 ` Linus Torvalds
@ 2003-02-20 22:45                   ` Linus Torvalds
  0 siblings, 0 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 22:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Linus Torvalds wrote:
> 
> Yeah, don't bother to tell me it doesn't work. We need the task pointer to
> include information on _both_ "I'm still using it" (the task itself) _and_
> the "I'm waiting for it" case. So it's not just a matter of moving the
> put_task() thing around, it needs to get the accounting right..

And the way to get the accounting right (I think) is actually truly 
trivial: we should initialize the task count to _two_ at process creation 
time, since we have two users (the parent who will do the wait, and our 
own usage).

This should mean that we'd actually have the process count right, and 
wouldn't need the games we play right now. Ie the patch should be 
something like the appended (which again is totally untested, it might 
easily have serious problems, that's not really the point. The point is 
that reference counting is the only sane memory management policy, and we 
did it wrong).

		Linus

---
===== kernel/fork.c 1.106 vs edited =====
--- 1.106/kernel/fork.c	Tue Feb 18 13:54:44 2003
+++ edited/kernel/fork.c	Thu Feb 20 14:42:25 2003
@@ -217,7 +217,9 @@
 	*tsk = *orig;
 	tsk->thread_info = ti;
 	ti->task = tsk;
-	atomic_set(&tsk->usage,1);
+
+	/* One for us, one for whoever does the "release_task()" (usually parent) */
+	atomic_set(&tsk->usage,2);
 	return tsk;
 }
 
===== kernel/sched.c 1.160 vs edited =====
--- 1.160/kernel/sched.c	Thu Feb 20 05:42:54 2003
+++ edited/kernel/sched.c	Thu Feb 20 14:27:23 2003
@@ -581,6 +581,8 @@
 	finish_arch_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
+	if (prev->state & TASK_DEAD)
+		put_task_struct(prev);
 }
 
 /**


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 22:32               ` Linus Torvalds
  2003-02-20 22:40                 ` Linus Torvalds
@ 2003-02-20 22:57                 ` John Levon
  2003-02-20 23:21                 ` Ingo Molnar
  2 siblings, 0 replies; 52+ messages in thread
From: John Levon @ 2003-02-20 22:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III

On Thu, Feb 20, 2003 at 02:32:02PM -0800, Linus Torvalds wrote:

> I think the _real_ simplification is to just have the task switch do this 
> in the tail:
> 
> 	if (prev->state & TASK_DEAD)
> 		put_task_struct(prev);
> 
> suddenly we don't have any issues at all with possibly freeing stuff 
> before its time, since we're guaranteed to keep the process around untill 
> we've properly scheduled out of it.

Side note ... if there's a sleepable context in which oprofile can
synchronise its buffers (i.e. after the task can possible run on a CPU
again, and before the task_struct itself is freed/reused), that would be
very handy.

Currently we're masking out any samples when PF_EXITING is set for
current(), which is obviously less than ideal.

Would this be such a spot ? Basically somewhere that profile_exit_task
can sit.

regards
john


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 16:11           ` Martin J. Bligh
  2003-02-20 16:54             ` Linus Torvalds
@ 2003-02-20 23:09             ` Chris Wedgwood
  1 sibling, 0 replies; 52+ messages in thread
From: Chris Wedgwood @ 2003-02-20 23:09 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Linus Torvalds, Ingo Molnar, Dave Hansen, Zwane Mwaikambo,
	Kernel Mailing List, William Lee Irwin III

On Thu, Feb 20, 2003 at 08:11:31AM -0800, Martin J. Bligh wrote:

> There are patches in -mjb from Dave Hansen / Ben LaHaise to detect
> stack overflow included with the stuff for the 4K stacks patch
> (intended for scaling to large numbers of tasks). I've split them
> out attatched, should apply to mainline reasonably easily.

I tried with these patches and also wli's sched deadlock fix to see if
that helps.

Sadly not,  I can still easily reproduce a reboot.


  --cw

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 22:32               ` Linus Torvalds
  2003-02-20 22:40                 ` Linus Torvalds
  2003-02-20 22:57                 ` John Levon
@ 2003-02-20 23:21                 ` Ingo Molnar
  2003-02-20 23:36                   ` Linus Torvalds
  2 siblings, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2003-02-20 23:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Linus Torvalds wrote:

> > well, we can do the wait_task_inactive() in both cases - in
> > release_task(), and in __put_task_struct(). [in the release_task() path
> > that will just be a nop]. This further simplifies the patch.
> 
> I think the _real_ simplification is to just have the task switch do
> this in the tail:

if possible i'd avoid putting more overhead into the scheduler - it's
clearly more performance-sensitive than the task create/exit path.

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 23:21                 ` Ingo Molnar
@ 2003-02-20 23:36                   ` Linus Torvalds
  2003-02-21  7:00                     ` Ingo Molnar
  0 siblings, 1 reply; 52+ messages in thread
From: Linus Torvalds @ 2003-02-20 23:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Fri, 21 Feb 2003, Ingo Molnar wrote:
> 
> if possible i'd avoid putting more overhead into the scheduler - it's
> clearly more performance-sensitive than the task create/exit path.

This is a single non-serializing bit test, and if it means that the task 
counters are _right_, that's definitely the right thing to do.

		Linus


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 23:36                   ` Linus Torvalds
@ 2003-02-21  7:00                     ` Ingo Molnar
  2003-02-21 15:05                       ` Linus Torvalds
  0 siblings, 1 reply; 52+ messages in thread
From: Ingo Molnar @ 2003-02-21  7:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III


On Thu, 20 Feb 2003, Linus Torvalds wrote:

> > if possible i'd avoid putting more overhead into the scheduler - it's
> > clearly more performance-sensitive than the task create/exit path.
> 
> This is a single non-serializing bit test, and if it means that the task
> counters are _right_, that's definitely the right thing to do.

ok. Plus the wait_task_inactive() stuff was always a bit volatile. Now we
could in fact remove it from release_task(), right?

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 22:42                 ` William Lee Irwin III
@ 2003-02-21  7:05                   ` Ingo Molnar
  0 siblings, 0 replies; 52+ messages in thread
From: Ingo Molnar @ 2003-02-21  7:05 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Andrew Morton, Linus Torvalds, zwane, cw, linux-kernel, mbligh


On Thu, 20 Feb 2003, William Lee Irwin III wrote:

> >> 1: We're calling mmdrop() under spin_lock_irq(&rq->lock).  But mmdrop
> >>    calls vfree(), which calls smp_call_function().  
> 
> On Thu, Feb 20, 2003 at 11:04:41PM +0100, Ingo Molnar wrote:
> > this has been fixed in the -F3 scheduler patch.
> 
> Not quite. It leaks mm's because schedule_tail() isn't cleaning
> up rq->prev_mm.

hm, this i think was a forward-porting oversight. Anyway, now the separate
patch is in, and it's better that way, the fix was unrelated to the main
things -F3 does.

	Ingo


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-21  7:00                     ` Ingo Molnar
@ 2003-02-21 15:05                       ` Linus Torvalds
  0 siblings, 0 replies; 52+ messages in thread
From: Linus Torvalds @ 2003-02-21 15:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Chris Wedgwood, Kernel Mailing List,
	Martin J. Bligh, William Lee Irwin III

On Fri, 21 Feb 2003, Ingo Molnar wrote:
> > 
> > This is a single non-serializing bit test, and if it means that the task
> > counters are _right_, that's definitely the right thing to do.
> 
> ok. Plus the wait_task_inactive() stuff was always a bit volatile. Now we
> could in fact remove it from release_task(), right?

Yes, except for the same concerns I had about your patch moving it.

That part could be cleanly solvged by just moving a lot of the tear-down
of the "struct task_struct" entirely into "__put_task_struct()" (which now
can never be called with "current == tsk"), ie if we do the "free_user()"
_there_, then I think we can remove the wait_task_inactive() entirely from 
the wait path.

		Linus

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 16:54             ` Linus Torvalds
  2003-02-20 17:24               ` Jeff Garzik
  2003-02-20 21:21               ` Alan Cox
@ 2003-02-27 18:50               ` Randy.Dunlap
  2003-02-27 19:39                 ` Muli Ben-Yehuda
  2003-03-02  6:12                 ` Keith Owens
  2003-02-27 23:32               ` Randy.Dunlap
  3 siblings, 2 replies; 52+ messages in thread
From: Randy.Dunlap @ 2003-02-27 18:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: mbligh, zwane, cw, linux-kernel

On Thu, 20 Feb 2003 08:54:55 -0800 (PST)
Linus Torvalds <torvalds@transmeta.com> wrote:

| On Thu, 20 Feb 2003, Martin J. Bligh wrote:
| > 
| > There are patches in -mjb from Dave Hansen / Ben LaHaise to detect stack
| > overflow included with the stuff for the 4K stacks patch (intended for 
| > scaling to large numbers of tasks). I've split them out attatched, should 
| > apply to mainline reasonably easily.
| 
| Ok, the 4kB stack definitely won't work in real life, but that's because 
| we have some hopelessly bad stack users in the kernel. But the debugging 
| part would be good to try (in fact, it might be a good idea to keep the 
| 8kB stack, but with rather anal debugging. Just the "mcount" part should 
| do that).
| 
| A sorted list of bad stack users (more than 256 bytes) in my default build
| follows. Anybody can create their own with something like
| 
| 	objdump -d linux/vmlinux |
| 		grep 'sub.*$0x...,.*esp' |
| 		awk '{ print $9,$1 }' |
| 		sort > bigstack
| 
| and a script to look up the addresses.
| 
| That ide_unregister() thing uses up >2kB in just one call! And there are 
| several in the 1.5kB range too, with a long list of ~500 byte offenders.
| 
| Yeah, and this assumes we don't have alloca() users or other dynamic 
| stack allocators (non-constant-size automatic arrays). I hope we don't 
| have that kind of crap anywhere..

I don't get a nice listing from this script like you did.
Example of mine is below.  Do I just have a tools issue?

Thanks,
--
~Randy



$0x424,%esp c01f6bc0:
$0x490,%esp c0106010:
$0x4ac,%esp c016aec3:
$0x540,%esp c01061a6:
$0x5ac,%esp c010533e:
$0x798,%esp c02528b8:
$0x924,%esp c02484fb:

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-27 18:50               ` Randy.Dunlap
@ 2003-02-27 19:39                 ` Muli Ben-Yehuda
  2003-02-27 19:47                   ` Randy.Dunlap
  2003-03-02  6:12                 ` Keith Owens
  1 sibling, 1 reply; 52+ messages in thread
From: Muli Ben-Yehuda @ 2003-02-27 19:39 UTC (permalink / raw)
  To: Randy.Dunlap; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 873 bytes --]

On Thu, Feb 27, 2003 at 10:50:56AM -0800, Randy.Dunlap wrote:
> On Thu, 20 Feb 2003 08:54:55 -0800 (PST)
> Linus Torvalds <torvalds@transmeta.com> wrote:

[snipped] 

> | A sorted list of bad stack users (more than 256 bytes) in my default build
> | follows. Anybody can create their own with something like
> | 
> | 	objdump -d linux/vmlinux |
> | 		grep 'sub.*$0x...,.*esp' |
> | 		awk '{ print $9,$1 }' |
> | 		sort > bigstack
> | 
> | and a script to look up the addresses.

[snipped] 

> I don't get a nice listing from this script like you did.
> Example of mine is below.  Do I just have a tools issue?

See the part where Linus said "...and a script to look up the
addresses.". You can use 'ksymoops -v vmlinux -m System.map --no-ksyms
--no-lsmod -A 0xcodebabe' to translate address to symbol. 
-- 
Muli Ben-Yehuda
http://www.mulix.org


[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-27 19:39                 ` Muli Ben-Yehuda
@ 2003-02-27 19:47                   ` Randy.Dunlap
  0 siblings, 0 replies; 52+ messages in thread
From: Randy.Dunlap @ 2003-02-27 19:47 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: linux-kernel

On Thu, 27 Feb 2003 21:39:44 +0200
Muli Ben-Yehuda <mulix@mulix.org> wrote:

| On Thu, Feb 27, 2003 at 10:50:56AM -0800, Randy.Dunlap wrote:
| > On Thu, 20 Feb 2003 08:54:55 -0800 (PST)
| > Linus Torvalds <torvalds@transmeta.com> wrote:
| 
| [snipped] 
| 
| > | A sorted list of bad stack users (more than 256 bytes) in my default build
| > | follows. Anybody can create their own with something like
| > | 
| > | 	objdump -d linux/vmlinux |
| > | 		grep 'sub.*$0x...,.*esp' |
| > | 		awk '{ print $9,$1 }' |
| > | 		sort > bigstack
| > | 
| > | and a script to look up the addresses.
| 
| [snipped] 
| 
| > I don't get a nice listing from this script like you did.
| > Example of mine is below.  Do I just have a tools issue?
| 
| See the part where Linus said "...and a script to look up the
| addresses.". You can use 'ksymoops -v vmlinux -m System.map --no-ksyms
| --no-lsmod -A 0xcodebabe' to translate address to symbol. 

Yes, sorry about skimming over that.
And yes, I'm familiar with that option of ksymoops.*  :)

--
~Randy


*: since it's based on
   http://www.osdl.org/archive/rddunlap/scripts/ksysmap

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-20 16:54             ` Linus Torvalds
                                 ` (2 preceding siblings ...)
  2003-02-27 18:50               ` Randy.Dunlap
@ 2003-02-27 23:32               ` Randy.Dunlap
  3 siblings, 0 replies; 52+ messages in thread
From: Randy.Dunlap @ 2003-02-27 23:32 UTC (permalink / raw)
  To: linux-kernel

| A sorted list of bad stack users (more than 256 bytes) in my default build
| follows. Anybody can create their own with something like
| 
| 	objdump -d linux/vmlinux |
| 		grep 'sub.*$0x...,.*esp' |
| 		awk '{ print $9,$1 }' |
| 		sort > bigstack
| 
| and a script to look up the addresses.
| 
| That ide_unregister() thing uses up >2kB in just one call! And there are 
| several in the 1.5kB range too, with a long list of ~500 byte offenders.
| 
| Yeah, and this assumes we don't have alloca() users or other dynamic 
| stack allocators (non-constant-size automatic arrays). I hope we don't 
| have that kind of crap anywhere..

Keith Owens did such a script over 1 year ago.  It's available from
  http://kernelnewbies.org/scripts/check-stack.sh
It also identifies (flags) dynamic stack allocation.
(course, I can't read Keith's as well as I can Linus's)

--
~Randy

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
  2003-02-27 18:50               ` Randy.Dunlap
  2003-02-27 19:39                 ` Muli Ben-Yehuda
@ 2003-03-02  6:12                 ` Keith Owens
  1 sibling, 0 replies; 52+ messages in thread
From: Keith Owens @ 2003-03-02  6:12 UTC (permalink / raw)
  To: linux-kernel

Linus Torvalds <torvalds@transmeta.com> wrote:
> A sorted list of bad stack users (more than 256 bytes) in my default build
> follows. Anybody can create their own with something like
> 
> 	objdump -d linux/vmlinux |
> 		grep 'sub.*$0x...,.*esp' |
> 		awk '{ print $9,$1 }' |
> 		sort > bigstack
> 
> and a script to look up the addresses.
> 
> Yeah, and this assumes we don't have alloca() users or other dynamic 
> stack allocators (non-constant-size automatic arrays). I hope we don't 
> have that kind of crap anywhere..

We do.

kernel.stack identifies big offenders, dynamic stacks and tells you
which procedure is at fault.  This must be at least the fifth time I
have published this script.

#!/bin/bash
#
#	Run a compiled ix86 kernel and print large local stack usage.
#
#	/>:/{s/[<>:]*//g; h; }   On lines that contain '>:' (headings like
#	c0100000 <_stext>:), remove <, > and : and hold the line.  Identifies
#	the procedure and its start address.
#
#	/subl\?.*\$0x[^,][^,][^,].*,%esp/{    Select lines containing
#	subl\?...0x...,%esp but only if there are at least 3 digits between 0x and
#	,%esp.  These are local stacks of at least 0x100 bytes.
#
#	s/.*$0x\([^,]*\).*/\1/;   Extract just the stack adjustment
#	/^[89a-f].......$/d;   Ignore lines with 8 digit offsets that are 
#	negative.  Some compilers adjust the stack on exit, seems to be related
#	to goto statements
#	G;   Append the held line (procedure and start address).
#	s/\(.*\)\n.* \(.*\)/\1 \2/;  Remove the newline and procedure start 
#	address.  Leaves just stack size and procedure name.
#	p; };   Print stack size and procedure name.
#
#	/subl\?.*%.*,%esp/{   Selects adjustment of %esp by register, dynamic 
#	arrays on stack.
#	G;   Append the held line (procedure and start address).
#	s/\(.*\)\n\(.*\)/Dynamic \2 \1/;   Reformat to "Dynamic", procedure 
#	start address, procedure name and the instruction that adjusts the
#	stack, including its offset within the proc.
#	p; };   Print the dynamic line.
#
#
#	Leading spaces in the sed string are required.
#
objdump --disassemble "$@" | \
sed -ne '/>:/{s/[<>:]*//g; h; }
 /subl\?.*\$0x[^,][^,][^,].*,%esp/{
 s/.*\$0x\([^,]*\).*/\1/; /^[89a-f].......$/d; G; s/\(.*\)\n.* \(.*\)/\1 \2/; p; };
 /subl\?.*%.*,%esp/{ G; s/\(.*\)\n\(.*\)/Dynamic \2 \1/; p; }; ' | \
sort


^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2003-03-02  6:02 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <Pine.LNX.4.44.0302201830580.474-100000@localhost.localdomain>
2003-02-20 18:01 ` doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots) Linus Torvalds
2003-02-20 18:23   ` Linus Torvalds
2003-02-20 19:36     ` Ingo Molnar
2003-02-20 19:53       ` Ingo Molnar
2003-02-20 19:57         ` Ingo Molnar
2003-02-20 20:14           ` Ingo Molnar
2003-02-20 20:17           ` Linus Torvalds
2003-02-20 20:50             ` Andrew Morton
2003-02-20 22:04               ` Ingo Molnar
2003-02-20 22:42                 ` William Lee Irwin III
2003-02-21  7:05                   ` Ingo Molnar
2003-02-20 22:00             ` Ingo Molnar
2003-02-20 22:32               ` Linus Torvalds
2003-02-20 22:40                 ` Linus Torvalds
2003-02-20 22:45                   ` Linus Torvalds
2003-02-20 22:57                 ` John Levon
2003-02-20 23:21                 ` Ingo Molnar
2003-02-20 23:36                   ` Linus Torvalds
2003-02-21  7:00                     ` Ingo Molnar
2003-02-21 15:05                       ` Linus Torvalds
2003-02-20 19:00   ` Ingo Molnar
2003-02-18 23:01 Linux v2.5.62 --- spontaneous reboots Chris Wedgwood
2003-02-19 23:35 ` doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots) Linus Torvalds
2003-02-20  2:22   ` Zwane Mwaikambo
2003-02-20  2:26     ` William Lee Irwin III
2003-02-20  2:55       ` Zwane Mwaikambo
2003-02-20  3:15         ` William Lee Irwin III
2003-02-20  4:52     ` Linus Torvalds
2003-02-20  5:07       ` William Lee Irwin III
2003-02-20  6:05       ` Zwane Mwaikambo
2003-02-20 11:46       ` Ingo Molnar
2003-02-20 12:12         ` William Lee Irwin III
2003-02-20 12:33           ` Ingo Molnar
2003-02-20 14:03             ` Zwane Mwaikambo
2003-02-20 14:00         ` Zwane Mwaikambo
2003-02-20 15:43         ` Linus Torvalds
2003-02-20 15:52           ` Ingo Molnar
2003-02-20 16:11           ` Martin J. Bligh
2003-02-20 16:54             ` Linus Torvalds
2003-02-20 17:24               ` Jeff Garzik
2003-02-20 21:21               ` Alan Cox
2003-02-20 20:20                 ` Linus Torvalds
2003-02-20 20:23                 ` Martin J. Bligh
2003-02-20 20:42                   ` William Lee Irwin III
2003-02-20 20:51                     ` Linus Torvalds
2003-02-27 18:50               ` Randy.Dunlap
2003-02-27 19:39                 ` Muli Ben-Yehuda
2003-02-27 19:47                   ` Randy.Dunlap
2003-03-02  6:12                 ` Keith Owens
2003-02-27 23:32               ` Randy.Dunlap
2003-02-20 23:09             ` Chris Wedgwood
2003-02-20 16:44           ` Ingo Molnar
2003-02-20 20:13           ` Chris Wedgwood

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox