public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [patch] Re: 2.6.16-rc4-mm1 (bugs and lockups)
@ 2006-02-21 17:51 Stas Sergeev
  2006-02-22  1:31 ` Andrew Morton
  2006-02-22  1:44 ` Andrew Morton
  0 siblings, 2 replies; 4+ messages in thread
From: Stas Sergeev @ 2006-02-21 17:51 UTC (permalink / raw)
  To: Linux kernel; +Cc: Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 2411 bytes --]

Hi.

The history is that -mm kernels do not work for me
for a few months already. The things started from
crashing somewhere after starting init, and for the
last month - no boot at all, just
"Uncompressing... OK, booting kernel", and silence.
Early console didn't work too.
With the latest releases this degraded into an infinite
stream of the "Unknown interrupt or fault" messages.
So today my patience ran out and I started to think how
can I collect at least some info for the bug-report.
Attached is the patch that allows to gather some valueable
debug info on the problem by making an early console more
useable. I can't properly test the patch, as the kernel
still doesn't boot, so I'll explain it in details in a
hope someone else can justify the intrusive changes.


arch_hooks.h: added prototypes for setup_early_printk()
and early_printk().

head.S: added "hlt" to the dummy fault handler. This is
necessary because otherwise the fault retriggers infinitely,
causing the infinite stream of an "Unknown interrupt or fault"
messages, which scrolls away the usefull info. I don't know
if this is a safe change.

setup.c: killed wrong setup_early_printk() prototype.
Moved setup_early_printk() a bit earlier, as it was not
"early enough" to cover the bug I was fighting with.

early_printk.c: made it to start printing from the bottom
of the screen, otherwise the messages interfere with the
ones of the boot-loader, so you can't read them.

main.c: moved smp_prepare_boot_cpu() call earlier. This
was necessary because otherwise printk() can't print
It checks cpu_online(), which returns false. This change
is consistent with the UP case, where's the boot CPU is
"online" from the very beginning, AFAICS. But again, I am
not entirely sure whether this is safe.


OK, so with that patch I was hoping to collect some debug
info. It turned out though, that the main.c change also
fixes the problem itself. The lockup was happening in an
__alloc_bootmem_core(): the "if (!size)" check was succeeding,
and the BUG() was triggering. After my main.c change this no
longer happens, but I don't know where the problem was.

I still can't boot the kernel because of this
http://www.uwsg.iu.edu/hypermail/linux/kernel/0602.2/1244.html
but at least I know that with the attached patch, the boot
process goes much further.

Just in case the patch is going to be applied:
Signed-off-by: Stas Sergeev <stsp@aknet.ru>


[-- Attachment #2: bugearly1-16-rc4-mm1.diff --]
[-- Type: text/x-patch, Size: 3083 bytes --]

--- a/include/asm-i386/arch_hooks.h	2004-01-09 09:59:45.000000000 +0300
+++ b/include/asm-i386/arch_hooks.h	2006-02-21 12:07:40.000000000 +0300
@@ -24,4 +24,7 @@
 extern void time_init_hook(void);
 extern void mca_nmi_hook(void);
 
+extern int setup_early_printk(char *); 
+extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
+
 #endif
--- a/arch/i386/kernel/head.S	2006-02-15 12:02:33.000000000 +0300
+++ b/arch/i386/kernel/head.S	2006-02-21 11:39:05.000000000 +0300
@@ -410,6 +410,7 @@
 	popl %ecx
 	popl %eax
 #endif
+	hlt	/* no way out, iret will just retrigger the fault */
 	iret
 
 /*
@@ -439,7 +440,7 @@
 ready:	.byte 0
 
 int_msg:
-	.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+	.asciz "Unknown interrupt or fault at EIP %p %p %p, press Reset\n"
 
 /*
  * The IDT and GDT 'descriptors' are a strange 48-bit object
--- a/arch/i386/kernel/setup.c	2006-02-21 10:35:36.000000000 +0300
+++ b/arch/i386/kernel/setup.c	2006-02-21 12:09:52.000000000 +0300
@@ -1447,6 +1447,16 @@
 
 	parse_cmdline_early(cmdline_p);
 
+#ifdef CONFIG_EARLY_PRINTK
+	{
+		char *s = strstr(*cmdline_p, "earlyprintk=");
+		if (s) {
+			setup_early_printk(strchr(s, '=') + 1);
+			printk("early console enabled\n");
+		}
+	}
+#endif
+
 	max_low_pfn = setup_memory();
 
 	/*
@@ -1471,19 +1481,6 @@
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
 
-#ifdef CONFIG_EARLY_PRINTK
-	{
-		char *s = strstr(*cmdline_p, "earlyprintk=");
-		if (s) {
-			extern void setup_early_printk(char *);
-
-			setup_early_printk(strchr(s, '=') + 1);
-			printk("early console enabled\n");
-		}
-	}
-#endif
-
-
 	dmi_scan_machine();
 
 #ifdef CONFIG_X86_GENERICARCH
--- a/arch/x86_64/kernel/early_printk.c	2006-02-21 10:35:40.000000000 +0300
+++ b/arch/x86_64/kernel/early_printk.c	2006-02-21 12:19:10.000000000 +0300
@@ -21,7 +21,7 @@
 #define MAX_XPOS	max_xpos
 
 static int max_ypos = 25, max_xpos = 80;
-static int current_ypos = 1, current_xpos = 0;
+static int current_ypos = 25, current_xpos = 0;
 
 static void early_vga_write(struct console *con, const char *str, unsigned n)
 {
@@ -244,6 +244,7 @@
 	           && SCREEN_INFO.orig_video_isVGA == 1) {
 		max_xpos = SCREEN_INFO.orig_video_cols;
 		max_ypos = SCREEN_INFO.orig_video_lines;
+		current_ypos = max_ypos;
 		early_console = &early_vga_console;
  	} else if (!strncmp(buf, "simnow", 6)) {
  		simnow_init(buf + 6);
--- a/init/main.c	2006-02-21 10:36:04.000000000 +0300
+++ b/init/main.c	2006-02-21 15:33:18.000000000 +0300
@@ -449,6 +449,7 @@
  * enable them
  */
 	lock_kernel();
+	smp_prepare_boot_cpu();
 	page_address_init();
 	printk(KERN_NOTICE);
 	printk(linux_banner);
@@ -456,12 +457,6 @@
 	setup_per_cpu_areas();
 
 	/*
-	 * Mark the boot cpu "online" so that it can call console drivers in
-	 * printk() and can access its per-cpu storage.
-	 */
-	smp_prepare_boot_cpu();
-
-	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
 	 * time - but meanwhile we still have a functioning scheduler.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] Re: 2.6.16-rc4-mm1 (bugs and lockups)
  2006-02-21 17:51 [patch] Re: 2.6.16-rc4-mm1 (bugs and lockups) Stas Sergeev
@ 2006-02-22  1:31 ` Andrew Morton
  2006-02-22  1:44 ` Andrew Morton
  1 sibling, 0 replies; 4+ messages in thread
From: Andrew Morton @ 2006-02-22  1:31 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: linux-kernel

Stas Sergeev <stsp@aknet.ru> wrote:
>
> Hi.
> 
> The history is that -mm kernels do not work for me
> for a few months already. The things started from
> crashing somewhere after starting init, and for the
> last month - no boot at all, just
> "Uncompressing... OK, booting kernel", and silence.
> Early console didn't work too.
> With the latest releases this degraded into an infinite
> stream of the "Unknown interrupt or fault" messages.
> So today my patience ran out and I started to think how
> can I collect at least some info for the bug-report.
> Attached is the patch that allows to gather some valueable
> debug info on the problem by making an early console more
> useable. I can't properly test the patch, as the kernel
> still doesn't boot, so I'll explain it in details in a
> hope someone else can justify the intrusive changes.
> 

It's unusual that the failure has been only in -mm, and for so long.  That
would indicate that we have a problem in a for-mm-only patch.

And yet your patch applies OK to 2.6.16-rc4, so it's not obvious how this
got in there.

Did you never perform a bisection as per
http://www.zip.com.au/~akpm/linux/patches/stuff/bisecting-mm-trees.txt,
find out which patch in -mm was the offender?

> arch_hooks.h: added prototypes for setup_early_printk()
> and early_printk().
> 
> head.S: added "hlt" to the dummy fault handler. This is
> necessary because otherwise the fault retriggers infinitely,
> causing the infinite stream of an "Unknown interrupt or fault"
> messages, which scrolls away the usefull info. I don't know
> if this is a safe change.
> 
> setup.c: killed wrong setup_early_printk() prototype.
> Moved setup_early_printk() a bit earlier, as it was not
> "early enough" to cover the bug I was fighting with.
> 
> early_printk.c: made it to start printing from the bottom
> of the screen, otherwise the messages interfere with the
> ones of the boot-loader, so you can't read them.
> 
> main.c: moved smp_prepare_boot_cpu() call earlier. This
> was necessary because otherwise printk() can't print
> It checks cpu_online(), which returns false. This change
> is consistent with the UP case, where's the boot CPU is
> "online" from the very beginning, AFAICS. But again, I am
> not entirely sure whether this is safe.
> 

They all sound like good stuff - I'll take a look, thanks.

> OK, so with that patch I was hoping to collect some debug
> info. It turned out though, that the main.c change also
> fixes the problem itself. The lockup was happening in an
> __alloc_bootmem_core(): the "if (!size)" check was succeeding,
> and the BUG() was triggering. After my main.c change this no
> longer happens, but I don't know where the problem was.
> 
> I still can't boot the kernel because of this
> http://www.uwsg.iu.edu/hypermail/linux/kernel/0602.2/1244.html
> but at least I know that with the attached patch, the boot
> process goes much further.
> 

Sorry, this was hot-fixed.  See
ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.16-rc4/2.6.16-rc4-mm1/hot-fixes/.
 You'll want revert-register-sysfs-device-for-lp-devices.patch.  May as
well apply the others, too..

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] Re: 2.6.16-rc4-mm1 (bugs and lockups)
  2006-02-21 17:51 [patch] Re: 2.6.16-rc4-mm1 (bugs and lockups) Stas Sergeev
  2006-02-22  1:31 ` Andrew Morton
@ 2006-02-22  1:44 ` Andrew Morton
  2006-02-22 15:51   ` Stas Sergeev
  1 sibling, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2006-02-22  1:44 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: linux-kernel

Stas Sergeev <stsp@aknet.ru> wrote:
>
> main.c: moved smp_prepare_boot_cpu() call earlier. This
>  was necessary because otherwise printk() can't print
>  It checks cpu_online(), which returns false. This change
>  is consistent with the UP case, where's the boot CPU is
>  "online" from the very beginning, AFAICS. But again, I am
>  not entirely sure whether this is safe.
> 

Yeah, this is scary.  Early boot is fragile and complex and architectures
might not expect to run smp_prepare_boot_cpu() before setup_arch().

umm, actually it's wrong.  i386's smp_prepare_boot_cpu() diddles with
per-cpu memory, and that's not initialised at that stage.  See the call to
setup_per_cpu_areas() a few lines later.

So I'll drop that hunk.  How important is it in practice?

If it's purely to make printk print something then perhaps we can do
something expedient like:

#ifdef CONFIG_SMP
	cpu_set(smp_processor_id(), cpu_online_map);	/* comment */
#endif

right there in start_kernel()?

(That assumes that smp_processor_id() works at that stage.  Surely that's
true).

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] Re: 2.6.16-rc4-mm1 (bugs and lockups)
  2006-02-22  1:44 ` Andrew Morton
@ 2006-02-22 15:51   ` Stas Sergeev
  0 siblings, 0 replies; 4+ messages in thread
From: Stas Sergeev @ 2006-02-22 15:51 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1320 bytes --]

Hello.

Andrew Morton wrote:
> umm, actually it's wrong.  i386's smp_prepare_boot_cpu() diddles with
> per-cpu memory, and that's not initialised at that stage.  See the call to
> setup_per_cpu_areas() a few lines later.
> So I'll drop that hunk.  How important is it in practice?
It was important because it used to fix both the printk and
(completely accidentally!) the boot problem itself.

> #ifdef CONFIG_SMP
> 	cpu_set(smp_processor_id(), cpu_online_map);	/* comment */
> #endif
I don't even think #ifdef is needed. Having that for the UP
case may be useless, yet looks consistent to me.

> right there in start_kernel()?
This is enough for printk but not for the boot lockup.
The attached patch is however enough. And it should be
correct, as it is consistent with an UP case.

> (That assumes that smp_processor_id() works at that stage.  Surely that's
> true).
Looking into the arch-specific code, I can see that some
arches evaluate the boot-cpu number by some other means,
not by the smp_processor_id(). Still I am pretty sure the
patch won't hurt them.

With this patch and with the hotfixes, I've got the -mm
kernel working, thanks.

----

Register the boot-cpu in the cpu maps earlier to allow the
early printk to work, and to fix an obscure deadlock at boot.

Signed-off-by: Stas Sergeev <stsp@aknet.ru>


[-- Attachment #2: smpb.diff --]
[-- Type: text/x-patch, Size: 981 bytes --]

--- a/init/main.c	2006-02-21 10:36:04.000000000 +0300
+++ b/init/main.c	2006-02-22 11:30:01.000000000 +0300
@@ -440,6 +440,15 @@
  *	Activate the first processor.
  */
 
+static void boot_cpu_init(void)
+{
+	int cpu = smp_processor_id();
+	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
+	cpu_set(cpu, cpu_online_map);
+	cpu_set(cpu, cpu_present_map);
+	cpu_set(cpu, cpu_possible_map);
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -449,17 +458,13 @@
  * enable them
  */
 	lock_kernel();
+	boot_cpu_init();
 	page_address_init();
 	printk(KERN_NOTICE);
 	printk(linux_banner);
 	setup_arch(&command_line);
 	setup_per_cpu_areas();
-
-	/*
-	 * Mark the boot cpu "online" so that it can call console drivers in
-	 * printk() and can access its per-cpu storage.
-	 */
-	smp_prepare_boot_cpu();
+	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2006-02-22 15:51 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-02-21 17:51 [patch] Re: 2.6.16-rc4-mm1 (bugs and lockups) Stas Sergeev
2006-02-22  1:31 ` Andrew Morton
2006-02-22  1:44 ` Andrew Morton
2006-02-22 15:51   ` Stas Sergeev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox