public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] kdump: add a missing notifier before crashing
@ 2006-06-15 11:16 Akiyama, Nobuyuki
  2006-06-16  6:28 ` [Fastboot] " Eric W. Biederman
  0 siblings, 1 reply; 15+ messages in thread
From: Akiyama, Nobuyuki @ 2006-06-15 11:16 UTC (permalink / raw)
  To: fastboot, linux-kernel

Hi all,

The attached patch adds a missing notifier before crashing.
This patch is remade to follow the former discussions.
The change is that a notifier calling becomes optional.
Please refer to the following thread for details:

http://lists.osdl.org/pipermail/fastboot/2006-May/003018.html

Description:
We don't have a simple and light weight way to know the kernel dies.
The panic notifier does not be called if kdump is activated
because crash_kexec() does not return, and there is no mechanism to
notify of a crash before crashing by SysRq-c.
Although notify_die() exists, the function depends on architecture.
If notify_die() is added in panic and SysRq respectively like
existing implementation, the code will be ugly.
I think that adding a generic hook in crash_kexec() is better to
simplify the code.

This new notifier is useful, especially for a clustering system.
On a mission critical system, failover need to start within a few
milli-second. The notifier could be called on 2nd kernel, but it is
no use because it takes the time of second order to boot up.

The attached patch is against 2.6.17-rc6-git5.
I tested on i386-box.

Thanks,

Akiyama, Nobuyuki

Signed-off-by: Akiyama, Nobuyuki <akiyama.nobuyuk@jp.fujitsu.com>
---

 Documentation/filesystems/proc.txt |   11 +++++++++++
 arch/i386/kernel/traps.c           |    4 ++--
 arch/powerpc/kernel/traps.c        |    2 +-
 arch/x86_64/kernel/traps.c         |    4 ++--
 drivers/char/sysrq.c               |    2 +-
 include/linux/kexec.h              |   12 ++++++++++--
 include/linux/sysctl.h             |    1 +
 kernel/kexec.c                     |   18 +++++++++++++++++-
 kernel/panic.c                     |    2 +-
 kernel/sysctl.c                    |   11 +++++++++++
 10 files changed, 57 insertions(+), 10 deletions(-)

diff -Nurp linux-2.6.17-rc6-git5/Documentation/filesystems/proc.txt linux-2.6.17-rc6-git5.mod/Documentation/filesystems/proc.txt
--- linux-2.6.17-rc6-git5/Documentation/filesystems/proc.txt	2006-06-14 16:25:26.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/Documentation/filesystems/proc.txt	2006-06-15 14:48:59.000000000 +0900
@@ -1130,6 +1130,17 @@ If a system hangs up, try pressing the N
    And NMI watchdog will be disabled when the value in this file is set to
    non-zero.
 
+kdump_safe
+----------
+
+The value in this file affects behavior of a notifier before kdump. When the
+value is non-zero(default), the notifier is not called before crashing. If the
+notifier is expected to be called before crashing, set zero.
+
+[NOTE]
+   The notifier may be hung and kdump may be stalled because the notifier is
+   usually called under panic state. The value of this file should be decided
+   by the policy of system usage.  
 
 2.4 /proc/sys/vm - The virtual memory subsystem
 -----------------------------------------------
diff -Nurp linux-2.6.17-rc6-git5/arch/i386/kernel/traps.c linux-2.6.17-rc6-git5.mod/arch/i386/kernel/traps.c
--- linux-2.6.17-rc6-git5/arch/i386/kernel/traps.c	2006-06-14 16:25:26.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/arch/i386/kernel/traps.c	2006-06-14 16:02:46.000000000 +0900
@@ -414,7 +414,7 @@ void die(const char * str, struct pt_reg
 		return;
 
 	if (kexec_should_crash(current))
-		crash_kexec(regs);
+		crash_kexec(CRASH_ON_DIE, regs, NULL);
 
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
@@ -665,7 +665,7 @@ void die_nmi (struct pt_regs *regs, cons
 	*/
 	if (!user_mode_vm(regs)) {
 		current->thread.trap_no = 2;
-		crash_kexec(regs);
+		crash_kexec(CRASH_ON_DIE, regs, NULL);
 	}
 
 	do_exit(SIGSEGV);
diff -Nurp linux-2.6.17-rc6-git5/arch/powerpc/kernel/traps.c linux-2.6.17-rc6-git5.mod/arch/powerpc/kernel/traps.c
--- linux-2.6.17-rc6-git5/arch/powerpc/kernel/traps.c	2006-06-14 16:25:27.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/arch/powerpc/kernel/traps.c	2006-06-14 16:02:46.000000000 +0900
@@ -132,7 +132,7 @@ int die(const char *str, struct pt_regs 
 	if (!crash_dump_start && kexec_should_crash(current)) {
 		crash_dump_start = 1;
 		spin_unlock_irq(&die_lock);
-		crash_kexec(regs);
+		crash_kexec(CRASH_ON_DIE, regs, NULL);
 		/* NOTREACHED */
 	}
 	spin_unlock_irq(&die_lock);
diff -Nurp linux-2.6.17-rc6-git5/arch/x86_64/kernel/traps.c linux-2.6.17-rc6-git5.mod/arch/x86_64/kernel/traps.c
--- linux-2.6.17-rc6-git5/arch/x86_64/kernel/traps.c	2006-06-14 16:25:27.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/arch/x86_64/kernel/traps.c	2006-06-14 16:02:46.000000000 +0900
@@ -445,7 +445,7 @@ void __kprobes __die(const char * str, s
 	printk_address(regs->rip); 
 	printk(" RSP <%016lx>\n", regs->rsp); 
 	if (kexec_should_crash(current))
-		crash_kexec(regs);
+		crash_kexec(CRASH_ON_DIE, regs, NULL);
 }
 
 void die(const char * str, struct pt_regs * regs, long err)
@@ -469,7 +469,7 @@ void __kprobes die_nmi(char *str, struct
 	printk(str, safe_smp_processor_id());
 	show_registers(regs);
 	if (kexec_should_crash(current))
-		crash_kexec(regs);
+		crash_kexec(CRASH_ON_DIE, regs, NULL);
 	if (panic_on_timeout || panic_on_oops)
 		panic("nmi watchdog");
 	printk("console shuts up ...\n");
diff -Nurp linux-2.6.17-rc6-git5/drivers/char/sysrq.c linux-2.6.17-rc6-git5.mod/drivers/char/sysrq.c
--- linux-2.6.17-rc6-git5/drivers/char/sysrq.c	2006-06-14 16:25:28.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/drivers/char/sysrq.c	2006-06-14 16:02:46.000000000 +0900
@@ -99,7 +99,7 @@ static struct sysrq_key_op sysrq_unraw_o
 static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs,
 				struct tty_struct *tty)
 {
-	crash_kexec(pt_regs);
+	crash_kexec(CRASH_ON_SYSRQ, pt_regs, NULL);
 }
 static struct sysrq_key_op sysrq_crashdump_op = {
 	.handler	= sysrq_handle_crashdump,
diff -Nurp linux-2.6.17-rc6-git5/include/linux/kexec.h linux-2.6.17-rc6-git5.mod/include/linux/kexec.h
--- linux-2.6.17-rc6-git5/include/linux/kexec.h	2006-03-20 14:53:29.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/include/linux/kexec.h	2006-06-14 16:02:46.000000000 +0900
@@ -1,12 +1,18 @@
 #ifndef LINUX_KEXEC_H
 #define LINUX_KEXEC_H
 
+/* crash type for notifier */
+#define CRASH_ON_PANIC		1
+#define CRASH_ON_DIE		2
+#define CRASH_ON_SYSRQ		3
+
 #ifdef CONFIG_KEXEC
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
 #include <linux/ioport.h>
+#include <linux/notifier.h>
 #include <asm/kexec.h>
 
 /* Verify architecture specific macros are defined */
@@ -103,9 +109,11 @@ extern asmlinkage long compat_sys_kexec_
 #endif
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
-extern void crash_kexec(struct pt_regs *);
+extern void crash_kexec(int, struct pt_regs *, void *);
 int kexec_should_crash(struct task_struct *);
 extern struct kimage *kexec_image;
+extern struct raw_notifier_head crash_notifier_list;
+extern int kdump_safe;
 
 #define KEXEC_ON_CRASH  0x00000001
 #define KEXEC_ARCH_MASK 0xffff0000
@@ -133,7 +141,7 @@ extern note_buf_t *crash_notes;
 #else /* !CONFIG_KEXEC */
 struct pt_regs;
 struct task_struct;
-static inline void crash_kexec(struct pt_regs *regs) { }
+static inline void crash_kexec(int type, struct pt_regs *regs, void *v) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #endif /* CONFIG_KEXEC */
 #endif /* LINUX_KEXEC_H */
diff -Nurp linux-2.6.17-rc6-git5/include/linux/sysctl.h linux-2.6.17-rc6-git5.mod/include/linux/sysctl.h
--- linux-2.6.17-rc6-git5/include/linux/sysctl.h	2006-06-14 16:25:35.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/include/linux/sysctl.h	2006-06-14 16:02:46.000000000 +0900
@@ -148,6 +148,7 @@ enum
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
 	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
+	KERN_KDUMP_SAFE=73,	/* int: crash notifier flag */
 };
 
 
diff -Nurp linux-2.6.17-rc6-git5/kernel/kexec.c linux-2.6.17-rc6-git5.mod/kernel/kexec.c
--- linux-2.6.17-rc6-git5/kernel/kexec.c	2006-03-20 14:53:29.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/kernel/kexec.c	2006-06-14 16:02:46.000000000 +0900
@@ -20,6 +20,8 @@
 #include <linux/syscalls.h>
 #include <linux/ioport.h>
 #include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -27,6 +29,11 @@
 #include <asm/system.h>
 #include <asm/semaphore.h>
 
+
+RAW_NOTIFIER_HEAD(crash_notifier_list);
+EXPORT_SYMBOL(crash_notifier_list);
+int kdump_safe = 1;
+
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t* crash_notes;
 
@@ -1040,7 +1047,15 @@ asmlinkage long compat_sys_kexec_load(un
 }
 #endif
 
-void crash_kexec(struct pt_regs *regs)
+static inline void notify_crash(int type, void *v)
+{
+#ifdef CONFIG_SYSCTL
+	if (!kdump_safe)
+		raw_notifier_call_chain(&crash_notifier_list, type, v);
+#endif
+}
+
+void crash_kexec(int type, struct pt_regs *regs, void *v)
 {
 	struct kimage *image;
 	int locked;
@@ -1061,6 +1076,7 @@ void crash_kexec(struct pt_regs *regs)
 			struct pt_regs fixed_regs;
 			crash_setup_regs(&fixed_regs, regs);
 			machine_crash_shutdown(&fixed_regs);
+			notify_crash(type, v);
 			machine_kexec(image);
 		}
 		xchg(&kexec_lock, 0);
diff -Nurp linux-2.6.17-rc6-git5/kernel/panic.c linux-2.6.17-rc6-git5.mod/kernel/panic.c
--- linux-2.6.17-rc6-git5/kernel/panic.c	2006-06-14 16:25:35.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/kernel/panic.c	2006-06-14 16:02:46.000000000 +0900
@@ -85,7 +85,7 @@ NORET_TYPE void panic(const char * fmt, 
 	 * everything else.
 	 * Do we want to call this before we try to display a message?
 	 */
-	crash_kexec(NULL);
+	crash_kexec(CRASH_ON_PANIC, NULL, buf);
 
 #ifdef CONFIG_SMP
 	/*
diff -Nurp linux-2.6.17-rc6-git5/kernel/sysctl.c linux-2.6.17-rc6-git5.mod/kernel/sysctl.c
--- linux-2.6.17-rc6-git5/kernel/sysctl.c	2006-06-14 16:25:35.000000000 +0900
+++ linux-2.6.17-rc6-git5.mod/kernel/sysctl.c	2006-06-14 16:02:46.000000000 +0900
@@ -46,6 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
+#include <linux/kexec.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -683,6 +684,16 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#if defined(CONFIG_KEXEC)
+	{
+		.ctl_name       = KERN_KDUMP_SAFE,
+		.procname       = "kdump_safe",
+		.data           = &kdump_safe,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-15 11:16 [PATCH] kdump: add a missing notifier before crashing Akiyama, Nobuyuki
@ 2006-06-16  6:28 ` Eric W. Biederman
  2006-06-16 12:15   ` Akiyama, Nobuyuki
  0 siblings, 1 reply; 15+ messages in thread
From: Eric W. Biederman @ 2006-06-16  6:28 UTC (permalink / raw)
  To: Akiyama, Nobuyuki; +Cc: fastboot, linux-kernel

"Akiyama, Nobuyuki" <akiyama.nobuyuk@jp.fujitsu.com> writes:

> Hi all,
>
> The attached patch adds a missing notifier before crashing.
> This patch is remade to follow the former discussions.
> The change is that a notifier calling becomes optional.
> Please refer to the following thread for details:
>
> http://lists.osdl.org/pipermail/fastboot/2006-May/003018.html
>
> Description:
> We don't have a simple and light weight way to know the kernel dies.
> The panic notifier does not be called if kdump is activated
> because crash_kexec() does not return, and there is no mechanism to
> notify of a crash before crashing by SysRq-c.
> Although notify_die() exists, the function depends on architecture.
> If notify_die() is added in panic and SysRq respectively like
> existing implementation, the code will be ugly.
> I think that adding a generic hook in crash_kexec() is better to
> simplify the code.
>
> This new notifier is useful, especially for a clustering system.
> On a mission critical system, failover need to start within a few
> milli-second. The notifier could be called on 2nd kernel, but it is
> no use because it takes the time of second order to boot up.
>
> The attached patch is against 2.6.17-rc6-git5.
> I tested on i386-box.

Please give a concrete example of a failure mode where this allows
you to meet your timing constraint.

I have yet to be convinced that this actually solves a real world
problem.

What is the cost of the notifier you wish to implement?
What is your guarantee that the system won't have wasted seconds
detecting it can't allocate memory or other cases?

If we go this route the notifier should not be exported to modules.
Only the most scrutinized of code paths should ever set this,
and code like that should never be a module.

The patchset that adds the notifier call needs to include the notifier
so people can look and see how sane this is.

So far what I have seen are hand waving arguments that failures
that can never happen must be detected and reported within
milliseconds to another machine in an unspecified manner.  Your kernel
startup times are asserted to be to large to do this from the next
kernel, but the code to do so is sufficiently complicated you can't do
this in the kexec code stub that runs before it starts your next
kernel.

I am sympathetic but this interface seems to set expectations that
we can the impossible, and it still appears unnecessary to me.

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-16  6:28 ` [Fastboot] " Eric W. Biederman
@ 2006-06-16 12:15   ` Akiyama, Nobuyuki
  2006-06-16 16:37     ` Eric W. Biederman
  0 siblings, 1 reply; 15+ messages in thread
From: Akiyama, Nobuyuki @ 2006-06-16 12:15 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: fastboot, linux-kernel

On Fri, 16 Jun 2006 00:28:08 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:

> Please give a concrete example of a failure mode where this allows
> you to meet your timing constraint.
> 
> I have yet to be convinced that this actually solves a real world
> problem.
> 
> What is the cost of the notifier you wish to implement?
> What is your guarantee that the system won't have wasted seconds
> detecting it can't allocate memory or other cases?
> 
> If we go this route the notifier should not be exported to modules.
> Only the most scrutinized of code paths should ever set this,
> and code like that should never be a module.
> 
> The patchset that adds the notifier call needs to include the notifier
> so people can look and see how sane this is.
> 
> So far what I have seen are hand waving arguments that failures
> that can never happen must be detected and reported within
> milliseconds to another machine in an unspecified manner.  Your kernel
> startup times are asserted to be to large to do this from the next
> kernel, but the code to do so is sufficiently complicated you can't do
> this in the kexec code stub that runs before it starts your next
> kernel.
> 
> I am sympathetic but this interface seems to set expectations that
> we can the impossible, and it still appears unnecessary to me.

As I mentioned many times, this notifier is very effective in the
clustering system. Actually the Red Hat's kernel has a notifier
at the same point like this patch. I know a real system that failover
of DB server is immediately done by using this notifier.
It takes much cost to keep consistency of transaction processing
on DB system. Therefore, to shorten down time, it is very important
to immediately know that the system dies. In a mission critical system,
millisecond order or less is demanded.
The processing of the notifier is to make a SCSI adaptor power off to
stop writing in the shared disk completely and then notify to standby-node.
But as you think, it is sure not to necessarily become this scenario.
For instance, if the kernel hangs, the failure can be detected only
by heart-beat. In this case the detection time becomes longer.

Anyway this notifier is very effective and important in the actual world. 
Another example is as follows:

http://lists.osdl.org/pipermail/fastboot/2006-June/003028.html

Thanks,

Akiyama, Nobuyuki


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-16 12:15   ` Akiyama, Nobuyuki
@ 2006-06-16 16:37     ` Eric W. Biederman
  2006-06-19  7:30       ` Akiyama, Nobuyuki
  0 siblings, 1 reply; 15+ messages in thread
From: Eric W. Biederman @ 2006-06-16 16:37 UTC (permalink / raw)
  To: Akiyama, Nobuyuki; +Cc: fastboot, linux-kernel

"Akiyama, Nobuyuki" <akiyama.nobuyuk@jp.fujitsu.com> writes:

>> I am sympathetic but this interface seems to set expectations that
>> we can the impossible, and it still appears unnecessary to me.
>
> As I mentioned many times, this notifier is very effective in the
> clustering system. Actually the Red Hat's kernel has a notifier
> at the same point like this patch.

Ok.  Then maybe someone at redhat can help you convince me.
Why is the appropriate redhat person not sending this patch
upstream?

> I know a real system that failover
> of DB server is immediately done by using this notifier.

Agreed the code can be used.  I'm asking if this makes sense,
and if it is reliable, and if it typically meets the deadline.

> It takes much cost to keep consistency of transaction processing
> on DB system. Therefore, to shorten down time, it is very important
> to immediately know that the system dies. In a mission critical system,
> millisecond order or less is demanded.

I'm not arguing against your requirements.  I'm arguing that I don't
see how this allows you to meet your requirements when you can't
with the current kernel code.

If you are an existing user of a panic notifier I can see how
this removes the need to convert code, because the technique
does not change.  Unfortunately this is the only advantage I see
to this patch.

> The processing of the notifier is to make a SCSI adaptor power off to
> stop writing in the shared disk completely and then notify to standby-node.

The kernel has called panic no new SCSI operations were execute.
I'm not saying don't notify your standby-node

> But as you think, it is sure not to necessarily become this scenario.
> For instance, if the kernel hangs, the failure can be detected only
> by heart-beat. In this case the detection time becomes longer.
>
> Anyway this notifier is very effective and important in the actual world. 
> Another example is as follows:

Please walk me through a real world kernel failure, and show me how
your millisecond requirement is met.

In the example please answer:
- What causes the kernel to call panic?
- From the real failure to the kernel calling panic how long
  does it take?
- What actions does the notifier take to tell the other kernel
  it is dead.
- Why do we think the kernel taking that action will be reliable?
- From the point where we call panic() how long does it take until
  the kdump kernel is active?


> Anyway this notifier is very effective and important in the actual world. 
> Another example is as follows:
> 
> http://lists.osdl.org/pipermail/fastboot/2006-June/003028.html

Hmm.  So if I read this correctly all you need to execute is a single
outb instruction?

This is part one of my biggest confusions what do you need to
do to notify the other node that you have died?

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-16 16:37     ` Eric W. Biederman
@ 2006-06-19  7:30       ` Akiyama, Nobuyuki
  2006-06-19 12:47         ` Eric W. Biederman
  0 siblings, 1 reply; 15+ messages in thread
From: Akiyama, Nobuyuki @ 2006-06-19  7:30 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: fastboot, linux-kernel

On Fri, 16 Jun 2006 10:37:05 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:

> > The processing of the notifier is to make a SCSI adaptor power off to
> > stop writing in the shared disk completely and then notify to standby-node.
> 
> The kernel has called panic no new SCSI operations were execute.
> I'm not saying don't notify your standby-node

As you say, the kernel does not do anything about SCSI operations.
But many SCSI adaptors flush their cache after a few seconds pass
after a SCSI write command is invoked, especially RAID cards.
To completely stop writing immediately, we should make the adaptor
power off.

> Please walk me through a real world kernel failure, and show me how
> your millisecond requirement is met.
> 
> In the example please answer:
> - What causes the kernel to call panic?
> - From the real failure to the kernel calling panic how long
>   does it take?

For instance, if a file system inconsistency is detected,
it takes few time until invoking panic.
I have seen various kernel failure so far and these will
unfortunately occur.

> - What actions does the notifier take to tell the other kernel
>   it is dead.

The operation is only writing to BMC a few times to use IPMI
interface. That operation using outb is very simple.

> - Why do we think the kernel taking that action will be reliable?

I agree the notifier may spoil reliability as compared with doing
nothing. It depends on quality of the notifier processing.
But I think the one is needed because it is more effective.

> - From the point where we call panic() how long does it take until
>   the kdump kernel is active?

On my box it takes about one second or so, but on a actual enterprise
system which have many disks(hundreds or more) it becomes more.

Thanks,

--
Akiyama, Nobuyuki


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19  7:30       ` Akiyama, Nobuyuki
@ 2006-06-19 12:47         ` Eric W. Biederman
  2006-06-19 13:28           ` Preben Traerup
  0 siblings, 1 reply; 15+ messages in thread
From: Eric W. Biederman @ 2006-06-19 12:47 UTC (permalink / raw)
  To: Akiyama, Nobuyuki; +Cc: fastboot, linux-kernel

"Akiyama, Nobuyuki" <akiyama.nobuyuk@jp.fujitsu.com> writes:

> On Fri, 16 Jun 2006 10:37:05 -0600
> ebiederm@xmission.com (Eric W. Biederman) wrote:
>
>> > The processing of the notifier is to make a SCSI adaptor power off to
>> > stop writing in the shared disk completely and then notify to standby-node.
>> 
>> The kernel has called panic no new SCSI operations were execute.
>> I'm not saying don't notify your standby-node
>
> As you say, the kernel does not do anything about SCSI operations.
> But many SCSI adaptors flush their cache after a few seconds pass
> after a SCSI write command is invoked, especially RAID cards.
> To completely stop writing immediately, we should make the adaptor
> power off.

Yes.  Although I don't have a clue what big scsi has to do with a
telco systems.

>> Please walk me through a real world kernel failure, and show me how
>> your millisecond requirement is met.
>> 
>> In the example please answer:
>> - What causes the kernel to call panic?
>> - From the real failure to the kernel calling panic how long
>>   does it take?
>
> For instance, if a file system inconsistency is detected,
> it takes few time until invoking panic.

What is a few time?

> I have seen various kernel failure so far and these will
> unfortunately occur.

Yes kernel failures will occur, people and hardware are imperfect.
But the should be quite rare, on the telco gear you were talking
about.

>> - What actions does the notifier take to tell the other kernel
>>   it is dead.
>
> The operation is only writing to BMC a few times to use IPMI
> interface. That operation using outb is very simple.

Ok.  A simple outb to the BMC through the IPMI interface.

>> - Why do we think the kernel taking that action will be reliable?
>
> I agree the notifier may spoil reliability as compared with doing
> nothing. It depends on quality of the notifier processing.
> But I think the one is needed because it is more effective.

It depends very much on what you are doing.  We have C code that
runs before the dump kernel is started.  It would be absolutely
trivial to modify that C code to tell the IPMI controller that
something has happened.  That operation can happen then after
it has checked a checksum of itself.  

>> - From the point where we call panic() how long does it take until
>>   the kdump kernel is active?
>
> On my box it takes about one second or so, but on a actual enterprise
> system which have many disks(hundreds or more) it becomes more.

Certainly.  But a system with hundreds of disks isn't the system
with a millisecond response time limit.  In general you don't need
to initialize all of your disks just to take a crash dump so even
without optimizing the kernel the kernel things are slow.

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19 12:47         ` Eric W. Biederman
@ 2006-06-19 13:28           ` Preben Traerup
  2006-06-19 16:49             ` Eric W. Biederman
  2006-06-20  3:39             ` Akiyama, Nobuyuki
  0 siblings, 2 replies; 15+ messages in thread
From: Preben Traerup @ 2006-06-19 13:28 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Akiyama, Nobuyuki, fastboot, linux-kernel

Eric W. Biederman wrote:

>"Akiyama, Nobuyuki" <akiyama.nobuyuk@jp.fujitsu.com> writes:
>
>  
>
>>On Fri, 16 Jun 2006 10:37:05 -0600
>>ebiederm@xmission.com (Eric W. Biederman) wrote:
>>
>>    
>>
>>>>The processing of the notifier is to make a SCSI adaptor power off to
>>>>stop writing in the shared disk completely and then notify to standby-node.
>>>>        
>>>>
>>>The kernel has called panic no new SCSI operations were execute.
>>>I'm not saying don't notify your standby-node
>>>      
>>>
>>As you say, the kernel does not do anything about SCSI operations.
>>But many SCSI adaptors flush their cache after a few seconds pass
>>after a SCSI write command is invoked, especially RAID cards.
>>To completely stop writing immediately, we should make the adaptor
>>power off.
>>    
>>
>
>Yes.  Although I don't have a clue what big scsi has to do with a
>telco systems.
>  
>
Strictly speaking for myself: Nothing.

Mr. Akiyama Nobuyuk gave an example from his environment which is cluster systems.
I was the one saying we in our Telco systems could use this feature too.

The only thing Mr. Akiyama Nobuyuk and I have in common is we both would like to do
something before crash dumping, simply because the less mess we will have to cleanup
afterwards in the system taking over, the better.

Mr. Akiyama Nobuyuk operates on SCSI devices to avoid filesystem corruptions.
My usage would be more like notifying external management to get traffic 
redirected to server systems taking over.

./Preben




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19 13:28           ` Preben Traerup
@ 2006-06-19 16:49             ` Eric W. Biederman
  2006-06-19 17:07               ` Vivek Goyal
  2006-06-20  3:39             ` Akiyama, Nobuyuki
  1 sibling, 1 reply; 15+ messages in thread
From: Eric W. Biederman @ 2006-06-19 16:49 UTC (permalink / raw)
  To: Preben Traerup; +Cc: Akiyama, Nobuyuki, fastboot, linux-kernel

Preben Traerup <Preben.Trarup@ericsson.com> writes:

> Strictly speaking for myself: Nothing.
>
> Mr. Akiyama Nobuyuk gave an example from his environment which is cluster
> systems.
> I was the one saying we in our Telco systems could use this feature too.
>
> The only thing Mr. Akiyama Nobuyuk and I have in common is we both would like to
> do
> something before crash dumping, simply because the less mess we will have to
> cleanup
> afterwards in the system taking over, the better.
>
> Mr. Akiyama Nobuyuk operates on SCSI devices to avoid filesystem corruptions.
> My usage would be more like notifying external management to get traffic
> redirected to server systems taking over.

Ok. That resolves some of my confusion.

After think this over here is my position.

There may be cases where it is warranted to add a call during crash_kexec.
I have seen no evidence that the cases where we want something happening
in crash_kexec are going to be at all common. It is my opinion anything
added to the crash_kexec path needs a case by case review.

Therefore if something is needs to happen in the crash kexec path it
should be a direct function call.  No pointers and no hooks.  Just call
the function.

Patches that and add an explicit function call allow for case by case review
and convey the message that you really don't want to do that, and that we
are really dealing with an exceptional circumstance.

Does this sound like a reasonable position?

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19 16:49             ` Eric W. Biederman
@ 2006-06-19 17:07               ` Vivek Goyal
  2006-06-19 17:50                 ` Eric W. Biederman
  0 siblings, 1 reply; 15+ messages in thread
From: Vivek Goyal @ 2006-06-19 17:07 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Preben Traerup, fastboot, linux-kernel

On Mon, Jun 19, 2006 at 10:49:32AM -0600, Eric W. Biederman wrote:
> Preben Traerup <Preben.Trarup@ericsson.com> writes:
> 
> > Strictly speaking for myself: Nothing.
> >
> > Mr. Akiyama Nobuyuk gave an example from his environment which is cluster
> > systems.
> > I was the one saying we in our Telco systems could use this feature too.
> >
> > The only thing Mr. Akiyama Nobuyuk and I have in common is we both would like to
> > do
> > something before crash dumping, simply because the less mess we will have to
> > cleanup
> > afterwards in the system taking over, the better.
> >
> > Mr. Akiyama Nobuyuk operates on SCSI devices to avoid filesystem corruptions.
> > My usage would be more like notifying external management to get traffic
> > redirected to server systems taking over.
> 
> Ok. That resolves some of my confusion.
> 
> After think this over here is my position.
> 
> There may be cases where it is warranted to add a call during crash_kexec.
> I have seen no evidence that the cases where we want something happening
> in crash_kexec are going to be at all common. It is my opinion anything
> added to the crash_kexec path needs a case by case review.
> 
> Therefore if something is needs to happen in the crash kexec path it
> should be a direct function call.  No pointers and no hooks.  Just call
> the function.
> 
> Patches that and add an explicit function call allow for case by case review
> and convey the message that you really don't want to do that, and that we
> are really dealing with an exceptional circumstance.
> 
> Does this sound like a reasonable position?

Sounds like trouble for modules. I am assuming that code to power down the
scsi disks/controller will be part of the driver, which is generally built
as a module and also assuming that powering down the disks is a valid
requirement after the crash.

After introducing an option to disable/enable crash notifiers from user
space I think now responsibility lies to with user. If he chooses to enable
the notifiers, he understands that there are chances that we never boot
into the next kernel and get lost in between. 

Thanks
Vivek

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19 17:07               ` Vivek Goyal
@ 2006-06-19 17:50                 ` Eric W. Biederman
  2006-06-19 18:19                   ` Vivek Goyal
  2006-06-20  3:46                   ` Akiyama, Nobuyuki
  0 siblings, 2 replies; 15+ messages in thread
From: Eric W. Biederman @ 2006-06-19 17:50 UTC (permalink / raw)
  To: vgoyal; +Cc: Preben Traerup, fastboot, linux-kernel

Vivek Goyal <vgoyal@in.ibm.com> writes:

> On Mon, Jun 19, 2006 at 10:49:32AM -0600, Eric W. Biederman wrote:
>
> Sounds like trouble for modules. I am assuming that code to power down the
> scsi disks/controller will be part of the driver, which is generally built
> as a module and also assuming that powering down the disks is a valid
> requirement after the crash.

I'm assuming if anything is important and critical enough to be in a crash
notifier it can be built into the kernel.

> After introducing an option to disable/enable crash notifiers from user
> space I think now responsibility lies to with user. If he chooses to enable
> the notifiers, he understands that there are chances that we never boot
> into the next kernel and get lost in between. 

At the moment this is a lot of infrastructure for a vaguely defined
case that I have yet to see defined.

One of the reasons using kexec for this kind of activity was precisely
because it doesn't do any of this when the kernel is known to be
broken.

Having notifiers and being able to disable them is designing for an
unspecified case.  We need to concentrate on the fundamentals here.
Do any of these crash notifiers make sense?

If after the notifiers are well understood they it makes sense to
add a general framework etc, then I'm for it.

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19 17:50                 ` Eric W. Biederman
@ 2006-06-19 18:19                   ` Vivek Goyal
  2006-06-19 18:45                     ` Eric W. Biederman
  2006-06-20  3:46                   ` Akiyama, Nobuyuki
  1 sibling, 1 reply; 15+ messages in thread
From: Vivek Goyal @ 2006-06-19 18:19 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Preben Traerup, fastboot, linux-kernel

On Mon, Jun 19, 2006 at 11:50:24AM -0600, Eric W. Biederman wrote:
> Vivek Goyal <vgoyal@in.ibm.com> writes:
> 
> > On Mon, Jun 19, 2006 at 10:49:32AM -0600, Eric W. Biederman wrote:
> >
> > Sounds like trouble for modules. I am assuming that code to power down the
> > scsi disks/controller will be part of the driver, which is generally built
> > as a module and also assuming that powering down the disks is a valid
> > requirement after the crash.
> 
> I'm assuming if anything is important and critical enough to be in a crash
> notifier it can be built into the kernel.
> 
> > After introducing an option to disable/enable crash notifiers from user
> > space I think now responsibility lies to with user. If he chooses to enable
> > the notifiers, he understands that there are chances that we never boot
> > into the next kernel and get lost in between. 
> 
> At the moment this is a lot of infrastructure for a vaguely defined
> case that I have yet to see defined.
> 
> One of the reasons using kexec for this kind of activity was precisely
> because it doesn't do any of this when the kernel is known to be
> broken.
> 
> Having notifiers and being able to disable them is designing for an
> unspecified case.  We need to concentrate on the fundamentals here.
> Do any of these crash notifiers make sense?
> 

Agreed. That makes sense. Probably folks who want this functionality
should also post the code which they would like to run from inside the
notifiers so that requirement is understood more clearly.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19 18:19                   ` Vivek Goyal
@ 2006-06-19 18:45                     ` Eric W. Biederman
  0 siblings, 0 replies; 15+ messages in thread
From: Eric W. Biederman @ 2006-06-19 18:45 UTC (permalink / raw)
  To: vgoyal; +Cc: Preben Traerup, fastboot, linux-kernel

Vivek Goyal <vgoyal@in.ibm.com> writes:

> On Mon, Jun 19, 2006 at 11:50:24AM -0600, Eric W. Biederman wrote:
>> 
>> Having notifiers and being able to disable them is designing for an
>> unspecified case.  We need to concentrate on the fundamentals here.
>> Do any of these crash notifiers make sense?
>> 
>
> Agreed. That makes sense. Probably folks who want this functionality
> should also post the code which they would like to run from inside the
> notifiers so that requirement is understood more clearly.

Which is why I will be happy to see patch that call the functions directly.
Without the notification layer.

If they make sense I will even be happy to see those patches go upstream.

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19 13:28           ` Preben Traerup
  2006-06-19 16:49             ` Eric W. Biederman
@ 2006-06-20  3:39             ` Akiyama, Nobuyuki
  1 sibling, 0 replies; 15+ messages in thread
From: Akiyama, Nobuyuki @ 2006-06-20  3:39 UTC (permalink / raw)
  To: Preben Traerup; +Cc: ebiederm, fastboot, linux-kernel

On Mon, 19 Jun 2006 15:28:23 +0200
Preben Traerup <Preben.Trarup@ericsson.com> wrote:

> Strictly speaking for myself: Nothing.
> 
> Mr. Akiyama Nobuyuk gave an example from his environment which is cluster systems.
> I was the one saying we in our Telco systems could use this feature too.
> 
> The only thing Mr. Akiyama Nobuyuk and I have in common is we both would like to do
> something before crash dumping, simply because the less mess we will have to cleanup
> afterwards in the system taking over, the better.
> 
> Mr. Akiyama Nobuyuk operates on SCSI devices to avoid filesystem corruptions.
> My usage would be more like notifying external management to get traffic 
> redirected to server systems taking over.
> 

Thanks, Preben.
I appreciate your good interpretation;-)

Thanks,
--
Akiyama, Nobuyuki


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-06-19 17:50                 ` Eric W. Biederman
  2006-06-19 18:19                   ` Vivek Goyal
@ 2006-06-20  3:46                   ` Akiyama, Nobuyuki
  1 sibling, 0 replies; 15+ messages in thread
From: Akiyama, Nobuyuki @ 2006-06-20  3:46 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: vgoyal, Preben.Trarup, fastboot, linux-kernel

On Mon, 19 Jun 2006 11:50:24 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:

> Vivek Goyal <vgoyal@in.ibm.com> writes:
> 
> > On Mon, Jun 19, 2006 at 10:49:32AM -0600, Eric W. Biederman wrote:
> >
> > Sounds like trouble for modules. I am assuming that code to power down the
> > scsi disks/controller will be part of the driver, which is generally built
> > as a module and also assuming that powering down the disks is a valid
> > requirement after the crash.
> 
> I'm assuming if anything is important and critical enough to be in a crash
> notifier it can be built into the kernel.

No. On enterprise system, commercial distributions are generally
used and we can not usually modify the kernel code.
So that is the reason why I need generic add-on hook.
I have put a actual usage, and Preben also. I think it is difficult
to define precise characteristic of the notifier at the desk.

Thanks,
--
Akiyama, Nobuyuki


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Fastboot] [PATCH] kdump: add a missing notifier before crashing
  2006-07-04 12:33 Akiyama, Nobuyuki
@ 2006-07-05 16:14 ` Eric W. Biederman
  0 siblings, 0 replies; 15+ messages in thread
From: Eric W. Biederman @ 2006-07-05 16:14 UTC (permalink / raw)
  To: Akiyama, Nobuyuki; +Cc: fastboot, linux-kernel

"Akiyama, Nobuyuki" <akiyama.nobuyuk@jp.fujitsu.com> writes:

> Hi all,
>
> The attached patch adds a missing notifier before crashing.
NAK

It's not missing.  It should not exist.

> This patch is remade for 2.6.17-git22.
> I tested this patch on a i386-box.
>
> Please refer to the previous discussions for details:
> http://lists.osdl.org/pipermail/fastboot/2006-May/003018.html
> http://lists.osdl.org/pipermail/fastboot/2006-June/003113.html
>
> Description:
> We don't have a simple and light weight way to know the
> kernel dies. The panic notifier does not be called if kdump
> is activated because crash_kexec() does not return,
> and there is no mechanism to notify of a crash before
> crashing by SysRq-c.
> Although notify_die() exists, but the function depends on
> architecture. If notify_die() is added in panic and SysRq
> respectively like existing implementation, the code will be
> very ugly. I think that adding a generic hook in crash_kexec()
> is better to simplify the code.
>
> For example, the clustering system can take advantage of this
> notifier. On a mission critical system, failover needs to start
> within a few milli-second. The notifier could be called on
> 2nd kernel, but it is no use because it takes the time of
> second order to boot up.
>
> On an actual system, the notifier turns off HBA's power to
> stop accessing shared disk, and then notifies standby node
> that the current node died. 

And again NAK.
Just call the stupid HBA routine directly if this is necessary.
The call can compile to nothing when you HBA is not compiled in.

This is completely unacceptable until we see the code that you are
calling.

If we do export your notifier list it needs to be at least a
GPL only export as this is very much in the guts of the kernel.

As written this seriously destabilizes the kexec on panic support.

I will happy to have a solution to this problem but not this solution.
Especially not without an in-kernel user.

Nacked-by: Eric Biederman <ebiederm@xmission.com>

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2006-07-05 16:14 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-15 11:16 [PATCH] kdump: add a missing notifier before crashing Akiyama, Nobuyuki
2006-06-16  6:28 ` [Fastboot] " Eric W. Biederman
2006-06-16 12:15   ` Akiyama, Nobuyuki
2006-06-16 16:37     ` Eric W. Biederman
2006-06-19  7:30       ` Akiyama, Nobuyuki
2006-06-19 12:47         ` Eric W. Biederman
2006-06-19 13:28           ` Preben Traerup
2006-06-19 16:49             ` Eric W. Biederman
2006-06-19 17:07               ` Vivek Goyal
2006-06-19 17:50                 ` Eric W. Biederman
2006-06-19 18:19                   ` Vivek Goyal
2006-06-19 18:45                     ` Eric W. Biederman
2006-06-20  3:46                   ` Akiyama, Nobuyuki
2006-06-20  3:39             ` Akiyama, Nobuyuki
  -- strict thread matches above, loose matches on Subject: below --
2006-07-04 12:33 Akiyama, Nobuyuki
2006-07-05 16:14 ` [Fastboot] " Eric W. Biederman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox