linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
@ 2010-12-22 23:35 Seiji Aguchi
  2010-12-23  0:29 ` Greg KH
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Seiji Aguchi @ 2010-12-22 23:35 UTC (permalink / raw)
  To: rdunlap@xenotime.net, tglx@linutronix.de, mingo@redhat.com,
	hpa@zytor.com, x86@kernel.org, ebiederm@xmission.com,
	andi@firstfloor.org, akpm@linuxfoundation.org,
	eugeneteo@kernel.org, kees.cook@canonical.com,
	drosenberg@vsecurity.com, ying.huang@intel.com,
	len.brown@intel.com, seto.hidetoshi@jp.fujitsu.com,
	paulmck@linux.vnet.ibm.com, gregkh@suse.de, davem@davemloft.net,
	hadi@cyberus.ca, hawk@comx.dk, opurdila@ixiacom.com,
	hidave.darkstar@gmail.com, dzickus@redhat.com,
	eric.dumazet@gmail.com, ext-andriy.shevchenko@nokia.com,
	tj@kernel.org, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, kexec@lists.infradead.org,
	linux-mm@kvack.org, dle-develop@lists.sourceforge.net
  Cc: Satoru Moriya, Seiji Aguchi

Hi,

[Purpose]
Kexec may trigger additional hardware errors and multiply the damage 
if it works after MCE occurred because there are some hardware-related 
operations in kexec as follows.
  - Sending NMI to cpus
  - Initializing hardware during boot process of second kernel.
  - Accessing to memory and dumping it to disks.

So, I propose adding a new option controlling kexec behaviour when MCE 
occurred.
This patch prevents unnecessary hardware errors and avoid expanding 
the damage.

[Patch Description]
I added a sysctl option ,kernel.kexec_on_mce, controlling kexec behaviour 
when MCE occurred.

 - Permission
   - 0644
 - Value(default is "1")
   - non-zero: Kexec is enabled regardless of MCE.
   - 0: Kexec is disabled when MCE occurred.

Matrix of kernel.kexec_on_mce value, MCE and kexec behaviour

--------------------------------------------------
kernel.kexec_on_mce| MCE          | kexec behaviour
--------------------------------------------------
non-zero           | occurred     | enabled
                   -------------------------------
                   | not occurred | enabled
--------------------------------------------------
0                  | occurred     | disabled
                   |------------------------------
                   | not occurred | enabled
--------------------------------------------------

Any comments and suggestions are welcome.

Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>

---
 Documentation/sysctl/kernel.txt  |   12 ++++++++++++
 arch/x86/include/asm/mce.h       |    2 ++
 arch/x86/kernel/cpu/mcheck/mce.c |    4 ++++
 include/linux/sysctl.h           |    1 +
 kernel/kexec.c                   |    7 +++++++
 kernel/sysctl.c                  |   12 ++++++++++++
 kernel/sysctl_binary.c           |    1 +
 mm/memory-failure.c              |    9 +++++++++
 8 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 209e158..ce3240e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kexec_on_mce                [ X86 only ]
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
@@ -261,6 +262,17 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kexec_on_mce: (X86 only)
+
+Controls the kexec behaviour when MCE occurred.
+Default value is 1.
+
+0: Kexec is disabled when MCE occurred.
+non-zero: Kexec is enabled regardless of MCE.
+
+
+==============================================================
+
 kstack_depth_to_print: (X86 only)
 
 Controls the number of words to print when dumping the raw
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13c..062dabd 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -123,6 +123,8 @@ extern struct atomic_notifier_head x86_mce_decoder_chain;
 
 extern int mce_disabled;
 extern int mce_p5_enabled;
+extern int kexec_on_mce;
+extern int mce_flag;
 
 #ifdef CONFIG_X86_MCE
 int mcheck_init(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7a35b72..edbaf77 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -85,6 +85,8 @@ static int			mce_dont_log_ce		__read_mostly;
 int				mce_cmci_disabled	__read_mostly;
 int				mce_ignore_ce		__read_mostly;
 int				mce_ser			__read_mostly;
+int				kexec_on_mce = 1;
+int				mce_flag;
 
 struct mce_bank                *mce_banks		__read_mostly;
 
@@ -944,6 +946,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	percpu_inc(mce_exception_count);
 
+	mce_flag = 1;
+
 	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
 		goto out;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7bb5cb6..0ebe708 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
 	KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+	KERN_KEXEC_ON_MCE=77, /* int: whether we will dump memory on mce */
 };
 
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045b..3e5c41a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -39,6 +39,7 @@
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/sections.h>
+#include <asm/mce.h>
 
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
@@ -1074,6 +1075,12 @@ void crash_kexec(struct pt_regs *regs)
 	 * of memory the xchg(&kexec_crash_image) would be
 	 * sufficient.  But since I reuse the memory...
 	 */
+#ifdef CONFIG_X86_MCE
+	if (!kexec_on_mce && mce_flag) {
+		printk(KERN_WARNING "Kexec is disabled because MCE occurred\n");
+		return;
+	}
+#endif
 	if (mutex_trylock(&kexec_mutex)) {
 		if (kexec_crash_image) {
 			struct pt_regs fixed_regs;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa15..3a64cd6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,9 @@
 #include <linux/nmi.h>
 #endif
 
+#ifdef CONFIG_X86_MCE
+#include <asm/mce.h>
+#endif
 
 #if defined(CONFIG_SYSCTL)
 
@@ -963,6 +966,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+#if defined(CONFIG_X86_MCE)
+	{
+		.procname	= "kexec_on_mce",
+		.data		= &kexec_on_mce,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c57..a25f971 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -138,6 +138,7 @@ static const struct bin_table bin_kern_table[] = {
 	{ CTL_INT,	KERN_MAX_LOCK_DEPTH,		"max_lock_depth" },
 	{ CTL_INT,	KERN_NMI_WATCHDOG,		"nmi_watchdog" },
 	{ CTL_INT,	KERN_PANIC_ON_NMI,		"panic_on_unrecovered_nmi" },
+	{ CTL_INT,	KERN_KEXEC_ON_MCE,		"kexec_on_mce" },
 	{}
 };
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c0..3ec075a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,11 @@
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
+
+#ifdef CONFIG_X86_MCE
+#include <asm/mce.h>
+#endif
+
 #include "internal.h"
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -949,6 +954,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	int res;
 	unsigned int nr_pages;
 
+#ifdef CONFIG_X86_MCE
+	mce_flag = 1;
+#endif
+
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
 
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-22 23:35 [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred Seiji Aguchi
@ 2010-12-23  0:29 ` Greg KH
  2010-12-23  7:43 ` Andi Kleen
  2010-12-27  1:56 ` Hidetoshi Seto
  2 siblings, 0 replies; 11+ messages in thread
From: Greg KH @ 2010-12-23  0:29 UTC (permalink / raw)
  To: Seiji Aguchi
  Cc: rdunlap@xenotime.net, tglx@linutronix.de, mingo@redhat.com,
	hpa@zytor.com, x86@kernel.org, ebiederm@xmission.com,
	andi@firstfloor.org, akpm@linuxfoundation.org,
	eugeneteo@kernel.org, kees.cook@canonical.com,
	drosenberg@vsecurity.com, ying.huang@intel.com,
	len.brown@intel.com, seto.hidetoshi@jp.fujitsu.com,
	paulmck@linux.vnet.ibm.com, davem@davemloft.net, hadi@cyberus.ca,
	hawk@comx.dk, opurdila@ixiacom.com, hidave.darkstar@gmail.com,
	dzickus@redhat.com, eric.dumazet@gmail.com,
	ext-andriy.shevchenko@nokia.com, tj@kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	dle-develop@lists.sourceforge.net, Satoru Moriya

On Wed, Dec 22, 2010 at 06:35:40PM -0500, Seiji Aguchi wrote:
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -81,6 +81,9 @@
>  #include <linux/nmi.h>
>  #endif
>  
> +#ifdef CONFIG_X86_MCE
> +#include <asm/mce.h>
> +#endif

Please don't put ifdefs in .c files, you do that a lot for this option.
Just make it work on all platforms and then you will not need the
#ifdef.

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-22 23:35 [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred Seiji Aguchi
  2010-12-23  0:29 ` Greg KH
@ 2010-12-23  7:43 ` Andi Kleen
  2010-12-23  9:18   ` Borislav Petkov
  2010-12-27  1:56 ` Hidetoshi Seto
  2 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2010-12-23  7:43 UTC (permalink / raw)
  To: Seiji Aguchi
  Cc: rdunlap@xenotime.net, tglx@linutronix.de, mingo@redhat.com,
	hpa@zytor.com, x86@kernel.org, ebiederm@xmission.com,
	andi@firstfloor.org, akpm@linuxfoundation.org,
	eugeneteo@kernel.org, kees.cook@canonical.com,
	drosenberg@vsecurity.com, ying.huang@intel.com,
	len.brown@intel.com, seto.hidetoshi@jp.fujitsu.com,
	paulmck@linux.vnet.ibm.com, gregkh@suse.de, davem@davemloft.net,
	hadi@cyberus.ca, hawk@comx.dk, opurdila@ixiacom.com,
	hidave.darkstar@gmail.com, dzickus@redhat.com,
	eric.dumazet@gmail.com, ext-andriy.shevchenko@nokia.com,
	tj@kernel.org, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, kexec@lists.infradead.org,
	linux-mm@kvack.org, dle-develop@lists.sourceforge.net,
	Satoru Moriya



>   - Accessing to memory and dumping it to disks.

A better solution for this is

http://git.kernel.org/?p=linux/kernel/git/ak/linux-mce-2.6.git;a=commitdiff;h=fe61906edce9e70d02481a77a617ba1397573dce
and
http://git.kernel.org/?p=linux/kernel/git/ak/linux-mce-2.6.git;a=commit;h=cb58f049ae6709ddbab71be199390dc6852018cd

I'm not a big friend of sysctls for things like this -- either the behaviour
makes sense and should be default or not.

-Andi


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-23  7:43 ` Andi Kleen
@ 2010-12-23  9:18   ` Borislav Petkov
  2010-12-23 17:31     ` Seiji Aguchi
  0 siblings, 1 reply; 11+ messages in thread
From: Borislav Petkov @ 2010-12-23  9:18 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Seiji Aguchi, rdunlap@xenotime.net, tglx@linutronix.de,
	mingo@redhat.com, hpa@zytor.com, x86@kernel.org,
	ebiederm@xmission.com, akpm@linuxfoundation.org,
	eugeneteo@kernel.org, kees.cook@canonical.com,
	drosenberg@vsecurity.com, ying.huang@intel.com,
	len.brown@intel.com, seto.hidetoshi@jp.fujitsu.com,
	paulmck@linux.vnet.ibm.com, gregkh@suse.de, davem@davemloft.net,
	hadi@cyberus.ca, hawk@comx.dk, opurdila@ixiacom.com,
	hidave.darkstar@gmail.com, dzickus@redhat.com,
	eric.dumazet@gmail.com, ext-andriy.shevchenko@nokia.com,
	tj@kernel.org, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, kexec@lists.infradead.org,
	linux-mm@kvack.org, dle-develop@lists.sourceforge.net,
	Satoru Moriya

On Thu, Dec 23, 2010 at 08:43:39AM +0100, Andi Kleen wrote:
> 
> 
> >   - Accessing to memory and dumping it to disks.
> 
> A better solution for this is
> 
> http://git.kernel.org/?p=linux/kernel/git/ak/linux-mce-2.6.git;a=commitdiff;h=fe61906edce9e70d02481a77a617ba1397573dce
> and
> http://git.kernel.org/?p=linux/kernel/git/ak/linux-mce-2.6.git;a=commit;h=cb58f049ae6709ddbab71be199390dc6852018cd
> 
> I'm not a big friend of sysctls for things like this -- either the behaviour
> makes sense and should be default or not.

This doesn't add up. AFAICT, you're disabling MCE reporting for crash
dumps and the original patch's intention was to control whether kexec
should run after a machine check. And I agree with Greg that this
shouldn't be configurable but instead on by default - if you get a
critical error and you cannot guarantee a stable system anymore, kexec
shouldn't start at all. That simple.

Thanks.

-- 
Regards/Gruss,
    Boris.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-23  9:18   ` Borislav Petkov
@ 2010-12-23 17:31     ` Seiji Aguchi
  2010-12-23 19:56       ` Eric W. Biederman
  0 siblings, 1 reply; 11+ messages in thread
From: Seiji Aguchi @ 2010-12-23 17:31 UTC (permalink / raw)
  To: Borislav Petkov, Andi Kleen
  Cc: rdunlap@xenotime.net, tglx@linutronix.de, mingo@redhat.com,
	hpa@zytor.com, x86@kernel.org, ebiederm@xmission.com,
	akpm@linuxfoundation.org, eugeneteo@kernel.org,
	kees.cook@canonical.com, drosenberg@vsecurity.com,
	ying.huang@intel.com, len.brown@intel.com,
	seto.hidetoshi@jp.fujitsu.com, paulmck@linux.vnet.ibm.com,
	gregkh@suse.de, davem@davemloft.net, hadi@cyberus.ca,
	hawk@comx.dk, opurdila@ixiacom.com, hidave.darkstar@gmail.com,
	dzickus@redhat.com, eric.dumazet@gmail.com,
	ext-andriy.shevchenko@nokia.com, tj@kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	dle-develop@lists.sourceforge.net, Satoru Moriya

Hi,

I agree with Borislav that kexec shouldn't start at all because we can't guarantee 
a stable system anymore when MCE is reported.

On the other hand, I understand there are people like Andi who want to start kexec 
even if MCE occurred.

That is why I propose adding a new option controlling kexec behaviour when MCE occurred.

I don't stick to "sysctl".
I suggest to add a new boot parameter instead of sysctl because users can't change 
their configuration once the boot parameter is set.

I will resend the patch if it is acceptable.

Regards,

Seiji

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-23 17:31     ` Seiji Aguchi
@ 2010-12-23 19:56       ` Eric W. Biederman
  2010-12-25 14:56         ` Seiji Aguchi
  0 siblings, 1 reply; 11+ messages in thread
From: Eric W. Biederman @ 2010-12-23 19:56 UTC (permalink / raw)
  To: Seiji Aguchi
  Cc: Borislav Petkov, Andi Kleen, rdunlap@xenotime.net,
	tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com,
	x86@kernel.org, akpm@linuxfoundation.org, eugeneteo@kernel.org,
	kees.cook@canonical.com, drosenberg@vsecurity.com,
	ying.huang@intel.com, len.brown@intel.com,
	seto.hidetoshi@jp.fujitsu.com, paulmck@linux.vnet.ibm.com,
	gregkh@suse.de, davem@davemloft.net, hadi@cyberus.ca,
	hawk@comx.dk, opurdila@ixiacom.com, hidave.darkstar@gmail.com,
	dzickus@redhat.com, eric.dumazet@gmail.com,
	ext-andriy.shevchenko@nokia.com, tj@kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	dle-develop@lists.sourceforge.net, Satoru Moriya

Seiji Aguchi <seiji.aguchi@hds.com> writes:

> Hi,
>
> I agree with Borislav that kexec shouldn't start at all because we can't guarantee 
> a stable system anymore when MCE is reported.

In the case of kexec on panic we can never guarantee a stable system.
But the odds are much better of executing non-corrupt code  and of
telling people you had a hardware error if you go through the kexec
on panic process.

If I read Andi's patch correctly he was suggesting to not allow any more
mces to be reported on that path.


> On the other hand, I understand there are people like Andi who want to start kexec 
> even if MCE occurred.
>
> That is why I propose adding a new option controlling kexec behaviour
> when MCE occurred.

What do you gain but not doing the kexec on panic, when you have the
system configured to take one.  We already have the big policy knobs
to enable or disable this kind of behavior.

> I don't stick to "sysctl".

I think adding a sysctl in this path or any unnecessary code will make
things less reliable.

Last time this happened to me (about a week ago).  The kexec on panic
from a ecc reported memory error worked just fine.  Aka in the real
world it seems to work.

So what is the problem you are trying to avoid, and why can't we do
something in the kernels initialization path to avoid initializing
when there is a problem?

Eric

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-23 19:56       ` Eric W. Biederman
@ 2010-12-25 14:56         ` Seiji Aguchi
  2010-12-25 17:19           ` Eric W. Biederman
  0 siblings, 1 reply; 11+ messages in thread
From: Seiji Aguchi @ 2010-12-25 14:56 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Borislav Petkov, Andi Kleen, rdunlap@xenotime.net,
	tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com,
	x86@kernel.org, akpm@linuxfoundation.org, eugeneteo@kernel.org,
	kees.cook@canonical.com, drosenberg@vsecurity.com,
	ying.huang@intel.com, len.brown@intel.com,
	seto.hidetoshi@jp.fujitsu.com, paulmck@linux.vnet.ibm.com,
	gregkh@suse.de, davem@davemloft.net, hadi@cyberus.ca,
	hawk@comx.dk, opurdila@ixiacom.com, hidave.darkstar@gmail.com,
	dzickus@redhat.com, eric.dumazet@gmail.com,
	ext-andriy.shevchenko@nokia.com, tj@kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	dle-develop@lists.sourceforge.net, Satoru Moriya

Hi,

Thank you for giving your comments.

>So what is the problem you are trying to avoid, and why can't we do
>something in the kernels initialization path to avoid initializing
>when there is a problem?

Kdump gets a dump disk identifier based on information from memory.

So, kdump may receive wrong identifier when it starts after MCE 
occurred, because MCE is reported by memory, cache, and TLB errors

In the worst case, kdump will overwrite user data if it recognizes a 
disk saving user data as a dump disk.

Kdump shouldn't write any data to disk when information from
hardware is incredible because saving user data is always first 
priority.

Seiji

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-25 14:56         ` Seiji Aguchi
@ 2010-12-25 17:19           ` Eric W. Biederman
  2010-12-25 18:33             ` H. Peter Anvin
  0 siblings, 1 reply; 11+ messages in thread
From: Eric W. Biederman @ 2010-12-25 17:19 UTC (permalink / raw)
  To: Seiji Aguchi
  Cc: Borislav Petkov, Andi Kleen, rdunlap@xenotime.net,
	tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com,
	x86@kernel.org, akpm@linuxfoundation.org, eugeneteo@kernel.org,
	kees.cook@canonical.com, drosenberg@vsecurity.com,
	ying.huang@intel.com, len.brown@intel.com,
	seto.hidetoshi@jp.fujitsu.com, paulmck@linux.vnet.ibm.com,
	gregkh@suse.de, davem@davemloft.net, hadi@cyberus.ca,
	hawk@comx.dk, opurdila@ixiacom.com, hidave.darkstar@gmail.com,
	dzickus@redhat.com, eric.dumazet@gmail.com,
	ext-andriy.shevchenko@nokia.com, tj@kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	dle-develop@lists.sourceforge.net, Satoru Moriya

Seiji Aguchi <seiji.aguchi@hds.com> writes:

> Hi,
>
> Thank you for giving your comments.
>
>>So what is the problem you are trying to avoid, and why can't we do
>>something in the kernels initialization path to avoid initializing
>>when there is a problem?
>
> Kdump gets a dump disk identifier based on information from memory.
>
> So, kdump may receive wrong identifier when it starts after MCE 
> occurred, because MCE is reported by memory, cache, and TLB errors
>
> In the worst case, kdump will overwrite user data if it recognizes a 
> disk saving user data as a dump disk.

Absurdly unlikely there is a sha256 checksum verified over the
kdump kernel before it starts booting.  If you have very broken
memory it is possible, but absurdly unlikely that the machine will
even boot if you are having enough uncorrectable memory errors
an hour to get past the sha256 checksum and then be corruppt.

> Kdump shouldn't write any data to disk when information from
> hardware is incredible because saving user data is always first 
> priority.

Which is what is already implemented.

It looks to me like you are jumping at shadows, and adding
complexity to the kernel with no gain, and significant cost.


Eric

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-25 17:19           ` Eric W. Biederman
@ 2010-12-25 18:33             ` H. Peter Anvin
  2010-12-25 21:40               ` Eric W. Biederman
  0 siblings, 1 reply; 11+ messages in thread
From: H. Peter Anvin @ 2010-12-25 18:33 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Seiji Aguchi, Borislav Petkov, Andi Kleen, rdunlap@xenotime.net,
	tglx@linutronix.de, mingo@redhat.com, x86@kernel.org,
	akpm@linuxfoundation.org, eugeneteo@kernel.org,
	kees.cook@canonical.com, drosenberg@vsecurity.com,
	ying.huang@intel.com, len.brown@intel.com,
	seto.hidetoshi@jp.fujitsu.com, paulmck@linux.vnet.ibm.com,
	gregkh@suse.de, davem@davemloft.net, hadi@cyberus.ca,
	hawk@comx.dk, opurdila@ixiacom.com, hidave.darkstar@gmail.com,
	dzickus@redhat.com, eric.dumazet@gmail.com,
	ext-andriy.shevchenko@nokia.com, tj@kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	dle-develop@lists.sourceforge.net, Satoru Moriya

On 12/25/2010 09:19 AM, Eric W. Biederman wrote:
>>
>> So, kdump may receive wrong identifier when it starts after MCE 
>> occurred, because MCE is reported by memory, cache, and TLB errors
>>
>> In the worst case, kdump will overwrite user data if it recognizes a 
>> disk saving user data as a dump disk.
> 
> Absurdly unlikely there is a sha256 checksum verified over the
> kdump kernel before it starts booting.  If you have very broken
> memory it is possible, but absurdly unlikely that the machine will
> even boot if you are having enough uncorrectable memory errors
> an hour to get past the sha256 checksum and then be corruppt.
> 

That wouldn't be the likely scenario (passing a sha256 checksum with the
wrong data due to a random event will never happen for all the computers
on Earth before the Sun destroys the planet).  However, in a
failing-memory scenario, the much more likely scenario is that kdump
starts up, verifies the signature, and *then* has corruption causing it
to write to the wrong disk or whatnot.  This is inherent in any scheme
that allows writing to hard media after a failure (as opposed to, say,
dumping to the network.)

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-25 18:33             ` H. Peter Anvin
@ 2010-12-25 21:40               ` Eric W. Biederman
  0 siblings, 0 replies; 11+ messages in thread
From: Eric W. Biederman @ 2010-12-25 21:40 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Seiji Aguchi, Borislav Petkov, Andi Kleen, rdunlap@xenotime.net,
	tglx@linutronix.de, mingo@redhat.com, x86@kernel.org,
	akpm@linuxfoundation.org, eugeneteo@kernel.org,
	kees.cook@canonical.com, drosenberg@vsecurity.com,
	ying.huang@intel.com, len.brown@intel.com,
	seto.hidetoshi@jp.fujitsu.com, paulmck@linux.vnet.ibm.com,
	gregkh@suse.de, davem@davemloft.net, hadi@cyberus.ca,
	hawk@comx.dk, opurdila@ixiacom.com, hidave.darkstar@gmail.com,
	dzickus@redhat.com, eric.dumazet@gmail.com,
	ext-andriy.shevchenko@nokia.com, tj@kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	dle-develop@lists.sourceforge.net, Satoru Moriya

"H. Peter Anvin" <hpa@zytor.com> writes:

> On 12/25/2010 09:19 AM, Eric W. Biederman wrote:
>>>
>>> So, kdump may receive wrong identifier when it starts after MCE 
>>> occurred, because MCE is reported by memory, cache, and TLB errors
>>>
>>> In the worst case, kdump will overwrite user data if it recognizes a 
>>> disk saving user data as a dump disk.
>> 
>> Absurdly unlikely there is a sha256 checksum verified over the
>> kdump kernel before it starts booting.  If you have very broken
>> memory it is possible, but absurdly unlikely that the machine will
>> even boot if you are having enough uncorrectable memory errors
>> an hour to get past the sha256 checksum and then be corruppt.
>> 
>
> That wouldn't be the likely scenario (passing a sha256 checksum with the
> wrong data due to a random event will never happen for all the computers
> on Earth before the Sun destroys the planet).  However, in a
> failing-memory scenario, the much more likely scenario is that kdump
> starts up, verifies the signature, and *then* has corruption causing it
> to write to the wrong disk or whatnot.  This is inherent in any scheme
> that allows writing to hard media after a failure (as opposed to, say,
> dumping to the network.)

Then kdump kernel should also panic if we detect an uncorrected ECC
error.  So this doesn't appear to open any new holes for disk corruption.

kexec on panic can also be used for taking crash dumps over the
network.  What happens with the data is totally defined by userspace
code in an initrd.

Which is why extra policy knobs should be where they can be used.

Eric

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred
  2010-12-22 23:35 [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred Seiji Aguchi
  2010-12-23  0:29 ` Greg KH
  2010-12-23  7:43 ` Andi Kleen
@ 2010-12-27  1:56 ` Hidetoshi Seto
  2 siblings, 0 replies; 11+ messages in thread
From: Hidetoshi Seto @ 2010-12-27  1:56 UTC (permalink / raw)
  To: Seiji Aguchi
  Cc: rdunlap@xenotime.net, tglx@linutronix.de, mingo@redhat.com,
	hpa@zytor.com, x86@kernel.org, ebiederm@xmission.com,
	andi@firstfloor.org, akpm@linuxfoundation.org,
	eugeneteo@kernel.org, kees.cook@canonical.com,
	drosenberg@vsecurity.com, ying.huang@intel.com,
	len.brown@intel.com, paulmck@linux.vnet.ibm.com, gregkh@suse.de,
	davem@davemloft.net, hadi@cyberus.ca, hawk@comx.dk,
	opurdila@ixiacom.com, hidave.darkstar@gmail.com,
	dzickus@redhat.com, eric.dumazet@gmail.com,
	ext-andriy.shevchenko@nokia.com, tj@kernel.org,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	dle-develop@lists.sourceforge.net, Satoru Moriya

(2010/12/23 8:35), Seiji Aguchi wrote:
> Hi,
> 
> [Purpose]
> Kexec may trigger additional hardware errors and multiply the damage 
> if it works after MCE occurred because there are some hardware-related 
> operations in kexec as follows.
>   - Sending NMI to cpus
>   - Initializing hardware during boot process of second kernel.
>   - Accessing to memory and dumping it to disks.
> 
> So, I propose adding a new option controlling kexec behaviour when MCE 
> occurred.
> This patch prevents unnecessary hardware errors and avoid expanding 
> the damage.
> 
> [Patch Description]
> I added a sysctl option ,kernel.kexec_on_mce, controlling kexec behaviour 
> when MCE occurred.
> 
>  - Permission
>    - 0644
>  - Value(default is "1")
>    - non-zero: Kexec is enabled regardless of MCE.
>    - 0: Kexec is disabled when MCE occurred.
> 
> Matrix of kernel.kexec_on_mce value, MCE and kexec behaviour
> 
> --------------------------------------------------
> kernel.kexec_on_mce| MCE          | kexec behaviour
> --------------------------------------------------
> non-zero           | occurred     | enabled
>                    -------------------------------
>                    | not occurred | enabled
> --------------------------------------------------
> 0                  | occurred     | disabled
>                    |------------------------------
>                    | not occurred | enabled
> --------------------------------------------------
> 
> Any comments and suggestions are welcome.

This reminds me of a quite similar patch that I've made a long time ago
but haven't posted.

Following is what I found still in a branch of my private git tree.
I guess it cannot be applied without rebase, but I think the description
of my patch could give you some different point of view etc.
Feel free to use this debris to improve yours.


Thanks,
H.Seto

<*__NOTE_THIS_PATCH_IS_NOT_READY_TO_APPLY__*>
=====
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Fri, 10 Jul 2009 15:55:42 +0900
Subject: [PATCH] kdump, sysctl: kdump_on_safe

This patch adds a sysctl kdump_on_safe, to limit kdump to run only
on safe situation.

Quote from document in this patch:
 > kdump_on_safe:
 >
 > When the system experiences panic, kdump will be triggered if
 > crash kernel is configured.  However the kdump might fail if
 > the panic was caused by fatal error, such as hardware error
 > reported by machine check exception.  It should be rare case,
 > but in the worst case, it will result in data corruption and/or
 > fatal damage on the hardware.
 >
 > If this flag is 1, it prevents kdump from running on such
 > unstable system situation.  Default is 0.

This will be a possible option if your hardware can provide good error
report (in SEL etc.) and/or kernel can provide other data enough for
error investigation (console log, mcelog on x86 etc.), and you'd like
to reduce down-time by skipping kdump on such situation.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
 Documentation/sysctl/kernel.txt  |   15 +++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c |    3 +++
 include/linux/kexec.h            |    3 +++
 kernel/kexec.c                   |    8 ++++++++
 kernel/sysctl.c                  |   13 +++++++++++++
 5 files changed, 42 insertions(+), 0 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 3894eaa..9d66ab9 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kdump_on_safe               [ kexec ]
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
@@ -247,6 +248,20 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kdump_on_safe:
+
+When the system experiences panic, kdump will be triggered if
+crash kernel is configured.  However the kdump might fail if
+the panic was caused by fatal error, such as hardware error
+reported by machine check exception.  It should be rare case,
+but in the worst case, it will result in data corruption and/or
+fatal damage on the hardware.
+
+If this flag is 1, it prevents kdump from running on such
+unstable system situation.  Default is 0.
+
+==============================================================
+
 kstack_depth_to_print: (X86 only)
 
 Controls the number of words to print when dumping the raw
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3e2ab18..c93bb38 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -23,6 +23,7 @@
 #include <linux/sysdev.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
+#include <linux/kexec.h>
 #include <linux/sched.h>
 #include <linux/sysfs.h>
 #include <linux/types.h>
@@ -291,6 +292,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 	int cpu;
 
 	if (!fake_panic) {
+		set_kdump_might_fail();
+
 		/*
 		 * Make sure only one CPU runs in machine check panic
 		 */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 03e8e8d..41e9ab0 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -209,10 +209,13 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 
+extern int kdump_might_fail;
+static inline void set_kdump_might_fail(void) { kdump_might_fail = 1; }
 #else /* !CONFIG_KEXEC */
 struct pt_regs;
 struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
+static inline void set_kdump_might_fail(void) { }
 #endif /* CONFIG_KEXEC */
 #endif /* LINUX_KEXEC_H */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87ebe8a..182c2f3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,6 +40,9 @@
 #include <asm/system.h>
 #include <asm/sections.h>
 
+int kdump_on_safe;
+int kdump_might_fail;
+
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
@@ -1064,6 +1067,11 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 
 void crash_kexec(struct pt_regs *regs)
 {
+	if (kdump_on_safe && kdump_might_fail) {
+		printk(KERN_EMERG "kexec cancelled due to unstable system.\n");
+		return;
+	}
+
 	/* Take the kexec_mutex here to prevent sys_kexec_load
 	 * running on one cpu from replacing the crash kernel
 	 * we are using after a panic on a different cpu.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f..8564e5c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -156,6 +156,10 @@ extern int unaligned_dump_stack;
 
 extern struct ratelimit_state printk_ratelimit_state;
 
+#ifdef CONFIG_KEXEC
+extern int kdump_on_safe;
+#endif
+
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -926,6 +930,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_KEXEC
+	{
+		.procname	= "kdump_on_safe",
+		.data		= &kdump_on_safe,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
1.7.3.2
</*__NOTE_THIS_PATCH_IS_NOT_READY_TO_APPLY__*>



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-12-27  2:03 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-12-22 23:35 [RFC][PATCH] Add a sysctl option controlling kexec when MCE occurred Seiji Aguchi
2010-12-23  0:29 ` Greg KH
2010-12-23  7:43 ` Andi Kleen
2010-12-23  9:18   ` Borislav Petkov
2010-12-23 17:31     ` Seiji Aguchi
2010-12-23 19:56       ` Eric W. Biederman
2010-12-25 14:56         ` Seiji Aguchi
2010-12-25 17:19           ` Eric W. Biederman
2010-12-25 18:33             ` H. Peter Anvin
2010-12-25 21:40               ` Eric W. Biederman
2010-12-27  1:56 ` Hidetoshi Seto

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).