[PATCH RFC] NMI Re-introduce un[set]_nmi

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
@ 2008-09-04 13:07 Prarit Bhargava
  2008-09-04 13:37 ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Prarit Bhargava @ 2008-09-04 13:07 UTC (permalink / raw)
  To: linux-kernel, arozansk, dzickus, Thomas.Mingarelli, ak, mingo
  Cc: Prarit Bhargava

Andi and Ingo,

This patch is an RFC for the following changes.  If I get a positive review,
changes to the HP Watchdog timer (currently in the kernel) will also be
submitted along with this patch.

Thanks,

P.

The drivers/watchdog/hpwdt.c driver requires that all NMIs are processed
by the driver.  Currently the driver does not do this.  The first
step is to implement code to allow the default NMI handler to be replaced
by a custom NMI handler.

This patch re-implements a global set and unset NMI callback so that the
default NMI handler can be replaced with a custom NMI handler.  This
functionality was removed from the kernel a while ago and now must be
reintroduced into the kernel.

An existing unknown NMI callback already exists within the code.  This call
has been renamed to the more descriptive do_unknown_nmi_callback.

Patch was tested on x86 (32 and 64 bit) on Intel and AMD hardware.

Acked-by: Aris Rozanski <arozansk@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Prarit Bhargava <prarit@redhat.com>

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index abb78a2..917b351 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -510,7 +510,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 
 #endif /* CONFIG_SYSCTL */
 
-int do_nmi_callback(struct pt_regs *regs, int cpu)
+int do_unknown_nmi_callback(struct pt_regs *regs, int cpu)
 {
 #ifdef CONFIG_SYSCTL
 	if (unknown_nmi_panic)
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 03df8e4..f19e414 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -796,7 +796,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		 */
 		if (nmi_watchdog_tick(regs, reason))
 			return;
-		if (!do_nmi_callback(regs, cpu))
+		if (!do_unknown_nmi_callback(regs, cpu))
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
@@ -819,17 +819,61 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 	reassert_nmi();
 }
 
+static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+{
+	return 0;
+}
+
+static nmi_callback_t nmi_callback = dummy_nmi_callback;
+static int nmi_cb_status = 0;
+
+int set_nmi_callback(nmi_callback_t callback)
+{
+	if (nmi_callback != dummy_nmi_callback) {
+		printk(KERN_WARNING
+		       "WARNING: Only one NMI callback can be registered at a "
+		       "time.\n");
+		return -EBUSY;
+	}
+
+	nmi_cb_status = atomic_read(&nmi_active);
+	if (nmi_cb_status) {
+		if (nmi_watchdog == NMI_LOCAL_APIC) {
+			disable_lapic_nmi_watchdog();
+			vmalloc_sync_all();
+		} else {
+			printk(KERN_WARNING
+			       "WARNING: NMI watchdog can only be set on "
+			       "systems with status lapic NMI.\n");
+			return -ENODEV;
+		}
+	}
+	rcu_assign_pointer(nmi_callback, callback);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(set_nmi_callback);
+
+void unset_nmi_callback(void)
+{
+	if (nmi_cb_status && (nmi_watchdog == NMI_LOCAL_APIC))
+		enable_lapic_nmi_watchdog();
+	nmi_cb_status = 0;
+	nmi_callback = dummy_nmi_callback;
+}
+
+EXPORT_SYMBOL_GPL(unset_nmi_callback);
 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
 {
 	int cpu;
 
 	nmi_enter();
 
-	cpu = smp_processor_id();
+	cpu = safe_smp_processor_id();
 
 	++nmi_count(cpu);
 
-	if (!ignore_nmis)
+	if (!ignore_nmis && !rcu_dereference(nmi_callback)(regs,cpu))
 		default_do_nmi(regs);
 
 	nmi_exit();
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 513caac..0f3bd27 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -815,7 +815,7 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		 */
 		if (nmi_watchdog_tick(regs, reason))
 			return;
-		if (!do_nmi_callback(regs, cpu))
+		if (!do_unknown_nmi_callback(regs, cpu))
 			unknown_nmi_error(reason, regs);
 
 		return;
@@ -830,14 +830,62 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		io_check_error(reason, regs);
 }
 
+static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+{
+	return 0;
+}
+
+static nmi_callback_t nmi_callback = dummy_nmi_callback;
+static int nmi_cb_status = 0;
+
+int set_nmi_callback(nmi_callback_t callback)
+{
+	if (nmi_callback != dummy_nmi_callback) {
+		printk(KERN_WARNING
+		       "WARNING: Only one NMI callback can be registered at a "
+		       "time.\n");
+		return -EBUSY;
+	}
+
+	nmi_cb_status = atomic_read(&nmi_active);
+	if (nmi_cb_status) {
+		if (nmi_watchdog == NMI_LOCAL_APIC) {
+			disable_lapic_nmi_watchdog();
+			vmalloc_sync_all();
+		} else {
+			printk(KERN_WARNING
+			       "WARNING: NMI watchdog can only be set on "
+			       "systems with status lapic NMI.\n");
+			return -ENODEV;
+		}
+	}
+	rcu_assign_pointer(nmi_callback, callback);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(set_nmi_callback);
+
+void unset_nmi_callback(void)
+{
+	if (nmi_cb_status && (nmi_watchdog == NMI_LOCAL_APIC))
+		enable_lapic_nmi_watchdog();
+	nmi_cb_status = 0;
+	nmi_callback = dummy_nmi_callback;
+}
+EXPORT_SYMBOL_GPL(unset_nmi_callback);
+
 asmlinkage notrace __kprobes void
 do_nmi(struct pt_regs *regs, long error_code)
 {
+	int cpu;
+
 	nmi_enter();
 
+	cpu = safe_smp_processor_id();
+
 	add_pda(__nmi_count, 1);
 
-	if (!ignore_nmis)
+	if (!ignore_nmis && !rcu_dereference(nmi_callback)(regs,cpu))
 		default_do_nmi(regs);
 
 	nmi_exit();
diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
index 21f8d02..3c3901a 100644
--- a/include/asm-x86/nmi.h
+++ b/include/asm-x86/nmi.h
@@ -7,13 +7,30 @@
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
 
+typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
+
+/**
+ * set_nmi_callback
+ *
+ * Set a global handler for an NMI. Only one handler may be
+ * set. Return 1 if the NMI was handled.
+ */
+int set_nmi_callback(nmi_callback_t callback);
+
+/**
+ * unset_nmi_callback
+ *
+ * Remove the global handler previously set.
+ */
+void unset_nmi_callback(void);
+
 /**
- * do_nmi_callback
+ * do_unknown_nmi_callback
  *
  * Check to see if a callback exists and execute it.  Return 1
- * if the handler exists and was handled successfully.
+ * if the unknown handler exists and was handled successfully.
  */
-int do_nmi_callback(struct pt_regs *regs, int cpu);
+int do_unknown_nmi_callback(struct pt_regs *regs, int cpu);
 
 #ifdef CONFIG_X86_64
 extern void default_do_nmi(struct pt_regs *);

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 13:07 [PATCH RFC] NMI Re-introduce un[set]_nmi_callback Prarit Bhargava
@ 2008-09-04 13:37 ` Peter Zijlstra
  2008-09-04 14:29   ` Prarit Bhargava
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2008-09-04 13:37 UTC (permalink / raw)
  To: Prarit Bhargava
  Cc: linux-kernel, arozansk, dzickus, Thomas.Mingarelli, ak, mingo

On Thu, 2008-09-04 at 09:07 -0400, Prarit Bhargava wrote:
> Andi and Ingo,
> 
> This patch is an RFC for the following changes.  If I get a positive review,
> changes to the HP Watchdog timer (currently in the kernel) will also be
> submitted along with this patch.
> 
> Thanks,
> 
> P.
> 
> The drivers/watchdog/hpwdt.c driver requires that all NMIs are processed
> by the driver.  Currently the driver does not do this.  The first
> step is to implement code to allow the default NMI handler to be replaced
> by a custom NMI handler.

Why is the DIE_NMIWATCHDOG notifier not sufficient for this driver?


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 13:37 ` Peter Zijlstra
@ 2008-09-04 14:29   ` Prarit Bhargava
  2008-09-04 14:49     ` aris
  2008-09-04 14:56     ` Ingo Molnar
  0 siblings, 2 replies; 29+ messages in thread
From: Prarit Bhargava @ 2008-09-04 14:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, arozansk, dzickus, Thomas.Mingarelli, ak, mingo

>
> Why is the DIE_NMIWATCHDOG notifier not sufficient for this driver?
>
>   

Peter -- good question.  The HP systems with this HW will use the hpwdt 
driver in place of the default nmi watchdog.  When the HW detects a 
problem, the HW will generate a single NMI that the driver will handle.  
The driver doesn't want the NMI to be rejected due to a reason code.  
I'm sure that Thomas Mingarelli, who is cc'd, can provide further details.

 From our quick conversation as well, you raised an interesting point 
about oprofile, kgdb, and other subsystems that use the NMI notifier 
chains -- they may be impacted by the NMI callback.

Don (dzickus) or Aris, do you have any thoughts on how to get around the 
second issue?  We could check to see if anything is registered on the 
notifier chain and the fail to register the callback.

P.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 14:29   ` Prarit Bhargava
@ 2008-09-04 14:49     ` aris
  2008-09-04 14:56     ` Ingo Molnar
  1 sibling, 0 replies; 29+ messages in thread
From: aris @ 2008-09-04 14:49 UTC (permalink / raw)
  To: Prarit Bhargava
  Cc: Peter Zijlstra, linux-kernel, dzickus, Thomas.Mingarelli, ak,
	mingo

> Peter -- good question.  The HP systems with this HW will use the hpwdt  
> driver in place of the default nmi watchdog.  When the HW detects a  
> problem, the HW will generate a single NMI that the driver will handle.   
> The driver doesn't want the NMI to be rejected due to a reason code.   
> I'm sure that Thomas Mingarelli, who is cc'd, can provide further 
> details.
it's not the first time this is asked. I think it's needed for some kernel
debuggers as well: make sure a function is called before anything else
when a NMI happens. something that the notifier chain can't do.

> From our quick conversation as well, you raised an interesting point  
> about oprofile, kgdb, and other subsystems that use the NMI notifier  
> chains -- they may be impacted by the NMI callback.
>
> Don (dzickus) or Aris, do you have any thoughts on how to get around the  
> second issue?  We could check to see if anything is registered on the  
> notifier chain and the fail to register the callback.
or call the notifier chain in case it indentifies it's a unexpected NMI?

-- 
Aristeu


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 14:29   ` Prarit Bhargava
  2008-09-04 14:49     ` aris
@ 2008-09-04 14:56     ` Ingo Molnar
  2008-09-04 15:12       ` H. Peter Anvin
  2008-09-04 15:52       ` Andi Kleen
  1 sibling, 2 replies; 29+ messages in thread
From: Ingo Molnar @ 2008-09-04 14:56 UTC (permalink / raw)
  To: Prarit Bhargava
  Cc: Peter Zijlstra, linux-kernel, arozansk, dzickus,
	Thomas.Mingarelli, ak, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki


* Prarit Bhargava <prarit@redhat.com> wrote:

>> Why is the DIE_NMIWATCHDOG notifier not sufficient for this driver?
>
> Peter -- good question.  The HP systems with this HW will use the 
> hpwdt driver in place of the default nmi watchdog.  When the HW 
> detects a problem, the HW will generate a single NMI that the driver 
> will handle.  The driver doesn't want the NMI to be rejected due to a 
> reason code.  I'm sure that Thomas Mingarelli, who is cc'd, can 
> provide further details.
>
> From our quick conversation as well, you raised an interesting point 
> about oprofile, kgdb, and other subsystems that use the NMI notifier 
> chains -- they may be impacted by the NMI callback.
>
> Don (dzickus) or Aris, do you have any thoughts on how to get around 
> the second issue?  We could check to see if anything is registered on 
> the notifier chain and the fail to register the callback.

i'd much rather attack this general problem from this angle:

  static inline unsigned char get_nmi_reason(void)
  {
          return inb(0x61);
  }

that port 61H read is both arcane (on modern chipsets) and broken on 
multiple levels. It's racy and SMP unsafe to begin with, if there's any 
mixture of intentional cross-CPU or CPU self-generated NMIs mixed with 
chipset generated NMIs.

One possible approach would be to get rid of it, and to perhaps register 
a low-priority die notifier on systems where we know port 61 
reads+writes to be safe and desired. Modern systems will emit MCEs in 
most cases anyway, not NMIs.

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 14:56     ` Ingo Molnar
@ 2008-09-04 15:12       ` H. Peter Anvin
  2008-09-04 15:18         ` Ingo Molnar
  2008-09-04 15:52       ` Andi Kleen
  1 sibling, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2008-09-04 15:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Prarit Bhargava, Peter Zijlstra, linux-kernel, arozansk, dzickus,
	Thomas.Mingarelli, ak, Alan Cox, Thomas Gleixner,
	Maciej W. Rozycki

Ingo Molnar wrote:
> 
> i'd much rather attack this general problem from this angle:
> 
>   static inline unsigned char get_nmi_reason(void)
>   {
>           return inb(0x61);
>   }
> 
> that port 61H read is both arcane (on modern chipsets) and broken on 
> multiple levels. It's racy and SMP unsafe to begin with, if there's any 
> mixture of intentional cross-CPU or CPU self-generated NMIs mixed with 
> chipset generated NMIs.
> 
> One possible approach would be to get rid of it, and to perhaps register 
> a low-priority die notifier on systems where we know port 61 
> reads+writes to be safe and desired. Modern systems will emit MCEs in 
> most cases anyway, not NMIs.
> 

I believe we should still do it, but as the lowest priority "nothing 
else claimed this".  It reflects a system error and not all systems will 
generate #MC instead of NMI for all system errors.

Pretty much what you're saying above.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 15:12       ` H. Peter Anvin
@ 2008-09-04 15:18         ` Ingo Molnar
  0 siblings, 0 replies; 29+ messages in thread
From: Ingo Molnar @ 2008-09-04 15:18 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Prarit Bhargava, Peter Zijlstra, linux-kernel, arozansk, dzickus,
	Thomas.Mingarelli, ak, Alan Cox, Thomas Gleixner,
	Maciej W. Rozycki


* H. Peter Anvin <hpa@zytor.com> wrote:

> Ingo Molnar wrote:
>>
>> i'd much rather attack this general problem from this angle:
>>
>>   static inline unsigned char get_nmi_reason(void)
>>   {
>>           return inb(0x61);
>>   }
>>
>> that port 61H read is both arcane (on modern chipsets) and broken on  
>> multiple levels. It's racy and SMP unsafe to begin with, if there's any 
>> mixture of intentional cross-CPU or CPU self-generated NMIs mixed with  
>> chipset generated NMIs.
>>
>> One possible approach would be to get rid of it, and to perhaps 
>> register a low-priority die notifier on systems where we know port 61  
>> reads+writes to be safe and desired. Modern systems will emit MCEs in  
>> most cases anyway, not NMIs.
>>
>
> I believe we should still do it, but as the lowest priority "nothing 
> else claimed this".  It reflects a system error and not all systems 
> will generate #MC instead of NMI for all system errors.
>
> Pretty much what you're saying above.

ok. One potential additional concern i can think of is multi-source 
NMIs: NMIs, when generated by some sort of hardware are edge triggered 
most of the time, so if in the notifier chain a notifier decides "this 
was for me, return now", we will lose an event.

So i think we should iterate through all notifier entries and call them 
(even if all of them indicate that they handled it), and determine 
whether at least one notifier handled something. If nothing handled the 
NMI _then_ do the port 61H logic as a final fall-back thing.

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 14:56     ` Ingo Molnar
  2008-09-04 15:12       ` H. Peter Anvin
@ 2008-09-04 15:52       ` Andi Kleen
  2008-09-04 17:20         ` Don Zickus
  1 sibling, 1 reply; 29+ messages in thread
From: Andi Kleen @ 2008-09-04 15:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Prarit Bhargava, Peter Zijlstra, linux-kernel, arozansk, dzickus,
	Thomas.Mingarelli, ak, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

Ingo Molnar <mingo@elte.hu> writes:
>
> i'd much rather attack this general problem from this angle:
>
>   static inline unsigned char get_nmi_reason(void)
>   {
>           return inb(0x61);
>   }
>
> that port 61H read is both arcane (on modern chipsets) and broken on 
> multiple levels.

Yes it is. I did some datasheet reading recently and unfortunately
there is no really standardized better way. So the only replacement
would be to have chipset specific NMI drivers that know 
the particular registers of the chipset.

 It's racy and SMP unsafe to begin with, if there's any 
> mixture of intentional cross-CPU or CPU self-generated NMIs mixed with 
> chipset generated NMIs.
>
> One possible approach would be to get rid of it, and to perhaps register 

Removing the IO port accesses by default would be a good idea
I agree. They are hardly useful for anything on modern systems.

But you still need some way to catch the chipset NMIs
and give some indication of the problem.

The way so far was to ask all the other sources (software NMIs
in memory flags, perfmon IPIs check perf ctrs, etc.) first 
and if it's none of them assume it's a chipset NMI
(or NMI button NMI if the sysctl is set).

Then if there's a chipset specific NMI driver it could
also check if the chipset raised it. That would be a possible
solution for HP -- they would need to implement such a driver
for their systems with the special watchdog.

Yes that's racy but the poor hardware support doesn't unfortunately 
leave much wiggling room to do better.

> a low-priority die notifier on systems where we know port 61 
> reads+writes to be safe and desired. Modern systems will emit MCEs in 
> most cases anyway, not NMIs.

The chipsets will still trigger NMIs (depending on their
configuration) -- e.g. on some PCI or internal errors -- they cannot
trigger MCEs directly.  Fortunately it's being replaced with PCI-AER
on PCI-Express, but PCI-X which doesn't do that is still very common
and shipping.

BTW the NMI handlers are also racy, it's not safe 
to call printk in a NMI handler. They really should be taught
to start using mce_log()

-Andi
-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 15:52       ` Andi Kleen
@ 2008-09-04 17:20         ` Don Zickus
  2008-09-04 17:52           ` Andi Kleen
  0 siblings, 1 reply; 29+ messages in thread
From: Don Zickus @ 2008-09-04 17:20 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ingo Molnar, Prarit Bhargava, Peter Zijlstra, linux-kernel,
	arozansk, Thomas.Mingarelli, ak, Alan Cox, H. Peter Anvin,
	Thomas Gleixner, Maciej W. Rozycki

On Thu, Sep 04, 2008 at 05:52:17PM +0200, Andi Kleen wrote:
> Then if there's a chipset specific NMI driver it could
> also check if the chipset raised it. That would be a possible
> solution for HP -- they would need to implement such a driver
> for their systems with the special watchdog.

The thing with HP's special watchdog timer is that it does _not_ have a
chipset specific NMI it is trying to catch.  HP is going on the assumption
that _all_ NMIs are /bad/ and they want to catch _every_ NMI, log it, and
reboot the system.

Now obviously NMIs from kgdb and oprofile are not the ones a system should
panic on but this breaks HP's assumptions.

So that is part of the problem.  How do you become a catch-all for NMIs in
a system, to process as you wish, but ignore all the 'safe' NMIs?

Cheers,
Don

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 17:20         ` Don Zickus
@ 2008-09-04 17:52           ` Andi Kleen
  2008-09-04 18:26             ` Don Zickus
  0 siblings, 1 reply; 29+ messages in thread
From: Andi Kleen @ 2008-09-04 17:52 UTC (permalink / raw)
  To: Don Zickus
  Cc: Andi Kleen, Ingo Molnar, Prarit Bhargava, Peter Zijlstra,
	linux-kernel, arozansk, Thomas.Mingarelli, ak, Alan Cox,
	H. Peter Anvin, Thomas Gleixner, Maciej W. Rozycki

On Thu, Sep 04, 2008 at 01:20:52PM -0400, Don Zickus wrote:
> On Thu, Sep 04, 2008 at 05:52:17PM +0200, Andi Kleen wrote:
> > Then if there's a chipset specific NMI driver it could
> > also check if the chipset raised it. That would be a possible
> > solution for HP -- they would need to implement such a driver
> > for their systems with the special watchdog.
> 
> The thing with HP's special watchdog timer is that it does _not_ have a
> chipset specific NMI it is trying to catch.  HP is going on the assumption
> that _all_ NMIs are /bad/ and they want to catch _every_ NMI, log it, and
> reboot the system.

That's my point. If you have drivers which can identify all other
NMIs then the left over NMIs must come from that watchdog driver.
So they just need drivers which can do that for their chipsets.

It's not race free, but that's simply not possible with the x86
NMI architecture.

Better would be probably to just configure the watchdog 
to reboot the system directly on its own. Most other watchdogs
I'm aware of do that. That's more reliable anyways because the system
might be wedged enough to not be able to process NMIs anymore.

> 
> Now obviously NMIs from kgdb and oprofile are not the ones a system should
> panic on but this breaks HP's assumptions.
> 
> So that is part of the problem.  How do you become a catch-all for NMIs in
> a system, to process as you wish, but ignore all the 'safe' NMIs?

To be fully reliable: you need a new NMI architecture or move the event
somewhere else.
To be reasonable reliable (assuming NMis are not very frequent): you
need drivers for all NMI sources that can identify them.

-Andi
-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 17:52           ` Andi Kleen
@ 2008-09-04 18:26             ` Don Zickus
  2008-09-04 18:47               ` Andi Kleen
  2008-09-04 19:08               ` Vivek Goyal
  0 siblings, 2 replies; 29+ messages in thread
From: Don Zickus @ 2008-09-04 18:26 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ingo Molnar, Prarit Bhargava, Peter Zijlstra, linux-kernel,
	arozansk, Thomas.Mingarelli, ak, Alan Cox, H. Peter Anvin,
	Thomas Gleixner, Maciej W. Rozycki

On Thu, Sep 04, 2008 at 07:52:31PM +0200, Andi Kleen wrote:
> On Thu, Sep 04, 2008 at 01:20:52PM -0400, Don Zickus wrote:
> > On Thu, Sep 04, 2008 at 05:52:17PM +0200, Andi Kleen wrote:
> > > Then if there's a chipset specific NMI driver it could
> > > also check if the chipset raised it. That would be a possible
> > > solution for HP -- they would need to implement such a driver
> > > for their systems with the special watchdog.
> > 
> > The thing with HP's special watchdog timer is that it does _not_ have a
> > chipset specific NMI it is trying to catch.  HP is going on the assumption
> > that _all_ NMIs are /bad/ and they want to catch _every_ NMI, log it, and
> > reboot the system.
> 
> That's my point. If you have drivers which can identify all other
> NMIs then the left over NMIs must come from that watchdog driver.
> So they just need drivers which can do that for their chipsets.

Except their chipsets are _not_ producing NMIs.  They just want to
supercede all the other NMI handlers.  For example if an EDAC NMI came in,
they don't want the EDAC handler to try and recover from it, HP just wants
their NMI watchdog to grab the NMI, log it and reboot.

> 
> It's not race free, but that's simply not possible with the x86
> NMI architecture.

I agree.

> 
> Better would be probably to just configure the watchdog 
> to reboot the system directly on its own. Most other watchdogs
> I'm aware of do that. That's more reliable anyways because the system
> might be wedged enough to not be able to process NMIs anymore.

The trick is they want to log it in a special way (BIOS or NVRAM or
something I forget) before rebooting.

> 
> > 
> > Now obviously NMIs from kgdb and oprofile are not the ones a system should
> > panic on but this breaks HP's assumptions.
> > 
> > So that is part of the problem.  How do you become a catch-all for NMIs in
> > a system, to process as you wish, but ignore all the 'safe' NMIs?
> 
> To be fully reliable: you need a new NMI architecture or move the event
> somewhere else.
> To be reasonable reliable (assuming NMis are not very frequent): you
> need drivers for all NMI sources that can identify them.

Yeah I know.  Originally I thought this would be easy, just replace the
default handler.  But once the mention of kgdb and oprofile using the NMIs
came up, I realized we are almost back to square one. :-(

Cheers,
Don

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 18:26             ` Don Zickus
@ 2008-09-04 18:47               ` Andi Kleen
  2008-09-04 19:08               ` Vivek Goyal
  1 sibling, 0 replies; 29+ messages in thread
From: Andi Kleen @ 2008-09-04 18:47 UTC (permalink / raw)
  To: Don Zickus
  Cc: Andi Kleen, Ingo Molnar, Prarit Bhargava, Peter Zijlstra,
	linux-kernel, arozansk, Thomas.Mingarelli, ak, Alan Cox,
	H. Peter Anvin, Thomas Gleixner, Maciej W. Rozycki

On Thu, Sep 04, 2008 at 02:26:37PM -0400, Don Zickus wrote:
> On Thu, Sep 04, 2008 at 07:52:31PM +0200, Andi Kleen wrote:
> > On Thu, Sep 04, 2008 at 01:20:52PM -0400, Don Zickus wrote:
> > > On Thu, Sep 04, 2008 at 05:52:17PM +0200, Andi Kleen wrote:
> > > > Then if there's a chipset specific NMI driver it could
> > > > also check if the chipset raised it. That would be a possible
> > > > solution for HP -- they would need to implement such a driver
> > > > for their systems with the special watchdog.
> > > 
> > > The thing with HP's special watchdog timer is that it does _not_ have a
> > > chipset specific NMI it is trying to catch.  HP is going on the assumption
> > > that _all_ NMIs are /bad/ and they want to catch _every_ NMI, log it, and
> > > reboot the system.
> > 
> > That's my point. If you have drivers which can identify all other
> > NMIs then the left over NMIs must come from that watchdog driver.
> > So they just need drivers which can do that for their chipsets.
> 
> Except their chipsets are _not_ producing NMIs.  They just want to

They will produce NMIs when the suitable error conditions are true.
That is why the fallback is assuming a chipset problem.

So the only reliable way to find out if the event really came from
the misdesigned watchdog (whoever designed it didn't understand
NMIs I would say) you have to check the chipset (and all other
sources). Hopefully they are all better designed and can
actually tell you if they triggered or not.

Also there's the issue that sometimes people want
the fallback to be the NMI button.

> > It's not race free, but that's simply not possible with the x86
> > NMI architecture.
> 
> I agree.
> 
> > 
> > Better would be probably to just configure the watchdog 
> > to reboot the system directly on its own. Most other watchdogs
> > I'm aware of do that. That's more reliable anyways because the system
> > might be wedged enough to not be able to process NMIs anymore.
> 
> The trick is they want to log it in a special way (BIOS or NVRAM or
> something I forget) before rebooting.

Then why tell the OS? It should be an SMI then.


-Andi
-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 18:26             ` Don Zickus
  2008-09-04 18:47               ` Andi Kleen
@ 2008-09-04 19:08               ` Vivek Goyal
  2008-09-04 20:00                 ` Andi Kleen
  1 sibling, 1 reply; 29+ messages in thread
From: Vivek Goyal @ 2008-09-04 19:08 UTC (permalink / raw)
  To: Don Zickus
  Cc: Andi Kleen, Ingo Molnar, Prarit Bhargava, Peter Zijlstra,
	linux-kernel, arozansk, Thomas.Mingarelli, ak, Alan Cox,
	H. Peter Anvin, Thomas Gleixner, Maciej W. Rozycki

On Thu, Sep 04, 2008 at 02:26:37PM -0400, Don Zickus wrote:
> On Thu, Sep 04, 2008 at 07:52:31PM +0200, Andi Kleen wrote:
> > On Thu, Sep 04, 2008 at 01:20:52PM -0400, Don Zickus wrote:
> > > On Thu, Sep 04, 2008 at 05:52:17PM +0200, Andi Kleen wrote:
> > > > Then if there's a chipset specific NMI driver it could
> > > > also check if the chipset raised it. That would be a possible
> > > > solution for HP -- they would need to implement such a driver
> > > > for their systems with the special watchdog.
> > > 
> > > The thing with HP's special watchdog timer is that it does _not_ have a
> > > chipset specific NMI it is trying to catch.  HP is going on the assumption
> > > that _all_ NMIs are /bad/ and they want to catch _every_ NMI, log it, and
> > > reboot the system.
> > 
> > That's my point. If you have drivers which can identify all other
> > NMIs then the left over NMIs must come from that watchdog driver.
> > So they just need drivers which can do that for their chipsets.
> 
> Except their chipsets are _not_ producing NMIs.  They just want to
> supercede all the other NMI handlers.  For example if an EDAC NMI came in,
> they don't want the EDAC handler to try and recover from it, HP just wants
> their NMI watchdog to grab the NMI, log it and reboot.
> 
> > 
> > It's not race free, but that's simply not possible with the x86
> > NMI architecture.
> 
> I agree.
> 
> > 
> > Better would be probably to just configure the watchdog 
> > to reboot the system directly on its own. Most other watchdogs
> > I'm aware of do that. That's more reliable anyways because the system
> > might be wedged enough to not be able to process NMIs anymore.
> 
> The trick is they want to log it in a special way (BIOS or NVRAM or
> something I forget) before rebooting.
> 
> > 
> > > 
> > > Now obviously NMIs from kgdb and oprofile are not the ones a system should
> > > panic on but this breaks HP's assumptions.
> > > 
> > > So that is part of the problem.  How do you become a catch-all for NMIs in
> > > a system, to process as you wish, but ignore all the 'safe' NMIs?
> > 
> > To be fully reliable: you need a new NMI architecture or move the event
> > somewhere else.
> > To be reasonable reliable (assuming NMis are not very frequent): you
> > need drivers for all NMI sources that can identify them.
> 
> Yeah I know.  Originally I thought this would be easy, just replace the
> default handler.  But once the mention of kgdb and oprofile using the NMIs
> came up, I realized we are almost back to square one. :-(
> 

Add "kdump" to the list. It will also be broken if we decide to let one
driver hijack the NMI handler.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 19:08               ` Vivek Goyal
@ 2008-09-04 20:00                 ` Andi Kleen
  2008-09-04 20:01                   ` Mingarelli, Thomas
  2008-09-05  9:33                   ` Ingo Molnar
  0 siblings, 2 replies; 29+ messages in thread
From: Andi Kleen @ 2008-09-04 20:00 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Don Zickus, Andi Kleen, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel, arozansk, Thomas.Mingarelli, ak,
	Alan Cox, H. Peter Anvin, Thomas Gleixner, Maciej W. Rozycki

> Add "kdump" to the list. It will also be broken if we decide to let one
> driver hijack the NMI handler.

kdump is a special case, similar to the NMI button panic mode. It should 
be always only active when the user configured it. When the user configured
it should be always the fallback and override any other drivers.

But watchdog is a special case. I assume the watchdog will just log
(and do the work that a SMI should be doing) but then continue 
the chain so that kdump can dump on a watchdog timeout.

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 20:00                 ` Andi Kleen
@ 2008-09-04 20:01                   ` Mingarelli, Thomas
  2008-09-04 20:19                     ` Andi Kleen
  2008-09-04 20:57                     ` Vivek Goyal
  2008-09-05  9:33                   ` Ingo Molnar
  1 sibling, 2 replies; 29+ messages in thread
From: Mingarelli, Thomas @ 2008-09-04 20:01 UTC (permalink / raw)
  To: Andi Kleen, Vivek Goyal
  Cc: Don Zickus, Ingo Molnar, Prarit Bhargava, Peter Zijlstra,
	linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

Exactly.

The hpwdt driver is meant to be a catch-all for any NMI coming through on ProLiant HW only. Moreover, for newer ProLiant HW at that.

Once the NMI comes in, we call into our BIOS for the true reason of the NMI. That message gets logged to the IML in NVRAM for the user to view. We then panic the system.

Yes, kdump will work under this scenario because we stop the watchdog timer. This is a user configurable setting.

Tom

-----Original Message-----
From: Andi Kleen [mailto:andi@firstfloor.org]
Sent: Thursday, September 04, 2008 3:01 PM
To: Vivek Goyal
Cc: Don Zickus; Andi Kleen; Ingo Molnar; Prarit Bhargava; Peter Zijlstra; linux-kernel@vger.kernel.org; arozansk@redhat.com; Mingarelli, Thomas; ak@linux.intel.com; Alan Cox; H. Peter Anvin; Thomas Gleixner; Maciej W. Rozycki
Subject: Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback

> Add "kdump" to the list. It will also be broken if we decide to let one
> driver hijack the NMI handler.

kdump is a special case, similar to the NMI button panic mode. It should
be always only active when the user configured it. When the user configured
it should be always the fallback and override any other drivers.

But watchdog is a special case. I assume the watchdog will just log
(and do the work that a SMI should be doing) but then continue
the chain so that kdump can dump on a watchdog timeout.

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 20:01                   ` Mingarelli, Thomas
@ 2008-09-04 20:19                     ` Andi Kleen
  2008-09-04 20:21                       ` Mingarelli, Thomas
  2008-09-04 20:57                     ` Vivek Goyal
  1 sibling, 1 reply; 29+ messages in thread
From: Andi Kleen @ 2008-09-04 20:19 UTC (permalink / raw)
  To: Mingarelli, Thomas
  Cc: Andi Kleen, Vivek Goyal, Don Zickus, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

On Thu, Sep 04, 2008 at 08:01:31PM +0000, Mingarelli, Thomas wrote:
> Exactly.
> 
> The hpwdt driver is meant to be a catch-all for any NMI coming through on ProLiant HW only. Moreover, for newer ProLiant HW at that.
> 
> Once the NMI comes in, we call into our BIOS for the true reason of the NMI. That message gets logged to the IML in NVRAM for the user to view. We then panic the system.

The BIOS tells you about the NMI reason and tells you if the
watchdog didn't fire? 

If yes that's great. You can be a good NMI citizen then. 
Just check if the NMI came from your watchdog and if not return
NOTIFY_DONE

-Andi


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 20:19                     ` Andi Kleen
@ 2008-09-04 20:21                       ` Mingarelli, Thomas
  2008-09-04 20:53                         ` Andi Kleen
  0 siblings, 1 reply; 29+ messages in thread
From: Mingarelli, Thomas @ 2008-09-04 20:21 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Vivek Goyal, Don Zickus, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

The BIOS does the actual logging of the cause of the NMI. What kind of NMI:

PCI Bus Parity error
Double bit memory error
.
.
.
And so on.

The watchdog is a separate part of the driver. It can be enabled or not; most of our customers will want the NMI sourcing capability of the driver.
With Prarit's patch we no longer need to worry about the watchdog timer firing. However, yes that was troublesome before his patch. We could not distinguish between a REAL NMI and a watchdog timer tick.

The BIOS does not come into play until the hpwdt nmi handler gets called.

Tom

-----Original Message-----
From: Andi Kleen [mailto:andi@firstfloor.org]
Sent: Thursday, September 04, 2008 3:19 PM
To: Mingarelli, Thomas
Cc: Andi Kleen; Vivek Goyal; Don Zickus; Ingo Molnar; Prarit Bhargava; Peter Zijlstra; linux-kernel@vger.kernel.org; arozansk@redhat.com; ak@linux.intel.com; Alan Cox; H. Peter Anvin; Thomas Gleixner; Maciej W. Rozycki
Subject: Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback

On Thu, Sep 04, 2008 at 08:01:31PM +0000, Mingarelli, Thomas wrote:
> Exactly.
>
> The hpwdt driver is meant to be a catch-all for any NMI coming through on ProLiant HW only. Moreover, for newer ProLiant HW at that.
>
> Once the NMI comes in, we call into our BIOS for the true reason of the NMI. That message gets logged to the IML in NVRAM for the user to view. We then panic the system.

The BIOS tells you about the NMI reason and tells you if the
watchdog didn't fire?

If yes that's great. You can be a good NMI citizen then.
Just check if the NMI came from your watchdog and if not return
NOTIFY_DONE

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 20:21                       ` Mingarelli, Thomas
@ 2008-09-04 20:53                         ` Andi Kleen
  2008-09-04 21:22                           ` Don Zickus
  0 siblings, 1 reply; 29+ messages in thread
From: Andi Kleen @ 2008-09-04 20:53 UTC (permalink / raw)
  To: Mingarelli, Thomas
  Cc: Andi Kleen, Vivek Goyal, Don Zickus, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

On Thu, Sep 04, 2008 at 08:21:40PM +0000, Mingarelli, Thomas wrote:
> The BIOS does the actual logging of the cause of the NMI. What kind of NMI:
> 
> PCI Bus Parity error
> Double bit memory error
> .
> .
> .
> And so on.
> 
> The watchdog is a separate part of the driver. It can be enabled or not; most of our customers will want the NMI sourcing capability of the driver.
> With Prarit's patch we no longer need to worry about the watchdog timer firing. However, yes that was troublesome before his patch. We could not distinguish between a REAL NMI and a watchdog timer tick.
> 
> The BIOS does not come into play until the hpwdt nmi handler gets called.

If you use the die chain (as you should) you'll get notified 
of the NMis, but your handler has to decide if the NMI is for
you or not. The way the chain works is that it asks everyone
(in priority order) and the first one who says "it's for me"
will get it.

So if your handler can decide "This is an NMI that came from
a source i know about" it can be a proper[1] NMI cititzen.

Otherwise it will be hard to make it coexist nicely.

Then if the BIOS tells you the real cause you should also take
over the final handler anyways because as it was pointed out earlier
the Linux default fallback handler is crap (it made sense on a 
IBM PC-AT but not today). If you can ask the BIOS for the real
reason you could printk that instead and everyone will be more
happy. That would be one of those "NMI chipset drivers" I talked
about earlier.

That probably should be a separate driver because it's orthogonal to 
the watchdog. 

 But it should only take 
the default handler when kdump is not active. 

-Andi

[1] proper defined as in "it's still racy, but on low volume
NMIs it will hopefully DTRT"

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 20:01                   ` Mingarelli, Thomas
  2008-09-04 20:19                     ` Andi Kleen
@ 2008-09-04 20:57                     ` Vivek Goyal
  2008-09-04 21:05                       ` Mingarelli, Thomas
  1 sibling, 1 reply; 29+ messages in thread
From: Vivek Goyal @ 2008-09-04 20:57 UTC (permalink / raw)
  To: Mingarelli, Thomas
  Cc: Andi Kleen, Don Zickus, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

On Thu, Sep 04, 2008 at 08:01:31PM +0000, Mingarelli, Thomas wrote:
> Exactly.
> 
> The hpwdt driver is meant to be a catch-all for any NMI coming through on ProLiant HW only. Moreover, for newer ProLiant HW at that.
> 
> Once the NMI comes in, we call into our BIOS for the true reason of the NMI. That message gets logged to the IML in NVRAM for the user to view. We then panic the system.
> 
> Yes, kdump will work under this scenario because we stop the watchdog timer. This is a user configurable setting.
> 
> 

Sorry I did not get it. Few questions.

- So you want to capture every NMI and then do something. So what's the
  harm in registering on die chain and look for both DIE_NMI_IPI and
  DIE_NMI events and take appropriate action? Depending on reason code,
  one or other will be called. If I read the code correctly, you will get
  to see every NMI on that cpu irrespective of the reason and then you can
  take the action accordingly.

- How would kdump continue to work above driver hijacks the nmi callback.
  You will disable watchdog, log message and call panic(). panic() will
  lead to kdump and kdump will send NMI IPI to reset of the cpus in the
  system to save their state and halt these. The moment other cpus get
  NMI IPI, above driver will hijack that NMI also and nobody gets a chance
  to run? So kdump will not work?

  Am I missing something?

Thanks
Vivek
  

 
> Tom
> 
> -----Original Message-----
> From: Andi Kleen [mailto:andi@firstfloor.org]
> Sent: Thursday, September 04, 2008 3:01 PM
> To: Vivek Goyal
> Cc: Don Zickus; Andi Kleen; Ingo Molnar; Prarit Bhargava; Peter Zijlstra; linux-kernel@vger.kernel.org; arozansk@redhat.com; Mingarelli, Thomas; ak@linux.intel.com; Alan Cox; H. Peter Anvin; Thomas Gleixner; Maciej W. Rozycki
> Subject: Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
> 
> > Add "kdump" to the list. It will also be broken if we decide to let one
> > driver hijack the NMI handler.
> 
> kdump is a special case, similar to the NMI button panic mode. It should
> be always only active when the user configured it. When the user configured
> it should be always the fallback and override any other drivers.
> 
> But watchdog is a special case. I assume the watchdog will just log
> (and do the work that a SMI should be doing) but then continue
> the chain so that kdump can dump on a watchdog timeout.
> 
> -Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 20:57                     ` Vivek Goyal
@ 2008-09-04 21:05                       ` Mingarelli, Thomas
  2008-09-04 21:21                         ` Vivek Goyal
  0 siblings, 1 reply; 29+ messages in thread
From: Mingarelli, Thomas @ 2008-09-04 21:05 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Andi Kleen, Don Zickus, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

Ok regarding question #1. The die_notifier works as you mentioned; however, the fact that the watchdog timer ticks also come through as NMIs is a hinderance. Now, when the watchdog timer is configured through the LOCAL_APIC the issue isn't so bad. I think the hpwdt driver handles the NMI coming in because there isn't a flood of timer ticks coming through as in the IOAPIC case.

As for the KDUMP perhaps I am missing something. If I handle the NMI coming in and source it via our BIOS, I then stop the watchdog timer and the kdump will take place.

Tom

-----Original Message-----
From: Vivek Goyal [mailto:vgoyal@redhat.com]
Sent: Thursday, September 04, 2008 3:57 PM
To: Mingarelli, Thomas
Cc: Andi Kleen; Don Zickus; Ingo Molnar; Prarit Bhargava; Peter Zijlstra; linux-kernel@vger.kernel.org; arozansk@redhat.com; ak@linux.intel.com; Alan Cox; H. Peter Anvin; Thomas Gleixner; Maciej W. Rozycki
Subject: Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback

On Thu, Sep 04, 2008 at 08:01:31PM +0000, Mingarelli, Thomas wrote:
> Exactly.
>
> The hpwdt driver is meant to be a catch-all for any NMI coming through on ProLiant HW only. Moreover, for newer ProLiant HW at that.
>
> Once the NMI comes in, we call into our BIOS for the true reason of the NMI. That message gets logged to the IML in NVRAM for the user to view. We then panic the system.
>
> Yes, kdump will work under this scenario because we stop the watchdog timer. This is a user configurable setting.
>
>

Sorry I did not get it. Few questions.

- So you want to capture every NMI and then do something. So what's the
  harm in registering on die chain and look for both DIE_NMI_IPI and
  DIE_NMI events and take appropriate action? Depending on reason code,
  one or other will be called. If I read the code correctly, you will get
  to see every NMI on that cpu irrespective of the reason and then you can
  take the action accordingly.

- How would kdump continue to work above driver hijacks the nmi callback.
  You will disable watchdog, log message and call panic(). panic() will
  lead to kdump and kdump will send NMI IPI to reset of the cpus in the
  system to save their state and halt these. The moment other cpus get
  NMI IPI, above driver will hijack that NMI also and nobody gets a chance
  to run? So kdump will not work?

  Am I missing something?

Thanks
Vivek



> Tom
>
> -----Original Message-----
> From: Andi Kleen [mailto:andi@firstfloor.org]
> Sent: Thursday, September 04, 2008 3:01 PM
> To: Vivek Goyal
> Cc: Don Zickus; Andi Kleen; Ingo Molnar; Prarit Bhargava; Peter Zijlstra; linux-kernel@vger.kernel.org; arozansk@redhat.com; Mingarelli, Thomas; ak@linux.intel.com; Alan Cox; H. Peter Anvin; Thomas Gleixner; Maciej W. Rozycki
> Subject: Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
>
> > Add "kdump" to the list. It will also be broken if we decide to let one
> > driver hijack the NMI handler.
>
> kdump is a special case, similar to the NMI button panic mode. It should
> be always only active when the user configured it. When the user configured
> it should be always the fallback and override any other drivers.
>
> But watchdog is a special case. I assume the watchdog will just log
> (and do the work that a SMI should be doing) but then continue
> the chain so that kdump can dump on a watchdog timeout.
>
> -Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 21:05                       ` Mingarelli, Thomas
@ 2008-09-04 21:21                         ` Vivek Goyal
  2008-09-04 21:24                           ` Don Zickus
  0 siblings, 1 reply; 29+ messages in thread
From: Vivek Goyal @ 2008-09-04 21:21 UTC (permalink / raw)
  To: Mingarelli, Thomas
  Cc: Andi Kleen, Don Zickus, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

On Thu, Sep 04, 2008 at 09:05:37PM +0000, Mingarelli, Thomas wrote:
> Ok regarding question #1. The die_notifier works as you mentioned; however, the fact that the watchdog timer ticks also come through as NMIs is a hinderance. Now, when the watchdog timer is configured through the LOCAL_APIC the issue isn't so bad. I think the hpwdt driver handles the NMI coming in because there isn't a flood of timer ticks coming through as in the IOAPIC case.

Ok, so how does replacing the nmi callback help here? You driver handler
be still called upon timer ticks. So you will be called on watchdog tick
whether you are on die chain or you replace nmi handler with nmi callback.
So watchdog ticks can't be a reason for not being on die chain.

Thanks
Vivek

> 
> As for the KDUMP perhaps I am missing something. If I handle the NMI coming in and source it via our BIOS, I then stop the watchdog timer and the kdump will take place.
> Tom

> 
> -----Original Message-----
> From: Vivek Goyal [mailto:vgoyal@redhat.com]
> Sent: Thursday, September 04, 2008 3:57 PM
> To: Mingarelli, Thomas
> Cc: Andi Kleen; Don Zickus; Ingo Molnar; Prarit Bhargava; Peter Zijlstra; linux-kernel@vger.kernel.org; arozansk@redhat.com; ak@linux.intel.com; Alan Cox; H. Peter Anvin; Thomas Gleixner; Maciej W. Rozycki
> Subject: Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
> 
> On Thu, Sep 04, 2008 at 08:01:31PM +0000, Mingarelli, Thomas wrote:
> > Exactly.
> >
> > The hpwdt driver is meant to be a catch-all for any NMI coming through on ProLiant HW only. Moreover, for newer ProLiant HW at that.
> >
> > Once the NMI comes in, we call into our BIOS for the true reason of the NMI. That message gets logged to the IML in NVRAM for the user to view. We then panic the system.
> >
> > Yes, kdump will work under this scenario because we stop the watchdog timer. This is a user configurable setting.
> >
> >
> 
> Sorry I did not get it. Few questions.
> 
> - So you want to capture every NMI and then do something. So what's the
>   harm in registering on die chain and look for both DIE_NMI_IPI and
>   DIE_NMI events and take appropriate action? Depending on reason code,
>   one or other will be called. If I read the code correctly, you will get
>   to see every NMI on that cpu irrespective of the reason and then you can
>   take the action accordingly.
> 
> - How would kdump continue to work above driver hijacks the nmi callback.
>   You will disable watchdog, log message and call panic(). panic() will
>   lead to kdump and kdump will send NMI IPI to reset of the cpus in the
>   system to save their state and halt these. The moment other cpus get
>   NMI IPI, above driver will hijack that NMI also and nobody gets a chance
>   to run? So kdump will not work?
> 
>   Am I missing something?
> 
> Thanks
> Vivek
> 
> 
> 
> > Tom
> >
> > -----Original Message-----
> > From: Andi Kleen [mailto:andi@firstfloor.org]
> > Sent: Thursday, September 04, 2008 3:01 PM
> > To: Vivek Goyal
> > Cc: Don Zickus; Andi Kleen; Ingo Molnar; Prarit Bhargava; Peter Zijlstra; linux-kernel@vger.kernel.org; arozansk@redhat.com; Mingarelli, Thomas; ak@linux.intel.com; Alan Cox; H. Peter Anvin; Thomas Gleixner; Maciej W. Rozycki
> > Subject: Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
> >
> > > Add "kdump" to the list. It will also be broken if we decide to let one
> > > driver hijack the NMI handler.
> >
> > kdump is a special case, similar to the NMI button panic mode. It should
> > be always only active when the user configured it. When the user configured
> > it should be always the fallback and override any other drivers.
> >
> > But watchdog is a special case. I assume the watchdog will just log
> > (and do the work that a SMI should be doing) but then continue
> > the chain so that kdump can dump on a watchdog timeout.
> >
> > -Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 20:53                         ` Andi Kleen
@ 2008-09-04 21:22                           ` Don Zickus
  0 siblings, 0 replies; 29+ messages in thread
From: Don Zickus @ 2008-09-04 21:22 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Mingarelli, Thomas, Vivek Goyal, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

On Thu, Sep 04, 2008 at 10:53:45PM +0200, Andi Kleen wrote:
> So if your handler can decide "This is an NMI that came from
> a source i know about" it can be a proper[1] NMI cititzen.

The way it was explained to me months ago, is that it can't.  Currently it
works by making sure the nmi watchdog is disabled on the box (either
compiled off, or nmi_watchdog=0).  Then it registers itself on the DIE
chain.  Because it is on the top of the chain, it just so happens to
be the first handler to process the NMIs, so HP is happy.  Kdump works
because it registers later on top of the HP handler thus it is called
before HP can panic the box.  However, I suspect if one were to run
oprofile in nmi mode, HP would panic the box on the first profile point.

> 
> Otherwise it will be hard to make it coexist nicely.

Hence our attempt at a solution by creating a registration for a new
nmi_handler, which in hindsight seemed to miss a couple of cases.

Cheers,
Don

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 21:21                         ` Vivek Goyal
@ 2008-09-04 21:24                           ` Don Zickus
  2008-09-04 21:46                             ` Vivek Goyal
  2008-09-05 10:24                             ` Ingo Molnar
  0 siblings, 2 replies; 29+ messages in thread
From: Don Zickus @ 2008-09-04 21:24 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Mingarelli, Thomas, Andi Kleen, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

On Thu, Sep 04, 2008 at 05:21:34PM -0400, Vivek Goyal wrote:
> On Thu, Sep 04, 2008 at 09:05:37PM +0000, Mingarelli, Thomas wrote:
> > Ok regarding question #1. The die_notifier works as you mentioned; however, the fact that the watchdog timer ticks also come through as NMIs is a hinderance. Now, when the watchdog timer is configured through the LOCAL_APIC the issue isn't so bad. I think the hpwdt driver handles the NMI coming in because there isn't a flood of timer ticks coming through as in the IOAPIC case.
> 
> Ok, so how does replacing the nmi callback help here? You driver handler
> be still called upon timer ticks. So you will be called on watchdog tick
> whether you are on die chain or you replace nmi handler with nmi callback.
> So watchdog ticks can't be a reason for not being on die chain.

Prarit's patch disabled the timer upon registering a callback to prevent
this case.  The thought was if you have your own handler you could provide
your own watchdog.

Cheers,
Don

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 21:24                           ` Don Zickus
@ 2008-09-04 21:46                             ` Vivek Goyal
  2008-09-05  8:57                               ` Ingo Molnar
  2008-09-05 10:24                             ` Ingo Molnar
  1 sibling, 1 reply; 29+ messages in thread
From: Vivek Goyal @ 2008-09-04 21:46 UTC (permalink / raw)
  To: Don Zickus
  Cc: Mingarelli, Thomas, Andi Kleen, Ingo Molnar, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

On Thu, Sep 04, 2008 at 05:24:53PM -0400, Don Zickus wrote:
> On Thu, Sep 04, 2008 at 05:21:34PM -0400, Vivek Goyal wrote:
> > On Thu, Sep 04, 2008 at 09:05:37PM +0000, Mingarelli, Thomas wrote:
> > > Ok regarding question #1. The die_notifier works as you mentioned; however, the fact that the watchdog timer ticks also come through as NMIs is a hinderance. Now, when the watchdog timer is configured through the LOCAL_APIC the issue isn't so bad. I think the hpwdt driver handles the NMI coming in because there isn't a flood of timer ticks coming through as in the IOAPIC case.
> > 
> > Ok, so how does replacing the nmi callback help here? You driver handler
> > be still called upon timer ticks. So you will be called on watchdog tick
> > whether you are on die chain or you replace nmi handler with nmi callback.
> > So watchdog ticks can't be a reason for not being on die chain.
> 
> Prarit's patch disabled the timer upon registering a callback to prevent
> this case.  The thought was if you have your own handler you could provide
> your own watchdog.
> 

In theory, you could do the same while registering the handler on die
chain?

I don't get the whole point. So we are looking for a system where no body
else uses an NMI for any purpose and the moment NMI happens, this driver
will go and panic() the system. I don't get, what do we achive by that?

Looks like you got some device in platform which raises an NMI and which
indicates that something is wrong and log the message and do a panic(). 
But you don't have a way to find out if that device has raised the
interrupt or something else has raised the NMI, hence you want to stop the
watchdog timer and assume any NMI henceforth is from device?

So any functionality which is dependent on NMI, will not work as long as
this driver is loaded. 

Thanks
Vivek

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 21:46                             ` Vivek Goyal
@ 2008-09-05  8:57                               ` Ingo Molnar
  0 siblings, 0 replies; 29+ messages in thread
From: Ingo Molnar @ 2008-09-05  8:57 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Don Zickus, Mingarelli, Thomas, Andi Kleen, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki, Eric W. Biederman

* Vivek Goyal <vgoyal@redhat.com> wrote:

> > Prarit's patch disabled the timer upon registering a callback to 
> > prevent this case.  The thought was if you have your own handler you 
> > could provide your own watchdog.
> 
> In theory, you could do the same while registering the handler on die 
> chain?
> 
> I don't get the whole point. So we are looking for a system where no 
> body else uses an NMI for any purpose and the moment NMI happens, this 
> driver will go and panic() the system. I don't get, what do we achive 
> by that?
> 
> Looks like you got some device in platform which raises an NMI and 
> which indicates that something is wrong and log the message and do a 
> panic(). But you don't have a way to find out if that device has 
> raised the interrupt or something else has raised the NMI, hence you 
> want to stop the watchdog timer and assume any NMI henceforth is from 
> device?
> 
> So any functionality which is dependent on NMI, will not work as long 
> as this driver is loaded.

exactly. The proposed patch brings the NMI notifier arms race to its 
next level: it is in essence a "super high priority" notifier that wants 
exclusivity. That is unnecessary: we already have sufficient 
infrastructure in place to recognize the priority of notifiers, and die 
notifiers make active use of it. (see the list below)

If the goal is to log relevant NMI events to NVRAM for later inspection 
then the solution is to register a low-priority notifier which will 
execute if no other notifier shows interest in an NMI event.

If the goal is to reboot the system if there's a "bad" NMI, then the 
solution is to register a low-priority notifier which will execute and 
panic the system if no other notifier shows interest in that NMI event.

This means all the 'good' NMI sources need to register at higher 
priority and all the standard platform fallback notifiers need to 
register at lowest priority. Debuggers go last. If any of the existing 
notifiers in the kernel dont follow the rules and cause problems then we 
can fix up those.

Here's a list of all current die notifiers we use in the kernel:

                                                                  prio
                                                                  ----
  # core kernel:

  kernel/kprobes.c                   # instruction probing        +INT_MAX
  kernel/trace/trace.c               # dump trace                 150

  # arch/x86:

  arch/x86/mm/kmmio.c                # mmiotrace, debug feature   0 
  arch/x86/kernel/kgdb.c             # kgdb, kernel debugger      -INT_MAX
  arch/x86/kernel/crash.c            # kdump                      0
  arch/x86/oprofile/nmi_int.c        # oprofile                   0
  arch/x86/oprofile/nmi_timer_int.c  # oprofile timer fallback    0

  # drivers:

  drivers/watchdog/hpwdt.c           # HP watchdog driver         +INT_MAX
  drivers/char/ipmi/ipmi_watchdog.c  # IPMI watchdog              150
  drivers/misc/sgi-xp/xpc_main.c     # SGI SN2                    0

  # upcoming:
  arch/x86/mm/kmemcheck/smp.c        # in tip/master              0
                                     [ this should change to +INT_MAX ]

One change we could do is to move arch/x86/kernel/crash.c's priority to 
-100. That would achieve the following effect: it would execute after 
all known sources of NMIs, but before the lowest prio interactive 
notifiers such as kgdb.

That way a platform driver could insert at priority -50 and execute 
after the known self-generated NMI sources, but before any of the debug 
and crash-tool notifiers. And the constants should be made symbolic as 
well, so that we encode the purpose of a notifier, not the 
implementation - and can map the purpose to priority and shape the 
ordering flexibly.

furthermore, as i indicated it in my first reply, if the port 61H read 
is causing problems on this platform (and i'd not be surprised if it did 
so) we can turn it into a lowest-prio fallback notifier which only 
executes if no known sources of NMIs show interest in an event.

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 20:00                 ` Andi Kleen
  2008-09-04 20:01                   ` Mingarelli, Thomas
@ 2008-09-05  9:33                   ` Ingo Molnar
  2008-09-05 14:16                     ` Vivek Goyal
  2008-09-05 14:18                     ` Andi Kleen
  1 sibling, 2 replies; 29+ messages in thread
From: Ingo Molnar @ 2008-09-05  9:33 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Vivek Goyal, Don Zickus, Prarit Bhargava, Peter Zijlstra,
	linux-kernel, arozansk, Thomas.Mingarelli, ak, Alan Cox,
	H. Peter Anvin, Thomas Gleixner, Maciej W. Rozycki

* Andi Kleen <andi@firstfloor.org> wrote:

> > Add "kdump" to the list. It will also be broken if we decide to let 
> > one driver hijack the NMI handler.
> 
> kdump is a special case, similar to the NMI button panic mode. It 
> should be always only active when the user configured it. When the 
> user configured it should be always the fallback and override any 
> other drivers.

if by 'any other drivers' you mean all other notifiers then that's wrong 
- kdump must still come after many other NMI sources.

Basically, the sane order is this:

highest priority:

  instruction patching callbacks. (kprobes, mmiotrace, kmemcheck) These 
  are abstractions that are essential for the kernel to properly 
  function/execute. We dont ever want them to be overriden.

high priority:

  CPU-generated profiling callbacks. (nmi-lapic watchdog, performance 
  counter generated NMIs) These are 'good' NMI sources that can (well, 
  'could') identify themselves, and they can also come in very 
  frequently so we want to execute them early.

mid priority:

  optional/interactive debug facilities. (kdump, KGDB, trace dump, NMI 
  button).
  The user enables them optionally and wants them catch all non-expected 
  or interactive NMI events.

normal priority:

  various platform drivers. Infrequent NMI sources. It's what we use to 
  make unexpected events slightly more informative when the user does 
  not configure any explicit debugging helper.

lowest priority:

  fallback legacy platform handlers - 61H reads, etc.

All in one, the patch submitted here is wrong as a generic facility. One 
valid aspect of the patch is that the port 61H reads (and other 
architecture code chipset ops) should execute as a regular notifier and 
with low priority.

as it does not really solve anything in a structured way, it allows 
platform drivers to install a super-high priority notifier, creating 
needless duplication and confusion. The exact reasons for the changes 
should be listed instead and proper (and separate) patches done for each 
reason, along the suggestions in this thread - i believe all issues were 
covered in one way or another.

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-04 21:24                           ` Don Zickus
  2008-09-04 21:46                             ` Vivek Goyal
@ 2008-09-05 10:24                             ` Ingo Molnar
  1 sibling, 0 replies; 29+ messages in thread
From: Ingo Molnar @ 2008-09-05 10:24 UTC (permalink / raw)
  To: Don Zickus
  Cc: Vivek Goyal, Mingarelli, Thomas, Andi Kleen, Prarit Bhargava,
	Peter Zijlstra, linux-kernel@vger.kernel.org, arozansk@redhat.com,
	ak@linux.intel.com, Alan Cox, H. Peter Anvin, Thomas Gleixner,
	Maciej W. Rozycki

* Don Zickus <dzickus@redhat.com> wrote:

> Prarit's patch disabled the timer upon registering a callback to 
> prevent this case.  The thought was if you have your own handler you 
> could provide your own watchdog.

btw., while we are talking about watchdog design problems, the current 
way the NMI timer watchdog oprofile fallback uses the die handler and 
how it all interacts with the ioapic/lapic-timer NMI watchdog is 
misdesigned as well. (this is used in the relatively rare but still 
possible case where no primary oprofile handler is found and installed)

The i386 and x86_64 architecture code started using die notifiers for 
the NMI timer watchdog oprofile code two years ago, via commit 2fbe7b25 
"i386/x86-64: Remove un/set_nmi_callback and reserve/release_lapic_nmi 
functions". [ Hey - what a coincidence - that was done by you! ;-) ]

It was a nice cleanup but not complete: using a die notifier for the NMI 
watchdog does not work properly as the NMI watchdog driver has no 
knowledge currently about whether it got activated (often it's not even 
possible to tell it):

 +       case DIE_NMI:
 +               oprofile_add_sample(args->regs, 0);
 +               ret = NOTIFY_STOP;
 +               break;

it always returns NOTIFY_STOP. That breaks all other lower prio 
notifiers down the chain which might have proper knowledge about the 
source of the NMI. Such kind of cross-notifier breakage is one of the 
reasons why driver notifiers try to become exclusive and try to play 
tricks with the nmi watchdog.

Instead the full solution would be for it to return NOTIFY_OK when it is 
not the source of the NMI (and is able to tell it).

[
  The reason why i qualified my statement with "when it is able to tell 
  it" is that while it is possible to disambiguate the NMI source when 
  the source of the NMI was the local apic timer (we already do that in 
  lapic_wd_event() - we return 1 if we rearmed the counter in the CPU), 
  it is not possible to tell it reliably that the NMI source was the
  IO-APIC.

  The reason for that is that the IO-APIC generated NMIs are 
  fundamentally 'anonymous' in that case: we put the legacy PIC into 
  auto-EOI mode and the IO-APIC broadcasts the NMIs and they are edge 
  triggered (on all but some really ancient systems) . So there's no 
  permanent and reliable information left about why the NMI was raised 
  in the CPU. This is a property of the hardware, we cannot solve it on 
  the kernel side. (we tried explicit ACKs before, and those are even 
  worse.)
]

So the right long-term solution is to move all NMI watchdog code 
(including the lockup detection bits - nmi_watchdog_tick(), etc.) into 
the notifier, then to propagate the re-arm information from 
lapic_wd_event() back into the notifier return code, and perhaps to 
print a (one-time) warning into the kernel log about the (in essence) 
deactivated lower-prio notifiers when the IO-APIC watchdog is activated.

This is all non-default debug or instrumentation functionality so if 
they cancel out each other in certain rare cases that's not a big issue 
in practice, as long as the user is informed about the constraints.

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-05  9:33                   ` Ingo Molnar
@ 2008-09-05 14:16                     ` Vivek Goyal
  2008-09-05 14:18                     ` Andi Kleen
  1 sibling, 0 replies; 29+ messages in thread
From: Vivek Goyal @ 2008-09-05 14:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Don Zickus, Prarit Bhargava, Peter Zijlstra,
	linux-kernel, arozansk, Thomas.Mingarelli, ak, Alan Cox,
	H. Peter Anvin, Thomas Gleixner, Maciej W. Rozycki

On Fri, Sep 05, 2008 at 11:33:03AM +0200, Ingo Molnar wrote:
> 
> * Andi Kleen <andi@firstfloor.org> wrote:
> 
> > > Add "kdump" to the list. It will also be broken if we decide to let 
> > > one driver hijack the NMI handler.
> > 
> > kdump is a special case, similar to the NMI button panic mode. It 
> > should be always only active when the user configured it. When the 
> > user configured it should be always the fallback and override any 
> > other drivers.
> 
> if by 'any other drivers' you mean all other notifiers then that's wrong 
> - kdump must still come after many other NMI sources.
> 

Hi Ingo,

Kdump notifier is registered on the die list, only after when we have 
decided that system is no more runnable and it is time to die. So as long
as system is running, this notifier is not even present on the list and
does not compete for any of the NMI events.

But once the panic() has occurred, the only notifiers which probably
are interested in NMi events are kgdb and kdump? We probably don't want to
run any "instruction patching" callback or any profiling related callbacks
because system is already dead and we are just doing some last minute
information gathering/debugging exercise.

Keeping that in mind, I would think that it would make sense to register
kdump notifier at even higher priority (INT_MAX) and not lower the
priority. Because after panic() probably there is no body who is
interested in NMI event. Any contention for getting hold of panic() event
should be resolved at panic_notifier_list (kdump and kgdb contention comes
to mind). Possibly there are other users who want to get notified of
panic() event and do something. But I think platform drivers will not be
interested in NMI event after panic(). 

Thanks
Vivek
 
> Basically, the sane order is this:
> 
> highest priority:
>  
>   instruction patching callbacks. (kprobes, mmiotrace, kmemcheck) These 
>   are abstractions that are essential for the kernel to properly 
>   function/execute. We dont ever want them to be overriden.
> 
> high priority:
> 
>   CPU-generated profiling callbacks. (nmi-lapic watchdog, performance 
>   counter generated NMIs) These are 'good' NMI sources that can (well, 
>   'could') identify themselves, and they can also come in very 
>   frequently so we want to execute them early.
> 
> mid priority:
> 
>   optional/interactive debug facilities. (kdump, KGDB, trace dump, NMI 
>   button).
>   The user enables them optionally and wants them catch all non-expected 
>   or interactive NMI events.
> 
> normal priority:
> 
>   various platform drivers. Infrequent NMI sources. It's what we use to 
>   make unexpected events slightly more informative when the user does 
>   not configure any explicit debugging helper.
> 
> lowest priority:
> 
>   fallback legacy platform handlers - 61H reads, etc.
> 


> All in one, the patch submitted here is wrong as a generic facility. One 
> valid aspect of the patch is that the port 61H reads (and other 
> architecture code chipset ops) should execute as a regular notifier and 
> with low priority.
> 
> as it does not really solve anything in a structured way, it allows 
> platform drivers to install a super-high priority notifier, creating 
> needless duplication and confusion. The exact reasons for the changes 
> should be listed instead and proper (and separate) patches done for each 
> reason, along the suggestions in this thread - i believe all issues were 
> covered in one way or another.
> 
> 	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC] NMI Re-introduce un[set]_nmi_callback
  2008-09-05  9:33                   ` Ingo Molnar
  2008-09-05 14:16                     ` Vivek Goyal
@ 2008-09-05 14:18                     ` Andi Kleen
  1 sibling, 0 replies; 29+ messages in thread
From: Andi Kleen @ 2008-09-05 14:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Vivek Goyal, Don Zickus, Prarit Bhargava,
	Peter Zijlstra, linux-kernel, arozansk, Thomas.Mingarelli,
	Alan Cox, H. Peter Anvin, Thomas Gleixner, Maciej W. Rozycki

Ingo Molnar wrote:
> * Andi Kleen <andi@firstfloor.org> wrote:
> 
>>> Add "kdump" to the list. It will also be broken if we decide to let 
>>> one driver hijack the NMI handler.
>> kdump is a special case, similar to the NMI button panic mode. It 
>> should be always only active when the user configured it. When the 
>> user configured it should be always the fallback and override any 
>> other drivers.
> 
> if by 'any other drivers' you mean all other notifiers then that's wrong 
> - kdump must still come after many other NMI sources.

Your ordering makes sense. Someone just has to go through all
the users and fix them up I guess and also document it properly.

One thing to consider though: if there are more and more NMI drivers
it would make sense to have a new notifier chain just for this
(and also finally convert oprofile to use it too).

The problem with adding more and more into the die chain is that
die is executed on every exception, including quite performance
critical ones like page fault or int 3 (performance critical for
dprobes)

-Andi


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2008-09-05 14:18 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-04 13:07 [PATCH RFC] NMI Re-introduce un[set]_nmi_callback Prarit Bhargava
2008-09-04 13:37 ` Peter Zijlstra
2008-09-04 14:29   ` Prarit Bhargava
2008-09-04 14:49     ` aris
2008-09-04 14:56     ` Ingo Molnar
2008-09-04 15:12       ` H. Peter Anvin
2008-09-04 15:18         ` Ingo Molnar
2008-09-04 15:52       ` Andi Kleen
2008-09-04 17:20         ` Don Zickus
2008-09-04 17:52           ` Andi Kleen
2008-09-04 18:26             ` Don Zickus
2008-09-04 18:47               ` Andi Kleen
2008-09-04 19:08               ` Vivek Goyal
2008-09-04 20:00                 ` Andi Kleen
2008-09-04 20:01                   ` Mingarelli, Thomas
2008-09-04 20:19                     ` Andi Kleen
2008-09-04 20:21                       ` Mingarelli, Thomas
2008-09-04 20:53                         ` Andi Kleen
2008-09-04 21:22                           ` Don Zickus
2008-09-04 20:57                     ` Vivek Goyal
2008-09-04 21:05                       ` Mingarelli, Thomas
2008-09-04 21:21                         ` Vivek Goyal
2008-09-04 21:24                           ` Don Zickus
2008-09-04 21:46                             ` Vivek Goyal
2008-09-05  8:57                               ` Ingo Molnar
2008-09-05 10:24                             ` Ingo Molnar
2008-09-05  9:33                   ` Ingo Molnar
2008-09-05 14:16                     ` Vivek Goyal
2008-09-05 14:18                     ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox