[RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-10  2:51 [RFC 1/6] x86, NMI, Add symbol definition for NMI magic constants Huang Ying
@ 2010-09-10  2:51 ` Huang Ying
  2010-09-10 16:02   ` Don Zickus
  0 siblings, 1 reply; 27+ messages in thread
From: Huang Ying @ 2010-09-10  2:51 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin; +Cc: linux-kernel, Andi Kleen, Huang Ying

On some platforms, fatal hardware error may be notified via unknown
NMI.

For example, on some platform with APEI firmware first mode support,
firmware generates NMI for fatal error but without error record. The
unknown NMI should be treated as notification of fatal hardware
error. The unknown_nmi_for_hwerr is added for these platform, if it is
not zero, system will treat unknown NMI as notification of fatal
hardware error.

These platforms are identified via the presentation of APEI HEST or
some PCI ID of the host bridge. The PCI ID of host bridge instead of
DMI ID is used, so that the checking can be done based on the platform
type instead of motherboard. This should be simpler and sufficient.

The method to identify the platforms is designed by Andi Kleen.

# TODO

- Report unknown NMI as fatal hardware error with new hardware error
  reporting interface, so that the error record can be gotten after
  system reboot.

- Because printk is not safe in NMI handler, all printk except that
  called after bust_spinlocks will be called in a delayed IRQ context
  to prevent system deadlock.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 arch/x86/include/asm/nmi.h |    1 
 arch/x86/kernel/Makefile   |    2 +
 arch/x86/kernel/hwerr.c    |   55 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c    |   10 ++++++++
 drivers/acpi/apei/hest.c   |    8 ++++++
 5 files changed, 76 insertions(+)
 create mode 100644 arch/x86/kernel/hwerr.c

--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -44,6 +44,7 @@ struct ctl_table;
 extern int proc_nmi_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
+extern int unknown_nmi_for_hwerr;
 
 void arch_trigger_all_cpu_backtrace(void);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -116,6 +116,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION)
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 
+obj-y					+= hwerr.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
--- /dev/null
+++ b/arch/x86/kernel/hwerr.c
@@ -0,0 +1,55 @@
+/*
+ * Hardware error architecture dependent processing
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/nmi.h>
+
+/*
+ * On some platform, hardware errors may be notified via unknown
+ * NMI. These platform is identified via the PCI ID of host bridge.
+ *
+ * The PCI ID of host bridge instead of DMI ID is used, so that the
+ * checking can be done based on the platform instead of motherboard.
+ * This should be simpler and sufficient.
+ */
+static const
+struct pci_device_id unknown_nmi_for_hwerr_platform[] __initdata = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3406) },
+	{ 0, }
+};
+
+int __init check_unknown_nmi_for_hwerr(void)
+{
+	struct pci_dev *dev = NULL;
+
+	for_each_pci_dev(dev) {
+		if (pci_match_id(unknown_nmi_for_hwerr_platform, dev)) {
+			pr_info(
+"Host bridge is identified, will treat unknown NMI as hardware error!\n");
+			unknown_nmi_for_hwerr = 1;
+			break;
+		}
+	}
+
+	return 0;
+}
+late_initcall(check_unknown_nmi_for_hwerr);
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors);
 
 static int ignore_nmis;
 
+int unknown_nmi_for_hwerr;
+
 /*
  * Prevent NMI reason port (0x61) being accessed simultaneously, can
  * only be used in NMI handler.
@@ -349,6 +351,14 @@ io_check_error(unsigned char reason, str
 static notrace __kprobes void
 unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 {
+	/*
+	 * On some platforms, hardware errors may be notified via
+	 * unknown NMI
+	 */
+	if (unknown_nmi_for_hwerr)
+		panic("NMI for hardware error without error record: "
+		      "Not continuing");
+
 #ifdef CONFIG_MCA
 	/*
 	 * Might actually be able to figure out what the guilty party
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -35,6 +35,7 @@
 #include <linux/highmem.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
+#include <linux/nmi.h>
 #include <acpi/apei.h>
 
 #include "apei-internal.h"
@@ -222,6 +223,13 @@ static int __init hest_init(void)
 	if (rc)
 		goto err;
 
+	/*
+	 * System has proper HEST should treat unknown NMI as fatal
+	 * hardware error notification
+	 */
+	pr_info("HEST is valid, will treat unknown NMI as hardware error!\n");
+	unknown_nmi_for_hwerr = 1;
+
 	rc = hest_ghes_dev_register(ghes_count);
 	if (rc)
 		goto err;

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-10  2:51 ` [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI Huang Ying
@ 2010-09-10 16:02   ` Don Zickus
  2010-09-10 16:19     ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Don Zickus @ 2010-09-10 16:02 UTC (permalink / raw)
  To: Huang Ying; +Cc: Ingo Molnar, H. Peter Anvin, linux-kernel, Andi Kleen

> @@ -349,6 +351,14 @@ io_check_error(unsigned char reason, str
>  static notrace __kprobes void
>  unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
>  {
> +	/*
> +	 * On some platforms, hardware errors may be notified via
> +	 * unknown NMI
> +	 */
> +	if (unknown_nmi_for_hwerr)
> +		panic("NMI for hardware error without error record: "
> +		      "Not continuing");
> +
>  #ifdef CONFIG_MCA

I'm not sure I agree with this.  I still see PCI SERR's not coming in
through port 0x61 and get routed to unknown_nmi_error.  Not sure we should
just assume that it is an APEI/HEST error and panic the box.

Also all the perf problems we have seen recently have been going through
that path as we slowly try to figure out why we are not catching those
unknown nmis.

I am grasping for straws here, but is there a register that APEI/HEST can
poke to see if it generated the NMI?

Cheers,
Don

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-10 16:02   ` Don Zickus
@ 2010-09-10 16:19     ` Andi Kleen
  2010-09-10 18:40       ` Don Zickus
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2010-09-10 16:19 UTC (permalink / raw)
  To: Don Zickus; +Cc: Huang Ying, Ingo Molnar, H. Peter Anvin, linux-kernel


> I am grasping for straws here, but is there a register that APEI/HEST
> can poke to see if it generated the NMI?

HEST knows this yes.

But this is not about HEST errors, but about those without HEST
handling.

-Andi


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-10 16:19     ` Andi Kleen
@ 2010-09-10 18:40       ` Don Zickus
  2010-09-13  2:19         ` Huang Ying
  0 siblings, 1 reply; 27+ messages in thread
From: Don Zickus @ 2010-09-10 18:40 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Ingo Molnar, H. Peter Anvin, linux-kernel

On Fri, Sep 10, 2010 at 06:19:29PM +0200, Andi Kleen wrote:
> 
> > I am grasping for straws here, but is there a register that APEI/HEST
> > can poke to see if it generated the NMI?
> 
> HEST knows this yes.
> 
> But this is not about HEST errors, but about those without HEST
> handling.

Don't most unknown NMIs fall into the same boat, that they were not being
handled properly?

On the other hand could you use the die_notifier_chain(DIE_UNKNOWNNMI) for
the same purpose and keep the unknown_nmi_error() handler a little
cleaner?

Cheers,
Don

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-10 18:40       ` Don Zickus
@ 2010-09-13  2:19         ` Huang Ying
  2010-09-13 14:11           ` Don Zickus
  0 siblings, 1 reply; 27+ messages in thread
From: Huang Ying @ 2010-09-13  2:19 UTC (permalink / raw)
  To: Don Zickus
  Cc: Andi Kleen, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Sat, 2010-09-11 at 02:40 +0800, Don Zickus wrote:
> On Fri, Sep 10, 2010 at 06:19:29PM +0200, Andi Kleen wrote:
> > 
> > > I am grasping for straws here, but is there a register that APEI/HEST
> > > can poke to see if it generated the NMI?
> > 
> > HEST knows this yes.
> > 
> > But this is not about HEST errors, but about those without HEST
> > handling.
> 
> Don't most unknown NMIs fall into the same boat, that they were not being
> handled properly?

As far as I know, at least on some platforms, unknown NMIs are used for
hardware error reporting. They will cause "Blue Screen" in Windows.

> On the other hand could you use the die_notifier_chain(DIE_UNKNOWNNMI) for
> the same purpose and keep the unknown_nmi_error() handler a little
> cleaner?

I think explicit function call has better readability than notifier
chain.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13  2:19         ` Huang Ying
@ 2010-09-13 14:11           ` Don Zickus
  2010-09-13 15:24             ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Don Zickus @ 2010-09-13 14:11 UTC (permalink / raw)
  To: Huang Ying
  Cc: Andi Kleen, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, Sep 13, 2010 at 10:19:49AM +0800, Huang Ying wrote:
> On Sat, 2010-09-11 at 02:40 +0800, Don Zickus wrote:
> > On Fri, Sep 10, 2010 at 06:19:29PM +0200, Andi Kleen wrote:
> > > 
> > > > I am grasping for straws here, but is there a register that APEI/HEST
> > > > can poke to see if it generated the NMI?
> > > 
> > > HEST knows this yes.
> > > 
> > > But this is not about HEST errors, but about those without HEST
> > > handling.
> > 
> > Don't most unknown NMIs fall into the same boat, that they were not being
> > handled properly?
> 
> As far as I know, at least on some platforms, unknown NMIs are used for
> hardware error reporting. They will cause "Blue Screen" in Windows.

Unfortunately, most of the bugzillas I deal with, unkown NMIs are the
result of SERRs.  While you can consider that hardware error reporting,
the easiest way for me to debug those problems currently is to have
reporters run 'lspci -vvv' after the NMI is displayed to figure out who
caused the NMI.

My fear is that panic'ing the box on unknown NMIs on those platforms will
hinder my ability to easily debug those NMIs.

> 
> > On the other hand could you use the die_notifier_chain(DIE_UNKNOWNNMI) for
> > the same purpose and keep the unknown_nmi_error() handler a little
> > cleaner?
> 
> I think explicit function call has better readability than notifier
> chain.

Ok.  What criteria should we establish to determine which functions go on
the notifier chain and which ones can explicitly called?

Cheers,
Don

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 14:11           ` Don Zickus
@ 2010-09-13 15:24             ` Andi Kleen
  2010-09-13 15:47               ` Don Zickus
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2010-09-13 15:24 UTC (permalink / raw)
  To: Don Zickus
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

Don,

> Unfortunately, most of the bugzillas I deal with, unkown NMIs are the
> result of SERRs.  While you can consider that hardware error
> reporting, the easiest way for me to debug those problems currently
> is to have reporters run 'lspci -vvv' after the NMI is displayed to
> figure out who caused the NMI.
> 
> My fear is that panic'ing the box on unknown NMIs on those platforms
> will hinder my ability to easily debug those NMIs.

The reason the NMI is sent is that there is a "lost" 
data corruption somewhere in the system and if you don't 
stop it the system the corruption may make it to disk,
become permanent etc. The hardware was designed
under the assumption that  the system is stopped by software
when this happens (same reason as why many machine
checks cause panics)

Then there isn't necessarily something to "debug": data corruption
can happen without any bugs being around (and in fact
that's the common case, assuming production systems)

So I'm not sure what you're debugging here. Are you being the support
technician for the system through bugzilla? That sounds
inefficient.

Anyways for hardware support we could probably dump some
more information at panic or better through error
serialization, but the word "debug" is IMHO an very wrong
way to think about that.

If this is about driver debugging it's entirely reasonable
to have a special setting (e.g. disable the panic), 
but the defaults should be set in a way to avoid
spreading data corruption,.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 15:24             ` Andi Kleen
@ 2010-09-13 15:47               ` Don Zickus
  2010-09-13 16:57                 ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Don Zickus @ 2010-09-13 15:47 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, Sep 13, 2010 at 05:24:38PM +0200, Andi Kleen wrote:
> 
> Don,
> 
> > Unfortunately, most of the bugzillas I deal with, unkown NMIs are the
> > result of SERRs.  While you can consider that hardware error
> > reporting, the easiest way for me to debug those problems currently
> > is to have reporters run 'lspci -vvv' after the NMI is displayed to
> > figure out who caused the NMI.
> > 
> > My fear is that panic'ing the box on unknown NMIs on those platforms
> > will hinder my ability to easily debug those NMIs.
> 
> 
> The reason the NMI is sent is that there is a "lost" 
> data corruption somewhere in the system and if you don't 
> stop it the system the corruption may make it to disk,
> become permanent etc. The hardware was designed
> under the assumption that  the system is stopped by software
> when this happens (same reason as why many machine
> checks cause panics)

Yeah, I know. I was being too ignorant perhaps.

> 
> Then there isn't necessarily something to "debug": data corruption
> can happen without any bugs being around (and in fact
> that's the common case, assuming production systems)
> 
> So I'm not sure what you're debugging here. Are you being the support
> technician for the system through bugzilla? That sounds
> inefficient.

The problem I repeatedly deal with for RHEL systems is a customer sees an
unknown NMI printed on their screen and sometimes the machine falls apart
shortly after, sometimes it doesn't.  Obviously they are going to file a
bug asking why.  A chunk of the problems are bad hardware/firmware.  But
the problem is which one.

Replacing a slot card is easy, replacing a motherboard is not.  So I
usually try to determine which device is failing by walking the pci bus
and looking for the serr bits or some of the pci-e status bits.

It is inefficient, but I haven't had time to figure out a way to clean it
up.  And just for the record, I usually see an unknown NMI report every
other week.

> 
> Anyways for hardware support we could probably dump some
> more information at panic or better through error
> serialization, but the word "debug" is IMHO an very wrong
> way to think about that.

Well, I can use 'diagnos' or 'determine' if you want.  But at the end of
the day, we have customers that see scary software messages and expect us
to give them reasonable direction to fix their problems.

> 
> If this is about driver debugging it's entirely reasonable
> to have a special setting (e.g. disable the panic), 
> but the defaults should be set in a way to avoid
> spreading data corruption,.

Ok.  I can accept that.

Cheers,
Don

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 15:47               ` Don Zickus
@ 2010-09-13 16:57                 ` Andi Kleen
  2010-09-13 17:53                   ` Don Zickus
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2010-09-13 16:57 UTC (permalink / raw)
  To: Don Zickus
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, 13 Sep 2010 11:47:50 -0400
Don Zickus <dzickus@redhat.com> wrote:

> 
> > 
> > Then there isn't necessarily something to "debug": data corruption
> > can happen without any bugs being around (and in fact
> > that's the common case, assuming production systems)
> > 
> > So I'm not sure what you're debugging here. Are you being the
> > support technician for the system through bugzilla? That sounds
> > inefficient.
> 
> The problem I repeatedly deal with for RHEL systems is a customer
> sees an unknown NMI printed on their screen and sometimes the machine
> falls apart shortly after, sometimes it doesn't.  Obviously they are
> going to file a bug asking why.  A chunk of the problems are bad
> hardware/firmware.  But the problem is which one.

NMIs are usually hardware.

BTW one big issue here is that we don't display anything
on the screen so the system seems hung. KMS solves this,
but unfortunately not for the video chipsets 
often used in servers.

Part of it is solved by serializing the error
and defaulting to reboot after panic (currently NMI doesn't do that,
MCE already does, NMI should too imho) 

> 
> Replacing a slot card is easy, replacing a motherboard is not.  So I
> usually try to determine which device is failing by walking the pci
> bus and looking for the serr bits or some of the pci-e status bits.

You don't necessarily need to replace anything, it could
be just unlucky data corruption (e.g. a big enough cosmic ray
that flipped enough bits that the normal error correction
couldn't fix it anymore)

> 
> It is inefficient, but I haven't had time to figure out a way to
> clean it up.  And just for the record, I usually see an unknown NMI
> report every other week.

At least ignoring the data corruption is not the way to handle
it. I don't think you'll do your customers a favor this way.

> > Anyways for hardware support we could probably dump some
> > more information at panic or better through error
> > serialization, but the word "debug" is IMHO an very wrong
> > way to think about that.
> 
> Well, I can use 'diagnos' or 'determine' if you want.  But at the end
> of the day, we have customers that see scary software messages and
> expect us to give them reasonable direction to fix their problems.

Usually these problems shouldn't be handled by kernel hackers,
it's something for a hardware technician. If kernel
hackers have to handle it something is very wrong.

IMHO the software should give the customer enough information
to fix (or rather let their hardware technician) work it out.

If it's not good enough for this we have to improve it. But
ignoring the errors is not the way to do that.

BTW one issue is that the screen is not big enough for all
the information that is really useful for this. So I suspect
to have it really useful you need to accept that some information
will only be available through serialization (e.g. if you 
wanted to dump parts of the PCI config space)

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 16:57                 ` Andi Kleen
@ 2010-09-13 17:53                   ` Don Zickus
  2010-09-13 18:07                     ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Don Zickus @ 2010-09-13 17:53 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, Sep 13, 2010 at 06:57:21PM +0200, Andi Kleen wrote:
> On Mon, 13 Sep 2010 11:47:50 -0400
> Don Zickus <dzickus@redhat.com> wrote:
> 
> > 
> > > 
> > > Then there isn't necessarily something to "debug": data corruption
> > > can happen without any bugs being around (and in fact
> > > that's the common case, assuming production systems)
> > > 
> > > So I'm not sure what you're debugging here. Are you being the
> > > support technician for the system through bugzilla? That sounds
> > > inefficient.
> > 
> > The problem I repeatedly deal with for RHEL systems is a customer
> > sees an unknown NMI printed on their screen and sometimes the machine
> > falls apart shortly after, sometimes it doesn't.  Obviously they are
> > going to file a bug asking why.  A chunk of the problems are bad
> > hardware/firmware.  But the problem is which one.
> 
> NMIs are usually hardware.
> 
> BTW one big issue here is that we don't display anything
> on the screen so the system seems hung. KMS solves this,
> but unfortunately not for the video chipsets 
> often used in servers.

No most of our customer see messages being sent to the console or serial
part.  I haven't seen KMS hiding the info yet.

> 
> Part of it is solved by serializing the error
> and defaulting to reboot after panic (currently NMI doesn't do that,
> MCE already does, NMI should too imho) 
> 
> > 
> > Replacing a slot card is easy, replacing a motherboard is not.  So I
> > usually try to determine which device is failing by walking the pci
> > bus and looking for the serr bits or some of the pci-e status bits.
> 
> You don't necessarily need to replace anything, it could
> be just unlucky data corruption (e.g. a big enough cosmic ray
> that flipped enough bits that the normal error correction
> couldn't fix it anymore)

No, these are easily reproducible NMIs.  So far it they have been the
result of bad firmware (either features that are marked supported but not,
or register settings that changed between updates), nic cards that had
issues, or bad motherboards.

None of these issues went away because of a reboot.

> 
> > 
> > It is inefficient, but I haven't had time to figure out a way to
> > clean it up.  And just for the record, I usually see an unknown NMI
> > report every other week.
> 
> At least ignoring the data corruption is not the way to handle
> it. I don't think you'll do your customers a favor this way.

I never said I ignore them.  We try to resolve them quickly.

>  
> > > Anyways for hardware support we could probably dump some
> > > more information at panic or better through error
> > > serialization, but the word "debug" is IMHO an very wrong
> > > way to think about that.
> > 
> > Well, I can use 'diagnos' or 'determine' if you want.  But at the end
> > of the day, we have customers that see scary software messages and
> > expect us to give them reasonable direction to fix their problems.
> 
> Usually these problems shouldn't be handled by kernel hackers,
> it's something for a hardware technician. If kernel
> hackers have to handle it something is very wrong.
> 
> IMHO the software should give the customer enough information
> to fix (or rather let their hardware technician) work it out.

Yes, I agree, but the hardware folks usually like it when we give them a
better clue than 'hardware is broken'.  Something like the network stopped
working or your storage controller's firmware went bad, is usually more
helpful.

And the thing is, the hardware usually leaves a bread cumb trail of where
things went wrong.  It is just a matter of poking different chips to find
out who generated the error and report that.

> 
> BTW one issue is that the screen is not big enough for all
> the information that is really useful for this. So I suspect
> to have it really useful you need to accept that some information
> will only be available through serialization (e.g. if you 
> wanted to dump parts of the PCI config space)

Honestly, I don't think you need much screen real estate.  It would be
nice when an unknown NMI comes in, if the kernel just pokes around the hardware
registers and display a summary of what it found.  For example,

The following devices had error bits set in the status registers:
PCI device x:y.z - STATUS_BIT1 | STATUS_BIT2
HW device xyz - STATUS_BIT3
...

This at least gives the users some hardware they can remove/replace to see
if the problem goes away.

Right now I feel like it is one giant guessing game.

But I guess if we accept the fact that an unknown NMI will panic the box,
then we can probably be a little more liberal in breaking spinlocks and
poking around the hardware to display some userful info.

Just some thoughts.

Cheers,
Don

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 17:53                   ` Don Zickus
@ 2010-09-13 18:07                     ` Andi Kleen
  2010-09-13 18:23                       ` Don Zickus
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2010-09-13 18:07 UTC (permalink / raw)
  To: Don Zickus
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org


> 
> Honestly, I don't think you need much screen real estate.  It would be
> nice when an unknown NMI comes in, if the kernel just pokes around
> the hardware registers and display a summary of what it found.  For
> example,
> 
> The following devices had error bits set in the status registers:
> PCI device x:y.z - STATUS_BIT1 | STATUS_BIT2
> HW device xyz - STATUS_BIT3
> ...

You mean data from the generic PCI config space?

I don't think i would feel comfortable with arbitrary driver callbacks
(the risk of the driver breaking the panic would be high)

But if it's generic if not on the screen it should
be at least in the error serialization data and logged after boot.

At least on PCI-E it may be enough to simply dump all recent AER
data.

> 
> But I guess if we accept the fact that an unknown NMI will panic the
> box, then we can probably be a little more liberal in breaking
> spinlocks and poking around the hardware to display some userful info.

You have to be a bit careful with that, you may caused nested errors
(e.g. machine checks or more NMIs). I suppose this could be checked for
though.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 18:07                     ` Andi Kleen
@ 2010-09-13 18:23                       ` Don Zickus
  2010-09-13 18:36                         ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Don Zickus @ 2010-09-13 18:23 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, Sep 13, 2010 at 08:07:07PM +0200, Andi Kleen wrote:
> 
> > 
> > Honestly, I don't think you need much screen real estate.  It would be
> > nice when an unknown NMI comes in, if the kernel just pokes around
> > the hardware registers and display a summary of what it found.  For
> > example,
> > 
> > The following devices had error bits set in the status registers:
> > PCI device x:y.z - STATUS_BIT1 | STATUS_BIT2
> > HW device xyz - STATUS_BIT3
> > ...
> 
> You mean data from the generic PCI config space?

Yes. I normally just look at the Status register.  With PCI-e I'll look at
the other status registers in the capabilities field too.

> 
> I don't think i would feel comfortable with arbitrary driver callbacks
> (the risk of the driver breaking the panic would be high)

Neither would I.

> 
> But if it's generic if not on the screen it should
> be at least in the error serialization data and logged after boot.

I guess I don't know what that is, 'error serialization data'.  Is there
somewhere I can read more about it?

> 
> At least on PCI-E it may be enough to simply dump all recent AER
> data.

This assumes AER is supported on the bridge?  Which for newer chips is
probably true, but I wasn't sure about older ones.

How would I dump AER data from within the kernel?

> 
> > 
> > But I guess if we accept the fact that an unknown NMI will panic the
> > box, then we can probably be a little more liberal in breaking
> > spinlocks and poking around the hardware to display some userful info.
> 
> You have to be a bit careful with that, you may caused nested errors
> (e.g. machine checks or more NMIs). I suppose this could be checked for
> though.

Of course.

Cheers,
Don

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 18:23                       ` Don Zickus
@ 2010-09-13 18:36                         ` Andi Kleen
  2010-09-13 19:36                           ` Don Zickus
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2010-09-13 18:36 UTC (permalink / raw)
  To: Don Zickus
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org


> > 
> > But if it's generic if not on the screen it should
> > be at least in the error serialization data and logged after boot.
> 
> I guess I don't know what that is, 'error serialization data'.  Is
> there somewhere I can read more about it?

That's already supported in MCE -- saving the error record to NVRAM
and logging it after reboot. NMI should probably do the same.
It's much nicer than getting it from a console.

> > 
> > At least on PCI-E it may be enough to simply dump all recent AER
> > data.
> 
> This assumes AER is supported on the bridge?  Which for newer chips is
> probably true, but I wasn't sure about older ones.

Today's servers should usually have AER at least.

For old systems you only can get the few bits in PCI space.

> How would I dump AER data from within the kernel?

Would need a buffer that is dumped for past events and 
reading the registers for not yet reported. Right now some
infrastructure is needed.


-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 18:36                         ` Andi Kleen
@ 2010-09-13 19:36                           ` Don Zickus
  2010-09-13 20:49                             ` Andi Kleen
  2010-09-14 12:21                             ` Ingo Molnar
  0 siblings, 2 replies; 27+ messages in thread
From: Don Zickus @ 2010-09-13 19:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, Sep 13, 2010 at 08:36:54PM +0200, Andi Kleen wrote:
> 
> > > 
> > > But if it's generic if not on the screen it should
> > > be at least in the error serialization data and logged after boot.
> > 
> > I guess I don't know what that is, 'error serialization data'.  Is
> > there somewhere I can read more about it?
> 
> That's already supported in MCE -- saving the error record to NVRAM
> and logging it after reboot. NMI should probably do the same.
> It's much nicer than getting it from a console.

Hmm, that assumes these boxes have NVRAM.  I am not sure if many of the
boxes I see with problems have NVRAM on them.

> 
> > > 
> > > At least on PCI-E it may be enough to simply dump all recent AER
> > > data.
> > 
> > This assumes AER is supported on the bridge?  Which for newer chips is
> > probably true, but I wasn't sure about older ones.
> 
> Today's servers should usually have AER at least.
> 
> For old systems you only can get the few bits in PCI space.
> 
> > How would I dump AER data from within the kernel?
> 
> Would need a buffer that is dumped for past events and 
> reading the registers for not yet reported. Right now some
> infrastructure is needed.

Oh ok.

Cheers,
Don

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 19:36                           ` Don Zickus
@ 2010-09-13 20:49                             ` Andi Kleen
  2010-09-13 21:25                               ` Valdis.Kletnieks
  2010-09-14 12:21                             ` Ingo Molnar
  1 sibling, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2010-09-13 20:49 UTC (permalink / raw)
  To: Don Zickus
  Cc: Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, 13 Sep 2010 15:36:55 -0400
Don Zickus <dzickus@redhat.com> wrote:

> On Mon, Sep 13, 2010 at 08:36:54PM +0200, Andi Kleen wrote:
> > 
> > > > 
> > > > But if it's generic if not on the screen it should
> > > > be at least in the error serialization data and logged after
> > > > boot.
> > > 
> > > I guess I don't know what that is, 'error serialization data'.  Is
> > > there somewhere I can read more about it?
> > 
> > That's already supported in MCE -- saving the error record to NVRAM
> > and logging it after reboot. NMI should probably do the same.
> > It's much nicer than getting it from a console.
> 
> Hmm, that assumes these boxes have NVRAM.  I am not sure if many of
> the boxes I see with problems have NVRAM on them.

Practically every PC has a small amount of NVRAM.

-Andi


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 20:49                             ` Andi Kleen
@ 2010-09-13 21:25                               ` Valdis.Kletnieks
  2010-09-14  7:48                                 ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Valdis.Kletnieks @ 2010-09-13 21:25 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Don Zickus, Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 885 bytes --]

On Mon, 13 Sep 2010 22:49:52 +0200, Andi Kleen said:

> > > That's already supported in MCE -- saving the error record to NVRAM
> > > and logging it after reboot. NMI should probably do the same.
> > > It's much nicer than getting it from a console.
> > 
> > Hmm, that assumes these boxes have NVRAM.  I am not sure if many of
> > the boxes I see with problems have NVRAM on them.
> 
> Practically every PC has a small amount of NVRAM.

The big question is how much NVRAM the PC has that is safe for our NMI code to
hijack/borrow across the reboot without scrogging something that the BIOS has
squirreled away in there. I recall one patch that saved progress indicators during
early boot or something - but at the expense of stomping on the saved clock
settings or whatever so you rebooted and then you knew where your previos boot
wedged, but your system thought it was 1970 again. 


[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 21:25                               ` Valdis.Kletnieks
@ 2010-09-14  7:48                                 ` Andi Kleen
  2010-09-14 17:54                                   ` Valdis.Kletnieks
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2010-09-14  7:48 UTC (permalink / raw)
  To: Valdis.Kletnieks
  Cc: Don Zickus, Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Mon, 13 Sep 2010 17:25:21 -0400
Valdis.Kletnieks@vt.edu wrote:

> > Practically every PC has a small amount of NVRAM.
> 
> The big question is how much NVRAM the PC has that is safe for our
> NMI code to hijack/borrow across the reboot without scrogging
> something that the BIOS has squirreled away in there. I recall one
> patch that saved progress indicators during early boot or something -
> but at the expense of stomping on the saved clock settings or
> whatever so you rebooted and then you knew where your previos boot
> wedged, but your system thought it was 1970 again. 

It's already implemented for MCE and it works on servers.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-13 19:36                           ` Don Zickus
  2010-09-13 20:49                             ` Andi Kleen
@ 2010-09-14 12:21                             ` Ingo Molnar
  2010-09-14 13:45                               ` Don Zickus
  2010-09-14 19:34                               ` Cyrill Gorcunov
  1 sibling, 2 replies; 27+ messages in thread
From: Ingo Molnar @ 2010-09-14 12:21 UTC (permalink / raw)
  To: Don Zickus
  Cc: Andi Kleen, Huang Ying, H. Peter Anvin,
	linux-kernel@vger.kernel.org


* Don Zickus <dzickus@redhat.com> wrote:

> > > > At least on PCI-E it may be enough to simply dump all recent AER 
> > > > data.
> > > 
> > > This assumes AER is supported on the bridge?  Which for newer 
> > > chips is probably true, but I wasn't sure about older ones.
> > 
> > Today's servers should usually have AER at least.
> > 
> > For old systems you only can get the few bits in PCI space.
> > 
> > > How would I dump AER data from within the kernel?
> > 
> > Would need a buffer that is dumped for past events and reading the 
> > registers for not yet reported. Right now some infrastructure is 
> > needed.
> 
> Oh ok.

The proper approach would be not to add hacks to the NMI code but to 
implement southbridge drivers - which would also have NMI callbacks. 
These are unchartered waters, but variance in that space is reducing 
systematically so it would be worth a shot.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-14 12:21                             ` Ingo Molnar
@ 2010-09-14 13:45                               ` Don Zickus
  2010-09-14 19:34                               ` Cyrill Gorcunov
  1 sibling, 0 replies; 27+ messages in thread
From: Don Zickus @ 2010-09-14 13:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Huang Ying, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Tue, Sep 14, 2010 at 02:21:31PM +0200, Ingo Molnar wrote:
> 
> * Don Zickus <dzickus@redhat.com> wrote:
> 
> > > > > At least on PCI-E it may be enough to simply dump all recent AER 
> > > > > data.
> > > > 
> > > > This assumes AER is supported on the bridge?  Which for newer 
> > > > chips is probably true, but I wasn't sure about older ones.
> > > 
> > > Today's servers should usually have AER at least.
> > > 
> > > For old systems you only can get the few bits in PCI space.
> > > 
> > > > How would I dump AER data from within the kernel?
> > > 
> > > Would need a buffer that is dumped for past events and reading the 
> > > registers for not yet reported. Right now some infrastructure is 
> > > needed.
> > 
> > Oh ok.
> 
> The proper approach would be not to add hacks to the NMI code but to 
> implement southbridge drivers - which would also have NMI callbacks. 
> These are unchartered waters, but variance in that space is reducing 
> systematically so it would be worth a shot.

Interesting.  I think the only southbridge I see regularly are Intel, AMD
and Nvidia (with Nvidia being more problematic than others).
Unfortunately, getting specs for Nvidia is very difficult.

But that might help narrow down where the NMI problem is.

Cheers,
Don

> 
> Thanks,
> 
> 	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
@ 2010-09-14 14:31 Andi Kleen
  2010-09-14 15:17 ` Don Zickus
  2010-09-14 17:48 ` Ingo Molnar
  0 siblings, 2 replies; 27+ messages in thread
From: Andi Kleen @ 2010-09-14 14:31 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Andi Kleen, Huang Ying, H. Peter Anvin,
	linux-kernel@vger.kernel.org

>
> The proper approach would be not to add hacks to the NMI code but to
> implement southbridge drivers - which would also have NMI callbacks.

BTW southbridges do less and less regarding PCI.

> These are unchartered waters, but variance in that space is reducing
> systematically so it would be worth a shot.

You don't really need special drivers for AER -- it's fully standardized
and works generically. I think the old PCI-X error bits Don was interested
in were also all architectural.

The driver for the first is already there, just right now the
information is not dumped in the right places.

There are a few platform specific error signals, but they tend
to be rather obscure stuff. The "meat" is all in the standard.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-14 14:31 [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI Andi Kleen
@ 2010-09-14 15:17 ` Don Zickus
  2010-09-14 17:40   ` Andi Kleen
  2010-09-14 17:48 ` Ingo Molnar
  1 sibling, 1 reply; 27+ messages in thread
From: Don Zickus @ 2010-09-14 15:17 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ingo Molnar, Huang Ying, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Tue, Sep 14, 2010 at 04:31:20PM +0200, Andi Kleen wrote:
> 
> >
> > The proper approach would be not to add hacks to the NMI code but to
> > implement southbridge drivers - which would also have NMI callbacks.
> 
> BTW southbridges do less and less regarding PCI.
> 
> > These are unchartered waters, but variance in that space is reducing
> > systematically so it would be worth a shot.
> 
> You don't really need special drivers for AER -- it's fully standardized
> and works generically. I think the old PCI-X error bits Don was interested
> in were also all architectural.
> 
> The driver for the first is already there, just right now the
> information is not dumped in the right places.
> 
> There are a few platform specific error signals, but they tend
> to be rather obscure stuff. The "meat" is all in the standard.

>From what someone explained to me, the only platform that implements AER
correctly is Nehalem based ones.  So for AMD and Intel Core arches, AER is
not expected to work.  Is that incorrect?

Cheers,
Don

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-14 15:17 ` Don Zickus
@ 2010-09-14 17:40   ` Andi Kleen
  0 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2010-09-14 17:40 UTC (permalink / raw)
  To: Don Zickus
  Cc: Ingo Molnar, Huang Ying, H. Peter Anvin,
	linux-kernel@vger.kernel.org

> 
> From what someone explained to me, the only platform that implements
> AER correctly is Nehalem based ones.  So for AMD and Intel Core
> arches, AER is not expected to work.  Is that incorrect?

At least for Intel it's definitely not true. AER goes back a
lot of generations, pretty much all that support PCI Express. 

There were some bugs in it of course and some workarounds needed, but
no show stoppers in any particular part to my knowledge.

Haven't kept fully track of AMD systems recently, but would surprise
me if it was the case on AMD either.

This applies to servers. On clients AER is more spotty
and various simply don't have it.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-14 14:31 [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI Andi Kleen
  2010-09-14 15:17 ` Don Zickus
@ 2010-09-14 17:48 ` Ingo Molnar
  2010-09-15  5:06   ` Huang Ying
  1 sibling, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2010-09-14 17:48 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Don Zickus, Huang Ying, H. Peter Anvin,
	linux-kernel@vger.kernel.org


* Some One <some@where> stripped out all quoted identities and wrote:

> >
> > The proper approach would be not to add hacks to the NMI code but to 
> > implement southbridge drivers - which would also have NMI callbacks.
> 
> BTW southbridges do less and less regarding PCI.

Except WICTCR.

> > These are unchartered waters, but variance in that space is reducing 
> > systematically so it would be worth a shot.
> 
> You don't really need special drivers for AER [...]

On the contrary, we need proper driverization for _everything_ new. 
Embedded x86 is here to stay, so we are abstracting away each and every 
bit of the platform. See struct x86_ops for a highlevel platform driver 
- but a more specific, southbridge-encompassing driver framework can be 
created too.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-14  7:48                                 ` Andi Kleen
@ 2010-09-14 17:54                                   ` Valdis.Kletnieks
  0 siblings, 0 replies; 27+ messages in thread
From: Valdis.Kletnieks @ 2010-09-14 17:54 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Don Zickus, Huang Ying, Ingo Molnar, H. Peter Anvin,
	linux-kernel@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 455 bytes --]

On Tue, 14 Sep 2010 09:48:13 +0200, Andi Kleen said:

> It's already implemented for MCE and it works on servers.

I'm OK with snarfing up some NVRAM, as long as we have some way to check "yes,
there are NN bytes of NVRAM architected for our use".

Did I miss the memo where it says "This code will only run on chipsets that
have architected nvram space for this sort of use"?  Wouldn't be the first time
I've misunderstood what code is really doing. ;)

[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-14 12:21                             ` Ingo Molnar
  2010-09-14 13:45                               ` Don Zickus
@ 2010-09-14 19:34                               ` Cyrill Gorcunov
  2010-09-15  9:29                                 ` Ingo Molnar
  1 sibling, 1 reply; 27+ messages in thread
From: Cyrill Gorcunov @ 2010-09-14 19:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Don Zickus, Andi Kleen, Huang Ying, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Tue, Sep 14, 2010 at 02:21:31PM +0200, Ingo Molnar wrote:
> 
... 
> The proper approach would be not to add hacks to the NMI code but to 
> implement southbridge drivers - which would also have NMI callbacks. 
> These are unchartered waters, but variance in that space is reducing 
> systematically so it would be worth a shot.
> 
> Thanks,
> 
> 	Ingo

Hi Ingo,

while there is a conversation about makeing NMI handler robust/modern or
whatever, I think the naming Huang has implemented for NMI Stat&Ctrl
registers/ports look quite good and convenient (I thought about this times
ago when being merging nmi-32/64 code but didn't implemented it properly).
So I presume perhaps we could merge this snippets first? Or I miss something
on discussion in general?

	-- Cyrill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-14 17:48 ` Ingo Molnar
@ 2010-09-15  5:06   ` Huang Ying
  0 siblings, 0 replies; 27+ messages in thread
From: Huang Ying @ 2010-09-15  5:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andi Kleen, Don Zickus, H. Peter Anvin,
	linux-kernel@vger.kernel.org

On Wed, 2010-09-15 at 01:48 +0800, Ingo Molnar wrote:
> * Some One <some@where> stripped out all quoted identities and wrote:
> > >
> > > The proper approach would be not to add hacks to the NMI code but to 
> > > implement southbridge drivers - which would also have NMI callbacks.
> > 
> > BTW southbridges do less and less regarding PCI.
> 
> Except WICTCR.
> 
> > > These are unchartered waters, but variance in that space is reducing 
> > > systematically so it would be worth a shot.
> > 
> > You don't really need special drivers for AER [...]
> 
> On the contrary, we need proper driverization for _everything_ new. 
> Embedded x86 is here to stay, so we are abstracting away each and every 
> bit of the platform. See struct x86_ops for a highlevel platform driver 
> - but a more specific, southbridge-encompassing driver framework can be 
> created too.

All in all, we can have proper drivers to printk/write to NVRAM the
error information collected for unknown NMI, regardless they are AER or
Southbridge driver or both. And, the drivers will register handlers for
unknown NMI. Do you agree?

Best Regards,
Huang Ying


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI
  2010-09-14 19:34                               ` Cyrill Gorcunov
@ 2010-09-15  9:29                                 ` Ingo Molnar
  0 siblings, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2010-09-15  9:29 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Don Zickus, Andi Kleen, Huang Ying, H. Peter Anvin,
	linux-kernel@vger.kernel.org


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Tue, Sep 14, 2010 at 02:21:31PM +0200, Ingo Molnar wrote:
> > 
> ... 
> > The proper approach would be not to add hacks to the NMI code but to 
> > implement southbridge drivers - which would also have NMI callbacks. 
> > These are unchartered waters, but variance in that space is reducing 
> > systematically so it would be worth a shot.
> > 
> > Thanks,
> > 
> > 	Ingo
> 
> Hi Ingo,
> 
> while there is a conversation about makeing NMI handler robust/modern 
> or whatever, I think the naming Huang has implemented for NMI 
> Stat&Ctrl registers/ports look quite good and convenient (I thought 
> about this times ago when being merging nmi-32/64 code but didn't 
> implemented it properly). So I presume perhaps we could merge this 
> snippets first? Or I miss something on discussion in general?

Yeah, i'm waiting for Don to pick out the good ones and send them to me 
after a bit of testing (he's been driving this topic lately). We can 
obviously make some progress.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2010-09-15  9:29 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-14 14:31 [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI Andi Kleen
2010-09-14 15:17 ` Don Zickus
2010-09-14 17:40   ` Andi Kleen
2010-09-14 17:48 ` Ingo Molnar
2010-09-15  5:06   ` Huang Ying
  -- strict thread matches above, loose matches on Subject: below --
2010-09-10  2:51 [RFC 1/6] x86, NMI, Add symbol definition for NMI magic constants Huang Ying
2010-09-10  2:51 ` [RFC 5/6] x86, NMI, Add support to notify hardware error with unknown NMI Huang Ying
2010-09-10 16:02   ` Don Zickus
2010-09-10 16:19     ` Andi Kleen
2010-09-10 18:40       ` Don Zickus
2010-09-13  2:19         ` Huang Ying
2010-09-13 14:11           ` Don Zickus
2010-09-13 15:24             ` Andi Kleen
2010-09-13 15:47               ` Don Zickus
2010-09-13 16:57                 ` Andi Kleen
2010-09-13 17:53                   ` Don Zickus
2010-09-13 18:07                     ` Andi Kleen
2010-09-13 18:23                       ` Don Zickus
2010-09-13 18:36                         ` Andi Kleen
2010-09-13 19:36                           ` Don Zickus
2010-09-13 20:49                             ` Andi Kleen
2010-09-13 21:25                               ` Valdis.Kletnieks
2010-09-14  7:48                                 ` Andi Kleen
2010-09-14 17:54                                   ` Valdis.Kletnieks
2010-09-14 12:21                             ` Ingo Molnar
2010-09-14 13:45                               ` Don Zickus
2010-09-14 19:34                               ` Cyrill Gorcunov
2010-09-15  9:29                                 ` Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.