[PATCH] 3/3: MCA/MCE correctable error handling

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] 3/3: MCA/MCE correctable error handling
@ 2007-08-21 13:31 Christoph Egger
  2007-08-21 16:02 ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Egger @ 2007-08-21 13:31 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 511 bytes --]


This is patch 3/3.

Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>



-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

[-- Attachment #2: mca_diff3_event.diff --]
[-- Type: text/x-diff, Size: 16287 bytes --]

diff -r a5209d79d241 -r e18773b9584c xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c	Fri Aug 17 13:21:40 2007 +0200
+++ b/xen/arch/x86/cpu/mcheck/mce.c	Tue Aug 21 14:13:19 2007 +0200
@@ -13,6 +13,7 @@
 #include <asm/system.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 int mce_disabled = 0;
 unsigned int nr_mce_banks;
@@ -89,3 +90,94 @@ static int __init mcheck_enable(char *st
 
 custom_param("nomce", mcheck_disable);
 custom_param("mce", mcheck_enable);
+
+
+
+
+void x86_mcinfo_clear(struct shared_info *si)
+{
+	memset(&si->arch.mc_info, 0, sizeof(struct arch_mc_info));
+	x86_mcinfo_nentries(si) = 0;
+}
+
+
+int x86_mcinfo_add(struct shared_info *si, void *mcinfo)
+{
+	int i;
+	unsigned long end1, end2;
+	struct mcinfo_common *mic, *mic_base, *mic_index;
+
+	mic = (struct mcinfo_common *)mcinfo;
+	mic_index = mic_base = x86_mcinfo_first(si);
+
+	/* go to first free entry */
+	for (i = 0; i < x86_mcinfo_nentries(si); i++) {
+		mic_index = x86_mcinfo_next(mic_index);
+	}
+
+	/* check if there is enough size */
+	end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct arch_mc_info));
+	end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
+
+	if (end1 < end2)
+		return -1; /* No space. Can't add entry. */
+
+	/* there's enough space. add entry. */
+	memcpy(mic_index, mic, mic->size);
+	x86_mcinfo_nentries(si)++;
+
+	return 0;
+}
+
+/* Dump machine check information in a format,
+ * mcelog can parse. This is used only when
+ * Dom0 does not take the notification. */
+void x86_mcinfo_dump(struct shared_info *si)
+{
+	struct mcinfo_common *mic;
+	struct mcinfo_global *mc_global;
+	struct mcinfo_bank *mc_bank;
+
+	/* first print the global info */
+	x86_mcinfo_lookup(mic, si, MC_TYPE_GLOBAL);
+	if (mic == NULL)
+		return;
+	mc_global = (struct mcinfo_global *)mic;
+	if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+		printk(KERN_EMERG
+		       "CPU%d: Machine Check Exception: %16"PRIx64"\n",
+		       mc_global->mc_coreid, mc_global->mc_gstatus);
+	} else {
+		printk(KERN_INFO "MCE: The hardware reports a non "
+		       "fatal, correctable incident occured on "
+		       "CPU %d.\n",
+		       mc_global->mc_coreid);
+	}
+
+	/* then the bank information */
+	x86_mcinfo_lookup(mic, si, MC_TYPE_BANK); /* finds the first entry */
+	do {
+		if (mic == NULL)
+			return;
+		if (mic->type != MC_TYPE_BANK)
+			continue;
+
+		mc_bank = (struct mcinfo_bank *)mic;
+       
+		if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE)
+			printk(KERN_EMERG);
+		else
+			printk(KERN_INFO);
+
+		printk("Bank %d: %16"PRIx64,
+		       mc_bank->mc_bank,
+		       mc_bank->mc_status);
+		if (mc_bank->mc_status & MCi_STATUS_MISCV)
+			printk("[%16"PRIx64"]", mc_bank->mc_misc);
+		if (mc_bank->mc_status & MCi_STATUS_ADDRV)
+			printk(" at %16"PRIx64, mc_bank->mc_addr);
+
+		printk("\n");
+		mic = x86_mcinfo_next(mic); /* next entry */
+	} while (mic != NULL);
+}
diff -r a5209d79d241 -r e18773b9584c xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h	Fri Aug 17 13:21:40 2007 +0200
+++ b/xen/arch/x86/cpu/mcheck/mce.h	Tue Aug 21 14:13:19 2007 +0200
@@ -6,6 +6,10 @@ void intel_p6_mcheck_init(struct cpuinfo
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
+void x86_mcinfo_clear(struct shared_info *si);
+int x86_mcinfo_add(struct shared_info *si, void *mcinfo);
+void x86_mcinfo_dump(struct shared_info *si);
+
 /* Call the installed machine check handler for this CPU setup. */
 extern fastcall void (*machine_check_vector)(struct cpu_user_regs *, long error_code);
 
diff -r a5209d79d241 -r e18773b9584c xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c	Fri Aug 17 13:21:40 2007 +0200
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c	Tue Aug 21 14:13:19 2007 +0200
@@ -13,19 +13,22 @@
 #include <xen/kernel.h>
 #include <xen/smp.h>
 #include <xen/timer.h>
-#include <xen/errno.h>
+#include <xen/event.h>
 #include <asm/processor.h> 
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
 
-static int firstbank;
+static int firstbank = 0;
 static struct timer mce_timer;
 
 #define MCE_PERIOD MILLISECS(15000)
-
-static void mce_checkregs (void *info)
+#define MCE_MIN    MILLISECS(2000)
+#define MCE_MAX    MILLISECS(30000)
+
+
+static void mce_intel_checkregs (void *info)
 {
 	u32 low, high;
 	int i;
@@ -50,10 +53,170 @@ static void mce_checkregs (void *info)
 	}
 }
 
-static void mce_work_fn(void *data)
+static void mce_intel_work_fn(void *data)
 { 
-	on_each_cpu(mce_checkregs, NULL, 1, 1);
+	on_each_cpu(mce_intel_checkregs, NULL, 1, 1);
 	set_timer(&mce_timer, NOW() + MCE_PERIOD);
+}
+
+
+
+/* The MCi_STATUS_* #defines are needed here */
+#include "x86_mca.h"
+
+static s_time_t period = MCE_PERIOD;
+static int hw_threshold = 0;
+static int adjust = 0;
+
+
+/* The polling service routine:
+ * Collects information of correctable errors and notifies
+ * Dom0 via an event.
+ */
+void mce_amd_checkregs(void *info)
+{
+	struct shared_info *si;
+	struct vcpu  *vcpu = current;
+	struct mcinfo_global mc_global;
+	struct mcinfo_bank mc_info;
+	uint64_t status, addrv, miscv;
+	unsigned int i;
+	unsigned int event_enabled;
+
+	event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
+	adjust = 0;
+	si = (struct shared_info *)dom0->shared_info; /* cast silences gcc4 */
+	memset(&mc_global, 0, sizeof(mc_global));
+	mc_global.common.type = MC_TYPE_GLOBAL;
+	mc_global.common.size = sizeof(mc_global);
+
+	mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
+	mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+	mc_global.mc_vcpu_id = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* todo: on which socket is this physical core? */
+	mc_global.mc_socketid = ???;
+#endif
+	mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
+	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+	x86_mcinfo_clear(si);
+	x86_mcinfo_add(si, &mc_global);
+
+	for (i = 0; i < nr_mce_banks; i++) {
+		rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+	       
+		if (!(status & MCi_STATUS_VAL))
+			continue;
+
+		memset(&mc_info, 0, sizeof(mc_info));
+		mc_info.common.type = MC_TYPE_BANK;
+		mc_info.common.size = sizeof(mc_info);
+		mc_info.mc_bank = i;
+		mc_info.mc_status = status;
+
+		/* Increase polling frequency */
+		adjust = 1;
+
+		addrv = 0;
+		if (status & MCi_STATUS_ADDRV)
+			rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
+
+		miscv = 0;
+		if (status & MCi_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+
+		mc_info.mc_addr = addrv;
+		mc_info.mc_misc = miscv;
+		x86_mcinfo_add(si, &mc_info);
+
+		/* clear status */
+		wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
+		wmb();
+	}
+
+	if (adjust > 0) {
+		/* If Dom0 enabled the VIRQ_MCA event, then ... */
+		if (event_enabled)
+			/* ... notify it. */
+			send_guest_global_virq(dom0, VIRQ_MCA);
+		else
+			/* ... or dump it */
+			x86_mcinfo_dump(si);
+	}
+}
+
+/* polling service routine invoker:
+ * Adjust poll frequency at runtime. No error means slow polling frequency,
+ * an error means higher polling frequency.
+ * It uses hw threshold register introduced in AMD K8 RevF to detect
+ * multiple correctable errors between two polls. In that case,
+ * increase polling frequency higher than normal.
+ */
+static void mce_amd_work_fn(void *data)
+{
+	on_each_cpu(mce_amd_checkregs, data, 1, 1);
+
+	if (adjust > 0) {
+		if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+			/* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
+			printk("MCE: polling routine found correctable error\n");
+		}
+	}
+
+	if (hw_threshold) {
+		uint64_t value;
+		uint32_t counter;
+
+		rdmsrl(MSR_K8_MC4_MISC, value);
+		/* Only the error counter field is of interest
+		 * Bit field is described in AMD K8 BKDG chapter 6.4.5.5
+		 */
+		counter = (value & 0xFFF00000000ULL) >> 32U;
+
+		/* HW does not count *all* kinds of correctable errors.
+		 * Thus it is possible, that the polling routine finds an
+		 * correctable error even if the HW reports nothing.
+		 * However, the other way around is not possible (= BUG).
+		 */ 
+		if (counter > 0) {
+			/* HW reported correctable errors,
+			 * the polling routine did not find...
+			 */
+			BUG_ON(adjust == 0);
+			/* subtract 1 to not double count the error 
+			 * from the polling service routine */ 
+			adjust += (counter - 1);
+
+			/* Restart counter */
+			/* No interrupt, reset counter value */
+			value &= ~(0x60FFF00000000ULL);
+			/* Counter enable */
+			value |= (1ULL << 51);
+			wrmsrl(MSR_K8_MC4_MISC, value);
+			wmb();
+		}
+	}
+
+	if (adjust > 0) {
+		/* Increase polling frequency */
+		adjust++; /* adjust == 1 must have an effect */
+		period /= adjust;
+	} else {
+		/* Decrease polling frequency */
+		period *= 2;
+	}
+	if (period > MCE_MAX) {
+		/* limit: Poll at least every 30s */
+		period = MCE_MAX;
+	}
+	if (period < MCE_MIN) {
+		/* limit: Poll every 2s.
+		 * When this is reached an uncorrectable error
+		 * is expected to happen, if Dom0 does nothing.
+		 */
+		period = MCE_MIN;
+	}
+
+	set_timer(&mce_timer, NOW() + period);
 }
 
 static int __init init_nonfatal_mce_checker(void)
@@ -68,19 +231,63 @@ static int __init init_nonfatal_mce_chec
 	if (!cpu_has(c, X86_FEATURE_MCA))
 		return -ENODEV;
 
-	/* Some Athlons misbehave when we frob bank 0 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		boot_cpu_data.x86 == 6)
-			firstbank = 1;
-	else
-			firstbank = 0;
-
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
-	init_timer(&mce_timer, mce_work_fn, NULL, 0);
-	set_timer(&mce_timer, NOW() + MCE_PERIOD);
-	printk(KERN_INFO "Machine check exception polling timer started.\n");
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (c->x86 == 6) { /* K7 */
+			firstbank = 1;
+			init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
+			set_timer(&mce_timer, NOW() + MCE_PERIOD);
+			break;
+		}
+
+		/* Assume we are on K8 or newer AMD CPU here */
+		if (cpu_has(c, X86_FEATURE_SVME)) {
+			uint64_t value;
+
+			/* hw threshold registers present */
+			hw_threshold = 1;
+			rdmsrl(MSR_K8_MC4_MISC, value);
+
+			if (value & (1ULL << 61)) { /* Locked bit */
+				/* Locked by BIOS. Not available for use */
+				hw_threshold = 0;
+			}
+			if (!(value & (1ULL << 63))) { /* Valid bit */
+				/* No CtrP present */
+				hw_threshold = 0;
+			} else {
+			       if (!(value & (1ULL << 62))) { /* Counter Bit */
+					/* No counter field present */
+					hw_threshold = 0;
+				}
+			}
+
+			if (hw_threshold) {
+				/* No interrupt, reset counter value */
+				value &= ~(0x60FFF00000000ULL);
+				/* Counter enable */
+				value |= (1ULL << 51);
+				wrmsrl(MSR_K8_MC4_MISC, value);
+				/* serialize */
+				wmb();
+				printk(KERN_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
+			}
+		}
+
+		init_timer(&mce_timer, mce_amd_work_fn, NULL, 0);
+		set_timer(&mce_timer, NOW() + period);
+		break;
+
+	case X86_VENDOR_INTEL:
+		init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
+		set_timer(&mce_timer, NOW() + MCE_PERIOD);
+		break;
+	}
+
+	printk(KERN_INFO "MCA: Machine check polling timer started.\n");
 	return 0;
 }
 __initcall(init_nonfatal_mce_checker);
diff -r a5209d79d241 -r e18773b9584c xen/common/event_channel.c
--- a/xen/common/event_channel.c	Fri Aug 17 13:21:40 2007 +0200
+++ b/xen/common/event_channel.c	Tue Aug 21 14:13:19 2007 +0200
@@ -539,6 +539,21 @@ void evtchn_set_pending(struct vcpu *v, 
 }
 
 
+int guest_enabled_event(struct vcpu *v, int virq)
+{
+    int port;
+
+    if ( unlikely(v == NULL) )
+        return 0;
+
+    port = v->virq_to_evtchn[virq];
+    if ( port == 0 )
+        return 0;
+
+    /* virq is in use */
+    return 1;
+}
+
 void send_guest_vcpu_virq(struct vcpu *v, int virq)
 {
     int port;
diff -r a5209d79d241 -r e18773b9584c xen/include/asm-x86/event.h
--- a/xen/include/asm-x86/event.h	Fri Aug 17 13:21:40 2007 +0200
+++ b/xen/include/asm-x86/event.h	Tue Aug 21 14:13:19 2007 +0200
@@ -61,7 +61,12 @@ static inline void local_event_delivery_
 /* No arch specific virq definition now. Default to global. */
 static inline int arch_virq_is_global(int virq)
 {
-    return 1;
+    switch (virq) {
+    case VIRQ_MCA:
+        return 1;
+    default:
+        return 1;
+    }
 }
 
 #endif
diff -r a5209d79d241 -r e18773b9584c xen/include/public/arch-x86/xen.h
--- a/xen/include/public/arch-x86/xen.h	Fri Aug 17 13:21:40 2007 +0200
+++ b/xen/include/public/arch-x86/xen.h	Tue Aug 21 14:13:19 2007 +0200
@@ -82,6 +82,8 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 
 #ifndef __ASSEMBLY__
 
+#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
+
 /*
  * Machine Check Architecure:
  * structs are read-only and used to report all kinds of
diff -r a5209d79d241 -r e18773b9584c xen/include/xen/event.h
--- a/xen/include/xen/event.h	Fri Aug 17 13:21:40 2007 +0200
+++ b/xen/include/xen/event.h	Tue Aug 21 14:13:19 2007 +0200
@@ -51,6 +51,9 @@ void free_xen_event_channel(
 void free_xen_event_channel(
     struct vcpu *local_vcpu, int port);
 
+/* Query if event channel is in use by the guest */
+int guest_enabled_event(struct vcpu *v, int virq);
+
 /* Notify remote end of a Xen-attached event channel.*/
 void notify_via_xen_event_channel(int lport);
 
diff -r a5209d79d241 -r e18773b9584c xen/arch/x86/cpu/mcheck/x86_mca.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Tue Aug 21 14:13:19 2007 +0200
@@ -0,0 +1,72 @@
+/*
+ * MCA implementation for AMD K7/K8 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc. 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* The MCA/MCE MSRs should not be used anywhere else.
+ * They are cpu family/model specific and are only for use
+ * in terms of machine check handling.
+ * So we define them here rather in <asm/msr.h>.
+ */
+
+
+/* Bitfield of the MSR_IA32_MCG_CAP register */
+#define MCG_CAP_COUNT           0x00000000000000ffULL
+#define MCG_CTL_P               0x0000000000000100ULL
+/* Bits 9-63 are reserved */
+
+/* Bitfield of the MSR_IA32_MCG_STATUS register */
+#define MCG_STATUS_RIPV         0x0000000000000001ULL
+#define MCG_STATUS_EIPV         0x0000000000000002ULL
+#define MCG_STATUS_MCIP         0x0000000000000004ULL
+/* Bits 3-63 are reserved */
+
+/* Bitfield of MSR_K8_MCi_STATUS registers */
+/* MCA error code */
+#define MCi_STATUS_MCA          0x000000000000ffffULL
+/* model-specific error code */
+#define MCi_STATUS_MSEC         0x00000000ffff0000ULL
+/* Other information */
+#define MCi_STATUS_OTHER        0x01ffffff00000000ULL
+/* processor context corrupt */
+#define MCi_STATUS_PCC          0x0200000000000000ULL
+/* MSR_K8_MCi_ADDR register valid */
+#define MCi_STATUS_ADDRV        0x0400000000000000ULL
+/* MSR_K8_MCi_MISC register valid */
+#define MCi_STATUS_MISCV        0x0800000000000000ULL
+/* error condition enabled */
+#define MCi_STATUS_EN           0x1000000000000000ULL
+/* uncorrected error */
+#define MCi_STATUS_UC           0x2000000000000000ULL
+/* status register overflow */
+#define MCi_STATUS_OVER         0x4000000000000000ULL
+/* valid */
+#define MCi_STATUS_VAL          0x8000000000000000ULL
+
+/* Bitfield of MSi_STATUS_OTHER field */
+/* reserved bits */
+#define MCi_STATUS_OTHER_RESERVED1      0x00001fff00000000ULL
+/* uncorrectable ECC error */
+#define MCi_STATUS_OTEHR_UC_ECC         0x0000200000000000ULL
+/* correctable ECC error */
+#define MCi_STATUS_OTHER_C_ECC          0x0000400000000000ULL
+/* ECC syndrome of an ECC error */
+#define MCi_STATUS_OTHER_ECC_SYNDROME   0x007f800000000000ULL
+/* reserved bits */
+#define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
+

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] 3/3: MCA/MCE correctable error handling
  2007-08-21 13:31 [PATCH] 3/3: MCA/MCE correctable error handling Christoph Egger
@ 2007-08-21 16:02 ` Jan Beulich
  2007-08-22  9:00   ` Christoph Egger
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2007-08-21 16:02 UTC (permalink / raw)
  To: Christoph Egger; +Cc: Gavin.Maltby, xen-devel, Keir Fraser

>+		if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE)
>+			printk(KERN_EMERG);
>+		else
>+			printk(KERN_INFO);

KERN_INFO seems gross understatement here - generally, correctable MCs are
considered indicators that within not too distant future uncorrectable MCs might
result, so this generally is a call for action (and hence shouldn't be hidden with
default log level settings).

Also, I'm not sure adjusting the polling frequency makes much sense - 30s seems
an awful lot of time to me.

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] 3/3: MCA/MCE correctable error handling
  2007-08-21 16:02 ` Jan Beulich
@ 2007-08-22  9:00   ` Christoph Egger
  2007-08-22 10:09     ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Egger @ 2007-08-22  9:00 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser, Jan Beulich

On Tuesday 21 August 2007 18:02:54 Jan Beulich wrote:
> >+		if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE)
> >+			printk(KERN_EMERG);
> >+		else
> >+			printk(KERN_INFO);
>
> KERN_INFO seems gross understatement here - generally, correctable MCs are
> considered indicators that within not too distant future uncorrectable MCs
> might result, so this generally is a call for action (and hence shouldn't
> be hidden with default log level settings).

Well, that is what the "old" code did. It used KERN_EMERG for fatal errors
and KERN_INFO in the polling service routine. What do you want me to suggest?

> Also, I'm not sure adjusting the polling frequency makes much sense - 30s
> seems an awful lot of time to me.

It's not clear to me what you are trying to tell me. Please explain/elaborate.

Christoph

-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] 3/3: MCA/MCE correctable error handling
  2007-08-22  9:00   ` Christoph Egger
@ 2007-08-22 10:09     ` Jan Beulich
  2007-08-22 15:56       ` Christoph Egger
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2007-08-22 10:09 UTC (permalink / raw)
  To: Christoph Egger, xen-devel; +Cc: Gavin.Maltby, Keir Fraser

>>> "Christoph Egger" <Christoph.Egger@amd.com> 22.08.07 11:00 >>>
>On Tuesday 21 August 2007 18:02:54 Jan Beulich wrote:
>> >+		if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE)
>> >+			printk(KERN_EMERG);
>> >+		else
>> >+			printk(KERN_INFO);
>>
>> KERN_INFO seems gross understatement here - generally, correctable MCs are
>> considered indicators that within not too distant future uncorrectable MCs
>> might result, so this generally is a call for action (and hence shouldn't
>> be hidden with default log level settings).
>
>Well, that is what the "old" code did. It used KERN_EMERG for fatal errors
>and KERN_INFO in the polling service routine. What do you want me to suggest?

This should be at least KERN_WARNING, probably even KERN_ERR (note
though that KERN_ERR and KERN_EMERG both resolve to XENLOG_ERR).

>> Also, I'm not sure adjusting the polling frequency makes much sense - 30s
>> seems an awful lot of time to me.
>
>It's not clear to me what you are trying to tell me. Please explain/elaborate.

What I'm trying to say is that I'd think this should be polled at a much higher
frequency (I'd suggest 1Hz), without adjustments. Typically, a healthy system
will not encounter problems soon after boot, but after running for perhaps a
very long time (and a system in bad condition is likely to encounter problems
right away, so wouldn't be affected by changing the polling rate). Thus, in the
general case, you'd have a comparably long latency, during which some kind
of (automated) action could already be taken to preserve data consistency.

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] 3/3: MCA/MCE correctable error handling
  2007-08-22 10:09     ` Jan Beulich
@ 2007-08-22 15:56       ` Christoph Egger
  2007-08-22 16:05         ` Keir Fraser
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Egger @ 2007-08-22 15:56 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser, Jan Beulich

On Wednesday 22 August 2007 12:09:41 Jan Beulich wrote:
> >>> "Christoph Egger" <Christoph.Egger@amd.com> 22.08.07 11:00 >>>
> >
> >On Tuesday 21 August 2007 18:02:54 Jan Beulich wrote:
> >> >+		if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE)
> >> >+			printk(KERN_EMERG);
> >> >+		else
> >> >+			printk(KERN_INFO);
> >>
> >> KERN_INFO seems gross understatement here - generally, correctable MCs
> >> are considered indicators that within not too distant future
> >> uncorrectable MCs might result, so this generally is a call for action
> >> (and hence shouldn't be hidden with default log level settings).
> >
> >Well, that is what the "old" code did. It used KERN_EMERG for fatal errors
> >and KERN_INFO in the polling service routine. What do you want me to
> > suggest?
>
> This should be at least KERN_WARNING, probably even KERN_ERR (note
> though that KERN_ERR and KERN_EMERG both resolve to XENLOG_ERR).

I changed to KERN_WARNING. This made the above if block
superflous. Tnx.
I will re-submit this patch as well.

> >> Also, I'm not sure adjusting the polling frequency makes much sense -
> >> 30s seems an awful lot of time to me.
> >
> >It's not clear to me what you are trying to tell me. Please
> > explain/elaborate.
>
> What I'm trying to say is that I'd think this should be polled at a much
> higher frequency (I'd suggest 1Hz), without adjustments. Typically, a
> healthy system will not encounter problems soon after boot, but after
> running for perhaps a very long time (and a system in bad condition is
> likely to encounter problems right away, so wouldn't be affected by
> changing the polling rate). Thus, in the general case, you'd have a
> comparably long latency, during which some kind of (automated) action could
> already be taken to preserve data consistency.

The polling routine that is in the -unstable tree (the version taken from 
Linux) runs every 15 seconds without adjustments.
1Hz causes too much system load for a healthy system IMO.
That's why I introduced the adjustments with use of hw threshold registers
to come to a compromise solution.


-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] 3/3: MCA/MCE correctable error handling
  2007-08-22 15:56       ` Christoph Egger
@ 2007-08-22 16:05         ` Keir Fraser
  2007-08-22 16:10           ` Keir Fraser
  0 siblings, 1 reply; 10+ messages in thread
From: Keir Fraser @ 2007-08-22 16:05 UTC (permalink / raw)
  To: Christoph Egger, xen-devel; +Cc: Gavin.Maltby, Jan Beulich

On 22/8/07 16:56, "Christoph Egger" <Christoph.Egger@amd.com> wrote:

>> What I'm trying to say is that I'd think this should be polled at a much
>> higher frequency (I'd suggest 1Hz), without adjustments. Typically, a
>> healthy system will not encounter problems soon after boot, but after
>> running for perhaps a very long time (and a system in bad condition is
>> likely to encounter problems right away, so wouldn't be affected by
>> changing the polling rate). Thus, in the general case, you'd have a
>> comparably long latency, during which some kind of (automated) action could
>> already be taken to preserve data consistency.
> 
> The polling routine that is in the -unstable tree (the version taken from
> Linux) runs every 15 seconds without adjustments.
> 1Hz causes too much system load for a healthy system IMO.
> That's why I introduced the adjustments with use of hw threshold registers
> to come to a compromise solution.

What's the deal here? Do correctable errors not cause an MCE, yet are still
detected via the machine-check architecture (albeit by a polling method)?

Are there going to be patches on the Linux side to pick up this MCA info?
What is Linux going to do with it, apart from log it (which Xen can already
do itself)? Or is this all Solaris-specific?

 -- Keir

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] 3/3: MCA/MCE correctable error handling
  2007-08-22 16:05         ` Keir Fraser
@ 2007-08-22 16:10           ` Keir Fraser
  2007-08-23  6:57             ` Christoph Egger
  0 siblings, 1 reply; 10+ messages in thread
From: Keir Fraser @ 2007-08-22 16:10 UTC (permalink / raw)
  To: Christoph Egger, xen-devel; +Cc: Gavin.Maltby, Jan Beulich

On 22/8/07 17:05, "Keir Fraser" <keir@xensource.com> wrote:

>> The polling routine that is in the -unstable tree (the version taken from
>> Linux) runs every 15 seconds without adjustments.
>> 1Hz causes too much system load for a healthy system IMO.
>> That's why I introduced the adjustments with use of hw threshold registers
>> to come to a compromise solution.
> 
> What's the deal here? Do correctable errors not cause an MCE, yet are still
> detected via the machine-check architecture (albeit by a polling method)?
> 
> Are there going to be patches on the Linux side to pick up this MCA info?
> What is Linux going to do with it, apart from log it (which Xen can already
> do itself)? Or is this all Solaris-specific?

Oh, and is AMD-specific code really needed in non-fatal.c? I though the MCA
stuff was architectural now rather than vendor specific? If there are
vendor-specific extensions then they belong in the vendor's .c file.

 -- Keir

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] 3/3: MCA/MCE correctable error handling
  2007-08-22 16:10           ` Keir Fraser
@ 2007-08-23  6:57             ` Christoph Egger
  2007-08-23  9:27               ` [PATCH] resend " Christoph Egger
  2007-08-23 14:07               ` [PATCH] " Keir Fraser
  0 siblings, 2 replies; 10+ messages in thread
From: Christoph Egger @ 2007-08-23  6:57 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser, Jan Beulich

On Wednesday 22 August 2007 18:10:24 Keir Fraser wrote:
> On 22/8/07 17:05, "Keir Fraser" <keir@xensource.com> wrote:
> >> The polling routine that is in the -unstable tree (the version taken
> >> from Linux) runs every 15 seconds without adjustments.
> >> 1Hz causes too much system load for a healthy system IMO.
> >> That's why I introduced the adjustments with use of hw threshold
> >> registers to come to a compromise solution.
> >
> > What's the deal here? Do correctable errors not cause an MCE, yet are
> > still detected via the machine-check architecture (albeit by a polling
> > method)?

The deal here is, detect correctable errors via polling und uncorrectable 
errors via MCE.
This patchset is about correctable errors.

> > Are there going to be patches on the Linux side to pick up this MCA info?
> > What is Linux going to do with it, apart from log it (which Xen can
> > already do itself)? Or is this all Solaris-specific?

The general idea is the Dom0 picks up this MCA info and a) uses
the error-handling infrastructure provided for the non-virtualized form
and b) will use hypercalls to tell xen to also report MCA to a DomU and/or
kill a DomU.
Some hw features for self-healing can only use Dom0 (because registers
sit in the PCI extended config space, Xen doesn't have access to) and some
can use Xen itself.

I wrote a demo driver that mainly tests that the Dom0 actually receives the
MCA info for NetBSD/Xen (Sun prefers to look into BSD licensed code).
It should be easy to port it to Linux.

> Oh, and is AMD-specific code really needed in non-fatal.c? I though the MCA
> stuff was architectural now rather than vendor specific? If there are
> vendor-specific extensions then they belong in the vendor's .c file.

AMD-specific is the use of the hw register code. Intel has some additional 
machine check MSR's containing the register set. Intel may add a structure
to patch 2/3 that make use of them. Should I move the amd polling handler to
amd.c ?

-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] resend 3/3: MCA/MCE correctable error handling
  2007-08-23  6:57             ` Christoph Egger
@ 2007-08-23  9:27               ` Christoph Egger
  2007-08-23 14:07               ` [PATCH] " Keir Fraser
  1 sibling, 0 replies; 10+ messages in thread
From: Christoph Egger @ 2007-08-23  9:27 UTC (permalink / raw)
  To: xen-devel; +Cc: Gavin.Maltby, Keir Fraser, Jan Beulich

[-- Attachment #1: Type: text/plain, Size: 595 bytes --]


Yesterday I said, I will re-send this patch. Here is it.
It incorporates feedback from Jan Beulich.

Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>




-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

[-- Attachment #2: mca_diff3_event.diff --]
[-- Type: text/x-diff, Size: 13289 bytes --]

diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c	Thu Aug 23 10:20:43 2007 +0200
+++ b/xen/arch/x86/cpu/mcheck/mce.c	Thu Aug 23 10:52:15 2007 +0200
@@ -8,11 +8,13 @@
 #include <xen/kernel.h>
 #include <xen/config.h>
 #include <xen/smp.h>
+#include <xen/errno.h>
 
 #include <asm/processor.h> 
 #include <asm/system.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 int mce_disabled = 0;
 unsigned int nr_mce_banks;
@@ -89,3 +91,89 @@ static int __init mcheck_enable(char *st
 
 custom_param("nomce", mcheck_disable);
 custom_param("mce", mcheck_enable);
+
+
+
+
+void x86_mcinfo_clear(struct shared_info *si)
+{
+	memset(&si->arch.mc_info, 0, sizeof(struct arch_mc_info));
+	x86_mcinfo_nentries(si) = 0;
+}
+
+
+int x86_mcinfo_add(struct shared_info *si, void *mcinfo)
+{
+	int i;
+	unsigned long end1, end2;
+	struct mcinfo_common *mic, *mic_base, *mic_index;
+
+	mic = (struct mcinfo_common *)mcinfo;
+	mic_index = mic_base = x86_mcinfo_first(si);
+
+	/* go to first free entry */
+	for (i = 0; i < x86_mcinfo_nentries(si); i++) {
+		mic_index = x86_mcinfo_next(mic_index);
+	}
+
+	/* check if there is enough size */
+	end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct arch_mc_info));
+	end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
+
+	if (end1 < end2)
+		return -ENOSPC; /* No space. Can't add entry. */
+
+	/* there's enough space. add entry. */
+	memcpy(mic_index, mic, mic->size);
+	x86_mcinfo_nentries(si)++;
+
+	return 0;
+}
+
+/* Dump machine check information in a format,
+ * mcelog can parse. This is used only when
+ * Dom0 does not take the notification. */
+void x86_mcinfo_dump(struct shared_info *si)
+{
+	struct mcinfo_common *mic;
+	struct mcinfo_global *mc_global;
+	struct mcinfo_bank *mc_bank;
+
+	/* first print the global info */
+	x86_mcinfo_lookup(mic, si, MC_TYPE_GLOBAL);
+	if (mic == NULL)
+		return;
+	mc_global = (struct mcinfo_global *)mic;
+	if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+		printk(KERN_WARNING
+		       "CPU%d: Machine Check Exception: %16"PRIx64"\n",
+		       mc_global->mc_coreid, mc_global->mc_gstatus);
+	} else {
+		printk(KERN_WARNING "Machine Check: The hardware reports a non "
+		       "fatal, correctable incident occured on "
+		       "CPU %d.\n",
+		       mc_global->mc_coreid);
+	}
+
+	/* then the bank information */
+	x86_mcinfo_lookup(mic, si, MC_TYPE_BANK); /* finds the first entry */
+	do {
+		if (mic == NULL)
+			return;
+		if (mic->type != MC_TYPE_BANK)
+			continue;
+
+		mc_bank = (struct mcinfo_bank *)mic;
+       
+		printk(KERN_WARNING "Bank %d: %16"PRIx64,
+		       mc_bank->mc_bank,
+		       mc_bank->mc_status);
+		if (mc_bank->mc_status & MCi_STATUS_MISCV)
+			printk("[%16"PRIx64"]", mc_bank->mc_misc);
+		if (mc_bank->mc_status & MCi_STATUS_ADDRV)
+			printk(" at %16"PRIx64, mc_bank->mc_addr);
+
+		printk("\n");
+		mic = x86_mcinfo_next(mic); /* next entry */
+	} while (mic != NULL);
+}
diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h	Thu Aug 23 10:20:43 2007 +0200
+++ b/xen/arch/x86/cpu/mcheck/mce.h	Thu Aug 23 10:52:15 2007 +0200
@@ -6,6 +6,10 @@ void intel_p6_mcheck_init(struct cpuinfo
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
+void x86_mcinfo_clear(struct shared_info *si);
+int x86_mcinfo_add(struct shared_info *si, void *mcinfo);
+void x86_mcinfo_dump(struct shared_info *si);
+
 /* Call the installed machine check handler for this CPU setup. */
 extern fastcall void (*machine_check_vector)(struct cpu_user_regs *, long error_code);
 
diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c	Thu Aug 23 10:20:43 2007 +0200
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c	Thu Aug 23 10:52:15 2007 +0200
@@ -13,19 +13,22 @@
 #include <xen/kernel.h>
 #include <xen/smp.h>
 #include <xen/timer.h>
-#include <xen/errno.h>
+#include <xen/event.h>
 #include <asm/processor.h> 
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
 
-static int firstbank;
+static int firstbank = 0;
 static struct timer mce_timer;
 
 #define MCE_PERIOD MILLISECS(15000)
-
-static void mce_checkregs (void *info)
+#define MCE_MIN    MILLISECS(2000)
+#define MCE_MAX    MILLISECS(30000)
+
+
+static void mce_intel_checkregs (void *info)
 {
 	u32 low, high;
 	int i;
@@ -50,10 +53,171 @@ static void mce_checkregs (void *info)
 	}
 }
 
-static void mce_work_fn(void *data)
+static void mce_intel_work_fn(void *data)
 { 
-	on_each_cpu(mce_checkregs, NULL, 1, 1);
+	on_each_cpu(mce_intel_checkregs, NULL, 1, 1);
 	set_timer(&mce_timer, NOW() + MCE_PERIOD);
+}
+
+
+
+/* The MCi_STATUS_* #defines are needed here */
+#include "x86_mca.h"
+
+static s_time_t period = MCE_PERIOD;
+static int hw_threshold = 0;
+static int adjust = 0;
+
+
+/* The polling service routine:
+ * Collects information of correctable errors and notifies
+ * Dom0 via an event.
+ */
+void mce_amd_checkregs(void *info)
+{
+	struct shared_info *si;
+	struct vcpu  *vcpu = current;
+	struct mcinfo_global mc_global;
+	struct mcinfo_bank mc_info;
+	uint64_t status, addrv, miscv;
+	unsigned int i;
+	unsigned int event_enabled;
+
+	event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
+	adjust = 0;
+	si = (struct shared_info *)dom0->shared_info; /* cast silences gcc4 */
+	memset(&mc_global, 0, sizeof(mc_global));
+	mc_global.common.type = MC_TYPE_GLOBAL;
+	mc_global.common.size = sizeof(mc_global);
+
+	mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
+	mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+	mc_global.mc_core_threadid = 0;
+	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* todo: on which socket is this physical core? */
+	mc_global.mc_socketid = ???;
+#endif
+	mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
+	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+	x86_mcinfo_clear(si);
+	x86_mcinfo_add(si, &mc_global);
+
+	for (i = 0; i < nr_mce_banks; i++) {
+		rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+	       
+		if (!(status & MCi_STATUS_VAL))
+			continue;
+
+		memset(&mc_info, 0, sizeof(mc_info));
+		mc_info.common.type = MC_TYPE_BANK;
+		mc_info.common.size = sizeof(mc_info);
+		mc_info.mc_bank = i;
+		mc_info.mc_status = status;
+
+		/* Increase polling frequency */
+		adjust = 1;
+
+		addrv = 0;
+		if (status & MCi_STATUS_ADDRV)
+			rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
+
+		miscv = 0;
+		if (status & MCi_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+
+		mc_info.mc_addr = addrv;
+		mc_info.mc_misc = miscv;
+		x86_mcinfo_add(si, &mc_info);
+
+		/* clear status */
+		wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
+		wmb();
+	}
+
+	if (adjust > 0) {
+		/* If Dom0 enabled the VIRQ_MCA event, then ... */
+		if (event_enabled)
+			/* ... notify it. */
+			send_guest_global_virq(dom0, VIRQ_MCA);
+		else
+			/* ... or dump it */
+			x86_mcinfo_dump(si);
+	}
+}
+
+/* polling service routine invoker:
+ * Adjust poll frequency at runtime. No error means slow polling frequency,
+ * an error means higher polling frequency.
+ * It uses hw threshold register introduced in AMD K8 RevF to detect
+ * multiple correctable errors between two polls. In that case,
+ * increase polling frequency higher than normal.
+ */
+static void mce_amd_work_fn(void *data)
+{
+	on_each_cpu(mce_amd_checkregs, data, 1, 1);
+
+	if (adjust > 0) {
+		if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+			/* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
+			printk("MCE: polling routine found correctable error\n");
+		}
+	}
+
+	if (hw_threshold) {
+		uint64_t value;
+		uint32_t counter;
+
+		rdmsrl(MSR_K8_MC4_MISC, value);
+		/* Only the error counter field is of interest
+		 * Bit field is described in AMD K8 BKDG chapter 6.4.5.5
+		 */
+		counter = (value & 0xFFF00000000ULL) >> 32U;
+
+		/* HW does not count *all* kinds of correctable errors.
+		 * Thus it is possible, that the polling routine finds an
+		 * correctable error even if the HW reports nothing.
+		 * However, the other way around is not possible (= BUG).
+		 */ 
+		if (counter > 0) {
+			/* HW reported correctable errors,
+			 * the polling routine did not find...
+			 */
+			BUG_ON(adjust == 0);
+			/* subtract 1 to not double count the error 
+			 * from the polling service routine */ 
+			adjust += (counter - 1);
+
+			/* Restart counter */
+			/* No interrupt, reset counter value */
+			value &= ~(0x60FFF00000000ULL);
+			/* Counter enable */
+			value |= (1ULL << 51);
+			wrmsrl(MSR_K8_MC4_MISC, value);
+			wmb();
+		}
+	}
+
+	if (adjust > 0) {
+		/* Increase polling frequency */
+		adjust++; /* adjust == 1 must have an effect */
+		period /= adjust;
+	} else {
+		/* Decrease polling frequency */
+		period *= 2;
+	}
+	if (period > MCE_MAX) {
+		/* limit: Poll at least every 30s */
+		period = MCE_MAX;
+	}
+	if (period < MCE_MIN) {
+		/* limit: Poll every 2s.
+		 * When this is reached an uncorrectable error
+		 * is expected to happen, if Dom0 does nothing.
+		 */
+		period = MCE_MIN;
+	}
+
+	set_timer(&mce_timer, NOW() + period);
 }
 
 static int __init init_nonfatal_mce_checker(void)
@@ -68,19 +232,63 @@ static int __init init_nonfatal_mce_chec
 	if (!cpu_has(c, X86_FEATURE_MCA))
 		return -ENODEV;
 
-	/* Some Athlons misbehave when we frob bank 0 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		boot_cpu_data.x86 == 6)
-			firstbank = 1;
-	else
-			firstbank = 0;
-
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
-	init_timer(&mce_timer, mce_work_fn, NULL, 0);
-	set_timer(&mce_timer, NOW() + MCE_PERIOD);
-	printk(KERN_INFO "Machine check exception polling timer started.\n");
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (c->x86 == 6) { /* K7 */
+			firstbank = 1;
+			init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
+			set_timer(&mce_timer, NOW() + MCE_PERIOD);
+			break;
+		}
+
+		/* Assume we are on K8 or newer AMD CPU here */
+		if (cpu_has(c, X86_FEATURE_SVME)) {
+			uint64_t value;
+
+			/* hw threshold registers present */
+			hw_threshold = 1;
+			rdmsrl(MSR_K8_MC4_MISC, value);
+
+			if (value & (1ULL << 61)) { /* Locked bit */
+				/* Locked by BIOS. Not available for use */
+				hw_threshold = 0;
+			}
+			if (!(value & (1ULL << 63))) { /* Valid bit */
+				/* No CtrP present */
+				hw_threshold = 0;
+			} else {
+			       if (!(value & (1ULL << 62))) { /* Counter Bit */
+					/* No counter field present */
+					hw_threshold = 0;
+				}
+			}
+
+			if (hw_threshold) {
+				/* No interrupt, reset counter value */
+				value &= ~(0x60FFF00000000ULL);
+				/* Counter enable */
+				value |= (1ULL << 51);
+				wrmsrl(MSR_K8_MC4_MISC, value);
+				/* serialize */
+				wmb();
+				printk(KERN_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
+			}
+		}
+
+		init_timer(&mce_timer, mce_amd_work_fn, NULL, 0);
+		set_timer(&mce_timer, NOW() + period);
+		break;
+
+	case X86_VENDOR_INTEL:
+		init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
+		set_timer(&mce_timer, NOW() + MCE_PERIOD);
+		break;
+	}
+
+	printk(KERN_INFO "MCA: Machine check polling timer started.\n");
 	return 0;
 }
 __initcall(init_nonfatal_mce_checker);
diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/common/event_channel.c
--- a/xen/common/event_channel.c	Thu Aug 23 10:20:43 2007 +0200
+++ b/xen/common/event_channel.c	Thu Aug 23 10:52:15 2007 +0200
@@ -539,6 +539,21 @@ void evtchn_set_pending(struct vcpu *v, 
 }
 
 
+int guest_enabled_event(struct vcpu *v, int virq)
+{
+    int port;
+
+    if ( unlikely(v == NULL) )
+        return 0;
+
+    port = v->virq_to_evtchn[virq];
+    if ( port == 0 )
+        return 0;
+
+    /* virq is in use */
+    return 1;
+}
+
 void send_guest_vcpu_virq(struct vcpu *v, int virq)
 {
     int port;
diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/include/asm-x86/event.h
--- a/xen/include/asm-x86/event.h	Thu Aug 23 10:20:43 2007 +0200
+++ b/xen/include/asm-x86/event.h	Thu Aug 23 10:52:15 2007 +0200
@@ -61,7 +61,12 @@ static inline void local_event_delivery_
 /* No arch specific virq definition now. Default to global. */
 static inline int arch_virq_is_global(int virq)
 {
-    return 1;
+    switch (virq) {
+    case VIRQ_MCA:
+        return 1;
+    default:
+        return 1;
+    }
 }
 
 #endif
diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/include/public/arch-x86/xen.h
--- a/xen/include/public/arch-x86/xen.h	Thu Aug 23 10:20:43 2007 +0200
+++ b/xen/include/public/arch-x86/xen.h	Thu Aug 23 10:52:15 2007 +0200
@@ -82,6 +82,8 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 
 #ifndef __ASSEMBLY__
 
+#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
+
 /*
  * Machine Check Architecure:
  * structs are read-only and used to report all kinds of
diff -r 0fd5402a3730 -r e59c5e3b3d41 xen/include/xen/event.h
--- a/xen/include/xen/event.h	Thu Aug 23 10:20:43 2007 +0200
+++ b/xen/include/xen/event.h	Thu Aug 23 10:52:15 2007 +0200
@@ -51,6 +51,9 @@ void free_xen_event_channel(
 void free_xen_event_channel(
     struct vcpu *local_vcpu, int port);
 
+/* Query if event channel is in use by the guest */
+int guest_enabled_event(struct vcpu *v, int virq);
+
 /* Notify remote end of a Xen-attached event channel.*/
 void notify_via_xen_event_channel(int lport);
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] 3/3: MCA/MCE correctable error handling
  2007-08-23  6:57             ` Christoph Egger
  2007-08-23  9:27               ` [PATCH] resend " Christoph Egger
@ 2007-08-23 14:07               ` Keir Fraser
  1 sibling, 0 replies; 10+ messages in thread
From: Keir Fraser @ 2007-08-23 14:07 UTC (permalink / raw)
  To: Christoph Egger, xen-devel; +Cc: Gavin.Maltby, Jan Beulich

On 23/8/07 07:57, "Christoph Egger" <Christoph.Egger@amd.com> wrote:

>> Oh, and is AMD-specific code really needed in non-fatal.c? I though the MCA
>> stuff was architectural now rather than vendor specific? If there are
>> vendor-specific extensions then they belong in the vendor's .c file.
> 
> AMD-specific is the use of the hw register code. Intel has some additional
> machine check MSR's containing the register set. Intel may add a structure
> to patch 2/3 that make use of them. Should I move the amd polling handler to
> amd.c ?

I think so.

 -- Keir

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2007-08-23 14:07 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-21 13:31 [PATCH] 3/3: MCA/MCE correctable error handling Christoph Egger
2007-08-21 16:02 ` Jan Beulich
2007-08-22  9:00   ` Christoph Egger
2007-08-22 10:09     ` Jan Beulich
2007-08-22 15:56       ` Christoph Egger
2007-08-22 16:05         ` Keir Fraser
2007-08-22 16:10           ` Keir Fraser
2007-08-23  6:57             ` Christoph Egger
2007-08-23  9:27               ` [PATCH] resend " Christoph Egger
2007-08-23 14:07               ` [PATCH] " Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.