All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
@ 2008-12-19  6:19 Ke, Liping
  2009-01-12 14:03 ` Christoph Egger
  0 siblings, 1 reply; 11+ messages in thread
From: Ke, Liping @ 2008-12-19  6:19 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 255 bytes --]

Hi, all

Patch 4 is the main patch for CMCI enabling in XEN. It adds the CMCI interrupt handler, new common CMCI/MCA init process,CMCI owner judge algorithm when bring_up CPUs, CPU on/offline  and  polling mechanisms, etc

Thanks & Regards,
Criping

[-- Attachment #2: clean_and_cmci.patch --]
[-- Type: application/octet-stream, Size: 33272 bytes --]

[patch 4/4]Enable CMCI for Intel CPUs -- main patch for CMCI support

This patch is the main patch for CMCI enabling in XEN.
It adds the CMCI interrupt handler, new common CMCI/MCA init process,
CMCI owner judge algorithm when bring_up CPUs, CPU on/offlines, polling
mechanisms, etc

Signed-off-by Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by Liping Ke <liping.ke@intel.com>

diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/Makefile	Sun Jan 02 00:25:07 2005 +0800
@@ -3,8 +3,7 @@
 obj-y += amd_k8.o
 obj-y += amd_f10.o
 obj-y += mce.o
+obj-y += mce_intel.o
 obj-y += non-fatal.o
-obj-y += p4.o
 obj-$(x86_32) += p5.o
-obj-$(x86_32) += p6.o
 obj-$(x86_32) += winchip.o
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/k7.c
--- a/xen/arch/x86/cpu/mcheck/k7.c	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/k7.c	Sun Jan 02 00:25:07 2005 +0800
@@ -14,6 +14,7 @@
 #include <asm/msr.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 /* Machine Check Handler For AMD Athlon/Duron */
 static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_code)
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.c	Sun Jan 02 00:25:07 2005 +0800
@@ -27,7 +27,7 @@
  * to physical cpus present in the machine.
  * The more physical cpus are available, the more entries you need.
  */
-#define MAX_MCINFO	10
+#define MAX_MCINFO	20
 
 struct mc_machine_notify {
 	struct mc_info mc;
@@ -110,6 +110,22 @@
 	}
 }
 
+/*check the existence of Machine Check*/
+int mce_available(struct cpuinfo_x86 *c)
+{
+	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+/*Make sure there are no machine check on offlined or suspended CPUs*/
+void mce_disable_cpu(void)
+{
+    if (!mce_available(&current_cpu_data) || mce_disabled == 1)
+         return;
+    printk(KERN_DEBUG "MCE: disable mce on CPU%d\n", smp_processor_id());
+    clear_in_cr4(X86_CR4_MCE);
+}
+
+
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
@@ -135,11 +151,13 @@
 #ifndef CONFIG_X86_64
 		if (c->x86==5)
 			intel_p5_mcheck_init(c);
-		if (c->x86==6)
-			intel_p6_mcheck_init(c);
 #endif
-		if (c->x86==15)
-			intel_p4_mcheck_init(c);
+		/*If it is P6 or P4 family, including CORE 2 DUO series*/
+		if (c->x86 == 6 || c->x86==15)
+		{
+			printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
+			intel_mcheck_init(c);
+		}
 		break;
 
 #ifndef CONFIG_X86_64
@@ -181,7 +199,7 @@
 		entry = mc_data.error_idx;
 		smp_rmb();
 		next = entry + 1;
-		if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
+		if (cmpxchg(&mc_data.error_idx, entry, next) == entry) 
 			break;
 	}
 
@@ -231,8 +249,7 @@
 
 	*fetch_idx = mc_data.fetch_idx;
 	mc_data.fetch_idx++;
-	BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
-
+	BUG_ON(mc_data.fetch_idx > mc_data.error_idx);	
 	return mi;
 }
 
@@ -392,18 +409,24 @@
 	struct mcinfo_bank *mc_bank;
 
 	/* first print the global info */
+	printk(KERN_DEBUG "MCE: Dump machine check data\n");
 	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
-	if (mic == NULL)
+	if (mic == NULL) {
+		printk(XENLOG_WARNING "MCE: global info NULL on CPU%d\n", 
+				smp_processor_id());
 		return;
+	}
 	mc_global = (struct mcinfo_global *)mic;
-	if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+	if (mc_global->mc_flags & 
+			(MC_FLAG_UNCORRECTABLE | MC_FLAG_RECOVERABLE)) {
 		printk(XENLOG_WARNING
-			"CPU%d: Machine Check Exception: %16"PRIx64"\n",
+			"CORE%d CPU%d: Machine Check Exception: %16"PRIx64"\n",
+			mc_global->mc_socketid, 
 			mc_global->mc_coreid, mc_global->mc_gstatus);
 	} else {
 		printk(XENLOG_WARNING "MCE: The hardware reports a non "
 			"fatal, correctable incident occured on "
-			"CPU %d.\n",
+			"CORE%d CPU %d.\n", mc_global->mc_socketid,
 			mc_global->mc_coreid);
 	}
 
@@ -413,7 +436,7 @@
 		if (mic == NULL)
 			return;
 		if (mic->type != MC_TYPE_BANK)
-			continue;
+			goto next;
 
 		mc_bank = (struct mcinfo_bank *)mic;
 	
@@ -426,6 +449,7 @@
 			printk(" at %16"PRIx64, mc_bank->mc_addr);
 
 		printk("\n");
+next:
 		mic = x86_mcinfo_next(mic); /* next entry */
 		if ((mic == NULL) || (mic->size == 0))
 			break;
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.h	Sun Jan 02 00:25:07 2005 +0800
@@ -1,14 +1,22 @@
 #include <xen/init.h>
+#include <asm/types.h>
 #include <asm/traps.h>
+#include <asm/atomic.h>
+#include <asm/percpu.h>
+
 
 /* Init functions */
 void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
 void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
 void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
 void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
+
+
+void intel_mcheck_timer(struct cpuinfo_x86 *c);
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
+void intel_mcheck_init(struct cpuinfo_x86 *c);
+void mce_intel_feature_init(struct cpuinfo_x86 *c);
+
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
 /* Function pointer used in the handlers to collect additional information
@@ -19,6 +27,7 @@
 		uint16_t bank, uint64_t status);
 
 
+int mce_available(struct cpuinfo_x86 *c);
 /* Helper functions used for collecting error telemetry */
 struct mc_info *x86_mcinfo_getptr(void);
 void x86_mcinfo_clear(struct mc_info *mi);
@@ -26,6 +35,3 @@
 void x86_mcinfo_dump(struct mc_info *mi);
 void mc_panic(char *s);
 
-/* Global variables */
-extern int mce_disabled;
-extern unsigned int nr_mce_banks;
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce_intel.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Sun Jan 02 00:25:07 2005 +0800
@@ -0,0 +1,678 @@
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/irq.h>
+#include <xen/event.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+#include "x86_mca.h"
+
+DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
+
+static int nr_intel_ext_msrs = 0;
+static int cmci_support = 0;
+extern int firstbank;
+
+static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
+{
+    if (nr_intel_ext_msrs == 0)
+        return;
+
+	/*this function will called when CAP(9).MCG_EXT_P = 1*/
+    memset(mc_ext, 0, sizeof(struct mcinfo_extended));
+    mc_ext->common.type = MC_TYPE_EXTENDED;
+    mc_ext->common.size = sizeof(mc_ext);
+    mc_ext->mc_msrs = 10;
+
+    mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
+    rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
+    mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
+    rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
+    mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
+    rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
+
+    mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
+    rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
+    mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
+    rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
+    mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
+    rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
+
+    mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
+    rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
+    mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
+    rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
+    mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
+    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
+    mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
+    rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
+}
+
+#ifdef CONFIG_X86_MCE_THERMAL
+static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
+{	
+    printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
+                smp_processor_id());
+    add_taint(TAINT_MACHINE_CHECK);
+}
+
+/* P4/Xeon Thermal transition interrupt handler */
+static void intel_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    u32 l, h;
+    unsigned int cpu = smp_processor_id();
+    static s_time_t next[NR_CPUS];
+
+    ack_APIC_irq();
+    if (NOW() < next[cpu])
+        return;
+
+    next[cpu] = NOW() + MILLISECS(5000);
+    rdmsr(MSR_IA32_THERM_STATUS, l, h);
+    if (l & 0x1) {
+        printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
+        printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
+                cpu);
+        add_taint(TAINT_MACHINE_CHECK);
+    } else {
+        printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+    }
+}
+
+/* Thermal interrupt handler for this CPU setup */
+static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs) 
+        = unexpected_thermal_interrupt;
+
+fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    irq_enter();
+    vendor_thermal_interrupt(regs);
+    irq_exit();
+}
+
+/* P4/Xeon Thermal regulation detect and init */
+static void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+    u32 l, h;
+    int tm2 = 0;
+    unsigned int cpu = smp_processor_id();
+
+    /* Thermal monitoring */
+    if (!cpu_has(c, X86_FEATURE_ACPI))
+        return;	/* -ENODEV */
+
+    /* Clock modulation */
+    if (!cpu_has(c, X86_FEATURE_ACC))
+        return;	/* -ENODEV */
+
+    /* first check if its enabled already, in which case there might
+     * be some SMM goo which handles it, so we can't even put a handler
+     * since it might be delivered via SMI already -zwanem.
+     */
+    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+    h = apic_read(APIC_LVTTHMR);
+    if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
+        printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
+        return; /* -EBUSY */
+    }
+
+    if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+        tm2 = 1;
+
+	/* check whether a vector already exists, temporarily masked? */
+    if (h & APIC_VECTOR_MASK) {
+        printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
+                 cpu, (h & APIC_VECTOR_MASK));
+        return; /* -EBUSY */
+    }
+
+    /* The temperature transition interrupt handler setup */
+    h = THERMAL_APIC_VECTOR;		/* our delivery vector */
+    h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
+    apic_write_around(APIC_LVTTHMR, h);
+
+    rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
+    wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+
+    /* ok we're good to go... */
+    vendor_thermal_interrupt = intel_thermal_interrupt;
+
+    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+    wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+
+    l = apic_read (APIC_LVTTHMR);
+    apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+    printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 
+            cpu, tm2 ? "TM2" : "TM1");
+    return;
+}
+#endif /* CONFIG_X86_MCE_THERMAL */
+
+/* machine_check_poll might be called by following types:
+ * 1. called when do mcheck_init.
+ * 2. called in cmci interrupt handler
+ * 3. called in polling handler
+ * It will generate a new mc_info item if found CE/UC errors. DOM0 is the 
+ * consumer.
+*/
+static int machine_check_poll(struct mc_info *mi, int calltype)
+{
+    int exceptions = (read_cr4() & X86_CR4_MCE);
+    int i, nr_unit = 0, uc = 0, pcc = 0;
+    uint64_t status, addr;
+    struct mcinfo_global mcg;
+    struct mcinfo_extended mce;
+    unsigned int cpu;
+    struct domain *d;
+
+    cpu = smp_processor_id();
+
+    if (!mi) {
+        printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
+        return 0;
+    }
+    x86_mcinfo_clear(mi);
+
+    memset(&mcg, 0, sizeof(mcg));
+    mcg.common.type = MC_TYPE_GLOBAL;
+    mcg.common.size = sizeof(mcg);
+    /*If called from cpu-reset check, don't need to fill them.
+     *If called from cmci context, we'll try to fill domid by memory addr
+    */
+    mcg.mc_domid = -1;
+    mcg.mc_vcpuid = -1;
+    if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
+        mcg.mc_flags = MC_FLAG_POLLED;
+    else if (calltype == MC_FLAG_CMCI)
+        mcg.mc_flags = MC_FLAG_CMCI;
+    mcg.mc_socketid = phys_proc_id[cpu];
+    mcg.mc_coreid = cpu_core_id[cpu];
+    mcg.mc_apicid = cpu_physical_id(cpu);
+    mcg.mc_core_threadid = mcg.mc_apicid & ( 1 << (smp_num_siblings - 1)); 
+    rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
+
+    for ( i = 0; i < nr_mce_banks; i++ ) {
+        struct mcinfo_bank mcb;
+        if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+        if (! (status & MCi_STATUS_VAL) )
+            continue;
+        /*
+         * Uncorrected events are handled by the exception
+         * handler when it is enabled. But when the exception
+         * is disabled such as when mcheck_init, log everything.
+         */
+        if ((status & MCi_STATUS_UC) && exceptions)
+            continue;
+
+        if (status & MCi_STATUS_UC)
+            uc = 1;
+        if (status & MCi_STATUS_PCC)
+            pcc = 1;
+
+        memset(&mcb, 0, sizeof(mcb));
+        mcb.common.type = MC_TYPE_BANK;
+        mcb.common.size = sizeof(mcb);
+        mcb.mc_bank = i;
+        mcb.mc_status = status;
+        if (status & MCi_STATUS_MISCV)
+            rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
+        if (status & MCi_STATUS_ADDRV) {
+            rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+            d = maddr_get_owner(addr);
+            if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) )
+                mcb.mc_domid = d->domain_id;
+        }
+        if (cmci_support)
+            rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+        if (calltype == MC_FLAG_CMCI)
+            rdtscll(mcb.mc_tsc);
+        x86_mcinfo_add(mi, &mcb);
+        nr_unit++;
+        add_taint(TAINT_MACHINE_CHECK);
+        /*Clear state for this bank */
+        wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
+        printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%lx]\n", 
+                i, cpu, status);
+        printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
+                "thread[%d]\n", cpu, mcg.mc_socketid, 
+                mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
+ 
+    }
+    /*if pcc = 1, uc must be 1*/
+    if (pcc)
+        mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
+    else if (uc)
+        mcg.mc_flags |= MC_FLAG_RECOVERABLE;
+    else /*correctable*/
+        mcg.mc_flags |= MC_FLAG_CORRECTABLE;
+
+    if (nr_unit && nr_intel_ext_msrs && 
+                    (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
+        intel_get_extended_msrs(&mce);
+        x86_mcinfo_add(mi, &mce);
+    }
+    if (nr_unit) 
+        x86_mcinfo_add(mi, &mcg);
+    /*Clear global state*/
+    return nr_unit;
+}
+
+static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
+{
+    /* MACHINE CHECK Error handler will be sent in another patch,
+     * simply copy old solutions here. This code will be replaced
+     * by upcoming machine check patches
+     */
+
+    int recover=1;
+    u32 alow, ahigh, high, low;
+    u32 mcgstl, mcgsth;
+    int i;
+   
+    rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+    if (mcgstl & (1<<0))	/* Recoverable ? */
+    	recover=0;
+    
+    printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+    	smp_processor_id(), mcgsth, mcgstl);
+    
+    for (i=0; i<nr_mce_banks; i++) {
+    	rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+    	if (high & (1<<31)) {
+    		if (high & (1<<29))
+    			recover |= 1;
+    		if (high & (1<<25))
+    			recover |= 2;
+    		printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
+    		high &= ~(1<<31);
+    		if (high & (1<<27)) {
+    			rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+    			printk ("[%08x%08x]", ahigh, alow);
+    		}
+    		if (high & (1<<26)) {
+    			rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+    			printk (" at %08x%08x", ahigh, alow);
+    		}
+    		printk ("\n");
+    	}
+    }
+    
+    if (recover & 2)
+    	mc_panic ("CPU context corrupt");
+    if (recover & 1)
+    	mc_panic ("Unable to continue");
+    
+    printk(KERN_EMERG "Attempting to continue.\n");
+    /* 
+     * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+     * recoverable/continuable.This will allow BIOS to look at the MSRs
+     * for errors if the OS could not log the error.
+     */
+    for (i=0; i<nr_mce_banks; i++) {
+    	u32 msr;
+    	msr = MSR_IA32_MC0_STATUS+i*4;
+    	rdmsr (msr, low, high);
+    	if (high&(1<<31)) {
+    		/* Clear it */
+    		wrmsr(msr, 0UL, 0UL);
+    		/* Serialize */
+    		wmb();
+    		add_taint(TAINT_MACHINE_CHECK);
+    	}
+    }
+    mcgstl &= ~(1<<2);
+    wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+}
+
+extern void (*cpu_down_handler)(int down_cpu);
+extern void (*cpu_down_rollback_handler)(int down_cpu);
+extern void mce_disable_cpu(void);
+static bool_t cmci_clear_lock = 0;
+static DEFINE_SPINLOCK(cmci_discover_lock);
+static DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
+
+/*
+ * Discover bank sharing using the algorithm recommended in the SDM.
+ */
+static int do_cmci_discover(int i)
+{
+    unsigned msr = MSR_IA32_MC0_CTL2 + i;
+    u64 val;
+
+    rdmsrl(msr, val);
+    /* Some other CPU already owns this bank. */
+    if (val & CMCI_EN) {
+    	clear_bit(i, __get_cpu_var(mce_banks_owned));
+    	goto out;
+    }
+    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
+    rdmsrl(msr, val);
+
+    if (!(val & CMCI_EN)) {
+     /*
+      * This bank does not support CMCI. The polling
+      * timer has to handle it. 
+      */
+    	set_bit(i, __get_cpu_var(no_cmci_banks));
+    	return 0;
+    }
+    set_bit(i, __get_cpu_var(mce_banks_owned));
+out:
+    clear_bit(i, __get_cpu_var(no_cmci_banks));
+    return 1;
+}
+
+void cmci_discover(void)
+{
+    int i;
+
+    printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
+    spin_lock(&cmci_discover_lock);
+    for (i = 0; i < nr_mce_banks; i++) {
+        /*If the cpu is the bank owner, need not re-discover*/
+        if (test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        do_cmci_discover(i);
+    }
+    spin_unlock(&cmci_discover_lock);
+    printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
+            smp_processor_id(), 
+            *((unsigned long *)__get_cpu_var(mce_banks_owned)), 
+            *((unsigned long *)__get_cpu_var(no_cmci_banks)));
+}
+
+/*
+ * Define an owner for each bank. Banks can be shared between CPUs
+ * and to avoid reporting events multiple times always set up one
+ * CPU as owner. 
+ *
+ * The assignment has to be redone when CPUs go offline and
+ * any of the owners goes away. Also pollers run in parallel so we
+ * have to be careful to update the banks in a way that doesn't
+ * lose or duplicate events.
+ */
+
+static void mce_set_owner(void)
+{
+
+    if (!cmci_support || mce_disabled == 1)
+        return;
+
+    cmci_discover();
+}
+
+static void clear_cmci(void)
+{
+    int i;
+
+    if (!cmci_support || mce_disabled == 1)
+        return;
+
+    printk(KERN_DEBUG "CMCI: clear_cmci support on CPU%d\n", 
+            smp_processor_id());
+
+    for (i = 0; i < nr_mce_banks; i++) {
+        unsigned msr = MSR_IA32_MC0_CTL2 + i;
+        u64 val;
+        if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        rdmsrl(msr, val);
+        if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
+            wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
+        clear_bit(i, __get_cpu_var(mce_banks_owned));
+    }
+}
+
+/*we need to re-set cmci owners when cpu_down fail or cpu_up*/
+static void cmci_reenable_cpu(void *h)
+{
+    if (!mce_available(&current_cpu_data) || mce_disabled == 1)
+         return;
+    printk(KERN_DEBUG "CMCI: reenable mce on CPU%d\n", smp_processor_id());
+    mce_set_owner();
+    set_in_cr4(X86_CR4_MCE);
+}
+
+/* When take cpu_down, we need to execute the impacted cmci_owner judge algorithm 
+ * First, we need to clear the ownership on the dead CPU
+ * Then,  other CPUs will check whether to take the bank's ownership from down_cpu
+ * CPU0 need not and "never" execute this path
+*/
+void  __cpu_clear_cmci( int down_cpu)
+{
+    int cpu = smp_processor_id();
+
+    if (!cmci_support && mce_disabled == 1)
+        return;
+
+    if (cpu == 0) {
+        printk(KERN_DEBUG "CMCI: CPU0 need not be cleared\n");
+        return;
+    }
+
+    local_irq_disable();
+    if (cpu == down_cpu){
+        mce_disable_cpu();
+        clear_cmci();
+        wmb();
+        test_and_set_bool(cmci_clear_lock);
+        return;
+    }
+    while (!cmci_clear_lock)
+        cpu_relax();
+    if (cpu != down_cpu)
+        mce_set_owner();
+
+    test_and_clear_bool(cmci_clear_lock);
+    local_irq_enable();
+
+}
+
+void  __cpu_clear_cmci_rollback( int down_cpu)
+{
+    cpumask_t down_map;
+    if (!cmci_support || mce_disabled == 1) 
+        return;
+
+    cpus_clear(down_map);
+    cpu_set(down_cpu, down_map);
+    printk(KERN_ERR "CMCI: cpu_down fail. "
+        "Reenable cmci on CPU%d\n", down_cpu);
+    on_selected_cpus(down_map, cmci_reenable_cpu, NULL, 1, 1);
+}
+
+static void intel_init_cmci(struct cpuinfo_x86 *c)
+{
+    u32 l, apic;
+    int cpu = smp_processor_id();
+
+    if (!mce_available(c) || !cmci_support) {
+        printk(KERN_DEBUG "CMCI: CPU%d has no CMCI support\n", cpu);
+        return;
+    }
+
+    apic = apic_read(APIC_CMCI);
+    if ( apic & APIC_VECTOR_MASK )
+    {
+        printk(KERN_WARNING "CPU%d CMCI LVT vector (%#x) already installed\n",
+            cpu, ( apic & APIC_VECTOR_MASK ));
+        return;
+    }
+
+    apic = CMCI_APIC_VECTOR;
+    apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+    apic_write_around(APIC_CMCI, apic);
+
+	/*now clear mask flag*/
+    l = apic_read(APIC_CMCI);
+    apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
+    cpu_down_handler =  __cpu_clear_cmci;
+    cpu_down_rollback_handler = __cpu_clear_cmci_rollback; 
+}
+
+fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
+{
+    int nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+    int cpu = smp_processor_id();
+
+    ack_APIC_irq();
+    irq_enter();
+    printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
+    nr_unit = machine_check_poll(mi, MC_FLAG_CMCI);
+    if (nr_unit) {
+        x86_mcinfo_dump(mi);
+        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+            send_guest_global_virq(dom0, VIRQ_MCA);
+    }
+    irq_exit();
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+
+#ifdef CONFIG_X86_MCE_THERMAL
+    intel_init_thermal(c);
+#endif
+    intel_init_cmci(c);
+}
+
+static void mce_cap_init(struct cpuinfo_x86 *c)
+{
+    u32 l, h;
+
+    rdmsr (MSR_IA32_MCG_CAP, l, h);
+    if ((l & MCG_CMCI_P) && cpu_has_apic)
+        cmci_support = 1;
+
+    nr_mce_banks = l & 0xff;
+    if (nr_mce_banks > MAX_NR_BANKS)
+        printk(KERN_WARNING "MCE: exceed max mce banks\n");
+    if (l & MCG_EXT_P)
+    {
+        nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
+        printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
+            smp_processor_id(), nr_intel_ext_msrs);
+    }
+    /* for most of p6 family, bank 0 is an alias bios MSR.
+     * But after model>1a, bank 0 is available*/
+    if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
+            && c->x86_model < 0x1A)
+        firstbank = 1;
+    else
+        firstbank = 0;
+}
+
+static void mce_init(void)
+{
+    u32 l, h;
+    int i, nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+    clear_in_cr4(X86_CR4_MCE);
+    /* log the machine checks left over from the previous reset.
+     * This also clears all registers*/
+
+    nr_unit = machine_check_poll(mi, MC_FLAG_RESET);
+    /*in the boot up stage, not expect inject to DOM0, but go print out
+    */
+    if (nr_unit > 0)
+        x86_mcinfo_dump(mi);
+
+    set_in_cr4(X86_CR4_MCE);
+    rdmsr (MSR_IA32_MCG_CAP, l, h);
+    if (l & MCG_CTL_P)	/* Control register present ? */
+        wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+    for (i = firstbank; i < nr_mce_banks; i++)
+    {
+        /*Some banks are shared across cores, use MCi_CTRL to judge whether
+         * this bank has been initialized by other cores already.*/
+        rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
+        if (!l & !h)
+        {
+            /*if ctl is 0, this bank is never initialized*/
+            printk(KERN_DEBUG "mce_init: init bank%d\n", i);
+            wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
+            wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
+       }
+    }
+    if (firstbank) /*if cmci enabled, firstbank = 0*/
+        wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+}
+
+/*p4/p6 faimily has similar MCA initialization process*/
+void intel_mcheck_init(struct cpuinfo_x86 *c)
+{
+	
+	mce_cap_init(c);
+	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+		smp_processor_id());
+	/* machine check is available */
+	machine_check_vector = intel_machine_check;
+	mce_init();
+	mce_intel_feature_init(c);
+	mce_set_owner();
+}
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll faster. When the poller finds no more 
+ * errors, poll slower
+*/
+static struct timer mce_timer;
+
+#define MCE_PERIOD 4000
+#define MCE_MIN    2000
+#define MCE_MAX    32000
+
+static u64 period = MCE_PERIOD;
+static int adjust = 0;
+
+static void mce_intel_checkregs(void *info)
+{
+    int nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+
+    if( !mce_available(&current_cpu_data))
+        return;
+    nr_unit = machine_check_poll(mi, MC_FLAG_POLLED);
+    if (nr_unit)
+    {
+        x86_mcinfo_dump(mi);
+        adjust++;
+        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+            send_guest_global_virq(dom0, VIRQ_MCA);
+    }
+}
+
+static void mce_intel_work_fn(void *data)
+{
+    on_each_cpu(mce_intel_checkregs, data, 1, 1);
+    if (adjust) {
+        period = period / (adjust + 1);
+        printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval to %ld",
+            period);
+    }
+    else {
+        period *= 2;
+    }
+    if (period > MCE_MAX) 
+        period = MCE_MAX;
+    if (period < MCE_MIN)
+        period = MCE_MIN;
+    set_timer(&mce_timer, NOW() + MILLISECS(period));
+    adjust = 0;
+}
+
+void intel_mcheck_timer(struct cpuinfo_x86 *c)
+{
+    printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
+    init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
+    set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
+}
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c	Sun Jan 02 00:25:07 2005 +0800
@@ -19,8 +19,8 @@
 #include <asm/msr.h>
 
 #include "mce.h"
-
-static int firstbank;
+#include "x86_mca.h"
+int firstbank = 0;
 static struct timer mce_timer;
 
 #define MCE_PERIOD MILLISECS(15000)
@@ -61,13 +61,8 @@
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	/* Check for MCE support */
-	if (!cpu_has(c, X86_FEATURE_MCE))
+	if (!mce_available(c))
 		return -ENODEV;
-
-	/* Check for PPro style MCA */
-	if (!cpu_has(c, X86_FEATURE_MCA))
-		return -ENODEV;
-
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
@@ -85,12 +80,20 @@
 		break;
 
 	case X86_VENDOR_INTEL:
-		init_timer(&mce_timer, mce_work_fn, NULL, 0);
-		set_timer(&mce_timer, NOW() + MCE_PERIOD);
+		/* p5 family is different. P4/P6 and latest CPUs shares the
+		 * same polling methods
+		*/
+		if ( c->x86 != 5 )
+		{
+			/* some CPUs or banks don't support cmci, we need to 
+			 * enable this feature anyway
+			 */
+			intel_mcheck_timer(c);
+		}
 		break;
 	}
 
-	printk(KERN_INFO "MCA: Machine check polling timer started.\n");
+	printk(KERN_INFO "mcheck_poll: Machine check polling timer started.\n");
 	return 0;
 }
 __initcall(init_nonfatal_mce_checker);
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Sun Jan 02 00:25:07 2005 +0800
@@ -28,7 +28,10 @@
 /* Bitfield of the MSR_IA32_MCG_CAP register */
 #define MCG_CAP_COUNT           0x00000000000000ffULL
 #define MCG_CTL_P               0x0000000000000100ULL
-/* Bits 9-63 are reserved */
+#define MCG_EXT_P		(1UL<<9)
+#define MCG_EXT_CNT		(16)
+#define MCG_CMCI_P		(1UL<<10)
+/* Other bits are reserved */
 
 /* Bitfield of the MSR_IA32_MCG_STATUS register */
 #define MCG_STATUS_RIPV         0x0000000000000001ULL
@@ -70,3 +73,17 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/*Intel Specific bitfield*/
+#define CMCI_THRESHOLD			0x2
+
+
+#define MAX_NR_BANKS 128
+
+typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned);
+
+/* Global variables */
+extern int mce_disabled;
+extern unsigned int nr_mce_banks;
+extern int firstbank;
+
diff -r a2069e8a3055 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/smpboot.c	Sun Jan 02 00:25:07 2005 +0800
@@ -1237,11 +1237,24 @@
 }
 
 extern void fixup_irqs(cpumask_t map);
-int __cpu_disable(void)
+
+/*
+ * Called when offline cpu. We need to process some new
+ * feature such as CMCI owner change in latest Intel
+ * CPU families
+*/
+void (*cpu_down_handler)(int down_cpu) = NULL;
+void (*cpu_down_rollback_handler)(int down_cpu) = NULL;
+
+
+int __cpu_disable(int down_cpu)
 {
 	cpumask_t map = cpu_online_map;
 	int cpu = smp_processor_id();
 
+	/*Only down_cpu need to execute this function*/
+	if (cpu != down_cpu)
+		return 0;
 	/*
 	 * Perhaps use cpufreq to drop frequency, but that could go
 	 * into generic code.
@@ -1293,10 +1306,14 @@
 	}
  	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
+static int take_cpu_down(void *down_cpu)
+{
 
-static int take_cpu_down(void *unused)
-{
-    return __cpu_disable();
+    if (cpu_down_handler)
+        cpu_down_handler(*(int *)down_cpu);
+    wmb();
+
+    return __cpu_disable(*(int *)down_cpu);
 }
 
 int cpu_down(unsigned int cpu)
@@ -1322,7 +1339,7 @@
 
 	printk("Prepare to bring CPU%d down...\n", cpu);
 
-	err = stop_machine_run(take_cpu_down, NULL, cpu);
+	err = stop_machine_run(take_cpu_down, &cpu, cpu_online_map);
 	if ( err < 0 )
 		goto out;
 
@@ -1333,6 +1350,10 @@
 		err = -EBUSY;
 	}
 out:
+	/*if cpu_offline failed, re-check cmci_owner*/
+
+	if ( err < 0 && cpu_down_rollback_handler) 
+		cpu_down_rollback_handler(cpu); 
 	spin_unlock(&cpu_add_remove_lock);
 	return err;
 }
diff -r a2069e8a3055 xen/include/asm-x86/msr-index.h
--- a/xen/include/asm-x86/msr-index.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/include/asm-x86/msr-index.h	Sun Jan 02 00:25:07 2005 +0800
@@ -92,8 +92,10 @@
 #define MSR_IA32_MC0_STATUS		0x00000401
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
+#define MSR_IA32_MC0_CTL2		0x00000280
+#define CMCI_EN 			(1UL<<30)
+#define CMCI_THRESHOLD_MASK		0x7FFF
 
-#define MSR_IA32_MC1_CTL		0x00000404
 #define MSR_IA32_MC1_STATUS		0x00000405
 #define MSR_IA32_MC1_ADDR		0x00000406
 #define MSR_IA32_MC1_MISC		0x00000407
diff -r a2069e8a3055 xen/include/asm-x86/smp.h
--- a/xen/include/asm-x86/smp.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/include/asm-x86/smp.h	Sun Jan 02 00:25:07 2005 +0800
@@ -101,7 +101,7 @@
 
 #endif
 
-extern int __cpu_disable(void);
+extern int __cpu_disable(int down_cpu);
 extern void __cpu_die(unsigned int cpu);
 #endif /* !__ASSEMBLY__ */
 
diff -r a2069e8a3055 xen/include/public/arch-x86/xen-mca.h
--- a/xen/include/public/arch-x86/xen-mca.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/include/public/arch-x86/xen-mca.h	Sun Jan 02 00:25:07 2005 +0800
@@ -106,7 +106,10 @@
 
 #define MC_FLAG_CORRECTABLE     (1 << 0)
 #define MC_FLAG_UNCORRECTABLE   (1 << 1)
-
+#define MC_FLAG_RECOVERABLE	(1 << 2)
+#define MC_FLAG_POLLED		(1 << 3)
+#define MC_FLAG_RESET		(1 << 4)
+#define MC_FLAG_CMCI		(1 << 5)
 /* contains global x86 mc information */
 struct mcinfo_global {
     struct mcinfo_common common;
@@ -115,6 +118,7 @@
     uint16_t mc_domid;
     uint32_t mc_socketid; /* physical socket of the physical core */
     uint16_t mc_coreid; /* physical impacted core */
+    uint8_t  mc_apicid;
     uint16_t mc_core_threadid; /* core thread of physical core */
     uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
     uint64_t mc_gstatus; /* global status */
@@ -132,6 +136,8 @@
     uint64_t mc_addr;   /* bank address, only valid
                          * if addr bit is set in mc_status */
     uint64_t mc_misc;
+    uint64_t mc_ctrl2;
+    uint64_t mc_tsc;
 };
 
 
@@ -150,7 +156,12 @@
      * multiple times. */
 
     uint32_t mc_msrs; /* Number of msr with valid values. */
-    struct mcinfo_msr mc_msr[5];
+    /*
+     * Currently Intel extended MSR (32/64) including all gp registers
+     * and E(R)DI, E(R)BP, E(R)SP, E(R)FLAGS, E(R)IP, E(R)MISC, only 10
+     * of them might be useful. So expend this array to 10.
+    */
+    struct mcinfo_msr mc_msr[10];
 };
 
 #define MCINFO_HYPERCALLSIZE	1024

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-01-14  4:13 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-19  6:19 [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs Ke, Liping
2009-01-12 14:03 ` Christoph Egger
2009-01-13  2:12   ` Ke, Liping
2009-01-13 11:21     ` Christoph Egger
2009-01-13 16:48       ` Frank Van Der Linden
2009-01-13 17:11         ` Christoph Egger
2009-01-13 17:29           ` Frank van der Linden
2009-01-14  2:11             ` Jiang, Yunhong
2009-01-14  1:35       ` Ke, Liping
2009-01-14  2:58       ` Jiang, Yunhong
2009-01-14  4:13         ` Frank van der Linden

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.