[patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs

All of lore.kernel.org
 help / color / mirror / Atom feed

* [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
@ 2008-12-19  6:19 Ke, Liping
  2009-01-12 14:03 ` Christoph Egger
  0 siblings, 1 reply; 11+ messages in thread
From: Ke, Liping @ 2008-12-19  6:19 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 255 bytes --]

Hi, all

Patch 4 is the main patch for CMCI enabling in XEN. It adds the CMCI interrupt handler, new common CMCI/MCA init process,CMCI owner judge algorithm when bring_up CPUs, CPU on/offline  and  polling mechanisms, etc

Thanks & Regards,
Criping

[-- Attachment #2: clean_and_cmci.patch --]
[-- Type: application/octet-stream, Size: 33272 bytes --]

[patch 4/4]Enable CMCI for Intel CPUs -- main patch for CMCI support

This patch is the main patch for CMCI enabling in XEN.
It adds the CMCI interrupt handler, new common CMCI/MCA init process,
CMCI owner judge algorithm when bring_up CPUs, CPU on/offlines, polling
mechanisms, etc

Signed-off-by Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by Liping Ke <liping.ke@intel.com>

diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/Makefile	Sun Jan 02 00:25:07 2005 +0800
@@ -3,8 +3,7 @@
 obj-y += amd_k8.o
 obj-y += amd_f10.o
 obj-y += mce.o
+obj-y += mce_intel.o
 obj-y += non-fatal.o
-obj-y += p4.o
 obj-$(x86_32) += p5.o
-obj-$(x86_32) += p6.o
 obj-$(x86_32) += winchip.o
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/k7.c
--- a/xen/arch/x86/cpu/mcheck/k7.c	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/k7.c	Sun Jan 02 00:25:07 2005 +0800
@@ -14,6 +14,7 @@
 #include <asm/msr.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 /* Machine Check Handler For AMD Athlon/Duron */
 static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_code)
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.c	Sun Jan 02 00:25:07 2005 +0800
@@ -27,7 +27,7 @@
  * to physical cpus present in the machine.
  * The more physical cpus are available, the more entries you need.
  */
-#define MAX_MCINFO	10
+#define MAX_MCINFO	20
 
 struct mc_machine_notify {
 	struct mc_info mc;
@@ -110,6 +110,22 @@
 	}
 }
 
+/*check the existence of Machine Check*/
+int mce_available(struct cpuinfo_x86 *c)
+{
+	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+/*Make sure there are no machine check on offlined or suspended CPUs*/
+void mce_disable_cpu(void)
+{
+    if (!mce_available(&current_cpu_data) || mce_disabled == 1)
+         return;
+    printk(KERN_DEBUG "MCE: disable mce on CPU%d\n", smp_processor_id());
+    clear_in_cr4(X86_CR4_MCE);
+}
+
+
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
@@ -135,11 +151,13 @@
 #ifndef CONFIG_X86_64
 		if (c->x86==5)
 			intel_p5_mcheck_init(c);
-		if (c->x86==6)
-			intel_p6_mcheck_init(c);
 #endif
-		if (c->x86==15)
-			intel_p4_mcheck_init(c);
+		/*If it is P6 or P4 family, including CORE 2 DUO series*/
+		if (c->x86 == 6 || c->x86==15)
+		{
+			printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
+			intel_mcheck_init(c);
+		}
 		break;
 
 #ifndef CONFIG_X86_64
@@ -181,7 +199,7 @@
 		entry = mc_data.error_idx;
 		smp_rmb();
 		next = entry + 1;
-		if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
+		if (cmpxchg(&mc_data.error_idx, entry, next) == entry) 
 			break;
 	}
 
@@ -231,8 +249,7 @@
 
 	*fetch_idx = mc_data.fetch_idx;
 	mc_data.fetch_idx++;
-	BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
-
+	BUG_ON(mc_data.fetch_idx > mc_data.error_idx);	
 	return mi;
 }
 
@@ -392,18 +409,24 @@
 	struct mcinfo_bank *mc_bank;
 
 	/* first print the global info */
+	printk(KERN_DEBUG "MCE: Dump machine check data\n");
 	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
-	if (mic == NULL)
+	if (mic == NULL) {
+		printk(XENLOG_WARNING "MCE: global info NULL on CPU%d\n", 
+				smp_processor_id());
 		return;
+	}
 	mc_global = (struct mcinfo_global *)mic;
-	if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+	if (mc_global->mc_flags & 
+			(MC_FLAG_UNCORRECTABLE | MC_FLAG_RECOVERABLE)) {
 		printk(XENLOG_WARNING
-			"CPU%d: Machine Check Exception: %16"PRIx64"\n",
+			"CORE%d CPU%d: Machine Check Exception: %16"PRIx64"\n",
+			mc_global->mc_socketid, 
 			mc_global->mc_coreid, mc_global->mc_gstatus);
 	} else {
 		printk(XENLOG_WARNING "MCE: The hardware reports a non "
 			"fatal, correctable incident occured on "
-			"CPU %d.\n",
+			"CORE%d CPU %d.\n", mc_global->mc_socketid,
 			mc_global->mc_coreid);
 	}
 
@@ -413,7 +436,7 @@
 		if (mic == NULL)
 			return;
 		if (mic->type != MC_TYPE_BANK)
-			continue;
+			goto next;
 
 		mc_bank = (struct mcinfo_bank *)mic;
 	
@@ -426,6 +449,7 @@
 			printk(" at %16"PRIx64, mc_bank->mc_addr);
 
 		printk("\n");
+next:
 		mic = x86_mcinfo_next(mic); /* next entry */
 		if ((mic == NULL) || (mic->size == 0))
 			break;
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.h	Sun Jan 02 00:25:07 2005 +0800
@@ -1,14 +1,22 @@
 #include <xen/init.h>
+#include <asm/types.h>
 #include <asm/traps.h>
+#include <asm/atomic.h>
+#include <asm/percpu.h>
+
 
 /* Init functions */
 void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
 void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
 void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
 void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
+
+
+void intel_mcheck_timer(struct cpuinfo_x86 *c);
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
+void intel_mcheck_init(struct cpuinfo_x86 *c);
+void mce_intel_feature_init(struct cpuinfo_x86 *c);
+
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
 /* Function pointer used in the handlers to collect additional information
@@ -19,6 +27,7 @@
 		uint16_t bank, uint64_t status);
 
 
+int mce_available(struct cpuinfo_x86 *c);
 /* Helper functions used for collecting error telemetry */
 struct mc_info *x86_mcinfo_getptr(void);
 void x86_mcinfo_clear(struct mc_info *mi);
@@ -26,6 +35,3 @@
 void x86_mcinfo_dump(struct mc_info *mi);
 void mc_panic(char *s);
 
-/* Global variables */
-extern int mce_disabled;
-extern unsigned int nr_mce_banks;
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce_intel.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Sun Jan 02 00:25:07 2005 +0800
@@ -0,0 +1,678 @@
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/irq.h>
+#include <xen/event.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+#include "x86_mca.h"
+
+DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
+
+static int nr_intel_ext_msrs = 0;
+static int cmci_support = 0;
+extern int firstbank;
+
+static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
+{
+    if (nr_intel_ext_msrs == 0)
+        return;
+
+	/*this function will called when CAP(9).MCG_EXT_P = 1*/
+    memset(mc_ext, 0, sizeof(struct mcinfo_extended));
+    mc_ext->common.type = MC_TYPE_EXTENDED;
+    mc_ext->common.size = sizeof(mc_ext);
+    mc_ext->mc_msrs = 10;
+
+    mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
+    rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
+    mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
+    rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
+    mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
+    rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
+
+    mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
+    rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
+    mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
+    rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
+    mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
+    rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
+
+    mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
+    rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
+    mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
+    rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
+    mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
+    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
+    mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
+    rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
+}
+
+#ifdef CONFIG_X86_MCE_THERMAL
+static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
+{	
+    printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
+                smp_processor_id());
+    add_taint(TAINT_MACHINE_CHECK);
+}
+
+/* P4/Xeon Thermal transition interrupt handler */
+static void intel_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    u32 l, h;
+    unsigned int cpu = smp_processor_id();
+    static s_time_t next[NR_CPUS];
+
+    ack_APIC_irq();
+    if (NOW() < next[cpu])
+        return;
+
+    next[cpu] = NOW() + MILLISECS(5000);
+    rdmsr(MSR_IA32_THERM_STATUS, l, h);
+    if (l & 0x1) {
+        printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
+        printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
+                cpu);
+        add_taint(TAINT_MACHINE_CHECK);
+    } else {
+        printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+    }
+}
+
+/* Thermal interrupt handler for this CPU setup */
+static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs) 
+        = unexpected_thermal_interrupt;
+
+fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    irq_enter();
+    vendor_thermal_interrupt(regs);
+    irq_exit();
+}
+
+/* P4/Xeon Thermal regulation detect and init */
+static void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+    u32 l, h;
+    int tm2 = 0;
+    unsigned int cpu = smp_processor_id();
+
+    /* Thermal monitoring */
+    if (!cpu_has(c, X86_FEATURE_ACPI))
+        return;	/* -ENODEV */
+
+    /* Clock modulation */
+    if (!cpu_has(c, X86_FEATURE_ACC))
+        return;	/* -ENODEV */
+
+    /* first check if its enabled already, in which case there might
+     * be some SMM goo which handles it, so we can't even put a handler
+     * since it might be delivered via SMI already -zwanem.
+     */
+    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+    h = apic_read(APIC_LVTTHMR);
+    if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
+        printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
+        return; /* -EBUSY */
+    }
+
+    if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+        tm2 = 1;
+
+	/* check whether a vector already exists, temporarily masked? */
+    if (h & APIC_VECTOR_MASK) {
+        printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
+                 cpu, (h & APIC_VECTOR_MASK));
+        return; /* -EBUSY */
+    }
+
+    /* The temperature transition interrupt handler setup */
+    h = THERMAL_APIC_VECTOR;		/* our delivery vector */
+    h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
+    apic_write_around(APIC_LVTTHMR, h);
+
+    rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
+    wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+
+    /* ok we're good to go... */
+    vendor_thermal_interrupt = intel_thermal_interrupt;
+
+    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+    wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+
+    l = apic_read (APIC_LVTTHMR);
+    apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+    printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 
+            cpu, tm2 ? "TM2" : "TM1");
+    return;
+}
+#endif /* CONFIG_X86_MCE_THERMAL */
+
+/* machine_check_poll might be called by following types:
+ * 1. called when do mcheck_init.
+ * 2. called in cmci interrupt handler
+ * 3. called in polling handler
+ * It will generate a new mc_info item if found CE/UC errors. DOM0 is the 
+ * consumer.
+*/
+static int machine_check_poll(struct mc_info *mi, int calltype)
+{
+    int exceptions = (read_cr4() & X86_CR4_MCE);
+    int i, nr_unit = 0, uc = 0, pcc = 0;
+    uint64_t status, addr;
+    struct mcinfo_global mcg;
+    struct mcinfo_extended mce;
+    unsigned int cpu;
+    struct domain *d;
+
+    cpu = smp_processor_id();
+
+    if (!mi) {
+        printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
+        return 0;
+    }
+    x86_mcinfo_clear(mi);
+
+    memset(&mcg, 0, sizeof(mcg));
+    mcg.common.type = MC_TYPE_GLOBAL;
+    mcg.common.size = sizeof(mcg);
+    /*If called from cpu-reset check, don't need to fill them.
+     *If called from cmci context, we'll try to fill domid by memory addr
+    */
+    mcg.mc_domid = -1;
+    mcg.mc_vcpuid = -1;
+    if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
+        mcg.mc_flags = MC_FLAG_POLLED;
+    else if (calltype == MC_FLAG_CMCI)
+        mcg.mc_flags = MC_FLAG_CMCI;
+    mcg.mc_socketid = phys_proc_id[cpu];
+    mcg.mc_coreid = cpu_core_id[cpu];
+    mcg.mc_apicid = cpu_physical_id(cpu);
+    mcg.mc_core_threadid = mcg.mc_apicid & ( 1 << (smp_num_siblings - 1)); 
+    rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
+
+    for ( i = 0; i < nr_mce_banks; i++ ) {
+        struct mcinfo_bank mcb;
+        if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+        if (! (status & MCi_STATUS_VAL) )
+            continue;
+        /*
+         * Uncorrected events are handled by the exception
+         * handler when it is enabled. But when the exception
+         * is disabled such as when mcheck_init, log everything.
+         */
+        if ((status & MCi_STATUS_UC) && exceptions)
+            continue;
+
+        if (status & MCi_STATUS_UC)
+            uc = 1;
+        if (status & MCi_STATUS_PCC)
+            pcc = 1;
+
+        memset(&mcb, 0, sizeof(mcb));
+        mcb.common.type = MC_TYPE_BANK;
+        mcb.common.size = sizeof(mcb);
+        mcb.mc_bank = i;
+        mcb.mc_status = status;
+        if (status & MCi_STATUS_MISCV)
+            rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
+        if (status & MCi_STATUS_ADDRV) {
+            rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+            d = maddr_get_owner(addr);
+            if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) )
+                mcb.mc_domid = d->domain_id;
+        }
+        if (cmci_support)
+            rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+        if (calltype == MC_FLAG_CMCI)
+            rdtscll(mcb.mc_tsc);
+        x86_mcinfo_add(mi, &mcb);
+        nr_unit++;
+        add_taint(TAINT_MACHINE_CHECK);
+        /*Clear state for this bank */
+        wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
+        printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%lx]\n", 
+                i, cpu, status);
+        printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
+                "thread[%d]\n", cpu, mcg.mc_socketid, 
+                mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
+ 
+    }
+    /*if pcc = 1, uc must be 1*/
+    if (pcc)
+        mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
+    else if (uc)
+        mcg.mc_flags |= MC_FLAG_RECOVERABLE;
+    else /*correctable*/
+        mcg.mc_flags |= MC_FLAG_CORRECTABLE;
+
+    if (nr_unit && nr_intel_ext_msrs && 
+                    (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
+        intel_get_extended_msrs(&mce);
+        x86_mcinfo_add(mi, &mce);
+    }
+    if (nr_unit) 
+        x86_mcinfo_add(mi, &mcg);
+    /*Clear global state*/
+    return nr_unit;
+}
+
+static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
+{
+    /* MACHINE CHECK Error handler will be sent in another patch,
+     * simply copy old solutions here. This code will be replaced
+     * by upcoming machine check patches
+     */
+
+    int recover=1;
+    u32 alow, ahigh, high, low;
+    u32 mcgstl, mcgsth;
+    int i;
+   
+    rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+    if (mcgstl & (1<<0))	/* Recoverable ? */
+    	recover=0;
+    
+    printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+    	smp_processor_id(), mcgsth, mcgstl);
+    
+    for (i=0; i<nr_mce_banks; i++) {
+    	rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+    	if (high & (1<<31)) {
+    		if (high & (1<<29))
+    			recover |= 1;
+    		if (high & (1<<25))
+    			recover |= 2;
+    		printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
+    		high &= ~(1<<31);
+    		if (high & (1<<27)) {
+    			rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+    			printk ("[%08x%08x]", ahigh, alow);
+    		}
+    		if (high & (1<<26)) {
+    			rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+    			printk (" at %08x%08x", ahigh, alow);
+    		}
+    		printk ("\n");
+    	}
+    }
+    
+    if (recover & 2)
+    	mc_panic ("CPU context corrupt");
+    if (recover & 1)
+    	mc_panic ("Unable to continue");
+    
+    printk(KERN_EMERG "Attempting to continue.\n");
+    /* 
+     * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+     * recoverable/continuable.This will allow BIOS to look at the MSRs
+     * for errors if the OS could not log the error.
+     */
+    for (i=0; i<nr_mce_banks; i++) {
+    	u32 msr;
+    	msr = MSR_IA32_MC0_STATUS+i*4;
+    	rdmsr (msr, low, high);
+    	if (high&(1<<31)) {
+    		/* Clear it */
+    		wrmsr(msr, 0UL, 0UL);
+    		/* Serialize */
+    		wmb();
+    		add_taint(TAINT_MACHINE_CHECK);
+    	}
+    }
+    mcgstl &= ~(1<<2);
+    wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+}
+
+extern void (*cpu_down_handler)(int down_cpu);
+extern void (*cpu_down_rollback_handler)(int down_cpu);
+extern void mce_disable_cpu(void);
+static bool_t cmci_clear_lock = 0;
+static DEFINE_SPINLOCK(cmci_discover_lock);
+static DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
+
+/*
+ * Discover bank sharing using the algorithm recommended in the SDM.
+ */
+static int do_cmci_discover(int i)
+{
+    unsigned msr = MSR_IA32_MC0_CTL2 + i;
+    u64 val;
+
+    rdmsrl(msr, val);
+    /* Some other CPU already owns this bank. */
+    if (val & CMCI_EN) {
+    	clear_bit(i, __get_cpu_var(mce_banks_owned));
+    	goto out;
+    }
+    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
+    rdmsrl(msr, val);
+
+    if (!(val & CMCI_EN)) {
+     /*
+      * This bank does not support CMCI. The polling
+      * timer has to handle it. 
+      */
+    	set_bit(i, __get_cpu_var(no_cmci_banks));
+    	return 0;
+    }
+    set_bit(i, __get_cpu_var(mce_banks_owned));
+out:
+    clear_bit(i, __get_cpu_var(no_cmci_banks));
+    return 1;
+}
+
+void cmci_discover(void)
+{
+    int i;
+
+    printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
+    spin_lock(&cmci_discover_lock);
+    for (i = 0; i < nr_mce_banks; i++) {
+        /*If the cpu is the bank owner, need not re-discover*/
+        if (test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        do_cmci_discover(i);
+    }
+    spin_unlock(&cmci_discover_lock);
+    printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
+            smp_processor_id(), 
+            *((unsigned long *)__get_cpu_var(mce_banks_owned)), 
+            *((unsigned long *)__get_cpu_var(no_cmci_banks)));
+}
+
+/*
+ * Define an owner for each bank. Banks can be shared between CPUs
+ * and to avoid reporting events multiple times always set up one
+ * CPU as owner. 
+ *
+ * The assignment has to be redone when CPUs go offline and
+ * any of the owners goes away. Also pollers run in parallel so we
+ * have to be careful to update the banks in a way that doesn't
+ * lose or duplicate events.
+ */
+
+static void mce_set_owner(void)
+{
+
+    if (!cmci_support || mce_disabled == 1)
+        return;
+
+    cmci_discover();
+}
+
+static void clear_cmci(void)
+{
+    int i;
+
+    if (!cmci_support || mce_disabled == 1)
+        return;
+
+    printk(KERN_DEBUG "CMCI: clear_cmci support on CPU%d\n", 
+            smp_processor_id());
+
+    for (i = 0; i < nr_mce_banks; i++) {
+        unsigned msr = MSR_IA32_MC0_CTL2 + i;
+        u64 val;
+        if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        rdmsrl(msr, val);
+        if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
+            wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
+        clear_bit(i, __get_cpu_var(mce_banks_owned));
+    }
+}
+
+/*we need to re-set cmci owners when cpu_down fail or cpu_up*/
+static void cmci_reenable_cpu(void *h)
+{
+    if (!mce_available(&current_cpu_data) || mce_disabled == 1)
+         return;
+    printk(KERN_DEBUG "CMCI: reenable mce on CPU%d\n", smp_processor_id());
+    mce_set_owner();
+    set_in_cr4(X86_CR4_MCE);
+}
+
+/* When take cpu_down, we need to execute the impacted cmci_owner judge algorithm 
+ * First, we need to clear the ownership on the dead CPU
+ * Then,  other CPUs will check whether to take the bank's ownership from down_cpu
+ * CPU0 need not and "never" execute this path
+*/
+void  __cpu_clear_cmci( int down_cpu)
+{
+    int cpu = smp_processor_id();
+
+    if (!cmci_support && mce_disabled == 1)
+        return;
+
+    if (cpu == 0) {
+        printk(KERN_DEBUG "CMCI: CPU0 need not be cleared\n");
+        return;
+    }
+
+    local_irq_disable();
+    if (cpu == down_cpu){
+        mce_disable_cpu();
+        clear_cmci();
+        wmb();
+        test_and_set_bool(cmci_clear_lock);
+        return;
+    }
+    while (!cmci_clear_lock)
+        cpu_relax();
+    if (cpu != down_cpu)
+        mce_set_owner();
+
+    test_and_clear_bool(cmci_clear_lock);
+    local_irq_enable();
+
+}
+
+void  __cpu_clear_cmci_rollback( int down_cpu)
+{
+    cpumask_t down_map;
+    if (!cmci_support || mce_disabled == 1) 
+        return;
+
+    cpus_clear(down_map);
+    cpu_set(down_cpu, down_map);
+    printk(KERN_ERR "CMCI: cpu_down fail. "
+        "Reenable cmci on CPU%d\n", down_cpu);
+    on_selected_cpus(down_map, cmci_reenable_cpu, NULL, 1, 1);
+}
+
+static void intel_init_cmci(struct cpuinfo_x86 *c)
+{
+    u32 l, apic;
+    int cpu = smp_processor_id();
+
+    if (!mce_available(c) || !cmci_support) {
+        printk(KERN_DEBUG "CMCI: CPU%d has no CMCI support\n", cpu);
+        return;
+    }
+
+    apic = apic_read(APIC_CMCI);
+    if ( apic & APIC_VECTOR_MASK )
+    {
+        printk(KERN_WARNING "CPU%d CMCI LVT vector (%#x) already installed\n",
+            cpu, ( apic & APIC_VECTOR_MASK ));
+        return;
+    }
+
+    apic = CMCI_APIC_VECTOR;
+    apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+    apic_write_around(APIC_CMCI, apic);
+
+	/*now clear mask flag*/
+    l = apic_read(APIC_CMCI);
+    apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
+    cpu_down_handler =  __cpu_clear_cmci;
+    cpu_down_rollback_handler = __cpu_clear_cmci_rollback; 
+}
+
+fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
+{
+    int nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+    int cpu = smp_processor_id();
+
+    ack_APIC_irq();
+    irq_enter();
+    printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
+    nr_unit = machine_check_poll(mi, MC_FLAG_CMCI);
+    if (nr_unit) {
+        x86_mcinfo_dump(mi);
+        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+            send_guest_global_virq(dom0, VIRQ_MCA);
+    }
+    irq_exit();
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+
+#ifdef CONFIG_X86_MCE_THERMAL
+    intel_init_thermal(c);
+#endif
+    intel_init_cmci(c);
+}
+
+static void mce_cap_init(struct cpuinfo_x86 *c)
+{
+    u32 l, h;
+
+    rdmsr (MSR_IA32_MCG_CAP, l, h);
+    if ((l & MCG_CMCI_P) && cpu_has_apic)
+        cmci_support = 1;
+
+    nr_mce_banks = l & 0xff;
+    if (nr_mce_banks > MAX_NR_BANKS)
+        printk(KERN_WARNING "MCE: exceed max mce banks\n");
+    if (l & MCG_EXT_P)
+    {
+        nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
+        printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
+            smp_processor_id(), nr_intel_ext_msrs);
+    }
+    /* for most of p6 family, bank 0 is an alias bios MSR.
+     * But after model>1a, bank 0 is available*/
+    if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
+            && c->x86_model < 0x1A)
+        firstbank = 1;
+    else
+        firstbank = 0;
+}
+
+static void mce_init(void)
+{
+    u32 l, h;
+    int i, nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+    clear_in_cr4(X86_CR4_MCE);
+    /* log the machine checks left over from the previous reset.
+     * This also clears all registers*/
+
+    nr_unit = machine_check_poll(mi, MC_FLAG_RESET);
+    /*in the boot up stage, not expect inject to DOM0, but go print out
+    */
+    if (nr_unit > 0)
+        x86_mcinfo_dump(mi);
+
+    set_in_cr4(X86_CR4_MCE);
+    rdmsr (MSR_IA32_MCG_CAP, l, h);
+    if (l & MCG_CTL_P)	/* Control register present ? */
+        wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+    for (i = firstbank; i < nr_mce_banks; i++)
+    {
+        /*Some banks are shared across cores, use MCi_CTRL to judge whether
+         * this bank has been initialized by other cores already.*/
+        rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
+        if (!l & !h)
+        {
+            /*if ctl is 0, this bank is never initialized*/
+            printk(KERN_DEBUG "mce_init: init bank%d\n", i);
+            wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
+            wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
+       }
+    }
+    if (firstbank) /*if cmci enabled, firstbank = 0*/
+        wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+}
+
+/*p4/p6 faimily has similar MCA initialization process*/
+void intel_mcheck_init(struct cpuinfo_x86 *c)
+{
+	
+	mce_cap_init(c);
+	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+		smp_processor_id());
+	/* machine check is available */
+	machine_check_vector = intel_machine_check;
+	mce_init();
+	mce_intel_feature_init(c);
+	mce_set_owner();
+}
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll faster. When the poller finds no more 
+ * errors, poll slower
+*/
+static struct timer mce_timer;
+
+#define MCE_PERIOD 4000
+#define MCE_MIN    2000
+#define MCE_MAX    32000
+
+static u64 period = MCE_PERIOD;
+static int adjust = 0;
+
+static void mce_intel_checkregs(void *info)
+{
+    int nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+
+    if( !mce_available(&current_cpu_data))
+        return;
+    nr_unit = machine_check_poll(mi, MC_FLAG_POLLED);
+    if (nr_unit)
+    {
+        x86_mcinfo_dump(mi);
+        adjust++;
+        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+            send_guest_global_virq(dom0, VIRQ_MCA);
+    }
+}
+
+static void mce_intel_work_fn(void *data)
+{
+    on_each_cpu(mce_intel_checkregs, data, 1, 1);
+    if (adjust) {
+        period = period / (adjust + 1);
+        printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval to %ld",
+            period);
+    }
+    else {
+        period *= 2;
+    }
+    if (period > MCE_MAX) 
+        period = MCE_MAX;
+    if (period < MCE_MIN)
+        period = MCE_MIN;
+    set_timer(&mce_timer, NOW() + MILLISECS(period));
+    adjust = 0;
+}
+
+void intel_mcheck_timer(struct cpuinfo_x86 *c)
+{
+    printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
+    init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
+    set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
+}
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c	Sun Jan 02 00:25:07 2005 +0800
@@ -19,8 +19,8 @@
 #include <asm/msr.h>
 
 #include "mce.h"
-
-static int firstbank;
+#include "x86_mca.h"
+int firstbank = 0;
 static struct timer mce_timer;
 
 #define MCE_PERIOD MILLISECS(15000)
@@ -61,13 +61,8 @@
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	/* Check for MCE support */
-	if (!cpu_has(c, X86_FEATURE_MCE))
+	if (!mce_available(c))
 		return -ENODEV;
-
-	/* Check for PPro style MCA */
-	if (!cpu_has(c, X86_FEATURE_MCA))
-		return -ENODEV;
-
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
@@ -85,12 +80,20 @@
 		break;
 
 	case X86_VENDOR_INTEL:
-		init_timer(&mce_timer, mce_work_fn, NULL, 0);
-		set_timer(&mce_timer, NOW() + MCE_PERIOD);
+		/* p5 family is different. P4/P6 and latest CPUs shares the
+		 * same polling methods
+		*/
+		if ( c->x86 != 5 )
+		{
+			/* some CPUs or banks don't support cmci, we need to 
+			 * enable this feature anyway
+			 */
+			intel_mcheck_timer(c);
+		}
 		break;
 	}
 
-	printk(KERN_INFO "MCA: Machine check polling timer started.\n");
+	printk(KERN_INFO "mcheck_poll: Machine check polling timer started.\n");
 	return 0;
 }
 __initcall(init_nonfatal_mce_checker);
diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Sun Jan 02 00:25:07 2005 +0800
@@ -28,7 +28,10 @@
 /* Bitfield of the MSR_IA32_MCG_CAP register */
 #define MCG_CAP_COUNT           0x00000000000000ffULL
 #define MCG_CTL_P               0x0000000000000100ULL
-/* Bits 9-63 are reserved */
+#define MCG_EXT_P		(1UL<<9)
+#define MCG_EXT_CNT		(16)
+#define MCG_CMCI_P		(1UL<<10)
+/* Other bits are reserved */
 
 /* Bitfield of the MSR_IA32_MCG_STATUS register */
 #define MCG_STATUS_RIPV         0x0000000000000001ULL
@@ -70,3 +73,17 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/*Intel Specific bitfield*/
+#define CMCI_THRESHOLD			0x2
+
+
+#define MAX_NR_BANKS 128
+
+typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned);
+
+/* Global variables */
+extern int mce_disabled;
+extern unsigned int nr_mce_banks;
+extern int firstbank;
+
diff -r a2069e8a3055 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/arch/x86/smpboot.c	Sun Jan 02 00:25:07 2005 +0800
@@ -1237,11 +1237,24 @@
 }
 
 extern void fixup_irqs(cpumask_t map);
-int __cpu_disable(void)
+
+/*
+ * Called when offline cpu. We need to process some new
+ * feature such as CMCI owner change in latest Intel
+ * CPU families
+*/
+void (*cpu_down_handler)(int down_cpu) = NULL;
+void (*cpu_down_rollback_handler)(int down_cpu) = NULL;
+
+
+int __cpu_disable(int down_cpu)
 {
 	cpumask_t map = cpu_online_map;
 	int cpu = smp_processor_id();
 
+	/*Only down_cpu need to execute this function*/
+	if (cpu != down_cpu)
+		return 0;
 	/*
 	 * Perhaps use cpufreq to drop frequency, but that could go
 	 * into generic code.
@@ -1293,10 +1306,14 @@
 	}
  	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
+static int take_cpu_down(void *down_cpu)
+{
 
-static int take_cpu_down(void *unused)
-{
-    return __cpu_disable();
+    if (cpu_down_handler)
+        cpu_down_handler(*(int *)down_cpu);
+    wmb();
+
+    return __cpu_disable(*(int *)down_cpu);
 }
 
 int cpu_down(unsigned int cpu)
@@ -1322,7 +1339,7 @@
 
 	printk("Prepare to bring CPU%d down...\n", cpu);
 
-	err = stop_machine_run(take_cpu_down, NULL, cpu);
+	err = stop_machine_run(take_cpu_down, &cpu, cpu_online_map);
 	if ( err < 0 )
 		goto out;
 
@@ -1333,6 +1350,10 @@
 		err = -EBUSY;
 	}
 out:
+	/*if cpu_offline failed, re-check cmci_owner*/
+
+	if ( err < 0 && cpu_down_rollback_handler) 
+		cpu_down_rollback_handler(cpu); 
 	spin_unlock(&cpu_add_remove_lock);
 	return err;
 }
diff -r a2069e8a3055 xen/include/asm-x86/msr-index.h
--- a/xen/include/asm-x86/msr-index.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/include/asm-x86/msr-index.h	Sun Jan 02 00:25:07 2005 +0800
@@ -92,8 +92,10 @@
 #define MSR_IA32_MC0_STATUS		0x00000401
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
+#define MSR_IA32_MC0_CTL2		0x00000280
+#define CMCI_EN 			(1UL<<30)
+#define CMCI_THRESHOLD_MASK		0x7FFF
 
-#define MSR_IA32_MC1_CTL		0x00000404
 #define MSR_IA32_MC1_STATUS		0x00000405
 #define MSR_IA32_MC1_ADDR		0x00000406
 #define MSR_IA32_MC1_MISC		0x00000407
diff -r a2069e8a3055 xen/include/asm-x86/smp.h
--- a/xen/include/asm-x86/smp.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/include/asm-x86/smp.h	Sun Jan 02 00:25:07 2005 +0800
@@ -101,7 +101,7 @@
 
 #endif
 
-extern int __cpu_disable(void);
+extern int __cpu_disable(int down_cpu);
 extern void __cpu_die(unsigned int cpu);
 #endif /* !__ASSEMBLY__ */
 
diff -r a2069e8a3055 xen/include/public/arch-x86/xen-mca.h
--- a/xen/include/public/arch-x86/xen-mca.h	Thu Dec 18 14:38:19 2008 +0800
+++ b/xen/include/public/arch-x86/xen-mca.h	Sun Jan 02 00:25:07 2005 +0800
@@ -106,7 +106,10 @@
 
 #define MC_FLAG_CORRECTABLE     (1 << 0)
 #define MC_FLAG_UNCORRECTABLE   (1 << 1)
-
+#define MC_FLAG_RECOVERABLE	(1 << 2)
+#define MC_FLAG_POLLED		(1 << 3)
+#define MC_FLAG_RESET		(1 << 4)
+#define MC_FLAG_CMCI		(1 << 5)
 /* contains global x86 mc information */
 struct mcinfo_global {
     struct mcinfo_common common;
@@ -115,6 +118,7 @@
     uint16_t mc_domid;
     uint32_t mc_socketid; /* physical socket of the physical core */
     uint16_t mc_coreid; /* physical impacted core */
+    uint8_t  mc_apicid;
     uint16_t mc_core_threadid; /* core thread of physical core */
     uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
     uint64_t mc_gstatus; /* global status */
@@ -132,6 +136,8 @@
     uint64_t mc_addr;   /* bank address, only valid
                          * if addr bit is set in mc_status */
     uint64_t mc_misc;
+    uint64_t mc_ctrl2;
+    uint64_t mc_tsc;
 };
 
 
@@ -150,7 +156,12 @@
      * multiple times. */
 
     uint32_t mc_msrs; /* Number of msr with valid values. */
-    struct mcinfo_msr mc_msr[5];
+    /*
+     * Currently Intel extended MSR (32/64) including all gp registers
+     * and E(R)DI, E(R)BP, E(R)SP, E(R)FLAGS, E(R)IP, E(R)MISC, only 10
+     * of them might be useful. So expend this array to 10.
+    */
+    struct mcinfo_msr mc_msr[10];
 };
 
 #define MCINFO_HYPERCALLSIZE	1024

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2008-12-19  6:19 [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs Ke, Liping
@ 2009-01-12 14:03 ` Christoph Egger
  2009-01-13  2:12   ` Ke, Liping
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Egger @ 2009-01-12 14:03 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser, Ke, Liping

On Friday 19 December 2008 07:19:16 Ke, Liping wrote:
> Hi, all
>
> Patch 4 is the main patch for CMCI enabling in XEN. It adds the CMCI
> interrupt handler, new common CMCI/MCA init process,CMCI owner judge
> algorithm when bring_up CPUs, CPU on/offline  and  polling mechanisms, etc
>
> Thanks & Regards,
> Criping

This patch changes the public API. There's no need to change
struct mcinfo_extended. The marshalling technique allows to use
this structure as often as needed. Please undo this change.

struct mcinfo_global is also changed. Why can't mc_coreid not be
filled with the apicid ? Adding the apicid field breaks the alignment
causing troubles on 32bit-guest-on-64bit-hypervisor.

Christoph

-- 
Advanced Micro Devices GmbH
Karl-Hammerschmidt-Str. 34
85609 Dornach b. München

Geschäftsführer: Jochen Polster, Thomas M. McCoy, Giuliano Meroni
Sitz: Dornach, Gemeinde Aschheim, Landkreis München
Registergericht München, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-12 14:03 ` Christoph Egger
@ 2009-01-13  2:12   ` Ke, Liping
  2009-01-13 11:21     ` Christoph Egger
  0 siblings, 1 reply; 11+ messages in thread
From: Ke, Liping @ 2009-01-13  2:12 UTC (permalink / raw)
  To: Christoph Egger, xen-devel@lists.xensource.com
  Cc: Jiang, Yunhong, Keir Fraser

Hi, Christoph
Please see our comments below

Christoph Egger wrote:
> On Friday 19 December 2008 07:19:16 Ke, Liping wrote:
>> Hi, all
>> 
>> Patch 4 is the main patch for CMCI enabling in XEN. It adds the CMCI
>> interrupt handler, new common CMCI/MCA init process,CMCI owner judge
>> algorithm when bring_up CPUs, CPU on/offline  and  polling
>> mechanisms, etc 
>> 
>> Thanks & Regards,
>> Criping
> 
> This patch changes the public API. There's no need to change
> struct mcinfo_extended. The marshalling technique allows to use
> this structure as often as needed. Please undo this change.

Since Intel extended msrs' number is bigger than AMD, and we can't use pointer and allocate memory in mca handler,
so we extended it from 5 -> 10. We think it should not impact any of your usage.

Else, we can change boundary 5->0, use a globally allocated pointer instead. But it seems not that necessary.
How do you think about?

> 
> struct mcinfo_global is also changed. Why can't mc_coreid not be
> filled with the apicid ? Adding the apicid field breaks the alignment
> causing troubles on 32bit-guest-on-64bit-hypervisor.
We plan to extend the apic_id to 32 bit since 8 bit is not enough for new apic_id.
Besides, for this problem, before sending the patch, we actually talked about it. Seems original structure is not 32/64 alligned.
How about below changes?

struct mcinfo_global {
    struct mcinfo_common common; 4 byte

    /* running domain at the time in error (most likely the impacted one) */
    uint16_t mc_domid; 2 byte
    uint32_t mc_socketid; /* physical socket of the physical core */ 4 byte
    uint16_t mc_coreid; /* physical impacted core */ 2 byte
    uint8_t  mc_apicid; ---change it to 4 byte
    uint16_t mc_core_threadid; /* core thread of physical core */ 2 byte
    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ 2 byte
    uint64_t mc_gstatus; /* global status */ 8 byte
    uint32_t mc_flags; 4 byte
};


Change to 
------------------------>>>>>

struct mcinfo_global {
    struct mcinfo_common common; 4 byte

    /* running domain at the time in error (most likely the impacted one) */
    uint32_t mc_socketid; /* physical socket of the physical core */ 4 byte
-----------------------------------------------------------------
    uint16_t mc_domid; 2 byte
    uint16_t mc_coreid; /* physical impacted core */ 2 byte
    uint32_t  mc_apicid; ---change it to 4 byte
-------------------------------------------------------------------
    uint16_t mc_core_threadid; /* core thread of physical core */ 2 byte
    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ 2 byte
    uint32_t mc_flags; 4 byte
--------------------------------------------------------------------------
    uint64_t mc_gstatus; /* global status */ 8 byte
};

Thanks & Regards,
Criping




> 
> Christoph

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-13  2:12   ` Ke, Liping
@ 2009-01-13 11:21     ` Christoph Egger
  2009-01-13 16:48       ` Frank Van Der Linden
                         ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Christoph Egger @ 2009-01-13 11:21 UTC (permalink / raw)
  To: Ke, Liping; +Cc: xen-devel@lists.xensource.com, Keir Fraser, Jiang, Yunhong

On Tuesday 13 January 2009 03:12:51 Ke, Liping wrote:
> Hi, Christoph
> Please see our comments below
>
> Christoph Egger wrote:
> > On Friday 19 December 2008 07:19:16 Ke, Liping wrote:
> >> Hi, all
> >>
> >> Patch 4 is the main patch for CMCI enabling in XEN. It adds the CMCI
> >> interrupt handler, new common CMCI/MCA init process,CMCI owner judge
> >> algorithm when bring_up CPUs, CPU on/offline  and  polling
> >> mechanisms, etc
> >>
> >> Thanks & Regards,
> >> Criping
> >
> > This patch changes the public API. There's no need to change
> > struct mcinfo_extended. The marshalling technique allows to use
> > this structure as often as needed. Please undo this change.
>
> Since Intel extended msrs' number is bigger than AMD, and we can't use
> pointer and allocate memory in mca handler, so we extended it from 5 -> 10.
> We think it should not impact any of your usage.
>
> Else, we can change boundary 5->0, use a globally allocated pointer
> instead. But it seems not that necessary. How do you think about?

When I defined the API, I already knew that 5 is not enough for Intel.
So I made struct mc_info large enough to keep multiple entities in any
combination. I expected from you to fill struct mcinfo_extended two or
three times into struct mc_info  the same way as you do with
struct mcinfo_bank.   There's no (additional) allocation needed.

From your description I just read, that you don't know this marshalling 
technique.


> > struct mcinfo_global is also changed. Why can't mc_coreid not be
> > filled with the apicid ? Adding the apicid field breaks the alignment
> > causing troubles on 32bit-guest-on-64bit-hypervisor.
>
> We plan to extend the apic_id to 32 bit since 8 bit is not enough for new
> apic_id. Besides, for this problem, before sending the patch, we actually
> talked about it. Seems original structure is not 32/64 alligned. How about
> below changes?
>
> struct mcinfo_global {
>     struct mcinfo_common common; 4 byte
>
>     /* running domain at the time in error (most likely the impacted one)
> */ uint16_t mc_domid; 2 byte
>     uint32_t mc_socketid; /* physical socket of the physical core */ 4 byte
>     uint16_t mc_coreid; /* physical impacted core */ 2 byte
>     uint8_t  mc_apicid; ---change it to 4 byte
>     uint16_t mc_core_threadid; /* core thread of physical core */ 2 byte
>     uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ 2 byte
>     uint64_t mc_gstatus; /* global status */ 8 byte
>     uint32_t mc_flags; 4 byte
> };
>
>
> Change to
> ------------------------>>>>>
>
> struct mcinfo_global {
>     struct mcinfo_common common; 4 byte
>
>     /* running domain at the time in error (most likely the impacted one)
> */ uint32_t mc_socketid; /* physical socket of the physical core */ 4 byte
> -----------------------------------------------------------------
>     uint16_t mc_domid; 2 byte
>     uint16_t mc_coreid; /* physical impacted core */ 2 byte
>     uint32_t  mc_apicid; ---change it to 4 byte
> -------------------------------------------------------------------
>     uint16_t mc_core_threadid; /* core thread of physical core */ 2 byte
>     uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ 2 byte
>     uint32_t mc_flags; 4 byte
> --------------------------------------------------------------------------
>     uint64_t mc_gstatus; /* global status */ 8 byte
> };

Your proposed change fixes the alignment problem, but it doesn't answer my
question: Why is mc_apicid needed at all since the CPU core is already
identified by mc_coreid ?

Christoph



-- 
Advanced Micro Devices GmbH
Karl-Hammerschmidt-Str. 34
85609 Dornach b. München
 
Geschäftsführer: Jochen Polster, Thomas M. McCoy, Giuliano Meroni
Sitz: Dornach, Gemeinde Aschheim, Landkreis München
Registergericht München, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-13 11:21     ` Christoph Egger
@ 2009-01-13 16:48       ` Frank Van Der Linden
  2009-01-13 17:11         ` Christoph Egger
  2009-01-14  1:35       ` Ke, Liping
  2009-01-14  2:58       ` Jiang, Yunhong
  2 siblings, 1 reply; 11+ messages in thread
From: Frank Van Der Linden @ 2009-01-13 16:48 UTC (permalink / raw)
  To: Christoph Egger
  Cc: Jiang, Yunhong, xen-devel@lists.xensource.com, Keir Fraser,
	Ke, Liping

Hi Christoph,

This patch contains part of our work to extend the MCE interface, as 
part of our FMA efforts. I know we changed some structures, and I also 
remember some alignment issues that we had to fix.

I think these patches are partly based on a slightly older version of 
what we made available, not our final version.

I'm hoping to bring all our changes up to xen-unstable level soon (they 
were originally for 3.1.4), and post them. That should refresh my mind 
about our changes, and hopefully we can discuss things then (with Gavin 
too, who made a lot of these changes).

We (Sun) should probably also re-synch with some of the Intel efforts.

Again, hopefully we can do this soon,

- Frank

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-13 16:48       ` Frank Van Der Linden
@ 2009-01-13 17:11         ` Christoph Egger
  2009-01-13 17:29           ` Frank van der Linden
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Egger @ 2009-01-13 17:11 UTC (permalink / raw)
  To: Frank Van Der Linden
  Cc: Jiang, Yunhong, xen-devel@lists.xensource.com, Keir Fraser,
	Ke, Liping

On Tuesday 13 January 2009 17:48:48 Frank Van Der Linden wrote:
> Hi Christoph,
>
> This patch contains part of our work to extend the MCE interface, as
> part of our FMA efforts. I know we changed some structures, and I also
> remember some alignment issues that we had to fix.
>
> I think these patches are partly based on a slightly older version of
> what we made available, not our final version.
>
> I'm hoping to bring all our changes up to xen-unstable level soon (they
> were originally for 3.1.4), and post them. That should refresh my mind
> about our changes, and hopefully we can discuss things then (with Gavin
> too, who made a lot of these changes).

The patches I have seen so far, don't touch the public API at all.
But Intel did. That's the point in this discussion.

> We (Sun) should probably also re-synch with some of the Intel efforts.
>
> Again, hopefully we can do this soon,
>
> - Frank



-- 
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Karl-Hammerschmidt-Str. 34, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Jochen Polster, Thomas M. McCoy, Giuliano Meroni
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-13 17:11         ` Christoph Egger
@ 2009-01-13 17:29           ` Frank van der Linden
  2009-01-14  2:11             ` Jiang, Yunhong
  0 siblings, 1 reply; 11+ messages in thread
From: Frank van der Linden @ 2009-01-13 17:29 UTC (permalink / raw)
  To: Christoph Egger
  Cc: Jiang, Yunhong, xen-devel@lists.xensource.com, Keir Fraser,
	Ke, Liping

Christoph Egger wrote:

> The patches I have seen so far, don't touch the public API at all.
> But Intel did. That's the point in this discussion.

Actually, it seems we did add a few things that changed it. But looking 
at our changes now, we might not need them (like mc_apicid; it appears 
we don't actually use it anywhere).

But, I'll have to get back to you on that after I've brought the changes 
up to xen-unstable level.

- Frank

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-13 17:29           ` Frank van der Linden
@ 2009-01-14  2:11             ` Jiang, Yunhong
  0 siblings, 0 replies; 11+ messages in thread
From: Jiang, Yunhong @ 2009-01-14  2:11 UTC (permalink / raw)
  To: Frank.Vanderlinden@Sun.COM, Christoph Egger
  Cc: xen-devel@lists.xensource.com, Keir Fraser, Ke, Liping

Frank.Vanderlinden@Sun.COM <mailto:Frank.Vanderlinden@Sun.COM> wrote:
> Christoph Egger wrote:
> 
>> The patches I have seen so far, don't touch the public API at all.
>> But Intel did. That's the point in this discussion.
> 
> Actually, it seems we did add a few things that changed it.
> But looking
> at our changes now, we might not need them (like mc_apicid; it appears
> we don't actually use it anywhere).

For the mc_apicid, I think it gives informatoin for the whole topology information of the system, and will be helpful if needed in future. That's the reason we adopt the changes in our patch. Of course, we can get them through translate the mc_coreid to the apic_id, requiring dom0 cache the translation or have a hypercall for it. 

> 
> But, I'll have to get back to you on that after I've brought
> the changes
> up to xen-unstable level.

Yes, I suggest we hold the discussion till your changes are submited to upstream. 

> 
> - Frank

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-13 11:21     ` Christoph Egger
  2009-01-13 16:48       ` Frank Van Der Linden
@ 2009-01-14  1:35       ` Ke, Liping
  2009-01-14  2:58       ` Jiang, Yunhong
  2 siblings, 0 replies; 11+ messages in thread
From: Ke, Liping @ 2009-01-14  1:35 UTC (permalink / raw)
  To: Christoph Egger
  Cc: Keir, xen-devel@lists.xensource.com, Fraser, Jiang, Yunhong

Christoph Egger wrote:
> On Tuesday 13 January 2009 03:12:51 Ke, Liping wrote:
>> Hi, Christoph
>> Please see our comments below
>> 
>> Christoph Egger wrote:
>>> On Friday 19 December 2008 07:19:16 Ke, Liping wrote:
>>>> Hi, all
>>>> 
>>>> Patch 4 is the main patch for CMCI enabling in XEN. It adds the
>>>> CMCI interrupt handler, new common CMCI/MCA init process,CMCI
>>>> owner judge algorithm when bring_up CPUs, CPU on/offline  and 
>>>> polling mechanisms, etc 
>>>> 
>>>> Thanks & Regards,
>>>> Criping
>>> 
>>> This patch changes the public API. There's no need to change
>>> struct mcinfo_extended. The marshalling technique allows to use
>>> this structure as often as needed. Please undo this change.
>> 
>> Since Intel extended msrs' number is bigger than AMD, and we can't
>> use pointer and allocate memory in mca handler, so we extended it
>> from 5 -> 10. We think it should not impact any of your usage.
>> 
>> Else, we can change boundary 5->0, use a globally allocated pointer
>> instead. But it seems not that necessary. How do you think about?
> 
> When I defined the API, I already knew that 5 is not enough for Intel.
> So I made struct mc_info large enough to keep multiple entities in any
> combination. I expected from you to fill struct mcinfo_extended two or
> three times into struct mc_info  the same way as you do with
> struct mcinfo_bank.   There's no (additional) allocation needed.
> 
> From your description I just read, that you don't know this
> marshalling technique.

Hi, Christoph

We do understand and actually are using your *marshalling* techniques,
you see we push each bank infor to one entry of your *mcinfo_bank*. 
It's flexible and just good. 
But split one integral information into several entries seems not necessary.
For me it is not good programming style for me. If we have several kinds 
of extended Infor, surely I will use your marshalling techniques. 
It should not *impact* public interface(We noticed you have calculate 
data structure size dynamically we think). How do you think? 

For the reason of keeping apic_id, partly because we're not sure whether 
the information would be useful to recovery action(It brings more
Information than core_id), so we just want to *keep* it. 
If you don't use it, just let it be blank. It should not impact your interface. 
Your opinion? 

Thanks a lot for your help!
Regards,
Criping

> 
> 
>>> struct mcinfo_global is also changed. Why can't mc_coreid not be
>>> filled with the apicid ? Adding the apicid field breaks the
>>> alignment causing troubles on 32bit-guest-on-64bit-hypervisor.
>> 
>> We plan to extend the apic_id to 32 bit since 8 bit is not enough
>> for new apic_id. Besides, for this problem, before sending the
>> patch, we actually talked about it. Seems original structure is not
>> 32/64 alligned. How about below changes? 
>> 
>> struct mcinfo_global {
>>     struct mcinfo_common common; 4 byte
>> 
>>     /* running domain at the time in error (most likely the impacted
>>     one) */ uint16_t mc_domid; 2 byte uint32_t mc_socketid; /*
>>     physical socket of the physical core */ 4 byte uint16_t
>>     mc_coreid; /* physical impacted core */ 2 byte uint8_t 
>>     mc_apicid; ---change it to 4 byte uint16_t mc_core_threadid; /*
>>     core thread of physical core */ 2 byte uint16_t mc_vcpuid; /*
>>     virtual cpu scheduled for mc_domid */ 2 byte uint64_t
>> mc_gstatus; /* global status */ 8 byte     uint32_t mc_flags; 4 byte
>> }; 
>> 
>> 
>> Change to
>> ------------------------>>>>>
>> 
>> struct mcinfo_global {
>>     struct mcinfo_common common; 4 byte
>> 
>>     /* running domain at the time in error (most likely the impacted
>> one) */ uint32_t mc_socketid; /* physical socket of the physical
>> core */ 4 byte
>>    
>>    
>> -----------------------------------------------------------------   
>>     uint16_t mc_domid; 2 byte uint16_t mc_coreid; /* physical
>>     impacted core */ 2 byte uint32_t  mc_apicid; ---change it to 4
>> byte
>>    
>> -------------------------------------------------------------------
>> uint16_t mc_core_threadid; /* core thread of physical core */ 2 byte
>> uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ 2 byte 
>> uint32_t mc_flags; 4 byte
>> --------------------------------------------------------------------------
>> uint64_t mc_gstatus; /* global status */ 8 byte };     
> 
> Your proposed change fixes the alignment problem, but it doesn't
> answer my question: Why is mc_apicid needed at all since the CPU core
> is already identified by mc_coreid ?
> 
> Christoph

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-13 11:21     ` Christoph Egger
  2009-01-13 16:48       ` Frank Van Der Linden
  2009-01-14  1:35       ` Ke, Liping
@ 2009-01-14  2:58       ` Jiang, Yunhong
  2009-01-14  4:13         ` Frank van der Linden
  2 siblings, 1 reply; 11+ messages in thread
From: Jiang, Yunhong @ 2009-01-14  2:58 UTC (permalink / raw)
  To: Christoph Egger, Ke, Liping; +Cc: Keir, xen-devel@lists.xensource.com, Fraser

xen-devel-bounces@lists.xensource.com <> wrote:
> On Tuesday 13 January 2009 03:12:51 Ke, Liping wrote:
> 
> When I defined the API, I already knew that 5 is not enough for Intel.
> So I made struct mc_info large enough to keep multiple entities in any
> combination. I expected from you to fill struct mcinfo_extended two or
> three times into struct mc_info  the same way as you do with
> struct mcinfo_bank.   There's no (additional) allocation needed.

Just as Liping has pointed out, since there is size caculated in mcinfo_common, it should be ok for this changes, especially considering just increase the size, not decrease. 
Anyway, let's wait for  Frank and Gavin's changes. 

BTW, do you know any usage for this MCA mechanism except Sun? We are considering more changes to Xen's MCA and some suggestion is send to mailing list (http://lists.xensource.com/archives/html/xen-devel/2008-12/msg00643.html gives three options  ), we want get feedback from them and you, to avoid suprise in last minutes again.

Thanks
Yunhong Jiang

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs
  2009-01-14  2:58       ` Jiang, Yunhong
@ 2009-01-14  4:13         ` Frank van der Linden
  0 siblings, 0 replies; 11+ messages in thread
From: Frank van der Linden @ 2009-01-14  4:13 UTC (permalink / raw)
  To: Jiang, Yunhong
  Cc: Keir, Christoph Egger, xen-devel@lists.xensource.com, Fraser,
	Ke, Liping

Jiang, Yunhong wrote:

> Just as Liping has pointed out, since there is size caculated in mcinfo_common, it should be ok for this changes, especially considering just increase the size, not decrease. 
> Anyway, let's wait for  Frank and Gavin's changes. 

Thanks! I'll start re-synching our changes with xen-unstable soon (later 
this week). Hopefully I'll be done with that somewhere next week. Then 
I'll send our changes to both you and Christoph for review. Then we can 
discuss them, and present them for integration, so that there's a common 
basis for future work from all sides.

- Frank

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-01-14  4:13 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-19  6:19 [patch 4/4]Enable CMCI (Corrected Machine Check Error Interrupt) for Intel CPUs Ke, Liping
2009-01-12 14:03 ` Christoph Egger
2009-01-13  2:12   ` Ke, Liping
2009-01-13 11:21     ` Christoph Egger
2009-01-13 16:48       ` Frank Van Der Linden
2009-01-13 17:11         ` Christoph Egger
2009-01-13 17:29           ` Frank van der Linden
2009-01-14  2:11             ` Jiang, Yunhong
2009-01-14  1:35       ` Ke, Liping
2009-01-14  2:58       ` Jiang, Yunhong
2009-01-14  4:13         ` Frank van der Linden

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.