* [patch] 2.6.0 MCA TLB error recovery
@ 2003-12-18 23:37 Luck, Tony
0 siblings, 0 replies; 9+ messages in thread
From: Luck, Tony @ 2003-12-18 23:37 UTC (permalink / raw)
To: linux-ia64
David,
Here's the updated version of the MCA TLB recovery patch
(and I've made the assumption that you'll take Keith's
salinfo patch from 11/25 and the deadlock fix that he posted
on 12/7 first ... so this patch is against base 2.6.0 with
Keith's patches applied).
One slight glitch that I don't understand. When I injected a
TLB error I saw this printk on the console:
+CPU 3: SAL log contains MCA error record
+Err Record ID: 545586543104884737 SAL Rev: 0.03
+Time: 12/18/2003 10:28:32 Severity 0
But the salinfo_decode daemon didn't wake up to pluck this
from the kernel and deposit it in /var/log/salinfo/{raw,decoded}/*
After I rebooted the daemon picked up the log and decoded it and
reported 4 copies of the same stuff, one from each cpu:
BEGIN HARDWARE ERROR STATE from mca on cpu 3
Err Record ID: 545586543104884737 SAL Rev: 0.03
Time: 2003-12-18 10:28:32 Severity 0
Processor Device Error Info Section
UNCORRECTED PROCESSOR ERROR: TLB Check
processor lid : 0x00000000c6180000
cpu: M nasid: 0x618
processor state parameter: 0x10000000ff7211a0
blah, blah, blah
It looks like salinfo_log_wakeup() is called right before
ia64_log_print() ... so I'm not sure why the salinfo_decode
daemon kept on snoozing. Keith: am I missing something obvious?
Here's the patch (substantial portions of this code written by Fenghua Yu):
diff -ru linux-2.6.0/arch/ia64/kernel/asm-offsets.c tlbfix/arch/ia64/kernel/asm-offsets.c
--- linux-2.6.0/arch/ia64/kernel/asm-offsets.c 2003-12-17 18:59:39.000000000 -0800
+++ tlbfix/arch/ia64/kernel/asm-offsets.c 2003-12-18 09:47:18.000000000 -0800
@@ -12,6 +12,7 @@
#include <asm-ia64/ptrace.h>
#include <asm-ia64/siginfo.h>
#include <asm-ia64/sigcontext.h>
+#include <asm-ia64/mca.h>
#include "../kernel/sigframe.h"
@@ -204,4 +205,7 @@
# error "CLONE_SETTLS_BIT incorrect, please fix"
#endif
+ BLANK();
+ DEFINE(IA64_MCA_TLB_INFO_SIZE, sizeof (struct ia64_mca_tlb_info));
+
}
diff -ru linux-2.6.0/arch/ia64/kernel/efi.c tlbfix/arch/ia64/kernel/efi.c
--- linux-2.6.0/arch/ia64/kernel/efi.c 2003-12-17 18:58:05.000000000 -0800
+++ tlbfix/arch/ia64/kernel/efi.c 2003-12-18 09:47:18.000000000 -0800
@@ -30,6 +30,7 @@
#include <asm/kregs.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
+#include <asm/mca.h>
#define EFI_DEBUG 0
@@ -395,6 +396,9 @@
int pal_code_count = 0;
u64 mask, psr;
u64 vaddr;
+#ifdef CONFIG_IA64_MCA
+ int cpu;
+#endif
efi_map_start = __va(ia64_boot_param->efi_memmap);
efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
@@ -455,6 +459,14 @@
IA64_GRANULE_SHIFT);
ia64_set_psr(psr); /* restore psr */
ia64_srlz_i();
+
+#ifdef CONFIG_IA64_MCA
+ cpu = smp_processor_id();
+
+ /* insert this TR into our list for MCA recovery purposes */
+ ia64_mca_tlb_list[cpu].pal_base=vaddr & mask;
+ ia64_mca_tlb_list[cpu].pal_paddr= pte_val(mk_pte_phys(md->phys_addr, PAGE_KERNEL));
+#endif
}
}
diff -ru linux-2.6.0/arch/ia64/kernel/mca_asm.S tlbfix/arch/ia64/kernel/mca_asm.S
--- linux-2.6.0/arch/ia64/kernel/mca_asm.S 2003-12-17 18:59:29.000000000 -0800
+++ tlbfix/arch/ia64/kernel/mca_asm.S 2003-12-18 09:47:18.000000000 -0800
@@ -13,7 +13,9 @@
// 2. Restore current thread pointer to kr6
// 3. Move stack ptr 16 bytes to conform to C calling convention
//
+//
#include <linux/config.h>
+#include <linux/threads.h>
#include <asm/asmmacro.h>
#include <asm/pgtable.h>
@@ -22,20 +24,15 @@
#include <asm/mca.h>
/*
- * When we get an machine check, the kernel stack pointer is no longer
+ * When we get a machine check, the kernel stack pointer is no longer
* valid, so we need to set a new stack pointer.
*/
#define MINSTATE_PHYS /* Make sure stack access is physical for MINSTATE */
/*
- * Needed for ia64_sal call
- */
-#define SAL_GET_STATE_INFO 0x01000001
-
-/*
* Needed for return context to SAL
*/
-#define IA64_MCA_SAME_CONTEXT 0x0
+#define IA64_MCA_SAME_CONTEXT 0
#define IA64_MCA_COLD_BOOT -2
#include "minstate.h"
@@ -71,19 +68,36 @@
* returns ptr to SAL rtn save loc in _tmp
*/
#define OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(_tmp) \
- LOAD_PHYSICAL(p6, _tmp, ia64_sal_to_os_handoff_state);; \
- LOAD_PHYSICAL(p7, _tmp, ia64_os_to_sal_handoff_state);; \
-(p6) movl r8=IA64_MCA_COLD_BOOT; \
-(p6) movl r10=IA64_MCA_SAME_CONTEXT; \
-(p6) add _tmp=0x18,_tmp;; \
-(p6) ld8 r9=[_tmp],0x10; \
-(p6) mov r22=r0;; \
-(p7) ld8 r8=[_tmp],0x08;; \
-(p7) ld8 r9=[_tmp],0x08;; \
-(p7) ld8 r10=[_tmp],0x08;; \
-(p7) ld8 r22=[_tmp],0x08;;
+ movl _tmp=ia64_os_to_sal_handoff_state;; \
+ DATA_VA_TO_PA(_tmp);; \
+ ld8 r8=[_tmp],0x08;; \
+ ld8 r9=[_tmp],0x08;; \
+ ld8 r10=[_tmp],0x08;; \
+ ld8 r22=[_tmp],0x08;;
// now _tmp is pointing to SAL rtn save location
+/*
+ * COLD_BOOT_HANDOFF_STATE() sets ia64_mca_os_to_sal_state
+ * imots_os_status=IA64_MCA_COLD_BOOT
+ * imots_sal_gp=SAL GP
+ * imots_context=IA64_MCA_SAME_CONTEXT
+ * imots_new_min_state=Min state save area pointer
+ * imots_sal_check_ra=Return address to location within SAL_CHECK
+ *
+ */
+#define COLD_BOOT_HANDOFF_STATE(sal_to_os_handoff,os_to_sal_handoff,tmp)\
+ movl tmp=IA64_MCA_COLD_BOOT; \
+ movl sal_to_os_handoff=__pa(ia64_sal_to_os_handoff_state); \
+ movl os_to_sal_handoff=__pa(ia64_os_to_sal_handoff_state);; \
+ st8 [os_to_sal_handoff]=tmp,8;; \
+ ld8 tmp=[sal_to_os_handoff],48;; \
+ st8 [os_to_sal_handoff]=tmp,8;; \
+ movl tmp=IA64_MCA_SAME_CONTEXT;; \
+ st8 [os_to_sal_handoff]=tmp,8;; \
+ ld8 tmp=[sal_to_os_handoff],-8;; \
+ st8 [os_to_sal_handoff]=tmp,8;; \
+ ld8 tmp=[sal_to_os_handoff];; \
+ st8 [os_to_sal_handoff]=tmp;;
.global ia64_os_mca_dispatch
.global ia64_os_mca_dispatch_end
@@ -94,20 +108,21 @@
.global ia64_mca_stackframe
.global ia64_mca_bspstore
.global ia64_init_stack
- .global ia64_mca_sal_data_area
- .global ia64_tlb_functional
.text
.align 16
ia64_os_mca_dispatch:
-#if defined(MCA_TEST)
- // Pretend that we are in interrupt context
- mov r2=psr
- dep r2=0, r2, PSR_IC, 2;
- mov psr.l = r2
-#endif /* #if defined(MCA_TEST) */
+ // Serialize all MCA processing
+// movl r2=ia64_mca_serialize
+ mov r3=1;;
+// DATA_VA_TO_PA(r2);;
+ LOAD_PHYSICAL(p0,r2,ia64_mca_serialize);;
+ia64_os_mca_spin:
+ xchg8 r4=[r2],r3;;
+ cmp.ne p6,p0=r4,r0
+(p6) br ia64_os_mca_spin
// Save the SAL to OS MCA handoff state as defined
// by SAL SPEC 3.0
@@ -124,6 +139,191 @@
ia64_os_mca_done_dump:
+// movl r16=__pa(ia64_sal_to_os_handoff_state)+56
+ LOAD_PHYSICAL(p0,r16,ia64_sal_to_os_handoff_state+56)
+ ;;
+ ld8 r18=[r16] // Get processor state parameter on existing PALE_CHECK.
+ ;;
+ tbit.nz p6,p7=r18,60
+(p7) br.spnt done_tlb_purge_and_reload
+
+ // The following code purges TC and TR entries. Then reload all TC entries.
+ // Purge percpu data TC entries.
+begin_tlb_purge_and_reload:
+ mov r16=cr.lid
+// movl r17=__pa(ia64_mca_tlb_list) // Physical address of ia64_mca_tlb_list
+ LOAD_PHYSICAL(p0,r17,ia64_mca_tlb_list) // Physical address of ia64_mca_tlb_list
+ mov r19=0
+ mov r20=NR_CPUS
+ ;;
+1: cmp.eq p6,p7=r19,r20
+(p6) br.spnt.few err
+ ld8 r18=[r17],IA64_MCA_TLB_INFO_SIZE
+ ;;
+ add r19=1,r19
+ cmp.eq p6,p7=r18,r16
+(p7) br.sptk.few 1b
+ ;;
+ adds r17=-IA64_MCA_TLB_INFO_SIZE,r17
+ ;;
+ mov r23=r17 // save current ia64_mca_percpu_info addr pointer.
+ adds r17\x16,r17
+ ;;
+ .global aegl
+aegl:
+ ld8 r18=[r17],8 // r18=ptce_base
+ ;;
+ ld4 r19=[r17],4 // r19=ptce_count[0]
+ ;;
+ ld4 r20=[r17],4 // r20=ptce_count[1]
+ ;;
+ ld4 r21=[r17],4 // r21=ptce_stride[0]
+ mov r24=0
+ ;;
+ ld4 r22=[r17],4 // r22=ptce_stride[1]
+ adds r20=-1,r20
+ ;;
+2:
+ cmp.ltu p6,p7=r24,r19
+(p7) br.cond.dpnt.few 4f
+ mov ar.lc=r20
+3:
+ ptc.e r18
+ ;;
+ add r18=r22,r18
+ br.cloop.sptk.few 3b
+ ;;
+ add r18=r21,r18
+ add r24=1,r24
+ ;;
+ br.sptk.few 2b
+4:
+ srlz.i // srlz.i implies srlz.d
+ ;;
+
+ // Now purge addresses formerly mapped by TR registers
+ // 1. Purge ITR&DTR for kernel.
+ movl r16=KERNEL_START
+ mov r18=KERNEL_TR_PAGE_SHIFT<<2
+ ;;
+ ptr.i r16, r18
+ ptr.d r16, r18
+ ;;
+ srlz.i
+ ;;
+ srlz.d
+ ;;
+ // 2. Purge DTR for PERCPU data.
+ movl r16=PERCPU_ADDR
+ mov r18=PERCPU_PAGE_SHIFT<<2
+ ;;
+ ptr.d r16,r18
+ ;;
+ srlz.d
+ ;;
+ // 3. Purge ITR for PAL code.
+ adds r17H,r23
+ ;;
+ ld8 r16=[r17]
+ mov r18=IA64_GRANULE_SHIFT<<2
+ ;;
+ ptr.i r16,r18
+ ;;
+ srlz.i
+ ;;
+ // 4. Purge DTR for stack.
+ mov r16=IA64_KR(CURRENT_STACK)
+ ;;
+ shl r16=r16,IA64_GRANULE_SHIFT
+ movl r19=PAGE_OFFSET
+ ;;
+ add r16=r19,r16
+ mov r18=IA64_GRANULE_SHIFT<<2
+ ;;
+ ptr.d r16,r18
+ ;;
+ srlz.i
+ ;;
+ // Finally reload the TR registers.
+ // 1. Reload DTR/ITR registers for kernel.
+ mov r18=KERNEL_TR_PAGE_SHIFT<<2
+ movl r17=KERNEL_START
+ ;;
+ mov cr.itir=r18
+ mov cr.ifa=r17
+ mov r16=IA64_TR_KERNEL
+ mov r19=ip
+ movl r18=PAGE_KERNEL
+ ;;
+ dep r17=0,r19,0, KERNEL_TR_PAGE_SHIFT
+ ;;
+ or r18=r17,r18
+ ;;
+ itr.i itr[r16]=r18
+ ;;
+ itr.d dtr[r16]=r18
+ ;;
+ srlz.i
+ srlz.d
+ ;;
+ // 2. Reload DTR register for PERCPU data.
+ adds r17=8,r23
+ movl r16=PERCPU_ADDR // vaddr
+ movl r18=PERCPU_PAGE_SHIFT<<2
+ ;;
+ mov cr.itir=r18
+ mov cr.ifa=r16
+ ;;
+ ld8 r18=[r17] // pte
+ mov r16=IA64_TR_PERCPU_DATA;
+ ;;
+ itr.d dtr[r16]=r18
+ ;;
+ srlz.d
+ ;;
+ // 3. Reload ITR for PAL code.
+ adds r17@,r23
+ ;;
+ ld8 r18=[r17],8 // pte
+ ;;
+ ld8 r16=[r17] // vaddr
+ mov r19=IA64_GRANULE_SHIFT<<2
+ ;;
+ mov cr.itir=r19
+ mov cr.ifa=r16
+ mov r20=IA64_TR_PALCODE
+ ;;
+ itr.i itr[r20]=r18
+ ;;
+ srlz.i
+ ;;
+ // 4. Reload DTR for stack.
+ mov r16=IA64_KR(CURRENT_STACK)
+ ;;
+ shl r16=r16,IA64_GRANULE_SHIFT
+ movl r19=PAGE_OFFSET
+ ;;
+ add r18=r19,r16
+ movl r20=PAGE_KERNEL
+ ;;
+ add r16=r20,r16
+ mov r19=IA64_GRANULE_SHIFT<<2
+ ;;
+ mov cr.itir=r19
+ mov cr.ifa=r18
+ mov r20=IA64_TR_CURRENT_STACK
+ ;;
+ itr.d dtr[r20]=r16
+ ;;
+ srlz.d
+ ;;
+ br.sptk.many done_tlb_purge_and_reload
+err:
+ COLD_BOOT_HANDOFF_STATE(r20,r21,r22)
+ br.sptk.many ia64_os_mca_done_restore
+
+done_tlb_purge_and_reload:
+
// Setup new stack frame for OS_MCA handling
movl r2=ia64_mca_bspstore;; // local bspstore area location in r2
DATA_VA_TO_PA(r2);;
@@ -137,17 +337,11 @@
// (C calling convention)
DATA_VA_TO_PA(r12);;
- // Check to see if the MCA resulted from a TLB error
-begin_tlb_error_check:
- br ia64_os_mca_tlb_error_check;;
-
-done_tlb_error_check:
-
- // If TLB is functional, enter virtual mode from physical mode
+ // Enter virtual mode from physical mode
VIRTUAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_begin, r4)
ia64_os_mca_virtual_begin:
- // call our handler
+ // Call virtual mode handler
movl r2=ia64_mca_ucmc_handler;;
mov b6=r2;;
br.call.sptk.many b0¶;;
@@ -156,13 +350,6 @@
PHYSICAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_end, r4)
ia64_os_mca_virtual_end:
-#if defined(MCA_TEST)
- // Pretend that we are in interrupt context
- mov r2=psr;;
- dep r2=0, r2, PSR_IC, 2;;
- mov psr.l = r2;;
-#endif /* #if defined(MCA_TEST) */
-
// restore the original stack frame here
movl r2=ia64_mca_stackframe // restore stack frame from memory at r2
;;
@@ -178,14 +365,16 @@
br ia64_os_mca_proc_state_restore;;
ia64_os_mca_done_restore:
- movl r3=ia64_tlb_functional;;
- DATA_VA_TO_PA(r3);;
- ld8 r3=[r3];;
- cmp.eq p6,p7=r0,r3;;
OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(r2);;
// branch back to SALE_CHECK
ld8 r3=[r2];;
mov b0=r3;; // SAL_CHECK return address
+
+ // release lock
+ movl r3=ia64_mca_serialize;;
+ DATA_VA_TO_PA(r3);;
+ st8.rel [r3]=r0
+
br b0
;;
ia64_os_mca_dispatch_end:
@@ -205,8 +394,9 @@
ia64_os_mca_proc_state_dump:
// Save bank 1 GRs 16-31 which will be used by c-language code when we switch
// to virtual addressing mode.
- movl r2=ia64_mca_proc_state_dump;; // Os state dump area
- DATA_VA_TO_PA(r2) // convert to to physical address
+// movl r2=ia64_mca_proc_state_dump;; // Os state dump area
+// DATA_VA_TO_PA(r2) // convert to to physical address
+ LOAD_PHYSICAL(p0,r2,ia64_mca_proc_state_dump)// convert OS state dump area to physical address
// save ar.NaT
mov r5=ar.unat // ar.unat
@@ -658,79 +848,6 @@
//EndStub//////////////////////////////////////////////////////////////////////
-//++
-// Name:
-// ia64_os_mca_tlb_error_check()
-//
-// Stub Description:
-//
-// This stub checks to see if the MCA resulted from a TLB error
-//
-//--
-
-ia64_os_mca_tlb_error_check:
-
- // Retrieve sal data structure for uncorrected MCA
-
- // Make the ia64_sal_get_state_info() call
- movl r4=ia64_mca_sal_data_area;;
- movl r7=ia64_sal;;
- mov r6=r1 // save gp
- DATA_VA_TO_PA(r4) // convert to physical address
- DATA_VA_TO_PA(r7);; // convert to physical address
- ld8 r7=[r7] // get addr of pdesc from ia64_sal
- movl r3=SAL_GET_STATE_INFO;;
- DATA_VA_TO_PA(r7);; // convert to physical address
- ld8 r8=[r7],8;; // get pdesc function pointer
- dep r8=0,r8,61,3;; // convert SAL VA to PA
- ld8 r1=[r7];; // set new (ia64_sal) gp
- dep r1=0,r1,61,3;; // convert SAL VA to PA
- mov b6=r8
-
- alloc r5=ar.pfs,8,0,8,0;; // allocate stack frame for SAL call
- mov out0=r3 // which SAL proc to call
- mov out1=r0 // error type = MCA
- mov out2=r0 // null arg
- mov out3=r4 // data copy area
- mov out4=r0 // null arg
- mov out5=r0 // null arg
- mov out6=r0 // null arg
- mov out7=r0;; // null arg
-
- br.call.sptk.few b0¶;;
-
- mov r1=r6 // restore gp
- mov ar.pfs=r5;; // restore ar.pfs
-
- movl r6=ia64_tlb_functional;;
- DATA_VA_TO_PA(r6) // needed later
-
- cmp.eq p6,p7=r0,r8;; // check SAL call return address
-(p7) st8 [r6]=r0 // clear tlb_functional flag
-(p7) br tlb_failure // error; return to SAL
-
- // examine processor error log for type of error
- add r4@+24,r4;; // parse past record header (length@)
- // and section header (length$)
- ld4 r4=[r4] // get valid field of processor log
- mov r5=0xf00;;
- and r5=r4,r5;; // read bits 8-11 of valid field
- // to determine if we have a TLB error
- movl r3=0x1
- cmp.eq p6,p7=r0,r5;;
- // if no TLB failure, set tlb_functional flag
-(p6) st8 [r6]=r3
- // else clear flag
-(p7) st8 [r6]=r0
-
- // if no TLB failure, continue with normal virtual mode logging
-(p6) br done_tlb_error_check
- // else no point in entering virtual mode for logging
-tlb_failure:
- br ia64_os_mca_virtual_end
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
// ok, the issue here is that we need to save state information so
// it can be useable by the kernel debugger and show regs routines.
diff -ru linux-2.6.0/arch/ia64/kernel/mca.c tlbfix/arch/ia64/kernel/mca.c
--- linux-2.6.0/arch/ia64/kernel/mca.c 2003-12-18 09:18:53.000000000 -0800
+++ tlbfix/arch/ia64/kernel/mca.c 2003-12-18 09:47:18.000000000 -0800
@@ -78,9 +78,8 @@
u64 ia64_mca_stackframe[32];
u64 ia64_mca_bspstore[1024];
u64 ia64_init_stack[KERNEL_STACK_SIZE/8] __attribute__((aligned(16)));
-u64 ia64_mca_sal_data_area[1356];
-u64 ia64_tlb_functional;
u64 ia64_os_mca_recovery_successful;
+u64 ia64_mca_serialize;
static void ia64_mca_wakeup_ipi_wait(void);
static void ia64_mca_wakeup(int cpu);
static void ia64_mca_wakeup_all(void);
@@ -90,6 +89,8 @@
static u64 ia64_log_get(int sal_info_type, u8 **buffer);
extern struct hw_interrupt_type irq_type_iosapic_level;
+struct ia64_mca_tlb_info ia64_mca_tlb_list[NR_CPUS];
+
static struct irqaction cmci_irqaction = {
.handler = ia64_mca_cmc_int_handler,
.flags = SA_INTERRUPT,
@@ -944,6 +945,9 @@
void
ia64_return_to_sal_check(void)
{
+ pal_processor_state_info_t *psp = (pal_processor_state_info_t *)
+ &ia64_sal_to_os_handoff_state.proc_state_param;
+
/* Copy over some relevant stuff from the sal_to_os_mca_handoff
* so that it can be used at the time of os_mca_to_sal_handoff
*/
@@ -953,14 +957,22 @@
ia64_os_to_sal_handoff_state.imots_sal_check_ra ia64_sal_to_os_handoff_state.imsto_sal_check_ra;
- /* Cold Boot for uncorrectable MCA */
- ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_COLD_BOOT;
+ /*
+ * Did we correct the error? At the moment the only error that
+ * we fix is a TLB error, if any other kind of error occurred
+ * we must reboot.
+ */
+ if (psp->cc = 1 && psp->bc = 1 && psp->rc = 1 && psp->uc = 1)
+ ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_COLD_BOOT;
+ else
+ ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_CORRECTED;
/* Default = tell SAL to return to same context */
ia64_os_to_sal_handoff_state.imots_context = IA64_MCA_SAME_CONTEXT;
ia64_os_to_sal_handoff_state.imots_new_min_state (u64 *)ia64_sal_to_os_handoff_state.pal_min_state;
+
}
/*
@@ -1338,8 +1350,8 @@
void
ia64_log_prt_guid (efi_guid_t *p_guid, prfunc_t prfunc)
{
- char out[40];
- printk(KERN_DEBUG "GUID = %s\n", efi_guid_unparse(p_guid, out));
+ //char out[40];
+ //printk(KERN_DEBUG "GUID = %s\n", efi_guid_unparse(p_guid, out));
}
static void
diff -ru linux-2.6.0/arch/ia64/mm/init.c tlbfix/arch/ia64/mm/init.c
--- linux-2.6.0/arch/ia64/mm/init.c 2003-12-17 18:58:48.000000000 -0800
+++ tlbfix/arch/ia64/mm/init.c 2003-12-18 09:47:18.000000000 -0800
@@ -34,6 +34,7 @@
#include <asm/tlb.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
+#include <asm/mca.h>
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -277,6 +278,10 @@
{
unsigned long psr, pta, impl_va_bits;
extern void __init tlb_init (void);
+#ifdef CONFIG_IA64_MCA
+ int cpu;
+#endif
+
#ifdef CONFIG_DISABLE_VHPT
# define VHPT_ENABLE_BIT 0
#else
@@ -335,6 +340,23 @@
ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);
ia64_tlb_init();
+
+#ifdef CONFIG_IA64_MCA
+ cpu = smp_processor_id();
+
+ /* mca handler uses cr.lid as key to pick the right entry */
+ ia64_mca_tlb_list[cpu].cr_lid = ia64_getreg(_IA64_REG_CR_LID);
+
+ /* insert this percpu data information into our list for MCA recovery purposes */
+ ia64_mca_tlb_list[cpu].percpu_paddr=pte_val(mk_pte_phys(__pa(my_cpu_data), PAGE_KERNEL));
+ /* Also save per-cpu tlb flush recipe for use in physical mode mca handler */
+ ia64_mca_tlb_list[cpu].ptce_base=local_cpu_data->ptce_base;
+ ia64_mca_tlb_list[cpu].ptce_count[0]=local_cpu_data->ptce_count[0];
+ ia64_mca_tlb_list[cpu].ptce_count[1]=local_cpu_data->ptce_count[1];
+ ia64_mca_tlb_list[cpu].ptce_stride[0]=local_cpu_data->ptce_stride[0];
+ ia64_mca_tlb_list[cpu].ptce_stride[1]=local_cpu_data->ptce_stride[1];
+#endif
+
}
#ifdef CONFIG_VIRTUAL_MEM_MAP
diff -ru linux-2.6.0/include/asm-ia64/mca.h tlbfix/include/asm-ia64/mca.h
--- linux-2.6.0/include/asm-ia64/mca.h 2003-12-18 09:18:53.000000000 -0800
+++ tlbfix/include/asm-ia64/mca.h 2003-12-18 09:47:18.000000000 -0800
@@ -18,6 +18,7 @@
#include <asm/param.h>
#include <asm/sal.h>
#include <asm/processor.h>
+#include <asm/mca_asm.h>
/* These are the return codes from all the IA64_MCA specific interfaces */
typedef int ia64_mca_return_code_t;
@@ -61,6 +62,17 @@
IA64_MCA_RENDEZ_CHECKIN_DONE = 0x1
};
+/* the following data structure is used for TLB error recovery purposes */
+extern struct ia64_mca_tlb_info {
+ u64 cr_lid;
+ u64 percpu_paddr;
+ u64 ptce_base;
+ u32 ptce_count[2];
+ u32 ptce_stride[2];
+ u64 pal_paddr;
+ u64 pal_base;
+} ia64_mca_tlb_list[NR_CPUS];
+
/* Information maintained by the MC infrastructure */
typedef struct ia64_mc_info_s {
u64 imi_mca_handler;
diff -ru linux-2.6.0/include/asm-ia64/pgtable.h tlbfix/include/asm-ia64/pgtable.h
--- linux-2.6.0/include/asm-ia64/pgtable.h 2003-12-17 18:58:39.000000000 -0800
+++ tlbfix/include/asm-ia64/pgtable.h 2003-12-18 09:47:18.000000000 -0800
@@ -230,6 +230,10 @@
#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+/* This takes a physical page address that is used by the remapping functions */
+#define mk_pte_phys(physpage, pgprot) \
+({ pte_t __pte; pte_val(__pte) = physpage + pgprot_val(pgprot); __pte; })
+
#define pte_modify(_pte, newprot) \
(__pte((pte_val(_pte) & ~_PAGE_CHG_MASK) | (pgprot_val(newprot) & _PAGE_CHG_MASK)))
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] 2.6.0 MCA TLB error recovery
@ 2003-12-19 5:06 Keith Owens
2003-12-19 18:07 ` Luck, Tony
` (6 more replies)
0 siblings, 7 replies; 9+ messages in thread
From: Keith Owens @ 2003-12-19 5:06 UTC (permalink / raw)
To: linux-ia64
On Thu, 18 Dec 2003 15:37:09 -0800,
"Luck, Tony" <tony.luck@intel.com> wrote:
>It looks like salinfo_log_wakeup() is called right before
>ia64_log_print() ... so I'm not sure why the salinfo_decode
>daemon kept on snoozing. Keith: am I missing something obvious?
From the top of salinfo_log_wakeup()
* ... MCA and INIT events are
* not irq safe, do not call any routines that use spinlocks, they may deadlock.
MCA and INIT records are noted but it is not safe to call up() from
those interrupts, so the daemon cannot be woken. This has not been a
problem in the past because MCA and INIT were not recoverable, the
records are picked up on the next boot. Once my patches are in David's
tree, I will update salinfo to periodically check for any MCA or INIT
records and kick the daemon. There was no point before, I had no way
of testing this case.
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [patch] 2.6.0 MCA TLB error recovery
2003-12-19 5:06 Keith Owens
@ 2003-12-19 18:07 ` Luck, Tony
2003-12-19 23:22 ` David Mosberger
` (5 subsequent siblings)
6 siblings, 0 replies; 9+ messages in thread
From: Luck, Tony @ 2003-12-19 18:07 UTC (permalink / raw)
To: linux-ia64
> >It looks like salinfo_log_wakeup() is called right before
> >ia64_log_print() ... so I'm not sure why the salinfo_decode
> >daemon kept on snoozing. Keith: am I missing something obvious?
>
> From the top of salinfo_log_wakeup()
>
> * ... MCA and INIT events are
> * not irq safe, do not call any routines that use spinlocks,
> they may deadlock.
Okay ... that was pretty danged obvious! Thanks for pointing
it out so gently :-)
> MCA and INIT records are noted but it is not safe to call up() from
> those interrupts, so the daemon cannot be woken. This has not been a
> problem in the past because MCA and INIT were not recoverable, the
> records are picked up on the next boot. Once my patches are
> in David's
> tree, I will update salinfo to periodically check for any MCA or INIT
> records and kick the daemon. There was no point before, I had no way
> of testing this case.
Sounds good. Salinfo-0.4 is beautiful by the way. Getting all the
bits decoded from processor state parameter and ipsr/xpsr etc. is great.
-Tony
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] 2.6.0 MCA TLB error recovery
2003-12-19 5:06 Keith Owens
2003-12-19 18:07 ` Luck, Tony
@ 2003-12-19 23:22 ` David Mosberger
2003-12-20 0:03 ` Luck, Tony
` (4 subsequent siblings)
6 siblings, 0 replies; 9+ messages in thread
From: David Mosberger @ 2003-12-19 23:22 UTC (permalink / raw)
To: linux-ia64
I didn't keep the old patches around. Could you and Keith send me a
consistent set of patches? That would help me.
Thanks,
--david
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [patch] 2.6.0 MCA TLB error recovery
2003-12-19 5:06 Keith Owens
2003-12-19 18:07 ` Luck, Tony
2003-12-19 23:22 ` David Mosberger
@ 2003-12-20 0:03 ` Luck, Tony
2003-12-20 0:04 ` Luck, Tony
` (3 subsequent siblings)
6 siblings, 0 replies; 9+ messages in thread
From: Luck, Tony @ 2003-12-20 0:03 UTC (permalink / raw)
To: linux-ia64
> I didn't keep the old patches around. Could you and Keith send me a
> consistent set of patches? That would help me.
Here's Keith's first patch, 2nd (much smaller) is in the next e-mail:
From: linux-ia64-owner@vger.kernel.org on behalf of Keith Owens
[kaos@sgi.com]
Sent: Tuesday, November 25, 2003 12:37 AM
To: linux-ia64@vger.kernel.org
Subject: [patch] 2.6.0-test9 pal/sal/salinfo/mca
Forward port the recent changes to pal.h, sal.h, mca.h, salinfo.c and
mca.c from 2.4.23-rc2 to 2.6.0-test9.
This converts 2.6 to use salinfo instead of printing CMC/CPE/MCA/INIT
records in the kernel. It makes the two kernel versions as close
together as possible.
Index: 9.2/include/asm-ia64/sal.h
--- 9.2/include/asm-ia64/sal.h Sun, 24 Aug 2003 10:59:40 +1000 kaos (linux-2.6-test/u/c/36_sal.h 1.3 644)
+++ 9.4/include/asm-ia64/sal.h Tue, 25 Nov 2003 19:23:21 +1100 kaos (linux-2.6-test/u/c/36_sal.h 1.4 644)
@@ -725,14 +725,16 @@ ia64_sal_mc_rendez (void)
* Allow the OS to specify the interrupt number to be used by SAL to interrupt OS during
* the machine check rendezvous sequence as well as the mechanism to wake up the
* non-monarch processor at the end of machine check processing.
+ * Returns the complete ia64_sal_retval because some calls return more than just a status
+ * value.
*/
-static inline s64
+static inline struct ia64_sal_retval
ia64_sal_mc_set_params (u64 param_type, u64 i_or_m, u64 i_or_m_val, u64 timeout, u64 rz_always)
{
struct ia64_sal_retval isrv;
SAL_CALL(isrv, SAL_MC_SET_PARAMS, param_type, i_or_m, i_or_m_val,
timeout, rz_always, 0, 0);
- return isrv.status;
+ return isrv;
}
/* Read from PCI configuration space */
@@ -804,10 +806,12 @@ ia64_sal_update_pal (u64 param_buf, u64
extern unsigned long sal_platform_features;
+extern int (*salinfo_platform_oemdata)(const u8 *, u8 **, u64 *);
+
struct sal_ret_values {
long r8; long r9; long r10; long r11;
};
#endif /* __ASSEMBLY__ */
-#endif /* _ASM_IA64_PAL_H */
+#endif /* _ASM_IA64_SAL_H */
Index: 9.2/include/asm-ia64/pal.h
--- 9.2/include/asm-ia64/pal.h Mon, 20 Oct 2003 11:16:44 +1000 kaos (linux-2.6-test/v/c/12_pal.h 1.3 644)
+++ 9.4/include/asm-ia64/pal.h Tue, 25 Nov 2003 19:23:21 +1100 kaos (linux-2.6-test/v/c/12_pal.h 1.4 644)
@@ -461,23 +461,13 @@ typedef struct pal_process_state_info_s
} pal_processor_state_info_t;
typedef struct pal_cache_check_info_s {
- u64 reserved1 : 16,
- way : 5, /* Way in which the
- * error occurred
- */
- reserved2 : 1,
- mc : 1, /* Machine check corrected */
- tv : 1, /* Target address
- * structure is valid
- */
-
- wv : 1, /* Way field valid */
- op : 3, /* Type of cache
+ u64 op : 4, /* Type of cache
* operation that
* caused the machine
* check.
*/
-
+ level : 2, /* Cache level */
+ reserved1 : 2,
dl : 1, /* Failure in data part
* of cache line
*/
@@ -486,11 +476,34 @@ typedef struct pal_cache_check_info_s {
*/
dc : 1, /* Failure in dcache */
ic : 1, /* Failure in icache */
- index : 24, /* Cache line index */
- mv : 1, /* mesi valid */
mesi : 3, /* Cache line state */
- level : 4; /* Cache level */
+ mv : 1, /* mesi valid */
+ way : 5, /* Way in which the
+ * error occurred
+ */
+ wiv : 1, /* Way field valid */
+ reserved2 : 10,
+
+ index : 20, /* Cache line index */
+ reserved3 : 2,
+ is : 1, /* instruction set (1 = ia32) */
+ iv : 1, /* instruction set field valid */
+ pl : 2, /* privilege level */
+ pv : 1, /* privilege level field valid */
+ mcc : 1, /* Machine check corrected */
+ tv : 1, /* Target address
+ * structure is valid
+ */
+ rq : 1, /* Requester identifier
+ * structure is valid
+ */
+ rp : 1, /* Responder identifier
+ * structure is valid
+ */
+ pi : 1; /* Precise instruction pointer
+ * structure is valid
+ */
} pal_cache_check_info_t;
typedef struct pal_tlb_check_info_s {
@@ -498,18 +511,38 @@ typedef struct pal_tlb_check_info_s {
u64 tr_slot : 8, /* Slot# of TR where
* error occurred
*/
- reserved2 : 8,
+ trv : 1, /* tr_slot field is valid */
+ reserved1 : 1,
+ level : 2, /* TLB level where failure occurred */
+ reserved2 : 4,
dtr : 1, /* Fail in data TR */
itr : 1, /* Fail in inst TR */
dtc : 1, /* Fail in data TC */
itc : 1, /* Fail in inst. TC */
- mc : 1, /* Machine check corrected */
- reserved1 : 43;
+ op : 4, /* Cache operation */
+ reserved3 : 30,
+ is : 1, /* instruction set (1 = ia32) */
+ iv : 1, /* instruction set field valid */
+ pl : 2, /* privilege level */
+ pv : 1, /* privilege level field valid */
+ mcc : 1, /* Machine check corrected */
+ tv : 1, /* Target address
+ * structure is valid
+ */
+ rq : 1, /* Requester identifier
+ * structure is valid
+ */
+ rp : 1, /* Responder identifier
+ * structure is valid
+ */
+ pi : 1; /* Precise instruction pointer
+ * structure is valid
+ */
} pal_tlb_check_info_t;
typedef struct pal_bus_check_info_s {
- u64 size : 5, /* Xaction size*/
+ u64 size : 5, /* Xaction size */
ib : 1, /* Internal bus error */
eb : 1, /* External bus error */
cc : 1, /* Error occurred
@@ -518,22 +551,99 @@ typedef struct pal_bus_check_info_s {
*/
type : 8, /* Bus xaction type*/
sev : 5, /* Bus error severity*/
- tv : 1, /* Targ addr valid */
- rp : 1, /* Resp addr valid */
- rq : 1, /* Req addr valid */
+ hier : 2, /* Bus hierarchy level */
+ reserved1 : 1,
bsi : 8, /* Bus error status
* info
*/
- mc : 1, /* Machine check corrected */
- reserved1 : 31;
+ reserved2 : 22,
+
+ is : 1, /* instruction set (1 = ia32) */
+ iv : 1, /* instruction set field valid */
+ pl : 2, /* privilege level */
+ pv : 1, /* privilege level field valid */
+ mcc : 1, /* Machine check corrected */
+ tv : 1, /* Target address
+ * structure is valid
+ */
+ rq : 1, /* Requester identifier
+ * structure is valid
+ */
+ rp : 1, /* Responder identifier
+ * structure is valid
+ */
+ pi : 1; /* Precise instruction pointer
+ * structure is valid
+ */
} pal_bus_check_info_t;
+typedef struct pal_reg_file_check_info_s {
+ u64 id : 4, /* Register file identifier */
+ op : 4, /* Type of register
+ * operation that
+ * caused the machine
+ * check.
+ */
+ reg_num : 7, /* Register number */
+ rnv : 1, /* reg_num valid */
+ reserved2 : 38,
+
+ is : 1, /* instruction set (1 = ia32) */
+ iv : 1, /* instruction set field valid */
+ pl : 2, /* privilege level */
+ pv : 1, /* privilege level field valid */
+ mcc : 1, /* Machine check corrected */
+ reserved3 : 3,
+ pi : 1; /* Precise instruction pointer
+ * structure is valid
+ */
+} pal_reg_file_check_info_t;
+
+typedef struct pal_uarch_check_info_s {
+ u64 sid : 5, /* Structure identification */
+ level : 3, /* Level of failure */
+ array_id : 4, /* Array identification */
+ op : 4, /* Type of
+ * operation that
+ * caused the machine
+ * check.
+ */
+ way : 6, /* Way of structure */
+ wv : 1, /* way valid */
+ xv : 1, /* index valid */
+ reserved1 : 8,
+ index : 8, /* Index or set of the uarch
+ * structure that failed.
+ */
+ reserved2 : 24,
+
+ is : 1, /* instruction set (1 = ia32) */
+ iv : 1, /* instruction set field valid */
+ pl : 2, /* privilege level */
+ pv : 1, /* privilege level field valid */
+ mcc : 1, /* Machine check corrected */
+ tv : 1, /* Target address
+ * structure is valid
+ */
+ rq : 1, /* Requester identifier
+ * structure is valid
+ */
+ rp : 1, /* Responder identifier
+ * structure is valid
+ */
+ pi : 1; /* Precise instruction pointer
+ * structure is valid
+ */
+} pal_uarch_check_info_t;
+
typedef union pal_mc_error_info_u {
u64 pmei_data;
pal_processor_state_info_t pme_processor;
pal_cache_check_info_t pme_cache;
pal_tlb_check_info_t pme_tlb;
pal_bus_check_info_t pme_bus;
+ pal_reg_file_check_info_t pme_reg_file;
+ pal_uarch_check_info_t pme_uarch;
} pal_mc_error_info_t;
#define pmci_proc_unknown_check pme_processor.uc
Index: 9.2/include/asm-ia64/mca.h
--- 9.2/include/asm-ia64/mca.h Mon, 20 Oct 2003 11:16:44 +1000 kaos (linux-2.6-test/w/c/12_mca.h 1.3 644)
+++ 9.4/include/asm-ia64/mca.h Tue, 25 Nov 2003 19:23:21 +1100 kaos (linux-2.6-test/w/c/12_mca.h 1.4 644)
@@ -141,7 +141,6 @@ extern irqreturn_t ia64_mca_cpe_int_call
extern int ia64_log_print(int,prfunc_t);
extern void ia64_mca_cmc_vector_setup(void);
extern int ia64_mca_check_errors(void);
-extern u64 ia64_log_get(int, prfunc_t);
#define PLATFORM_CALL(fn, args) printk("Platform call TBD\n")
Index: 9.2/arch/ia64/Kconfig
--- 9.2/arch/ia64/Kconfig Tue, 21 Oct 2003 18:45:55 +1000 kaos (linux-2.6-test/F/e/0_Kconfig 1.6 644)
+++ 9.4/arch/ia64/Kconfig Tue, 25 Nov 2003 19:23:21 +1100 kaos (linux-2.6-test/F/e/0_Kconfig 1.7 644)
@@ -394,16 +394,6 @@ config IA64_PALINFO
To use this option, you have to ensure that the "/proc file system
support" (CONFIG_PROC_FS) is enabled, too.
-config IA64_SALINFO
- tristate "/proc/sal support"
- help
- The /proc/sal directory exports the SAL (system abstraction layer)
- feature bits, like whether the platform is subject to ITC drift. It
- is intended to be used by user programs that care about such things.
-
- To use this option, you have to ensure that the "/proc file system
- support" (CONFIG_PROC_FS) is enabled, too.
-
config EFI_VARS
tristate "/proc/efi/vars support"
help
Index: 9.2/arch/ia64/kernel/Makefile
--- 9.2/arch/ia64/kernel/Makefile Mon, 29 Sep 2003 13:13:55 +1000 kaos (linux-2.6-test/I/e/3_Makefile 1.3 644)
+++ 9.4/arch/ia64/kernel/Makefile Tue, 25 Nov 2003 19:23:21 +1100 kaos (linux-2.6-test/I/e/3_Makefile 1.4 644)
@@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.ld
obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \
irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \
- semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o unwind.o
+ salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o unwind.o
obj-$(CONFIG_EFI_VARS) += efivars.o
obj-$(CONFIG_IA64_BRL_EMU) += brl_emu.o
@@ -14,7 +14,6 @@ obj-$(CONFIG_IA64_GENERIC) += acpi-ext.o
obj-$(CONFIG_IA64_HP_ZX1) += acpi-ext.o
obj-$(CONFIG_IA64_MCA) += mca.o mca_asm.o
obj-$(CONFIG_IA64_PALINFO) += palinfo.o
-obj-$(CONFIG_IA64_SALINFO) += salinfo.o
obj-$(CONFIG_IOSAPIC) += iosapic.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_SMP) += smp.o smpboot.o
Index: 9.2/arch/ia64/kernel/salinfo.c
--- 9.2/arch/ia64/kernel/salinfo.c Mon, 29 Sep 2003 13:13:55 +1000 kaos (linux-2.6-test/I/e/22_salinfo.c 1.2 644)
+++ 9.4/arch/ia64/kernel/salinfo.c Tue, 25 Nov 2003 19:23:21 +1100 kaos (linux-2.6-test/I/e/22_salinfo.c 1.3 644)
@@ -3,18 +3,31 @@
*
* Creates entries in /proc/sal for various system features.
*
- * Copyright (c) 2001 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (c) 2003 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (c) 2003 Hewlett-Packard Co
+ * Bjorn Helgaas <bjorn.helgaas@hp.com>
*
- * 09/11/2003 jbarnes@sgi.com updated for 2.6
* 10/30/2001 jbarnes@sgi.com copied much of Stephane's palinfo
* code to create this file
+ * Oct 23 2003 kaos@sgi.com
+ * Replace IPI with set_cpus_allowed() to read a record from the required cpu.
+ * Redesign salinfo log processing to separate interrupt and user space
+ * contexts.
+ * Cache the record across multi-block reads from user space.
+ * Support > 64 cpus.
+ * Delete module_exit and MOD_INC/DEC_COUNT, salinfo cannot be a module.
*/
#include <linux/types.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <asm/semaphore.h>
#include <asm/sal.h>
+#include <asm/uaccess.h>
MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
@@ -41,42 +54,511 @@ static salinfo_entry_t salinfo_entries[]
#define NR_SALINFO_ENTRIES ARRAY_SIZE(salinfo_entries)
-/*
- * One for each feature and one more for the directory entry...
+static char *salinfo_log_name[] = {
+ "mca",
+ "init",
+ "cmc",
+ "cpe",
+};
+
+static struct proc_dir_entry *salinfo_proc_entries[
+ ARRAY_SIZE(salinfo_entries) + /* /proc/sal/bus_lock */
+ ARRAY_SIZE(salinfo_log_name) + /* /proc/sal/{mca,...} */
+ (2 * ARRAY_SIZE(salinfo_log_name)) + /* /proc/sal/mca/{event,data} */
+ 1]; /* /proc/sal */
+
+/* Some records we get ourselves, some are accessed as saved data in buffers
+ * that are owned by mca.c.
+ */
+struct salinfo_data_saved {
+ u8* buffer;
+ u64 size;
+ u64 id;
+ int cpu;
+};
+
+/* State transitions. Actions are :-
+ * Write "read <cpunum>" to the data file.
+ * Write "clear <cpunum>" to the data file.
+ * Write "oemdata <cpunum> <offset> to the data file.
+ * Read from the data file.
+ * Close the data file.
+ *
+ * Start state is NO_DATA.
+ *
+ * NO_DATA
+ * write "read <cpunum>" -> NO_DATA or LOG_RECORD.
+ * write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
+ * write "oemdata <cpunum> <offset> -> return -EINVAL.
+ * read data -> return EOF.
+ * close -> unchanged. Free record areas.
+ *
+ * LOG_RECORD
+ * write "read <cpunum>" -> NO_DATA or LOG_RECORD.
+ * write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
+ * write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
+ * read data -> return the INIT/MCA/CMC/CPE record.
+ * close -> unchanged. Keep record areas.
+ *
+ * OEMDATA
+ * write "read <cpunum>" -> NO_DATA or LOG_RECORD.
+ * write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
+ * write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
+ * read data -> return the formatted oemdata.
+ * close -> unchanged. Keep record areas.
+ *
+ * Closing the data file does not change the state. This allows shell scripts
+ * to manipulate salinfo data, each shell redirection opens the file, does one
+ * action then closes it again. The record areas are only freed at close when
+ * the state is NO_DATA.
+ */
+enum salinfo_state {
+ STATE_NO_DATA,
+ STATE_LOG_RECORD,
+ STATE_OEMDATA,
+};
+
+struct salinfo_data {
+ volatile cpumask_t cpu_event; /* which cpus have outstanding events */
+ struct semaphore sem; /* count of cpus with outstanding events (bits set in cpu_event) */
+ u8 *log_buffer;
+ u64 log_size;
+ u8 *oemdata; /* decoded oem data */
+ u64 oemdata_size;
+ int open; /* single-open to prevent races */
+ u8 type;
+ u8 saved_num; /* using a saved record? */
+ enum salinfo_state state :8; /* processing state */
+ u8 padding;
+ int cpu_check; /* next CPU to check */
+ struct salinfo_data_saved data_saved[5];/* save last 5 records from mca.c, must be < 255 */
+};
+
+static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)];
+
+static spinlock_t data_lock, data_saved_lock;
+
+/** salinfo_platform_oemdata - optional callback to decode oemdata from an error
+ * record.
+ * @sect_header: pointer to the start of the section to decode.
+ * @oemdata: returns vmalloc area containing the decded output.
+ * @oemdata_size: returns length of decoded output (strlen).
+ *
+ * Description: If user space asks for oem data to be decoded by the kernel
+ * and/or prom and the platform has set salinfo_platform_oemdata to the address
+ * of a platform specific routine then call that routine. salinfo_platform_oemdata
+ * vmalloc's and formats its output area, returning the address of the text
+ * and its strlen. Returns 0 for success, -ve for error. The callback is
+ * invoked on the cpu that generated the error record.
*/
-static struct proc_dir_entry *salinfo_proc_entries[NR_SALINFO_ENTRIES + 1];
+int (*salinfo_platform_oemdata)(const u8 *sect_header, u8 **oemdata, u64 *oemdata_size);
+
+struct salinfo_platform_oemdata_parms {
+ const u8 *efi_guid;
+ u8 **oemdata;
+ u64 *oemdata_size;
+ int ret;
+};
+
+static void
+salinfo_platform_oemdata_cpu(void *context)
+{
+ struct salinfo_platform_oemdata_parms *parms = context;
+ parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
+}
+
+static void
+shift1_data_saved (struct salinfo_data *data, int shift)
+{
+ memcpy(data->data_saved+shift, data->data_saved+shift+1,
+ (ARRAY_SIZE(data->data_saved) - (shift+1)) * sizeof(data->data_saved[0]));
+ memset(data->data_saved + ARRAY_SIZE(data->data_saved) - 1, 0,
+ sizeof(data->data_saved[0]));
+}
+
+/* This routine is invoked in interrupt context. Note: mca.c enables
+ * interrupts before calling this code for CMC/CPE. MCA and INIT events are
+ * not irq safe, do not call any routines that use spinlocks, they may deadlock.
+ *
+ * The buffer passed from mca.c points to the output from ia64_log_get. This is
+ * a persistent buffer but its contents can change between the interrupt and
+ * when user space processes the record. Save the record id to identify
+ * changes.
+ */
+void
+salinfo_log_wakeup(int type, u8 *buffer, u64 size)
+{
+ struct salinfo_data *data = salinfo_data + type;
+ struct salinfo_data_saved *data_saved;
+ unsigned long flags = 0;
+ int i, irqsafe = type != SAL_INFO_TYPE_MCA && type != SAL_INFO_TYPE_INIT;
+ int saved_size = ARRAY_SIZE(data->data_saved);
+
+ BUG_ON(type >= ARRAY_SIZE(salinfo_log_name));
+
+ if (irqsafe)
+ spin_lock_irqsave(&data_saved_lock, flags);
+ for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
+ if (!data_saved->buffer)
+ break;
+ }
+ if (i = saved_size) {
+ if (!data->saved_num) {
+ shift1_data_saved(data, 0);
+ data_saved = data->data_saved + saved_size - 1;
+ } else
+ data_saved = NULL;
+ }
+ if (data_saved) {
+ data_saved->cpu = smp_processor_id();
+ data_saved->id = ((sal_log_record_header_t *)buffer)->id;
+ data_saved->size = size;
+ data_saved->buffer = buffer;
+ }
+ if (irqsafe)
+ spin_unlock_irqrestore(&data_saved_lock, flags);
+
+ if (!test_and_set_bit(smp_processor_id(), &data->cpu_event)) {
+ if (irqsafe)
+ up(&data->sem);
+ }
+}
+
+static int
+salinfo_event_open(struct inode *inode, struct file *file)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return 0;
+}
+
+static ssize_t
+salinfo_event_read(struct file *file, char *buffer, size_t count, loff_t *ppos)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct proc_dir_entry *entry = PDE(inode);
+ struct salinfo_data *data = entry->data;
+ char cmd[32];
+ size_t size;
+ int i, n, cpu = -1;
+
+retry:
+ if (down_trylock(&data->sem)) {
+ if (file->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+ if (down_interruptible(&data->sem))
+ return -ERESTARTSYS;
+ }
+
+ n = data->cpu_check;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (test_bit(n, &data->cpu_event)) {
+ cpu = n;
+ break;
+ }
+ if (++n = NR_CPUS)
+ n = 0;
+ }
+
+ if (cpu = -1)
+ goto retry;
+
+ /* events are sticky until the user says "clear" */
+ up(&data->sem);
+
+ /* for next read, start checking at next CPU */
+ data->cpu_check = cpu;
+ if (++data->cpu_check = NR_CPUS)
+ data->cpu_check = 0;
+
+ snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
+
+ size = strlen(cmd);
+ if (size > count)
+ size = count;
+ if (copy_to_user(buffer, cmd, size))
+ return -EFAULT;
+
+ return size;
+}
+
+static struct file_operations salinfo_event_fops = {
+ .open = salinfo_event_open,
+ .read = salinfo_event_read,
+};
+
+static int
+salinfo_log_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *entry = PDE(inode);
+ struct salinfo_data *data = entry->data;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock(&data_lock);
+ if (data->open) {
+ spin_unlock(&data_lock);
+ return -EBUSY;
+ }
+ data->open = 1;
+ spin_unlock(&data_lock);
+
+ if (data->state = STATE_NO_DATA &&
+ !(data->log_buffer = vmalloc(ia64_sal_get_state_info_size(data->type)))) {
+ data->open = 0;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int
+salinfo_log_release(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *entry = PDE(inode);
+ struct salinfo_data *data = entry->data;
+
+ if (data->state = STATE_NO_DATA) {
+ vfree(data->log_buffer);
+ vfree(data->oemdata);
+ data->log_buffer = NULL;
+ data->oemdata = NULL;
+ }
+ spin_lock(&data_lock);
+ data->open = 0;
+ spin_unlock(&data_lock);
+ return 0;
+}
+
+static void
+call_on_cpu(int cpu, void (*fn)(void *), void *arg)
+{
+ cpumask_t save_cpus_allowed, new_cpus_allowed;
+ memcpy(&save_cpus_allowed, ¤t->cpus_allowed, sizeof(save_cpus_allowed));
+ memset(&new_cpus_allowed, 0, sizeof(new_cpus_allowed));
+ set_bit(cpu, &new_cpus_allowed);
+ set_cpus_allowed(current, new_cpus_allowed);
+ (*fn)(arg);
+ set_cpus_allowed(current, save_cpus_allowed);
+}
+
+static void
+salinfo_log_read_cpu(void *context)
+{
+ struct salinfo_data *data = context;
+ data->log_size = ia64_sal_get_state_info(data->type, (u64 *) data->log_buffer);
+}
+
+static void
+salinfo_log_new_read(int cpu, struct salinfo_data *data)
+{
+ struct salinfo_data_saved *data_saved;
+ unsigned long flags;
+ int i;
+ int saved_size = ARRAY_SIZE(data->data_saved);
+
+ data->saved_num = 0;
+ spin_lock_irqsave(&data_saved_lock, flags);
+retry:
+ for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
+ if (data_saved->buffer && data_saved->cpu = cpu) {
+ sal_log_record_header_t *rh = (sal_log_record_header_t *)(data_saved->buffer);
+ data->log_size = data_saved->size;
+ memcpy(data->log_buffer, rh, data->log_size);
+ barrier(); /* id check must not be moved */
+ if (rh->id = data_saved->id) {
+ data->saved_num = i+1;
+ break;
+ }
+ /* saved record changed by mca.c since interrupt, discard it */
+ shift1_data_saved(data, i);
+ goto retry;
+ }
+ }
+ spin_unlock_irqrestore(&data_saved_lock, flags);
+
+ if (!data->saved_num)
+ call_on_cpu(cpu, salinfo_log_read_cpu, data);
+ data->state = data->log_size ? STATE_LOG_RECORD : STATE_NO_DATA;
+}
+
+static ssize_t
+salinfo_log_read(struct file *file, char *buffer, size_t count, loff_t *ppos)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct proc_dir_entry *entry = PDE(inode);
+ struct salinfo_data *data = entry->data;
+ void *saldata;
+ size_t size;
+ u8 *buf;
+ u64 bufsize;
+
+ if (data->state = STATE_LOG_RECORD) {
+ buf = data->log_buffer;
+ bufsize = data->log_size;
+ } else if (data->state = STATE_OEMDATA) {
+ buf = data->oemdata;
+ bufsize = data->oemdata_size;
+ } else {
+ buf = NULL;
+ bufsize = 0;
+ }
+ if (*ppos >= bufsize)
+ return 0;
+
+ saldata = buf + file->f_pos;
+ size = bufsize - file->f_pos;
+ if (size > count)
+ size = count;
+ if (copy_to_user(buffer, saldata, size))
+ return -EFAULT;
+
+ *ppos += size;
+ return size;
+}
+
+static void
+salinfo_log_clear_cpu(void *context)
+{
+ struct salinfo_data *data = context;
+ ia64_sal_clear_state_info(data->type);
+}
+
+static int
+salinfo_log_clear(struct salinfo_data *data, int cpu)
+{
+ data->state = STATE_NO_DATA;
+ if (!test_bit(cpu, &data->cpu_event))
+ return 0;
+ down(&data->sem);
+ clear_bit(cpu, &data->cpu_event);
+ if (data->saved_num) {
+ unsigned long flags;
+ spin_lock_irqsave(&data_saved_lock, flags);
+ shift1_data_saved(data, data->saved_num - 1 );
+ data->saved_num = 0;
+ spin_unlock_irqrestore(&data_saved_lock, flags);
+ }
+ call_on_cpu(cpu, salinfo_log_clear_cpu, data);
+
+ /* clearing a record may make a new record visible */
+ salinfo_log_new_read(cpu, data);
+ if (data->state = STATE_LOG_RECORD &&
+ !test_and_set_bit(cpu, &data->cpu_event))
+ up(&data->sem);
+ return 0;
+}
+
+static ssize_t
+salinfo_log_write(struct file *file, const char *buffer, size_t count, loff_t *ppos)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct proc_dir_entry *entry = PDE(inode);
+ struct salinfo_data *data = entry->data;
+ char cmd[32];
+ size_t size;
+ u32 offset;
+ int cpu;
+
+ size = sizeof(cmd);
+ if (count < size)
+ size = count;
+ if (copy_from_user(cmd, buffer, size))
+ return -EFAULT;
+
+ if (sscanf(cmd, "read %d", &cpu) = 1) {
+ salinfo_log_new_read(cpu, data);
+ } else if (sscanf(cmd, "clear %d", &cpu) = 1) {
+ int ret;
+ if ((ret = salinfo_log_clear(data, cpu)))
+ count = ret;
+ } else if (sscanf(cmd, "oemdata %d %d", &cpu, &offset) = 2) {
+ if (data->state != STATE_LOG_RECORD && data->state != STATE_OEMDATA)
+ return -EINVAL;
+ if (offset > data->log_size - sizeof(efi_guid_t))
+ return -EINVAL;
+ data->state = STATE_OEMDATA;
+ if (salinfo_platform_oemdata) {
+ struct salinfo_platform_oemdata_parms parms = {
+ .efi_guid = data->log_buffer + offset,
+ .oemdata = &data->oemdata,
+ .oemdata_size = &data->oemdata_size
+ };
+ call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms);
+ if (parms.ret)
+ count = parms.ret;
+ } else
+ data->oemdata_size = 0;
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static struct file_operations salinfo_data_fops = {
+ .open = salinfo_log_open,
+ .release = salinfo_log_release,
+ .read = salinfo_log_read,
+ .write = salinfo_log_write,
+};
static int __init
salinfo_init(void)
{
struct proc_dir_entry *salinfo_dir; /* /proc/sal dir entry */
struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */
- int i;
+ struct proc_dir_entry *dir, *entry;
+ struct salinfo_data *data;
+ int i, j, online;
salinfo_dir = proc_mkdir("sal", NULL);
+ if (!salinfo_dir)
+ return 0;
for (i=0; i < NR_SALINFO_ENTRIES; i++) {
/* pass the feature bit in question as misc data */
- *sdir = create_proc_read_entry (salinfo_entries[i].name, 0, salinfo_dir,
+ *sdir++ = create_proc_read_entry (salinfo_entries[i].name, 0, salinfo_dir,
salinfo_read, (void *)salinfo_entries[i].feature);
- if (*sdir)
- (*sdir)->owner = THIS_MODULE;
- sdir++;
}
- *sdir++ = salinfo_dir;
-
- return 0;
-}
-static void __exit
-salinfo_exit(void)
-{
- int i = 0;
+ for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) {
+ data = salinfo_data + i;
+ data->type = i;
+ sema_init(&data->sem, 0);
+ dir = proc_mkdir(salinfo_log_name[i], salinfo_dir);
+ if (!dir)
+ continue;
+
+ entry = create_proc_entry("event", S_IRUSR, dir);
+ if (!entry)
+ continue;
+ entry->data = data;
+ entry->proc_fops = &salinfo_event_fops;
+ *sdir++ = entry;
+
+ entry = create_proc_entry("data", S_IRUSR | S_IWUSR, dir);
+ if (!entry)
+ continue;
+ entry->data = data;
+ entry->proc_fops = &salinfo_data_fops;
+ *sdir++ = entry;
+
+ /* we missed any events before now */
+ online = 0;
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j)) {
+ set_bit(j, &data->cpu_event);
+ ++online;
+ }
+ sema_init(&data->sem, online);
- for (i = 0; i < NR_SALINFO_ENTRIES ; i++) {
- if (salinfo_proc_entries[i])
- remove_proc_entry (salinfo_proc_entries[i]->name, NULL);
+ *sdir++ = dir;
}
+
+ *sdir++ = salinfo_dir;
+
+ return 0;
}
/*
@@ -102,4 +584,3 @@ salinfo_read(char *page, char **start, o
}
module_init(salinfo_init);
-module_exit(salinfo_exit);
Index: 9.2/arch/ia64/kernel/mca.c
--- 9.2/arch/ia64/kernel/mca.c Mon, 20 Oct 2003 11:16:44 +1000 kaos (linux-2.6-test/I/e/30_mca.c 1.2.1.4 644)
+++ 9.4/arch/ia64/kernel/mca.c Tue, 25 Nov 2003 19:30:04 +1100 kaos (linux-2.6-test/I/e/30_mca.c 1.2.1.6 644)
@@ -87,6 +87,7 @@ static void ia64_mca_wakeup_all(void);
static void ia64_log_init(int);
extern void ia64_monarch_init_handler (void);
extern void ia64_slave_init_handler (void);
+static u64 ia64_log_get(int sal_info_type, u8 **buffer);
extern struct hw_interrupt_type irq_type_iosapic_level;
static struct irqaction cmci_irqaction = {
@@ -149,12 +150,14 @@ static int cmc_polling_enabled = 1;
*/
static int cpe_poll_enabled = 1;
+extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size);
+
/*
* ia64_mca_log_sal_error_record
*
- * This function retrieves a specified error record type from SAL, sends it to
- * the system log, and notifies SALs to clear the record from its non-volatile
- * memory.
+ * This function retrieves a specified error record type from SAL,
+ * wakes up any processes waiting for error records, and sends it to
+ * the system log.
*
* Inputs : sal_info_type (Type of error record MCA/CMC/CPE/INIT)
* Outputs : platform error status
@@ -162,11 +165,13 @@ static int cpe_poll_enabled = 1;
int
ia64_mca_log_sal_error_record(int sal_info_type, int called_from_init)
{
- int platform_err = 0;
+ u8 *buffer;
+ u64 size;
+ int platform_err;
- /* Get the MCA error record */
- if (!ia64_log_get(sal_info_type, (prfunc_t)printk))
- return platform_err; /* no record retrieved */
+ size = ia64_log_get(sal_info_type, &buffer);
+ if (!size)
+ return 0;
/* TODO:
* 1. analyze error logs to determine recoverability
@@ -174,10 +179,10 @@ ia64_mca_log_sal_error_record(int sal_in
* 3. set ia64_os_mca_recovery_successful flag, if applicable
*/
+ salinfo_log_wakeup(sal_info_type, buffer, size);
platform_err = ia64_log_print(sal_info_type, (prfunc_t)printk);
- /* temporary: only clear SAL logs on hardware-corrected errors
- or if we're logging an error after an MCA-initiated reboot */
- if ((sal_info_type > 1) || (called_from_init))
+ /* Clear logs from corrected errors in case there's no user-level logger */
+ if (sal_info_type = SAL_INFO_TYPE_CPE || sal_info_type = SAL_INFO_TYPE_CMC)
ia64_sal_clear_state_info(sal_info_type);
return platform_err;
@@ -450,7 +455,10 @@ static void
ia64_mca_register_cpev (int cpev)
{
/* Register the CPE interrupt vector with SAL */
- if (ia64_sal_mc_set_params(SAL_MC_PARAM_CPE_INT, SAL_MC_PARAM_MECHANISM_INT, cpev, 0, 0)) {
+ struct ia64_sal_retval isrv;
+
+ isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_CPE_INT, SAL_MC_PARAM_MECHANISM_INT, cpev, 0, 0);
+ if (isrv.status) {
printk(KERN_ERR "ia64_mca_platform_init: failed to register Corrected "
"Platform Error interrupt vector with SAL.\n");
return;
@@ -629,6 +637,8 @@ ia64_mca_init(void)
ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch;
int i;
s64 rc;
+ struct ia64_sal_retval isrv;
+ u64 timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */
IA64_MCA_DEBUG("ia64_mca_init: begin\n");
@@ -644,23 +654,33 @@ ia64_mca_init(void)
*/
/* Register the rendezvous interrupt vector with SAL */
- if ((rc = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_INT,
- SAL_MC_PARAM_MECHANISM_INT,
- IA64_MCA_RENDEZ_VECTOR,
- IA64_MCA_RENDEZ_TIMEOUT,
- SAL_MC_PARAM_RZ_ALWAYS)))
- {
+ while (1) {
+ isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_INT,
+ SAL_MC_PARAM_MECHANISM_INT,
+ IA64_MCA_RENDEZ_VECTOR,
+ timeout,
+ SAL_MC_PARAM_RZ_ALWAYS);
+ rc = isrv.status;
+ if (rc = 0)
+ break;
+ if (rc = -2) {
+ printk(KERN_INFO "ia64_mca_init: increasing MCA rendezvous timeout from "
+ "%ld to %ld\n", timeout, isrv.v0);
+ timeout = isrv.v0;
+ continue;
+ }
printk(KERN_ERR "ia64_mca_init: Failed to register rendezvous interrupt "
"with SAL. rc = %ld\n", rc);
return;
}
/* Register the wakeup interrupt vector with SAL */
- if ((rc = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_WAKEUP,
- SAL_MC_PARAM_MECHANISM_INT,
- IA64_MCA_WAKEUP_VECTOR,
- 0, 0)))
- {
+ isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_WAKEUP,
+ SAL_MC_PARAM_MECHANISM_INT,
+ IA64_MCA_WAKEUP_VECTOR,
+ 0, 0);
+ rc = isrv.status;
+ if (rc) {
printk(KERN_ERR "ia64_mca_init: Failed to register wakeup interrupt with SAL. "
"rc = %ld\n", rc);
return;
@@ -1399,12 +1419,12 @@ ia64_log_init(int sal_info_type)
* Get the current MCA log from SAL and copy it into the OS log buffer.
*
* Inputs : info_type (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
- * prfunc (fn ptr of log output function)
* Outputs : size (total record length)
+ * *buffer (ptr to error record)
*
*/
-u64
-ia64_log_get(int sal_info_type, prfunc_t prfunc)
+static u64
+ia64_log_get(int sal_info_type, u8 **buffer)
{
sal_log_record_header_t *log_buffer;
u64 total_len = 0;
@@ -1422,6 +1442,7 @@ ia64_log_get(int sal_info_type, prfunc_t
IA64_LOG_UNLOCK(sal_info_type);
IA64_MCA_DEBUG("ia64_log_get: SAL error record type %d retrieved. "
"Record length = %ld\n", sal_info_type, total_len);
+ *buffer = (u8 *) log_buffer;
return total_len;
} else {
IA64_LOG_UNLOCK(sal_info_type);
@@ -1466,7 +1487,7 @@ ia64_log_prt_oem_data (int header_len, i
void
ia64_log_rec_header_print (sal_log_record_header_t *lh, prfunc_t prfunc)
{
- prfunc("+Err Record ID: %d SAL Rev: %2x.%02x\n", lh->id,
+ prfunc("+Err Record ID: %ld SAL Rev: %2x.%02x\n", lh->id,
lh->revision.major, lh->revision.minor);
prfunc("+Time: %02x/%02x/%02x%02x %02x:%02x:%02x Severity %d\n",
lh->timestamp.slh_month, lh->timestamp.slh_day,
@@ -1589,13 +1610,13 @@ ia64_log_cache_check_info_print (int
if (info->dl)
prfunc(" Line: Data,");
prfunc(" Operation: %s,", pal_cache_op[info->op]);
- if (info->wv)
+ if (info->wiv)
prfunc(" Way: %d,", info->way);
if (cache_check_info->valid.target_identifier)
/* Hope target address is saved in target_identifier */
if (info->tv)
prfunc(" Target Addr: 0x%lx,", target_addr);
- if (info->mc)
+ if (info->mcc)
prfunc(" MC: Corrected");
prfunc("\n");
}
@@ -1631,13 +1652,13 @@ ia64_log_tlb_check_info_print (int
prfunc(" Failure: Data Translation Cache");
if (info->itr) {
prfunc(" Failure: Instruction Translation Register");
- prfunc(" ,Slot: %d", info->tr_slot);
+ prfunc(" ,Slot: %ld", info->tr_slot);
}
if (info->dtr) {
prfunc(" Failure: Data Translation Register");
- prfunc(" ,Slot: %d", info->tr_slot);
+ prfunc(" ,Slot: %ld", info->tr_slot);
}
- if (info->mc)
+ if (info->mcc)
prfunc(" ,MC: Corrected");
prfunc("\n");
}
@@ -1683,7 +1704,7 @@ ia64_log_bus_check_info_print (int
prfunc(" ,Error: Internal");
if (info->eb)
prfunc(" ,Error: External");
- if (info->mc)
+ if (info->mcc)
prfunc(" ,MC: Corrected");
if (info->tv)
prfunc(" ,Target Address: 0x%lx", targ_addr);
@@ -1970,9 +1991,9 @@ ia64_log_plat_specific_err_info_print (s
ia64_log_prt_guid(&psei->guid, prfunc);
}
if (psei->valid.oem_data) {
- platform_plat_specific_err_print((int)psei->header.len,
- (int)sizeof(sal_log_plat_specific_err_info_t) - 1,
- &(psei->oem_data[0]), prfunc);
+ platform_plat_specific_err_print((int) psei->header.len,
+ (char *) psei->oem_data - (char *) psei,
+ &psei->oem_data[0], prfunc);
}
prfunc("\n");
}
@@ -2352,13 +2373,12 @@ ia64_log_print(int sal_info_type, prfunc
switch(sal_info_type) {
case SAL_INFO_TYPE_MCA:
- prfunc("+BEGIN HARDWARE ERROR STATE AT MCA\n");
- platform_err = ia64_log_platform_info_print(IA64_LOG_CURR_BUFFER(sal_info_type),
- prfunc);
- prfunc("+END HARDWARE ERROR STATE AT MCA\n");
+ prfunc("+CPU %d: SAL log contains MCA error record\n", smp_processor_id());
+ ia64_log_rec_header_print(IA64_LOG_CURR_BUFFER(sal_info_type), prfunc);
break;
case SAL_INFO_TYPE_INIT:
- prfunc("+MCA INIT ERROR LOG (UNIMPLEMENTED)\n");
+ prfunc("+CPU %d: SAL log contains INIT error record\n", smp_processor_id());
+ ia64_log_rec_header_print(IA64_LOG_CURR_BUFFER(sal_info_type), prfunc);
break;
case SAL_INFO_TYPE_CMC:
prfunc("+BEGIN HARDWARE ERROR STATE AT CMC\n");
-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [patch] 2.6.0 MCA TLB error recovery
2003-12-19 5:06 Keith Owens
` (2 preceding siblings ...)
2003-12-20 0:03 ` Luck, Tony
@ 2003-12-20 0:04 ` Luck, Tony
2003-12-20 0:26 ` Keith Owens
` (2 subsequent siblings)
6 siblings, 0 replies; 9+ messages in thread
From: Luck, Tony @ 2003-12-20 0:04 UTC (permalink / raw)
To: linux-ia64
> I didn't keep the old patches around. Could you and Keith send me a
> consistent set of patches? That would help me.
>
Keith's second fix:
From: linux-ia64-owner@vger.kernel.org on behalf of Keith Owens
[kaos@sgi.com]
Sent: Sunday, December 07, 2003 10:25 PM
To: Alex Williamson
Cc: linux-ia64@vger.kernel.org
Subject: Re: [patch] 2.4.23 fix deadlock in ia64_mca_cmc_int_caller
On Sun, 07 Dec 2003 22:30:00 -0700,
Alex Williamson <alex.williamson@hp.com> wrote:
> Looks good to me, only comment I have would be to tack the below
>chunk into ia64_mca_late_init().
Duh, how did I miss ia64_mca_init? Take 2.
smp_call_function() must not be called from interrupt context (can
deadlock on tasklist_lock). Use keventd to call smp_call_function().
Index: 23.5/arch/ia64/kernel/mca.c
--- 23.5/arch/ia64/kernel/mca.c Tue, 18 Nov 2003 16:26:06 +1100 kaos (linux-2.4/s/c/5_mca.c 1.1.3.2.3.1.1.1.1.2.1.1.1.1.1.5 644)
+++ 23.5(w)/arch/ia64/kernel/mca.c Mon, 08 Dec 2003 17:23:55 +1100 kaos (linux-2.4/s/c/5_mca.c 1.1.3.2.3.1.1.1.1.2.1.1.1.1.1.5 644)
@@ -36,6 +36,10 @@
* SAL 3.0 spec.
* 00/03/29 C. Fleckenstein Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
* added min save state dump, added INIT handler.
+ *
+ * 2003-12-08 Keith Owens <kaos@sgi.com>
+ * smp_call_function() must not be called from interrupt context (can
+ * deadlock on tasklist_lock). Use keventd to call smp_call_function().
*/
#include <linux/config.h>
#include <linux/types.h>
@@ -50,6 +54,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/smp.h>
+#include <linux/tqueue.h>
#include <asm/delay.h>
#include <asm/machvec.h>
@@ -154,6 +159,8 @@ static int cpe_poll_enabled = 1;
extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size);
+static struct tq_struct cmc_disable_tq, cmc_enable_tq;
+
/*
* ia64_mca_log_sal_error_record
*
@@ -626,6 +633,36 @@ verify_guid (efi_guid_t *test, efi_guid_
}
/*
+ * ia64_mca_cmc_vector_disable_keventd
+ *
+ * Called via keventd (smp_call_function() is not safe in interrupt context) to
+ * disable the cmc interrupt vector.
+ *
+ * Note: needs preempt_disable() if you apply the preempt patch to 2.4.
+ */
+static void
+ia64_mca_cmc_vector_disable_keventd(void *unused)
+{
+ ia64_mca_cmc_vector_disable(NULL);
+ smp_call_function(ia64_mca_cmc_vector_disable, NULL, 1, 0);
+}
+
+/*
+ * ia64_mca_cmc_vector_enable_keventd
+ *
+ * Called via keventd (smp_call_function() is not safe in interrupt context) to
+ * enable the cmc interrupt vector.
+ *
+ * Note: needs preempt_disable() if you apply the preempt patch to 2.4.
+ */
+static void
+ia64_mca_cmc_vector_enable_keventd(void *unused)
+{
+ smp_call_function(ia64_mca_cmc_vector_enable, NULL, 1, 0);
+ ia64_mca_cmc_vector_enable(NULL);
+}
+
+/*
* ia64_mca_init
*
* Do all the system level mca specific initialization.
@@ -658,6 +695,9 @@ ia64_mca_init(void)
IA64_MCA_DEBUG("ia64_mca_init: begin\n");
+ INIT_TQUEUE(&cmc_disable_tq, ia64_mca_cmc_vector_disable_keventd, NULL);
+ INIT_TQUEUE(&cmc_enable_tq, ia64_mca_cmc_vector_enable_keventd, NULL);
+
/* initialize recovery success indicator */
ia64_os_mca_recovery_successful = 0;
@@ -1062,14 +1102,7 @@ ia64_mca_cmc_int_handler(int cmc_irq, vo
cmc_polling_enabled = 1;
spin_unlock(&cmc_history_lock);
-
- /*
- * We rely on the local_irq_enable() above so
- * that this can't deadlock.
- */
- ia64_mca_cmc_vector_disable(NULL);
-
- smp_call_function(ia64_mca_cmc_vector_disable, NULL, 1, 0);
+ schedule_task(&cmc_disable_tq);
/*
* Corrected errors will still be corrected, but
@@ -1163,19 +1196,7 @@ ia64_mca_cmc_int_caller(int cpe_irq, voi
if (start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) {
printk(KERN_WARNING "%s: Returning to interrupt driven CMC handler\n", __FUNCTION__);
-
- /*
- * The cmc interrupt handler enabled irqs, so
- * this can't deadlock.
- */
- smp_call_function(ia64_mca_cmc_vector_enable, NULL, 1, 0);
-
- /*
- * Turn off interrupts before re-enabling the
- * cmc vector locally. Make sure we get out.
- */
- local_irq_disable();
- ia64_mca_cmc_vector_enable(NULL);
+ schedule_task(&cmc_enable_tq);
cmc_polling_enabled = 0;
} else {
-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] 2.6.0 MCA TLB error recovery
2003-12-19 5:06 Keith Owens
` (3 preceding siblings ...)
2003-12-20 0:04 ` Luck, Tony
@ 2003-12-20 0:26 ` Keith Owens
2003-12-20 0:32 ` David Mosberger
2003-12-20 2:56 ` Keith Owens
6 siblings, 0 replies; 9+ messages in thread
From: Keith Owens @ 2003-12-20 0:26 UTC (permalink / raw)
To: linux-ia64
On Fri, 19 Dec 2003 16:04:14 -0800,
"Luck, Tony" <tony.luck@intel.com> wrote:
>DavidM wroe
>> I didn't keep the old patches around. Could you and Keith send me a
>> consistent set of patches? That would help me.
>>
>
>Keith's second fix:
>smp_call_function() must not be called from interrupt context (can
>deadlock on tasklist_lock). Use keventd to call smp_call_function().
>Index: 23.5/arch/ia64/kernel/mca.c
Do not apply this patch to 2.6, it is for 2.4 only. 2.6 uses different
structure and function names, plus the patch needs tweaking for
preempt.
David, please apply the first mca/salinfo patch to your tree (Forward
port the recent changes to pal.h, sal.h, mca.h, salinfo.c and mca.c
from 2.4.23-rc2 to 2.6.0-test9), then I will do a 2.6 version of the
second patch over the top.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] 2.6.0 MCA TLB error recovery
2003-12-19 5:06 Keith Owens
` (4 preceding siblings ...)
2003-12-20 0:26 ` Keith Owens
@ 2003-12-20 0:32 ` David Mosberger
2003-12-20 2:56 ` Keith Owens
6 siblings, 0 replies; 9+ messages in thread
From: David Mosberger @ 2003-12-20 0:32 UTC (permalink / raw)
To: linux-ia64
>>>>> On Sat, 20 Dec 2003 11:26:12 +1100, Keith Owens <kaos@sgi.com> said:
Keith> David, please apply the first mca/salinfo patch to your tree
Keith> (Forward port the recent changes to pal.h, sal.h, mca.h,
Keith> salinfo.c and mca.c from 2.4.23-rc2 to 2.6.0-test9), then I
Keith> will do a 2.6 version of the second patch over the top.
Whoops, too late. I did see the preempt comments and figured you'll
fix that once it's in the tree. Can you go from what's in the bk
trees now (give me a minute or two, I'm just pushing things to
to-linus-2.5 and linux-ia64-2.5 will be next).
--david
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] 2.6.0 MCA TLB error recovery
2003-12-19 5:06 Keith Owens
` (5 preceding siblings ...)
2003-12-20 0:32 ` David Mosberger
@ 2003-12-20 2:56 ` Keith Owens
6 siblings, 0 replies; 9+ messages in thread
From: Keith Owens @ 2003-12-20 2:56 UTC (permalink / raw)
To: linux-ia64
On Fri, 19 Dec 2003 16:32:14 -0800,
David Mosberger <davidm@napali.hpl.hp.com> wrote:
>>>>>> On Sat, 20 Dec 2003 11:26:12 +1100, Keith Owens <kaos@sgi.com> said:
>
> Keith> David, please apply the first mca/salinfo patch to your tree
> Keith> (Forward port the recent changes to pal.h, sal.h, mca.h,
> Keith> salinfo.c and mca.c from 2.4.23-rc2 to 2.6.0-test9), then I
> Keith> will do a 2.6 version of the second patch over the top.
>
>Whoops, too late. I did see the preempt comments and figured you'll
>fix that once it's in the tree. Can you go from what's in the bk
>trees now (give me a minute or two, I'm just pushing things to
>to-linus-2.5 and linux-ia64-2.5 will be next).
Convert cmc deadlock avoidance patch from 2.4 to 2.6.
--- 2.6.0/arch/ia64/kernel/mca.c.orig Sat Dec 20 13:33:30 2003
+++ 2.6.0/arch/ia64/kernel/mca.c Sat Dec 20 13:52:10 2003
@@ -55,7 +55,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/smp.h>
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
#include <asm/delay.h>
#include <asm/machvec.h>
@@ -158,8 +158,6 @@
extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size);
-static struct tq_struct cmc_disable_tq, cmc_enable_tq;
-
/*
* ia64_mca_log_sal_error_record
*
@@ -622,14 +620,11 @@
*
* Called via keventd (smp_call_function() is not safe in interrupt context) to
* disable the cmc interrupt vector.
- *
- * Note: needs preempt_disable() if you apply the preempt patch to 2.4.
*/
static void
ia64_mca_cmc_vector_disable_keventd(void *unused)
{
- ia64_mca_cmc_vector_disable(NULL);
- smp_call_function(ia64_mca_cmc_vector_disable, NULL, 1, 0);
+ on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0);
}
/*
@@ -637,14 +632,11 @@
*
* Called via keventd (smp_call_function() is not safe in interrupt context) to
* enable the cmc interrupt vector.
- *
- * Note: needs preempt_disable() if you apply the preempt patch to 2.4.
*/
static void
ia64_mca_cmc_vector_enable_keventd(void *unused)
{
- smp_call_function(ia64_mca_cmc_vector_enable, NULL, 1, 0);
- ia64_mca_cmc_vector_enable(NULL);
+ on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0);
}
/*
@@ -680,9 +672,6 @@
IA64_MCA_DEBUG("ia64_mca_init: begin\n");
- INIT_TQUEUE(&cmc_disable_tq, ia64_mca_cmc_vector_disable_keventd, NULL);
- INIT_TQUEUE(&cmc_enable_tq, ia64_mca_cmc_vector_enable_keventd, NULL);
-
/* initialize recovery success indicator */
ia64_os_mca_recovery_successful = 0;
@@ -1055,6 +1044,9 @@
ia64_return_to_sal_check();
}
+static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd, NULL);
+static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL);
+
/*
* ia64_mca_cmc_int_handler
*
@@ -1101,7 +1093,7 @@
cmc_polling_enabled = 1;
spin_unlock(&cmc_history_lock);
- schedule_task(&cmc_disable_tq);
+ schedule_work(&cmc_disable_work);
/*
* Corrected errors will still be corrected, but
@@ -1196,7 +1188,7 @@
if (start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) {
printk(KERN_WARNING "%s: Returning to interrupt driven CMC handler\n", __FUNCTION__);
- schedule_task(&cmc_enable_tq);
+ schedule_work(&cmc_enable_work);
cmc_polling_enabled = 0;
} else {
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2003-12-20 2:56 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-12-18 23:37 [patch] 2.6.0 MCA TLB error recovery Luck, Tony
-- strict thread matches above, loose matches on Subject: below --
2003-12-19 5:06 Keith Owens
2003-12-19 18:07 ` Luck, Tony
2003-12-19 23:22 ` David Mosberger
2003-12-20 0:03 ` Luck, Tony
2003-12-20 0:04 ` Luck, Tony
2003-12-20 0:26 ` Keith Owens
2003-12-20 0:32 ` David Mosberger
2003-12-20 2:56 ` Keith Owens
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox