public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed
* [patch] Memory Error Handling Improvement
@ 2005-06-23 17:30 Russ Anderson
  2005-06-23 17:52 ` David Mosberger
                   ` (17 more replies)
  0 siblings, 18 replies; 19+ messages in thread
From: Russ Anderson @ 2005-06-23 17:30 UTC (permalink / raw)
  To: linux-ia64

	[patch] Memory Error Handling Improvement

With the current memory recovery code, if the MCA surfaces while the CPU is
in privilage mode, the code does not try to recover.  There are cases where
a read is launched in user context, but an interrupt starts a context
switch causing the MCA to surface early in the context switch code.  This
patch add a check to see if the MCA surfaced in the interrupt code while
saving user state, and if so, the MCA was due to a useland read and can
be recovered.

Jack Steiner wrote the significant parts of this code.

Testing: The test program allocates memory, does a SAL call to change the 
	ECC on that memory to create a hardware uncorrectable error, then 
	consumes the data, causing the MCA.  With other interrupt traffic 
	(to force context switches) the 2.6.12 code will not recover after
	8-100 passes.  With this patch, the test consistently reaches the
	1024 recovery limit.  Analysis of the recovered MCA records shows 
	7-10% are in the interrupt code saving user state.  Those MCAs
	become recovered with this patch.

Signed-off-by: Russ Anderson (rja@sgi.com)

----------------------------------------------------------------------
Index: linux-2.6/arch/ia64/kernel/mca_drv.c
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv.c	2005-06-22 10:27:30.006898406 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv.c	2005-06-23 10:58:30.145638400 -0500
@@ -118,10 +118,11 @@
  */
 
 void
-mca_handler_bh(unsigned long paddr)
+mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
 {
-	printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
-		current->pid, current->comm);
+	printk(KERN_DEBUG "OS_MCA: process [cpu %d, pid: %d, uid: %d, iip: %p, psr: 0x%lx, paddr: 0x%lx](%s) encounters MCA.\n",
+		smp_processor_id(), current->pid, current->uid, iip, ipsr, paddr, current->
+comm);
 
 	spin_lock(&mca_bh_lock);
 	if (mca_page_isolate(paddr) = ISOLATE_OK) {
@@ -394,6 +395,7 @@
 	pal_min_state_area_t *pmsa;
 	struct ia64_psr *psr1, *psr2;
 	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
+	extern void *interrupt, *interrupt_pnr;
 
 	/* Is target address valid? */
 	if (!pbci->tv)
@@ -419,16 +421,19 @@
 	 *  Check the privilege level of interrupted context.
 	 *   If it is user-mode, then terminate affected process.
 	 */
-	if (psr1->cpl != 0) {
+	pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
+	if (psr1->cpl != 0 || (pmsa->pmsa_iip >= (unsigned long)&interrupt &&
+			       pmsa->pmsa_iip <  (unsigned long)&interrupt_pnr)) {
 		smei = peidx_bus_check(peidx, 0);
 		if (smei->valid.target_identifier) {
 			/*
 			 *  setup for resume to bottom half of MCA,
 			 * "mca_handler_bhhook"
 			 */
-			pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
-			/* pass to bhhook as 1st argument (gr8) */
+			/* pass to bhhook as argument (gr8, ...) */
 			pmsa->pmsa_gr[8-1] = smei->target_identifier;
+			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
 			/* set interrupted return address (but no use) */
 			pmsa->pmsa_br0 = pmsa->pmsa_iip;
 			/* change resume address to bottom half */
@@ -438,6 +443,7 @@
 			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
 			psr2->cpl = 0;
 			psr2->ri  = 0;
+			psr2->bn  = 1;
 			psr2->i  = 0;
 
 			return 1;
Index: linux-2.6/arch/ia64/kernel/mca_drv_asm.S
=================================--- linux-2.6.orig/arch/ia64/kernel/mca_drv_asm.S	2005-06-22 10:27:30.006898406 -0500
+++ linux-2.6/arch/ia64/kernel/mca_drv_asm.S	2005-06-22 15:30:35.506339996 -0500
@@ -19,7 +19,7 @@
 	;;						//
 	clrrrb						//
 	;;						
-	alloc		r16=ar.pfs,0,2,1,0		// make a new frame
+	alloc		r16=ar.pfs,0,2,3,0		// make a new frame
 	;;
 	mov		ar.rsc=0
 	;;
@@ -40,11 +40,13 @@
 	movl		loc1=mca_handler_bh		// recovery C function
 	;;
 	mov		out0=r8				// poisoned address
+	mov		out1=r9				// iip
+	mov		out2=r10			// psr
 	mov		b6=loc1
 	;;
 	mov		loc1=rp
 	;;
-	ssm		psr.i
+	ssm		psr.i | psr.ic
 	;;
 	br.call.sptk.many    rp¶			// does not return ...
 	;;
Index: linux-2.6/arch/ia64/kernel/ia64_ksyms.c
=================================--- linux-2.6.orig/arch/ia64/kernel/ia64_ksyms.c	2005-06-22 10:27:30.006898406 -0500
+++ linux-2.6/arch/ia64/kernel/ia64_ksyms.c	2005-06-22 15:30:35.507316546 -0500
@@ -123,5 +123,9 @@
 # endif
 #endif
 
+extern char interrupt, interrupt_pnr;
+EXPORT_SYMBOL(interrupt);
+EXPORT_SYMBOL(interrupt_pnr);
+
 extern char ia64_ivt[];
 EXPORT_SYMBOL(ia64_ivt);
Index: linux-2.6/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.orig/arch/ia64/kernel/ivt.S	2005-06-22 10:27:30.007874957 -0500
+++ linux-2.6/arch/ia64/kernel/ivt.S	2005-06-22 15:30:35.508293097 -0500
@@ -768,7 +768,7 @@
 	.org ia64_ivt+0x3000
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
-ENTRY(interrupt)
+GLOBAL_ENTRY(interrupt)
 	DBG_FAULT(12)
 	mov r31=pr		// prepare to save predicates
 	;;
@@ -780,6 +780,8 @@
 	;;
 	SAVE_REST
 	;;
+	.global	interrupt_pnr
+interrupt_pnr:
 	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
 	mov out0=cr.ivr		// pass cr.ivr as first arg
 	add out1\x16,sp		// pass pointer to pt_regs as second arg

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2005-06-24 21:53 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-06-23 17:30 [patch] Memory Error Handling Improvement Russ Anderson
2005-06-23 17:52 ` David Mosberger
2005-06-23 19:30 ` Russ Anderson
2005-06-23 22:11 ` Andreas Schwab
2005-06-23 22:18 ` David Mosberger
2005-06-23 22:22 ` Russ Anderson
2005-06-23 22:54 ` Andreas Schwab
2005-06-24  1:12 ` Hidetoshi Seto
2005-06-24 20:11 ` Russ Anderson
2005-06-24 20:18 ` Russ Anderson
2005-06-24 20:36 ` David Mosberger
2005-06-24 21:05 ` Luck, Tony
2005-06-24 21:11 ` David Mosberger
2005-06-24 21:20 ` Luck, Tony
2005-06-24 21:25 ` David Mosberger
2005-06-24 21:31 ` Luck, Tony
2005-06-24 21:36 ` Russ Anderson
2005-06-24 21:36 ` David Mosberger
2005-06-24 21:53 ` David Mosberger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox