[patch] MCA recovery: Montecito support

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* [patch] MCA recovery: Montecito support
@ 2006-10-20 15:55 Russ Anderson
  2006-10-21  5:22 ` Keith Owens
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: Russ Anderson @ 2006-10-20 15:55 UTC (permalink / raw)
  To: linux-ia64

[patch] MCA recovery: Montecito support

The information in MCA records is filled in slightly differently on
Montecito than on Madison/McKinley.  Usually, the cache check and bus
check target identifiers have the same address.   On Montecito the
cache check and bus check target identifiers can be different if 
a corrected error (ie SBE or unconsumed poison data) was encountered and
then an uncorrected error (ie DBE) was consumed.  In that case, the 
cache check target identifier is the physical address of the DBE (that
caused the MCA to surface) while the bus check target identifier is the 
physical address of the SBE.  This patch correctly finds the target
identifier that triggered the MCA.

This change works with both Montecito and Madison/McKinley and was
tested on a mixed Montecito and Madison system.

Signed-off-by: Russ Anderson (rja@sgi.com)

---
 arch/ia64/kernel/mca.c     |   52 +++++++++++++---------------
 arch/ia64/kernel/mca_drv.c |   81 ++++++++++++++++++++++++++++++---------------
 2 files changed, 80 insertions(+), 53 deletions(-)

Index: test/arch/ia64/kernel/mca_drv.c
=================================--- test.orig/arch/ia64/kernel/mca_drv.c	2006-10-19 16:23:24.543535104 -0500
+++ test/arch/ia64/kernel/mca_drv.c	2006-10-20 10:31:20.553249675 -0500
@@ -435,6 +435,38 @@ is_mca_global(peidx_table_t *peidx, pal_
 }
 
 /**
+ * get_target_identifier - Get the valid Cache or Bus check target identifier.
+ * @peidx:	pointer of index of processor error section
+ *
+ * Return value:
+ *	target address on Success / 0 on Failue
+ */
+static u64
+get_target_identifier(peidx_table_t *peidx)
+{
+	sal_log_mod_error_info_t *smei;
+	int i;
+
+	/*
+	 * Look through the cache checks for a valid target identifier
+	 */
+	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
+		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
+		if (smei->valid.target_identifier && smei->target_identifier)
+			return smei->target_identifier;
+	}
+
+	/*
+	 * Look at the bus check for a valid target identifier
+	 */
+	smei = peidx_bus_check(peidx, 0);
+	if (smei && smei->valid.target_identifier)
+		return smei->target_identifier;
+
+	return 0;
+}
+
+/**
  * recover_from_read_error - Try to recover the errors which type are "read"s.
  * @slidx:	pointer of index of SAL error record
  * @peidx:	pointer of index of processor error section
@@ -450,13 +482,14 @@ recover_from_read_error(slidx_table_t *s
 			peidx_table_t *peidx, pal_bus_check_info_t *pbci,
 			struct ia64_sal_os_state *sos)
 {
-	sal_log_mod_error_info_t *smei;
+	u64 target_identifier;
 	pal_min_state_area_t *pmsa;
 	struct ia64_psr *psr1, *psr2;
 	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
 
 	/* Is target address valid? */
-	if (!pbci->tv)
+	target_identifier = get_target_identifier(peidx);
+	if (!target_identifier)
 		return fatal_mca("target address not valid");
 
 	/*
@@ -487,32 +520,28 @@ recover_from_read_error(slidx_table_t *s
 	pmsa = sos->pal_min_state;
 	if (psr1->cpl != 0 ||
 	   ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) {
-		smei = peidx_bus_check(peidx, 0);
-		if (smei->valid.target_identifier) {
-			/*
-			 *  setup for resume to bottom half of MCA,
-			 * "mca_handler_bhhook"
-			 */
-			/* pass to bhhook as argument (gr8, ...) */
-			pmsa->pmsa_gr[8-1] = smei->target_identifier;
-			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
-			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
-			/* set interrupted return address (but no use) */
-			pmsa->pmsa_br0 = pmsa->pmsa_iip;
-			/* change resume address to bottom half */
-			pmsa->pmsa_iip = mca_hdlr_bh->fp;
-			pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
-			/* set cpl with kernel mode */
-			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
-			psr2->cpl = 0;
-			psr2->ri  = 0;
-			psr2->bn  = 1;
-			psr2->i  = 0;
+		/*
+		 *  setup for resume to bottom half of MCA,
+		 * "mca_handler_bhhook"
+		 */
+		/* pass to bhhook as argument (gr8, ...) */
+		pmsa->pmsa_gr[8-1] = target_identifier;
+		pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+		pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
+		/* set interrupted return address (but no use) */
+		pmsa->pmsa_br0 = pmsa->pmsa_iip;
+		/* change resume address to bottom half */
+		pmsa->pmsa_iip = mca_hdlr_bh->fp;
+		pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
+		/* set cpl with kernel mode */
+		psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
+		psr2->cpl = 0;
+		psr2->ri  = 0;
+		psr2->bn  = 1;
+		psr2->i  = 0;
 
-			return mca_recovered("user memory corruption. "
+		return mca_recovered("user memory corruption. "
 				"kill affected process - recovered.");
-		}
-
 	}
 
 	return fatal_mca("kernel context not recovered, iip 0x%lx\n",
Index: test/arch/ia64/kernel/mca.c
=================================--- test.orig/arch/ia64/kernel/mca.c	2006-10-19 16:23:24.543535104 -0500
+++ test/arch/ia64/kernel/mca.c	2006-10-19 17:06:36.447259750 -0500
@@ -962,33 +962,31 @@ ia64_mca_modify_original_stack(struct pt
 		goto no_mod;
 	}
 
-	if (!mca_recover_range(ms->pmsa_iip)) {
-		if (r13 != sos->prev_IA64_KR_CURRENT) {
-			msg = "inconsistent previous current and r13";
-			goto no_mod;
-		}
-		if ((r12 - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent r12 and r13";
-			goto no_mod;
-		}
-		if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent ar.bspstore and r13";
-			goto no_mod;
-		}
-		va.p = old_bspstore;
-		if (va.f.reg < 5) {
-			msg = "old_bspstore is in the wrong region";
-			goto no_mod;
-		}
-		if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent ar.bsp and r13";
-			goto no_mod;
-		}
-		size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8;
-		if (ar_bspstore + size > r12) {
-			msg = "no room for blocked state";
-			goto no_mod;
-		}
+	if (r13 != sos->prev_IA64_KR_CURRENT) {
+		msg = "inconsistent previous current and r13";
+		goto no_mod;
+	}
+	if ((r12 - r13) >= KERNEL_STACK_SIZE) {
+		msg = "inconsistent r12 and r13";
+		goto no_mod;
+	}
+	if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) {
+		msg = "inconsistent ar.bspstore and r13";
+		goto no_mod;
+	}
+	va.p = old_bspstore;
+	if (va.f.reg < 5) {
+		msg = "old_bspstore is in the wrong region";
+		goto no_mod;
+	}
+	if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) {
+		msg = "inconsistent ar.bsp and r13";
+		goto no_mod;
+	}
+	size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8;
+	if (ar_bspstore + size > r12) {
+		msg = "no room for blocked state";
+		goto no_mod;
 	}
 
 	ia64_mca_modify_comm(previous_current);
-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch] MCA recovery: Montecito support
  2006-10-20 15:55 [patch] MCA recovery: Montecito support Russ Anderson
@ 2006-10-21  5:22 ` Keith Owens
  2006-10-23  0:53 ` Hidetoshi Seto
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Keith Owens @ 2006-10-21  5:22 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson (on Fri, 20 Oct 2006 10:55:49 -0500 (CDT)) wrote:
>[patch] MCA recovery: Montecito support
>
>The information in MCA records is filled in slightly differently on
>Montecito than on Madison/McKinley.  Usually, the cache check and bus
>check target identifiers have the same address.   On Montecito the
>cache check and bus check target identifiers can be different if 
>a corrected error (ie SBE or unconsumed poison data) was encountered and
>then an uncorrected error (ie DBE) was consumed.  In that case, the 
>cache check target identifier is the physical address of the DBE (that
>caused the MCA to surface) while the bus check target identifier is the 
>physical address of the SBE.  This patch correctly finds the target
>identifier that triggered the MCA.
>
>This change works with both Montecito and Madison/McKinley and was
>tested on a mixed Montecito and Madison system.
>
>Signed-off-by: Russ Anderson (rja@sgi.com)
>
>---
> arch/ia64/kernel/mca.c     |   52 +++++++++++++---------------
> arch/ia64/kernel/mca_drv.c |   81 ++++++++++++++++++++++++++++++---------------
> 2 files changed, 80 insertions(+), 53 deletions(-)
>
>Index: test/arch/ia64/kernel/mca_drv.c
>=================================>--- test.orig/arch/ia64/kernel/mca_drv.c	2006-10-19 16:23:24.543535104 -0500
>+++ test/arch/ia64/kernel/mca_drv.c	2006-10-20 10:31:20.553249675 -0500
>@@ -435,6 +435,38 @@ is_mca_global(peidx_table_t *peidx, pal_
> }
> 
> /**
>+ * get_target_identifier - Get the valid Cache or Bus check target identifier.
>+ * @peidx:	pointer of index of processor error section
>+ *
>+ * Return value:
>+ *	target address on Success / 0 on Failue
>+ */
>+static u64
>+get_target_identifier(peidx_table_t *peidx)
>+{
>+	sal_log_mod_error_info_t *smei;
>+	int i;
>+
>+	/*
>+	 * Look through the cache checks for a valid target identifier
>+	 */
>+	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
>+		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
>+		if (smei->valid.target_identifier && smei->target_identifier)
>+			return smei->target_identifier;
>+	}
>+
>+	/*
>+	 * Look at the bus check for a valid target identifier
>+	 */
>+	smei = peidx_bus_check(peidx, 0);
>+	if (smei && smei->valid.target_identifier)
>+		return smei->target_identifier;
>+
>+	return 0;
>+}
>+
>+/**
>  * recover_from_read_error - Try to recover the errors which type are "read"s.
>  * @slidx:	pointer of index of SAL error record
>  * @peidx:	pointer of index of processor error section
>@@ -450,13 +482,14 @@ recover_from_read_error(slidx_table_t *s
> 			peidx_table_t *peidx, pal_bus_check_info_t *pbci,
> 			struct ia64_sal_os_state *sos)
> {
>-	sal_log_mod_error_info_t *smei;
>+	u64 target_identifier;
> 	pal_min_state_area_t *pmsa;
> 	struct ia64_psr *psr1, *psr2;
> 	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
> 
> 	/* Is target address valid? */
>-	if (!pbci->tv)
>+	target_identifier = get_target_identifier(peidx);
>+	if (!target_identifier)
> 		return fatal_mca("target address not valid");
> 
> 	/*
>@@ -487,32 +520,28 @@ recover_from_read_error(slidx_table_t *s
> 	pmsa = sos->pal_min_state;
> 	if (psr1->cpl != 0 ||
> 	   ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) {
>-		smei = peidx_bus_check(peidx, 0);
>-		if (smei->valid.target_identifier) {
>-			/*
>-			 *  setup for resume to bottom half of MCA,
>-			 * "mca_handler_bhhook"
>-			 */
>-			/* pass to bhhook as argument (gr8, ...) */
>-			pmsa->pmsa_gr[8-1] = smei->target_identifier;
>-			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
>-			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
>-			/* set interrupted return address (but no use) */
>-			pmsa->pmsa_br0 = pmsa->pmsa_iip;
>-			/* change resume address to bottom half */
>-			pmsa->pmsa_iip = mca_hdlr_bh->fp;
>-			pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
>-			/* set cpl with kernel mode */
>-			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
>-			psr2->cpl = 0;
>-			psr2->ri  = 0;
>-			psr2->bn  = 1;
>-			psr2->i  = 0;
>+		/*
>+		 *  setup for resume to bottom half of MCA,
>+		 * "mca_handler_bhhook"
>+		 */
>+		/* pass to bhhook as argument (gr8, ...) */
>+		pmsa->pmsa_gr[8-1] = target_identifier;
>+		pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
>+		pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
>+		/* set interrupted return address (but no use) */
>+		pmsa->pmsa_br0 = pmsa->pmsa_iip;
>+		/* change resume address to bottom half */
>+		pmsa->pmsa_iip = mca_hdlr_bh->fp;
>+		pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
>+		/* set cpl with kernel mode */
>+		psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
>+		psr2->cpl = 0;
>+		psr2->ri  = 0;
>+		psr2->bn  = 1;
>+		psr2->i  = 0;
> 
>-			return mca_recovered("user memory corruption. "
>+		return mca_recovered("user memory corruption. "
> 				"kill affected process - recovered.");
>-		}
>-
> 	}
> 
> 	return fatal_mca("kernel context not recovered, iip 0x%lx\n",
>Index: test/arch/ia64/kernel/mca.c
>=================================>--- test.orig/arch/ia64/kernel/mca.c	2006-10-19 16:23:24.543535104 -0500
>+++ test/arch/ia64/kernel/mca.c	2006-10-19 17:06:36.447259750 -0500
>@@ -962,33 +962,31 @@ ia64_mca_modify_original_stack(struct pt
> 		goto no_mod;
> 	}
> 
>-	if (!mca_recover_range(ms->pmsa_iip)) {
>-		if (r13 != sos->prev_IA64_KR_CURRENT) {
>-			msg = "inconsistent previous current and r13";
>-			goto no_mod;
>-		}
>-		if ((r12 - r13) >= KERNEL_STACK_SIZE) {
>-			msg = "inconsistent r12 and r13";
>-			goto no_mod;
>-		}
>-		if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) {
>-			msg = "inconsistent ar.bspstore and r13";
>-			goto no_mod;
>-		}
>-		va.p = old_bspstore;
>-		if (va.f.reg < 5) {
>-			msg = "old_bspstore is in the wrong region";
>-			goto no_mod;
>-		}
>-		if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) {
>-			msg = "inconsistent ar.bsp and r13";
>-			goto no_mod;
>-		}
>-		size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8;
>-		if (ar_bspstore + size > r12) {
>-			msg = "no room for blocked state";
>-			goto no_mod;
>-		}
>+	if (r13 != sos->prev_IA64_KR_CURRENT) {
>+		msg = "inconsistent previous current and r13";
>+		goto no_mod;
>+	}
>+	if ((r12 - r13) >= KERNEL_STACK_SIZE) {
>+		msg = "inconsistent r12 and r13";
>+		goto no_mod;
>+	}
>+	if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) {
>+		msg = "inconsistent ar.bspstore and r13";
>+		goto no_mod;
>+	}
>+	va.p = old_bspstore;
>+	if (va.f.reg < 5) {
>+		msg = "old_bspstore is in the wrong region";
>+		goto no_mod;
>+	}
>+	if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) {
>+		msg = "inconsistent ar.bsp and r13";
>+		goto no_mod;
>+	}
>+	size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8;
>+	if (ar_bspstore + size > r12) {
>+		msg = "no room for blocked state";
>+		goto no_mod;
> 	}
> 
> 	ia64_mca_modify_comm(previous_current);

Why remove the mca_recover_range() check from
ia64_mca_modify_original_stack()?


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch] MCA recovery: Montecito support
  2006-10-20 15:55 [patch] MCA recovery: Montecito support Russ Anderson
  2006-10-21  5:22 ` Keith Owens
@ 2006-10-23  0:53 ` Hidetoshi Seto
  2006-10-23  4:09 ` Russ Anderson
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Hidetoshi Seto @ 2006-10-23  0:53 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson wrote:
> +	/*
> +	 * Look through the cache checks for a valid target identifier
> +	 */
> +	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
> +		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
> +		if (smei->valid.target_identifier && smei->target_identifier)
> +			return smei->target_identifier;
> +	}

This says there would be more than 1 cache_check in the sal log.
I suppose it also means there would be more than 1 identifier.
Are there any reason why you don't handle identifiers other than
listed earliest?


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch] MCA recovery: Montecito support
  2006-10-20 15:55 [patch] MCA recovery: Montecito support Russ Anderson
  2006-10-21  5:22 ` Keith Owens
  2006-10-23  0:53 ` Hidetoshi Seto
@ 2006-10-23  4:09 ` Russ Anderson
  2006-10-25 22:59 ` Russ Anderson
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Russ Anderson @ 2006-10-23  4:09 UTC (permalink / raw)
  To: linux-ia64

Hidetoshi Seto wrote:
> Russ Anderson wrote:
> > +	/*
> > +	 * Look through the cache checks for a valid target identifier
> > +	 */
> > +	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
> > +		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
> > +		if (smei->valid.target_identifier && smei->target_identifier)
> > +			return smei->target_identifier;
> > +	}
> 
> This says there would be more than 1 cache_check in the sal log.
> I suppose it also means there would be more than 1 identifier.
> Are there any reason why you don't handle identifiers other than
> listed earliest?

My testing has encountered more than one cache check, but not
multiple cache checks with valid target identifiers with different
addresses.  

I guess if there were two different addresses, the level L1 would
be more likely to be the one that triggered the MCA (assuming 
the other was L2).  I'll ask Intel for clarification.

Here are some examples of multiple cache checks.
---------------------------------------------------
  Cache check info[0]
    Operation: 7 (Move in), Level: L1, Line: Data, Cache: Data, Way: 5, Index: 128
    machine check corrected
  Cache check info[1]
    Operation: 1 (Load), Level: L2, Line: Data, Cache: Data
    machine check corrected
    target identifier        : 0x000000600c594000
---------------------------------------------------

  Cache check info[0]
    Operation: 2 (Store), Level: L1, Line: Data, Cache: Data, Way: 7, Index: 0
    target identifier        : 0x0000006046418000
  Cache check info[1]
    Operation: 7 (Move in), Level: L1, Line: Data, Cache: Data, Way: 7, Index: 0

---------------------------------------------------
  Cache check info[0]
    Operation: 7 (Move in), Level: L1, Line: Data, Cache: Data, Way: 0, Index: 128
    machine check corrected
    target identifier        : 0x000000607a524000
  Cache check info[1]
    Operation: 1 (Load), Level: L2, Line: Data, Cache: Data
    machine check corrected
    target identifier        : 0x000000607a524000
---------------------------------------------------

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch] MCA recovery: Montecito support
  2006-10-20 15:55 [patch] MCA recovery: Montecito support Russ Anderson
                   ` (2 preceding siblings ...)
  2006-10-23  4:09 ` Russ Anderson
@ 2006-10-25 22:59 ` Russ Anderson
  2006-10-26  0:21 ` Hidetoshi Seto
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Russ Anderson @ 2006-10-25 22:59 UTC (permalink / raw)
  To: linux-ia64

Hidetoshi Seto wrote:
> Russ Anderson wrote:
> > +	/*
> > +	 * Look through the cache checks for a valid target identifier
> > +	 */
> > +	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
> > +		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
> > +		if (smei->valid.target_identifier && smei->target_identifier)
> > +			return smei->target_identifier;
> > +	}
> 
> This says there would be more than 1 cache_check in the sal log.
> I suppose it also means there would be more than 1 identifier.
> Are there any reason why you don't handle identifiers other than
> listed earliest?

I reworked that routine to look at all the valid cache target identifiers
and use the one with the lowest cache level.  

I've opened a Quad issue to get clarification from Intel as to 
which target identifier triggered the MCA if there are multiple
cache checks with valid target identifiers.  

This patch also leaves mca.c unchanged.  I'll treat that as a seperate
patch if needed.

Signed-off-by: Russ Anderson (rja@sgi.com)

---
 arch/ia64/kernel/mca_drv.c |   93 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 67 insertions(+), 26 deletions(-)

Index: test/arch/ia64/kernel/mca_drv.c
=================================--- test.orig/arch/ia64/kernel/mca_drv.c	2006-10-25 14:01:12.245158144 -0500
+++ test/arch/ia64/kernel/mca_drv.c	2006-10-25 17:09:30.720299537 -0500
@@ -435,6 +435,50 @@ is_mca_global(peidx_table_t *peidx, pal_
 }
 
 /**
+ * get_target_identifier - Get the valid Cache or Bus check target identifier.
+ * @peidx:	pointer of index of processor error section
+ *
+ * Return value:
+ *	target address on Success / 0 on Failue
+ */
+static u64
+get_target_identifier(peidx_table_t *peidx)
+{
+	u64 target_address = 0;
+	sal_log_mod_error_info_t *smei;
+	pal_cache_check_info_t *pcci;
+	int i, level = 9;
+
+	/*
+	 * Look through the cache checks for a valid target identifier
+	 * If more than one valid target identifier, return the one
+	 * with the lowest cache level.
+	 */
+	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
+		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
+		if (smei->valid.target_identifier && smei->target_identifier) {
+			pcci = (pal_cache_check_info_t *)&(smei->check_info);
+			if (!target_address || (pcci->level < level)) {
+				target_address = smei->target_identifier;
+				level = pcci->level;
+				continue;
+			}
+		}
+	}
+	if (target_address)
+		return target_address;
+
+	/*
+	 * Look at the bus check for a valid target identifier
+	 */
+	smei = peidx_bus_check(peidx, 0);
+	if (smei && smei->valid.target_identifier)
+		return smei->target_identifier;
+
+	return 0;
+}
+
+/**
  * recover_from_read_error - Try to recover the errors which type are "read"s.
  * @slidx:	pointer of index of SAL error record
  * @peidx:	pointer of index of processor error section
@@ -450,13 +494,14 @@ recover_from_read_error(slidx_table_t *s
 			peidx_table_t *peidx, pal_bus_check_info_t *pbci,
 			struct ia64_sal_os_state *sos)
 {
-	sal_log_mod_error_info_t *smei;
+	u64 target_identifier;
 	pal_min_state_area_t *pmsa;
 	struct ia64_psr *psr1, *psr2;
 	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
 
 	/* Is target address valid? */
-	if (!pbci->tv)
+	target_identifier = get_target_identifier(peidx);
+	if (!target_identifier)
 		return fatal_mca("target address not valid");
 
 	/*
@@ -487,32 +532,28 @@ recover_from_read_error(slidx_table_t *s
 	pmsa = sos->pal_min_state;
 	if (psr1->cpl != 0 ||
 	   ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) {
-		smei = peidx_bus_check(peidx, 0);
-		if (smei->valid.target_identifier) {
-			/*
-			 *  setup for resume to bottom half of MCA,
-			 * "mca_handler_bhhook"
-			 */
-			/* pass to bhhook as argument (gr8, ...) */
-			pmsa->pmsa_gr[8-1] = smei->target_identifier;
-			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
-			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
-			/* set interrupted return address (but no use) */
-			pmsa->pmsa_br0 = pmsa->pmsa_iip;
-			/* change resume address to bottom half */
-			pmsa->pmsa_iip = mca_hdlr_bh->fp;
-			pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
-			/* set cpl with kernel mode */
-			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
-			psr2->cpl = 0;
-			psr2->ri  = 0;
-			psr2->bn  = 1;
-			psr2->i  = 0;
+		/*
+		 *  setup for resume to bottom half of MCA,
+		 * "mca_handler_bhhook"
+		 */
+		/* pass to bhhook as argument (gr8, ...) */
+		pmsa->pmsa_gr[8-1] = target_identifier;
+		pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+		pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
+		/* set interrupted return address (but no use) */
+		pmsa->pmsa_br0 = pmsa->pmsa_iip;
+		/* change resume address to bottom half */
+		pmsa->pmsa_iip = mca_hdlr_bh->fp;
+		pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
+		/* set cpl with kernel mode */
+		psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
+		psr2->cpl = 0;
+		psr2->ri  = 0;
+		psr2->bn  = 1;
+		psr2->i  = 0;
 
-			return mca_recovered("user memory corruption. "
+		return mca_recovered("user memory corruption. "
 				"kill affected process - recovered.");
-		}
-
 	}
 
 	return fatal_mca("kernel context not recovered, iip 0x%lx\n",

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch] MCA recovery: Montecito support
  2006-10-20 15:55 [patch] MCA recovery: Montecito support Russ Anderson
                   ` (3 preceding siblings ...)
  2006-10-25 22:59 ` Russ Anderson
@ 2006-10-26  0:21 ` Hidetoshi Seto
  2006-10-26 23:20 ` Russ Anderson
  2006-10-27  0:25 ` Hidetoshi Seto
  6 siblings, 0 replies; 8+ messages in thread
From: Hidetoshi Seto @ 2006-10-26  0:21 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson wrote:
> I reworked that routine to look at all the valid cache target identifiers
> and use the one with the lowest cache level.  
> 
> I've opened a Quad issue to get clarification from Intel as to 
> which target identifier triggered the MCA if there are multiple
> cache checks with valid target identifiers.  
> 
> This patch also leaves mca.c unchanged.  I'll treat that as a seperate
> patch if needed.

Looks good.

But I have one more question (for intel possibly):
- If identifiers in cache_check and bus_check are different,
   the cache's always takes priority and the bus's will be ignored.
   Are there any opposite case, such as a case of error log that have
   corrected cache_checks with ignorable identifiers and an uncorrected
   bus_check with significant identifier?

I guess if both are significant it would be separated double MCA,
or should be reset by SAL/platform.


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch] MCA recovery: Montecito support
  2006-10-20 15:55 [patch] MCA recovery: Montecito support Russ Anderson
                   ` (4 preceding siblings ...)
  2006-10-26  0:21 ` Hidetoshi Seto
@ 2006-10-26 23:20 ` Russ Anderson
  2006-10-27  0:25 ` Hidetoshi Seto
  6 siblings, 0 replies; 8+ messages in thread
From: Russ Anderson @ 2006-10-26 23:20 UTC (permalink / raw)
  To: linux-ia64

Hidetoshi Seto wrote:
> Russ Anderson wrote:
> > I reworked that routine to look at all the valid cache target identifiers
> > and use the one with the lowest cache level.  
> > 
> > I've opened a Quad issue to get clarification from Intel as to 
> > which target identifier triggered the MCA if there are multiple
> > cache checks with valid target identifiers.  
> > 
> > This patch also leaves mca.c unchanged.  I'll treat that as a seperate
> > patch if needed.
> 
> Looks good.
> 
> But I have one more question (for intel possibly):
> - If identifiers in cache_check and bus_check are different,
>    the cache's always takes priority and the bus's will be ignored.
>    Are there any opposite case, such as a case of error log that have
>    corrected cache_checks with ignorable identifiers and an uncorrected
>    bus_check with significant identifier?

Bad data moving across the FSB does not cause an MCA (at least not
the way the hardware is configured on SGI Altix).

Usually the MCA is triggered by consuming the bad data.
"consumption" is :
  1) Loading bad data into L1 cache
  2) Loading bad data into a register file
  3) st1 or st2 to bad data
So the cache check information would be more accurate.

It's worth noting that this change does not effect the selection of
which process to kill.  It only effects which physical memory
address gets marked as bad.  

In my test case, a correctable error is injected (an ends up in the
bus check target identifier) then a memory uncorrectable is injected
and consumed, triggering the MCA.  The test program is correctly 
terminated, but the current code uses the bus check target identifier
and marks the address of the correctable error as bad.  The real bad
memory goes back on the free list, and promptly gets reused, triggering
another MCA.  The cycle repeats until the kernel happens to get
the memory.  Kernel memory error, end of ballgame.

> I guess if both are significant it would be separated double MCA,
> or should be reset by SAL/platform.

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch] MCA recovery: Montecito support
  2006-10-20 15:55 [patch] MCA recovery: Montecito support Russ Anderson
                   ` (5 preceding siblings ...)
  2006-10-26 23:20 ` Russ Anderson
@ 2006-10-27  0:25 ` Hidetoshi Seto
  6 siblings, 0 replies; 8+ messages in thread
From: Hidetoshi Seto @ 2006-10-27  0:25 UTC (permalink / raw)
  To: linux-ia64

All right. Thank you, Russ.

- H.Seto

Acked-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>

Russ Anderson wrote:
> It's worth noting that this change does not effect the selection of
> which process to kill.  It only effects which physical memory
> address gets marked as bad.  
> 
> In my test case, a correctable error is injected (an ends up in the
> bus check target identifier) then a memory uncorrectable is injected
> and consumed, triggering the MCA.  The test program is correctly 
> terminated, but the current code uses the bus check target identifier
> and marks the address of the correctable error as bad.  The real bad
> memory goes back on the free list, and promptly gets reused, triggering
> another MCA.  The cycle repeats until the kernel happens to get
> the memory.  Kernel memory error, end of ballgame.


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2006-10-27  0:25 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-10-20 15:55 [patch] MCA recovery: Montecito support Russ Anderson
2006-10-21  5:22 ` Keith Owens
2006-10-23  0:53 ` Hidetoshi Seto
2006-10-23  4:09 ` Russ Anderson
2006-10-25 22:59 ` Russ Anderson
2006-10-26  0:21 ` Hidetoshi Seto
2006-10-26 23:20 ` Russ Anderson
2006-10-27  0:25 ` Hidetoshi Seto

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox