* [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well @ 2012-12-17 19:39 Jacob Shin 2012-12-17 19:39 ` [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h Jacob Shin 2012-12-17 19:57 ` [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well Joe Perches 0 siblings, 2 replies; 25+ messages in thread From: Jacob Shin @ 2012-12-17 19:39 UTC (permalink / raw) To: Borislav Petkov, Doug Thompson; +Cc: linux-edac, linux-kernel, Jacob Shin Currently only AMD Family 15h processors have special handling for MC2 errors, since upcoming Family 16h will also need unique handling, let's make MC2 handling part of amd_decoder_ops. Signed-off-by: Jacob Shin <jacob.shin@amd.com> --- drivers/edac/mce_amd.c | 58 ++++++++++++++++++++++++++---------------------- drivers/edac/mce_amd.h | 1 + 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index ad63757..84320f9 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -399,12 +399,9 @@ static void decode_mc1_mce(struct mce *m) pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); } -static void decode_mc2_mce(struct mce *m) +static bool k8_mc2_mce(u16 ec, u8 xec) { - u16 ec = EC(m->status); - u8 xec = XEC(m->status, xec_mask); - - pr_emerg(HW_ERR "MC2 Error"); + bool ret = true; if (xec == 0x1) pr_cont(" in the write data buffers.\n"); @@ -429,24 +426,18 @@ static void decode_mc2_mce(struct mce *m) pr_cont(": %s parity/ECC error during data " "access from L2.\n", R4_MSG(ec)); else - goto wrong_mc2_mce; + ret = false; } else - goto wrong_mc2_mce; + ret = false; } else - goto wrong_mc2_mce; - - return; + ret = false; - wrong_mc2_mce: - pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); + return ret; } -static void decode_f15_mc2_mce(struct mce *m) +static bool f15h_mc2_mce(u16 ec, u8 xec) { - u16 ec = EC(m->status); - u8 xec = XEC(m->status, xec_mask); - - pr_emerg(HW_ERR "MC2 Error: "); + bool ret = true; if (TLB_ERROR(ec)) { if (xec == 0x0) @@ -454,10 +445,10 @@ static void decode_f15_mc2_mce(struct mce *m) else if (xec == 0x1) pr_cont("Poison data provided for TLB fill.\n"); else - goto wrong_f15_mc2_mce; + ret = false; } else if (BUS_ERROR(ec)) { if (xec > 2) - goto wrong_f15_mc2_mce; + ret = false; pr_cont("Error during attempted NB data read.\n"); } else if (MEM_ERROR(ec)) { @@ -471,14 +462,24 @@ static void decode_f15_mc2_mce(struct mce *m) break; default: - goto wrong_f15_mc2_mce; + ret = false; } } - return; + return ret; +} - wrong_f15_mc2_mce: - pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); +static void decode_mc2_mce(struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + pr_emerg(HW_ERR "MC2 Error: "); + + if (fam_ops->mc2_mce(ec, xec)) + ; + else + pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); } static void decode_mc3_mce(struct mce *m) @@ -702,10 +703,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) break; case 2: - if (c->x86 == 0x15) - decode_f15_mc2_mce(m); - else - decode_mc2_mce(m); + decode_mc2_mce(m); break; case 3: @@ -783,33 +781,39 @@ static int __init mce_amd_init(void) case 0xf: fam_ops->mc0_mce = k8_mc0_mce; fam_ops->mc1_mce = k8_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x10: fam_ops->mc0_mce = f10h_mc0_mce; fam_ops->mc1_mce = k8_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x11: fam_ops->mc0_mce = k8_mc0_mce; fam_ops->mc1_mce = k8_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x12: fam_ops->mc0_mce = f12h_mc0_mce; fam_ops->mc1_mce = k8_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x14: nb_err_cpumask = 0x3; fam_ops->mc0_mce = f14h_mc0_mce; fam_ops->mc1_mce = f14h_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x15: xec_mask = 0x1f; fam_ops->mc0_mce = f15h_mc0_mce; fam_ops->mc1_mce = f15h_mc1_mce; + fam_ops->mc2_mce = f15h_mc2_mce; break; default: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 6796799..000f6e2 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -78,6 +78,7 @@ extern const char * const ii_msgs[]; struct amd_decoder_ops { bool (*mc0_mce)(u16, u8); bool (*mc1_mce)(u16, u8); + bool (*mc2_mce)(u16, u8); }; void amd_report_gart_errors(bool); -- 1.7.9.5 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-17 19:39 [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well Jacob Shin @ 2012-12-17 19:39 ` Jacob Shin 2012-12-18 17:19 ` Borislav Petkov 2012-12-17 19:57 ` [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well Joe Perches 1 sibling, 1 reply; 25+ messages in thread From: Jacob Shin @ 2012-12-17 19:39 UTC (permalink / raw) To: Borislav Petkov, Doug Thompson; +Cc: linux-edac, linux-kernel, Jacob Shin Add MCE decoding logic for AMD Family 16h processors. Signed-off-by: Jacob Shin <jacob.shin@amd.com> --- drivers/edac/mce_amd.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++-- drivers/edac/mce_amd.h | 6 +++ 2 files changed, 122 insertions(+), 4 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 84320f9..7d2d037 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -64,6 +64,10 @@ EXPORT_SYMBOL_GPL(to_msgs); const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); +/* internal error type */ +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; +EXPORT_SYMBOL_GPL(uu_msgs); + static const char * const f15h_mc1_mce_desc[] = { "UC during a demand linefill from L2", "Parity error during data load from IC", @@ -275,6 +279,23 @@ static bool f15h_mc0_mce(u16 ec, u8 xec) return ret; } +static bool f16h_mc0_mce(u16 ec, u8 xec) +{ + u8 r4 = R4(ec); + + if (MEM_ERROR(ec) && TT(ec) == TT_DATA && LL(ec) == LL_L1 && + (r4 == R4_DRD || r4 == R4_DWR)) { + + pr_cont("%s parity error due to %s.\n", + (xec == 0x0 ? "Data" : "Tag"), + (r4 == R4_DRD ? "load" : "store")); + + return true; + } + + return f14h_mc0_mce(ec, xec); +} + static void decode_mc0_mce(struct mce *m) { u16 ec = EC(m->status); @@ -379,6 +400,36 @@ static bool f15h_mc1_mce(u16 ec, u8 xec) return ret; } +static bool f16h_mc1_mce(u16 ec, u8 xec) +{ + u8 r4 = R4(ec); + bool ret = true; + + if (MEM_ERROR(ec)) { + if (TT(ec) != TT_INSTR) + ret = false; + + else if (r4 == R4_IRD) + pr_cont("%s array parity error for a tag hit.\n", + (xec == 0x0 ? "Data" : "Tag")); + + else if (r4 == R4_SNOOP) + pr_cont("Tag error during snoop/victimization.\n"); + + else if (xec == 0x0) + pr_cont("Tag parity error from victim castout.\n"); + + else if (xec == 0x2) + pr_cont("Microcode patch RAM parity error.\n"); + + else + ret = false; + } else + ret = false; + + return ret; +} + static void decode_mc1_mce(struct mce *m) { u16 ec = EC(m->status); @@ -469,6 +520,48 @@ static bool f15h_mc2_mce(u16 ec, u8 xec) return ret; } +static bool f16h_mc2_mce(u16 ec, u8 xec) +{ + u8 r4 = R4(ec); + bool ret = true; + + if (MEM_ERROR(ec) && TT(ec) == TT_GEN && LL(ec) == LL_L2) { + switch (xec) { + case 0x04 ... 0x05: + pr_cont("Parity error in %s.\n", + (r4 == R4_RD ? "IBUFF" : "OBUFF")); + break; + + case 0x09 ... 0x0b: + case 0x0d ... 0x0f: + pr_cont("ECC error in L2 tag (%s).\n", + (r4 == R4_GEN ? "BankReq" : + (r4 == R4_SNOOP ? "Probe" : "Fill"))); + break; + + case 0x10 ... 0x1b: + pr_cont("ECC error in L2 data array (%s).\n", + (r4 == R4_RD ? "Hit" : + (r4 == R4_GEN ? "Attr" : + (r4 == R4_EVICT ? "Vict" : "Fill")))); + break; + + case 0x1c ... 0x1f: + pr_cont("Parity error in L2 attribute bits (%s).\n", + (r4 == R4_RD ? "Hit" : + (r4 == R4_GEN ? "Attr" : "Fill"))); + break; + + default: + ret = false; + break; + } + } else + ret = false; + + return ret; +} + static void decode_mc2_mce(struct mce *m) { u16 ec = EC(m->status); @@ -548,7 +641,7 @@ static void decode_mc4_mce(struct mce *m) return; case 0x19: - if (boot_cpu_data.x86 == 0x15) + if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16) pr_cont("Compute Unit Data Error.\n"); else goto wrong_mc4_mce; @@ -634,6 +727,10 @@ static void decode_mc6_mce(struct mce *m) static inline void amd_decode_err_code(u16 ec) { + if (INT_ERROR(ec)) { + pr_emerg(HW_ERR "internal: %s\n", LL_MSG(ec)); + return; + } pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); @@ -738,10 +835,18 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - if (c->x86 == 0x15) - pr_cont("|%s|%s", + if (c->x86 == 0x15 || c->x86 == 0x16) { + char coreid[16]; + + if (m->status & MCI_STATUS_COREIDV) + sprintf(coreid, "CoreIdV(Core%d)", + (int)ERR_CORE_ID(m->status)); + + pr_cont("|%s|%s|%s", + ((m->status & MCI_STATUS_COREIDV) ? coreid : "-"), ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); + } /* do the two bits[14:13] together */ ecc = (m->status >> 45) & 0x3; @@ -770,7 +875,7 @@ static int __init mce_amd_init(void) if (c->x86_vendor != X86_VENDOR_AMD) return 0; - if (c->x86 < 0xf || c->x86 > 0x15) + if (c->x86 < 0xf || c->x86 > 0x16) return 0; fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); @@ -816,6 +921,13 @@ static int __init mce_amd_init(void) fam_ops->mc2_mce = f15h_mc2_mce; break; + case 0x16: + xec_mask = 0x1f; + fam_ops->mc0_mce = f16h_mc0_mce; + fam_ops->mc1_mce = f16h_mc1_mce; + fam_ops->mc2_mce = f16h_mc2_mce; + break; + default: printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); kfree(fam_ops); diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 000f6e2..af2d20d 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -11,9 +11,12 @@ #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) +#define ERR_CORE_ID(x) (((x) >> 32) & 0xf) + #define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) #define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) #define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) +#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) #define TT(x) (((x) >> 2) & 0x3) #define TT_MSG(x) tt_msgs[TT(x)] @@ -25,10 +28,13 @@ #define TO_MSG(x) to_msgs[TO(x)] #define PP(x) (((x) >> 9) & 0x3) #define PP_MSG(x) pp_msgs[PP(x)] +#define UU(x) (((x) >> 8) & 0x3) +#define UU_MSG(x) uu_msgs[UU(x)] #define R4(x) (((x) >> 4) & 0xf) #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") +#define MCI_STATUS_COREIDV BIT_64(56) #define MCI_STATUS_DEFERRED BIT_64(44) #define MCI_STATUS_POISON BIT_64(43) -- 1.7.9.5 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-17 19:39 ` [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h Jacob Shin @ 2012-12-18 17:19 ` Borislav Petkov 2012-12-18 17:30 ` Jacob Shin 2012-12-18 18:24 ` Joe Perches 0 siblings, 2 replies; 25+ messages in thread From: Borislav Petkov @ 2012-12-18 17:19 UTC (permalink / raw) To: Jacob Shin; +Cc: Doug Thompson, linux-edac, linux-kernel On Mon, Dec 17, 2012 at 01:39:48PM -0600, Jacob Shin wrote: > Add MCE decoding logic for AMD Family 16h processors. > > Signed-off-by: Jacob Shin <jacob.shin@amd.com> > --- > drivers/edac/mce_amd.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++-- > drivers/edac/mce_amd.h | 6 +++ > 2 files changed, 122 insertions(+), 4 deletions(-) > > diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c > index 84320f9..7d2d037 100644 > --- a/drivers/edac/mce_amd.c > +++ b/drivers/edac/mce_amd.c > @@ -64,6 +64,10 @@ EXPORT_SYMBOL_GPL(to_msgs); > const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; > EXPORT_SYMBOL_GPL(ii_msgs); > > +/* internal error type */ > +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; > +EXPORT_SYMBOL_GPL(uu_msgs); Seems like those aren't used anywhere? > static const char * const f15h_mc1_mce_desc[] = { > "UC during a demand linefill from L2", > "Parity error during data load from IC", > @@ -275,6 +279,23 @@ static bool f15h_mc0_mce(u16 ec, u8 xec) > return ret; > } > > +static bool f16h_mc0_mce(u16 ec, u8 xec) > +{ > + u8 r4 = R4(ec); > + > + if (MEM_ERROR(ec) && TT(ec) == TT_DATA && LL(ec) == LL_L1 && > + (r4 == R4_DRD || r4 == R4_DWR)) { > + > + pr_cont("%s parity error due to %s.\n", > + (xec == 0x0 ? "Data" : "Tag"), > + (r4 == R4_DRD ? "load" : "store")); > + > + return true; > + } > + > + return f14h_mc0_mce(ec, xec); Looks like this could be merged with f14h_mc0_mce no? You can call the function then cat_mc0_mce (for all the *cat cores) and assign it to fam_ops->mc0_mce in the f14h and f16h case. > +} > + > static void decode_mc0_mce(struct mce *m) > { > u16 ec = EC(m->status); > @@ -379,6 +400,36 @@ static bool f15h_mc1_mce(u16 ec, u8 xec) > return ret; > } > > +static bool f16h_mc1_mce(u16 ec, u8 xec) > +{ > + u8 r4 = R4(ec); > + bool ret = true; > + > + if (MEM_ERROR(ec)) { > + if (TT(ec) != TT_INSTR) > + ret = false; > + > + else if (r4 == R4_IRD) > + pr_cont("%s array parity error for a tag hit.\n", > + (xec == 0x0 ? "Data" : "Tag")); > + > + else if (r4 == R4_SNOOP) > + pr_cont("Tag error during snoop/victimization.\n"); > + > + else if (xec == 0x0) > + pr_cont("Tag parity error from victim castout.\n"); > + > + else if (xec == 0x2) > + pr_cont("Microcode patch RAM parity error.\n"); Also no need for a family-special function - just rename f14h_mc1_mce to cat_mc1_mce() as above and add a special case like this as the last else-branch of the if conditional there: + if (boot_cpu_data.x86 == 0x16) { + if (LL(ec) == LL_LG && xec == 2) + pr_cont("Microcode patch RAM parity error.\n"); + else + pr_cont("IC Tag parity error from victim castout.\n"); + return true; + } > + > + else > + ret = false; > + } else > + ret = false; > + > + return ret; > +} > + > static void decode_mc1_mce(struct mce *m) > { > u16 ec = EC(m->status); > @@ -469,6 +520,48 @@ static bool f15h_mc2_mce(u16 ec, u8 xec) > return ret; > } > > +static bool f16h_mc2_mce(u16 ec, u8 xec) > +{ > + u8 r4 = R4(ec); > + bool ret = true; > + > + if (MEM_ERROR(ec) && TT(ec) == TT_GEN && LL(ec) == LL_L2) { You can exit early here: if (!MEM_ERROR(ec)) return false; Also, no need to test for TT and LL - we're relying on the hardware here and if those values are b0rked then we have a more serious problem. > + switch (xec) { > + case 0x04 ... 0x05: > + pr_cont("Parity error in %s.\n", > + (r4 == R4_RD ? "IBUFF" : "OBUFF")); or pr_cont("%sBUFF parity error.\n", ((xec == 4) ? "I" : "O")); > + break; > + > + case 0x09 ... 0x0b: > + case 0x0d ... 0x0f: > + pr_cont("ECC error in L2 tag (%s).\n", > + (r4 == R4_GEN ? "BankReq" : > + (r4 == R4_SNOOP ? "Probe" : "Fill"))); > + break; > + > + case 0x10 ... 0x1b: > + pr_cont("ECC error in L2 data array (%s).\n", > + (r4 == R4_RD ? "Hit" : > + (r4 == R4_GEN ? "Attr" : > + (r4 == R4_EVICT ? "Vict" : "Fill")))); > + break; > + > + case 0x1c ... 0x1f: > + pr_cont("Parity error in L2 attribute bits (%s).\n", > + (r4 == R4_RD ? "Hit" : > + (r4 == R4_GEN ? "Attr" : "Fill"))); > + break; > + > + default: > + ret = false; > + break; > + } > + } else > + ret = false; > + > + return ret; > +} > + > static void decode_mc2_mce(struct mce *m) > { > u16 ec = EC(m->status); > @@ -548,7 +641,7 @@ static void decode_mc4_mce(struct mce *m) > return; > > case 0x19: > - if (boot_cpu_data.x86 == 0x15) > + if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16) > pr_cont("Compute Unit Data Error.\n"); > else > goto wrong_mc4_mce; > @@ -634,6 +727,10 @@ static void decode_mc6_mce(struct mce *m) > > static inline void amd_decode_err_code(u16 ec) > { > + if (INT_ERROR(ec)) { > + pr_emerg(HW_ERR "internal: %s\n", LL_MSG(ec)); > + return; > + } Is this correct? I'm just confirming because I don't have the internal info anymore. Uuh, hold on, maybe those otherwise unused uu_msgs above were meant to be used here instead of the LL_MSG? IOW, pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); Right? > > pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); > > @@ -738,10 +835,18 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) > ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), > ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); > > - if (c->x86 == 0x15) > - pr_cont("|%s|%s", > + if (c->x86 == 0x15 || c->x86 == 0x16) { > + char coreid[16]; > + > + if (m->status & MCI_STATUS_COREIDV) > + sprintf(coreid, "CoreIdV(Core%d)", > + (int)ERR_CORE_ID(m->status)); Uuh, no, this is probably dumping the core which detected the error.. No need since we're dumping the core reporting the error anyway above. And if that's mismatched for some reason, we're also dumping full MCi_STATUS contents so you can decypher CoreId from there if needed. Thanks. -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 17:19 ` Borislav Petkov @ 2012-12-18 17:30 ` Jacob Shin 2012-12-18 18:09 ` Jacob Shin 2012-12-18 18:24 ` Joe Perches 1 sibling, 1 reply; 25+ messages in thread From: Jacob Shin @ 2012-12-18 17:30 UTC (permalink / raw) To: Borislav Petkov, Doug Thompson, linux-edac, linux-kernel On Tue, Dec 18, 2012 at 06:19:15PM +0100, Borislav Petkov wrote: > On Mon, Dec 17, 2012 at 01:39:48PM -0600, Jacob Shin wrote: > > Add MCE decoding logic for AMD Family 16h processors. > > > > Signed-off-by: Jacob Shin <jacob.shin@amd.com> > > --- > > drivers/edac/mce_amd.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++-- > > drivers/edac/mce_amd.h | 6 +++ > > 2 files changed, 122 insertions(+), 4 deletions(-) > > > > diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c > > index 84320f9..7d2d037 100644 > > --- a/drivers/edac/mce_amd.c > > +++ b/drivers/edac/mce_amd.c > > @@ -64,6 +64,10 @@ EXPORT_SYMBOL_GPL(to_msgs); > > const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; > > EXPORT_SYMBOL_GPL(ii_msgs); > > > > +/* internal error type */ > > +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; > > +EXPORT_SYMBOL_GPL(uu_msgs); > > Seems like those aren't used anywhere? > > > static const char * const f15h_mc1_mce_desc[] = { > > "UC during a demand linefill from L2", > > "Parity error during data load from IC", > > @@ -275,6 +279,23 @@ static bool f15h_mc0_mce(u16 ec, u8 xec) > > return ret; > > } > > > > +static bool f16h_mc0_mce(u16 ec, u8 xec) > > +{ > > + u8 r4 = R4(ec); > > + > > + if (MEM_ERROR(ec) && TT(ec) == TT_DATA && LL(ec) == LL_L1 && > > + (r4 == R4_DRD || r4 == R4_DWR)) { > > + > > + pr_cont("%s parity error due to %s.\n", > > + (xec == 0x0 ? "Data" : "Tag"), > > + (r4 == R4_DRD ? "load" : "store")); > > + > > + return true; > > + } > > + > > + return f14h_mc0_mce(ec, xec); > > Looks like this could be merged with f14h_mc0_mce no? You can call the > function then cat_mc0_mce (for all the *cat cores) and assign it to > fam_ops->mc0_mce in the f14h and f16h case. Okay > > > +} > > + > > static void decode_mc0_mce(struct mce *m) > > { > > u16 ec = EC(m->status); > > @@ -379,6 +400,36 @@ static bool f15h_mc1_mce(u16 ec, u8 xec) > > return ret; > > } > > > > +static bool f16h_mc1_mce(u16 ec, u8 xec) > > +{ > > + u8 r4 = R4(ec); > > + bool ret = true; > > + > > + if (MEM_ERROR(ec)) { > > + if (TT(ec) != TT_INSTR) > > + ret = false; > > + > > + else if (r4 == R4_IRD) > > + pr_cont("%s array parity error for a tag hit.\n", > > + (xec == 0x0 ? "Data" : "Tag")); > > + > > + else if (r4 == R4_SNOOP) > > + pr_cont("Tag error during snoop/victimization.\n"); > > + > > + else if (xec == 0x0) > > + pr_cont("Tag parity error from victim castout.\n"); > > + > > + else if (xec == 0x2) > > + pr_cont("Microcode patch RAM parity error.\n"); > > Also no need for a family-special function - just rename f14h_mc1_mce > to cat_mc1_mce() as above and add a special case like this as the last > else-branch of the if conditional there: Okay > > + if (boot_cpu_data.x86 == 0x16) { > + if (LL(ec) == LL_LG && xec == 2) > + pr_cont("Microcode patch RAM parity error.\n"); > + else > + pr_cont("IC Tag parity error from victim castout.\n"); > + return true; > + } > > > + > > + else > > + ret = false; > > + } else > > + ret = false; > > + > > + return ret; > > +} > > + > > static void decode_mc1_mce(struct mce *m) > > { > > u16 ec = EC(m->status); > > @@ -469,6 +520,48 @@ static bool f15h_mc2_mce(u16 ec, u8 xec) > > return ret; > > } > > > > +static bool f16h_mc2_mce(u16 ec, u8 xec) > > +{ > > + u8 r4 = R4(ec); > > + bool ret = true; > > + > > + if (MEM_ERROR(ec) && TT(ec) == TT_GEN && LL(ec) == LL_L2) { > > You can exit early here: > > if (!MEM_ERROR(ec)) > return false; > > Also, no need to test for TT and LL - we're relying on the hardware here > and if those values are b0rked then we have a more serious problem. Okay > > > + switch (xec) { > > + case 0x04 ... 0x05: > > + pr_cont("Parity error in %s.\n", > > + (r4 == R4_RD ? "IBUFF" : "OBUFF")); > > or > pr_cont("%sBUFF parity error.\n", ((xec == 4) ? "I" : "O")); > > > + break; > > + > > + case 0x09 ... 0x0b: > > + case 0x0d ... 0x0f: > > + pr_cont("ECC error in L2 tag (%s).\n", > > + (r4 == R4_GEN ? "BankReq" : > > + (r4 == R4_SNOOP ? "Probe" : "Fill"))); > > + break; > > + > > + case 0x10 ... 0x1b: > > + pr_cont("ECC error in L2 data array (%s).\n", > > + (r4 == R4_RD ? "Hit" : > > + (r4 == R4_GEN ? "Attr" : > > + (r4 == R4_EVICT ? "Vict" : "Fill")))); > > + break; > > + > > + case 0x1c ... 0x1f: > > + pr_cont("Parity error in L2 attribute bits (%s).\n", > > + (r4 == R4_RD ? "Hit" : > > + (r4 == R4_GEN ? "Attr" : "Fill"))); > > + break; > > + > > + default: > > + ret = false; > > + break; > > + } > > + } else > > + ret = false; > > + > > + return ret; > > +} > > + > > static void decode_mc2_mce(struct mce *m) > > { > > u16 ec = EC(m->status); > > @@ -548,7 +641,7 @@ static void decode_mc4_mce(struct mce *m) > > return; > > > > case 0x19: > > - if (boot_cpu_data.x86 == 0x15) > > + if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16) > > pr_cont("Compute Unit Data Error.\n"); > > else > > goto wrong_mc4_mce; > > @@ -634,6 +727,10 @@ static void decode_mc6_mce(struct mce *m) > > > > static inline void amd_decode_err_code(u16 ec) > > { > > + if (INT_ERROR(ec)) { > > + pr_emerg(HW_ERR "internal: %s\n", LL_MSG(ec)); > > + return; > > + } > > Is this correct? I'm just confirming because I don't have the internal > info anymore. > > Uuh, hold on, maybe those otherwise unused uu_msgs above were meant to > be used here instead of the LL_MSG? IOW, > > pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); > > Right? Ah, yes thats right, sorry about the typo. It looks like: Error Code Error Code Type Description 0000 01UU 0000 0000 Internal Unclassified UU = Internal Error Type And the UU encoding is as is in the mce_amd.h file > > > > > pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); > > > > @@ -738,10 +835,18 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) > > ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), > > ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); > > > > - if (c->x86 == 0x15) > > - pr_cont("|%s|%s", > > + if (c->x86 == 0x15 || c->x86 == 0x16) { > > + char coreid[16]; > > + > > + if (m->status & MCI_STATUS_COREIDV) > > + sprintf(coreid, "CoreIdV(Core%d)", > > + (int)ERR_CORE_ID(m->status)); > > Uuh, no, this is probably dumping the core which detected the error.. No > need since we're dumping the core reporting the error anyway above. And > if that's mismatched for some reason, we're also dumping full MCi_STATUS > contents so you can decypher CoreId from there if needed. Okay, will take this out. Thanks for the feedback! I'll spin a V2 and send it out later today. -Jacob > > Thanks. > > -- > Regards/Gruss, > Boris. > > Sent from a fat crate under my desk. Formatting is fine. > -- > ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 17:30 ` Jacob Shin @ 2012-12-18 18:09 ` Jacob Shin 2012-12-18 18:32 ` Borislav Petkov 0 siblings, 1 reply; 25+ messages in thread From: Jacob Shin @ 2012-12-18 18:09 UTC (permalink / raw) To: Borislav Petkov, Doug Thompson, linux-edac, linux-kernel On Tue, Dec 18, 2012 at 11:30:24AM -0600, Jacob Shin wrote: > On Tue, Dec 18, 2012 at 06:19:15PM +0100, Borislav Petkov wrote: > > On Mon, Dec 17, 2012 at 01:39:48PM -0600, Jacob Shin wrote: > > > +/* internal error type */ > > > +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; > > > +EXPORT_SYMBOL_GPL(uu_msgs); > > > > Seems like those aren't used anywhere? > > > @@ -634,6 +727,10 @@ static void decode_mc6_mce(struct mce *m) > > > > > > static inline void amd_decode_err_code(u16 ec) > > > { > > > + if (INT_ERROR(ec)) { > > > + pr_emerg(HW_ERR "internal: %s\n", LL_MSG(ec)); > > > + return; > > > + } > > > > Is this correct? I'm just confirming because I don't have the internal > > info anymore. > > > > Uuh, hold on, maybe those otherwise unused uu_msgs above were meant to > > be used here instead of the LL_MSG? IOW, > > > > > pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); > > > > Right? > > Ah, yes thats right, sorry about the typo. It looks like: > > Error Code Error Code Type Description > 0000 01UU 0000 0000 Internal Unclassified UU = Internal Error Type > > And the UU encoding is as is in the mce_amd.h file I think I meant to say as in uu_msgs[] above. HWA stands for "Hardware Assertion" .. ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 18:09 ` Jacob Shin @ 2012-12-18 18:32 ` Borislav Petkov 0 siblings, 0 replies; 25+ messages in thread From: Borislav Petkov @ 2012-12-18 18:32 UTC (permalink / raw) To: Jacob Shin; +Cc: Doug Thompson, linux-edac, linux-kernel On Tue, Dec 18, 2012 at 12:09:45PM -0600, Jacob Shin wrote: > I think I meant to say as in uu_msgs[] above. HWA stands for "Hardware > Assertion" .. Yep, I got it. You mean this: const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; and HWA is bit setting 0x2. You don't need to add a comment to say what HWA means because I'm assuming the BKDG will have it when it comes out. Thanks. -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 17:19 ` Borislav Petkov 2012-12-18 17:30 ` Jacob Shin @ 2012-12-18 18:24 ` Joe Perches 2012-12-18 18:33 ` Borislav Petkov 1 sibling, 1 reply; 25+ messages in thread From: Joe Perches @ 2012-12-18 18:24 UTC (permalink / raw) To: Borislav Petkov; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Tue, 2012-12-18 at 18:19 +0100, Borislav Petkov wrote: > On Mon, Dec 17, 2012 at 01:39:48PM -0600, Jacob Shin wrote: > > Add MCE decoding logic for AMD Family 16h processors. [] > > + switch (xec) { > > + case 0x04 ... 0x05: > > + pr_cont("Parity error in %s.\n", > > + (r4 == R4_RD ? "IBUFF" : "OBUFF")); > > or > pr_cont("%sBUFF parity error.\n", ((xec == 4) ? "I" : "O")); or without all the unnecessary parens and using char: pr_cont("%cBUFF parity error\n", xec == 4 ? 'I' : 'O'); ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 18:24 ` Joe Perches @ 2012-12-18 18:33 ` Borislav Petkov 2012-12-18 18:37 ` Joe Perches 0 siblings, 1 reply; 25+ messages in thread From: Borislav Petkov @ 2012-12-18 18:33 UTC (permalink / raw) To: Joe Perches; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Tue, Dec 18, 2012 at 10:24:29AM -0800, Joe Perches wrote: > or without all the unnecessary parens and using char: > > pr_cont("%cBUFF parity error\n", xec == 4 ? 'I' : 'O'); Char is fine, the parens make this more readable when you look at it again after a couple of months :-). Thanks. -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 18:33 ` Borislav Petkov @ 2012-12-18 18:37 ` Joe Perches 2012-12-18 19:00 ` Mauro Carvalho Chehab 0 siblings, 1 reply; 25+ messages in thread From: Joe Perches @ 2012-12-18 18:37 UTC (permalink / raw) To: Borislav Petkov; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Tue, 2012-12-18 at 19:33 +0100, Borislav Petkov wrote: > On Tue, Dec 18, 2012 at 10:24:29AM -0800, Joe Perches wrote: > > or without all the unnecessary parens and using char: > > > > pr_cont("%cBUFF parity error\n", xec == 4 ? 'I' : 'O'); > > Char is fine, the parens make this more readable when you look at it > again after a couple of months :-). <shrug> I think the unnecessary parens make it _less_ readable because you need to parse more stuff and it's easy to misplace those parens. cheers, Joe ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 18:37 ` Joe Perches @ 2012-12-18 19:00 ` Mauro Carvalho Chehab 0 siblings, 0 replies; 25+ messages in thread From: Mauro Carvalho Chehab @ 2012-12-18 19:00 UTC (permalink / raw) To: Joe Perches Cc: Borislav Petkov, Jacob Shin, Doug Thompson, linux-edac, linux-kernel Em 18-12-2012 16:37, Joe Perches escreveu: > On Tue, 2012-12-18 at 19:33 +0100, Borislav Petkov wrote: >> On Tue, Dec 18, 2012 at 10:24:29AM -0800, Joe Perches wrote: >>> or without all the unnecessary parens and using char: >>> >>> pr_cont("%cBUFF parity error\n", xec == 4 ? 'I' : 'O'); >> >> Char is fine, the parens make this more readable when you look at it >> again after a couple of months :-). > > <shrug> > > I think the unnecessary parens make it _less_ readable > because you need to parse more stuff and it's easy to > misplace those parens. I agree with Boris: the parenthesis at the test (xec == 4) makes it easier to read, IMO. Regards, Mauro ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 19:39 [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well Jacob Shin 2012-12-17 19:39 ` [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h Jacob Shin @ 2012-12-17 19:57 ` Joe Perches 2012-12-17 20:05 ` Borislav Petkov 1 sibling, 1 reply; 25+ messages in thread From: Joe Perches @ 2012-12-17 19:57 UTC (permalink / raw) To: Jacob Shin; +Cc: Borislav Petkov, Doug Thompson, linux-edac, linux-kernel On Mon, 2012-12-17 at 13:39 -0600, Jacob Shin wrote: > Currently only AMD Family 15h processors have special handling for MC2 > errors, since upcoming Family 16h will also need unique handling, > let's make MC2 handling part of amd_decoder_ops. [] > diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c [] > @@ -399,12 +399,9 @@ static void decode_mc1_mce(struct mce *m) > pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); > } > > -static void decode_mc2_mce(struct mce *m) > +static bool k8_mc2_mce(u16 ec, u8 xec) > { > - u16 ec = EC(m->status); > - u8 xec = XEC(m->status, xec_mask); > - > - pr_emerg(HW_ERR "MC2 Error"); > + bool ret = true; > > if (xec == 0x1) > pr_cont(" in the write data buffers.\n"); It'd be better to change the pr_cont uses to pr_emerg [] > +static void decode_mc2_mce(struct mce *m) > +{ > + u16 ec = EC(m->status); > + u8 xec = XEC(m->status, xec_mask); > + > + pr_emerg(HW_ERR "MC2 Error: "); Remove this and > + if (fam_ops->mc2_mce(ec, xec)) > + ; > + else > + pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); > } And make this if (!fam_ops->mc2_mce(ec, xec)) pr_emerg(etc...); ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 19:57 ` [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well Joe Perches @ 2012-12-17 20:05 ` Borislav Petkov 2012-12-17 20:16 ` Joe Perches 0 siblings, 1 reply; 25+ messages in thread From: Borislav Petkov @ 2012-12-17 20:05 UTC (permalink / raw) To: Joe Perches; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Mon, Dec 17, 2012 at 11:57:01AM -0800, Joe Perches wrote: > It'd be better to change the pr_cont uses to pr_emerg > > [] > > > +static void decode_mc2_mce(struct mce *m) > > +{ > > + u16 ec = EC(m->status); > > + u8 xec = XEC(m->status, xec_mask); > > + > > + pr_emerg(HW_ERR "MC2 Error: "); > > Remove this and > > > + if (fam_ops->mc2_mce(ec, xec)) > > + ; > > + else > > + pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); > > } > > And make this > > if (!fam_ops->mc2_mce(ec, xec)) > pr_emerg(etc...); No, this is not how we do this here. We do pr_emerg in the main per-bank function, i.e. mc0, mc1, mc2... and we finish the line in the respective function with pr_cont. If your fear is line interleaving, then this shouldn't happen in most cases because we're in atomic #MC context and nothing else is executing in that case. (And I haven't seen it interleave in all my testing so far). If it does interleave when we have a non-critical error detected and reported in process context, then this whole decoding code needs a lot more work than this. Thanks. -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 20:05 ` Borislav Petkov @ 2012-12-17 20:16 ` Joe Perches 2012-12-17 20:22 ` Borislav Petkov 0 siblings, 1 reply; 25+ messages in thread From: Joe Perches @ 2012-12-17 20:16 UTC (permalink / raw) To: Borislav Petkov; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Mon, 2012-12-17 at 21:05 +0100, Borislav Petkov wrote: > On Mon, Dec 17, 2012 at 11:57:01AM -0800, Joe Perches wrote: > > It'd be better to change the pr_cont uses to pr_emerg > > > > [] > > > > > +static void decode_mc2_mce(struct mce *m) > > > +{ > > > + u16 ec = EC(m->status); > > > + u8 xec = XEC(m->status, xec_mask); > > > + > > > + pr_emerg(HW_ERR "MC2 Error: "); > > > > Remove this and > > > > > + if (fam_ops->mc2_mce(ec, xec)) > > > + ; > > > + else > > > + pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); > > > } > > > > And make this > > > > if (!fam_ops->mc2_mce(ec, xec)) > > pr_emerg(etc...); > > No, this is not how we do this here. We do pr_emerg in the main per-bank > function, i.e. mc0, mc1, mc2... and we finish the line in the respective > function with pr_cont. > > If your fear is line interleaving, then this shouldn't happen in most > cases because we're in atomic #MC context and nothing else is executing > in that case. (And I haven't seen it interleave in all my testing so > far). > > If it does interleave when we have a non-critical error detected and > reported in process context, then this whole decoding code needs a lot > more work than this. > You also have paths where you start a pr_emerg and do not terminate it when there is no apparent error at all. ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 20:16 ` Joe Perches @ 2012-12-17 20:22 ` Borislav Petkov 2012-12-17 20:34 ` Joe Perches 0 siblings, 1 reply; 25+ messages in thread From: Borislav Petkov @ 2012-12-17 20:22 UTC (permalink / raw) To: Joe Perches; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Mon, Dec 17, 2012 at 12:16:16PM -0800, Joe Perches wrote: > You also have paths where you start a pr_emerg and do not terminate it > when there is no apparent error at all. For example? -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 20:22 ` Borislav Petkov @ 2012-12-17 20:34 ` Joe Perches 2012-12-17 20:40 ` Borislav Petkov 0 siblings, 1 reply; 25+ messages in thread From: Joe Perches @ 2012-12-17 20:34 UTC (permalink / raw) To: Borislav Petkov; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Mon, 2012-12-17 at 21:22 +0100, Borislav Petkov wrote: > On Mon, Dec 17, 2012 at 12:16:16PM -0800, Joe Perches wrote: > > You also have paths where you start a pr_emerg and do not terminate it > > when there is no apparent error at all. > > For example? > decode_mc2_mce does pr_emerg(HW_ERR "MC2 Error: "); calls fam_ops->mc2_mce() which may not emit anything and if it doesn't, does pr_emerg(HW_ERR "Corrupted ..."); on another line leaving a trailing colon without newline from the pr_emerg above. I believe the second pr_emerg should be pr_cont cheers, Joe ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 20:34 ` Joe Perches @ 2012-12-17 20:40 ` Borislav Petkov 2012-12-17 20:43 ` Jacob Shin 0 siblings, 1 reply; 25+ messages in thread From: Borislav Petkov @ 2012-12-17 20:40 UTC (permalink / raw) To: Joe Perches; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Mon, Dec 17, 2012 at 12:34:35PM -0800, Joe Perches wrote: > decode_mc2_mce does > > pr_emerg(HW_ERR "MC2 Error: "); > calls > fam_ops->mc2_mce() > which may not emit anything > and if it doesn't, does > pr_emerg(HW_ERR "Corrupted ..."); > on another line leaving a trailing colon without > newline from the pr_emerg above. Ok, this is in Jacob's patch AFAICT, I'll review those tomorrow but thanks for the heads-up. > I believe the second pr_emerg should be pr_cont Actually, it should be pr_emerg(HW_ERR "\nCorrupted...\n"); to finish the initial pr_emerg. Thanks. -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 20:40 ` Borislav Petkov @ 2012-12-17 20:43 ` Jacob Shin 2012-12-17 20:59 ` Joe Perches 0 siblings, 1 reply; 25+ messages in thread From: Jacob Shin @ 2012-12-17 20:43 UTC (permalink / raw) To: Borislav Petkov, Joe Perches, Doug Thompson, linux-edac, linux-kernel On Mon, Dec 17, 2012 at 09:40:11PM +0100, Borislav Petkov wrote: > On Mon, Dec 17, 2012 at 12:34:35PM -0800, Joe Perches wrote: > > decode_mc2_mce does > > > > pr_emerg(HW_ERR "MC2 Error: "); > > calls > > fam_ops->mc2_mce() > > which may not emit anything > > and if it doesn't, does > > pr_emerg(HW_ERR "Corrupted ..."); > > on another line leaving a trailing colon without > > newline from the pr_emerg above. > > Ok, this is in Jacob's patch AFAICT, I'll review those tomorrow but > thanks for the heads-up. Okay, I'll wait until your feedback before sending V2 > > > I believe the second pr_emerg should be pr_cont > > Actually, it should be > > pr_emerg(HW_ERR "\nCorrupted...\n"); > > to finish the initial pr_emerg. Okay, all of the other decode_mc*_mce functions needs to change then. Thanks! > > Thanks. > > -- > Regards/Gruss, > Boris. > > Sent from a fat crate under my desk. Formatting is fine. > -- > ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 20:43 ` Jacob Shin @ 2012-12-17 20:59 ` Joe Perches 2012-12-17 21:08 ` Borislav Petkov 2012-12-17 21:09 ` Jacob Shin 0 siblings, 2 replies; 25+ messages in thread From: Joe Perches @ 2012-12-17 20:59 UTC (permalink / raw) To: Jacob Shin; +Cc: Borislav Petkov, Doug Thompson, linux-edac, linux-kernel On Mon, 2012-12-17 at 14:43 -0600, Jacob Shin wrote: > On Mon, Dec 17, 2012 at 09:40:11PM +0100, Borislav Petkov wrote: > > On Mon, Dec 17, 2012 at 12:34:35PM -0800, Joe Perches wrote: > > > decode_mc2_mce does > > > > > > pr_emerg(HW_ERR "MC2 Error: "); > > > calls > > > fam_ops->mc2_mce() > > > which may not emit anything > > > and if it doesn't, does > > > pr_emerg(HW_ERR "Corrupted ..."); > > > on another line leaving a trailing colon without > > > newline from the pr_emerg above. > > > > Ok, this is in Jacob's patch AFAICT, I'll review those tomorrow but > > thanks for the heads-up. > > Okay, I'll wait until your feedback before sending V2 > > > > > > I believe the second pr_emerg should be pr_cont > > > > Actually, it should be > > > > pr_emerg(HW_ERR "\nCorrupted...\n"); > > > > to finish the initial pr_emerg. > > Okay, all of the other decode_mc*_mce functions needs to change then. Hi Jacob. I don't think that's necessary. Using any pr_<level>("\n...") is just not good form. I think the pr_cont would be fine, it's the same form as all the other decoded continuations. The new pr_emerg is effectively a termination of the previous one. There's no need for 2 consecutive uses of pr_emerg(HW_ERR ...) here. cheers, Joe ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 20:59 ` Joe Perches @ 2012-12-17 21:08 ` Borislav Petkov 2012-12-17 21:11 ` Jacob Shin 2012-12-17 21:09 ` Jacob Shin 1 sibling, 1 reply; 25+ messages in thread From: Borislav Petkov @ 2012-12-17 21:08 UTC (permalink / raw) To: Jacob Shin; +Cc: Joe Perches, Doug Thompson, linux-edac, linux-kernel On Mon, Dec 17, 2012 at 12:59:00PM -0800, Joe Perches wrote: > The new pr_emerg is effectively a termination of the > previous one. There's no need for 2 consecutive uses > of pr_emerg(HW_ERR ...) here. Actually, in decode_mc5_mce(), the second pr_emerg needs to stay like what it is now. So Jacob, please leave those changes out, I'll sort them out later. Thanks. -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 21:08 ` Borislav Petkov @ 2012-12-17 21:11 ` Jacob Shin 0 siblings, 0 replies; 25+ messages in thread From: Jacob Shin @ 2012-12-17 21:11 UTC (permalink / raw) To: Borislav Petkov, Joe Perches, Doug Thompson, linux-edac, linux-kernel On Mon, Dec 17, 2012 at 10:08:02PM +0100, Borislav Petkov wrote: > On Mon, Dec 17, 2012 at 12:59:00PM -0800, Joe Perches wrote: > > The new pr_emerg is effectively a termination of the > > previous one. There's no need for 2 consecutive uses > > of pr_emerg(HW_ERR ...) here. > > Actually, in decode_mc5_mce(), the second pr_emerg needs to stay like > what it is now. > > So Jacob, please leave those changes out, I'll sort them out later. Okay, will do, I'll change fix it in my changes then. But I'll wait for your feedback tomorrow, before sending out V2. Thanks! -Jacob > > Thanks. > > -- > Regards/Gruss, > Boris. > > Sent from a fat crate under my desk. Formatting is fine. > -- > ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well 2012-12-17 20:59 ` Joe Perches 2012-12-17 21:08 ` Borislav Petkov @ 2012-12-17 21:09 ` Jacob Shin 1 sibling, 0 replies; 25+ messages in thread From: Jacob Shin @ 2012-12-17 21:09 UTC (permalink / raw) To: Joe Perches; +Cc: Borislav Petkov, Doug Thompson, linux-edac, linux-kernel On Mon, Dec 17, 2012 at 12:59:00PM -0800, Joe Perches wrote: > On Mon, 2012-12-17 at 14:43 -0600, Jacob Shin wrote: > > On Mon, Dec 17, 2012 at 09:40:11PM +0100, Borislav Petkov wrote: > > > On Mon, Dec 17, 2012 at 12:34:35PM -0800, Joe Perches wrote: > > > > decode_mc2_mce does > > > > > > > > pr_emerg(HW_ERR "MC2 Error: "); > > > > calls > > > > fam_ops->mc2_mce() > > > > which may not emit anything > > > > and if it doesn't, does > > > > pr_emerg(HW_ERR "Corrupted ..."); > > > > on another line leaving a trailing colon without > > > > newline from the pr_emerg above. > > > > > > Ok, this is in Jacob's patch AFAICT, I'll review those tomorrow but > > > thanks for the heads-up. > > > > Okay, I'll wait until your feedback before sending V2 > > > > > > > > > I believe the second pr_emerg should be pr_cont > > > > > > Actually, it should be > > > > > > pr_emerg(HW_ERR "\nCorrupted...\n"); > > > > > > to finish the initial pr_emerg. > > > > Okay, all of the other decode_mc*_mce functions needs to change then. > > Hi Jacob. > > I don't think that's necessary. > > Using any pr_<level>("\n...") is just not good form. > I think the pr_cont would be fine, it's the same > form as all the other decoded continuations. > > The new pr_emerg is effectively a termination of the > previous one. There's no need for 2 consecutive uses > of pr_emerg(HW_ERR ...) here. Ah, okay, I'll use pr_cont. Thanks! -Jacob > > cheers, Joe > > ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH V2 0/2] MCE, AMD: MCE decoding support for AMD Family 16h @ 2012-12-18 21:06 Jacob Shin 2012-12-18 21:06 ` [PATCH 2/2] " Jacob Shin 0 siblings, 1 reply; 25+ messages in thread From: Jacob Shin @ 2012-12-18 21:06 UTC (permalink / raw) To: Borislav Petkov, Doug Thompson; +Cc: linux-edac, linux-kernel, Jacob Shin The following patchset enables MCE decoding support for AMD Family 16h processors. Changes in V2: * Changed if/else style and pr_cont usage per feedback from: https://lkml.org/lkml/2012/12/17/365 * Merged f14h and f16h mc0 and mc2 decoding into common function per feedback from: https://lkml.org/lkml/2012/12/18/269 * Fixed typo in INT_ERROR decoding. Jacob Shin (2): MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well MCE, AMD: MCE decoding support for AMD Family 16h drivers/edac/mce_amd.c | 139 +++++++++++++++++++++++++++++++++++------------- drivers/edac/mce_amd.h | 4 ++ 2 files changed, 105 insertions(+), 38 deletions(-) -- 1.7.9.5 ^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 21:06 [PATCH V2 0/2] MCE, AMD: MCE decoding support for AMD Family 16h Jacob Shin @ 2012-12-18 21:06 ` Jacob Shin 2012-12-18 22:10 ` Joe Perches 0 siblings, 1 reply; 25+ messages in thread From: Jacob Shin @ 2012-12-18 21:06 UTC (permalink / raw) To: Borislav Petkov, Doug Thompson; +Cc: linux-edac, linux-kernel, Jacob Shin Add MCE decoding logic for AMD Family 16h processors. Signed-off-by: Jacob Shin <jacob.shin@amd.com> --- drivers/edac/mce_amd.c | 83 +++++++++++++++++++++++++++++++++++++++++------- drivers/edac/mce_amd.h | 3 ++ 2 files changed, 75 insertions(+), 11 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e4752be..ce4c8b7 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -64,6 +64,10 @@ EXPORT_SYMBOL_GPL(to_msgs); const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); +/* internal error type */ +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; +EXPORT_SYMBOL_GPL(uu_msgs); + static const char * const f15h_mc1_mce_desc[] = { "UC during a demand linefill from L2", "Parity error during data load from IC", @@ -176,7 +180,7 @@ static bool k8_mc0_mce(u16 ec, u8 xec) return f10h_mc0_mce(ec, xec); } -static bool f14h_mc0_mce(u16 ec, u8 xec) +static bool cat_mc0_mce(u16 ec, u8 xec) { u8 r4 = R4(ec); bool ret = true; @@ -330,22 +334,27 @@ static bool k8_mc1_mce(u16 ec, u8 xec) return ret; } -static bool f14h_mc1_mce(u16 ec, u8 xec) +static bool cat_mc1_mce(u16 ec, u8 xec) { u8 r4 = R4(ec); bool ret = true; if (MEM_ERROR(ec)) { - if (TT(ec) != 0 || LL(ec) != 1) + if (TT(ec) != TT_INSTR) ret = false; - - if (r4 == R4_IRD) + else if (r4 == R4_IRD) pr_cont("Data/tag array parity error for a tag hit.\n"); else if (r4 == R4_SNOOP) pr_cont("Tag error during snoop/victimization.\n"); + else if (xec == 0x0) + pr_cont("Tag parity error from victim castout.\n"); + else if (xec == 0x2) + pr_cont("Microcode patch RAM parity error.\n"); else ret = false; - } + } else + ret = false; + return ret; } @@ -469,6 +478,47 @@ static bool f15h_mc2_mce(u16 ec, u8 xec) return ret; } +static bool f16h_mc2_mce(u16 ec, u8 xec) +{ + u8 r4 = R4(ec); + + if (!MEM_ERROR(ec)) + return false; + + switch (xec) { + case 0x04 ... 0x05: + pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); + break; + + case 0x09 ... 0x0b: + case 0x0d ... 0x0f: + pr_cont("ECC error in L2 tag (%s).\n", + ((r4 == R4_GEN) ? "BankReq" : + ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); + break; + + case 0x10 ... 0x19: + case 0x1b: + pr_cont("ECC error in L2 data array (%s).\n", + (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : + ((r4 == R4_GEN) ? "Attr" : + ((r4 == R4_EVICT) ? "Vict" : "Fill")))); + break; + + case 0x1c ... 0x1d: + case 0x1f: + pr_cont("Parity error in L2 attribute bits (%s).\n", + ((r4 == R4_RD) ? "Hit" : + ((r4 == R4_GEN) ? "Attr" : "Fill"))); + break; + + default: + return false; + } + + return true; +} + static void decode_mc2_mce(struct mce *m) { u16 ec = EC(m->status); @@ -546,7 +596,7 @@ static void decode_mc4_mce(struct mce *m) return; case 0x19: - if (boot_cpu_data.x86 == 0x15) + if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16) pr_cont("Compute Unit Data Error.\n"); else goto wrong_mc4_mce; @@ -632,6 +682,10 @@ static void decode_mc6_mce(struct mce *m) static inline void amd_decode_err_code(u16 ec) { + if (INT_ERROR(ec)) { + pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); + return; + } pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); @@ -736,7 +790,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - if (c->x86 == 0x15) + if (c->x86 == 0x15 || c->x86 == 0x16) pr_cont("|%s|%s", ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); @@ -768,7 +822,7 @@ static int __init mce_amd_init(void) if (c->x86_vendor != X86_VENDOR_AMD) return 0; - if (c->x86 < 0xf || c->x86 > 0x15) + if (c->x86 < 0xf || c->x86 > 0x16) return 0; fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); @@ -802,8 +856,8 @@ static int __init mce_amd_init(void) case 0x14: nb_err_cpumask = 0x3; - fam_ops->mc0_mce = f14h_mc0_mce; - fam_ops->mc1_mce = f14h_mc1_mce; + fam_ops->mc0_mce = cat_mc0_mce; + fam_ops->mc1_mce = cat_mc1_mce; fam_ops->mc2_mce = k8_mc2_mce; break; @@ -814,6 +868,13 @@ static int __init mce_amd_init(void) fam_ops->mc2_mce = f15h_mc2_mce; break; + case 0x16: + xec_mask = 0x1f; + fam_ops->mc0_mce = cat_mc0_mce; + fam_ops->mc1_mce = cat_mc1_mce; + fam_ops->mc2_mce = f16h_mc2_mce; + break; + default: printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); kfree(fam_ops); diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 000f6e2..2828908 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -14,6 +14,7 @@ #define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) #define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) #define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) +#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) #define TT(x) (((x) >> 2) & 0x3) #define TT_MSG(x) tt_msgs[TT(x)] @@ -25,6 +26,8 @@ #define TO_MSG(x) to_msgs[TO(x)] #define PP(x) (((x) >> 9) & 0x3) #define PP_MSG(x) pp_msgs[PP(x)] +#define UU(x) (((x) >> 8) & 0x3) +#define UU_MSG(x) uu_msgs[UU(x)] #define R4(x) (((x) >> 4) & 0xf) #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") -- 1.7.9.5 ^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 21:06 ` [PATCH 2/2] " Jacob Shin @ 2012-12-18 22:10 ` Joe Perches 2012-12-18 22:56 ` Borislav Petkov 0 siblings, 1 reply; 25+ messages in thread From: Joe Perches @ 2012-12-18 22:10 UTC (permalink / raw) To: Jacob Shin; +Cc: Borislav Petkov, Doug Thompson, linux-edac, linux-kernel On Tue, 2012-12-18 at 15:06 -0600, Jacob Shin wrote: > Add MCE decoding logic for AMD Family 16h processors. More trivia: > diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c [] > @@ -64,6 +64,10 @@ EXPORT_SYMBOL_GPL(to_msgs); > const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; > EXPORT_SYMBOL_GPL(ii_msgs); > > +/* internal error type */ > +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; > +EXPORT_SYMBOL_GPL(uu_msgs); Do these really need to be exported? These don't seem to be used outside of mce_amd. If these really need export, are these very good names? Perhaps these should be prefixed with something? ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 22:10 ` Joe Perches @ 2012-12-18 22:56 ` Borislav Petkov 2012-12-20 19:20 ` Borislav Petkov 0 siblings, 1 reply; 25+ messages in thread From: Borislav Petkov @ 2012-12-18 22:56 UTC (permalink / raw) To: Joe Perches; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Tue, Dec 18, 2012 at 02:10:59PM -0800, Joe Perches wrote: > On Tue, 2012-12-18 at 15:06 -0600, Jacob Shin wrote: > > Add MCE decoding logic for AMD Family 16h processors. > > More trivia: > > > diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c > [] > > @@ -64,6 +64,10 @@ EXPORT_SYMBOL_GPL(to_msgs); > > const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; > > EXPORT_SYMBOL_GPL(ii_msgs); > > > > +/* internal error type */ > > +const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; > > +EXPORT_SYMBOL_GPL(uu_msgs); > > Do these really need to be exported? > These don't seem to be used outside of mce_amd. > > If these really need export, are these very good names? > Perhaps these should be prefixed with something? Hmm, some of those were used in amd64_edac at some point but then the whole decoding code got concentrated into mce_amd.c but the exports remained. PP() and thus pp_msgs is still used there, for example. So, if you'd like, you could wait until I apply Jacob's patches and push that git branch to kernel.org and then write a patch ontop removing all the exports except pp_msgs (you could add them to the "amd_" namespace by calling it amd_pp_msgs or something) test that patch and send it to me. Thanks. -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h 2012-12-18 22:56 ` Borislav Petkov @ 2012-12-20 19:20 ` Borislav Petkov 0 siblings, 0 replies; 25+ messages in thread From: Borislav Petkov @ 2012-12-20 19:20 UTC (permalink / raw) To: Joe Perches; +Cc: Jacob Shin, Doug Thompson, linux-edac, linux-kernel On Tue, Dec 18, 2012 at 11:56:10PM +0100, Borislav Petkov wrote: > So, if you'd like, you could wait until I apply Jacob's patches and > push that git branch to kernel.org and then write a patch ontop > removing all the exports except pp_msgs (you could add them to the > "amd_" namespace by calling it amd_pp_msgs or something) test that > patch and send it to me. Ok, I've applied Jacob's stuff. You can do your patch ontop of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git edac-for-3.9 and send it to me. Thanks. -- Regards/Gruss, Boris. Sent from a fat crate under my desk. Formatting is fine. -- ^ permalink raw reply [flat|nested] 25+ messages in thread
end of thread, other threads:[~2012-12-20 19:20 UTC | newest] Thread overview: 25+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2012-12-17 19:39 [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well Jacob Shin 2012-12-17 19:39 ` [PATCH 2/2] MCE, AMD: MCE decoding support for AMD Family 16h Jacob Shin 2012-12-18 17:19 ` Borislav Petkov 2012-12-18 17:30 ` Jacob Shin 2012-12-18 18:09 ` Jacob Shin 2012-12-18 18:32 ` Borislav Petkov 2012-12-18 18:24 ` Joe Perches 2012-12-18 18:33 ` Borislav Petkov 2012-12-18 18:37 ` Joe Perches 2012-12-18 19:00 ` Mauro Carvalho Chehab 2012-12-17 19:57 ` [PATCH 1/2] MCE, AMD: Make MC2 decoding part of amd_decoder_ops as well Joe Perches 2012-12-17 20:05 ` Borislav Petkov 2012-12-17 20:16 ` Joe Perches 2012-12-17 20:22 ` Borislav Petkov 2012-12-17 20:34 ` Joe Perches 2012-12-17 20:40 ` Borislav Petkov 2012-12-17 20:43 ` Jacob Shin 2012-12-17 20:59 ` Joe Perches 2012-12-17 21:08 ` Borislav Petkov 2012-12-17 21:11 ` Jacob Shin 2012-12-17 21:09 ` Jacob Shin -- strict thread matches above, loose matches on Subject: below -- 2012-12-18 21:06 [PATCH V2 0/2] MCE, AMD: MCE decoding support for AMD Family 16h Jacob Shin 2012-12-18 21:06 ` [PATCH 2/2] " Jacob Shin 2012-12-18 22:10 ` Joe Perches 2012-12-18 22:56 ` Borislav Petkov 2012-12-20 19:20 ` Borislav Petkov
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox