* [RFC PATCH 2/4] powerpc/pseries: Define MCE error event section.
2018-06-06 4:36 [RFC PATCH 0/4] powerpc/pseries: Machien check handler improvements Mahesh J Salgaonkar
2018-06-06 4:36 ` [RFC PATCH 1/4] powerpc/pseries: convert rtas_log_buf to linear allocation Mahesh J Salgaonkar
@ 2018-06-06 4:37 ` Mahesh J Salgaonkar
2018-06-06 4:37 ` [RFC PATCH 3/4] powerpc/pseries: Dump and flush SLB contents on SLB MCE errors Mahesh J Salgaonkar
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Mahesh J Salgaonkar @ 2018-06-06 4:37 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Aneesh Kumar K.V, Michael Ellerman, Laurent Dufour
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
On pseries, the machine check error details are part of RTAS extended
event log passed under Machine check exception section. This patch adds
the definition of rtas MCE event section and related helper
functions.
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/rtas.h | 104 +++++++++++++++++++++++++++++++++++++++
1 file changed, 104 insertions(+)
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index ec9dd79398ee..3f2fba7ef23b 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -275,6 +275,7 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log)
#define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H')
#define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D')
#define PSERIES_ELOG_SECT_ID_HOTPLUG (('H' << 8) | 'P')
+#define PSERIES_ELOG_SECT_ID_MCE (('M' << 8) | 'C')
/* Vendor specific Platform Event Log Format, Version 6, section header */
struct pseries_errorlog {
@@ -326,6 +327,109 @@ struct pseries_hp_errorlog {
#define PSERIES_HP_ELOG_ID_DRC_COUNT 3
#define PSERIES_HP_ELOG_ID_DRC_IC 4
+/* RTAS pseries MCE errorlog section */
+#pragma pack(push, 1)
+struct pseries_mc_errorlog {
+ __be32 fru_id;
+ __be32 proc_id;
+ uint8_t error_type;
+ union {
+ struct {
+ uint8_t ue_err_type;
+ /* XXXXXXXX
+ * X 1: Permanent or Transient UE.
+ * X 1: Effective address provided.
+ * X 1: Logical address provided.
+ * XX 2: Reserved.
+ * XXX 3: Type of UE error.
+ */
+ uint8_t reserved_1[6];
+ __be64 effective_address;
+ __be64 logical_address;
+ } ue_error;
+ struct {
+ uint8_t soft_err_type;
+ /* XXXXXXXX
+ * X 1: Effective address provided.
+ * XXXXX 5: Reserved.
+ * XX 2: Type of SLB/ERAT/TLB error.
+ */
+ uint8_t reserved_1[6];
+ __be64 effective_address;
+ uint8_t reserved_2[8];
+ } soft_error;
+ } u;
+};
+#pragma pack(pop)
+
+/* RTAS pseries MCE error types */
+#define PSERIES_MC_ERROR_TYPE_UE 0x00
+#define PSERIES_MC_ERROR_TYPE_SLB 0x01
+#define PSERIES_MC_ERROR_TYPE_ERAT 0x02
+#define PSERIES_MC_ERROR_TYPE_TLB 0x04
+#define PSERIES_MC_ERROR_TYPE_D_CACHE 0x05
+#define PSERIES_MC_ERROR_TYPE_I_CACHE 0x07
+
+/* RTAS pseries MCE error sub types */
+#define PSERIES_MC_ERROR_UE_INDETERMINATE 0
+#define PSERIES_MC_ERROR_UE_IFETCH 1
+#define PSERIES_MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2
+#define PSERIES_MC_ERROR_UE_LOAD_STORE 3
+#define PSERIES_MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4
+
+#define PSERIES_MC_ERROR_SLB_PARITY 0
+#define PSERIES_MC_ERROR_SLB_MULTIHIT 1
+#define PSERIES_MC_ERROR_SLB_INDETERMINATE 2
+
+#define PSERIES_MC_ERROR_ERAT_PARITY 1
+#define PSERIES_MC_ERROR_ERAT_MULTIHIT 2
+#define PSERIES_MC_ERROR_ERAT_INDETERMINATE 3
+
+#define PSERIES_MC_ERROR_TLB_PARITY 1
+#define PSERIES_MC_ERROR_TLB_MULTIHIT 2
+#define PSERIES_MC_ERROR_TLB_INDETERMINATE 3
+
+static inline uint8_t rtas_mc_error_type(const struct pseries_mc_errorlog *mlog)
+{
+ return mlog->error_type;
+}
+
+static inline uint8_t rtas_mc_error_sub_type(
+ const struct pseries_mc_errorlog *mlog)
+{
+ switch (mlog->error_type) {
+ case PSERIES_MC_ERROR_TYPE_UE:
+ return (mlog->u.ue_error.ue_err_type & 0x07);
+ case PSERIES_MC_ERROR_TYPE_SLB:
+ case PSERIES_MC_ERROR_TYPE_ERAT:
+ case PSERIES_MC_ERROR_TYPE_TLB:
+ return (mlog->u.soft_error.soft_err_type & 0x03);
+ default:
+ return 0;
+ }
+}
+
+static inline uint64_t rtas_mc_get_effective_addr(
+ const struct pseries_mc_errorlog *mlog)
+{
+ uint64_t addr = 0;
+
+ switch (mlog->error_type) {
+ case PSERIES_MC_ERROR_TYPE_UE:
+ if (mlog->u.ue_error.ue_err_type & 0x40)
+ addr = mlog->u.ue_error.effective_address;
+ break;
+ case PSERIES_MC_ERROR_TYPE_SLB:
+ case PSERIES_MC_ERROR_TYPE_ERAT:
+ case PSERIES_MC_ERROR_TYPE_TLB:
+ if (mlog->u.soft_error.soft_err_type & 0x80)
+ addr = mlog->u.soft_error.effective_address;
+ default:
+ break;
+ }
+ return be64_to_cpu(addr);
+}
+
struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
uint16_t section_id);
^ permalink raw reply related [flat|nested] 6+ messages in thread* [RFC PATCH 3/4] powerpc/pseries: Dump and flush SLB contents on SLB MCE errors.
2018-06-06 4:36 [RFC PATCH 0/4] powerpc/pseries: Machien check handler improvements Mahesh J Salgaonkar
2018-06-06 4:36 ` [RFC PATCH 1/4] powerpc/pseries: convert rtas_log_buf to linear allocation Mahesh J Salgaonkar
2018-06-06 4:37 ` [RFC PATCH 2/4] powerpc/pseries: Define MCE error event section Mahesh J Salgaonkar
@ 2018-06-06 4:37 ` Mahesh J Salgaonkar
2018-06-06 4:37 ` [RFC PATCH 4/4] powerpc/pseries: Display machine check error details Mahesh J Salgaonkar
2018-06-07 19:43 ` [RFC PATCH 0/4] powerpc/pseries: Machien check handler improvements Michal Suchánek
4 siblings, 0 replies; 6+ messages in thread
From: Mahesh J Salgaonkar @ 2018-06-06 4:37 UTC (permalink / raw)
To: linuxppc-dev
Cc: Aneesh Kumar K.V, Michael Ellerman, Aneesh Kumar K.V,
Michael Ellerman, Laurent Dufour
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
If we get a machine check exceptions due to SLB errors then dump the
current SLB contents which will be very much helpful in debugging the
root cause of SLB errors. On pseries, as of today system crashes on SLB
errors. These are soft errors and can be fixed by flushing the SLBs so
the kernel can continue to function instead of system crash. This patch
fixes that also.
With this patch the console will log SLB contents like below on SLB MCE
errors:
[ 822.711728] slb contents:
[ 822.711730] 00 c000000008000000 400ea1b217000500
[ 822.711731] 1T ESID= c00000 VSID= ea1b217 LLP:100
[ 822.711732] 01 d000000008000000 400d43642f000510
[ 822.711733] 1T ESID= d00000 VSID= d43642f LLP:110
[ 822.711734] 09 f000000008000000 400a86c85f000500
[ 822.711736] 1T ESID= f00000 VSID= a86c85f LLP:100
[ 822.711737] 10 00007f0008000000 400d1f26e3000d90
[ 822.711738] 1T ESID= 7f VSID= d1f26e3 LLP:110
[ 822.711739] 11 0000000018000000 000e3615f520fd90
[ 822.711740] 256M ESID= 1 VSID= e3615f520f LLP:110
[ 822.711740] 12 d000000008000000 400d43642f000510
[ 822.711741] 1T ESID= d00000 VSID= d43642f LLP:110
[ 822.711742] 13 d000000008000000 400d43642f000510
[ 822.711743] 1T ESID= d00000 VSID= d43642f LLP:110
Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/book3s/64/mmu-hash.h | 1 +
arch/powerpc/mm/slb.c | 35 +++++++++++++++++++++++++
arch/powerpc/platforms/pseries/ras.c | 29 ++++++++++++++++++++-
3 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 50ed64fba4ae..c0da68927235 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -487,6 +487,7 @@ extern void hpte_init_native(void);
extern void slb_initialize(void);
extern void slb_flush_and_rebolt(void);
+extern void slb_dump_contents(void);
extern void slb_vmalloc_update(void);
extern void slb_set_size(u16 size);
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 66577cc66dc9..799aa117cec3 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -145,6 +145,41 @@ void slb_flush_and_rebolt(void)
get_paca()->slb_cache_ptr = 0;
}
+void slb_dump_contents(void)
+{
+ int i;
+ unsigned long e, v;
+ unsigned long llp;
+
+ pr_err("slb contents:\n");
+ for (i = 0; i < mmu_slb_size; i++) {
+ asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i));
+ asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i));
+
+ if (!e && !v)
+ continue;
+
+ pr_err("%02d %016lx %016lx", i, e, v);
+
+ if (!(e & SLB_ESID_V)) {
+ pr_err("\n");
+ continue;
+ }
+ llp = v & SLB_VSID_LLP;
+ if (v & SLB_VSID_B_1T) {
+ pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID_1T(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
+ llp);
+ } else {
+ pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
+ llp);
+ }
+ }
+}
+
void slb_vmalloc_update(void)
{
unsigned long vflags;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 5e1ef9150182..7470a216cd6b 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -422,6 +422,31 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
}
+static int mce_handle_error(struct rtas_error_log *errp)
+{
+ struct pseries_errorlog *pseries_log;
+ struct pseries_mc_errorlog *mce_log;
+ int disposition = rtas_error_disposition(errp);
+ uint8_t error_type;
+
+ pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+ if (pseries_log == NULL)
+ goto out;
+
+ mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+ error_type = rtas_mc_error_type(mce_log);
+
+ if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
+ (error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
+ slb_dump_contents();
+ slb_flush_and_rebolt();
+ disposition = RTAS_DISP_FULLY_RECOVERED;
+ }
+
+out:
+ return disposition;
+}
+
/*
* See if we can recover from a machine check exception.
* This is only called on power4 (or above) and only via
@@ -434,7 +459,9 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
{
int recovered = 0;
- int disposition = rtas_error_disposition(err);
+ int disposition;
+
+ disposition = mce_handle_error(err);
if (!(regs->msr & MSR_RI)) {
/* If MSR_RI isn't set, we cannot recover */
^ permalink raw reply related [flat|nested] 6+ messages in thread* [RFC PATCH 4/4] powerpc/pseries: Display machine check error details.
2018-06-06 4:36 [RFC PATCH 0/4] powerpc/pseries: Machien check handler improvements Mahesh J Salgaonkar
` (2 preceding siblings ...)
2018-06-06 4:37 ` [RFC PATCH 3/4] powerpc/pseries: Dump and flush SLB contents on SLB MCE errors Mahesh J Salgaonkar
@ 2018-06-06 4:37 ` Mahesh J Salgaonkar
2018-06-07 19:43 ` [RFC PATCH 0/4] powerpc/pseries: Machien check handler improvements Michal Suchánek
4 siblings, 0 replies; 6+ messages in thread
From: Mahesh J Salgaonkar @ 2018-06-06 4:37 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Aneesh Kumar K.V, Michael Ellerman, Laurent Dufour
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Extract the MCE eror details from RTAS extended log and display it to
console.
With this patch you should now see mce logs like below:
[ 822.711745] Severe Machine check interrupt [Recovered]
[ 822.711746] Initiator: CPU
[ 822.711747] Error type: SLB [Multihit]
[ 822.711747] Effective address: d00000000c660000
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/rtas.h | 5 +
arch/powerpc/platforms/pseries/ras.c | 116 ++++++++++++++++++++++++++++++++++
2 files changed, 121 insertions(+)
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 3f2fba7ef23b..8100a95c133a 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -190,6 +190,11 @@ static inline uint8_t rtas_error_extended(const struct rtas_error_log *elog)
return (elog->byte1 & 0x04) >> 2;
}
+static inline uint8_t rtas_error_initiator(const struct rtas_error_log *elog)
+{
+ return (elog->byte2 & 0xf0) >> 4;
+}
+
#define rtas_error_type(x) ((x)->byte3)
static inline
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 7470a216cd6b..09a172bb6fdb 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -422,6 +422,121 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
}
+#define VAL_TO_STRING(ar, val) ((val < ARRAY_SIZE(ar)) ? ar[val] : "Unknown")
+
+static void pseries_print_mce_info(struct rtas_error_log *errp, int disposition)
+{
+ const char *level, *sevstr;
+ struct pseries_errorlog *pseries_log;
+ struct pseries_mc_errorlog *mce_log;
+ uint8_t error_type, err_sub_type;
+ uint8_t initiator = rtas_error_initiator(errp);
+ uint64_t addr;
+
+ static const char * const initiators[] = {
+ "Unknown",
+ "CPU",
+ "PCI",
+ "ISA",
+ "Memory",
+ "Power Mgmt",
+ };
+ static const char * const mc_err_types[] = {
+ "UE",
+ "SLB",
+ "ERAT",
+ "TLB",
+ "D-Cache",
+ "Unknown",
+ "I-Cache",
+ };
+ static const char * const mc_ue_types[] = {
+ "Indeterminate",
+ "Instruction fetch",
+ "Page table walk ifetch",
+ "Load/Store",
+ "Page table walk Load/Store",
+ };
+
+ /* SLB sub errors valid values are 0x0, 0x1, 0x2 */
+ static const char * const mc_slb_types[] = {
+ "Parity",
+ "Multihit",
+ "Indeterminate",
+ };
+
+ /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
+ static const char * const mc_soft_types[] = {
+ "Unknown",
+ "Parity",
+ "Multihit",
+ "Indeterminate",
+ };
+
+ pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+ if (pseries_log == NULL)
+ return;
+
+ mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+
+ error_type = rtas_mc_error_type(mce_log);
+ err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+ switch (rtas_error_severity(errp)) {
+ case RTAS_SEVERITY_NO_ERROR:
+ level = KERN_INFO;
+ sevstr = "Harmless";
+ break;
+ case RTAS_SEVERITY_WARNING:
+ level = KERN_WARNING;
+ sevstr = "";
+ break;
+ case RTAS_SEVERITY_ERROR:
+ case RTAS_SEVERITY_ERROR_SYNC:
+ level = KERN_ERR;
+ sevstr = "Severe";
+ break;
+ case RTAS_SEVERITY_FATAL:
+ default:
+ level = KERN_ERR;
+ sevstr = "Fatal";
+ break;
+ }
+
+ printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+ disposition == RTAS_DISP_FULLY_RECOVERED ?
+ "Recovered" : "Not recovered");
+ printk("%s Initiator: %s\n", level,
+ VAL_TO_STRING(initiators, initiator));
+
+ switch (error_type) {
+ case PSERIES_MC_ERROR_TYPE_UE:
+ printk("%s Error type: %s [%s]\n", level,
+ VAL_TO_STRING(mc_err_types, error_type),
+ VAL_TO_STRING(mc_ue_types, err_sub_type));
+ break;
+ case PSERIES_MC_ERROR_TYPE_SLB:
+ printk("%s Error type: %s [%s]\n", level,
+ VAL_TO_STRING(mc_err_types, error_type),
+ VAL_TO_STRING(mc_slb_types, err_sub_type));
+ break;
+ case PSERIES_MC_ERROR_TYPE_ERAT:
+ case PSERIES_MC_ERROR_TYPE_TLB:
+ printk("%s Error type: %s [%s]\n", level,
+ VAL_TO_STRING(mc_err_types, error_type),
+ VAL_TO_STRING(mc_soft_types, err_sub_type));
+ break;
+ default:
+ printk("%s Error type: %s\n", level,
+ VAL_TO_STRING(mc_err_types, error_type));
+ break;
+ }
+
+ addr = rtas_mc_get_effective_addr(mce_log);
+ if (addr)
+ printk("%s Effective address: %016llx\n", level, addr);
+}
+
static int mce_handle_error(struct rtas_error_log *errp)
{
struct pseries_errorlog *pseries_log;
@@ -442,6 +557,7 @@ static int mce_handle_error(struct rtas_error_log *errp)
slb_flush_and_rebolt();
disposition = RTAS_DISP_FULLY_RECOVERED;
}
+ pseries_print_mce_info(errp, disposition);
out:
return disposition;
^ permalink raw reply related [flat|nested] 6+ messages in thread