From: Borislav Petkov <bp@alien8.de>
To: "Chen, Gong" <gong.chen@linux.intel.com>
Cc: tony.luck@intel.com, m.chehab@samsung.com, arozansk@redhat.com,
linux-acpi@vger.kernel.org
Subject: Re: [PATCH 2/2] trace, RAS: Add eMCA trace event interface
Date: Fri, 7 Mar 2014 12:44:16 +0100 [thread overview]
Message-ID: <20140307114416.GA5255@pd.tnic> (raw)
In-Reply-To: <1393924997-8992-3-git-send-email-gong.chen@linux.intel.com>
On Tue, Mar 04, 2014 at 04:23:17AM -0500, Chen, Gong wrote:
> Add trace interface to elaborate all H/W error related information.
>
> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
> ---
> drivers/acpi/Kconfig | 3 +-
> drivers/acpi/Makefile | 1 +
> drivers/acpi/acpi_extlog.c | 131 +++++++++++++++++++++++++++++++++++++++++++-
> drivers/firmware/efi/cper.c | 13 ++++-
> include/linux/cper.h | 2 +
> include/ras/ras_event.h | 62 +++++++++++++++++++++
> kernel/trace/ras-traces.c | 1 +
> 7 files changed, 208 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
> index 4770de5..3e569d4 100644
> --- a/drivers/acpi/Kconfig
> +++ b/drivers/acpi/Kconfig
> @@ -363,6 +363,7 @@ config ACPI_EXTLOG
>
> Enhanced MCA Logging allows firmware to provide additional error
> information to system software, synchronous with MCE or CMCI. This
> - driver adds support for that functionality.
> + driver adds support for that functionality with corresponding
> + tracepoint which carries that information to userspace.
>
> endif # ACPI
> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
> index 0331f91..f6abc4a 100644
> --- a/drivers/acpi/Makefile
> +++ b/drivers/acpi/Makefile
> @@ -82,4 +82,5 @@ obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
>
> obj-$(CONFIG_ACPI_APEI) += apei/
>
> +CFLAGS_acpi_extlog.o := -I$(src)
> obj-$(CONFIG_ACPI_EXTLOG) += acpi_extlog.o
> diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
> index c4a5d87..fbdebad 100644
> --- a/drivers/acpi/acpi_extlog.c
> +++ b/drivers/acpi/acpi_extlog.c
> @@ -14,8 +14,10 @@
> #include <linux/edac.h>
> #include <asm/cpu.h>
> #include <asm/mce.h>
> +#include <linux/dmi.h>
>
> #include "apei/apei-internal.h"
> +#include <ras/ras_event.h>
>
> #define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */
>
> @@ -44,6 +46,11 @@ struct extlog_l1_head {
> static int old_edac_report_status;
>
> static u8 extlog_dsm_uuid[] __initdata = "663E35AF-CC10-41A4-88EA-5470AF055295";
> +static const uuid_le invalid_uuid = NULL_UUID_LE;
> +
> +static DEFINE_RAW_SPINLOCK(trace_lock);
> +static char mem_location[LOC_LEN];
> +static char dimm_location[LOC_LEN];
>
> /* L1 table related physical address */
> static u64 elog_base;
> @@ -69,6 +76,106 @@ static u32 l1_percpu_entry;
> #define ELOG_ENTRY_ADDR(phyaddr) \
> (phyaddr - elog_base + (u8 *)elog_addr)
>
> +static void mem_err_location(struct cper_sec_mem_err *mem)
> +{
> + char *p;
> + u32 n = 0;
> +
> + memset(mem_location, 0, LOC_LEN);
> + p = mem_location;
> + if (mem->validation_bits & CPER_MEM_VALID_NODE)
> + n += sprintf(p + n, " node: %d", mem->node);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_CARD)
> + n += sprintf(p + n, " card: %d", mem->card);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_MODULE)
> + n += sprintf(p + n, " module: %d", mem->module);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
> + n += sprintf(p + n, " rank: %d", mem->rank);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_BANK)
> + n += sprintf(p + n, " bank: %d", mem->bank);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
> + n += sprintf(p + n, " device: %d", mem->device);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_ROW)
> + n += sprintf(p + n, " row: %d", mem->row);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_COLUMN)
> + n += sprintf(p + n, " column: %d", mem->column);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
> + n += sprintf(p + n, " bit_position: %d", mem->bit_pos);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
> + n += sprintf(p + n, " requestor_id: 0x%016llx",
> + mem->requestor_id);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
> + n += sprintf(p + n, " responder_id: 0x%016llx",
> + mem->responder_id);
> + if (n >= LOC_LEN)
> + goto end;
> + if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID)
> + n += sprintf(p + n, " target_id: 0x%016llx", mem->target_id);
> +end:
> + return;
> +}
Looks like this wants to share with cper_print_mem() - definitely a lot
of duplication there.
> +
> +static void dimm_err_location(struct cper_sec_mem_err *mem)
> +{
> + const char *bank = NULL, *device = NULL;
> +
> + memset(dimm_location, 0, LOC_LEN);
> + if (!(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE))
> + return;
> +
> + dmi_memdev_name(mem->mem_dev_handle, &bank, &device);
> + if (bank != NULL && device != NULL)
> + snprintf(dimm_location, LOC_LEN - 1, "%s %s", bank, device);
> + else
> + snprintf(dimm_location, LOC_LEN - 1, "DMI handle: 0x%.4x",
> + mem->mem_dev_handle);
> +}
This one too.
> +
> +static void trace_mem_error(const uuid_le *fru_id, char *fru_text,
> + u64 err_count, u32 severity,
> + struct cper_sec_mem_err *mem)
> +{
> + u32 etype = ~0U;
> + u64 phy_addr = ~0ull;
I'm assuming userspace knows that all 1s means field value is invalid?
> + unsigned long flags;
> +
> + if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
> + etype = mem->error_type;
newline.
> + if (mem->validation_bits & CPER_MEM_VALID_PA) {
> + phy_addr = mem->physical_addr;
> + if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
> + phy_addr &= mem->physical_addr_mask;
> + }
> +
> + raw_spin_lock_irqsave(&trace_lock, flags);
> + mem_err_location(mem);
> + dimm_err_location(mem);
> +
> + trace_extlog_mem_event(etype, dimm_location, fru_id, fru_text,
> + err_count, severity, phy_addr, mem_location);
> + raw_spin_unlock_irqrestore(&trace_lock, flags);
> +}
> +
> static struct acpi_generic_status *extlog_elog_entry_check(int cpu, int bank)
> {
> int idx;
> @@ -137,7 +244,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
> struct mce *mce = (struct mce *)data;
> int bank = mce->bank;
> int cpu = mce->extcpu;
> - struct acpi_generic_status *estatus;
> + struct acpi_generic_status *estatus, *tmp;
> + struct acpi_generic_data *gdata;
> + const uuid_le *fru_id = &invalid_uuid;
> + char *fru_text = "";
> + uuid_le *sec_type;
> + static u64 err_count;
> int rc;
>
> estatus = extlog_elog_entry_check(cpu, bank);
> @@ -149,6 +261,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
> estatus->block_status = 0;
>
> rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
> + tmp = (struct acpi_generic_status *)elog_buf;
> + gdata = (struct acpi_generic_data *)(tmp + 1);
> + rc = print_extlog_rcd(NULL, tmp, cpu);
We probably need a mechanism to disable printking to dmesg once
userspace has opened the tracepoint.
> + /* trace extended error log */
> + err_count++;
> + if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
> + fru_id = (uuid_le *)gdata->fru_id;
> + if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
> + fru_text = gdata->fru_text;
> + sec_type = (uuid_le *)gdata->section_type;
> + if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
> + struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
> + if (gdata->error_data_length >= sizeof(*mem_err))
> + trace_mem_error(fru_id, fru_text, err_count,
> + gdata->error_severity, mem_err);
> + }
>
> return NOTIFY_STOP;
> }
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index 1491dd4..9d3e2c4 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -57,11 +57,12 @@ static const char *cper_severity_strs[] = {
> "info",
> };
>
> -static const char *cper_severity_str(unsigned int severity)
> +const char *cper_severity_str(unsigned int severity)
> {
> return severity < ARRAY_SIZE(cper_severity_strs) ?
> cper_severity_strs[severity] : "unknown";
> }
> +EXPORT_SYMBOL_GPL(cper_severity_str);
Yes, this calls for a common file sharin cper and extlog functionality.
> /*
> * cper_print_bits - print strings for set bits
> @@ -196,6 +197,13 @@ static const char *cper_mem_err_type_strs[] = {
> "physical memory map-out event",
> };
>
> +const char *cper_mem_err_type_str(unsigned int etype)
> +{
> + return etype < ARRAY_SIZE(cper_mem_err_type_strs) ?
> + cper_mem_err_type_strs[etype] : "unknown";
> +}
> +EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
> +
> static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
> {
> if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
> @@ -233,8 +241,7 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
> if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
> u8 etype = mem->error_type;
> printk("%s""error_type: %d, %s\n", pfx, etype,
> - etype < ARRAY_SIZE(cper_mem_err_type_strs) ?
> - cper_mem_err_type_strs[etype] : "unknown");
> + cper_mem_err_type_str(etype));
> }
> if (mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
> const char *bank = NULL, *device = NULL;
Ditto.
> diff --git a/include/linux/cper.h b/include/linux/cper.h
> index 2fc0ec3..c6d87fc 100644
> --- a/include/linux/cper.h
> +++ b/include/linux/cper.h
> @@ -395,6 +395,8 @@ struct cper_sec_pcie {
> #pragma pack()
>
> u64 cper_next_record_id(void);
> +const char *cper_severity_str(unsigned int);
> +const char *cper_mem_err_type_str(unsigned int);
> void cper_print_bits(const char *prefix, unsigned int bits,
> const char * const strs[], unsigned int strs_size);
>
> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index 21cdb0b..97f2192 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -8,6 +8,68 @@
> #include <linux/tracepoint.h>
> #include <linux/edac.h>
> #include <linux/ktime.h>
> +#include <linux/cper.h>
> +
> +/*
> + * MCE Extended Error Log trace event
> + *
> + * These events are generated when hardware detects a corrected or
> + * uncorrected event.
> + *
> + */
> +
> +/* memory trace event */
> +
> +#define LOC_LEN 512
> +
> +TRACE_EVENT(extlog_mem_event,
So this is a mem thing so we're defining a tracepoint for memory events,
specifically.
However, if extlog carries all kinds of errors outside, not only DRAM
errors, we should do a TRACE_EVENT_CLASS which contains the shared args
to every error type and then make a mem event ontop of it.
> + TP_PROTO(u32 etype,
> + char *dimm_info,
> + const uuid_le *fru_id,
> + char *fru_text,
> + u64 error_count,
> + u32 severity,
> + u64 phy_addr,
> + char *mem_loc),
> +
> + TP_ARGS(etype, dimm_info, fru_id, fru_text, error_count, severity,
> + phy_addr, mem_loc),
> +
> + TP_STRUCT__entry(
> + __field(u32, etype)
> + __dynamic_array(char, dimm_info, LOC_LEN)
> + __field(u64, error_count)
> + __field(u32, severity)
> + __field(u64, paddr)
> + __string(mem_loc, mem_loc)
> + __dynamic_array(char, fru, LOC_LEN)
> + ),
> +
> + TP_fast_assign(
> + __entry->error_count = error_count;
> + __entry->severity = severity;
> + __entry->etype = etype;
> + if (dimm_info[0] != '\0')
> + snprintf(__get_dynamic_array(dimm_info), LOC_LEN - 1,
> + "%s", dimm_info);
> + else
> + __assign_str(dimm_info, "");
> + __entry->paddr = phy_addr;
> + __assign_str(mem_loc, mem_loc);
> + snprintf(__get_dynamic_array(fru), LOC_LEN - 1,
> + "FRU: %pUl %.20s", fru_id, fru_text);
> + ),
> +
> + TP_printk("%llu %s error%s: %s %s physical addr: 0x%016llx%s %s",
> + __entry->error_count,
> + cper_severity_str(__entry->severity),
> + __entry->error_count > 1 ? "s" : "",
> + cper_mem_err_type_str(__entry->etype),
> + __get_str(dimm_info),
> + __entry->paddr,
> + __get_str(mem_loc),
> + __get_str(fru))
> +);
>
> /*
> * Hardware Events Report
> diff --git a/kernel/trace/ras-traces.c b/kernel/trace/ras-traces.c
> index b0c6ed1..197b1ea 100644
> --- a/kernel/trace/ras-traces.c
> +++ b/kernel/trace/ras-traces.c
> @@ -9,4 +9,5 @@
> #define TRACE_INCLUDE_PATH ../../include/ras
> #include <ras/ras_event.h>
>
> +EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
> EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
> --
> 1.8.4.3
>
>
--
Regards/Gruss,
Boris.
Sent from a fat crate under my desk. Formatting is fine.
--
next prev parent reply other threads:[~2014-03-07 11:44 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-04 9:23 trace, RAS: New eMCA trace event interface Chen, Gong
2014-03-04 9:23 ` [PATCH 1/2] trace, RAS: Add basic RAS trace event Chen, Gong
2014-03-06 11:18 ` Borislav Petkov
2014-03-06 11:43 ` Mauro Carvalho Chehab
2014-03-06 12:17 ` Borislav Petkov
2014-03-06 13:06 ` Mauro Carvalho Chehab
2014-03-06 15:26 ` Borislav Petkov
2014-03-06 15:39 ` Mauro Carvalho Chehab
2014-03-07 6:21 ` Chen, Gong
2014-03-07 9:08 ` Mauro Carvalho Chehab
2014-03-04 9:23 ` [PATCH 2/2] trace, RAS: Add eMCA trace event interface Chen, Gong
2014-03-07 11:44 ` Borislav Petkov [this message]
2014-03-10 8:22 ` Chen, Gong
2014-03-10 10:04 ` Mauro Carvalho Chehab
2014-03-10 10:31 ` Borislav Petkov
2014-03-10 11:41 ` Mauro Carvalho Chehab
2014-03-10 13:29 ` Borislav Petkov
2014-03-10 17:37 ` Luck, Tony
2014-03-11 14:27 ` Borislav Petkov
2014-03-10 10:33 ` Borislav Petkov
2014-03-10 17:42 ` Luck, Tony
2014-03-11 7:03 ` Chen, Gong
2014-03-04 17:54 ` trace, RAS: New " Luck, Tony
2014-03-07 9:10 ` Mauro Carvalho Chehab
2014-03-10 18:55 ` Tony Luck
2014-03-10 19:41 ` Mauro Carvalho Chehab
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140307114416.GA5255@pd.tnic \
--to=bp@alien8.de \
--cc=arozansk@redhat.com \
--cc=gong.chen@linux.intel.com \
--cc=linux-acpi@vger.kernel.org \
--cc=m.chehab@samsung.com \
--cc=tony.luck@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.