linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mauro Carvalho Chehab <mchehab@redhat.com>
To: unlisted-recipients:; (no To-header on input)
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>,
	Linux Edac Mailing List <linux-edac@vger.kernel.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: [PATCH 3/3] hw_event: Consolidate uncorrected/corrected error msgs into one
Date: Thu, 26 Jan 2012 21:05:08 -0200	[thread overview]
Message-ID: <1327619108-23316-3-git-send-email-mchehab@redhat.com> (raw)
In-Reply-To: <1327619108-23316-1-git-send-email-mchehab@redhat.com>

This is an RFC patch, consolidating two trace calls into one.
Not sure if this is the better thing to do, but it simplifies
the error tracepoint, while still keeping the technical details
that may be needed by someone debugging the driver or for
the vendors to double-check what's happening inside the system.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/edac_mc.c          |   51 +++++++--
 include/linux/edac.h            |    6 +
 include/trace/events/hw_event.h |  231 ++++-----------------------------------
 3 files changed, 68 insertions(+), 220 deletions(-)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 2b8382e..5038239 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -685,6 +685,7 @@ void edac_mc_handle_ce(struct mem_ctl_info *mci,
 		int row, int channel, const char *msg)
 {
 	unsigned long remapped_page;
+	char detail[80];
 
 	debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
 
@@ -711,8 +712,15 @@ void edac_mc_handle_ce(struct mem_ctl_info *mci,
 		return;
 	}
 
-	trace_mc_corrected_error(mci, page_frame_number, offset_in_page,
-				syndrome, row, channel, msg);
+	/* Memory type dependent details about the error */
+	snprintf(detail, sizeof(detail),
+		 " (page 0x%lx, offset 0x%lx, grain %d, "
+		 "syndrome 0x%lx, row %d, channel %d)\n",
+		 page_frame_number, offset_in_page,
+		 mci->csrows[row].grain, syndrome, row, channel);
+	trace_mc_error(HW_EVENT_ERR_CORRECTED, mci->mc_idx,
+		       mci->csrows[row].channels[channel].label,
+		       msg, detail);
 
 	if (edac_mc_get_log_ce())
 		/* FIXME - put in DIMM location */
@@ -749,7 +757,8 @@ EXPORT_SYMBOL_GPL(edac_mc_handle_ce);
 
 void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg)
 {
-	trace_mc_corrected_error_no_info(mci, msg);
+	trace_mc_error(HW_EVENT_ERR_CORRECTED, mci->mc_idx,
+		       "unknown", msg, "");
 	if (edac_mc_get_log_ce())
 		edac_mc_printk(mci, KERN_WARNING,
 			"CE - no information available: %s\n", msg);
@@ -768,6 +777,7 @@ void edac_mc_handle_ue(struct mem_ctl_info *mci,
 	char *pos = labels;
 	int chan;
 	int chars;
+	char detail[80];
 
 	debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
 
@@ -796,8 +806,15 @@ void edac_mc_handle_ue(struct mem_ctl_info *mci,
 		pos += chars;
 	}
 
-	trace_mc_uncorrected_error(mci, page_frame_number, offset_in_page,
-				row, msg, labels);
+	/* Memory type dependent details about the error */
+	snprintf(detail, sizeof(detail),
+		 "page 0x%lx, offset 0x%lx, grain %d, row %d ",
+		 page_frame_number, offset_in_page,
+	         mci->csrows[row].grain, row);
+	trace_mc_error(HW_EVENT_ERR_UNCORRECTED, mci->mc_idx,
+		       labels,
+		       msg, detail);
+
 	if (edac_mc_get_log_ue())
 		edac_mc_printk(mci, KERN_EMERG,
 			"UE page 0x%lx, offset 0x%lx, grain %d, row %d, "
@@ -818,7 +835,8 @@ EXPORT_SYMBOL_GPL(edac_mc_handle_ue);
 
 void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg)
 {
-	trace_mc_uncorrected_error_no_info(mci, msg);
+	trace_mc_error(HW_EVENT_ERR_UNCORRECTED, mci->mc_idx,
+		       "unknown", msg, "");
 	if (edac_mc_get_panic_on_ue())
 		panic("EDAC MC%d: Uncorrected Error", mci->mc_idx);
 
@@ -843,6 +861,7 @@ void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
 	char labels[len + 1];
 	char *pos = labels;
 	int chars;
+	char detail[80];
 
 	if (csrow >= mci->nr_csrows) {
 		/* something is wrong */
@@ -891,8 +910,13 @@ void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
 	chars = snprintf(pos, len + 1, "-%s",
 			 mci->csrows[csrow].channels[channelb].label);
 
-	trace_mc_uncorrected_error_fbd(mci, csrow, channela, channelb,
-				       msg, labels);
+	/* Memory type dependent details about the error */
+	snprintf(detail, sizeof(detail),
+		 "row %d, channel-a= %d channel-b= %d ",
+		 csrow, channela, channelb);
+	trace_mc_error(HW_EVENT_ERR_UNCORRECTED, mci->mc_idx,
+		       labels,
+		       msg, detail);
 	if (edac_mc_get_log_ue())
 		edac_mc_printk(mci, KERN_EMERG,
 			"UE row %d, channel-a= %d channel-b= %d "
@@ -913,7 +937,7 @@ EXPORT_SYMBOL(edac_mc_handle_fbd_ue);
 void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
 			unsigned int csrow, unsigned int channel, char *msg)
 {
-
+	char detail[80];
 	/* Ensure boundary values */
 	if (csrow >= mci->nr_csrows) {
 		/* something is wrong */
@@ -936,7 +960,14 @@ void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
 		return;
 	}
 
-	trace_mc_corrected_error_fbd(mci, csrow, channel, msg);
+	/* Memory type dependent details about the error */
+	snprintf(detail, sizeof(detail),
+		 "(row %d, channel %d)\n",
+		 csrow, channel);
+	trace_mc_error(HW_EVENT_ERR_CORRECTED, mci->mc_idx,
+		       mci->csrows[csrow].channels[channel].label,
+		       msg, detail);
+
 	if (edac_mc_get_log_ce())
 		/* FIXME - put in DIMM location */
 		edac_mc_printk(mci, KERN_WARNING,
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 055b248..3ba99d7 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -66,6 +66,12 @@ enum dev_type {
 #define DEV_FLAG_X32		BIT(DEV_X32)
 #define DEV_FLAG_X64		BIT(DEV_X64)
 
+enum hw_event_mc_err_type {
+	HW_EVENT_ERR_CORRECTED,
+	HW_EVENT_ERR_UNCORRECTED,
+	HW_EVENT_ERR_FATAL,
+};
+
 /* memory types */
 enum mem_type {
 	MEM_EMPTY = 0,		/* Empty csrow */
diff --git a/include/trace/events/hw_event.h b/include/trace/events/hw_event.h
index 85fca0d..fee7ed2 100644
--- a/include/trace/events/hw_event.h
+++ b/include/trace/events/hw_event.h
@@ -52,183 +52,42 @@ DEFINE_EVENT(hw_event_class, hw_event_init,
 /*
  * Default error mechanisms for Memory Controller errors (CE and UE)
  */
-TRACE_EVENT(mc_corrected_error,
+TRACE_EVENT(mc_error,
 
-	TP_PROTO(struct mem_ctl_info *mci,
-		unsigned long page_frame_number,
-		unsigned long offset_in_page, unsigned long syndrome,
-		int row, int channel, const char *msg),
+	TP_PROTO(unsigned int err_type,
+		 unsigned int mc_index,
+		 const char *label,
+		 const char *msg,
+		 const char *detail),
 
-	TP_ARGS(mci, page_frame_number, offset_in_page, syndrome, row,
-		channel, msg),
+	TP_ARGS(err_type, mc_index, label, msg, detail),
 
 	TP_STRUCT__entry(
+		__field(	unsigned int,	err_type		)
 		__field(	unsigned int,	mc_index		)
-		__field(	unsigned long,	page_frame_number	)
-		__field(	unsigned long,	offset_in_page		)
-		__field(	u32,		grain			)
-		__field(	unsigned long,	syndrome		)
-		__field(	int,		row			)
-		__field(	int,		channel			)
-		__string(	label,		mci->csrows[row].channels[channel].label)
-		__string(	msg,		msg			)
-	),
-
-	TP_fast_assign(
-		__entry->mc_index		= mci->mc_idx;
-		__entry->page_frame_number	= page_frame_number;
-		__entry->offset_in_page		= offset_in_page;
-		__entry->grain			= mci->csrows[row].grain;
-		__entry->syndrome		= syndrome;
-		__entry->row			= row;
-		__entry->channel		= channel;
-		__assign_str(label, mci->csrows[row].channels[channel].label);
-		__assign_str(msg, msg);
-	),
-
-	TP_printk(HW_ERR "mce#%d: Corrected error %s on label \"%s\" "
-			 "(page 0x%lux, offset 0x%lux, grain %ud, "
-			 "syndrome 0x%lux, row %d, channel %d)\n",
-		__entry->mc_index,
-		__get_str(msg),
-		__get_str(label),
-		__entry->page_frame_number,
-		__entry->offset_in_page,
-		__entry->grain,
-		__entry->syndrome,
-		__entry->row,
-		__entry->channel)
-);
-
-TRACE_EVENT(mc_uncorrected_error,
-
-	TP_PROTO(struct mem_ctl_info *mci,
-		unsigned long page_frame_number,
-		unsigned long offset_in_page,
-		int row, const char *msg, const char *label),
-
-	TP_ARGS(mci, page_frame_number, offset_in_page,
-		row, msg, label),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	mc_index		)
-		__field(	unsigned long,	page_frame_number	)
-		__field(	unsigned long,	offset_in_page		)
-		__field(	u32,		grain			)
-		__field(	int,		row			)
-		__string(	msg,		msg			)
 		__string(	label,		label			)
-	),
-
-	TP_fast_assign(
-		__entry->mc_index		= mci->mc_idx;
-		__entry->page_frame_number	= page_frame_number;
-		__entry->offset_in_page		= offset_in_page;
-		__entry->grain			= mci->csrows[row].grain;
-		__entry->row			= row;
-		__assign_str(msg, msg);
-		__assign_str(label, label);
-	),
-
-	TP_printk(HW_ERR "mce#%d: Uncorrected error %s on label \"%s\""
-			 "(page 0x%lux, offset 0x%lux, grain %ud, row %d)\n",
-		__entry->mc_index,
-		__get_str(msg),
-		__get_str(label),
-		__entry->page_frame_number,
-		__entry->offset_in_page,
-		__entry->grain,
-		__entry->row)
-);
-
-
-/*
- * Fully-Buffered memory hardware in general don't provide syndrome/grain/row
- * information for all types of errors. So, we need to either have another
- * trace event or add a bitmapped field to indicate that some info are not
- * provided and use the previously-declared event. It seemed easier and less
- * confusing to create a different event for such cases
- */
-TRACE_EVENT(mc_corrected_error_fbd,
-
-	TP_PROTO(struct mem_ctl_info *mci,
-		int row, int channel, const char *msg),
-
-	TP_ARGS(mci, row, channel, msg),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	mc_index		)
-		__field(	int,		row			)
-		__field(	int,		channel	        	)
-		__string(	label,		mci->csrows[row].channels[channel].label)
 		__string(	msg,		msg			)
+		__string(	detail,		detail			)
 	),
 
 	TP_fast_assign(
-		__entry->mc_index		= mci->mc_idx;
-		__entry->row			= row;
-		__entry->channel		= channel;
-		__assign_str(label, mci->csrows[row].channels[channel].label);
-		__assign_str(msg, msg);
-	),
-
-	TP_printk(HW_ERR "mce#%d: Corrected Error %s on label \"%s\" "
-			 "(row %d, channel %d)\n",
-		__entry->mc_index,
-		__get_str(msg),
-		__get_str(label),
-		__entry->row,
-		__entry->channel)
-);
-
-TRACE_EVENT(mc_uncorrected_error_fbd,
-
-	TP_PROTO(struct mem_ctl_info *mci,
-		int row, int channela, int channelb,
-		const char *msg, const char *label),
-
-	TP_ARGS(mci, row, channela, channelb, msg, label),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	mc_index		)
-		__field(	int,		row			)
-		__field(	int,		channela		)
-		__field(	int,		channelb		)
-		__string(	msg,		msg			)
-		__string(	label,		label			)
-	),
-
-	TP_fast_assign(
-		__entry->mc_index		= mci->mc_idx;
-		__entry->row			= row;
-		__entry->channela		= channela;
-		__entry->channelb		= channelb;
-		__assign_str(msg, msg);
+		__entry->err_type		= err_type;
+		__entry->mc_index		= mc_index;
 		__assign_str(label, label);
+		__assign_str(msg, msg);
+		__assign_str(detail, detail);
 	),
 
-	TP_printk(HW_ERR "mce#%d: Uncorrected Error %s on label \"%s\" "
-			 "(row %d, channels: %d, %d)\n",
-		__entry->mc_index,
-		__get_str(msg),
-		__get_str(label),
-		__entry->row,
-		__entry->channela,
-		__entry->channelb)
+	TP_printk(HW_ERR "mce#%d: %s error %s on label \"%s\" %s\n",
+		  __entry->mc_index,
+		  (__entry->err_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" :
+			((__entry->err_type == HW_EVENT_ERR_FATAL) ?
+			"Fatal" : "Uncorrected"),
+		  __get_str(msg),
+		  __get_str(label),
+		  __get_str(detail))
 );
 
-/*
- * The Memory controller driver needs to discover the memory topology, in
- * order to associate a hardware error with the memory label. If, for any
- * reason, it receives an error for a channel or row that are not supposed
- * to be there, an error event needs to be generated to indicate:
- *	- that a Corrected or Uncorrected error was received;
- *	- that the driver has a bug and, for that particular hardware, was
- *	  not capable of detecting the hardware architecture
- * If one of such errors is ever received, a bug to the kernel driver must
- * be filled.
- */
-
 TRACE_EVENT(mc_out_of_range,
 	TP_PROTO(struct mem_ctl_info *mci, const char *type, const char *field,
 		int invalid_val, int min, int max),
@@ -263,54 +122,6 @@ TRACE_EVENT(mc_out_of_range,
 );
 
 /*
- * On some cases, a corrected or uncorrected error was detected, but it
- * couldn't be properly handled, or because another error overrided the
- * error registers that details the error or because of some internal problem
- * on the driver. Those events bellow are meant for those error types.
- */
-TRACE_EVENT(mc_corrected_error_no_info,
-	TP_PROTO(struct mem_ctl_info *mci, const char *msg),
-
-	TP_ARGS(mci, msg),
-
-	TP_STRUCT__entry(
-	__string(	msg,			msg			)
-		__field(	unsigned int,	mc_index		)
-	),
-
-	TP_fast_assign(
-		__assign_str(msg, msg);
-		__entry->mc_index		= mci->mc_idx;
-	),
-
-	TP_printk(HW_ERR "mce#%d: Corrected Error: %s\n",
-		__entry->mc_index,
-		__get_str(msg))
-);
-
-TRACE_EVENT(mc_uncorrected_error_no_info,
-	TP_PROTO(struct mem_ctl_info *mci, const char *msg),
-
-	TP_ARGS(mci, msg),
-
-	TP_STRUCT__entry(
-		__string(	msg,		msg			)
-		__field(	unsigned int,	mc_index		)
-	),
-
-	TP_fast_assign(
-		__assign_str(msg, msg);
-		__entry->mc_index		= mci->mc_idx;
-	),
-
-	TP_printk(HW_ERR "mce#%d: Uncorrected Error: %s\n",
-		__entry->mc_index,
-		__get_str(msg))
-);
-
-
-
-/*
  * MCE Events placeholder. Please add non-memory events that come from the
  * MCE driver here
  */
-- 
1.7.8


  parent reply	other threads:[~2012-01-26 23:05 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <cover.1300996141.git.mchehab@redhat.com>
2011-03-24 20:32 ` [PATCH RFC 2/2] events/hw_event: Create a Hardware Anomaly Report Mechanism (HARM) Mauro Carvalho Chehab
2011-03-24 22:39   ` Borislav Petkov
2011-03-25 10:20     ` Mauro Carvalho Chehab
2011-03-25 14:13       ` Borislav Petkov
2011-03-25 21:22         ` Mauro Carvalho Chehab
2011-03-25 22:37           ` Tony Luck
2011-03-26 11:56             ` Mauro Carvalho Chehab
2011-03-28 17:03           ` Borislav Petkov
2011-03-28 19:44             ` Mauro Carvalho Chehab
2011-03-30 17:27               ` Luck, Tony
2011-03-30 17:51                 ` Borislav Petkov
2011-03-30 18:30                   ` Francis St. Amant
2011-03-30 19:50                     ` Borislav Petkov
2011-03-30 20:00                       ` Francis St. Amant
2011-03-31  7:43                         ` Borislav Petkov
2012-01-26 23:05     ` [PATCH 1/3] events/hw_event: Create a Hardware Events Report Mecanism (HERM) Mauro Carvalho Chehab
2012-01-26 23:05       ` [PATCH 2/3] events/hw_event: use __string() trace macros for events Mauro Carvalho Chehab
2012-01-26 23:05       ` Mauro Carvalho Chehab [this message]
2011-03-24 20:32 ` [PATCH RFC 1/2] edac: Move edac main structs to include/linux/edac.h Mauro Carvalho Chehab
2011-03-24 20:54 ` Mauro Carvalho Chehab

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1327619108-23316-3-git-send-email-mchehab@redhat.com \
    --to=mchehab@redhat.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).