[PATCH] New way of storing MCA/INIT logs

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] New way of storing MCA/INIT logs
@ 2008-03-04 17:05 Zoltan Menyhart
  2008-03-05  0:23 ` Russ Anderson
                   ` (22 more replies)
  0 siblings, 23 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-04 17:05 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 2775 bytes --]

This patch adds a lock free, yet safe way of storing MCA/INIT logs.
You will not end up with logs mixed up from different MCAs.

The MCAs/INITs are rare.
There is no use wasting much permanent resources.
Should you see a burst of events, they are not uncorrelated.
Only the recovered events are treated. Should you miss one,
you'll see it later :-)
With the others, you will not be able to store them on disk anyway.

There are IA64_MAX_MCA_INIT_BUFS log buffers for the MCA, and
another IA64_MAX_MCA_INIT_BUFS log buffers for the INIT handler.

IA64_MAX_MCA_INIT_BUFS >= 2.

There is no per CPU log buffer.

The first (IA64_MAX_MCA_INIT_BUFS - 1) logs and the very last one are
stored only.
The last one gets overwritten if there are too many logs there.

The admin. info. is in a structure ia64_mca_init_buf_t, see in mca.h.

Handling the first (IA64_MAX_MCA_INIT_BUFS - 1) log buffers is
straight forward: you increment an atomic variable (_b_cnt) and
you use as index to _buf[IA64_MAX_MCA_INIT_BUFS - 1].
Having completed the log, you set the corresponding validity bit (_valid).

Otherwise you race (incl. with the nested handlers) for the last buffer:
- Increment the atomic generation counter (_gen_cnt).
- You own the last log buffer while no one else has got a higher
  generation count.
- The log data is broken up into 4-byte chunks and they are stamped with
  the generation count. They are written together as an atomic64_t into
  the last buffer (*_last_buf)[] by use of a compare-and-swap primitive
  to make sure that no one with higher generation count has passed by in
  the mean time.
- (*_last_buf)[0] is a marker:
  * Before writing the log data into the rest of (*->_last_buf)[], you
    set the marker to say "not done" (LAST_LOG_DONE bit off).
  * Having finished, you set the marker to say "done" (LAST_LOG_DONE bit on).

This is how the code backs down if someone writes the same buffer with
a higher generation count:

       do {
               tmp = atomic64_read(p);		// p => las log buffer
               /*
                * If you can see a higher generation count than yours,
                * then you are not the last - bail out.
                */
               if (GET_GEN_CNT(tmp) > gen_cnt)
                       return -1;
       } while (cmpxchg_rel(p, tmp, COMPOSE_AT_VAL(gen_cnt, value)) != tmp);

The code does not assume that the rendezvous always works.

The salinfo side verifies that every element of the last log buffer is
of the same generation.
If there is no log left, it clears _b_cnt.
There is no "shift" of the logs in the buffers at the salinfo side.

Well, the the old code is not cleaned up and the integration into the
salinfo side in not yet quit smooth, but you can judge the idea...

Thanks,

Zoltan Menyhart


[-- Attachment #2: tmp2 --]
[-- Type: text/plain, Size: 24101 bytes --]

--- linux-2.6.24/arch/ia64/kernel/mca.c	2008-03-04 15:47:35.000000000 +0100
+++ linux-2.6.24-new/arch/ia64/kernel/mca.c	2008-03-04 15:54:35.000000000 +0100
@@ -183,6 +183,92 @@
 #define	MCA_IRQ_SAFE	1	/* NOT called from the MCA/INIT handlers */
 
 
+ia64_mca_init_buf_t ia64_MCA_logs;		/* Log buffers for the MCA handler */
+ia64_mca_init_buf_t ia64_INIT_logs;		/* Log buffers for the INIT handler */
+unsigned int max_SAL_log_size;			/* From SAL_GET_STATE_INFO_SIZE() */
+
+EXPORT_SYMBOL(ia64_MCA_logs);			// For testing purposes
+
+/*
+ * Store the "last log".
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns non zero on failure.
+ */
+static int
+ia64_last_log_write(
+	ia64_mca_init_buf_t	* const bp,	/* Where to save the log */
+	const void		* const log,	/* The SAL log to save */
+	unsigned int		size)		/* Its actual size in u32 units */
+{
+	const u32		*src = (u32 *) log;
+	atomic64_t		*p = &(*bp->_last_buf)[0];
+	unsigned int		const gen_cnt = ia64_fetchadd4_acq(&bp->_gen_cnt, 1) + 1;
+
+	/* Set the marker saying "not done" */
+	if (set_last_buf_item(p++, gen_cnt, smp_processor_id()) != 0)
+		return -1;			/* You are NOT the last one */
+	/* Sore the actual log size in u32 units */
+	if (set_last_buf_item(p++, gen_cnt, size) != 0)
+		return -1;			/* You are NOT the last one */
+	/*
+	 * The log data is broken up into 4-byte chunks and they are stamped with
+	 * the generation count. They are written together as an atomic64_t.
+	 */
+	while (size-- > 0)
+		if (set_last_buf_item(p++, gen_cnt, *src++) != 0)
+			return -1;		/* You are NOT the last one */
+	/* Set the marker saying "done" */
+	return set_last_buf_item(&(*bp->_last_buf)[0], gen_cnt,
+						smp_processor_id() | LAST_LOG_DONE);
+}
+
+/*
+ * Try to pick up a buffer for MCA/INIT log coming from SAL_GET_STATE_INFO().
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns the buffer index, or -1 on failure.
+ */
+static int
+ia64_get_mca_init_log_buf(
+	ia64_mca_init_buf_t	* const bp)	/* Log buffer admin. info. */
+{
+	unsigned int		idx;		/* Index to ->_buf[] */
+
+	idx = ia64_fetchadd4_acq(&bp->_b_cnt, 1);	/* Returns the old value */
+	if (idx < IA64_MAX_MCA_INIT_BUFS - 1)
+		return idx;
+	else
+		return -1;
+}
+
+/*
+ * Set up the log buffers for the MCA/INIT handlers.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns non zero on failure.
+ */
+static inline int
+ia64_mca_init_bufs_set_up(
+	ia64_mca_init_buf_t	* const bp)	/* Log buffer for the MCA/INIT handler */
+
+{
+	unsigned int		i;
+
+	/* The regular log buffers.
+	 * Add 4 bytes for the CPU number and 4 more for the actual log size.
+	 */
+	for (i = 0; i < IA64_MAX_MCA_INIT_BUFS - 1; i++)
+		if ((bp->_buf[i] = alloc_bootmem(max_SAL_log_size + 8)) == NULL)
+			return 1;
+	i = (max_SAL_log_size + sizeof(u32) - 1) & ~(sizeof(u32) - 1);
+	/*
+	 * The "last log buffer": 4 data bytes are stored in each atomic64_t.
+	 * Add 4 bytes for the marker item and 4 more for the actual log size.
+	 */
+	return (bp->_last_buf = alloc_bootmem(2 * (i + 8))) != NULL;
+}
+
 /*
  * Push messages into buffer, print them later if not urgent.
  */
@@ -323,19 +409,6 @@
 	while (1)
 		cpu_relax();
 }
-/*
- * IA64_MCA log support
- */
-#define IA64_MAX_LOGS		2	/* Double-buffering for nested MCAs */
-#define IA64_MAX_LOG_TYPES      4   /* MCA, INIT, CMC, CPE */
-
-typedef struct ia64_state_log_s
-{
-	spinlock_t	isl_lock;
-	int		isl_index;
-	unsigned long	isl_count;
-	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
-} ia64_state_log_t;
 
 static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
 
@@ -367,21 +440,22 @@
 static void __init
 ia64_log_init(int sal_info_type)
 {
-	u64	max_size = 0;
-
 	IA64_LOG_NEXT_INDEX(sal_info_type) = 0;
 	IA64_LOG_LOCK_INIT(sal_info_type);
 
 	// SAL will tell us the maximum size of any error record of this type
-	max_size = ia64_sal_get_state_info_size(sal_info_type);
-	if (!max_size)
+	max_SAL_log_size = ia64_sal_get_state_info_size(sal_info_type);
+	if (!max_SAL_log_size)
 		/* alloc_bootmem() doesn't like zero-sized allocations! */
 		return;
 
 	// set up OS data structures to hold error info
-	IA64_LOG_ALLOCATE(sal_info_type, max_size);
-	memset(IA64_LOG_CURR_BUFFER(sal_info_type), 0, max_size);
-	memset(IA64_LOG_NEXT_BUFFER(sal_info_type), 0, max_size);
+	IA64_LOG_ALLOCATE(sal_info_type, max_SAL_log_size);
+	memset(IA64_LOG_CURR_BUFFER(sal_info_type), 0, max_SAL_log_size);
+	memset(IA64_LOG_NEXT_BUFFER(sal_info_type), 0, max_SAL_log_size);
+	if (ia64_mca_init_bufs_set_up(&ia64_MCA_logs) != 0 ||
+				ia64_mca_init_bufs_set_up(&ia64_INIT_logs) != 0)
+		printk(KERN_WARNING "WARNING: MCA/INIT log buffer set up failed\n");
 }
 
 /*
@@ -517,7 +591,8 @@
 int cpe_vector = -1;
 int ia64_cpe_irq = -1;
 
-static irqreturn_t
+// static					// For testing purposes
+irqreturn_t
 ia64_mca_cpe_int_handler (int cpe_irq, void *arg)
 {
 	static unsigned long	cpe_history[CPE_HISTORY_LENGTH];
@@ -570,6 +645,9 @@
 	return IRQ_HANDLED;
 }
 
+EXPORT_SYMBOL(ia64_mca_cpe_int_handler);	// For testing purposes
+
+
 #endif /* CONFIG_ACPI */
 
 #ifdef CONFIG_ACPI
@@ -1190,6 +1268,45 @@
 }
 
 /*
+ * Helper for ia64_mca_handler().
+ */
+int
+ia64_mca_handler_helper(
+	unsigned int			* const size_p,	/* -> actual size of the log */
+	void				* const log,	/* SAL log buffer */
+	struct ia64_sal_os_state	* const sos)
+{
+	unsigned int			size;		/* Actual size of the log */
+	int				recover;
+
+	/* Get the MCA error record */
+	size = ia64_sal_get_state_info(SAL_INFO_TYPE_MCA, (u64 *) log);
+	if (size_p != NULL)
+		*size_p = size;
+
+	/* MCA error recovery */
+	recover = ia64_mca_ucmc_extension != NULL &&
+					ia64_mca_ucmc_extension(log, sos);
+	if (recover) {
+		sal_log_record_header_t *rh = log;
+		rh->severity = sal_log_severity_corrected;
+		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
+		sos->os_status = IA64_MCA_CORRECTED;
+	} else {
+		/* Dump buffered message to console */
+		ia64_mlogbuf_finish(1);
+#ifdef CONFIG_KEXEC
+		atomic_set(&kdump_in_progress, 1);
+#endif
+	}
+	return recover;
+}
+
+/* Placed after ia64_mca_handler(). Hopefully, it will not be inlined. */
+static int
+ia64_mca_handler_last_log(struct ia64_sal_os_state * const sos);
+
+/*
  * ia64_mca_handler
  *
  *	This is uncorrectable machine check handler called from OS_MCA
@@ -1214,6 +1331,7 @@
 		 struct ia64_sal_os_state *sos)
 {
 	int recover, cpu = smp_processor_id();
+	int log_buf_idx;
 	struct task_struct *previous_current;
 	struct ia64_mca_notify_die nd =
 		{ .sos = sos, .monarch_cpu = &monarch_cpu };
@@ -1255,34 +1373,29 @@
 		while (cpu_isset(cpu, mca_cpu))
 			cpu_relax();	/* spin until monarch wakes us */
         }
+	/*
+	 * Try to pick up a buffer for the log coming from SAL_GET_STATE_INFO().
+	 */
+	if ((log_buf_idx = ia64_get_mca_init_log_buf(&ia64_MCA_logs)) >= 0){
+		l_buf_t	* const p = ia64_MCA_logs._buf[log_buf_idx];
+		
+		recover = ia64_mca_handler_helper(&p->_log_size, p->_data, sos);
+		p->_cpu = smp_processor_id();
+		/*
+		 * Tell salinfo that this log is valid.
+		 * Don't use set_bit(), ".rel" semantics is required.
+		 */
+		set_bit_rel(log_buf_idx, ia64_MCA_logs._valid);
+	} else
+		recover = ia64_mca_handler_last_log(sos);
 
-	/* Get the MCA error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA, MCA_IRQ_NOTSAFE);
-
-	/* MCA error recovery */
-	recover = (ia64_mca_ucmc_extension
-		&& ia64_mca_ucmc_extension(
-			IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA),
-			sos));
+	if (!recover)
+		monarch_cpu = -1;	/* Do we really care??? */
 
-	if (recover) {
-		sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA);
-		rh->severity = sal_log_severity_corrected;
-		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
-		sos->os_status = IA64_MCA_CORRECTED;
-	} else {
-		/* Dump buffered message to console */
-		ia64_mlogbuf_finish(1);
-#ifdef CONFIG_KEXEC
-		atomic_set(&kdump_in_progress, 1);
-		monarch_cpu = -1;
-#endif
-	}
 	if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover)
 			== NOTIFY_STOP)
 		ia64_mca_spin(__FUNCTION__);
 
-
 	if (atomic_dec_return(&mca_count) > 0) {
 		int i;
 
@@ -1307,6 +1420,26 @@
 	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
 }
 
+/*
+ * Helper routine for ia64_mca_handler() when only the last log buffer is available.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ * It is placed after ia64_mca_handler(). Hopefully, it will not be inlined.
+ * Don't want buff[max_SAL_log_size] always be on the stack...
+ */
+static int
+ia64_mca_handler_last_log(
+	struct ia64_sal_os_state	* const sos)
+{
+	unsigned char	buff[max_SAL_log_size];
+	int		recover;
+	unsigned int	size;
+
+	if ((recover = ia64_mca_handler_helper(&size, buff, sos)))
+		(void) ia64_last_log_write(&ia64_MCA_logs, buff,
+						(size + sizeof(u32) - 1) / sizeof(u32));
+	return recover;
+}
+
 static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd);
 static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd);
 
--- linux-2.6.24/include/asm-ia64/mca.h	2008-01-24 23:58:37.000000000 +0100
+++ linux-2.6.24-new/include/asm-ia64/mca.h	2008-02-29 16:38:13.000000000 +0100
@@ -161,6 +161,129 @@
 
 DECLARE_PER_CPU(u64, ia64_mca_pal_base);
 
+/*
+ * IA64_MCA log support
+ */
+#define IA64_MAX_LOGS		2	/* Double-buffering for nested MCAs */
+#define IA64_MAX_LOG_TYPES      4   /* MCA, INIT, CMC, CPE */
+
+/*
+ * IA64_MCA log support:
+ * used for SAL_GET_STATE_INFO() data by the MCA/INIT handlers.
+ */
+#define	IA64_MAX_MCA_INIT_BUFS	3
+#if	IA64_MAX_MCA_INIT_BUFS < 2
+#error	Min. 2 buffers required
+#endif
+
+typedef struct ia64_state_log_s
+{
+	spinlock_t	isl_lock;
+	int		isl_index;
+	unsigned long	isl_count;
+	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
+} ia64_state_log_t;
+
+/*
+ * These structures below describe the global buffers available for an MCA or an
+ * INIT handler to store SAL_GET_STATE_INFO() data.
+ *
+ * Note: there is no use saving non-recovered MCAs: there will be no chance for
+ * such a log to hit the permanent storage device.
+ *
+ * The rules are:
+ * - The first (IA64_MAX_MCA_INIT_BUFS - 1) logs and the very last one are
+ *   stored only.
+ * - The last one gets overwritten if there are too many logs there.
+ * - if (->_b_cnt <= IA64_MAX_MCA_INIT_BUFS), then ->_b_cnt counts the in-use
+ *   buffers. There is no lost log if (->_b_cnt < IA64_MAX_MCA_INIT_BUFS).
+ * - if (->_b_cnt => IA64_MAX_MCA_INIT_BUFS), then ->_gen_cnt is incremented
+ *   each time the last buffer gets over-written.
+ *
+ * The MCA/INIT handler plays as follows:
+ * - It fetches and increments ->_b_cnt in an atomic way.
+ * - If (previous value < IA64_MAX_MCA_INIT_BUFS - 1), then it can simply store
+ *   its log into ->_buf[ previous value ]. Having done that, it sets the
+ *   corresponding ->_valid bit.
+ * - Otherwise it races (incl. with the nesting handlers) for the last buffer:
+ *   + It increments ->_gen_cnt in an atomic way to obtain its generation count.
+ *   + It owns the last log buffer while no one else has got a higher generation
+ *     count.
+ *   + The log data is broken up into 4-byte chunks and they are stamped with
+ *     the generation count. They are written together as an atomic64_t into
+ *     (*->_last_buf)[] by use of a compare-and-swap primitive to make sure
+ *     that no one with higher generation count has passed by in the mean time.
+ *   + (*->_last_buf)[0] is a marker:
+ *     * Before writing the log data into the rest of (*->_last_buf)[], the
+ *       MCA/INIT handler sets the marker to say "not done"
+ *       (LAST_LOG_DONE bit off).
+ *     * Having finished, it sets the marker to say "done"
+ *       (LAST_LOG_DONE bit on).
+ *
+ * The salinfo side polls ->_b_cnt:
+ * - Once their corresponding ->_valid bit is set, it is safe to read, at any
+ *   time, without any further precaution, the first
+ *   MIN(IA64_MAX_MCA_INIT_BUFS - 1, ->_b_cnt) buffer entries.
+ * - The salinfo side can clear the ->_valid bits at any time with atomic bit
+ *   operations. While ->_b_cnt is not reset to 0, the log buffers are not reused.
+ * - If (->_b_cnt > IA64_MAX_MCA_INIT_BUFS - 1), then the last buffer is read as
+ *   follows:
+ *   + Pick up ->_gen_cnt.
+ *   + Verify the marker (*->_last_buf)[0], it should have the bit LAST_LOG_DONE
+ *     on. (Otherwise come back later...)
+ *   + While reading (*->_last_buf)[], verify if the generation count in each
+ *     item is the same. (Otherwise restart...)
+ * - The salinfo side can reset ->_b_cnt to 0 with an atomic operation, provided
+ *   it has not changed. (Otherwise restart...)
+ */
+
+typedef struct l_buf_s {
+	u32		_cpu;
+	u32		_log_size;
+	u8		_data[];
+} l_buf_t;
+
+typedef struct ia64_mca_init_buf_s {
+	l_buf_t		*_buf[IA64_MAX_MCA_INIT_BUFS - 1];
+	atomic_t	_b_cnt;		/* Counts the in-use _buf[]'s */
+	u32		_valid[DIV_ROUND_UP(IA64_MAX_MCA_INIT_BUFS - 1, 32)];
+	atomic64_t	(*_last_buf)[0];
+	atomic_t	_gen_cnt;	/* Generation counter for _last_buf[] */
+	u32		_gen_seen;	/* Generation seen by salinfo */
+//	u32		_buf_seen;	/* ->_buf[i] seen by salinfo */
+} ia64_mca_init_buf_t;
+
+/* For the marker item of (*->_last_buf)[0]: */
+#define	LAST_LOG_DONE		(1 << 31)
+
+/* Macros for (*->_last_buf)[]: */
+#define	GET_GEN_CNT(x)		((u32) x)		/* Generation counter */
+#define	GET_LOG_DATA(x)		((u32) (x >> 32))	/* Log data */
+#define	COMPOSE_AT_VAL(gc, dt)	((u32) gc | ((u64) dt << 32))
+
+/*
+ * Store a 4-byte value into (*->_last_buf)[i].
+ */
+static inline int
+set_last_buf_item(
+	atomic64_t	* const p,		/* == &(*->_last_buf)[i] */
+	unsigned int	const gen_cnt,		/* Generation count */
+	u32		const value)
+{
+	u64		tmp;
+
+	do {
+		tmp = atomic64_read(p);
+		/*
+		 * If you can see a higher generation count than yours,
+		 * then you are not the last - bail out.
+		 */
+		if (GET_GEN_CNT(tmp) > gen_cnt)
+			return -1;
+	} while (cmpxchg_rel(p, tmp, COMPOSE_AT_VAL(gen_cnt, value)) != tmp);
+	return 0;
+}
+
 #else	/* __ASSEMBLY__ */
 
 #define IA64_MCA_CORRECTED	0x0	/* Error has been corrected by OS_MCA */
--- linux-2.6.24/arch/ia64/kernel/salinfo.c	2008-01-24 23:58:37.000000000 +0100
+++ linux-2.6.24-new/arch/ia64/kernel/salinfo.c	2008-03-04 17:06:48.000000000 +0100
@@ -49,10 +49,15 @@
 #include <asm/sal.h>
 #include <asm/uaccess.h>
 
+#include <asm/mca.h>
+
 MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
 MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
 MODULE_LICENSE("GPL");
 
+extern ia64_mca_init_buf_t ia64_MCA_logs;	/* Log buffers for the MCA handler */
+extern ia64_mca_init_buf_t ia64_INIT_logs;	/* Log buffers for the INIT handler */
+
 static int salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data);
 
 typedef struct {
@@ -283,8 +288,20 @@
 static void
 salinfo_timeout (unsigned long arg)
 {
+	unsigned long flags;
+
 	ia64_mlogbuf_dump();
-	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
+	if (atomic_read(&ia64_MCA_logs._b_cnt) > 0 ){
+		spin_lock_irqsave(&data_saved_lock, flags);
+		salinfo_work_to_do(salinfo_data + SAL_INFO_TYPE_MCA);
+		spin_unlock_irqrestore(&data_saved_lock, flags);
+	}
+	if (atomic_read(&ia64_INIT_logs._b_cnt) > 0 ){
+		spin_lock_irqsave(&data_saved_lock, flags);
+		salinfo_work_to_do(salinfo_data + SAL_INFO_TYPE_INIT);
+		spin_unlock_irqrestore(&data_saved_lock, flags);
+	}
+//	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
 	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
 	salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
 	add_timer(&salinfo_timer);
@@ -298,6 +315,136 @@
 	return 0;
 }
 
+
+#define	MIN(a, b)	(a < b ? a : b)
+
+
+/*
+ * Copy the "last log" into some regular buffer.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns 1 if the last log has successfully been fetched.
+ */
+static inline int
+copy_last_log(
+	const atomic64_t	*p,		/* On entry: p == &(*->_last_buf)[2] */
+	u32			*dest,
+	unsigned int		const gen,
+	unsigned int		size)		/* SAL log size in u32 units */
+{
+	u64			tmp;
+
+	while (size-- > 0){
+		tmp = atomic64_read(p++);
+		if (GET_GEN_CNT(tmp) != gen)
+			return 0;
+		*dest++ = GET_LOG_DATA(tmp);
+	}
+	return 1;
+}
+
+/*
+ * Fetch the "last log" created by ia64_last_log_write() in mca.c.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns 0 if the last log has successfully been fetched.
+ */
+static inline int
+fetch_last_log(
+	ia64_mca_init_buf_t	* const bp,	/* Where to look for the logs */
+	struct salinfo_data	* const data)
+{
+	unsigned int		gen;
+	const atomic64_t	*p;
+	u64			tmp;
+
+//	printk("%s(%p,...): type: %d, CPU: %d\n", __FUNCTION__, bp, data->type, smp_processor_id());
+	for (;; schedule()) {
+		gen = atomic_read(&bp->_gen_cnt);	/* Generation counter for _last_buf[] */
+		p = &(*bp->_last_buf)[0];
+//		printk("gen: 0x%x, _last_buf: %p\n", gen, p);
+		tmp = atomic64_read(p++);		/* The marker */
+		if (GET_GEN_CNT(tmp) != gen)
+			continue;
+		tmp = GET_LOG_DATA(tmp);
+		if (!(tmp & LAST_LOG_DONE))
+			continue;
+		tmp = atomic64_read(p++);		/* SAL log size in u32 units */
+		if (GET_GEN_CNT(tmp) != gen)
+			continue;
+		if (copy_last_log(p, (void *) data->log_buffer, gen, GET_LOG_DATA(tmp)))
+			break;
+	}
+	data->log_size = GET_LOG_DATA(tmp) * sizeof(u32);
+	bp->_gen_seen = gen;
+	return 0;
+}
+
+#define	JUST_TEST_LOGS		0
+#define	DO_FETCH_LOG		1
+
+/*
+ * Check to see if we have already seen all the logs in *bp.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns 1 if some logs are available.
+ */
+static int
+any_log_available(
+	ia64_mca_init_buf_t	* const bp,	/* Where to look for the logs */
+	struct salinfo_data	* const data,
+	unsigned int		const mode)	/* JUST_TEST_LOGS, DO_FETCH_LOG */
+{
+	l_buf_t			*p;
+	unsigned int		const b_cnt = atomic_read(&bp->_b_cnt);
+	unsigned int		const idx_limit = MIN(IA64_MAX_MCA_INIT_BUFS - 1, b_cnt);
+	unsigned int		i;
+
+//	printk("%s(0x%p,... %d): mode: %d\n", __FUNCTION__, bp, data->type, mode);
+	for (i = 0; i < idx_limit; i++)
+		if (test_bit(i, bp->_valid)){
+			p = bp->_buf[i];
+//			printk("valid bit #%d, buf; %p\n", i, p);
+			if (mode == JUST_TEST_LOGS)
+				return 1;
+			data->log_size = p->_log_size;
+			memcpy(data->log_buffer, p->_data, p->_log_size);
+			clear_bit(i, bp->_valid);
+			/*
+			 * Check to see if all the buffers have been consumed.
+			 */
+			for (i = 0; i < idx_limit; i++)
+				if (test_bit(i, bp->_valid))
+					return 1;
+			if (b_cnt < IA64_MAX_MCA_INIT_BUFS ||
+					bp->_gen_seen == atomic_read(&bp->_gen_cnt)){
+				/*
+				 * Clear ->_b_cnt. It can fail.
+				 * ... will be seen next time...
+				 */
+				(void) cmpxchg(&bp->_b_cnt, b_cnt, 0);
+			}
+			return 1;
+		}
+	if (atomic_read(&bp->_gen_cnt) == bp->_gen_seen)
+		return 0;
+	if (mode == JUST_TEST_LOGS)
+		return 1;
+	if (fetch_last_log(bp, data))
+		return 1;
+	/*
+	 * Check to see if all the buffers have been consumed.
+	 */
+	for (i = 0; i < IA64_MAX_MCA_INIT_BUFS - 1; i++)
+		if (test_bit(i, bp->_valid))
+			return 1;
+	/*
+	 * Clear ->_b_cnt. It can fail. ... will be seen next time...
+	 */
+	(void) cmpxchg(&bp->_b_cnt, b_cnt, 0);
+	return 1;
+}
+
 static ssize_t
 salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
 {
@@ -317,29 +464,35 @@
 	}
 
 	n = data->cpu_check;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_isset(n, data->cpu_event)) {
-			if (!cpu_online(n)) {
-				cpu_clear(n, data->cpu_event);
-				continue;
+//	printk("CPU %d: %s(): data->cpu_check: %d, data->cpu_event: %016lx\n", smp_processor_id(),
+//					__FUNCTION__, n, data->cpu_event.bits[0]);	// :-)
+	if (atomic_read(&ia64_MCA_logs._b_cnt) > 0 || atomic_read(&ia64_INIT_logs._b_cnt) > 0){
+//		printk("%d %d\n", atomic_read(&ia64_MCA_logs._b_cnt), atomic_read(&ia64_INIT_logs._b_cnt));
+		cpu = any_online_cpu(cpu_online_map);
+	} else {
+		for (i = 0; i < NR_CPUS; i++) {
+			if (cpu_isset(n, data->cpu_event)) {
+				if (!cpu_online(n)) {
+					cpu_clear(n, data->cpu_event);
+					continue;
+				}
+				cpu = n;
+				break;
 			}
-			cpu = n;
-			break;
+			if (++n == NR_CPUS)
+				n = 0;
 		}
-		if (++n == NR_CPUS)
-			n = 0;
-	}
-
-	if (cpu == -1)
-		goto retry;
 
-	ia64_mlogbuf_dump();
+		if (cpu == -1)
+			goto retry;
 
-	/* for next read, start checking at next CPU */
-	data->cpu_check = cpu;
-	if (++data->cpu_check == NR_CPUS)
-		data->cpu_check = 0;
+		ia64_mlogbuf_dump();
 
+		/* for next read, start checking at next CPU */
+		data->cpu_check = cpu;
+		if (++data->cpu_check == NR_CPUS)
+			data->cpu_check = 0;
+	}
 	snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
 
 	size = strlen(cmd);
@@ -415,6 +568,7 @@
 {
 	struct salinfo_data *data = context;
 	sal_log_record_header_t *rh;
+
 	data->log_size = ia64_sal_get_state_info(data->type, (u64 *) data->log_buffer);
 	rh = (sal_log_record_header_t *)(data->log_buffer);
 	/* Clear corrected errors as they are read from SAL */
@@ -431,6 +585,18 @@
 	int saved_size = ARRAY_SIZE(data->data_saved);
 
 	data->saved_num = 0;
+	switch (data->type){
+	case SAL_INFO_TYPE_MCA:
+//		printk("%s(): data->state: %d\n", __FUNCTION__, data->state);
+		if (any_log_available(&ia64_MCA_logs, data, JUST_TEST_LOGS))
+			data->state = STATE_LOG_RECORD;
+		return;
+	case SAL_INFO_TYPE_INIT:
+//		printk("%s(): data->state: %d\n", __FUNCTION__, data->state);
+		if (any_log_available(&ia64_INIT_logs, data, JUST_TEST_LOGS))
+			data->state = STATE_LOG_RECORD;
+		return;
+	}
 	spin_lock_irqsave(&data_saved_lock, flags);
 retry:
 	for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
@@ -469,7 +635,20 @@
 	u8 *buf;
 	u64 bufsize;
 
+//	printk("%s(): data->state: %d\n", __FUNCTION__, data->state);
 	if (data->state == STATE_LOG_RECORD) {
+		switch (data->type){
+		case SAL_INFO_TYPE_MCA:
+			data->log_size = 0;
+			(void) any_log_available(&ia64_MCA_logs, data, DO_FETCH_LOG);
+			data->state = STATE_NO_DATA;
+			break;
+		case SAL_INFO_TYPE_INIT:
+			data->log_size = 0;
+			(void) any_log_available(&ia64_INIT_logs, data, DO_FETCH_LOG);
+			data->state = STATE_NO_DATA;
+			break;
+		}
 		buf = data->log_buffer;
 		bufsize = data->log_size;
 	} else if (data->state == STATE_OEMDATA) {
@@ -479,6 +658,8 @@
 		buf = NULL;
 		bufsize = 0;
 	}
+//	printk("%s(): buf: %p, count: %ld, pos: %lld, bufsize: %ld\n",
+//				__FUNCTION__, buf, count, *ppos, bufsize);
 	return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
 }
 
diff -Nru linux-2.6.24-tmp/include/asm-ia64/bitops.h linux-2.6.24-new-tmp/include/asm-ia64/bitops.h
--- linux-2.6.24-tmp/include/asm-ia64/bitops.h	2008-03-04 15:58:19.000000000 +0100
+++ linux-2.6.24-new-tmp/include/asm-ia64/bitops.h	2008-03-04 15:59:27.000000000 +0100
@@ -51,6 +51,39 @@
 }
 
 /**
+ * set_bit_rel - Atomically set a bit in memory with ".rel" semantics
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered.  See __set_bit()
+ * if you do not require the atomic guarantees.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ *
+ * The address must be (at least) "long" aligned.
+ * Note that there are driver (e.g., eepro100) which use these operations to
+ * operate on hw-defined data-structures, so we can't easily change these
+ * operations to force a bigger alignment.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+static __inline__ void
+set_bit_rel (int nr, volatile void *addr)
+{
+	__u32 bit, old, new;
+	volatile __u32 *m;
+	CMPXCHG_BUGCHECK_DECL
+
+	m = (volatile __u32 *) addr + (nr >> 5);
+	bit = 1 << (nr & 31);
+	do {
+		CMPXCHG_BUGCHECK(m);
+		old = *m;
+		new = old | bit;
+	} while (cmpxchg_rel(m, old, new) != old);
+}
+
+/**
  * __set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
@ 2008-03-05  0:23 ` Russ Anderson
  2008-03-05 13:14 ` Zoltan Menyhart
                   ` (21 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Russ Anderson @ 2008-03-05  0:23 UTC (permalink / raw)
  To: linux-ia64

On Tue, Mar 04, 2008 at 06:05:06PM +0100, Zoltan Menyhart wrote:
> This patch adds a lock free, yet safe way of storing MCA/INIT logs.
> You will not end up with logs mixed up from different MCAs.

A good goal.

> The MCAs/INITs are rare.

One hopes.  :-)

> There is no use wasting much permanent resources.

Sometimes a necessary evil.  Normal memory allocation routines 
cannot be called from MCA/INIT context.

> Should you see a burst of events, they are not uncorrelated.
> Only the recovered events are treated. Should you miss one,
> you'll see it later :-)
> With the others, you will not be able to store them on disk anyway.

Even if the system is going down it is still nice to try to 
go down gracefully.  Taking a system dump and logging as 
much as possible is usefull, too.

> There are IA64_MAX_MCA_INIT_BUFS log buffers for the MCA, and
> another IA64_MAX_MCA_INIT_BUFS log buffers for the INIT handler.
> 
> IA64_MAX_MCA_INIT_BUFS >= 2.
> 
> There is no per CPU log buffer.

In the case where all the CPUs are INITed, what happens?
Does this assume only one CPU at a time processes/logs records?

> The code does not assume that the rendezvous always works.

Could you explain.  Do you mean MCA/INIT rendezvous?
 

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
  2008-03-05  0:23 ` Russ Anderson
@ 2008-03-05 13:14 ` Zoltan Menyhart
  2008-03-05 16:59 ` Luck, Tony
                   ` (20 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-05 13:14 UTC (permalink / raw)
  To: linux-ia64

Thank you for your remarks.

>>The MCAs/INITs are rare.
> 
> One hopes.  :-)

Should you have a single unrecoverable MCA, the game is over.
Neither the original code, nor mine can log it before the machine
is re-booted / halted.

Only the recovered ones play.
It is safe to continue after the recovered ones.
You need these logs to be alerted and to program the maintenance.

Both the original code and mine can "swallow" about 1 recovered
event / minute, and tolerate a "burst" of 2 or IA64_MAX_MCA_INIT_BUFS
events.

The probability to have more than that _independent_ events
in a small time frame is very very low. Therefore you can
afford losing events of the same "burst".

>>There is no use wasting much permanent resources.
> 
> Sometimes a necessary evil.  Normal memory allocation routines 
> cannot be called from MCA/INIT context.

This is why I pre-allocate IA64_MAX_MCA_INIT_BUFS buffers.

> Even if the system is going down it is still nice to try to 
> go down gracefully.  Taking a system dump and logging as 
> much as possible is usefull, too.

You (may want to) take a dump if the event is not recovered.
In such e case, neither the original code, nor mine does any useful
thing :-)

> In the case where all the CPUs are INITed, what happens?
> Does this assume only one CPU at a time processes/logs records?

I have not added my code to the INIT handler yet.

From the SAL spec.: INIT reason code:

0 = Received INIT signal on this processor for reasons other than machine
     check rendezvous and CrashDump switch assertion.
1 = Received INIT signal on this processor during machine check rendezvous.
2 = Received INIT signal on this processor due to CrashDump switch assertion.

I think there is no use to log anything in the cases of MCA rendezvous
and CrashDump (that can actually dump, call the KDB).
I intend to log the "other reasons" only, by the monarch only.

>>The code does not assume that the rendezvous always works.
> 
> Could you explain.  Do you mean MCA/INIT rendezvous?

Yes.
If everything goes fine, only one CPU, the monarch logs.
(See also the comment in the INIT handler saying:
 FIXME: Workaround for broken proms that drive all INIT events as monarchs.)

However, the SAL spec. allows in "OS_MCA Hand-off State" that
"Rendezvous of other processors was required but was unsuccessful
on one or more processors."

E.g. two non-global MCAs can happen on two CPUs, both of them can start
to execute the MCA handler, thinking that each of them is monarch.
My code should survive...

Thanks,

Zoltan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
  2008-03-05  0:23 ` Russ Anderson
  2008-03-05 13:14 ` Zoltan Menyhart
@ 2008-03-05 16:59 ` Luck, Tony
  2008-03-05 18:56 ` Russ Anderson
                   ` (19 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Luck, Tony @ 2008-03-05 16:59 UTC (permalink / raw)
  To: linux-ia64

> The probability to have more than that _independent_ events
> in a small time frame is very very low. Therefore you can
> afford losing events of the same "burst".

Both the CMC and CPE interrupt paths have code to switch to
polling mode in the presence of a burst of correctable errors.
Can we tune this threshold w.r.t. the number of buffers we
pre-allocate to save error records so that we (the OS) won't
be responsible for losing errors?  Obviously entering polling
mode puts the responsibility onto SAL to keep track of all
the error reports - but nobody ever complained that this
might result in the loss of error information if the SAL runs
out of space to keep the error records before the next poll
from the OS. ["solving" problems by shifting the blame point?]

-Tony

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (2 preceding siblings ...)
  2008-03-05 16:59 ` Luck, Tony
@ 2008-03-05 18:56 ` Russ Anderson
  2008-03-05 23:38 ` Keith Owens
                   ` (18 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Russ Anderson @ 2008-03-05 18:56 UTC (permalink / raw)
  To: linux-ia64

On Wed, Mar 05, 2008 at 02:14:52PM +0100, Zoltan Menyhart wrote:
> Thank you for your remarks.
> 
> >>The MCAs/INITs are rare.
> >
> >One hopes.  :-)
> 
> Should you have a single unrecoverable MCA, the game is over.

Depends on the definiton of "over". :-)

> Neither the original code, nor mine can log it before the machine
> is re-booted / halted.
> Only the recovered ones play.
> It is safe to continue after the recovered ones.
> You need these logs to be alerted and to program the maintenance.
> 
> Both the original code and mine can "swallow" about 1 recovered
> event / minute, and tolerate a "burst" of 2 or IA64_MAX_MCA_INIT_BUFS
> events.

That is not nearly enough.  On a large shared memory system multiple
CPUs can hit the same memory error at the same time (for example).
There are several test cases in my test environment that cause
multiple CPUs to go into MCA at the same time.  The value needs 
to scale with system size.

What happens on boot up, when salinfo reads all the old records?
Does that "burst" of records all get logged.

> The probability to have more than that _independent_ events
> in a small time frame is very very low. Therefore you can
> afford losing events of the same "burst".

Large systems turn unlikely probabilities into likely.

> >>There is no use wasting much permanent resources.
> >
> >Sometimes a necessary evil.  Normal memory allocation routines 
> >cannot be called from MCA/INIT context.
> 
> This is why I pre-allocate IA64_MAX_MCA_INIT_BUFS buffers.
> 
> >Even if the system is going down it is still nice to try to 
> >go down gracefully.  Taking a system dump and logging as 
> >much as possible is usefull, too.
> 
> You (may want to) take a dump if the event is not recovered.
> In such e case, neither the original code, nor mine does any useful
> thing :-)

My intent was not to turn this into a discussion of KDB/system
dump, but those are necessary features.  :-)
 
> >In the case where all the CPUs are INITed, what happens?
> >Does this assume only one CPU at a time processes/logs records?
> 
> I have not added my code to the INIT handler yet.
> 
> >From the SAL spec.: INIT reason code:
> 
> 0 = Received INIT signal on this processor for reasons other than machine
>     check rendezvous and CrashDump switch assertion.
> 1 = Received INIT signal on this processor during machine check rendezvous.
> 2 = Received INIT signal on this processor due to CrashDump switch 
> assertion.
> 
> I think there is no use to log anything in the cases of MCA rendezvous
> and CrashDump (that can actually dump, call the KDB).
> I intend to log the "other reasons" only, by the monarch only.

When the system is NMI'ed, all the CPUs receive an INIT.
I'll check what category that falls under.

> >>The code does not assume that the rendezvous always works.
> >
> >Could you explain.  Do you mean MCA/INIT rendezvous?
> 
> Yes.
> If everything goes fine, only one CPU, the monarch logs.
> (See also the comment in the INIT handler saying:
> FIXME: Workaround for broken proms that drive all INIT events as monarchs.)

That FIXME was to work around a case where all the CPUs rendezvoued but SAL
did not identify any of the CPUs as monarch.
 
> However, the SAL spec. allows in "OS_MCA Hand-off State" that
> "Rendezvous of other processors was required but was unsuccessful
> on one or more processors."
> 
> E.g. two non-global MCAs can happen on two CPUs, both of them can start
> to execute the MCA handler, thinking that each of them is monarch.
> My code should survive...

I have a test case that creates that scenario.  With your patch and only 
one of the MCAs (at most) end up getting logged in /var/log/salinfo/decoded .

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (3 preceding siblings ...)
  2008-03-05 18:56 ` Russ Anderson
@ 2008-03-05 23:38 ` Keith Owens
  2008-03-06 10:24 ` Zoltan Menyhart
                   ` (17 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Keith Owens @ 2008-03-05 23:38 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson (on Wed, 5 Mar 2008 12:56:29 -0600) wrote:
>On Wed, Mar 05, 2008 at 02:14:52PM +0100, Zoltan Menyhart wrote:
>> Neither the original code, nor mine can log it before the machine
>> is re-booted / halted.
>> Only the recovered ones play.
>> It is safe to continue after the recovered ones.
>> You need these logs to be alerted and to program the maintenance.
>> 
>> Both the original code and mine can "swallow" about 1 recovered
>> event / minute, and tolerate a "burst" of 2 or IA64_MAX_MCA_INIT_BUFS
>> events.
>
>What happens on boot up, when salinfo reads all the old records?
>Does that "burst" of records all get logged.

salinfo on boot up is a serial process, driven from user space.  One
cpu at a time, one record of each type at a time.  Records of different
types are processed in parallel.

   Keith (I wish I had never heard of salinfo) Owens


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (4 preceding siblings ...)
  2008-03-05 23:38 ` Keith Owens
@ 2008-03-06 10:24 ` Zoltan Menyhart
  2008-03-06 13:14 ` Zoltan Menyhart
                   ` (16 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-06 10:24 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson wrote:

> That is not nearly enough.  On a large shared memory system multiple
> CPUs can hit the same memory error at the same time (for example).
> There are several test cases in my test environment that cause
> multiple CPUs to go into MCA at the same time.  The value needs 
> to scale with system size.

These are the consequences of the same bad memory block.
There is no more information about the health of the machine in
N log instances of the same memory error, than in the first one.
Anyway, the HW guys or the maintenance guys will count the events
as a single occurrence of memory failure.

> What happens on boot up, when salinfo reads all the old records?
> Does that "burst" of records all get logged.

The errors coming from the events before the reboot do not go
through the MCA handler. The salinfo side reads them directly by
calling ia64_sal_get_state_info().

>>The probability to have more than that _independent_ events
>>in a small time frame is very very low. Therefore you can
>>afford losing events of the same "burst".
> 
> Large systems turn unlikely probabilities into likely.

A rough estimation can be done as follows:

Assume you have an MTBF of 30,000 hours.
The probability of having an MCA in a one minute time frame is less
than 1 / (60 * 30,000) < 10^(-6).
The probability of having two independent errors causing MCAs in
the same one minute time frame is less than 10^(-12).

> That FIXME was to work around a case where all the CPUs rendezvoued but SAL
> did not identify any of the CPUs as monarch.

I agree, I just wanted to mention that it is not sure that the SALs
fully respect the specification. In addition, it is allowed that a
a rendezvous be unsuccessful.

I designed my code not to reckon on successful rendezvous.

> I have a test case that creates that scenario.  With your patch and only 
> one of the MCAs (at most) end up getting logged in /var/log/salinfo/decoded .

Can you describe, please, what your test does and what is the
expected behavior of the MCA layer?

Another idea: the integration into the salinfo side in not yet quit smooth, :-)
it is the polling that fetches the logs one by one. Please leave 3 periods
for the polling to see all the logs.

Thanks,

Zoltan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (5 preceding siblings ...)
  2008-03-06 10:24 ` Zoltan Menyhart
@ 2008-03-06 13:14 ` Zoltan Menyhart
  2008-03-06 17:09 ` Luck, Tony
                   ` (15 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-06 13:14 UTC (permalink / raw)
  To: linux-ia64

Luck, Tony wrote:

Let's see this first:

> Obviously entering polling
> mode puts the responsibility onto SAL to keep track of all
> the error reports

Please have a look at the
Figure 2-1. Itanium® Processor Family Firmware Machine Check Handling Model
in the Error Handling Guide.

This figure shows that the SAL (or the PAL) cannot see the platform
originated CPEIs, nor the CPU HW originated CMCIs.

When you call SAL_GET_STATE_INFO(), the SAL (and the PAL) will read out
the error status from some HW registers.

Therefore the SAL / PAL cannot store error reports.

Can the HW (platform or CPU) help to save error reports?

A typical "error register set" - whatever it is - saves the first
error and maintains a "cumulative error" status (usually reset
by SAL_CLEAR_STATE_INFO()).

CPEs / CMCs will be lost unless you (want to) "swallow" them
quickly enough.

The SAL / PAL can be the origin of CPEIs / CMCIs if they succeed
in correcting MCAs. They stock the related information until the
OS calls SAL_GET_STATE_INFO().
How many such outstanding CPEIs / CMCIs there can be is an
implementation issue.
Surely there are a limited number of bufferers there.
I do not think they date to implement a complicated buffer
handling mechanism in an MCA context.

- but nobody ever complained that this
> might result in the loss of error information if the SAL runs
> out of space to keep the error records before the next poll
> from the OS. ["solving" problems by shifting the blame point?]

I've got a Tiger box like machine installed with some known
to be bad memory. I scan the known bad addresses via /dev/mem:

	volatile unsigned char *p = bad ph. addr.
        for (;;){
                tmp += *p;
                ia64_fc((void *) p);
        }

It is a deterministically bad memory location.
You can guess how many errors / sec there are.
Obviously, we switch into polling mode.
(And we lose most of the events.)

Less than half of the cases I get logs like this:

  Platform Memory Device Error Info Section
  Mem Error Detail
    Physical Address: 0x280059b81 Address Mask: 0xfffffffff80 Node: 0
	Card: 0 Module: 3 Bank: 3 Device: 1 Row: 2050 Column: 1356
  Platform Memory Device Error Info Section
  Mem Error Detail
    Node: 0

But in more than half of the cases, salinfo_decode gets lost:

  Platform Memory Device Error Info Section
  Mem Error Detail
    Node: 0  

Again we lose events.

The SAL spec. does not say a word about how many errors have to be
kept by the SAL. Therefore we cannot reckon on the SAL keeping them.

> Both the CMC and CPE interrupt paths have code to switch to
> polling mode in the presence of a burst of correctable errors.
> Can we tune this threshold w.r.t. the number of buffers we
> pre-allocate to save error records so that we (the OS) won't
> be responsible for losing errors?

We are condemned to lose error logs due to the limited number
of the error buffers in the SAL / PAL / OS, due to the limited
services provided by the HW.

I hope we can agree that the probability of a coincidence of more
that one independent errors is very very low
(otherwise change the machine :-)).

Keeping the first error log that contains pertinent, new
information, is very important.
Keeping the last one is important, because not treating rapidly
enough an error can worsen the situation.
The others are just for the statistics...

Thanks,

Zoltan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (6 preceding siblings ...)
  2008-03-06 13:14 ` Zoltan Menyhart
@ 2008-03-06 17:09 ` Luck, Tony
  2008-03-06 17:29 ` Zoltan Menyhart
                   ` (14 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Luck, Tony @ 2008-03-06 17:09 UTC (permalink / raw)
  To: linux-ia64

> A rough estimation can be done as follows:
>
> Assume you have an MTBF of 30,000 hours.
> The probability of having an MCA in a one minute time frame is less
> than 1 / (60 * 30,000) < 10^(-6).
> The probability of having two independent errors causing MCAs in
> the same one minute time frame is less than 10^(-12).

Russ's large systems change these.  Is 30,000 hours a plausible
MTBF for a DIMM.  What if the system contains 8TB memory in 2GB
DIMMs.  Now you have 4096 DIMM sticks in the system.  Redo your
calculations for this large system.

-Tony

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (7 preceding siblings ...)
  2008-03-06 17:09 ` Luck, Tony
@ 2008-03-06 17:29 ` Zoltan Menyhart
  2008-03-06 17:52 ` Russ Anderson
                   ` (13 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-06 17:29 UTC (permalink / raw)
  To: linux-ia64

Luck, Tony wrote:
>>A rough estimation can be done as follows:
>>
>>Assume you have an MTBF of 30,000 hours.
>>The probability of having an MCA in a one minute time frame is less
>>than 1 / (60 * 30,000) < 10^(-6).
>>The probability of having two independent errors causing MCAs in
>>the same one minute time frame is less than 10^(-12).
> 
> 
> Russ's large systems change these.  Is 30,000 hours a plausible
> MTBF for a DIMM.  What if the system contains 8TB memory in 2GB
> DIMMs.  Now you have 4096 DIMM sticks in the system.  Redo your
> calculations for this large system.
> 
> -Tony

Can you please provide some real MTBF data?

Thanks,

Zoltan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (8 preceding siblings ...)
  2008-03-06 17:29 ` Zoltan Menyhart
@ 2008-03-06 17:52 ` Russ Anderson
  2008-03-06 21:56 ` Luck, Tony
                   ` (12 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Russ Anderson @ 2008-03-06 17:52 UTC (permalink / raw)
  To: linux-ia64

On Thu, Mar 06, 2008 at 02:14:48PM +0100, Zoltan Menyhart wrote:
> Luck, Tony wrote:
> 
> Let's see this first:
> 
> >Obviously entering polling
> >mode puts the responsibility onto SAL to keep track of all
> >the error reports
> 
> Please have a look at the
> Figure 2-1. Itanium® Processor Family Firmware Machine Check Handling Model
> in the Error Handling Guide.
> 
> This figure shows that the SAL (or the PAL) cannot see the platform
> originated CPEIs, nor the CPU HW originated CMCIs.

Figure 2-1 does show SAL passing up CPEI records to OS, too.

> When you call SAL_GET_STATE_INFO(), the SAL (and the PAL) will read out
> the error status from some HW registers.
> 
> Therefore the SAL / PAL cannot store error reports.

See section 5.3.2 CMC and CPE Records

  Each processor or physical platform could have multiple valid corrected
  machine check or corrected platform error records. The maximum number of
  these records present in a system depends on the SAL implementation and
  the storage space available on the system. There is no requirement for
  these records to be logged into NVM. The SAL may use an implementation
  specific error record replacement algorithm for overflow situations. The
  OS needs to make an explicit call to the SAL procedure SAL_CLEAR_STATE_INFO
  to clear the CMC and CPE records in order to free up the memory resources
  that may be used for future records.

5.4.1 Corrected Error Event Record

  In response to a CMC/CPE condition, SAL builds and maintains the error
  record for OS retrieval.

> Can the HW (platform or CPU) help to save error reports?
> 
> A typical "error register set" - whatever it is - saves the first
> error and maintains a "cumulative error" status (usually reset
> by SAL_CLEAR_STATE_INFO()).
> 
> CPEs / CMCs will be lost unless you (want to) "swallow" them
> quickly enough.

Yes, we want to handle the records as quickly as possible.

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (9 preceding siblings ...)
  2008-03-06 17:52 ` Russ Anderson
@ 2008-03-06 21:56 ` Luck, Tony
  2008-03-06 22:13 ` Russ Anderson
                   ` (11 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Luck, Tony @ 2008-03-06 21:56 UTC (permalink / raw)
  To: linux-ia64

> Can you please provide some real MTBF data?

I don't have any data on MTBF rates in DIMMs.  There was a
study on single-bit ECC error rates in memory that IIRC
concluded that the rate was about 1 error per gigabyte per
two months.  This study was looking at errors caused by
high energy neutrons which came from the collision of
cosmic rays with the upper atmosphere.  Actual rates vary
by location because of the shape of the earths magnetic field,
and with altitude.

But that was a very old study ... newer DIMMs made on denser
silicon processes will most likely be more vulnerable to
neutron strikes.

-Tony

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (10 preceding siblings ...)
  2008-03-06 21:56 ` Luck, Tony
@ 2008-03-06 22:13 ` Russ Anderson
  2008-03-07 12:02 ` Zoltan Menyhart
                   ` (10 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Russ Anderson @ 2008-03-06 22:13 UTC (permalink / raw)
  To: linux-ia64

On Thu, Mar 06, 2008 at 06:29:21PM +0100, Zoltan Menyhart wrote:
> Luck, Tony wrote:
> >
> >Russ's large systems change these.  Is 30,000 hours a plausible
> >MTBF for a DIMM.  What if the system contains 8TB memory in 2GB
> >DIMMs.  Now you have 4096 DIMM sticks in the system.  Redo your
> >calculations for this large system.
> >
> >-Tony
> 
> Can you please provide some real MTBF data?

Here is a manufacturer advertising "over 7 years".
7 years is 61,320 hrs, 8 year is 70,080.

http://ramfinder.com/items/ex2gb0132f.html

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (11 preceding siblings ...)
  2008-03-06 22:13 ` Russ Anderson
@ 2008-03-07 12:02 ` Zoltan Menyhart
  2008-03-07 16:55 ` Russ Anderson
                   ` (9 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-07 12:02 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson wrote:

> Figure 2-1 does show SAL passing up CPEI records to OS, too.

Yes, as I also said:
"The SAL / PAL can be the origin of CPEIs / CMCIs if they succeed
in correcting MCAs. They stock the related information until the
OS calls SAL_GET_STATE_INFO()."

I Just want to emphasize that in case of the platform / CPU HW originated
CPEIs / CMCIs, the SAL does not know of them before we call
SAL_GET_STATE_INFO(), therefore it cannot store any information about
them.

> See section 5.3.2 CMC and CPE Records
> 
>   Each processor or physical platform could have multiple valid corrected
>   machine check or corrected platform error records. The maximum number of
>   these records present in a system depends on the SAL implementation and
>   the storage space available on the system. There is no requirement for
>   these records to be logged into NVM. The SAL may use an implementation
>   specific error record replacement algorithm for overflow situations. The
>   OS needs to make an explicit call to the SAL procedure SAL_CLEAR_STATE_INFO
>   to clear the CMC and CPE records in order to free up the memory resources
>   that may be used for future records.

As far as I can understand, it is about the events not signaled by
interrupts, but MCAs, and either the PAL or the SAL manages to correct
them (=> CMCI, CPEI).

You have got N >= 1 buffers for this kind of errors.

> 5.4.1 Corrected Error Event Record
> 
>   In response to a CMC/CPE condition, SAL builds and maintains the error
>   record for OS retrieval.

It does not say that the SAL knows about CMCI / CPEI signaled errors
before we call SAL_GET_STATE_INFO().

Example: the Tiger box with i82870:

There is a register pair of FERRST / SERRST for each component, e.g.
the memory controller.

FERRST: first error status register
SERRST: second / subsequent error status register

Note that the FERRST captures correctly the errors, the SERRST
is mixture (OR logic) of all the other errors.

In case of a corrected memory error, the OS receives a CPEI.
When the OS calls SAL_GET_STATE_INFO(), the SAL reads out the
FERRST / SERRST for each component.
If there are multiple errors, the SAL selects which one is to be
reported.
When the OS calls SAL_CLEAR_STATE_INFO(), the SAL resets the
register pairs whose content were reported by SAL_GET_STATE_INFO().
If there are multiple errors, then you can SAL_GET_STATE_INFO()
repeatedly.

> Here is a manufacturer advertising "over 7 years".
> 7 years is 61,320 hrs, 8 year is 70,080.

It seems to be way too low.
Would not it mean:
"99.999% probability that the product will operate for over 7 years without a failure"
instead of being an MTBF value?

Please have a look at e.g.: http://ramfinder.com/items/ex2gb0132f.html

They mean "without a failure": uncorrectable errors.

Luck, Tony wrote:

> Russ's large systems change these.  Is 30,000 hours a plausible
> MTBF for a DIMM.  What if the system contains 8TB memory in 2GB
> DIMMs.  Now you have 4096 DIMM sticks in the system.  Redo your
> calculations for this large system.

Using the memory seen at http://ramfinder.com/items/ex2gb0132f.html

7 years * 100% / (100% - 99.999%) / 4096 = 170 years

i.e. the MTBF: > 1,000,000 hours with 4096 DIMMs.

> ... about 1 error per gigabyte per two months.

It can be an estimation for the single bit error rate (CPEI).

> But that was a very old study ... newer DIMMs made on denser
> silicon processes will most likely be more vulnerable to
> neutron strikes.

Let's assume the flux of cosmic ray generated particles will hit
the same number of memory cells, unless a particle comes // to the
silicon die, then it can hit more cells until its energy is eaten up.

This is why I think it is the "surface" of the memory exposed
to the flux of cosmic ray generated particles, that is important
and not the number of the gigabytes.

Thanks,

Zoltan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (12 preceding siblings ...)
  2008-03-07 12:02 ` Zoltan Menyhart
@ 2008-03-07 16:55 ` Russ Anderson
  2008-03-10  9:36 ` Zoltan Menyhart
                   ` (8 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Russ Anderson @ 2008-03-07 16:55 UTC (permalink / raw)
  To: linux-ia64

On Fri, Mar 07, 2008 at 01:02:47PM +0100, Zoltan Menyhart wrote:
> Russ Anderson wrote:
> 
> >Figure 2-1 does show SAL passing up CPEI records to OS, too.
> 
> Yes, as I also said:
> "The SAL / PAL can be the origin of CPEIs / CMCIs if they succeed
> in correcting MCAs. They stock the related information until the
> OS calls SAL_GET_STATE_INFO()."
> 
> I Just want to emphasize that in case of the platform / CPU HW originated
> CPEIs / CMCIs, the SAL does not know of them before we call
> SAL_GET_STATE_INFO(), therefore it cannot store any information about
> them.

In some implementations SAL builds the records in response to 
SAL_GET_STATE_INFO(), in other implementations SAL knows of 
the CPEI/CMCI and builds/buffers the records before the
SAL_GET_STATE_INFO() call.  The SAL spec does not prohibit SAL 
building/buffering the records before SAL_GET_STATE_INFO().

From a practical perspective, I don't think the difference significantly
changes how linux should handle CPEIs/CMCIs.  Linux should try to read/log
the CPEI/CMCI as quick as possible.  The lack of SAL buffering increases
the chance of a record getting lost (overwritten) while SAL buffering
reduces the chance that a CPEI/CMCI record gets lost (overwritten).
If anything, the lack of SAL buffering would be a reason for more
linux buffers, to reduce the chance of losing records.

> >See section 5.3.2 CMC and CPE Records
> >
> >  Each processor or physical platform could have multiple valid corrected
> >  machine check or corrected platform error records. The maximum number of
> >  these records present in a system depends on the SAL implementation and
> >  the storage space available on the system. There is no requirement for
> >  these records to be logged into NVM. The SAL may use an implementation
> >  specific error record replacement algorithm for overflow situations. The
> >  OS needs to make an explicit call to the SAL procedure 
> >  SAL_CLEAR_STATE_INFO
> >  to clear the CMC and CPE records in order to free up the memory resources
> >  that may be used for future records.
> 
> As far as I can understand, it is about the events not signaled by
> interrupts, but MCAs, and either the PAL or the SAL manages to correct
> them (=> CMCI, CPEI).

Agreed that SAL corrected errors can get passed up as CMCI/CPEI.
I do not believe it prohibits other CMCI/CPEI records from being
built/buffered before the SAL_CLEAR_STATE_INFO() call.  

As stated above, from a practical perspective, I don't believe the
difference significanlty changes how linux should behave other than
possibly being a reason for more linux buffers.

> You have got N >= 1 buffers for this kind of errors.

My preference is for a larger N.  Scaling N with system size
may be the best solution for small & large systems.

> >5.4.1 Corrected Error Event Record
> >
> >  In response to a CMC/CPE condition, SAL builds and maintains the error
> >  record for OS retrieval.
> 
> It does not say that the SAL knows about CMCI / CPEI signaled errors
> before we call SAL_GET_STATE_INFO().

It does not say that SAL cannot know before the SAL_GET_STATE_INFO() call.

> Example: the Tiger box with i82870:

I take your word as how Tiger SAL behaves.
Please take my word that other SAL implementations behave differently.

Thanks,
-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (13 preceding siblings ...)
  2008-03-07 16:55 ` Russ Anderson
@ 2008-03-10  9:36 ` Zoltan Menyhart
  2008-03-10 20:36 ` Russ Anderson
                   ` (7 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-10  9:36 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson wrote:

> ... in other implementations SAL knows of
> the CPEI/CMCI and builds/buffers the records before the
> SAL_GET_STATE_INFO() call.

If you mean MCAs corrected and transformed into CPEIs/CMCIs, I agree.
If you mean the platform / CPU HW originated CPEIs/CMCIs, please
explain how the SAL catches these interrupts.

I guess the differences among implementations call for a dynamically
configurable solution.

First I would have liked to discuss about the MCAs, which - in may
approach - are completely separated from the CPEIs/CMCIs.
As MCA log buffering cannot be synchronized as it is done
for the CPEIs/CMCIs, it requires different code, can we discuss
separately the mechanisms for MCAs and CPEIs/CMCIs?
As far as the my MCA stuff is concerned, can you agree that it is
safer than the original code?

>>From a practical perspective, I don't think the difference significantly
> changes how linux should handle CPEIs/CMCIs.  Linux should try to read/log
> the CPEI/CMCI as quick as possible. The lack of SAL buffering increases
> the chance of a record getting lost (overwritten) while SAL buffering
> reduces the chance that a CPEI/CMCI record gets lost (overwritten).
> If anything, the lack of SAL buffering would be a reason for more
> linux buffers, to reduce the chance of losing records.

I agree.

> Agreed that SAL corrected errors can get passed up as CMCI/CPEI.
> I do not believe it prohibits other CMCI/CPEI records from being
> built/buffered before the SAL_CLEAR_STATE_INFO() call.  

How can it build / buffer records belonging to the platform / CPU HW
originated CPEIs/CMCIs?
Some of the CPEI/CMCI arrows on the Figure 2-1.... go directly from the
platform / CPU HW to the OS.
And as far as I can see, the SAL do not handle CPE/CMC interrupts.

> As stated above, from a practical perspective, I don't believe the
> difference significanlty changes how linux should behave other than
> possibly being a reason for more linux buffers.

> My preference is for a larger N.  Scaling N with system size
> may be the best solution for small & large systems.

Can we think of some dynamic / platfom dependent way of configuring
the number of the buffers?

E.g. my MCA stuff can start up with, say, 3 buffers by default,
and you will be able to override it by a boot command line option.

Thanks,

Zoltan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (14 preceding siblings ...)
  2008-03-10  9:36 ` Zoltan Menyhart
@ 2008-03-10 20:36 ` Russ Anderson
  2008-03-10 21:10 ` Russ Anderson
                   ` (6 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Russ Anderson @ 2008-03-10 20:36 UTC (permalink / raw)
  To: linux-ia64

On Mon, Mar 10, 2008 at 10:36:11AM +0100, Zoltan Menyhart wrote:
> Russ Anderson wrote:
> 
> >... in other implementations SAL knows of
> >the CPEI/CMCI and builds/buffers the records before the
> >SAL_GET_STATE_INFO() call.
> 
> If you mean MCAs corrected and transformed into CPEIs/CMCIs, I agree.
> If you mean the platform / CPU HW originated CPEIs/CMCIs, please
> explain how the SAL catches these interrupts.

The difference does not change how linux should handle CPEIs/CMCIs.

> I guess the differences among implementations call for a dynamically
> configurable solution.

I don't think so.  They should appear to linux the same.

> First I would have liked to discuss about the MCAs, which - in may
> approach - are completely separated from the CPEIs/CMCIs.
> As MCA log buffering cannot be synchronized as it is done
> for the CPEIs/CMCIs, it requires different code, can we discuss
> separately the mechanisms for MCAs and CPEIs/CMCIs?

Yes, I agree that different code for MCA/INIT and CPEI/CMCI is
the right approach.

> As far as the my MCA stuff is concerned, can you agree that it is
> safer than the original code?

Yes.  I like your approach.  I want to make sure it works
on larger systems.

> >>From a practical perspective, I don't think the difference significantly
> >changes how linux should handle CPEIs/CMCIs.  Linux should try to read/log
> >the CPEI/CMCI as quick as possible. The lack of SAL buffering increases
> >the chance of a record getting lost (overwritten) while SAL buffering
> >reduces the chance that a CPEI/CMCI record gets lost (overwritten).
> >If anything, the lack of SAL buffering would be a reason for more
> >linux buffers, to reduce the chance of losing records.
> 
> I agree.
> 
> >As stated above, from a practical perspective, I don't believe the
> >difference significanlty changes how linux should behave other than
> >possibly being a reason for more linux buffers.
> 
> >My preference is for a larger N.  Scaling N with system size
> >may be the best solution for small & large systems.
> 
> Can we think of some dynamic / platfom dependent way of configuring
> the number of the buffers?
> 
> E.g. my MCA stuff can start up with, say, 3 buffers by default,
> and you will be able to override it by a boot command line option.

How about having N be the number of actual cpus?  

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (15 preceding siblings ...)
  2008-03-10 20:36 ` Russ Anderson
@ 2008-03-10 21:10 ` Russ Anderson
  2008-03-11 14:07 ` Zoltan Menyhart
                   ` (5 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Russ Anderson @ 2008-03-10 21:10 UTC (permalink / raw)
  To: linux-ia64

On Thu, Mar 06, 2008 at 11:24:06AM +0100, Zoltan Menyhart wrote:
> Russ Anderson wrote:
> 
> >I have a test case that creates that scenario.  With your patch and only 
> >one of the MCAs (at most) end up getting logged in 
> >/var/log/salinfo/decoded .
> 
> Can you describe, please, what your test does and what is the
> expected behavior of the MCA layer?

The test process allocates memory, injects an uncorrectable error, 
forks a child, then both processes consume the bad data, with
the effect of two processes going into OS_MCA at the same time.

With the old code a total of four MCA records get logged.  
(Overkill, an opportunity for improvement.)  Each cpu that went 
through MCA logs the error twice, with one of the records being
marked recovered (each pair of records are otherwise identical). 

With the new code the first MCA is reported as occuring on cpu 0
when it occured on cpu 1.  I think it is due to this code in
arch/ia64/kernel/salinfo.c:

-------------------------------------------------------------
        n = data->cpu_check;
//      printk("CPU %d: %s(): data->cpu_check: %d, data->cpu_event: %016lx\n", smp_processor_id(),
//                                      __func__, n, data->cpu_event.bits[0]);  // :-)
        if (atomic_read(&ia64_MCA_logs._b_cnt) > 0 || atomic_read(&ia64_INIT_logs._b_cnt) >
 0){
//              printk("cpu %d %d %d\n", cpu, atomic_read(&ia64_MCA_logs._b_cnt), atomic_read(&ia64_INIT_logs._b_cnt));
                cpu = any_online_cpu(cpu_online_map);
        } else {
                for (i = 0; i < NR_CPUS; i++) {
                        if (cpu_isset(n, data->cpu_event)) {
                                if (!cpu_online(n)) {
                                        cpu_clear(n, data->cpu_event);
                                        continue;
                                }
                                cpu = n;
                                break;
                        }
                        if (++n = NR_CPUS)
                                n = 0;
                }

                if (cpu = -1)
                        goto retry;

                ia64_mlogbuf_dump();

                /* for next read, start checking at next CPU */
                data->cpu_check = cpu;
                if (++data->cpu_check = NR_CPUS)
                        data->cpu_check = 0;
        }
        snprintf(cmd, sizeof(cmd), "read %d\n", cpu);  
-------------------------------------------------------------
This line
                cpu = any_online_cpu(cpu_online_map);

returns 0, so the MCA gets marked as being on cpu 0 instead
of the actual cpu (cpu 1).

 
> Another idea: the integration into the salinfo side in not yet quit smooth, 
> :-)

Understood.

> it is the polling that fetches the logs one by one. Please leave 3 periods
> for the polling to see all the logs.

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (16 preceding siblings ...)
  2008-03-10 21:10 ` Russ Anderson
@ 2008-03-11 14:07 ` Zoltan Menyhart
  2008-03-11 14:32 ` Robin Holt
                   ` (4 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-11 14:07 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson wrote:
> ...
>>As far as the my MCA stuff is concerned, can you agree that it is
>>safer than the original code?
> 
> Yes.  I like your approach.  I want to make sure it works
> on larger systems.

If it comes from a boot command line option...

>>E.g. my MCA stuff can start up with, say, 3 buffers by default,
>>and you will be able to override it by a boot command line option.
> 
> How about having N be the number of actual cpus?  

Let me ask again: do you expect _independent_ MCAs to happen?
If you have got a estimation of the probability of independent
MCAs happening at a same time, different from what I calculated,
then please share it with us.

If the MCAs are the consequences of the same error event, then
you can find out what they are, where they are from 2 or 3 logs.

The code actual tries to recover local MCAs only. They are:
- TLB errors: per CPU local. As the CPUs are much more reliable
  then the other components, e.g. the memory, having two or
  more CPUs with corrupted TLBs at the same time is really unlikely.
- I/O or memory read errors:
  + One error has affected N CPUs: the first log is enough.
  + More than one independent error at the same time: assuming
    my estimations are more or less correct...

I still don't see any need for many buffers.

Thanks,

Zoltan

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (17 preceding siblings ...)
  2008-03-11 14:07 ` Zoltan Menyhart
@ 2008-03-11 14:32 ` Robin Holt
  2008-03-11 21:22 ` Russ Anderson
                   ` (3 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Robin Holt @ 2008-03-11 14:32 UTC (permalink / raw)
  To: linux-ia64

On Tue, Mar 11, 2008 at 03:07:20PM +0100, Zoltan Menyhart wrote:
> Let me ask again: do you expect _independent_ MCAs to happen?
> If you have got a estimation of the probability of independent
> MCAs happening at a same time, different from what I calculated,
> then please share it with us.
>
> If the MCAs are the consequences of the same error event, then
> you can find out what they are, where they are from 2 or 3 logs.
>
> The code actual tries to recover local MCAs only. They are:
> - TLB errors: per CPU local. As the CPUs are much more reliable
>  then the other components, e.g. the memory, having two or
>  more CPUs with corrupted TLBs at the same time is really unlikely.
> - I/O or memory read errors:
>  + One error has affected N CPUs: the first log is enough.
>  + More than one independent error at the same time: assuming
>    my estimations are more or less correct...

I don't know enough in this area to be of much use, but I do recall
times where a customer machine has run into an error and the neither the
first nor last record was of any use, but one of the intermediate
records.  I recall taking nearly a day to find the critical difference
and I vaguely recall it was on the order of 120 records and the useful
record was in the early 80s.  Russ certainly has more experience in this
area.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (18 preceding siblings ...)
  2008-03-11 14:32 ` Robin Holt
@ 2008-03-11 21:22 ` Russ Anderson
  2008-03-12  1:08 ` Keith Owens
                   ` (2 subsequent siblings)
  22 siblings, 0 replies; 24+ messages in thread
From: Russ Anderson @ 2008-03-11 21:22 UTC (permalink / raw)
  To: linux-ia64

I'd much rather focus on the actual code.  
See debug information at the end.

On Tue, Mar 11, 2008 at 03:07:20PM +0100, Zoltan Menyhart wrote:
> Russ Anderson wrote:
> >...
> >>As far as the my MCA stuff is concerned, can you agree that it is
> >>safer than the original code?
> >
> >Yes.  I like your approach.  I want to make sure it works
> >on larger systems.
> 
> If it comes from a boot command line option...
> 
> >>E.g. my MCA stuff can start up with, say, 3 buffers by default,
> >>and you will be able to override it by a boot command line option.
> >
> >How about having N be the number of actual cpus?  
> 
> Let me ask again: do you expect _independent_ MCAs to happen?

Depends on what you mean by _independent_.  I have a lot of experience
with _cascading_ MCAs, where there is a root cause failure quickly
followed by other MCAs as a side effect of the initial failure all
occuring as one MCA event.  In those cases capturing all the MCA
information and sorting through to reconstruct the events is vital
to find the root cause.  Whether the MCAs are due to one root cause
or multiple causes is not clear until after the analysis.

Multiple CPUs going through MCA at the same time is not an abstract
scenario.  It is not uncomon to have many processes accessing
the same shared memory and hitting the same bad memory.  That is
why I have test cases for those scenarios.

> If the MCAs are the consequences of the same error event, then
> you can find out what they are, where they are from 2 or 3 logs.

Easier said than done in real life.

> The code actual tries to recover local MCAs only. They are:
> - TLB errors: per CPU local. As the CPUs are much more reliable
>  then the other components, e.g. the memory, having two or
>  more CPUs with corrupted TLBs at the same time is really unlikely.
> - I/O or memory read errors:
>  + One error has affected N CPUs: the first log is enough.

In the case of two processes consuming the same bad data, it
is often the second processes that calls up to OS_MCA first.
The reason is in SAL, the first CPU into MCA tries to rendezvou
the others.  The second one in (beating the rendezvou) sees
the first is doing the rendezvou so he immediately call into
linux OS_MCA.  So the second CPU shows up in OS_MCA before
the first.  There is no guarantee that the first error
in hardware wins the race to be the first in linux OS_MCA.

>  + More than one independent error at the same time: assuming
>    my estimations are more or less correct...

Another recent example of multiple CPUs going into MCA at
the same time was a hot lock on a large system with enough
contention to cause memory timeouts.  It was by looking at
the MCA records that we were able to identify the hot lock
and fix the code.

> I still don't see any need for many buffers.

In testing, I found one of the records getting dropped in salinfo.c
at the comment "saved record changed by mca.c since interrupt, discard it".
That code was not added by your patch, but is something that
impacts logging.

Thanks,
-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (19 preceding siblings ...)
  2008-03-11 21:22 ` Russ Anderson
@ 2008-03-12  1:08 ` Keith Owens
  2008-03-12  7:42 ` Zoltan Menyhart
  2008-04-01 15:18 ` [PATCH] New way of storing MCA/INIT logs - take 2 Zoltan Menyhart
  22 siblings, 0 replies; 24+ messages in thread
From: Keith Owens @ 2008-03-12  1:08 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson (on Tue, 11 Mar 2008 16:22:21 -0500) wrote:
>On Tue, Mar 11, 2008 at 03:07:20PM +0100, Zoltan Menyhart wrote:
>> I still don't see any need for many buffers.
>
>In testing, I found one of the records getting dropped in salinfo.c
>at the comment "saved record changed by mca.c since interrupt, discard it".
>That code was not added by your patch, but is something that
>impacts logging.

A record getting dropped at that point indicates a race between
salinfo.c and mca.c.  salinfo.c is running under spin_lock_irqsave
which is normally safe, but mca.c can be driven at any time and it
completely ignores spin_lock_irqsave.  mca.c grabs the next free buffer
in the circular list and overwrites that buffer.  The record id check
detects that mca.c has overwritten this buffer while salinfo.c was
processing it and retries the extraction of the record to user space.

By definition whatever record was originally in the buffer has now been
lost.  Was the lost record of any use?  No way of telling.  The only
way to avoid that loss is to increase the number of buffers.

Any repeated sequence of recoverable MCA events will result in some
loss of data, no matter how many buffers you allocate, simply because
MCA processing has a higher priority than user space processing.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] New way of storing MCA/INIT logs
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (20 preceding siblings ...)
  2008-03-12  1:08 ` Keith Owens
@ 2008-03-12  7:42 ` Zoltan Menyhart
  2008-04-01 15:18 ` [PATCH] New way of storing MCA/INIT logs - take 2 Zoltan Menyhart
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-03-12  7:42 UTC (permalink / raw)
  To: linux-ia64

Russ Anderson wrote:

> Depends on what you mean by _independent_.  I have a lot of experience
> with _cascading_ MCAs, where there is a root cause failure quickly
> followed by other MCAs as a side effect of the initial failure all
> occuring as one MCA event.  In those cases capturing all the MCA
> information and sorting through to reconstruct the events is vital
> to find the root cause.  Whether the MCAs are due to one root cause
> or multiple causes is not clear until after the analysis.

Independent: there is no single root cause.

Let's say: the number of the buffers has to be adapted (e.g. at
the boot time) to the particularity of the platform, to the
probability of multiple events, to the mean length of cascading
MCAs.

I prefer to have a default number of buffers that allows:
- to run small / moderate sized boxes
- to "survive" the install process on large systems. You
   calculate the number of buffers during the install process.

... even if you stay with the actual code.

> Multiple CPUs going through MCA at the same time is not an abstract
> scenario.  It is not uncomon to have many processes accessing
> the same shared memory and hitting the same bad memory.  That is
> why I have test cases for those scenarios.

This is definitely not a case of independent events.
How much more information are there in the additional logs?

>>If the MCAs are the consequences of the same error event, then
>>you can find out what they are, where they are from 2 or 3 logs.
> 
> Easier said than done in real life.

You may be right => platform dependent number of buffers.

> In the case of two processes consuming the same bad data, it
> is often the second processes that calls up to OS_MCA first.
> The reason is in SAL, the first CPU into MCA tries to rendezvou
> the others.  The second one in (beating the rendezvou) sees
> the first is doing the rendezvou so he immediately call into
> linux OS_MCA.  So the second CPU shows up in OS_MCA before
> the first.  There is no guarantee that the first error
> in hardware wins the race to be the first in linux OS_MCA.

I can agree with your explanation.
Yet you said: the same bad data.
All of the logs will indicate the same bad memory.

> Another recent example of multiple CPUs going into MCA at
> the same time was a hot lock on a large system with enough
> contention to cause memory timeouts.  It was by looking at
> the MCA records that we were able to identify the hot lock
> and fix the code.

... platform dependent number of buffers.

Thanks,

Zoltan


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH] New way of storing MCA/INIT logs - take 2
  2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
                   ` (21 preceding siblings ...)
  2008-03-12  7:42 ` Zoltan Menyhart
@ 2008-04-01 15:18 ` Zoltan Menyhart
  22 siblings, 0 replies; 24+ messages in thread
From: Zoltan Menyhart @ 2008-04-01 15:18 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 3171 bytes --]

This patch adds a lock free, yet safe way of storing MCA/INIT logs.
You will not end up with logs mixed up from different MCAs.

By default, there are N_MCA_INIT_LOGS log buffers for the MCA, and
another N_MCA_INIT_LOGS log buffers for the INIT handler.
Boot command-line options of "nMCAlogs=<NUM>" and "nINITlogs=<NUM>,
where <NUM> is an integer greater than N_MCA_INIT_LOGS, override
the default values.

The first ("N" - 1) logs and the very last one are stored only.
The last one gets overwritten if there are too many logs there.

The admin. info. is in a structure ia64_mca_init_buf_t, see in mca.h.

Handling the first ("N" - 1) log buffers is straight forward:
You increment an atomic variable (->_b_cnt) and you use it as
index to ->_buf[].
Having completed the log, you set the corresponding validity bit.

Otherwise you race (incl. with the nested handlers) for the last buffer:
- Increment the atomic generation counter (->_gen_cnt).
- You own the last log buffer while no one else has got a higher
  generation count.
- The log data is broken up into 4-byte chunks and they are stamped with
  the generation count. They are written together as an atomic64_t into
  the last buffer (*->_last_buf)[] by use of a compare-and-swap primitive
  to make sure that no one with higher generation count has passed by in
  the mean time.
- (*->_last_buf)[0] is a marker:
  * Before writing the log data into the rest of (*->_last_buf)[], you
    set the marker to say "not done" (MCA_INIT_LOG_VALID bit off).
  * Having finished, you set the marker to say "done"
    (MCA_INIT_LOG_VALID bit on).

This is how the code backs off if someone writes the same buffer with
a higher generation count:

       do {
               tmp = atomic64_read(p);		// p => las log buffer
               /*
                * If you can see a higher generation count than yours,
                * then you are not the last - bail out.
                */
               if (GET_GEN_CNT(tmp) > gen_cnt)
                       return -1;
       } while (cmpxchg_rel(p, tmp, COMPOSE_AT_VAL(gen_cnt, value)) != tmp);

The code does not assume that the rendezvous always works.

The salinfo side verifies that every element of the last log buffer is
of the same generation.
If there is no log left to save, it clears ->_b_cnt.
There is no "shift" of the logs in the buffers at the salinfo side.

Well, the the old code is not cleaned up...

Changes since the previous patch:
- Boot command-line options of "nMCAlogs=<NUM>" and "nINITlogs=<NUM>
- Reusing the "struct salinfo_data" infrastructure (not the data buffers)

Notes:
- Writing "clear <cpunum>" does not actually clear the SAL's log record.
  The MCA handler clears the recovered events.
- When checking to see if there is an MCA log coming before the reboot,
  the CPU number should have been picked up from the Processor Device
  Error Info. Yet a CPU causing fatal errors can be excluded after the
  reboot, the CPUs can be renumbered, etc. This implementation lets
  any CPU pick up logs coming before the reboot.
- Apply the patch http://marc.info/?l=linux-ia64&m=120418991227044&w=3
  before this one.

Thanks,

Zoltan Menyhart



[-- Attachment #2: new-mca-log-patch --]
[-- Type: text/plain, Size: 31808 bytes --]

diff -Nru linux-2.6.24-tmp/arch/ia64/kernel/mca.c linux-2.6.24-new-tmp/arch/ia64/kernel/mca.c
--- linux-2.6.24-tmp/arch/ia64/kernel/mca.c	2008-04-01 13:07:33.000000000 +0200
+++ linux-2.6.24-new-tmp/arch/ia64/kernel/mca.c	2008-03-31 11:15:08.000000000 +0200
@@ -183,6 +183,131 @@
 #define	MCA_IRQ_SAFE	1	/* NOT called from the MCA/INIT handlers */
 
 
+#define	N_MCA_INIT_LOGS	3
+ia64_mca_init_buf_t ia64_MCA_logs;	/* Log buffers for the MCA handler */
+ia64_mca_init_buf_t ia64_INIT_logs;	/* Log buffers for the INIT handler */
+static unsigned int ia64_n_MCA_logs __initdata =	/* Incl. the "last log" */
+					N_MCA_INIT_LOGS;
+static unsigned int ia64_n_INIT_logs __initdata =	/* Incl. the "last log" */
+					N_MCA_INIT_LOGS;
+EXPORT_SYMBOL(ia64_MCA_logs);
+EXPORT_SYMBOL(ia64_INIT_logs);
+
+/*
+ * Command-line options of "nMCAlogs=<NUM>" and "nINITlogs=<NUM>,
+ * where <NUM> is an integer greater than N_MCA_INIT_LOGS, set the
+ * maximum number of the MCA / INIT log buffers, incl. the "last one".
+ */
+
+static int __init
+set_ia64_n_MCA_logs(char *str)
+{
+	unsigned int arg;
+
+	if (get_option(&str, &arg) != 1 /* int found, no subsequent comma */)
+		return 1;
+	if (ia64_n_MCA_logs >= arg)
+		return 1;
+	ia64_n_MCA_logs = arg;
+	return 0;
+}
+
+static int __init
+set_ia64_n_INIT_logs(char *str)
+{
+	unsigned int arg;
+
+	if (get_option(&str, &arg) != 1 /* int found, no subsequent comma */)
+		return 1;
+	if (ia64_n_INIT_logs >= arg)
+		return 1;
+	ia64_n_INIT_logs = arg;
+	return 0;
+}
+
+early_param("nMCAlogs", set_ia64_n_MCA_logs);
+early_param("nINITlogs", set_ia64_n_INIT_logs);
+
+/*
+ * Store the "last log".
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns non zero on failure.
+ */
+static int
+ia64_last_log_write(
+	ia64_mca_init_buf_t	* const bp,	/* Where to save the log */
+	const void		* const log,	/* The SAL log to save */
+	unsigned int		size)		/* Its actual size in u32 units */
+{
+	const u32		*src = (u32 *) log;
+	atomic64_t		*p = &(*bp->_last_buf)[0];
+	unsigned int		const gen_cnt = ia64_fetchadd4_acq(&bp->_gen_cnt, 1) + 1;
+
+	/* Set the marker saying "not done" */
+	if (set_last_buf_item(p++, gen_cnt, smp_processor_id()) != 0)
+		return -1;			/* You are NOT the last one */
+	/* Sore the actual log size in u32 units */
+	if (set_last_buf_item(p++, gen_cnt, size) != 0)
+		return -1;			/* You are NOT the last one */
+	/*
+	 * The log data is broken up into 4-byte chunks and they are stamped with
+	 * the generation count. They are written together as an atomic64_t.
+	 */
+	while (size-- > 0)
+		if (set_last_buf_item(p++, gen_cnt, *src++) != 0)
+			return -1;		/* You are NOT the last one */
+	/* Set the marker saying "done" */
+	return set_last_buf_item(&(*bp->_last_buf)[0], gen_cnt,
+					smp_processor_id() | MCA_INIT_LOG_VALID);
+}
+
+/*
+ * Try to pick up a buffer for MCA/INIT log coming from SAL_GET_STATE_INFO().
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns a pointer to the buffer, or NULL on failure.
+ */
+static log_buf_t *
+ia64_get_mca_init_log_buf(
+	ia64_mca_init_buf_t	* const bp)	/* Log buffer admin. info. */
+{
+	unsigned int		idx;		/* Index to ->_buf[] */
+
+	idx = ia64_fetchadd4_acq(&bp->_b_cnt, 1);	/* Returns the old value */
+	return idx < bp->_n_bufs ? MCA_BUFFER(bp, idx) : NULL;
+}
+
+/*
+ * Set up the log buffers for an MCA/INIT handler.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ */
+static inline int
+ia64_mca_init_bufs_set_up(
+	ia64_mca_init_buf_t	* const bp,	/* Log buffers for an MCA/INIT handler */
+	unsigned int		const n_buffs,	/* Incl. the "last log" */
+	unsigned int		sal_info_type)
+{ 
+	/* SAL will tell us the maximum size of any error record of this type. */
+	if ((bp->_b_size = ia64_sal_get_state_info_size(sal_info_type)) == 0)
+		return -1;
+	/* Add 4 bytes for the CPU number and 4 more for the actual log size. */
+	bp->_b_size = (bp->_b_size + 8 + sizeof(u64) - 1) & ~(sizeof(u64) - 1);
+	/*
+	 * Allocate the (n_buffs - 1) conventional buffers. The "last one" stores 4
+	 * bytes on each atomic64_t, therefore allocate twice of ->_b_size for it.
+	 */
+	bp->_buf = (log_buf_t (*)[]) alloc_bootmem((n_buffs - 1 + 2) * bp->_b_size);
+	if (bp->_buf == NULL)
+		return -1;
+	memset(bp->_buf, 0, (n_buffs - 1 + 2) * bp->_b_size);
+	/* The conventional log buffers w/o the "last log". */
+	bp->_n_bufs = n_buffs - 1;
+	/* The "last log buffer": */
+	bp->_last_buf = (atomic64_t (*)[]) ((u8 *) bp->_buf + bp->_b_size * (n_buffs - 1));
+	return 0;
+}
+
 /*
  * Push messages into buffer, print them later if not urgent.
  */
@@ -323,19 +448,6 @@
 	while (1)
 		cpu_relax();
 }
-/*
- * IA64_MCA log support
- */
-#define IA64_MAX_LOGS		2	/* Double-buffering for nested MCAs */
-#define IA64_MAX_LOG_TYPES      4   /* MCA, INIT, CMC, CPE */
-
-typedef struct ia64_state_log_s
-{
-	spinlock_t	isl_lock;
-	int		isl_index;
-	unsigned long	isl_count;
-	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
-} ia64_state_log_t;
 
 static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
 
@@ -1194,6 +1306,41 @@
 }
 
 /*
+ * Helper for ia64_mca_handler().
+ */
+int
+ia64_mca_handler_helper(
+	unsigned int			* const size_p,	/* -> actual size of the log */
+	void				* const log,	/* SAL log buffer */
+	struct ia64_sal_os_state	* const sos)
+{
+	int				recover;
+
+	/* Get the MCA error record */
+	*size_p = ia64_sal_get_state_info(SAL_INFO_TYPE_MCA, (u64 *) log);
+
+	/* MCA error recovery */
+	recover = ia64_mca_ucmc_extension != NULL &&
+					ia64_mca_ucmc_extension(log, sos);
+	if (recover) {
+		sal_log_record_header_t *rh = log;
+		rh->severity = sal_log_severity_corrected;
+		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
+		sos->os_status = IA64_MCA_CORRECTED;
+	} else {
+		/* Dump buffered message to console */
+		ia64_mlogbuf_finish(1);
+#ifdef CONFIG_KEXEC
+		atomic_set(&kdump_in_progress, 1);
+#endif
+	}
+	return recover;
+}
+
+static int
+ia64_mca_handler_last_log(struct ia64_sal_os_state * const sos);
+
+/*
  * ia64_mca_handler
  *
  *	This is uncorrectable machine check handler called from OS_MCA
@@ -1218,6 +1365,7 @@
 		 struct ia64_sal_os_state *sos)
 {
 	int recover, cpu = smp_processor_id();
+	log_buf_t *log_buf;
 	struct task_struct *previous_current;
 	struct ia64_mca_notify_die nd =
 		{ .sos = sos, .monarch_cpu = &monarch_cpu };
@@ -1259,34 +1407,29 @@
 		while (cpu_isset(cpu, mca_cpu))
 			cpu_relax();	/* spin until monarch wakes us */
         }
+	/*
+	 * Try to pick up a buffer for the log coming from SAL_GET_STATE_INFO().
+	 */
+	if ((log_buf = ia64_get_mca_init_log_buf(&ia64_MCA_logs)) != NULL){
+		/* (->_cpu & MCA_INIT_LOG_VALID) was off, and remains off. */
+		log_buf->_cpu = smp_processor_id();
+		recover = ia64_mca_handler_helper(&log_buf->_log_size, log_buf->_data, sos);
+		/*
+		 * Tell salinfo that this log is valid.
+		 * Don't use set_bit(), ".rel" semantics is required.
+		 * Note that if !recover'ed => nobody will read this log.
+		 */
+		set_bit_rel(MCA_INIT_LOG_VALID_N, &log_buf->_cpu);
+	} else
+		recover = ia64_mca_handler_last_log(sos);
 
-	/* Get the MCA error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA, MCA_IRQ_NOTSAFE);
+	if (!recover)
+		monarch_cpu = -1;	/* Do we really care??? */
 
-	/* MCA error recovery */
-	recover = (ia64_mca_ucmc_extension
-		&& ia64_mca_ucmc_extension(
-			IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA),
-			sos));
-
-	if (recover) {
-		sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA);
-		rh->severity = sal_log_severity_corrected;
-		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
-		sos->os_status = IA64_MCA_CORRECTED;
-	} else {
-		/* Dump buffered message to console */
-		ia64_mlogbuf_finish(1);
-#ifdef CONFIG_KEXEC
-		atomic_set(&kdump_in_progress, 1);
-		monarch_cpu = -1;
-#endif
-	}
 	if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover)
 			== NOTIFY_STOP)
 		ia64_mca_spin(__FUNCTION__);
 
-
 	if (atomic_dec_return(&mca_count) > 0) {
 		int i;
 
@@ -1311,6 +1454,27 @@
 	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
 }
 
+/*
+ * Helper routine for ia64_mca_handler() when only the last log buffer is
+ * available.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ * Should not be inlined.
+ * Don't want buff[max_SAL_log_size] always be on the stack...
+ */
+static noinline int
+ia64_mca_handler_last_log(
+	struct ia64_sal_os_state	* const sos)
+{
+	unsigned char	buff[ia64_MCA_logs._b_size];
+	int		recover;
+	unsigned int	size;
+
+	if ((recover = ia64_mca_handler_helper(&size, buff, sos)))
+		(void) ia64_last_log_write(&ia64_MCA_logs, buff,
+					(size + sizeof(u32) - 1) / sizeof(u32));
+	return recover;
+}
+
 static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd);
 static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd);
 
@@ -2015,10 +2179,15 @@
 	 * platform/processor error states for MCA/INIT/CMC
 	 * handling.
 	 */
-	ia64_log_init(SAL_INFO_TYPE_MCA);
+//	ia64_log_init(SAL_INFO_TYPE_MCA);
 	ia64_log_init(SAL_INFO_TYPE_INIT);
 	ia64_log_init(SAL_INFO_TYPE_CMC);
 	ia64_log_init(SAL_INFO_TYPE_CPE);
+	if (ia64_mca_init_bufs_set_up(&ia64_MCA_logs, ia64_n_MCA_logs,
+							SAL_INFO_TYPE_MCA) != 0 ||
+			ia64_mca_init_bufs_set_up(&ia64_INIT_logs, ia64_n_INIT_logs,
+							SAL_INFO_TYPE_INIT) != 0)
+		printk(KERN_WARNING "WARNING: MCA/INIT log buffer set up failed\n");
 
 	mca_init = 1;
 	printk(KERN_INFO "MCA related initialization done\n");
diff -Nru linux-2.6.24-tmp/arch/ia64/kernel/salinfo.c linux-2.6.24-new-tmp/arch/ia64/kernel/salinfo.c
--- linux-2.6.24-tmp/arch/ia64/kernel/salinfo.c	2008-04-01 13:07:33.000000000 +0200
+++ linux-2.6.24-new-tmp/arch/ia64/kernel/salinfo.c	2008-04-01 13:05:24.000000000 +0200
@@ -36,6 +36,12 @@
  *   Modify the locking to make the test for "work to do" an atomic operation.
  */
 
+#if 0
+#define	D_printk(...)            printk(__VA_ARGS__)
+#else
+#define	D_printk(...)            do { } while (0)
+#endif
+
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/types.h>
@@ -49,10 +55,16 @@
 #include <asm/sal.h>
 #include <asm/uaccess.h>
 
+#include <asm/mca.h>
+
 MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
 MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
 MODULE_LICENSE("GPL");
 
+extern ia64_mca_init_buf_t ia64_MCA_logs;	/* Log buffers for the MCA handler */
+extern ia64_mca_init_buf_t ia64_INIT_logs;	/* Log buffers for the INIT handler */
+static int ia64_old_mca_log_checked;		/* Coming before the reboot */
+
 static int salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data);
 
 typedef struct {
@@ -262,6 +274,58 @@
 	}
 }
 
+#define	MIN(a, b)	(a < b ? a : b)
+
+/*
+ * Check to see if we have got any not yet seen log in *bp (incl. the "last one").
+ * Set the bits in event mask indicating which CPUs are involved.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns:	FALSE if there is no fresh log.
+ *
+ * Note: If "cpu_event_p" points at "cpu_event" of "salinfo_data" than apply the
+ * necessary locking.
+ */
+static int
+new_log_available_set_events(
+	cpumask_t		* const cpu_event_p,
+	ia64_mca_init_buf_t	* const bp)	/* Where to look for the logs */
+{
+	int			found = 0;
+	unsigned int		const limit = MIN(atomic_read(&bp->_b_cnt),
+						bp->_n_bufs /* Excl. the "last log" */);
+	unsigned int		i;
+	unsigned long		tmp;
+
+	for (i = 0; i < limit; i++){
+		tmp = MCA_BUFFER(bp, i)->_cpu;
+		if (MCA_INIT_LOG_VALID & tmp){
+			D_printk("%s(): cpu %ld got event\n", __FUNCTION__,
+								tmp & MCA_INIT_CPU_MASK);
+			cpu_set(tmp & MCA_INIT_CPU_MASK, *cpu_event_p);
+			found = 1;
+		}
+	}
+	if (bp->_last_buf == NULL)
+		return found;
+	for (;; cpu_relax()){
+		i = atomic_read(&bp->_gen_cnt);		/* Gen. counter for _last_buf[] */
+		tmp = atomic64_read(&(*bp->_last_buf)[0]);	/* The marker */
+		if (GET_GEN_CNT(tmp) != i)
+			continue;
+		i = GET_LOG_DATA(tmp);
+		if (i & MCA_INIT_LOG_VALID){
+			D_printk("%s(): cpu %d got \"last\" event\n", __FUNCTION__,
+								i & MCA_INIT_CPU_MASK);
+			cpu_set(i & MCA_INIT_CPU_MASK, *cpu_event_p);
+			return 1;
+		} else
+			return found;
+	}
+	/*NOTREACHED*/
+}
+
+
 /* Check for outstanding MCA/INIT records every minute (arbitrary) */
 #define SALINFO_TIMER_DELAY (60*HZ)
 static struct timer_list salinfo_timer;
@@ -273,6 +337,9 @@
 	unsigned long flags;
 	if (!data->open)
 		return;
+	if (data->type == SAL_INFO_TYPE_MCA)
+		D_printk("%s(): events: 0x%016lx\n", __FUNCTION__,
+			* (u64 *) & (salinfo_data + SAL_INFO_TYPE_MCA)->cpu_event);
 	if (!cpus_empty(data->cpu_event)) {
 		spin_lock_irqsave(&data_saved_lock, flags);
 		salinfo_work_to_do(data);
@@ -280,11 +347,59 @@
 	}
 }
 
+/*
+ * Check to see if there is an MCA log coming before the reboot.
+ * (Should not be inlined: don't want this buffer on the stack all the time.)
+ *
+ * Returns:	The number of the CPU involved, or -1.
+ */
+static noinline int
+check_old_mca(unsigned int b_size)
+{
+	u8 l_buff[b_size];
+
+	if (ia64_sal_get_state_info(SAL_INFO_TYPE_MCA, (u64 *) l_buff) == 0)
+		return -1;
+	/*
+	 * Well, the CPU number should have been picked up from the
+	 * Processor Device Error Info.
+	 */
+	return smp_processor_id();
+}
+
 static void
 salinfo_timeout (unsigned long arg)
 {
+	struct salinfo_data *data;
+	cpumask_t cpu_event;
+	unsigned int cpu;
+	unsigned long flags;
+
 	ia64_mlogbuf_dump();
-	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
+	data = salinfo_data + SAL_INFO_TYPE_MCA;
+	if (!ia64_old_mca_log_checked){			/* Coming before the reboot? */
+		if (ia64_MCA_logs._b_size == 0)
+			ia64_old_mca_log_checked = 1;	/* MCA init. went wrong */
+		else
+			if ((cpu = check_old_mca(ia64_MCA_logs._b_size)) == -1)
+				ia64_old_mca_log_checked = 1;
+			else {
+				spin_lock_irqsave(&data_saved_lock, flags);
+				cpu_set(cpu, data->cpu_event);
+				salinfo_work_to_do(data);
+				spin_unlock_irqrestore(&data_saved_lock, flags);
+				D_printk("events: 0x%016lx\n", * (u64 *) & data->cpu_event);
+			}
+	}
+	cpus_clear(cpu_event);
+	if (new_log_available_set_events(&cpu_event, &ia64_MCA_logs)){
+		D_printk("new events: 0x%016lx\n", * (u64 *) & cpu_event);
+		spin_lock_irqsave(&data_saved_lock, flags);
+		cpus_or(data->cpu_event, data->cpu_event, cpu_event);
+		salinfo_work_to_do(data);
+		spin_unlock_irqrestore(&data_saved_lock, flags);
+	}
+// The new logging mechanism has not been integrated into the INIT handler yet
 	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
 	salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
 	add_timer(&salinfo_timer);
@@ -298,6 +413,136 @@
 	return 0;
 }
 
+
+/*
+ * Copy the "last log" into some regular buffer.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns 1 if the last log has successfully been fetched.
+ */
+static inline int
+copy_last_log(
+	const atomic64_t	*p,		/* On entry: p == &(*->_last_buf)[2] */
+	u32			*dest,
+	unsigned int		const gen,
+	unsigned int		size)		/* SAL log size in u32 units */
+{
+	u64			tmp;
+
+	while (size-- > 0){
+		tmp = atomic64_read(p++);
+		if (GET_GEN_CNT(tmp) != gen)
+			return 0;
+		*dest++ = GET_LOG_DATA(tmp);
+	}
+	return 1;
+}
+
+/*
+ * Fetch the "last log" created by ia64_last_log_write() in mca.c.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ */
+static inline void
+fetch_last_log(
+	ia64_mca_init_buf_t	* const bp,	/* Where to look for the logs */
+	struct salinfo_data	* const data)
+{
+	unsigned int		gen;
+	const atomic64_t	*p;
+	u64			tmp;
+
+//	D_printk("%s(%p,...): type: %d, CPU: %d\n", __FUNCTION__, bp, data->type,
+//								smp_processor_id());
+	for (;; cpu_relax()) {
+		gen = atomic_read(&bp->_gen_cnt);	/* Gen. counter for _last_buf[] */
+		p = &(*bp->_last_buf)[0];
+		tmp = atomic64_read(p++);		/* The marker */
+		if (GET_GEN_CNT(tmp) != gen)
+			continue;
+		tmp = GET_LOG_DATA(tmp);
+		/*
+		 * Before we got here, we have already seen MCA_INIT_LOG_VALID on.
+		 * If it is off, then the log is being updated in this very moment.
+		 */
+		if (!(tmp & MCA_INIT_LOG_VALID))
+			continue;
+		tmp = atomic64_read(p++);		/* SAL log size in u32 units */
+		if (GET_GEN_CNT(tmp) != gen)
+			continue;
+		if (copy_last_log(p, (void *) data->log_buffer, gen, GET_LOG_DATA(tmp)))
+			break;
+	}
+	data->log_size = GET_LOG_DATA(tmp) * sizeof(u32);
+	bp->_gen_seen = gen;
+}
+
+#define	JUST_TEST_LOGS		0
+#define	DO_FETCH_LOG		1
+
+/*
+ * Check to see if we have already seen all the logs for a CPU in *bp.
+ * See the comment above the definition of "ia64_mca_init_buf_t" in mca.h.
+ *
+ * Returns TRUE if some logs are available.
+ */
+static int
+is_log_available(
+	ia64_mca_init_buf_t	* const bp,	/* Where to look for the logs */
+	unsigned int		const cpu,
+	struct salinfo_data	* const data,
+	unsigned int		const mode)	/* JUST_TEST_LOGS, DO_FETCH_LOG */
+{
+	log_buf_t		*p;
+	unsigned int		const b_cnt = atomic_read(&bp->_b_cnt);
+	unsigned int		const limit = MIN( atomic_read(&bp->_b_cnt),
+						bp->_n_bufs /* Excl. the "last log" */);
+	unsigned int		i;
+
+	D_printk("%s(0x%p,... %d): mode: %d\n", __FUNCTION__, bp, data->type, mode);
+	for (i = 0; i < limit; i++){
+		p =  MCA_BUFFER(bp, i);
+		if (MCA_INIT_LOG_VALID & p->_cpu){
+			D_printk("buffer #%d @ %p valid\n", i, p);
+			if (mode == JUST_TEST_LOGS)
+				return 1;
+			data->log_size = p->_log_size;
+			memcpy(data->log_buffer, p->_data, p->_log_size);
+			p->_cpu &= ~MCA_INIT_LOG_VALID;
+			/*
+			 * Check to see if all the buffers have been consumed.
+			 */
+			for (i = 0; i < limit; i++)
+				if (MCA_INIT_LOG_VALID & MCA_BUFFER(bp, i)->_cpu)
+					return 1;
+			if (b_cnt <= bp->_n_bufs ||
+					bp->_gen_seen == atomic_read(&bp->_gen_cnt)){
+				/*
+				 * Clear ->_b_cnt. It can fail.
+				 * ... will be done next time...
+				 */
+				(void) cmpxchg(&bp->_b_cnt, b_cnt, 0);
+			}
+			return 1;
+		}
+	}
+	if (atomic_read(&bp->_gen_cnt) == bp->_gen_seen)
+		return 0;
+	if (mode == JUST_TEST_LOGS)
+		return 1;
+	fetch_last_log(bp, data);
+	/*
+	 * Check to see if all the buffers have been consumed.
+	 */
+	for (i = 0; i < limit; i++)
+		if (MCA_INIT_LOG_VALID & MCA_BUFFER(bp, i)->_cpu)
+			return 1;
+	/*
+	 * Clear ->_b_cnt. It can fail. ... will be done next time...
+	 */
+	(void) cmpxchg(&bp->_b_cnt, b_cnt, 0);
+	return 1;
+}
+
 static ssize_t
 salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
 {
@@ -308,6 +553,8 @@
 	size_t size;
 	int i, n, cpu = -1;
 
+	D_printk("%s(): type: %d events: 0x%016lx\n", __FUNCTION__,
+					data->type, * (u64 *) & data->cpu_event);
 retry:
 	if (cpus_empty(data->cpu_event) && down_trylock(&data->mutex)) {
 		if (file->f_flags & O_NONBLOCK)
@@ -415,9 +662,13 @@
 {
 	struct salinfo_data *data = context;
 	sal_log_record_header_t *rh;
+
 	data->log_size = ia64_sal_get_state_info(data->type, (u64 *) data->log_buffer);
 	rh = (sal_log_record_header_t *)(data->log_buffer);
-	/* Clear corrected errors as they are read from SAL */
+	/*
+	 * Clear corrected errors as they are read from SAL.
+	 * The MCA handler has already cleared the recovered events.
+	 */
 	if (rh->severity == sal_log_severity_corrected)
 		ia64_sal_clear_state_info(data->type);
 }
@@ -425,15 +676,52 @@
 static void
 salinfo_log_new_read(int cpu, struct salinfo_data *data)
 {
-	struct salinfo_data_saved *data_saved;
+	struct salinfo_data_saved *data_saved = data->data_saved;
 	unsigned long flags;
 	int i;
 	int saved_size = ARRAY_SIZE(data->data_saved);
 
 	data->saved_num = 0;
+	switch (data->type){
+	case SAL_INFO_TYPE_MCA:
+		D_printk("%s(): data->state: %d cpu: %d\n", __FUNCTION__,
+							data->state, cpu);
+		if (is_log_available(&ia64_MCA_logs, cpu, data, JUST_TEST_LOGS)){
+			data->state = STATE_LOG_RECORD;
+			/* Have to save CPU somewhere... */
+			data_saved->cpu = cpu;
+			D_printk("data_saved->cpu: %d\n", data_saved->cpu);
+		} else if (!ia64_old_mca_log_checked){	/* Coming before the reboot */
+			if ((i /* cpu */ = check_old_mca(ia64_MCA_logs._b_size)) == -1)
+				ia64_old_mca_log_checked = 1;
+			else {
+			// Should check if the old MCA is for this CPU
+//			else if (i == cpu){
+				data->state = STATE_LOG_RECORD;
+				/* Have to save CPU somewhere... */
+				data_saved->cpu = cpu;
+				D_printk("data_saved->cpu: %d\n", data_saved->cpu);
+			}
+		}
+		return;
+	case SAL_INFO_TYPE_INIT:
+		D_printk("%s(): data->state: %d cpu: %d\n", __FUNCTION__,
+							data->state, cpu);
+		data_saved->cpu = cpu;
+		if (is_log_available(&ia64_INIT_logs, cpu, data, JUST_TEST_LOGS)){
+			data->state = STATE_LOG_RECORD;
+			/* Have to save CPU somewhere... */
+			data_saved->cpu = cpu;
+		}
+		return;
+	}
 	spin_lock_irqsave(&data_saved_lock, flags);
 retry:
-	for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
+	for (i = 0; i < saved_size; ++i, ++data_saved) {
+		/*
+		 * "salinfo_log_wakeup()" never called for the new buffering
+		 * mechanism used for MCSa / INITs therefore "->buffer" remains NULL.
+		 */
 		if (data_saved->buffer && data_saved->cpu == cpu) {
 			sal_log_record_header_t *rh = (sal_log_record_header_t *)(data_saved->buffer);
 			data->log_size = data_saved->size;
@@ -469,7 +757,45 @@
 	u8 *buf;
 	u64 bufsize;
 
+	D_printk("%s(): data->state: %d\n", __FUNCTION__, data->state);
 	if (data->state == STATE_LOG_RECORD) {
+		switch (data->type){
+		case SAL_INFO_TYPE_MCA:
+			D_printk("data->data_saved->cpu: %d\n", data->data_saved->cpu);
+			/*
+			 * Should find the same log that has been found in
+			 * salinfo_log_new_read(), unless it is the "last log",
+			 * and it has been overwritten by another CPU in the
+			 * mean time. - Will be seen later.
+			 */
+			data->log_size = 0;
+			if (!is_log_available(&ia64_MCA_logs,
+					data->data_saved->cpu, data, DO_FETCH_LOG) &&
+							!ia64_old_mca_log_checked){
+				/* Coming before the reboot */
+				data->log_size = ia64_sal_get_state_info(
+							SAL_INFO_TYPE_MCA,
+							(u64 *) data->log_buffer);
+				if (data->log_size > 0){
+					/*
+					 * Well, the CPU number should be checked
+					 * against Processor Device Error Info.
+					 */
+//					if (data->data_saved->cpu ! = ...CPU...)
+//						data->log_size = 0;
+//					else
+						ia64_old_mca_log_checked = 1;
+				}
+			}
+			data->state = STATE_NO_DATA;
+			break;
+		case SAL_INFO_TYPE_INIT:
+			data->log_size = 0;
+			(void) is_log_available(&ia64_INIT_logs,
+					data->data_saved->cpu, data, DO_FETCH_LOG);
+			data->state = STATE_NO_DATA;
+			break;
+		}
 		buf = data->log_buffer;
 		bufsize = data->log_size;
 	} else if (data->state == STATE_OEMDATA) {
@@ -479,6 +805,8 @@
 		buf = NULL;
 		bufsize = 0;
 	}
+	D_printk("buf: %p, count: %ld, pos: %lld, bufsize: %ld\n",
+					buf, count, *ppos, bufsize);
 	return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
 }
 
@@ -486,6 +814,9 @@
 salinfo_log_clear_cpu(void *context)
 {
 	struct salinfo_data *data = context;
+	/*
+	 * The MCA handler has already cleared the recovered events.
+	 */
 	ia64_sal_clear_state_info(data->type);
 }
 
@@ -538,10 +869,16 @@
 	if (copy_from_user(cmd, buffer, size))
 		return -EFAULT;
 
+	D_printk("%s(): \"%s\" type: %d state: %d\n", __FUNCTION__, cmd,
+						data->type, data->state);
 	if (sscanf(cmd, "read %d", &cpu) == 1) {
 		salinfo_log_new_read(cpu, data);
 	} else if (sscanf(cmd, "clear %d", &cpu) == 1) {
 		int ret;
+
+		/*
+		 * The MCA handler has already cleared the recovered events.
+		 */
 		if ((ret = salinfo_log_clear(data, cpu)))
 			count = ret;
 	} else if (sscanf(cmd, "oemdata %d %d", &cpu, &offset) == 2) {
@@ -662,10 +999,20 @@
 		entry->proc_fops = &salinfo_data_fops;
 		*sdir++ = entry;
 
-		/* we missed any events before now */
-		for_each_online_cpu(j)
-			cpu_set(j, data->cpu_event);
-
+		switch (data->type){
+		case SAL_INFO_TYPE_MCA:
+//		case SAL_INFO_TYPE_INIT:	// ...coming soon...
+			/*
+			 * There is no way to miss en event if there is a log
+			 * buffer available. An MCA coming before reboot is
+			 * treated separately.
+			 */
+			break;
+		default:
+			/* we missed any events before now */
+			for_each_online_cpu(j)
+				cpu_set(j, data->cpu_event);
+		}
 		*sdir++ = dir;
 	}
 
diff -Nru linux-2.6.24-tmp/include/asm-ia64/bitops.h linux-2.6.24-new-tmp/include/asm-ia64/bitops.h
--- linux-2.6.24-tmp/include/asm-ia64/bitops.h	2008-04-01 13:07:46.000000000 +0200
+++ linux-2.6.24-new-tmp/include/asm-ia64/bitops.h	2008-03-31 09:33:07.000000000 +0200
@@ -51,6 +51,39 @@
 }
 
 /**
+ * set_bit_rel - Atomically set a bit in memory with ".rel" semantics
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered.  See __set_bit()
+ * if you do not require the atomic guarantees.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ *
+ * The address must be (at least) "long" aligned.
+ * Note that there are driver (e.g., eepro100) which use these operations to
+ * operate on hw-defined data-structures, so we can't easily change these
+ * operations to force a bigger alignment.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+static __inline__ void
+set_bit_rel (int nr, volatile void *addr)
+{
+	__u32 bit, old, new;
+	volatile __u32 *m;
+	CMPXCHG_BUGCHECK_DECL
+
+	m = (volatile __u32 *) addr + (nr >> 5);
+	bit = 1 << (nr & 31);
+	do {
+		CMPXCHG_BUGCHECK(m);
+		old = *m;
+		new = old | bit;
+	} while (cmpxchg_rel(m, old, new) != old);
+}
+
+/**
  * __set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
diff -Nru linux-2.6.24-tmp/include/asm-ia64/mca.h linux-2.6.24-new-tmp/include/asm-ia64/mca.h
--- linux-2.6.24-tmp/include/asm-ia64/mca.h	2008-04-01 13:07:46.000000000 +0200
+++ linux-2.6.24-new-tmp/include/asm-ia64/mca.h	2008-03-31 09:33:07.000000000 +0200
@@ -161,6 +161,143 @@
 
 DECLARE_PER_CPU(u64, ia64_mca_pal_base);
 
+/*
+ * IA64_MCA log support
+ */
+#define IA64_MAX_LOGS		2	/* Double-buffering for nested MCAs */
+#define IA64_MAX_LOG_TYPES      4   /* MCA, INIT, CMC, CPE */
+
+/*
+ * IA64_MCA log support:
+ * used for SAL_GET_STATE_INFO() data by the MCA/INIT handlers.
+ */
+
+#define	IA64_MAX_MCA_BUFS	2	/* excl. the "last" buffer */
+#if	IA64_MAX_MCA_BUFS < 1
+#error	Min. 1 buffers required
+#endif
+
+#define	IA64_MAX_INIT_BUFS	2	/* excl. the "last" buffer */
+#if	IA64_MAX_INIT_BUFS < 1
+#error	Min. 1 buffers required
+#endif
+
+typedef struct ia64_state_log_s
+{
+	spinlock_t	isl_lock;
+	int		isl_index;
+	unsigned long	isl_count;
+	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
+} ia64_state_log_t;
+
+/*
+ * These structures below describe the global buffers available for an MCA or an
+ * INIT handler to store SAL_GET_STATE_INFO() data.
+ *
+ * Note: there is no use saving non-recovered MCAs: there will be no chance for
+ * such a log to hit the permanent storage device.
+ *
+ * The rules are:
+ * - The first ->_n_bufs logs (called as conventional ones) and the very last
+ *   one are stored only.
+ * - The last log gets overwritten if there are too many logs there.
+ * - if (->_b_cnt <= ->_n_bufs + 1), then ->_b_cnt counts the in-use buffers,
+ *   incl. the last one. There is no lost log if (->_b_cnt <= ->_n_bufs + 1).
+ * - if (->_b_cnt >= ->_n_bufs), then ->_gen_cnt is incremented.
+ * - if (->_b_cnt > ->_n_bufs), then the last buffer gets over-written by the
+ *   additional logs.
+ *
+ * The MCA/INIT handler plays as follows:
+ * - It fetches and increments ->_b_cnt in an atomic way (acquisition semantics).
+ * - If (previous value < ->_n_bufs), then it can simply store its log into
+ *   ->_buf[ previous value ]. Having done that, it sets the MCA_INIT_LOG_VALID
+ *   bit in ->_buf[ previous value ]._cpu (release semantics).
+ * - Otherwise it races (incl. with the nesting handlers) for the last buffer:
+ *   + It increments ->_gen_cnt in an atomic way to obtain its generation count
+ *     (acquisition semantics).
+ *   + It owns the last log buffer while no one else has got a higher generation
+ *     count.
+ *   + The log data is broken up into 4-byte chunks and they are stamped with
+ *     the generation count. They are written together as an atomic64_t into
+ *     (*->_last_buf)[] by use of a compare-and-swap primitive to make sure
+ *     that no one with higher generation count has passed by in the mean time.
+ *   + Similarly to the conventional buffers, (*->_last_buf)[0] is a marker:
+ *     it includes the CPU number and the MCA_INIT_LOG_VALID bit:
+ *     * Before writing the log data into the rest of (*->_last_buf)[], the
+ *       MCA/INIT handler sets the marker to say "not done"
+ *       (MCA_INIT_LOG_VALID bit off) + write-memory-barrier.
+ *     * Having finished, it sets the MCA_INIT_LOG_VALID bit.
+ *       (MCA_INIT_LOG_VALID bit on) using release semantics.
+ *
+ * The salinfo side polls ->_b_cnt:
+ * - Once a MCA_INIT_LOG_VALID bit is set in ->_buf[]._cpu, it is safe to read,
+ *   at any time, without any further precaution, the first
+ *   MIN(->_n_bufs, ->_b_cnt) buffer entries.
+ *   While ->_b_cnt is not reset to 0, the log buffers are not reused.
+ * - The salinfo side can clear the MCA_INIT_LOG_VALID bit in ->_buf[]._cpu at
+ *   any time (no need for an atomic operation because the MCA/INIT handler does
+ *   not even consider re-using this item before ->_b_cnt drops back to 0).
+ * - If (->_b_cnt > ->_n_buf), then the last buffer is read as follows:
+ *   + Pick up ->_gen_cnt.
+ *   + Verify the marker (*->_last_buf)[0], it should have the bit
+ *     MCA_INIT_LOG_VALID on. (Otherwise come back later...)
+ *   + While reading (*->_last_buf)[], verify if the generation count in each
+ *     item is the same. (Otherwise restart...)
+ * - The salinfo side can reset ->_b_cnt to 0 with an atomic operation, provided
+ *   it has not changed. (Otherwise restart...)
+ */
+
+#define	MCA_INIT_LOG_VALID_N	31
+#define	MCA_INIT_LOG_VALID	(1U << MCA_INIT_LOG_VALID_N)
+#define	MCA_INIT_CPU_MASK	(MCA_INIT_LOG_VALID - 1)
+
+typedef struct log_buf_s {		/* Conventional log buffer */
+	u32		_cpu;		/* Incl. MCA_INIT_LOG_VALID bit */
+	u32		_log_size;
+	u8		_data[];
+} log_buf_t;
+
+typedef struct ia64_mca_init_buf_s {
+	log_buf_t	(*_buf)[0];	/* Conventional buffers */
+	u32		_b_size;	/* Actual sizeof(log_buf_t) */
+	atomic_t	_b_cnt;		/* Counts the in-use _buf[]'s */
+	u32		_n_bufs;	/* Excl. the "last log" */
+	atomic64_t	(*_last_buf)[0];
+	atomic_t	_gen_cnt;	/* Generation counter for _last_buf[] */
+	u32		_gen_seen;	/* Generation seen by salinfo */
+} ia64_mca_init_buf_t;
+
+/* i-th conventional buffer: */
+#define	MCA_BUFFER(bp, i)		((log_buf_t *) ((u8 *) (bp)->_buf + (bp)->_b_size * (i)))
+
+/* Macros for (*->_last_buf)[]: */
+#define	GET_GEN_CNT(x)		((u32) x)		/* Generation counter */
+#define	GET_LOG_DATA(x)		((u32) (x >> 32))	/* Log data */
+#define	COMPOSE_AT_VAL(gc, dt)	((u32) (gc) | ((u64) (dt) << 32))
+
+/*
+ * Store a 4-byte value into (*->_last_buf)[i].
+ */
+static inline int
+set_last_buf_item(
+	atomic64_t	* const p,		/* == &(*->_last_buf)[i] */
+	unsigned int	const gen_cnt,		/* Generation count */
+	u32		const value)
+{
+	u64		tmp;
+
+	do {
+		tmp = atomic64_read(p);
+		/*
+		 * If you can see a higher generation count than yours,
+		 * then you are not the last - bail out.
+		 */
+		if (GET_GEN_CNT(tmp) > gen_cnt)
+			return -1;
+	} while (cmpxchg_rel(p, tmp, COMPOSE_AT_VAL(gen_cnt, value)) != tmp);
+	return 0;
+}
+
 #else	/* __ASSEMBLY__ */
 
 #define IA64_MCA_CORRECTED	0x0	/* Error has been corrected by OS_MCA */

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2008-04-01 15:18 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-04 17:05 [PATCH] New way of storing MCA/INIT logs Zoltan Menyhart
2008-03-05  0:23 ` Russ Anderson
2008-03-05 13:14 ` Zoltan Menyhart
2008-03-05 16:59 ` Luck, Tony
2008-03-05 18:56 ` Russ Anderson
2008-03-05 23:38 ` Keith Owens
2008-03-06 10:24 ` Zoltan Menyhart
2008-03-06 13:14 ` Zoltan Menyhart
2008-03-06 17:09 ` Luck, Tony
2008-03-06 17:29 ` Zoltan Menyhart
2008-03-06 17:52 ` Russ Anderson
2008-03-06 21:56 ` Luck, Tony
2008-03-06 22:13 ` Russ Anderson
2008-03-07 12:02 ` Zoltan Menyhart
2008-03-07 16:55 ` Russ Anderson
2008-03-10  9:36 ` Zoltan Menyhart
2008-03-10 20:36 ` Russ Anderson
2008-03-10 21:10 ` Russ Anderson
2008-03-11 14:07 ` Zoltan Menyhart
2008-03-11 14:32 ` Robin Holt
2008-03-11 21:22 ` Russ Anderson
2008-03-12  1:08 ` Keith Owens
2008-03-12  7:42 ` Zoltan Menyhart
2008-04-01 15:18 ` [PATCH] New way of storing MCA/INIT logs - take 2 Zoltan Menyhart

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox