* [PATCH] printing support for MCA/INIT
@ 2006-06-07 11:53 Hidetoshi Seto
2006-06-07 13:28 ` Keith Owens
` (7 more replies)
0 siblings, 8 replies; 9+ messages in thread
From: Hidetoshi Seto @ 2006-06-07 11:53 UTC (permalink / raw)
To: linux-ia64
Printing message to console from MCA/INIT handler is useful,
however doing oops_in_progress = 1 in them exactly makes
something in kernel wrong.
Especially it sounds ugly if system goes wrong after returning
from recoverable MCA.
This patch adds ia64_mca_printk() function that collects messages
into temporary-not-so-large message buffer during in MCA/INIT
environment and print them out later, after returning to normal
context or when handlers determine to down the system.
Also this print function is exported for use in extensional MCA
handler. It would be useful to describe detail about recovery.
H.Seto
p.s.
I don't think it is sane thing if temporary message buffer is enlarged
enough to hold stack dumps of each cpu, so buffering is disabled in
dumping from default_monarch_init_process. please fix in future.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
arch/ia64/kernel/mca.c | 143 +++++++++++++++++++++++++++++++++++++++------
arch/ia64/kernel/mca_drv.c | 52 +++++++++++-----
arch/ia64/kernel/mca_drv.h | 4 +
arch/ia64/kernel/salinfo.c | 4 +
4 files changed, 168 insertions(+), 35 deletions(-)
Index: linux-2.6.17-rc6/arch/ia64/kernel/mca.c
=================================--- linux-2.6.17-rc6.orig/arch/ia64/kernel/mca.c
+++ linux-2.6.17-rc6/arch/ia64/kernel/mca.c
@@ -54,6 +54,9 @@
*
* 2005-10-07 Keith Owens <kaos@sgi.com>
* Add notify_die() hooks.
+ *
+ * 2006-06-07 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+ * Add printing support for MCA/INIT.
*/
#include <linux/config.h>
#include <linux/types.h>
@@ -137,11 +140,113 @@ extern void salinfo_log_wakeup(int type,
static int mca_init __initdata;
+/*
+ * limited & delayed printing support for MCA/INIT handler
+ */
+
+#define mprintk(fmt...) ia64_mca_printk(fmt)
+
+#define MLOGBUF_SIZE (256*NR_CPUS)
+static char mlogbuf[MLOGBUF_SIZE];
+static DEFINE_SPINLOCK(mlogbuf_wlock); /* mca context only */
+static DEFINE_SPINLOCK(mlogbuf_rlock); /* normal context only */
+static unsigned long mlogbuf_start;
+static unsigned long mlogbuf_end;
+
+/*
+ * Push messages into buffer, print them later.
+ */
+void ia64_mca_printk(const char *fmt, ...)
+{
+ va_list args;
+ int printed_len;
+ char temp_buf[256];
+ char *p;
+
+ va_start(args, fmt);
+ printed_len = vscnprintf(temp_buf, sizeof(temp_buf), fmt, args);
+ va_end(args);
+
+ /* Copy the output into mlogbuf */
+ if (oops_in_progress) {
+ /* mlogbuf was abandoned, use printk directly instead. */
+ printk(temp_buf);
+ } else {
+ spin_lock(&mlogbuf_wlock);
+ for (p = temp_buf; *p; p++) {
+ unsigned long next = (mlogbuf_end + 1) % MLOGBUF_SIZE;
+ if (next != mlogbuf_start) {
+ mlogbuf[mlogbuf_end] = *p;
+ mlogbuf_end = next;
+ } else {
+ /* buffer full */
+ mlogbuf[mlogbuf_end] = '\0';
+ break;
+ }
+ }
+ spin_unlock(&mlogbuf_wlock);
+ }
+}
+EXPORT_SYMBOL(ia64_mca_printk);
+
+/*
+ * Print buffered messages.
+ * NOTE: call this after returning normal context. (ex. from salinfod)
+ */
+void ia64_mlogbuf_dump(void)
+{
+ char temp_buf[256];
+ char *p;
+ unsigned long index;
+ unsigned long flags;
+
+ /* Get output from mlogbuf by line */
+ while (mlogbuf_start != mlogbuf_end) {
+ temp_buf[0] = '\0';
+ p = temp_buf;
+
+ spin_lock_irqsave(&mlogbuf_rlock, flags);
+
+ index = mlogbuf_start;
+ while (index != mlogbuf_end) {
+ *p = mlogbuf[index];
+ index = (index + 1) % MLOGBUF_SIZE;
+ if (!*p)
+ break;
+ *p++;
+ }
+ if (temp_buf[0])
+ printk(temp_buf);
+ mlogbuf_start = index;
+
+ spin_unlock_irqrestore(&mlogbuf_rlock, flags);
+ }
+}
+EXPORT_SYMBOL(ia64_mlogbuf_dump);
+
+/*
+ * System is going to down, flush messages to console immediately!
+ * NOTE: this should be called from monarch.
+ */
+static void ia64_mlogbuf_break(void)
+{
+ oops_in_progress = 1; /* zap printk locks. */
+ console_loglevel = 15; /* make sure printks make it to console */
+
+ spin_lock_init(&mlogbuf_rlock);
+ ia64_mlogbuf_dump();
+
+ /* wait for console */
+ printk("Delaying for 5 seconds...\n");
+ udelay(5*1000000);
+}
static void inline
ia64_mca_spin(const char *func)
{
- printk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
+ if (monarch_cpu = smp_processor_id())
+ ia64_mlogbuf_break();
+ mprintk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
while (1)
cpu_relax();
}
@@ -989,18 +1094,18 @@ ia64_wait_for_slaves(int monarch, const
}
if (!missing)
goto all_in;
- printk(KERN_INFO "OS %s slave did not rendezvous on cpu", type);
+ mprintk(KERN_INFO "OS %s slave did not rendezvous on cpu", type);
for_each_online_cpu(c) {
if (c = monarch)
continue;
if (ia64_mc_info.imi_rendez_checkin[c] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
- printk(" %d", c);
+ mprintk(" %d", c);
}
- printk("\n");
+ mprintk("\n");
return;
all_in:
- printk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type);
+ mprintk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type);
return;
}
@@ -1028,10 +1133,8 @@ ia64_mca_handler(struct pt_regs *regs, s
struct ia64_mca_notify_die nd { .sos = sos, .monarch_cpu = &monarch_cpu };
- oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */
- console_loglevel = 15; /* make sure printks make it to console */
- printk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d monarch=%ld\n",
- sos->proc_state_param, cpu, sos->monarch);
+ mprintk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d "
+ "monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch);
previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
monarch_cpu = cpu;
@@ -1067,6 +1170,9 @@ ia64_mca_handler(struct pt_regs *regs, s
rh->severity = sal_log_severity_corrected;
ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
sos->os_status = IA64_MCA_CORRECTED;
+ } else {
+ /* Dump buffered message to console */
+ ia64_mlogbuf_break();
}
if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover)
= NOTIFY_STOP)
@@ -1306,6 +1412,10 @@ default_monarch_init_process(struct noti
struct task_struct *g, *t;
if (val != DIE_INIT_MONARCH_PROCESS)
return NOTIFY_DONE;
+
+ /* FIXME: mlogbuf will brim over with stack dumps... */
+ ia64_mlogbuf_break();
+
printk(KERN_ERR "Processes interrupted by INIT -");
for_each_online_cpu(c) {
struct ia64_sal_os_state *s;
@@ -1358,12 +1468,9 @@ ia64_init_handler(struct pt_regs *regs,
struct ia64_mca_notify_die nd { .sos = sos, .monarch_cpu = &monarch_cpu };
- oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */
- console_loglevel = 15; /* make sure printks make it to console */
-
(void) notify_die(DIE_INIT_ENTER, "INIT", regs, (long)&nd, 0, 0);
- printk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
+ mprintk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
sos->proc_state_param, cpu, sos->monarch);
salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0);
@@ -1376,7 +1483,7 @@ ia64_init_handler(struct pt_regs *regs,
* fix their proms and get their customers updated.
*/
if (!sos->monarch && atomic_add_return(1, &slaves) = num_online_cpus()) {
- printk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
+ mprintk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
__FUNCTION__, cpu);
atomic_dec(&slaves);
sos->monarch = 1;
@@ -1388,7 +1495,7 @@ ia64_init_handler(struct pt_regs *regs,
* fix their proms and get their customers updated.
*/
if (sos->monarch && atomic_add_return(1, &monarchs) > 1) {
- printk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
+ mprintk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
__FUNCTION__, cpu);
atomic_dec(&monarchs);
sos->monarch = 0;
@@ -1409,7 +1516,7 @@ ia64_init_handler(struct pt_regs *regs,
if (notify_die(DIE_INIT_SLAVE_LEAVE, "INIT", regs, (long)&nd, 0, 0)
= NOTIFY_STOP)
ia64_mca_spin(__FUNCTION__);
- printk("Slave on cpu %d returning to normal service.\n", cpu);
+ mprintk("Slave on cpu %d returning to normal service.\n", cpu);
set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
atomic_dec(&slaves);
@@ -1427,7 +1534,7 @@ ia64_init_handler(struct pt_regs *regs,
* same serial line, the user will need some time to switch out of the BMC before
* the dump begins.
*/
- printk("Delaying for 5 seconds...\n");
+ mprintk("Delaying for 5 seconds...\n");
udelay(5*1000000);
ia64_wait_for_slaves(cpu, "INIT");
/* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
@@ -1440,7 +1547,7 @@ ia64_init_handler(struct pt_regs *regs,
if (notify_die(DIE_INIT_MONARCH_LEAVE, "INIT", regs, (long)&nd, 0, 0)
= NOTIFY_STOP)
ia64_mca_spin(__FUNCTION__);
- printk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu);
+ mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu);
atomic_dec(&monarchs);
set_curr_task(cpu, previous_current);
monarch_cpu = -1;
Index: linux-2.6.17-rc6/arch/ia64/kernel/mca_drv.c
=================================--- linux-2.6.17-rc6.orig/arch/ia64/kernel/mca_drv.c
+++ linux-2.6.17-rc6/arch/ia64/kernel/mca_drv.c
@@ -80,14 +80,30 @@ static int
fatal_mca(const char *fmt, ...)
{
va_list args;
+ char buf[256];
va_start(args, fmt);
- vprintk(fmt, args);
+ vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
+ ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf);
return MCA_NOT_RECOVERED;
}
+static int
+mca_recovered(const char *fmt, ...)
+{
+ va_list args;
+ char buf[256];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ ia64_mca_printk(KERN_INFO "MCA: %s\n", buf);
+
+ return MCA_RECOVERED;
+}
+
/**
* mca_page_isolate - isolate a poisoned page in order not to use it later
* @paddr: poisoned memory location
@@ -141,6 +157,7 @@ mca_page_isolate(unsigned long paddr)
void
mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
{
+ ia64_mlogbuf_dump();
printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, "
"iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n",
raw_smp_processor_id(), current->pid, current->uid,
@@ -441,7 +458,7 @@ recover_from_read_error(slidx_table_t *s
/* Is target address valid? */
if (!pbci->tv)
- return fatal_mca(KERN_ALERT "MCA: target address not valid\n");
+ return fatal_mca("target address not valid");
/*
* cpu read or memory-mapped io read
@@ -459,7 +476,7 @@ recover_from_read_error(slidx_table_t *s
/* Is minstate valid? */
if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate))
- return fatal_mca(KERN_ALERT "MCA: minstate not valid\n");
+ return fatal_mca("minstate not valid");
psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);
@@ -493,13 +510,14 @@ recover_from_read_error(slidx_table_t *s
psr2->bn = 1;
psr2->i = 0;
- return MCA_RECOVERED;
+ return mca_recovered("user memory corruption. "
+ "kill affected process - recovered.");
}
}
- return fatal_mca(KERN_ALERT "MCA: kernel context not recovered,"
- " iip 0x%lx\n", pmsa->pmsa_iip);
+ return fatal_mca("kernel context not recovered, iip 0x%lx\n",
+ pmsa->pmsa_iip);
}
/**
@@ -585,13 +603,13 @@ recover_from_processor_error(int platfor
* The machine check is corrected.
*/
if (psp->cm = 1)
- return MCA_RECOVERED;
+ return mca_recovered("machine check is already corrected.");
/*
* The error was not contained. Software must be reset.
*/
if (psp->us || psp->ci = 0)
- return fatal_mca(KERN_ALERT "MCA: error not contained\n");
+ return fatal_mca("error not contained");
/*
* The cache check and bus check bits have four possible states
@@ -602,22 +620,22 @@ recover_from_processor_error(int platfor
* 1 1 Memory error, attempt recovery
*/
if (psp->bc = 0 || pbci = NULL)
- return fatal_mca(KERN_ALERT "MCA: No bus check\n");
+ return fatal_mca("No bus check");
/*
* Sorry, we cannot handle so many.
*/
if (peidx_bus_check_num(peidx) > 1)
- return fatal_mca(KERN_ALERT "MCA: Too many bus checks\n");
+ return fatal_mca("Too many bus checks");
/*
* Well, here is only one bus error.
*/
if (pbci->ib)
- return fatal_mca(KERN_ALERT "MCA: Internal Bus error\n");
+ return fatal_mca("Internal Bus error");
if (pbci->cc)
- return fatal_mca(KERN_ALERT "MCA: Cache-cache error\n");
+ return fatal_mca("Cache-cache error");
if (pbci->eb && pbci->bsi > 0)
- return fatal_mca(KERN_ALERT "MCA: External bus check fatal status\n");
+ return fatal_mca("External bus check fatal status");
/*
* This is a local MCA and estimated as recoverble external bus error.
@@ -629,7 +647,7 @@ recover_from_processor_error(int platfor
/*
* On account of strange SAL error record, we cannot recover.
*/
- return fatal_mca(KERN_ALERT "MCA: Strange SAL record\n");
+ return fatal_mca("Strange SAL record");
}
/**
@@ -658,10 +676,10 @@ mca_try_to_recover(void *rec, struct ia6
/* Now, OS can recover when there is one processor error section */
if (n_proc_err > 1)
- return fatal_mca(KERN_ALERT "MCA: Too Many Errors\n");
+ return fatal_mca("Too Many Errors");
else if (n_proc_err = 0)
/* Weird SAL record ... We need not to recover */
- return fatal_mca(KERN_ALERT "MCA: Weird SAL record\n");
+ return fatal_mca("Weird SAL record");
/* Make index of processor error section */
mca_make_peidx((sal_log_processor_info_t*)
@@ -672,7 +690,7 @@ mca_try_to_recover(void *rec, struct ia6
/* Check whether MCA is global or not */
if (is_mca_global(&peidx, &pbci, sos))
- return fatal_mca(KERN_ALERT "MCA: global MCA\n");
+ return fatal_mca("global MCA");
/* Try to recover a processor error */
return recover_from_processor_error(platform_err, &slidx, &peidx,
Index: linux-2.6.17-rc6/arch/ia64/kernel/mca_drv.h
=================================--- linux-2.6.17-rc6.orig/arch/ia64/kernel/mca_drv.h
+++ linux-2.6.17-rc6/arch/ia64/kernel/mca_drv.h
@@ -118,3 +118,7 @@ struct mca_table_entry {
extern const struct mca_table_entry *search_mca_tables (unsigned long addr);
extern int mca_recover_range(unsigned long);
+extern void ia64_mca_printk(const char * fmt, ...)
+ __attribute__ ((format (printf, 1, 2)));
+extern void ia64_mlogbuf_dump(void);
+
Index: linux-2.6.17-rc6/arch/ia64/kernel/salinfo.c
=================================--- linux-2.6.17-rc6.orig/arch/ia64/kernel/salinfo.c
+++ linux-2.6.17-rc6/arch/ia64/kernel/salinfo.c
@@ -266,6 +266,7 @@ salinfo_log_wakeup(int type, u8 *buffer,
/* Check for outstanding MCA/INIT records every minute (arbitrary) */
#define SALINFO_TIMER_DELAY (60*HZ)
static struct timer_list salinfo_timer;
+extern void ia64_mlogbuf_dump(void);
static void
salinfo_timeout_check(struct salinfo_data *data)
@@ -283,6 +284,7 @@ salinfo_timeout_check(struct salinfo_dat
static void
salinfo_timeout (unsigned long arg)
{
+ ia64_mlogbuf_dump();
salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
@@ -332,6 +334,8 @@ retry:
if (cpu = -1)
goto retry;
+ ia64_mlogbuf_dump();
+
/* for next read, start checking at next CPU */
data->cpu_check = cpu;
if (++data->cpu_check = NR_CPUS)
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH] printing support for MCA/INIT
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
@ 2006-06-07 13:28 ` Keith Owens
2006-06-08 3:39 ` Hidetoshi Seto
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Keith Owens @ 2006-06-07 13:28 UTC (permalink / raw)
To: linux-ia64
Hidetoshi Seto (on Wed, 07 Jun 2006 20:53:22 +0900) wrote:
>Printing message to console from MCA/INIT handler is useful,
>however doing oops_in_progress = 1 in them exactly makes
>something in kernel wrong.
>Especially it sounds ugly if system goes wrong after returning
>from recoverable MCA.
>
>This patch adds ia64_mca_printk() function that collects messages
>into temporary-not-so-large message buffer during in MCA/INIT
>environment and print them out later, after returning to normal
>context or when handlers determine to down the system.
NAK this patch. The handlers print their progress as they perform the
MCA/INIT rendezvous. That information is critical for diagnosing any
problems with entering the handlers, it must not be delayed until the
handlers have exited.
The correct fix is to make printk safe against non maskable interrupts
and to allow polled console I/O when interrupts are disabled. That
fixes printk from NMI error handlers in all architectures, not just
ia64.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH] printing support for MCA/INIT
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
2006-06-07 13:28 ` Keith Owens
@ 2006-06-08 3:39 ` Hidetoshi Seto
2006-06-08 6:01 ` Luck, Tony
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Hidetoshi Seto @ 2006-06-08 3:39 UTC (permalink / raw)
To: linux-ia64
Keith Owens wrote:
> Hidetoshi Seto (on Wed, 07 Jun 2006 20:53:22 +0900) wrote:
>> This patch adds ia64_mca_printk() function that collects messages
>> into temporary-not-so-large message buffer during in MCA/INIT
>> environment and print them out later, after returning to normal
>> context or when handlers determine to down the system.
>
> NAK this patch. The handlers print their progress as they perform the
> MCA/INIT rendezvous. That information is critical for diagnosing any
> problems with entering the handlers, it must not be delayed until the
> handlers have exited.
Isn't it just convenience of MCA/INIT handler developers like us?
I don't think it is needed much more than providing stable system to
ia64 users. Even recoverable TLB error which handled perfectly in old
days is now possibly break the system and its logs.
Why don't you use notifier all around there for such purpose?
Or is it better to have a switch the visibility of these messages?
IA64_MCA_DEBUG() is already there.
I guess there are only 2 cases actually needs to display its progress,
long time wait on rendezvous and INIT-monarch. Except these cases,
who cares about short delay of the last words from fading system and
difference of delayed messages, "there was a CMC/CPE" and "an MCA was
corrected"?
> The correct fix is to make printk safe against non maskable interrupts
> and to allow polled console I/O when interrupts are disabled. That
> fixes printk from NMI error handlers in all architectures, not just
> ia64.
Too idealistic, at least this time.
There are many delicate thing in doing new printk from NMI context
besides interrupted printk and console driver. Making NMI handlers
(especially MCA handlers) based on assumption that printk still work
correct is technically not safe.
This patch should be replaced if such printk_on_critical is available.
Until then, I believe it is better to put the stability of the system
needed by ia64 users above our convenience.
How about your second thought?
H.Seto
^ permalink raw reply [flat|nested] 9+ messages in thread* RE: [PATCH] printing support for MCA/INIT
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
2006-06-07 13:28 ` Keith Owens
2006-06-08 3:39 ` Hidetoshi Seto
@ 2006-06-08 6:01 ` Luck, Tony
2006-06-08 6:29 ` Hidetoshi Seto
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Luck, Tony @ 2006-06-08 6:01 UTC (permalink / raw)
To: linux-ia64
> I guess there are only 2 cases actually needs to display its progress,
> long time wait on rendezvous and INIT-monarch.
In the MCA case, something bad has already happened to the system,
it is possible that we will not complete printing all of the
messages, but if they are streaming directly to the console, then
at least we will see the first part of the messages. If you buffer
them to be printed later, there may be no "later", and all the
information will be lost.
-Tony
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH] printing support for MCA/INIT
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
` (2 preceding siblings ...)
2006-06-08 6:01 ` Luck, Tony
@ 2006-06-08 6:29 ` Hidetoshi Seto
2006-06-08 6:36 ` Keith Owens
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Hidetoshi Seto @ 2006-06-08 6:29 UTC (permalink / raw)
To: linux-ia64
Luck, Tony wrote:
>> I guess there are only 2 cases actually needs to display its progress,
>> long time wait on rendezvous and INIT-monarch.
>
> In the MCA case, something bad has already happened to the system,
> it is possible that we will not complete printing all of the
> messages, but if they are streaming directly to the console, then
> at least we will see the first part of the messages. If you buffer
> them to be printed later, there may be no "later", and all the
> information will be lost.
>
> -Tony
>
Please look my patch.
@@ -1067,6 +1170,9 @@ ia64_mca_handler(struct pt_regs *regs, s
rh->severity = sal_log_severity_corrected;
ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
sos->os_status = IA64_MCA_CORRECTED;
+ } else {
+ /* Dump buffered message to console */
+ ia64_mlogbuf_break();
}
if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover)
= NOTIFY_STOP)
If MCA handler cannot recovery the error, then try to printk
all buffered messages before returning to SAL.
Isn't it enough?
H.Seto
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH] printing support for MCA/INIT
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
` (3 preceding siblings ...)
2006-06-08 6:29 ` Hidetoshi Seto
@ 2006-06-08 6:36 ` Keith Owens
2006-06-08 10:27 ` Hidetoshi Seto
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Keith Owens @ 2006-06-08 6:36 UTC (permalink / raw)
To: linux-ia64
"Luck, Tony" (on Wed, 7 Jun 2006 23:01:54 -0700) wrote:
>> I guess there are only 2 cases actually needs to display its progress,
>> long time wait on rendezvous and INIT-monarch.
>
>In the MCA case, something bad has already happened to the system,
>it is possible that we will not complete printing all of the
>messages, but if they are streaming directly to the console, then
>at least we will see the first part of the messages. If you buffer
>them to be printed later, there may be no "later", and all the
>information will be lost.
Also consider that crash dump may be invoked from MCA/INIT. The
various crash dump analysis tools all expect to find the messages in
the dmesg buffer in the dump. Adding a special print buffer just for
MCA/INIT means changing all the crash dump tools to look in two places.
The existing 'oops_in_progress' code is working pretty well. It does
leave nasty bits behind if the MCA is recoverable, but that problem is
not bad enough to justify a completely separate print mechanism plus
changes to external programs. Instead we should fix the unwanted side
effects of oops_in_progress.
It is possible to make the core of printk completely NMI safe. We can
make it lockless, or retain the locks but detect that there is no
movement and ignore the lock. The SN2 serial console does the latter,
see drivers/serial/sn_console.c::sn_sal_console_write(). This means
that SN2 machines can safely write to the console even from MCA/INIT.
printk can use the same technique to lock access to its print buffer.
/* somebody really wants this output, might be an
* oops, kdb, panic, etc. make sure they get it. */
if (spin_is_locked(&port->sc_port.lock)) {
int lhead = port->sc_port.info->xmit.head;
int ltail = port->sc_port.info->xmit.tail;
int counter, got_lock = 0;
/*
* We attempt to determine if someone has died with the
* lock. We wait ~20 secs after the head and tail ptrs
* stop moving and assume the lock holder is not functional
* and plow ahead. If the lock is freed within the time out
* period we re-get the lock and go ahead normally. We also
* remember if we have plowed ahead so that we don't have
* to wait out the time out period again - the asumption
* is that we will time out again.
*/
for (counter = 0; counter < 150; mdelay(125), counter++) {
if (!spin_is_locked(&port->sc_port.lock)
|| stole_lock) {
if (!stole_lock) {
spin_lock_irqsave(&port->sc_port.lock,
flags);
got_lock = 1;
}
break;
} else {
/* still locked */
if ((lhead != port->sc_port.info->xmit.head)
|| (ltail ! port->sc_port.info->xmit.tail)) {
lhead port->sc_port.info->xmit.head;
ltail port->sc_port.info->xmit.tail;
counter = 0;
}
}
}
/* flush anything in the serial core xmit buffer, raw */
sn_transmit_chars(port, 1);
if (got_lock) {
spin_unlock_irqrestore(&port->sc_port.lock, flags);
stole_lock = 0;
} else {
/* fell thru */
stole_lock = 1;
}
puts_raw_fixed(port->sc_ops->sal_puts_raw, s, count);
} else {
stole_lock = 0;
spin_lock_irqsave(&port->sc_port.lock, flags);
sn_transmit_chars(port, 1);
spin_unlock_irqrestore(&port->sc_port.lock, flags);
puts_raw_fixed(port->sc_ops->sal_puts_raw, s, count);
}
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH] printing support for MCA/INIT
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
` (4 preceding siblings ...)
2006-06-08 6:36 ` Keith Owens
@ 2006-06-08 10:27 ` Hidetoshi Seto
2006-09-15 2:31 ` Hidetoshi Seto
2006-09-15 3:12 ` Russ Anderson
7 siblings, 0 replies; 9+ messages in thread
From: Hidetoshi Seto @ 2006-06-08 10:27 UTC (permalink / raw)
To: linux-ia64
Keith Owens wrote:
> Also consider that crash dump may be invoked from MCA/INIT. The
> various crash dump analysis tools all expect to find the messages in
> the dmesg buffer in the dump. Adding a special print buffer just for
> MCA/INIT means changing all the crash dump tools to look in two places.
I doubt it.
If you successfully get the crash dump, you will be able to see where
the dump was invoked, possibly with short string describing why the dump
was invoked. Then these short messages from MCA/INIT handler are not so
important thing even still you can look in special buffer.
Or export ia64_mlogbuf_break and use it to flush special buffer to
dmesg buffer before you invoke crash dump.
> It is possible to make the core of printk completely NMI safe. We can
> make it lockless, or retain the locks but detect that there is no
> movement and ignore the lock. The SN2 serial console does the latter,
> see drivers/serial/sn_console.c::sn_sal_console_write(). This means
> that SN2 machines can safely write to the console even from MCA/INIT.
> printk can use the same technique to lock access to its print buffer.
Latter will not be acceptable. How long we can pause the system to
recover a TLB error which can be swept in a second?
I have no idea to make printk lockless. Really can we?
Still I like patching as a workaround better than waiting incredible
feature in future.
H.Seto
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH] printing support for MCA/INIT
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
` (5 preceding siblings ...)
2006-06-08 10:27 ` Hidetoshi Seto
@ 2006-09-15 2:31 ` Hidetoshi Seto
2006-09-15 3:12 ` Russ Anderson
7 siblings, 0 replies; 9+ messages in thread
From: Hidetoshi Seto @ 2006-09-15 2:31 UTC (permalink / raw)
To: linux-ia64
Here is the latest updated patch.
Let me express my appreciation to Russ for all the advice
and help.
Thanks,
H.Seto
-----
Printing message to console from MCA/INIT handler is useful,
however doing oops_in_progress = 1 in them exactly makes
something in kernel wrong. Especially it sounds ugly if
system goes wrong after returning from recoverable MCA.
This patch adds ia64_mca_printk() function that collects
messages into temporary-not-so-large message buffer during
in MCA/INIT environment and print them out later, after
returning to normal context or when handlers determine to
down the system.
Also this print function is exported for use in extensional
MCA handler. It would be useful to describe detail about
recovery.
NOTE:
I don't think it is sane thing if temporary message buffer
is enlarged enough to hold whole stack dumps from INIT, so
buffering is disabled during stack dump from INIT-monarch
(= default_monarch_init_process). please fix it in future.
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
arch/ia64/kernel/mca.c | 216 +++++++++++++++++++++++++++++++++++++++++----
arch/ia64/kernel/mca_drv.c | 54 +++++++----
arch/ia64/kernel/mca_drv.h | 4
arch/ia64/kernel/salinfo.c | 4
4 files changed, 242 insertions(+), 36 deletions(-)
Index: tony/arch/ia64/kernel/mca.c
=================================--- tony.orig/arch/ia64/kernel/mca.c
+++ tony/arch/ia64/kernel/mca.c
@@ -54,6 +54,9 @@
*
* 2005-10-07 Keith Owens <kaos@sgi.com>
* Add notify_die() hooks.
+ *
+ * 2006-09-15 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+ * Add printing support for MCA/INIT.
*/
#include <linux/types.h>
#include <linux/init.h>
@@ -136,11 +139,175 @@
static int mca_init __initdata;
+/*
+ * limited & delayed printing support for MCA/INIT handler
+ */
+
+#define mprintk(fmt...) ia64_mca_printk(fmt)
+
+#define MLOGBUF_SIZE (512+256*NR_CPUS)
+#define MLOGBUF_MSGMAX 256
+static char mlogbuf[MLOGBUF_SIZE];
+static DEFINE_SPINLOCK(mlogbuf_wlock); /* mca context only */
+static DEFINE_SPINLOCK(mlogbuf_rlock); /* normal context only */
+static unsigned long mlogbuf_start;
+static unsigned long mlogbuf_end;
+static unsigned int mlogbuf_finished = 0;
+static unsigned long mlogbuf_timestamp = 0;
+
+static int loglevel_save = -1;
+#define BREAK_LOGLEVEL(__console_loglevel) \
+ oops_in_progress = 1; \
+ if (loglevel_save < 0) \
+ loglevel_save = __console_loglevel; \
+ __console_loglevel = 15;
+
+#define RESTORE_LOGLEVEL(__console_loglevel) \
+ if (loglevel_save >= 0) { \
+ __console_loglevel = loglevel_save; \
+ loglevel_save = -1; \
+ } \
+ mlogbuf_finished = 0; \
+ oops_in_progress = 0;
+
+/*
+ * Push messages into buffer, print them later if not urgent.
+ */
+void ia64_mca_printk(const char *fmt, ...)
+{
+ va_list args;
+ int printed_len;
+ char temp_buf[MLOGBUF_MSGMAX];
+ char *p;
+
+ va_start(args, fmt);
+ printed_len = vscnprintf(temp_buf, sizeof(temp_buf), fmt, args);
+ va_end(args);
+
+ /* Copy the output into mlogbuf */
+ if (oops_in_progress) {
+ /* mlogbuf was abandoned, use printk directly instead. */
+ printk(temp_buf);
+ } else {
+ spin_lock(&mlogbuf_wlock);
+ for (p = temp_buf; *p; p++) {
+ unsigned long next = (mlogbuf_end + 1) % MLOGBUF_SIZE;
+ if (next != mlogbuf_start) {
+ mlogbuf[mlogbuf_end] = *p;
+ mlogbuf_end = next;
+ } else {
+ /* buffer full */
+ break;
+ }
+ }
+ mlogbuf[mlogbuf_end] = '\0';
+ spin_unlock(&mlogbuf_wlock);
+ }
+}
+EXPORT_SYMBOL(ia64_mca_printk);
+
+/*
+ * Print buffered messages.
+ * NOTE: call this after returning normal context. (ex. from salinfod)
+ */
+void ia64_mlogbuf_dump(void)
+{
+ char temp_buf[MLOGBUF_MSGMAX];
+ char *p;
+ unsigned long index;
+ unsigned long flags;
+ unsigned int printed_len;
+
+ /* Get output from mlogbuf */
+ while (mlogbuf_start != mlogbuf_end) {
+ temp_buf[0] = '\0';
+ p = temp_buf;
+ printed_len = 0;
+
+ spin_lock_irqsave(&mlogbuf_rlock, flags);
+
+ index = mlogbuf_start;
+ while (index != mlogbuf_end) {
+ *p = mlogbuf[index];
+ index = (index + 1) % MLOGBUF_SIZE;
+ if (!*p)
+ break;
+ p++;
+ if (++printed_len >= MLOGBUF_MSGMAX - 1)
+ break;
+ }
+ *p = '\0';
+ if (temp_buf[0])
+ printk(temp_buf);
+ mlogbuf_start = index;
+
+ mlogbuf_timestamp = 0;
+ spin_unlock_irqrestore(&mlogbuf_rlock, flags);
+ }
+}
+EXPORT_SYMBOL(ia64_mlogbuf_dump);
+
+/*
+ * Call this if system is going to down or if immediate flushing messages to
+ * console is required. (ex. recovery was failed, crash dump is going to be
+ * invoked, long-wait rendezvous etc.)
+ * NOTE: this should be called from monarch.
+ */
+static void ia64_mlogbuf_finish(int wait)
+{
+ BREAK_LOGLEVEL(console_loglevel);
+
+ spin_lock_init(&mlogbuf_rlock);
+ ia64_mlogbuf_dump();
+ printk(KERN_EMERG "mlogbuf_finish: printing switched to urgent mode, "
+ "MCA/INIT might be dodgy or fail.\n");
+
+ if (!wait)
+ return;
+
+ /* wait for console */
+ printk("Delaying for 5 seconds...\n");
+ udelay(5*1000000);
+
+ mlogbuf_finished = 1;
+}
+EXPORT_SYMBOL(ia64_mlogbuf_finish);
+
+/*
+ * Print buffered messages from INIT context.
+ */
+static void ia64_mlogbuf_dump_from_init(void)
+{
+ if (mlogbuf_finished)
+ return;
+
+ if (mlogbuf_timestamp && (mlogbuf_timestamp + 30*HZ > jiffies)) {
+ printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT "
+ " and the system seems to be messed up.\n");
+ ia64_mlogbuf_finish(0);
+ return;
+ }
+
+ if (!spin_trylock(&mlogbuf_rlock)) {
+ printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT. "
+ "Generated messages other than stack dump will be "
+ "buffered to mlogbuf and will be printed later.\n");
+ printk(KERN_ERR "INIT: If messages would not printed after "
+ "this INIT, wait 30sec and assert INIT again.\n");
+ if (!mlogbuf_timestamp)
+ mlogbuf_timestamp = jiffies;
+ return;
+ }
+ spin_unlock(&mlogbuf_rlock);
+ ia64_mlogbuf_dump();
+}
static void inline
ia64_mca_spin(const char *func)
{
- printk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
+ if (monarch_cpu = smp_processor_id())
+ ia64_mlogbuf_finish(0);
+ mprintk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
while (1)
cpu_relax();
}
@@ -988,18 +1155,22 @@
}
if (!missing)
goto all_in;
- printk(KERN_INFO "OS %s slave did not rendezvous on cpu", type);
+ /*
+ * Maybe slave(s) dead. Print buffered messages immediately.
+ */
+ ia64_mlogbuf_finish(0);
+ mprintk(KERN_INFO "OS %s slave did not rendezvous on cpu", type);
for_each_online_cpu(c) {
if (c = monarch)
continue;
if (ia64_mc_info.imi_rendez_checkin[c] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
- printk(" %d", c);
+ mprintk(" %d", c);
}
- printk("\n");
+ mprintk("\n");
return;
all_in:
- printk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type);
+ mprintk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type);
return;
}
@@ -1027,10 +1198,8 @@
struct ia64_mca_notify_die nd { .sos = sos, .monarch_cpu = &monarch_cpu };
- oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */
- console_loglevel = 15; /* make sure printks make it to console */
- printk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d monarch=%ld\n",
- sos->proc_state_param, cpu, sos->monarch);
+ mprintk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d "
+ "monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch);
previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
monarch_cpu = cpu;
@@ -1066,6 +1235,9 @@
rh->severity = sal_log_severity_corrected;
ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
sos->os_status = IA64_MCA_CORRECTED;
+ } else {
+ /* Dump buffered message to console */
+ ia64_mlogbuf_finish(1);
}
if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover)
= NOTIFY_STOP)
@@ -1305,6 +1477,15 @@
struct task_struct *g, *t;
if (val != DIE_INIT_MONARCH_PROCESS)
return NOTIFY_DONE;
+
+ /*
+ * FIXME: mlogbuf will brim over with INIT stack dumps.
+ * To enable show_stack from INIT, we use oops_in_progress which should
+ * be used in real oops. This would cause something wrong after INIT.
+ */
+ BREAK_LOGLEVEL(console_loglevel);
+ ia64_mlogbuf_dump_from_init();
+
printk(KERN_ERR "Processes interrupted by INIT -");
for_each_online_cpu(c) {
struct ia64_sal_os_state *s;
@@ -1326,6 +1507,8 @@
} while_each_thread (g, t);
read_unlock(&tasklist_lock);
}
+ /* FIXME: This will not restore zapped printk locks. */
+ RESTORE_LOGLEVEL(console_loglevel);
return NOTIFY_DONE;
}
@@ -1357,12 +1540,9 @@
struct ia64_mca_notify_die nd { .sos = sos, .monarch_cpu = &monarch_cpu };
- oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */
- console_loglevel = 15; /* make sure printks make it to console */
-
(void) notify_die(DIE_INIT_ENTER, "INIT", regs, (long)&nd, 0, 0);
- printk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
+ mprintk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
sos->proc_state_param, cpu, sos->monarch);
salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0);
@@ -1375,7 +1555,7 @@
* fix their proms and get their customers updated.
*/
if (!sos->monarch && atomic_add_return(1, &slaves) = num_online_cpus()) {
- printk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
+ mprintk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
__FUNCTION__, cpu);
atomic_dec(&slaves);
sos->monarch = 1;
@@ -1387,7 +1567,7 @@
* fix their proms and get their customers updated.
*/
if (sos->monarch && atomic_add_return(1, &monarchs) > 1) {
- printk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
+ mprintk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
__FUNCTION__, cpu);
atomic_dec(&monarchs);
sos->monarch = 0;
@@ -1408,7 +1588,7 @@
if (notify_die(DIE_INIT_SLAVE_LEAVE, "INIT", regs, (long)&nd, 0, 0)
= NOTIFY_STOP)
ia64_mca_spin(__FUNCTION__);
- printk("Slave on cpu %d returning to normal service.\n", cpu);
+ mprintk("Slave on cpu %d returning to normal service.\n", cpu);
set_curr_task(cpu, previous_current);
ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
atomic_dec(&slaves);
@@ -1426,7 +1606,7 @@
* same serial line, the user will need some time to switch out of the BMC before
* the dump begins.
*/
- printk("Delaying for 5 seconds...\n");
+ mprintk("Delaying for 5 seconds...\n");
udelay(5*1000000);
ia64_wait_for_slaves(cpu, "INIT");
/* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
@@ -1439,7 +1619,7 @@
if (notify_die(DIE_INIT_MONARCH_LEAVE, "INIT", regs, (long)&nd, 0, 0)
= NOTIFY_STOP)
ia64_mca_spin(__FUNCTION__);
- printk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu);
+ mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu);
atomic_dec(&monarchs);
set_curr_task(cpu, previous_current);
monarch_cpu = -1;
Index: tony/arch/ia64/kernel/mca_drv.c
=================================--- tony.orig/arch/ia64/kernel/mca_drv.c
+++ tony/arch/ia64/kernel/mca_drv.c
@@ -79,14 +79,30 @@
fatal_mca(const char *fmt, ...)
{
va_list args;
+ char buf[256];
va_start(args, fmt);
- vprintk(fmt, args);
+ vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
+ ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf);
return MCA_NOT_RECOVERED;
}
+static int
+mca_recovered(const char *fmt, ...)
+{
+ va_list args;
+ char buf[256];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ ia64_mca_printk(KERN_INFO "MCA: %s\n", buf);
+
+ return MCA_RECOVERED;
+}
+
/**
* mca_page_isolate - isolate a poisoned page in order not to use it later
* @paddr: poisoned memory location
@@ -140,6 +156,7 @@
void
mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
{
+ ia64_mlogbuf_dump();
printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, "
"iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n",
raw_smp_processor_id(), current->pid, current->uid,
@@ -440,7 +457,7 @@
/* Is target address valid? */
if (!pbci->tv)
- return fatal_mca(KERN_ALERT "MCA: target address not valid\n");
+ return fatal_mca("target address not valid");
/*
* cpu read or memory-mapped io read
@@ -458,7 +475,7 @@
/* Is minstate valid? */
if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate))
- return fatal_mca(KERN_ALERT "MCA: minstate not valid\n");
+ return fatal_mca("minstate not valid");
psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);
@@ -492,13 +509,14 @@
psr2->bn = 1;
psr2->i = 0;
- return MCA_RECOVERED;
+ return mca_recovered("user memory corruption. "
+ "kill affected process - recovered.");
}
}
- return fatal_mca(KERN_ALERT "MCA: kernel context not recovered,"
- " iip 0x%lx\n", pmsa->pmsa_iip);
+ return fatal_mca("kernel context not recovered, iip 0x%lx\n",
+ pmsa->pmsa_iip);
}
/**
@@ -584,13 +602,13 @@
* The machine check is corrected.
*/
if (psp->cm = 1)
- return MCA_RECOVERED;
+ return mca_recovered("machine check is already corrected.");
/*
* The error was not contained. Software must be reset.
*/
if (psp->us || psp->ci = 0)
- return fatal_mca(KERN_ALERT "MCA: error not contained\n");
+ return fatal_mca("error not contained");
/*
* The cache check and bus check bits have four possible states
@@ -601,22 +619,22 @@
* 1 1 Memory error, attempt recovery
*/
if (psp->bc = 0 || pbci = NULL)
- return fatal_mca(KERN_ALERT "MCA: No bus check\n");
+ return fatal_mca("No bus check");
/*
* Sorry, we cannot handle so many.
*/
if (peidx_bus_check_num(peidx) > 1)
- return fatal_mca(KERN_ALERT "MCA: Too many bus checks\n");
+ return fatal_mca("Too many bus checks");
/*
* Well, here is only one bus error.
*/
if (pbci->ib)
- return fatal_mca(KERN_ALERT "MCA: Internal Bus error\n");
+ return fatal_mca("Internal Bus error");
if (pbci->cc)
- return fatal_mca(KERN_ALERT "MCA: Cache-cache error\n");
+ return fatal_mca("Cache-cache error");
if (pbci->eb && pbci->bsi > 0)
- return fatal_mca(KERN_ALERT "MCA: External bus check fatal status\n");
+ return fatal_mca("External bus check fatal status");
/*
* This is a local MCA and estimated as recoverble external bus error.
@@ -628,7 +646,7 @@
/*
* On account of strange SAL error record, we cannot recover.
*/
- return fatal_mca(KERN_ALERT "MCA: Strange SAL record\n");
+ return fatal_mca("Strange SAL record");
}
/**
@@ -657,10 +675,10 @@
/* Now, OS can recover when there is one processor error section */
if (n_proc_err > 1)
- return fatal_mca(KERN_ALERT "MCA: Too Many Errors\n");
+ return fatal_mca("Too Many Errors");
else if (n_proc_err = 0)
- /* Weird SAL record ... We need not to recover */
- return fatal_mca(KERN_ALERT "MCA: Weird SAL record\n");
+ /* Weird SAL record ... We can't do anything */
+ return fatal_mca("Weird SAL record");
/* Make index of processor error section */
mca_make_peidx((sal_log_processor_info_t*)
@@ -671,7 +689,7 @@
/* Check whether MCA is global or not */
if (is_mca_global(&peidx, &pbci, sos))
- return fatal_mca(KERN_ALERT "MCA: global MCA\n");
+ return fatal_mca("global MCA");
/* Try to recover a processor error */
return recover_from_processor_error(platform_err, &slidx, &peidx,
Index: tony/arch/ia64/kernel/mca_drv.h
=================================--- tony.orig/arch/ia64/kernel/mca_drv.h
+++ tony/arch/ia64/kernel/mca_drv.h
@@ -118,3 +118,7 @@
extern const struct mca_table_entry *search_mca_tables (unsigned long addr);
extern int mca_recover_range(unsigned long);
+extern void ia64_mca_printk(const char * fmt, ...)
+ __attribute__ ((format (printf, 1, 2)));
+extern void ia64_mlogbuf_dump(void);
+
Index: tony/arch/ia64/kernel/salinfo.c
=================================--- tony.orig/arch/ia64/kernel/salinfo.c
+++ tony/arch/ia64/kernel/salinfo.c
@@ -266,6 +266,7 @@
/* Check for outstanding MCA/INIT records every minute (arbitrary) */
#define SALINFO_TIMER_DELAY (60*HZ)
static struct timer_list salinfo_timer;
+extern void ia64_mlogbuf_dump(void);
static void
salinfo_timeout_check(struct salinfo_data *data)
@@ -283,6 +284,7 @@
static void
salinfo_timeout (unsigned long arg)
{
+ ia64_mlogbuf_dump();
salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
@@ -332,6 +334,8 @@
if (cpu = -1)
goto retry;
+ ia64_mlogbuf_dump();
+
/* for next read, start checking at next CPU */
data->cpu_check = cpu;
if (++data->cpu_check = NR_CPUS)
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH] printing support for MCA/INIT
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
` (6 preceding siblings ...)
2006-09-15 2:31 ` Hidetoshi Seto
@ 2006-09-15 3:12 ` Russ Anderson
7 siblings, 0 replies; 9+ messages in thread
From: Russ Anderson @ 2006-09-15 3:12 UTC (permalink / raw)
To: linux-ia64
Acked-by: Russ Anderson <rja@sgi.com>
Hidetoshi Seto wrote:
>
> Here is the latest updated patch.
> Let me express my appreciation to Russ for all the advice
> and help.
>
> Thanks,
> H.Seto
>
> -----
>
> Printing message to console from MCA/INIT handler is useful,
> however doing oops_in_progress = 1 in them exactly makes
> something in kernel wrong. Especially it sounds ugly if
> system goes wrong after returning from recoverable MCA.
>
> This patch adds ia64_mca_printk() function that collects
> messages into temporary-not-so-large message buffer during
> in MCA/INIT environment and print them out later, after
> returning to normal context or when handlers determine to
> down the system.
>
> Also this print function is exported for use in extensional
> MCA handler. It would be useful to describe detail about
> recovery.
>
> NOTE:
> I don't think it is sane thing if temporary message buffer
> is enlarged enough to hold whole stack dumps from INIT, so
> buffering is disabled during stack dump from INIT-monarch
> (= default_monarch_init_process). please fix it in future.
>
> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
--
Russ Anderson, OS RAS/Partitioning Project Lead
SGI - Silicon Graphics Inc rja@sgi.com
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2006-09-15 3:12 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-07 11:53 [PATCH] printing support for MCA/INIT Hidetoshi Seto
2006-06-07 13:28 ` Keith Owens
2006-06-08 3:39 ` Hidetoshi Seto
2006-06-08 6:01 ` Luck, Tony
2006-06-08 6:29 ` Hidetoshi Seto
2006-06-08 6:36 ` Keith Owens
2006-06-08 10:27 ` Hidetoshi Seto
2006-09-15 2:31 ` Hidetoshi Seto
2006-09-15 3:12 ` Russ Anderson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox