From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
To: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>, "H. Peter Anvin" <hpa@zytor.com>,
Andi Kleen <ak@linux.intel.com>,
"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
Subject: [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU
Date: Mon, 05 Oct 2009 15:38:59 +0900 [thread overview]
Message-ID: <4AC99483.7020100@jp.fujitsu.com> (raw)
In-Reply-To: <4AC990E1.7030708@jp.fujitsu.com>
On larger systems the global 32 size buffer for mcelog easily overflow,
lose events. And there's a known livelock, now hit by more people,
under high error rate.
This patch fixes these issues by making MCE log buffer to per-CPU:
+ MCEs are added to corresponding local per-CPU buffer, instead of
one big global buffer. Contention/unfairness between CPUs is
eliminated.
Reader/Writer convention is unchanged (= Lock-less for writer side):
+ MCE log writer may come from NMI, so the writer side must be
lock-less. For per-CPU buffer of one CPU, writers may come from
process, IRQ or NMI context, so cmpxchg_local() is used to allocate
buffer space.
+ MCE records are read out and removed from per-CPU buffers by mutex
protected global reader function. Because there are no many
readers in system to contend in most cases. In other words,
reader side is protected with a mutex to guarantee only one reader
is active in the whole system.
As the result now each CPU has its local 32 size buffer.
HS: Add a member header_len to struct mce_log to help debugger to know
where the array of record is.
(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")
Originally-From: Huang Ying <ying.huang@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
arch/x86/include/asm/mce.h | 37 ++++++----
arch/x86/kernel/cpu/mcheck/mce.c | 139 +++++++++++++++++++++++++++-----------
2 files changed, 120 insertions(+), 56 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2f1c0ef..c5d4144 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -52,7 +52,7 @@
#define MCE_INJ_NMI_BROADCAST (1 << 2) /* do NMI broadcasting */
#define MCE_INJ_EXCEPTION (1 << 3) /* raise as exception */
-/* Fields are zero when not available */
+/* MCE log entry. Fields are zero when not available. */
struct mce {
__u64 status;
__u64 misc;
@@ -63,12 +63,12 @@ struct mce {
__u64 time; /* wall time_t when error was detected */
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
- __u16 pad;
+ __u16 pad;
__u32 cpuid; /* CPUID 1 EAX */
- __u8 cs; /* code segment */
+ __u8 cs; /* code segment */
__u8 bank; /* machine check bank */
__u8 cpu; /* cpu number; obsolete; use extcpu now */
- __u8 finished; /* entry is valid */
+ __u8 finished; /* 1 if write to entry is finished & entry is valid */
__u32 extcpu; /* linux cpu number that detected the error */
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */
@@ -76,26 +76,33 @@ struct mce {
};
/*
- * This structure contains all data related to the MCE log. Also
- * carries a signature to make it easier to find from external
- * debugging tools. Each entry is only valid when its finished flag
- * is set.
+ * This structure contains all data related to the MCE log. Also carries
+ * a signature to make it easier to find from external debugging tools.
+ * Each entry is only valid when its finished flag is set.
*/
-#define MCE_LOG_LEN 32
+#define MCE_LOG_LEN 32
+
+struct mce_log_cpu;
struct mce_log {
- char signature[12]; /* "MACHINECHECK" */
- unsigned len; /* = MCE_LOG_LEN */
- unsigned next;
+ char signature[12]; /* "MACHINECHEC2" */
+
+ /* points the table of per-CPU buffers */
+ struct mce_log_cpu **mcelog_cpus;
+ unsigned int nr_mcelog_cpus; /* = num_possible_cpus() */
+
+ /* spec of per-CPU buffer */
+ unsigned int header_len; /* offset of array "entry" */
+ unsigned int nr_record; /* array size (= MCE_LOG_LEN) */
+ unsigned int record_len; /* length of struct mce */
+
unsigned flags;
- unsigned recordlen; /* length of struct mce */
- struct mce entry[MCE_LOG_LEN];
};
#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
-#define MCE_LOG_SIGNATURE "MACHINECHECK"
+#define MCE_LOG_SIGNATURE "MACHINECHEC2"
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
#define MCE_GET_LOG_LEN _IOR('M', 2, int)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 684b42e..ad2eb89 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -122,21 +122,30 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
+struct mce_log_cpu {
+ unsigned next;
+ struct mce entry[MCE_LOG_LEN];
+};
+
+DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
+
static struct mce_log mcelog = {
.signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
+ .header_len = offsetof(struct mce_log_cpu, entry),
+ .nr_record = MCE_LOG_LEN,
+ .record_len = sizeof(struct mce),
};
void mce_log(struct mce *mce)
{
+ struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
unsigned next, entry;
mce->finished = 0;
wmb();
do {
- entry = rcu_dereference(mcelog.next);
+ entry = mcelog_cpu->next;
for (;;) {
/*
* When the buffer fills up discard new entries.
@@ -149,7 +158,7 @@ void mce_log(struct mce *mce)
return;
}
/* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
+ if (mcelog_cpu->entry[entry].finished) {
entry++;
continue;
}
@@ -157,12 +166,12 @@ void mce_log(struct mce *mce)
}
smp_rmb();
next = entry + 1;
- } while (cmpxchg(&mcelog.next, entry, next) != entry);
+ } while (cmpxchg_local(&mcelog_cpu->next, entry, next) != entry);
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ memcpy(mcelog_cpu->entry + entry, mce, sizeof(struct mce));
wmb();
- mcelog.entry[entry].finished = 1;
+ mcelog_cpu->entry[entry].finished = 1;
wmb();
mce->finished = 1;
set_bit(0, &mce_need_notify);
@@ -210,6 +219,26 @@ static void print_mce_tail(void)
"Run through mcelog --ascii to decode and contact your hardware vendor\n");
}
+static void print_mce_cpu(int cpu, struct mce *final, u64 mask, u64 res)
+{
+ int i;
+ struct mce_log_cpu *mcelog_cpu;
+
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog_cpu->entry[i];
+ if (!m->finished)
+ continue;
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if ((m->status & mask) != res)
+ continue;
+ if (final && !memcmp(m, final, sizeof(struct mce)))
+ continue;
+ print_mce(m);
+ }
+}
+
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_paniced;
@@ -232,7 +261,7 @@ static void wait_for_panic(void)
static void mce_panic(char *msg, struct mce *final, char *exp)
{
- int i;
+ int cpu;
if (!fake_panic) {
/*
@@ -251,23 +280,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
}
print_mce_head();
/* First print corrected ones that are still unlogged */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- print_mce(m);
- }
- /* Now print uncorrected but with the final one last */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- continue;
- if (!final || memcmp(m, final, sizeof(struct mce)))
- print_mce(m);
- }
+ for_each_online_cpu(cpu)
+ print_mce_cpu(cpu, final, MCI_STATUS_UC, 0);
+ /* Print uncorrected but without the final one */
+ for_each_online_cpu(cpu)
+ print_mce_cpu(cpu, final, MCI_STATUS_UC, MCI_STATUS_UC);
+ /* Finally print the final mce */
if (final)
print_mce(final);
if (cpu_missing)
@@ -1234,6 +1252,22 @@ static int __cpuinit mce_cap_init(void)
return 0;
}
+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit void mce_log_init(void)
+{
+ int cpu;
+
+ if (mcelog.mcelog_cpus)
+ return;
+ mcelog.nr_mcelog_cpus = num_possible_cpus();
+ mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
+ GFP_KERNEL);
+ for_each_possible_cpu(cpu)
+ mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
+}
+
static void mce_init(void)
{
mce_banks_t all_banks;
@@ -1404,6 +1438,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
mce_disabled = 1;
return;
}
+ mce_log_init();
machine_check_vector = do_machine_check;
@@ -1452,13 +1487,16 @@ static int mce_release(struct inode *inode, struct file *file)
return 0;
}
-static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
+static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
{
+ struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
char __user *ubuf = inubuf;
unsigned prev, next;
int i, err;
- next = rcu_dereference(mcelog.next);
+ next = mcelog_cpu->next;
+ if (!next)
+ return 0;
err = 0;
prev = 0;
@@ -1466,9 +1504,9 @@ static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
for (i = prev; i < next; i++) {
int timeout = WRITER_TIMEOUT_NS;
- while (!mcelog.entry[i].finished) {
+ while (!mcelog_cpu->entry[i].finished) {
if (timeout-- <= 0) {
- memset(mcelog.entry + i, 0,
+ memset(mcelog_cpu->entry + i, 0,
sizeof(struct mce));
printk(KERN_WARNING "mcelog: timeout "
"waiting for writer to finish!\n");
@@ -1477,27 +1515,33 @@ static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
ndelay(1);
}
smp_rmb();
- err |= copy_to_user(ubuf, mcelog.entry + i,
+ err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
sizeof(struct mce));
ubuf += sizeof(struct mce);
timeout:
;
}
- memset(mcelog.entry + prev, 0,
+ memset(mcelog_cpu->entry + prev, 0,
(next - prev) * sizeof(struct mce));
prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
+ next = cmpxchg(&mcelog_cpu->next, prev, 0);
} while (next != prev);
- synchronize_sched();
-
return err ? -EFAULT : ubuf - inubuf;
}
static int mce_empty(void)
{
- return !rcu_dereference(mcelog.next);
+ int cpu;
+ struct mce_log_cpu *mcelog_cpu;
+
+ for_each_possible_cpu(cpu) {
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ if (mcelog_cpu->next)
+ return 0;
+ }
+ return 1;
}
static DEFINE_MUTEX(mce_read_mutex);
@@ -1506,7 +1550,7 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
loff_t *off)
{
char __user *ubuf = inubuf;
- int err;
+ int cpu, err = 0;
/* Only supports full reads right now */
if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
@@ -1514,12 +1558,25 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
mutex_lock(&mce_read_mutex);
- err = mce_read_buf(ubuf, usize);
- if (err > 0) {
- ubuf += err;
- err = 0;
+ while (!mce_empty()) {
+ for_each_possible_cpu(cpu) {
+ if (usize < MCE_LOG_LEN * sizeof(struct mce))
+ goto out;
+ err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
+ if (err > 0) {
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ err = 0;
+ } else if (err < 0)
+ goto out;
+ }
+ if (need_resched()) {
+ mutex_unlock(&mce_read_mutex);
+ cond_resched();
+ mutex_lock(&mce_read_mutex);
+ }
}
-
+out:
mutex_unlock(&mce_read_mutex);
return err ? err : ubuf - inubuf;
--
1.6.4.3
next prev parent reply other threads:[~2009-10-05 6:39 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-09-18 10:20 [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Huang Ying
2009-09-18 11:09 ` Ingo Molnar
2009-09-21 5:37 ` Huang Ying
2009-09-22 13:39 ` [PATCH] x86: mce: New MCE logging design Ingo Molnar
2009-10-05 6:23 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
2009-10-05 6:33 ` [PATCH 01/10] x86, mce: remove tsc handling from mce_read Hidetoshi Seto
2009-10-05 6:34 ` [PATCH 02/10] x86, mce: mce_read can check args without mutex Hidetoshi Seto
2009-10-05 6:35 ` [PATCH 03/10] x86, mce: change writer timeout in mce_read Hidetoshi Seto
2009-10-05 6:36 ` [PATCH 04/10] x86, mce: use do-while in mce_log Hidetoshi Seto
2009-10-05 6:37 ` [PATCH 05/10] x86, mce: make mce_log buffer to per-CPU, prep Hidetoshi Seto
2009-10-05 6:38 ` Hidetoshi Seto [this message]
2009-10-05 7:06 ` [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU Andi Kleen
2009-10-05 7:50 ` Hidetoshi Seto
2009-10-09 1:45 ` Huang Ying
2009-10-09 5:34 ` Hidetoshi Seto
2009-10-05 6:40 ` [PATCH 07/10] x86, mce: remove for-loop in mce_log Hidetoshi Seto
2009-10-05 6:41 ` [PATCH 08/10] x86, mce: change barriers " Hidetoshi Seto
2009-10-05 6:42 ` [PATCH 09/10] x86, mce: make mce_log buffer to ring buffer Hidetoshi Seto
2009-10-05 6:44 ` [PATCH 10/10] x86, mce: move mce_log_init() into mce_cap_init() Hidetoshi Seto
2009-10-05 7:07 ` [BUGFIX -v7] x86, MCE: Fix bugs and issues of MCE log ring buffer Hidetoshi Seto
2009-10-05 8:51 ` Frédéric Weisbecker
2009-10-05 15:16 ` Andi Kleen
2009-10-06 5:46 ` Hidetoshi Seto
-- strict thread matches above, loose matches on Subject: below --
2009-10-09 1:53 [PATCH 06/10] x86, mce: make mce_log buffer to per-CPU H. Peter Anvin
2009-10-09 1:53 ` H. Peter Anvin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4AC99483.7020100@jp.fujitsu.com \
--to=seto.hidetoshi@jp.fujitsu.com \
--cc=ak@linux.intel.com \
--cc=hpa@zytor.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=ying.huang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.