From: Tejun Heo <tj@kernel.org>
To: Sachin Sant <sachinp@in.ibm.com>
Cc: Linux/PPC Development <linuxppc-dev@ozlabs.org>,
David Miller <davem@davemloft.net>
Subject: Re: 2.6.31-git5 kernel boot hangs on powerpc
Date: Wed, 23 Sep 2009 23:17:06 +0900 [thread overview]
Message-ID: <4ABA2DE2.6000601@kernel.org> (raw)
In-Reply-To: <4AB9DD8F.1040305@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 605 bytes --]
Tejun Heo wrote:
>> One workaround i have found for this problem is to disable IPv6.
>> With IPv6 disabled the machine boots OK. Till a reliable solution
>> is available for this issue, i will keep IPv6 disabled in my configs.
>
> I'm think it's most likely caused by some code accessing invalid
> percpu address. I'm currently writing up access validator. Should be
> done in several hours. So, ipv6 it is. I couldn't reproduce your
> problem here. I'll give ipv6 a shot.
Can you please apply the attached patch and see whether anything
interesting shows up in the kernel log?
Thanks.
--
tejun
[-- Attachment #2: pcpu-verify --]
[-- Type: text/plain, Size: 13367 bytes --]
---
arch/ia64/include/asm/sn/arch.h | 4 -
arch/powerpc/mm/stab.c | 4 -
arch/x86/kernel/cpu/cpu_debug.c | 12 ++--
arch/x86/kernel/cpu/perf_event.c | 4 -
include/asm-generic/percpu.h | 23 +++++---
include/linux/percpu-defs.h | 6 ++
include/linux/percpu.h | 7 ++
kernel/softirq.c | 8 +-
lib/Kconfig.debug | 17 +++++
mm/percpu.c | 112 +++++++++++++++++++++++++++++++++++++++
10 files changed, 173 insertions(+), 24 deletions(-)
Index: work/arch/ia64/include/asm/sn/arch.h
===================================================================
--- work.orig/arch/ia64/include/asm/sn/arch.h
+++ work/arch/ia64/include/asm/sn/arch.h
@@ -71,8 +71,8 @@ DECLARE_PER_CPU(struct sn_hub_info_s, __
* Compact node ID to nasid mappings kept in the per-cpu data areas of each
* cpu.
*/
-DECLARE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]);
-#define sn_cnodeid_to_nasid (&__get_cpu_var(__sn_cnodeid_to_nasid[0]))
+DECLARE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid);
+#define sn_cnodeid_to_nasid (&__get_cpu_var(__sn_cnodeid_to_nasid)[0])
extern u8 sn_partition_id;
Index: work/arch/powerpc/mm/stab.c
===================================================================
--- work.orig/arch/powerpc/mm/stab.c
+++ work/arch/powerpc/mm/stab.c
@@ -138,7 +138,7 @@ static int __ste_allocate(unsigned long
if (!is_kernel_addr(ea)) {
offset = __get_cpu_var(stab_cache_ptr);
if (offset < NR_STAB_CACHE_ENTRIES)
- __get_cpu_var(stab_cache[offset++]) = stab_entry;
+ __get_cpu_var(stab_cache)[offset++] = stab_entry;
else
offset = NR_STAB_CACHE_ENTRIES+1;
__get_cpu_var(stab_cache_ptr) = offset;
@@ -185,7 +185,7 @@ void switch_stab(struct task_struct *tsk
int i;
for (i = 0; i < offset; i++) {
- ste = stab + __get_cpu_var(stab_cache[i]);
+ ste = stab + __get_cpu_var(stab_cache)[i];
ste->esid_data = 0; /* invalidate entry */
}
} else {
Index: work/arch/x86/kernel/cpu/cpu_debug.c
===================================================================
--- work.orig/arch/x86/kernel/cpu/cpu_debug.c
+++ work/arch/x86/kernel/cpu/cpu_debug.c
@@ -531,7 +531,7 @@ static int cpu_create_file(unsigned cpu,
/* Already intialized */
if (file == CPU_INDEX_BIT)
- if (per_cpu(cpu_arr[type].init, cpu))
+ if (per_cpu(cpu_arr, cpu)[type].init)
return 0;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -543,7 +543,7 @@ static int cpu_create_file(unsigned cpu,
priv->reg = reg;
priv->file = file;
mutex_lock(&cpu_debug_lock);
- per_cpu(priv_arr[type], cpu) = priv;
+ per_cpu(priv_arr, cpu)[type] = priv;
per_cpu(cpu_priv_count, cpu)++;
mutex_unlock(&cpu_debug_lock);
@@ -552,10 +552,10 @@ static int cpu_create_file(unsigned cpu,
dentry, (void *)priv, &cpu_fops);
else {
debugfs_create_file(cpu_base[type].name, S_IRUGO,
- per_cpu(cpu_arr[type].dentry, cpu),
+ per_cpu(cpu_arr, cpu)[type].dentry,
(void *)priv, &cpu_fops);
mutex_lock(&cpu_debug_lock);
- per_cpu(cpu_arr[type].init, cpu) = 1;
+ per_cpu(cpu_arr, cpu)[type].init = 1;
mutex_unlock(&cpu_debug_lock);
}
@@ -615,7 +615,7 @@ static int cpu_init_allreg(unsigned cpu,
if (!is_typeflag_valid(cpu, cpu_base[type].flag))
continue;
cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
- per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+ per_cpu(cpu_arr, cpu)[type].dentry = cpu_dentry;
if (type < CPU_TSS_BIT)
err = cpu_init_msr(cpu, type, cpu_dentry);
@@ -677,7 +677,7 @@ static void __exit cpu_debug_exit(void)
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
- kfree(per_cpu(priv_arr[i], cpu));
+ kfree(per_cpu(priv_arr, cpu)[i]);
}
module_init(cpu_debug_init);
Index: work/arch/x86/kernel/cpu/perf_event.c
===================================================================
--- work.orig/arch/x86/kernel/cpu/perf_event.c
+++ work/arch/x86/kernel/cpu/perf_event.c
@@ -1253,7 +1253,7 @@ x86_perf_event_set_period(struct perf_ev
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
- per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+ per_cpu(pmc_prev_left, smp_processor_id())[idx] = left;
/*
* The hw event starts counting from this event offset,
@@ -1470,7 +1470,7 @@ void perf_event_print_debug(void)
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
- prev_left = per_cpu(pmc_prev_left[idx], cpu);
+ prev_left = per_cpu(pmc_prev_left, cpu)[idx];
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
Index: work/include/asm-generic/percpu.h
===================================================================
--- work.orig/include/asm-generic/percpu.h
+++ work/include/asm-generic/percpu.h
@@ -49,13 +49,22 @@ extern unsigned long __per_cpu_offset[NR
* established ways to produce a usable pointer from the percpu variable
* offset.
*/
-#define per_cpu(var, cpu) \
- (*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))
-#define __get_cpu_var(var) \
- (*SHIFT_PERCPU_PTR(&per_cpu_var(var), my_cpu_offset))
-#define __raw_get_cpu_var(var) \
- (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset))
-
+#define per_cpu(var, cpu) (*({ \
+ typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var); \
+ unsigned int __pcpu_cpu__ = (cpu); \
+ pcpu_verify_access(__pcpu_ptr__, __pcpu_cpu__); \
+ SHIFT_PERCPU_PTR(__pcpu_ptr__, per_cpu_offset(__pcpu_cpu__)); \
+}))
+#define __get_cpu_var(var) (*({ \
+ typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var); \
+ pcpu_verify_access(__pcpu_ptr__, NR_CPUS); \
+ SHIFT_PERCPU_PTR(__pcpu_ptr__, my_cpu_offset); \
+}))
+#define __raw_get_cpu_var(var) (*({ \
+ typeof(&per_cpu_var(var)) __pcpu_ptr__ = &per_cpu_var(var); \
+ pcpu_verify_access(__pcpu_ptr__, NR_CPUS); \
+ SHIFT_PERCPU_PTR(__pcpu_ptr__, __my_cpu_offset); \
+}))
#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
extern void setup_per_cpu_areas(void);
Index: work/include/linux/percpu-defs.h
===================================================================
--- work.orig/include/linux/percpu-defs.h
+++ work/include/linux/percpu-defs.h
@@ -7,6 +7,12 @@
*/
#define per_cpu_var(var) per_cpu__##var
+#ifdef CONFIG_DEBUG_VERIFY_PER_CPU
+extern void pcpu_verify_access(void *ptr, unsigned int cpu);
+#else
+#define pcpu_verify_access(ptr, cpu) do {} while (0)
+#endif
+
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
Index: work/include/linux/percpu.h
===================================================================
--- work.orig/include/linux/percpu.h
+++ work/include/linux/percpu.h
@@ -127,7 +127,12 @@ extern int __init pcpu_page_first_chunk(
* dynamically allocated. Non-atomic access to the current CPU's
* version should probably be combined with get_cpu()/put_cpu().
*/
-#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+#define per_cpu_ptr(ptr, cpu) ({ \
+ typeof(ptr) __pcpu_ptr__ = (ptr); \
+ unsigned int __pcpu_cpu__ = (cpu); \
+ pcpu_verify_access(__pcpu_ptr__, __pcpu_cpu__); \
+ SHIFT_PERCPU_PTR(__pcpu_ptr__, per_cpu_offset((__pcpu_cpu__))); \
+})
extern void *__alloc_reserved_percpu(size_t size, size_t align);
Index: work/kernel/softirq.c
===================================================================
--- work.orig/kernel/softirq.c
+++ work/kernel/softirq.c
@@ -560,7 +560,7 @@ EXPORT_PER_CPU_SYMBOL(softirq_work_list)
static void __local_trigger(struct call_single_data *cp, int softirq)
{
- struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+ struct list_head *head = &__get_cpu_var(softirq_work_list)[softirq];
list_add_tail(&cp->list, head);
@@ -656,13 +656,13 @@ static int __cpuinit remote_softirq_cpu_
local_irq_disable();
for (i = 0; i < NR_SOFTIRQS; i++) {
- struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+ struct list_head *head = &per_cpu(softirq_work_list, cpu)[i];
struct list_head *local_head;
if (list_empty(head))
continue;
- local_head = &__get_cpu_var(softirq_work_list[i]);
+ local_head = &__get_cpu_var(softirq_work_list)[i];
list_splice_init(head, local_head);
raise_softirq_irqoff(i);
}
@@ -688,7 +688,7 @@ void __init softirq_init(void)
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
for (i = 0; i < NR_SOFTIRQS; i++)
- INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
+ INIT_LIST_HEAD(&per_cpu(softirq_work_list, cpu)[i]);
}
register_hotcpu_notifier(&remote_softirq_cpu_notifier);
Index: work/lib/Kconfig.debug
===================================================================
--- work.orig/lib/Kconfig.debug
+++ work/lib/Kconfig.debug
@@ -805,6 +805,21 @@ config DEBUG_BLOCK_EXT_DEVT
Say N if you are unsure.
+config DEBUG_VERIFY_PER_CPU
+ bool "Verify per-cpu accesses"
+ depends on DEBUG_KERNEL
+ depends on SMP
+ help
+
+ This option makes percpu access macros to verify the
+ specified processor and percpu variable offset on each
+ access. This helps catching percpu variable access bugs
+ which may cause corruption on unrelated memory region making
+ it very difficult to catch at the cost of making percpu
+ accesses considerably slow.
+
+ Say N if you are unsure.
+
config DEBUG_FORCE_WEAK_PER_CPU
bool "Force weak per-cpu definitions"
depends on DEBUG_KERNEL
@@ -820,6 +835,8 @@ config DEBUG_FORCE_WEAK_PER_CPU
To ensure that generic code follows the above rules, this
option forces all percpu variables to be defined as weak.
+ Say N if you are unsure.
+
config LKDTM
tristate "Linux Kernel Dump Test Tool Module"
depends on DEBUG_KERNEL
Index: work/mm/percpu.c
===================================================================
--- work.orig/mm/percpu.c
+++ work/mm/percpu.c
@@ -1241,6 +1241,118 @@ void free_percpu(void *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
+#ifdef CONFIG_DEBUG_VERIFY_PER_CPU
+static struct pcpu_chunk *pcpu_verify_match_chunk(void *addr)
+{
+ void *first_start = pcpu_first_chunk->base_addr;
+ struct pcpu_chunk *chunk;
+ int slot;
+
+ /* is it in the first chunk? */
+ if (addr >= first_start && addr < first_start + pcpu_unit_size) {
+ /* is it in the reserved area? */
+ if (addr < first_start + pcpu_reserved_chunk_limit)
+ return pcpu_reserved_chunk;
+ return pcpu_first_chunk;
+ }
+
+ /* walk each dynamic chunk */
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_slot[slot], list)
+ if (addr >= chunk->base_addr &&
+ addr < chunk->base_addr + pcpu_unit_size)
+ return chunk;
+ return NULL;
+}
+
+void pcpu_verify_access(void *ptr, unsigned int cpu)
+{
+ static bool verifying[NR_CPUS];
+ static int warn_limit = 10;
+ char cbuf[80], obuf[160];
+ void *addr = __pcpu_ptr_to_addr(ptr);
+ bool is_static = false;
+ struct pcpu_chunk *chunk;
+ unsigned long flags;
+ int i, addr_off, off, len, end;
+
+ /* not been initialized yet or whined enough already */
+ if (unlikely(!pcpu_first_chunk || !warn_limit))
+ return;
+
+ /* don't re-enter */
+ preempt_disable();
+ if (verifying[raw_smp_processor_id()]) {
+ preempt_enable_no_resched();
+ return;
+ }
+ verifying[raw_smp_processor_id()] = true;
+
+ cbuf[0] = '\0';
+ obuf[0] = '\0';
+
+ if (unlikely(cpu < NR_CPUS && !cpu_possible(cpu)) && warn_limit)
+ snprintf(cbuf, sizeof(cbuf), "invalid cpu %u", cpu);
+
+ /*
+ * We can enter this function from weird places and have no
+ * way to reliably avoid deadlock. If lock is available, grab
+ * it and verify. If not, just let it go through.
+ */
+ if (!spin_trylock_irqsave(&pcpu_lock, flags))
+ goto out;
+
+ chunk = pcpu_verify_match_chunk(addr);
+ if (!chunk) {
+ snprintf(obuf, sizeof(obuf),
+ "no matching chunk ptr=%p addr=%p", ptr, addr);
+ goto out_unlock;
+ }
+
+ addr_off = addr - chunk->base_addr;
+ if (chunk->base_addr == pcpu_first_chunk->base_addr)
+ if (chunk == pcpu_reserved_chunk || addr_off < -chunk->map[0])
+ is_static = true;
+
+ for (i = 0, off = 0; i < chunk->map_used; i++, off = end) {
+ len = chunk->map[i];
+ end = off + abs(len);
+
+ if (addr_off == off) {
+ if (unlikely(len > 0))
+ snprintf(obuf, sizeof(obuf),
+ "free area accessed ptr=%p addr=%p "
+ "off=%d len=%d", ptr, addr, off, len);
+ break;
+ }
+ if (!is_static && off < addr_off && addr_off < end) {
+ snprintf(obuf, sizeof(obuf),
+ "%sarea accessed in the middle ptr=%p "
+ "addr=%p:%d off=%d len=%d",
+ len > 0 ? "free " : "",
+ ptr, addr, addr_off, off, abs(len));
+ break;
+ }
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+out:
+ if (unlikely(cbuf[0] || obuf[0])) {
+ printk(KERN_ERR "PERCPU: %s%s%s\n",
+ cbuf, cbuf[0] ? ", " : "", obuf);
+ dump_stack();
+ if (!--warn_limit)
+ printk(KERN_WARNING "PERCPU: access warning limit "
+ "reached, turning off access validation\n");
+ }
+
+ verifying[raw_smp_processor_id()] = false;
+ preempt_enable_no_resched();
+}
+EXPORT_SYMBOL_GPL(pcpu_verify_access);
+#endif /* CONFIG_DEBUG_VERIFY_PER_CPU */
+
static inline size_t pcpu_calc_fc_sizes(size_t static_size,
size_t reserved_size,
ssize_t *dyn_sizep)
next prev parent reply other threads:[~2009-09-23 14:17 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-09-16 12:25 2.6.31-git5 kernel boot hangs on powerpc Sachin Sant
2009-09-16 12:25 ` Sachin Sant
2009-09-17 10:51 ` Sachin Sant
2009-09-17 11:13 ` Benjamin Herrenschmidt
2009-09-17 15:53 ` Tejun Heo
2009-09-17 16:41 ` Sachin Sant
2009-09-19 8:54 ` Sachin Sant
2009-09-23 8:23 ` Sachin Sant
2009-09-23 8:34 ` Tejun Heo
2009-09-23 14:17 ` Tejun Heo [this message]
2009-09-24 7:58 ` Sachin Sant
2009-09-24 12:59 ` Tejun Heo
2009-09-24 13:23 ` Sachin Sant
2009-09-24 21:05 ` Benjamin Herrenschmidt
2009-09-25 3:22 ` Tejun Heo
2009-09-25 3:40 ` Benjamin Herrenschmidt
2009-09-25 7:15 ` Sachin Sant
2009-09-25 7:39 ` Tejun Heo
2009-09-25 7:43 ` Tejun Heo
2009-09-25 8:03 ` Sachin Sant
2009-09-25 9:01 ` Tejun Heo
2009-09-25 9:48 ` Benjamin Herrenschmidt
2009-10-05 6:54 ` Sachin Sant
2009-09-25 8:31 ` Benjamin Herrenschmidt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4ABA2DE2.6000601@kernel.org \
--to=tj@kernel.org \
--cc=davem@davemloft.net \
--cc=linuxppc-dev@ozlabs.org \
--cc=sachinp@in.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.