* [PATCH] powerpc: return the_cpu_ spec from identify_cpu
From: Scott Wood @ 2011-07-25 21:04 UTC (permalink / raw)
To: benh; +Cc: linuxppc-dev
Commit af9eef3c7b1ed004c378c89b87642f4937337d50 caused cpu_setup to see
the_cpu_spec, rather than the source struct. However, on 32-bit, the
return value of identify_cpu was being used for feature fixups, and
identify_cpu was returning the source struct. So if cpu_setup patches
the feature bits, the update won't affect the fixups.
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
arch/powerpc/kernel/cputable.c | 11 ++++++-----
1 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 9fb9332..fa44ff5 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -2051,7 +2051,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
static struct cpu_spec the_cpu_spec;
-static void __init setup_cpu_spec(unsigned long offset, struct cpu_spec *s)
+static struct cpu_spec * __init setup_cpu_spec(unsigned long offset,
+ struct cpu_spec *s)
{
struct cpu_spec *t = &the_cpu_spec;
struct cpu_spec old;
@@ -2114,6 +2115,8 @@ static void __init setup_cpu_spec(unsigned long offset, struct cpu_spec *s)
t->cpu_setup(offset, t);
}
#endif /* CONFIG_PPC64 || CONFIG_BOOKE */
+
+ return t;
}
struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
@@ -2124,10 +2127,8 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
s = PTRRELOC(s);
for (i = 0; i < ARRAY_SIZE(cpu_specs); i++,s++) {
- if ((pvr & s->pvr_mask) == s->pvr_value) {
- setup_cpu_spec(offset, s);
- return s;
- }
+ if ((pvr & s->pvr_mask) == s->pvr_value)
+ return setup_cpu_spec(offset, s);
}
BUG();
--
1.7.4.1
^ permalink raw reply related
* [PATCH] powerpc: mtspr/mtmsr should take an unsigned long
From: Scott Wood @ 2011-07-25 21:02 UTC (permalink / raw)
To: benh; +Cc: linuxppc-dev
Add a cast in case the caller passes in a different type, as it would
if mtspr/mtmsr were functions.
Previously, if a 64-bit type was passed in on 32-bit, GCC would bind the
constraint to a pair of registers, and would substitute the first register
in the pair in the asm code. This corresponds to the upper half of the
64-bit register, which is generally not the desired behavior.
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
If you're wondering why you'd pass a 64-bit value to one of these macros
on 32-bit, it came up when trying to load an SPR from kvm_vcpu_arch_shared.
arch/powerpc/include/asm/reg.h | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 213d1d7..1b45133 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1007,13 +1007,16 @@
#define mtmsrd(v) __mtmsrd((v), 0)
#define mtmsr(v) mtmsrd(v)
#else
-#define mtmsr(v) asm volatile("mtmsr %0" : : "r" (v) : "memory")
+#define mtmsr(v) asm volatile("mtmsr %0" : \
+ : "r" ((unsigned long)(v)) \
+ : "memory")
#endif
#define mfspr(rn) ({unsigned long rval; \
asm volatile("mfspr %0," __stringify(rn) \
: "=r" (rval)); rval;})
-#define mtspr(rn, v) asm volatile("mtspr " __stringify(rn) ",%0" : : "r" (v)\
+#define mtspr(rn, v) asm volatile("mtspr " __stringify(rn) ",%0" : \
+ : "r" ((unsigned long)(v)) \
: "memory")
#ifdef __powerpc64__
--
1.7.4.1
^ permalink raw reply related
* [PATCH] powerpc/nvram: Add compression to fit more oops output into NVRAM
From: Jim Keniston @ 2011-07-25 17:54 UTC (permalink / raw)
To: benh, linuxppc-dev
Capture more than twice as much text from the printk buffer, and
compress it to fit it in the lnx,oops-log NVRAM partition. You
can view the compressed text using the new (as of July 20) --unzip
option of the nvram command in the powerpc-utils package.
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
---
arch/powerpc/include/asm/rtas.h | 6 +
arch/powerpc/platforms/pseries/nvram.c | 171 +++++++++++++++++++++++++++++++-
2 files changed, 168 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 58625d1..41f69ae 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -249,10 +249,12 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
#define ERR_FLAG_ALREADY_LOGGED 0x0
#define ERR_FLAG_BOOT 0x1 /* log was pulled from NVRAM on boot */
#define ERR_TYPE_RTAS_LOG 0x2 /* from rtas event-scan */
-#define ERR_TYPE_KERNEL_PANIC 0x4 /* from panic() */
+#define ERR_TYPE_KERNEL_PANIC 0x4 /* from die()/panic() */
+#define ERR_TYPE_KERNEL_PANIC_GZ 0x8 /* ditto, compressed */
/* All the types and not flags */
-#define ERR_TYPE_MASK (ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC)
+#define ERR_TYPE_MASK \
+ (ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC | ERR_TYPE_KERNEL_PANIC_GZ)
#define RTAS_DEBUG KERN_DEBUG "RTAS: "
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 00cc3a0..a76b228 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -18,6 +18,8 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/kmsg_dump.h>
+#include <linux/ctype.h>
+#include <linux/zlib.h>
#include <asm/uaccess.h>
#include <asm/nvram.h>
#include <asm/rtas.h>
@@ -78,8 +80,41 @@ static struct kmsg_dumper nvram_kmsg_dumper = {
#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */
static unsigned long last_unread_rtas_event; /* timestamp */
-/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */
-static char *oops_buf;
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we're capturing.
+ *
+ * oops_buf[] holds the compressed text, preceded by a prefix.
+ * The prefix is just a u16 holding the length of the compressed* text.
+ * (*Or uncompressed, if compression fails.) oops_buf[] gets written
+ * to NVRAM.
+ *
+ * oops_len points to the prefix. oops_data points to the compressed text.
+ *
+ * +- oops_buf
+ * | +- oops_data
+ * v v
+ * +------------+-----------------------------------------------+
+ * | length | text |
+ * | (2 bytes) | (oops_data_sz bytes) |
+ * +------------+-----------------------------------------------+
+ * ^
+ * +- oops_len
+ *
+ * We preallocate these buffers during init to avoid kmalloc during oops/panic.
+ */
+static size_t big_oops_buf_sz;
+static char *big_oops_buf, *oops_buf;
+static u16 *oops_len;
+static char *oops_data;
+static size_t oops_data_sz;
+
+/* Compression parameters */
+#define COMPR_LEVEL 6
+#define WINDOW_BITS 12
+#define MEM_LEVEL 4
+static struct z_stream_s stream;
static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
{
@@ -387,11 +422,44 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
sizeof(rtas_log_partition));
}
oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
+ if (!oops_buf) {
+ pr_err("nvram: No memory for %s partition\n",
+ oops_log_partition.name);
+ return;
+ }
+ oops_len = (u16*) oops_buf;
+ oops_data = oops_buf + sizeof(u16);
+ oops_data_sz = oops_log_partition.size - sizeof(u16);
+
+ /*
+ * Figure compression (preceded by elimination of each line's <n>
+ * severity prefix) will reduce the oops/panic report to at most
+ * 45% of its original size.
+ */
+ big_oops_buf_sz = (oops_data_sz * 100) / 45;
+ big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+ if (big_oops_buf) {
+ stream.workspace = kmalloc(zlib_deflate_workspacesize(
+ WINDOW_BITS, MEM_LEVEL), GFP_KERNEL);
+ if (!stream.workspace) {
+ pr_err("nvram: No memory for compression workspace; "
+ "skipping compression of %s partition data\n",
+ oops_log_partition.name);
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ }
+ } else {
+ pr_err("No memory for uncompressed %s data; "
+ "skipping compression\n", oops_log_partition.name);
+ stream.workspace = NULL;
+ }
+
rc = kmsg_dump_register(&nvram_kmsg_dumper);
if (rc != 0) {
pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
kfree(oops_buf);
- return;
+ kfree(big_oops_buf);
+ kfree(stream.workspace);
}
}
@@ -473,7 +541,83 @@ static int clobbering_unread_rtas_event(void)
NVRAM_RTAS_READ_TIMEOUT);
}
-/* our kmsg_dump callback */
+/* Squeeze out each line's <n> severity prefix. */
+static size_t elide_severities(char *buf, size_t len)
+{
+ char *in, *out, *buf_end = buf + len;
+ /* Assume a <n> at the very beginning marks the start of a line. */
+ int newline = 1;
+
+ in = out = buf;
+ while (in < buf_end) {
+ if (newline && in+3 <= buf_end &&
+ *in == '<' && isdigit(in[1]) && in[2] == '>') {
+ in += 3;
+ newline = 0;
+ } else {
+ newline = (*in == '\n');
+ *out++ = *in++;
+ }
+ }
+ return out - buf;
+}
+
+/* Derived from logfs_compress() */
+static int nvram_compress(const void *in, void *out, size_t inlen,
+ size_t outlen)
+{
+ int err, ret;
+
+ ret = -EIO;
+ err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+ MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (err != Z_OK)
+ goto error;
+
+ stream.next_in = in;
+ stream.avail_in = inlen;
+ stream.total_in = 0;
+ stream.next_out = out;
+ stream.avail_out = outlen;
+ stream.total_out = 0;
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END)
+ goto error;
+
+ err = zlib_deflateEnd(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ if (stream.total_out >= stream.total_in)
+ goto error;
+
+ ret = stream.total_out;
+error:
+ return ret;
+}
+
+/* Compress the text from big_oops_buf into oops_buf. */
+static int zip_oops(size_t text_len)
+{
+ int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
+ oops_data_sz);
+ if (zipped_len < 0) {
+ pr_err("nvram: compression failed; returned %d\n", zipped_len);
+ pr_err("nvram: logging uncompressed oops/panic report\n");
+ return -1;
+ }
+ *oops_len = (u16) zipped_len;
+ return 0;
+}
+
+/*
+ * This is our kmsg_dump callback, called after an oops or panic report
+ * has been written to the printk buffer. We want to capture as much
+ * of the printk buffer as possible. First, capture as much as we can
+ * that we think will compress sufficiently to fit in the lnx,oops-log
+ * partition. If that's too much, go back and capture uncompressed text.
+ */
static void oops_to_nvram(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason,
const char *old_msgs, unsigned long old_len,
@@ -482,6 +626,8 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
static unsigned int oops_count = 0;
static bool panicking = false;
size_t text_len;
+ unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+ int rc = -1;
switch (reason) {
case KMSG_DUMP_RESTART:
@@ -509,8 +655,19 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
if (clobbering_unread_rtas_event())
return;
- text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len,
- oops_buf, oops_log_partition.size);
+ if (big_oops_buf) {
+ text_len = capture_last_msgs(old_msgs, old_len,
+ new_msgs, new_len, big_oops_buf, big_oops_buf_sz);
+ text_len = elide_severities(big_oops_buf, text_len);
+ rc = zip_oops(text_len);
+ }
+ if (rc != 0) {
+ text_len = capture_last_msgs(old_msgs, old_len,
+ new_msgs, new_len, oops_data, oops_data_sz);
+ err_type = ERR_TYPE_KERNEL_PANIC;
+ *oops_len = (u16) text_len;
+ }
+
(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
- (int) text_len, ERR_TYPE_KERNEL_PANIC, ++oops_count);
+ (int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count);
}
^ permalink raw reply related
* Re: [PATCH 13/14] 85xx: consolidate of_platform_bus_probe calls
From: Scott Wood @ 2011-07-25 15:40 UTC (permalink / raw)
To: Dmitry Eremin-Solenikov; +Cc: Paul Mackerras, Linux PPC Development
In-Reply-To: <CALT56yOZkLW=AmkyX77g5agxfBb0h7Fz8Q_enth-CJycJ3SkFA@mail.gmail.com>
On Sat, 23 Jul 2011 01:45:53 +0400
Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> wrote:
> I see your point. I just wasn't thinking too much about ot-of-tree trees.
> My thought was that if someone updates the kernel, he can also update the dtb.
Sometimes there are firmware dependencies that make that difficult. And
even if it's just user laziness/forgetfulness, that still translates to
extra support requests.
> Could you please update the lbc.txt suggesting the compatibility
> with simple-bus for lbc? Or you thing that it would be wrong?
>
> I think we should define compatibility list as "fsl,mpcXXXX-localbus",
> "fsl,pqXXXXX-localbus", "simple-bus", noting that by default new
> platforms/boards should only use "simple-bus" internally. Does this
> look reasonable for you? I can then try to provide a patch.
I'm OK with saying that localbus nodes should have simple-bus in new trees,
and defining canonical compatible values (chips with eLBC should be
"fsl,XXXX-elbc", "fsl,elbc", "simple-bus"). I'm not sure what you mean by
"should only use simple-bus internally", especially in the context of the
binding.
> What do you suggest/prefer? To add .name="localbus" to generic code
> or to have board-specific hooks (like one for mpc834xemitx)?
Just add localbus to the generic table.
-Scott
^ permalink raw reply
* Re: perf PPC: kernel panic with callchains and context switch events
From: David Ahern @ 2011-07-25 15:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Anton Blanchard
Cc: LKML, linux-perf-users, Paul Mackerras, linuxppc-dev
In-Reply-To: <1311558949.25044.614.camel@pasglop>
Hi Ben:
On 07/24/2011 07:55 PM, Benjamin Herrenschmidt wrote:
> On Sun, 2011-07-24 at 11:18 -0600, David Ahern wrote:
>> On 07/20/2011 03:57 PM, David Ahern wrote:
>>> I am hoping someone familiar with PPC can help understand a panic that
>>> is generated when capturing callchains with context switch events.
>>>
>>> Call trace is below. The short of it is that walking the callchain
>>> generates a page fault. To handle the page fault the mmap_sem is needed,
>>> but it is currently held by setup_arg_pages. setup_arg_pages calls
>>> shift_arg_pages with the mmap_sem held. shift_arg_pages then calls
>>> move_page_tables which has a cond_resched at the top of its for loop. If
>>> the cond_resched() is removed from move_page_tables everything works
>>> beautifully - no panics.
>>>
>>> So, the question: is it normal for walking the stack to trigger a page
>>> fault on PPC? The panic is not seen on x86 based systems.
>>
>> Can anyone confirm whether page faults while walking the stack are
>> normal for PPC? We really want to use the context switch event with
>> callchains and need to understand whether this behavior is normal. Of
>> course if it is normal, a way to address the problem without a panic
>> will be needed.
>
> Now that leads to interesting discoveries :-) Becky, can you read all
> the way and let me know what you think ?
>
> So, trying to walk the user stack directly will potentially cause page
> faults if it's done by direct access. So if you're going to do it in a
> spot where you can't afford it, you need to pagefault_disable() I
> suppose. I think the problem with our existing code is that it's missing
> those around __get_user_inatomic().
>
> In fact, arguably, we don't want the hash code from modifying the hash
> either (or even hashing things in). Our 64-bit code handles it today in
> perf_callchain.c in a way that involves pretty much duplicating the
> functionality of __get_user_pages_fast() as used by x86 (see below), but
> as a fallback from a direct access which misses the pagefault_disable()
> as well.
>
> I think it comes from an old assumption that this would always be called
> from an nmi, and the explicit tracepoints broke that assumption.
>
> In fact we probably want to bump the NMI count, not just the IRQ count
> as pagefault_disable() does, to make sure we prevent hashing.
>
> x86 does things differently, using __get_user_pages_fast() (a variant of
> get_user_page_fast() that doesn't fallback to normal get_user_pages()).
>
> Now, we could do the same (use __gup_fast too), but I can see a
> potential issue with ppc 32-bit platforms that have 64-bit PTEs, since
> we could end up GUP'ing in the middle of the two accesses.
>
> Becky: I think gup_fast is generally broken on 32-bit with 64-bit PTE
> because of that, the problem isn't specific to perf backtraces, I'll
> propose a solution further down.
>
> Now, on x86, there is a similar problem with PAE, which is handled by
>
> - having gup disable IRQs
> - rely on the fact that to change from a valid value to another valid
> value, the PTE will first get invalidated, which requires an IPI
> and thus will be blocked by our interrupts being off
>
> We do the first part, but the second part will break if we use HW TLB
> invalidation broadcast (yet another reason why those are bad, I think I
> will write a blog entry about it one of these days).
>
> I think we can work around this while keeping our broadcast TLB
> invalidations by having the invalidation code also increment a global
> generation count (using the existing lock used by the invalidation code,
> all 32-bit platforms have such a lock).
>
> From there, gup_fast can be changed to, with proper ordering, check the
> generation count around the loading of the PTE and loop if it has
> changed, kind-of a seqlock.
>
> We also need the NMI count bump if we are going to try to keep the
> attempt at doing a direct access first for perfs.
>
> Becky, do you feel like giving that a shot or should I find another
> victim ? (Or even do it myself ... ) :-)
Did you have something in mind besides the patch Anton sent? We'll give
that one a try and see how it works. (Thanks, Anton!)
David
>
> Cheers,
> Ben.
>
>> Thanks,
>> David
>>
>>>
>>> [<b0180e00>]rb_erase+0x1b4/0x3e8
>>> [<b00430f4>]__dequeue_entity+0x50/0xe8
>>> [<b0043304>]set_next_entity+0x178/0x1bc
>>> [<b0043440>]pick_next_task_fair+0xb0/0x118
>>> [<b02ada80>]schedule+0x500/0x614
>>> [<b02afaa8>]rwsem_down_failed_common+0xf0/0x264
>>> [<b02afca0>]rwsem_down_read_failed+0x34/0x54
>>> [<b02aed4c>]down_read+0x3c/0x54
>>> [<b0023b58>]do_page_fault+0x114/0x5e8
>>> [<b001e350>]handle_page_fault+0xc/0x80
>>> [<b0022dec>]perf_callchain+0x224/0x31c
>>> [<b009ba70>]perf_prepare_sample+0x240/0x2fc
>>> [<b009d760>]__perf_event_overflow+0x280/0x398
>>> [<b009d914>]perf_swevent_overflow+0x9c/0x10c
>>> [<b009db54>]perf_swevent_ctx_event+0x1d0/0x230
>>> [<b009dc38>]do_perf_sw_event+0x84/0xe4
>>> [<b009dde8>]perf_sw_event_context_switch+0x150/0x1b4
>>> [<b009de90>]perf_event_task_sched_out+0x44/0x2d4
>>> [<b02ad840>]schedule+0x2c0/0x614
>>> [<b0047dc0>]__cond_resched+0x34/0x90
>>> [<b02adcc8>]_cond_resched+0x4c/0x68
>>> [<b00bccf8>]move_page_tables+0xb0/0x418
>>> [<b00d7ee0>]setup_arg_pages+0x184/0x2a0
>>> [<b0110914>]load_elf_binary+0x394/0x1208
>>> [<b00d6e28>]search_binary_handler+0xe0/0x2c4
>>> [<b00d834c>]do_execve+0x1bc/0x268
>>> [<b0015394>]sys_execve+0x84/0xc8
>>> [<b001df10>]ret_from_syscall+0x0/0x3c
>>>
>>> Thanks,
>>> David
>> _______________________________________________
>> Linuxppc-dev mailing list
>> Linuxppc-dev@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
>
^ permalink raw reply
* [GIT PULL] Please pull powerpc.git next branch
From: Kumar Gala @ 2011-07-25 14:08 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
[ a few minor fixes ]
The following changes since commit 50d2a4223bb875d1e3a7ee97d40dd03bf31ce1b7:
powerpc: Copy back TIF flags on return from softirq stack (2011-07-22 13:38:58 +1000)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/galak/powerpc.git next
Fabio Baltieri (1):
powerpc/85xx: fix mpic configuration in CAMP mode
Timur Tabi (1):
drivers/virt: add missing linux/interrupt.h to fsl_hypervisor.c
arch/powerpc/platforms/85xx/mpc85xx_ds.c | 3 ++-
arch/powerpc/platforms/85xx/mpc85xx_rdb.c | 5 +++--
drivers/virt/fsl_hypervisor.c | 1 +
3 files changed, 6 insertions(+), 3 deletions(-)
^ permalink raw reply
* Re: Linux 3.0 boot failure on the Powerbook G4
From: Michael Büsch @ 2011-07-25 13:03 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1311549818.25044.587.camel@pasglop>
On Mon, 25 Jul 2011 09:23:38 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> Hrm.. the faulting address is outside of the zImage. Odd.
>
> Can you try loading a plain vmlinux instead ? (feel free to strip it).
The plain unstripped vmlinux boots fine:
mb@maggie:~$ uname -a
Linux maggie 3.0.0 #3 PREEMPT Sun Jul 24 11:51:30 CEST 2011 ppc GNU/Linux
Is there something going wrong in the uncompress trampoline?
--
Greetings, Michael.
^ permalink raw reply
* Re: [PATCH 0/5] ppc64 scheduler fixes
From: Peter Zijlstra @ 2011-07-25 12:41 UTC (permalink / raw)
To: Anton Blanchard; +Cc: mingo, linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>
On Mon, 2011-07-25 at 12:33 +1000, Anton Blanchard wrote:
> Here are a set of ppc64 scheduler fixes that help with some
> multi node performance issues.
They look fine to me. I'll probably ping you when I'll rip out all that
SD_NODES_PER_DOMAIN crap for good, but until then I'm fine with you
fiddling it for ppc64.
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
^ permalink raw reply
* mtu issue with gianfar driver
From: Kumar Reddy Suresh-B22303 @ 2011-07-25 11:47 UTC (permalink / raw)
To: linuxppc-dev@lists.ozlabs.org
[-- Attachment #1: Type: text/plain, Size: 1478 bytes --]
Hi All,
A problem was observed in gianfar driver when the interface MTU was modified to a small value.
FYI Kernel Version : 2.6.32 on PPC.
Like if we change the interface mtu to say 100, ping traffic with size greater than 450 is failing.
It was observed that packets ( ping requests) going out of that interface are getting properly fragmented, but the return packets ( ping replies ) are getting dropped by the interface.
To fix this issue the function gfar_change_mtu() in gianfar.c was modified as below:
rx_buffer_size is restored to DEFAULT_RX_BUFFER_SIZE as indicated in RED in the code snippet below
------------------------- CODE SNIPPET BEGIN ----------------------------------
tempsize =
(frame_size & ~(INCREMENTAL_BUFFER_SIZE - 1)) +
INCREMENTAL_BUFFER_SIZE;
if (tempsize < DEFAULT_RX_BUFFER_SIZE )
tempsize = DEFAULT_RX_BUFFER_SIZE;
/* Only stop and start the controller if it isn't already
* stopped, and we changed something */
if ((oldsize != tempsize) && (dev->flags & IFF_UP))
stop_gfar(dev);
priv->rx_buffer_size = tempsize;
dev->mtu = new_mtu;
------------------------- CODE SNIPPET END----------------------------------
If this fix OK? What is the impact of this change on overall behavior?
Best Regards,
- Suresh
[-- Attachment #2: Type: text/html, Size: 8406 bytes --]
^ permalink raw reply
* [PATCH 3/3] powerpc/pseries: Simplify vpa deregistration functions
From: Anton Blanchard @ 2011-07-25 11:46 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev
In-Reply-To: <20110725114631.778346293@samba.org>
The VPA, SLB shadow and DTL degistration functions do not need an
address, so simplify things and remove it.
Also cleanup pseries_kexec_cpu_down a bit by storing the cpu IDs
in local variables.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
Index: linux-powerpc/arch/powerpc/platforms/pseries/hotplug-cpu.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/hotplug-cpu.c 2011-07-25 21:06:49.390411273 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/hotplug-cpu.c 2011-07-25 21:06:57.380555950 +1000
@@ -135,7 +135,7 @@ static void pseries_mach_cpu_die(void)
get_lppaca()->idle = 0;
if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
- unregister_slb_shadow(hwcpu, __pa(get_slb_shadow()));
+ unregister_slb_shadow(hwcpu);
/*
* Call to start_secondary_resume() will not return.
@@ -150,7 +150,7 @@ static void pseries_mach_cpu_die(void)
WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
- unregister_slb_shadow(hwcpu, __pa(get_slb_shadow()));
+ unregister_slb_shadow(hwcpu);
rtas_stop_self();
/* Should never get here... */
Index: linux-powerpc/arch/powerpc/platforms/pseries/kexec.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/kexec.c 2011-07-25 21:06:56.260535670 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/kexec.c 2011-07-25 21:09:20.033141478 +1000
@@ -25,34 +25,30 @@ static void pseries_kexec_cpu_down(int c
{
/* Don't risk a hypervisor call if we're crashing */
if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
- unsigned long addr;
int ret;
+ int cpu = smp_processor_id();
+ int hwcpu = hard_smp_processor_id();
if (get_lppaca()->dtl_enable_mask) {
- ret = unregister_dtl(hard_smp_processor_id());
+ ret = unregister_dtl(hwcpu);
if (ret) {
pr_err("WARNING: DTL deregistration for cpu "
"%d (hw %d) failed with %d\n",
- smp_processor_id(),
- hard_smp_processor_id(), ret);
+ cpu, hwcpu, ret);
}
}
- addr = __pa(get_slb_shadow());
- ret = unregister_slb_shadow(hard_smp_processor_id(), addr);
+ ret = unregister_slb_shadow(hwcpu);
if (ret) {
pr_err("WARNING: SLB shadow buffer deregistration "
"for cpu %d (hw %d) failed with %d\n",
- smp_processor_id(),
- hard_smp_processor_id(), ret);
+ cpu, hwcpu, ret);
}
- addr = __pa(get_lppaca());
- ret = unregister_vpa(hard_smp_processor_id(), addr);
+ ret = unregister_vpa(hwcpu);
if (ret) {
pr_err("WARNING: VPA deregistration for cpu %d "
- "(hw %d) failed with %d\n", smp_processor_id(),
- hard_smp_processor_id(), ret);
+ "(hw %d) failed with %d\n", cpu, hwcpu, ret);
}
}
}
Index: linux-powerpc/arch/powerpc/platforms/pseries/plpar_wrappers.h
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/plpar_wrappers.h 2011-07-25 21:06:52.340464687 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/plpar_wrappers.h 2011-07-25 21:06:57.380555950 +1000
@@ -53,9 +53,9 @@ static inline long vpa_call(unsigned lon
return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa);
}
-static inline long unregister_vpa(unsigned long cpu, unsigned long vpa)
+static inline long unregister_vpa(unsigned long cpu)
{
- return vpa_call(0x5, cpu, vpa);
+ return vpa_call(0x5, cpu, 0);
}
static inline long register_vpa(unsigned long cpu, unsigned long vpa)
@@ -63,9 +63,9 @@ static inline long register_vpa(unsigned
return vpa_call(0x1, cpu, vpa);
}
-static inline long unregister_slb_shadow(unsigned long cpu, unsigned long vpa)
+static inline long unregister_slb_shadow(unsigned long cpu)
{
- return vpa_call(0x7, cpu, vpa);
+ return vpa_call(0x7, cpu, 0);
}
static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
^ permalink raw reply
* [PATCH 2/3] powerpc/pseries: Cleanup VPA registration and deregistration errors
From: Anton Blanchard @ 2011-07-25 11:46 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev
In-Reply-To: <20110725114631.778346293@samba.org>
Make the VPA, SLB shadow and DTL registration and deregistration
functions print consistent messages on error. I needed the firmware
error code while chasing a kexec bug but we weren't printing it.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
Index: linux-powerpc/arch/powerpc/platforms/pseries/kexec.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/kexec.c 2011-07-25 21:06:52.340464687 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/kexec.c 2011-07-25 21:06:56.260535670 +1000
@@ -39,17 +39,20 @@ static void pseries_kexec_cpu_down(int c
}
addr = __pa(get_slb_shadow());
- if (unregister_slb_shadow(hard_smp_processor_id(), addr))
- printk("SLB shadow buffer deregistration of "
- "cpu %u (hw_cpu_id %d) failed\n",
+ ret = unregister_slb_shadow(hard_smp_processor_id(), addr);
+ if (ret) {
+ pr_err("WARNING: SLB shadow buffer deregistration "
+ "for cpu %d (hw %d) failed with %d\n",
smp_processor_id(),
- hard_smp_processor_id());
+ hard_smp_processor_id(), ret);
+ }
addr = __pa(get_lppaca());
- if (unregister_vpa(hard_smp_processor_id(), addr)) {
- printk("VPA deregistration of cpu %u (hw_cpu_id %d) "
- "failed\n", smp_processor_id(),
- hard_smp_processor_id());
+ ret = unregister_vpa(hard_smp_processor_id(), addr);
+ if (ret) {
+ pr_err("WARNING: VPA deregistration for cpu %d "
+ "(hw %d) failed with %d\n", smp_processor_id(),
+ hard_smp_processor_id(), ret);
}
}
}
Index: linux-powerpc/arch/powerpc/platforms/pseries/lpar.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/lpar.c 2011-07-25 21:06:49.440412178 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/lpar.c 2011-07-25 21:06:56.260535670 +1000
@@ -67,9 +67,8 @@ void vpa_init(int cpu)
ret = register_vpa(hwcpu, addr);
if (ret) {
- printk(KERN_ERR "WARNING: vpa_init: VPA registration for "
- "cpu %d (hw %d) of area %lx returns %ld\n",
- cpu, hwcpu, addr, ret);
+ pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
+ "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
return;
}
/*
@@ -80,10 +79,9 @@ void vpa_init(int cpu)
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
ret = register_slb_shadow(hwcpu, addr);
if (ret)
- printk(KERN_ERR
- "WARNING: vpa_init: SLB shadow buffer "
- "registration for cpu %d (hw %d) of area %lx "
- "returns %ld\n", cpu, hwcpu, addr, ret);
+ pr_err("WARNING: SLB shadow buffer registration for "
+ "cpu %d (hw %d) of area %lx failed with %ld\n",
+ cpu, hwcpu, addr, ret);
}
/*
@@ -100,8 +98,9 @@ void vpa_init(int cpu)
dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
ret = register_dtl(hwcpu, __pa(dtl));
if (ret)
- pr_warn("DTL registration failed for cpu %d (%ld)\n",
- cpu, ret);
+ pr_err("WARNING: DTL registration of cpu %d (hw %d) "
+ "failed with %ld\n", smp_processor_id(),
+ hwcpu, ret);
lppaca_of(cpu).dtl_enable_mask = 2;
}
}
Index: linux-powerpc/arch/powerpc/platforms/pseries/setup.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/setup.c 2011-07-25 21:06:49.450412359 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/setup.c 2011-07-25 21:06:56.260535670 +1000
@@ -324,8 +324,9 @@ static int alloc_dispatch_logs(void)
dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
if (ret)
- pr_warn("DTL registration failed for boot cpu %d (%d)\n",
- smp_processor_id(), ret);
+ pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
+ "with %d\n", smp_processor_id(),
+ hard_smp_processor_id(), ret);
get_paca()->lppaca_ptr->dtl_enable_mask = 2;
return 0;
^ permalink raw reply
* [PATCH 1/3] powerpc/pseries: Fix kexec on recent firmware versions
From: Anton Blanchard @ 2011-07-25 11:46 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, stable
In-Reply-To: <20110725114631.778346293@samba.org>
Recent versions of firmware will fail to unmap the virtual processor
area if we have a dispatch trace log registered. This causes kexec
to fail.
If a trace log is registered this patch unregisters it before the
SLB shadow and virtual processor areas, fixing the problem.
The address argument is ignored by firmware on unregister so we
may as well remove it.
Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: <stable@kernel.org>
---
Index: linux-powerpc/arch/powerpc/platforms/pseries/kexec.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/kexec.c 2011-07-25 21:06:49.510413446 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/kexec.c 2011-07-25 21:06:52.340464687 +1000
@@ -26,6 +26,17 @@ static void pseries_kexec_cpu_down(int c
/* Don't risk a hypervisor call if we're crashing */
if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
unsigned long addr;
+ int ret;
+
+ if (get_lppaca()->dtl_enable_mask) {
+ ret = unregister_dtl(hard_smp_processor_id());
+ if (ret) {
+ pr_err("WARNING: DTL deregistration for cpu "
+ "%d (hw %d) failed with %d\n",
+ smp_processor_id(),
+ hard_smp_processor_id(), ret);
+ }
+ }
addr = __pa(get_slb_shadow());
if (unregister_slb_shadow(hard_smp_processor_id(), addr))
Index: linux-powerpc/arch/powerpc/platforms/pseries/dtl.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/dtl.c 2011-07-25 21:06:49.520413628 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/dtl.c 2011-07-25 21:06:52.340464687 +1000
@@ -181,7 +181,7 @@ static void dtl_stop(struct dtl *dtl)
lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
- unregister_dtl(hwcpu, __pa(dtl->buf));
+ unregister_dtl(hwcpu);
}
static u64 dtl_current_index(struct dtl *dtl)
Index: linux-powerpc/arch/powerpc/platforms/pseries/plpar_wrappers.h
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/plpar_wrappers.h 2011-07-25 21:06:49.500413264 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/plpar_wrappers.h 2011-07-25 21:06:52.340464687 +1000
@@ -73,9 +73,9 @@ static inline long register_slb_shadow(u
return vpa_call(0x3, cpu, vpa);
}
-static inline long unregister_dtl(unsigned long cpu, unsigned long vpa)
+static inline long unregister_dtl(unsigned long cpu)
{
- return vpa_call(0x6, cpu, vpa);
+ return vpa_call(0x6, cpu, 0);
}
static inline long register_dtl(unsigned long cpu, unsigned long vpa)
^ permalink raw reply
* [PATCH 0/3] pseries kexec fixes
From: Anton Blanchard @ 2011-07-25 11:46 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev
Here are a few pseries kexec fixes after testing on a recent version
version.
Anton
^ permalink raw reply
* [PATCH 5/5] powerpc/numa: Remove duplicate RECLAIM_DISTANCE definition
From: Anton Blanchard @ 2011-07-25 2:33 UTC (permalink / raw)
To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>
We have two identical definitions of RECLAIM_DISTANCE, looks like
the patch got applied twice. Remove one.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
Index: linux-2.6-work/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-work.orig/arch/powerpc/include/asm/topology.h 2011-07-25 12:15:33.059921510 +1000
+++ linux-2.6-work/arch/powerpc/include/asm/topology.h 2011-07-25 12:15:46.750174446 +1000
@@ -19,16 +19,6 @@ struct device_node;
#define RECLAIM_DISTANCE 10
/*
- * Before going off node we want the VM to try and reclaim from the local
- * node. It does this if the remote distance is larger than RECLAIM_DISTANCE.
- * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of
- * 20, we never reclaim and go off node straight away.
- *
- * To fix this we choose a smaller value of RECLAIM_DISTANCE.
- */
-#define RECLAIM_DISTANCE 10
-
-/*
* Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
* POWER7 boxes which have a maximum of 32 nodes.
*/
^ permalink raw reply
* [PATCH 4/5] powerpc/numa: Disable NEWIDLE balancing at node level
From: Anton Blanchard @ 2011-07-25 2:33 UTC (permalink / raw)
To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>
On big POWER7 boxes we see large amounts of CPU time in system
processes like workqueue and watchdog kernel threads.
We currently rebalance the entire machine each time a task goes
idle and this is very expensive on large machines. Disable newidle
balancing at the node level and rely on the scheduler tick to
rebalance across nodes.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
Index: linux-2.6-work/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-work.orig/arch/powerpc/include/asm/topology.h 2011-07-25 12:14:25.448671947 +1000
+++ linux-2.6-work/arch/powerpc/include/asm/topology.h 2011-07-25 12:14:26.568692651 +1000
@@ -75,7 +75,7 @@ static inline int pcibus_to_node(struct
.forkexec_idx = 0, \
\
.flags = 1*SD_LOAD_BALANCE \
- | 1*SD_BALANCE_NEWIDLE \
+ | 0*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
| 0*SD_BALANCE_WAKE \
^ permalink raw reply
* [PATCH 3/5] powerpc/numa: Increase SD_NODES_PER_DOMAIN to 32.
From: Anton Blanchard @ 2011-07-25 2:33 UTC (permalink / raw)
To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>
The largest POWER7 boxes have 32 nodes. SD_NODES_PER_DOMAIN groups
nodes into chunks of 16 and adds a global balancing domain
(SD_ALLNODES) above it.
If we bump SD_NODES_PER_DOMAIN to 32, then we avoid this extra
level of balancing on our largest boxes.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
Index: linux-2.6-work/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-work.orig/arch/powerpc/include/asm/topology.h 2011-07-25 11:43:24.954093179 +1000
+++ linux-2.6-work/arch/powerpc/include/asm/topology.h 2011-07-25 11:43:31.274205122 +1000
@@ -28,6 +28,12 @@ struct device_node;
*/
#define RECLAIM_DISTANCE 10
+/*
+ * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
+ * POWER7 boxes which have a maximum of 32 nodes.
+ */
+#define SD_NODES_PER_DOMAIN 32
+
#include <asm/mmzone.h>
static inline int cpu_to_node(int cpu)
^ permalink raw reply
* [PATCH 2/5] sched: Allow SD_NODES_PER_DOMAIN to be overridden
From: Anton Blanchard @ 2011-07-25 2:33 UTC (permalink / raw)
To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>
We want to override the default value of SD_NODES_PER_DOMAIN on ppc64,
so move it into linux/topology.h.
Signed-off-by: Anton Blanchard <anton@samba.org>
---
Index: linux-2.6-work/include/linux/topology.h
===================================================================
--- linux-2.6-work.orig/include/linux/topology.h 2011-07-25 11:20:02.588717796 +1000
+++ linux-2.6-work/include/linux/topology.h 2011-07-25 11:26:50.616468376 +1000
@@ -201,6 +201,10 @@ int arch_update_cpu_topology(void);
.balance_interval = 64, \
}
+#ifndef SD_NODES_PER_DOMAIN
+#define SD_NODES_PER_DOMAIN 16
+#endif
+
#ifdef CONFIG_SCHED_BOOK
#ifndef SD_BOOK_INIT
#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
Index: linux-2.6-work/kernel/sched.c
===================================================================
--- linux-2.6-work.orig/kernel/sched.c 2011-07-25 11:20:09.538850173 +1000
+++ linux-2.6-work/kernel/sched.c 2011-07-25 11:26:50.626468565 +1000
@@ -6938,8 +6938,6 @@ static int __init isolated_cpu_setup(cha
__setup("isolcpus=", isolated_cpu_setup);
-#define SD_NODES_PER_DOMAIN 16
-
#ifdef CONFIG_NUMA
/**
^ permalink raw reply
* [PATCH 1/5] powerpc/numa: Enable SD_WAKE_AFFINE in node definition
From: Anton Blanchard @ 2011-07-25 2:33 UTC (permalink / raw)
To: mingo, peterz, benh
Cc: fenghua.yu, tony.luck, linux-kernel, ralf, lethal, cmetcalf,
linuxppc-dev, davem
In-Reply-To: <20110725023311.175792493@samba.org>
When chasing a performance issue on ppc64, I noticed tasks
communicating via a pipe would often end up on different nodes.
It turns out SD_WAKE_AFFINE is not set in our node defition. Commit
9fcd18c9e63e (sched: re-tune balancing) enabled SD_WAKE_AFFINE
in the node definition for x86 and we need a similar change for
ppc64.
I used lmbench lat_ctx and perf bench pipe to verify this fix. Each
benchmark was run 10 times and the average taken.
lmbench lat_ctx:
before: 66565 ops/sec
after: 204700 ops/sec
3.1x faster
perf bench pipe:
before: 5.6570 usecs
after: 1.3470 usecs
4.2x faster
Signed-off-by: Anton Blanchard <anton@samba.org>
---
Cc-ing arch maintainers who might need to look at their SD_NODE_INIT
definitions
Index: linux-2.6-work/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-work.orig/arch/powerpc/include/asm/topology.h 2011-07-18 16:24:55.639949552 +1000
+++ linux-2.6-work/arch/powerpc/include/asm/topology.h 2011-07-18 16:25:02.630074557 +1000
@@ -73,7 +73,7 @@ static inline int pcibus_to_node(struct
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
| 0*SD_BALANCE_WAKE \
- | 0*SD_WAKE_AFFINE \
+ | 1*SD_WAKE_AFFINE \
| 0*SD_PREFER_LOCAL \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
^ permalink raw reply
* [PATCH 0/5] ppc64 scheduler fixes
From: Anton Blanchard @ 2011-07-25 2:33 UTC (permalink / raw)
To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
Here are a set of ppc64 scheduler fixes that help with some
multi node performance issues.
^ permalink raw reply
* Re: perf PPC: kernel panic with callchains and context switch events
From: Benjamin Herrenschmidt @ 2011-07-25 1:55 UTC (permalink / raw)
To: David Ahern, Kumar Gala, Becky Bruce
Cc: linux-perf-users, linuxppc-dev, Paul Mackerras, Anton Blanchard,
LKML
In-Reply-To: <4E2C53E0.3020400@gmail.com>
On Sun, 2011-07-24 at 11:18 -0600, David Ahern wrote:
> On 07/20/2011 03:57 PM, David Ahern wrote:
> > I am hoping someone familiar with PPC can help understand a panic that
> > is generated when capturing callchains with context switch events.
> >
> > Call trace is below. The short of it is that walking the callchain
> > generates a page fault. To handle the page fault the mmap_sem is needed,
> > but it is currently held by setup_arg_pages. setup_arg_pages calls
> > shift_arg_pages with the mmap_sem held. shift_arg_pages then calls
> > move_page_tables which has a cond_resched at the top of its for loop. If
> > the cond_resched() is removed from move_page_tables everything works
> > beautifully - no panics.
> >
> > So, the question: is it normal for walking the stack to trigger a page
> > fault on PPC? The panic is not seen on x86 based systems.
>
> Can anyone confirm whether page faults while walking the stack are
> normal for PPC? We really want to use the context switch event with
> callchains and need to understand whether this behavior is normal. Of
> course if it is normal, a way to address the problem without a panic
> will be needed.
Now that leads to interesting discoveries :-) Becky, can you read all
the way and let me know what you think ?
So, trying to walk the user stack directly will potentially cause page
faults if it's done by direct access. So if you're going to do it in a
spot where you can't afford it, you need to pagefault_disable() I
suppose. I think the problem with our existing code is that it's missing
those around __get_user_inatomic().
In fact, arguably, we don't want the hash code from modifying the hash
either (or even hashing things in). Our 64-bit code handles it today in
perf_callchain.c in a way that involves pretty much duplicating the
functionality of __get_user_pages_fast() as used by x86 (see below), but
as a fallback from a direct access which misses the pagefault_disable()
as well.
I think it comes from an old assumption that this would always be called
from an nmi, and the explicit tracepoints broke that assumption.
In fact we probably want to bump the NMI count, not just the IRQ count
as pagefault_disable() does, to make sure we prevent hashing.
x86 does things differently, using __get_user_pages_fast() (a variant of
get_user_page_fast() that doesn't fallback to normal get_user_pages()).
Now, we could do the same (use __gup_fast too), but I can see a
potential issue with ppc 32-bit platforms that have 64-bit PTEs, since
we could end up GUP'ing in the middle of the two accesses.
Becky: I think gup_fast is generally broken on 32-bit with 64-bit PTE
because of that, the problem isn't specific to perf backtraces, I'll
propose a solution further down.
Now, on x86, there is a similar problem with PAE, which is handled by
- having gup disable IRQs
- rely on the fact that to change from a valid value to another valid
value, the PTE will first get invalidated, which requires an IPI
and thus will be blocked by our interrupts being off
We do the first part, but the second part will break if we use HW TLB
invalidation broadcast (yet another reason why those are bad, I think I
will write a blog entry about it one of these days).
I think we can work around this while keeping our broadcast TLB
invalidations by having the invalidation code also increment a global
generation count (using the existing lock used by the invalidation code,
all 32-bit platforms have such a lock).
>From there, gup_fast can be changed to, with proper ordering, check the
generation count around the loading of the PTE and loop if it has
changed, kind-of a seqlock.
We also need the NMI count bump if we are going to try to keep the
attempt at doing a direct access first for perfs.
Becky, do you feel like giving that a shot or should I find another
victim ? (Or even do it myself ... ) :-)
Cheers,
Ben.
> Thanks,
> David
>
> >
> > [<b0180e00>]rb_erase+0x1b4/0x3e8
> > [<b00430f4>]__dequeue_entity+0x50/0xe8
> > [<b0043304>]set_next_entity+0x178/0x1bc
> > [<b0043440>]pick_next_task_fair+0xb0/0x118
> > [<b02ada80>]schedule+0x500/0x614
> > [<b02afaa8>]rwsem_down_failed_common+0xf0/0x264
> > [<b02afca0>]rwsem_down_read_failed+0x34/0x54
> > [<b02aed4c>]down_read+0x3c/0x54
> > [<b0023b58>]do_page_fault+0x114/0x5e8
> > [<b001e350>]handle_page_fault+0xc/0x80
> > [<b0022dec>]perf_callchain+0x224/0x31c
> > [<b009ba70>]perf_prepare_sample+0x240/0x2fc
> > [<b009d760>]__perf_event_overflow+0x280/0x398
> > [<b009d914>]perf_swevent_overflow+0x9c/0x10c
> > [<b009db54>]perf_swevent_ctx_event+0x1d0/0x230
> > [<b009dc38>]do_perf_sw_event+0x84/0xe4
> > [<b009dde8>]perf_sw_event_context_switch+0x150/0x1b4
> > [<b009de90>]perf_event_task_sched_out+0x44/0x2d4
> > [<b02ad840>]schedule+0x2c0/0x614
> > [<b0047dc0>]__cond_resched+0x34/0x90
> > [<b02adcc8>]_cond_resched+0x4c/0x68
> > [<b00bccf8>]move_page_tables+0xb0/0x418
> > [<b00d7ee0>]setup_arg_pages+0x184/0x2a0
> > [<b0110914>]load_elf_binary+0x394/0x1208
> > [<b00d6e28>]search_binary_handler+0xe0/0x2c4
> > [<b00d834c>]do_execve+0x1bc/0x268
> > [<b0015394>]sys_execve+0x84/0xc8
> > [<b001df10>]ret_from_syscall+0x0/0x3c
> >
> > Thanks,
> > David
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
^ permalink raw reply
* [PATCH] perf: powerpc: Disable pagefaults during callchain stack read
From: Anton Blanchard @ 2011-07-25 0:05 UTC (permalink / raw)
To: David Ahern; +Cc: linux-perf-users, linuxppc-dev, Paul Mackerras, LKML
In-Reply-To: <4E2C53E0.3020400@gmail.com>
Hi David,
> > I am hoping someone familiar with PPC can help understand a panic
> > that is generated when capturing callchains with context switch
> > events.
> >
> > Call trace is below. The short of it is that walking the callchain
> > generates a page fault. To handle the page fault the mmap_sem is
> > needed, but it is currently held by setup_arg_pages.
> > setup_arg_pages calls shift_arg_pages with the mmap_sem held.
> > shift_arg_pages then calls move_page_tables which has a
> > cond_resched at the top of its for loop. If the cond_resched() is
> > removed from move_page_tables everything works beautifully - no
> > panics.
> >
> > So, the question: is it normal for walking the stack to trigger a
> > page fault on PPC? The panic is not seen on x86 based systems.
>
> Can anyone confirm whether page faults while walking the stack are
> normal for PPC? We really want to use the context switch event with
> callchains and need to understand whether this behavior is normal. Of
> course if it is normal, a way to address the problem without a panic
> will be needed.
I talked to Ben about this last week and he pointed me at
pagefault_disable/enable. Untested patch below.
Anton
--
We need to disable pagefaults when reading the stack otherwise
we can lock up trying to take the mmap_sem when the code we are
profiling already has a write lock taken.
This will not happen for hardware events, but could for software
events.
Reported-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: <stable@kernel.org>
---
Index: linux-powerpc/arch/powerpc/kernel/perf_callchain.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/kernel/perf_callchain.c 2011-07-25 09:54:27.296757427 +1000
+++ linux-powerpc/arch/powerpc/kernel/perf_callchain.c 2011-07-25 09:56:08.828367882 +1000
@@ -154,8 +154,12 @@ static int read_user_stack_64(unsigned l
((unsigned long)ptr & 7))
return -EFAULT;
- if (!__get_user_inatomic(*ret, ptr))
+ pagefault_disable();
+ if (!__get_user_inatomic(*ret, ptr)) {
+ pagefault_enable();
return 0;
+ }
+ pagefault_enable();
return read_user_stack_slow(ptr, ret, 8);
}
@@ -166,8 +170,12 @@ static int read_user_stack_32(unsigned i
((unsigned long)ptr & 3))
return -EFAULT;
- if (!__get_user_inatomic(*ret, ptr))
+ pagefault_disable();
+ if (!__get_user_inatomic(*ret, ptr)) {
+ pagefault_enable();
return 0;
+ }
+ pagefault_enable();
return read_user_stack_slow(ptr, ret, 4);
}
^ permalink raw reply
* Re: Linux 3.0 boot failure on the Powerbook G4
From: Benjamin Herrenschmidt @ 2011-07-24 23:23 UTC (permalink / raw)
To: Michael Büsch; +Cc: linuxppc-dev
In-Reply-To: <20110724143729.49c69ce8@maggie>
On Sun, 2011-07-24 at 14:37 +0200, Michael Büsch wrote:
> On Sun, 24 Jul 2011 22:13:34 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > > I'm booting zImage.pmac.
> >
> > Ah that might make it easier... I don't remember where it links, can you
> > show me the program headers out of readelf -a of the zImage ?
>
> As I recompiled stuff, here's the current failure log:
> http://bues.ch/misc/linux-3.0-pbook-2.jpg
>
> And this is the corresponding readelf output:
Hrm.. the faulting address is outside of the zImage. Odd.
Can you try loading a plain vmlinux instead ? (feel free to strip it).
yaboot 1.3.13 might not be the best one to load a real ELF ...
On my side I'll dig one of my old powerbooks and see if I can reproduce
(I generally tend to netboot the zImage directly, but it needs to be <
4M for that to work due to Apple OF limitations, or use yaboot with plan
vmlinux which exercises a different code path within yaboot).
Cheers,
Ben.
> mb@maggie:~$ readelf -a /boot/linux.a
> ELF Header:
> Magic: 7f 45 4c 46 01 02 01 00 00 00 00 00 00 00 00 00
> Class: ELF32
> Data: 2's complement, big endian
> Version: 1 (current)
> OS/ABI: UNIX - System V
> ABI Version: 0
> Type: EXEC (Executable file)
> Machine: PowerPC
> Version: 0x1
> Entry point address: 0x400230
> Start of program headers: 52 (bytes into file)
> Start of section headers: 5769716 (bytes into file)
> Flags: 0x8000, relocatable-lib
> Size of this header: 52 (bytes)
> Size of program headers: 32 (bytes)
> Number of program headers: 2
> Size of section headers: 40 (bytes)
> Number of section headers: 12
> Section header string table index: 9
>
> Section Headers:
> [Nr] Name Type Addr Off Size ES Flg Lk Inf Al
> [ 0] NULL 00000000 000000 000000 00 0 0 0
> [ 1] .text PROGBITS 00400000 010000 0048b0 00 AX 0 0 4
> [ 2] .data PROGBITS 00405000 015000 0012f8 00 WA 0 0 4
> [ 3] .got PROGBITS 004062f8 0162f8 00000c 04 WA 0 0 4
> [ 4] __builtin_cmdline PROGBITS 00406304 016304 000200 00 WA 0 0 4
> [ 5] .kernel:vmlinux.s PROGBITS 00407000 017000 569952 00 A 0 0 1
> [ 6] .bss NOBITS 00971000 580952 00bc70 00 WA 0 0 4
> [ 7] .comment PROGBITS 00000000 580952 00001c 01 MS 0 0 1
> [ 8] .gnu.attributes LOOS+ffffff5 00000000 58096e 000014 00 0 0 1
> [ 9] .shstrtab STRTAB 00000000 580982 000072 00 0 0 1
> [10] .symtab SYMTAB 00000000 580bd4 000780 10 11 55 4
> [11] .strtab STRTAB 00000000 581354 0004f3 00 0 0 1
> Key to Flags:
> W (write), A (alloc), X (execute), M (merge), S (strings)
> I (info), L (link order), G (group), x (unknown)
> O (extra OS processing required) o (OS specific), p (processor specific)
>
> There are no section groups in this file.
>
> Program Headers:
> Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align
> LOAD 0x010000 0x00400000 0x00400000 0x570952 0x57cc70 RWE 0x10000
> GNU_STACK 0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x4
>
> Section to Segment mapping:
> Segment Sections...
> 00 .text .data .got __builtin_cmdline .kernel:vmlinux.strip .bss
> 01
>
> There is no dynamic section in this file.
>
> There are no relocations in this file.
>
> There are no unwind sections in this file.
>
> Symbol table '.symtab' contains 120 entries:
> Num: Value Size Type Bind Vis Ndx Name
> 0: 00000000 0 NOTYPE LOCAL DEFAULT UND
> 1: 00400000 0 SECTION LOCAL DEFAULT 1
> 2: 00405000 0 SECTION LOCAL DEFAULT 2
> 3: 004062f8 0 SECTION LOCAL DEFAULT 3
> 4: 00406304 0 SECTION LOCAL DEFAULT 4
> 5: 00407000 0 SECTION LOCAL DEFAULT 5
> 6: 00971000 0 SECTION LOCAL DEFAULT 6
> 7: 00000000 0 SECTION LOCAL DEFAULT 7
> 8: 00000000 0 SECTION LOCAL DEFAULT 8
> 9: 00000000 0 FILE LOCAL DEFAULT ABS of.c
> 10: 00400000 96 FUNC LOCAL DEFAULT 1 of_image_hdr
> 11: 00400130 220 FUNC LOCAL DEFAULT 1 of_try_claim
> 12: 00971000 4 OBJECT LOCAL DEFAULT 6 claim_base
> 13: 00000000 0 FILE LOCAL DEFAULT ABS empty.c
> 14: 0040021c 0 NOTYPE LOCAL DEFAULT 1 p_start
> 15: 00400220 0 NOTYPE LOCAL DEFAULT 1 p_etext
> 16: 00400224 0 NOTYPE LOCAL DEFAULT 1 p_bss_start
> 17: 00400228 0 NOTYPE LOCAL DEFAULT 1 p_end
> 18: 0040022c 0 NOTYPE LOCAL DEFAULT 1 p_pstack
> 19: 00400234 0 NOTYPE LOCAL DEFAULT 1 p_base
> 20: 00000007 0 NOTYPE LOCAL DEFAULT ABS RELA
> 21: 6ffffff9 0 NOTYPE LOCAL DEFAULT ABS RELACOUNT
> 22: 00000000 0 FILE LOCAL DEFAULT ABS main.c
> 23: 0040032c 536 FUNC LOCAL DEFAULT 1 prep_kernel
> 24: 00971004 46960 OBJECT LOCAL DEFAULT 6 gzstate
> 25: 00406304 512 OBJECT LOCAL DEFAULT 4 cmdline
> 26: 00000000 0 FILE LOCAL DEFAULT ABS gunzip_util.c
> 27: 0097c774 128 OBJECT LOCAL DEFAULT 6 discard_buf.1439
> 28: 00000000 0 FILE LOCAL DEFAULT ABS elf_util.c
> 29: 00000000 0 FILE LOCAL DEFAULT ABS inflate.c
> 30: 00400ed4 424 FUNC LOCAL DEFAULT 1 zlib_adler32
> 31: 004011c4 292 FUNC LOCAL DEFAULT 1 zlib_updatewindow
> 32: 00405484 2048 OBJECT LOCAL DEFAULT 2 lenfix.1147
> 33: 00405c84 128 OBJECT LOCAL DEFAULT 2 distfix.1148
> 34: 00405d04 38 OBJECT LOCAL DEFAULT 2 order.1216
> 35: 00000000 0 FILE LOCAL DEFAULT ABS inftrees.c
> 36: 00405e8e 62 OBJECT LOCAL DEFAULT 2 lext.1062
> 37: 00405ecc 62 OBJECT LOCAL DEFAULT 2 lbase.1061
> 38: 00405f0a 64 OBJECT LOCAL DEFAULT 2 dext.1064
> 39: 00405f4a 64 OBJECT LOCAL DEFAULT 2 dbase.1063
> 40: 00000000 0 FILE LOCAL DEFAULT ABS oflib.c
> 41: 00402a4c 432 FUNC LOCAL DEFAULT 1 of_call_prom_ret
> 42: 0040611c 4 OBJECT LOCAL DEFAULT 2 need_map
> 43: 0097c7f4 4 OBJECT LOCAL DEFAULT 6 prom
> 44: 0097c7f8 4 OBJECT LOCAL DEFAULT 6 chosen_mmu
> 45: 0097c7fc 4 OBJECT LOCAL DEFAULT 6 memory
> 46: 00000000 0 FILE LOCAL DEFAULT ABS ofconsole.c
> 47: 004032b0 104 FUNC LOCAL DEFAULT 1 of_console_open
> 48: 0040325c 84 FUNC LOCAL DEFAULT 1 of_console_write
> 49: 0097c800 4 OBJECT LOCAL DEFAULT 6 of_stdout_handle
> 50: 00000000 0 FILE LOCAL DEFAULT ABS stdio.c
> 51: 0040369c 848 FUNC LOCAL DEFAULT 1 number
> 52: 0097c804 1024 OBJECT LOCAL DEFAULT 6 sprint_buf
> 53: 00000000 0 FILE LOCAL DEFAULT ABS inffast.c
> 54: 004062f8 0 OBJECT LOCAL HIDDEN 3 _GLOBAL_OFFSET_TABLE_
> 55: 00400060 208 FUNC GLOBAL DEFAULT 1 platform_init
> 56: 00403318 0 NOTYPE GLOBAL DEFAULT 1 strcpy
> 57: 00000000 0 NOTYPE WEAK DEFAULT UND _platform_stack_top
> 58: 00400924 240 FUNC GLOBAL DEFAULT 1 gunzip_partial
> 59: 0040413c 188 FUNC GLOBAL DEFAULT 1 printf
> 60: 004039ec 1872 FUNC GLOBAL DEFAULT 1 vsprintf
> 61: 0040426c 0 NOTYPE GLOBAL DEFAULT 1 __div64_32
> 62: 00403468 0 NOTYPE GLOBAL DEFAULT 1 memmove
> 63: 00402a10 60 FUNC GLOBAL DEFAULT 1 of_init
> 64: 00406508 0 NOTYPE GLOBAL DEFAULT 4 _dtb_start
> 65: 0040020c 0 NOTYPE GLOBAL DEFAULT 1 _zimage_start_opd
> 66: 004048b0 0 NOTYPE GLOBAL DEFAULT 1 _etext
> 67: 00402e04 72 FUNC GLOBAL DEFAULT 1 of_finddevice
> 68: 00401088 132 FUNC GLOBAL DEFAULT 1 zlib_inflateReset
> 69: 00403470 0 NOTYPE GLOBAL DEFAULT 1 memcpy
> 70: 00403624 0 NOTYPE GLOBAL DEFAULT 1 flush_cache
> 71: 0040430c 1444 FUNC GLOBAL DEFAULT 1 inflate_fast
> 72: 00407000 0 NOTYPE GLOBAL DEFAULT 5 _vmlinux_start
> 73: 0040110c 152 FUNC GLOBAL DEFAULT 1 zlib_inflateInit2
> 74: 00402dac 88 FUNC GLOBAL DEFAULT 1 of_getprop
> 75: 00400b80 484 FUNC GLOBAL DEFAULT 1 gunzip_start
> 76: 0097cc04 20 OBJECT GLOBAL DEFAULT 6 loader_info
> 77: 0097cc18 28 OBJECT GLOBAL DEFAULT 6 platform_ops
> 78: 00403140 212 FUNC GLOBAL DEFAULT 1 of_vmlinux_alloc
> 79: 00400a7c 120 FUNC GLOBAL DEFAULT 1 gunzip_exactly
> 80: 004012e8 240 FUNC GLOBAL DEFAULT 1 zlib_inflateIncomp
> 81: 00400d64 200 FUNC GLOBAL DEFAULT 1 parse_elf64
> 82: 0097cc34 20 OBJECT GLOBAL DEFAULT 6 console_ops
> 83: 00403650 76 FUNC GLOBAL DEFAULT 1 strnlen
> 84: 00400a14 104 FUNC GLOBAL DEFAULT 1 gunzip_finish
> 85: 00402e90 688 FUNC GLOBAL DEFAULT 1 of_claim
> 86: 00402480 1424 FUNC GLOBAL DEFAULT 1 zlib_inflate_table
> 87: 00400af4 140 FUNC GLOBAL DEFAULT 1 gunzip_discard
> 88: 004013d8 4264 FUNC GLOBAL DEFAULT 1 zlib_inflate
> 89: 00400e2c 168 FUNC GLOBAL DEFAULT 1 parse_elf32
> 90: 0040335c 0 NOTYPE GLOBAL DEFAULT 1 strcat
> 91: 00402e4c 68 FUNC GLOBAL DEFAULT 1 of_exit
> 92: 004035cc 0 NOTYPE GLOBAL DEFAULT 1 memchr
> 93: 00400000 0 NOTYPE GLOBAL DEFAULT 1 _start
> 94: 004033cc 0 NOTYPE GLOBAL DEFAULT 1 strncmp
> 95: 00403214 72 FUNC GLOBAL DEFAULT 1 of_console_init
> 96: 0040107c 12 FUNC GLOBAL DEFAULT 1 zlib_inflate_workspacesiz
> 97: 00403334 0 NOTYPE GLOBAL DEFAULT 1 strncpy
> 98: 004035f4 0 NOTYPE GLOBAL DEFAULT 1 memcmp
> 99: 00971000 0 NOTYPE GLOBAL DEFAULT 5 _initrd_start
> 100: 00400230 0 NOTYPE WEAK DEFAULT 1 _zimage_start
> 101: 00403528 0 NOTYPE GLOBAL DEFAULT 1 backwards_memcpy
> 102: 00971000 0 NOTYPE GLOBAL DEFAULT 6 __bss_start
> 103: 0040340c 0 NOTYPE GLOBAL DEFAULT 1 memset
> 104: 00406508 0 NOTYPE GLOBAL DEFAULT 4 _dtb_end
> 105: 00971000 0 NOTYPE GLOBAL DEFAULT 5 _initrd_end
> 106: 0097cc48 40 OBJECT GLOBAL DEFAULT 6 dt_ops
> 107: 004033a8 0 NOTYPE GLOBAL DEFAULT 1 strcmp
> 108: 004041f8 116 FUNC GLOBAL DEFAULT 1 sprintf
> 109: 00971000 0 NOTYPE GLOBAL DEFAULT 6 _edata
> 110: 0097cc70 0 NOTYPE GLOBAL DEFAULT 6 _end
> 111: 00400544 992 FUNC GLOBAL DEFAULT 1 start
> 112: 00970952 0 NOTYPE GLOBAL DEFAULT 5 _vmlinux_end
> 113: 004033f4 0 NOTYPE GLOBAL DEFAULT 1 strlen
> 114: 00403388 0 NOTYPE GLOBAL DEFAULT 1 strchr
> 115: 00400230 0 NOTYPE GLOBAL DEFAULT 1 _zimage_start_lib
> 116: 00406504 0 NOTYPE GLOBAL DEFAULT 4 __dynamic_start
> 117: 004011a4 32 FUNC GLOBAL DEFAULT 1 zlib_inflateEnd
> 118: 00402d54 88 FUNC GLOBAL DEFAULT 1 of_setprop
> 119: 00402bfc 344 FUNC GLOBAL DEFAULT 1 of_call_prom
>
> No version information found in this file.
> Attribute Section: gnu
> File Attributes
> Tag_GNU_Power_ABI_FP: Soft float
> Tag_GNU_Power_ABI_Vector: Generic
> Tag_GNU_Power_ABI_Struct_Return: Memory
>
>
^ permalink raw reply
* Re: perf PPC: kernel panic with callchains and context switch events
From: David Ahern @ 2011-07-24 17:18 UTC (permalink / raw)
To: Anton Blanchard, Paul Mackerras, linux-perf-users, LKML,
linuxppc-dev
In-Reply-To: <4E274F5F.7000604@gmail.com>
On 07/20/2011 03:57 PM, David Ahern wrote:
> I am hoping someone familiar with PPC can help understand a panic that
> is generated when capturing callchains with context switch events.
>
> Call trace is below. The short of it is that walking the callchain
> generates a page fault. To handle the page fault the mmap_sem is needed,
> but it is currently held by setup_arg_pages. setup_arg_pages calls
> shift_arg_pages with the mmap_sem held. shift_arg_pages then calls
> move_page_tables which has a cond_resched at the top of its for loop. If
> the cond_resched() is removed from move_page_tables everything works
> beautifully - no panics.
>
> So, the question: is it normal for walking the stack to trigger a page
> fault on PPC? The panic is not seen on x86 based systems.
Can anyone confirm whether page faults while walking the stack are
normal for PPC? We really want to use the context switch event with
callchains and need to understand whether this behavior is normal. Of
course if it is normal, a way to address the problem without a panic
will be needed.
Thanks,
David
>
> [<b0180e00>]rb_erase+0x1b4/0x3e8
> [<b00430f4>]__dequeue_entity+0x50/0xe8
> [<b0043304>]set_next_entity+0x178/0x1bc
> [<b0043440>]pick_next_task_fair+0xb0/0x118
> [<b02ada80>]schedule+0x500/0x614
> [<b02afaa8>]rwsem_down_failed_common+0xf0/0x264
> [<b02afca0>]rwsem_down_read_failed+0x34/0x54
> [<b02aed4c>]down_read+0x3c/0x54
> [<b0023b58>]do_page_fault+0x114/0x5e8
> [<b001e350>]handle_page_fault+0xc/0x80
> [<b0022dec>]perf_callchain+0x224/0x31c
> [<b009ba70>]perf_prepare_sample+0x240/0x2fc
> [<b009d760>]__perf_event_overflow+0x280/0x398
> [<b009d914>]perf_swevent_overflow+0x9c/0x10c
> [<b009db54>]perf_swevent_ctx_event+0x1d0/0x230
> [<b009dc38>]do_perf_sw_event+0x84/0xe4
> [<b009dde8>]perf_sw_event_context_switch+0x150/0x1b4
> [<b009de90>]perf_event_task_sched_out+0x44/0x2d4
> [<b02ad840>]schedule+0x2c0/0x614
> [<b0047dc0>]__cond_resched+0x34/0x90
> [<b02adcc8>]_cond_resched+0x4c/0x68
> [<b00bccf8>]move_page_tables+0xb0/0x418
> [<b00d7ee0>]setup_arg_pages+0x184/0x2a0
> [<b0110914>]load_elf_binary+0x394/0x1208
> [<b00d6e28>]search_binary_handler+0xe0/0x2c4
> [<b00d834c>]do_execve+0x1bc/0x268
> [<b0015394>]sys_execve+0x84/0xc8
> [<b001df10>]ret_from_syscall+0x0/0x3c
>
> Thanks,
> David
^ permalink raw reply
* Re: [PATCH 2/5] hugetlb: add phys addr to struct huge_bootmem_page
From: Tabi Timur-B04825 @ 2011-07-24 16:48 UTC (permalink / raw)
To: Becky Bruce
Cc: linuxppc-dev@lists.ozlabs.org,
List linux-kernel@vger.kernel.org Mailing, David Gibson
In-Reply-To: <786B027A-4EC8-4175-A18D-9DA57E9549D6@kernel.crashing.org>
On Thu, Jun 30, 2011 at 1:50 PM, Becky Bruce <beckyb@kernel.crashing.org> w=
rote:
> Because there was no bootmem allocation in the normal case - the non-high=
mem
> version stores data structure in the huge page itself. =A0This is perfect=
ly fine as long
> as you have a mapping. =A0Since this isn't true for HIGHMEM pages, I allo=
cate
> bootmem to store the early data structure that stores information about t=
he
> hugepage (this happens in arch-specific code in alloc_bootmem_huge_page).
I would put this text in a comment in the code.
--=20
Timur Tabi
Linux kernel developer at Freescale=
^ permalink raw reply
* Re: Linux 3.0 boot failure on the Powerbook G4
From: Michael Büsch @ 2011-07-24 12:37 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1311509614.25044.585.camel@pasglop>
On Sun, 24 Jul 2011 22:13:34 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > I'm booting zImage.pmac.
>
> Ah that might make it easier... I don't remember where it links, can you
> show me the program headers out of readelf -a of the zImage ?
As I recompiled stuff, here's the current failure log:
http://bues.ch/misc/linux-3.0-pbook-2.jpg
And this is the corresponding readelf output:
mb@maggie:~$ readelf -a /boot/linux.a
ELF Header:
Magic: 7f 45 4c 46 01 02 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, big endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: PowerPC
Version: 0x1
Entry point address: 0x400230
Start of program headers: 52 (bytes into file)
Start of section headers: 5769716 (bytes into file)
Flags: 0x8000, relocatable-lib
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 2
Size of section headers: 40 (bytes)
Number of section headers: 12
Section header string table index: 9
Section Headers:
[Nr] Name Type Addr Off Size ES Flg Lk Inf Al
[ 0] NULL 00000000 000000 000000 00 0 0 0
[ 1] .text PROGBITS 00400000 010000 0048b0 00 AX 0 0 4
[ 2] .data PROGBITS 00405000 015000 0012f8 00 WA 0 0 4
[ 3] .got PROGBITS 004062f8 0162f8 00000c 04 WA 0 0 4
[ 4] __builtin_cmdline PROGBITS 00406304 016304 000200 00 WA 0 0 4
[ 5] .kernel:vmlinux.s PROGBITS 00407000 017000 569952 00 A 0 0 1
[ 6] .bss NOBITS 00971000 580952 00bc70 00 WA 0 0 4
[ 7] .comment PROGBITS 00000000 580952 00001c 01 MS 0 0 1
[ 8] .gnu.attributes LOOS+ffffff5 00000000 58096e 000014 00 0 0 1
[ 9] .shstrtab STRTAB 00000000 580982 000072 00 0 0 1
[10] .symtab SYMTAB 00000000 580bd4 000780 10 11 55 4
[11] .strtab STRTAB 00000000 581354 0004f3 00 0 0 1
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings)
I (info), L (link order), G (group), x (unknown)
O (extra OS processing required) o (OS specific), p (processor specific)
There are no section groups in this file.
Program Headers:
Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align
LOAD 0x010000 0x00400000 0x00400000 0x570952 0x57cc70 RWE 0x10000
GNU_STACK 0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x4
Section to Segment mapping:
Segment Sections...
00 .text .data .got __builtin_cmdline .kernel:vmlinux.strip .bss
01
There is no dynamic section in this file.
There are no relocations in this file.
There are no unwind sections in this file.
Symbol table '.symtab' contains 120 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 00000000 0 NOTYPE LOCAL DEFAULT UND
1: 00400000 0 SECTION LOCAL DEFAULT 1
2: 00405000 0 SECTION LOCAL DEFAULT 2
3: 004062f8 0 SECTION LOCAL DEFAULT 3
4: 00406304 0 SECTION LOCAL DEFAULT 4
5: 00407000 0 SECTION LOCAL DEFAULT 5
6: 00971000 0 SECTION LOCAL DEFAULT 6
7: 00000000 0 SECTION LOCAL DEFAULT 7
8: 00000000 0 SECTION LOCAL DEFAULT 8
9: 00000000 0 FILE LOCAL DEFAULT ABS of.c
10: 00400000 96 FUNC LOCAL DEFAULT 1 of_image_hdr
11: 00400130 220 FUNC LOCAL DEFAULT 1 of_try_claim
12: 00971000 4 OBJECT LOCAL DEFAULT 6 claim_base
13: 00000000 0 FILE LOCAL DEFAULT ABS empty.c
14: 0040021c 0 NOTYPE LOCAL DEFAULT 1 p_start
15: 00400220 0 NOTYPE LOCAL DEFAULT 1 p_etext
16: 00400224 0 NOTYPE LOCAL DEFAULT 1 p_bss_start
17: 00400228 0 NOTYPE LOCAL DEFAULT 1 p_end
18: 0040022c 0 NOTYPE LOCAL DEFAULT 1 p_pstack
19: 00400234 0 NOTYPE LOCAL DEFAULT 1 p_base
20: 00000007 0 NOTYPE LOCAL DEFAULT ABS RELA
21: 6ffffff9 0 NOTYPE LOCAL DEFAULT ABS RELACOUNT
22: 00000000 0 FILE LOCAL DEFAULT ABS main.c
23: 0040032c 536 FUNC LOCAL DEFAULT 1 prep_kernel
24: 00971004 46960 OBJECT LOCAL DEFAULT 6 gzstate
25: 00406304 512 OBJECT LOCAL DEFAULT 4 cmdline
26: 00000000 0 FILE LOCAL DEFAULT ABS gunzip_util.c
27: 0097c774 128 OBJECT LOCAL DEFAULT 6 discard_buf.1439
28: 00000000 0 FILE LOCAL DEFAULT ABS elf_util.c
29: 00000000 0 FILE LOCAL DEFAULT ABS inflate.c
30: 00400ed4 424 FUNC LOCAL DEFAULT 1 zlib_adler32
31: 004011c4 292 FUNC LOCAL DEFAULT 1 zlib_updatewindow
32: 00405484 2048 OBJECT LOCAL DEFAULT 2 lenfix.1147
33: 00405c84 128 OBJECT LOCAL DEFAULT 2 distfix.1148
34: 00405d04 38 OBJECT LOCAL DEFAULT 2 order.1216
35: 00000000 0 FILE LOCAL DEFAULT ABS inftrees.c
36: 00405e8e 62 OBJECT LOCAL DEFAULT 2 lext.1062
37: 00405ecc 62 OBJECT LOCAL DEFAULT 2 lbase.1061
38: 00405f0a 64 OBJECT LOCAL DEFAULT 2 dext.1064
39: 00405f4a 64 OBJECT LOCAL DEFAULT 2 dbase.1063
40: 00000000 0 FILE LOCAL DEFAULT ABS oflib.c
41: 00402a4c 432 FUNC LOCAL DEFAULT 1 of_call_prom_ret
42: 0040611c 4 OBJECT LOCAL DEFAULT 2 need_map
43: 0097c7f4 4 OBJECT LOCAL DEFAULT 6 prom
44: 0097c7f8 4 OBJECT LOCAL DEFAULT 6 chosen_mmu
45: 0097c7fc 4 OBJECT LOCAL DEFAULT 6 memory
46: 00000000 0 FILE LOCAL DEFAULT ABS ofconsole.c
47: 004032b0 104 FUNC LOCAL DEFAULT 1 of_console_open
48: 0040325c 84 FUNC LOCAL DEFAULT 1 of_console_write
49: 0097c800 4 OBJECT LOCAL DEFAULT 6 of_stdout_handle
50: 00000000 0 FILE LOCAL DEFAULT ABS stdio.c
51: 0040369c 848 FUNC LOCAL DEFAULT 1 number
52: 0097c804 1024 OBJECT LOCAL DEFAULT 6 sprint_buf
53: 00000000 0 FILE LOCAL DEFAULT ABS inffast.c
54: 004062f8 0 OBJECT LOCAL HIDDEN 3 _GLOBAL_OFFSET_TABLE_
55: 00400060 208 FUNC GLOBAL DEFAULT 1 platform_init
56: 00403318 0 NOTYPE GLOBAL DEFAULT 1 strcpy
57: 00000000 0 NOTYPE WEAK DEFAULT UND _platform_stack_top
58: 00400924 240 FUNC GLOBAL DEFAULT 1 gunzip_partial
59: 0040413c 188 FUNC GLOBAL DEFAULT 1 printf
60: 004039ec 1872 FUNC GLOBAL DEFAULT 1 vsprintf
61: 0040426c 0 NOTYPE GLOBAL DEFAULT 1 __div64_32
62: 00403468 0 NOTYPE GLOBAL DEFAULT 1 memmove
63: 00402a10 60 FUNC GLOBAL DEFAULT 1 of_init
64: 00406508 0 NOTYPE GLOBAL DEFAULT 4 _dtb_start
65: 0040020c 0 NOTYPE GLOBAL DEFAULT 1 _zimage_start_opd
66: 004048b0 0 NOTYPE GLOBAL DEFAULT 1 _etext
67: 00402e04 72 FUNC GLOBAL DEFAULT 1 of_finddevice
68: 00401088 132 FUNC GLOBAL DEFAULT 1 zlib_inflateReset
69: 00403470 0 NOTYPE GLOBAL DEFAULT 1 memcpy
70: 00403624 0 NOTYPE GLOBAL DEFAULT 1 flush_cache
71: 0040430c 1444 FUNC GLOBAL DEFAULT 1 inflate_fast
72: 00407000 0 NOTYPE GLOBAL DEFAULT 5 _vmlinux_start
73: 0040110c 152 FUNC GLOBAL DEFAULT 1 zlib_inflateInit2
74: 00402dac 88 FUNC GLOBAL DEFAULT 1 of_getprop
75: 00400b80 484 FUNC GLOBAL DEFAULT 1 gunzip_start
76: 0097cc04 20 OBJECT GLOBAL DEFAULT 6 loader_info
77: 0097cc18 28 OBJECT GLOBAL DEFAULT 6 platform_ops
78: 00403140 212 FUNC GLOBAL DEFAULT 1 of_vmlinux_alloc
79: 00400a7c 120 FUNC GLOBAL DEFAULT 1 gunzip_exactly
80: 004012e8 240 FUNC GLOBAL DEFAULT 1 zlib_inflateIncomp
81: 00400d64 200 FUNC GLOBAL DEFAULT 1 parse_elf64
82: 0097cc34 20 OBJECT GLOBAL DEFAULT 6 console_ops
83: 00403650 76 FUNC GLOBAL DEFAULT 1 strnlen
84: 00400a14 104 FUNC GLOBAL DEFAULT 1 gunzip_finish
85: 00402e90 688 FUNC GLOBAL DEFAULT 1 of_claim
86: 00402480 1424 FUNC GLOBAL DEFAULT 1 zlib_inflate_table
87: 00400af4 140 FUNC GLOBAL DEFAULT 1 gunzip_discard
88: 004013d8 4264 FUNC GLOBAL DEFAULT 1 zlib_inflate
89: 00400e2c 168 FUNC GLOBAL DEFAULT 1 parse_elf32
90: 0040335c 0 NOTYPE GLOBAL DEFAULT 1 strcat
91: 00402e4c 68 FUNC GLOBAL DEFAULT 1 of_exit
92: 004035cc 0 NOTYPE GLOBAL DEFAULT 1 memchr
93: 00400000 0 NOTYPE GLOBAL DEFAULT 1 _start
94: 004033cc 0 NOTYPE GLOBAL DEFAULT 1 strncmp
95: 00403214 72 FUNC GLOBAL DEFAULT 1 of_console_init
96: 0040107c 12 FUNC GLOBAL DEFAULT 1 zlib_inflate_workspacesiz
97: 00403334 0 NOTYPE GLOBAL DEFAULT 1 strncpy
98: 004035f4 0 NOTYPE GLOBAL DEFAULT 1 memcmp
99: 00971000 0 NOTYPE GLOBAL DEFAULT 5 _initrd_start
100: 00400230 0 NOTYPE WEAK DEFAULT 1 _zimage_start
101: 00403528 0 NOTYPE GLOBAL DEFAULT 1 backwards_memcpy
102: 00971000 0 NOTYPE GLOBAL DEFAULT 6 __bss_start
103: 0040340c 0 NOTYPE GLOBAL DEFAULT 1 memset
104: 00406508 0 NOTYPE GLOBAL DEFAULT 4 _dtb_end
105: 00971000 0 NOTYPE GLOBAL DEFAULT 5 _initrd_end
106: 0097cc48 40 OBJECT GLOBAL DEFAULT 6 dt_ops
107: 004033a8 0 NOTYPE GLOBAL DEFAULT 1 strcmp
108: 004041f8 116 FUNC GLOBAL DEFAULT 1 sprintf
109: 00971000 0 NOTYPE GLOBAL DEFAULT 6 _edata
110: 0097cc70 0 NOTYPE GLOBAL DEFAULT 6 _end
111: 00400544 992 FUNC GLOBAL DEFAULT 1 start
112: 00970952 0 NOTYPE GLOBAL DEFAULT 5 _vmlinux_end
113: 004033f4 0 NOTYPE GLOBAL DEFAULT 1 strlen
114: 00403388 0 NOTYPE GLOBAL DEFAULT 1 strchr
115: 00400230 0 NOTYPE GLOBAL DEFAULT 1 _zimage_start_lib
116: 00406504 0 NOTYPE GLOBAL DEFAULT 4 __dynamic_start
117: 004011a4 32 FUNC GLOBAL DEFAULT 1 zlib_inflateEnd
118: 00402d54 88 FUNC GLOBAL DEFAULT 1 of_setprop
119: 00402bfc 344 FUNC GLOBAL DEFAULT 1 of_call_prom
No version information found in this file.
Attribute Section: gnu
File Attributes
Tag_GNU_Power_ABI_FP: Soft float
Tag_GNU_Power_ABI_Vector: Generic
Tag_GNU_Power_ABI_Struct_Return: Memory
--
Greetings, Michael.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox