LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] powerpc: return the_cpu_ spec from identify_cpu
From: Scott Wood @ 2011-07-25 21:04 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

Commit af9eef3c7b1ed004c378c89b87642f4937337d50 caused cpu_setup to see
the_cpu_spec, rather than the source struct.  However, on 32-bit, the
return value of identify_cpu was being used for feature fixups, and
identify_cpu was returning the source struct.  So if cpu_setup patches
the feature bits, the update won't affect the fixups.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/kernel/cputable.c |   11 ++++++-----
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 9fb9332..fa44ff5 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -2051,7 +2051,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 
 static struct cpu_spec the_cpu_spec;
 
-static void __init setup_cpu_spec(unsigned long offset, struct cpu_spec *s)
+static struct cpu_spec * __init setup_cpu_spec(unsigned long offset,
+					       struct cpu_spec *s)
 {
 	struct cpu_spec *t = &the_cpu_spec;
 	struct cpu_spec old;
@@ -2114,6 +2115,8 @@ static void __init setup_cpu_spec(unsigned long offset, struct cpu_spec *s)
 		t->cpu_setup(offset, t);
 	}
 #endif /* CONFIG_PPC64 || CONFIG_BOOKE */
+
+	return t;
 }
 
 struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
@@ -2124,10 +2127,8 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
 	s = PTRRELOC(s);
 
 	for (i = 0; i < ARRAY_SIZE(cpu_specs); i++,s++) {
-		if ((pvr & s->pvr_mask) == s->pvr_value) {
-			setup_cpu_spec(offset, s);
-			return s;
-		}
+		if ((pvr & s->pvr_mask) == s->pvr_value)
+			return setup_cpu_spec(offset, s);
 	}
 
 	BUG();
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH] powerpc: mtspr/mtmsr should take an unsigned long
From: Scott Wood @ 2011-07-25 21:02 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

Add a cast in case the caller passes in a different type, as it would
if mtspr/mtmsr were functions.

Previously, if a 64-bit type was passed in on 32-bit, GCC would bind the
constraint to a pair of registers, and would substitute the first register
in the pair in the asm code.  This corresponds to the upper half of the
64-bit register, which is generally not the desired behavior.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
If you're wondering why you'd pass a 64-bit value to one of these macros
on 32-bit, it came up when trying to load an SPR from kvm_vcpu_arch_shared.

 arch/powerpc/include/asm/reg.h |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 213d1d7..1b45133 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1007,13 +1007,16 @@
 #define mtmsrd(v)	__mtmsrd((v), 0)
 #define mtmsr(v)	mtmsrd(v)
 #else
-#define mtmsr(v)	asm volatile("mtmsr %0" : : "r" (v) : "memory")
+#define mtmsr(v)	asm volatile("mtmsr %0" : \
+				     : "r" ((unsigned long)(v)) \
+				     : "memory")
 #endif
 
 #define mfspr(rn)	({unsigned long rval; \
 			asm volatile("mfspr %0," __stringify(rn) \
 				: "=r" (rval)); rval;})
-#define mtspr(rn, v)	asm volatile("mtspr " __stringify(rn) ",%0" : : "r" (v)\
+#define mtspr(rn, v)	asm volatile("mtspr " __stringify(rn) ",%0" : \
+				     : "r" ((unsigned long)(v)) \
 				     : "memory")
 
 #ifdef __powerpc64__
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH] powerpc/nvram: Add compression to fit more oops output into NVRAM
From: Jim Keniston @ 2011-07-25 17:54 UTC (permalink / raw)
  To: benh, linuxppc-dev

Capture more than twice as much text from the printk buffer, and
compress it to fit it in the lnx,oops-log NVRAM partition.  You
can view the compressed text using the new (as of July 20) --unzip
option of the nvram command in the powerpc-utils package.

Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
---

 arch/powerpc/include/asm/rtas.h        |    6 +
 arch/powerpc/platforms/pseries/nvram.c |  171 +++++++++++++++++++++++++++++++-
 2 files changed, 168 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 58625d1..41f69ae 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -249,10 +249,12 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 #define ERR_FLAG_ALREADY_LOGGED	0x0
 #define ERR_FLAG_BOOT		0x1 	/* log was pulled from NVRAM on boot */
 #define ERR_TYPE_RTAS_LOG	0x2	/* from rtas event-scan */
-#define ERR_TYPE_KERNEL_PANIC	0x4	/* from panic() */
+#define ERR_TYPE_KERNEL_PANIC	0x4	/* from die()/panic() */
+#define ERR_TYPE_KERNEL_PANIC_GZ 0x8	/* ditto, compressed */
 
 /* All the types and not flags */
-#define ERR_TYPE_MASK	(ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC)
+#define ERR_TYPE_MASK \
+	(ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC | ERR_TYPE_KERNEL_PANIC_GZ)
 
 #define RTAS_DEBUG KERN_DEBUG "RTAS: "
  
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 00cc3a0..a76b228 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -18,6 +18,8 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/kmsg_dump.h>
+#include <linux/ctype.h>
+#include <linux/zlib.h>
 #include <asm/uaccess.h>
 #include <asm/nvram.h>
 #include <asm/rtas.h>
@@ -78,8 +80,41 @@ static struct kmsg_dumper nvram_kmsg_dumper = {
 #define NVRAM_RTAS_READ_TIMEOUT 5		/* seconds */
 static unsigned long last_unread_rtas_event;	/* timestamp */
 
-/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */
-static char *oops_buf;
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we're capturing.
+ *
+ * oops_buf[] holds the compressed text, preceded by a prefix.
+ * The prefix is just a u16 holding the length of the compressed* text.
+ * (*Or uncompressed, if compression fails.)  oops_buf[] gets written
+ * to NVRAM.
+ *
+ * oops_len points to the prefix.  oops_data points to the compressed text.
+ *
+ * +- oops_buf
+ * |		+- oops_data
+ * v		v
+ * +------------+-----------------------------------------------+
+ * | length	| text                                          |
+ * | (2 bytes)	| (oops_data_sz bytes)                          |
+ * +------------+-----------------------------------------------+
+ * ^
+ * +- oops_len
+ *
+ * We preallocate these buffers during init to avoid kmalloc during oops/panic.
+ */
+static size_t big_oops_buf_sz;
+static char *big_oops_buf, *oops_buf;
+static u16 *oops_len;
+static char *oops_data;
+static size_t oops_data_sz;
+
+/* Compression parameters */
+#define COMPR_LEVEL 6
+#define WINDOW_BITS 12
+#define MEM_LEVEL 4
+static struct z_stream_s stream;
 
 static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
 {
@@ -387,11 +422,44 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
 						sizeof(rtas_log_partition));
 	}
 	oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
+	if (!oops_buf) {
+		pr_err("nvram: No memory for %s partition\n",
+						oops_log_partition.name);
+		return;
+	}
+	oops_len = (u16*) oops_buf;
+	oops_data = oops_buf + sizeof(u16);
+	oops_data_sz = oops_log_partition.size - sizeof(u16);
+
+	/*
+	 * Figure compression (preceded by elimination of each line's <n>
+	 * severity prefix) will reduce the oops/panic report to at most
+	 * 45% of its original size.
+	 */
+	big_oops_buf_sz = (oops_data_sz * 100) / 45;
+	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+	if (big_oops_buf) {
+		stream.workspace = kmalloc(zlib_deflate_workspacesize(
+				WINDOW_BITS, MEM_LEVEL), GFP_KERNEL);
+		if (!stream.workspace) {
+			pr_err("nvram: No memory for compression workspace; "
+				"skipping compression of %s partition data\n",
+				oops_log_partition.name);
+			kfree(big_oops_buf);
+			big_oops_buf = NULL;
+		}
+	} else {
+		pr_err("No memory for uncompressed %s data; "
+			"skipping compression\n", oops_log_partition.name);
+		stream.workspace = NULL;
+	}
+
 	rc = kmsg_dump_register(&nvram_kmsg_dumper);
 	if (rc != 0) {
 		pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
 		kfree(oops_buf);
-		return;
+		kfree(big_oops_buf);
+		kfree(stream.workspace);
 	}
 }
 
@@ -473,7 +541,83 @@ static int clobbering_unread_rtas_event(void)
 						NVRAM_RTAS_READ_TIMEOUT);
 }
 
-/* our kmsg_dump callback */
+/* Squeeze out each line's <n> severity prefix. */
+static size_t elide_severities(char *buf, size_t len)
+{
+	char *in, *out, *buf_end = buf + len;
+	/* Assume a <n> at the very beginning marks the start of a line. */
+	int newline = 1;
+
+	in = out = buf;
+	while (in < buf_end) {
+		if (newline && in+3 <= buf_end &&
+				*in == '<' && isdigit(in[1]) && in[2] == '>') {
+			in += 3;
+			newline = 0;
+		} else {
+			newline = (*in == '\n');
+			*out++ = *in++;
+		}
+	}
+	return out - buf;
+}
+
+/* Derived from logfs_compress() */
+static int nvram_compress(const void *in, void *out, size_t inlen,
+							size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+						MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	return ret;
+}
+
+/* Compress the text from big_oops_buf into oops_buf. */
+static int zip_oops(size_t text_len)
+{
+	int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
+								oops_data_sz);
+	if (zipped_len < 0) {
+		pr_err("nvram: compression failed; returned %d\n", zipped_len);
+		pr_err("nvram: logging uncompressed oops/panic report\n");
+		return -1;
+	}
+	*oops_len = (u16) zipped_len;
+	return 0;
+}
+
+/*
+ * This is our kmsg_dump callback, called after an oops or panic report
+ * has been written to the printk buffer.  We want to capture as much
+ * of the printk buffer as possible.  First, capture as much as we can
+ * that we think will compress sufficiently to fit in the lnx,oops-log
+ * partition.  If that's too much, go back and capture uncompressed text.
+ */
 static void oops_to_nvram(struct kmsg_dumper *dumper,
 		enum kmsg_dump_reason reason,
 		const char *old_msgs, unsigned long old_len,
@@ -482,6 +626,8 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 	static unsigned int oops_count = 0;
 	static bool panicking = false;
 	size_t text_len;
+	unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+	int rc = -1;
 
 	switch (reason) {
 	case KMSG_DUMP_RESTART:
@@ -509,8 +655,19 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 	if (clobbering_unread_rtas_event())
 		return;
 
-	text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len,
-					oops_buf, oops_log_partition.size);
+	if (big_oops_buf) {
+		text_len = capture_last_msgs(old_msgs, old_len,
+			new_msgs, new_len, big_oops_buf, big_oops_buf_sz);
+		text_len = elide_severities(big_oops_buf, text_len);
+		rc = zip_oops(text_len);
+	}
+	if (rc != 0) {
+		text_len = capture_last_msgs(old_msgs, old_len,
+				new_msgs, new_len, oops_data, oops_data_sz);
+		err_type = ERR_TYPE_KERNEL_PANIC;
+		*oops_len = (u16) text_len;
+	}
+
 	(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
-		(int) text_len, ERR_TYPE_KERNEL_PANIC, ++oops_count);
+		(int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count);
 }

^ permalink raw reply related

* Re: [PATCH 13/14] 85xx: consolidate of_platform_bus_probe calls
From: Scott Wood @ 2011-07-25 15:40 UTC (permalink / raw)
  To: Dmitry Eremin-Solenikov; +Cc: Paul Mackerras, Linux PPC Development
In-Reply-To: <CALT56yOZkLW=AmkyX77g5agxfBb0h7Fz8Q_enth-CJycJ3SkFA@mail.gmail.com>

On Sat, 23 Jul 2011 01:45:53 +0400
Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> wrote:

> I see your point. I just wasn't thinking too much about ot-of-tree trees.
> My thought was that if someone updates the kernel, he can also update the dtb.

Sometimes there are firmware dependencies that make that difficult.  And
even if it's just user laziness/forgetfulness, that still translates to
extra support requests.

> Could you please update the lbc.txt suggesting the compatibility
> with simple-bus for lbc? Or you thing that it would be wrong?
>
> I think we should define compatibility list as "fsl,mpcXXXX-localbus",
> "fsl,pqXXXXX-localbus", "simple-bus", noting that by default new
> platforms/boards should only use "simple-bus" internally. Does this
> look reasonable for you? I can then try to provide a patch.

I'm OK with saying that localbus nodes should have simple-bus in new trees,
and defining canonical compatible values (chips with eLBC should be
"fsl,XXXX-elbc", "fsl,elbc", "simple-bus").  I'm not sure what you mean by
"should only use simple-bus internally", especially in the context of the
binding.

> What do you suggest/prefer? To add .name="localbus" to generic code
> or to have board-specific hooks (like one for mpc834xemitx)?

Just add localbus to the generic table.

-Scott

^ permalink raw reply

* Re: perf PPC: kernel panic with callchains and context switch events
From: David Ahern @ 2011-07-25 15:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Anton Blanchard
  Cc: LKML, linux-perf-users, Paul Mackerras, linuxppc-dev
In-Reply-To: <1311558949.25044.614.camel@pasglop>

Hi Ben:

On 07/24/2011 07:55 PM, Benjamin Herrenschmidt wrote:
> On Sun, 2011-07-24 at 11:18 -0600, David Ahern wrote:
>> On 07/20/2011 03:57 PM, David Ahern wrote:
>>> I am hoping someone familiar with PPC can help understand a panic that
>>> is generated when capturing callchains with context switch events.
>>>
>>> Call trace is below. The short of it is that walking the callchain
>>> generates a page fault. To handle the page fault the mmap_sem is needed,
>>> but it is currently held by setup_arg_pages. setup_arg_pages calls
>>> shift_arg_pages with the mmap_sem held. shift_arg_pages then calls
>>> move_page_tables which has a cond_resched at the top of its for loop. If
>>> the cond_resched() is removed from move_page_tables everything works
>>> beautifully - no panics.
>>>
>>> So, the question: is it normal for walking the stack to trigger a page
>>> fault on PPC? The panic is not seen on x86 based systems.
>>
>> Can anyone confirm whether page faults while walking the stack are
>> normal for PPC? We really want to use the context switch event with
>> callchains and need to understand whether this behavior is normal. Of
>> course if it is normal, a way to address the problem without a panic
>> will be needed.
> 
> Now that leads to interesting discoveries :-) Becky, can you read all
> the way and let me know what you think ?
> 
> So, trying to walk the user stack directly will potentially cause page
> faults if it's done by direct access. So if you're going to do it in a
> spot where you can't afford it, you need to pagefault_disable() I
> suppose. I think the problem with our existing code is that it's missing
> those around __get_user_inatomic().
> 
> In fact, arguably, we don't want the hash code from modifying the hash
> either (or even hashing things in). Our 64-bit code handles it today in
> perf_callchain.c in a way that involves pretty much duplicating the
> functionality of __get_user_pages_fast() as used by x86 (see below), but
> as a fallback from a direct access which misses the pagefault_disable()
> as well.
> 
> I think it comes from an old assumption that this would always be called
> from an nmi, and the explicit tracepoints broke that assumption.
> 
> In fact we probably want to bump the NMI count, not just the IRQ count
> as pagefault_disable() does, to make sure we prevent hashing. 
> 
> x86 does things differently, using __get_user_pages_fast() (a variant of
> get_user_page_fast() that doesn't fallback to normal get_user_pages()).
> 
> Now, we could do the same (use __gup_fast too), but I can see a
> potential issue with ppc 32-bit platforms that have 64-bit PTEs, since
> we could end up GUP'ing in the middle of the two accesses.
> 
> Becky: I think gup_fast is generally broken on 32-bit with 64-bit PTE
> because of that, the problem isn't specific to perf backtraces, I'll
> propose a solution further down.
> 
> Now, on x86, there is a similar problem with PAE, which is handled by
> 
>  - having gup disable IRQs
>  - rely on the fact that to change from a valid value to another valid
>    value, the PTE will first get invalidated, which requires an IPI
>    and thus will be blocked by our interrupts being off
> 
> We do the first part, but the second part will break if we use HW TLB
> invalidation broadcast (yet another reason why those are bad, I think I
> will write a blog entry about it one of these days).
> 
> I think we can work around this while keeping our broadcast TLB
> invalidations by having the invalidation code also increment a global
> generation count (using the existing lock used by the invalidation code,
> all 32-bit platforms have such a lock).
> 
> From there, gup_fast can be changed to, with proper ordering, check the
> generation count around the loading of the PTE and loop if it has
> changed, kind-of a seqlock.
> 
> We also need the NMI count bump if we are going to try to keep the
> attempt at doing a direct access first for perfs.
> 
> Becky, do you feel like giving that a shot or should I find another
> victim ? (Or even do it myself ... ) :-)

Did you have something in mind besides the patch Anton sent? We'll give
that one a try and see how it works. (Thanks, Anton!)

David

> 
> Cheers,
> Ben.
> 
>> Thanks,
>> David
>>
>>>
>>>  [<b0180e00>]rb_erase+0x1b4/0x3e8
>>>  [<b00430f4>]__dequeue_entity+0x50/0xe8
>>>  [<b0043304>]set_next_entity+0x178/0x1bc
>>>  [<b0043440>]pick_next_task_fair+0xb0/0x118
>>>  [<b02ada80>]schedule+0x500/0x614
>>>  [<b02afaa8>]rwsem_down_failed_common+0xf0/0x264
>>>  [<b02afca0>]rwsem_down_read_failed+0x34/0x54
>>>  [<b02aed4c>]down_read+0x3c/0x54
>>>  [<b0023b58>]do_page_fault+0x114/0x5e8
>>>  [<b001e350>]handle_page_fault+0xc/0x80
>>>  [<b0022dec>]perf_callchain+0x224/0x31c
>>>  [<b009ba70>]perf_prepare_sample+0x240/0x2fc
>>>  [<b009d760>]__perf_event_overflow+0x280/0x398
>>>  [<b009d914>]perf_swevent_overflow+0x9c/0x10c
>>>  [<b009db54>]perf_swevent_ctx_event+0x1d0/0x230
>>>  [<b009dc38>]do_perf_sw_event+0x84/0xe4
>>>  [<b009dde8>]perf_sw_event_context_switch+0x150/0x1b4
>>>  [<b009de90>]perf_event_task_sched_out+0x44/0x2d4
>>>  [<b02ad840>]schedule+0x2c0/0x614
>>>  [<b0047dc0>]__cond_resched+0x34/0x90
>>>  [<b02adcc8>]_cond_resched+0x4c/0x68
>>>  [<b00bccf8>]move_page_tables+0xb0/0x418
>>>  [<b00d7ee0>]setup_arg_pages+0x184/0x2a0
>>>  [<b0110914>]load_elf_binary+0x394/0x1208
>>>  [<b00d6e28>]search_binary_handler+0xe0/0x2c4
>>>  [<b00d834c>]do_execve+0x1bc/0x268
>>>  [<b0015394>]sys_execve+0x84/0xc8
>>>  [<b001df10>]ret_from_syscall+0x0/0x3c
>>>
>>> Thanks,
>>> David
>> _______________________________________________
>> Linuxppc-dev mailing list
>> Linuxppc-dev@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 
> 

^ permalink raw reply

* [GIT PULL] Please pull powerpc.git next branch
From: Kumar Gala @ 2011-07-25 14:08 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

[ a few minor fixes ]

The following changes since commit 50d2a4223bb875d1e3a7ee97d40dd03bf31ce1b7:

  powerpc: Copy back TIF flags on return from softirq stack (2011-07-22 13:38:58 +1000)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/galak/powerpc.git next

Fabio Baltieri (1):
      powerpc/85xx: fix mpic configuration in CAMP mode

Timur Tabi (1):
      drivers/virt: add missing linux/interrupt.h to fsl_hypervisor.c

 arch/powerpc/platforms/85xx/mpc85xx_ds.c  |    3 ++-
 arch/powerpc/platforms/85xx/mpc85xx_rdb.c |    5 +++--
 drivers/virt/fsl_hypervisor.c             |    1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

^ permalink raw reply

* Re: Linux 3.0 boot failure on the Powerbook G4
From: Michael Büsch @ 2011-07-25 13:03 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1311549818.25044.587.camel@pasglop>

On Mon, 25 Jul 2011 09:23:38 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> Hrm.. the faulting address is outside of the zImage. Odd.
> 
> Can you try loading a plain vmlinux instead ? (feel free to strip it).

The plain unstripped vmlinux boots fine:

mb@maggie:~$ uname -a
Linux maggie 3.0.0 #3 PREEMPT Sun Jul 24 11:51:30 CEST 2011 ppc GNU/Linux

Is there something going wrong in the uncompress trampoline?

-- 
Greetings, Michael.

^ permalink raw reply

* Re: [PATCH 0/5] ppc64 scheduler fixes
From: Peter Zijlstra @ 2011-07-25 12:41 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: mingo, linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>

On Mon, 2011-07-25 at 12:33 +1000, Anton Blanchard wrote:
> Here are a set of ppc64 scheduler fixes that help with some
> multi node performance issues.

They look fine to me. I'll probably ping you when I'll rip out all that
SD_NODES_PER_DOMAIN crap for good, but until then I'm fine with you
fiddling it for ppc64.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

^ permalink raw reply

* mtu issue with gianfar driver
From: Kumar Reddy Suresh-B22303 @ 2011-07-25 11:47 UTC (permalink / raw)
  To: linuxppc-dev@lists.ozlabs.org

[-- Attachment #1: Type: text/plain, Size: 1478 bytes --]

Hi All,

A problem was observed in gianfar driver when the interface MTU was modified to a small value.

FYI Kernel Version : 2.6.32 on PPC.

Like if we change the interface mtu to say 100, ping traffic with size greater than 450 is failing.
It was observed that packets ( ping requests) going out of that interface are getting properly fragmented, but the return packets ( ping replies ) are getting dropped by the interface.

To fix this issue the function gfar_change_mtu() in gianfar.c was modified as below:

rx_buffer_size is restored to DEFAULT_RX_BUFFER_SIZE as indicated in RED in the code snippet below

------------------------- CODE SNIPPET BEGIN ----------------------------------
                tempsize =
                    (frame_size & ~(INCREMENTAL_BUFFER_SIZE - 1)) +
                    INCREMENTAL_BUFFER_SIZE;

        if (tempsize < DEFAULT_RX_BUFFER_SIZE )
           tempsize = DEFAULT_RX_BUFFER_SIZE;

                /* Only stop and start the controller if it isn't already
                * stopped, and we changed something */
                if ((oldsize != tempsize) && (dev->flags & IFF_UP))
                                stop_gfar(dev);

                priv->rx_buffer_size = tempsize;

                dev->mtu = new_mtu;
------------------------- CODE SNIPPET END----------------------------------

If this fix OK? What is the impact of this change on overall behavior?

Best Regards,
- Suresh

[-- Attachment #2: Type: text/html, Size: 8406 bytes --]

^ permalink raw reply

* [PATCH 3/3] powerpc/pseries: Simplify vpa deregistration functions
From: Anton Blanchard @ 2011-07-25 11:46 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev
In-Reply-To: <20110725114631.778346293@samba.org>

The VPA, SLB shadow and DTL degistration functions do not need an
address, so simplify things and remove it.

Also cleanup pseries_kexec_cpu_down a bit by storing the cpu IDs
in local variables.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: linux-powerpc/arch/powerpc/platforms/pseries/hotplug-cpu.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/hotplug-cpu.c	2011-07-25 21:06:49.390411273 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/hotplug-cpu.c	2011-07-25 21:06:57.380555950 +1000
@@ -135,7 +135,7 @@ static void pseries_mach_cpu_die(void)
 		get_lppaca()->idle = 0;
 
 		if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
-			unregister_slb_shadow(hwcpu, __pa(get_slb_shadow()));
+			unregister_slb_shadow(hwcpu);
 
 			/*
 			 * Call to start_secondary_resume() will not return.
@@ -150,7 +150,7 @@ static void pseries_mach_cpu_die(void)
 	WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
 
 	set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
-	unregister_slb_shadow(hwcpu, __pa(get_slb_shadow()));
+	unregister_slb_shadow(hwcpu);
 	rtas_stop_self();
 
 	/* Should never get here... */
Index: linux-powerpc/arch/powerpc/platforms/pseries/kexec.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/kexec.c	2011-07-25 21:06:56.260535670 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/kexec.c	2011-07-25 21:09:20.033141478 +1000
@@ -25,34 +25,30 @@ static void pseries_kexec_cpu_down(int c
 {
 	/* Don't risk a hypervisor call if we're crashing */
 	if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
-		unsigned long addr;
 		int ret;
+		int cpu = smp_processor_id();
+		int hwcpu = hard_smp_processor_id();
 
 		if (get_lppaca()->dtl_enable_mask) {
-			ret = unregister_dtl(hard_smp_processor_id());
+			ret = unregister_dtl(hwcpu);
 			if (ret) {
 				pr_err("WARNING: DTL deregistration for cpu "
 				       "%d (hw %d) failed with %d\n",
-				       smp_processor_id(),
-				       hard_smp_processor_id(), ret);
+				       cpu, hwcpu, ret);
 			}
 		}
 
-		addr = __pa(get_slb_shadow());
-		ret = unregister_slb_shadow(hard_smp_processor_id(), addr);
+		ret = unregister_slb_shadow(hwcpu);
 		if (ret) {
 			pr_err("WARNING: SLB shadow buffer deregistration "
 			       "for cpu %d (hw %d) failed with %d\n",
-			       smp_processor_id(),
-			       hard_smp_processor_id(), ret);
+			       cpu, hwcpu, ret);
 		}
 
-		addr = __pa(get_lppaca());
-		ret = unregister_vpa(hard_smp_processor_id(), addr);
+		ret = unregister_vpa(hwcpu);
 		if (ret) {
 			pr_err("WARNING: VPA deregistration for cpu %d "
-			       "(hw %d) failed with %d\n", smp_processor_id(),
-			       hard_smp_processor_id(), ret);
+			       "(hw %d) failed with %d\n", cpu, hwcpu, ret);
 		}
 	}
 }
Index: linux-powerpc/arch/powerpc/platforms/pseries/plpar_wrappers.h
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/plpar_wrappers.h	2011-07-25 21:06:52.340464687 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/plpar_wrappers.h	2011-07-25 21:06:57.380555950 +1000
@@ -53,9 +53,9 @@ static inline long vpa_call(unsigned lon
 	return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa);
 }
 
-static inline long unregister_vpa(unsigned long cpu, unsigned long vpa)
+static inline long unregister_vpa(unsigned long cpu)
 {
-	return vpa_call(0x5, cpu, vpa);
+	return vpa_call(0x5, cpu, 0);
 }
 
 static inline long register_vpa(unsigned long cpu, unsigned long vpa)
@@ -63,9 +63,9 @@ static inline long register_vpa(unsigned
 	return vpa_call(0x1, cpu, vpa);
 }
 
-static inline long unregister_slb_shadow(unsigned long cpu, unsigned long vpa)
+static inline long unregister_slb_shadow(unsigned long cpu)
 {
-	return vpa_call(0x7, cpu, vpa);
+	return vpa_call(0x7, cpu, 0);
 }
 
 static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)

^ permalink raw reply

* [PATCH 2/3] powerpc/pseries: Cleanup VPA registration and deregistration errors
From: Anton Blanchard @ 2011-07-25 11:46 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev
In-Reply-To: <20110725114631.778346293@samba.org>

Make the VPA, SLB shadow and DTL registration and deregistration
functions print consistent messages on error. I needed the firmware
error code while chasing a kexec bug but we weren't printing it.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: linux-powerpc/arch/powerpc/platforms/pseries/kexec.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/kexec.c	2011-07-25 21:06:52.340464687 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/kexec.c	2011-07-25 21:06:56.260535670 +1000
@@ -39,17 +39,20 @@ static void pseries_kexec_cpu_down(int c
 		}
 
 		addr = __pa(get_slb_shadow());
-		if (unregister_slb_shadow(hard_smp_processor_id(), addr))
-			printk("SLB shadow buffer deregistration of "
-			       "cpu %u (hw_cpu_id %d) failed\n",
+		ret = unregister_slb_shadow(hard_smp_processor_id(), addr);
+		if (ret) {
+			pr_err("WARNING: SLB shadow buffer deregistration "
+			       "for cpu %d (hw %d) failed with %d\n",
 			       smp_processor_id(),
-			       hard_smp_processor_id());
+			       hard_smp_processor_id(), ret);
+		}
 
 		addr = __pa(get_lppaca());
-		if (unregister_vpa(hard_smp_processor_id(), addr)) {
-			printk("VPA deregistration of cpu %u (hw_cpu_id %d) "
-					"failed\n", smp_processor_id(),
-					hard_smp_processor_id());
+		ret = unregister_vpa(hard_smp_processor_id(), addr);
+		if (ret) {
+			pr_err("WARNING: VPA deregistration for cpu %d "
+			       "(hw %d) failed with %d\n", smp_processor_id(),
+			       hard_smp_processor_id(), ret);
 		}
 	}
 }
Index: linux-powerpc/arch/powerpc/platforms/pseries/lpar.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/lpar.c	2011-07-25 21:06:49.440412178 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/lpar.c	2011-07-25 21:06:56.260535670 +1000
@@ -67,9 +67,8 @@ void vpa_init(int cpu)
 	ret = register_vpa(hwcpu, addr);
 
 	if (ret) {
-		printk(KERN_ERR "WARNING: vpa_init: VPA registration for "
-				"cpu %d (hw %d) of area %lx returns %ld\n",
-				cpu, hwcpu, addr, ret);
+		pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
+		       "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
 		return;
 	}
 	/*
@@ -80,10 +79,9 @@ void vpa_init(int cpu)
 	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 		ret = register_slb_shadow(hwcpu, addr);
 		if (ret)
-			printk(KERN_ERR
-			       "WARNING: vpa_init: SLB shadow buffer "
-			       "registration for cpu %d (hw %d) of area %lx "
-			       "returns %ld\n", cpu, hwcpu, addr, ret);
+			pr_err("WARNING: SLB shadow buffer registration for "
+			       "cpu %d (hw %d) of area %lx failed with %ld\n",
+			       cpu, hwcpu, addr, ret);
 	}
 
 	/*
@@ -100,8 +98,9 @@ void vpa_init(int cpu)
 		dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
 		ret = register_dtl(hwcpu, __pa(dtl));
 		if (ret)
-			pr_warn("DTL registration failed for cpu %d (%ld)\n",
-				cpu, ret);
+			pr_err("WARNING: DTL registration of cpu %d (hw %d) "
+			       "failed with %ld\n", smp_processor_id(),
+			       hwcpu, ret);
 		lppaca_of(cpu).dtl_enable_mask = 2;
 	}
 }
Index: linux-powerpc/arch/powerpc/platforms/pseries/setup.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/setup.c	2011-07-25 21:06:49.450412359 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/setup.c	2011-07-25 21:06:56.260535670 +1000
@@ -324,8 +324,9 @@ static int alloc_dispatch_logs(void)
 	dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
 	ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
 	if (ret)
-		pr_warn("DTL registration failed for boot cpu %d (%d)\n",
-			smp_processor_id(), ret);
+		pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
+		       "with %d\n", smp_processor_id(),
+		       hard_smp_processor_id(), ret);
 	get_paca()->lppaca_ptr->dtl_enable_mask = 2;
 
 	return 0;

^ permalink raw reply

* [PATCH 1/3] powerpc/pseries: Fix kexec on recent firmware versions
From: Anton Blanchard @ 2011-07-25 11:46 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, stable
In-Reply-To: <20110725114631.778346293@samba.org>

Recent versions of firmware will fail to unmap the virtual processor
area if we have a dispatch trace log registered. This causes kexec
to fail.

If a trace log is registered this patch unregisters it before the
SLB shadow and virtual processor areas, fixing the problem.

The address argument is ignored by firmware on unregister so we
may as well remove it.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: <stable@kernel.org>
---

Index: linux-powerpc/arch/powerpc/platforms/pseries/kexec.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/kexec.c	2011-07-25 21:06:49.510413446 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/kexec.c	2011-07-25 21:06:52.340464687 +1000
@@ -26,6 +26,17 @@ static void pseries_kexec_cpu_down(int c
 	/* Don't risk a hypervisor call if we're crashing */
 	if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
 		unsigned long addr;
+		int ret;
+
+		if (get_lppaca()->dtl_enable_mask) {
+			ret = unregister_dtl(hard_smp_processor_id());
+			if (ret) {
+				pr_err("WARNING: DTL deregistration for cpu "
+				       "%d (hw %d) failed with %d\n",
+				       smp_processor_id(),
+				       hard_smp_processor_id(), ret);
+			}
+		}
 
 		addr = __pa(get_slb_shadow());
 		if (unregister_slb_shadow(hard_smp_processor_id(), addr))
Index: linux-powerpc/arch/powerpc/platforms/pseries/dtl.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/dtl.c	2011-07-25 21:06:49.520413628 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/dtl.c	2011-07-25 21:06:52.340464687 +1000
@@ -181,7 +181,7 @@ static void dtl_stop(struct dtl *dtl)
 
 	lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
 
-	unregister_dtl(hwcpu, __pa(dtl->buf));
+	unregister_dtl(hwcpu);
 }
 
 static u64 dtl_current_index(struct dtl *dtl)
Index: linux-powerpc/arch/powerpc/platforms/pseries/plpar_wrappers.h
===================================================================
--- linux-powerpc.orig/arch/powerpc/platforms/pseries/plpar_wrappers.h	2011-07-25 21:06:49.500413264 +1000
+++ linux-powerpc/arch/powerpc/platforms/pseries/plpar_wrappers.h	2011-07-25 21:06:52.340464687 +1000
@@ -73,9 +73,9 @@ static inline long register_slb_shadow(u
 	return vpa_call(0x3, cpu, vpa);
 }
 
-static inline long unregister_dtl(unsigned long cpu, unsigned long vpa)
+static inline long unregister_dtl(unsigned long cpu)
 {
-	return vpa_call(0x6, cpu, vpa);
+	return vpa_call(0x6, cpu, 0);
 }
 
 static inline long register_dtl(unsigned long cpu, unsigned long vpa)

^ permalink raw reply

* [PATCH 0/3] pseries kexec fixes
From: Anton Blanchard @ 2011-07-25 11:46 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev

Here are a few pseries kexec fixes after testing on a recent version
version.

Anton

^ permalink raw reply

* [PATCH 5/5] powerpc/numa: Remove duplicate RECLAIM_DISTANCE definition
From: Anton Blanchard @ 2011-07-25  2:33 UTC (permalink / raw)
  To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>

We have two identical definitions of RECLAIM_DISTANCE, looks like
the patch got applied twice. Remove one.

Signed-off-by: Anton Blanchard <anton@samba.org> 
---

Index: linux-2.6-work/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-work.orig/arch/powerpc/include/asm/topology.h	2011-07-25 12:15:33.059921510 +1000
+++ linux-2.6-work/arch/powerpc/include/asm/topology.h	2011-07-25 12:15:46.750174446 +1000
@@ -19,16 +19,6 @@ struct device_node;
 #define RECLAIM_DISTANCE 10
 
 /*
- * Before going off node we want the VM to try and reclaim from the local
- * node. It does this if the remote distance is larger than RECLAIM_DISTANCE.
- * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of
- * 20, we never reclaim and go off node straight away.
- *
- * To fix this we choose a smaller value of RECLAIM_DISTANCE.
- */
-#define RECLAIM_DISTANCE 10
-
-/*
  * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
  * POWER7 boxes which have a maximum of 32 nodes.
  */

^ permalink raw reply

* [PATCH 4/5] powerpc/numa: Disable NEWIDLE balancing at node level
From: Anton Blanchard @ 2011-07-25  2:33 UTC (permalink / raw)
  To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>

On big POWER7 boxes we see large amounts of CPU time in system
processes like workqueue and watchdog kernel threads.

We currently rebalance the entire machine each time a task goes
idle and this is very expensive on large machines. Disable newidle
balancing at the node level and rely on the scheduler tick to
rebalance across nodes.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: linux-2.6-work/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-work.orig/arch/powerpc/include/asm/topology.h	2011-07-25 12:14:25.448671947 +1000
+++ linux-2.6-work/arch/powerpc/include/asm/topology.h	2011-07-25 12:14:26.568692651 +1000
@@ -75,7 +75,7 @@ static inline int pcibus_to_node(struct
 	.forkexec_idx		= 0,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
-				| 1*SD_BALANCE_NEWIDLE			\
+				| 0*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
 				| 0*SD_BALANCE_WAKE			\

^ permalink raw reply

* [PATCH 3/5] powerpc/numa: Increase SD_NODES_PER_DOMAIN to 32.
From: Anton Blanchard @ 2011-07-25  2:33 UTC (permalink / raw)
  To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>

The largest POWER7 boxes have 32 nodes. SD_NODES_PER_DOMAIN groups
nodes into chunks of 16 and adds a global balancing domain
(SD_ALLNODES) above it.

If we bump SD_NODES_PER_DOMAIN to 32, then we avoid this extra
level of balancing on our largest boxes.

Signed-off-by: Anton Blanchard <anton@samba.org> 
---

Index: linux-2.6-work/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-work.orig/arch/powerpc/include/asm/topology.h	2011-07-25 11:43:24.954093179 +1000
+++ linux-2.6-work/arch/powerpc/include/asm/topology.h	2011-07-25 11:43:31.274205122 +1000
@@ -28,6 +28,12 @@ struct device_node;
  */
 #define RECLAIM_DISTANCE 10
 
+/*
+ * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
+ * POWER7 boxes which have a maximum of 32 nodes.
+ */
+#define SD_NODES_PER_DOMAIN 32
+
 #include <asm/mmzone.h>
 
 static inline int cpu_to_node(int cpu)

^ permalink raw reply

* [PATCH 2/5] sched: Allow SD_NODES_PER_DOMAIN to be overridden
From: Anton Blanchard @ 2011-07-25  2:33 UTC (permalink / raw)
  To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20110725023311.175792493@samba.org>

We want to override the default value of SD_NODES_PER_DOMAIN on ppc64,
so move it into linux/topology.h.

Signed-off-by: Anton Blanchard <anton@samba.org> 
---

Index: linux-2.6-work/include/linux/topology.h
===================================================================
--- linux-2.6-work.orig/include/linux/topology.h	2011-07-25 11:20:02.588717796 +1000
+++ linux-2.6-work/include/linux/topology.h	2011-07-25 11:26:50.616468376 +1000
@@ -201,6 +201,10 @@ int arch_update_cpu_topology(void);
 	.balance_interval	= 64,					\
 }
 
+#ifndef SD_NODES_PER_DOMAIN
+#define SD_NODES_PER_DOMAIN 16
+#endif
+
 #ifdef CONFIG_SCHED_BOOK
 #ifndef SD_BOOK_INIT
 #error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
Index: linux-2.6-work/kernel/sched.c
===================================================================
--- linux-2.6-work.orig/kernel/sched.c	2011-07-25 11:20:09.538850173 +1000
+++ linux-2.6-work/kernel/sched.c	2011-07-25 11:26:50.626468565 +1000
@@ -6938,8 +6938,6 @@ static int __init isolated_cpu_setup(cha
 
 __setup("isolcpus=", isolated_cpu_setup);
 
-#define SD_NODES_PER_DOMAIN 16
-
 #ifdef CONFIG_NUMA
 
 /**

^ permalink raw reply

* [PATCH 1/5] powerpc/numa: Enable SD_WAKE_AFFINE in node definition
From: Anton Blanchard @ 2011-07-25  2:33 UTC (permalink / raw)
  To: mingo, peterz, benh
  Cc: fenghua.yu, tony.luck, linux-kernel, ralf, lethal, cmetcalf,
	linuxppc-dev, davem
In-Reply-To: <20110725023311.175792493@samba.org>

When chasing a performance issue on ppc64, I noticed tasks
communicating via a pipe would often end up on different nodes.

It turns out SD_WAKE_AFFINE is not set in our node defition. Commit
9fcd18c9e63e (sched: re-tune balancing) enabled SD_WAKE_AFFINE
in the node definition for x86 and we need a similar change for
ppc64.

I used lmbench lat_ctx and perf bench pipe to verify this fix. Each
benchmark was run 10 times and the average taken.


lmbench lat_ctx:

before:  66565 ops/sec
after:  204700 ops/sec

3.1x faster


perf bench pipe:

before: 5.6570 usecs
after:  1.3470 usecs

4.2x faster


Signed-off-by: Anton Blanchard <anton@samba.org>
---

Cc-ing arch maintainers who might need to look at their SD_NODE_INIT
definitions

Index: linux-2.6-work/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-work.orig/arch/powerpc/include/asm/topology.h	2011-07-18 16:24:55.639949552 +1000
+++ linux-2.6-work/arch/powerpc/include/asm/topology.h	2011-07-18 16:25:02.630074557 +1000
@@ -73,7 +73,7 @@ static inline int pcibus_to_node(struct
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
 				| 0*SD_BALANCE_WAKE			\
-				| 0*SD_WAKE_AFFINE			\
+				| 1*SD_WAKE_AFFINE			\
 				| 0*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\

^ permalink raw reply

* [PATCH 0/5] ppc64 scheduler fixes
From: Anton Blanchard @ 2011-07-25  2:33 UTC (permalink / raw)
  To: mingo, peterz, benh; +Cc: linuxppc-dev, linux-kernel

Here are a set of ppc64 scheduler fixes that help with some
multi node performance issues.

^ permalink raw reply

* Re: perf PPC: kernel panic with callchains and context switch events
From: Benjamin Herrenschmidt @ 2011-07-25  1:55 UTC (permalink / raw)
  To: David Ahern, Kumar Gala, Becky Bruce
  Cc: linux-perf-users, linuxppc-dev, Paul Mackerras, Anton Blanchard,
	LKML
In-Reply-To: <4E2C53E0.3020400@gmail.com>

On Sun, 2011-07-24 at 11:18 -0600, David Ahern wrote:
> On 07/20/2011 03:57 PM, David Ahern wrote:
> > I am hoping someone familiar with PPC can help understand a panic that
> > is generated when capturing callchains with context switch events.
> > 
> > Call trace is below. The short of it is that walking the callchain
> > generates a page fault. To handle the page fault the mmap_sem is needed,
> > but it is currently held by setup_arg_pages. setup_arg_pages calls
> > shift_arg_pages with the mmap_sem held. shift_arg_pages then calls
> > move_page_tables which has a cond_resched at the top of its for loop. If
> > the cond_resched() is removed from move_page_tables everything works
> > beautifully - no panics.
> > 
> > So, the question: is it normal for walking the stack to trigger a page
> > fault on PPC? The panic is not seen on x86 based systems.
> 
> Can anyone confirm whether page faults while walking the stack are
> normal for PPC? We really want to use the context switch event with
> callchains and need to understand whether this behavior is normal. Of
> course if it is normal, a way to address the problem without a panic
> will be needed.

Now that leads to interesting discoveries :-) Becky, can you read all
the way and let me know what you think ?

So, trying to walk the user stack directly will potentially cause page
faults if it's done by direct access. So if you're going to do it in a
spot where you can't afford it, you need to pagefault_disable() I
suppose. I think the problem with our existing code is that it's missing
those around __get_user_inatomic().

In fact, arguably, we don't want the hash code from modifying the hash
either (or even hashing things in). Our 64-bit code handles it today in
perf_callchain.c in a way that involves pretty much duplicating the
functionality of __get_user_pages_fast() as used by x86 (see below), but
as a fallback from a direct access which misses the pagefault_disable()
as well.

I think it comes from an old assumption that this would always be called
from an nmi, and the explicit tracepoints broke that assumption.

In fact we probably want to bump the NMI count, not just the IRQ count
as pagefault_disable() does, to make sure we prevent hashing. 

x86 does things differently, using __get_user_pages_fast() (a variant of
get_user_page_fast() that doesn't fallback to normal get_user_pages()).

Now, we could do the same (use __gup_fast too), but I can see a
potential issue with ppc 32-bit platforms that have 64-bit PTEs, since
we could end up GUP'ing in the middle of the two accesses.

Becky: I think gup_fast is generally broken on 32-bit with 64-bit PTE
because of that, the problem isn't specific to perf backtraces, I'll
propose a solution further down.

Now, on x86, there is a similar problem with PAE, which is handled by

 - having gup disable IRQs
 - rely on the fact that to change from a valid value to another valid
   value, the PTE will first get invalidated, which requires an IPI
   and thus will be blocked by our interrupts being off

We do the first part, but the second part will break if we use HW TLB
invalidation broadcast (yet another reason why those are bad, I think I
will write a blog entry about it one of these days).

I think we can work around this while keeping our broadcast TLB
invalidations by having the invalidation code also increment a global
generation count (using the existing lock used by the invalidation code,
all 32-bit platforms have such a lock).

>From there, gup_fast can be changed to, with proper ordering, check the
generation count around the loading of the PTE and loop if it has
changed, kind-of a seqlock.

We also need the NMI count bump if we are going to try to keep the
attempt at doing a direct access first for perfs.

Becky, do you feel like giving that a shot or should I find another
victim ? (Or even do it myself ... ) :-)

Cheers,
Ben.

> Thanks,
> David
> 
> > 
> >  [<b0180e00>]rb_erase+0x1b4/0x3e8
> >  [<b00430f4>]__dequeue_entity+0x50/0xe8
> >  [<b0043304>]set_next_entity+0x178/0x1bc
> >  [<b0043440>]pick_next_task_fair+0xb0/0x118
> >  [<b02ada80>]schedule+0x500/0x614
> >  [<b02afaa8>]rwsem_down_failed_common+0xf0/0x264
> >  [<b02afca0>]rwsem_down_read_failed+0x34/0x54
> >  [<b02aed4c>]down_read+0x3c/0x54
> >  [<b0023b58>]do_page_fault+0x114/0x5e8
> >  [<b001e350>]handle_page_fault+0xc/0x80
> >  [<b0022dec>]perf_callchain+0x224/0x31c
> >  [<b009ba70>]perf_prepare_sample+0x240/0x2fc
> >  [<b009d760>]__perf_event_overflow+0x280/0x398
> >  [<b009d914>]perf_swevent_overflow+0x9c/0x10c
> >  [<b009db54>]perf_swevent_ctx_event+0x1d0/0x230
> >  [<b009dc38>]do_perf_sw_event+0x84/0xe4
> >  [<b009dde8>]perf_sw_event_context_switch+0x150/0x1b4
> >  [<b009de90>]perf_event_task_sched_out+0x44/0x2d4
> >  [<b02ad840>]schedule+0x2c0/0x614
> >  [<b0047dc0>]__cond_resched+0x34/0x90
> >  [<b02adcc8>]_cond_resched+0x4c/0x68
> >  [<b00bccf8>]move_page_tables+0xb0/0x418
> >  [<b00d7ee0>]setup_arg_pages+0x184/0x2a0
> >  [<b0110914>]load_elf_binary+0x394/0x1208
> >  [<b00d6e28>]search_binary_handler+0xe0/0x2c4
> >  [<b00d834c>]do_execve+0x1bc/0x268
> >  [<b0015394>]sys_execve+0x84/0xc8
> >  [<b001df10>]ret_from_syscall+0x0/0x3c
> > 
> > Thanks,
> > David
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev

^ permalink raw reply

* [PATCH] perf: powerpc: Disable pagefaults during callchain stack read
From: Anton Blanchard @ 2011-07-25  0:05 UTC (permalink / raw)
  To: David Ahern; +Cc: linux-perf-users, linuxppc-dev, Paul Mackerras, LKML
In-Reply-To: <4E2C53E0.3020400@gmail.com>

Hi David,

> > I am hoping someone familiar with PPC can help understand a panic
> > that is generated when capturing callchains with context switch
> > events.
> > 
> > Call trace is below. The short of it is that walking the callchain
> > generates a page fault. To handle the page fault the mmap_sem is
> > needed, but it is currently held by setup_arg_pages.
> > setup_arg_pages calls shift_arg_pages with the mmap_sem held.
> > shift_arg_pages then calls move_page_tables which has a
> > cond_resched at the top of its for loop. If the cond_resched() is
> > removed from move_page_tables everything works beautifully - no
> > panics.
> > 
> > So, the question: is it normal for walking the stack to trigger a
> > page fault on PPC? The panic is not seen on x86 based systems.
> 
> Can anyone confirm whether page faults while walking the stack are
> normal for PPC? We really want to use the context switch event with
> callchains and need to understand whether this behavior is normal. Of
> course if it is normal, a way to address the problem without a panic
> will be needed.

I talked to Ben about this last week and he pointed me at
pagefault_disable/enable. Untested patch below.

Anton

--

We need to disable pagefaults when reading the stack otherwise
we can lock up trying to take the mmap_sem when the code we are
profiling already has a write lock taken.

This will not happen for hardware events, but could for software
events.

Reported-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: <stable@kernel.org>
---

Index: linux-powerpc/arch/powerpc/kernel/perf_callchain.c
===================================================================
--- linux-powerpc.orig/arch/powerpc/kernel/perf_callchain.c	2011-07-25 09:54:27.296757427 +1000
+++ linux-powerpc/arch/powerpc/kernel/perf_callchain.c	2011-07-25 09:56:08.828367882 +1000
@@ -154,8 +154,12 @@ static int read_user_stack_64(unsigned l
 	    ((unsigned long)ptr & 7))
 		return -EFAULT;
 
-	if (!__get_user_inatomic(*ret, ptr))
+	pagefault_disable();
+	if (!__get_user_inatomic(*ret, ptr)) {
+		pagefault_enable();
 		return 0;
+	}
+	pagefault_enable();
 
 	return read_user_stack_slow(ptr, ret, 8);
 }
@@ -166,8 +170,12 @@ static int read_user_stack_32(unsigned i
 	    ((unsigned long)ptr & 3))
 		return -EFAULT;
 
-	if (!__get_user_inatomic(*ret, ptr))
+	pagefault_disable();
+	if (!__get_user_inatomic(*ret, ptr)) {
+		pagefault_enable();
 		return 0;
+	}
+	pagefault_enable();
 
 	return read_user_stack_slow(ptr, ret, 4);
 }

^ permalink raw reply

* Re: Linux 3.0 boot failure on the Powerbook G4
From: Benjamin Herrenschmidt @ 2011-07-24 23:23 UTC (permalink / raw)
  To: Michael Büsch; +Cc: linuxppc-dev
In-Reply-To: <20110724143729.49c69ce8@maggie>

On Sun, 2011-07-24 at 14:37 +0200, Michael Büsch wrote:
> On Sun, 24 Jul 2011 22:13:34 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > > I'm booting zImage.pmac.
> > 
> > Ah that might make it easier... I don't remember where it links, can you
> > show me the program headers out of readelf -a of the zImage ?
> 
> As I recompiled stuff, here's the current failure log:
> http://bues.ch/misc/linux-3.0-pbook-2.jpg
> 
> And this is the corresponding readelf output:

Hrm.. the faulting address is outside of the zImage. Odd.

Can you try loading a plain vmlinux instead ? (feel free to strip it).

yaboot 1.3.13 might not be the best one to load a real ELF ...

On my side I'll dig one of my old powerbooks and see if I can reproduce
(I generally tend to netboot the zImage directly, but it needs to be <
4M for that to work due to Apple OF limitations, or use yaboot with plan
vmlinux which exercises a different code path within yaboot).

Cheers,
Ben.

> mb@maggie:~$ readelf -a /boot/linux.a
> ELF Header:
>   Magic:   7f 45 4c 46 01 02 01 00 00 00 00 00 00 00 00 00 
>   Class:                             ELF32
>   Data:                              2's complement, big endian
>   Version:                           1 (current)
>   OS/ABI:                            UNIX - System V
>   ABI Version:                       0
>   Type:                              EXEC (Executable file)
>   Machine:                           PowerPC
>   Version:                           0x1
>   Entry point address:               0x400230
>   Start of program headers:          52 (bytes into file)
>   Start of section headers:          5769716 (bytes into file)
>   Flags:                             0x8000, relocatable-lib
>   Size of this header:               52 (bytes)
>   Size of program headers:           32 (bytes)
>   Number of program headers:         2
>   Size of section headers:           40 (bytes)
>   Number of section headers:         12
>   Section header string table index: 9
> 
> Section Headers:
>   [Nr] Name              Type            Addr     Off    Size   ES Flg Lk Inf Al
>   [ 0]                   NULL            00000000 000000 000000 00      0   0  0
>   [ 1] .text             PROGBITS        00400000 010000 0048b0 00  AX  0   0  4
>   [ 2] .data             PROGBITS        00405000 015000 0012f8 00  WA  0   0  4
>   [ 3] .got              PROGBITS        004062f8 0162f8 00000c 04  WA  0   0  4
>   [ 4] __builtin_cmdline PROGBITS        00406304 016304 000200 00  WA  0   0  4
>   [ 5] .kernel:vmlinux.s PROGBITS        00407000 017000 569952 00   A  0   0  1
>   [ 6] .bss              NOBITS          00971000 580952 00bc70 00  WA  0   0  4
>   [ 7] .comment          PROGBITS        00000000 580952 00001c 01  MS  0   0  1
>   [ 8] .gnu.attributes   LOOS+ffffff5    00000000 58096e 000014 00      0   0  1
>   [ 9] .shstrtab         STRTAB          00000000 580982 000072 00      0   0  1
>   [10] .symtab           SYMTAB          00000000 580bd4 000780 10     11  55  4
>   [11] .strtab           STRTAB          00000000 581354 0004f3 00      0   0  1
> Key to Flags:
>   W (write), A (alloc), X (execute), M (merge), S (strings)
>   I (info), L (link order), G (group), x (unknown)
>   O (extra OS processing required) o (OS specific), p (processor specific)
> 
> There are no section groups in this file.
> 
> Program Headers:
>   Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
>   LOAD           0x010000 0x00400000 0x00400000 0x570952 0x57cc70 RWE 0x10000
>   GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x4
> 
>  Section to Segment mapping:
>   Segment Sections...
>    00     .text .data .got __builtin_cmdline .kernel:vmlinux.strip .bss 
>    01     
> 
> There is no dynamic section in this file.
> 
> There are no relocations in this file.
> 
> There are no unwind sections in this file.
> 
> Symbol table '.symtab' contains 120 entries:
>    Num:    Value  Size Type    Bind   Vis      Ndx Name
>      0: 00000000     0 NOTYPE  LOCAL  DEFAULT  UND 
>      1: 00400000     0 SECTION LOCAL  DEFAULT    1 
>      2: 00405000     0 SECTION LOCAL  DEFAULT    2 
>      3: 004062f8     0 SECTION LOCAL  DEFAULT    3 
>      4: 00406304     0 SECTION LOCAL  DEFAULT    4 
>      5: 00407000     0 SECTION LOCAL  DEFAULT    5 
>      6: 00971000     0 SECTION LOCAL  DEFAULT    6 
>      7: 00000000     0 SECTION LOCAL  DEFAULT    7 
>      8: 00000000     0 SECTION LOCAL  DEFAULT    8 
>      9: 00000000     0 FILE    LOCAL  DEFAULT  ABS of.c
>     10: 00400000    96 FUNC    LOCAL  DEFAULT    1 of_image_hdr
>     11: 00400130   220 FUNC    LOCAL  DEFAULT    1 of_try_claim
>     12: 00971000     4 OBJECT  LOCAL  DEFAULT    6 claim_base
>     13: 00000000     0 FILE    LOCAL  DEFAULT  ABS empty.c
>     14: 0040021c     0 NOTYPE  LOCAL  DEFAULT    1 p_start
>     15: 00400220     0 NOTYPE  LOCAL  DEFAULT    1 p_etext
>     16: 00400224     0 NOTYPE  LOCAL  DEFAULT    1 p_bss_start
>     17: 00400228     0 NOTYPE  LOCAL  DEFAULT    1 p_end
>     18: 0040022c     0 NOTYPE  LOCAL  DEFAULT    1 p_pstack
>     19: 00400234     0 NOTYPE  LOCAL  DEFAULT    1 p_base
>     20: 00000007     0 NOTYPE  LOCAL  DEFAULT  ABS RELA
>     21: 6ffffff9     0 NOTYPE  LOCAL  DEFAULT  ABS RELACOUNT
>     22: 00000000     0 FILE    LOCAL  DEFAULT  ABS main.c
>     23: 0040032c   536 FUNC    LOCAL  DEFAULT    1 prep_kernel
>     24: 00971004 46960 OBJECT  LOCAL  DEFAULT    6 gzstate
>     25: 00406304   512 OBJECT  LOCAL  DEFAULT    4 cmdline
>     26: 00000000     0 FILE    LOCAL  DEFAULT  ABS gunzip_util.c
>     27: 0097c774   128 OBJECT  LOCAL  DEFAULT    6 discard_buf.1439
>     28: 00000000     0 FILE    LOCAL  DEFAULT  ABS elf_util.c
>     29: 00000000     0 FILE    LOCAL  DEFAULT  ABS inflate.c
>     30: 00400ed4   424 FUNC    LOCAL  DEFAULT    1 zlib_adler32
>     31: 004011c4   292 FUNC    LOCAL  DEFAULT    1 zlib_updatewindow
>     32: 00405484  2048 OBJECT  LOCAL  DEFAULT    2 lenfix.1147
>     33: 00405c84   128 OBJECT  LOCAL  DEFAULT    2 distfix.1148
>     34: 00405d04    38 OBJECT  LOCAL  DEFAULT    2 order.1216
>     35: 00000000     0 FILE    LOCAL  DEFAULT  ABS inftrees.c
>     36: 00405e8e    62 OBJECT  LOCAL  DEFAULT    2 lext.1062
>     37: 00405ecc    62 OBJECT  LOCAL  DEFAULT    2 lbase.1061
>     38: 00405f0a    64 OBJECT  LOCAL  DEFAULT    2 dext.1064
>     39: 00405f4a    64 OBJECT  LOCAL  DEFAULT    2 dbase.1063
>     40: 00000000     0 FILE    LOCAL  DEFAULT  ABS oflib.c
>     41: 00402a4c   432 FUNC    LOCAL  DEFAULT    1 of_call_prom_ret
>     42: 0040611c     4 OBJECT  LOCAL  DEFAULT    2 need_map
>     43: 0097c7f4     4 OBJECT  LOCAL  DEFAULT    6 prom
>     44: 0097c7f8     4 OBJECT  LOCAL  DEFAULT    6 chosen_mmu
>     45: 0097c7fc     4 OBJECT  LOCAL  DEFAULT    6 memory
>     46: 00000000     0 FILE    LOCAL  DEFAULT  ABS ofconsole.c
>     47: 004032b0   104 FUNC    LOCAL  DEFAULT    1 of_console_open
>     48: 0040325c    84 FUNC    LOCAL  DEFAULT    1 of_console_write
>     49: 0097c800     4 OBJECT  LOCAL  DEFAULT    6 of_stdout_handle
>     50: 00000000     0 FILE    LOCAL  DEFAULT  ABS stdio.c
>     51: 0040369c   848 FUNC    LOCAL  DEFAULT    1 number
>     52: 0097c804  1024 OBJECT  LOCAL  DEFAULT    6 sprint_buf
>     53: 00000000     0 FILE    LOCAL  DEFAULT  ABS inffast.c
>     54: 004062f8     0 OBJECT  LOCAL  HIDDEN    3 _GLOBAL_OFFSET_TABLE_
>     55: 00400060   208 FUNC    GLOBAL DEFAULT    1 platform_init
>     56: 00403318     0 NOTYPE  GLOBAL DEFAULT    1 strcpy
>     57: 00000000     0 NOTYPE  WEAK   DEFAULT  UND _platform_stack_top
>     58: 00400924   240 FUNC    GLOBAL DEFAULT    1 gunzip_partial
>     59: 0040413c   188 FUNC    GLOBAL DEFAULT    1 printf
>     60: 004039ec  1872 FUNC    GLOBAL DEFAULT    1 vsprintf
>     61: 0040426c     0 NOTYPE  GLOBAL DEFAULT    1 __div64_32
>     62: 00403468     0 NOTYPE  GLOBAL DEFAULT    1 memmove
>     63: 00402a10    60 FUNC    GLOBAL DEFAULT    1 of_init
>     64: 00406508     0 NOTYPE  GLOBAL DEFAULT    4 _dtb_start
>     65: 0040020c     0 NOTYPE  GLOBAL DEFAULT    1 _zimage_start_opd
>     66: 004048b0     0 NOTYPE  GLOBAL DEFAULT    1 _etext
>     67: 00402e04    72 FUNC    GLOBAL DEFAULT    1 of_finddevice
>     68: 00401088   132 FUNC    GLOBAL DEFAULT    1 zlib_inflateReset
>     69: 00403470     0 NOTYPE  GLOBAL DEFAULT    1 memcpy
>     70: 00403624     0 NOTYPE  GLOBAL DEFAULT    1 flush_cache
>     71: 0040430c  1444 FUNC    GLOBAL DEFAULT    1 inflate_fast
>     72: 00407000     0 NOTYPE  GLOBAL DEFAULT    5 _vmlinux_start
>     73: 0040110c   152 FUNC    GLOBAL DEFAULT    1 zlib_inflateInit2
>     74: 00402dac    88 FUNC    GLOBAL DEFAULT    1 of_getprop
>     75: 00400b80   484 FUNC    GLOBAL DEFAULT    1 gunzip_start
>     76: 0097cc04    20 OBJECT  GLOBAL DEFAULT    6 loader_info
>     77: 0097cc18    28 OBJECT  GLOBAL DEFAULT    6 platform_ops
>     78: 00403140   212 FUNC    GLOBAL DEFAULT    1 of_vmlinux_alloc
>     79: 00400a7c   120 FUNC    GLOBAL DEFAULT    1 gunzip_exactly
>     80: 004012e8   240 FUNC    GLOBAL DEFAULT    1 zlib_inflateIncomp
>     81: 00400d64   200 FUNC    GLOBAL DEFAULT    1 parse_elf64
>     82: 0097cc34    20 OBJECT  GLOBAL DEFAULT    6 console_ops
>     83: 00403650    76 FUNC    GLOBAL DEFAULT    1 strnlen
>     84: 00400a14   104 FUNC    GLOBAL DEFAULT    1 gunzip_finish
>     85: 00402e90   688 FUNC    GLOBAL DEFAULT    1 of_claim
>     86: 00402480  1424 FUNC    GLOBAL DEFAULT    1 zlib_inflate_table
>     87: 00400af4   140 FUNC    GLOBAL DEFAULT    1 gunzip_discard
>     88: 004013d8  4264 FUNC    GLOBAL DEFAULT    1 zlib_inflate
>     89: 00400e2c   168 FUNC    GLOBAL DEFAULT    1 parse_elf32
>     90: 0040335c     0 NOTYPE  GLOBAL DEFAULT    1 strcat
>     91: 00402e4c    68 FUNC    GLOBAL DEFAULT    1 of_exit
>     92: 004035cc     0 NOTYPE  GLOBAL DEFAULT    1 memchr
>     93: 00400000     0 NOTYPE  GLOBAL DEFAULT    1 _start
>     94: 004033cc     0 NOTYPE  GLOBAL DEFAULT    1 strncmp
>     95: 00403214    72 FUNC    GLOBAL DEFAULT    1 of_console_init
>     96: 0040107c    12 FUNC    GLOBAL DEFAULT    1 zlib_inflate_workspacesiz
>     97: 00403334     0 NOTYPE  GLOBAL DEFAULT    1 strncpy
>     98: 004035f4     0 NOTYPE  GLOBAL DEFAULT    1 memcmp
>     99: 00971000     0 NOTYPE  GLOBAL DEFAULT    5 _initrd_start
>    100: 00400230     0 NOTYPE  WEAK   DEFAULT    1 _zimage_start
>    101: 00403528     0 NOTYPE  GLOBAL DEFAULT    1 backwards_memcpy
>    102: 00971000     0 NOTYPE  GLOBAL DEFAULT    6 __bss_start
>    103: 0040340c     0 NOTYPE  GLOBAL DEFAULT    1 memset
>    104: 00406508     0 NOTYPE  GLOBAL DEFAULT    4 _dtb_end
>    105: 00971000     0 NOTYPE  GLOBAL DEFAULT    5 _initrd_end
>    106: 0097cc48    40 OBJECT  GLOBAL DEFAULT    6 dt_ops
>    107: 004033a8     0 NOTYPE  GLOBAL DEFAULT    1 strcmp
>    108: 004041f8   116 FUNC    GLOBAL DEFAULT    1 sprintf
>    109: 00971000     0 NOTYPE  GLOBAL DEFAULT    6 _edata
>    110: 0097cc70     0 NOTYPE  GLOBAL DEFAULT    6 _end
>    111: 00400544   992 FUNC    GLOBAL DEFAULT    1 start
>    112: 00970952     0 NOTYPE  GLOBAL DEFAULT    5 _vmlinux_end
>    113: 004033f4     0 NOTYPE  GLOBAL DEFAULT    1 strlen
>    114: 00403388     0 NOTYPE  GLOBAL DEFAULT    1 strchr
>    115: 00400230     0 NOTYPE  GLOBAL DEFAULT    1 _zimage_start_lib
>    116: 00406504     0 NOTYPE  GLOBAL DEFAULT    4 __dynamic_start
>    117: 004011a4    32 FUNC    GLOBAL DEFAULT    1 zlib_inflateEnd
>    118: 00402d54    88 FUNC    GLOBAL DEFAULT    1 of_setprop
>    119: 00402bfc   344 FUNC    GLOBAL DEFAULT    1 of_call_prom
> 
> No version information found in this file.
> Attribute Section: gnu
> File Attributes
>   Tag_GNU_Power_ABI_FP: Soft float
>   Tag_GNU_Power_ABI_Vector: Generic
>   Tag_GNU_Power_ABI_Struct_Return: Memory
> 
> 

^ permalink raw reply

* Re: perf PPC: kernel panic with callchains and context switch events
From: David Ahern @ 2011-07-24 17:18 UTC (permalink / raw)
  To: Anton Blanchard, Paul Mackerras, linux-perf-users, LKML,
	linuxppc-dev
In-Reply-To: <4E274F5F.7000604@gmail.com>

On 07/20/2011 03:57 PM, David Ahern wrote:
> I am hoping someone familiar with PPC can help understand a panic that
> is generated when capturing callchains with context switch events.
> 
> Call trace is below. The short of it is that walking the callchain
> generates a page fault. To handle the page fault the mmap_sem is needed,
> but it is currently held by setup_arg_pages. setup_arg_pages calls
> shift_arg_pages with the mmap_sem held. shift_arg_pages then calls
> move_page_tables which has a cond_resched at the top of its for loop. If
> the cond_resched() is removed from move_page_tables everything works
> beautifully - no panics.
> 
> So, the question: is it normal for walking the stack to trigger a page
> fault on PPC? The panic is not seen on x86 based systems.

Can anyone confirm whether page faults while walking the stack are
normal for PPC? We really want to use the context switch event with
callchains and need to understand whether this behavior is normal. Of
course if it is normal, a way to address the problem without a panic
will be needed.

Thanks,
David

> 
>  [<b0180e00>]rb_erase+0x1b4/0x3e8
>  [<b00430f4>]__dequeue_entity+0x50/0xe8
>  [<b0043304>]set_next_entity+0x178/0x1bc
>  [<b0043440>]pick_next_task_fair+0xb0/0x118
>  [<b02ada80>]schedule+0x500/0x614
>  [<b02afaa8>]rwsem_down_failed_common+0xf0/0x264
>  [<b02afca0>]rwsem_down_read_failed+0x34/0x54
>  [<b02aed4c>]down_read+0x3c/0x54
>  [<b0023b58>]do_page_fault+0x114/0x5e8
>  [<b001e350>]handle_page_fault+0xc/0x80
>  [<b0022dec>]perf_callchain+0x224/0x31c
>  [<b009ba70>]perf_prepare_sample+0x240/0x2fc
>  [<b009d760>]__perf_event_overflow+0x280/0x398
>  [<b009d914>]perf_swevent_overflow+0x9c/0x10c
>  [<b009db54>]perf_swevent_ctx_event+0x1d0/0x230
>  [<b009dc38>]do_perf_sw_event+0x84/0xe4
>  [<b009dde8>]perf_sw_event_context_switch+0x150/0x1b4
>  [<b009de90>]perf_event_task_sched_out+0x44/0x2d4
>  [<b02ad840>]schedule+0x2c0/0x614
>  [<b0047dc0>]__cond_resched+0x34/0x90
>  [<b02adcc8>]_cond_resched+0x4c/0x68
>  [<b00bccf8>]move_page_tables+0xb0/0x418
>  [<b00d7ee0>]setup_arg_pages+0x184/0x2a0
>  [<b0110914>]load_elf_binary+0x394/0x1208
>  [<b00d6e28>]search_binary_handler+0xe0/0x2c4
>  [<b00d834c>]do_execve+0x1bc/0x268
>  [<b0015394>]sys_execve+0x84/0xc8
>  [<b001df10>]ret_from_syscall+0x0/0x3c
> 
> Thanks,
> David

^ permalink raw reply

* Re: [PATCH 2/5] hugetlb: add phys addr to struct huge_bootmem_page
From: Tabi Timur-B04825 @ 2011-07-24 16:48 UTC (permalink / raw)
  To: Becky Bruce
  Cc: linuxppc-dev@lists.ozlabs.org,
	List linux-kernel@vger.kernel.org Mailing, David Gibson
In-Reply-To: <786B027A-4EC8-4175-A18D-9DA57E9549D6@kernel.crashing.org>

On Thu, Jun 30, 2011 at 1:50 PM, Becky Bruce <beckyb@kernel.crashing.org> w=
rote:

> Because there was no bootmem allocation in the normal case - the non-high=
mem
> version stores data structure in the huge page itself. =A0This is perfect=
ly fine as long
> as you have a mapping. =A0Since this isn't true for HIGHMEM pages, I allo=
cate
> bootmem to store the early data structure that stores information about t=
he
> hugepage (this happens in arch-specific code in alloc_bootmem_huge_page).

I would put this text in a comment in the code.

--=20
Timur Tabi
Linux kernel developer at Freescale=

^ permalink raw reply

* Re: Linux 3.0 boot failure on the Powerbook G4
From: Michael Büsch @ 2011-07-24 12:37 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1311509614.25044.585.camel@pasglop>

On Sun, 24 Jul 2011 22:13:34 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > I'm booting zImage.pmac.
> 
> Ah that might make it easier... I don't remember where it links, can you
> show me the program headers out of readelf -a of the zImage ?

As I recompiled stuff, here's the current failure log:
http://bues.ch/misc/linux-3.0-pbook-2.jpg

And this is the corresponding readelf output:

mb@maggie:~$ readelf -a /boot/linux.a
ELF Header:
  Magic:   7f 45 4c 46 01 02 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF32
  Data:                              2's complement, big endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           PowerPC
  Version:                           0x1
  Entry point address:               0x400230
  Start of program headers:          52 (bytes into file)
  Start of section headers:          5769716 (bytes into file)
  Flags:                             0x8000, relocatable-lib
  Size of this header:               52 (bytes)
  Size of program headers:           32 (bytes)
  Number of program headers:         2
  Size of section headers:           40 (bytes)
  Number of section headers:         12
  Section header string table index: 9

Section Headers:
  [Nr] Name              Type            Addr     Off    Size   ES Flg Lk Inf Al
  [ 0]                   NULL            00000000 000000 000000 00      0   0  0
  [ 1] .text             PROGBITS        00400000 010000 0048b0 00  AX  0   0  4
  [ 2] .data             PROGBITS        00405000 015000 0012f8 00  WA  0   0  4
  [ 3] .got              PROGBITS        004062f8 0162f8 00000c 04  WA  0   0  4
  [ 4] __builtin_cmdline PROGBITS        00406304 016304 000200 00  WA  0   0  4
  [ 5] .kernel:vmlinux.s PROGBITS        00407000 017000 569952 00   A  0   0  1
  [ 6] .bss              NOBITS          00971000 580952 00bc70 00  WA  0   0  4
  [ 7] .comment          PROGBITS        00000000 580952 00001c 01  MS  0   0  1
  [ 8] .gnu.attributes   LOOS+ffffff5    00000000 58096e 000014 00      0   0  1
  [ 9] .shstrtab         STRTAB          00000000 580982 000072 00      0   0  1
  [10] .symtab           SYMTAB          00000000 580bd4 000780 10     11  55  4
  [11] .strtab           STRTAB          00000000 581354 0004f3 00      0   0  1
Key to Flags:
  W (write), A (alloc), X (execute), M (merge), S (strings)
  I (info), L (link order), G (group), x (unknown)
  O (extra OS processing required) o (OS specific), p (processor specific)

There are no section groups in this file.

Program Headers:
  Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD           0x010000 0x00400000 0x00400000 0x570952 0x57cc70 RWE 0x10000
  GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x4

 Section to Segment mapping:
  Segment Sections...
   00     .text .data .got __builtin_cmdline .kernel:vmlinux.strip .bss 
   01     

There is no dynamic section in this file.

There are no relocations in this file.

There are no unwind sections in this file.

Symbol table '.symtab' contains 120 entries:
   Num:    Value  Size Type    Bind   Vis      Ndx Name
     0: 00000000     0 NOTYPE  LOCAL  DEFAULT  UND 
     1: 00400000     0 SECTION LOCAL  DEFAULT    1 
     2: 00405000     0 SECTION LOCAL  DEFAULT    2 
     3: 004062f8     0 SECTION LOCAL  DEFAULT    3 
     4: 00406304     0 SECTION LOCAL  DEFAULT    4 
     5: 00407000     0 SECTION LOCAL  DEFAULT    5 
     6: 00971000     0 SECTION LOCAL  DEFAULT    6 
     7: 00000000     0 SECTION LOCAL  DEFAULT    7 
     8: 00000000     0 SECTION LOCAL  DEFAULT    8 
     9: 00000000     0 FILE    LOCAL  DEFAULT  ABS of.c
    10: 00400000    96 FUNC    LOCAL  DEFAULT    1 of_image_hdr
    11: 00400130   220 FUNC    LOCAL  DEFAULT    1 of_try_claim
    12: 00971000     4 OBJECT  LOCAL  DEFAULT    6 claim_base
    13: 00000000     0 FILE    LOCAL  DEFAULT  ABS empty.c
    14: 0040021c     0 NOTYPE  LOCAL  DEFAULT    1 p_start
    15: 00400220     0 NOTYPE  LOCAL  DEFAULT    1 p_etext
    16: 00400224     0 NOTYPE  LOCAL  DEFAULT    1 p_bss_start
    17: 00400228     0 NOTYPE  LOCAL  DEFAULT    1 p_end
    18: 0040022c     0 NOTYPE  LOCAL  DEFAULT    1 p_pstack
    19: 00400234     0 NOTYPE  LOCAL  DEFAULT    1 p_base
    20: 00000007     0 NOTYPE  LOCAL  DEFAULT  ABS RELA
    21: 6ffffff9     0 NOTYPE  LOCAL  DEFAULT  ABS RELACOUNT
    22: 00000000     0 FILE    LOCAL  DEFAULT  ABS main.c
    23: 0040032c   536 FUNC    LOCAL  DEFAULT    1 prep_kernel
    24: 00971004 46960 OBJECT  LOCAL  DEFAULT    6 gzstate
    25: 00406304   512 OBJECT  LOCAL  DEFAULT    4 cmdline
    26: 00000000     0 FILE    LOCAL  DEFAULT  ABS gunzip_util.c
    27: 0097c774   128 OBJECT  LOCAL  DEFAULT    6 discard_buf.1439
    28: 00000000     0 FILE    LOCAL  DEFAULT  ABS elf_util.c
    29: 00000000     0 FILE    LOCAL  DEFAULT  ABS inflate.c
    30: 00400ed4   424 FUNC    LOCAL  DEFAULT    1 zlib_adler32
    31: 004011c4   292 FUNC    LOCAL  DEFAULT    1 zlib_updatewindow
    32: 00405484  2048 OBJECT  LOCAL  DEFAULT    2 lenfix.1147
    33: 00405c84   128 OBJECT  LOCAL  DEFAULT    2 distfix.1148
    34: 00405d04    38 OBJECT  LOCAL  DEFAULT    2 order.1216
    35: 00000000     0 FILE    LOCAL  DEFAULT  ABS inftrees.c
    36: 00405e8e    62 OBJECT  LOCAL  DEFAULT    2 lext.1062
    37: 00405ecc    62 OBJECT  LOCAL  DEFAULT    2 lbase.1061
    38: 00405f0a    64 OBJECT  LOCAL  DEFAULT    2 dext.1064
    39: 00405f4a    64 OBJECT  LOCAL  DEFAULT    2 dbase.1063
    40: 00000000     0 FILE    LOCAL  DEFAULT  ABS oflib.c
    41: 00402a4c   432 FUNC    LOCAL  DEFAULT    1 of_call_prom_ret
    42: 0040611c     4 OBJECT  LOCAL  DEFAULT    2 need_map
    43: 0097c7f4     4 OBJECT  LOCAL  DEFAULT    6 prom
    44: 0097c7f8     4 OBJECT  LOCAL  DEFAULT    6 chosen_mmu
    45: 0097c7fc     4 OBJECT  LOCAL  DEFAULT    6 memory
    46: 00000000     0 FILE    LOCAL  DEFAULT  ABS ofconsole.c
    47: 004032b0   104 FUNC    LOCAL  DEFAULT    1 of_console_open
    48: 0040325c    84 FUNC    LOCAL  DEFAULT    1 of_console_write
    49: 0097c800     4 OBJECT  LOCAL  DEFAULT    6 of_stdout_handle
    50: 00000000     0 FILE    LOCAL  DEFAULT  ABS stdio.c
    51: 0040369c   848 FUNC    LOCAL  DEFAULT    1 number
    52: 0097c804  1024 OBJECT  LOCAL  DEFAULT    6 sprint_buf
    53: 00000000     0 FILE    LOCAL  DEFAULT  ABS inffast.c
    54: 004062f8     0 OBJECT  LOCAL  HIDDEN    3 _GLOBAL_OFFSET_TABLE_
    55: 00400060   208 FUNC    GLOBAL DEFAULT    1 platform_init
    56: 00403318     0 NOTYPE  GLOBAL DEFAULT    1 strcpy
    57: 00000000     0 NOTYPE  WEAK   DEFAULT  UND _platform_stack_top
    58: 00400924   240 FUNC    GLOBAL DEFAULT    1 gunzip_partial
    59: 0040413c   188 FUNC    GLOBAL DEFAULT    1 printf
    60: 004039ec  1872 FUNC    GLOBAL DEFAULT    1 vsprintf
    61: 0040426c     0 NOTYPE  GLOBAL DEFAULT    1 __div64_32
    62: 00403468     0 NOTYPE  GLOBAL DEFAULT    1 memmove
    63: 00402a10    60 FUNC    GLOBAL DEFAULT    1 of_init
    64: 00406508     0 NOTYPE  GLOBAL DEFAULT    4 _dtb_start
    65: 0040020c     0 NOTYPE  GLOBAL DEFAULT    1 _zimage_start_opd
    66: 004048b0     0 NOTYPE  GLOBAL DEFAULT    1 _etext
    67: 00402e04    72 FUNC    GLOBAL DEFAULT    1 of_finddevice
    68: 00401088   132 FUNC    GLOBAL DEFAULT    1 zlib_inflateReset
    69: 00403470     0 NOTYPE  GLOBAL DEFAULT    1 memcpy
    70: 00403624     0 NOTYPE  GLOBAL DEFAULT    1 flush_cache
    71: 0040430c  1444 FUNC    GLOBAL DEFAULT    1 inflate_fast
    72: 00407000     0 NOTYPE  GLOBAL DEFAULT    5 _vmlinux_start
    73: 0040110c   152 FUNC    GLOBAL DEFAULT    1 zlib_inflateInit2
    74: 00402dac    88 FUNC    GLOBAL DEFAULT    1 of_getprop
    75: 00400b80   484 FUNC    GLOBAL DEFAULT    1 gunzip_start
    76: 0097cc04    20 OBJECT  GLOBAL DEFAULT    6 loader_info
    77: 0097cc18    28 OBJECT  GLOBAL DEFAULT    6 platform_ops
    78: 00403140   212 FUNC    GLOBAL DEFAULT    1 of_vmlinux_alloc
    79: 00400a7c   120 FUNC    GLOBAL DEFAULT    1 gunzip_exactly
    80: 004012e8   240 FUNC    GLOBAL DEFAULT    1 zlib_inflateIncomp
    81: 00400d64   200 FUNC    GLOBAL DEFAULT    1 parse_elf64
    82: 0097cc34    20 OBJECT  GLOBAL DEFAULT    6 console_ops
    83: 00403650    76 FUNC    GLOBAL DEFAULT    1 strnlen
    84: 00400a14   104 FUNC    GLOBAL DEFAULT    1 gunzip_finish
    85: 00402e90   688 FUNC    GLOBAL DEFAULT    1 of_claim
    86: 00402480  1424 FUNC    GLOBAL DEFAULT    1 zlib_inflate_table
    87: 00400af4   140 FUNC    GLOBAL DEFAULT    1 gunzip_discard
    88: 004013d8  4264 FUNC    GLOBAL DEFAULT    1 zlib_inflate
    89: 00400e2c   168 FUNC    GLOBAL DEFAULT    1 parse_elf32
    90: 0040335c     0 NOTYPE  GLOBAL DEFAULT    1 strcat
    91: 00402e4c    68 FUNC    GLOBAL DEFAULT    1 of_exit
    92: 004035cc     0 NOTYPE  GLOBAL DEFAULT    1 memchr
    93: 00400000     0 NOTYPE  GLOBAL DEFAULT    1 _start
    94: 004033cc     0 NOTYPE  GLOBAL DEFAULT    1 strncmp
    95: 00403214    72 FUNC    GLOBAL DEFAULT    1 of_console_init
    96: 0040107c    12 FUNC    GLOBAL DEFAULT    1 zlib_inflate_workspacesiz
    97: 00403334     0 NOTYPE  GLOBAL DEFAULT    1 strncpy
    98: 004035f4     0 NOTYPE  GLOBAL DEFAULT    1 memcmp
    99: 00971000     0 NOTYPE  GLOBAL DEFAULT    5 _initrd_start
   100: 00400230     0 NOTYPE  WEAK   DEFAULT    1 _zimage_start
   101: 00403528     0 NOTYPE  GLOBAL DEFAULT    1 backwards_memcpy
   102: 00971000     0 NOTYPE  GLOBAL DEFAULT    6 __bss_start
   103: 0040340c     0 NOTYPE  GLOBAL DEFAULT    1 memset
   104: 00406508     0 NOTYPE  GLOBAL DEFAULT    4 _dtb_end
   105: 00971000     0 NOTYPE  GLOBAL DEFAULT    5 _initrd_end
   106: 0097cc48    40 OBJECT  GLOBAL DEFAULT    6 dt_ops
   107: 004033a8     0 NOTYPE  GLOBAL DEFAULT    1 strcmp
   108: 004041f8   116 FUNC    GLOBAL DEFAULT    1 sprintf
   109: 00971000     0 NOTYPE  GLOBAL DEFAULT    6 _edata
   110: 0097cc70     0 NOTYPE  GLOBAL DEFAULT    6 _end
   111: 00400544   992 FUNC    GLOBAL DEFAULT    1 start
   112: 00970952     0 NOTYPE  GLOBAL DEFAULT    5 _vmlinux_end
   113: 004033f4     0 NOTYPE  GLOBAL DEFAULT    1 strlen
   114: 00403388     0 NOTYPE  GLOBAL DEFAULT    1 strchr
   115: 00400230     0 NOTYPE  GLOBAL DEFAULT    1 _zimage_start_lib
   116: 00406504     0 NOTYPE  GLOBAL DEFAULT    4 __dynamic_start
   117: 004011a4    32 FUNC    GLOBAL DEFAULT    1 zlib_inflateEnd
   118: 00402d54    88 FUNC    GLOBAL DEFAULT    1 of_setprop
   119: 00402bfc   344 FUNC    GLOBAL DEFAULT    1 of_call_prom

No version information found in this file.
Attribute Section: gnu
File Attributes
  Tag_GNU_Power_ABI_FP: Soft float
  Tag_GNU_Power_ABI_Vector: Generic
  Tag_GNU_Power_ABI_Struct_Return: Memory


-- 
Greetings, Michael.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox