LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/14] 8xx: Use a macro to simpliy CPU6 errata code.
From: Joakim Tjernlund @ 2011-10-10 11:30 UTC (permalink / raw)
  To: linuxppc-dev, Scott Wood, Willy Tarreau, Dan Malek
In-Reply-To: <1318246220-4839-1-git-send-email-Joakim.Tjernlund@transmode.se>


Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
---
 arch/ppc/kernel/head_8xx.S |   84 +++++++++++--------------------------------
 1 files changed, 22 insertions(+), 62 deletions(-)

diff --git a/arch/ppc/kernel/head_8xx.S b/arch/ppc/kernel/head_8xx.S
index f9a30f3..ba05a57 100644
--- a/arch/ppc/kernel/head_8xx.S
+++ b/arch/ppc/kernel/head_8xx.S
@@ -31,6 +31,15 @@
 #include <asm/ppc_asm.h>
 #include "ppc_defs.h"
 
+/* Macro to make the code more readable. */
+#ifdef CONFIG_8xx_CPU6
+  #define DO_8xx_CPU6(val, reg) \
+	li	reg, val; \
+	stw	reg, 12(r0); \
+	lwz	reg, 12(r0);
+#else
+  #define DO_8xx_CPU6(val, reg)
+#endif
 	.text
 	.globl	_stext
 _stext:
@@ -310,20 +319,14 @@ SystemCall:
 InstructionTLBMiss:
 #ifdef CONFIG_8xx_CPU6
 	stw	r3, 8(r0)
-	li	r3, 0x3f80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
 #endif
+	DO_8xx_CPU6(0x3f80, r3)
 	mtspr	M_TW, r20	/* Save a couple of working registers */
 	mfcr	r20
 	stw	r20, 0(r0)
 	stw	r21, 4(r0)
 	mfspr	r20, SRR0	/* Get effective address of fault */
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x3780
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x3780, r3)
 	mtspr	MD_EPN, r20	/* Have to use MD_EPN for walk, MI_EPN can't */
 	mfspr	r20, M_TWB	/* Get level 1 table entry address */
 
@@ -345,17 +348,9 @@ InstructionTLBMiss:
 	tophys(r21,r21)
 	ori	r21,r21,1		/* Set valid bit */
 	beq-	2f			/* If zero, don't try to find a pte */
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x2b80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x2b80, r3)
 	mtspr	MI_TWC, r21	/* Set segment attributes */
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x3b80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x3b80, r3)
 	mtspr	MD_TWC, r21	/* Load pte table base address */
 	mfspr	r21, MD_TWC	/* ....and get the pte address */
 	lwz	r20, 0(r21)	/* Get the pte */
@@ -371,12 +366,7 @@ InstructionTLBMiss:
 	 */
 2:	li	r21, 0x00f0
 	rlwimi	r20, r21, 0, 24, 28	/* Set 24-27, clear 28 */
-
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x2d80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x2d80, r3)
 	mtspr	MI_RPN, r20	/* Update TLB entry */
 
 	mfspr	r20, M_TW	/* Restore registers */
@@ -392,10 +382,8 @@ InstructionTLBMiss:
 DataStoreTLBMiss:
 #ifdef CONFIG_8xx_CPU6
 	stw	r3, 8(r0)
-	li	r3, 0x3f80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
 #endif
+	DO_8xx_CPU6(0x3f80, r3)
 	mtspr	M_TW, r20	/* Save a couple of working registers */
 	mfcr	r20
 	stw	r20, 0(r0)
@@ -419,11 +407,7 @@ DataStoreTLBMiss:
 	tophys(r21, r21)
 	ori	r21, r21, 1	/* Set valid bit in physical L2 page */
 	beq-	2f		/* If zero, don't try to find a pte */
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x3b80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x3b80, r3)
 	mtspr	MD_TWC, r21	/* Load pte table base address */
 	mfspr	r20, MD_TWC	/* ....and get the pte address */
 	lwz	r20, 0(r20)	/* Get the pte */
@@ -435,11 +419,7 @@ DataStoreTLBMiss:
 	 * above.
 	 */
 	rlwimi	r21, r20, 0, 27, 27
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x3b80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x3b80, r3)
 	mtspr	MD_TWC, r21
 
 	mfspr	r21, MD_TWC	/* get the pte address again */
@@ -454,12 +434,7 @@ DataStoreTLBMiss:
 	 */
 2:	li	r21, 0x00f0
 	rlwimi	r20, r21, 0, 24, 28	/* Set 24-27, clear 28 */
-
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x3d80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x3d80, r3)
 	mtspr	MD_RPN, r20	/* Update TLB entry */
 
 	mfspr	r20, M_TW	/* Restore registers */
@@ -491,10 +466,8 @@ InstructionTLBError:
 DataTLBError:
 #ifdef CONFIG_8xx_CPU6
 	stw	r3, 8(r0)
-	li	r3, 0x3f80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
 #endif
+	DO_8xx_CPU6(0x3f80, r3)
 	mtspr	M_TW, r20	/* Save a couple of working registers */
 	mfcr	r20
 	stw	r20, 0(r0)
@@ -527,11 +500,7 @@ DataTLBError:
 	ori	r21, r21, MD_EVALID
 	mfspr	r20, M_CASID
 	rlwimi	r21, r20, 0, 28, 31
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x3780
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x3780, r3)
 	mtspr	MD_EPN, r21
 
 	mfspr	r20, M_TWB	/* Get level 1 table entry address */
@@ -553,11 +522,7 @@ DataTLBError:
 	 */
 	tophys(r21, r21)
 	ori	r21, r21, 1		/* Set valid bit in physical L2 page */
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x3b80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x3b80, r3)
 	mtspr	MD_TWC, r21		/* Load pte table base address */
 	mfspr	r21, MD_TWC		/* ....and get the pte address */
 	lwz	r20, 0(r21)		/* Get the pte */
@@ -579,12 +544,7 @@ DataTLBError:
 	 */
 	li	r21, 0x00f0
 	rlwimi	r20, r21, 0, 24, 28	/* Set 24-27, clear 28 */
-
-#ifdef CONFIG_8xx_CPU6
-	li	r3, 0x3d80
-	stw	r3, 12(r0)
-	lwz	r3, 12(r0)
-#endif
+	DO_8xx_CPU6(0x3d80, r3)
 	mtspr	MD_RPN, r20	/* Update TLB entry */
 
 	mfspr	r20, M_TW	/* Restore registers */
-- 
1.7.3.4

^ permalink raw reply related

* [PATCH 02/14] 8xx: Tag DAR with 0x00f0 to catch buggy instructions.
From: Joakim Tjernlund @ 2011-10-10 11:30 UTC (permalink / raw)
  To: linuxppc-dev, Scott Wood, Willy Tarreau, Dan Malek
In-Reply-To: <1318246220-4839-1-git-send-email-Joakim.Tjernlund@transmode.se>

dcbz, dcbf, dcbi, dcbst and icbi do not set DAR when they
cause a DTLB Error. Dectect this by tagging DAR with 0x00f0
at every exception exit that modifies DAR.
This also fixes MachineCheck to pass DAR and DSISR as well.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
---
 arch/ppc/kernel/head_8xx.S |   18 +++++++++++++++++-
 1 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/arch/ppc/kernel/head_8xx.S b/arch/ppc/kernel/head_8xx.S
index ba05a57..57858ce 100644
--- a/arch/ppc/kernel/head_8xx.S
+++ b/arch/ppc/kernel/head_8xx.S
@@ -197,7 +197,17 @@ label:						\
 	STD_EXCEPTION(0x100, Reset, UnknownException)
 
 /* Machine check */
-	STD_EXCEPTION(0x200, MachineCheck, MachineCheckException)
+	. = 0x200
+MachineCheck:
+	EXCEPTION_PROLOG
+	mfspr	r20,DSISR
+	stw	r20,_DSISR(r21)
+	mfspr	r20,DAR
+	stw	r20,_DAR(r21)
+	li	r20,0x00f0
+	mtspr	DAR,r20	/* Tag DAR */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	FINISH_EXCEPTION(MachineCheckException)
 
 /* Data access exception.
  * This is "never generated" by the MPC8xx.  We jump to it for other
@@ -211,6 +221,8 @@ DataAccess:
 	mr	r5,r20
 	mfspr	r4,DAR
 	stw	r4,_DAR(r21)
+	li	r20,0x00f0
+	mtspr	DAR,r20	/* Tag DAR */
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	li	r20,MSR_KERNEL
 	rlwimi	r20,r23,0,16,16		/* copy EE bit from saved MSR */
@@ -249,6 +261,8 @@ Alignment:
 	EXCEPTION_PROLOG
 	mfspr	r4,DAR
 	stw	r4,_DAR(r21)
+	li	r20,0x00f0
+	mtspr	DAR,r20	/* Tag DAR */
 	mfspr	r5,DSISR
 	stw	r5,_DSISR(r21)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@@ -433,6 +447,7 @@ DataStoreTLBMiss:
 	 * of the MMU.
 	 */
 2:	li	r21, 0x00f0
+	mtspr	DAR, r21	/* Tag DAR */
 	rlwimi	r20, r21, 0, 24, 28	/* Set 24-27, clear 28 */
 	DO_8xx_CPU6(0x3d80, r3)
 	mtspr	MD_RPN, r20	/* Update TLB entry */
@@ -543,6 +558,7 @@ DataTLBError:
 	 * of the MMU.
 	 */
 	li	r21, 0x00f0
+	mtspr	DAR, r21	/* Tag DAR */
 	rlwimi	r20, r21, 0, 24, 28	/* Set 24-27, clear 28 */
 	DO_8xx_CPU6(0x3d80, r3)
 	mtspr	MD_RPN, r20	/* Update TLB entry */
-- 
1.7.3.4

^ permalink raw reply related

* [PATCH 00/14] Backport 8xx TLB to 2.4
From: Joakim Tjernlund @ 2011-10-10 11:30 UTC (permalink / raw)
  To: linuxppc-dev, Scott Wood, Willy Tarreau, Dan Malek

This is a backport from 2.6 which I did to overcome 8xx CPU
bugs. 8xx does not update the DAR register when taking a TLB
error caused by dcbX and icbi insns which makes it very
tricky to use these insns. Also the dcbst wrongly sets the
the store bit when faulting into DTLB error.
A few more bugs very found during development.

I know 2.4 is in strict maintenance mode and 8xx is obsolete
but as it is still in use I wanted 8xx to age with grace.

Addendum:
I have now ported our 8xx custom board to 2.4.37.11 and
tested these patches there.

V2:
 - Remove mandatory pinning of kernel ITLB. It is not
   needed in 2.4

8 MB Large page support will follow.

Joakim Tjernlund (14):
  8xx: Use a macro to simpliy CPU6 errata code.
  8xx: Tag DAR with 0x00f0 to catch buggy instructions.
  8xx: invalidate non present TLBs
  8xx: Fix CONFIG_PIN_TLB
  8xx: Update TLB asm so it behaves as linux mm expects.
  8xx: Fixup DAR from buggy dcbX instructions.
  8xx: CPU6 errata make DTLB error too big to fit.
  8xx: Add missing Guarded setting in DTLB Error.
  8xx: Restore _PAGE_WRITETHRU
  8xx: Set correct HW pte flags in DTLB Error too
  8xx: start using dcbX instructions in various copy routines
  8xx: Use symbolic constants in TLB asm
  8xx: Optimize TLB Miss handlers
  8xx: The TLB miss handler manages ACCESSED correctly.

 arch/ppc/kernel/head_8xx.S |  367 ++++++++++++++++++++++++++++++-------------
 arch/ppc/kernel/misc.S     |   18 --
 arch/ppc/lib/string.S      |   17 --
 include/asm-ppc/pgtable.h  |   26 +--
 4 files changed, 264 insertions(+), 164 deletions(-)

-- 
1.7.3.4

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-10 10:18 UTC (permalink / raw)
  To: Eli Cohen
  Cc: netdev, Yevgeny Petrilin, Eli Cohen, David Laight,
	Thadeu Lima de Souza Cascardo, linuxppc-dev
In-Reply-To: <20111010092926.GO2681@mtldesk30>

On Mon, 2011-10-10 at 11:29 +0200, Eli Cohen wrote:
> On Mon, Oct 10, 2011 at 11:24:05AM +0200, Benjamin Herrenschmidt wrote:
> > On Mon, 2011-10-10 at 11:16 +0200, Eli Cohen wrote:
> > 
> > > Until then I think we need to have the logic working right on ppc and
> > > measure if blue flame buys us any improvement in ppc. If that's not
> > > the case (e.g because write combining is not working), then maybe we
> > > should avoid using blueflame in ppc.
> > > Could any of the guys from IBM check this and give us feedback?
> > 
> > I don't have the necessary hardware myself to test that but maybe Thadeu
> > can.
> > 
> > Note that for WC to work, things must be mapped non-guarded. You can do
> > that by using ioremap_prot() with pgprot_noncached_wc(PAGE_KERNEL) or
> > ioremap_wc() (dunno how "generic" the later is).
> 
> I use the io mapping API:
> 
> at driver statrt:
>         priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
>         if (!priv->bf_mapping)
>                 err = -ENOMEM;
> 
> and then:
>         uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT);
> 
>         
> Will this work on ppc?

That API has never been tested on ppc I suspect. We don't have
CONFIG_HAVE_ATOMIC_IOMAP (mostly because we never needed it, it
was designed and only ever used for Intel graphics before), so
it will fallback to:

static inline struct io_mapping *
io_mapping_create_wc(resource_size_t base, unsigned long size)
{
	return (struct io_mapping __force *) ioremap_wc(base, size);
}

Which should work (hopefully :-)

Cheers,
Ben.

^ permalink raw reply

* [PATCH 3/3] [44x] Enable CRASH_DUMP for 440x
From: Suzuki K. Poulose @ 2011-10-10  9:57 UTC (permalink / raw)
  To: linux ppc dev
  Cc: Michal Simek, tmarri, Mahesh Jagannath Salgaonkar, Dave Hansen,
	David Laight, Suzuki K. Poulose, Scott Wood, linuxppc-dev,
	Vivek Goyal
In-Reply-To: <20111010094627.16589.52367.stgit@suzukikp.in.ibm.com>

Now that we have relocatable kernel, supporting CRASH_DUMP only requires
turning the switches on for UP machines.

We don't have kexec support on 47x yet. Enabling SMP support would be done
as part of enabling the PPC_47x support.


Signed-off-by: Suzuki K. Poulose <suzuki@in.ibm.com>
Cc:	Josh Boyer <jwboyer@gmail.com>
Cc:	Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc:	linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
---

 arch/powerpc/Kconfig |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 99558d6..fc41ce5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -362,8 +362,8 @@ config KEXEC
 
 config CRASH_DUMP
 	bool "Build a kdump crash kernel"
-	depends on PPC64 || 6xx || FSL_BOOKE
-	select RELOCATABLE if PPC64 || FSL_BOOKE
+	depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP)
+	select RELOCATABLE if PPC64 || FSL_BOOKE || 44x
 	help
 	  Build a kernel suitable for use as a kdump capture kernel.
 	  The same kernel binary can be used as production kernel and dump

^ permalink raw reply related

* [PATCH 2/3] [44x] Enable CONFIG_RELOCATABLE for PPC44x
From: Suzuki K. Poulose @ 2011-10-10  9:56 UTC (permalink / raw)
  To: linux ppc dev
  Cc: Michal Simek, tmarri, Mahesh Jagannath Salgaonkar, Dave Hansen,
	David Laight, Suzuki K. Poulose, Scott Wood, Paul Mackerras,
	linuxppc-dev, Vivek Goyal
In-Reply-To: <20111010094627.16589.52367.stgit@suzukikp.in.ibm.com>

The following patch adds relocatable support for PPC44x kernel.

We find the runtime address of _stext and relocate ourselves based
on the following calculation.

	virtual_base = ALIGN(KERNELBASE,256M) +
			MODULO(_stext.run,256M)

relocate() is called with the Effective Virtual Base Address (as
shown below)

            | Phys. Addr| Virt. Addr |
Page (256M) |------------------------|
Boundary    |           |            |
            |           |            |
            |           |            |
Kernel Load |___________|_ __ _ _ _ _|<- Effective
Addr(_stext)|           |      ^     |Virt. Base Addr
            |           |      |     |
            |           |      |     |
            |           |reloc_offset|
            |           |      |     |
            |           |      |     |
            |           |______v_____|<-(KERNELBASE)%256M
            |           |            |
            |           |            |
            |           |            |
Page(256M)  |-----------|------------|
Boundary    |           |            |


On BookE, we need __va() & __pa() early in the boot process to access
the device tree.

Currently this has been defined as :

#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) -
						PHYSICAL_START + KERNELBASE)
where:
 PHYSICAL_START is kernstart_addr - a variable updated at runtime.
 KERNELBASE	is the compile time Virtual base address of kernel.

This won't work for us, as kernstart_addr is dynamic and will yield different
results for __va()/__pa() for same mapping.

e.g.,

Let the kernel be loaded at 64MB and KERNELBASE be 0xc0000000 (same as
PAGE_OFFSET).

In this case, we would be mapping 0 to 0xc0000000, and kernstart_addr = 64M

Now __va(1MB) = (0x100000) - (0x4000000) + 0xc0000000
		= 0xbc100000 , which is wrong.

it should be : 0xc0000000 + 0x100000 = 0xc0100000

On PPC_47x (which is based on 44x), the kernel could be loaded at highmem.
Hence we cannot always depend on the compile time constants for mapping.

Here are the possible solutions:

1) Update kernstart_addr(PHSYICAL_START) to match the Physical address of
compile time KERNELBASE value, instead of the actual Physical_Address(_stext).

The disadvantage is that we may break other users of PHYSICAL_START. They
could be replaced with __pa(_stext).

2) Redefine __va() & __pa() with relocation offset


#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_44x)
#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) - PHYSICAL_START + (KERNELBASE + RELOC_OFFSET)))
#define __pa(x) ((unsigned long)(x) + PHYSICAL_START - (KERNELBASE + RELOC_OFFSET))
#endif

where, RELOC_OFFSET could be

  a) A variable, say relocation_offset (like kernstart_addr), updated
     at boot time. This impacts performance, as we have to load an additional
     variable from memory.

		OR

  b) #define RELOC_OFFSET ((PHYSICAL_START & PPC_PIN_SIZE_OFFSET_MASK) - \
                      (KERNELBASE & PPC_PIN_SIZE_OFFSET_MASK))

   This introduces more calculations for doing the translation.

3) Redefine __va() & __pa() with a new variable

i.e,

#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + VIRT_PHYS_OFFSET))

where VIRT_PHYS_OFFSET :

#ifdef CONFIG_44x
#define VIRT_PHYS_OFFSET virt_phys_offset
#else
#define VIRT_PHYS_OFFSET (KERNELBASE - PHYSICAL_START)
#endif /* 44x */

where virt_phy_offset is updated at runtime to :

	Effective KERNELBASE - kernstart_addr.

Taking our example, above:

virt_phys_offset = effective_kernelstart_vaddr - kernstart_addr
		 = 0xc0400000 - 0x400000
		 = 0xc0000000
	and

	__va(0x100000) = 0xc0000000 + 0x100000 = 0xc0100000
	 which is what we want.

I have implemented (3) in the following patch which has same cost of
operation as the existing one.

I have tested the patches on 440x platforms only. However this should
work fine for PPC_47x also, as we only depend on the runtime address
and the current TLB XLAT entry for the startup code, which is available
in r25. I don't have access to a 47x board yet. So, it would be great if
somebody could test this on 47x.

Signed-off-by: Suzuki K. Poulose <suzuki@in.ibm.com>
Cc:	Paul Mackerras <paulus@samba.org>
Cc:	Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc:	Kumar Gala <galak@kernel.crashing.org>
Cc:	Tony Breeds <tony@bakeyournoodle.com>
Cc:	Josh Boyer <jwboyer@gmail.com>
Cc:	linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
---

 arch/powerpc/Kconfig            |    2 -
 arch/powerpc/Makefile           |    1 
 arch/powerpc/include/asm/page.h |   84 +++++++++++++++++++++++++++++-
 arch/powerpc/kernel/head_44x.S  |  111 ++++++++++++++++++++++++++++++++++-----
 arch/powerpc/mm/init_32.c       |    7 ++
 5 files changed, 187 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9eb2e60..99558d6 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -843,7 +843,7 @@ config LOWMEM_CAM_NUM
 
 config RELOCATABLE
 	bool "Build a relocatable kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && ADVANCED_OPTIONS && FLATMEM && (FSL_BOOKE || PPC_47x)
+	depends on EXPERIMENTAL && ADVANCED_OPTIONS && FLATMEM && (FSL_BOOKE || 44x || PPC_47x)
 	help
 	  This builds a kernel image that is capable of running at the
 	  location the kernel is loaded at (some alignment restrictions may
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 57af16e..632b3dd 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -65,6 +65,7 @@ endif
 
 LDFLAGS_vmlinux-yy := -Bstatic
 LDFLAGS_vmlinux-$(CONFIG_PPC64)$(CONFIG_RELOCATABLE) := -pie
+LDFLAGS_vmlinux-$(CONFIG_44x)$(CONFIG_RELOCATABLE) := -pie
 LDFLAGS_vmlinux	:= $(LDFLAGS_vmlinux-yy)
 
 CFLAGS-$(CONFIG_PPC64)	:= -mminimal-toc -mtraceback=no -mcall-aixdesc
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index dd9c4fd..6898542 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -97,10 +97,25 @@ extern unsigned int HPAGE_SHIFT;
 
 extern phys_addr_t memstart_addr;
 extern phys_addr_t kernstart_addr;
+
+#ifdef CONFIG_44x
+extern long long virt_phys_offset;
 #endif
+
+#endif /* __ASSEMBLY__ */
 #define PHYSICAL_START	kernstart_addr
+
+
+/* See Description below for VIRT_PHYS_OFFSET */
+#ifdef CONFIG_44x
+#define VIRT_PHYS_OFFSET virt_phys_offset
 #else
+#define VIRT_PHYS_OFFSET (KERNELBASE - PHYSICAL_START)
+#endif /* 44x */
+
+#else	/* !CONFIG_RELOCATABLE */
 #define PHYSICAL_START	ASM_CONST(CONFIG_PHYSICAL_START)
+#define VIRT_PHYS_OFFSET (KERNELBASE - PHYSICAL_START)
 #endif
 
 #ifdef CONFIG_PPC64
@@ -125,12 +140,77 @@ extern phys_addr_t kernstart_addr;
  * determine MEMORY_START until then.  However we can determine PHYSICAL_START
  * from information at hand (program counter, TLB lookup).
  *
+ *  Relocation on 44x
+ *
+ *  On 44x, we support loading the kernel at any physical address without
+ *  any restriction on the page alignment.
+ *
+ *  We find the runtime address of _stext and relocate ourselves based on 
+ *  the following calculation:
+ *
+ *  	virtual_base = ALIGN_DOWN(KERNELBASE,256M) +
+ *  				MODULO(_stext.run,256M)
+ *  and create the following mapping:
+ *
+ * 	 ALIGN_DOWN(_stext.run,256M) => ALIGN_DOWN(KERNELBASE,256M)
+ *
+ * When we process relocations, we cannot depend on the
+ * existing equation for the __va()/__pa() translations:
+ *
+ * 	 __va(x) = (x)  - PHYSICAL_START + KERNELBASE
+ *
+ *  Where:
+ *  	PHYSICAL_START = kernstart_addr = Physical address of _stext
+ *  	KERNELBASE = Compiled virtual address of _stext.
+ *
+ * This formula holds true iff, kernel load address is TLB page aligned.
+ *
+ * In our case, we need to also account for the shift in the kernel Virtual 
+ * address.
+ *
+ * E.g.,
+ *
+ * Let the kernel be loaded at 64MB and KERNELBASE be 0xc0000000 (same as PAGE_OFFSET).
+ * In this case, we would be mapping 0 to 0xc0000000, and kernstart_addr = 64M
+ *
+ * Now __va(1MB) = (0x100000) - (0x4000000) + 0xc0000000
+ *               = 0xbc100000 , which is wrong.
+ *
+ * Rather, it should be : 0xc0000000 + 0x100000 = 0xc0100000
+ * 	according to our mapping.
+ *
+ * Hence we use the following formula to get the translations right:
+ *
+ * 	__va(x) = (x) - [ PHYSICAL_START - Effective KERNELBASE ]
+ *
+ * 	Where :
+ * 		PHYSICAL_START = dynamic load address.(kernstart_addr variable)
+ * 		Effective KERNELBASE = virtual_base =
+ * 				     = ALIGN_DOWN(KERNELBASE,256M) +
+ * 						MODULO(PHYSICAL_START,256M)
+ *
+ * 	To make the cost of __va() / __pa() more light weight, we introduce
+ * 	a new variable virt_phys_offset, which will hold :
+ *
+ * 	virt_phys_offset = Effective KERNELBASE - PHYSICAL_START
+ * 			 = ALIGN_DOWN(KERNELBASE,256M) - 
+ * 			 	ALIGN_DOWN(PHYSICALSTART,256M)
+ *
+ * 	Hence :
+ *
+ * 	__va(x) = x - PHYSICAL_START + Effective KERNELBASE
+ * 		= x + virt_phys_offset
+ *
+ * 		and
+ * 	__pa(x) = x + PHYSICAL_START - Effective KERNELBASE
+ * 		= x - virt_phys_offset
+ * 		
  * On non-Book-E PPC64 PAGE_OFFSET and MEMORY_START are constants so use
  * the other definitions for __va & __pa.
  */
 #ifdef CONFIG_BOOKE
-#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) - PHYSICAL_START + KERNELBASE))
-#define __pa(x) ((unsigned long)(x) + PHYSICAL_START - KERNELBASE)
+#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + VIRT_PHYS_OFFSET))
+#define __pa(x) ((unsigned long)(x) - VIRT_PHYS_OFFSET)
 #else
 #define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + PAGE_OFFSET - MEMORY_START))
 #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET + MEMORY_START)
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index b725dab..8f57c31 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -64,6 +64,35 @@ _ENTRY(_start);
 	mr	r31,r3		/* save device tree ptr */
 	li	r24,0		/* CPU number */
 
+#if defined(CONFIG_RELOCATABLE)
+/*
+ * Relocate ourselves to the current runtime address.
+ * This is called only by the Boot CPU.
+ * "relocate" is called with our current runtime virutal
+ * address.
+ * r21 will be loaded with the physical runtime address of _stext
+ */
+	bl	0f				/* Get our runtime address */
+0:	mflr	r21				/* Make it accessible */
+	addis	r21,r21,(_stext - 0b)@ha
+	addi	r21,r21,(_stext - 0b)@l 	/* Get our current runtime base */
+
+	/*
+	 * We have the runtime (virutal) address of our base.
+	 * We calculate our shift of offset from a 256M page.
+	 * We could map the 256M page we belong to at PAGE_OFFSET and
+	 * get going from there.
+	 */
+	lis	r4,KERNELBASE@h
+	ori	r4,r4,KERNELBASE@l
+	rlwinm	r6,r21,0,4,31			/* r6 = PHYS_START % 256M */
+	rlwinm	r5,r4,0,4,31			/* r5 = KERNELBASE % 256M */
+	subf	r3,r5,r6			/* r3 = r6 - r5 */
+	add	r3,r4,r3			/* Required Virutal Address */
+
+	bl	relocate
+#endif
+
 	bl	init_cpu_state
 
 	/*
@@ -88,27 +117,60 @@ _ENTRY(_start);
 
 #ifdef CONFIG_RELOCATABLE
 	/*
-	 * r25 will contain RPN/ERPN for the start address of memory
-	 *
-	 * Add the difference between KERNELBASE and PAGE_OFFSET to the
-	 * start of physical memory to get kernstart_addr.
+	 * When we reach here :
+	 * r25 holds RPN/ERPN for the start address of memory
+	 * r21 contain the physical address of _stext
 	 */
 	lis	r3,kernstart_addr@ha
 	la	r3,kernstart_addr@l(r3)
 
-	lis	r4,KERNELBASE@h
-	ori	r4,r4,KERNELBASE@l
-	lis	r5,PAGE_OFFSET@h
-	ori	r5,r5,PAGE_OFFSET@l
-	subf	r4,r5,r4
-
-	rlwinm	r6,r25,0,28,31	/* ERPN */
+	/*
+	 * Compute the kernstart_addr.
+	 * kernstart_addr => (r6,r8)
+	 * kernstart_addr & ~0xfffffff => (r6,r7)
+	 */
+	rlwinm	r6,r25,0,28,31	/* ERPN. Bits 32-35 of Address */
 	rlwinm	r7,r25,0,0,3	/* RPN - assuming 256 MB page size */
-	add	r7,r7,r4
+	rlwinm	r8,r21,0,4,31	/* r8 = (_stext & 0xfffffff) */
+	or	r8,r7,r8	/* Compute the lower 32bit of kernstart_addr */
+
+	/* Store kernstart_addr */
+	stw	r6,0(r3)	/* higher 32bit */
+	stw	r8,4(r3)	/* lower 32bit  */
+
+	/* 
+	 * Compute the virt_phys_offset :
+	 * virt_phys_offset = stext.run - kernstart_addr
+	 * 
+	 * stext.run = (KERNELBASE & ~0xfffffff) + (kernstart_addr & 0xfffffff)
+	 * When we relocate, we have :
+	 *
+	 *	(kernstart_addr & 0xfffffff) = (stext.run & 0xfffffff) 
+	 *
+	 * hence:
+	 *  virt_phys_offset = (KERNELBASE & ~0xfffffff) - (kernstart_addr & ~0xfffffff)
+	 * 
+	 */
 
-	stw	r6,0(r3)
-	stw	r7,4(r3)
-#endif
+	/* KERNELBASE&~0xfffffff => (r4,r5) */
+	li	r4, 0		/* higer 32bit */
+	lis	r5,KERNELBASE@h
+	rlwinm	r5,r5,0,0,3	/* Align to 256M, lower 32bit */
+
+	/* 
+	 * 64bit subtraction.
+	 */ 
+	subfc	r5,r7,r5
+	subfe	r4,r6,r4
+
+	/* Store virt_phys_offset */
+	lis	r3,virt_phys_offset@ha
+	la	r3,virt_phys_offset@l(r3)
+
+	stw	r4,0(r3)
+	stw	r5,4(r3)
+
+#endif	/* CONFIG_RELOCATABLE */
 
 /*
  * Decide what sort of machine this is and initialize the MMU.
@@ -801,11 +863,30 @@ skpinv:	addi	r4,r4,1				/* Increment */
  * Configure and load pinned entry into TLB slot 63.
  */
 
+#ifdef CONFIG_RELOCATABLE
+	/*
+	 * Stores the XLAT entry for this code at r25.
+	 * Uses the mapping where we are loaded.
+	 */
+
+	tlbre	r25,r23,PPC44x_TLB_XLAT		/* Read our XLAT entry in r25 */
+
+	/* PAGEID fields for mapping */
+	lis	r3,KERNELBASE@h
+	rlwinm	r3,r3,0,0,3			/* Round to 256M page boundary */
+
+	/* Use the current XLAT entry */
+	mr	r4,r25
+#else
+
+
 	lis	r3,PAGE_OFFSET@h
 	ori	r3,r3,PAGE_OFFSET@l
 
 	/* Kernel is at the base of RAM */
 	li r4, 0			/* Load the kernel physical address */
+#endif
+
 
 	/* Load the kernel PID = 0 */
 	li	r0,0
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 161cefd..a249edb 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -65,6 +65,13 @@ phys_addr_t memstart_addr = (phys_addr_t)~0ull;
 EXPORT_SYMBOL(memstart_addr);
 phys_addr_t kernstart_addr;
 EXPORT_SYMBOL(kernstart_addr);
+
+#if	defined(CONFIG_44x) && defined(CONFIG_RELOCATABLE)
+/* Used in __va()/__pa() for 44x */
+long long virt_phys_offset;
+EXPORT_SYMBOL(virt_phys_offset);
+#endif
+
 phys_addr_t lowmem_end_addr;
 
 int boot_mapsize;

^ permalink raw reply related

* [PATCH 1/3] [powerpc32] Process dynamic relocations for kernel
From: Suzuki K. Poulose @ 2011-10-10  9:55 UTC (permalink / raw)
  To: linux ppc dev
  Cc: Michal Simek, tmarri, Mahesh Jagannath Salgaonkar, Alan Modra,
	Dave Hansen, David Laight, Suzuki K. Poulose, Scott Wood,
	Paul Mackerras, linuxppc-dev, Vivek Goyal
In-Reply-To: <20111010094627.16589.52367.stgit@suzukikp.in.ibm.com>

The following patch implements the dynamic relocation processing for
PPC32 kernel. relocate() accepts the target virtual address and relocates
 the kernel image to the same.

Currently the following relocation types are handled :

	R_PPC_RELATIVE
	R_PPC_ADDR16_LO
	R_PPC_ADDR16_HI
	R_PPC_ADDR16_HA

The last 3 relocations in the above list depends on value of Symbol indexed
whose index is encoded in the Relocation entry. Hence we need the Symbol
Table for processing such relocations.

Note: The GNU ld for ppc32 produces buggy relocations for relocation types
that depend on symbols. The value of the symbols with STB_LOCAL scope
should be assumed to be zero. - Alan Modra

Signed-off-by: Suzuki K. Poulose <suzuki@in.ibm.com>
Cc:	Paul Mackerras <paulus@samba.org>
Cc:	Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc:	Alan Modra <amodra@au1.ibm.com>
Cc:	Kumar Gala <galak@kernel.crashing.org>
Cc:	Josh Boyer <jwboyer@gmail.com>
Cc:	linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
---

 arch/powerpc/Kconfig              |    4 +
 arch/powerpc/kernel/Makefile      |    2 
 arch/powerpc/kernel/reloc_32.S    |  194 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/vmlinux.lds.S |    8 +-
 4 files changed, 207 insertions(+), 1 deletions(-)
 create mode 100644 arch/powerpc/kernel/reloc_32.S

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8523bd1..9eb2e60 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -859,6 +859,10 @@ config RELOCATABLE
 	  setting can still be useful to bootwrappers that need to know the
 	  load location of the kernel (eg. u-boot/mkimage).
 
+config RELOCATABLE_PPC32
+	def_bool y
+	depends on PPC32 && RELOCATABLE
+
 config PAGE_OFFSET_BOOL
 	bool "Set custom page offset address"
 	depends on ADVANCED_OPTIONS
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index ce4f7f1..ee728e4 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -85,6 +85,8 @@ extra-$(CONFIG_FSL_BOOKE)	:= head_fsl_booke.o
 extra-$(CONFIG_8xx)		:= head_8xx.o
 extra-y				+= vmlinux.lds
 
+obj-$(CONFIG_RELOCATABLE_PPC32)	+= reloc_32.o
+
 obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o
 obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S
new file mode 100644
index 0000000..045d61e
--- /dev/null
+++ b/arch/powerpc/kernel/reloc_32.S
@@ -0,0 +1,194 @@
+/*
+ * Code to process dynamic relocations for PPC32.
+ *
+ * Copyrights (C) IBM Corporation, 2011.
+ *	Author: Suzuki Poulose <suzuki@in.ibm.com>
+ *
+ *  - Based on ppc64 code - reloc_64.S
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+
+/* Dynamic section table entry tags */
+DT_RELA = 7			/* Tag for Elf32_Rela section */
+DT_RELASZ = 8			/* Size of the Rela relocs */
+DT_RELAENT = 9			/* Size of one Rela reloc entry */
+
+STN_UNDEF = 0			/* Undefined symbol index */
+STB_LOCAL = 0			/* Local binding for the symbol */
+
+R_PPC_ADDR16_LO = 4		/* Lower half of (S+A) */
+R_PPC_ADDR16_HI = 5		/* Upper half of (S+A) */
+R_PPC_ADDR16_HA = 6		/* High Adjusted (S+A) */
+R_PPC_RELATIVE = 22
+
+/*
+ * r3 = desired final address
+ */
+
+_GLOBAL(relocate)
+
+	mflr	r0
+	bl	0f		/* Find our current runtime address */
+0:	mflr	r12		/* Make it accessible */
+	mtlr	r0
+
+	lwz	r11, (p_dyn - 0b)(r12)
+	add	r11, r11, r12	/* runtime address of .dynamic section */
+	lwz	r9, (p_rela - 0b)(r12)
+	add	r9, r9, r12	/* runtime address of .rela.dyn section */
+	lwz	r10, (p_st - 0b)(r12)
+	add	r10, r10, r12	/* runtime address of _stext section */
+	lwz	r13, (p_sym - 0b)(r12)
+	add	r13, r13, r12	/* runtime address of .dynsym section */
+
+	/*
+	 * Scan the dynamic section for RELA, RELASZ entries
+	 */
+	li	r6, 0
+	li	r7, 0
+	li	r8, 0
+1:	lwz	r5, 0(r11)	/* ELF_Dyn.d_tag */
+	cmpwi	r5, 0		/* End of ELF_Dyn[] */
+	beq	eodyn
+	cmpwi	r5, DT_RELA
+	bne	relasz
+	lwz	r7, 4(r11)	/* r7 = rela.link */
+	b	skip
+relasz:
+	cmpwi	r5, DT_RELASZ
+	bne	relaent
+	lwz	r8, 4(r11)	/* r8 = Total Rela relocs size */
+	b	skip
+relaent:
+	cmpwi	r5, DT_RELAENT
+	bne	skip
+	lwz	r6, 4(r11)	/* r6 = Size of one Rela reloc */
+skip:
+	addi	r11, r11, 8
+	b	1b
+eodyn:				/* End of Dyn Table scan */
+
+	/* Check if we have found all the entries */
+	cmpwi	r7, 0
+	beq	done
+	cmpwi	r8, 0
+	beq	done
+	cmpwi	r6, 0
+	beq	done
+
+
+	/*
+	 * Work out the current offset from the link time address of .rela
+	 * section.
+	 *  cur_offset[r7] = rela.run[r9] - rela.link [r7]
+	 *  _stext.link[r10] = _stext.run[r10] - cur_offset[r7]
+	 *  final_offset[r3] = _stext.final[r3] - _stext.link[r10]
+	 */
+	subf	r7, r7, r9	/* cur_offset */
+	subf	r10, r7, r10
+	subf	r3, r10, r3	/* final_offset */
+
+	subf	r8, r6, r8	/* relaz -= relaent */
+	/*
+	 * Scan through the .rela table and process each entry
+	 * r9	- points to the current .rela table entry
+	 * r13	- points to the symbol table
+	 */
+
+	/*
+	 * Check if we have a relocation based on symbol
+	 * r5 will hold the value of the symbol.
+	 */
+applyrela:
+	lwz	r4, 4(r9)
+	srwi	r5, r4, 8		/* ELF32_R_SYM(r_info) */
+	cmpwi	r5, STN_UNDEF	/* sym == STN_UNDEF ? */
+	beq	get_type	/* value = 0 */
+	/* Find the value of the symbol at index(r5) */
+	slwi	r5, r5, 4		/* r5 = r5 * sizeof(Elf32_Sym) */
+	add	r12, r13, r5	/* r12 = &__dyn_sym[Index] */
+
+	/*
+	 * GNU ld has a bug, where dynamic relocs based on
+	 * STB_LOCAL symbols, the value should be assumed
+	 * to be zero. - Alan Modra
+	 */
+	/* XXX: Do we need to check if we are using GNU ld ? */
+	lbz	r5, 12(r12)	/* r5 = dyn_sym[Index].st_info */
+	extrwi	r5, r5, 4, 24	/* r5 = ELF32_ST_BIND(r5) */
+	cmpwi	r5, STB_LOCAL	/* st_value = 0, ld bug */
+	beq	get_type	/* We have r5 = 0 */
+	lwz	r5, 4(r12)	/* r5 = __dyn_sym[Index].st_value */
+
+get_type:
+	/* r4 holds the relocation type */
+	extrwi	r4, r4, 8, 24	/* r4 = ((char*)r4)[3] */
+
+	/* R_PPC_RELATIVE */
+	cmpwi	r4, R_PPC_RELATIVE
+	bne	hi16
+	lwz	r4, 0(r9)	/* r_offset */
+	lwz	r0, 8(r9)	/* r_addend */
+	add	r0, r0, r3	/* final addend */
+	stwx	r0, r4, r7	/* memory[r4+r7]) = (u32)r0 */
+	b	nxtrela		/* continue */
+
+	/* R_PPC_ADDR16_HI */
+hi16:
+	cmpwi	r4, R_PPC_ADDR16_HI
+	bne	ha16
+	lwz	r4, 0(r9)	/* r_offset */
+	lwz	r0, 8(r9)	/* r_addend */
+	add	r0, r0, r3
+	add	r0, r0, r5	/* r0 = (S+A+Offset) */
+	extrwi	r0, r0, 16, 0	/* r0 = (r0 >> 16) */
+	b	store_half
+
+	/* R_PPC_ADDR16_HA */
+ha16:
+	cmpwi	r4, R_PPC_ADDR16_HA
+	bne	lo16
+	lwz	r4, 0(r9)	/* r_offset */
+	lwz	r0, 8(r9)	/* r_addend */
+	add	r0, r0, r3
+	add	r0, r0, r5	/* r0 = (S+A+Offset) */
+	extrwi	r5, r0, 1, 16	/* Extract bit 16 */
+	extrwi	r0, r0, 16, 0	/* r0 = (r0 >> 16) */
+	add	r0, r0, r5	/* Add it to r0 */
+	b	store_half
+
+	/* R_PPC_ADDR16_LO */
+lo16:
+	cmpwi	r4, R_PPC_ADDR16_LO
+	bne	nxtrela
+	lwz	r4, 0(r9)	/* r_offset */
+	lwz	r0, 8(r9)	/* r_addend */
+	add	r0, r0, r3
+	add	r0, r0, r5	/* r0 = (S+A+Offset) */
+	extrwi	r0, r0, 16, 16	/* r0 &= 0xffff */
+	/* Fall through to */
+
+	/* Store half word */
+store_half:
+	sthx	r0, r4, r7	/* memory[r4+r7] = (u16)r0 */
+
+nxtrela:
+	cmpwi	r8, 0		/* relasz = 0 ? */
+	ble	done
+	add	r9, r9, r6	/* move to next entry in the .rela table */
+	subf	r8, r6, r8	/* relasz -= relaent */
+	b	applyrela
+
+done:	blr
+
+
+p_dyn:		.long	__dynamic_start - 0b
+p_rela:		.long	__rela_dyn_start - 0b
+p_sym:		.long	__dynamic_symtab - 0b
+p_st:		.long	_stext - 0b
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 920276c..710a540 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -170,7 +170,13 @@ SECTIONS
 	}
 #ifdef CONFIG_RELOCATABLE
 	. = ALIGN(8);
-	.dynsym : AT(ADDR(.dynsym) - LOAD_OFFSET) { *(.dynsym) }
+	.dynsym : AT(ADDR(.dynsym) - LOAD_OFFSET)
+	{
+#ifdef CONFIG_RELOCATABLE_PPC32
+		__dynamic_symtab = .;
+#endif
+		*(.dynsym)
+	}
 	.dynstr : AT(ADDR(.dynstr) - LOAD_OFFSET) { *(.dynstr) }
 	.dynamic : AT(ADDR(.dynamic) - LOAD_OFFSET)
 	{

^ permalink raw reply related

* [PATCH 0/3] Kdump support for PPC440x
From: Suzuki K. Poulose @ 2011-10-10  9:54 UTC (permalink / raw)
  To: linux ppc dev
  Cc: Michal Simek, tmarri, Mahesh Jagannath Salgaonkar, Dave Hansen,
	David Laight, Scott Wood, Vivek Goyal

The following series implements CRASH_DUMP support for PPC440x. The
patches apply on top of power-next tree. This set also adds support
for CONFIG_RELOCATABLE on 44x.

I have tested the patches on Ebony and Virtex(QEMU Emulated). Testing
these patches would require latest snapshot of kexec-tools git tree and
(preferrably) the following patch for kexec-tools :

	http://lists.infradead.org/pipermail/kexec/2011-October/005552.html

---

Suzuki K. Poulose (3):
      [44x] Enable CRASH_DUMP for 440x
      [44x] Enable CONFIG_RELOCATABLE for PPC44x
      [powerpc32] Process dynamic relocations for kernel


 arch/powerpc/Kconfig              |   10 +-
 arch/powerpc/Makefile             |    1 
 arch/powerpc/include/asm/page.h   |   84 ++++++++++++++++
 arch/powerpc/kernel/Makefile      |    2 
 arch/powerpc/kernel/head_44x.S    |  111 ++++++++++++++++++---
 arch/powerpc/kernel/reloc_32.S    |  194 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/vmlinux.lds.S |    8 +-
 arch/powerpc/mm/init_32.c         |    7 +
 8 files changed, 396 insertions(+), 21 deletions(-)
 create mode 100644 arch/powerpc/kernel/reloc_32.S

-- 
Thanks
Suzuki

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Eli Cohen @ 2011-10-10  9:29 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: netdev, Yevgeny Petrilin, Eli Cohen, David Laight,
	Thadeu Lima de Souza Cascardo, linuxppc-dev
In-Reply-To: <1318238645.29415.426.camel@pasglop>

On Mon, Oct 10, 2011 at 11:24:05AM +0200, Benjamin Herrenschmidt wrote:
> On Mon, 2011-10-10 at 11:16 +0200, Eli Cohen wrote:
> 
> > Until then I think we need to have the logic working right on ppc and
> > measure if blue flame buys us any improvement in ppc. If that's not
> > the case (e.g because write combining is not working), then maybe we
> > should avoid using blueflame in ppc.
> > Could any of the guys from IBM check this and give us feedback?
> 
> I don't have the necessary hardware myself to test that but maybe Thadeu
> can.
> 
> Note that for WC to work, things must be mapped non-guarded. You can do
> that by using ioremap_prot() with pgprot_noncached_wc(PAGE_KERNEL) or
> ioremap_wc() (dunno how "generic" the later is).

I use the io mapping API:

at driver statrt:
        priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
        if (!priv->bf_mapping)
                err = -ENOMEM;

and then:
        uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT);

        
Will this work on ppc?

> 
> >From there, you should get write combining provided that you don't have
> barriers between every access (ie those copy operations in their current
> form should do the trick).
> 
> Cheers,
> Ben.
> 
> > > Maybe it's time for us to revive those discussions about providing a
> > > good set of relaxed MMIO accessors with explicit barriers :-)
> > > 
> > > Cheers,
> > > Ben.
> > >  
> 

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-10  9:24 UTC (permalink / raw)
  To: Eli Cohen
  Cc: netdev, Yevgeny Petrilin, Eli Cohen, David Laight,
	Thadeu Lima de Souza Cascardo, linuxppc-dev
In-Reply-To: <20111010091611.GN2681@mtldesk30>

On Mon, 2011-10-10 at 11:16 +0200, Eli Cohen wrote:

> Until then I think we need to have the logic working right on ppc and
> measure if blue flame buys us any improvement in ppc. If that's not
> the case (e.g because write combining is not working), then maybe we
> should avoid using blueflame in ppc.
> Could any of the guys from IBM check this and give us feedback?

I don't have the necessary hardware myself to test that but maybe Thadeu
can.

Note that for WC to work, things must be mapped non-guarded. You can do
that by using ioremap_prot() with pgprot_noncached_wc(PAGE_KERNEL) or
ioremap_wc() (dunno how "generic" the later is).

>From there, you should get write combining provided that you don't have
barriers between every access (ie those copy operations in their current
form should do the trick).

Cheers,
Ben.

> > Maybe it's time for us to revive those discussions about providing a
> > good set of relaxed MMIO accessors with explicit barriers :-)
> > 
> > Cheers,
> > Ben.
> >  

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Eli Cohen @ 2011-10-10  9:16 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: netdev, Yevgeny Petrilin, Eli Cohen, David Laight,
	Thadeu Lima de Souza Cascardo, linuxppc-dev
In-Reply-To: <1318237284.29415.422.camel@pasglop>

On Mon, Oct 10, 2011 at 11:01:24AM +0200, Benjamin Herrenschmidt wrote:
> 
> The case where things get a bit more nasty is when you try to use MMIO
> for low latency small-data type transfers instead of DMA, in which case
> you do want the ability for the chipset to write-combine and control the
> barriers more precisely.
> 
> However, this is hard and Linux doesn't provide very good accessors to
> do so, thus you need to be extra careful (see my example about wmb()
> 
> In the case of the iomap "copy" operations, my problem is that they
> don't properly advertise their lack of ordering since normal iomap does
> have full ordering.
> 
> I believe they should provide ordering with a barrier before & a barrier
> after, eventually with _relaxed variants or _raw variants for those who
> "know what they are doing".

Until then I think we need to have the logic working right on ppc and
measure if blue flame buys us any improvement in ppc. If that's not
the case (e.g because write combining is not working), then maybe we
should avoid using blueflame in ppc.
Could any of the guys from IBM check this and give us feedback?
> 
> Maybe it's time for us to revive those discussions about providing a
> good set of relaxed MMIO accessors with explicit barriers :-)
> 
> Cheers,
> Ben.
>  

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-10  9:01 UTC (permalink / raw)
  To: Eli Cohen
  Cc: netdev, Yevgeny Petrilin, Eli Cohen, David Laight,
	Thadeu Lima de Souza Cascardo, linuxppc-dev
In-Reply-To: <20111010084726.GM2681@mtldesk30>

On Mon, 2011-10-10 at 10:47 +0200, Eli Cohen wrote:
> On Mon, Oct 10, 2011 at 09:40:17AM +0100, David Laight wrote:
> > 
> > Actually memory barriers shouldn't really be added to
> > any of these 'accessor' functions.
> > (Or, at least, ones without barriers should be provided.)
> > 
> > The driver may want to to a series of writes, then a
> > single barrier, before a final write of a command (etc).
> > 
> > in_le32() from io.h is specially horrid!
> > 
> > 	David
> > 
> The driver would like to control if and when we want to put a memory
> barrier. We really don't want it to be done under the hood. In this
> respect we prefer raw functions which are still available to all
> platforms.

 ... but not necessarily the corresponding barriers.

That's why on powerpc we had to make all rmb,wmb and mb the same, aka a
full sync, because our weaker barriers don't order cachable vs.
non-cachable.

In any case, the raw functions are a bit nasty to use because they both
don't have barriers -and- don't handle endianness. So you have to be
extra careful.

In 90% of the cases, the barriers are what you want anyway. For example
in the else case of the driver, the doorbell MMIO typically wants it, so
using writel() is fine (or iowrite32be) and will have the necessary
barriers.

The case where things get a bit more nasty is when you try to use MMIO
for low latency small-data type transfers instead of DMA, in which case
you do want the ability for the chipset to write-combine and control the
barriers more precisely.

However, this is hard and Linux doesn't provide very good accessors to
do so, thus you need to be extra careful (see my example about wmb()

In the case of the iomap "copy" operations, my problem is that they
don't properly advertise their lack of ordering since normal iomap does
have full ordering.

I believe they should provide ordering with a barrier before & a barrier
after, eventually with _relaxed variants or _raw variants for those who
"know what they are doing".

Maybe it's time for us to revive those discussions about providing a
good set of relaxed MMIO accessors with explicit barriers :-)

Cheers,
Ben.
 

^ permalink raw reply

* RE: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-10  8:53 UTC (permalink / raw)
  To: David Laight
  Cc: netdev, linuxppc-dev, Yevgeny Petrilin, Eli Cohen,
	Thadeu Lima de Souza Cascardo, Eli Cohen
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6D8AE78@saturn3.aculab.com>

On Mon, 2011-10-10 at 09:40 +0100, David Laight wrote:
> > What is this __iowrite64_copy... oh I see
> > 
> > Nice, somebody _AGAIN_ added a bunch of "generic" IO 
> > accessors that are utterly wrong on all archs except
> > x86 (ok, -almost-).
> > There isn't a single bloody memory barrier in there !
> 
> Actually memory barriers shouldn't really be added to
> any of these 'accessor' functions.
> (Or, at least, ones without barriers should be provided.)

As long as they are documented to provide no guarantee of ordering
between the stores... And x86 driver writers have any clue that they
will not be ordered vs. surrounding accesses.

> The driver may want to to a series of writes, then a
> single barrier, before a final write of a command (etc).
> 
> in_le32() from io.h is specially horrid!

The reason for that is that drivers expect fully ordered writel() vs
everything (including DMA).

Unfortunately, this is how Linux defines those semantics. I would much
prefer to require barriers explicitely but the decision was made back
then simply because the vast majority of driver writers do not
understand weakly ordered memory models and "everything should be made
to look like x86".

It would be great to come up with a set of more relaxed accessors along
with the appropriate barrier to use for drivers who "know better" but so
far all attempts at doing so have failed due to the inability to agree
on their precise semantics. Tho that was a while ago, we should probably
give it a new shot.

Cheers,
Ben. 

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Eli Cohen @ 2011-10-10  8:47 UTC (permalink / raw)
  To: David Laight
  Cc: Yevgeny Petrilin, Eli Cohen, Thadeu Lima de Souza Cascardo,
	netdev, linuxppc-dev
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6D8AE78@saturn3.aculab.com>

On Mon, Oct 10, 2011 at 09:40:17AM +0100, David Laight wrote:
> 
> Actually memory barriers shouldn't really be added to
> any of these 'accessor' functions.
> (Or, at least, ones without barriers should be provided.)
> 
> The driver may want to to a series of writes, then a
> single barrier, before a final write of a command (etc).
> 
> in_le32() from io.h is specially horrid!
> 
> 	David
> 
The driver would like to control if and when we want to put a memory
barrier. We really don't want it to be done under the hood. In this
respect we prefer raw functions which are still available to all
platforms.

^ permalink raw reply

* RE: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: David Laight @ 2011-10-10  8:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Eli Cohen
  Cc: netdev, Yevgeny Petrilin, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Eli Cohen
In-Reply-To: <1318145118.29415.371.camel@pasglop>

=20
> What is this __iowrite64_copy... oh I see
>=20
> Nice, somebody _AGAIN_ added a bunch of "generic" IO=20
> accessors that are utterly wrong on all archs except
> x86 (ok, -almost-).
> There isn't a single bloody memory barrier in there !

Actually memory barriers shouldn't really be added to
any of these 'accessor' functions.
(Or, at least, ones without barriers should be provided.)

The driver may want to to a series of writes, then a
single barrier, before a final write of a command (etc).

in_le32() from io.h is specially horrid!

	David

^ permalink raw reply

* RE: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-10  8:29 UTC (permalink / raw)
  To: David Laight
  Cc: netdev, linuxppc-dev, Yevgeny Petrilin, Eli Cohen,
	Thadeu Lima de Souza Cascardo, Eli Cohen
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6D8AE77@saturn3.aculab.com>

On Mon, 2011-10-10 at 09:20 +0100, David Laight wrote:
> 
> For the above I'd actually suggest making 'doorbell_qpn' have the
> correct endianness in order to avoid the (potential) swap every
> time it is set.

Well, the problem is that either you'll end up swapping on x86 or you'll
end up swapping on ppc, there is no "native" MMIO accessor that allow
you to do a no-swap access whatever the arch you are on. Or rather,
there is the __raw_ one but you shouldn't use it for most things :-)
(Because it also doesn't have the right memory barriers).

So I'd rather they do it right using the simpler method, the cost of
swap is going to be negligible, probably not even measurable, and if and
only if they think they can improve on that in a second step, then
consider doing otherwise with appropriate measurements showing a
significant difference.

> You also need to treble-check the required endianness for the
> 'vlan_tag' in the tx descriptor. What would be needed is the
> MAC PCI slave were on an x86 (LE) system.

Cheers,
Ben.

^ permalink raw reply

* RE: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: David Laight @ 2011-10-10  8:20 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Eli Cohen
  Cc: netdev, Yevgeny Petrilin, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Eli Cohen
In-Reply-To: <1318153939.29415.401.camel@pasglop>

=20
> Then, this statement:
>=20
> *(u32 *) (&tx_desc->ctrl.vlan_tag) |=3D ring->doorbell_qpn;

...
> instead do ... :

> 	*(u32 *) (&tx_desc->ctrl.vlan_tag) |=3D
cpu_to_be32(ring->doorbell_qpn);
>=20
> (Also get rid of that cast and define vlan_tag as a __be32 to start
> with).

Agreed, casts that change the type of memory - *(foo *)&xxx - are
generally bad news unless you are casting a generic 'buffer' to
a specific structure.
I've seen far to much code that ends up being depending on the
endianness and system word size.

For the above I'd actually suggest making 'doorbell_qpn' have the
correct endianness in order to avoid the (potential) swap every
time it is set.

You also need to treble-check the required endianness for the
'vlan_tag' in the tx descriptor. What would be needed is the
MAC PCI slave were on an x86 (LE) system.

	David

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-10  7:32 UTC (permalink / raw)
  To: Eli Cohen
  Cc: netdev@vger.kernel.org, Eli Cohen, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
In-Reply-To: <20111009103020.GL2681@mtldesk30>

On Sun, 2011-10-09 at 12:30 +0200, Eli Cohen wrote:

> > Ideally you want to avoid that swapping altogether and use the right
> > accessor that indicates that your register is BE to start with. IE.
> > remove the swab32 completely and then use something like 
> > iowrite32be() instead of writel().
> I agree, this looks better but does it work on memory mapped io or
> only on io pci space? All our registers are memory mapped...

The iomap functions work on both.

> > Basically, the problem you have is that writel() has an implicit "write
> > to LE register" semantic. Your register is BE. the "iomap" variants
> > provide you with more fine grained "be" variants to use in that case.
> > There's also writel_be() but that one doesn't exist on every
> > architecture afaik.
> So writel_be is the function I should use for memory mapped io? If it
> does not exist for all platforms it's a pitty :-(

Just use the iomap variant. Usually you also use pci_iomap() instead of
ioremap() but afaik, for straight MMIO, it works with normal ioremap as
well.

> > Now, once the mmio problem is out of the way, let's look back at how you
> > then use that qpn.
> > 
> > With the current code, you've generated something in memory which is
> > byte reversed, so essentially "LE" on ppc and "BE" on x86.
> > 
> > Then, this statement:
> > 
> > *(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;
> > 
> > Will essentially write it out as-is in memory for use by the chip. The chip,
> > from what you say, expects BE, so this will be broken on PPC.
> I see. So this field is layed in le for ppc and the rest of the
> descriptor is be. so I assum that __iowrite64_copy() does not swap
> anything but we still have tx_desc->ctrl.vlan_tag in the wrong
> endianess.

Yes because you had swapped it initially. IE your original swab32 is
what busts it for you on ppc.

> > Here too, the right solution is to instead not do that swab32 to begin
> > with (ring->doorbell_qpn remains a native endian value) and instead do,
> > in addition to the above mentioned change to the writel:
> > 
> > 	*(u32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn);
> > 
> > (Also get rid of that cast and define vlan_tag as a __be32 to start
> > with).
> > 
> > Cheers,
> > Ben.
> 
> Thanks for your review. I will send another patch which should fix the
> deficiencies.

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Eli Cohen @ 2011-10-09 10:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: netdev@vger.kernel.org, Eli Cohen, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
In-Reply-To: <1318153939.29415.401.camel@pasglop>

On Sun, Oct 09, 2011 at 11:52:19AM +0200, Benjamin Herrenschmidt wrote:
> 
> > > To go back to the driver code, the statements that ring a "bell" are:
> > > 
> > > 	*(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;
> > > 
> > > This doesn't look right unless "doorbell_qpn" itself is already somewhat
> > > in the appropriate byte order.
> 
> > This is something that supports my claim that the chipset swaps
> > endianess in ppc.
> 
> No the chipset doesn't swap
> 
> > Look at mlx4_en_activate_tx_ring():
> > ring->doorbell_qpn = swab32(ring->qp.qpn << 8);
> 
> That looks gross, I think somebody writing this driver doesn't
> understand endianness.
> 
> > so in LE machines it layed as big endian in memory while in BE machines
> > it is layed as little endian in memory.
> 
> Yes which is very odd, it should be layed out the same in memory
> regardless of the machine. However in this case, this isn't accessed
> directly via DMA (this field at least), so what you appear to be doing
> here is to artificially "undo" what writel does later on (see below).
> 
> > Then we write this value to the device registers which must get it in
> > big endian otherwise it won't work - and we know this works in both
> > ppc and x86. You can ignore the case of blue flame:
> 
> Well it works because your device is odd and has BE registers :-) It's
> however not the right way to do and it's broken in your blue flame case
> (for what is now obvious reasons, see below).
> 
> What's happening basically here is that you are swapping once in
> swab32 , store that swapped value, then writel will re-swap on power and
> not swap on x86 (because the writel accessor performs swapping). So on
> x86 you basically do LE -> BE and on ppc you do BE -> LE -> BE :-)
> Pretty inefficient.
> 
> None of this has anything to do with the chipset which doesn't swap
> anything behind your back.
> 
> Ideally you want to avoid that swapping altogether and use the right
> accessor that indicates that your register is BE to start with. IE.
> remove the swab32 completely and then use something like 
> iowrite32be() instead of writel().
I agree, this looks better but does it work on memory mapped io or
only on io pci space? All our registers are memory mapped...

> 
> Basically, the problem you have is that writel() has an implicit "write
> to LE register" semantic. Your register is BE. the "iomap" variants
> provide you with more fine grained "be" variants to use in that case.
> There's also writel_be() but that one doesn't exist on every
> architecture afaik.
So writel_be is the function I should use for memory mapped io? If it
does not exist for all platforms it's a pitty :-(
> 
> Now, once the mmio problem is out of the way, let's look back at how you
> then use that qpn.
> 
> With the current code, you've generated something in memory which is
> byte reversed, so essentially "LE" on ppc and "BE" on x86.
> 
> Then, this statement:
> 
> *(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;
> 
> Will essentially write it out as-is in memory for use by the chip. The chip,
> from what you say, expects BE, so this will be broken on PPC.
I see. So this field is layed in le for ppc and the rest of the
descriptor is be. so I assum that __iowrite64_copy() does not swap
anything but we still have tx_desc->ctrl.vlan_tag in the wrong
endianess.

> 
> Here too, the right solution is to instead not do that swab32 to begin
> with (ring->doorbell_qpn remains a native endian value) and instead do,
> in addition to the above mentioned change to the writel:
> 
> 	*(u32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn);
> 
> (Also get rid of that cast and define vlan_tag as a __be32 to start
> with).
> 
> Cheers,
> Ben.

Thanks for your review. I will send another patch which should fix the
deficiencies.

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-09  9:52 UTC (permalink / raw)
  To: Eli Cohen
  Cc: netdev@vger.kernel.org, Eli Cohen, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
In-Reply-To: <20111009092102.GK2681@mtldesk30>


> > To go back to the driver code, the statements that ring a "bell" are:
> > 
> > 	*(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;
> > 
> > This doesn't look right unless "doorbell_qpn" itself is already somewhat
> > in the appropriate byte order.

> This is something that supports my claim that the chipset swaps
> endianess in ppc.

No the chipset doesn't swap

> Look at mlx4_en_activate_tx_ring():
> ring->doorbell_qpn = swab32(ring->qp.qpn << 8);

That looks gross, I think somebody writing this driver doesn't
understand endianness.

> so in LE machines it layed as big endian in memory while in BE machines
> it is layed as little endian in memory.

Yes which is very odd, it should be layed out the same in memory
regardless of the machine. However in this case, this isn't accessed
directly via DMA (this field at least), so what you appear to be doing
here is to artificially "undo" what writel does later on (see below).

> Then we write this value to the device registers which must get it in
> big endian otherwise it won't work - and we know this works in both
> ppc and x86. You can ignore the case of blue flame:

Well it works because your device is odd and has BE registers :-) It's
however not the right way to do and it's broken in your blue flame case
(for what is now obvious reasons, see below).

What's happening basically here is that you are swapping once in
swab32 , store that swapped value, then writel will re-swap on power and
not swap on x86 (because the writel accessor performs swapping). So on
x86 you basically do LE -> BE and on ppc you do BE -> LE -> BE :-)
Pretty inefficient.

None of this has anything to do with the chipset which doesn't swap
anything behind your back.

Ideally you want to avoid that swapping altogether and use the right
accessor that indicates that your register is BE to start with. IE.
remove the swab32 completely and then use something like 
iowrite32be() instead of writel().

Basically, the problem you have is that writel() has an implicit "write
to LE register" semantic. Your register is BE. the "iomap" variants
provide you with more fine grained "be" variants to use in that case.
There's also writel_be() but that one doesn't exist on every
architecture afaik.

Now, once the mmio problem is out of the way, let's look back at how you
then use that qpn.

With the current code, you've generated something in memory which is
byte reversed, so essentially "LE" on ppc and "BE" on x86.

Then, this statement:

*(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;

Will essentially write it out as-is in memory for use by the chip. The chip,
from what you say, expects BE, so this will be broken on PPC.

Here too, the right solution is to instead not do that swab32 to begin
with (ring->doorbell_qpn remains a native endian value) and instead do,
in addition to the above mentioned change to the writel:

	*(u32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn);

(Also get rid of that cast and define vlan_tag as a __be32 to start
with).

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Eli Cohen @ 2011-10-09  9:21 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: netdev@vger.kernel.org, Eli Cohen, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
In-Reply-To: <1318149536.29415.384.camel@pasglop>

On Sun, Oct 09, 2011 at 10:38:56AM +0200, Benjamin Herrenschmidt wrote:
> On Sun, 2011-10-09 at 10:07 +0200, Eli Cohen wrote:
> 
> > > Well, first, what do you mean by "swapped" ? :-) But no, it won't for
> > > all intend and purpose, this is a copy routine, copy routines never
> > > swap, neither do fifo accesses for example.
> > When I say swapped, I mean not necessairliy by software. I think that
> > the chipset will swap the the data. The reason I think so is that the
> > CPU arch is big endian, while PCI bus is defined as little endian.
> > That's why I think a swap will occur in ppc and not in x86.
> 
> No it won't "swap the data". The wiring between PCI and the CPU bus is
> done in a way called "byte address invariant", and there is some kind of
> flip of byte lanes related essentially to ensure that, but for all
> intend and purpose it's transparent.
> 
> > It's a special descriptor that resides both in memory and also written
> > to the device's register. An it contains both data and control
> > informartion.
> 
> Data should not be swapped then. Only individual bits of control
> information. In any case, the buffer should be generated in the right
> format in memory to start with. The copy routine doesn't need to swap.
> 
> To go back to the driver code, the statements that ring a "bell" are:
> 
> 	*(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;
> 
> This doesn't look right unless "doorbell_qpn" itself is already somewhat
> in the appropriate byte order.
This is something that supports my claim that the chipset swaps
endianess in ppc.
Look at mlx4_en_activate_tx_ring():
ring->doorbell_qpn = swab32(ring->qp.qpn << 8);
so in LE machines it layed as big endian in memory while in BE machines
it is layed as little endian in memory.
Then we write this value to the device registers which must get it in
big endian otherwise it won't work - and we know this works in both
ppc and x86. You can ignore the case of blue flame:

        } else if (nreq) {
                qp->sq.head += nreq;

                /*
                 * Make sure that descriptors are written before
                 * doorbell record.
                 */
                wmb();

                writel(qp->doorbell_qpn, qp->bf.uar->map +
MLX4_SEND_DOORBELL); <==  remember that it is layed in little endian
but the device must get it in big endian.

                /*
                 * Make sure doorbells don't leak out of SQ spinlock
                 * and reach the HCA out of order.
                 */
                mmiowb();

        }




> 
> Is that vlan_tag a big or little endian quantity ? Either way, this
> looks broken in either x86 or ppc unless doorbell_qpn itself is already
> in the right endian.
> 
> But since later I see
> 
> 	writel(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL);
> 
> That rings a nasty bell, it looks like doorbell_pqn is in CPU (native)
> endian, so it should have to be swapped before being OR'ed into the
> descriptor, either that or the HW does some black magic I don't
> understand.
> 
> Cheers,
> Ben.
> 

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-09  8:38 UTC (permalink / raw)
  To: Eli Cohen
  Cc: netdev@vger.kernel.org, Eli Cohen, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
In-Reply-To: <20111009080747.GJ2681@mtldesk30>

On Sun, 2011-10-09 at 10:07 +0200, Eli Cohen wrote:

> > Well, first, what do you mean by "swapped" ? :-) But no, it won't for
> > all intend and purpose, this is a copy routine, copy routines never
> > swap, neither do fifo accesses for example.
> When I say swapped, I mean not necessairliy by software. I think that
> the chipset will swap the the data. The reason I think so is that the
> CPU arch is big endian, while PCI bus is defined as little endian.
> That's why I think a swap will occur in ppc and not in x86.

No it won't "swap the data". The wiring between PCI and the CPU bus is
done in a way called "byte address invariant", and there is some kind of
flip of byte lanes related essentially to ensure that, but for all
intend and purpose it's transparent.

> It's a special descriptor that resides both in memory and also written
> to the device's register. An it contains both data and control
> informartion.

Data should not be swapped then. Only individual bits of control
information. In any case, the buffer should be generated in the right
format in memory to start with. The copy routine doesn't need to swap.

To go back to the driver code, the statements that ring a "bell" are:

	*(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;

This doesn't look right unless "doorbell_qpn" itself is already somewhat
in the appropriate byte order.

Is that vlan_tag a big or little endian quantity ? Either way, this
looks broken in either x86 or ppc unless doorbell_qpn itself is already
in the right endian.

But since later I see

	writel(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL);

That rings a nasty bell, it looks like doorbell_pqn is in CPU (native)
endian, so it should have to be swapped before being OR'ed into the
descriptor, either that or the HW does some black magic I don't
understand.

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Eli Cohen @ 2011-10-09  8:07 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: netdev@vger.kernel.org, Eli Cohen, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
In-Reply-To: <1318147254.29415.377.camel@pasglop>

On Sun, Oct 09, 2011 at 10:00:54AM +0200, Benjamin Herrenschmidt wrote:
> On Sun, 2011-10-09 at 09:35 +0200, Eli Cohen wrote:
> > On Sun, Oct 09, 2011 at 09:25:18AM +0200, Benjamin Herrenschmidt wrote:
> > > On Thu, 2011-10-06 at 15:57 +0200, Eli Cohen wrote:
> > > > On Wed, Oct 05, 2011 at 10:15:02AM +0200, Eli Cohen wrote:
> > > > 
> > > > How about this patch - can you give it a try?
> > > > 
> > > > 
> > > > >From dee60547aa9e35a02835451d9e694cd80dd3072f Mon Sep 17 00:00:00 2001
> > > > From: Eli Cohen <eli@mellanox.co.il>
> > > > Date: Thu, 6 Oct 2011 15:50:02 +0200
> > > > Subject: [PATCH] mlx4_en: Fix blue flame on powerpc
> > > > 
> > > > The source buffer used for copying into the blue flame register is already in
> > > > big endian. However, when copying to device on powerpc, the endianess is
> > > > swapped so the data reaches th device in little endian which is wrong. On x86
> > > > based platform no swapping occurs so it reaches the device with the correct
> > > > endianess. Fix this by calling le32_to_cpu() on the buffer. On LE systems there
> > > > is no change; on BE there will be a swap.
> > > 
> > > That looks wrong.
> > Not sure I understand: are you saying that on ppc, when you call
> > __iowrite64_copy, it will not reach the device swapped?
> 
> Well, first, what do you mean by "swapped" ? :-) But no, it won't for
> all intend and purpose, this is a copy routine, copy routines never
> swap, neither do fifo accesses for example.
When I say swapped, I mean not necessairliy by software. I think that
the chipset will swap the the data. The reason I think so is that the
CPU arch is big endian, while PCI bus is defined as little endian.
That's why I think a swap will occur in ppc and not in x86.
> 
> > The point is that we must always have the buffer ready in big endian
> > in memory. In the case of blue flame, we must also copy it to the
> > device registers in pci memory space. So if we use the buffer we
> > already prepared, we must have another swap. I can think of a nicer
> > way to implement this functionality but the question is do you think
> > my observation above is wrong and why.
> 
> No. If it's in memory BE then the copy routine will keep it BE. A copy
> routine doesn't swap and doesn't affect endianness.
> 
> Additionally, a swapping phase like you proposed doing 32-bit swaps
> means that you know for sure that the buffer is made of 32-bit
> quantities, is that the case ?
Yes

> Even if you had needed that swap, if your
> buffer had contained 16-bit or 64-bit quantities, you're toast.
> 
> What is this buffer anyway ? A descriptor or a network packet ?
It's a special descriptor that resides both in memory and also written
to the device's register. An it contains both data and control
informartion.
> 
> If it's a packet, then it's data, endianness has no meaning (or rather
> it has for individual fields of the packets but they are already in the
> right format and a 32-bit swap will never be right).
>  
> It's almost never right to perform swapping when copying data (or
> reading/writing a FIFO).
> 
> Cheers,
> Ben.

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Benjamin Herrenschmidt @ 2011-10-09  8:00 UTC (permalink / raw)
  To: Eli Cohen
  Cc: netdev@vger.kernel.org, Eli Cohen, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
In-Reply-To: <20111009073546.GI2681@mtldesk30>

On Sun, 2011-10-09 at 09:35 +0200, Eli Cohen wrote:
> On Sun, Oct 09, 2011 at 09:25:18AM +0200, Benjamin Herrenschmidt wrote:
> > On Thu, 2011-10-06 at 15:57 +0200, Eli Cohen wrote:
> > > On Wed, Oct 05, 2011 at 10:15:02AM +0200, Eli Cohen wrote:
> > > 
> > > How about this patch - can you give it a try?
> > > 
> > > 
> > > >From dee60547aa9e35a02835451d9e694cd80dd3072f Mon Sep 17 00:00:00 2001
> > > From: Eli Cohen <eli@mellanox.co.il>
> > > Date: Thu, 6 Oct 2011 15:50:02 +0200
> > > Subject: [PATCH] mlx4_en: Fix blue flame on powerpc
> > > 
> > > The source buffer used for copying into the blue flame register is already in
> > > big endian. However, when copying to device on powerpc, the endianess is
> > > swapped so the data reaches th device in little endian which is wrong. On x86
> > > based platform no swapping occurs so it reaches the device with the correct
> > > endianess. Fix this by calling le32_to_cpu() on the buffer. On LE systems there
> > > is no change; on BE there will be a swap.
> > 
> > That looks wrong.
> Not sure I understand: are you saying that on ppc, when you call
> __iowrite64_copy, it will not reach the device swapped?

Well, first, what do you mean by "swapped" ? :-) But no, it won't for
all intend and purpose, this is a copy routine, copy routines never
swap, neither do fifo accesses for example.

> The point is that we must always have the buffer ready in big endian
> in memory. In the case of blue flame, we must also copy it to the
> device registers in pci memory space. So if we use the buffer we
> already prepared, we must have another swap. I can think of a nicer
> way to implement this functionality but the question is do you think
> my observation above is wrong and why.

No. If it's in memory BE then the copy routine will keep it BE. A copy
routine doesn't swap and doesn't affect endianness.

Additionally, a swapping phase like you proposed doing 32-bit swaps
means that you know for sure that the buffer is made of 32-bit
quantities, is that the case ? Even if you had needed that swap, if your
buffer had contained 16-bit or 64-bit quantities, you're toast.

What is this buffer anyway ? A descriptor or a network packet ?

If it's a packet, then it's data, endianness has no meaning (or rather
it has for individual fields of the packets but they are already in the
right format and a 32-bit swap will never be right).
 
It's almost never right to perform swapping when copying data (or
reading/writing a FIFO).

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] mlx4_en: fix transmit of packages when blue frame is enabled
From: Eli Cohen @ 2011-10-09  7:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: netdev@vger.kernel.org, Eli Cohen, linuxppc-dev,
	Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
In-Reply-To: <1318145118.29415.371.camel@pasglop>

On Sun, Oct 09, 2011 at 09:25:18AM +0200, Benjamin Herrenschmidt wrote:
> On Thu, 2011-10-06 at 15:57 +0200, Eli Cohen wrote:
> > On Wed, Oct 05, 2011 at 10:15:02AM +0200, Eli Cohen wrote:
> > 
> > How about this patch - can you give it a try?
> > 
> > 
> > >From dee60547aa9e35a02835451d9e694cd80dd3072f Mon Sep 17 00:00:00 2001
> > From: Eli Cohen <eli@mellanox.co.il>
> > Date: Thu, 6 Oct 2011 15:50:02 +0200
> > Subject: [PATCH] mlx4_en: Fix blue flame on powerpc
> > 
> > The source buffer used for copying into the blue flame register is already in
> > big endian. However, when copying to device on powerpc, the endianess is
> > swapped so the data reaches th device in little endian which is wrong. On x86
> > based platform no swapping occurs so it reaches the device with the correct
> > endianess. Fix this by calling le32_to_cpu() on the buffer. On LE systems there
> > is no change; on BE there will be a swap.
> 
> That looks wrong.
Not sure I understand: are you saying that on ppc, when you call
__iowrite64_copy, it will not reach the device swapped?

The point is that we must always have the buffer ready in big endian
in memory. In the case of blue flame, we must also copy it to the
device registers in pci memory space. So if we use the buffer we
already prepared, we must have another swap. I can think of a nicer
way to implement this functionality but the question is do you think
my observation above is wrong and why.

> 
> What is this __iowrite64_copy... oh I see
> 
> Nice, somebody _AGAIN_ added a bunch of "generic" IO accessors that are
> utterly wrong on all archs except x86 (ok, -almost-). There isn't a
> single bloody memory barrier in there !
> 
> So, __iowrite64_copy is doing raw_writel which will -not- swap, so your
> buffer is going to have the same endianness in the destination than it
> has in the source. This is _NOT_ the right place to do a swap.
> 
> It's the original construction of the descriptor that needs change. The
> data itself should never need to be affected accross a copy operation
> (unless your HW is terminally busted).
> 
> Cheers,
> Ben.
> 
> > Signed-off-by: Eli Cohen <eli@mellanox.co.il>
> > ---
> >  drivers/net/mlx4/en_tx.c |   10 ++++++++++
> >  1 files changed, 10 insertions(+), 0 deletions(-)
> > 
> > diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c
> > index 16337fb..3743acc 100644
> > --- a/drivers/net/mlx4/en_tx.c
> > +++ b/drivers/net/mlx4/en_tx.c
> > @@ -601,6 +601,16 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb)
> >  
> >  static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
> >  {
> > +	int i;
> > +	__le32 *psrc = (__le32 *)src;
> > +
> > +	/*
> > +	 * the buffer is already in big endian. For little endian machines that's
> > +	 * fine. For big endain machines we must swap since the chipset swaps again
> > +	 */
> > +	for (i = 0; i < bytecnt / 4; ++i)
> > +		psrc[i] = le32_to_cpu(psrc[i]);
> > +
> >  	__iowrite64_copy(dst, src, bytecnt / 8);
> >  }
> >  
> > -- 
> > 1.7.7.rc0.70.g82660
> > 
> > 
> > 
> > > On Tue, Oct 04, 2011 at 05:26:20PM -0300, Thadeu Lima de Souza Cascardo wrote:
> > > 
> > > I believe we have an endianess problem here. The source buffer is in
> > > big endian - in x86 archs, it will rich the pci device unswapped since
> > > both x86 and pci are little endian. In ppc, it wil be swapped by the
> > > chipset so it will reach the device in little endian which is wrong.
> > > So, in mlx4_bf_copy, you could loop over the buffer and swap32 the all
> > > the dwords before the call to __iowrite64_copy. Of course which should
> > > fix this in an arch independent manner. Let me know this works for
> > > you.
> > > 
> > > > On Tue, Oct 04, 2011 at 08:02:12AM +0200, Benjamin Herrenschmidt wrote:
> > > > > On Mon, 2011-10-03 at 17:53 -0300, Thadeu Lima de Souza Cascardo wrote:
> > > > > 
> > > > >  .../...
> > > > > 
> > > > > > > Can you also send me the output of ethtool -i?
> > > > > > > It seems that there is a problem with write combining on Power processors, we will check this issue.
> > > > > > > 
> > > > > > > Yevgeny
> > > > > > 
> > > > > > Hello, Yevgeny.
> > > > > > 
> > > > > > You will find the output of ethtool -i below.
> > > > > > 
> > > > > > I am copying Ben and powerpc list, in case this is an issue with Power
> > > > > > processors. They can provide us some more insight into this.
> > > > > 
> > > > > May I get some background please ? :-)
> > > > > 
> > > > > I'm not aware of a specific issue with write combining but I'd need to
> > > > > know more about what you are doing and the code to do it to comment on
> > > > > whether it should work or not.
> > > > > 
> > > > > Cheers,
> > > > > Ben.
> > > > > 
> > > > > 
> > > > 
> > > > Hello, Ben.
> > > > 
> > > > Sorry for that. I am testing mlx4_en driver on a POWER. Yevgeny has
> > > > added blue frame support, that does not require writing to the device
> > > > memory to indicate a new packet (the doorbell register as it is called).
> > > > 
> > > > Well, the ring is getting full with no interrupt or packets transmitted.
> > > > I simply added a write to the doorbell register and it works for me.
> > > > Yevgeny says this is not the right fix, claiming there is a problem with
> > > > write combining on POWER. The code uses memory barriers, so I don't know
> > > > why there is any problem.
> > > > 
> > > > I am posting the code here to show better what the situation is.
> > > > Yevgeny can tell more about the device and the driver.
> > > > 
> > > > The code below is the driver as of now, including a diff with what I
> > > > changed and had resulted OK for me. Before the blue frame support, the
> > > > only code executed was the else part. I can't tell much what the device
> > > > should be seeing and doing after the blue frame part of the code is
> > > > executed. But it does send the packet if I write to the doorbell
> > > > register.
> > > > 
> > > > Yevgeny, can you tell us what the device should be doing and why you
> > > > think this is a problem on POWER? Is it possible that this is simply a
> > > > problem with the firmware version?
> > > > 
> > > > Thanks,
> > > > Cascardo.
> > > > 
> > > > ---
> > > >         if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
> > > > !vlan_tag) {
> > > >                 *(u32 *) (&tx_desc->ctrl.vlan_tag) |=
> > > > ring->doorbell_qpn;
> > > >                 op_own |= htonl((bf_index & 0xffff) << 8);
> > > >                 /* Ensure new descirptor hits memory
> > > >                 * before setting ownership of this descriptor to HW */
> > > >                 wmb();
> > > >                 tx_desc->ctrl.owner_opcode = op_own;
> > > > 
> > > >                 wmb();
> > > > 
> > > >                 mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned
> > > > long *) &tx_desc->ctrl,
> > > >                      desc_size);
> > > > 
> > > >                 wmb();
> > > > 
> > > >                 ring->bf.offset ^= ring->bf.buf_size;
> > > >         } else {
> > > >                 /* Ensure new descirptor hits memory
> > > >                 * before setting ownership of this descriptor to HW */
> > > >                 wmb();
> > > >                 tx_desc->ctrl.owner_opcode = op_own;
> > > > -               wmb();
> > > > -               writel(ring->doorbell_qpn, ring->bf.uar->map +
> > > > MLX4_SEND_DOORBELL);
> > > >         }
> > > > 
> > > > +       wmb();
> > > > +       writel(ring->doorbell_qpn, ring->bf.uar->map +
> > > > MLX4_SEND_DOORBELL);
> > > > +
> > > > ---
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox