LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 6/7] fsl_pmc: Add API to enable device as wakeup event source
From: Zhao Chenhui @ 2011-11-16  9:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: scottwood
In-Reply-To: <1321437344-19253-1-git-send-email-chenhui.zhao@freescale.com>

Add APIs for setting wakeup source and lossless Ethernet in low power modes.
These APIs can be used by wake-on-packet feature.

Signed-off-by: Dave Liu <daveliu@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Jin Qing <b24347@freescale.com>
Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
---
Changes for v2:
 - rename functions
 - add pmc_flag

 arch/powerpc/sysdev/fsl_pmc.c |   72 ++++++++++++++++++++++++++++++++++++++++-
 arch/powerpc/sysdev/fsl_soc.h |    9 +++++
 2 files changed, 80 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_pmc.c b/arch/powerpc/sysdev/fsl_pmc.c
index d6c65a7..6a2f8b4 100644
--- a/arch/powerpc/sysdev/fsl_pmc.c
+++ b/arch/powerpc/sysdev/fsl_pmc.c
@@ -40,13 +40,83 @@ static unsigned int pmc_flag;
 
 #define PMC_SLEEP	0x1
 #define PMC_DEEP_SLEEP	0x2
+#define PMC_LOSSLESS	0x4
 
 #define POWMGTCSR_SLP_MASK	0x00020000
+#define POWMGTCSR_LOSSLESS_MASK	0x00400000
 
 /* Cast the ccsrbar to 64-bit parameter so that the assembly
  * code can be compatible with both 32-bit & 36-bit */
 extern void mpc85xx_enter_deep_sleep(u64 ccsrbar);
 
+#ifdef CONFIG_FSL_PMC
+/**
+ * mpc85xx_pmc_set_wake - enable OF device as wakeup event source
+ * @pdev: platform device affected
+ * @enable: True to enable event generation; false to disable
+ *
+ * This enables the device as a wakeup event source, or disables it.
+ *
+ * RETURN VALUE:
+ * 0 is returned on success
+ * -EINVAL is returned if device is not supposed to wake up the system
+ * Error code depending on the platform is returned if both the platform and
+ * the native mechanism fail to enable the generation of wake-up events
+ */
+int mpc85xx_pmc_set_wake(struct platform_device *pdev, bool enable)
+{
+	int ret = 0;
+	struct device_node *clk_np;
+	u32 pmcdr_mask;
+
+	if (!pmc_regs) {
+		pr_err("%s: PMC is unavailable\n", __func__);
+		return -ENODEV;
+	}
+
+	if (enable && !device_may_wakeup(&pdev->dev))
+		return -EINVAL;
+
+	clk_np = of_parse_phandle(pdev->dev.of_node, "clk-handle", 0);
+	if (!clk_np)
+		return -EINVAL;
+
+	if (of_property_read_u32(clk_np, "fsl,pmcdr-mask", &pmcdr_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (enable)
+		/* clear to enable clock in low power mode */
+		clrbits32(&pmc_regs->pmcdr, pmcdr_mask);
+	else
+		setbits32(&pmc_regs->pmcdr, pmcdr_mask);
+
+out:
+	of_node_put(clk_np);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mpc85xx_pmc_set_wake);
+
+/**
+ * mpc85xx_pmc_set_lossless_ethernet - enable lossless ethernet
+ * in (deep) sleep mode
+ * @enable: True to enable event generation; false to disable
+ */
+void mpc85xx_pmc_set_lossless_ethernet(int enable)
+{
+	if (pmc_flag & PMC_LOSSLESS) {
+		if (enable)
+			setbits32(&pmc_regs->powmgtcsr,
+					POWMGTCSR_LOSSLESS_MASK);
+		else
+			clrbits32(&pmc_regs->powmgtcsr,
+					POWMGTCSR_LOSSLESS_MASK);
+	}
+}
+EXPORT_SYMBOL_GPL(mpc85xx_pmc_set_lossless_ethernet);
+#endif
+
 static int pmc_suspend_enter(suspend_state_t state)
 {
 	int ret = 0;
@@ -120,7 +190,7 @@ static int pmc_probe(struct platform_device *pdev)
 		pmc_flag |= PMC_DEEP_SLEEP;
 
 	if (of_device_is_compatible(np, "fsl,p1022-pmc"))
-		pmc_flag |= PMC_DEEP_SLEEP;
+		pmc_flag |= PMC_DEEP_SLEEP | PMC_LOSSLESS;
 
 	suspend_set_ops(&pmc_suspend_ops);
 
diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h
index c6d0073..3422b0d 100644
--- a/arch/powerpc/sysdev/fsl_soc.h
+++ b/arch/powerpc/sysdev/fsl_soc.h
@@ -3,6 +3,7 @@
 #ifdef __KERNEL__
 
 #include <asm/mmu.h>
+#include <linux/platform_device.h>
 
 struct spi_device;
 
@@ -21,6 +22,14 @@ struct device_node;
 
 extern void fsl_rstcr_restart(char *cmd);
 
+#ifdef CONFIG_FSL_PMC
+int mpc85xx_pmc_set_wake(struct platform_device *pdev, bool enable);
+void mpc85xx_pmc_set_lossless_ethernet(int enable);
+#else
+#define mpc85xx_pmc_set_wake(pdev, enable)
+#define mpc85xx_pmc_set_lossless_ethernet(enable)
+#endif
+
 #if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
 
 /* The different ports that the DIU can be connected to */
-- 
1.6.4.1

^ permalink raw reply related

* [PATCH v2 2/7] powerpc/85xx: add HOTPLUG_CPU support
From: Zhao Chenhui @ 2011-11-16  9:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: scottwood
In-Reply-To: <1321437344-19253-1-git-send-email-chenhui.zhao@freescale.com>

From: Li Yang <leoli@freescale.com>

Add support to disable and re-enable individual cores at runtime
on MPC85xx/QorIQ SMP machines. Currently support e500v2 core.

MPC85xx machines use ePAPR spin-table in boot page for CPU kick-off.
This patch uses the boot page from bootloader to boot core at runtime.
It supports 32-bit and 36-bit physical address.

Add generic_set_cpu_up() to set cpu_state as CPU_UP_PREPARE in kick_cpu().

Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Jin Qing <b24347@freescale.com>
Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
---
Changes for v2:
 - fix a sync issue by generic_set_cpu_up()
 - put the dying core in nap state
 - remove smp_85xx_unmap_bootpg() and smp_85xx_reset_core()
 - use mpic_reset_core() to reset core

 arch/powerpc/Kconfig                 |    5 +-
 arch/powerpc/include/asm/smp.h       |    2 +
 arch/powerpc/kernel/head_fsl_booke.S |   28 ++++++
 arch/powerpc/kernel/smp.c            |   10 ++
 arch/powerpc/platforms/85xx/smp.c    |  170 +++++++++++++++++++++++++++-------
 5 files changed, 179 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b177caa..afe1682 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -212,7 +212,7 @@ config ARCH_HIBERNATION_POSSIBLE
 config ARCH_SUSPEND_POSSIBLE
 	def_bool y
 	depends on ADB_PMU || PPC_EFIKA || PPC_LITE5200 || PPC_83xx || \
-		   (PPC_85xx && !SMP) || PPC_86xx || PPC_PSERIES || 44x || 40x
+		   PPC_85xx || PPC_86xx || PPC_PSERIES || 44x || 40x
 
 config PPC_DCR_NATIVE
 	bool
@@ -323,7 +323,8 @@ config SWIOTLB
 
 config HOTPLUG_CPU
 	bool "Support for enabling/disabling CPUs"
-	depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_POWERNV)
+	depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || \
+		PPC_PMAC || PPC_POWERNV || E500)
 	---help---
 	  Say Y here to be able to disable and re-enable individual
 	  CPUs at runtime on SMP machines.
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index adba970..7517863 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -65,6 +65,7 @@ int generic_cpu_disable(void);
 void generic_cpu_die(unsigned int cpu);
 void generic_mach_cpu_die(void);
 void generic_set_cpu_dead(unsigned int cpu);
+void generic_set_cpu_up(unsigned int cpu);
 int generic_check_cpu_restart(unsigned int cpu);
 #endif
 
@@ -191,6 +192,7 @@ extern unsigned long __secondary_hold_spinloop;
 extern unsigned long __secondary_hold_acknowledge;
 extern char __secondary_hold;
 
+extern void __early_start(void);
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 9f5d210..1d93272 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -1004,6 +1004,34 @@ _GLOBAL(flush_dcache_L1)
 
 	blr
 
+/* Flush L1 d-cache, invalidate and disable d-cache and i-cache */
+_GLOBAL(flush_disable_L1)
+	mflr	r10
+	bl	flush_dcache_L1	/* Flush L1 d-cache */
+	mtlr	r10
+
+	mfspr	r4, SPRN_L1CSR0	/* Invalidate and disable d-cache */
+	li	r5, 2
+	rlwimi	r4, r5, 0, 3
+
+	msync
+	isync
+	mtspr	SPRN_L1CSR0, r4
+	isync
+
+1:	mfspr	r4, SPRN_L1CSR0	/* Wait for the invalidate to finish */
+	andi.	r4, r4, 2
+	bne	1b
+
+	mfspr	r4, SPRN_L1CSR1	/* Invalidate and disable i-cache */
+	li	r5, 2
+	rlwimi	r4, r5, 0, 3
+
+	mtspr	SPRN_L1CSR1, r4
+	isync
+
+	blr
+
 #ifdef CONFIG_SMP
 /* When we get here, r24 needs to hold the CPU # */
 	.globl __secondary_start
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6df7090..e2d4401 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -422,6 +422,16 @@ void generic_set_cpu_dead(unsigned int cpu)
 	per_cpu(cpu_state, cpu) = CPU_DEAD;
 }
 
+/*
+ * The cpu_state should be set to CPU_UP_PREPARE in kick_cpu(), otherwise
+ * the cpu_state is always CPU_DEAD after calling generic_set_cpu_dead(),
+ * which makes the delay in generic_cpu_die() not happen.
+ */
+void generic_set_cpu_up(unsigned int cpu)
+{
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+}
+
 int generic_check_cpu_restart(unsigned int cpu)
 {
 	return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 6834447..78732a5 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -17,6 +17,7 @@
 #include <linux/of.h>
 #include <linux/kexec.h>
 #include <linux/highmem.h>
+#include <linux/cpu.h>
 
 #include <asm/machdep.h>
 #include <asm/pgtable.h>
@@ -28,28 +29,78 @@
 #include <sysdev/fsl_soc.h>
 #include <sysdev/mpic.h>
 
-extern void __early_start(void);
-
-#define BOOT_ENTRY_ADDR_UPPER	0
-#define BOOT_ENTRY_ADDR_LOWER	1
-#define BOOT_ENTRY_R3_UPPER	2
-#define BOOT_ENTRY_R3_LOWER	3
-#define BOOT_ENTRY_RESV		4
-#define BOOT_ENTRY_PIR		5
-#define BOOT_ENTRY_R6_UPPER	6
-#define BOOT_ENTRY_R6_LOWER	7
-#define NUM_BOOT_ENTRY		8
-#define SIZE_BOOT_ENTRY		(NUM_BOOT_ENTRY * sizeof(u32))
-
-static int __init
-smp_85xx_kick_cpu(int nr)
+#define MPC85xx_BPTR_OFF		0x00020
+#define MPC85xx_BPTR_EN			0x80000000
+#define MPC85xx_BPTR_BOOT_PAGE_MASK	0x00ffffff
+
+struct epapr_spin_table {
+	u32	addr_h;
+	u32	addr_l;
+	u32	r3_h;
+	u32	r3_l;
+	u32	reserved;
+	u32	pir;
+};
+
+static void __cpuinit smp_85xx_setup_cpu(int cpu_nr);
+
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PPC32)
+static __iomem u32 *bptr;
+
+extern void flush_disable_L1(void);
+
+static void __cpuinit smp_85xx_mach_cpu_die(void)
+{
+	unsigned int cpu = smp_processor_id();
+	register u32 tmp;
+
+	local_irq_disable();
+	idle_task_exit();
+	generic_set_cpu_dead(cpu);
+	mb();
+
+	mtspr(SPRN_TCR, 0);
+	mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
+
+	flush_disable_L1();
+
+	if (cpu_has_feature(CPU_FTR_CAN_NAP)) {
+		tmp = (mfspr(SPRN_HID0) & ~(HID0_DOZE|HID0_SLEEP)) | HID0_NAP;
+		mb();
+		isync();
+		mtspr(SPRN_HID0, tmp);
+		isync();
+
+		tmp = mfmsr();
+		tmp |= MSR_WE;
+		mb();
+		mtmsr(tmp);
+		isync();
+	}
+
+	for (;;);
+}
+
+static void __cpuinit smp_85xx_set_bootpg(u32 page)
+{
+	if (bptr != NULL) {
+		/* Set the BPTR to the boot page */
+		out_be32(bptr,
+		    MPC85xx_BPTR_EN | (page & MPC85xx_BPTR_BOOT_PAGE_MASK));
+	}
+}
+#endif
+
+static int __cpuinit smp_85xx_kick_cpu(int nr)
+
 {
 	unsigned long flags;
 	const u64 *cpu_rel_addr;
-	__iomem u32 *bptr_vaddr;
+	__iomem struct epapr_spin_table *epapr;
 	struct device_node *np;
 	int n = 0, hw_cpu = get_hard_smp_processor_id(nr);
 	int ioremappable;
+	int ret = 0;
 
 	WARN_ON(nr < 0 || nr >= NR_CPUS);
 	WARN_ON(hw_cpu < 0 || hw_cpu >= NR_CPUS);
@@ -57,10 +108,11 @@ smp_85xx_kick_cpu(int nr)
 	pr_debug("smp_85xx_kick_cpu: kick CPU #%d\n", nr);
 
 	np = of_get_cpu_node(nr, NULL);
-	cpu_rel_addr = of_get_property(np, "cpu-release-addr", NULL);
 
+	cpu_rel_addr = of_get_property(np, "cpu-release-addr", NULL);
 	if (cpu_rel_addr == NULL) {
-		printk(KERN_ERR "No cpu-release-addr for cpu %d\n", nr);
+		pr_err("%s: No cpu-release-addr for cpu %d\n",
+					__func__, nr);
 		return -ENOENT;
 	}
 
@@ -74,46 +126,83 @@ smp_85xx_kick_cpu(int nr)
 
 	/* Map the spin table */
 	if (ioremappable)
-		bptr_vaddr = ioremap(*cpu_rel_addr, SIZE_BOOT_ENTRY);
+		epapr = ioremap(*cpu_rel_addr, sizeof(struct epapr_spin_table));
 	else
-		bptr_vaddr = phys_to_virt(*cpu_rel_addr);
+		epapr = phys_to_virt(*cpu_rel_addr);
 
 	local_irq_save(flags);
 
-	out_be32(bptr_vaddr + BOOT_ENTRY_PIR, hw_cpu);
+	out_be32(&epapr->pir, hw_cpu);
 #ifdef CONFIG_PPC32
-	out_be32(bptr_vaddr + BOOT_ENTRY_ADDR_LOWER, __pa(__early_start));
+#ifdef CONFIG_HOTPLUG_CPU
+	/* Corresponding to generic_set_cpu_dead() */
+	generic_set_cpu_up(nr);
+
+	if (system_state == SYSTEM_RUNNING) {
+		out_be32(&epapr->addr_l, 0);
+
+		smp_85xx_set_bootpg((u32)(*cpu_rel_addr >> PAGE_SHIFT));
+		mpic_reset_core(hw_cpu);
+
+		/* wait until core is ready... */
+		n = 0;
+		while ((in_be32(&epapr->addr_l) != 1) && (++n < 1000))
+			udelay(100);
+		if (n >= 1000) {
+			pr_err("%s: timeout waiting for core %d to reset\n",
+							__func__, hw_cpu);
+			ret = -ENOENT;
+			goto out;
+		}
+		/*  clear the acknowledge status */
+		__secondary_hold_acknowledge = -1;
+	}
+#endif
+	out_be32(&epapr->addr_l, __pa(__early_start));
 
 	if (!ioremappable)
-		flush_dcache_range((ulong)bptr_vaddr,
-				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
+		flush_dcache_range((ulong)epapr,
+			(ulong)epapr + sizeof(struct epapr_spin_table));
 
 	/* Wait a bit for the CPU to ack. */
+	n = 0;
 	while ((__secondary_hold_acknowledge != hw_cpu) && (++n < 1000))
 		mdelay(1);
+	if (n >= 1000) {
+		pr_err("%s: timeout waiting for core %d to ack\n",
+						__func__, hw_cpu);
+		ret = -ENOENT;
+		goto out;
+	}
+out:
 #else
 	smp_generic_kick_cpu(nr);
 
-	out_be64((u64 *)(bptr_vaddr + BOOT_ENTRY_ADDR_UPPER),
-		__pa((u64)*((unsigned long long *) generic_secondary_smp_init)));
+	out_be64((u64 *)(&epapr->addr_h),
+	  __pa((u64)*((unsigned long long *) generic_secondary_smp_init)));
 
 	if (!ioremappable)
-		flush_dcache_range((ulong)bptr_vaddr,
-				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
+		flush_dcache_range((ulong)epapr,
+			(ulong)epapr + sizeof(struct epapr_spin_table));
 #endif
 
 	local_irq_restore(flags);
 
 	if (ioremappable)
-		iounmap(bptr_vaddr);
+		iounmap(epapr);
 
 	pr_debug("waited %d msecs for CPU #%d.\n", n, nr);
 
-	return 0;
+	return ret;
 }
 
 struct smp_ops_t smp_85xx_ops = {
 	.kick_cpu = smp_85xx_kick_cpu,
+	.setup_cpu	= smp_85xx_setup_cpu,
+#ifdef CONFIG_HOTPLUG_CPU
+	.cpu_disable	= generic_cpu_disable,
+	.cpu_die	= generic_cpu_die,
+#endif
 	.give_timebase	= smp_generic_give_timebase,
 	.take_timebase	= smp_generic_take_timebase,
 };
@@ -215,8 +304,7 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC */
 
-static void __init
-smp_85xx_setup_cpu(int cpu_nr)
+static void __cpuinit smp_85xx_setup_cpu(int cpu_nr)
 {
 	if (smp_85xx_ops.probe == smp_mpic_probe)
 		mpic_setup_this_cpu();
@@ -229,14 +317,24 @@ void __init mpc85xx_smp_init(void)
 {
 	struct device_node *np;
 
-	smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu;
-
 	np = of_find_node_by_type(NULL, "open-pic");
 	if (np) {
 		smp_85xx_ops.probe = smp_mpic_probe;
 		smp_85xx_ops.message_pass = smp_mpic_message_pass;
 	}
 
+	of_node_put(np);
+#ifdef CONFIG_HOTPLUG_CPU
+	bptr = NULL;
+	np = of_find_node_by_name(NULL, "ecm-law");
+	if (!np) {
+		pr_err("%s: can't find ecm-law node in dts\n", __func__);
+		return;
+	}
+	bptr = of_iomap(np, 0) + MPC85xx_BPTR_OFF;
+	of_node_put(np);
+#endif
+
 	if (cpu_has_feature(CPU_FTR_DBELL)) {
 		/*
 		 * If left NULL, .message_pass defaults to
@@ -246,6 +344,10 @@ void __init mpc85xx_smp_init(void)
 		smp_85xx_ops.cause_ipi = doorbell_cause_ipi;
 	}
 
+#ifdef CONFIG_HOTPLUG_CPU
+	ppc_md.cpu_die		= smp_85xx_mach_cpu_die;
+#endif
+
 	smp_ops = &smp_85xx_ops;
 
 #ifdef CONFIG_KEXEC
-- 
1.6.4.1

^ permalink raw reply related

* [PATCH v2 3/7] powerpc/85xx: add sleep and deep sleep support
From: Zhao Chenhui @ 2011-11-16  9:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: scottwood
In-Reply-To: <1321437344-19253-1-git-send-email-chenhui.zhao@freescale.com>

From: Li Yang <leoli@freescale.com>

Some Freescale chips like MPC8536 and P1022 has deep sleep PM mode
in addtion to the sleep PM mode.

In sleep PM mode, the clocks of e500 core and unused IP blocks is
turned off. IP blocks which are allowed to wake up the processor
are still running

While in deep sleep PM mode, additionally, the power supply is
removed from e500 core and most IP blocks. Only the blocks needed
to wake up the chip out of deep sleep are ON.

This patch supports 32-bit and 36-bit address space.

The deep sleep mode is equal to the Suspend-to-RAM state of Linux
Power Management.

Command to enter deep sleep mode.
  echo mem > /sys/power/state

Signed-off-by: Dave Liu <daveliu@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Jin Qing <b24347@freescale.com>
Signed-off-by: Jerry Huang <Chang-Ming.Huang@freescale.com>
Cc: Scott Wood <scottwood@freescale.com>
Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
---
Changes for V2:
 - add enable_kernel_fp()
 - rename functions and variables
 - add pmc_flag
 - remove the second parameter of mpc85xx_enter_deep_sleep()
 - keep other bits intact when setting the deep sleep bit of
   POWMGTSCR in mpc85xx_enter_deep_sleep()

 arch/powerpc/kernel/Makefile         |    1 +
 arch/powerpc/kernel/l2cr_85xx.S      |   53 +++
 arch/powerpc/platforms/85xx/Makefile |    1 +
 arch/powerpc/platforms/85xx/sleep.S  |  605 ++++++++++++++++++++++++++++++++++
 arch/powerpc/sysdev/fsl_pmc.c        |   98 +++++--
 5 files changed, 738 insertions(+), 20 deletions(-)
 create mode 100644 arch/powerpc/kernel/l2cr_85xx.S
 create mode 100644 arch/powerpc/platforms/85xx/sleep.S

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index ce4f7f1..d5cc385 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 ifeq ($(CONFIG_PPC32),y)
 obj-$(CONFIG_E500)		+= idle_e500.o
 endif
+obj-$(CONFIG_PPC_85xx)		+= l2cr_85xx.o
 obj-$(CONFIG_6xx)		+= idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o
 obj-$(CONFIG_TAU)		+= tau_6xx.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o suspend.o
diff --git a/arch/powerpc/kernel/l2cr_85xx.S b/arch/powerpc/kernel/l2cr_85xx.S
new file mode 100644
index 0000000..95dfef0
--- /dev/null
+++ b/arch/powerpc/kernel/l2cr_85xx.S
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2009-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *	Scott Wood <scottwood@freescale.com>
+ *	Dave Liu <daveliu@freescale.com>
+ * implement the L2 cache operations of e500 based L2 controller
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+	.section .text
+
+	/* r3 = virtual address of L2 controller, WIMG = 01xx */
+_GLOBAL(flush_disable_L2)
+	/* It's a write-through cache, so only invalidation is needed. */
+	mbar
+	isync
+	lwz	r4, 0(r3)
+	li	r5, 1
+	rlwimi	r4, r5, 30, 0xc0000000
+	stw	r4, 0(r3)
+
+	/* Wait for the invalidate to finish */
+1:	lwz	r4, 0(r3)
+	andis.	r4, r4, 0x4000
+	bne	1b
+	mbar
+
+	blr
+
+	/* r3 = virtual address of L2 controller, WIMG = 01xx */
+_GLOBAL(invalidate_enable_L2)
+	mbar
+	isync
+	lwz	r4, 0(r3)
+	li	r5, 3
+	rlwimi	r4, r5, 30, 0xc0000000
+	stw	r4, 0(r3)
+
+	/* Wait for the invalidate to finish */
+1:	lwz	r4, 0(r3)
+	andis.	r4, r4, 0x4000
+	bne	1b
+	mbar
+
+	blr
diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile
index bc5acb9..cec54c7 100644
--- a/arch/powerpc/platforms/85xx/Makefile
+++ b/arch/powerpc/platforms/85xx/Makefile
@@ -2,6 +2,7 @@
 # Makefile for the PowerPC 85xx linux kernel.
 #
 obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SUSPEND)	+= sleep.o
 
 obj-$(CONFIG_MPC8540_ADS) += mpc85xx_ads.o
 obj-$(CONFIG_MPC8560_ADS) += mpc85xx_ads.o
diff --git a/arch/powerpc/platforms/85xx/sleep.S b/arch/powerpc/platforms/85xx/sleep.S
new file mode 100644
index 0000000..3923bab
--- /dev/null
+++ b/arch/powerpc/platforms/85xx/sleep.S
@@ -0,0 +1,605 @@
+/*
+ * Enter and leave deep sleep/sleep state on MPC85xx
+ *
+ * Author: Scott Wood <scottwood@freescale.com>
+ *
+ * Copyright (C) 2006-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+#include <asm/reg.h>
+#include <asm/asm-offsets.h>
+
+#define SS_TB		0x00
+#define SS_HID		0x08 /* 2 HIDs */
+#define SS_IAC		0x10 /* 2 IACs */
+#define SS_DAC		0x18 /* 2 DACs */
+#define SS_DBCR		0x20 /* 3 DBCRs */
+#define SS_PID		0x2c /* 3 PIDs */
+#define SS_SPRG		0x38 /* 8 SPRGs */
+#define SS_IVOR		0x58 /* 20 interrupt vectors */
+#define SS_TCR		0xa8
+#define SS_BUCSR	0xac
+#define SS_L1CSR	0xb0 /* 2 L1CSRs */
+#define SS_MSR		0xb8
+#define SS_USPRG	0xbc
+#define SS_GPREG	0xc0 /* r12-r31 */
+#define SS_LR		0x110
+#define SS_CR		0x114
+#define SS_SP		0x118
+#define SS_CURRENT	0x11c
+#define SS_IVPR		0x120
+#define SS_BPTR		0x124
+
+#define POWMGTCSR_DPSLP_MASK    0x00100000
+
+#define STATE_SAVE_SIZE 0x128
+
+	.section .data
+	.align	5
+mpc85xx_sleep_save_area:
+	.space	STATE_SAVE_SIZE
+ccsrbase_low:
+	.long	0
+ccsrbase_high:
+	.long	0
+powmgtreq:
+	.long	POWMGTCSR_DPSLP_MASK
+
+	.section .text
+	.align	12
+
+	/*
+	 * r3 = high word of physical address of CCSR
+	 * r4 = low word of physical address of CCSR
+	 */
+_GLOBAL(mpc85xx_enter_deep_sleep)
+	lis	r6, ccsrbase_low@ha
+	stw	r4, ccsrbase_low@l(r6)
+	lis	r6, ccsrbase_high@ha
+	stw	r3, ccsrbase_high@l(r6)
+
+	lis	r10, mpc85xx_sleep_save_area@h
+	ori	r10, r10, mpc85xx_sleep_save_area@l
+
+	mfspr	r5, SPRN_HID0
+	mfspr	r6, SPRN_HID1
+
+	stw	r5, SS_HID+0(r10)
+	stw	r6, SS_HID+4(r10)
+
+	mfspr	r4, SPRN_IAC1
+	mfspr	r5, SPRN_IAC2
+	mfspr	r6, SPRN_DAC1
+	mfspr	r7, SPRN_DAC2
+
+	stw	r4, SS_IAC+0(r10)
+	stw	r5, SS_IAC+4(r10)
+	stw	r6, SS_DAC+0(r10)
+	stw	r7, SS_DAC+4(r10)
+
+	mfspr	r4, SPRN_DBCR0
+	mfspr	r5, SPRN_DBCR1
+	mfspr	r6, SPRN_DBCR2
+
+	stw	r4, SS_DBCR+0(r10)
+	stw	r5, SS_DBCR+4(r10)
+	stw	r6, SS_DBCR+8(r10)
+
+	mfspr	r4, SPRN_PID0
+	mfspr	r5, SPRN_PID1
+	mfspr	r6, SPRN_PID2
+
+	stw	r4, SS_PID+0(r10)
+	stw	r5, SS_PID+4(r10)
+	stw	r6, SS_PID+8(r10)
+
+	mfspr	r4, SPRN_SPRG0
+	mfspr	r5, SPRN_SPRG1
+	mfspr	r6, SPRN_SPRG2
+	mfspr	r7, SPRN_SPRG3
+
+	stw	r4, SS_SPRG+0x00(r10)
+	stw	r5, SS_SPRG+0x04(r10)
+	stw	r6, SS_SPRG+0x08(r10)
+	stw	r7, SS_SPRG+0x0c(r10)
+
+	mfspr	r4, SPRN_SPRG4
+	mfspr	r5, SPRN_SPRG5
+	mfspr	r6, SPRN_SPRG6
+	mfspr	r7, SPRN_SPRG7
+
+	stw	r4, SS_SPRG+0x10(r10)
+	stw	r5, SS_SPRG+0x14(r10)
+	stw	r6, SS_SPRG+0x18(r10)
+	stw	r7, SS_SPRG+0x1c(r10)
+
+	mfspr	r4, SPRN_IVPR
+	stw	r4, SS_IVPR(r10)
+
+	mfspr	r4, SPRN_IVOR0
+	mfspr	r5, SPRN_IVOR1
+	mfspr	r6, SPRN_IVOR2
+	mfspr	r7, SPRN_IVOR3
+
+	stw	r4, SS_IVOR+0x00(r10)
+	stw	r5, SS_IVOR+0x04(r10)
+	stw	r6, SS_IVOR+0x08(r10)
+	stw	r7, SS_IVOR+0x0c(r10)
+
+	mfspr	r4, SPRN_IVOR4
+	mfspr	r5, SPRN_IVOR5
+	mfspr	r6, SPRN_IVOR6
+	mfspr	r7, SPRN_IVOR7
+
+	stw	r4, SS_IVOR+0x10(r10)
+	stw	r5, SS_IVOR+0x14(r10)
+	stw	r6, SS_IVOR+0x18(r10)
+	stw	r7, SS_IVOR+0x1c(r10)
+
+	mfspr	r4, SPRN_IVOR8
+	mfspr	r5, SPRN_IVOR9
+	mfspr	r6, SPRN_IVOR10
+	mfspr	r7, SPRN_IVOR11
+
+	stw	r4, SS_IVOR+0x20(r10)
+	stw	r5, SS_IVOR+0x24(r10)
+	stw	r6, SS_IVOR+0x28(r10)
+	stw	r7, SS_IVOR+0x2c(r10)
+
+	mfspr	r4, SPRN_IVOR12
+	mfspr	r5, SPRN_IVOR13
+	mfspr	r6, SPRN_IVOR14
+	mfspr	r7, SPRN_IVOR15
+
+	stw	r4, SS_IVOR+0x30(r10)
+	stw	r5, SS_IVOR+0x34(r10)
+	stw	r6, SS_IVOR+0x38(r10)
+	stw	r7, SS_IVOR+0x3c(r10)
+
+	mfspr	r4, SPRN_IVOR32
+	mfspr	r5, SPRN_IVOR33
+	mfspr	r6, SPRN_IVOR34
+	mfspr	r7, SPRN_IVOR35
+
+	stw	r4, SS_IVOR+0x40(r10)
+	stw	r5, SS_IVOR+0x44(r10)
+	stw	r6, SS_IVOR+0x48(r10)
+	stw	r7, SS_IVOR+0x4c(r10)
+
+	mfspr	r4, SPRN_TCR
+	mfspr	r5, SPRN_BUCSR
+	mfspr	r6, SPRN_L1CSR0
+	mfspr	r7, SPRN_L1CSR1
+	mfspr	r8, SPRN_USPRG0
+
+	stw	r4, SS_TCR(r10)
+	stw	r5, SS_BUCSR(r10)
+	stw	r6, SS_L1CSR+0(r10)
+	stw	r7, SS_L1CSR+4(r10)
+	stw	r8, SS_USPRG+0(r10)
+
+	stmw	r12, SS_GPREG(r10)
+
+	mfmsr	r4
+	mflr	r5
+	mfcr	r6
+
+	stw	r4, SS_MSR(r10)
+	stw	r5, SS_LR(r10)
+	stw	r6, SS_CR(r10)
+	stw	r1, SS_SP(r10)
+	stw	r2, SS_CURRENT(r10)
+
+1:	mftbu	r4
+	mftb	r5
+	mftbu	r6
+	cmpw	r4, r6
+	bne	1b
+
+	stw	r4, SS_TB+0(r10)
+	stw	r5, SS_TB+4(r10)
+
+	lis	r5, ccsrbase_low@ha
+	lwz	r4, ccsrbase_low@l(r5)
+	lis	r5, ccsrbase_high@ha
+	lwz	r3, ccsrbase_high@l(r5)
+
+	/* Disable machine checks and critical exceptions */
+	mfmsr	r5
+	rlwinm	r5, r5, 0, ~MSR_CE
+	rlwinm	r5, r5, 0, ~MSR_ME
+	mtmsr	r5
+	isync
+
+	/* Use TLB1[15] to map the CCSR at 0xf0000000 */
+	lis	r5, 0x100f
+	mtspr	SPRN_MAS0, r5
+	lis	r5, 0xc000
+	ori	r5, r5, 0x0500
+	mtspr	SPRN_MAS1, r5
+	lis	r5, 0xf000
+	ori	r5, r5, 0x000a
+	mtspr	SPRN_MAS2, r5
+	rlwinm	r5, r4, 0, 0xfffff000
+	ori	r5, r5, 0x0005
+	mtspr	SPRN_MAS3, r5
+	mtspr	SPRN_MAS7, r3
+	isync
+	tlbwe
+	isync
+
+	lis	r3, 0xf000
+	lwz	r4, 0x20(r3)
+	stw	r4, SS_BPTR(r10)
+
+	lis	r3, 0xf002	/* L2 cache controller at CCSR+0x20000 */
+	bl	flush_disable_L2
+	bl	flush_disable_L1
+
+	/* Enable I-cache, so as not to upset the bus
+	 * with our loop.
+	 */
+
+	mfspr	r4, SPRN_L1CSR1
+	ori	r4, r4, 1
+	mtspr	SPRN_L1CSR1, r4
+	isync
+
+	/* Set boot page translation */
+	lis	r3, 0xf000
+	lis	r4, (mpc85xx_deep_resume - PAGE_OFFSET)@h
+	ori	r4, r4, (mpc85xx_deep_resume - PAGE_OFFSET)@l
+	rlwinm	r4, r4, 20, 0x000fffff
+	oris	r4, r4, 0x8000
+	stw	r4, 0x20(r3)
+	lwz	r4, 0x20(r3)		/* read-back to flush write */
+	twi	0, r4, 0
+	isync
+
+	/* Disable the decrementer */
+	mfspr	r4, SPRN_TCR
+	rlwinm	r4, r4, 0, ~TCR_DIE
+	mtspr	SPRN_TCR, r4
+
+	mfspr	r4, SPRN_TSR
+	oris	r4, r4, TSR_DIS@h
+	mtspr	SPRN_TSR, r4
+
+	/* set PMRCCR[VRCNT] to wait power stable for 40ms */
+	lis	r3, 0xf00e
+	lwz	r4, 0x84(r3)
+	clrlwi	r4, r4, 16
+	oris	r4, r4, 0x12a3
+	stw	r4, 0x84(r3)
+	lwz	r4, 0x84(r3)
+
+	/* set deep sleep bit in POWMGTSCR */
+	lis	r3, powmgtreq@ha
+	lwz	r8, powmgtreq@l(r3)
+
+	lis	r3, 0xf00e
+	lwz	r4, 0x80(r3)
+	or	r4, r4, r8
+	stw	r4, 0x80(r3)
+	lwz	r4, 0x80(r3)		/* read-back to flush write */
+	twi	0, r4, 0
+	isync
+
+	mftb	r5
+1:	/* spin until either we enter deep sleep, or the sleep process is
+	 * aborted due to a pending wakeup event.  Wait some time between
+	 * accesses, so we don't flood the bus and prevent the pmc from
+	 * detecting an idle system.
+	 */
+
+	mftb	r4
+	subf	r7, r5, r4
+	cmpwi	r7, 1000
+	blt	1b
+	mr	r5, r4
+
+	lwz	r6, 0x80(r3)
+	andis.	r6, r6, 0x0010
+	bne	1b
+	b	2f
+
+2:	mfspr	r4, SPRN_PIR
+	andi.	r4, r4, 1
+99:	bne	99b
+
+	/* Establish a temporary 64MB 0->0 mapping in TLB1[1]. */
+	lis	r4, 0x1001
+	mtspr	SPRN_MAS0, r4
+	lis	r4, 0xc000
+	ori	r4, r4, 0x0800
+	mtspr	SPRN_MAS1, r4
+	li	r4, 0
+	mtspr	SPRN_MAS2, r4
+	li	r4, 0x0015
+	mtspr	SPRN_MAS3, r4
+	li	r4, 0
+	mtspr	SPRN_MAS7, r4
+	isync
+	tlbwe
+	isync
+
+	lis	r3, (3f - PAGE_OFFSET)@h
+	ori	r3, r3, (3f - PAGE_OFFSET)@l
+	mtctr	r3
+	bctr
+
+	/* Locate the resume vector in the last word of the current page. */
+	. = mpc85xx_enter_deep_sleep + 0xffc
+mpc85xx_deep_resume:
+	b	2b
+
+3:
+	/* Restore the contents of TLB1[0].  It is assumed that it covers
+	 * the currently executing code and the sleep save area, and that
+	 * it does not alias our temporary mapping (which is at virtual zero).
+	 */
+	lis	r3, (TLBCAM - PAGE_OFFSET)@h
+	ori	r3, r3, (TLBCAM - PAGE_OFFSET)@l
+
+	lwz	r4, 0(r3)
+	lwz	r5, 4(r3)
+	lwz	r6, 8(r3)
+	lwz	r7, 12(r3)
+	lwz	r8, 16(r3)
+
+	mtspr	SPRN_MAS0, r4
+	mtspr	SPRN_MAS1, r5
+	mtspr	SPRN_MAS2, r6
+	mtspr	SPRN_MAS3, r7
+	mtspr	SPRN_MAS7, r8
+
+	isync
+	tlbwe
+	isync
+
+	/* Access the ccsrbase address with TLB1[0] */
+	lis	r5, ccsrbase_low@ha
+	lwz	r4, ccsrbase_low@l(r5)
+	lis	r5, ccsrbase_high@ha
+	lwz	r3, ccsrbase_high@l(r5)
+
+	/* Use TLB1[15] to map the CCSR at 0xf0000000 */
+	lis	r5, 0x100f
+	mtspr	SPRN_MAS0, r5
+	lis	r5, 0xc000
+	ori	r5, r5, 0x0500
+	mtspr	SPRN_MAS1, r5
+	lis	r5, 0xf000
+	ori	r5, r5, 0x000a
+	mtspr	SPRN_MAS2, r5
+	rlwinm	r5, r4, 0, 0xfffff000
+	ori	r5, r5, 0x0005
+	mtspr	SPRN_MAS3, r5
+	mtspr	SPRN_MAS7, r3
+	isync
+	tlbwe
+	isync
+
+	lis	r3, 0xf002	/* L2 cache controller at CCSR+0x20000 */
+	bl	invalidate_enable_L2
+
+	/* Access the MEM(r10) with TLB1[0] */
+	lis	r10, mpc85xx_sleep_save_area@h
+	ori	r10, r10, mpc85xx_sleep_save_area@l
+
+	lis	r3, 0xf000
+	lwz	r4, SS_BPTR(r10)
+	stw	r4, 0x20(r3)		/* restore BPTR */
+
+	/* Program shift running space to PAGE_OFFSET */
+	mfmsr	r3
+	lis	r4, 1f@h
+	ori	r4, r4, 1f@l
+
+	mtsrr1	r3
+	mtsrr0	r4
+	rfi
+
+1:	/* Restore the rest of TLB1, in ascending order so that
+	 * the TLB1[1] gets invalidated first.
+	 *
+	 * XXX: It's better to invalidate the temporary mapping
+	 * TLB1[15] for CCSR before restore any TLB1 entry include 0.
+	 */
+	lis	r4, 0x100f
+	mtspr	SPRN_MAS0, r4
+	lis	r4, 0
+	mtspr	SPRN_MAS1, r4
+	isync
+	tlbwe
+	isync
+
+	lis	r3, (TLBCAM + 5*4 - 4)@h
+	ori	r3, r3, (TLBCAM + 5*4 - 4)@l
+	li	r4, 15
+	mtctr	r4
+
+2:
+	lwz	r5, 4(r3)
+	lwz	r6, 8(r3)
+	lwz	r7, 12(r3)
+	lwz	r8, 16(r3)
+	lwzu	r9, 20(r3)
+
+	mtspr	SPRN_MAS0, r5
+	mtspr	SPRN_MAS1, r6
+	mtspr	SPRN_MAS2, r7
+	mtspr	SPRN_MAS3, r8
+	mtspr	SPRN_MAS7, r9
+
+	isync
+	tlbwe
+	isync
+	bdnz	2b
+
+	lis	r10, mpc85xx_sleep_save_area@h
+	ori	r10, r10, mpc85xx_sleep_save_area@l
+
+	lwz	r5, SS_HID+0(r10)
+	lwz	r6, SS_HID+4(r10)
+
+	isync
+	mtspr	SPRN_HID0, r5
+	isync
+
+	msync
+	mtspr	SPRN_HID1, r6
+	isync
+
+	lwz	r4, SS_IAC+0(r10)
+	lwz	r5, SS_IAC+4(r10)
+	lwz	r6, SS_DAC+0(r10)
+	lwz	r7, SS_DAC+4(r10)
+
+	mtspr	SPRN_IAC1, r4
+	mtspr	SPRN_IAC2, r5
+	mtspr	SPRN_DAC1, r6
+	mtspr	SPRN_DAC2, r7
+
+	lwz	r4, SS_DBCR+0(r10)
+	lwz	r5, SS_DBCR+4(r10)
+	lwz	r6, SS_DBCR+8(r10)
+
+	mtspr	SPRN_DBCR0, r4
+	mtspr	SPRN_DBCR1, r5
+	mtspr	SPRN_DBCR2, r6
+
+	lwz	r4, SS_PID+0(r10)
+	lwz	r5, SS_PID+4(r10)
+	lwz	r6, SS_PID+8(r10)
+
+	mtspr	SPRN_PID0, r4
+	mtspr	SPRN_PID1, r5
+	mtspr	SPRN_PID2, r6
+
+	lwz	r4, SS_SPRG+0x00(r10)
+	lwz	r5, SS_SPRG+0x04(r10)
+	lwz	r6, SS_SPRG+0x08(r10)
+	lwz	r7, SS_SPRG+0x0c(r10)
+
+	mtspr	SPRN_SPRG0, r4
+	mtspr	SPRN_SPRG1, r5
+	mtspr	SPRN_SPRG2, r6
+	mtspr	SPRN_SPRG3, r7
+
+	lwz	r4, SS_SPRG+0x10(r10)
+	lwz	r5, SS_SPRG+0x14(r10)
+	lwz	r6, SS_SPRG+0x18(r10)
+	lwz	r7, SS_SPRG+0x1c(r10)
+
+	mtspr	SPRN_SPRG4, r4
+	mtspr	SPRN_SPRG5, r5
+	mtspr	SPRN_SPRG6, r6
+	mtspr	SPRN_SPRG7, r7
+
+	lwz	r4, SS_IVPR(r10)
+	mtspr	SPRN_IVPR, r4
+
+	lwz	r4, SS_IVOR+0x00(r10)
+	lwz	r5, SS_IVOR+0x04(r10)
+	lwz	r6, SS_IVOR+0x08(r10)
+	lwz	r7, SS_IVOR+0x0c(r10)
+
+	mtspr	SPRN_IVOR0, r4
+	mtspr	SPRN_IVOR1, r5
+	mtspr	SPRN_IVOR2, r6
+	mtspr	SPRN_IVOR3, r7
+
+	lwz	r4, SS_IVOR+0x10(r10)
+	lwz	r5, SS_IVOR+0x14(r10)
+	lwz	r6, SS_IVOR+0x18(r10)
+	lwz	r7, SS_IVOR+0x1c(r10)
+
+	mtspr	SPRN_IVOR4, r4
+	mtspr	SPRN_IVOR5, r5
+	mtspr	SPRN_IVOR6, r6
+	mtspr	SPRN_IVOR7, r7
+
+	lwz	r4, SS_IVOR+0x20(r10)
+	lwz	r5, SS_IVOR+0x24(r10)
+	lwz	r6, SS_IVOR+0x28(r10)
+	lwz	r7, SS_IVOR+0x2c(r10)
+
+	mtspr	SPRN_IVOR8, r4
+	mtspr	SPRN_IVOR9, r5
+	mtspr	SPRN_IVOR10, r6
+	mtspr	SPRN_IVOR11, r7
+
+	lwz	r4, SS_IVOR+0x30(r10)
+	lwz	r5, SS_IVOR+0x34(r10)
+	lwz	r6, SS_IVOR+0x38(r10)
+	lwz	r7, SS_IVOR+0x3c(r10)
+
+	mtspr	SPRN_IVOR12, r4
+	mtspr	SPRN_IVOR13, r5
+	mtspr	SPRN_IVOR14, r6
+	mtspr	SPRN_IVOR15, r7
+
+	lwz	r4, SS_IVOR+0x40(r10)
+	lwz	r5, SS_IVOR+0x44(r10)
+	lwz	r6, SS_IVOR+0x48(r10)
+	lwz	r7, SS_IVOR+0x4c(r10)
+
+	mtspr	SPRN_IVOR32, r4
+	mtspr	SPRN_IVOR33, r5
+	mtspr	SPRN_IVOR34, r6
+	mtspr	SPRN_IVOR35, r7
+
+	lwz	r4, SS_TCR(r10)
+	lwz	r5, SS_BUCSR(r10)
+	lwz	r6, SS_L1CSR+0(r10)
+	lwz	r7, SS_L1CSR+4(r10)
+	lwz	r8, SS_USPRG+0(r10)
+
+	mtspr	SPRN_TCR, r4
+	mtspr	SPRN_BUCSR, r5
+
+	msync
+	isync
+	mtspr	SPRN_L1CSR0, r6
+	isync
+
+	mtspr	SPRN_L1CSR1, r7
+	isync
+
+	mtspr	SPRN_USPRG0, r8
+
+	lmw	r12, SS_GPREG(r10)
+
+	lwz	r1, SS_SP(r10)
+	lwz	r2, SS_CURRENT(r10)
+	lwz	r4, SS_MSR(r10)
+	lwz	r5, SS_LR(r10)
+	lwz	r6, SS_CR(r10)
+
+	msync
+	mtmsr	r4
+	isync
+
+	mtlr	r5
+	mtcr	r6
+
+	li	r4, 0
+	mtspr	SPRN_TBWL, r4
+
+	lwz	r4, SS_TB+0(r10)
+	lwz	r5, SS_TB+4(r10)
+
+	mtspr	SPRN_TBWU, r4
+	mtspr	SPRN_TBWL, r5
+
+	lis	r3, 1
+	mtdec	r3
+
+	blr
diff --git a/arch/powerpc/sysdev/fsl_pmc.c b/arch/powerpc/sysdev/fsl_pmc.c
index 592a0f8..d6c65a7 100644
--- a/arch/powerpc/sysdev/fsl_pmc.c
+++ b/arch/powerpc/sysdev/fsl_pmc.c
@@ -2,6 +2,7 @@
  * Suspend/resume support
  *
  * Copyright 2009  MontaVista Software, Inc.
+ * Copyright 2007-2011 Freescale Semiconductor Inc.
  *
  * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
  *
@@ -19,39 +20,86 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/of_platform.h>
+#include <linux/pm.h>
+#include <linux/interrupt.h>
+
+#include <sysdev/fsl_soc.h>
 
 struct pmc_regs {
 	__be32 devdisr;
-	__be32 devdisr2;
-	__be32 :32;
-	__be32 :32;
-	__be32 pmcsr;
-#define PMCSR_SLP	(1 << 17)
+	__be32 res1;
+	__be32 res2;
+	__be32 pmjcr;
+	__be32 powmgtcsr;
+	__be32 res4;
+	__be32 res5;
+	__be32 pmcdr;
 };
-
-static struct device *pmc_dev;
 static struct pmc_regs __iomem *pmc_regs;
+static unsigned int pmc_flag;
+
+#define PMC_SLEEP	0x1
+#define PMC_DEEP_SLEEP	0x2
+
+#define POWMGTCSR_SLP_MASK	0x00020000
+
+/* Cast the ccsrbar to 64-bit parameter so that the assembly
+ * code can be compatible with both 32-bit & 36-bit */
+extern void mpc85xx_enter_deep_sleep(u64 ccsrbar);
 
 static int pmc_suspend_enter(suspend_state_t state)
 {
-	int ret;
+	int ret = 0;
+	unsigned long flags;
+
+	switch (state) {
+	case PM_SUSPEND_MEM:
+#ifdef CONFIG_SPE
+		enable_kernel_spe();
+#endif
+		enable_kernel_fp();
+
+		pr_debug("%s: Entering deep sleep\n", __func__);
+
+		local_irq_save(flags);
+		mpc85xx_enter_deep_sleep(get_immrbase());
+		local_irq_restore(flags);
 
-	setbits32(&pmc_regs->pmcsr, PMCSR_SLP);
-	/* At this point, the CPU is asleep. */
+		pr_debug("%s: Resumed from deep sleep\n", __func__);
+		break;
 
-	/* Upon resume, wait for SLP bit to be clear. */
-	ret = spin_event_timeout((in_be32(&pmc_regs->pmcsr) & PMCSR_SLP) == 0,
-				 10000, 10) ? 0 : -ETIMEDOUT;
-	if (ret)
-		dev_err(pmc_dev, "tired waiting for SLP bit to clear\n");
+	case PM_SUSPEND_STANDBY:
+		local_irq_save(flags);
+
+		setbits32(&pmc_regs->powmgtcsr, POWMGTCSR_SLP_MASK);
+		/* At this point, the CPU is asleep. */
+
+		local_irq_restore(flags);
+		/* Upon resume, wait for SLP bit to be clear. */
+		ret = spin_event_timeout(
+		  (in_be32(&pmc_regs->powmgtcsr) & POWMGTCSR_SLP_MASK) == 0,
+		  10000, 10);
+		if (!ret) {
+			pr_err("%s: timeout waiting for SLP bit "
+				"to be cleared\n", __func__);
+			ret = -EINVAL;
+		}
+		break;
+
+	default:
+		ret = -EINVAL;
+
+	}
 	return ret;
 }
 
 static int pmc_suspend_valid(suspend_state_t state)
 {
-	if (state != PM_SUSPEND_STANDBY)
+	if (((pmc_flag & PMC_SLEEP) && (state == PM_SUSPEND_STANDBY)) ||
+	    ((pmc_flag & PMC_DEEP_SLEEP) && (state == PM_SUSPEND_MEM)))
+		return 1;
+	else
 		return 0;
-	return 1;
 }
 
 static const struct platform_suspend_ops pmc_suspend_ops = {
@@ -59,14 +107,24 @@ static const struct platform_suspend_ops pmc_suspend_ops = {
 	.enter = pmc_suspend_enter,
 };
 
-static int pmc_probe(struct platform_device *ofdev)
+static int pmc_probe(struct platform_device *pdev)
 {
-	pmc_regs = of_iomap(ofdev->dev.of_node, 0);
+	struct device_node *np = pdev->dev.of_node;
+
+	pmc_regs = of_iomap(np, 0);
 	if (!pmc_regs)
 		return -ENOMEM;
 
-	pmc_dev = &ofdev->dev;
+	pmc_flag = PMC_SLEEP;
+	if (of_device_is_compatible(np, "fsl,mpc8536-pmc"))
+		pmc_flag |= PMC_DEEP_SLEEP;
+
+	if (of_device_is_compatible(np, "fsl,p1022-pmc"))
+		pmc_flag |= PMC_DEEP_SLEEP;
+
 	suspend_set_ops(&pmc_suspend_ops);
+
+	pr_info("Freescale PMC driver\n");
 	return 0;
 }
 
-- 
1.6.4.1

^ permalink raw reply related

* [PATCH v2 1/7] powerpc/85xx: re-enable timebase sync disabled by KEXEC patch
From: Zhao Chenhui @ 2011-11-16  9:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: scottwood

From: Li Yang <leoli@freescale.com>

The timebase sync is not only necessary when using KEXEC. It should also
be used by normal boot up and cpu hotplug. Remove the ifdef added by
the KEXEC patch.

Signed-off-by: Jin Qing <b24347@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
---
 arch/powerpc/platforms/85xx/smp.c |    2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 2df4785..6834447 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -114,10 +114,8 @@ smp_85xx_kick_cpu(int nr)
 
 struct smp_ops_t smp_85xx_ops = {
 	.kick_cpu = smp_85xx_kick_cpu,
-#ifdef CONFIG_KEXEC
 	.give_timebase	= smp_generic_give_timebase,
 	.take_timebase	= smp_generic_take_timebase,
-#endif
 };
 
 #ifdef CONFIG_KEXEC
-- 
1.6.4.1

^ permalink raw reply related

* Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup
From: Paul Mackerras @ 2011-11-16  4:40 UTC (permalink / raw)
  To: Moffett, Kyle D
  Cc: B04825@freescale.com, linux-kernel@vger.kernel.org,
	paul.gortmaker@windriver.com, scottwood@freescale.com,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <022078B9-CD41-4D24-B44A-F85256A69D0A@boeing.com>

On Tue, Nov 15, 2011 at 04:45:18PM -0600, Moffett, Kyle D wrote:
> On Nov 15, 2011, at 17:29, Benjamin Herrenschmidt wrote:
> > On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
> >> Unfortunately, I've been staring at PPC asm for long enough that I
> >> have a migraine headache and I'm going to have to stop here for now.
> >> If somebody else wants to tackle fixing up the 32-bit copy_page() and
> >> __copy_tofrom_user() routines it would be highly appreciated. 
> > 
> > Yeah that's the one everybody's avoiding :-)
> > 
> > What about my idea of instead compiling it multiple times with a
> > different size and fixing up the branch to call the right one ?
> 
> I guess that's doable, although I have to admit that idea almost gives
> me more of a headache than trying to fix up the 32-bit ASM.
> 
> One thing that bothers me in particular is that both 32/64 versions of
> __copy_tofrom_user() are dramatically overcomplicated for what they
> ought to be doing.
> 
> It would seem that if we get a page fault during an unaligned copy, we
> ought to just give up and fall back to a simple byte-by-byte copy loop
> from wherever we left off.  That would eliminate 90% of the ugly
> special cases without actually hurting performance, right?

That's basically what we do, IIRC, and most of the complexity comes
from working out where we were up to.  We could probably use a simpler
approximation that means we might copy some bytes twice.  In fact the
greatest simplification would probably be to implement range entries
in the exception table so we can just have one entry for all the loads
and stores instead of an entry for each individual load and store.

> For a page-fault during a cacheline-aligned copy, we should be able to
> handle the exception and retry from the last cacheline without much
> logic, again with good performance.
> 
> With that said, I'm curious about the origin of the PPC32 ASM.  In
> particular, it looks like it was generated by GCC at some point in the
> distant past, and I'm wondering if there's a good way to rewrite that
> file in C and trick GCC into generating the relevant exception tables
> for it?

Why do you think it was generated by gcc?  I wrote the original
version, but I think it got extended and macro-ized by others.

Paul.

^ permalink raw reply

* RE: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level sensitive for PCIe
From: Zang Roy-R61911 @ 2011-11-16  4:27 UTC (permalink / raw)
  To: Wood Scott-B07421, Kumar Gala; +Cc: linuxppc-dev@lists.ozlabs.org
In-Reply-To: <4EC2E42B.1090600@freescale.com>



> -----Original Message-----
> From: Wood Scott-B07421
> Sent: Wednesday, November 16, 2011 6:14 AM
> To: Kumar Gala
> Cc: Zang Roy-R61911; linuxppc-dev@lists.ozlabs.org
> Subject: Re: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level
> sensitive for PCIe
>=20
> On 11/15/2011 03:51 PM, Kumar Gala wrote:
> >
> > On Nov 7, 2011, at 2:32 AM, Roy Zang wrote:
> >
> > Should be setting ALL PCIe interrupts to '2'?  As I think in general
> > we say these PCIe are 'active high'.  The only reason I would think
> > we would NOT do this is if they are shared with some external device
> > that is 'active low'.  If so we should comment that somewhere (maybe
> > in the .dts, maybe just in the commit message).
>=20
> I'd assume the ones that are pinned out are pulled high on the board.
yes. The boards pulled up the shared IRQs. PCIe specification does not spec=
ify 'active low' or 'active high', but for PCI, the INTx is 'active low'.

> Active-low is normal, it's these non-pinned-out "external" interrupts
> that are pulled low inside the SoC that are weird.
I agree here. Do you want me to add something to point out the "weird" in t=
he commit message?
Thanks.
Roy

^ permalink raw reply

* [PATCH v2] powerpc/powernv: PCI support for p7IOC under OPAL v2
From: Benjamin Herrenschmidt @ 2011-11-16  3:29 UTC (permalink / raw)
  To: linuxppc-dev

This adds support for p7IOC (and possibly other IODA v1 IO Hubs)
using OPAL v2 interfaces.

We completely take over resource assignment and assign them using an
algorithm that hands out device BARs in a way that makes them fit in
individual segments of the M32 window of the bridge, which enables us
to assign individual PEs to devices and functions.

The current implementation gives out a PE per functions on PCIe, and a
PE for the entire bridge for PCIe to PCI-X bridges.

This can be adjusted / fine tuned later.

We also setup DMA resources (32-bit only for now) and MSIs (both 32-bit
and 64-bit MSI are supported).

The DMA allocation tries to divide the available 256M segments of the
32-bit DMA address space "fairly" among PEs. This is done using a
"weight" heuristic which assigns less value to things like OHCI USB
controllers than, for example SCSI RAID controllers. This algorithm
will probably want some fine tuning for specific devices or device
types.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2. Small fixes from Gavin Shan in the resource assignment code

 arch/powerpc/include/asm/pci-bridge.h     |    6 +-
 arch/powerpc/kernel/pci_dn.c              |    3 +
 arch/powerpc/platforms/powernv/Makefile   |    2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 1320 +++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c      |   20 +-
 arch/powerpc/platforms/powernv/pci.h      |   84 ++
 6 files changed, 1429 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/pci-ioda.c

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 56b879a..882b6aa 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -153,8 +153,8 @@ struct pci_dn {
 
 	int	pci_ext_config_space;	/* for pci devices */
 
-#ifdef CONFIG_EEH
 	struct	pci_dev *pcidev;	/* back-pointer to the pci device */
+#ifdef CONFIG_EEH
 	int	class_code;		/* pci device class */
 	int	eeh_mode;		/* See eeh.h for possible EEH_MODEs */
 	int	eeh_config_addr;
@@ -164,6 +164,10 @@ struct pci_dn {
 	int	eeh_false_positives;	/* # times this device reported #ff's */
 	u32	config_space[16];	/* saved PCI config space */
 #endif
+#define IODA_INVALID_PE		(-1)
+#ifdef CONFIG_PPC_POWERNV
+	int	pe_number;
+#endif
 };
 
 /* Get the pointer to a device_node's pci_dn */
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 4e69deb..dd9e4a0 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -50,6 +50,9 @@ void * __devinit update_dn_pci_info(struct device_node *dn, void *data)
 	dn->data = pdn;
 	pdn->node = dn;
 	pdn->phb = phb;
+#ifdef CONFIG_PPC_POWERNV
+	pdn->pe_number = IODA_INVALID_PE;
+#endif
 	regs = of_get_property(dn, "reg", NULL);
 	if (regs) {
 		/* First register entry is addr (00BBSS00)  */
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 3185300..bcc3cb48 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -2,4 +2,4 @@ obj-y			+= setup.o opal-takeover.o opal-wrappers.o opal.o
 obj-y			+= opal-rtc.o opal-nvram.o
 
 obj-$(CONFIG_SMP)	+= smp.o
-obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o
+obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
new file mode 100644
index 0000000..8857d9b
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -0,0 +1,1320 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/abs_addr.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+struct resource_wrap {
+	struct list_head	link;
+	resource_size_t		size;
+	resource_size_t		align;
+	struct pci_dev		*dev;	/* Set if it's a device */
+	struct pci_bus		*bus;	/* Set if it's a bridge */
+};
+
+static int __pe_printk(const char *level, const struct pnv_ioda_pe *pe,
+		       struct va_format *vaf)
+{
+	char pfix[32];
+
+	if (pe->pdev)
+		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
+	else
+		sprintf(pfix, "%04x:%02x     ",
+			pci_domain_nr(pe->pbus), pe->pbus->number);
+	return printk("pci %s%s: [PE# %.3d] %pV", level, pfix, pe->pe_number, vaf);
+}
+
+#define define_pe_printk_level(func, kern_level)		\
+static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...)	\
+{								\
+	struct va_format vaf;					\
+	va_list args;						\
+	int r;							\
+								\
+	va_start(args, fmt);					\
+								\
+	vaf.fmt = fmt;						\
+	vaf.va = &args;						\
+								\
+	r = __pe_printk(kern_level, pe, &vaf);			\
+	va_end(args);						\
+								\
+	return r;						\
+}								\
+
+define_pe_printk_level(pe_err, KERN_ERR);
+define_pe_printk_level(pe_warn, KERN_WARNING);
+define_pe_printk_level(pe_info, KERN_INFO);
+
+
+/* Calculate resource usage & alignment requirement of a single
+ * device. This will also assign all resources within the device
+ * for a given type starting at 0 for the biggest one and then
+ * assigning in decreasing order of size.
+ */
+static void __devinit pnv_ioda_calc_dev(struct pci_dev *dev, unsigned int flags,
+					resource_size_t *size,
+					resource_size_t *align)
+{
+	resource_size_t start;
+	struct resource *r;
+	int i;
+
+	pr_devel("  -> CDR %s\n", pci_name(dev));
+
+	*size = *align = 0;
+
+	/* Clear the resources out and mark them all unset */
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+		r = &dev->resource[i];
+		if (!(r->flags & flags))
+		    continue;
+		if (r->start) {
+			r->end -= r->start;
+			r->start = 0;
+		}
+		r->flags |= IORESOURCE_UNSET;
+	}
+
+	/* We currently keep all memory resources together, we
+	 * will handle prefetch & 64-bit separately in the future
+	 * but for now we stick everybody in M32
+	 */
+	start = 0;
+	for (;;) {
+		resource_size_t max_size = 0;
+		int max_no = -1;
+
+		/* Find next biggest resource */
+		for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+			r = &dev->resource[i];
+			if (!(r->flags & IORESOURCE_UNSET) ||
+			    !(r->flags & flags))
+				continue;
+			if (resource_size(r) > max_size) {
+				max_size = resource_size(r);
+				max_no = i;
+			}
+		}
+		if (max_no < 0)
+			break;
+		r = &dev->resource[max_no];
+		if (max_size > *align)
+			*align = max_size;
+		*size += max_size;
+		r->start = start;
+		start += max_size;
+		r->end = r->start + max_size - 1;
+		r->flags &= ~IORESOURCE_UNSET;
+		pr_devel("  ->     R%d %016llx..%016llx\n",
+			 max_no, r->start, r->end);
+	}
+	pr_devel("  <- CDR %s size=%llx align=%llx\n",
+		 pci_name(dev), *size, *align);
+}
+
+/* Allocate a resource "wrap" for a given device or bridge and
+ * insert it at the right position in the sorted list
+ */
+static void __devinit pnv_ioda_add_wrap(struct list_head *list,
+					struct pci_bus *bus,
+					struct pci_dev *dev,
+					resource_size_t size,
+					resource_size_t align)
+{
+	struct resource_wrap *w1, *w = kzalloc(sizeof(*w), GFP_KERNEL);
+
+	w->size = size;
+	w->align = align;
+	w->dev = dev;
+	w->bus = bus;
+
+	list_for_each_entry(w1, list, link) {
+		if (w1->align < align) {
+			list_add_tail(&w->link, &w1->link);
+			return;
+		}
+	}
+	list_add_tail(&w->link, list);
+}
+
+/* Offset device resources of a given type */
+static void __devinit pnv_ioda_offset_dev(struct pci_dev *dev,
+					  unsigned int flags,
+					  resource_size_t offset)
+{
+	struct resource *r;
+	int i;
+
+	pr_devel("  -> ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset);
+
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+		r = &dev->resource[i];
+		if (r->flags & flags) {
+			dev->resource[i].start += offset;
+			dev->resource[i].end += offset;
+		}
+	}
+
+	pr_devel("  <- ODR %s [%x] +%016llx\n", pci_name(dev), flags, offset);
+}
+
+/* Offset bus resources (& all children) of a given type */
+static void __devinit pnv_ioda_offset_bus(struct pci_bus *bus,
+					  unsigned int flags,
+					  resource_size_t offset)
+{
+	struct resource *r;
+	struct pci_dev *dev;
+	struct pci_bus *cbus;
+	int i;
+
+	pr_devel("  -> OBR %s [%x] +%016llx\n",
+		 bus->self ? pci_name(bus->self) : "root", flags, offset);
+
+	for (i = 0; i < 2; i++) {
+		r = bus->resource[i];
+		if (r && (r->flags & flags)) { 
+			bus->resource[i]->start += offset;
+			bus->resource[i]->end += offset;
+		}
+	}
+	list_for_each_entry(dev, &bus->devices, bus_list)
+		pnv_ioda_offset_dev(dev, flags, offset);
+	list_for_each_entry(cbus, &bus->children, node)
+		pnv_ioda_offset_bus(cbus, flags, offset);
+
+	pr_devel("  <- OBR %s [%x]\n",
+		 bus->self ? pci_name(bus->self) : "root", flags);
+}
+
+/* This is the guts of our IODA resource allocation. This is called
+ * recursively for each bus in the system. It calculates all the
+ * necessary size and requirements for children and assign them
+ * resources such that:
+ *
+ *   - Each function fits in it's own contiguous set of IO/M32
+ *     segment
+ *
+ *   - All segments behind a P2P bridge are contiguous and obey
+ *     alignment constraints of those bridges
+ */
+static void __devinit pnv_ioda_calc_bus(struct pci_bus *bus, unsigned int flags,
+					resource_size_t *size,
+					resource_size_t *align)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pnv_phb *phb = hose->private_data;
+	resource_size_t dev_size, dev_align, start;
+	resource_size_t min_align, min_balign;
+	struct pci_dev *cdev;
+	struct pci_bus *cbus;
+	struct list_head head;
+	struct resource_wrap *w;
+	unsigned int bres;
+
+	*size = *align = 0;
+
+	pr_devel("-> CBR %s [%x]\n",
+		 bus->self ? pci_name(bus->self) : "root", flags);
+
+	/* Calculate alignment requirements based on the type
+	 * of resource we are working on
+	 */
+	if (flags & IORESOURCE_IO) {
+		bres = 0;
+		min_align = phb->ioda.io_segsize;
+		min_balign = 0x1000;
+	} else {
+		bres = 1;
+		min_align = phb->ioda.m32_segsize;
+		min_balign = 0x100000;
+	}
+
+	/* Gather all our children resources ordered by alignment */
+	INIT_LIST_HEAD(&head);
+
+	/*   - Busses */
+	list_for_each_entry(cbus, &bus->children, node) {
+		pnv_ioda_calc_bus(cbus, flags, &dev_size, &dev_align);
+		pnv_ioda_add_wrap(&head, cbus, NULL, dev_size, dev_align);
+	}
+
+	/*   - Devices */
+	list_for_each_entry(cdev, &bus->devices, bus_list) {
+		pnv_ioda_calc_dev(cdev, flags, &dev_size, &dev_align);
+		/* Align them to segment size */
+		if (dev_align < min_align)
+			dev_align = min_align;
+		pnv_ioda_add_wrap(&head, NULL, cdev, dev_size, dev_align);
+	}
+	if (list_empty(&head))
+		goto empty;
+
+	/* Now we can do two things: assign offsets to them within that
+	 * level and get our total alignment & size requirements. The
+	 * assignment algorithm is going to be uber-trivial for now, we
+	 * can try to be smarter later at filling out holes.
+	 */
+	start = bus->self ? 0 : bus->resource[bres]->start;
+
+	/* Don't hand out IO 0 */
+	if ((flags & IORESOURCE_IO) && !bus->self)
+		start += 0x1000;
+
+	while(!list_empty(&head)) {
+		w = list_first_entry(&head, struct resource_wrap, link);
+		list_del(&w->link);
+		if (w->size) {
+			if (start) {
+				start = ALIGN(start, w->align);
+				if (w->dev)
+					pnv_ioda_offset_dev(w->dev,flags,start);
+				else if (w->bus)
+					pnv_ioda_offset_bus(w->bus,flags,start);
+			}
+			if (w->align > *align)
+				*align = w->align;
+		}
+		start += w->size;
+		kfree(w);
+	}
+	*size = start;
+
+	/* Align and setup bridge resources */
+	*align = max_t(resource_size_t, *align,
+		       max_t(resource_size_t, min_align, min_balign));
+	*size = ALIGN(*size,
+		      max_t(resource_size_t, min_align, min_balign));
+ empty:
+	/* Only setup P2P's, not the PHB itself */
+	if (bus->self) {
+		WARN_ON(bus->resource[bres] == NULL);
+		bus->resource[bres]->start = 0;
+		bus->resource[bres]->flags = (*size) ? flags : 0;
+		bus->resource[bres]->end = (*size) ? (*size - 1) : 0;
+
+		/* Clear prefetch bus resources for now */
+		bus->resource[2]->flags = 0;
+	}
+
+	pr_devel("<- CBR %s [%x] *size=%016llx *align=%016llx\n",
+		 bus->self ? pci_name(bus->self) : "root", flags,*size,*align);
+}
+
+static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev)
+{
+	struct device_node *np;
+
+	np = pci_device_to_OF_node(dev);
+	if (!np)
+		return NULL;
+	return PCI_DN(np);
+}
+
+static void __devinit pnv_ioda_setup_pe_segments(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+	unsigned int pe, i;
+	resource_size_t pos;
+	struct resource io_res;
+	struct resource m32_res;
+	struct pci_bus_region region;
+	int rc;
+
+	/* Anything not referenced in the device-tree gets PE#0 */
+	pe = pdn ? pdn->pe_number : 0;
+
+	/* Calculate the device min/max */
+	io_res.start = m32_res.start = (resource_size_t)-1;
+	io_res.end = m32_res.end = 0;
+	io_res.flags = IORESOURCE_IO;
+	m32_res.flags = IORESOURCE_MEM;
+
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+		struct resource *r = NULL;
+		if (dev->resource[i].flags & IORESOURCE_IO)
+			r = &io_res;
+		if (dev->resource[i].flags & IORESOURCE_MEM)
+			r = &m32_res;
+		if (!r)
+			continue;
+		if (dev->resource[i].start < r->start)
+			r->start = dev->resource[i].start;
+		if (dev->resource[i].end > r->end)
+			r->end = dev->resource[i].end;
+	}
+
+	/* Setup IO segments */
+	if (io_res.start < io_res.end) {
+		pcibios_resource_to_bus(dev, &region, &io_res);
+		pos = region.start;
+		i = pos / phb->ioda.io_segsize;
+		while(i < phb->ioda.total_pe && pos <= region.end) {
+			if (phb->ioda.io_segmap[i]) {
+				pr_err("%s: Trying to use IO seg #%d which is"
+				       " already used by PE# %d\n",
+				       pci_name(dev), i,
+				       phb->ioda.io_segmap[i]);
+				/* XXX DO SOMETHING TO DISABLE DEVICE ? */
+				break;
+			}
+			phb->ioda.io_segmap[i] = pe;
+			rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe,
+							 OPAL_IO_WINDOW_TYPE,
+							 0, i);
+			if (rc != OPAL_SUCCESS) {
+				pr_err("%s: OPAL error %d setting up mapping"
+				       " for IO seg# %d\n",
+				       pci_name(dev), rc, i);
+				/* XXX DO SOMETHING TO DISABLE DEVICE ? */
+				break;
+			}
+			pos += phb->ioda.io_segsize;
+			i++;
+		};
+	}
+
+	/* Setup M32 segments */
+	if (m32_res.start < m32_res.end) {
+		pcibios_resource_to_bus(dev, &region, &m32_res);
+		pos = region.start;
+		i = pos / phb->ioda.m32_segsize;
+		while(i < phb->ioda.total_pe && pos <= region.end) {
+			if (phb->ioda.m32_segmap[i]) {
+				pr_err("%s: Trying to use M32 seg #%d which is"
+				       " already used by PE# %d\n",
+				       pci_name(dev), i,
+				       phb->ioda.m32_segmap[i]);
+				/* XXX DO SOMETHING TO DISABLE DEVICE ? */
+				break;
+			}
+			phb->ioda.m32_segmap[i] = pe;
+			rc = opal_pci_map_pe_mmio_window(phb->opal_id, pe,
+							 OPAL_M32_WINDOW_TYPE,
+							 0, i);
+			if (rc != OPAL_SUCCESS) {
+				pr_err("%s: OPAL error %d setting up mapping"
+				       " for M32 seg# %d\n",
+				       pci_name(dev), rc, i);
+				/* XXX DO SOMETHING TO DISABLE DEVICE ? */
+				break;
+			}
+			pos += phb->ioda.m32_segsize;
+			i++;
+		}
+	}
+}
+
+/* Check if a resource still fits in the total IO or M32 range
+ * for a given PHB
+ */
+static int __devinit pnv_ioda_resource_fit(struct pci_controller *hose,
+					   struct resource *r)
+{
+	struct resource *bounds;
+
+	if (r->flags & IORESOURCE_IO)
+		bounds = &hose->io_resource;
+	else if (r->flags & IORESOURCE_MEM)
+		bounds = &hose->mem_resources[0];
+	else
+		return 1;
+
+	if (r->start >= bounds->start && r->end <= bounds->end)
+		return 1;
+	r->flags = 0;
+	return 0;
+}
+
+static void __devinit pnv_ioda_update_resources(struct pci_bus *bus)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pci_bus *cbus;
+	struct pci_dev *cdev;
+	unsigned int i;
+	u16 cmd;
+
+	/* Clear all device enables  */
+	list_for_each_entry(cdev, &bus->devices, bus_list) {
+		pci_read_config_word(cdev, PCI_COMMAND, &cmd);
+		cmd &= ~(PCI_COMMAND_IO|PCI_COMMAND_MEMORY|PCI_COMMAND_MASTER);
+		pci_write_config_word(cdev, PCI_COMMAND, cmd);
+	}
+
+	/* Check if bus resources fit in our IO or M32 range */
+	for (i = 0; bus->self && (i < 2); i++) {
+		struct resource *r = bus->resource[i];
+		if (r && !pnv_ioda_resource_fit(hose, r))
+			pr_err("%s: Bus %d resource %d disabled, no room\n",
+			       pci_name(bus->self), bus->number, i);
+	}
+
+	/* Update self if it's not a PHB */
+	if (bus->self)
+		pci_setup_bridge(bus);
+
+	/* Update child devices */
+	list_for_each_entry(cdev, &bus->devices, bus_list) {
+		/* Check if resource fits, if not, disabled it */
+		for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+			struct resource *r = &cdev->resource[i];
+			if (!pnv_ioda_resource_fit(hose, r))
+				pr_err("%s: Resource %d disabled, no room\n",
+				       pci_name(cdev), i);
+		}
+
+		/* Assign segments */
+		pnv_ioda_setup_pe_segments(cdev);
+
+		/* Update HW BARs */
+		for (i = 0; i <= PCI_ROM_RESOURCE; i++)
+			pci_update_resource(cdev, i);
+	}
+
+	/* Update child busses */
+	list_for_each_entry(cbus, &bus->children, node)
+		pnv_ioda_update_resources(cbus);
+}
+
+static int __devinit pnv_ioda_alloc_pe(struct pnv_phb *phb)
+{
+	unsigned long pe;
+
+	do {
+		pe = find_next_zero_bit(phb->ioda.pe_alloc,
+					phb->ioda.total_pe, 0);
+		if (pe >= phb->ioda.total_pe)
+			return IODA_INVALID_PE;
+	} while(test_and_set_bit(pe, phb->ioda.pe_alloc));
+
+	phb->ioda.pe_array[pe].pe_number = pe;
+	return pe;
+}
+
+static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
+{
+	WARN_ON(phb->ioda.pe_array[pe].pdev);
+
+	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
+	clear_bit(pe, phb->ioda.pe_alloc);
+}
+
+static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+
+	if (!pdn)
+		return NULL;
+	if (pdn->pe_number == IODA_INVALID_PE)
+		return NULL;
+	return &phb->ioda.pe_array[pdn->pe_number];
+}
+
+static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev)
+{
+	struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev);
+
+	while (!pe && dev->bus->self) {
+		dev = dev->bus->self;
+		pe = __pnv_ioda_get_one_pe(dev);
+		if (pe)
+			pe = pe->bus_pe;
+	}
+	return pe;
+}
+
+static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb,
+					   struct pnv_ioda_pe *pe)
+{
+	struct pci_dev *parent;
+	uint8_t bcomp, dcomp, fcomp;
+	long rc, rid_end, rid;
+
+	/* Bus validation ? */
+	if (pe->pbus) {
+		int count;
+
+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+		parent = pe->pbus->self;
+		count = pe->pbus->subordinate - pe->pbus->secondary + 1;
+		switch(count) {
+		case  1: bcomp = OpalPciBusAll;		break;
+		case  2: bcomp = OpalPciBus7Bits;	break;
+		case  4: bcomp = OpalPciBus6Bits;	break;
+		case  8: bcomp = OpalPciBus5Bits;	break;
+		case 16: bcomp = OpalPciBus4Bits;	break;
+		case 32: bcomp = OpalPciBus3Bits;	break;
+		default:
+			pr_err("%s: Number of subordinate busses %d"
+			       " unsupported\n",
+			       pci_name(pe->pbus->self), count);
+			/* Do an exact match only */
+			bcomp = OpalPciBusAll;
+		}
+		rid_end = pe->rid + (count << 8);
+	} else {
+		parent = pe->pdev->bus->self;
+		bcomp = OpalPciBusAll;
+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+		rid_end = pe->rid + 1;
+	}
+
+	/* Associate PE in PELT */
+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+			     bcomp, dcomp, fcomp, OPAL_MAP_PE);
+	if (rc) {
+		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+		return -ENXIO;
+	}
+	opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+	/* Add to all parents PELT-V */
+	while (parent) {
+		struct pci_dn *pdn = pnv_ioda_get_pdn(parent);
+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+						pe->pe_number, 1);
+			/* XXX What to do in case of error ? */
+		}
+		parent = parent->bus->self;
+	}
+	/* Setup reverse map */
+	for (rid = pe->rid; rid < rid_end; rid++)
+		phb->ioda.pe_rmap[rid] = pe->pe_number;
+
+	/* Setup one MVTs on IODA1 */
+	if (phb->type == PNV_PHB_IODA1) {
+		pe->mve_number = pe->pe_number;
+		rc = opal_pci_set_mve(phb->opal_id, pe->mve_number,
+				      pe->pe_number);
+		if (rc) {
+			pe_err(pe, "OPAL error %ld setting up MVE %d\n",
+			       rc, pe->mve_number);
+			pe->mve_number = -1;
+		} else {
+			rc = opal_pci_set_mve_enable(phb->opal_id,
+						     pe->mve_number, 1);
+			if (rc) {
+				pe_err(pe, "OPAL error %ld enabling MVE %d\n",
+				       rc, pe->mve_number);
+				pe->mve_number = -1;
+			}
+		}
+	} else if (phb->type == PNV_PHB_IODA2)
+		pe->mve_number = 0;
+
+	return 0;
+}
+
+static void __devinit pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
+						 struct pnv_ioda_pe *pe)
+{
+	struct pnv_ioda_pe *lpe;
+
+	list_for_each_entry(lpe, &phb->ioda.pe_list, link) {
+		if (lpe->dma_weight < pe->dma_weight) {
+			list_add_tail(&pe->link, &lpe->link);
+			return;
+		}
+	}
+	list_add_tail(&pe->link, &phb->ioda.pe_list);
+}
+
+static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
+{
+	/* This is quite simplistic. The "base" weight of a device
+	 * is 10. 0 means no DMA is to be accounted for it.
+	 */
+
+	/* If it's a bridge, no DMA */
+	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+		return 0;
+
+	/* Reduce the weight of slow USB controllers */
+	if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+	    dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+	    dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+		return 3;
+
+	/* Increase the weight of RAID (includes Obsidian) */
+	if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+		return 15;
+
+	/* Default */
+	return 10;
+}
+
+static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+	struct pnv_ioda_pe *pe;
+	int pe_num;
+
+	if (!pdn) {
+		pr_err("%s: Device tree node not associated properly\n",
+			   pci_name(dev));
+		return NULL;
+	}
+	if (pdn->pe_number != IODA_INVALID_PE)
+		return NULL;
+
+	/* PE#0 has been pre-set */
+	if (dev->bus->number == 0)
+		pe_num = 0;
+	else
+		pe_num = pnv_ioda_alloc_pe(phb);
+	if (pe_num == IODA_INVALID_PE) {
+		pr_warning("%s: Not enough PE# available, disabling device\n",
+			   pci_name(dev));
+		return NULL;
+	}
+
+	/* NOTE: We get only one ref to the pci_dev for the pdn, not for the
+	 * pointer in the PE data structure, both should be destroyed at the
+	 * same time. However, this needs to be looked at more closely again
+	 * once we actually start removing things (Hotplug, SR-IOV, ...)
+	 *
+	 * At some point we want to remove the PDN completely anyways
+	 */
+	pe = &phb->ioda.pe_array[pe_num];
+	pci_dev_get(dev);
+	pdn->pcidev = dev;
+	pdn->pe_number = pe_num;
+	pe->pdev = dev;
+	pe->pbus = NULL;
+	pe->tce32_seg = -1;
+	pe->mve_number = -1;
+	pe->rid = dev->bus->number << 8 | pdn->devfn;
+
+	pe_info(pe, "Associated device to PE\n");
+
+	if (pnv_ioda_configure_pe(phb, pe)) {
+		/* XXX What do we do here ? */
+		if (pe_num)
+			pnv_ioda_free_pe(phb, pe_num);
+		pdn->pe_number = IODA_INVALID_PE;
+		pe->pdev = NULL;
+		pci_dev_put(dev);
+		return NULL;
+	}
+
+	/* Assign a DMA weight to the device */
+	pe->dma_weight = pnv_ioda_dma_weight(dev);
+	if (pe->dma_weight != 0) {
+		phb->ioda.dma_weight += pe->dma_weight;
+		phb->ioda.dma_pe_count++;
+	}
+
+	/* Link the PE */
+	pnv_ioda_link_pe_by_weight(phb, pe);
+
+	return pe;
+}
+
+static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+
+		if (pdn == NULL) {
+			pr_warn("%s: No device node associated with device !\n",
+				pci_name(dev));
+			continue;
+		}
+		pci_dev_get(dev);
+		pdn->pcidev = dev;
+		pdn->pe_number = pe->pe_number;
+		pe->dma_weight += pnv_ioda_dma_weight(dev);
+		if (dev->subordinate)
+			pnv_ioda_setup_same_PE(dev->subordinate, pe);
+	}
+}
+
+static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev,
+					    struct pnv_ioda_pe *ppe)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_bus *bus = dev->subordinate;
+	struct pnv_ioda_pe *pe;
+	int pe_num;
+
+	if (!bus) {
+		pr_warning("%s: Bridge without a subordinate bus !\n",
+			   pci_name(dev));
+		return;
+	}
+	pe_num = pnv_ioda_alloc_pe(phb);
+	if (pe_num == IODA_INVALID_PE) {
+		pr_warning("%s: Not enough PE# available, disabling bus\n",
+			   pci_name(dev));
+		return;
+	}
+
+	pe = &phb->ioda.pe_array[pe_num];
+	ppe->bus_pe = pe;
+	pe->pbus = bus;
+	pe->pdev = NULL;
+	pe->tce32_seg = -1;
+	pe->mve_number = -1;
+	pe->rid = bus->secondary << 8;
+	pe->dma_weight = 0;
+
+	pe_info(pe, "Secondary busses %d..%d associated with PE\n",
+		bus->secondary, bus->subordinate);
+
+	if (pnv_ioda_configure_pe(phb, pe)) {
+		/* XXX What do we do here ? */
+		if (pe_num)
+			pnv_ioda_free_pe(phb, pe_num);
+		pe->pbus = NULL;
+		return;
+	}
+
+	/* Associate it with all child devices */
+	pnv_ioda_setup_same_PE(bus, pe);
+
+	/* Account for one DMA PE if at least one DMA capable device exist
+	 * below the bridge
+	 */
+	if (pe->dma_weight != 0) {
+		phb->ioda.dma_weight += pe->dma_weight;
+		phb->ioda.dma_pe_count++;
+	}
+
+	/* Link the PE */
+	pnv_ioda_link_pe_by_weight(phb, pe);
+}
+
+static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+	struct pnv_ioda_pe *pe;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		pe = pnv_ioda_setup_dev_PE(dev);
+		if (pe == NULL)
+			continue;
+		/* Leaving the PCIe domain ... single PE# */
+		if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+			pnv_ioda_setup_bus_PE(dev, pe);
+		else if (dev->subordinate)
+			pnv_ioda_setup_PEs(dev->subordinate);
+	}
+}
+
+static void __devinit pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb,
+						 struct pci_dev *dev)
+{
+	/* We delay DMA setup after we have assigned all PE# */
+}
+
+static void __devinit pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
+					     struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		set_iommu_table_base(&dev->dev, &pe->tce32_table);
+		if (dev->subordinate)
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+	}
+}
+
+static void __devinit pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
+						struct pnv_ioda_pe *pe,
+						unsigned int base,
+						unsigned int segs)
+{
+
+	struct page *tce_mem = NULL;
+	const __be64 *swinvp;
+	struct iommu_table *tbl;
+	unsigned int i;
+	int64_t rc;
+	void *addr;
+
+	/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
+#define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
+
+	/* XXX FIXME: Handle 64-bit only DMA devices */
+	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
+	/* XXX FIXME: Allocate multi-level tables on PHB3 */
+
+	/* We shouldn't already have a 32-bit DMA associated */
+	if (WARN_ON(pe->tce32_seg >= 0))
+		return;
+
+	/* Grab a 32-bit TCE table */
+	pe->tce32_seg = base;
+	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
+		(base << 28), ((base + segs) << 28) - 1);
+
+	/* XXX Currently, we allocate one big contiguous table for the
+	 * TCEs. We only really need one chunk per 256M of TCE space
+	 * (ie per segment) but that's an optimization for later, it
+	 * requires some added smarts with our get/put_tce implementation
+	 */
+	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+				   get_order(TCE32_TABLE_SIZE * segs));
+	if (!tce_mem) {
+		pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
+		goto fail;
+	}
+	addr = page_address(tce_mem);
+	memset(addr, 0, TCE32_TABLE_SIZE * segs);
+
+	/* Configure HW */
+	for (i = 0; i < segs; i++) {
+		rc = opal_pci_map_pe_dma_window(phb->opal_id,
+					      pe->pe_number,
+					      base + i, 1,
+					      __pa(addr) + TCE32_TABLE_SIZE * i,
+					      TCE32_TABLE_SIZE, 0x1000);
+		if (rc) {
+			pe_err(pe, " Failed to configure 32-bit TCE table,"
+			       " err %ld\n", rc);
+			goto fail;
+		}
+	}
+
+	/* Setup linux iommu table */
+	tbl = &pe->tce32_table;
+	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
+				  base << 28);
+
+	/* OPAL variant of P7IOC SW invalidated TCEs */
+	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+	if (swinvp) {
+		/* We need a couple more fields -- an address and a data
+		 * to or.  Since the bus is only printed out on table free
+		 * errors, and on the first pass the data will be a relative
+		 * bus number, print that out instead.
+		 */
+		tbl->it_busno = 0;
+		tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8);
+		tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE
+			| TCE_PCI_SWINV_PAIR;
+	}
+	iommu_init_table(tbl, phb->hose->node);
+
+	if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+	else
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
+
+	return;
+ fail:
+	/* XXX Failure: Try to fallback to 64-bit only ? */
+	if (pe->tce32_seg >= 0)
+		pe->tce32_seg = -1;
+	if (tce_mem)
+		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+}
+
+static void __devinit pnv_ioda_setup_dma(struct pnv_phb *phb)
+{
+	struct pci_controller *hose = phb->hose;
+	unsigned int residual, remaining, segs, tw, base;
+	struct pnv_ioda_pe *pe;
+
+	/* If we have more PE# than segments available, hand out one
+	 * per PE until we run out and let the rest fail. If not,
+	 * then we assign at least one segment per PE, plus more based
+	 * on the amount of devices under that PE
+	 */
+	if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
+		residual = 0;
+	else
+		residual = phb->ioda.tce32_count -
+			phb->ioda.dma_pe_count;
+
+	pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
+		hose->global_number, phb->ioda.tce32_count);
+	pr_info("PCI: %d PE# for a total weight of %d\n",
+		phb->ioda.dma_pe_count, phb->ioda.dma_weight);
+
+	/* Walk our PE list and configure their DMA segments, hand them
+	 * out one base segment plus any residual segments based on
+	 * weight
+	 */
+	remaining = phb->ioda.tce32_count;
+	tw = phb->ioda.dma_weight;
+	base = 0;
+	list_for_each_entry(pe, &phb->ioda.pe_list, link) {
+		if (!pe->dma_weight)
+			continue;
+		if (!remaining) {
+			pe_warn(pe, "No DMA32 resources available\n");
+			continue;
+		}
+		segs = 1;
+		if (residual) {
+			segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw;
+			if (segs > remaining)
+				segs = remaining;
+		}
+		pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
+			pe->dma_weight, segs);
+		pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+		remaining -= segs;
+		base += segs;
+	}
+}
+
+#ifdef CONFIG_PCI_MSI
+static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+				  unsigned int hwirq, unsigned int is_64,
+				  struct msi_msg *msg)
+{
+	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+	unsigned int xive_num = hwirq - phb->msi_base;
+	uint64_t addr64;
+	uint32_t addr32, data;
+	int rc;
+
+	/* No PE assigned ? bail out ... no MSI for you ! */
+	if (pe == NULL)
+		return -ENXIO;
+
+	/* Check if we have an MVE */
+	if (pe->mve_number < 0)
+		return -ENXIO;
+
+	/* Assign XIVE to PE */
+	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+	if (rc) {
+		pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
+			pci_name(dev), rc, xive_num);
+		return -EIO;
+	}
+
+	if (is_64) {
+		rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
+				     &addr64, &data);
+		if (rc) {
+			pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
+				pci_name(dev), rc);
+			return -EIO;
+		}
+		msg->address_hi = addr64 >> 32;
+		msg->address_lo = addr64 & 0xfffffffful;
+	} else {
+		rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
+				     &addr32, &data);
+		if (rc) {
+			pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
+				pci_name(dev), rc);
+			return -EIO;
+		}
+		msg->address_hi = 0;
+		msg->address_lo = addr32;
+	}
+	msg->data = data;
+
+	pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
+		 " address=%x_%08x data=%x PE# %d\n",
+		 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
+		 msg->address_hi, msg->address_lo, data, pe->pe_number);
+
+	return 0;
+}
+
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
+{
+	unsigned int bmap_size;
+	const __be32 *prop = of_get_property(phb->hose->dn,
+					     "ibm,opal-msi-ranges", NULL);
+	if (!prop) {
+		/* BML Fallback */
+		prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
+	}
+	if (!prop)
+		return;
+
+	phb->msi_base = be32_to_cpup(prop);
+	phb->msi_count = be32_to_cpup(prop + 1);
+	bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long);
+	phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL);
+	if (!phb->msi_map) {
+		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+		       phb->hose->global_number);
+		return;
+	}
+	phb->msi_setup = pnv_pci_ioda_msi_setup;
+	phb->msi32_support = 1;
+	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+		phb->msi_count, phb->msi_base);
+}
+#else
+static void pnv_pci_setup_ioda_msis(struct pnv_phb *phb) { }
+#endif /* CONFIG_PCI_MSI */
+
+/* This is the starting point of our IODA specific resource
+ * allocation process
+ */
+static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose)
+{
+	resource_size_t size, align;
+	struct pci_bus *child;
+
+	/* Associate PEs per functions */
+	pnv_ioda_setup_PEs(hose->bus);
+
+	/* Calculate all resources */
+	pnv_ioda_calc_bus(hose->bus, IORESOURCE_IO, &size, &align);
+	pnv_ioda_calc_bus(hose->bus, IORESOURCE_MEM, &size, &align);
+
+	/* Apply then to HW */
+	pnv_ioda_update_resources(hose->bus);
+
+	/* Setup DMA */
+	pnv_ioda_setup_dma(hose->private_data);
+
+	/* Configure PCI Express settings */
+	list_for_each_entry(child, &hose->bus->children, node) {
+		struct pci_dev *self = child->self;
+		if (!self)
+			continue;
+		pcie_bus_configure_settings(child, self->pcie_mpss);
+	}
+}
+
+/* Prevent enabling devices for which we couldn't properly
+ * assign a PE
+ */
+static int __devinit pnv_pci_enable_device_hook(struct pci_dev *dev)
+{
+	struct pci_dn *pdn = pnv_ioda_get_pdn(dev);
+
+	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+		return -EINVAL;
+	return 0;
+}
+
+static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
+			       u32 devfn)
+{
+	return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
+}
+
+void __init pnv_pci_init_ioda1_phb(struct device_node *np)
+{
+	struct pci_controller *hose;
+	static int primary = 1;
+	struct pnv_phb *phb;
+	unsigned long size, m32map_off, iomap_off, pemap_off;
+	const u64 *prop64;
+	u64 phb_id;
+	void *aux;
+	long rc;
+
+	pr_info(" Initializing IODA OPAL PHB %s\n", np->full_name);
+
+	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+	if (!prop64) {
+		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
+		return;
+	}
+	phb_id = be64_to_cpup(prop64);
+	pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
+
+	phb = alloc_bootmem(sizeof(struct pnv_phb));
+	if (phb) {
+		memset(phb, 0, sizeof(struct pnv_phb));
+		phb->hose = hose = pcibios_alloc_controller(np);
+	}
+	if (!phb || !phb->hose) {
+		pr_err("PCI: Failed to allocate PCI controller for %s\n",
+		       np->full_name);
+		return;
+	}
+
+	spin_lock_init(&phb->lock);
+	/* XXX Use device-tree */
+	hose->first_busno = 0;
+	hose->last_busno = 0xff;
+	hose->private_data = phb;
+	phb->opal_id = phb_id;
+	phb->type = PNV_PHB_IODA1;
+
+	/* We parse "ranges" now since we need to deduce the register base
+	 * from the IO base
+	 */
+	pci_process_bridge_OF_ranges(phb->hose, np, primary);
+	primary = 0;
+
+	/* Magic formula from Milton */
+	phb->regs = of_iomap(np, 0);
+	if (phb->regs == NULL)
+		pr_err("  Failed to map registers !\n");
+
+
+	/* XXX This is hack-a-thon. This needs to be changed so that:
+	 *  - we obtain stuff like PE# etc... from device-tree
+	 *  - we properly re-allocate M32 ourselves
+	 *    (the OFW one isn't very good)
+	 */
+
+	/* Initialize more IODA stuff */
+	phb->ioda.total_pe = 128;
+
+	phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
+	/* OFW Has already off top 64k of M32 space (MSI space) */
+	phb->ioda.m32_size += 0x10000;
+
+	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
+	phb->ioda.m32_pci_base = hose->mem_resources[0].start -
+		hose->pci_mem_offset;
+	phb->ioda.io_size = hose->pci_io_size;
+	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
+	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
+
+	/* Allocate aux data & arrays */
+	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+	m32map_off = size;
+	size += phb->ioda.total_pe;
+	iomap_off = size;
+	size += phb->ioda.total_pe;
+	pemap_off = size;
+	size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
+	aux = alloc_bootmem(size);
+	memset(aux, 0, size);
+	phb->ioda.pe_alloc = aux;
+	phb->ioda.m32_segmap = aux + m32map_off;
+	phb->ioda.io_segmap = aux + iomap_off;
+	phb->ioda.pe_array = aux + pemap_off;
+	set_bit(0, phb->ioda.pe_alloc);
+
+	INIT_LIST_HEAD(&phb->ioda.pe_list);
+
+	/* Calculate how many 32-bit TCE segments we have */
+	phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
+
+	/* Clear unusable m64 */
+	hose->mem_resources[1].flags = 0;
+	hose->mem_resources[1].start = 0;
+	hose->mem_resources[1].end = 0;
+	hose->mem_resources[2].flags = 0;
+	hose->mem_resources[2].start = 0;
+	hose->mem_resources[2].end = 0;
+
+#if 0
+	rc = opal_pci_set_phb_mem_window(opal->phb_id,
+					 window_type,
+					 window_num,
+					 starting_real_address,
+					 starting_pci_address,
+					 segment_size);
+#endif
+
+	pr_info("  %d PE's M32: 0x%x [segment=0x%x] IO: 0x%x [segment=0x%x]\n",
+		phb->ioda.total_pe,
+		phb->ioda.m32_size, phb->ioda.m32_segsize,
+		phb->ioda.io_size, phb->ioda.io_segsize);
+
+	if (phb->regs)  {
+		pr_devel(" BUID     = 0x%016llx\n", in_be64(phb->regs + 0x100));
+		pr_devel(" PHB2_CR  = 0x%016llx\n", in_be64(phb->regs + 0x160));
+		pr_devel(" IO_BAR   = 0x%016llx\n", in_be64(phb->regs + 0x170));
+		pr_devel(" IO_BAMR  = 0x%016llx\n", in_be64(phb->regs + 0x178));
+		pr_devel(" IO_SAR   = 0x%016llx\n", in_be64(phb->regs + 0x180));
+		pr_devel(" M32_BAR  = 0x%016llx\n", in_be64(phb->regs + 0x190));
+		pr_devel(" M32_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x198));
+		pr_devel(" M32_SAR  = 0x%016llx\n", in_be64(phb->regs + 0x1a0));
+	}
+	phb->hose->ops = &pnv_pci_ops;
+
+	/* Setup RID -> PE mapping function */
+	phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
+
+	/* Setup TCEs */
+	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
+
+	/* Setup MSI support */
+	pnv_pci_init_ioda_msis(phb);
+
+	/* We set both probe_only and PCI_REASSIGN_ALL_RSRC. This is an
+	 * odd combination which essentially means that we skip all resource
+	 * fixups and assignments in the generic code, and do it all
+	 * ourselves here
+	 */
+	pci_probe_only = 1;
+	ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb;
+	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
+	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
+
+	/* Reset IODA tables to a clean state */
+	rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_RESET, OPAL_ASSERT_RESET);
+	if (rc)
+		pr_warning("  OPAL Error %ld performing IODA reset !\n", rc);
+	opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE);
+}
+
+void __init pnv_pci_init_ioda_hub(struct device_node *np)
+{
+	struct device_node *phbn;
+	const u64 *prop64;
+	u64 hub_id;
+
+	pr_info("Probing IODA IO-Hub %s\n", np->full_name);
+
+	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+	if (!prop64) {
+		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+		return;
+	}
+	hub_id = be64_to_cpup(prop64);
+	pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
+
+	/* Count child PHBs */
+	for_each_child_of_node(np, phbn) {
+		/* Look for IODA1 PHBs */
+		if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
+			pnv_pci_init_ioda1_phb(phbn);
+	}
+}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index baef772..c0ed379 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -467,12 +467,24 @@ void __init pnv_pci_init(void)
 		init_pci_config_tokens();
 		find_and_init_phbs();
 #endif /* CONFIG_PPC_POWERNV_RTAS */
-	} else {
-		/* OPAL is here, do our normal stuff */
+	}
+	/* OPAL is here, do our normal stuff */
+	else {
+		int found_ioda = 0;
+
+		/* Look for IODA IO-Hubs. We don't support mixing IODA
+		 * and p5ioc2 due to the need to change some global
+		 * probing flags
+		 */
+		for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
+			pnv_pci_init_ioda_hub(np);
+			found_ioda = 1;
+		}
 
 		/* Look for p5ioc2 IO-Hubs */
-		for_each_compatible_node(np, NULL, "ibm,p5ioc2")
-			pnv_pci_init_p5ioc2_hub(np);
+		if (!found_ioda)
+			for_each_compatible_node(np, NULL, "ibm,p5ioc2")
+				pnv_pci_init_p5ioc2_hub(np);
 	}
 
 	/* Setup the linkage between OF nodes and PHBs */
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index d4dbc49..28ae4ca 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -9,6 +9,50 @@ enum pnv_phb_type {
 	PNV_PHB_IODA2,
 };
 
+/* Data associated with a PE, including IOMMU tracking etc.. */
+struct pnv_ioda_pe {
+	/* A PE can be associated with a single device or an
+	 * entire bus (& children). In the former case, pdev
+	 * is populated, in the later case, pbus is.
+	 */
+	struct pci_dev		*pdev;
+	struct pci_bus		*pbus;
+
+	/* Effective RID (device RID for a device PE and base bus
+	 * RID with devfn 0 for a bus PE)
+	 */
+	unsigned int		rid;
+
+	/* PE number */
+	unsigned int		pe_number;
+
+	/* "Weight" assigned to the PE for the sake of DMA resource
+	 * allocations
+	 */
+	unsigned int		dma_weight;
+
+	/* This is a PCI-E -> PCI-X bridge, this points to the
+	 * corresponding bus PE
+	 */
+	struct pnv_ioda_pe	*bus_pe;
+
+	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
+	int			tce32_seg;
+	int			tce32_segcount;
+	struct iommu_table	tce32_table;
+
+	/* XXX TODO: Add support for additional 64-bit iommus */
+
+	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
+	 * and -1 if not supported. (It's actually identical to the
+	 * PE number)
+	 */
+	int			mve_number;
+
+	/* Link in list of PE#s */
+	struct list_head	link;
+};
+
 struct pnv_phb {
 	struct pci_controller	*hose;
 	enum pnv_phb_type	type;
@@ -34,6 +78,45 @@ struct pnv_phb {
 		struct {
 			struct iommu_table iommu_table;
 		} p5ioc2;
+
+		struct {
+			/* Global bridge info */
+			unsigned int		total_pe;
+			unsigned int		m32_size;
+			unsigned int		m32_segsize;
+			unsigned int		m32_pci_base;
+			unsigned int		io_size;
+			unsigned int		io_segsize;
+			unsigned int		io_pci_base;
+
+			/* PE allocation bitmap */
+			unsigned long		*pe_alloc;
+
+			/* M32 & IO segment maps */
+			unsigned int		*m32_segmap;
+			unsigned int		*io_segmap;
+			struct pnv_ioda_pe	*pe_array;
+
+			/* Reverse map of PEs, will have to extend if
+			 * we are to support more than 256 PEs, indexed
+			 * bus { bus, devfn }
+			 */
+			unsigned char		pe_rmap[0x10000];
+
+			/* 32-bit TCE tables allocation */
+			unsigned long		tce32_count;
+
+			/* Total "weight" for the sake of DMA resources
+			 * allocation
+			 */
+			unsigned int		dma_weight;
+			unsigned int		dma_pe_count;
+
+			/* Sorted list of used PE's, sorted at
+			 * boot for resource allocation purposes
+			 */
+			struct list_head	pe_list;
+		} ioda;
 	};
 };
 
@@ -43,6 +126,7 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 				      void *tce_mem, u64 tce_size,
 				      u64 dma_offset);
 extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
+extern void pnv_pci_init_ioda_hub(struct device_node *np);
 
 
 #endif /* __POWERNV_PCI_H */

^ permalink raw reply related

* [PATCH] powerpc: Fix atomic_xxx_return barrier semantics
From: Benjamin Herrenschmidt @ 2011-11-16  3:11 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Paul E. McKenney, Anton Blanchard, Paul Mackerras

The Documentation/memory-barriers.txt document requires that atomic
operations that return a value act as a memory barrier both before
and after the actual atomic operation.

Our current implementation doesn't guarantee this. More specifically,
while a load following the isync can not be issued before stwcx. has
completed, that completion doesn't architecturally means that the
result of stwcx. is visible to other processors (or any previous stores
for that matter) (typically, the other processors L1 caches can still
hold the old value).

This has caused an actual crash in RCU torture testing on Power 7

This fixes it by changing those atomic ops to use new macros instead
of RELEASE/ACQUIRE barriers, called ATOMIC_ENTRY and ATMOIC_EXIT barriers,
which are then defined respectively to lwsync and sync.

I haven't had a chance to measure the performance impact (or rather
what I measured with kernel compiles is in the noise, I yet have to
find a more precise benchmark)

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index e2a4c26..02e41b5 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -49,13 +49,13 @@ static __inline__ int atomic_add_return(int a, atomic_t *v)
 	int t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%2		# atomic_add_return\n\
 	add	%0,%1,%0\n"
 	PPC405_ERR77(0,%2)
 "	stwcx.	%0,0,%2 \n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (t)
 	: "r" (a), "r" (&v->counter)
 	: "cc", "memory");
@@ -85,13 +85,13 @@ static __inline__ int atomic_sub_return(int a, atomic_t *v)
 	int t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%2		# atomic_sub_return\n\
 	subf	%0,%1,%0\n"
 	PPC405_ERR77(0,%2)
 "	stwcx.	%0,0,%2 \n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (t)
 	: "r" (a), "r" (&v->counter)
 	: "cc", "memory");
@@ -119,13 +119,13 @@ static __inline__ int atomic_inc_return(atomic_t *v)
 	int t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%1		# atomic_inc_return\n\
 	addic	%0,%0,1\n"
 	PPC405_ERR77(0,%1)
 "	stwcx.	%0,0,%1 \n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (t)
 	: "r" (&v->counter)
 	: "cc", "xer", "memory");
@@ -163,13 +163,13 @@ static __inline__ int atomic_dec_return(atomic_t *v)
 	int t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%1		# atomic_dec_return\n\
 	addic	%0,%0,-1\n"
 	PPC405_ERR77(0,%1)
 "	stwcx.	%0,0,%1\n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (t)
 	: "r" (&v->counter)
 	: "cc", "xer", "memory");
@@ -194,7 +194,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
 	int t;
 
 	__asm__ __volatile__ (
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%1		# __atomic_add_unless\n\
 	cmpw	0,%0,%3 \n\
 	beq-	2f \n\
@@ -202,7 +202,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
 	PPC405_ERR77(0,%2)
 "	stwcx.	%0,0,%1 \n\
 	bne-	1b \n"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 "	subf	%0,%2,%0 \n\
 2:"
 	: "=&r" (t)
@@ -226,7 +226,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
 	int t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%1		# atomic_dec_if_positive\n\
 	cmpwi	%0,1\n\
 	addi	%0,%0,-1\n\
@@ -234,7 +234,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
 	PPC405_ERR77(0,%1)
 "	stwcx.	%0,0,%1\n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	"\n\
 2:"	: "=&b" (t)
 	: "r" (&v->counter)
@@ -285,12 +285,12 @@ static __inline__ long atomic64_add_return(long a, atomic64_t *v)
 	long t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%2		# atomic64_add_return\n\
 	add	%0,%1,%0\n\
 	stdcx.	%0,0,%2 \n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (t)
 	: "r" (a), "r" (&v->counter)
 	: "cc", "memory");
@@ -319,12 +319,12 @@ static __inline__ long atomic64_sub_return(long a, atomic64_t *v)
 	long t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%2		# atomic64_sub_return\n\
 	subf	%0,%1,%0\n\
 	stdcx.	%0,0,%2 \n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (t)
 	: "r" (a), "r" (&v->counter)
 	: "cc", "memory");
@@ -351,12 +351,12 @@ static __inline__ long atomic64_inc_return(atomic64_t *v)
 	long t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%1		# atomic64_inc_return\n\
 	addic	%0,%0,1\n\
 	stdcx.	%0,0,%1 \n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (t)
 	: "r" (&v->counter)
 	: "cc", "xer", "memory");
@@ -393,12 +393,12 @@ static __inline__ long atomic64_dec_return(atomic64_t *v)
 	long t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%1		# atomic64_dec_return\n\
 	addic	%0,%0,-1\n\
 	stdcx.	%0,0,%1\n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (t)
 	: "r" (&v->counter)
 	: "cc", "xer", "memory");
@@ -418,13 +418,13 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 	long t;
 
 	__asm__ __volatile__(
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%1		# atomic64_dec_if_positive\n\
 	addic.	%0,%0,-1\n\
 	blt-	2f\n\
 	stdcx.	%0,0,%1\n\
 	bne-	1b"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 	"\n\
 2:"	: "=&r" (t)
 	: "r" (&v->counter)
@@ -450,14 +450,14 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
 	long t;
 
 	__asm__ __volatile__ (
-	PPC_RELEASE_BARRIER
+	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%1		# __atomic_add_unless\n\
 	cmpd	0,%0,%3 \n\
 	beq-	2f \n\
 	add	%0,%2,%0 \n"
 "	stdcx.	%0,0,%1 \n\
 	bne-	1b \n"
-	PPC_ACQUIRE_BARRIER
+	PPC_ATOMIC_EXIT_BARRIER
 "	subf	%0,%2,%0 \n\
 2:"
 	: "=&r" (t)
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index e137afc..efdc926 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -124,14 +124,14 @@ static __inline__ unsigned long fn(			\
 	return (old & mask);				\
 }
 
-DEFINE_TESTOP(test_and_set_bits, or, PPC_RELEASE_BARRIER,
-	      PPC_ACQUIRE_BARRIER, 0)
+DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
+	      PPC_ATOMIC_EXIT_BARRIER, 0)
 DEFINE_TESTOP(test_and_set_bits_lock, or, "",
 	      PPC_ACQUIRE_BARRIER, 1)
-DEFINE_TESTOP(test_and_clear_bits, andc, PPC_RELEASE_BARRIER,
-	      PPC_ACQUIRE_BARRIER, 0)
-DEFINE_TESTOP(test_and_change_bits, xor, PPC_RELEASE_BARRIER,
-	      PPC_ACQUIRE_BARRIER, 0)
+DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
+	      PPC_ATOMIC_EXIT_BARRIER, 0)
+DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
+	      PPC_ATOMIC_EXIT_BARRIER, 0)
 
 static __inline__ int test_and_set_bit(unsigned long nr,
 				       volatile unsigned long *addr)
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index c94e4a3..2a9cf84 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -11,12 +11,13 @@
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
   __asm__ __volatile ( \
-	PPC_RELEASE_BARRIER \
+	PPC_ATOMIC_ENTRY_BARRIER \
 "1:	lwarx	%0,0,%2\n" \
 	insn \
 	PPC405_ERR77(0, %2) \
 "2:	stwcx.	%1,0,%2\n" \
 	"bne-	1b\n" \
+	PPC_ATOMIC_EXIT_BARRIER \
 	"li	%1,0\n" \
 "3:	.section .fixup,\"ax\"\n" \
 "4:	li	%1,%3\n" \
@@ -92,14 +93,14 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 		return -EFAULT;
 
         __asm__ __volatile__ (
-        PPC_RELEASE_BARRIER
+        PPC_ATOMIC_ENTRY_BARRIER
 "1:     lwarx   %1,0,%3         # futex_atomic_cmpxchg_inatomic\n\
         cmpw    0,%1,%4\n\
         bne-    3f\n"
         PPC405_ERR77(0,%3)
 "2:     stwcx.  %5,0,%3\n\
         bne-    1b\n"
-        PPC_ACQUIRE_BARRIER
+        PPC_ATOMIC_EXIT_BARRIER
 "3:	.section .fixup,\"ax\"\n\
 4:	li	%0,%6\n\
 	b	3b\n\
diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h
index d7cab44..24fc618 100644
--- a/arch/powerpc/include/asm/synch.h
+++ b/arch/powerpc/include/asm/synch.h
@@ -41,11 +41,15 @@ static inline void isync(void)
 	START_LWSYNC_SECTION(97);			\
 	isync;						\
 	MAKE_LWSYNC_SECTION_ENTRY(97, __lwsync_fixup);
-#define PPC_ACQUIRE_BARRIER	"\n" stringify_in_c(__PPC_ACQUIRE_BARRIER)
-#define PPC_RELEASE_BARRIER	stringify_in_c(LWSYNC) "\n"
+#define PPC_ACQUIRE_BARRIER	 "\n" stringify_in_c(__PPC_ACQUIRE_BARRIER)
+#define PPC_RELEASE_BARRIER	 stringify_in_c(LWSYNC) "\n"
+#define PPC_ATOMIC_ENTRY_BARRIER "\n" stringify_in_c(LWSYNC) "\n"
+#define PPC_ATOMIC_EXIT_BARRIER	 "\n" stringify_in_c(sync) "\n"
 #else
 #define PPC_ACQUIRE_BARRIER
 #define PPC_RELEASE_BARRIER
+#define PPC_ATOMIC_ENTRY_BARRIER
+#define PPC_ATOMIC_EXIT_BARRIER
 #endif
 
 #endif /* __KERNEL__ */

^ permalink raw reply related

* Re: [PATCH] P1021: set IReady in QE Microcode Upload
From: Tabi Timur-B04825 @ 2011-11-16  3:02 UTC (permalink / raw)
  To: Kokoris, Ioannis
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org
In-Reply-To: <026483A88B848047A08C3F03E20822D0267B1F4BDB@MCHP058A.global-ad.net>

On Mon, Nov 14, 2011 at 2:55 AM, Kokoris, Ioannis
<ioannis.kokoris@siemens-enterprise.com> wrote:
> Ready register is needed for ROM-less devices such as P1021, MPC859, MPC8=
306 etc.
> For ROM-based devices such as MCP8323 the Ready register does not exist.
> Is there a global definition for conditionally including this code?

I'll have to check.  But this patch can't be applied as-is unless it's
proven safe for all QE-enabled chips.

--=20
Timur Tabi
Linux kernel developer at Freescale=

^ permalink raw reply

* Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup
From: Moffett, Kyle D @ 2011-11-16  0:25 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Anton Blanchard, B04825@freescale.com,
	linux-kernel@vger.kernel.org, paul.gortmaker@windriver.com,
	scottwood@freescale.com, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1321400792.3170.17.camel@pasglop>

On Nov 15, 2011, at 18:46, Benjamin Herrenschmidt wrote:
> On Tue, 2011-11-15 at 16:45 -0600, Moffett, Kyle D wrote:
>>=20
>> With that said, I'm curious about the origin of the PPC32 ASM.  In
>> particular, it looks like it was generated by GCC at some point in the
>> distant past, and I'm wondering if there's a good way to rewrite that
>> file in C and trick GCC into generating the relevant exception tables
>> for it?
>=20
> There is some serious history in there :-)
>=20
> I would check with Anton, he's been doing some performance work on those
> lately (the 64-bit ones).
>=20
> It's probably worth throwing a proof-of-concept simpler variant for
> 32-bit at least on the table and have people compare the perfs
> (typically network perfs). I can test on a range of ppc32 here (6xx,
> 7xxx, 4xx).

Ok, so there's not really a good way to make GCC generate the exception
tables itself.  I've come up with several overly-clever ways to do most
of what we would want using "asm goto" except that (1) "asm goto" cannot
have register outputs, and (2) "asm goto" is only available in GCC 4.5+

I could easily work around the former by putting the code into its own
file and creating a "global" register variable just for that file, but
the GCC 4.5+ dependency is a total nonstarter.

I'm trying to see if I can make it look better than it does now with
some judicious use of inline ASM.  At the very least, it should be
possible to have a wrapper function written in C which calls the ASM
guts with the correct cache params.

More importantly, the ASM code needs to use something other than
totally arbitrary numbers for labels.  :-D

Cheers,
Kyle Moffett

--
Curious about my work on the Debian powerpcspe port?
I'm keeping a blog here: http://pureperl.blogspot.com/

^ permalink raw reply

* Re: [RFC][PATCH 15/30] powerpc/85xx: Rework P1022DS device tree
From: Tabi Timur-B04825 @ 2011-11-15 23:49 UTC (permalink / raw)
  To: Kumar Gala; +Cc: linuxppc-dev@ozlabs.org
In-Reply-To: <1320941653-29797-16-git-send-email-galak@kernel.crashing.org>

On Thu, Nov 10, 2011 at 10:13 AM, Kumar Gala <galak@kernel.crashing.org> wr=
ote:
>
> + =A0 =A0 =A0 lbc: localbus@fffe05000 {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0reg =3D <0 0xffe05000 0 0x1000>;
> - =A0 =A0 =A0 =A0 =A0 =A0 =A0 interrupts =3D <19 2 0 0>;

I just noticed this bug in the original p1022ds.dts, and I see you're
carrying it over here.  The reg property should look like this:

reg =3D <0xf 0xffe05000 0 0x1000>;
       ^^^

Do you want to fix this here, or do you want me to submit a patch that
fixes the original p1022ds.dts?

--=20
Timur Tabi
Linux kernel developer at Freescale=

^ permalink raw reply

* Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup
From: Benjamin Herrenschmidt @ 2011-11-15 23:46 UTC (permalink / raw)
  To: Moffett, Kyle D
  Cc: Anton Blanchard, B04825@freescale.com,
	linux-kernel@vger.kernel.org, paul.gortmaker@windriver.com,
	scottwood@freescale.com, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <022078B9-CD41-4D24-B44A-F85256A69D0A@boeing.com>

On Tue, 2011-11-15 at 16:45 -0600, Moffett, Kyle D wrote:

> I guess that's doable, although I have to admit that idea almost gives
> me more of a headache than trying to fix up the 32-bit ASM.
> 
> One thing that bothers me in particular is that both 32/64 versions of
> __copy_tofrom_user() are dramatically overcomplicated for what they
> ought to be doing.
> 
> It would seem that if we get a page fault during an unaligned copy, we
> ought to just give up and fall back to a simple byte-by-byte copy loop
> from wherever we left off.  That would eliminate 90% of the ugly
> special cases without actually hurting performance, right?
> 
> For a page-fault during a cacheline-aligned copy, we should be able to
> handle the exception and retry from the last cacheline without much
> logic, again with good performance.
> 
> With that said, I'm curious about the origin of the PPC32 ASM.  In
> particular, it looks like it was generated by GCC at some point in the
> distant past, and I'm wondering if there's a good way to rewrite that
> file in C and trick GCC into generating the relevant exception tables
> for it?

There is some serious history in there :-)

I would check with Anton, he's been doing some performance work on those
lately (the 64-bit ones).

It's probably worth throwing a proof-of-concept simpler variant for
32-bit at least on the table and have people compare the perfs
(typically network perfs). I can test on a range of ppc32 here (6xx,
7xxx, 4xx).

Cheers,
Ben.

^ permalink raw reply

* [patch 1/1] drivers/edac/mpc85xx_edac.c: fix memory controller compatible for edac
From: akpm @ 2011-11-15 22:52 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, akpm, Shaohui.Xie

From: Shaohui Xie <Shaohui.Xie@freescale.com>
Subject: drivers/edac/mpc85xx_edac.c: fix memory controller compatible for edac

compatible in dts has been changed, so the driver needs to be updated
accordingly.

Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 drivers/edac/mpc85xx_edac.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff -puN drivers/edac/mpc85xx_edac.c~drivers-edac-mpc85xx_edacc-fix-memory-controller-compatible-for-edac drivers/edac/mpc85xx_edac.c
--- a/drivers/edac/mpc85xx_edac.c~drivers-edac-mpc85xx_edacc-fix-memory-controller-compatible-for-edac
+++ a/drivers/edac/mpc85xx_edac.c
@@ -1128,7 +1128,7 @@ static struct of_device_id mpc85xx_mc_er
 	{ .compatible = "fsl,p1020-memory-controller", },
 	{ .compatible = "fsl,p1021-memory-controller", },
 	{ .compatible = "fsl,p2020-memory-controller", },
-	{ .compatible = "fsl,p4080-memory-controller", },
+	{ .compatible = "fsl,qoriq-memory-controller", },
 	{},
 };
 MODULE_DEVICE_TABLE(of, mpc85xx_mc_err_of_match);
_

^ permalink raw reply

* Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup
From: Moffett, Kyle D @ 2011-11-15 22:45 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: B04825@freescale.com, linux-kernel@vger.kernel.org,
	paul.gortmaker@windriver.com, scottwood@freescale.com,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1321396146.3170.11.camel@pasglop>

On Nov 15, 2011, at 17:29, Benjamin Herrenschmidt wrote:
> On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
>> Unfortunately, I've been staring at PPC asm for long enough that I
>> have a migraine headache and I'm going to have to stop here for now.
>> If somebody else wants to tackle fixing up the 32-bit copy_page() and
>> __copy_tofrom_user() routines it would be highly appreciated.=20
>=20
> Yeah that's the one everybody's avoiding :-)
>=20
> What about my idea of instead compiling it multiple times with a
> different size and fixing up the branch to call the right one ?

I guess that's doable, although I have to admit that idea almost gives
me more of a headache than trying to fix up the 32-bit ASM.

One thing that bothers me in particular is that both 32/64 versions of
__copy_tofrom_user() are dramatically overcomplicated for what they
ought to be doing.

It would seem that if we get a page fault during an unaligned copy, we
ought to just give up and fall back to a simple byte-by-byte copy loop
from wherever we left off.  That would eliminate 90% of the ugly
special cases without actually hurting performance, right?

For a page-fault during a cacheline-aligned copy, we should be able to
handle the exception and retry from the last cacheline without much
logic, again with good performance.

With that said, I'm curious about the origin of the PPC32 ASM.  In
particular, it looks like it was generated by GCC at some point in the
distant past, and I'm wondering if there's a good way to rewrite that
file in C and trick GCC into generating the relevant exception tables
for it?

Cheers,
Kyle Moffett

--
Curious about my work on the Debian powerpcspe port?
I'm keeping a blog here: http://pureperl.blogspot.com/

^ permalink raw reply

* Re: [RFC PATCH 2/2] WIP: PowerPC cache cleanup
From: Benjamin Herrenschmidt @ 2011-11-15 22:42 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: B04825, linux-kernel, paul.gortmaker, scottwood, linuxppc-dev
In-Reply-To: <1321370524-2740-1-git-send-email-Kyle.D.Moffett@boeing.com>

On Tue, 2011-11-15 at 10:22 -0500, Kyle Moffett wrote:
> [My apologies for the resend, it does not seem to have hit the MLs.
> I think my git send-email "cc-cmd" may have broken somehow, oops.]

Or the ML took a while because it's big :-) I got both.

I'll try to review this week. Probably wont get to it today tho.

Thanks for looking at this !

Cheers,
Ben.

> This badly needs breaking up, and a better changelog... oh well...
> 
> The big changes:
> 
> * The "ppc64_caches" structure is now "powerpc_caches" and is used on
>   both PPC32 and PPC64.  I hated staring at the pages and pages of
>   assembly code, so nearly all of the functions are now C with tiny
>   snippets of inline ASM in the loops.
> 
> * Lots of ugly assembly functions in arch/powerpc/kernel/misc_*.S were
>   rewritten as cleaner inline ASM in arch/powerpc/mm/cache.c
> 
> * I'm not sure that the physical address functions from those files
>   actually came out cleaner, but they are now more correct.
> 
> * I'm not 100% sure I like the new FOR_EACH_CACHE_LINE() macro, but it
>   sure does make a lot of the other code much cleaner.
> 
> * I have a bit of a temptation to try to merge the 32/64-bit variants
>   of copy_page() into a single C function.  A quick test seems to show
>   that I can get nearly identical output to the 64-bit ASM with very
>   little work.
> 
> 
> ---
>  arch/powerpc/include/asm/cache.h             |  155 ++++++++++++---
>  arch/powerpc/include/asm/cacheflush.h        |    3 -
>  arch/powerpc/include/asm/page.h              |    6 +
>  arch/powerpc/include/asm/page_32.h           |    4 +-
>  arch/powerpc/include/asm/page_64.h           |   17 --
>  arch/powerpc/kernel/align.c                  |    7 +-
>  arch/powerpc/kernel/asm-offsets.c            |   13 +-
>  arch/powerpc/kernel/head_32.S                |    9 +-
>  arch/powerpc/kernel/head_64.S                |    2 +-
>  arch/powerpc/kernel/misc_32.S                |  193 ------------------
>  arch/powerpc/kernel/misc_64.S                |  182 -----------------
>  arch/powerpc/kernel/ppc_ksyms.c              |    3 -
>  arch/powerpc/kernel/setup-common.c           |  103 ++++++++++
>  arch/powerpc/kernel/setup.h                  |    1 +
>  arch/powerpc/kernel/setup_32.c               |   11 +-
>  arch/powerpc/kernel/setup_64.c               |  118 +----------
>  arch/powerpc/kernel/vdso.c                   |   27 +--
>  arch/powerpc/lib/copypage_64.S               |   10 +-
>  arch/powerpc/mm/Makefile                     |    2 +-
>  arch/powerpc/mm/cache.c                      |  279 ++++++++++++++++++++++++++
>  arch/powerpc/mm/dma-noncoherent.c            |    2 +-
>  arch/powerpc/platforms/52xx/lite5200_sleep.S |    9 +-
>  arch/powerpc/platforms/powermac/pci.c        |    2 +-
>  arch/powerpc/xmon/xmon.c                     |   53 +++---
>  drivers/macintosh/smu.c                      |    8 +-
>  25 files changed, 599 insertions(+), 620 deletions(-)
>  create mode 100644 arch/powerpc/mm/cache.c
> 
> diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
> index 4b50941..b1dc08f 100644
> --- a/arch/powerpc/include/asm/cache.h
> +++ b/arch/powerpc/include/asm/cache.h
> @@ -3,47 +3,142 @@
>  
>  #ifdef __KERNEL__
>  
> -
> -/* bytes per L1 cache line */
> -#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
> -#define L1_CACHE_SHIFT		4
> -#define MAX_COPY_PREFETCH	1
> +/*
> + * Various PowerPC CPUs which are otherwise compatible have different L1
> + * cache line sizes.
> + *
> + * Unfortunately, lots of kernel code assumes that L1_CACHE_BYTES and
> + * L1_CACHE_SHIFT are compile-time constants that can be used to align
> + * data-structures to avoid false cacheline sharing, so we can't just
> + * compute them at runtime from the cputable values.
> + *
> + * So for alignment purposes, we will compute these values as safe maximums
> + * of all the CPU support compiled into the kernel.
> + */
> +#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_47x)
> +# define L1_CACHE_SHIFT_MAX 7 /* 128-byte cache blocks */
>  #elif defined(CONFIG_PPC_E500MC)
> -#define L1_CACHE_SHIFT		6
> -#define MAX_COPY_PREFETCH	4
> -#elif defined(CONFIG_PPC32)
> -#define MAX_COPY_PREFETCH	4
> -#if defined(CONFIG_PPC_47x)
> -#define L1_CACHE_SHIFT		7
> +# define L1_CACHE_SHIFT_MAX 6 /* 64-byte cache blocks */
>  #else
> -#define L1_CACHE_SHIFT		5
> +# define L1_CACHE_SHIFT_MAX 5 /* 32-byte cache blocks */
>  #endif
> +#define L1_CACHE_BYTES_MAX (1 << L1_CACHE_SHIFT_MAX)
> +
> +#define L1_CACHE_SHIFT  L1_CACHE_SHIFT_MAX
> +#define L1_CACHE_BYTES  L1_CACHE_BYTES_MAX
> +#define SMP_CACHE_BYTES L1_CACHE_BYTES_MAX
> +
> +/*
> + * Unfortunately, for other purposes, we can't just use a safe maximum value
> + * because it gets used in loops when invalidating or clearing cachelines and
> + * it would be very bad to only flush/invalidate/zero/etc every 4th one.
> + *
> + * During early initialization we load these values from the device-tree and
> + * the cputable into the powerpc_caches structure, but we need to be able to
> + * clear pages before that occurs, so these need sane default values.
> + *
> + * As explained in the powerpc_caches structure definition, the defaults
> + * should be safe minimums, so that's what we compute here.
> + */
> +#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
> +# define L1_CACHE_SHIFT_MIN 4 /* 16-byte cache blocks */
> +#elif defined(CONFIG_PPC32)
> +# define L1_CACHE_SHIFT_MIN 5 /* 32-byte cache blocks */
>  #else /* CONFIG_PPC64 */
> -#define L1_CACHE_SHIFT		7
> +# define L1_CACHE_SHIFT_MIN 6 /* 64-byte cache blocks */
>  #endif
> +#define L1_CACHE_BYTES_MIN (1 << L1_CACHE_SHIFT_MIN)
>  
> -#define	L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
> +/*
> + * Apparently the 8xx and the 403GCX have tiny caches, so they never prefetch
> + * more than a single cacheline in the ASM memory copy functions.
> + *
> + * All other 32-bit CPUs prefetch 4 cachelines, and the 64-bit CPUs have
> + * their own copy routines which prefetch the entire page.
> + */
> +#ifdef PPC32
> +# if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
> +#  define MAX_COPY_PREFETCH 1
> +# else
> +#  define MAX_COPY_PREFETCH 4
> +# endif
> +#endif
>  
> -#define	SMP_CACHE_BYTES		L1_CACHE_BYTES
> +#ifndef __ASSEMBLY__
>  
> -#if defined(__powerpc64__) && !defined(__ASSEMBLY__)
> -struct ppc64_caches {
> -	u32	dsize;			/* L1 d-cache size */
> -	u32	dline_size;		/* L1 d-cache line size	*/
> -	u32	log_dline_size;
> -	u32	dlines_per_page;
> -	u32	isize;			/* L1 i-cache size */
> -	u32	iline_size;		/* L1 i-cache line size	*/
> -	u32	log_iline_size;
> -	u32	ilines_per_page;
> -};
> +/*
> + * A handy macro to iterate over all the cachelines referring to memory from
> + * "START" through "STOP - 1", inclusive.
> + */
> +#define FOR_EACH_CACHELINE(LINE, START, STOP, CACHE)			\
> +	for (u32 linesize__ = powerpc_caches.CACHE##_block_bytes,	\
> +			(LINE) = (START) & ~(linesize__ - 1);		\
> +			(LINE) < (STOP); (LINE) += linesize__)
> +
> +/* Write out a data cache block if it is dirty */
> +static inline void dcbst(unsigned long addr)
> +{
> +	asm volatile("dcbst %y0" :: "Z"(addr) : "memory");
> +}
>  
> -extern struct ppc64_caches ppc64_caches;
> -#endif /* __powerpc64__ && ! __ASSEMBLY__ */
> +/* Invalidate a data cache block (will lose data if dirty!) */
> +static inline void dcbi(unsigned long addr)
> +{
> +	asm volatile("dcbi %y0" :: "Z"(addr) : "memory");
> +}
> +
> +/* Write out (if dirty) and invalidate a data cache block */
> +static inline void dcbf(unsigned long addr)
> +{
> +	asm volatile("dcbf %y0" :: "Z"(addr) : "memory");
> +}
> +
> +/* Populate a data cache block with zeros */
> +static inline void dcbz(unsigned long addr)
> +{
> +	asm volatile("dcbz %y0" :: "Z"(addr) : "memory");
> +}
> +
> +/* Invalidate an instruction cache block */
> +static inline void icbi(unsigned long addr)
> +{
> +	asm volatile("icbi %y0" :: "Z"(addr) : "memory");
> +}
> +
> +/*
> + * This structure contains the various PowerPC cache parameters computed
> + * shortly after the device-tree has been unflattened during boot.
> + *
> + * Prior to that they have statically initialized values from L1_CACHE_*_MIN
> + * computed above.
> + *
> + * NOTE: If the dcache/icache are separate then ucache_* should be zeroed,
> + *       otherwise dcache == icache == ucache.
> + */
> +struct powerpc_caches {
> +	/* Data cache parameters */
> +	u32 dcache_total_bytes;
> +	u32 dcache_block_bytes;
> +	u32 dcache_block_shift;
> +	u32 dcache_blocks_per_page;
> +
> +	/* Instruction cache parameters */
> +	u32 icache_total_bytes;
> +	u32 icache_block_bytes;
> +	u32 icache_block_shift;
> +	u32 icache_blocks_per_page;
> +
> +	/* Unified cache parameters (If != 0, all 3 caches must be equal) */
> +	u32 ucache_total_bytes;
> +	u32 ucache_block_bytes;
> +	u32 ucache_block_shift;
> +	u32 ucache_blocks_per_page;
> +};
> +extern struct powerpc_caches powerpc_caches;
>  
> -#if !defined(__ASSEMBLY__)
>  #define __read_mostly __attribute__((__section__(".data..read_mostly")))
> -#endif
> +
> +#endif /* not __ASSEMBLY__ */
>  
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_POWERPC_CACHE_H */
> diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
> index ab9e402..8646443 100644
> --- a/arch/powerpc/include/asm/cacheflush.h
> +++ b/arch/powerpc/include/asm/cacheflush.h
> @@ -47,12 +47,9 @@ extern void __flush_dcache_icache_phys(unsigned long physaddr);
>  #endif /* CONFIG_PPC32 && !CONFIG_BOOKE */
>  
>  extern void flush_dcache_range(unsigned long start, unsigned long stop);
> -#ifdef CONFIG_PPC32
>  extern void clean_dcache_range(unsigned long start, unsigned long stop);
>  extern void invalidate_dcache_range(unsigned long start, unsigned long stop);
> -#endif /* CONFIG_PPC32 */
>  #ifdef CONFIG_PPC64
> -extern void flush_inval_dcache_range(unsigned long start, unsigned long stop);
>  extern void flush_dcache_phys_range(unsigned long start, unsigned long stop);
>  #endif
>  
> diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
> index dd9c4fd..b2e24ce 100644
> --- a/arch/powerpc/include/asm/page.h
> +++ b/arch/powerpc/include/asm/page.h
> @@ -286,11 +286,17 @@ static inline int hugepd_ok(hugepd_t hpd)
>  #endif /* CONFIG_HUGETLB_PAGE */
>  
>  struct page;
> +extern void clear_pages(void *page, int order);
>  extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
>  extern void copy_user_page(void *to, void *from, unsigned long vaddr,
>  		struct page *p);
>  extern int page_is_ram(unsigned long pfn);
>  
> +static inline void clear_page(void *page)
> +{
> +	clear_pages(page, 0);
> +}
> +
>  #ifdef CONFIG_PPC_SMLPAR
>  void arch_free_page(struct page *page, int order);
>  #define HAVE_ARCH_FREE_PAGE
> diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
> index 68d73b2..12ae694 100644
> --- a/arch/powerpc/include/asm/page_32.h
> +++ b/arch/powerpc/include/asm/page_32.h
> @@ -10,7 +10,7 @@
>  #define VM_DATA_DEFAULT_FLAGS	VM_DATA_DEFAULT_FLAGS32
>  
>  #ifdef CONFIG_NOT_COHERENT_CACHE
> -#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
> +#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES_MAX
>  #endif
>  
>  #ifdef CONFIG_PTE_64BIT
> @@ -37,8 +37,6 @@ typedef unsigned long pte_basic_t;
>  #endif
>  
>  struct page;
> -extern void clear_pages(void *page, int order);
> -static inline void clear_page(void *page) { clear_pages(page, 0); }
>  extern void copy_page(void *to, void *from);
>  
>  #include <asm-generic/getorder.h>
> diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
> index fb40ede..7e156f6 100644
> --- a/arch/powerpc/include/asm/page_64.h
> +++ b/arch/powerpc/include/asm/page_64.h
> @@ -42,23 +42,6 @@
>  
>  typedef unsigned long pte_basic_t;
>  
> -static __inline__ void clear_page(void *addr)
> -{
> -	unsigned long lines, line_size;
> -
> -	line_size = ppc64_caches.dline_size;
> -	lines = ppc64_caches.dlines_per_page;
> -
> -	__asm__ __volatile__(
> -	"mtctr	%1	# clear_page\n\
> -1:      dcbz	0,%0\n\
> -	add	%0,%0,%3\n\
> -	bdnz+	1b"
> -        : "=r" (addr)
> -        : "r" (lines), "0" (addr), "r" (line_size)
> -	: "ctr", "memory");
> -}
> -
>  extern void copy_page(void *to, void *from);
>  
>  /* Log 2 of page table size */
> diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
> index 8184ee9..debfb99 100644
> --- a/arch/powerpc/kernel/align.c
> +++ b/arch/powerpc/kernel/align.c
> @@ -233,14 +233,9 @@ static inline unsigned make_dsisr(unsigned instr)
>   */
>  static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr)
>  {
> +	int i, size = powerpc_caches.dcache_block_bytes;
>  	long __user *p;
> -	int i, size;
>  
> -#ifdef __powerpc64__
> -	size = ppc64_caches.dline_size;
> -#else
> -	size = L1_CACHE_BYTES;
> -#endif
>  	p = (long __user *) (regs->dar & -size);
>  	if (user_mode(regs) && !access_ok(VERIFY_WRITE, p, size))
>  		return -EFAULT;
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 7c5324f..505b25a 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -126,13 +126,14 @@ int main(void)
>  	DEFINE(TI_TASK, offsetof(struct thread_info, task));
>  	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
>  
> +	DEFINE(DCACHE_BLOCK_SHIFT,	offsetof(struct powerpc_caches, dcache_block_shift));
> +	DEFINE(DCACHE_BLOCK_BYTES,	offsetof(struct powerpc_caches, dcache_block_bytes));
> +	DEFINE(DCACHE_BLOCKS_PER_PAGE,	offsetof(struct powerpc_caches, dcache_blocks_per_page));
> +	DEFINE(ICACHE_BLOCK_SHIFT,	offsetof(struct powerpc_caches, icache_block_shift));
> +	DEFINE(ICACHE_BLOCK_BYTES,	offsetof(struct powerpc_caches, icache_block_bytes));
> +	DEFINE(ICACHE_BLOCKS_PER_PAGE,	offsetof(struct powerpc_caches, icache_blocks_per_page));
> +
>  #ifdef CONFIG_PPC64
> -	DEFINE(DCACHEL1LINESIZE, offsetof(struct ppc64_caches, dline_size));
> -	DEFINE(DCACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_dline_size));
> -	DEFINE(DCACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, dlines_per_page));
> -	DEFINE(ICACHEL1LINESIZE, offsetof(struct ppc64_caches, iline_size));
> -	DEFINE(ICACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_iline_size));
> -	DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
>  	/* paca */
>  	DEFINE(PACA_SIZE, sizeof(struct paca_struct));
>  	DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
> diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
> index 0654dba..8abc44a 100644
> --- a/arch/powerpc/kernel/head_32.S
> +++ b/arch/powerpc/kernel/head_32.S
> @@ -786,7 +786,14 @@ relocate_kernel:
>  _ENTRY(copy_and_flush)
>  	addi	r5,r5,-4
>  	addi	r6,r6,-4
> -4:	li	r0,L1_CACHE_BYTES/4
> +4:	li	r0,L1_CACHE_BYTES_MIN/4	/* Use the smallest common	*/
> +					/* denominator cache line	*/
> +					/* size.  This results in	*/
> +					/* extra cache line flushes	*/
> +					/* but operation is correct.	*/
> +					/* Can't get cache line size	*/
> +					/* from device-tree yet		*/
> +
>  	mtctr	r0
>  3:	addi	r6,r6,4			/* copy a cache line */
>  	lwzx	r0,r6,r4
> diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
> index 06c7251..183d371 100644
> --- a/arch/powerpc/kernel/head_64.S
> +++ b/arch/powerpc/kernel/head_64.S
> @@ -480,7 +480,7 @@ p_end:	.llong	_end - _stext
>  _GLOBAL(copy_and_flush)
>  	addi	r5,r5,-8
>  	addi	r6,r6,-8
> -4:	li	r0,8			/* Use the smallest common	*/
> +4:	li	r0,L1_CACHE_BYTES_MIN/8	/* Use the smallest common	*/
>  					/* denominator cache line	*/
>  					/* size.  This results in	*/
>  					/* extra cache line flushes	*/
> diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
> index f7d760a..ee61600 100644
> --- a/arch/powerpc/kernel/misc_32.S
> +++ b/arch/powerpc/kernel/misc_32.S
> @@ -321,199 +321,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
>  	blr
>  
>  /*
> - * Write any modified data cache blocks out to memory
> - * and invalidate the corresponding instruction cache blocks.
> - * This is a no-op on the 601.
> - *
> - * flush_icache_range(unsigned long start, unsigned long stop)
> - */
> -_KPROBE(__flush_icache_range)
> -BEGIN_FTR_SECTION
> -	blr				/* for 601, do nothing */
> -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
> -	li	r5,L1_CACHE_BYTES-1
> -	andc	r3,r3,r5
> -	subf	r4,r3,r4
> -	add	r4,r4,r5
> -	srwi.	r4,r4,L1_CACHE_SHIFT
> -	beqlr
> -	mtctr	r4
> -	mr	r6,r3
> -1:	dcbst	0,r3
> -	addi	r3,r3,L1_CACHE_BYTES
> -	bdnz	1b
> -	sync				/* wait for dcbst's to get to ram */
> -#ifndef CONFIG_44x
> -	mtctr	r4
> -2:	icbi	0,r6
> -	addi	r6,r6,L1_CACHE_BYTES
> -	bdnz	2b
> -#else
> -	/* Flash invalidate on 44x because we are passed kmapped addresses and
> -	   this doesn't work for userspace pages due to the virtually tagged
> -	   icache.  Sigh. */
> -	iccci	0, r0
> -#endif
> -	sync				/* additional sync needed on g4 */
> -	isync
> -	blr
> -/*
> - * Write any modified data cache blocks out to memory.
> - * Does not invalidate the corresponding cache lines (especially for
> - * any corresponding instruction cache).
> - *
> - * clean_dcache_range(unsigned long start, unsigned long stop)
> - */
> -_GLOBAL(clean_dcache_range)
> -	li	r5,L1_CACHE_BYTES-1
> -	andc	r3,r3,r5
> -	subf	r4,r3,r4
> -	add	r4,r4,r5
> -	srwi.	r4,r4,L1_CACHE_SHIFT
> -	beqlr
> -	mtctr	r4
> -
> -1:	dcbst	0,r3
> -	addi	r3,r3,L1_CACHE_BYTES
> -	bdnz	1b
> -	sync				/* wait for dcbst's to get to ram */
> -	blr
> -
> -/*
> - * Write any modified data cache blocks out to memory and invalidate them.
> - * Does not invalidate the corresponding instruction cache blocks.
> - *
> - * flush_dcache_range(unsigned long start, unsigned long stop)
> - */
> -_GLOBAL(flush_dcache_range)
> -	li	r5,L1_CACHE_BYTES-1
> -	andc	r3,r3,r5
> -	subf	r4,r3,r4
> -	add	r4,r4,r5
> -	srwi.	r4,r4,L1_CACHE_SHIFT
> -	beqlr
> -	mtctr	r4
> -
> -1:	dcbf	0,r3
> -	addi	r3,r3,L1_CACHE_BYTES
> -	bdnz	1b
> -	sync				/* wait for dcbst's to get to ram */
> -	blr
> -
> -/*
> - * Like above, but invalidate the D-cache.  This is used by the 8xx
> - * to invalidate the cache so the PPC core doesn't get stale data
> - * from the CPM (no cache snooping here :-).
> - *
> - * invalidate_dcache_range(unsigned long start, unsigned long stop)
> - */
> -_GLOBAL(invalidate_dcache_range)
> -	li	r5,L1_CACHE_BYTES-1
> -	andc	r3,r3,r5
> -	subf	r4,r3,r4
> -	add	r4,r4,r5
> -	srwi.	r4,r4,L1_CACHE_SHIFT
> -	beqlr
> -	mtctr	r4
> -
> -1:	dcbi	0,r3
> -	addi	r3,r3,L1_CACHE_BYTES
> -	bdnz	1b
> -	sync				/* wait for dcbi's to get to ram */
> -	blr
> -
> -/*
> - * Flush a particular page from the data cache to RAM.
> - * Note: this is necessary because the instruction cache does *not*
> - * snoop from the data cache.
> - * This is a no-op on the 601 which has a unified cache.
> - *
> - *	void __flush_dcache_icache(void *page)
> - */
> -_GLOBAL(__flush_dcache_icache)
> -BEGIN_FTR_SECTION
> -	blr
> -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
> -	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
> -	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
> -	mtctr	r4
> -	mr	r6,r3
> -0:	dcbst	0,r3				/* Write line to ram */
> -	addi	r3,r3,L1_CACHE_BYTES
> -	bdnz	0b
> -	sync
> -#ifdef CONFIG_44x
> -	/* We don't flush the icache on 44x. Those have a virtual icache
> -	 * and we don't have access to the virtual address here (it's
> -	 * not the page vaddr but where it's mapped in user space). The
> -	 * flushing of the icache on these is handled elsewhere, when
> -	 * a change in the address space occurs, before returning to
> -	 * user space
> -	 */
> -BEGIN_MMU_FTR_SECTION
> -	blr
> -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
> -#endif /* CONFIG_44x */
> -	mtctr	r4
> -1:	icbi	0,r6
> -	addi	r6,r6,L1_CACHE_BYTES
> -	bdnz	1b
> -	sync
> -	isync
> -	blr
> -
> -#ifndef CONFIG_BOOKE
> -/*
> - * Flush a particular page from the data cache to RAM, identified
> - * by its physical address.  We turn off the MMU so we can just use
> - * the physical address (this may be a highmem page without a kernel
> - * mapping).
> - *
> - *	void __flush_dcache_icache_phys(unsigned long physaddr)
> - */
> -_GLOBAL(__flush_dcache_icache_phys)
> -BEGIN_FTR_SECTION
> -	blr					/* for 601, do nothing */
> -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
> -	mfmsr	r10
> -	rlwinm	r0,r10,0,28,26			/* clear DR */
> -	mtmsr	r0
> -	isync
> -	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
> -	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
> -	mtctr	r4
> -	mr	r6,r3
> -0:	dcbst	0,r3				/* Write line to ram */
> -	addi	r3,r3,L1_CACHE_BYTES
> -	bdnz	0b
> -	sync
> -	mtctr	r4
> -1:	icbi	0,r6
> -	addi	r6,r6,L1_CACHE_BYTES
> -	bdnz	1b
> -	sync
> -	mtmsr	r10				/* restore DR */
> -	isync
> -	blr
> -#endif /* CONFIG_BOOKE */
> -
> -/*
> - * Clear pages using the dcbz instruction, which doesn't cause any
> - * memory traffic (except to write out any cache lines which get
> - * displaced).  This only works on cacheable memory.
> - *
> - * void clear_pages(void *page, int order) ;
> - */
> -_GLOBAL(clear_pages)
> -	li	r0,PAGE_SIZE/L1_CACHE_BYTES
> -	slw	r0,r0,r4
> -	mtctr	r0
> -1:	dcbz	0,r3
> -	addi	r3,r3,L1_CACHE_BYTES
> -	bdnz	1b
> -	blr
> -
> -/*
>   * Copy a whole page.  We use the dcbz instruction on the destination
>   * to reduce memory traffic (it eliminates the unnecessary reads of
>   * the destination into cache).  This requires that the destination
> diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
> index 616921e..500fd61 100644
> --- a/arch/powerpc/kernel/misc_64.S
> +++ b/arch/powerpc/kernel/misc_64.S
> @@ -53,188 +53,6 @@ _GLOBAL(call_handle_irq)
>  	mtlr	r0
>  	blr
>  
> -	.section	".toc","aw"
> -PPC64_CACHES:
> -	.tc		ppc64_caches[TC],ppc64_caches
> -	.section	".text"
> -
> -/*
> - * Write any modified data cache blocks out to memory
> - * and invalidate the corresponding instruction cache blocks.
> - *
> - * flush_icache_range(unsigned long start, unsigned long stop)
> - *
> - *   flush all bytes from start through stop-1 inclusive
> - */
> -
> -_KPROBE(__flush_icache_range)
> -
> -/*
> - * Flush the data cache to memory 
> - * 
> - * Different systems have different cache line sizes
> - * and in some cases i-cache and d-cache line sizes differ from
> - * each other.
> - */
> - 	ld	r10,PPC64_CACHES@toc(r2)
> -	lwz	r7,DCACHEL1LINESIZE(r10)/* Get cache line size */
> -	addi	r5,r7,-1
> -	andc	r6,r3,r5		/* round low to line bdy */
> -	subf	r8,r6,r4		/* compute length */
> -	add	r8,r8,r5		/* ensure we get enough */
> -	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of cache line size */
> -	srw.	r8,r8,r9		/* compute line count */
> -	beqlr				/* nothing to do? */
> -	mtctr	r8
> -1:	dcbst	0,r6
> -	add	r6,r6,r7
> -	bdnz	1b
> -	sync
> -
> -/* Now invalidate the instruction cache */
> -	
> -	lwz	r7,ICACHEL1LINESIZE(r10)	/* Get Icache line size */
> -	addi	r5,r7,-1
> -	andc	r6,r3,r5		/* round low to line bdy */
> -	subf	r8,r6,r4		/* compute length */
> -	add	r8,r8,r5
> -	lwz	r9,ICACHEL1LOGLINESIZE(r10)	/* Get log-2 of Icache line size */
> -	srw.	r8,r8,r9		/* compute line count */
> -	beqlr				/* nothing to do? */
> -	mtctr	r8
> -2:	icbi	0,r6
> -	add	r6,r6,r7
> -	bdnz	2b
> -	isync
> -	blr
> -	.previous .text
> -/*
> - * Like above, but only do the D-cache.
> - *
> - * flush_dcache_range(unsigned long start, unsigned long stop)
> - *
> - *    flush all bytes from start to stop-1 inclusive
> - */
> -_GLOBAL(flush_dcache_range)
> -
> -/*
> - * Flush the data cache to memory 
> - * 
> - * Different systems have different cache line sizes
> - */
> - 	ld	r10,PPC64_CACHES@toc(r2)
> -	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
> -	addi	r5,r7,-1
> -	andc	r6,r3,r5		/* round low to line bdy */
> -	subf	r8,r6,r4		/* compute length */
> -	add	r8,r8,r5		/* ensure we get enough */
> -	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of dcache line size */
> -	srw.	r8,r8,r9		/* compute line count */
> -	beqlr				/* nothing to do? */
> -	mtctr	r8
> -0:	dcbst	0,r6
> -	add	r6,r6,r7
> -	bdnz	0b
> -	sync
> -	blr
> -
> -/*
> - * Like above, but works on non-mapped physical addresses.
> - * Use only for non-LPAR setups ! It also assumes real mode
> - * is cacheable. Used for flushing out the DART before using
> - * it as uncacheable memory 
> - *
> - * flush_dcache_phys_range(unsigned long start, unsigned long stop)
> - *
> - *    flush all bytes from start to stop-1 inclusive
> - */
> -_GLOBAL(flush_dcache_phys_range)
> - 	ld	r10,PPC64_CACHES@toc(r2)
> -	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
> -	addi	r5,r7,-1
> -	andc	r6,r3,r5		/* round low to line bdy */
> -	subf	r8,r6,r4		/* compute length */
> -	add	r8,r8,r5		/* ensure we get enough */
> -	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of dcache line size */
> -	srw.	r8,r8,r9		/* compute line count */
> -	beqlr				/* nothing to do? */
> -	mfmsr	r5			/* Disable MMU Data Relocation */
> -	ori	r0,r5,MSR_DR
> -	xori	r0,r0,MSR_DR
> -	sync
> -	mtmsr	r0
> -	sync
> -	isync
> -	mtctr	r8
> -0:	dcbst	0,r6
> -	add	r6,r6,r7
> -	bdnz	0b
> -	sync
> -	isync
> -	mtmsr	r5			/* Re-enable MMU Data Relocation */
> -	sync
> -	isync
> -	blr
> -
> -_GLOBAL(flush_inval_dcache_range)
> - 	ld	r10,PPC64_CACHES@toc(r2)
> -	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
> -	addi	r5,r7,-1
> -	andc	r6,r3,r5		/* round low to line bdy */
> -	subf	r8,r6,r4		/* compute length */
> -	add	r8,r8,r5		/* ensure we get enough */
> -	lwz	r9,DCACHEL1LOGLINESIZE(r10)/* Get log-2 of dcache line size */
> -	srw.	r8,r8,r9		/* compute line count */
> -	beqlr				/* nothing to do? */
> -	sync
> -	isync
> -	mtctr	r8
> -0:	dcbf	0,r6
> -	add	r6,r6,r7
> -	bdnz	0b
> -	sync
> -	isync
> -	blr
> -
> -
> -/*
> - * Flush a particular page from the data cache to RAM.
> - * Note: this is necessary because the instruction cache does *not*
> - * snoop from the data cache.
> - *
> - *	void __flush_dcache_icache(void *page)
> - */
> -_GLOBAL(__flush_dcache_icache)
> -/*
> - * Flush the data cache to memory 
> - * 
> - * Different systems have different cache line sizes
> - */
> -
> -/* Flush the dcache */
> - 	ld	r7,PPC64_CACHES@toc(r2)
> -	clrrdi	r3,r3,PAGE_SHIFT           	    /* Page align */
> -	lwz	r4,DCACHEL1LINESPERPAGE(r7)	/* Get # dcache lines per page */
> -	lwz	r5,DCACHEL1LINESIZE(r7)		/* Get dcache line size */
> -	mr	r6,r3
> -	mtctr	r4
> -0:	dcbst	0,r6
> -	add	r6,r6,r5
> -	bdnz	0b
> -	sync
> -
> -/* Now invalidate the icache */	
> -
> -	lwz	r4,ICACHEL1LINESPERPAGE(r7)	/* Get # icache lines per page */
> -	lwz	r5,ICACHEL1LINESIZE(r7)		/* Get icache line size */
> -	mtctr	r4
> -1:	icbi	0,r3
> -	add	r3,r3,r5
> -	bdnz	1b
> -	isync
> -	blr
> -
> -
>  #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
>  /*
>   * Do an IO access in real mode
> diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
> index acba8ce..ccdceb7 100644
> --- a/arch/powerpc/kernel/ppc_ksyms.c
> +++ b/arch/powerpc/kernel/ppc_ksyms.c
> @@ -53,7 +53,6 @@ extern void program_check_exception(struct pt_regs *regs);
>  extern void single_step_exception(struct pt_regs *regs);
>  extern int sys_sigreturn(struct pt_regs *regs);
>  
> -EXPORT_SYMBOL(clear_pages);
>  EXPORT_SYMBOL(ISA_DMA_THRESHOLD);
>  EXPORT_SYMBOL(DMA_MODE_READ);
>  EXPORT_SYMBOL(DMA_MODE_WRITE);
> @@ -113,8 +112,6 @@ EXPORT_SYMBOL(giveup_spe);
>  #ifndef CONFIG_PPC64
>  EXPORT_SYMBOL(flush_instruction_cache);
>  #endif
> -EXPORT_SYMBOL(__flush_icache_range);
> -EXPORT_SYMBOL(flush_dcache_range);
>  
>  #ifdef CONFIG_SMP
>  #ifdef CONFIG_PPC32
> diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
> index 77bb77d..3abfea4 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -83,6 +83,54 @@ unsigned long klimit = (unsigned long) _end;
>  char cmd_line[COMMAND_LINE_SIZE];
>  
>  /*
> + * Initialize these values to minimum safe defaults in case they need to be
> + * used early during the boot process.  While this may not seem safe, it is
> + * actually safe in practice, because all of the kernel loops that use this
> + * data operate on whole pages.
> + *
> + * The PowerPC Book III-E spec documents that the pagesize is an even
> + * multiple of the cache block size and the cache blocks are always
> + * page-aligned.
> + *
> + * So, for example, when clearing a whole page there are only two things that
> + * can be done wrong with "dcbz":
> + *
> + *   (1) Call "dcbz" with an address outside the page you want to zero.
> + *
> + *   (2) Call "dcbz" too few times to actually hit all of the cachelines,
> + *       IE: Use a too-large cacheline stride.
> + *
> + * So as long as we ensure that this number is small enough for the current
> + * CPU everything will operate correctly, albeit with a slight performance
> + * hit, until we get a chance to parse the device-tree for the right value.
> + *
> + * NOTE: Userspace expects an exact value, so none of the above applies after
> + * the device tree has been unflattened and actual values computed.
> + *
> + * See arch/powerpc/asm/caches.h for more information.
> + */
> +struct powerpc_caches powerpc_caches = {
> +	/* Data cache sizes */
> +	.dcache_total_bytes  = 0, /* Unknown */
> +	.dcache_block_bytes = L1_CACHE_BYTES_MIN,
> +	.dcache_block_shift = L1_CACHE_SHIFT_MIN,
> +	.dcache_blocks_per_page = (PAGE_SIZE >> L1_CACHE_SHIFT_MIN),
> +
> +	/* Instruction cache sizes */
> +	.icache_total_bytes = 0,
> +	.icache_block_bytes = L1_CACHE_BYTES_MIN,
> +	.icache_block_shift = L1_CACHE_SHIFT_MIN,
> +	.icache_blocks_per_page = (PAGE_SIZE >> L1_CACHE_SHIFT_MIN),
> +
> +	/* Unified cache (assume cache is split by default) */
> +	.ucache_total_bytes = 0,
> +	.ucache_block_bytes = 0,
> +	.ucache_block_shift = 0,
> +	.ucache_blocks_per_page = 0,
> +};
> +EXPORT_SYMBOL_GPL(powerpc_caches);
> +
> +/*
>   * This still seems to be needed... -- paulus
>   */ 
>  struct screen_info screen_info = {
> @@ -349,6 +397,61 @@ const struct seq_operations cpuinfo_op = {
>  	.show =	show_cpuinfo,
>  };
>  
> +/* Helper functions to compute various values from a cache block size */
> +static void __init set_dcache_block_data(u32 bytes)
> +{
> +	u32 shift = __ilog2(bytes);
> +	powerpc_caches.dcache_block_bytes = bytes;
> +	powerpc_caches.dcache_block_shift = shift;
> +	powerpc_caches.dcache_blocks_per_page = (PAGE_SIZE >> shift);
> +}
> +static void __init set_icache_block_data(u32 bytes)
> +{
> +	u32 shift = __ilog2(bytes);
> +	powerpc_caches.icache_block_bytes = bytes;
> +	powerpc_caches.icache_block_shift = shift;
> +	powerpc_caches.icache_blocks_per_page = (PAGE_SIZE >> shift);
> +}
> +
> +/*
> + * Preinitialize the powerpc_caches structure from the cputable.  We will
> + * later scan the device-tree for this information, which may be more
> + * accurate.
> + */
> +void __init initialize_early_cache_info(void)
> +{
> +	set_dcache_block_data(cur_cpu_spec->dcache_bsize);
> +	set_icache_block_data(cur_cpu_spec->icache_bsize);
> +}
> +
> +/*
> + * Initialize the powerpc_caches structure from the device-tree for use by
> + * copy_page(), cache flush routines, and AT_DCACHEBSIZE elf headers.
> + *
> + * In the unlikely event that the device-tree doesn't have this information,
> + * the defaults loaded by initialize_early_cache_info() from the cputable
> + * will be used.
> + */
> +void __init initialize_cache_info(void)
> +{
> +	/* Assume that the cache properties are the same across all nodes */
> +	struct device_node *np = of_find_node_by_type(NULL, "cpu");
> +	u32 value = 0;
> +
> +	/* First check data/instruction cache block sizes */
> +	if (	!of_property_read_u32(np, "d-cache-block-size", &value) ||
> +		!of_property_read_u32(np, "d-cache-line-size", &value))
> +		set_dcache_block_data(value);
> +
> +	if (	!of_property_read_u32(np, "i-cache-block-size", &value) ||
> +		!of_property_read_u32(np, "i-cache-line-size", &value))
> +		set_icache_block_data(value);
> +
> +	/* Also read total cache sizes (no defaults here) */
> +	of_property_read_u32(np, "d-cache-size", &powerpc_caches.dcache_total_bytes);
> +	of_property_read_u32(np, "i-cache-size", &powerpc_caches.icache_total_bytes);
> +}
> +
>  void __init check_for_initrd(void)
>  {
>  #ifdef CONFIG_BLK_DEV_INITRD
> diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
> index 4c67ad7..1ae16ec 100644
> --- a/arch/powerpc/kernel/setup.h
> +++ b/arch/powerpc/kernel/setup.h
> @@ -1,6 +1,7 @@
>  #ifndef _POWERPC_KERNEL_SETUP_H
>  #define _POWERPC_KERNEL_SETUP_H
>  
> +void initialize_cache_info(void);
>  void check_for_initrd(void);
>  void do_init_bootmem(void);
>  void setup_panic(void);
> diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
> index c1ce863..1db2bfb 100644
> --- a/arch/powerpc/kernel/setup_32.c
> +++ b/arch/powerpc/kernel/setup_32.c
> @@ -63,14 +63,6 @@ EXPORT_SYMBOL(vgacon_remap_base);
>  #endif
>  
>  /*
> - * These are used in binfmt_elf.c to put aux entries on the stack
> - * for each elf executable being started.
> - */
> -int dcache_bsize;
> -int icache_bsize;
> -int ucache_bsize;
> -
> -/*
>   * We're called here very early in the boot.  We determine the machine
>   * type and call the appropriate low-level setup functions.
>   *  -- Cort <cort@fsmlabs.com>
> @@ -286,10 +278,13 @@ void __init setup_arch(char **cmdline_p)
>  {
>  	*cmdline_p = cmd_line;
>  
> +	initialize_early_cache_info();
> +
>  	/* so udelay does something sensible, assume <= 1000 bogomips */
>  	loops_per_jiffy = 500000000 / HZ;
>  
>  	unflatten_device_tree();
> +	initialize_cache_info();
>  	check_for_initrd();
>  
>  	if (ppc_md.init_early)
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 1a9dea8..bb686de 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -77,25 +77,6 @@ int boot_cpuid = 0;
>  int __initdata spinning_secondaries;
>  u64 ppc64_pft_size;
>  
> -/* Pick defaults since we might want to patch instructions
> - * before we've read this from the device tree.
> - */
> -struct ppc64_caches ppc64_caches = {
> -	.dline_size = 0x40,
> -	.log_dline_size = 6,
> -	.iline_size = 0x40,
> -	.log_iline_size = 6
> -};
> -EXPORT_SYMBOL_GPL(ppc64_caches);
> -
> -/*
> - * These are used in binfmt_elf.c to put aux entries on the stack
> - * for each elf executable being started.
> - */
> -int dcache_bsize;
> -int icache_bsize;
> -int ucache_bsize;
> -
>  #ifdef CONFIG_SMP
>  
>  static char *smt_enabled_cmdline;
> @@ -265,82 +246,6 @@ void smp_release_cpus(void)
>  #endif /* CONFIG_SMP || CONFIG_KEXEC */
>  
>  /*
> - * Initialize some remaining members of the ppc64_caches and systemcfg
> - * structures
> - * (at least until we get rid of them completely). This is mostly some
> - * cache informations about the CPU that will be used by cache flush
> - * routines and/or provided to userland
> - */
> -static void __init initialize_cache_info(void)
> -{
> -	struct device_node *np;
> -	unsigned long num_cpus = 0;
> -
> -	DBG(" -> initialize_cache_info()\n");
> -
> -	for_each_node_by_type(np, "cpu") {
> -		num_cpus += 1;
> -
> -		/*
> -		 * We're assuming *all* of the CPUs have the same
> -		 * d-cache and i-cache sizes... -Peter
> -		 */
> -		if (num_cpus == 1) {
> -			const u32 *sizep, *lsizep;
> -			u32 size, lsize;
> -
> -			size = 0;
> -			lsize = cur_cpu_spec->dcache_bsize;
> -			sizep = of_get_property(np, "d-cache-size", NULL);
> -			if (sizep != NULL)
> -				size = *sizep;
> -			lsizep = of_get_property(np, "d-cache-block-size",
> -						 NULL);
> -			/* fallback if block size missing */
> -			if (lsizep == NULL)
> -				lsizep = of_get_property(np,
> -							 "d-cache-line-size",
> -							 NULL);
> -			if (lsizep != NULL)
> -				lsize = *lsizep;
> -			if (sizep == 0 || lsizep == 0)
> -				DBG("Argh, can't find dcache properties ! "
> -				    "sizep: %p, lsizep: %p\n", sizep, lsizep);
> -
> -			ppc64_caches.dsize = size;
> -			ppc64_caches.dline_size = lsize;
> -			ppc64_caches.log_dline_size = __ilog2(lsize);
> -			ppc64_caches.dlines_per_page = PAGE_SIZE / lsize;
> -
> -			size = 0;
> -			lsize = cur_cpu_spec->icache_bsize;
> -			sizep = of_get_property(np, "i-cache-size", NULL);
> -			if (sizep != NULL)
> -				size = *sizep;
> -			lsizep = of_get_property(np, "i-cache-block-size",
> -						 NULL);
> -			if (lsizep == NULL)
> -				lsizep = of_get_property(np,
> -							 "i-cache-line-size",
> -							 NULL);
> -			if (lsizep != NULL)
> -				lsize = *lsizep;
> -			if (sizep == 0 || lsizep == 0)
> -				DBG("Argh, can't find icache properties ! "
> -				    "sizep: %p, lsizep: %p\n", sizep, lsizep);
> -
> -			ppc64_caches.isize = size;
> -			ppc64_caches.iline_size = lsize;
> -			ppc64_caches.log_iline_size = __ilog2(lsize);
> -			ppc64_caches.ilines_per_page = PAGE_SIZE / lsize;
> -		}
> -	}
> -
> -	DBG(" <- initialize_cache_info()\n");
> -}
> -
> -
> -/*
>   * Do some initial setup of the system.  The parameters are those which 
>   * were passed in from the bootloader.
>   */
> @@ -365,10 +270,7 @@ void __init setup_system(void)
>  	 */
>  	unflatten_device_tree();
>  
> -	/*
> -	 * Fill the ppc64_caches & systemcfg structures with informations
> - 	 * retrieved from the device-tree.
> -	 */
> +	/* Fill the powerpc_caches structure with device-tree data */
>  	initialize_cache_info();
>  
>  #ifdef CONFIG_PPC_RTAS
> @@ -423,12 +325,10 @@ void __init setup_system(void)
>  	printk("-----------------------------------------------------\n");
>  	printk("ppc64_pft_size                = 0x%llx\n", ppc64_pft_size);
>  	printk("physicalMemorySize            = 0x%llx\n", memblock_phys_mem_size());
> -	if (ppc64_caches.dline_size != 0x80)
> -		printk("ppc64_caches.dcache_line_size = 0x%x\n",
> -		       ppc64_caches.dline_size);
> -	if (ppc64_caches.iline_size != 0x80)
> -		printk("ppc64_caches.icache_line_size = 0x%x\n",
> -		       ppc64_caches.iline_size);
> +	if (powerpc_caches.dcache_block_bytes != 0x80)
> +		printk("dcache_block_bytes = 0x%x\n", powerpc_caches.dcache_block_bytes);
> +	if (powerpc_caches.icache_block_bytes != 0x80)
> +		printk("icache_block_bytes = 0x%x\n", powerpc_caches.icache_block_bytes);
>  #ifdef CONFIG_PPC_STD_MMU_64
>  	if (htab_address)
>  		printk("htab_address                  = 0x%p\n", htab_address);
> @@ -545,13 +445,7 @@ void __init setup_arch(char **cmdline_p)
>  
>  	*cmdline_p = cmd_line;
>  
> -	/*
> -	 * Set cache line size based on type of cpu as a default.
> -	 * Systems with OF can look in the properties on the cpu node(s)
> -	 * for a possibly more accurate value.
> -	 */
> -	dcache_bsize = ppc64_caches.dline_size;
> -	icache_bsize = ppc64_caches.iline_size;
> +	initialize_early_cache_info();
>  
>  	/* reboot on panic */
>  	panic_timeout = 180;
> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
> index 7d14bb6..4a038fb 100644
> --- a/arch/powerpc/kernel/vdso.c
> +++ b/arch/powerpc/kernel/vdso.c
> @@ -726,6 +726,7 @@ static int __init vdso_init(void)
>  	vdso_data->version.major = SYSTEMCFG_MAJOR;
>  	vdso_data->version.minor = SYSTEMCFG_MINOR;
>  	vdso_data->processor = mfspr(SPRN_PVR);
> +
>  	/*
>  	 * Fake the old platform number for pSeries and iSeries and add
>  	 * in LPAR bit if necessary
> @@ -734,29 +735,25 @@ static int __init vdso_init(void)
>  	if (firmware_has_feature(FW_FEATURE_LPAR))
>  		vdso_data->platform |= 1;
>  	vdso_data->physicalMemorySize = memblock_phys_mem_size();
> -	vdso_data->dcache_size = ppc64_caches.dsize;
> -	vdso_data->dcache_line_size = ppc64_caches.dline_size;
> -	vdso_data->icache_size = ppc64_caches.isize;
> -	vdso_data->icache_line_size = ppc64_caches.iline_size;
>  
> -	/* XXXOJN: Blocks should be added to ppc64_caches and used instead */
> -	vdso_data->dcache_block_size = ppc64_caches.dline_size;
> -	vdso_data->icache_block_size = ppc64_caches.iline_size;
> -	vdso_data->dcache_log_block_size = ppc64_caches.log_dline_size;
> -	vdso_data->icache_log_block_size = ppc64_caches.log_iline_size;
> +	/* There are more cache parameters saved for 64-bit than 32-bit */
> +	vdso_data->dcache_size           = powerpc_caches.dcache_total_size;
> +	vdso_data->icache_size           = powerpc_caches.icache_total_size;
> +	vdso_data->dcache_line_size      = powerpc_caches.dcache_block_bytes;
> +	vdso_data->icache_line_size      = powerpc_caches.icache_block_bytes;
>  
>  	/*
>  	 * Calculate the size of the 64 bits vDSO
>  	 */
>  	vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT;
>  	DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages);
> -#else
> -	vdso_data->dcache_block_size = L1_CACHE_BYTES;
> -	vdso_data->dcache_log_block_size = L1_CACHE_SHIFT;
> -	vdso_data->icache_block_size = L1_CACHE_BYTES;
> -	vdso_data->icache_log_block_size = L1_CACHE_SHIFT;
> -#endif /* CONFIG_PPC64 */
> +#endif
>  
> +	/* Save the cache-block sizes for the VDSO */
> +	vdso_data->dcache_block_size     = powerpc_caches.dcache_block_bytes;
> +	vdso_data->icache_block_size     = powerpc_caches.icache_block_bytes;
> +	vdso_data->dcache_log_block_size = powerpc_caches.dcache_block_shift;
> +	vdso_data->icache_log_block_size = powerpc_caches.icache_block_shift;
>  
>  	/*
>  	 * Calculate the size of the 32 bits vDSO
> diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
> index 53dcb6b..c466977 100644
> --- a/arch/powerpc/lib/copypage_64.S
> +++ b/arch/powerpc/lib/copypage_64.S
> @@ -12,17 +12,17 @@
>  #include <asm/asm-offsets.h>
>  
>          .section        ".toc","aw"
> -PPC64_CACHES:
> -        .tc             ppc64_caches[TC],ppc64_caches
> +POWERPC_CACHES:
> +        .tc             powerpc_caches[TC],powerpc_caches
>          .section        ".text"
>  
>  _GLOBAL(copy_page)
>  	lis	r5,PAGE_SIZE@h
>  	ori	r5,r5,PAGE_SIZE@l
>  BEGIN_FTR_SECTION
> -	ld      r10,PPC64_CACHES@toc(r2)
> -	lwz	r11,DCACHEL1LOGLINESIZE(r10)	/* log2 of cache line size */
> -	lwz     r12,DCACHEL1LINESIZE(r10)	/* get cache line size */
> +	ld      r10,POWERPC_CACHES@toc(r2)
> +	lwz	r11,DCACHE_BLOCK_SHIFT(r10)	/* log2 of cache line size */
> +	lwz     r12,DCACHE_BLOCK_BYTES(r10)	/* get cache line size */
>  	li	r9,0
>  	srd	r8,r5,r11
>  
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index 991ee81..8ad36a9 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -6,7 +6,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
>  
>  ccflags-$(CONFIG_PPC64)	:= -mno-minimal-toc
>  
> -obj-y				:= fault.o mem.o pgtable.o gup.o \
> +obj-y				:= cache.o fault.o mem.o pgtable.o gup.o \
>  				   init_$(CONFIG_WORD_SIZE).o \
>  				   pgtable_$(CONFIG_WORD_SIZE).o
>  obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
> diff --git a/arch/powerpc/mm/cache.c b/arch/powerpc/mm/cache.c
> new file mode 100644
> index 0000000..0fbf2d6
> --- /dev/null
> +++ b/arch/powerpc/mm/cache.c
> @@ -0,0 +1,279 @@
> +#include <linux/kprobes.h>
> +#include <linux/export.h>
> +#include <linux/types.h>
> +
> +#include <asm/cputable.h>
> +#include <asm/system.h>
> +#include <asm/cache.h>
> +#include <asm/page.h>
> +#include <asm/mmu.h>
> +
> +/*
> + * Write any modified data cache blocks out to memory.
> + * Does not invalidate the corresponding cache lines (especially for
> + * any corresponding instruction cache).
> + */
> +void clean_dcache_range(unsigned long start, unsigned long stop)
> +{
> +	unsigned long addr;
> +	FOR_EACH_CACHELINE(addr, start, stop, dcache)
> +		dcbst(addr);
> +	mb();
> +}
> +
> +/*
> + * Write any modified data cache blocks out to memory and invalidate them.
> + * Does not invalidate the corresponding instruction cache blocks.
> + */
> +void flush_dcache_range(unsigned long start, unsigned long stop)
> +{
> +	unsigned long addr;
> +	FOR_EACH_CACHELINE(addr, start, stop, dcache)
> +		dcbf(addr);
> +	mb();
> +}
> +EXPORT_SYMBOL(flush_dcache_range);
> +
> +/*
> + * Like above, but invalidate the D-cache.  This is used by the 8xx
> + * to invalidate the cache so the PPC core doesn't get stale data
> + * from the CPM (no cache snooping here :-).
> + *
> + * invalidate_dcache_range(unsigned long start, unsigned long stop)
> + */
> +void invalidate_dcache_range(unsigned long start, unsigned long stop)
> +{
> +	unsigned long addr;
> +	FOR_EACH_CACHELINE(addr, start, stop, dcache)
> +		dcbi(addr);
> +	mb();
> +}
> +
> +/*
> + * Unfortunately, we cannot flush individual chunks of the icache on 44x as
> + * we are passed kmapped addresses and we have a virtually-tagged icache.
> + *
> + * The only workaround is to invalidate the whole icache.
> + *
> + * NOTE: The CPU does not use the operands for this instruction, so
> + *       they are passed as dummies.
> + */
> +__kprobes void __flush_icache_range(unsigned long start, unsigned long stop)
> +{
> +	unsigned long addr;
> +
> +	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
> +		return;
> +
> +	/* First ensure that data has been written to memory */
> +	FOR_EACH_CACHELINE(addr, start, stop, dcache)
> +		dcbst(addr);
> +	mb();
> +
> +#ifdef CONFIG_44x
> +	if (mmu_has_feature(MMU_FTR_TYPE_44x)) {
> +		asm volatile("iccci 0, r0" ::: "memory");
> +		return;
> +	}
> +#endif
> +
> +	/* Now discard the corresponding icache */
> +	FOR_EACH_CACHELINE(addr, start, stop, icache)
> +		icbi(addr);
> +	mb();
> +	isync();
> +}
> +EXPORT_SYMBOL(__flush_icache_range);
> +
> +/*
> + * Flush a particular page from the data cache to RAM.
> + * Note: this is necessary because the instruction cache does *not*
> + * snoop from the data cache.
> + * This is a no-op on the 601 which has a unified cache.
> + *
> + *	void __flush_dcache_icache(void *page)
> + */
> +void __flush_dcache_icache(void *page)
> +{
> +	unsigned long base = ((unsigned long)page) & ~(PAGE_SIZE-1);
> +	unsigned long addr;
> +
> +	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
> +		return;
> +
> +	/* First ensure that data has been written to memory */
> +	FOR_EACH_CACHELINE(addr, base, base + PAGE_SIZE, dcache)
> +		dcbst(addr);
> +
> +#ifdef CONFIG_44x
> +	/*
> +	 * We don't flush the icache on 44x. Those have a virtual icache and
> +	 * we don't have access to the virtual address here (it's not the
> +	 * page vaddr but where it's mapped in user space). The flushing of
> +	 * the icache on these is handled elsewhere, when a change in the
> +	 * address space occurs, before returning to user space.
> +	 */
> +	if (mmu_has_feature(MMU_FTR_TYPE_44x))
> +		return;
> +#endif
> +
> +	FOR_EACH_CACHELINE(addr, base, base + PAGE_SIZE, icache)
> +		icbi(addr);
> +
> +	mb();
> +	isync();
> +}
> +
> +/*
> + * Clear pages using the dcbz instruction, which doesn't cause any
> + * memory traffic (except to write out any cache lines which get
> + * displaced).  This only works on cacheable memory.
> + *
> + */
> +void clear_pages(void *page, int order)
> +{
> +	unsigned long addr, base = (unsigned long)page;
> +	FOR_EACH_CACHELINE(addr, base, base + (PAGE_SIZE << order), dcache)
> +		dcbz(addr);
> +}
> +EXPORT_SYMBOL(clear_pages);
> +
> +#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
> +/*
> + * Flush a particular page from the data cache to RAM, identified
> + * by its physical address.  We turn off the MMU so we can just use
> + * the physical address (this may be a highmem page without a kernel
> + * mapping).
> + */
> +void __flush_dcache_icache_phys(unsigned long phys_page)
> +{
> +	u32 d_size	= powerpc_caches.dcache_block_bytes;
> +	u32 i_size	= powerpc_caches.icache_block_bytes;
> +	u32 d_per_page	= powerpc_caches.dcache_blocks_per_page;
> +	u32 i_per_page	= powerpc_caches.icache_blocks_per_page;
> +
> +	/* Temporary registers for the ASM to use */
> +	unsigned long old_msr, tmp_msr, d_phys_page, i_phys_page;
> +
> +	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
> +		return;
> +
> +	/* Page base address (used in 2 different loops) */
> +	d_phys_page = i_phys_page = phys_page & ~(PAGE_SIZE - 1);
> +
> +	/*
> +	 * This part needs to be 100% ASM because we disable the MMU, and we
> +	 * can't accidentally let some C code go poking at memory while the
> +	 * MMU isn't enabled.
> +	 *
> +	 * NOTE: This looks blatantly unsafe with respect to interrupts.
> +	 *       Hopefully all the callers provide sufficient protection?
> +	 */
> +	asm volatile(
> +		/* First disable the MMU */
> +		"mfmsr %[old_msr]\n\t"
> +		"rlwinm %[tmp_msr], %[old_msr], 0, 28, 26\n\t"
> +		"mtmsr %[tmp_msr]\n\t"
> +		"isync\n\t"
> +
> +		/* Clean the data cache */
> +		"mtctr %[d_per_page]\n"
> +	"0:	dcbst 0, %[d_phys_page]\n\t"
> +		"add %[d_phys_page], %[d_phys_page], %[d_size]\n\t"
> +		"bdnz 0b\n\t"
> +		"sync\n\t"
> +
> +		/* Invalidate the instruction cache */
> +		"mtctr %[i_per_page]\n"
> +	"0:	icbi 0, %[i_phys_page]\n\t"
> +		"add %[i_phys_page], %[i_phys_page], %[i_size]\n\t"
> +		"bdnz 0b\n\t"
> +
> +		/* Finally, re-enable the MMU */
> +		"sync\n\t"
> +		"mtmsr %[old_msr]\n\t"
> +		"isync\n\t"
> +
> +		/* Temporary variables and inputs */
> +		: [old_msr]    "=&r" (old_msr),
> +		  [tmp_msr]    "=&r" (tmp_msr),
> +		  [d_phys_page] "=b" (d_phys_page),
> +		  [i_phys_page] "=b" (i_phys_page)
> +
> +		/* Inputs */
> +		: [d_size]     "b" (d_size),
> +		  [i_size]     "b" (i_size),
> +		  [d_per_page] "b" (d_per_page),
> +		  [i_per_page] "b" (i_per_page),
> +		  "[d_phys_page]"  (d_phys_page),
> +		  "[i_phys_page]"  (i_phys_page)
> +
> +		/* Clobbers */
> +		: "memory", "c"
> +	);
> +}
> +#endif /* CONFIG_PPC32 && !CONFIG_BOOKE */
> +
> +#ifdef CONFIG_PPC64
> +/*
> + * Data cache flush that works on non-mapped physical addresses.
> + * Use only for non-LPAR setups ! It also assumes real mode
> + * is cacheable. Used for flushing out the DART before using
> + * it as uncacheable memory 
> + */
> +void flush_dcache_phys_range(unsigned long start, unsigned long stop)
> +{
> +	/* System data cache block size */
> +	unsigned long bytes = powerpc_caches.dcache_block_bytes;
> +	unsigned long shift = powerpc_caches.dcache_block_shift;
> +
> +	/* Temporary registers for the ASM to use */
> +	unsigned long old_msr, tmp_msr;
> +
> +	/* Compute a start address and number of cachelines */
> +	unsigned long phys_addr = start & ~(bytes - 1);
> +	unsigned long nr_lines = ((stop - phys_addr) + (bytes - 1)) >> shift;
> +
> +	/*
> +	 * This part needs to be 100% ASM because we disable the MMU, and we
> +	 * can't accidentally let some C code go poking at memory while the
> +	 * MMU isn't enabled.
> +	 *
> +	 * NOTE: This looks blatantly unsafe with respect to interrupts.
> +	 *       Hopefully all the callers provide sufficient protection?
> +	 */
> +	asm volatile(
> +		/* First disable the MMU */
> +		"mfmsr %[old_msr]\n\t"
> +		"rlwinm %[tmp_msr], %[old_msr], 0, 28, 26\n\t"
> +		"mtmsr %[tmp_msr]\n\t"
> +		"isync\n\t"
> +
> +		/* Clean the data cache */
> +		"mtctr %[nr_lines]\n"
> +	"0:	dcbst 0, %[phys_addr]\n\t"
> +		"add %[phys_addr], %[phys_addr], %[bytes]\n\t"
> +		"bdnz 0b\n\t"
> +		"sync\n\t"
> +		"isync\n\t"
> +
> +		/* Finally, re-enable the MMU */
> +		"mtmsr %[old_msr]\n\t"
> +		"sync\n\t"
> +		"isync\n\t"
> +
> +		/* Temporary variables and inputs */
> +		: [old_msr]  "=&r" (old_msr),
> +		  [tmp_msr]  "=&r" (tmp_msr),
> +		  [phys_addr] "=b" (phys_addr)
> +
> +		/* Inputs */
> +		: [bytes]    "b" (bytes),
> +		  [nr_lines] "b" (nr_lines),
> +		  "[phys_addr]"  (phys_addr)
> +
> +		/* Clobbers */
> +		: "memory", "c"
> +	);
> +}
> +#endif /* CONFIG_PPC64 */
> diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
> index 329be36..3823f64 100644
> --- a/arch/powerpc/mm/dma-noncoherent.c
> +++ b/arch/powerpc/mm/dma-noncoherent.c
> @@ -328,7 +328,7 @@ void __dma_sync(void *vaddr, size_t size, int direction)
>  		 * invalidate only when cache-line aligned otherwise there is
>  		 * the potential for discarding uncommitted data from the cache
>  		 */
> -		if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1)))
> +		if ((start | size) & (powerpc_caches.dcache_block_bytes - 1))
>  			flush_dcache_range(start, end);
>  		else
>  			invalidate_dcache_range(start, end);
> diff --git a/arch/powerpc/platforms/52xx/lite5200_sleep.S b/arch/powerpc/platforms/52xx/lite5200_sleep.S
> index 08ab6fe..ac285d9 100644
> --- a/arch/powerpc/platforms/52xx/lite5200_sleep.S
> +++ b/arch/powerpc/platforms/52xx/lite5200_sleep.S
> @@ -394,11 +394,16 @@ restore_regs:
>  
> 
>  /* cache flushing code. copied from arch/ppc/boot/util.S */
> -#define NUM_CACHE_LINES (128*8)
> +#define NUM_CACHE_LINES ((128 * 8) << (L1_CACHE_SHIFT_MAX - L1_CACHE_SHIFT_MIN))
>  
>  /*
>   * Flush data cache
>   * Do this by just reading lots of stuff into the cache.
> + *
> + * NOTE: This does not handle variable-sized cachelines properly, but since
> + *       we are just trying to flush the data cache by reading lots of data,
> + *       this works anyways.  We just make sure we read as many cachelines
> + *       as we could possibly need to overflow the cache on any hardware.
>   */
>  flush_data_cache:
>  	lis	r3,CONFIG_KERNEL_START@h
> @@ -407,6 +412,6 @@ flush_data_cache:
>  	mtctr	r4
>  1:
>  	lwz	r4,0(r3)
> -	addi	r3,r3,L1_CACHE_BYTES	/* Next line, please */
> +	addi	r3,r3,L1_CACHE_BYTES_MIN /* Next line, please */
>  	bdnz	1b
>  	blr
> diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
> index 31a7d3a..8503e38 100644
> --- a/arch/powerpc/platforms/powermac/pci.c
> +++ b/arch/powerpc/platforms/powermac/pci.c
> @@ -1135,7 +1135,7 @@ int pmac_pci_enable_device_hook(struct pci_dev *dev)
>  		pci_write_config_byte(dev, PCI_LATENCY_TIMER, 16);
>  
>  		pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE,
> -				      L1_CACHE_BYTES >> 2);
> +				powerpc_caches.dcache_block_bytes >> 2);
>  	}
>  
>  	return 0;
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 03a217a..c537d49 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -26,6 +26,7 @@
>  
>  #include <asm/ptrace.h>
>  #include <asm/string.h>
> +#include <asm/cache.h>
>  #include <asm/prom.h>
>  #include <asm/machdep.h>
>  #include <asm/xmon.h>
> @@ -254,16 +255,6 @@ static inline void store_inst(void *p)
>  	asm volatile ("dcbst 0,%0; sync; icbi 0,%0; isync" : : "r" (p));
>  }
>  
> -static inline void cflush(void *p)
> -{
> -	asm volatile ("dcbf 0,%0; icbi 0,%0" : : "r" (p));
> -}
> -
> -static inline void cinval(void *p)
> -{
> -	asm volatile ("dcbi 0,%0; icbi 0,%0" : : "r" (p));
> -}
> -
>  /*
>   * Disable surveillance (the service processor watchdog function)
>   * while we are in xmon.
> @@ -1513,10 +1504,9 @@ static void prregs(struct pt_regs *fp)
>  
>  static void cacheflush(void)
>  {
> -	int cmd;
> -	unsigned long nflush;
> +	unsigned long nflush, i;
>  
> -	cmd = inchar();
> +	int cmd = inchar();
>  	if (cmd != 'i')
>  		termch = cmd;
>  	scanhex((void *)&adrs);
> @@ -1524,23 +1514,30 @@ static void cacheflush(void)
>  		termch = 0;
>  	nflush = 1;
>  	scanhex(&nflush);
> -	nflush = (nflush + L1_CACHE_BYTES - 1) / L1_CACHE_BYTES;
> -	if (setjmp(bus_error_jmp) == 0) {
> -		catch_memory_errors = 1;
> -		sync();
>  
> -		if (cmd != 'i') {
> -			for (; nflush > 0; --nflush, adrs += L1_CACHE_BYTES)
> -				cflush((void *) adrs);
> -		} else {
> -			for (; nflush > 0; --nflush, adrs += L1_CACHE_BYTES)
> -				cinval((void *) adrs);
> -		}
> -		sync();
> -		/* wait a little while to see if we get a machine check */
> -		__delay(200);
> +	if (setjmp(bus_error_jmp) != 0) {
> +		catch_memory_errors = 0;
> +		return;
>  	}
> -	catch_memory_errors = 0;
> +	catch_memory_errors = 1;
> +	sync();
> +
> +	/* First flush/invalidate data caches */
> +	if (cmd != 'i') {
> +		FOR_EACH_CACHELINE(i, adrs, adrs + nflush, dcache)
> +			dcbf(i);
> +	} else {
> +		FOR_EACH_CACHELINE(i, adrs, adrs + nflush, dcache)
> +			dcbi(i);
> +	}
> +
> +	/* Now invalidate instruction caches */
> +	FOR_EACH_CACHELINE(i, adrs, adrs + nflush, icache)
> +		icbi(i);
> +
> +	sync();
> +	/* wait a little while to see if we get a machine check */
> +	__delay(200);
>  }
>  
>  static unsigned long
> diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
> index 116a49c..04ead15 100644
> --- a/drivers/macintosh/smu.c
> +++ b/drivers/macintosh/smu.c
> @@ -136,7 +136,9 @@ static void smu_start_cmd(void)
>  	/* Flush command and data to RAM */
>  	faddr = (unsigned long)smu->cmd_buf;
>  	fend = faddr + smu->cmd_buf->length + 2;
> -	flush_inval_dcache_range(faddr, fend);
> +	flush_dcache_range(faddr, fend);
> +	mb();
> +	isync();
>  
> 
>  	/* We also disable NAP mode for the duration of the command
> @@ -198,7 +200,9 @@ static irqreturn_t smu_db_intr(int irq, void *arg)
>  		 * reply length (it's only 2 cache lines anyway)
>  		 */
>  		faddr = (unsigned long)smu->cmd_buf;
> -		flush_inval_dcache_range(faddr, faddr + 256);
> +		flush_dcache_range(faddr, faddr + 256);
> +		mb();
> +		isync();
>  
>  		/* Now check ack */
>  		ack = (~cmd->cmd) & 0xff;

^ permalink raw reply

* Re: [RFC PATCH 00/17] powerpc/e500: separate e500 from e500mc
From: Benjamin Herrenschmidt @ 2011-11-15 22:41 UTC (permalink / raw)
  To: Moffett, Kyle D
  Cc: Timur Tabi, linux-kernel@vger.kernel.org, Paul Gortmaker,
	Scott Wood, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <4A0007C2-1C2C-4162-98E8-ACCA4E673AFE@boeing.com>

On Mon, 2011-11-14 at 20:36 -0600, Moffett, Kyle D wrote:
> So when you are clearing a whole page, there are only 2 things you can do
> wrong with "dcbz":
> 
>   (1) Call "dcbz" with an address outside of the page you want to zero.
> 
>   (2) Omit calls "dcbz" to dcbz for some physical cachelines in the page.
> 
> Now, that's a totally different story from the userspace memset() calls
> that caused the problem originally, because they were frequently given
> memory much smaller than a page to clear, and if you didn't know exactly
> how many bytes a "dcbz" was going to clear you couldn't use it at all.

Right. That's why we pass the cache line sizes to userspace via the elf
AUX table so they don't do stupid things like that :-)

> But the kernel doesn't do that anywhere, it just uses it for page clears. 

Right, so we could easily precalc the count & increment and use a "soft"
loop.

Cheers,
Ben.

^ permalink raw reply

* Re: [RFC PATCH 1/2] powerpc: Remove duplicate cacheable_memcpy/memzero functions
From: Benjamin Herrenschmidt @ 2011-11-15 22:31 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: Mike Frysinger, Ian Campbell, Eric Dumazet, Jiri Pirko, netdev,
	B04825, linux-kernel, Milton Miller, paul.gortmaker,
	Paul Mackerras, Anton Blanchard, Oleg Nesterov, scottwood,
	Andrew Morton, linuxppc-dev, David S. Miller, Jeff Kirsher
In-Reply-To: <1321324332-22964-2-git-send-email-Kyle.D.Moffett@boeing.com>

On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
> These functions are only used from one place each.  If the cacheable_*
> versions really are more efficient, then those changes should be
> migrated into the common code instead.
> 
> NOTE: The old routines are just flat buggy on kernels that support
>       hardware with different cacheline sizes.
> 
> Signed-off-by: Kyle Moffett <Kyle.D.Moffett@boeing.com>
> ---

Right, considering where those are used, I think we can safely remove
them. Thanks.

Ben.

>  arch/powerpc/include/asm/system.h    |    2 -
>  arch/powerpc/kernel/ppc_ksyms.c      |    2 -
>  arch/powerpc/lib/copy_32.S           |  127 ----------------------------------
>  arch/powerpc/mm/ppc_mmu_32.c         |    2 +-
>  drivers/net/ethernet/ibm/emac/core.c |   12 +---
>  5 files changed, 3 insertions(+), 142 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
> index e30a13d..25389d1 100644
> --- a/arch/powerpc/include/asm/system.h
> +++ b/arch/powerpc/include/asm/system.h
> @@ -189,8 +189,6 @@ static inline void flush_spe_to_thread(struct task_struct *t)
>  #endif
>  
>  extern int call_rtas(const char *, int, int, unsigned long *, ...);
> -extern void cacheable_memzero(void *p, unsigned int nb);
> -extern void *cacheable_memcpy(void *, const void *, unsigned int);
>  extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
>  extern void bad_page_fault(struct pt_regs *, unsigned long, int);
>  extern int die(const char *, struct pt_regs *, long);
> diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
> index d3114a7..acba8ce 100644
> --- a/arch/powerpc/kernel/ppc_ksyms.c
> +++ b/arch/powerpc/kernel/ppc_ksyms.c
> @@ -159,8 +159,6 @@ EXPORT_SYMBOL(screen_info);
>  #ifdef CONFIG_PPC32
>  EXPORT_SYMBOL(timer_interrupt);
>  EXPORT_SYMBOL(tb_ticks_per_jiffy);
> -EXPORT_SYMBOL(cacheable_memcpy);
> -EXPORT_SYMBOL(cacheable_memzero);
>  #endif
>  
>  #ifdef CONFIG_PPC32
> diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
> index 55f19f9..6813f80 100644
> --- a/arch/powerpc/lib/copy_32.S
> +++ b/arch/powerpc/lib/copy_32.S
> @@ -69,54 +69,6 @@ CACHELINE_BYTES = L1_CACHE_BYTES
>  LG_CACHELINE_BYTES = L1_CACHE_SHIFT
>  CACHELINE_MASK = (L1_CACHE_BYTES-1)
>  
> -/*
> - * Use dcbz on the complete cache lines in the destination
> - * to set them to zero.  This requires that the destination
> - * area is cacheable.  -- paulus
> - */
> -_GLOBAL(cacheable_memzero)
> -	mr	r5,r4
> -	li	r4,0
> -	addi	r6,r3,-4
> -	cmplwi	0,r5,4
> -	blt	7f
> -	stwu	r4,4(r6)
> -	beqlr
> -	andi.	r0,r6,3
> -	add	r5,r0,r5
> -	subf	r6,r0,r6
> -	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
> -	add	r8,r7,r5
> -	srwi	r9,r8,LG_CACHELINE_BYTES
> -	addic.	r9,r9,-1	/* total number of complete cachelines */
> -	ble	2f
> -	xori	r0,r7,CACHELINE_MASK & ~3
> -	srwi.	r0,r0,2
> -	beq	3f
> -	mtctr	r0
> -4:	stwu	r4,4(r6)
> -	bdnz	4b
> -3:	mtctr	r9
> -	li	r7,4
> -10:	dcbz	r7,r6
> -	addi	r6,r6,CACHELINE_BYTES
> -	bdnz	10b
> -	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
> -	addi	r5,r5,4
> -2:	srwi	r0,r5,2
> -	mtctr	r0
> -	bdz	6f
> -1:	stwu	r4,4(r6)
> -	bdnz	1b
> -6:	andi.	r5,r5,3
> -7:	cmpwi	0,r5,0
> -	beqlr
> -	mtctr	r5
> -	addi	r6,r6,3
> -8:	stbu	r4,1(r6)
> -	bdnz	8b
> -	blr
> -
>  _GLOBAL(memset)
>  	rlwimi	r4,r4,8,16,23
>  	rlwimi	r4,r4,16,0,15
> @@ -142,85 +94,6 @@ _GLOBAL(memset)
>  	bdnz	8b
>  	blr
>  
> -/*
> - * This version uses dcbz on the complete cache lines in the
> - * destination area to reduce memory traffic.  This requires that
> - * the destination area is cacheable.
> - * We only use this version if the source and dest don't overlap.
> - * -- paulus.
> - */
> -_GLOBAL(cacheable_memcpy)
> -	add	r7,r3,r5		/* test if the src & dst overlap */
> -	add	r8,r4,r5
> -	cmplw	0,r4,r7
> -	cmplw	1,r3,r8
> -	crand	0,0,4			/* cr0.lt &= cr1.lt */
> -	blt	memcpy			/* if regions overlap */
> -
> -	addi	r4,r4,-4
> -	addi	r6,r3,-4
> -	neg	r0,r3
> -	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
> -	beq	58f
> -
> -	cmplw	0,r5,r0			/* is this more than total to do? */
> -	blt	63f			/* if not much to do */
> -	andi.	r8,r0,3			/* get it word-aligned first */
> -	subf	r5,r0,r5
> -	mtctr	r8
> -	beq+	61f
> -70:	lbz	r9,4(r4)		/* do some bytes */
> -	stb	r9,4(r6)
> -	addi	r4,r4,1
> -	addi	r6,r6,1
> -	bdnz	70b
> -61:	srwi.	r0,r0,2
> -	mtctr	r0
> -	beq	58f
> -72:	lwzu	r9,4(r4)		/* do some words */
> -	stwu	r9,4(r6)
> -	bdnz	72b
> -
> -58:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
> -	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
> -	li	r11,4
> -	mtctr	r0
> -	beq	63f
> -53:
> -	dcbz	r11,r6
> -	COPY_16_BYTES
> -#if L1_CACHE_BYTES >= 32
> -	COPY_16_BYTES
> -#if L1_CACHE_BYTES >= 64
> -	COPY_16_BYTES
> -	COPY_16_BYTES
> -#if L1_CACHE_BYTES >= 128
> -	COPY_16_BYTES
> -	COPY_16_BYTES
> -	COPY_16_BYTES
> -	COPY_16_BYTES
> -#endif
> -#endif
> -#endif
> -	bdnz	53b
> -
> -63:	srwi.	r0,r5,2
> -	mtctr	r0
> -	beq	64f
> -30:	lwzu	r0,4(r4)
> -	stwu	r0,4(r6)
> -	bdnz	30b
> -
> -64:	andi.	r0,r5,3
> -	mtctr	r0
> -	beq+	65f
> -40:	lbz	r0,4(r4)
> -	stb	r0,4(r6)
> -	addi	r4,r4,1
> -	addi	r6,r6,1
> -	bdnz	40b
> -65:	blr
> -
>  _GLOBAL(memmove)
>  	cmplw	0,r3,r4
>  	bgt	backwards_memcpy
> diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
> index 11571e1..9f16b9f 100644
> --- a/arch/powerpc/mm/ppc_mmu_32.c
> +++ b/arch/powerpc/mm/ppc_mmu_32.c
> @@ -224,7 +224,7 @@ void __init MMU_init_hw(void)
>  	 */
>  	if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
>  	Hash = __va(memblock_alloc(Hash_size, Hash_size));
> -	cacheable_memzero(Hash, Hash_size);
> +	memset(Hash, 0, Hash_size);
>  	_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
>  
>  	Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
> diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
> index ed79b2d..be214ad 100644
> --- a/drivers/net/ethernet/ibm/emac/core.c
> +++ b/drivers/net/ethernet/ibm/emac/core.c
> @@ -77,13 +77,6 @@ MODULE_AUTHOR
>      ("Eugene Surovegin <eugene.surovegin@zultys.com> or <ebs@ebshome.net>");
>  MODULE_LICENSE("GPL");
>  
> -/*
> - * PPC64 doesn't (yet) have a cacheable_memcpy
> - */
> -#ifdef CONFIG_PPC64
> -#define cacheable_memcpy(d,s,n) memcpy((d),(s),(n))
> -#endif
> -
>  /* minimum number of free TX descriptors required to wake up TX process */
>  #define EMAC_TX_WAKEUP_THRESH		(NUM_TX_BUFF / 4)
>  
> @@ -1637,7 +1630,7 @@ static inline int emac_rx_sg_append(struct emac_instance *dev, int slot)
>  			dev_kfree_skb(dev->rx_sg_skb);
>  			dev->rx_sg_skb = NULL;
>  		} else {
> -			cacheable_memcpy(skb_tail_pointer(dev->rx_sg_skb),
> +			memcpy(skb_tail_pointer(dev->rx_sg_skb),
>  					 dev->rx_skb[slot]->data, len);
>  			skb_put(dev->rx_sg_skb, len);
>  			emac_recycle_rx_skb(dev, slot, len);
> @@ -1694,8 +1687,7 @@ static int emac_poll_rx(void *param, int budget)
>  				goto oom;
>  
>  			skb_reserve(copy_skb, EMAC_RX_SKB_HEADROOM + 2);
> -			cacheable_memcpy(copy_skb->data - 2, skb->data - 2,
> -					 len + 2);
> +			memcpy(copy_skb->data - 2, skb->data - 2, len + 2);
>  			emac_recycle_rx_skb(dev, slot, len);
>  			skb = copy_skb;
>  		} else if (unlikely(emac_alloc_rx_skb(dev, slot, GFP_ATOMIC)))

^ permalink raw reply

* Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup
From: Benjamin Herrenschmidt @ 2011-11-15 22:29 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: B04825, linux-kernel, paul.gortmaker, scottwood, linuxppc-dev
In-Reply-To: <1321324332-22964-1-git-send-email-Kyle.D.Moffett@boeing.com>

On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
> Unfortunately, I've been staring at PPC asm for long enough that I
> have a migraine headache and I'm going to have to stop here for now.
> If somebody else wants to tackle fixing up the 32-bit copy_page() and
> __copy_tofrom_user() routines it would be highly appreciated. 

Yeah that's the one everybody's avoiding :-)

What about my idea of instead compiling it multiple times with a
different size and fixing up the branch to call the right one ?

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level sensitive for PCIe
From: Scott Wood @ 2011-11-15 22:14 UTC (permalink / raw)
  To: Kumar Gala; +Cc: linuxppc-dev
In-Reply-To: <45E28F47-CC83-4059-959F-E890049F416B@kernel.crashing.org>

On 11/15/2011 03:51 PM, Kumar Gala wrote:
> 
> On Nov 7, 2011, at 2:32 AM, Roy Zang wrote:
> 
> Should be setting ALL PCIe interrupts to '2'?  As I think in general
> we say these PCIe are 'active high'.  The only reason I would think
> we would NOT do this is if they are shared with some external device
> that is 'active low'.  If so we should comment that somewhere (maybe
> in the .dts, maybe just in the commit message).

I'd assume the ones that are pinned out are pulled high on the board.
Active-low is normal, it's these non-pinned-out "external" interrupts
that are pulled low inside the SoC that are weird.

-Scott

^ permalink raw reply

* Re: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level sensitive for PCIe
From: Kumar Gala @ 2011-11-15 21:51 UTC (permalink / raw)
  To: Roy Zang; +Cc: linuxppc-dev
In-Reply-To: <1320654778-3294-1-git-send-email-tie-fei.zang@freescale.com>


On Nov 7, 2011, at 2:32 AM, Roy Zang wrote:

> P1023 external IRQ[4:6, 11] do not pin out, but the interrupts are
> shared with PCIe controller.
> The silicon internally ties the interrupts to L, so change the
> IRQ[4:6,11] to high level sensitive for PCIe.
>=20
> Signed-off-by: Roy Zang <tie-fei.zang@freescale.com>
> ---
> arch/powerpc/boot/dts/p1023rds.dts |    8 ++++----
> 1 files changed, 4 insertions(+), 4 deletions(-)
>=20
> diff --git a/arch/powerpc/boot/dts/p1023rds.dts =
b/arch/powerpc/boot/dts/p1023rds.dts
> index d9b7767..66bf804 100644
> --- a/arch/powerpc/boot/dts/p1023rds.dts
> +++ b/arch/powerpc/boot/dts/p1023rds.dts
> @@ -490,9 +490,9 @@
> 			interrupt-map-mask =3D <0xf800 0 0 7>;
> 			interrupt-map =3D <
> 				/* IDSEL 0x0 */
> -				0000 0 0 1 &mpic 4 1
> -				0000 0 0 2 &mpic 5 1
> -				0000 0 0 3 &mpic 6 1
> +				0000 0 0 1 &mpic 4 2
> +				0000 0 0 2 &mpic 5 2
> +				0000 0 0 3 &mpic 6 2
> 				0000 0 0 4 &mpic 7 1
> 				>;
> 			ranges =3D <0x2000000 0x0 0xa0000000
> @@ -532,7 +532,7 @@
> 				0000 0 0 1 &mpic 8 1
> 				0000 0 0 2 &mpic 9 1
> 				0000 0 0 3 &mpic 10 1
> -				0000 0 0 4 &mpic 11 1
> +				0000 0 0 4 &mpic 11 2
> 				>;
> 			ranges =3D <0x2000000 0x0 0x80000000
> 				  0x2000000 0x0 0x80000000
> --=20
> 1.6.0.6
>=20

Should be setting ALL PCIe interrupts to '2'?  As I think in general we =
say these PCIe are 'active high'.  The only reason I would think we =
would NOT do this is if they are shared with some external device that =
is 'active low'.  If so we should comment that somewhere (maybe in the =
.dts, maybe just in the commit message).

- k

^ permalink raw reply

* Re: [PATCH] net: fsl_pq_mdio: fix non tbi phy access
From: Baruch Siach @ 2011-11-15 15:44 UTC (permalink / raw)
  To: Andy Fleming; +Cc: netdev@vger.kernel.org, linuxppc-dev
In-Reply-To: <74631EEB-F6F8-4969-AD05-81DEAFB0EAB4@freescale.com>

Hi Andy,

On Tue, Nov 15, 2011 at 09:06:03AM -0600, Andy Fleming wrote:
> On Nov 14, 2011, at 11:17 PM, Baruch Siach wrote:
> > On Mon, Nov 14, 2011 at 09:04:47PM +0000, Fleming Andy-AFLEMING wrote:

[snip]

> >> And looking at the p1010si.dtsi, I see that it's automatically there for 
> >> you.
> >> 
> >> How were you breaking?
> > 
> > Adding linuxppc to Cc.
> > 
> > My board is P1011 based, the single core version of P1020, not P1010. In 
> > p1020si.dtsi I see no tbi node. In p1020rdb.dts I see a tbi node but only for 
> > mdio@25000, not mdio@24000, which is what I'm using.
> > 
> > Am I missing something?
> 
> Well, that's a bug. In truth, the silicon dtsi trees should not have tbi 
> nodes, as that's highly machine-specific. The p1020rdb is apparently relying 
> on the old behavior, which is broken, and due to the fact that the first 
> ethernet interface doesn't *use* the TBI PHY.
> 
> You should add this to your board tree:
> 
>                 mdio@24000 {
> 
>                         tbi0: tbi-phy@11 {
>                                 reg = <0x11>;
>                                 device_type = "tbi-phy";
>                         };
>                 };
> 
> And add the PHYs you use, as well as set reg (and the value after the "@") 
> to something that makes sense for your board.

Thanks for your detailed explanation and prompt response. I've added a tbi 
node, dropped my patch, and now my board works as expected.

> I am going to go right now, and add tbi nodes for all of the Freescale 
> platforms. I will also modify the fsl_pq_mdio code to be more explicit about 
> its reason for failure.

Please Cc me on these.

Thanks,
baruch

-- 
                                                     ~. .~   Tk Open Systems
=}------------------------------------------------ooO--U--Ooo------------{=
   - baruch@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -

^ permalink raw reply

* [RFC PATCH v5 2/9] fadump: Reserve the memory for firmware assisted dump.
From: Mahesh J Salgaonkar @ 2011-11-15 15:13 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111115151145.16533.16384.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Reserve the memory during early boot to preserve CPU state data, HPTE region
and RMR region data in case of kernel crash. At the time of crash, powerpc
firmware will store CPU state data, HPTE region data and move RMR region
data to the reserved memory area.

If the firmware-assisted dump fails to reserve the memory, then fallback
to existing kexec-based kdump.

The most of the code implementation to reserve memory has been
adapted from phyp assisted dump implementation written by Linas Vepstas
and Manish Ahuja

Change in v5:
- Merged patch 10/10 which introduces a config option CONFIG_FA_DUMP
  for firmware assisted dump feature on Powerpc (ppc64) architecture.
- Increased MIN_BOOT_MEM by 64M to avoid OOM issue during network
  dump capture. When kdump infrastructure is configured to save vmcore
  over network, we run into OOM issue while loading modules related to
  network setup.

Change in v2:
- Modified to use standard pr_debug() macro.
- Modified early_init_dt_scan_fw_dump() to get the size of
  "ibm,configure-kernel-dump-sizes" property and use it to iterate through
  an array of dump sections.
- Introduced boot option 'fadump_reserve_mem=' to let user specify the
  fadump boot memory to be reserved.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/Kconfig              |   13 ++
 arch/powerpc/include/asm/fadump.h |   69 ++++++++++
 arch/powerpc/kernel/Makefile      |    1 
 arch/powerpc/kernel/fadump.c      |  250 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/prom.c        |   15 ++
 5 files changed, 347 insertions(+), 1 deletions(-)
 create mode 100644 arch/powerpc/include/asm/fadump.h
 create mode 100644 arch/powerpc/kernel/fadump.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6926b61..7ce773c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -379,6 +379,19 @@ config PHYP_DUMP
 
 	  If unsure, say "N"
 
+config FA_DUMP
+	bool "Firmware-assisted dump"
+	depends on PPC64 && PPC_RTAS && CRASH_DUMP
+	help
+	  A robust mechanism to get reliable kernel crash dump with
+	  assistance from firmware. This approach does not use kexec,
+	  instead firmware assists in booting the kdump kernel
+	  while preserving memory contents. Firmware-assisted dump
+	  is meant to be a kdump replacement offering robustness and
+	  speed not possible without system firmware assistance.
+
+	  If unsure, say "N"
+
 config PPCBUG_NVRAM
 	bool "Enable reading PPCBUG NVRAM during boot" if PPLUS || LOPEC
 	default y if PPC_PREP
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
new file mode 100644
index 0000000..86b17e8
--- /dev/null
+++ b/arch/powerpc/include/asm/fadump.h
@@ -0,0 +1,69 @@
+/*
+ * Firmware Assisted dump header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __PPC64_FA_DUMP_H__
+#define __PPC64_FA_DUMP_H__
+
+#ifdef CONFIG_FA_DUMP
+
+/*
+ * The RMR region will be saved for later dumping when kernel crashes.
+ * Set this to RMO size.
+ */
+#define RMR_START	0x0
+#define RMR_END		(ppc64_rma_size)
+
+/*
+ * On some Power systems where RMO is 128MB, it still requires minimum of
+ * 256MB for kernel to boot successfully. When kdump infrastructure is
+ * configured to save vmcore over network, we run into OOM issue while
+ * loading modules related to network setup. Hence we need aditional 64M
+ * of memory to avoid OOM issue.
+ */
+#define MIN_BOOT_MEM	(((RMR_END < (0x1UL << 28)) ? (0x1UL << 28) : RMR_END) \
+			+ (0x1UL << 26))
+
+/* Firmware provided dump sections */
+#define FADUMP_CPU_STATE_DATA	0x0001
+#define FADUMP_HPTE_REGION	0x0002
+#define FADUMP_REAL_MODE_REGION	0x0011
+
+struct fw_dump {
+	unsigned long	cpu_state_data_size;
+	unsigned long	hpte_region_size;
+	unsigned long	boot_memory_size;
+	unsigned long	reserve_dump_area_start;
+	unsigned long	reserve_dump_area_size;
+	/* cmd line option during boot */
+	unsigned long	reserve_bootvar;
+
+	int		ibm_configure_kernel_dump;
+
+	unsigned long	fadump_enabled:1;
+	unsigned long	fadump_supported:1;
+	unsigned long	dump_active:1;
+};
+
+extern int early_init_dt_scan_fw_dump(unsigned long node,
+		const char *uname, int depth, void *data);
+extern int fadump_reserve_mem(void);
+#endif
+#endif
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index ce4f7f1..59b549c 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_IBMVIO)		+= vio.o
 obj-$(CONFIG_IBMEBUS)           += ibmebus.o
 obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
+obj-$(CONFIG_FA_DUMP)		+= fadump.o
 ifeq ($(CONFIG_PPC32),y)
 obj-$(CONFIG_E500)		+= idle_e500.o
 endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
new file mode 100644
index 0000000..d94fc0e
--- /dev/null
+++ b/arch/powerpc/kernel/fadump.c
@@ -0,0 +1,250 @@
+/*
+ * Firmware Assisted dump: A robust mechanism to get reliable kernel crash
+ * dump with assistance from firmware. This approach does not use kexec,
+ * instead firmware assists in booting the kdump kernel while preserving
+ * memory contents. The most of the code implementation has been adapted
+ * from phyp assisted dump implementation written by Linas Vepstas and
+ * Manish Ahuja
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+
+/*
+ * The RTAS property "ibm,configure-kernel-dump-sizes" returns dump
+ * sizes for the firmware provided dump sections (cpu state data
+ * and hpte region).
+ */
+struct dump_section {
+	u32		dump_section;
+	unsigned long	section_size;
+} __packed;
+
+static struct fw_dump fw_dump;
+
+/* Scan the Firmware Assisted dump configuration details. */
+int __init early_init_dt_scan_fw_dump(unsigned long node,
+			const char *uname, int depth, void *data)
+{
+	const struct dump_section *sections;
+	int i, num_sections;
+	unsigned long size;
+	const int *token;
+
+	if (depth != 1 || strcmp(uname, "rtas") != 0)
+		return 0;
+
+	/*
+	 * Check if Firmware Assisted dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+	if (!token)
+		return 0;
+
+	fw_dump.fadump_supported = 1;
+	fw_dump.ibm_configure_kernel_dump = *token;
+
+	/*
+	 * The 'ibm,kernel-dump' rtas node is present only if there is
+	 * dump data waiting for us.
+	 */
+	if (of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL))
+		fw_dump.dump_active = 1;
+
+	/* Get the sizes required to store dump data for the firmware provided
+	 * dump sections.
+	 */
+	sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+					&size);
+
+	if (!sections)
+		return 0;
+
+	num_sections = size / sizeof(struct dump_section);
+
+	for (i = 0; i < num_sections; i++) {
+		switch (sections[i].dump_section) {
+		case FADUMP_CPU_STATE_DATA:
+			fw_dump.cpu_state_data_size = sections[i].section_size;
+			break;
+		case FADUMP_HPTE_REGION:
+			fw_dump.hpte_region_size = sections[i].section_size;
+			break;
+		}
+	}
+	return 1;
+}
+
+/**
+ * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
+ *
+ * Function to find the largest memory size we need to reserve during early
+ * boot process. This will be the size of the memory that is required for a
+ * kernel to boot successfully.
+ *
+ * This function has been taken from phyp-assisted dump feature implementation.
+ *
+ * returns larger of 256MB or 5% rounded down to multiples of 256MB.
+ *
+ * TODO: Come up with better approach to find out more accurate memory size
+ * that is required for a kernel to boot successfully.
+ *
+ */
+static inline unsigned long fadump_calculate_reserve_size(void)
+{
+	unsigned long size;
+
+	/*
+	 * Check if the size is specified through fadump_reserve_mem= cmdline
+	 * option. If yes, then use that.
+	 */
+	if (fw_dump.reserve_bootvar)
+		return fw_dump.reserve_bootvar;
+
+	/* divide by 20 to get 5% of value */
+	size = memblock_end_of_DRAM();
+	do_div(size, 20);
+
+	/* round it down in multiples of 256 */
+	size = size & ~0x0FFFFFFFUL;
+
+	/* Truncate to memory_limit. We don't want to over reserve the memory.*/
+	if (memory_limit && size > memory_limit)
+		size = memory_limit;
+
+	return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM);
+}
+
+/*
+ * Calculate the total memory size required to be reserved for
+ * firmware-assisted dump registration.
+ */
+static unsigned long get_fadump_area_size(void)
+{
+	unsigned long size = 0;
+
+	size += fw_dump.cpu_state_data_size;
+	size += fw_dump.hpte_region_size;
+	size += fw_dump.boot_memory_size;
+
+	size = PAGE_ALIGN(size);
+	return size;
+}
+
+int __init fadump_reserve_mem(void)
+{
+	unsigned long base, size, memory_boundary;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	if (!fw_dump.fadump_supported) {
+		printk(KERN_ERR "Firmware-assisted dump is not supported on"
+				" this hardware\n");
+		fw_dump.fadump_enabled = 0;
+		return 0;
+	}
+	/* Initialize boot memory size */
+	fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+
+	/*
+	 * Calculate the memory boundary.
+	 * If memory_limit is less than actual memory boundary then reserve
+	 * the memory for fadump beyond the memory_limit and adjust the
+	 * memory_limit accordingly, so that the running kernel can run with
+	 * specified memory_limit.
+	 */
+	if (memory_limit && memory_limit < memblock_end_of_DRAM()) {
+		size = get_fadump_area_size();
+		if ((memory_limit + size) < memblock_end_of_DRAM())
+			memory_limit += size;
+		else
+			memory_limit = memblock_end_of_DRAM();
+		printk(KERN_INFO "Adjusted memory_limit for firmware-assisted"
+				" dump, now %#016llx\n",
+				(unsigned long long)memory_limit);
+	}
+	if (memory_limit)
+		memory_boundary = memory_limit;
+	else
+		memory_boundary = memblock_end_of_DRAM();
+
+	if (fw_dump.dump_active) {
+		printk(KERN_INFO "Firmware-assisted dump is active.\n");
+		/*
+		 * If last boot has crashed then reserve all the memory
+		 * above boot_memory_size so that we don't touch it until
+		 * dump is written to disk by userspace tool. This memory
+		 * will be released for general use once the dump is saved.
+		 */
+		base = fw_dump.boot_memory_size;
+		size = memory_boundary - base;
+		memblock_reserve(base, size);
+		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+				"for saving crash dump\n",
+				(unsigned long)(size >> 20),
+				(unsigned long)(base >> 20));
+	} else {
+		/* Reserve the memory at the top of memory. */
+		size = get_fadump_area_size();
+		base = memory_boundary - size;
+		memblock_reserve(base, size);
+		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+				"for firmware-assisted dump\n",
+				(unsigned long)(size >> 20),
+				(unsigned long)(base >> 20));
+	}
+	fw_dump.reserve_dump_area_start = base;
+	fw_dump.reserve_dump_area_size = size;
+	return 1;
+}
+
+/* Look for fadump= cmdline option. */
+static int __init early_fadump_param(char *p)
+{
+	if (!p)
+		return 1;
+
+	if (p[0] == '1')
+		fw_dump.fadump_enabled = 1;
+	else if (p[0] == '0')
+		fw_dump.fadump_enabled = 0;
+
+	return 0;
+}
+early_param("fadump", early_fadump_param);
+
+/* Look for fadump_reserve_mem= cmdline option */
+static int __init early_fadump_reserve_mem(char *p)
+{
+	if (p)
+		fw_dump.reserve_bootvar = memparse(p, &p);
+	return 0;
+}
+early_param("fadump_reserve_mem", early_fadump_reserve_mem);
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 174e1e9..3fe75eb 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -54,6 +54,7 @@
 #include <asm/pci-bridge.h>
 #include <asm/phyp_dump.h>
 #include <asm/kexec.h>
+#include <asm/fadump.h>
 #include <mm/mmu_decl.h>
 
 #ifdef DEBUG
@@ -712,6 +713,11 @@ void __init early_init_devtree(void *params)
 	of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL);
 #endif
 
+#ifdef CONFIG_FA_DUMP
+	/* scan tree to see if dump is active during last boot */
+	of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL);
+#endif
+
 	/* Retrieve various informations from the /chosen node of the
 	 * device-tree, including the platform type, initrd location and
 	 * size, TCE reserve, and more ...
@@ -735,7 +741,14 @@ void __init early_init_devtree(void *params)
 	if (PHYSICAL_START > MEMORY_START)
 		memblock_reserve(MEMORY_START, 0x8000);
 	reserve_kdump_trampoline();
-	reserve_crashkernel();
+#ifdef CONFIG_FA_DUMP
+	/*
+	 * If we fail to reserve memory for firmware-assisted dump then
+	 * fallback to kexec based kdump.
+	 */
+	if (fadump_reserve_mem() == 0)
+#endif
+		reserve_crashkernel();
 	early_reserve_mem();
 	phyp_dump_reserve_mem();
 

^ permalink raw reply related

* [RFC PATCH 2/2] WIP: PowerPC cache cleanup
From: Kyle Moffett @ 2011-11-15 15:22 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: B04825, linux-kernel, paul.gortmaker, Kyle Moffett, scottwood
In-Reply-To: <1320986410.21206.camel@pasglop>

[My apologies for the resend, it does not seem to have hit the MLs.
I think my git send-email "cc-cmd" may have broken somehow, oops.]

This badly needs breaking up, and a better changelog... oh well...

The big changes:

* The "ppc64_caches" structure is now "powerpc_caches" and is used on
  both PPC32 and PPC64.  I hated staring at the pages and pages of
  assembly code, so nearly all of the functions are now C with tiny
  snippets of inline ASM in the loops.

* Lots of ugly assembly functions in arch/powerpc/kernel/misc_*.S were
  rewritten as cleaner inline ASM in arch/powerpc/mm/cache.c

* I'm not sure that the physical address functions from those files
  actually came out cleaner, but they are now more correct.

* I'm not 100% sure I like the new FOR_EACH_CACHE_LINE() macro, but it
  sure does make a lot of the other code much cleaner.

* I have a bit of a temptation to try to merge the 32/64-bit variants
  of copy_page() into a single C function.  A quick test seems to show
  that I can get nearly identical output to the 64-bit ASM with very
  little work.


---
 arch/powerpc/include/asm/cache.h             |  155 ++++++++++++---
 arch/powerpc/include/asm/cacheflush.h        |    3 -
 arch/powerpc/include/asm/page.h              |    6 +
 arch/powerpc/include/asm/page_32.h           |    4 +-
 arch/powerpc/include/asm/page_64.h           |   17 --
 arch/powerpc/kernel/align.c                  |    7 +-
 arch/powerpc/kernel/asm-offsets.c            |   13 +-
 arch/powerpc/kernel/head_32.S                |    9 +-
 arch/powerpc/kernel/head_64.S                |    2 +-
 arch/powerpc/kernel/misc_32.S                |  193 ------------------
 arch/powerpc/kernel/misc_64.S                |  182 -----------------
 arch/powerpc/kernel/ppc_ksyms.c              |    3 -
 arch/powerpc/kernel/setup-common.c           |  103 ++++++++++
 arch/powerpc/kernel/setup.h                  |    1 +
 arch/powerpc/kernel/setup_32.c               |   11 +-
 arch/powerpc/kernel/setup_64.c               |  118 +----------
 arch/powerpc/kernel/vdso.c                   |   27 +--
 arch/powerpc/lib/copypage_64.S               |   10 +-
 arch/powerpc/mm/Makefile                     |    2 +-
 arch/powerpc/mm/cache.c                      |  279 ++++++++++++++++++++++++++
 arch/powerpc/mm/dma-noncoherent.c            |    2 +-
 arch/powerpc/platforms/52xx/lite5200_sleep.S |    9 +-
 arch/powerpc/platforms/powermac/pci.c        |    2 +-
 arch/powerpc/xmon/xmon.c                     |   53 +++---
 drivers/macintosh/smu.c                      |    8 +-
 25 files changed, 599 insertions(+), 620 deletions(-)
 create mode 100644 arch/powerpc/mm/cache.c

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 4b50941..b1dc08f 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -3,47 +3,142 @@
 
 #ifdef __KERNEL__
 
-
-/* bytes per L1 cache line */
-#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
-#define L1_CACHE_SHIFT		4
-#define MAX_COPY_PREFETCH	1
+/*
+ * Various PowerPC CPUs which are otherwise compatible have different L1
+ * cache line sizes.
+ *
+ * Unfortunately, lots of kernel code assumes that L1_CACHE_BYTES and
+ * L1_CACHE_SHIFT are compile-time constants that can be used to align
+ * data-structures to avoid false cacheline sharing, so we can't just
+ * compute them at runtime from the cputable values.
+ *
+ * So for alignment purposes, we will compute these values as safe maximums
+ * of all the CPU support compiled into the kernel.
+ */
+#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_47x)
+# define L1_CACHE_SHIFT_MAX 7 /* 128-byte cache blocks */
 #elif defined(CONFIG_PPC_E500MC)
-#define L1_CACHE_SHIFT		6
-#define MAX_COPY_PREFETCH	4
-#elif defined(CONFIG_PPC32)
-#define MAX_COPY_PREFETCH	4
-#if defined(CONFIG_PPC_47x)
-#define L1_CACHE_SHIFT		7
+# define L1_CACHE_SHIFT_MAX 6 /* 64-byte cache blocks */
 #else
-#define L1_CACHE_SHIFT		5
+# define L1_CACHE_SHIFT_MAX 5 /* 32-byte cache blocks */
 #endif
+#define L1_CACHE_BYTES_MAX (1 << L1_CACHE_SHIFT_MAX)
+
+#define L1_CACHE_SHIFT  L1_CACHE_SHIFT_MAX
+#define L1_CACHE_BYTES  L1_CACHE_BYTES_MAX
+#define SMP_CACHE_BYTES L1_CACHE_BYTES_MAX
+
+/*
+ * Unfortunately, for other purposes, we can't just use a safe maximum value
+ * because it gets used in loops when invalidating or clearing cachelines and
+ * it would be very bad to only flush/invalidate/zero/etc every 4th one.
+ *
+ * During early initialization we load these values from the device-tree and
+ * the cputable into the powerpc_caches structure, but we need to be able to
+ * clear pages before that occurs, so these need sane default values.
+ *
+ * As explained in the powerpc_caches structure definition, the defaults
+ * should be safe minimums, so that's what we compute here.
+ */
+#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
+# define L1_CACHE_SHIFT_MIN 4 /* 16-byte cache blocks */
+#elif defined(CONFIG_PPC32)
+# define L1_CACHE_SHIFT_MIN 5 /* 32-byte cache blocks */
 #else /* CONFIG_PPC64 */
-#define L1_CACHE_SHIFT		7
+# define L1_CACHE_SHIFT_MIN 6 /* 64-byte cache blocks */
 #endif
+#define L1_CACHE_BYTES_MIN (1 << L1_CACHE_SHIFT_MIN)
 
-#define	L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
+/*
+ * Apparently the 8xx and the 403GCX have tiny caches, so they never prefetch
+ * more than a single cacheline in the ASM memory copy functions.
+ *
+ * All other 32-bit CPUs prefetch 4 cachelines, and the 64-bit CPUs have
+ * their own copy routines which prefetch the entire page.
+ */
+#ifdef PPC32
+# if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
+#  define MAX_COPY_PREFETCH 1
+# else
+#  define MAX_COPY_PREFETCH 4
+# endif
+#endif
 
-#define	SMP_CACHE_BYTES		L1_CACHE_BYTES
+#ifndef __ASSEMBLY__
 
-#if defined(__powerpc64__) && !defined(__ASSEMBLY__)
-struct ppc64_caches {
-	u32	dsize;			/* L1 d-cache size */
-	u32	dline_size;		/* L1 d-cache line size	*/
-	u32	log_dline_size;
-	u32	dlines_per_page;
-	u32	isize;			/* L1 i-cache size */
-	u32	iline_size;		/* L1 i-cache line size	*/
-	u32	log_iline_size;
-	u32	ilines_per_page;
-};
+/*
+ * A handy macro to iterate over all the cachelines referring to memory from
+ * "START" through "STOP - 1", inclusive.
+ */
+#define FOR_EACH_CACHELINE(LINE, START, STOP, CACHE)			\
+	for (u32 linesize__ = powerpc_caches.CACHE##_block_bytes,	\
+			(LINE) = (START) & ~(linesize__ - 1);		\
+			(LINE) < (STOP); (LINE) += linesize__)
+
+/* Write out a data cache block if it is dirty */
+static inline void dcbst(unsigned long addr)
+{
+	asm volatile("dcbst %y0" :: "Z"(addr) : "memory");
+}
 
-extern struct ppc64_caches ppc64_caches;
-#endif /* __powerpc64__ && ! __ASSEMBLY__ */
+/* Invalidate a data cache block (will lose data if dirty!) */
+static inline void dcbi(unsigned long addr)
+{
+	asm volatile("dcbi %y0" :: "Z"(addr) : "memory");
+}
+
+/* Write out (if dirty) and invalidate a data cache block */
+static inline void dcbf(unsigned long addr)
+{
+	asm volatile("dcbf %y0" :: "Z"(addr) : "memory");
+}
+
+/* Populate a data cache block with zeros */
+static inline void dcbz(unsigned long addr)
+{
+	asm volatile("dcbz %y0" :: "Z"(addr) : "memory");
+}
+
+/* Invalidate an instruction cache block */
+static inline void icbi(unsigned long addr)
+{
+	asm volatile("icbi %y0" :: "Z"(addr) : "memory");
+}
+
+/*
+ * This structure contains the various PowerPC cache parameters computed
+ * shortly after the device-tree has been unflattened during boot.
+ *
+ * Prior to that they have statically initialized values from L1_CACHE_*_MIN
+ * computed above.
+ *
+ * NOTE: If the dcache/icache are separate then ucache_* should be zeroed,
+ *       otherwise dcache == icache == ucache.
+ */
+struct powerpc_caches {
+	/* Data cache parameters */
+	u32 dcache_total_bytes;
+	u32 dcache_block_bytes;
+	u32 dcache_block_shift;
+	u32 dcache_blocks_per_page;
+
+	/* Instruction cache parameters */
+	u32 icache_total_bytes;
+	u32 icache_block_bytes;
+	u32 icache_block_shift;
+	u32 icache_blocks_per_page;
+
+	/* Unified cache parameters (If != 0, all 3 caches must be equal) */
+	u32 ucache_total_bytes;
+	u32 ucache_block_bytes;
+	u32 ucache_block_shift;
+	u32 ucache_blocks_per_page;
+};
+extern struct powerpc_caches powerpc_caches;
 
-#if !defined(__ASSEMBLY__)
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
-#endif
+
+#endif /* not __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_CACHE_H */
diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index ab9e402..8646443 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -47,12 +47,9 @@ extern void __flush_dcache_icache_phys(unsigned long physaddr);
 #endif /* CONFIG_PPC32 && !CONFIG_BOOKE */
 
 extern void flush_dcache_range(unsigned long start, unsigned long stop);
-#ifdef CONFIG_PPC32
 extern void clean_dcache_range(unsigned long start, unsigned long stop);
 extern void invalidate_dcache_range(unsigned long start, unsigned long stop);
-#endif /* CONFIG_PPC32 */
 #ifdef CONFIG_PPC64
-extern void flush_inval_dcache_range(unsigned long start, unsigned long stop);
 extern void flush_dcache_phys_range(unsigned long start, unsigned long stop);
 #endif
 
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index dd9c4fd..b2e24ce 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -286,11 +286,17 @@ static inline int hugepd_ok(hugepd_t hpd)
 #endif /* CONFIG_HUGETLB_PAGE */
 
 struct page;
+extern void clear_pages(void *page, int order);
 extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
 extern void copy_user_page(void *to, void *from, unsigned long vaddr,
 		struct page *p);
 extern int page_is_ram(unsigned long pfn);
 
+static inline void clear_page(void *page)
+{
+	clear_pages(page, 0);
+}
+
 #ifdef CONFIG_PPC_SMLPAR
 void arch_free_page(struct page *page, int order);
 #define HAVE_ARCH_FREE_PAGE
diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index 68d73b2..12ae694 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -10,7 +10,7 @@
 #define VM_DATA_DEFAULT_FLAGS	VM_DATA_DEFAULT_FLAGS32
 
 #ifdef CONFIG_NOT_COHERENT_CACHE
-#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES_MAX
 #endif
 
 #ifdef CONFIG_PTE_64BIT
@@ -37,8 +37,6 @@ typedef unsigned long pte_basic_t;
 #endif
 
 struct page;
-extern void clear_pages(void *page, int order);
-static inline void clear_page(void *page) { clear_pages(page, 0); }
 extern void copy_page(void *to, void *from);
 
 #include <asm-generic/getorder.h>
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index fb40ede..7e156f6 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -42,23 +42,6 @@
 
 typedef unsigned long pte_basic_t;
 
-static __inline__ void clear_page(void *addr)
-{
-	unsigned long lines, line_size;
-
-	line_size = ppc64_caches.dline_size;
-	lines = ppc64_caches.dlines_per_page;
-
-	__asm__ __volatile__(
-	"mtctr	%1	# clear_page\n\
-1:      dcbz	0,%0\n\
-	add	%0,%0,%3\n\
-	bdnz+	1b"
-        : "=r" (addr)
-        : "r" (lines), "0" (addr), "r" (line_size)
-	: "ctr", "memory");
-}
-
 extern void copy_page(void *to, void *from);
 
 /* Log 2 of page table size */
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 8184ee9..debfb99 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -233,14 +233,9 @@ static inline unsigned make_dsisr(unsigned instr)
  */
 static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr)
 {
+	int i, size = powerpc_caches.dcache_block_bytes;
 	long __user *p;
-	int i, size;
 
-#ifdef __powerpc64__
-	size = ppc64_caches.dline_size;
-#else
-	size = L1_CACHE_BYTES;
-#endif
 	p = (long __user *) (regs->dar & -size);
 	if (user_mode(regs) && !access_ok(VERIFY_WRITE, p, size))
 		return -EFAULT;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7c5324f..505b25a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -126,13 +126,14 @@ int main(void)
 	DEFINE(TI_TASK, offsetof(struct thread_info, task));
 	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
 
+	DEFINE(DCACHE_BLOCK_SHIFT,	offsetof(struct powerpc_caches, dcache_block_shift));
+	DEFINE(DCACHE_BLOCK_BYTES,	offsetof(struct powerpc_caches, dcache_block_bytes));
+	DEFINE(DCACHE_BLOCKS_PER_PAGE,	offsetof(struct powerpc_caches, dcache_blocks_per_page));
+	DEFINE(ICACHE_BLOCK_SHIFT,	offsetof(struct powerpc_caches, icache_block_shift));
+	DEFINE(ICACHE_BLOCK_BYTES,	offsetof(struct powerpc_caches, icache_block_bytes));
+	DEFINE(ICACHE_BLOCKS_PER_PAGE,	offsetof(struct powerpc_caches, icache_blocks_per_page));
+
 #ifdef CONFIG_PPC64
-	DEFINE(DCACHEL1LINESIZE, offsetof(struct ppc64_caches, dline_size));
-	DEFINE(DCACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_dline_size));
-	DEFINE(DCACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, dlines_per_page));
-	DEFINE(ICACHEL1LINESIZE, offsetof(struct ppc64_caches, iline_size));
-	DEFINE(ICACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_iline_size));
-	DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
 	/* paca */
 	DEFINE(PACA_SIZE, sizeof(struct paca_struct));
 	DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 0654dba..8abc44a 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -786,7 +786,14 @@ relocate_kernel:
 _ENTRY(copy_and_flush)
 	addi	r5,r5,-4
 	addi	r6,r6,-4
-4:	li	r0,L1_CACHE_BYTES/4
+4:	li	r0,L1_CACHE_BYTES_MIN/4	/* Use the smallest common	*/
+					/* denominator cache line	*/
+					/* size.  This results in	*/
+					/* extra cache line flushes	*/
+					/* but operation is correct.	*/
+					/* Can't get cache line size	*/
+					/* from device-tree yet		*/
+
 	mtctr	r0
 3:	addi	r6,r6,4			/* copy a cache line */
 	lwzx	r0,r6,r4
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 06c7251..183d371 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -480,7 +480,7 @@ p_end:	.llong	_end - _stext
 _GLOBAL(copy_and_flush)
 	addi	r5,r5,-8
 	addi	r6,r6,-8
-4:	li	r0,8			/* Use the smallest common	*/
+4:	li	r0,L1_CACHE_BYTES_MIN/8	/* Use the smallest common	*/
 					/* denominator cache line	*/
 					/* size.  This results in	*/
 					/* extra cache line flushes	*/
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index f7d760a..ee61600 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -321,199 +321,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
 	blr
 
 /*
- * Write any modified data cache blocks out to memory
- * and invalidate the corresponding instruction cache blocks.
- * This is a no-op on the 601.
- *
- * flush_icache_range(unsigned long start, unsigned long stop)
- */
-_KPROBE(__flush_icache_range)
-BEGIN_FTR_SECTION
-	blr				/* for 601, do nothing */
-END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
-	li	r5,L1_CACHE_BYTES-1
-	andc	r3,r3,r5
-	subf	r4,r3,r4
-	add	r4,r4,r5
-	srwi.	r4,r4,L1_CACHE_SHIFT
-	beqlr
-	mtctr	r4
-	mr	r6,r3
-1:	dcbst	0,r3
-	addi	r3,r3,L1_CACHE_BYTES
-	bdnz	1b
-	sync				/* wait for dcbst's to get to ram */
-#ifndef CONFIG_44x
-	mtctr	r4
-2:	icbi	0,r6
-	addi	r6,r6,L1_CACHE_BYTES
-	bdnz	2b
-#else
-	/* Flash invalidate on 44x because we are passed kmapped addresses and
-	   this doesn't work for userspace pages due to the virtually tagged
-	   icache.  Sigh. */
-	iccci	0, r0
-#endif
-	sync				/* additional sync needed on g4 */
-	isync
-	blr
-/*
- * Write any modified data cache blocks out to memory.
- * Does not invalidate the corresponding cache lines (especially for
- * any corresponding instruction cache).
- *
- * clean_dcache_range(unsigned long start, unsigned long stop)
- */
-_GLOBAL(clean_dcache_range)
-	li	r5,L1_CACHE_BYTES-1
-	andc	r3,r3,r5
-	subf	r4,r3,r4
-	add	r4,r4,r5
-	srwi.	r4,r4,L1_CACHE_SHIFT
-	beqlr
-	mtctr	r4
-
-1:	dcbst	0,r3
-	addi	r3,r3,L1_CACHE_BYTES
-	bdnz	1b
-	sync				/* wait for dcbst's to get to ram */
-	blr
-
-/*
- * Write any modified data cache blocks out to memory and invalidate them.
- * Does not invalidate the corresponding instruction cache blocks.
- *
- * flush_dcache_range(unsigned long start, unsigned long stop)
- */
-_GLOBAL(flush_dcache_range)
-	li	r5,L1_CACHE_BYTES-1
-	andc	r3,r3,r5
-	subf	r4,r3,r4
-	add	r4,r4,r5
-	srwi.	r4,r4,L1_CACHE_SHIFT
-	beqlr
-	mtctr	r4
-
-1:	dcbf	0,r3
-	addi	r3,r3,L1_CACHE_BYTES
-	bdnz	1b
-	sync				/* wait for dcbst's to get to ram */
-	blr
-
-/*
- * Like above, but invalidate the D-cache.  This is used by the 8xx
- * to invalidate the cache so the PPC core doesn't get stale data
- * from the CPM (no cache snooping here :-).
- *
- * invalidate_dcache_range(unsigned long start, unsigned long stop)
- */
-_GLOBAL(invalidate_dcache_range)
-	li	r5,L1_CACHE_BYTES-1
-	andc	r3,r3,r5
-	subf	r4,r3,r4
-	add	r4,r4,r5
-	srwi.	r4,r4,L1_CACHE_SHIFT
-	beqlr
-	mtctr	r4
-
-1:	dcbi	0,r3
-	addi	r3,r3,L1_CACHE_BYTES
-	bdnz	1b
-	sync				/* wait for dcbi's to get to ram */
-	blr
-
-/*
- * Flush a particular page from the data cache to RAM.
- * Note: this is necessary because the instruction cache does *not*
- * snoop from the data cache.
- * This is a no-op on the 601 which has a unified cache.
- *
- *	void __flush_dcache_icache(void *page)
- */
-_GLOBAL(__flush_dcache_icache)
-BEGIN_FTR_SECTION
-	blr
-END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
-	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
-	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
-	mtctr	r4
-	mr	r6,r3
-0:	dcbst	0,r3				/* Write line to ram */
-	addi	r3,r3,L1_CACHE_BYTES
-	bdnz	0b
-	sync
-#ifdef CONFIG_44x
-	/* We don't flush the icache on 44x. Those have a virtual icache
-	 * and we don't have access to the virtual address here (it's
-	 * not the page vaddr but where it's mapped in user space). The
-	 * flushing of the icache on these is handled elsewhere, when
-	 * a change in the address space occurs, before returning to
-	 * user space
-	 */
-BEGIN_MMU_FTR_SECTION
-	blr
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
-#endif /* CONFIG_44x */
-	mtctr	r4
-1:	icbi	0,r6
-	addi	r6,r6,L1_CACHE_BYTES
-	bdnz	1b
-	sync
-	isync
-	blr
-
-#ifndef CONFIG_BOOKE
-/*
- * Flush a particular page from the data cache to RAM, identified
- * by its physical address.  We turn off the MMU so we can just use
- * the physical address (this may be a highmem page without a kernel
- * mapping).
- *
- *	void __flush_dcache_icache_phys(unsigned long physaddr)
- */
-_GLOBAL(__flush_dcache_icache_phys)
-BEGIN_FTR_SECTION
-	blr					/* for 601, do nothing */
-END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
-	mfmsr	r10
-	rlwinm	r0,r10,0,28,26			/* clear DR */
-	mtmsr	r0
-	isync
-	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
-	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
-	mtctr	r4
-	mr	r6,r3
-0:	dcbst	0,r3				/* Write line to ram */
-	addi	r3,r3,L1_CACHE_BYTES
-	bdnz	0b
-	sync
-	mtctr	r4
-1:	icbi	0,r6
-	addi	r6,r6,L1_CACHE_BYTES
-	bdnz	1b
-	sync
-	mtmsr	r10				/* restore DR */
-	isync
-	blr
-#endif /* CONFIG_BOOKE */
-
-/*
- * Clear pages using the dcbz instruction, which doesn't cause any
- * memory traffic (except to write out any cache lines which get
- * displaced).  This only works on cacheable memory.
- *
- * void clear_pages(void *page, int order) ;
- */
-_GLOBAL(clear_pages)
-	li	r0,PAGE_SIZE/L1_CACHE_BYTES
-	slw	r0,r0,r4
-	mtctr	r0
-1:	dcbz	0,r3
-	addi	r3,r3,L1_CACHE_BYTES
-	bdnz	1b
-	blr
-
-/*
  * Copy a whole page.  We use the dcbz instruction on the destination
  * to reduce memory traffic (it eliminates the unnecessary reads of
  * the destination into cache).  This requires that the destination
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 616921e..500fd61 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -53,188 +53,6 @@ _GLOBAL(call_handle_irq)
 	mtlr	r0
 	blr
 
-	.section	".toc","aw"
-PPC64_CACHES:
-	.tc		ppc64_caches[TC],ppc64_caches
-	.section	".text"
-
-/*
- * Write any modified data cache blocks out to memory
- * and invalidate the corresponding instruction cache blocks.
- *
- * flush_icache_range(unsigned long start, unsigned long stop)
- *
- *   flush all bytes from start through stop-1 inclusive
- */
-
-_KPROBE(__flush_icache_range)
-
-/*
- * Flush the data cache to memory 
- * 
- * Different systems have different cache line sizes
- * and in some cases i-cache and d-cache line sizes differ from
- * each other.
- */
- 	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1LINESIZE(r10)/* Get cache line size */
-	addi	r5,r7,-1
-	andc	r6,r3,r5		/* round low to line bdy */
-	subf	r8,r6,r4		/* compute length */
-	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of cache line size */
-	srw.	r8,r8,r9		/* compute line count */
-	beqlr				/* nothing to do? */
-	mtctr	r8
-1:	dcbst	0,r6
-	add	r6,r6,r7
-	bdnz	1b
-	sync
-
-/* Now invalidate the instruction cache */
-	
-	lwz	r7,ICACHEL1LINESIZE(r10)	/* Get Icache line size */
-	addi	r5,r7,-1
-	andc	r6,r3,r5		/* round low to line bdy */
-	subf	r8,r6,r4		/* compute length */
-	add	r8,r8,r5
-	lwz	r9,ICACHEL1LOGLINESIZE(r10)	/* Get log-2 of Icache line size */
-	srw.	r8,r8,r9		/* compute line count */
-	beqlr				/* nothing to do? */
-	mtctr	r8
-2:	icbi	0,r6
-	add	r6,r6,r7
-	bdnz	2b
-	isync
-	blr
-	.previous .text
-/*
- * Like above, but only do the D-cache.
- *
- * flush_dcache_range(unsigned long start, unsigned long stop)
- *
- *    flush all bytes from start to stop-1 inclusive
- */
-_GLOBAL(flush_dcache_range)
-
-/*
- * Flush the data cache to memory 
- * 
- * Different systems have different cache line sizes
- */
- 	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
-	addi	r5,r7,-1
-	andc	r6,r3,r5		/* round low to line bdy */
-	subf	r8,r6,r4		/* compute length */
-	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of dcache line size */
-	srw.	r8,r8,r9		/* compute line count */
-	beqlr				/* nothing to do? */
-	mtctr	r8
-0:	dcbst	0,r6
-	add	r6,r6,r7
-	bdnz	0b
-	sync
-	blr
-
-/*
- * Like above, but works on non-mapped physical addresses.
- * Use only for non-LPAR setups ! It also assumes real mode
- * is cacheable. Used for flushing out the DART before using
- * it as uncacheable memory 
- *
- * flush_dcache_phys_range(unsigned long start, unsigned long stop)
- *
- *    flush all bytes from start to stop-1 inclusive
- */
-_GLOBAL(flush_dcache_phys_range)
- 	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
-	addi	r5,r7,-1
-	andc	r6,r3,r5		/* round low to line bdy */
-	subf	r8,r6,r4		/* compute length */
-	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of dcache line size */
-	srw.	r8,r8,r9		/* compute line count */
-	beqlr				/* nothing to do? */
-	mfmsr	r5			/* Disable MMU Data Relocation */
-	ori	r0,r5,MSR_DR
-	xori	r0,r0,MSR_DR
-	sync
-	mtmsr	r0
-	sync
-	isync
-	mtctr	r8
-0:	dcbst	0,r6
-	add	r6,r6,r7
-	bdnz	0b
-	sync
-	isync
-	mtmsr	r5			/* Re-enable MMU Data Relocation */
-	sync
-	isync
-	blr
-
-_GLOBAL(flush_inval_dcache_range)
- 	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
-	addi	r5,r7,-1
-	andc	r6,r3,r5		/* round low to line bdy */
-	subf	r8,r6,r4		/* compute length */
-	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGLINESIZE(r10)/* Get log-2 of dcache line size */
-	srw.	r8,r8,r9		/* compute line count */
-	beqlr				/* nothing to do? */
-	sync
-	isync
-	mtctr	r8
-0:	dcbf	0,r6
-	add	r6,r6,r7
-	bdnz	0b
-	sync
-	isync
-	blr
-
-
-/*
- * Flush a particular page from the data cache to RAM.
- * Note: this is necessary because the instruction cache does *not*
- * snoop from the data cache.
- *
- *	void __flush_dcache_icache(void *page)
- */
-_GLOBAL(__flush_dcache_icache)
-/*
- * Flush the data cache to memory 
- * 
- * Different systems have different cache line sizes
- */
-
-/* Flush the dcache */
- 	ld	r7,PPC64_CACHES@toc(r2)
-	clrrdi	r3,r3,PAGE_SHIFT           	    /* Page align */
-	lwz	r4,DCACHEL1LINESPERPAGE(r7)	/* Get # dcache lines per page */
-	lwz	r5,DCACHEL1LINESIZE(r7)		/* Get dcache line size */
-	mr	r6,r3
-	mtctr	r4
-0:	dcbst	0,r6
-	add	r6,r6,r5
-	bdnz	0b
-	sync
-
-/* Now invalidate the icache */	
-
-	lwz	r4,ICACHEL1LINESPERPAGE(r7)	/* Get # icache lines per page */
-	lwz	r5,ICACHEL1LINESIZE(r7)		/* Get icache line size */
-	mtctr	r4
-1:	icbi	0,r3
-	add	r3,r3,r5
-	bdnz	1b
-	isync
-	blr
-
-
 #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
 /*
  * Do an IO access in real mode
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index acba8ce..ccdceb7 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -53,7 +53,6 @@ extern void program_check_exception(struct pt_regs *regs);
 extern void single_step_exception(struct pt_regs *regs);
 extern int sys_sigreturn(struct pt_regs *regs);
 
-EXPORT_SYMBOL(clear_pages);
 EXPORT_SYMBOL(ISA_DMA_THRESHOLD);
 EXPORT_SYMBOL(DMA_MODE_READ);
 EXPORT_SYMBOL(DMA_MODE_WRITE);
@@ -113,8 +112,6 @@ EXPORT_SYMBOL(giveup_spe);
 #ifndef CONFIG_PPC64
 EXPORT_SYMBOL(flush_instruction_cache);
 #endif
-EXPORT_SYMBOL(__flush_icache_range);
-EXPORT_SYMBOL(flush_dcache_range);
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 77bb77d..3abfea4 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -83,6 +83,54 @@ unsigned long klimit = (unsigned long) _end;
 char cmd_line[COMMAND_LINE_SIZE];
 
 /*
+ * Initialize these values to minimum safe defaults in case they need to be
+ * used early during the boot process.  While this may not seem safe, it is
+ * actually safe in practice, because all of the kernel loops that use this
+ * data operate on whole pages.
+ *
+ * The PowerPC Book III-E spec documents that the pagesize is an even
+ * multiple of the cache block size and the cache blocks are always
+ * page-aligned.
+ *
+ * So, for example, when clearing a whole page there are only two things that
+ * can be done wrong with "dcbz":
+ *
+ *   (1) Call "dcbz" with an address outside the page you want to zero.
+ *
+ *   (2) Call "dcbz" too few times to actually hit all of the cachelines,
+ *       IE: Use a too-large cacheline stride.
+ *
+ * So as long as we ensure that this number is small enough for the current
+ * CPU everything will operate correctly, albeit with a slight performance
+ * hit, until we get a chance to parse the device-tree for the right value.
+ *
+ * NOTE: Userspace expects an exact value, so none of the above applies after
+ * the device tree has been unflattened and actual values computed.
+ *
+ * See arch/powerpc/asm/caches.h for more information.
+ */
+struct powerpc_caches powerpc_caches = {
+	/* Data cache sizes */
+	.dcache_total_bytes  = 0, /* Unknown */
+	.dcache_block_bytes = L1_CACHE_BYTES_MIN,
+	.dcache_block_shift = L1_CACHE_SHIFT_MIN,
+	.dcache_blocks_per_page = (PAGE_SIZE >> L1_CACHE_SHIFT_MIN),
+
+	/* Instruction cache sizes */
+	.icache_total_bytes = 0,
+	.icache_block_bytes = L1_CACHE_BYTES_MIN,
+	.icache_block_shift = L1_CACHE_SHIFT_MIN,
+	.icache_blocks_per_page = (PAGE_SIZE >> L1_CACHE_SHIFT_MIN),
+
+	/* Unified cache (assume cache is split by default) */
+	.ucache_total_bytes = 0,
+	.ucache_block_bytes = 0,
+	.ucache_block_shift = 0,
+	.ucache_blocks_per_page = 0,
+};
+EXPORT_SYMBOL_GPL(powerpc_caches);
+
+/*
  * This still seems to be needed... -- paulus
  */ 
 struct screen_info screen_info = {
@@ -349,6 +397,61 @@ const struct seq_operations cpuinfo_op = {
 	.show =	show_cpuinfo,
 };
 
+/* Helper functions to compute various values from a cache block size */
+static void __init set_dcache_block_data(u32 bytes)
+{
+	u32 shift = __ilog2(bytes);
+	powerpc_caches.dcache_block_bytes = bytes;
+	powerpc_caches.dcache_block_shift = shift;
+	powerpc_caches.dcache_blocks_per_page = (PAGE_SIZE >> shift);
+}
+static void __init set_icache_block_data(u32 bytes)
+{
+	u32 shift = __ilog2(bytes);
+	powerpc_caches.icache_block_bytes = bytes;
+	powerpc_caches.icache_block_shift = shift;
+	powerpc_caches.icache_blocks_per_page = (PAGE_SIZE >> shift);
+}
+
+/*
+ * Preinitialize the powerpc_caches structure from the cputable.  We will
+ * later scan the device-tree for this information, which may be more
+ * accurate.
+ */
+void __init initialize_early_cache_info(void)
+{
+	set_dcache_block_data(cur_cpu_spec->dcache_bsize);
+	set_icache_block_data(cur_cpu_spec->icache_bsize);
+}
+
+/*
+ * Initialize the powerpc_caches structure from the device-tree for use by
+ * copy_page(), cache flush routines, and AT_DCACHEBSIZE elf headers.
+ *
+ * In the unlikely event that the device-tree doesn't have this information,
+ * the defaults loaded by initialize_early_cache_info() from the cputable
+ * will be used.
+ */
+void __init initialize_cache_info(void)
+{
+	/* Assume that the cache properties are the same across all nodes */
+	struct device_node *np = of_find_node_by_type(NULL, "cpu");
+	u32 value = 0;
+
+	/* First check data/instruction cache block sizes */
+	if (	!of_property_read_u32(np, "d-cache-block-size", &value) ||
+		!of_property_read_u32(np, "d-cache-line-size", &value))
+		set_dcache_block_data(value);
+
+	if (	!of_property_read_u32(np, "i-cache-block-size", &value) ||
+		!of_property_read_u32(np, "i-cache-line-size", &value))
+		set_icache_block_data(value);
+
+	/* Also read total cache sizes (no defaults here) */
+	of_property_read_u32(np, "d-cache-size", &powerpc_caches.dcache_total_bytes);
+	of_property_read_u32(np, "i-cache-size", &powerpc_caches.icache_total_bytes);
+}
+
 void __init check_for_initrd(void)
 {
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
index 4c67ad7..1ae16ec 100644
--- a/arch/powerpc/kernel/setup.h
+++ b/arch/powerpc/kernel/setup.h
@@ -1,6 +1,7 @@
 #ifndef _POWERPC_KERNEL_SETUP_H
 #define _POWERPC_KERNEL_SETUP_H
 
+void initialize_cache_info(void);
 void check_for_initrd(void);
 void do_init_bootmem(void);
 void setup_panic(void);
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index c1ce863..1db2bfb 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -63,14 +63,6 @@ EXPORT_SYMBOL(vgacon_remap_base);
 #endif
 
 /*
- * These are used in binfmt_elf.c to put aux entries on the stack
- * for each elf executable being started.
- */
-int dcache_bsize;
-int icache_bsize;
-int ucache_bsize;
-
-/*
  * We're called here very early in the boot.  We determine the machine
  * type and call the appropriate low-level setup functions.
  *  -- Cort <cort@fsmlabs.com>
@@ -286,10 +278,13 @@ void __init setup_arch(char **cmdline_p)
 {
 	*cmdline_p = cmd_line;
 
+	initialize_early_cache_info();
+
 	/* so udelay does something sensible, assume <= 1000 bogomips */
 	loops_per_jiffy = 500000000 / HZ;
 
 	unflatten_device_tree();
+	initialize_cache_info();
 	check_for_initrd();
 
 	if (ppc_md.init_early)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 1a9dea8..bb686de 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -77,25 +77,6 @@ int boot_cpuid = 0;
 int __initdata spinning_secondaries;
 u64 ppc64_pft_size;
 
-/* Pick defaults since we might want to patch instructions
- * before we've read this from the device tree.
- */
-struct ppc64_caches ppc64_caches = {
-	.dline_size = 0x40,
-	.log_dline_size = 6,
-	.iline_size = 0x40,
-	.log_iline_size = 6
-};
-EXPORT_SYMBOL_GPL(ppc64_caches);
-
-/*
- * These are used in binfmt_elf.c to put aux entries on the stack
- * for each elf executable being started.
- */
-int dcache_bsize;
-int icache_bsize;
-int ucache_bsize;
-
 #ifdef CONFIG_SMP
 
 static char *smt_enabled_cmdline;
@@ -265,82 +246,6 @@ void smp_release_cpus(void)
 #endif /* CONFIG_SMP || CONFIG_KEXEC */
 
 /*
- * Initialize some remaining members of the ppc64_caches and systemcfg
- * structures
- * (at least until we get rid of them completely). This is mostly some
- * cache informations about the CPU that will be used by cache flush
- * routines and/or provided to userland
- */
-static void __init initialize_cache_info(void)
-{
-	struct device_node *np;
-	unsigned long num_cpus = 0;
-
-	DBG(" -> initialize_cache_info()\n");
-
-	for_each_node_by_type(np, "cpu") {
-		num_cpus += 1;
-
-		/*
-		 * We're assuming *all* of the CPUs have the same
-		 * d-cache and i-cache sizes... -Peter
-		 */
-		if (num_cpus == 1) {
-			const u32 *sizep, *lsizep;
-			u32 size, lsize;
-
-			size = 0;
-			lsize = cur_cpu_spec->dcache_bsize;
-			sizep = of_get_property(np, "d-cache-size", NULL);
-			if (sizep != NULL)
-				size = *sizep;
-			lsizep = of_get_property(np, "d-cache-block-size",
-						 NULL);
-			/* fallback if block size missing */
-			if (lsizep == NULL)
-				lsizep = of_get_property(np,
-							 "d-cache-line-size",
-							 NULL);
-			if (lsizep != NULL)
-				lsize = *lsizep;
-			if (sizep == 0 || lsizep == 0)
-				DBG("Argh, can't find dcache properties ! "
-				    "sizep: %p, lsizep: %p\n", sizep, lsizep);
-
-			ppc64_caches.dsize = size;
-			ppc64_caches.dline_size = lsize;
-			ppc64_caches.log_dline_size = __ilog2(lsize);
-			ppc64_caches.dlines_per_page = PAGE_SIZE / lsize;
-
-			size = 0;
-			lsize = cur_cpu_spec->icache_bsize;
-			sizep = of_get_property(np, "i-cache-size", NULL);
-			if (sizep != NULL)
-				size = *sizep;
-			lsizep = of_get_property(np, "i-cache-block-size",
-						 NULL);
-			if (lsizep == NULL)
-				lsizep = of_get_property(np,
-							 "i-cache-line-size",
-							 NULL);
-			if (lsizep != NULL)
-				lsize = *lsizep;
-			if (sizep == 0 || lsizep == 0)
-				DBG("Argh, can't find icache properties ! "
-				    "sizep: %p, lsizep: %p\n", sizep, lsizep);
-
-			ppc64_caches.isize = size;
-			ppc64_caches.iline_size = lsize;
-			ppc64_caches.log_iline_size = __ilog2(lsize);
-			ppc64_caches.ilines_per_page = PAGE_SIZE / lsize;
-		}
-	}
-
-	DBG(" <- initialize_cache_info()\n");
-}
-
-
-/*
  * Do some initial setup of the system.  The parameters are those which 
  * were passed in from the bootloader.
  */
@@ -365,10 +270,7 @@ void __init setup_system(void)
 	 */
 	unflatten_device_tree();
 
-	/*
-	 * Fill the ppc64_caches & systemcfg structures with informations
- 	 * retrieved from the device-tree.
-	 */
+	/* Fill the powerpc_caches structure with device-tree data */
 	initialize_cache_info();
 
 #ifdef CONFIG_PPC_RTAS
@@ -423,12 +325,10 @@ void __init setup_system(void)
 	printk("-----------------------------------------------------\n");
 	printk("ppc64_pft_size                = 0x%llx\n", ppc64_pft_size);
 	printk("physicalMemorySize            = 0x%llx\n", memblock_phys_mem_size());
-	if (ppc64_caches.dline_size != 0x80)
-		printk("ppc64_caches.dcache_line_size = 0x%x\n",
-		       ppc64_caches.dline_size);
-	if (ppc64_caches.iline_size != 0x80)
-		printk("ppc64_caches.icache_line_size = 0x%x\n",
-		       ppc64_caches.iline_size);
+	if (powerpc_caches.dcache_block_bytes != 0x80)
+		printk("dcache_block_bytes = 0x%x\n", powerpc_caches.dcache_block_bytes);
+	if (powerpc_caches.icache_block_bytes != 0x80)
+		printk("icache_block_bytes = 0x%x\n", powerpc_caches.icache_block_bytes);
 #ifdef CONFIG_PPC_STD_MMU_64
 	if (htab_address)
 		printk("htab_address                  = 0x%p\n", htab_address);
@@ -545,13 +445,7 @@ void __init setup_arch(char **cmdline_p)
 
 	*cmdline_p = cmd_line;
 
-	/*
-	 * Set cache line size based on type of cpu as a default.
-	 * Systems with OF can look in the properties on the cpu node(s)
-	 * for a possibly more accurate value.
-	 */
-	dcache_bsize = ppc64_caches.dline_size;
-	icache_bsize = ppc64_caches.iline_size;
+	initialize_early_cache_info();
 
 	/* reboot on panic */
 	panic_timeout = 180;
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 7d14bb6..4a038fb 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -726,6 +726,7 @@ static int __init vdso_init(void)
 	vdso_data->version.major = SYSTEMCFG_MAJOR;
 	vdso_data->version.minor = SYSTEMCFG_MINOR;
 	vdso_data->processor = mfspr(SPRN_PVR);
+
 	/*
 	 * Fake the old platform number for pSeries and iSeries and add
 	 * in LPAR bit if necessary
@@ -734,29 +735,25 @@ static int __init vdso_init(void)
 	if (firmware_has_feature(FW_FEATURE_LPAR))
 		vdso_data->platform |= 1;
 	vdso_data->physicalMemorySize = memblock_phys_mem_size();
-	vdso_data->dcache_size = ppc64_caches.dsize;
-	vdso_data->dcache_line_size = ppc64_caches.dline_size;
-	vdso_data->icache_size = ppc64_caches.isize;
-	vdso_data->icache_line_size = ppc64_caches.iline_size;
 
-	/* XXXOJN: Blocks should be added to ppc64_caches and used instead */
-	vdso_data->dcache_block_size = ppc64_caches.dline_size;
-	vdso_data->icache_block_size = ppc64_caches.iline_size;
-	vdso_data->dcache_log_block_size = ppc64_caches.log_dline_size;
-	vdso_data->icache_log_block_size = ppc64_caches.log_iline_size;
+	/* There are more cache parameters saved for 64-bit than 32-bit */
+	vdso_data->dcache_size           = powerpc_caches.dcache_total_size;
+	vdso_data->icache_size           = powerpc_caches.icache_total_size;
+	vdso_data->dcache_line_size      = powerpc_caches.dcache_block_bytes;
+	vdso_data->icache_line_size      = powerpc_caches.icache_block_bytes;
 
 	/*
 	 * Calculate the size of the 64 bits vDSO
 	 */
 	vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT;
 	DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages);
-#else
-	vdso_data->dcache_block_size = L1_CACHE_BYTES;
-	vdso_data->dcache_log_block_size = L1_CACHE_SHIFT;
-	vdso_data->icache_block_size = L1_CACHE_BYTES;
-	vdso_data->icache_log_block_size = L1_CACHE_SHIFT;
-#endif /* CONFIG_PPC64 */
+#endif
 
+	/* Save the cache-block sizes for the VDSO */
+	vdso_data->dcache_block_size     = powerpc_caches.dcache_block_bytes;
+	vdso_data->icache_block_size     = powerpc_caches.icache_block_bytes;
+	vdso_data->dcache_log_block_size = powerpc_caches.dcache_block_shift;
+	vdso_data->icache_log_block_size = powerpc_caches.icache_block_shift;
 
 	/*
 	 * Calculate the size of the 32 bits vDSO
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 53dcb6b..c466977 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -12,17 +12,17 @@
 #include <asm/asm-offsets.h>
 
         .section        ".toc","aw"
-PPC64_CACHES:
-        .tc             ppc64_caches[TC],ppc64_caches
+POWERPC_CACHES:
+        .tc             powerpc_caches[TC],powerpc_caches
         .section        ".text"
 
 _GLOBAL(copy_page)
 	lis	r5,PAGE_SIZE@h
 	ori	r5,r5,PAGE_SIZE@l
 BEGIN_FTR_SECTION
-	ld      r10,PPC64_CACHES@toc(r2)
-	lwz	r11,DCACHEL1LOGLINESIZE(r10)	/* log2 of cache line size */
-	lwz     r12,DCACHEL1LINESIZE(r10)	/* get cache line size */
+	ld      r10,POWERPC_CACHES@toc(r2)
+	lwz	r11,DCACHE_BLOCK_SHIFT(r10)	/* log2 of cache line size */
+	lwz     r12,DCACHE_BLOCK_BYTES(r10)	/* get cache line size */
 	li	r9,0
 	srd	r8,r5,r11
 
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 991ee81..8ad36a9 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,7 +6,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 ccflags-$(CONFIG_PPC64)	:= -mno-minimal-toc
 
-obj-y				:= fault.o mem.o pgtable.o gup.o \
+obj-y				:= cache.o fault.o mem.o pgtable.o gup.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
diff --git a/arch/powerpc/mm/cache.c b/arch/powerpc/mm/cache.c
new file mode 100644
index 0000000..0fbf2d6
--- /dev/null
+++ b/arch/powerpc/mm/cache.c
@@ -0,0 +1,279 @@
+#include <linux/kprobes.h>
+#include <linux/export.h>
+#include <linux/types.h>
+
+#include <asm/cputable.h>
+#include <asm/system.h>
+#include <asm/cache.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+
+/*
+ * Write any modified data cache blocks out to memory.
+ * Does not invalidate the corresponding cache lines (especially for
+ * any corresponding instruction cache).
+ */
+void clean_dcache_range(unsigned long start, unsigned long stop)
+{
+	unsigned long addr;
+	FOR_EACH_CACHELINE(addr, start, stop, dcache)
+		dcbst(addr);
+	mb();
+}
+
+/*
+ * Write any modified data cache blocks out to memory and invalidate them.
+ * Does not invalidate the corresponding instruction cache blocks.
+ */
+void flush_dcache_range(unsigned long start, unsigned long stop)
+{
+	unsigned long addr;
+	FOR_EACH_CACHELINE(addr, start, stop, dcache)
+		dcbf(addr);
+	mb();
+}
+EXPORT_SYMBOL(flush_dcache_range);
+
+/*
+ * Like above, but invalidate the D-cache.  This is used by the 8xx
+ * to invalidate the cache so the PPC core doesn't get stale data
+ * from the CPM (no cache snooping here :-).
+ *
+ * invalidate_dcache_range(unsigned long start, unsigned long stop)
+ */
+void invalidate_dcache_range(unsigned long start, unsigned long stop)
+{
+	unsigned long addr;
+	FOR_EACH_CACHELINE(addr, start, stop, dcache)
+		dcbi(addr);
+	mb();
+}
+
+/*
+ * Unfortunately, we cannot flush individual chunks of the icache on 44x as
+ * we are passed kmapped addresses and we have a virtually-tagged icache.
+ *
+ * The only workaround is to invalidate the whole icache.
+ *
+ * NOTE: The CPU does not use the operands for this instruction, so
+ *       they are passed as dummies.
+ */
+__kprobes void __flush_icache_range(unsigned long start, unsigned long stop)
+{
+	unsigned long addr;
+
+	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		return;
+
+	/* First ensure that data has been written to memory */
+	FOR_EACH_CACHELINE(addr, start, stop, dcache)
+		dcbst(addr);
+	mb();
+
+#ifdef CONFIG_44x
+	if (mmu_has_feature(MMU_FTR_TYPE_44x)) {
+		asm volatile("iccci 0, r0" ::: "memory");
+		return;
+	}
+#endif
+
+	/* Now discard the corresponding icache */
+	FOR_EACH_CACHELINE(addr, start, stop, icache)
+		icbi(addr);
+	mb();
+	isync();
+}
+EXPORT_SYMBOL(__flush_icache_range);
+
+/*
+ * Flush a particular page from the data cache to RAM.
+ * Note: this is necessary because the instruction cache does *not*
+ * snoop from the data cache.
+ * This is a no-op on the 601 which has a unified cache.
+ *
+ *	void __flush_dcache_icache(void *page)
+ */
+void __flush_dcache_icache(void *page)
+{
+	unsigned long base = ((unsigned long)page) & ~(PAGE_SIZE-1);
+	unsigned long addr;
+
+	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		return;
+
+	/* First ensure that data has been written to memory */
+	FOR_EACH_CACHELINE(addr, base, base + PAGE_SIZE, dcache)
+		dcbst(addr);
+
+#ifdef CONFIG_44x
+	/*
+	 * We don't flush the icache on 44x. Those have a virtual icache and
+	 * we don't have access to the virtual address here (it's not the
+	 * page vaddr but where it's mapped in user space). The flushing of
+	 * the icache on these is handled elsewhere, when a change in the
+	 * address space occurs, before returning to user space.
+	 */
+	if (mmu_has_feature(MMU_FTR_TYPE_44x))
+		return;
+#endif
+
+	FOR_EACH_CACHELINE(addr, base, base + PAGE_SIZE, icache)
+		icbi(addr);
+
+	mb();
+	isync();
+}
+
+/*
+ * Clear pages using the dcbz instruction, which doesn't cause any
+ * memory traffic (except to write out any cache lines which get
+ * displaced).  This only works on cacheable memory.
+ *
+ */
+void clear_pages(void *page, int order)
+{
+	unsigned long addr, base = (unsigned long)page;
+	FOR_EACH_CACHELINE(addr, base, base + (PAGE_SIZE << order), dcache)
+		dcbz(addr);
+}
+EXPORT_SYMBOL(clear_pages);
+
+#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
+/*
+ * Flush a particular page from the data cache to RAM, identified
+ * by its physical address.  We turn off the MMU so we can just use
+ * the physical address (this may be a highmem page without a kernel
+ * mapping).
+ */
+void __flush_dcache_icache_phys(unsigned long phys_page)
+{
+	u32 d_size	= powerpc_caches.dcache_block_bytes;
+	u32 i_size	= powerpc_caches.icache_block_bytes;
+	u32 d_per_page	= powerpc_caches.dcache_blocks_per_page;
+	u32 i_per_page	= powerpc_caches.icache_blocks_per_page;
+
+	/* Temporary registers for the ASM to use */
+	unsigned long old_msr, tmp_msr, d_phys_page, i_phys_page;
+
+	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		return;
+
+	/* Page base address (used in 2 different loops) */
+	d_phys_page = i_phys_page = phys_page & ~(PAGE_SIZE - 1);
+
+	/*
+	 * This part needs to be 100% ASM because we disable the MMU, and we
+	 * can't accidentally let some C code go poking at memory while the
+	 * MMU isn't enabled.
+	 *
+	 * NOTE: This looks blatantly unsafe with respect to interrupts.
+	 *       Hopefully all the callers provide sufficient protection?
+	 */
+	asm volatile(
+		/* First disable the MMU */
+		"mfmsr %[old_msr]\n\t"
+		"rlwinm %[tmp_msr], %[old_msr], 0, 28, 26\n\t"
+		"mtmsr %[tmp_msr]\n\t"
+		"isync\n\t"
+
+		/* Clean the data cache */
+		"mtctr %[d_per_page]\n"
+	"0:	dcbst 0, %[d_phys_page]\n\t"
+		"add %[d_phys_page], %[d_phys_page], %[d_size]\n\t"
+		"bdnz 0b\n\t"
+		"sync\n\t"
+
+		/* Invalidate the instruction cache */
+		"mtctr %[i_per_page]\n"
+	"0:	icbi 0, %[i_phys_page]\n\t"
+		"add %[i_phys_page], %[i_phys_page], %[i_size]\n\t"
+		"bdnz 0b\n\t"
+
+		/* Finally, re-enable the MMU */
+		"sync\n\t"
+		"mtmsr %[old_msr]\n\t"
+		"isync\n\t"
+
+		/* Temporary variables and inputs */
+		: [old_msr]    "=&r" (old_msr),
+		  [tmp_msr]    "=&r" (tmp_msr),
+		  [d_phys_page] "=b" (d_phys_page),
+		  [i_phys_page] "=b" (i_phys_page)
+
+		/* Inputs */
+		: [d_size]     "b" (d_size),
+		  [i_size]     "b" (i_size),
+		  [d_per_page] "b" (d_per_page),
+		  [i_per_page] "b" (i_per_page),
+		  "[d_phys_page]"  (d_phys_page),
+		  "[i_phys_page]"  (i_phys_page)
+
+		/* Clobbers */
+		: "memory", "c"
+	);
+}
+#endif /* CONFIG_PPC32 && !CONFIG_BOOKE */
+
+#ifdef CONFIG_PPC64
+/*
+ * Data cache flush that works on non-mapped physical addresses.
+ * Use only for non-LPAR setups ! It also assumes real mode
+ * is cacheable. Used for flushing out the DART before using
+ * it as uncacheable memory 
+ */
+void flush_dcache_phys_range(unsigned long start, unsigned long stop)
+{
+	/* System data cache block size */
+	unsigned long bytes = powerpc_caches.dcache_block_bytes;
+	unsigned long shift = powerpc_caches.dcache_block_shift;
+
+	/* Temporary registers for the ASM to use */
+	unsigned long old_msr, tmp_msr;
+
+	/* Compute a start address and number of cachelines */
+	unsigned long phys_addr = start & ~(bytes - 1);
+	unsigned long nr_lines = ((stop - phys_addr) + (bytes - 1)) >> shift;
+
+	/*
+	 * This part needs to be 100% ASM because we disable the MMU, and we
+	 * can't accidentally let some C code go poking at memory while the
+	 * MMU isn't enabled.
+	 *
+	 * NOTE: This looks blatantly unsafe with respect to interrupts.
+	 *       Hopefully all the callers provide sufficient protection?
+	 */
+	asm volatile(
+		/* First disable the MMU */
+		"mfmsr %[old_msr]\n\t"
+		"rlwinm %[tmp_msr], %[old_msr], 0, 28, 26\n\t"
+		"mtmsr %[tmp_msr]\n\t"
+		"isync\n\t"
+
+		/* Clean the data cache */
+		"mtctr %[nr_lines]\n"
+	"0:	dcbst 0, %[phys_addr]\n\t"
+		"add %[phys_addr], %[phys_addr], %[bytes]\n\t"
+		"bdnz 0b\n\t"
+		"sync\n\t"
+		"isync\n\t"
+
+		/* Finally, re-enable the MMU */
+		"mtmsr %[old_msr]\n\t"
+		"sync\n\t"
+		"isync\n\t"
+
+		/* Temporary variables and inputs */
+		: [old_msr]  "=&r" (old_msr),
+		  [tmp_msr]  "=&r" (tmp_msr),
+		  [phys_addr] "=b" (phys_addr)
+
+		/* Inputs */
+		: [bytes]    "b" (bytes),
+		  [nr_lines] "b" (nr_lines),
+		  "[phys_addr]"  (phys_addr)
+
+		/* Clobbers */
+		: "memory", "c"
+	);
+}
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 329be36..3823f64 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -328,7 +328,7 @@ void __dma_sync(void *vaddr, size_t size, int direction)
 		 * invalidate only when cache-line aligned otherwise there is
 		 * the potential for discarding uncommitted data from the cache
 		 */
-		if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1)))
+		if ((start | size) & (powerpc_caches.dcache_block_bytes - 1))
 			flush_dcache_range(start, end);
 		else
 			invalidate_dcache_range(start, end);
diff --git a/arch/powerpc/platforms/52xx/lite5200_sleep.S b/arch/powerpc/platforms/52xx/lite5200_sleep.S
index 08ab6fe..ac285d9 100644
--- a/arch/powerpc/platforms/52xx/lite5200_sleep.S
+++ b/arch/powerpc/platforms/52xx/lite5200_sleep.S
@@ -394,11 +394,16 @@ restore_regs:
 
 
 /* cache flushing code. copied from arch/ppc/boot/util.S */
-#define NUM_CACHE_LINES (128*8)
+#define NUM_CACHE_LINES ((128 * 8) << (L1_CACHE_SHIFT_MAX - L1_CACHE_SHIFT_MIN))
 
 /*
  * Flush data cache
  * Do this by just reading lots of stuff into the cache.
+ *
+ * NOTE: This does not handle variable-sized cachelines properly, but since
+ *       we are just trying to flush the data cache by reading lots of data,
+ *       this works anyways.  We just make sure we read as many cachelines
+ *       as we could possibly need to overflow the cache on any hardware.
  */
 flush_data_cache:
 	lis	r3,CONFIG_KERNEL_START@h
@@ -407,6 +412,6 @@ flush_data_cache:
 	mtctr	r4
 1:
 	lwz	r4,0(r3)
-	addi	r3,r3,L1_CACHE_BYTES	/* Next line, please */
+	addi	r3,r3,L1_CACHE_BYTES_MIN /* Next line, please */
 	bdnz	1b
 	blr
diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
index 31a7d3a..8503e38 100644
--- a/arch/powerpc/platforms/powermac/pci.c
+++ b/arch/powerpc/platforms/powermac/pci.c
@@ -1135,7 +1135,7 @@ int pmac_pci_enable_device_hook(struct pci_dev *dev)
 		pci_write_config_byte(dev, PCI_LATENCY_TIMER, 16);
 
 		pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE,
-				      L1_CACHE_BYTES >> 2);
+				powerpc_caches.dcache_block_bytes >> 2);
 	}
 
 	return 0;
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 03a217a..c537d49 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -26,6 +26,7 @@
 
 #include <asm/ptrace.h>
 #include <asm/string.h>
+#include <asm/cache.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/xmon.h>
@@ -254,16 +255,6 @@ static inline void store_inst(void *p)
 	asm volatile ("dcbst 0,%0; sync; icbi 0,%0; isync" : : "r" (p));
 }
 
-static inline void cflush(void *p)
-{
-	asm volatile ("dcbf 0,%0; icbi 0,%0" : : "r" (p));
-}
-
-static inline void cinval(void *p)
-{
-	asm volatile ("dcbi 0,%0; icbi 0,%0" : : "r" (p));
-}
-
 /*
  * Disable surveillance (the service processor watchdog function)
  * while we are in xmon.
@@ -1513,10 +1504,9 @@ static void prregs(struct pt_regs *fp)
 
 static void cacheflush(void)
 {
-	int cmd;
-	unsigned long nflush;
+	unsigned long nflush, i;
 
-	cmd = inchar();
+	int cmd = inchar();
 	if (cmd != 'i')
 		termch = cmd;
 	scanhex((void *)&adrs);
@@ -1524,23 +1514,30 @@ static void cacheflush(void)
 		termch = 0;
 	nflush = 1;
 	scanhex(&nflush);
-	nflush = (nflush + L1_CACHE_BYTES - 1) / L1_CACHE_BYTES;
-	if (setjmp(bus_error_jmp) == 0) {
-		catch_memory_errors = 1;
-		sync();
 
-		if (cmd != 'i') {
-			for (; nflush > 0; --nflush, adrs += L1_CACHE_BYTES)
-				cflush((void *) adrs);
-		} else {
-			for (; nflush > 0; --nflush, adrs += L1_CACHE_BYTES)
-				cinval((void *) adrs);
-		}
-		sync();
-		/* wait a little while to see if we get a machine check */
-		__delay(200);
+	if (setjmp(bus_error_jmp) != 0) {
+		catch_memory_errors = 0;
+		return;
 	}
-	catch_memory_errors = 0;
+	catch_memory_errors = 1;
+	sync();
+
+	/* First flush/invalidate data caches */
+	if (cmd != 'i') {
+		FOR_EACH_CACHELINE(i, adrs, adrs + nflush, dcache)
+			dcbf(i);
+	} else {
+		FOR_EACH_CACHELINE(i, adrs, adrs + nflush, dcache)
+			dcbi(i);
+	}
+
+	/* Now invalidate instruction caches */
+	FOR_EACH_CACHELINE(i, adrs, adrs + nflush, icache)
+		icbi(i);
+
+	sync();
+	/* wait a little while to see if we get a machine check */
+	__delay(200);
 }
 
 static unsigned long
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 116a49c..04ead15 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -136,7 +136,9 @@ static void smu_start_cmd(void)
 	/* Flush command and data to RAM */
 	faddr = (unsigned long)smu->cmd_buf;
 	fend = faddr + smu->cmd_buf->length + 2;
-	flush_inval_dcache_range(faddr, fend);
+	flush_dcache_range(faddr, fend);
+	mb();
+	isync();
 
 
 	/* We also disable NAP mode for the duration of the command
@@ -198,7 +200,9 @@ static irqreturn_t smu_db_intr(int irq, void *arg)
 		 * reply length (it's only 2 cache lines anyway)
 		 */
 		faddr = (unsigned long)smu->cmd_buf;
-		flush_inval_dcache_range(faddr, faddr + 256);
+		flush_dcache_range(faddr, faddr + 256);
+		mb();
+		isync();
 
 		/* Now check ack */
 		ack = (~cmd->cmd) & 0xff;
-- 
1.7.2.5

^ permalink raw reply related

* [RFC PATCH v5 5/9] fadump: Convert firmware-assisted cpu state dump data into elf notes.
From: Mahesh J Salgaonkar @ 2011-11-15 15:14 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111115151145.16533.16384.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

When registered for firmware assisted dump on powerpc, firmware preserves
the registers for the active CPUs during a system crash. This patch reads
the cpu register data stored in Firmware-assisted dump format (except for
crashing cpu) and converts it into elf notes and updates the PT_NOTE program
header accordingly. The exact register state for crashing cpu is saved to
fadump crash info structure in scratch area during crash_fadump() and read
during second kernel boot.

Change in v5:
- Added 'fadump_' prefix to all static function defined.

Change in v4:
- Fixes a issue where memblock_free() is invoked from build_cpu_notes()
  function during error_out path. Invoke cpu_notes_buf_free() in error_out
  path instead of memblock_free().

Change in v2:
- Moved the crash_fadump() invocation from generic code to panic notifier.
- Introduced cpu_notes_buf_alloc() function to allocate cpu notes buffer
  using get_free_pages(). The reason is, with the use of subsys_initcall
  the setup_fadump() is now called after mem_init(). Hence use of
  get_free_pages() to allocate memory is more approriate then using
  memblock_alloc().

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/fadump.h  |   44 +++++
 arch/powerpc/kernel/fadump.c       |  314 ++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/setup-common.c |    6 +
 arch/powerpc/kernel/traps.c        |    3 
 4 files changed, 365 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index c022d5c..72908e3 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -63,6 +63,18 @@
 /* Dump status flag */
 #define FADUMP_ERROR_FLAG	0x2000
 
+#define FADUMP_CPU_ID_MASK	((1UL << 32) - 1)
+
+#define CPU_UNKNOWN		(~((u32)0))
+
+/* Utility macros */
+#define SKIP_TO_NEXT_CPU(reg_entry)			\
+({							\
+	while (reg_entry->reg_id != REG_ID("CPUEND"))	\
+		reg_entry++;				\
+	reg_entry++;					\
+})
+
 /* Kernel Dump section info */
 struct fadump_section {
 	u32	request_flag;
@@ -117,6 +129,9 @@ struct fw_dump {
 	unsigned long	reserve_bootvar;
 
 	unsigned long	fadumphdr_addr;
+	unsigned long	cpu_notes_buf;
+	unsigned long	cpu_notes_buf_size;
+
 	int		ibm_configure_kernel_dump;
 
 	unsigned long	fadump_enabled:1;
@@ -141,13 +156,40 @@ static inline u64 str_to_u64(const char *str)
 	return val;
 }
 #define STR_TO_HEX(x)	str_to_u64(x)
+#define REG_ID(x)	str_to_u64(x)
 
 #define FADUMP_CRASH_INFO_MAGIC		STR_TO_HEX("FADMPINF")
+#define REGSAVE_AREA_MAGIC		STR_TO_HEX("REGSAVE")
+
+/* The firmware-assisted dump format.
+ *
+ * The register save area is an area in the partition's memory used to preserve
+ * the register contents (CPU state data) for the active CPUs during a firmware
+ * assisted dump. The dump format contains register save area header followed
+ * by register entries. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND".
+ */
+
+/* Register save area header. */
+struct fadump_reg_save_area_header {
+	u64		magic_number;
+	u32		version;
+	u32		num_cpu_offset;
+};
+
+/* Register entry. */
+struct fadump_reg_entry {
+	u64		reg_id;
+	u64		reg_value;
+};
 
 /* fadump crash info structure */
 struct fadump_crash_info_header {
 	u64		magic_number;
 	u64		elfcorehdr_addr;
+	u32		crashing_cpu;
+	struct pt_regs	regs;
+	struct cpumask	cpu_online_mask;
 };
 
 /* Crash memory ranges */
@@ -163,7 +205,9 @@ extern int early_init_dt_scan_fw_dump(unsigned long node,
 extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
+extern void crash_fadump(struct pt_regs *, const char *);
 #else	/* CONFIG_FA_DUMP */
 static inline int is_fadump_active(void) { return 0; }
+static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 #endif
 #endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 082f85a..1879ddf 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -244,6 +244,7 @@ static unsigned long get_fadump_area_size(void)
 	size += fw_dump.boot_memory_size;
 	size += sizeof(struct fadump_crash_info_header);
 	size += sizeof(struct elfhdr); /* ELF core header.*/
+	size += sizeof(struct elf_phdr); /* place holder for cpu notes */
 	/* Program headers for crash memory regions. */
 	size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
 
@@ -397,6 +398,285 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
 	}
 }
 
+void crash_fadump(struct pt_regs *regs, const char *str)
+{
+	struct fadump_crash_info_header *fdh = NULL;
+
+	if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
+		return;
+
+	fdh = __va(fw_dump.fadumphdr_addr);
+	crashing_cpu = smp_processor_id();
+	fdh->crashing_cpu = crashing_cpu;
+	crash_save_vmcoreinfo();
+
+	if (regs)
+		fdh->regs = *regs;
+	else
+		ppc_save_regs(&fdh->regs);
+
+	fdh->cpu_online_mask = *cpu_online_mask;
+
+	/* Call ibm,os-term rtas call to trigger firmware assisted dump */
+	rtas_os_term((char *)str);
+}
+
+#define GPR_MASK	0xffffff0000000000
+static inline int fadump_gpr_index(u64 id)
+{
+	int i = -1;
+	char str[3];
+
+	if ((id & GPR_MASK) == REG_ID("GPR")) {
+		/* get the digits at the end */
+		id &= ~GPR_MASK;
+		id >>= 24;
+		str[2] = '\0';
+		str[1] = id & 0xff;
+		str[0] = (id >> 8) & 0xff;
+		sscanf(str, "%d", &i);
+		if (i > 31)
+			i = -1;
+	}
+	return i;
+}
+
+static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id,
+								u64 reg_val)
+{
+	int i;
+
+	i = fadump_gpr_index(reg_id);
+	if (i >= 0)
+		regs->gpr[i] = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("NIA"))
+		regs->nip = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("MSR"))
+		regs->msr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("CTR"))
+		regs->ctr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("LR"))
+		regs->link = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("XER"))
+		regs->xer = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("CR"))
+		regs->ccr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("DAR"))
+		regs->dar = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("DSISR"))
+		regs->dsisr = (unsigned long)reg_val;
+}
+
+static struct fadump_reg_entry*
+fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	while (reg_entry->reg_id != REG_ID("CPUEND")) {
+		fadump_set_regval(regs, reg_entry->reg_id,
+					reg_entry->reg_value);
+		reg_entry++;
+	}
+	reg_entry++;
+	return reg_entry;
+}
+
+static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type,
+						void *data, size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void fadump_final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
+{
+	struct elf_prstatus prstatus;
+
+	memset(&prstatus, 0, sizeof(prstatus));
+	/*
+	 * FIXME: How do i get PID? Do I really need it?
+	 * prstatus.pr_pid = ????
+	 */
+	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+	buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+				&prstatus, sizeof(prstatus));
+	return buf;
+}
+
+static void fadump_update_elfcore_header(char *bufp)
+{
+	struct elfhdr *elf;
+	struct elf_phdr *phdr;
+
+	elf = (struct elfhdr *)bufp;
+	bufp += sizeof(struct elfhdr);
+
+	/* First note is a place holder for cpu notes info. */
+	phdr = (struct elf_phdr *)bufp;
+
+	if (phdr->p_type == PT_NOTE) {
+		phdr->p_paddr = fw_dump.cpu_notes_buf;
+		phdr->p_offset	= phdr->p_paddr;
+		phdr->p_filesz	= fw_dump.cpu_notes_buf_size;
+		phdr->p_memsz = fw_dump.cpu_notes_buf_size;
+	}
+	return;
+}
+
+static void *fadump_cpu_notes_buf_alloc(unsigned long size)
+{
+	void *vaddr;
+	struct page *page;
+	unsigned long order, count, i;
+
+	order = get_order(size);
+	vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+	if (!vaddr)
+		return NULL;
+
+	count = 1 << order;
+	page = virt_to_page(vaddr);
+	for (i = 0; i < count; i++)
+		SetPageReserved(page + i);
+	return vaddr;
+}
+
+static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size)
+{
+	struct page *page;
+	unsigned long order, count, i;
+
+	order = get_order(size);
+	count = 1 << order;
+	page = virt_to_page(vaddr);
+	for (i = 0; i < count; i++)
+		ClearPageReserved(page + i);
+	__free_pages(page, order);
+}
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
+ * used to access the data to allow for additional fields to be added without
+ * affecting compatibility. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
+ * 8 Byte ASCII identifier and 8 Byte register value. The register entry
+ * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
+ * of register value. For more details refer to PAPR document.
+ *
+ * Only for the crashing cpu we ignore the CPU dump data and get exact
+ * state from fadump crash info structure populated by first kernel at the
+ * time of crash.
+ */
+static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
+{
+	struct fadump_reg_save_area_header *reg_header;
+	struct fadump_reg_entry *reg_entry;
+	struct fadump_crash_info_header *fdh = NULL;
+	void *vaddr;
+	unsigned long addr;
+	u32 num_cpus, *note_buf;
+	struct pt_regs regs;
+	int i, rc = 0, cpu = 0;
+
+	if (!fdm->cpu_state_data.bytes_dumped)
+		return -EINVAL;
+
+	addr = fdm->cpu_state_data.destination_address;
+	vaddr = __va(addr);
+
+	reg_header = vaddr;
+	if (reg_header->magic_number != REGSAVE_AREA_MAGIC) {
+		printk(KERN_ERR "Unable to read register save area.\n");
+		return -ENOENT;
+	}
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("Magic Number: %llx\n", reg_header->magic_number);
+	pr_debug("NumCpuOffset: %x\n", reg_header->num_cpu_offset);
+
+	vaddr += reg_header->num_cpu_offset;
+	num_cpus = *((u32 *)(vaddr));
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	vaddr += sizeof(u32);
+	reg_entry = (struct fadump_reg_entry *)vaddr;
+
+	/* Allocate buffer to hold cpu crash notes. */
+	fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
+	fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
+	note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size);
+	if (!note_buf) {
+		printk(KERN_ERR "Failed to allocate 0x%lx bytes for "
+			"cpu notes buffer\n", fw_dump.cpu_notes_buf_size);
+		return -ENOMEM;
+	}
+	fw_dump.cpu_notes_buf = __pa(note_buf);
+
+	pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
+			(num_cpus * sizeof(note_buf_t)), note_buf);
+
+	if (fw_dump.fadumphdr_addr)
+		fdh = __va(fw_dump.fadumphdr_addr);
+
+	for (i = 0; i < num_cpus; i++) {
+		if (reg_entry->reg_id != REG_ID("CPUSTRT")) {
+			printk(KERN_ERR "Unable to read CPU state data\n");
+			rc = -ENOENT;
+			goto error_out;
+		}
+		/* Lower 4 bytes of reg_value contains logical cpu id */
+		cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK;
+		if (!cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) {
+			SKIP_TO_NEXT_CPU(reg_entry);
+			continue;
+		}
+		pr_debug("Reading register data for cpu %d...\n", cpu);
+		if (fdh && fdh->crashing_cpu == cpu) {
+			regs = fdh->regs;
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+			SKIP_TO_NEXT_CPU(reg_entry);
+		} else {
+			reg_entry++;
+			reg_entry = fadump_read_registers(reg_entry, &regs);
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		}
+	}
+	fadump_final_note(note_buf);
+
+	pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+							fdh->elfcorehdr_addr);
+	fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr));
+	return 0;
+
+error_out:
+	fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf),
+					fw_dump.cpu_notes_buf_size);
+	fw_dump.cpu_notes_buf = 0;
+	fw_dump.cpu_notes_buf_size = 0;
+	return rc;
+
+}
+
 /*
  * Validate and process the dump data stored by firmware before exporting
  * it through '/proc/vmcore'.
@@ -404,18 +684,21 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
 static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
 {
 	struct fadump_crash_info_header *fdh;
+	int rc = 0;
 
 	if (!fdm_active || !fw_dump.fadumphdr_addr)
 		return -EINVAL;
 
 	/* Check if the dump data is valid. */
 	if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) ||
+			(fdm_active->cpu_state_data.error_flags != 0) ||
 			(fdm_active->rmr_region.error_flags != 0)) {
 		printk(KERN_ERR "Dump taken by platform is not valid\n");
 		return -EINVAL;
 	}
-	if (fdm_active->rmr_region.bytes_dumped !=
-			fdm_active->rmr_region.source_len) {
+	if ((fdm_active->rmr_region.bytes_dumped !=
+			fdm_active->rmr_region.source_len) ||
+			!fdm_active->cpu_state_data.bytes_dumped) {
 		printk(KERN_ERR "Dump taken by platform is incomplete\n");
 		return -EINVAL;
 	}
@@ -427,6 +710,10 @@ static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
 		return -EINVAL;
 	}
 
+	rc = fadump_build_cpu_notes(fdm_active);
+	if (rc)
+		return rc;
+
 	/*
 	 * We are done validating dump info and elfcore header is now ready
 	 * to be exported. set elfcorehdr_addr so that vmcore module will
@@ -541,6 +828,27 @@ static int fadump_create_elfcore_headers(char *bufp)
 	elf = (struct elfhdr *)bufp;
 	bufp += sizeof(struct elfhdr);
 
+	/*
+	 * setup ELF PT_NOTE, place holder for cpu notes info. The notes info
+	 * will be populated during second kernel boot after crash. Hence
+	 * this PT_NOTE will always be the first elf note.
+	 *
+	 * NOTE: Any new ELF note addition should be placed after this note.
+	 */
+	phdr = (struct elf_phdr *)bufp;
+	bufp += sizeof(struct elf_phdr);
+	phdr->p_type = PT_NOTE;
+	phdr->p_flags = 0;
+	phdr->p_vaddr = 0;
+	phdr->p_align = 0;
+
+	phdr->p_offset = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = 0;
+	phdr->p_memsz = 0;
+
+	(elf->e_phnum)++;
+
 	/* setup PT_LOAD sections. */
 
 	for (i = 0; i < crash_mem_ranges; i++) {
@@ -592,6 +900,8 @@ static unsigned long init_fadump_header(unsigned long addr)
 	memset(fdh, 0, sizeof(struct fadump_crash_info_header));
 	fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
 	fdh->elfcorehdr_addr = addr;
+	/* We will set the crashing cpu id in crash_fadump() during crash. */
+	fdh->crashing_cpu = CPU_UNKNOWN;
 
 	return addr;
 }
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index b1d738d..87d2465 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -61,6 +61,7 @@
 #include <asm/xmon.h>
 #include <asm/cputhreads.h>
 #include <mm/mmu_decl.h>
+#include <asm/fadump.h>
 
 #include "setup.h"
 
@@ -639,6 +640,11 @@ EXPORT_SYMBOL(check_legacy_ioport);
 static int ppc_panic_event(struct notifier_block *this,
                              unsigned long event, void *ptr)
 {
+	/*
+	 * If firmware-assisted dump has been registered then trigger
+	 * firmware-assisted dump and let firmware handle everything else.
+	 */
+	crash_fadump(NULL, ptr);
 	ppc_md.panic(ptr);  /* May not return */
 	return NOTIFY_DONE;
 }
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f19d977..c2ba6ed 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -57,6 +57,7 @@
 #include <asm/kexec.h>
 #include <asm/ppc-opcode.h>
 #include <asm/rio.h>
+#include <asm/fadump.h>
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 int (*__debugger)(struct pt_regs *regs) __read_mostly;
@@ -160,6 +161,8 @@ int die(const char *str, struct pt_regs *regs, long err)
 	add_taint(TAINT_DIE);
 	raw_spin_unlock_irqrestore(&die.lock, flags);
 
+	crash_fadump(regs, str);
+
 	if (kexec_should_crash(current) ||
 		kexec_sr_activated(smp_processor_id()))
 		crash_kexec(regs);

^ permalink raw reply related

* [RFC PATCH v5 4/9] fadump: Initialize elfcore header and add PT_LOAD program headers.
From: Mahesh J Salgaonkar @ 2011-11-15 15:13 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111115151145.16533.16384.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Build the crash memory range list by traversing through system memory during
the first kernel before we register for firmware-assisted dump. After the
successful dump registration, initialize the elfcore header and populate
PT_LOAD program headers with crash memory ranges. The elfcore header is
saved in the scratch area within the reserved memory. The scratch area starts
at the end of the memory reserved for saving RMR region contents. The
scratch area contains fadump crash info structure that contains magic number
for fadump validation and physical address where the eflcore header can be
found. This structure will also be used to pass some important crash info
data to the second kernel which will help second kernel to populate ELF core
header with correct data before it gets exported through /proc/vmcore. Since
the firmware preserves the entire partition memory at the time of crash the
contents of the scratch area will be preserved till second kernel boot.

NOTE: The current design implementation does not address a possibility of
introducing additional fields (in future) to this structure without affecting
compatibility. It's on TODO list to come up with better approach to
address this.

Reserved dump area start => +-------------------------------------+
                            |  CPU state dump data                |
                            +-------------------------------------+
                            |  HPTE region data                   |
                            +-------------------------------------+
                            |  RMR region data                    |
Scratch area start       => +-------------------------------------+
                            |  fadump crash info structure {      |
                            |     magic nummber                   |
                     +------|---- elfcorehdr_addr                 |
                     |      |  }                                  |
                     +----> +-------------------------------------+
                            |  ELF core header                    |
Reserved dump area end   => +-------------------------------------+

Change in v5:
- Added 'fadump_' prefix to all static functions defined.

Change in v4:
- Move the init_elfcore_header() function and 'memblock_num_regions' macro
  from generic code to power specific code as these are used only by
  firmware assisted dump implementation which is power specific feature.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/fadump.h |   43 +++++++
 arch/powerpc/kernel/fadump.c      |  233 +++++++++++++++++++++++++++++++++++++
 2 files changed, 275 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index c2951b2..c022d5c 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -41,6 +41,12 @@
 #define MIN_BOOT_MEM	(((RMR_END < (0x1UL << 28)) ? (0x1UL << 28) : RMR_END) \
 			+ (0x1UL << 26))
 
+#define memblock_num_regions(memblock_type)	(memblock.memblock_type.cnt)
+
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS 0
+#endif
+
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA	0x0001
 #define FADUMP_HPTE_REGION	0x0002
@@ -54,6 +60,9 @@
 #define FADUMP_UNREGISTER	2
 #define FADUMP_INVALIDATE	3
 
+/* Dump status flag */
+#define FADUMP_ERROR_FLAG	0x2000
+
 /* Kernel Dump section info */
 struct fadump_section {
 	u32	request_flag;
@@ -107,6 +116,7 @@ struct fw_dump {
 	/* cmd line option during boot */
 	unsigned long	reserve_bootvar;
 
+	unsigned long	fadumphdr_addr;
 	int		ibm_configure_kernel_dump;
 
 	unsigned long	fadump_enabled:1;
@@ -115,6 +125,39 @@ struct fw_dump {
 	unsigned long	dump_registered:1;
 };
 
+/*
+ * Copy the ascii values for first 8 characters from a string into u64
+ * variable at their respective indexes.
+ * e.g.
+ *  The string "FADMPINF" will be converted into 0x4641444d50494e46
+ */
+static inline u64 str_to_u64(const char *str)
+{
+	u64 val = 0;
+	int i;
+
+	for (i = 0; i < sizeof(val); i++)
+		val = (*str) ? (val << 8) | *str++ : val << 8;
+	return val;
+}
+#define STR_TO_HEX(x)	str_to_u64(x)
+
+#define FADUMP_CRASH_INFO_MAGIC		STR_TO_HEX("FADMPINF")
+
+/* fadump crash info structure */
+struct fadump_crash_info_header {
+	u64		magic_number;
+	u64		elfcorehdr_addr;
+};
+
+/* Crash memory ranges */
+#define INIT_CRASHMEM_RANGES	(INIT_MEMBLOCK_REGIONS + 2)
+
+struct fad_crash_memory_ranges {
+	unsigned long long	base;
+	unsigned long long	size;
+};
+
 extern int early_init_dt_scan_fw_dump(unsigned long node,
 		const char *uname, int depth, void *data);
 extern int fadump_reserve_mem(void);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 15f4751..082f85a 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -32,6 +32,7 @@
 #include <linux/delay.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/crash_dump.h>
 
 #include <asm/page.h>
 #include <asm/prom.h>
@@ -53,6 +54,8 @@ static struct fadump_mem_struct fdm;
 static const struct fadump_mem_struct *fdm_active;
 
 static DEFINE_MUTEX(fadump_mutex);
+struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES];
+int crash_mem_ranges;
 
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node,
@@ -239,6 +242,10 @@ static unsigned long get_fadump_area_size(void)
 	size += fw_dump.cpu_state_data_size;
 	size += fw_dump.hpte_region_size;
 	size += fw_dump.boot_memory_size;
+	size += sizeof(struct fadump_crash_info_header);
+	size += sizeof(struct elfhdr); /* ELF core header.*/
+	/* Program headers for crash memory regions. */
+	size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
 
 	size = PAGE_ALIGN(size);
 	return size;
@@ -304,6 +311,12 @@ int __init fadump_reserve_mem(void)
 				"for saving crash dump\n",
 				(unsigned long)(size >> 20),
 				(unsigned long)(base >> 20));
+
+		fw_dump.fadumphdr_addr =
+				fdm_active->rmr_region.destination_address +
+				fdm_active->rmr_region.source_len;
+		pr_debug("fadumphdr_addr = %p\n",
+				(void *) fw_dump.fadumphdr_addr);
 	} else {
 		/* Reserve the memory at the top of memory. */
 		size = get_fadump_area_size();
@@ -384,8 +397,210 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
 	}
 }
 
+/*
+ * Validate and process the dump data stored by firmware before exporting
+ * it through '/proc/vmcore'.
+ */
+static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
+{
+	struct fadump_crash_info_header *fdh;
+
+	if (!fdm_active || !fw_dump.fadumphdr_addr)
+		return -EINVAL;
+
+	/* Check if the dump data is valid. */
+	if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) ||
+			(fdm_active->rmr_region.error_flags != 0)) {
+		printk(KERN_ERR "Dump taken by platform is not valid\n");
+		return -EINVAL;
+	}
+	if (fdm_active->rmr_region.bytes_dumped !=
+			fdm_active->rmr_region.source_len) {
+		printk(KERN_ERR "Dump taken by platform is incomplete\n");
+		return -EINVAL;
+	}
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fw_dump.fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		printk(KERN_ERR "Crash info header is not valid.\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return 0;
+}
+
+static inline void fadump_add_crash_memory(unsigned long long base,
+					unsigned long long end)
+{
+	if (base == end)
+		return;
+
+	pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
+		crash_mem_ranges, base, end - 1, (end - base));
+	crash_memory_ranges[crash_mem_ranges].base = base;
+	crash_memory_ranges[crash_mem_ranges].size = end - base;
+	crash_mem_ranges++;
+}
+
+static void fadump_exclude_reserved_area(unsigned long long start,
+					unsigned long long end)
+{
+	unsigned long long ra_start, ra_end;
+
+	ra_start = fw_dump.reserve_dump_area_start;
+	ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+	if ((ra_start < end) && (ra_end > start)) {
+		if ((start < ra_start) && (end > ra_end)) {
+			fadump_add_crash_memory(start, ra_start);
+			fadump_add_crash_memory(ra_end, end);
+		} else if (start < ra_start) {
+			fadump_add_crash_memory(start, ra_start);
+		} else if (ra_end < end) {
+			fadump_add_crash_memory(ra_end, end);
+		}
+	} else
+		fadump_add_crash_memory(start, end);
+}
+
+static int fadump_init_elfcore_header(char *bufp)
+{
+	struct elfhdr *elf;
+
+	elf = (struct elfhdr *) bufp;
+	bufp += sizeof(struct elfhdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type = ET_CORE;
+	elf->e_machine = ELF_ARCH;
+	elf->e_version = EV_CURRENT;
+	elf->e_entry = 0;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_shoff = 0;
+	elf->e_flags = ELF_CORE_EFLAGS;
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = 0;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+
+	return 0;
+}
+
+/*
+ * Traverse through memblock structure and setup crash memory ranges. These
+ * ranges will be used create PT_LOAD program headers in elfcore header.
+ */
+static void fadump_setup_crash_memory_ranges(void)
+{
+	struct memblock_region *reg;
+	unsigned long long start, end;
+
+	pr_debug("Setup crash memory ranges.\n");
+	crash_mem_ranges = 0;
+	/*
+	 * add the first memory chunk (RMR_START through boot_memory_size) as
+	 * a separate memory chunk. The reason is, at the time crash firmware
+	 * will move the content of this memory chunk to different location
+	 * specified during fadump registration. We need to create a separate
+	 * program header for this chunk with the correct offset.
+	 */
+	fadump_add_crash_memory(RMR_START, fw_dump.boot_memory_size);
+
+	for_each_memblock(memory, reg) {
+		start = (unsigned long long)reg->base;
+		end = start + (unsigned long long)reg->size;
+		if (start == RMR_START && end >= fw_dump.boot_memory_size)
+			start = fw_dump.boot_memory_size;
+
+		/* add this range excluding the reserved dump area. */
+		fadump_exclude_reserved_area(start, end);
+	}
+}
+
+static int fadump_create_elfcore_headers(char *bufp)
+{
+	struct elfhdr *elf;
+	struct elf_phdr *phdr;
+	int i;
+
+	fadump_init_elfcore_header(bufp);
+	elf = (struct elfhdr *)bufp;
+	bufp += sizeof(struct elfhdr);
+
+	/* setup PT_LOAD sections. */
+
+	for (i = 0; i < crash_mem_ranges; i++) {
+		unsigned long long mbase, msize;
+		mbase = crash_memory_ranges[i].base;
+		msize = crash_memory_ranges[i].size;
+
+		if (!msize)
+			continue;
+
+		phdr = (struct elf_phdr *)bufp;
+		bufp += sizeof(struct elf_phdr);
+		phdr->p_type	= PT_LOAD;
+		phdr->p_flags	= PF_R|PF_W|PF_X;
+		phdr->p_offset	= mbase;
+
+		if (mbase == RMR_START) {
+			/*
+			 * The entire RMR region will be moved by firmware
+			 * to the specified destination_address. Hence set
+			 * the correct offset.
+			 */
+			phdr->p_offset = fdm.rmr_region.destination_address;
+		}
+
+		phdr->p_paddr = mbase;
+		phdr->p_vaddr = (unsigned long)__va(mbase);
+		phdr->p_filesz = msize;
+		phdr->p_memsz = msize;
+		phdr->p_align = 0;
+
+		/* Increment number of program headers. */
+		(elf->e_phnum)++;
+	}
+	return 0;
+}
+
+static unsigned long init_fadump_header(unsigned long addr)
+{
+	struct fadump_crash_info_header *fdh;
+
+	if (!addr)
+		return 0;
+
+	fw_dump.fadumphdr_addr = addr;
+	fdh = __va(addr);
+	addr += sizeof(struct fadump_crash_info_header);
+
+	memset(fdh, 0, sizeof(struct fadump_crash_info_header));
+	fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
+	fdh->elfcorehdr_addr = addr;
+
+	return addr;
+}
+
 static void register_fadump(void)
 {
+	unsigned long addr;
+	void *vaddr;
+
 	/*
 	 * If no memory is reserved then we can not register for firmware-
 	 * assisted dump.
@@ -393,6 +608,16 @@ static void register_fadump(void)
 	if (!fw_dump.reserve_dump_area_size)
 		return;
 
+	fadump_setup_crash_memory_ranges();
+
+	addr = fdm.rmr_region.destination_address + fdm.rmr_region.source_len;
+	/* Initialize fadump crash info header. */
+	addr = init_fadump_header(addr);
+	vaddr = __va(addr);
+
+	pr_debug("Creating ELF core headers at %#016lx\n", addr);
+	fadump_create_elfcore_headers(vaddr);
+
 	/* register the future kernel dump with firmware. */
 	register_fw_dump(&fdm);
 }
@@ -586,8 +811,14 @@ int __init setup_fadump(void)
 	}
 
 	fadump_show_config();
+	/*
+	 * If dump data is available then see if it is valid and prepare for
+	 * saving it to the disk.
+	 */
+	if (fw_dump.dump_active)
+		process_fadump(fdm_active);
 	/* Initialize the kernel dump memory structure for FAD registration. */
-	if (fw_dump.reserve_dump_area_size)
+	else if (fw_dump.reserve_dump_area_size)
 		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
 	fadump_init_files();
 

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox