LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Add cpufreq driver for Momentum Maple boards
From: Dmitry Eremin-Solenikov @ 2011-06-17  8:49 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: davej, paulus, cpufreq
In-Reply-To: <1308284088.32158.0.camel@pasglop>

Add simple cpufreq driver for Maple-based boards (ppc970fx evaluation
kit and others). Driver is based on a cpufreq driver for 64-bit powermac
boxes with all pmac-dependant features removed and simple cleanup
applied.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
---
 arch/powerpc/kernel/misc_64.S          |    4 +-
 arch/powerpc/platforms/Kconfig         |    8 +
 arch/powerpc/platforms/maple/Makefile  |    1 +
 arch/powerpc/platforms/maple/cpufreq.c |  317 ++++++++++++++++++++++++++++++++
 4 files changed, 328 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/platforms/maple/cpufreq.c

diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index e89df59..616921e 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -339,7 +339,7 @@ _GLOBAL(real_205_writeb)
 #endif /* CONFIG_PPC_PASEMI */
 
 
-#ifdef CONFIG_CPU_FREQ_PMAC64
+#if defined(CONFIG_CPU_FREQ_PMAC64) || defined(CONFIG_CPU_FREQ_MAPLE)
 /*
  * SCOM access functions for 970 (FX only for now)
  *
@@ -408,7 +408,7 @@ _GLOBAL(scom970_write)
 	/* restore interrupts */
 	mtmsrd	r5,1
 	blr
-#endif /* CONFIG_CPU_FREQ_PMAC64 */
+#endif /* CONFIG_CPU_FREQ_PMAC64 || CONFIG_CPU_FREQ_MAPLE */
 
 
 /*
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index f970ca2..96ab3ab 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -204,6 +204,14 @@ config PPC_PASEMI_CPUFREQ
 	  This adds the support for frequency switching on PA Semi
 	  PWRficient processors.
 
+config CPU_FREQ_MAPLE
+	bool "Support for Maple 970FX Evaluation Board"
+	depends on PPC_MAPLE
+	select CPU_FREQ_TABLE
+	help
+	  This adds support for frequency switching on Maple 970FX
+	  Evaluation Board and compatible boards (IBM JS2x blades).
+
 endmenu
 
 config PPC601_SYNC_FIX
diff --git a/arch/powerpc/platforms/maple/Makefile b/arch/powerpc/platforms/maple/Makefile
index 1be1a99..0b3e3e3 100644
--- a/arch/powerpc/platforms/maple/Makefile
+++ b/arch/powerpc/platforms/maple/Makefile
@@ -1 +1,2 @@
 obj-y	+= setup.o pci.o time.o
+obj-$(CONFIG_CPU_FREQ_MAPLE) += cpufreq.o
diff --git a/arch/powerpc/platforms/maple/cpufreq.c b/arch/powerpc/platforms/maple/cpufreq.c
new file mode 100644
index 0000000..1f706c1
--- /dev/null
+++ b/arch/powerpc/platforms/maple/cpufreq.c
@@ -0,0 +1,317 @@
+/*
+ *  Copyright (C) 2011 Dmitry Eremin-Solenikov
+ *  Copyright (C) 2002 - 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ *  and                       Markus Demleitner <msdemlei@cl.uni-heidelberg.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This driver adds basic cpufreq support for SMU & 970FX based G5 Macs,
+ * that is iMac G5 and latest single CPU desktop.
+ */
+
+#undef DEBUG
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/cpufreq.h>
+#include <linux/init.h>
+#include <linux/completion.h>
+#include <linux/mutex.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/sections.h>
+#include <asm/cputable.h>
+#include <asm/time.h>
+
+#define DBG(fmt...) pr_debug(fmt)
+
+/* see 970FX user manual */
+
+#define SCOM_PCR 0x0aa001			/* PCR scom addr */
+
+#define PCR_HILO_SELECT		0x80000000U	/* 1 = PCR, 0 = PCRH */
+#define PCR_SPEED_FULL		0x00000000U	/* 1:1 speed value */
+#define PCR_SPEED_HALF		0x00020000U	/* 1:2 speed value */
+#define PCR_SPEED_QUARTER	0x00040000U	/* 1:4 speed value */
+#define PCR_SPEED_MASK		0x000e0000U	/* speed mask */
+#define PCR_SPEED_SHIFT		17
+#define PCR_FREQ_REQ_VALID	0x00010000U	/* freq request valid */
+#define PCR_VOLT_REQ_VALID	0x00008000U	/* volt request valid */
+#define PCR_TARGET_TIME_MASK	0x00006000U	/* target time */
+#define PCR_STATLAT_MASK	0x00001f00U	/* STATLAT value */
+#define PCR_SNOOPLAT_MASK	0x000000f0U	/* SNOOPLAT value */
+#define PCR_SNOOPACC_MASK	0x0000000fU	/* SNOOPACC value */
+
+#define SCOM_PSR 0x408001			/* PSR scom addr */
+/* warning: PSR is a 64 bits register */
+#define PSR_CMD_RECEIVED	0x2000000000000000U   /* command received */
+#define PSR_CMD_COMPLETED	0x1000000000000000U   /* command completed */
+#define PSR_CUR_SPEED_MASK	0x0300000000000000U   /* current speed */
+#define PSR_CUR_SPEED_SHIFT	(56)
+
+/*
+ * The G5 only supports two frequencies (Quarter speed is not supported)
+ */
+#define CPUFREQ_HIGH                  0
+#define CPUFREQ_LOW                   1
+
+static struct cpufreq_frequency_table maple_cpu_freqs[] = {
+	{CPUFREQ_HIGH, 		0},
+	{CPUFREQ_LOW,		0},
+	{0,			CPUFREQ_TABLE_END},
+};
+
+static struct freq_attr* maple_cpu_freqs_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+/* Power mode data is an array of the 32 bits PCR values to use for
+ * the various frequencies, retrieved from the device-tree
+ */
+static int maple_pmode_cur;
+
+static DEFINE_MUTEX(maple_switch_mutex);
+
+static const u32 *maple_pmode_data;
+static int maple_pmode_max;
+
+/*
+ * Fake voltage switching for platforms with missing support
+ */
+
+static void maple_dummy_switch_volt(int speed_mode)
+{
+}
+
+/*
+ * SCOM based frequency switching for 970FX rev3
+ */
+static int maple_scom_switch_freq(int speed_mode)
+{
+	unsigned long flags;
+	int to;
+
+	/* If frequency is going up, first ramp up the voltage */
+	if (speed_mode < maple_pmode_cur)
+		maple_dummy_switch_volt(speed_mode);
+
+	local_irq_save(flags);
+
+	/* Clear PCR high */
+	scom970_write(SCOM_PCR, 0);
+	/* Clear PCR low */
+	scom970_write(SCOM_PCR, PCR_HILO_SELECT | 0);
+	/* Set PCR low */
+	scom970_write(SCOM_PCR, PCR_HILO_SELECT |
+		      maple_pmode_data[speed_mode]);
+
+	/* Wait for completion */
+	for (to = 0; to < 10; to++) {
+		unsigned long psr = scom970_read(SCOM_PSR);
+
+		if ((psr & PSR_CMD_RECEIVED) == 0 &&
+		    (((psr >> PSR_CUR_SPEED_SHIFT) ^
+		      (maple_pmode_data[speed_mode] >> PCR_SPEED_SHIFT)) & 0x3)
+		    == 0)
+			break;
+		if (psr & PSR_CMD_COMPLETED)
+			break;
+		udelay(100);
+	}
+
+	local_irq_restore(flags);
+
+	/* If frequency is going down, last ramp the voltage */
+	if (speed_mode > maple_pmode_cur)
+		maple_dummy_switch_volt(speed_mode);
+
+	maple_pmode_cur = speed_mode;
+	ppc_proc_freq = maple_cpu_freqs[speed_mode].frequency * 1000ul;
+
+	return 0;
+}
+
+static int maple_scom_query_freq(void)
+{
+	unsigned long psr = scom970_read(SCOM_PSR);
+	int i;
+
+	for (i = 0; i <= maple_pmode_max; i++)
+		if ((((psr >> PSR_CUR_SPEED_SHIFT) ^
+		      (maple_pmode_data[i] >> PCR_SPEED_SHIFT)) & 0x3) == 0)
+			break;
+	return i;
+}
+
+/*
+ * Common interface to the cpufreq core
+ */
+
+static int maple_cpufreq_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, maple_cpu_freqs);
+}
+
+static int maple_cpufreq_target(struct cpufreq_policy *policy,
+	unsigned int target_freq, unsigned int relation)
+{
+	unsigned int newstate = 0;
+	struct cpufreq_freqs freqs;
+	int rc;
+
+	if (cpufreq_frequency_table_target(policy, maple_cpu_freqs,
+			target_freq, relation, &newstate))
+		return -EINVAL;
+
+	if (maple_pmode_cur == newstate)
+		return 0;
+
+	mutex_lock(&maple_switch_mutex);
+
+	freqs.old = maple_cpu_freqs[maple_pmode_cur].frequency;
+	freqs.new = maple_cpu_freqs[newstate].frequency;
+	freqs.cpu = 0;
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	rc = maple_scom_switch_freq(newstate);
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+	mutex_unlock(&maple_switch_mutex);
+
+	return rc;
+}
+
+static unsigned int maple_cpufreq_get_speed(unsigned int cpu)
+{
+	return maple_cpu_freqs[maple_pmode_cur].frequency;
+}
+
+static int maple_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	policy->cpuinfo.transition_latency = 12000;
+	policy->cur = maple_cpu_freqs[maple_scom_query_freq()].frequency;
+	/* secondary CPUs are tied to the primary one by the
+	 * cpufreq core if in the secondary policy we tell it that
+	 * it actually must be one policy together with all others. */
+	cpumask_copy(policy->cpus, cpu_online_mask);
+	cpufreq_frequency_table_get_attr(maple_cpu_freqs, policy->cpu);
+
+	return cpufreq_frequency_table_cpuinfo(policy,
+		maple_cpu_freqs);
+}
+
+
+static struct cpufreq_driver maple_cpufreq_driver = {
+	.name		= "maple",
+	.owner		= THIS_MODULE,
+	.flags		= CPUFREQ_CONST_LOOPS,
+	.init		= maple_cpufreq_cpu_init,
+	.verify		= maple_cpufreq_verify,
+	.target		= maple_cpufreq_target,
+	.get		= maple_cpufreq_get_speed,
+	.attr		= maple_cpu_freqs_attr,
+};
+
+static int __init maple_cpufreq_init(void)
+{
+	struct device_node *cpus;
+	struct device_node *cpunode;
+	unsigned int psize;
+	unsigned long max_freq;
+	const u32 *valp;
+	u32 pvr_hi;
+	int rc = -ENODEV;
+
+	cpus = of_find_node_by_path("/cpus");
+	if (cpus == NULL) {
+		DBG("No /cpus node !\n");
+		return -ENODEV;
+	}
+
+	/* Get first CPU node */
+	for (cpunode = NULL;
+	     (cpunode = of_get_next_child(cpus, cpunode)) != NULL;) {
+		const u32 *reg = of_get_property(cpunode, "reg", NULL);
+		if (reg == NULL || (*reg) != 0)
+			continue;
+		if (!strcmp(cpunode->type, "cpu"))
+			break;
+	}
+	if (cpunode == NULL) {
+		printk(KERN_ERR "cpufreq: Can't find any CPU 0 node\n");
+		goto bail_cpus;
+	}
+
+	/* Check 970FX for now */
+	/* we actually don't care on which CPU to access PVR */
+	pvr_hi = PVR_VER(mfspr(SPRN_PVR));
+	if (pvr_hi != 0x3c && pvr_hi != 0x44) {
+		printk(KERN_ERR "cpufreq: Unsupported CPU version (%x)\n", pvr_hi);
+		goto bail_noprops;
+	}
+
+	/* Look for the powertune data in the device-tree */
+	maple_pmode_data = of_get_property(cpunode, "power-mode-data",&psize);
+	if (!maple_pmode_data) {
+		DBG("No power-mode-data !\n");
+		goto bail_noprops;
+	}
+	maple_pmode_max = psize / sizeof(u32) - 1;
+
+	/*
+	 * From what I see, clock-frequency is always the maximal frequency.
+	 * The current driver can not slew sysclk yet, so we really only deal
+	 * with powertune steps for now. We also only implement full freq and
+	 * half freq in this version. So far, I haven't yet seen a machine
+	 * supporting anything else.
+	 */
+	valp = of_get_property(cpunode, "clock-frequency", NULL);
+	if (!valp)
+		return -ENODEV;
+	max_freq = (*valp)/1000;
+	maple_cpu_freqs[0].frequency = max_freq;
+	maple_cpu_freqs[1].frequency = max_freq/2;
+
+	/* Force apply current frequency to make sure everything is in
+	 * sync (voltage is right for example). Firmware may leave us with
+	 * a strange setting ...
+	 */
+	maple_dummy_switch_volt(CPUFREQ_HIGH);
+	msleep(10);
+	maple_pmode_cur = -1;
+	maple_scom_switch_freq(maple_scom_query_freq());
+
+	printk(KERN_INFO "Registering G5 CPU frequency driver\n");
+	printk(KERN_INFO "Frequency method: SCOM, Voltage method: none\n");
+	printk(KERN_INFO "Low: %d Mhz, High: %d Mhz, Cur: %d MHz\n",
+		maple_cpu_freqs[1].frequency/1000,
+		maple_cpu_freqs[0].frequency/1000,
+		maple_cpu_freqs[maple_pmode_cur].frequency/1000);
+
+	rc = cpufreq_register_driver(&maple_cpufreq_driver);
+
+	of_node_put(cpunode);
+	of_node_put(cpus);
+
+	return rc;
+
+bail_noprops:
+	of_node_put(cpunode);
+bail_cpus:
+	of_node_put(cpus);
+
+	return rc;
+}
+
+module_init(maple_cpufreq_init);
+
+
+MODULE_LICENSE("GPL");
-- 
1.7.5.3

^ permalink raw reply related

* Re: [PATCH 2/3] powerpc: POWER7 optimised memcpy using VMX
From: Gabriel Paubert @ 2011-06-17  7:12 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: mikey, paulus, linuxppc-dev
In-Reply-To: <20110617045421.615463021@samba.org>

On Fri, Jun 17, 2011 at 02:54:00PM +1000, Anton Blanchard wrote:
> Implement a POWER7 optimised memcpy using VMX. For large aligned
> copies this new loop is over 10% faster and for large unaligned
> copies it is over 200% faster.
> 
> On POWER7 unaligned stores rarely slow down - they only flush when
> a store crosses a 4KB page boundary. Furthermore this flush is
> handled completely in hardware and should be 20-30 cycles.
> 
> Unaligned loads on the other hand flush much more often - whenever
> crossing a 128 byte cache line, or a 32 byte sector if either sector
> is an L1 miss.
> 
> Considering this information we really want to get the loads aligned
> and not worry about the alignment of the stores. Microbenchmarks
> confirm that this approach is much faster than the current unaligned
> copy loop that uses shifts and rotates to ensure both loads and
> stores are aligned.
> 
> We also want to try and do the stores in cacheline aligned, cacheline
> sized chunks. If the store queue is unable to merge an entire
> cacheline of stores then the L2 cache will have to do a
> read/modify/write. Even worse, we will serialise this with the stores
> in the next iteration of the copy loop since both iterations hit
> the same cacheline.
> 
> Based on this, the new loop does the following things:
> 
> 
> 1 - 127 bytes
> Get the source 8 byte aligned and use 8 byte loads and stores. Pretty
> boring and similar to how the current loop works.
> 
> 128 - 4095 bytes
> Get the source 8 byte aligned and use 8 byte loads and stores,
> 1 cacheline at a time. We aren't doing the stores in cacheline
> aligned chunks so we will potentially serialise once per cacheline.
> Even so it is much better than the loop we have today.
> 
> 4096 - bytes
> If both source and destination have the same alignment get them both
> 16 byte aligned, then get the destination cacheline aligned. Do
> cacheline sized loads and stores using VMX.
> 
> If source and destination do not have the same alignment, we get the
> destination cacheline aligned, and use permute to do aligned loads.
> 
> In both cases the VMX loop should be optimal - we always do aligned
> loads and stores and are always doing stores in cacheline aligned,
> cacheline sized chunks.
> 
> 
> The VMX breakpoint of 4096 bytes was chosen using this microbenchmark:
> 
> http://ozlabs.org/~anton/junkcode/copy_to_user.c
> 
> (Note that the breakpoint analysis was done with the copy_tofrom_user
> version of the loop and using varying sizes and alignments to read(). 
> It's much easier to create a benchmark using read() that can control
> the size and alignment of a kernel copy loop and synchronise it with
> userspace doing optional VMX instructions).
> 
> Since we are using VMX and there is a cost to saving and restoring
> the user VMX state there are two broad cases we need to benchmark:
> 
> - Best case - userspace never uses VMX
> 
> - Worst case - userspace always uses VMX
> 
> In reality a userspace process will sit somewhere between these two
> extremes. Since we need to test both aligned and unaligned copies we
> end up with 4 combinations. The point at which the VMX loop begins to
> win is:
> 
> 0% VMX
> aligned		2048 bytes
> unaligned	2048 bytes
> 
> 100% VMX
> aligned		16384 bytes
> unaligned	8192 bytes
> 
> Considering this is a microbenchmark, the data is hot in cache and
> the VMX loop has better store queue merging properties we set the
> breakpoint to 4096 bytes, a little below the unaligned breakpoints.
> 
> Some future optimisations we can look at:
> 
> - Looking at the perf data, a significant part of the cost when a task
>   is always using VMX is the extra exception we take to restore the
>   VMX state. As such we should do something similar to the x86
>   optimisation that restores FPU state for heavy users. ie:
> 
>         /*
>          * If the task has used fpu the last 5 timeslices, just do a full
>          * restore of the math state immediately to avoid the trap; the
>          * chances of needing FPU soon are obviously high now
>          */
>         preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
> 
>   and 
> 
>         /*
>          * fpu_counter contains the number of consecutive context switches
>          * that the FPU is used. If this is over a threshold, the lazy fpu
>          * saving becomes unlazy to save the trap. This is an unsigned char
>          * so that after 256 times the counter wraps and the behavior turns
>          * lazy again; this to deal with bursty apps that only use FPU for
>          * a short time
>          */
> 
> - We could create a paca bit to mirror the VMX enabled MSR bit and check
>   that first, avoiding multiple calls to calling enable_kernel_altivec.
> 
> - We could have two VMX breakpoints, one for when we know the user VMX
>   state is loaded into the registers and one when it isn't. This could
>   be a second bit in the paca so we can calculate the break points quickly.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org>
> ---
> 
> Index: linux-powerpc/arch/powerpc/lib/Makefile
> ===================================================================
> --- linux-powerpc.orig/arch/powerpc/lib/Makefile	2011-06-17 08:38:25.786110167 +1000
> +++ linux-powerpc/arch/powerpc/lib/Makefile	2011-06-17 14:05:30.023020417 +1000
> @@ -17,7 +17,7 @@ obj-$(CONFIG_HAS_IOMEM)	+= devres.o
>  obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
>  			   memcpy_64.o usercopy_64.o mem_64.o string.o \
>  			   checksum_wrappers_64.o hweight_64.o \
> -			   copypage_power7.o
> +			   copypage_power7.o memcpy_power7.o
>  obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
>  obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
>  obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
> Index: linux-powerpc/arch/powerpc/lib/memcpy_64.S
> ===================================================================
> --- linux-powerpc.orig/arch/powerpc/lib/memcpy_64.S	2011-06-17 08:32:33.670110896 +1000
> +++ linux-powerpc/arch/powerpc/lib/memcpy_64.S	2011-06-17 08:38:25.806110507 +1000
> @@ -11,7 +11,11 @@
>  
>  	.align	7
>  _GLOBAL(memcpy)
> +BEGIN_FTR_SECTION
>  	std	r3,48(r1)	/* save destination pointer for return value */
> +FTR_SECTION_ELSE
> +	b	memcpy_power7
> +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POWER7)
>  	PPC_MTOCRF	0x01,r5
>  	cmpldi	cr1,r5,16
>  	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
> Index: linux-powerpc/arch/powerpc/lib/memcpy_power7.S
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ linux-powerpc/arch/powerpc/lib/memcpy_power7.S	2011-06-17 08:38:25.806110507 +1000
> @@ -0,0 +1,596 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) IBM Corporation, 2011
> + *
> + * Author: Anton Blanchard <anton@au.ibm.com>
> + */
> +#include <asm/ppc_asm.h>
> +
> +#define STACKFRAMESIZE	256
> +#define STK_REG(i)	(112 + ((i)-14)*8)
> +
> +_GLOBAL(memcpy_power7)
> +	cmpldi	r5,16
> +	cmpldi	cr1,r5,4096
> +
> +	std	r3,48(r1)
> +
> +	blt	.Lshort_copy
> +	bgt	cr1,.Lvmx_copy
> +
> +	/* Get the source 8B aligned */
> +	neg	r6,r4
> +	mtocrf	0x01,r6
> +	clrldi	r6,r6,(64-3)
> +
> +	bf	cr7*4+3,1f
> +	lbz	r0,0(r4)
> +	addi	r4,r4,1
> +	stb	r0,0(r3)
> +	addi	r3,r3,1
> +
> +1:	bf	cr7*4+2,2f
> +	lhz	r0,0(r4)
> +	addi	r4,r4,2
> +	sth	r0,0(r3)
> +	addi	r3,r3,2
> +
> +2:	bf	cr7*4+1,3f
> +	lwz	r0,0(r4)
> +	addi	r4,r4,4
> +	stw	r0,0(r3)
> +	addi	r3,r3,4
> +
> +3:	sub	r5,r5,r6
> +	cmpldi	r5,128
> +	blt	5f
> +
> +	stdu	r1,-STACKFRAMESIZE(r1)
> +	std	r14,STK_REG(r14)(r1)
> +	std	r15,STK_REG(r15)(r1)
> +	std	r16,STK_REG(r16)(r1)
> +	std	r17,STK_REG(r17)(r1)
> +	std	r18,STK_REG(r18)(r1)
> +	std	r19,STK_REG(r19)(r1)
> +	std	r20,STK_REG(r20)(r1)
> +	std	r21,STK_REG(r21)(r1)
> +	std	r22,STK_REG(r22)(r1)
> +
> +	srdi	r6,r5,7
> +	mtctr	r6
> +
> +	/* Now do cacheline (128B) sized loads and stores. */
> +	.align	5
> +4:	ld	r0,0(r4)
> +	ld	r6,8(r4)
> +	ld	r7,16(r4)
> +	ld	r8,24(r4)
> +	ld	r9,32(r4)
> +	ld	r10,40(r4)
> +	ld	r11,48(r4)
> +	ld	r12,56(r4)
> +	ld	r14,64(r4)
> +	ld	r15,72(r4)
> +	ld	r16,80(r4)
> +	ld	r17,88(r4)
> +	ld	r18,96(r4)
> +	ld	r19,104(r4)
> +	ld	r20,112(r4)
> +	ld	r21,120(r4)
> +	addi	r4,r4,128
> +	std	r0,0(r3)
> +	std	r6,8(r3)
> +	std	r7,16(r3)
> +	std	r8,24(r3)
> +	std	r9,32(r3)
> +	std	r10,40(r3)
> +	std	r11,48(r3)
> +	std	r12,56(r3)
> +	std	r14,64(r3)
> +	std	r15,72(r3)
> +	std	r16,80(r3)
> +	std	r17,88(r3)
> +	std	r18,96(r3)
> +	std	r19,104(r3)
> +	std	r20,112(r3)
> +	std	r21,120(r3)
> +	addi	r3,r3,128
> +	bdnz	4b
> +
> +	clrldi	r5,r5,(64-7)
> +
> +	ld	r14,STK_REG(r14)(r1)
> +	ld	r15,STK_REG(r15)(r1)
> +	ld	r16,STK_REG(r16)(r1)
> +	ld	r17,STK_REG(r17)(r1)
> +	ld	r18,STK_REG(r18)(r1)
> +	ld	r19,STK_REG(r19)(r1)
> +	ld	r20,STK_REG(r20)(r1)
> +	ld	r21,STK_REG(r21)(r1)
> +	ld	r22,STK_REG(r22)(r1)
> +	addi	r1,r1,STACKFRAMESIZE
> +
> +	/* Up to 127B to go */
> +5:	srdi	r6,r5,4
> +	mtocrf	0x01,r6
> +
> +6:	bf	cr7*4+1,7f
> +	ld	r0,0(r4)
> +	ld	r6,8(r4)
> +	ld	r7,16(r4)
> +	ld	r8,24(r4)
> +	ld	r9,32(r4)
> +	ld	r10,40(r4)
> +	ld	r11,48(r4)
> +	ld	r12,56(r4)
> +	addi	r4,r4,64
> +	std	r0,0(r3)
> +	std	r6,8(r3)
> +	std	r7,16(r3)
> +	std	r8,24(r3)
> +	std	r9,32(r3)
> +	std	r10,40(r3)
> +	std	r11,48(r3)
> +	std	r12,56(r3)
> +	addi	r3,r3,64
> +
> +	/* Up to 63B to go */
> +7:	bf	cr7*4+2,8f
> +	ld	r0,0(r4)
> +	ld	r6,8(r4)
> +	ld	r7,16(r4)
> +	ld	r8,24(r4)
> +	addi	r4,r4,32
> +	std	r0,0(r3)
> +	std	r6,8(r3)
> +	std	r7,16(r3)
> +	std	r8,24(r3)
> +	addi	r3,r3,32
> +
> +	/* Up to 31B to go */
> +8:	bf	cr7*4+3,9f
> +	ld	r0,0(r4)
> +	ld	r6,8(r4)
> +	addi	r4,r4,16
> +	std	r0,0(r3)
> +	std	r6,8(r3)
> +	addi	r3,r3,16
> +
> +9:	clrldi	r5,r5,(64-4)

I fail to see the point of that instruction: after that
you move r5 to cr7 and only test the 4 LSB, so clearing
the higher order bits looks superfluous.

There are other places where I think that you can save
a few instructions, but that one stands out as being
completely useless, unless I miss something really subtle.

And no, I don't have a Power7. I wish I had one, or 3...

BTW: do you have any statistics on the size distribution
of memcpy memcpy_to_from_usr?

My gut feeling is that the intermediate case is the most
important, and the short case the less critical (drowned
in overhead's noise) but that's the kind of things on which
I've often been wrong.

Do you really need to save and restore all the 32 VMX registers 
(1/2 kB) or would it be possible (in a later step) to ony save 
and restore the actually used ones (and no CSR either) ? 

> +
> +	/* Up to 15B to go */
> +.Lshort_copy:
> +	mtocrf	0x01,r5
> +	bf	cr7*4+0,12f
> +	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
> +	lwz	r6,4(r4)
> +	addi	r4,r4,8
> +	stw	r0,0(r3)
> +	stw	r6,4(r3)
> +	addi	r3,r3,8
> +
> +12:	bf	cr7*4+1,13f
> +	lwz	r0,0(r4)
> +	addi	r4,r4,4
> +	stw	r0,0(r3)
> +	addi	r3,r3,4
> +
> +13:	bf	cr7*4+2,14f
> +	lhz	r0,0(r4)
> +	addi	r4,r4,2
> +	sth	r0,0(r3)
> +	addi	r3,r3,2
> +
> +14:	bf	cr7*4+3,15f
> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +
> +15:	ld	r3,48(r1)
> +	blr
> +

	Regards,
	Gabriel

^ permalink raw reply

* Re: linux-next: build failure after merge of the final tree (Linus' tree related)
From: KAMEZAWA Hiroyuki @ 2011-06-17  5:34 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Linus, linux-kernel, linux-next, Andrew Morton, ppc-dev,
	David S. Miller
In-Reply-To: <20110617153809.471769a1.sfr@canb.auug.org.au>

On Fri, 17 Jun 2011 15:38:09 +1000
Stephen Rothwell <sfr@canb.auug.org.au> wrote:

> Hi all,
> 
> After merging the final tree, today's linux-next build (powerpc
> allyesconfig) failed like this:
> 
> mm/page_cgroup.c: In function 'page_cgroup_init':
> mm/page_cgroup.c:309:13: error: 'pg_data_t' has no member named 'node_end_pfn'
> 
> Caused by commit 37573e8c7182 ("memcg: fix init_page_cgroup nid with
> sparsemem").  On powerpc, node_end_pfn() is defined to be (NODE_DATA
> (nid)->node_end_pfn) where NODE_DATA(nid) is (node_data[nid]) and
> node_data is struct pglist_data *node_data[].  As far as I can see,
> struct pglist_data has never had a member called node_end_pfn.
> 
> This commit introduces the only use of node_end_pfn() in the generic
> kernel code.  Presumably the powerpc definition needs to be fixed (to
> maybe something like the x86 version).  It looks like the sparc version
> is broken as well.
> 

Sorry, here is a fix I posted today. but no ack yet.
==
>From 507cc95c5ba2351bff16c5421255d1395a3b555b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 16 Jun 2011 17:28:07 +0900
Subject: [PATCH] Fix node_start/end_pfn() definition for mm/page_cgroup.c

commit 21a3c96 uses node_start/end_pfn(nid) for detection start/end
of nodes. But, it's not defined in linux/mmzone.h but defined in
/arch/???/include/mmzone.h which is included only under
CONFIG_NEED_MULTIPLE_NODES=y.

Then, we see
mm/page_cgroup.c: In function 'page_cgroup_init':
mm/page_cgroup.c:308: error: implicit declaration of function 'node_start_pfn'
mm/page_cgroup.c:309: error: implicit declaration of function 'node_end_pfn'

So, fixiing page_cgroup.c is an idea...

But node_start_pfn()/node_end_pfn() is a very generic macro and
should be implemented in the same manner for all archs.
(m32r has different implementation...)

This patch removes definitions of node_start/end_pfn() in each archs
and defines a unified one in linux/mmzone.h. It's not under
CONFIG_NEED_MULTIPLE_NODES, now.

A result of macro expansion is here (mm/page_cgroup.c)

for !NUMA
 start_pfn = ((&contig_page_data)->node_start_pfn);
  end_pfn = ({ pg_data_t *__pgdat = (&contig_page_data); __pgdat->node_start_pfn + __pgdat->node_spanned_pages;});

for NUMA (x86-64)
  start_pfn = ((node_data[nid])->node_start_pfn);
  end_pfn = ({ pg_data_t *__pgdat = (node_data[nid]); __pgdat->node_start_pfn + __pgdat->node_spanned_pages;});

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Changelog:
 - fixed to avoid using "nid" twice in node_end_pfn() macro.
---
 arch/alpha/include/asm/mmzone.h   |    1 -
 arch/m32r/include/asm/mmzone.h    |    8 +-------
 arch/parisc/include/asm/mmzone.h  |    7 -------
 arch/powerpc/include/asm/mmzone.h |    7 -------
 arch/sh/include/asm/mmzone.h      |    4 ----
 arch/sparc/include/asm/mmzone.h   |    2 --
 arch/tile/include/asm/mmzone.h    |   11 -----------
 arch/x86/include/asm/mmzone_32.h  |   11 -----------
 arch/x86/include/asm/mmzone_64.h  |    3 ---
 include/linux/mmzone.h            |    7 +++++++
 10 files changed, 8 insertions(+), 53 deletions(-)

diff --git a/arch/alpha/include/asm/mmzone.h b/arch/alpha/include/asm/mmzone.h
index 8af56ce..445dc42 100644
--- a/arch/alpha/include/asm/mmzone.h
+++ b/arch/alpha/include/asm/mmzone.h
@@ -56,7 +56,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
  * Given a kernel address, find the home node of the underlying memory.
  */
 #define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 
 /*
  * Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory
diff --git a/arch/m32r/include/asm/mmzone.h b/arch/m32r/include/asm/mmzone.h
index 9f3b5ac..115ced3 100644
--- a/arch/m32r/include/asm/mmzone.h
+++ b/arch/m32r/include/asm/mmzone.h
@@ -14,12 +14,6 @@ extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)		(node_data[nid])
 
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1;	\
-})
 
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 /*
@@ -44,7 +38,7 @@ static __inline__ int pfn_to_nid(unsigned long pfn)
 	int node;
 
 	for (node = 0 ; node < MAX_NUMNODES ; node++)
-		if (pfn >= node_start_pfn(node) && pfn <= node_end_pfn(node))
+		if (pfn >= node_start_pfn(node) && pfn < node_end_pfn(node))
 			break;
 
 	return node;
diff --git a/arch/parisc/include/asm/mmzone.h b/arch/parisc/include/asm/mmzone.h
index 9608d2c..e67eb9c 100644
--- a/arch/parisc/include/asm/mmzone.h
+++ b/arch/parisc/include/asm/mmzone.h
@@ -14,13 +14,6 @@ extern struct node_map_data node_data[];
 
 #define NODE_DATA(nid)          (&node_data[nid].pg_data)
 
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
-})
-
 /* We have these possible memory map layouts:
  * Astro: 0-3.75, 67.75-68, 4-64
  * zx1: 0-1, 257-260, 4-256
diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h
index fd3fd58..7b58917 100644
--- a/arch/powerpc/include/asm/mmzone.h
+++ b/arch/powerpc/include/asm/mmzone.h
@@ -38,13 +38,6 @@ u64 memory_hotplug_max(void);
 #define memory_hotplug_max() memblock_end_of_DRAM()
 #endif
 
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
-
 #else
 #define memory_hotplug_max() memblock_end_of_DRAM()
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h
index 8887baf..15a8496 100644
--- a/arch/sh/include/asm/mmzone.h
+++ b/arch/sh/include/asm/mmzone.h
@@ -9,10 +9,6 @@
 extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)		(node_data[nid])
 
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)	(NODE_DATA(nid)->node_start_pfn + \
-				 NODE_DATA(nid)->node_spanned_pages)
-
 static inline int pfn_to_nid(unsigned long pfn)
 {
 	int nid;
diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h
index e8c6487..99d9b9f 100644
--- a/arch/sparc/include/asm/mmzone.h
+++ b/arch/sparc/include/asm/mmzone.h
@@ -8,8 +8,6 @@
 extern struct pglist_data *node_data[];
 
 #define NODE_DATA(nid)		(node_data[nid])
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
 extern int numa_cpu_lookup_table[];
 extern cpumask_t numa_cpumask_lookup_table[];
diff --git a/arch/tile/include/asm/mmzone.h b/arch/tile/include/asm/mmzone.h
index c6344c4..9d3dbce 100644
--- a/arch/tile/include/asm/mmzone.h
+++ b/arch/tile/include/asm/mmzone.h
@@ -40,17 +40,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 	return highbits_to_node[__pfn_to_highbits(pfn)];
 }
 
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
-})
-
 #define kern_addr_valid(kaddr)	virt_addr_valid((void *)kaddr)
 
 static inline int pfn_valid(int pfn)
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 5e83a41..224e8c5 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -48,17 +48,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 #endif
 }
 
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
-})
-
 static inline int pfn_valid(int pfn)
 {
 	int nid = pfn_to_nid(pfn);
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index b3f88d7..129d9aa 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -13,8 +13,5 @@ extern struct pglist_data *node_data[];
 
 #define NODE_DATA(nid)		(node_data[nid])
 
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn +	\
-				 NODE_DATA(nid)->node_spanned_pages)
 #endif
 #endif /* _ASM_X86_MMZONE_64_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c928dac..9f7c3eb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -647,6 +647,13 @@ typedef struct pglist_data {
 #endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
+#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
+
+#define node_end_pfn(nid) ({\
+	pg_data_t *__pgdat = NODE_DATA(nid);\
+	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;\
+})
+
 #include <linux/memory_hotplug.h>
 
 extern struct mutex zonelists_mutex;
-- 
1.7.4.1

^ permalink raw reply related

* RE: powerpc: Add printk companion for ppc_md.progress
From: Benjamin Herrenschmidt @ 2011-06-17  6:21 UTC (permalink / raw)
  To: dtcarroll; +Cc: linuxppc-dev, miltonm
In-Reply-To: <1308289144.32158.33.camel@pasglop>

On Fri, 2011-06-17 at 15:39 +1000, Benjamin Herrenschmidt wrote:
> (Original mail lost in my email cleanup so this isn't a proper reply)
> 
> I'll apply that for now, but I'd very much like somebody to just get rid
> of the whole ppc_md.progress business.
> 
> We have printk working early enough nowadays (and we can use udbg for
> debugging).
> 
> It was meant to display magic numbers on the panel of IBM machines, I
> don't think it was every useful...

Hrm.. none of your 2 pending patches applies cleanly (factoring
free_initmem & that one). Haven't had a chance to check why yet, can you
check on your side ?

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH 3/3] powerpc: POWER7 optimised copy_to_user/copy_from_user using VMX
From: Benjamin Herrenschmidt @ 2011-06-17  5:58 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: linuxppc-dev, mikey, paulus
In-Reply-To: <20110617045421.700411944@samba.org>

On Fri, 2011-06-17 at 14:54 +1000, Anton Blanchard wrote:
> plain text document attachment (power7_copy_tofrom_user)
> Implement a POWER7 optimised copy_to_user/copy_from_user using VMX.
> For large aligned copies this new loop is over 10% faster, and for
> large unaligned copies it is over 200% faster.
> 
> If we take a fault we fall back to the old version, this keeps
> things relatively simple and easy to verify.

Same re-entrancy comment as the other ones. preempt & interrupts...
Except here is worse since you may page fault and thus lose the vmx
state completely.

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH 2/3] powerpc: POWER7 optimised memcpy using VMX
From: Benjamin Herrenschmidt @ 2011-06-17  5:57 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: linuxppc-dev, mikey, paulus
In-Reply-To: <20110617045421.615463021@samba.org>

O
> +.Lvmx_copy:
> +	mflr	r0
> +	std	r4,56(r1)
> +	std	r5,64(r1)
> +	std	r0,16(r1)
> +	stdu	r1,-STACKFRAMESIZE(r1)
> +	bl	.enable_kernel_altivec
> +	ld	r0,STACKFRAMESIZE+16(r1)
> +	ld	r3,STACKFRAMESIZE+48(r1)
> +	ld	r4,STACKFRAMESIZE+56(r1)
> +	ld	r5,STACKFRAMESIZE+64(r1)
> +	mtlr	r0

Disable interrupts ? We wont save the VMX state on interrupts and memcpy
is definitely re-entrant.

Or only run the optimization when not at interrupt time....

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH 1/3] powerpc: POWER7 optimised copy_page using VMX
From: Benjamin Herrenschmidt @ 2011-06-17  5:53 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: linuxppc-dev, mikey, paulus
In-Reply-To: <20110617045421.538184870@samba.org>

On Fri, 2011-06-17 at 14:53 +1000, Anton Blanchard wrote:

> +#include <asm/page.h>
> +#include <asm/ppc_asm.h>
> +
> +#define STACKFRAMESIZE	112
> +
> +_GLOBAL(copypage_power7)
> +	mflr	r0
> +	std	r3,48(r1)
> +	std	r4,56(r1)
> +	std	r0,16(r1)
> +	stdu	r1,-STACKFRAMESIZE(r1)
> +
> +	bl	.enable_kernel_altivec

Don't you need to preempt disable ? Or even irq disable ? Or do we know
copy page will never called at irq time ?

Also I wonder if you wouldn't be better to instead just manually enable
it MSR and save some VRs (if no current thread regs is attached) ? That
would be re-entrant.

> +	ld	r12,STACKFRAMESIZE+16(r1)
> +	ld	r4,STACKFRAMESIZE+56(r1)
> +	li	r0,(PAGE_SIZE/128)
> +	li	r6,16
> +	ld	r3,STACKFRAMESIZE+48(r1)
> +	li	r7,32
> +	li	r8,48
> +	mtctr	r0
> +	li	r9,64
> +	li	r10,80
> +	mtlr	r12
> +	li	r11,96
> +	li	r12,112
> +	addi	r1,r1,STACKFRAMESIZE
> +
> +	.align	5

Do we know that the blank will be filled with something harmless ?

> +1:	lvx	vr7,r0,r4
> +	lvx	vr6,r4,r6
> +	lvx	vr5,r4,r7
> +	lvx	vr4,r4,r8
> +	lvx	vr3,r4,r9
> +	lvx	vr2,r4,r10
> +	lvx	vr1,r4,r11
> +	lvx	vr0,r4,r12
> +	addi	r4,r4,128
> +	stvx	vr7,r0,r3
> +	stvx	vr6,r3,r6
> +	stvx	vr5,r3,r7
> +	stvx	vr4,r3,r8
> +	stvx	vr3,r3,r9
> +	stvx	vr2,r3,r10
> +	stvx	vr1,r3,r11
> +	stvx	vr0,r3,r12
> +	addi	r3,r3,128
> +	bdnz	1b

What about lvxl ? You aren't likely to re-use the source data soon
right ?

Hrm... re-reading the arch, it looks like the "l" variant is quirky,
should really only used on the last load of a cache block, but in your
case that should be ok to put it on the last accesses since we know the
alignment.

> +	blr
> Index: linux-powerpc/arch/powerpc/lib/Makefile
> ===================================================================
> --- linux-powerpc.orig/arch/powerpc/lib/Makefile	2011-05-19 19:57:38.058570608 +1000
> +++ linux-powerpc/arch/powerpc/lib/Makefile	2011-06-17 07:39:58.996165527 +1000
> @@ -16,7 +16,8 @@ obj-$(CONFIG_HAS_IOMEM)	+= devres.o
>  
>  obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
>  			   memcpy_64.o usercopy_64.o mem_64.o string.o \
> -			   checksum_wrappers_64.o hweight_64.o
> +			   checksum_wrappers_64.o hweight_64.o \
> +			   copypage_power7.o
>  obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
>  obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
>  obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
> Index: linux-powerpc/arch/powerpc/lib/copypage_64.S
> ===================================================================
> --- linux-powerpc.orig/arch/powerpc/lib/copypage_64.S	2011-06-06 08:07:35.000000000 +1000
> +++ linux-powerpc/arch/powerpc/lib/copypage_64.S	2011-06-17 07:39:58.996165527 +1000
> @@ -17,7 +17,11 @@ PPC64_CACHES:
>          .section        ".text"
>  
>  _GLOBAL(copy_page)
> +BEGIN_FTR_SECTION
>  	lis	r5,PAGE_SIZE@h
> +FTR_SECTION_ELSE
> +        b       .copypage_power7
> +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POWER7)
>  	ori	r5,r5,PAGE_SIZE@l
>  BEGIN_FTR_SECTION
>  	ld      r10,PPC64_CACHES@toc(r2)
> 

^ permalink raw reply

* Re: [PATCH 1/3] powerpc: POWER7 optimised copy_page using VMX
From: Benjamin Herrenschmidt @ 2011-06-17  5:43 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: linuxppc-dev, mikey, paulus
In-Reply-To: <20110617045421.538184870@samba.org>

On Fri, 2011-06-17 at 14:53 +1000, Anton Blanchard wrote:
> plain text document attachment (power7_copypage)
> Implement a POWER7 optimised copy_page using VMX. We copy a cacheline
> at a time using VMX loads and stores.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org>
> ---
> 
> How do we want to handle per machine optimised functions? I create
> yet another feature bit, but feature bits might get out of control
> at some point.

I've been wondering about that for some time.... The feature bit itself
isn't a big deal, for the in-kernel feature it's easy to split that into
separate masks (CPU features, cache features, debug features,
whatever...) but I don't like much the branch tricks, that won't scale
much when we have 4 or 5 versions....

What I really want is a way to patch the call sites to branch to an
alternate function.

We've looked at that with Michael a while back when pondering about
merging book3e/s but never got to something satisfactory, but maybe we
didn't look hard enough at what our toolchain is capable of...

Cheers,
Ben.

> Index: linux-powerpc/arch/powerpc/include/asm/cputable.h
> ===================================================================
> --- linux-powerpc.orig/arch/powerpc/include/asm/cputable.h	2011-06-06 08:07:35.128707749 +1000
> +++ linux-powerpc/arch/powerpc/include/asm/cputable.h	2011-06-17 07:39:58.996165527 +1000
> @@ -200,6 +200,7 @@ extern const char *powerpc_base_platform
>  #define CPU_FTR_POPCNTB			LONG_ASM_CONST(0x0400000000000000)
>  #define CPU_FTR_POPCNTD			LONG_ASM_CONST(0x0800000000000000)
>  #define CPU_FTR_ICSWX			LONG_ASM_CONST(0x1000000000000000)
> +#define CPU_FTR_POWER7			LONG_ASM_CONST(0x2000000000000000)
>  
>  #ifndef __ASSEMBLY__
>  
> @@ -423,7 +424,7 @@ extern const char *powerpc_base_platform
>  	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
>  	    CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
>  	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
> -	    CPU_FTR_ICSWX | CPU_FTR_CFAR)
> +	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_POWER7)
>  #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
>  	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
>  	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
> Index: linux-powerpc/arch/powerpc/lib/copypage_power7.S
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ linux-powerpc/arch/powerpc/lib/copypage_power7.S	2011-06-17 07:39:58.996165527 +1000
> @@ -0,0 +1,70 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) IBM Corporation, 2011
> + *
> + * Author: Anton Blanchard <anton@au.ibm.com>
> + */
> +#include <asm/page.h>
> +#include <asm/ppc_asm.h>
> +
> +#define STACKFRAMESIZE	112
> +
> +_GLOBAL(copypage_power7)
> +	mflr	r0
> +	std	r3,48(r1)
> +	std	r4,56(r1)
> +	std	r0,16(r1)
> +	stdu	r1,-STACKFRAMESIZE(r1)
> +
> +	bl	.enable_kernel_altivec
> +
> +	ld	r12,STACKFRAMESIZE+16(r1)
> +	ld	r4,STACKFRAMESIZE+56(r1)
> +	li	r0,(PAGE_SIZE/128)
> +	li	r6,16
> +	ld	r3,STACKFRAMESIZE+48(r1)
> +	li	r7,32
> +	li	r8,48
> +	mtctr	r0
> +	li	r9,64
> +	li	r10,80
> +	mtlr	r12
> +	li	r11,96
> +	li	r12,112
> +	addi	r1,r1,STACKFRAMESIZE
> +
> +	.align	5
> +1:	lvx	vr7,r0,r4
> +	lvx	vr6,r4,r6
> +	lvx	vr5,r4,r7
> +	lvx	vr4,r4,r8
> +	lvx	vr3,r4,r9
> +	lvx	vr2,r4,r10
> +	lvx	vr1,r4,r11
> +	lvx	vr0,r4,r12
> +	addi	r4,r4,128
> +	stvx	vr7,r0,r3
> +	stvx	vr6,r3,r6
> +	stvx	vr5,r3,r7
> +	stvx	vr4,r3,r8
> +	stvx	vr3,r3,r9
> +	stvx	vr2,r3,r10
> +	stvx	vr1,r3,r11
> +	stvx	vr0,r3,r12
> +	addi	r3,r3,128
> +	bdnz	1b
> +
> +	blr
> Index: linux-powerpc/arch/powerpc/lib/Makefile
> ===================================================================
> --- linux-powerpc.orig/arch/powerpc/lib/Makefile	2011-05-19 19:57:38.058570608 +1000
> +++ linux-powerpc/arch/powerpc/lib/Makefile	2011-06-17 07:39:58.996165527 +1000
> @@ -16,7 +16,8 @@ obj-$(CONFIG_HAS_IOMEM)	+= devres.o
>  
>  obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
>  			   memcpy_64.o usercopy_64.o mem_64.o string.o \
> -			   checksum_wrappers_64.o hweight_64.o
> +			   checksum_wrappers_64.o hweight_64.o \
> +			   copypage_power7.o
>  obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
>  obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
>  obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
> Index: linux-powerpc/arch/powerpc/lib/copypage_64.S
> ===================================================================
> --- linux-powerpc.orig/arch/powerpc/lib/copypage_64.S	2011-06-06 08:07:35.000000000 +1000
> +++ linux-powerpc/arch/powerpc/lib/copypage_64.S	2011-06-17 07:39:58.996165527 +1000
> @@ -17,7 +17,11 @@ PPC64_CACHES:
>          .section        ".text"
>  
>  _GLOBAL(copy_page)
> +BEGIN_FTR_SECTION
>  	lis	r5,PAGE_SIZE@h
> +FTR_SECTION_ELSE
> +        b       .copypage_power7
> +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POWER7)
>  	ori	r5,r5,PAGE_SIZE@l
>  BEGIN_FTR_SECTION
>  	ld      r10,PPC64_CACHES@toc(r2)
> 

^ permalink raw reply

* RE: powerpc: Add printk companion for ppc_md.progress
From: Benjamin Herrenschmidt @ 2011-06-17  5:39 UTC (permalink / raw)
  To: dtcarroll; +Cc: linuxppc-dev, miltonm

(Original mail lost in my email cleanup so this isn't a proper reply)

I'll apply that for now, but I'd very much like somebody to just get rid
of the whole ppc_md.progress business.

We have printk working early enough nowadays (and we can use udbg for
debugging).

It was meant to display magic numbers on the panel of IBM machines, I
don't think it was every useful...

Cheers,
Ben.

^ permalink raw reply

* linux-next: build failure after merge of the final tree (Linus' tree related)
From: Stephen Rothwell @ 2011-06-17  5:38 UTC (permalink / raw)
  To: Linus
  Cc: linux-kernel, linux-next, Andrew Morton, ppc-dev, David S. Miller,
	KAMEZAWA Hiroyuki

[-- Attachment #1: Type: text/plain, Size: 1032 bytes --]

Hi all,

After merging the final tree, today's linux-next build (powerpc
allyesconfig) failed like this:

mm/page_cgroup.c: In function 'page_cgroup_init':
mm/page_cgroup.c:309:13: error: 'pg_data_t' has no member named 'node_end_pfn'

Caused by commit 37573e8c7182 ("memcg: fix init_page_cgroup nid with
sparsemem").  On powerpc, node_end_pfn() is defined to be (NODE_DATA
(nid)->node_end_pfn) where NODE_DATA(nid) is (node_data[nid]) and
node_data is struct pglist_data *node_data[].  As far as I can see,
struct pglist_data has never had a member called node_end_pfn.

This commit introduces the only use of node_end_pfn() in the generic
kernel code.  Presumably the powerpc definition needs to be fixed (to
maybe something like the x86 version).  It looks like the sparc version
is broken as well.

I have left the powerpc allyesconfig build broken for today (as it is
also broken by other changes).
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* Re: [PATCH v3 2/2] powerpc: add support for MPIC message register API
From: Benjamin Herrenschmidt @ 2011-06-17  5:33 UTC (permalink / raw)
  To: Meador Inge
  Cc: openmcapi-dev, Hollis Blanchard, devicetree-discuss, linuxppc-dev
In-Reply-To: <1306869543-18812-3-git-send-email-meador_inge@mentor.com>

On Tue, 2011-05-31 at 14:19 -0500, Meador Inge wrote:
> Some MPIC implementations contain one or more blocks of message registers
> that are used to send messages between cores via IPIs.  A simple API has
> been added to access (get/put, read, write, etc ...) these message registers.
> The available message registers are initially discovered via nodes in the
> device tree.  A separate commit contains a binding for the message register
> nodes.

Ok, so I finally got to look at that in a bit more details...
> +#ifndef _ASM_MPIC_MSGR_H
> +#define _ASM_MPIC_MSGR_H
> +
> +#include <linux/types.h>
> +
> +struct mpic_msgr {
> +	u32 __iomem *addr;
> +	u32 __iomem *mer;
> +	u32 __iomem	*msr;
> +	int irq;
> +	atomic_t in_use;
> +	int num;
> +};

General comment... I'm really not fan of "msgr", I'd rather see
"mpic_message_*", it's a tad more verbose but looks a lot better, no ?

Also do you need those 3 iomem pointers ? Not just one with fixed
offsets ? Or do they come from vastly different sources ?

atomic_t in_use looks fishy, but let's see how you use it...

> +extern struct mpic_msgr* mpic_msgr_get(unsigned int reg_num);
> +extern void mpic_msgr_put(struct mpic_msgr* msgr);
> +extern void mpic_msgr_enable(struct mpic_msgr *msgr);
> +extern void mpic_msgr_disable(struct mpic_msgr *msgr);
> +extern void mpic_msgr_write(struct mpic_msgr *msgr, u32 message);
> +extern u32 mpic_msgr_read(struct mpic_msgr *msgr);
> +extern void mpic_msgr_clear(struct mpic_msgr *msgr);
> +extern void mpic_msgr_set_destination(struct mpic_msgr *msgr, u32 cpu_num);
> +extern int mpic_msgr_get_irq(struct mpic_msgr *msgr);

Documentation of the API please.

> +#endif
> diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
> index f7b0772..4d65593 100644
> --- a/arch/powerpc/platforms/Kconfig
> +++ b/arch/powerpc/platforms/Kconfig
> @@ -78,6 +78,14 @@ config MPIC_WEIRD
>  	bool
>  	default n
>  
> +config MPIC_MSGR
> +	bool "MPIC message register support"
> +	depends on MPIC
> +	default n
> +	help
> +	  Enables support for the MPIC message registers.  These
> +	  registers are used for inter-processor communication.
> +
>  config PPC_I8259
>  	bool
>  	default n
> diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
> index 1e0c933..6d40185 100644
> --- a/arch/powerpc/sysdev/Makefile
> +++ b/arch/powerpc/sysdev/Makefile
> @@ -3,7 +3,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
>  ccflags-$(CONFIG_PPC64)		:= -mno-minimal-toc
>  
>  mpic-msi-obj-$(CONFIG_PCI_MSI)	+= mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o
> -obj-$(CONFIG_MPIC)		+= mpic.o $(mpic-msi-obj-y)
> +mpic-msgr-obj-$(CONFIG_MPIC_MSGR)	+= mpic_msgr.o
> +obj-$(CONFIG_MPIC)		+= mpic.o $(mpic-msi-obj-y) $(mpic-msgr-obj-y)
>  fsl-msi-obj-$(CONFIG_PCI_MSI)	+= fsl_msi.o
>  obj-$(CONFIG_PPC_MSI_BITMAP)	+= msi_bitmap.o
>  
> diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c
> new file mode 100644
> index 0000000..bfa0612
> --- /dev/null
> +++ b/arch/powerpc/sysdev/mpic_msgr.c
> @@ -0,0 +1,279 @@
> +/*
> + * Copyright 2011-2012, Meador Inge, Mentor Graphics Corporation.
> + *
> + * Some ideas based on un-pushed work done by Vivek Mahajan, Jason Jin, and
> + * Mingkai Hu from Freescale Semiconductor, Inc.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; version 2 of the
> + * License.
> + *
> + */
> +
> +#include <linux/list.h>
> +#include <linux/of_platform.h>
> +#include <linux/errno.h>
> +#include <asm/prom.h>
> +#include <asm/hw_irq.h>
> +#include <asm/ppc-pci.h>
> +#include <asm/mpic_msgr.h>
> +
> +#define MPIC_MSGR_REGISTERS_PER_BLOCK 4
> +#define MSGR_INUSE 0
> +#define MSGR_FREE 1
> +
> +/* Internal structure used *only* for IO mapping register blocks. */
> +struct mpic_msgr_block {
> +	struct msgr {
> +		u32 msgr;
> +		u8 res[12];
> +	} msgrs[MPIC_MSGR_REGISTERS_PER_BLOCK];
> +	u8 res0[192];
> +	u32 mer;
> +	u8 res1[12];
> +	u32 msr;
> +};

So this represent HW registers ? Please make it clear in the comment.
I'm not a terrible fan of using structures to map HW especially with so
few registers.

> +static struct mpic_msgr **mpic_msgrs = 0;
> +static unsigned int mpic_msgr_count = 0;
> +
> +struct mpic_msgr* mpic_msgr_get(unsigned int reg_num)
> +{
> +	struct mpic_msgr* msgr;
> +
> +	if (reg_num >= mpic_msgr_count)
> +		return ERR_PTR(-ENODEV);
> +
> +	msgr = mpic_msgrs[reg_num];

No locking on the array access, might be ok if those things are never
plugged in/out I suppose...

> +	if (atomic_cmpxchg(&msgr->in_use, MSGR_FREE, MSGR_INUSE) == MSGR_FREE)
> +		return msgr;
> +
> +	return ERR_PTR(-EBUSY);
> +}
> +EXPORT_SYMBOL(mpic_msgr_get);

So how are those things intended to be used ? Clients get a fixed
"register" number to use ? It looks like this stuff would have been
better off using some kind of allocator no ? A bitmap would have done
the job... or do you really need to have some kind of fixed numbering
associated with clients ?

> +void mpic_msgr_put(struct mpic_msgr* msgr)
> +{
> +	atomic_set(&msgr->in_use, MSGR_FREE);
> +}
> +EXPORT_SYMBOL(mpic_msgr_put);

Shouldn't it be disabled too while at it ?

> +void mpic_msgr_enable(struct mpic_msgr *msgr)
> +{
> +	out_be32(msgr->mer, in_be32(msgr->mer) | (1 << msgr->num));
> +}
> +EXPORT_SYMBOL(mpic_msgr_enable);

Why are all those exported non-GPL ? We have a policy of making new
in-kernel APIs generally GPL only.

> +void mpic_msgr_disable(struct mpic_msgr *msgr)
> +{
> +	out_be32(msgr->mer, in_be32(msgr->mer) & ~(1 << msgr->num));
> +}
> +EXPORT_SYMBOL(mpic_msgr_disable);

I see read-modify-write cycles here with no synchronization/locking
whatsoever... Might be ok as long as you document it in the doc of the
API, ie, the caller is required to synchronize.

> +void mpic_msgr_write(struct mpic_msgr *msgr, u32 message)
> +{
> +	out_be32(msgr->addr, message);
> +}
> +EXPORT_SYMBOL(mpic_msgr_write);
> +
> +u32 mpic_msgr_read(struct mpic_msgr *msgr)
> +{
> +	return in_be32(msgr->addr);
> +}
> +EXPORT_SYMBOL(mpic_msgr_read);

Those look like significant bloat/overhead for what is just an MMIO
wrapper on a register access, why not make them static inline's
instead ?

> +void mpic_msgr_clear(struct mpic_msgr *msgr)
> +{
> +	(void) mpic_msgr_read(msgr);
> +}
> +EXPORT_SYMBOL(mpic_msgr_clear);
> +
> +void mpic_msgr_set_destination(struct mpic_msgr *msgr, u32 cpu_num)
> +{
> +	out_be32(msgr->addr, 1 << cpu_num);
> +}
> +EXPORT_SYMBOL(mpic_msgr_set_destination);

Please make it clear that this takes a HW CPU number. How is that going
to be used ? Wouldn't you be better off having a __ variant taking the
HW CPU number an the "public" variant doing the conversion from a Linux
number ?

> +int mpic_msgr_get_irq(struct mpic_msgr *msgr)
> +{
> +	return msgr->irq;
> +}
> +EXPORT_SYMBOL(mpic_msgr_get_irq);
> +
> +/* The following three functions are used to compute the order and number of
> + * the message register blocks.  They are clearly very inefficent.  However,
> + * they are called *only* a few times during device initialization.
> + */
> +static unsigned int mpic_msgr_number_of_blocks(void)
> +{
> +	unsigned int count;
> +	struct device_node *aliases;
> +
> +	count = 0;
> +	aliases = of_find_node_by_name(NULL, "aliases");
> +
> +	if (aliases) {
> +		char buf[32];
> +
> +		for (;;) {
> +			snprintf(buf, sizeof(buf), "mpic-msgr-block%d", count);
> +			if (!of_find_property(aliases, buf, NULL))
> +				break;
> +
> +			count += 1;
> +		}
> +	}
> +
> +	return count;
> +}

I don't like relying on aliases as a way to figure out what HW is
present, that's not right. You should find the actual devices themselves
based on their compatible properties.

All of that stuff also assume these things are never going to be
hot-plugged. IE. will they never go in/out of partition in hypervisors
(or even be virtualized) ?

> +static unsigned int mpic_msgr_number_of_registers(void)
> +{
> +	return mpic_msgr_number_of_blocks() * MPIC_MSGR_REGISTERS_PER_BLOCK;
> +}
> +
> +static int mpic_msgr_block_number(struct device_node *node)
> +{
> +	struct device_node *aliases;
> +	unsigned int index, number_of_blocks;
> +	char buf[64];
> +
> +	number_of_blocks = mpic_msgr_number_of_blocks();
> +	aliases = of_find_node_by_name(NULL, "aliases");
> +	if (!aliases)
> +		return -1;
> +
> +	for (index = 0; index < number_of_blocks; ++index) {
> +		struct property *prop;
> +
> +		snprintf(buf, sizeof(buf), "mpic-msgr-block%d", index);
> +		prop = of_find_property(aliases, buf, NULL);
> +		if (node == of_find_node_by_path(prop->value))
> +			break;
> +	}
> +
> +	return index == number_of_blocks ? -1 : index;
> +}
> +
> +/* The probe function for a single message register block.
> + */
> +static __devinit int mpic_msgr_probe(struct platform_device *dev)
> +{
> +	struct mpic_msgr_block __iomem *msgr_block;
> +	int block_number;
> +	struct resource rsrc;
> +	unsigned int i;
> +	unsigned int irq_index;
> +	struct device_node *np = dev->dev.of_node;
> +	unsigned int receive_mask;
> +	const unsigned int *prop;
> +
> +	if (!np) {
> +		dev_err(&dev->dev, "Device OF-Node is NULL");
> +		return -EFAULT;
> +	}
> +
> +	/* Allocate the message register array upon the first device
> +	 * registered.
> +	 */
> +	if (!mpic_msgrs) {
> +		mpic_msgr_count = mpic_msgr_number_of_registers();
> +		dev_info(&dev->dev, "Found %d message registers\n", mpic_msgr_count);
> +
> +		mpic_msgrs = kzalloc(sizeof(struct mpic_msgr) * mpic_msgr_count,
> +							 GFP_KERNEL);
> +		if (!mpic_msgrs) {
> +			dev_err(&dev->dev, "No memory for message register blocks\n");
> +			return -ENOMEM;
> +		}
> +	}
> +	dev_info(&dev->dev, "Of-device full name %s\n", np->full_name);

Ok, so we have another scheme of:

 - Count all devices in the system of a given type
 - Assign them numbers
 - API uses number

That sucks... unless you have an allocator. And even then.

I'd rather clients use something like struct mpic_msgr (or msg_reg or
message_reg) as the "handle" to one of these things.

It can be obtained via an allocator or a device tree parsing routine if
there's a fixed relationship between clients and registers, I don't
really know how that stuff is to be used, but in any case, the whole
thing stinks as it is.

> +	/* IO map the message register block. */
> +	of_address_to_resource(np, 0, &rsrc);
> +	msgr_block = ioremap(rsrc.start, rsrc.end - rsrc.start);
> +	if (!msgr_block) {
> +		dev_err(&dev->dev, "Failed to iomap MPIC message registers");
> +		return -EFAULT;
> +	}
> +
> +	/* Ensure the block has a defined order. */
> +	block_number = mpic_msgr_block_number(np);
> +	if (block_number < 0) {
> +		dev_err(&dev->dev, "Failed to find message register block alias\n");
> +		return -ENODEV;
> +	}
> +	dev_info(&dev->dev, "Setting up message register block %d\n", block_number);
> +
> +	/* Grab the receive mask which specifies what registers can receive
> +	 * interrupts.
> +	 */
> +	prop = of_get_property(np, "mpic-msgr-receive-mask", NULL);
> +	receive_mask = (prop) ? *prop : 0xF;
> +
> +	/* Build up the appropriate message register data structures. */
> +	for (i = 0, irq_index = 0; i < MPIC_MSGR_REGISTERS_PER_BLOCK; ++i) {
> +		struct mpic_msgr *msgr;
> +		unsigned int reg_number;
> +
> +		msgr = kzalloc(sizeof(struct mpic_msgr), GFP_KERNEL);
> +		if (!msgr) {
> +			dev_err(&dev->dev, "No memory for message register\n");
> +			return -ENOMEM;
> +		}
> +
> +		reg_number = block_number * MPIC_MSGR_REGISTERS_PER_BLOCK + i;
> +		msgr->addr = &msgr_block->msgrs[i].msgr;
> +		msgr->mer = &msgr_block->mer;
> +		msgr->msr = &msgr_block->msr;
> +		atomic_set(&msgr->in_use, MSGR_FREE);
> +		msgr->num = reg_number;
> +
> +		if (receive_mask & (1 << i)) {
> +			struct resource irq;
> +
> +			if (of_irq_to_resource(np, irq_index, &irq) == NO_IRQ) {
> +				dev_err(&dev->dev, "Missing interrupt specifier");
> +				kfree(msgr);
> +				return -EFAULT;
> +			}
> +			msgr->irq = irq.start;
> +			irq_index += 1;
> +		} else {
> +			msgr->irq = NO_IRQ;
> +		}
> +
> +		mpic_msgrs[reg_number] = msgr;
> +		mpic_msgr_disable(msgr);
> +		dev_info(&dev->dev, "Register %d initialized: irq %d\n",
> +				 msgr->num, msgr->irq);
> +	
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct of_device_id mpic_msgr_ids[] = {
> +	{
> +		.compatible = "fsl,mpic-v3.1-msgr",
> +		.data = NULL,
> +	},
> +	{}
> +};
> +
> +static struct platform_driver mpic_msgr_driver = {
> +	.driver = {
> +		.name = "mpic-msgr",
> +		.owner = THIS_MODULE,
> +		.of_match_table = mpic_msgr_ids,
> +	},
> +	.probe = mpic_msgr_probe,
> +};
> +
> +static __init int mpic_msgr_init(void)
> +{
> +	return platform_driver_register(&mpic_msgr_driver);
> +}
> +subsys_initcall(mpic_msgr_init);

Ben.

^ permalink raw reply

* Re: [PATCH 1/3] powerpc: POWER7 optimised copy_page using VMX
From: Anton Blanchard @ 2011-06-17  5:26 UTC (permalink / raw)
  To: Michael Neuling; +Cc: paulus, linuxppc-dev
In-Reply-To: <27834.1308287767@neuling.org>

Hi,

> Yeah, I'm pretty against CPU_FTR_POWER7.  Every loon is going to
> attach anything POWER7 to it.  
> 
> I'm keen to see it setup in  __setup_cpu_power7.  Either a function
> pointer or use the patch_instruction infrastructure to avoid indirect
> function calls on small copies.  

Instruction patching in __setup_cpu_power7 could work. We might want to
have a nop at the start of the base functions and a label at the start
of the next instruction so we can easily override the base function and
jump back to it if things are too hard (like I do in the
copy_tofrom_user patch).

Anton

^ permalink raw reply

* Re: [PATCH 1/3] powerpc: POWER7 optimised copy_page using VMX
From: Michael Neuling @ 2011-06-17  5:16 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: paulus, linuxppc-dev
In-Reply-To: <20110617045421.538184870@samba.org>

> Implement a POWER7 optimised copy_page using VMX. We copy a cacheline
> at a time using VMX loads and stores.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org>
> ---
> 
> How do we want to handle per machine optimised functions? I create
> yet another feature bit, but feature bits might get out of control
> at some point.

Yeah, I'm pretty against CPU_FTR_POWER7.  Every loon is going to attach
anything POWER7 to it.  

I'm keen to see it setup in  __setup_cpu_power7.  Either a function
pointer or use the patch_instruction infrastructure to avoid indirect
function calls on small copies.  

Mikey

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/book3e-64: reraise doorbell when masked by soft-irq-disable
From: Benjamin Herrenschmidt @ 2011-06-17  5:12 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev
In-Reply-To: <1306183896.7481.209.camel@pasglop>

On Tue, 2011-05-24 at 06:51 +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2011-05-23 at 15:26 -0500, Scott Wood wrote:
> > On Sat, 21 May 2011 08:32:58 +1000
> > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > 
> > > On Fri, 2011-05-20 at 14:00 -0500, Scott Wood wrote:
> > > > Signed-off-by: Scott Wood <scottwood@freescale.com>
> > > > ---
> > > >  arch/powerpc/kernel/exceptions-64e.S |   22 +++++++++++++++++++++-
> > > >  1 files changed, 21 insertions(+), 1 deletions(-)
> > > 
> > > You can probably remove the doorbell re-check when enabling interrupts
> > > now, can't you ?
> > 
> > Ah, so that's how it currently gets away without re-raising when the
> > interrupt happens. :-)
> > 
> > I'll remove it.
> 
> Yup, I was too lazy to make a special case in the exception handlers :-)

Are you going to send a re-spin ?

Cheers,
Ben.

^ permalink raw reply

* [PATCH 3/3] powerpc: POWER7 optimised copy_to_user/copy_from_user using VMX
From: Anton Blanchard @ 2011-06-17  4:54 UTC (permalink / raw)
  To: benh, paulus, mikey; +Cc: linuxppc-dev
In-Reply-To: <20110617045358.544896830@samba.org>

Implement a POWER7 optimised copy_to_user/copy_from_user using VMX.
For large aligned copies this new loop is over 10% faster, and for
large unaligned copies it is over 200% faster.

If we take a fault we fall back to the old version, this keeps
things relatively simple and easy to verify.

(The detailed comments below are copied from the POWER7 optimised
memcpy patch for completeness).

On POWER7 unaligned stores rarely slow down - they only flush when
a store crosses a 4KB page boundary. Furthermore this flush is
handled completely in hardware and should be 20-30 cycles.

Unaligned loads on the other hand flush much more often - whenever
crossing a 128 byte cache line, or a 32 byte sector if either sector
is an L1 miss.

Considering this information we really want to get the loads aligned
and not worry about the alignment of the stores. Microbenchmarks
confirm that this approach is much faster than the current unaligned
copy loop that uses shifts and rotates to ensure both loads and
stores are aligned.

We also want to try and do the stores in cacheline aligned, cacheline
sized chunks. If the store queue is unable to merge an entire
cacheline of stores then the L2 cache will have to do a
read/modify/write. Even worse, we will serialise this with the stores
in the next iteration of the copy loop since both iterations hit
the same cacheline.

Based on this, the new loop does the following things:


1 - 127 bytes
Get the source 8 byte aligned and use 8 byte loads and stores. Pretty
boring and similar to how the current loop works.

128 - 4095 bytes
Get the source 8 byte aligned and use 8 byte loads and stores,
1 cacheline at a time. We aren't doing the stores in cacheline
aligned chunks so we will potentially serialise once per cacheline.
Even so it is much better than the loop we have today.

4096 - bytes
If both source and destination have the same alignment get them both
16 byte aligned, then get the destination cacheline aligned. Do
cacheline sized loads and stores using VMX.

If source and destination do not have the same alignment, we get the
destination cacheline aligned, and use permute to do aligned loads.

In both cases the VMX loop should be optimal - we always do aligned
loads and stores and are always doing stores in cacheline aligned,
cacheline sized chunks.


The VMX breakpoint of 4096 bytes was chosen using this microbenchmark:

http://ozlabs.org/~anton/junkcode/copy_to_user.c

Since we are using VMX and there is a cost to saving and restoring
the user VMX state there are two broad cases we need to benchmark:

- Best case - userspace never uses VMX

- Worst case - userspace always uses VMX

In reality a userspace process will sit somewhere between these two
extremes. Since we need to test both aligned and unaligned copies we
end up with 4 combinations. The point at which the VMX loop begins to
win is:

0% VMX
aligned		2048 bytes
unaligned	2048 bytes

100% VMX
aligned		16384 bytes
unaligned	8192 bytes

Considering this is a microbenchmark, the data is hot in cache and
the VMX loop has better store queue merging properties we set the
breakpoint to 4096 bytes, a little below the unaligned breakpoints.

Some future optimisations we can look at:

- Looking at the perf data, a significant part of the cost when a task
  is always using VMX is the extra exception we take to restore the
  VMX state. As such we should do something similar to the x86
  optimisation that restores FPU state for heavy users. ie:

        /*
         * If the task has used fpu the last 5 timeslices, just do a full
         * restore of the math state immediately to avoid the trap; the
         * chances of needing FPU soon are obviously high now
         */
        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;

  and 

        /*
         * fpu_counter contains the number of consecutive context switches
         * that the FPU is used. If this is over a threshold, the lazy fpu
         * saving becomes unlazy to save the trap. This is an unsigned char
         * so that after 256 times the counter wraps and the behavior turns
         * lazy again; this to deal with bursty apps that only use FPU for
         * a short time
         */

- We could create a paca bit to mirror the VMX enabled MSR bit and check
  that first, avoiding multiple calls to calling enable_kernel_altivec.

- We could have two VMX breakpoints, one for when we know the user VMX
  state is loaded into the registers and one when it isn't. This could
  be a second bit in the paca so we can calculate the break points quickly.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: linux-powerpc/arch/powerpc/lib/copyuser_64.S
===================================================================
--- linux-powerpc.orig/arch/powerpc/lib/copyuser_64.S	2011-06-17 14:05:30.013020235 +1000
+++ linux-powerpc/arch/powerpc/lib/copyuser_64.S	2011-06-17 14:27:43.026572962 +1000
@@ -11,6 +11,10 @@
 
 	.align	7
 _GLOBAL(__copy_tofrom_user)
+BEGIN_FTR_SECTION
+	b	__copy_tofrom_user_power7
+END_FTR_SECTION_IFSET(CPU_FTR_POWER7)
+_GLOBAL(__copy_tofrom_user_base)
 	/* first check for a whole page copy on a page boundary */
 	cmpldi	cr1,r5,16
 	cmpdi	cr6,r5,4096
Index: linux-powerpc/arch/powerpc/lib/copyuser_power7.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-powerpc/arch/powerpc/lib/copyuser_power7.S	2011-06-17 14:41:47.901277096 +1000
@@ -0,0 +1,654 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2011
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+
+#define STACKFRAMESIZE	256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+	.macro err1
+100:
+	.section __ex_table,"a"
+	.align 3
+	.llong 100b,.Ldo_err1
+	.previous
+	.endm
+
+	.macro err2
+200:
+	.section __ex_table,"a"
+	.align 3
+	.llong 200b,.Ldo_err2
+	.previous
+	.endm
+
+	.macro err3
+300:
+	.section __ex_table,"a"
+	.align 3
+	.llong 300b,.Ldo_err3
+	.previous
+	.endm
+
+	.macro err4
+400:
+	.section __ex_table,"a"
+	.align 3
+	.llong 400b,.Ldo_err4
+	.previous
+	.endm
+
+
+.Ldo_err2:
+	ld	r22,STK_REG(r22)(r1)
+	ld	r21,STK_REG(r21)(r1)
+	ld	r20,STK_REG(r20)(r1)
+	ld	r19,STK_REG(r19)(r1)
+	ld	r18,STK_REG(r18)(r1)
+	ld	r17,STK_REG(r17)(r1)
+.Ldo_err4:
+	ld	r16,STK_REG(r16)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r14,STK_REG(r14)(r1)
+.Ldo_err3:
+	addi	r1,r1,STACKFRAMESIZE
+.Ldo_err1:
+	ld	r3,48(r1)
+	ld	r4,56(r1)
+	ld	r5,64(r1)
+	b	__copy_tofrom_user_base
+
+
+_GLOBAL(__copy_tofrom_user_power7)
+	cmpldi	r5,16
+	cmpldi	cr1,r5,4096
+
+	std	r3,48(r1)
+	std	r4,56(r1)
+	std	r5,64(r1)
+
+	blt	.Lshort_copy
+	bgt	cr1,.Lvmx_copy
+
+	/* Get the source 8B aligned */
+	neg	r6,r4
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-3)
+
+	bf	cr7*4+3,1f
+err1;	lbz	r0,0(r4)
+	addi	r4,r4,1
+err1;	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+err1;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+err1;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	sub	r5,r5,r6
+	cmpldi	r5,128
+	blt	5f
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+	std	r17,STK_REG(r17)(r1)
+	std	r18,STK_REG(r18)(r1)
+	std	r19,STK_REG(r19)(r1)
+	std	r20,STK_REG(r20)(r1)
+	std	r21,STK_REG(r21)(r1)
+	std	r22,STK_REG(r22)(r1)
+
+	srdi	r6,r5,7
+	mtctr	r6
+
+	/* Now do cacheline (128B) sized loads and stores. */
+	.align	5
+4:
+err2;	ld	r0,0(r4)
+err2;	ld	r6,8(r4)
+err2;	ld	r7,16(r4)
+err2;	ld	r8,24(r4)
+err2;	ld	r9,32(r4)
+err2;	ld	r10,40(r4)
+err2;	ld	r11,48(r4)
+err2;	ld	r12,56(r4)
+err2;	ld	r14,64(r4)
+err2;	ld	r15,72(r4)
+err2;	ld	r16,80(r4)
+err2;	ld	r17,88(r4)
+err2;	ld	r18,96(r4)
+err2;	ld	r19,104(r4)
+err2;	ld	r20,112(r4)
+err2;	ld	r21,120(r4)
+	addi	r4,r4,128
+err2;	std	r0,0(r3)
+err2;	std	r6,8(r3)
+err2;	std	r7,16(r3)
+err2;	std	r8,24(r3)
+err2;	std	r9,32(r3)
+err2;	std	r10,40(r3)
+err2;	std	r11,48(r3)
+err2;	std	r12,56(r3)
+err2;	std	r14,64(r3)
+err2;	std	r15,72(r3)
+err2;	std	r16,80(r3)
+err2;	std	r17,88(r3)
+err2;	std	r18,96(r3)
+err2;	std	r19,104(r3)
+err2;	std	r20,112(r3)
+err2;	std	r21,120(r3)
+	addi	r3,r3,128
+	bdnz	4b
+
+	clrldi	r5,r5,(64-7)
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+	ld	r17,STK_REG(r17)(r1)
+	ld	r18,STK_REG(r18)(r1)
+	ld	r19,STK_REG(r19)(r1)
+	ld	r20,STK_REG(r20)(r1)
+	ld	r21,STK_REG(r21)(r1)
+	ld	r22,STK_REG(r22)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	/* Up to 127B to go */
+5:	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+6:	bf	cr7*4+1,7f
+err1;	ld	r0,0(r4)
+err1;	ld	r6,8(r4)
+err1;	ld	r7,16(r4)
+err1;	ld	r8,24(r4)
+err1;	ld	r9,32(r4)
+err1;	ld	r10,40(r4)
+err1;	ld	r11,48(r4)
+err1;	ld	r12,56(r4)
+	addi	r4,r4,64
+err1;	std	r0,0(r3)
+err1;	std	r6,8(r3)
+err1;	std	r7,16(r3)
+err1;	std	r8,24(r3)
+err1;	std	r9,32(r3)
+err1;	std	r10,40(r3)
+err1;	std	r11,48(r3)
+err1;	std	r12,56(r3)
+	addi	r3,r3,64
+
+	/* Up to 63B to go */
+7:	bf	cr7*4+2,8f
+err1;	ld	r0,0(r4)
+err1;	ld	r6,8(r4)
+err1;	ld	r7,16(r4)
+err1;	ld	r8,24(r4)
+	addi	r4,r4,32
+err1;	std	r0,0(r3)
+err1;	std	r6,8(r3)
+err1;	std	r7,16(r3)
+err1;	std	r8,24(r3)
+	addi	r3,r3,32
+
+	/* Up to 31B to go */
+8:	bf	cr7*4+3,9f
+err1;	ld	r0,0(r4)
+err1;	ld	r6,8(r4)
+	addi	r4,r4,16
+err1;	std	r0,0(r3)
+err1;	std	r6,8(r3)
+	addi	r3,r3,16
+
+9:	clrldi	r5,r5,(64-4)
+
+	/* Up to 15B to go */
+.Lshort_copy:
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+err1;	lwz	r6,4(r4)
+	addi	r4,r4,8
+err1;	stw	r0,0(r3)
+err1;	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+err1;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+err1;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+err1;	lbz	r0,0(r4)
+err1;	stb	r0,0(r3)
+
+15:	li	r3,0
+	blr
+
+.Lvmx_copy:
+	mflr	r0
+	std	r4,56(r1)
+	std	r5,64(r1)
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	bl	.enable_kernel_altivec
+	ld	r0,STACKFRAMESIZE+16(r1)
+	ld	r3,STACKFRAMESIZE+48(r1)
+	ld	r4,STACKFRAMESIZE+56(r1)
+	ld	r5,STACKFRAMESIZE+64(r1)
+	mtlr	r0
+
+	/*
+	 * If source and destination are not relatively aligned we use a
+	 * slower permute loop.
+	 */
+	xor	r6,r4,r3
+	rldicl.	r6,r6,0,(64-4)
+	bne	.Lvmx_unaligned_copy
+
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+err3;	lbz	r0,0(r4)
+	addi	r4,r4,1
+err3;	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+err3;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err3;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+err3;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err3;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+err3;	ld	r0,0(r4)
+	addi	r4,r4,8
+err3;	std	r0,0(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	bf	cr7*4+3,5f
+err3;	lvx	vr1,r0,r4
+	addi	r4,r4,16
+err3;	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+5:	bf	cr7*4+2,6f
+err3;	lvx	vr1,r0,r4
+err3;	lvx	vr0,r4,r9
+	addi	r4,r4,32
+err3;	stvx	vr1,r0,r3
+err3;	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+err3;	lvx	vr3,r0,r4
+err3;	lvx	vr2,r4,r9
+err3;	lvx	vr1,r4,r10
+err3;	lvx	vr0,r4,r11
+	addi	r4,r4,64
+err3;	stvx	vr3,r0,r3
+err3;	stvx	vr2,r3,r9
+err3;	stvx	vr1,r3,r10
+err3;	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:
+err4;	lvx	vr7,r0,r4
+err4;	lvx	vr6,r4,r9
+err4;	lvx	vr5,r4,r10
+err4;	lvx	vr4,r4,r11
+err4;	lvx	vr3,r4,r12
+err4;	lvx	vr2,r4,r14
+err4;	lvx	vr1,r4,r15
+err4;	lvx	vr0,r4,r16
+	addi	r4,r4,128
+err4;	stvx	vr7,r0,r3
+err4;	stvx	vr6,r3,r9
+err4;	stvx	vr5,r3,r10
+err4;	stvx	vr4,r3,r11
+err4;	stvx	vr3,r3,r12
+err4;	stvx	vr2,r3,r14
+err4;	stvx	vr1,r3,r15
+err4;	stvx	vr0,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+err3;	lvx	vr3,r0,r4
+err3;	lvx	vr2,r4,r9
+err3;	lvx	vr1,r4,r10
+err3;	lvx	vr0,r4,r11
+	addi	r4,r4,64
+err3;	stvx	vr3,r0,r3
+err3;	stvx	vr2,r3,r9
+err3;	stvx	vr1,r3,r10
+err3;	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+err3;	lvx	vr1,r0,r4
+err3;	lvx	vr0,r4,r9
+	addi	r4,r4,32
+err3;	stvx	vr1,r0,r3
+err3;	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+err3;	lvx	vr1,r0,r4
+	addi	r4,r4,16
+err3;	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+err3;	ld	r0,0(r4)
+	addi	r4,r4,8
+err3;	std	r0,0(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+err3;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err3;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+err3;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err3;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+err3;	lbz	r0,0(r4)
+err3;	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	li	r3,0
+	blr
+
+.Lvmx_unaligned_copy:
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+err3;	lbz	r0,0(r4)
+	addi	r4,r4,1
+err3;	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+err3;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err3;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+err3;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err3;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+err3;	lwz	r7,4(r4)
+	addi	r4,r4,8
+err3;	stw	r0,0(r3)
+err3;	stw	r7,4(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	lvsl	vr16,0,r4	/* Setup permute control vector */
+err3;	lvx	vr0,0,r4
+	addi	r4,r4,16
+
+	bf	cr7*4+3,5f
+err3;	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	addi	r4,r4,16
+err3;	stvx	vr8,r0,r3
+	addi	r3,r3,16
+	vor	vr0,vr1,vr1
+
+5:	bf	cr7*4+2,6f
+err3;	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+err3;	lvx	vr0,r4,r9
+	vperm	vr9,vr1,vr0,vr16
+	addi	r4,r4,32
+err3;	stvx	vr8,r0,r3
+err3;	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+err3;	lvx	vr3,r0,r4
+	vperm	vr8,vr0,vr3,vr16
+err3;	lvx	vr2,r4,r9
+	vperm	vr9,vr3,vr2,vr16
+err3;	lvx	vr1,r4,r10
+	vperm	vr10,vr2,vr1,vr16
+err3;	lvx	vr0,r4,r11
+	vperm	vr11,vr1,vr0,vr16
+	addi	r4,r4,64
+err3;	stvx	vr8,r0,r3
+err3;	stvx	vr9,r3,r9
+err3;	stvx	vr10,r3,r10
+err3;	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:
+err4;	lvx	vr7,r0,r4
+	vperm	vr8,vr0,vr7,vr16
+err4;	lvx	vr6,r4,r9
+	vperm	vr9,vr7,vr6,vr16
+err4;	lvx	vr5,r4,r10
+	vperm	vr10,vr6,vr5,vr16
+err4;	lvx	vr4,r4,r11
+	vperm	vr11,vr5,vr4,vr16
+err4;	lvx	vr3,r4,r12
+	vperm	vr12,vr4,vr3,vr16
+err4;	lvx	vr2,r4,r14
+	vperm	vr13,vr3,vr2,vr16
+err4;	lvx	vr1,r4,r15
+	vperm	vr14,vr2,vr1,vr16
+err4;	lvx	vr0,r4,r16
+	vperm	vr15,vr1,vr0,vr16
+	addi	r4,r4,128
+err4;	stvx	vr8,r0,r3
+err4;	stvx	vr9,r3,r9
+err4;	stvx	vr10,r3,r10
+err4;	stvx	vr11,r3,r11
+err4;	stvx	vr12,r3,r12
+err4;	stvx	vr13,r3,r14
+err4;	stvx	vr14,r3,r15
+err4;	stvx	vr15,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+err3;	lvx	vr3,r0,r4
+	vperm	vr8,vr0,vr3,vr16
+err3;	lvx	vr2,r4,r9
+	vperm	vr9,vr3,vr2,vr16
+err3;	lvx	vr1,r4,r10
+	vperm	vr10,vr2,vr1,vr16
+err3;	lvx	vr0,r4,r11
+	vperm	vr11,vr1,vr0,vr16
+	addi	r4,r4,64
+err3;	stvx	vr8,r0,r3
+err3;	stvx	vr9,r3,r9
+err3;	stvx	vr10,r3,r10
+err3;	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+err3;	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+err3;	lvx	vr0,r4,r9
+	vperm	vr9,vr1,vr0,vr16
+	addi	r4,r4,32
+err3;	stvx	vr8,r0,r3
+err3;	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+err3;	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	addi	r4,r4,16
+err3;	stvx	vr8,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	addi	r4,r4,-16	/* Unwind the +16 load offset */
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+err3;	lwz	r6,4(r4)
+	addi	r4,r4,8
+err3;	stw	r0,0(r3)
+err3;	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+err3;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err3;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+err3;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err3;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+err3;	lbz	r0,0(r4)
+err3;	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	li	r3,0
+	blr
Index: linux-powerpc/arch/powerpc/lib/Makefile
===================================================================
--- linux-powerpc.orig/arch/powerpc/lib/Makefile	2011-06-17 14:27:42.396562026 +1000
+++ linux-powerpc/arch/powerpc/lib/Makefile	2011-06-17 14:27:43.026572962 +1000
@@ -17,7 +17,7 @@ obj-$(CONFIG_HAS_IOMEM)	+= devres.o
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   memcpy_64.o usercopy_64.o mem_64.o string.o \
 			   checksum_wrappers_64.o hweight_64.o \
-			   copypage_power7.o memcpy_power7.o
+			   copypage_power7.o memcpy_power7.o copyuser_power7.o
 obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o

^ permalink raw reply

* [PATCH 2/3] powerpc: POWER7 optimised memcpy using VMX
From: Anton Blanchard @ 2011-06-17  4:54 UTC (permalink / raw)
  To: benh, paulus, mikey; +Cc: linuxppc-dev
In-Reply-To: <20110617045358.544896830@samba.org>

Implement a POWER7 optimised memcpy using VMX. For large aligned
copies this new loop is over 10% faster and for large unaligned
copies it is over 200% faster.

On POWER7 unaligned stores rarely slow down - they only flush when
a store crosses a 4KB page boundary. Furthermore this flush is
handled completely in hardware and should be 20-30 cycles.

Unaligned loads on the other hand flush much more often - whenever
crossing a 128 byte cache line, or a 32 byte sector if either sector
is an L1 miss.

Considering this information we really want to get the loads aligned
and not worry about the alignment of the stores. Microbenchmarks
confirm that this approach is much faster than the current unaligned
copy loop that uses shifts and rotates to ensure both loads and
stores are aligned.

We also want to try and do the stores in cacheline aligned, cacheline
sized chunks. If the store queue is unable to merge an entire
cacheline of stores then the L2 cache will have to do a
read/modify/write. Even worse, we will serialise this with the stores
in the next iteration of the copy loop since both iterations hit
the same cacheline.

Based on this, the new loop does the following things:


1 - 127 bytes
Get the source 8 byte aligned and use 8 byte loads and stores. Pretty
boring and similar to how the current loop works.

128 - 4095 bytes
Get the source 8 byte aligned and use 8 byte loads and stores,
1 cacheline at a time. We aren't doing the stores in cacheline
aligned chunks so we will potentially serialise once per cacheline.
Even so it is much better than the loop we have today.

4096 - bytes
If both source and destination have the same alignment get them both
16 byte aligned, then get the destination cacheline aligned. Do
cacheline sized loads and stores using VMX.

If source and destination do not have the same alignment, we get the
destination cacheline aligned, and use permute to do aligned loads.

In both cases the VMX loop should be optimal - we always do aligned
loads and stores and are always doing stores in cacheline aligned,
cacheline sized chunks.


The VMX breakpoint of 4096 bytes was chosen using this microbenchmark:

http://ozlabs.org/~anton/junkcode/copy_to_user.c

(Note that the breakpoint analysis was done with the copy_tofrom_user
version of the loop and using varying sizes and alignments to read(). 
It's much easier to create a benchmark using read() that can control
the size and alignment of a kernel copy loop and synchronise it with
userspace doing optional VMX instructions).

Since we are using VMX and there is a cost to saving and restoring
the user VMX state there are two broad cases we need to benchmark:

- Best case - userspace never uses VMX

- Worst case - userspace always uses VMX

In reality a userspace process will sit somewhere between these two
extremes. Since we need to test both aligned and unaligned copies we
end up with 4 combinations. The point at which the VMX loop begins to
win is:

0% VMX
aligned		2048 bytes
unaligned	2048 bytes

100% VMX
aligned		16384 bytes
unaligned	8192 bytes

Considering this is a microbenchmark, the data is hot in cache and
the VMX loop has better store queue merging properties we set the
breakpoint to 4096 bytes, a little below the unaligned breakpoints.

Some future optimisations we can look at:

- Looking at the perf data, a significant part of the cost when a task
  is always using VMX is the extra exception we take to restore the
  VMX state. As such we should do something similar to the x86
  optimisation that restores FPU state for heavy users. ie:

        /*
         * If the task has used fpu the last 5 timeslices, just do a full
         * restore of the math state immediately to avoid the trap; the
         * chances of needing FPU soon are obviously high now
         */
        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;

  and 

        /*
         * fpu_counter contains the number of consecutive context switches
         * that the FPU is used. If this is over a threshold, the lazy fpu
         * saving becomes unlazy to save the trap. This is an unsigned char
         * so that after 256 times the counter wraps and the behavior turns
         * lazy again; this to deal with bursty apps that only use FPU for
         * a short time
         */

- We could create a paca bit to mirror the VMX enabled MSR bit and check
  that first, avoiding multiple calls to calling enable_kernel_altivec.

- We could have two VMX breakpoints, one for when we know the user VMX
  state is loaded into the registers and one when it isn't. This could
  be a second bit in the paca so we can calculate the break points quickly.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: linux-powerpc/arch/powerpc/lib/Makefile
===================================================================
--- linux-powerpc.orig/arch/powerpc/lib/Makefile	2011-06-17 08:38:25.786110167 +1000
+++ linux-powerpc/arch/powerpc/lib/Makefile	2011-06-17 14:05:30.023020417 +1000
@@ -17,7 +17,7 @@ obj-$(CONFIG_HAS_IOMEM)	+= devres.o
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   memcpy_64.o usercopy_64.o mem_64.o string.o \
 			   checksum_wrappers_64.o hweight_64.o \
-			   copypage_power7.o
+			   copypage_power7.o memcpy_power7.o
 obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
Index: linux-powerpc/arch/powerpc/lib/memcpy_64.S
===================================================================
--- linux-powerpc.orig/arch/powerpc/lib/memcpy_64.S	2011-06-17 08:32:33.670110896 +1000
+++ linux-powerpc/arch/powerpc/lib/memcpy_64.S	2011-06-17 08:38:25.806110507 +1000
@@ -11,7 +11,11 @@
 
 	.align	7
 _GLOBAL(memcpy)
+BEGIN_FTR_SECTION
 	std	r3,48(r1)	/* save destination pointer for return value */
+FTR_SECTION_ELSE
+	b	memcpy_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POWER7)
 	PPC_MTOCRF	0x01,r5
 	cmpldi	cr1,r5,16
 	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
Index: linux-powerpc/arch/powerpc/lib/memcpy_power7.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-powerpc/arch/powerpc/lib/memcpy_power7.S	2011-06-17 08:38:25.806110507 +1000
@@ -0,0 +1,596 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2011
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+
+#define STACKFRAMESIZE	256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+_GLOBAL(memcpy_power7)
+	cmpldi	r5,16
+	cmpldi	cr1,r5,4096
+
+	std	r3,48(r1)
+
+	blt	.Lshort_copy
+	bgt	cr1,.Lvmx_copy
+
+	/* Get the source 8B aligned */
+	neg	r6,r4
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-3)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	sub	r5,r5,r6
+	cmpldi	r5,128
+	blt	5f
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+	std	r17,STK_REG(r17)(r1)
+	std	r18,STK_REG(r18)(r1)
+	std	r19,STK_REG(r19)(r1)
+	std	r20,STK_REG(r20)(r1)
+	std	r21,STK_REG(r21)(r1)
+	std	r22,STK_REG(r22)(r1)
+
+	srdi	r6,r5,7
+	mtctr	r6
+
+	/* Now do cacheline (128B) sized loads and stores. */
+	.align	5
+4:	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	ld	r9,32(r4)
+	ld	r10,40(r4)
+	ld	r11,48(r4)
+	ld	r12,56(r4)
+	ld	r14,64(r4)
+	ld	r15,72(r4)
+	ld	r16,80(r4)
+	ld	r17,88(r4)
+	ld	r18,96(r4)
+	ld	r19,104(r4)
+	ld	r20,112(r4)
+	ld	r21,120(r4)
+	addi	r4,r4,128
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	std	r9,32(r3)
+	std	r10,40(r3)
+	std	r11,48(r3)
+	std	r12,56(r3)
+	std	r14,64(r3)
+	std	r15,72(r3)
+	std	r16,80(r3)
+	std	r17,88(r3)
+	std	r18,96(r3)
+	std	r19,104(r3)
+	std	r20,112(r3)
+	std	r21,120(r3)
+	addi	r3,r3,128
+	bdnz	4b
+
+	clrldi	r5,r5,(64-7)
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+	ld	r17,STK_REG(r17)(r1)
+	ld	r18,STK_REG(r18)(r1)
+	ld	r19,STK_REG(r19)(r1)
+	ld	r20,STK_REG(r20)(r1)
+	ld	r21,STK_REG(r21)(r1)
+	ld	r22,STK_REG(r22)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	/* Up to 127B to go */
+5:	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+6:	bf	cr7*4+1,7f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	ld	r9,32(r4)
+	ld	r10,40(r4)
+	ld	r11,48(r4)
+	ld	r12,56(r4)
+	addi	r4,r4,64
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	std	r9,32(r3)
+	std	r10,40(r3)
+	std	r11,48(r3)
+	std	r12,56(r3)
+	addi	r3,r3,64
+
+	/* Up to 63B to go */
+7:	bf	cr7*4+2,8f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	addi	r4,r4,32
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	addi	r3,r3,32
+
+	/* Up to 31B to go */
+8:	bf	cr7*4+3,9f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	addi	r4,r4,16
+	std	r0,0(r3)
+	std	r6,8(r3)
+	addi	r3,r3,16
+
+9:	clrldi	r5,r5,(64-4)
+
+	/* Up to 15B to go */
+.Lshort_copy:
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r6,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	ld	r3,48(r1)
+	blr
+
+.Lvmx_copy:
+	mflr	r0
+	std	r4,56(r1)
+	std	r5,64(r1)
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	bl	.enable_kernel_altivec
+	ld	r0,STACKFRAMESIZE+16(r1)
+	ld	r3,STACKFRAMESIZE+48(r1)
+	ld	r4,STACKFRAMESIZE+56(r1)
+	ld	r5,STACKFRAMESIZE+64(r1)
+	mtlr	r0
+
+	/*
+	 * If source and destination are not relatively aligned we use a
+	 * slower permute loop.
+	 */
+	xor	r6,r4,r3
+	rldicl.	r6,r6,0,(64-4)
+	bne	.Lvmx_unaligned_copy
+
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+	ld	r0,0(r4)
+	addi	r4,r4,8
+	std	r0,0(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	bf	cr7*4+3,5f
+	lvx	vr1,r0,r4
+	addi	r4,r4,16
+	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+5:	bf	cr7*4+2,6f
+	lvx	vr1,r0,r4
+	lvx	vr0,r4,r9
+	addi	r4,r4,32
+	stvx	vr1,r0,r3
+	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+	lvx	vr3,r0,r4
+	lvx	vr2,r4,r9
+	lvx	vr1,r4,r10
+	lvx	vr0,r4,r11
+	addi	r4,r4,64
+	stvx	vr3,r0,r3
+	stvx	vr2,r3,r9
+	stvx	vr1,r3,r10
+	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:	lvx	vr7,r0,r4
+	lvx	vr6,r4,r9
+	lvx	vr5,r4,r10
+	lvx	vr4,r4,r11
+	lvx	vr3,r4,r12
+	lvx	vr2,r4,r14
+	lvx	vr1,r4,r15
+	lvx	vr0,r4,r16
+	addi	r4,r4,128
+	stvx	vr7,r0,r3
+	stvx	vr6,r3,r9
+	stvx	vr5,r3,r10
+	stvx	vr4,r3,r11
+	stvx	vr3,r3,r12
+	stvx	vr2,r3,r14
+	stvx	vr1,r3,r15
+	stvx	vr0,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+	lvx	vr3,r0,r4
+	lvx	vr2,r4,r9
+	lvx	vr1,r4,r10
+	lvx	vr0,r4,r11
+	addi	r4,r4,64
+	stvx	vr3,r0,r3
+	stvx	vr2,r3,r9
+	stvx	vr1,r3,r10
+	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+	lvx	vr1,r0,r4
+	lvx	vr0,r4,r9
+	addi	r4,r4,32
+	stvx	vr1,r0,r3
+	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+	lvx	vr1,r0,r4
+	addi	r4,r4,16
+	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	ld	r0,0(r4)
+	addi	r4,r4,8
+	std	r0,0(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	ld	r3,48(r1)
+	blr
+
+.Lvmx_unaligned_copy:
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r7,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r7,4(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	lvsl	vr16,0,r4	/* Setup permute control vector */
+	lvx	vr0,0,r4
+	addi	r4,r4,16
+
+	bf	cr7*4+3,5f
+	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	addi	r4,r4,16
+	stvx	vr8,r0,r3
+	addi	r3,r3,16
+	vor	vr0,vr1,vr1
+
+5:	bf	cr7*4+2,6f
+	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	lvx	vr0,r4,r9
+	vperm	vr9,vr1,vr0,vr16
+	addi	r4,r4,32
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+	lvx	vr3,r0,r4
+	vperm	vr8,vr0,vr3,vr16
+	lvx	vr2,r4,r9
+	vperm	vr9,vr3,vr2,vr16
+	lvx	vr1,r4,r10
+	vperm	vr10,vr2,vr1,vr16
+	lvx	vr0,r4,r11
+	vperm	vr11,vr1,vr0,vr16
+	addi	r4,r4,64
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:	lvx	vr7,r0,r4
+	vperm	vr8,vr0,vr7,vr16
+	lvx	vr6,r4,r9
+	vperm	vr9,vr7,vr6,vr16
+	lvx	vr5,r4,r10
+	vperm	vr10,vr6,vr5,vr16
+	lvx	vr4,r4,r11
+	vperm	vr11,vr5,vr4,vr16
+	lvx	vr3,r4,r12
+	vperm	vr12,vr4,vr3,vr16
+	lvx	vr2,r4,r14
+	vperm	vr13,vr3,vr2,vr16
+	lvx	vr1,r4,r15
+	vperm	vr14,vr2,vr1,vr16
+	lvx	vr0,r4,r16
+	vperm	vr15,vr1,vr0,vr16
+	addi	r4,r4,128
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	stvx	vr12,r3,r12
+	stvx	vr13,r3,r14
+	stvx	vr14,r3,r15
+	stvx	vr15,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+	lvx	vr3,r0,r4
+	vperm	vr8,vr0,vr3,vr16
+	lvx	vr2,r4,r9
+	vperm	vr9,vr3,vr2,vr16
+	lvx	vr1,r4,r10
+	vperm	vr10,vr2,vr1,vr16
+	lvx	vr0,r4,r11
+	vperm	vr11,vr1,vr0,vr16
+	addi	r4,r4,64
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	lvx	vr0,r4,r9
+	vperm	vr9,vr1,vr0,vr16
+	addi	r4,r4,32
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+	lvx	vr1,r0,r4
+	vperm	vr8,vr0,vr1,vr16
+	addi	r4,r4,16
+	stvx	vr8,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	addi	r4,r4,-16	/* Unwind the +16 load offset */
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r6,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	ld	r3,48(r1)
+	blr

^ permalink raw reply

* [PATCH 1/3] powerpc: POWER7 optimised copy_page using VMX
From: Anton Blanchard @ 2011-06-17  4:53 UTC (permalink / raw)
  To: benh, paulus, mikey; +Cc: linuxppc-dev
In-Reply-To: <20110617045358.544896830@samba.org>

Implement a POWER7 optimised copy_page using VMX. We copy a cacheline
at a time using VMX loads and stores.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

How do we want to handle per machine optimised functions? I create
yet another feature bit, but feature bits might get out of control
at some point.

Index: linux-powerpc/arch/powerpc/include/asm/cputable.h
===================================================================
--- linux-powerpc.orig/arch/powerpc/include/asm/cputable.h	2011-06-06 08:07:35.128707749 +1000
+++ linux-powerpc/arch/powerpc/include/asm/cputable.h	2011-06-17 07:39:58.996165527 +1000
@@ -200,6 +200,7 @@ extern const char *powerpc_base_platform
 #define CPU_FTR_POPCNTB			LONG_ASM_CONST(0x0400000000000000)
 #define CPU_FTR_POPCNTD			LONG_ASM_CONST(0x0800000000000000)
 #define CPU_FTR_ICSWX			LONG_ASM_CONST(0x1000000000000000)
+#define CPU_FTR_POWER7			LONG_ASM_CONST(0x2000000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -423,7 +424,7 @@ extern const char *powerpc_base_platform
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_ICSWX | CPU_FTR_CFAR)
+	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_POWER7)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
Index: linux-powerpc/arch/powerpc/lib/copypage_power7.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-powerpc/arch/powerpc/lib/copypage_power7.S	2011-06-17 07:39:58.996165527 +1000
@@ -0,0 +1,70 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2011
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+
+#define STACKFRAMESIZE	112
+
+_GLOBAL(copypage_power7)
+	mflr	r0
+	std	r3,48(r1)
+	std	r4,56(r1)
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+
+	bl	.enable_kernel_altivec
+
+	ld	r12,STACKFRAMESIZE+16(r1)
+	ld	r4,STACKFRAMESIZE+56(r1)
+	li	r0,(PAGE_SIZE/128)
+	li	r6,16
+	ld	r3,STACKFRAMESIZE+48(r1)
+	li	r7,32
+	li	r8,48
+	mtctr	r0
+	li	r9,64
+	li	r10,80
+	mtlr	r12
+	li	r11,96
+	li	r12,112
+	addi	r1,r1,STACKFRAMESIZE
+
+	.align	5
+1:	lvx	vr7,r0,r4
+	lvx	vr6,r4,r6
+	lvx	vr5,r4,r7
+	lvx	vr4,r4,r8
+	lvx	vr3,r4,r9
+	lvx	vr2,r4,r10
+	lvx	vr1,r4,r11
+	lvx	vr0,r4,r12
+	addi	r4,r4,128
+	stvx	vr7,r0,r3
+	stvx	vr6,r3,r6
+	stvx	vr5,r3,r7
+	stvx	vr4,r3,r8
+	stvx	vr3,r3,r9
+	stvx	vr2,r3,r10
+	stvx	vr1,r3,r11
+	stvx	vr0,r3,r12
+	addi	r3,r3,128
+	bdnz	1b
+
+	blr
Index: linux-powerpc/arch/powerpc/lib/Makefile
===================================================================
--- linux-powerpc.orig/arch/powerpc/lib/Makefile	2011-05-19 19:57:38.058570608 +1000
+++ linux-powerpc/arch/powerpc/lib/Makefile	2011-06-17 07:39:58.996165527 +1000
@@ -16,7 +16,8 @@ obj-$(CONFIG_HAS_IOMEM)	+= devres.o
 
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   memcpy_64.o usercopy_64.o mem_64.o string.o \
-			   checksum_wrappers_64.o hweight_64.o
+			   checksum_wrappers_64.o hweight_64.o \
+			   copypage_power7.o
 obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
Index: linux-powerpc/arch/powerpc/lib/copypage_64.S
===================================================================
--- linux-powerpc.orig/arch/powerpc/lib/copypage_64.S	2011-06-06 08:07:35.000000000 +1000
+++ linux-powerpc/arch/powerpc/lib/copypage_64.S	2011-06-17 07:39:58.996165527 +1000
@@ -17,7 +17,11 @@ PPC64_CACHES:
         .section        ".text"
 
 _GLOBAL(copy_page)
+BEGIN_FTR_SECTION
 	lis	r5,PAGE_SIZE@h
+FTR_SECTION_ELSE
+        b       .copypage_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POWER7)
 	ori	r5,r5,PAGE_SIZE@l
 BEGIN_FTR_SECTION
 	ld      r10,PPC64_CACHES@toc(r2)

^ permalink raw reply

* [PATCH 0/3] POWER7 optimised copy loops
From: Anton Blanchard @ 2011-06-17  4:53 UTC (permalink / raw)
  To: benh, paulus, mikey; +Cc: linuxppc-dev

Here are POWER7 optimised versions of copy_page, memcpy and
copy_tofrom_user.

Anton

^ permalink raw reply

* Re: [RFC PATCH V1 5/7] cpuidle: (POWER) cpuidle driver for pSeries
From: Benjamin Herrenschmidt @ 2011-06-17  4:36 UTC (permalink / raw)
  To: Trinabh Gupta; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <20110607163012.6848.19268.stgit@tringupt.in.ibm.com>

On Tue, 2011-06-07 at 22:00 +0530, Trinabh Gupta wrote:

> +static int snooze_loop(struct cpuidle_device *dev,
> +			struct cpuidle_driver *drv,
> +			int index)
> +{
> +	unsigned long in_purr, out_purr;
> +	ktime_t kt_before, kt_after;
> +	s64 usec_delta;
> +
> +	/*
> +	 * Indicate to the HV that we are idle. Now would be
> +	 * a good time to find other work to dispatch.
> +	 */
> +	get_lppaca()->idle = 1;
> +	get_lppaca()->donate_dedicated_cpu = 1;
> +	in_purr = mfspr(SPRN_PURR);
> +
> +	kt_before = ktime_get_real();

Don't you want to timestamp before you tell the HV that you are idle ?
Or is the above stuff only polled by phyp when partition interrupts are
enabled ?

> +	local_irq_enable();
> +	set_thread_flag(TIF_POLLING_NRFLAG);
> +	while (!need_resched()) {
> +		ppc64_runlatch_off();
> +		HMT_low();
> +		HMT_very_low();
> +	}
> +	HMT_medium();
> +	clear_thread_flag(TIF_POLLING_NRFLAG);
> +	smp_mb();
> +	local_irq_disable();
> +
> +	kt_after = ktime_get_real();
> +	usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
> +
> +	out_purr = mfspr(SPRN_PURR);
> +	get_lppaca()->wait_state_cycles += out_purr - in_purr;
> +	get_lppaca()->donate_dedicated_cpu = 0;
> +	get_lppaca()->idle = 0;
> +
> +	dev->last_residency = (int)usec_delta;
> +
> +	return index;
> +}
> +
> +static int dedicated_cede_loop(struct cpuidle_device *dev,
> +				struct cpuidle_driver *drv,
> +				int index)
> +{
> +	unsigned long in_purr, out_purr;
> +	ktime_t kt_before, kt_after;
> +	s64 usec_delta;
> +
> +	/*
> +	 * Indicate to the HV that we are idle. Now would be
> +	 * a good time to find other work to dispatch.
> +	 */
> +	get_lppaca()->idle = 1;
> +	get_lppaca()->donate_dedicated_cpu = 1;
> +	in_purr = mfspr(SPRN_PURR);
> +
> +	kt_before = ktime_get_real();

There's a bit too much code duplication for my taste here between the
two functions. Not sure if it can be helped, maybe with some inlines
for the prolog/epilogue ... Looks like stuff that's easy to "fix" in one
place and forget the other...

> +	ppc64_runlatch_off();
> +	HMT_medium();
> +	cede_processor();
> +
> +	kt_after = ktime_get_real();
> +	usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
> +
> +	out_purr = mfspr(SPRN_PURR);
> +	get_lppaca()->wait_state_cycles += out_purr - in_purr;
> +	get_lppaca()->donate_dedicated_cpu = 0;
> +	get_lppaca()->idle = 0;
> +
> +	dev->last_residency = (int)usec_delta;
> +
> +	return index;
> +}
> +
> +static int shared_cede_loop(struct cpuidle_device *dev,
> +			struct cpuidle_driver *drv,
> +			int index)
> +{
> +	unsigned long in_purr, out_purr;
> +	ktime_t kt_before, kt_after;
> +	s64 usec_delta;
> +
> +	/*
> +	 * Indicate to the HV that we are idle. Now would be
> +	 * a good time to find other work to dispatch.
> +	 */
> +	get_lppaca()->idle = 1;
> +	get_lppaca()->donate_dedicated_cpu = 1;
> +	in_purr = mfspr(SPRN_PURR);
> +
> +	kt_before = ktime_get_real();
> +	/*
> +	 * Yield the processor to the hypervisor.  We return if
> +	 * an external interrupt occurs (which are driven prior
> +	 * to returning here) or if a prod occurs from another
> +	 * processor. When returning here, external interrupts
> +	 * are enabled.
> +	 */
> +	cede_processor();
> +
> +	kt_after = ktime_get_real();
> +
> +	usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
> +
> +	out_purr = mfspr(SPRN_PURR);
> +	get_lppaca()->wait_state_cycles += out_purr - in_purr;
> +	get_lppaca()->donate_dedicated_cpu = 0;
> +	get_lppaca()->idle = 0;
> +
> +	dev->last_residency = (int)usec_delta;
> +
> +	return index;
> +}
> +
> +/*
> + * States for dedicated partition case.
> + */
> +static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
> +	{ /* Snooze */
> +		.name = "snooze",
> +		.desc = "snooze",
> +		.flags = CPUIDLE_FLAG_TIME_VALID,
> +		.exit_latency = 0,
> +		.target_residency = 0,
> +		.enter = &snooze_loop },
> +	{ /* CEDE */
> +		.name = "CEDE",
> +		.desc = "CEDE",
> +		.flags = CPUIDLE_FLAG_TIME_VALID,
> +		.exit_latency = 1,
> +		.target_residency = 10,
> +		.enter = &dedicated_cede_loop },
> +};
> +
> +/*
> + * States for shared partition case.
> + */
> +static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
> +	{ /* Shared Cede */
> +		.name = "Shared Cede",
> +		.desc = "Shared Cede",
> +		.flags = CPUIDLE_FLAG_TIME_VALID,
> +		.exit_latency = 0,
> +		.target_residency = 0,
> +		.enter = &shared_cede_loop },
> +};
> +
> +int pseries_notify_cpuidle_add_cpu(int cpu)
> +{
> +	struct cpuidle_device *dev =
> +			per_cpu_ptr(pseries_idle_cpuidle_devices, cpu);
> +	if (dev && cpuidle_get_driver()) {
> +		cpuidle_disable_device(dev);
> +		cpuidle_enable_device(dev);
> +	}
> +	return 0;
> +}
> +
> +/*
> + * pseries_idle_cpuidle_driver_init()
> + */
> +static int pseries_idle_cpuidle_driver_init(void)
> +{
> +	int cstate;
> +	struct cpuidle_driver *drv = &pseries_idle_driver;
> +
> +	drv->state_count = 0;
> +
> +	for (cstate = 0; cstate < MAX_IDLE_STATE_COUNT; ++cstate) {
> +
> +		if (cstate > max_cstate)
> +			break;
> +
> +		/* is the state not enabled? */
> +		if (cpuidle_state_table[cstate].enter == NULL)
> +			continue;
> +
> +		drv->states[drv->state_count] =	/* structure copy */
> +			cpuidle_state_table[cstate];
> +
> +		if (cpuidle_state_table == dedicated_states)
> +			drv->states[drv->state_count].target_residency =
> +				__get_cpu_var(smt_snooze_delay);
> +
> +		drv->state_count += 1;
> +	}
> +
> +	return 0;
> +}
> +
> +/* pseries_idle_devices_uninit(void)
> + * unregister cpuidle devices and de-allocate memory
> + */
> +static void pseries_idle_devices_uninit(void)
> +{
> +	int i;
> +	struct cpuidle_device *dev;
> +
> +	for_each_possible_cpu(i) {
> +		dev = per_cpu_ptr(pseries_idle_cpuidle_devices, i);
> +		cpuidle_unregister_device(dev);
> +	}
> +
> +	free_percpu(pseries_idle_cpuidle_devices);
> +	return;
> +}
> +
> +/* pseries_idle_devices_init()
> + * allocate, initialize and register cpuidle device
> + */
> +static int pseries_idle_devices_init(void)
> +{
> +	int i;
> +	struct cpuidle_driver *drv = &pseries_idle_driver;
> +	struct cpuidle_device *dev;
> +
> +	pseries_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
> +	if (pseries_idle_cpuidle_devices == NULL)
> +		return -ENOMEM;
> +
> +	for_each_possible_cpu(i) {
> +		dev = per_cpu_ptr(pseries_idle_cpuidle_devices, i);
> +		dev->state_count = drv->state_count;
> +		dev->cpu = i;
> +		if (cpuidle_register_device(dev)) {
> +			printk(KERN_DEBUG "cpuidle_register_device %d failed!\n",
> +				 i);
> +			return -EIO;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * pseries_idle_probe()
> + * Choose state table for shared versus dedicated partition
> + */
> +static int pseries_idle_probe(void)
> +{
> +	if (max_cstate == 0) {
> +		printk(KERN_DEBUG "pseries processor idle disabled.\n");
> +		return -EPERM;
> +	}
> +
> +	if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
> +		printk(KERN_DEBUG "Using default idle\n");
> +		return -ENODEV;
> +	}
> +
> +	if (get_lppaca()->shared_proc)
> +		cpuidle_state_table = shared_states;
> +	else
> +		cpuidle_state_table = dedicated_states;
> +
> +	return 0;
> +}
> +
> +static int __init pseries_processor_idle_init(void)
> +{
> +	int retval;
> +
> +	retval = pseries_idle_probe();
> +	if (retval)
> +		return retval;
> +
> +	pseries_idle_cpuidle_driver_init();
> +	retval = cpuidle_register_driver(&pseries_idle_driver);
> +	if (retval) {
> +		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
> +		return retval;
> +	}
> +
> +	retval = pseries_idle_devices_init();
> +	if (retval) {
> +		pseries_idle_devices_uninit();
> +		cpuidle_unregister_driver(&pseries_idle_driver);
> +		return retval;
> +	}
> +
> +	printk(KERN_DEBUG "pseries_idle_driver registered\n");
> +
> +	return 0;
> +}
> +
> +device_initcall(pseries_processor_idle_init);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index e9f6d28..7c60380 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -56,4 +56,7 @@ extern struct device_node *dlpar_configure_connector(u32);
>  extern int dlpar_attach_node(struct device_node *);
>  extern int dlpar_detach_node(struct device_node *);
>  
> +/* Snooze Delay, pseries_idle */
> +DECLARE_PER_CPU(long, smt_snooze_delay);
> +
>  #endif /* _PSERIES_PSERIES_H */
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 593acce..6893a0c 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -584,9 +584,6 @@ static int __init pSeries_probe(void)
>  	return 1;
>  }
>  
> -
> -DECLARE_PER_CPU(long, smt_snooze_delay);
> -
>  static void pseries_dedicated_idle_sleep(void)
>  { 
>  	unsigned int cpu = smp_processor_id();
> diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
> index fbffd7e..2e46883 100644
> --- a/arch/powerpc/platforms/pseries/smp.c
> +++ b/arch/powerpc/platforms/pseries/smp.c
> @@ -150,6 +150,7 @@ static void __devinit smp_xics_setup_cpu(int cpu)
>  	set_cpu_current_state(cpu, CPU_STATE_ONLINE);
>  	set_default_offline_state(cpu);
>  #endif
> +	pseries_notify_cpuidle_add_cpu(cpu);
>  }
>  
>  static int __devinit smp_pSeries_kick_cpu(int nr)
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev

^ permalink raw reply

* Re: [RFC PATCH V1 4/7] cpuidle: (powerpc) Add cpu_idle_wait() to allow switching idle routines
From: Benjamin Herrenschmidt @ 2011-06-17  4:32 UTC (permalink / raw)
  To: Trinabh Gupta; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <20110607162959.6848.26918.stgit@tringupt.in.ibm.com>

On Tue, 2011-06-07 at 22:00 +0530, Trinabh Gupta wrote:

> diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
> index 39a2baa..932392b 100644
> --- a/arch/powerpc/kernel/idle.c
> +++ b/arch/powerpc/kernel/idle.c
> @@ -102,6 +102,24 @@ void cpu_idle(void)
>  	}
>  }
>  
> +static void do_nothing(void *unused)
> +{
> +}
> +
> +/*
> + * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
> + * idle loop and start using the new idle loop.
> + * Required while changing idle handler on SMP systems.
> + * Caller must have changed idle handler to the new value before the call.
> + */
> +void cpu_idle_wait(void)
> +{
> +	smp_mb();
> +	/* kick all the CPUs so that they exit out of old idle routine */
> +	smp_call_function(do_nothing, NULL, 1);
> +}
> +EXPORT_SYMBOL_GPL(cpu_idle_wait);
> +
>  int powersave_nap;
>  
>  #ifdef CONFIG_SYSCTL

This is gross :-)

Do you need to absolutely ensure the idle task has changed or just
kicking it with a send reschedule is enough ?

Cheers,
Ben.

^ permalink raw reply

* Re: [RFC PATCH V1 1/7] cpuidle: create bootparam "cpuidle.off=1"
From: Benjamin Herrenschmidt @ 2011-06-17  4:29 UTC (permalink / raw)
  To: Trinabh Gupta; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <20110607162922.6848.71783.stgit@tringupt.in.ibm.com>

On Tue, 2011-06-07 at 21:59 +0530, Trinabh Gupta wrote:
> From: Len Brown <len.brown@intel.com>
> 
> useful for disabling cpuidle to fall back
> to architecture-default idle loop
> 
> cpuidle drivers and governors will fail to register.
> on x86 they'll say so:
> 
> intel_idle: intel_idle yielding to (null)
> ACPI: acpi_idle yielding to (null)
> 
> Signed-off-by: Len Brown <len.brown@intel.com>
> ---

When you carry over somebody's patch like this you need to also add your
own signed-off-by.

Have those generic changes been reviewed by whoever is in charge of that
cpuidle framework ?

Cheers,
Ben.

>  Documentation/kernel-parameters.txt |    3 +++
>  drivers/cpuidle/cpuidle.c           |   10 ++++++++++
>  drivers/cpuidle/cpuidle.h           |    1 +
>  drivers/cpuidle/driver.c            |    3 +++
>  drivers/cpuidle/governor.c          |    3 +++
>  5 files changed, 20 insertions(+), 0 deletions(-)
> 
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index d9a203b..5697faf 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -546,6 +546,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>  			/proc/<pid>/coredump_filter.
>  			See also Documentation/filesystems/proc.txt.
>  
> +	cpuidle.off=1	[CPU_IDLE]
> +			disable the cpuidle sub-system
> +
>  	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver
>  			Format:
>  			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index 406be83..a171b9e 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -28,6 +28,12 @@ LIST_HEAD(cpuidle_detected_devices);
>  static void (*pm_idle_old)(void);
>  
>  static int enabled_devices;
> +static int off __read_mostly;
> +
> +int cpuidle_disabled(void)
> +{
> +	return off;
> +}
>  
>  #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
>  static void cpuidle_kick_cpus(void)
> @@ -397,6 +403,9 @@ static int __init cpuidle_init(void)
>  {
>  	int ret;
>  
> +	if (cpuidle_disabled())
> +		return -ENODEV;
> +
>  	pm_idle_old = pm_idle;
>  
>  	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
> @@ -408,4 +417,5 @@ static int __init cpuidle_init(void)
>  	return 0;
>  }
>  
> +module_param(off, int, 0444);
>  core_initcall(cpuidle_init);
> diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
> index 33e50d5..38c3fd8 100644
> --- a/drivers/cpuidle/cpuidle.h
> +++ b/drivers/cpuidle/cpuidle.h
> @@ -13,6 +13,7 @@ extern struct list_head cpuidle_governors;
>  extern struct list_head cpuidle_detected_devices;
>  extern struct mutex cpuidle_lock;
>  extern spinlock_t cpuidle_driver_lock;
> +extern int cpuidle_disabled(void);
>  
>  /* idle loop */
>  extern void cpuidle_install_idle_handler(void);
> diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
> index 33e3189..284d7af 100644
> --- a/drivers/cpuidle/driver.c
> +++ b/drivers/cpuidle/driver.c
> @@ -50,6 +50,9 @@ int cpuidle_register_driver(struct cpuidle_driver *drv)
>  	if (!drv)
>  		return -EINVAL;
>  
> +	if (cpuidle_disabled())
> +		return -ENODEV;
> +
>  	spin_lock(&cpuidle_driver_lock);
>  	if (cpuidle_curr_driver) {
>  		spin_unlock(&cpuidle_driver_lock);
> diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
> index 724c164..ea2f8e7 100644
> --- a/drivers/cpuidle/governor.c
> +++ b/drivers/cpuidle/governor.c
> @@ -81,6 +81,9 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
>  	if (!gov || !gov->select)
>  		return -EINVAL;
>  
> +	if (cpuidle_disabled())
> +		return -ENODEV;
> +
>  	mutex_lock(&cpuidle_lock);
>  	if (__cpuidle_find_governor(gov->name) == NULL) {
>  		ret = 0;
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev

^ permalink raw reply

* Re: [PATCH] perf_events: Enable idle state tracing for pseries (ppc64)
From: Benjamin Herrenschmidt @ 2011-06-17  4:24 UTC (permalink / raw)
  To: deepthi; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <20110601123554.GA12492@deepthi.in.ibm.com>

On Wed, 2011-06-01 at 18:05 +0530, Deepthi Dharwar wrote:
> Hi,
> 
> Please find below a patch, which has perf_events added for pseries (ppc64)
> platform in order to emit the trace required for perf timechart. 
> It essentially enables perf timechart for pseries platfrom to analyse
> power savings events like cpuidle states.

Unless I'm mistaken, you added traces to dedicated CPU idle sleep but
not shared processor. Any reason ?

Also I don't really know that tracing stuff but what's the point of
having start/end _and trace_cpu_idle if you're going to always start &
end around a single occurence of trace_cpu_idle ?

Wouldn't there be a way to start/end and then trace the snooze and
subsequent cede within the same start/end section or that makes no
sense ?

Also would there be any interest in doing the tracing more generically
in idle.c ?

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] Add cpufreq driver for Momentum Maple boards
From: Benjamin Herrenschmidt @ 2011-06-17  4:14 UTC (permalink / raw)
  To: Dmitry Eremin-Solenikov; +Cc: Dave Jones, Paul Mackerras, linuxppc-dev, cpufreq
In-Reply-To: <1305973726-28006-1-git-send-email-dbaryshkov@gmail.com>

On Sat, 2011-05-21 at 14:28 +0400, Dmitry Eremin-Solenikov wrote:
> Add simple cpufreq driver for Maple-based boards (ppc970fx evaluation
> kit and others). Driver is based on a cpufreq driver for 64-bit powermac
> boxes with all pmac-dependant features removed and simple cleanup
> applied.

No special comment other than please replace all the g5_* with maple_
for consistency.

Cheers,
Ben.

> Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
> ---
>  arch/powerpc/kernel/misc_64.S          |    4 +-
>  arch/powerpc/platforms/Kconfig         |    8 +
>  arch/powerpc/platforms/maple/Makefile  |    1 +
>  arch/powerpc/platforms/maple/cpufreq.c |  317 ++++++++++++++++++++++++++++++++
>  4 files changed, 328 insertions(+), 2 deletions(-)
>  create mode 100644 arch/powerpc/platforms/maple/cpufreq.c
> 
> diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
> index 206a321..c442aae 100644
> --- a/arch/powerpc/kernel/misc_64.S
> +++ b/arch/powerpc/kernel/misc_64.S
> @@ -339,7 +339,7 @@ _GLOBAL(real_205_writeb)
>  #endif /* CONFIG_PPC_PASEMI */
>  
> 
> -#ifdef CONFIG_CPU_FREQ_PMAC64
> +#if defined(CONFIG_CPU_FREQ_PMAC64) || defined(CONFIG_CPU_FREQ_MAPLE)
>  /*
>   * SCOM access functions for 970 (FX only for now)
>   *
> @@ -408,7 +408,7 @@ _GLOBAL(scom970_write)
>  	/* restore interrupts */
>  	mtmsrd	r5,1
>  	blr
> -#endif /* CONFIG_CPU_FREQ_PMAC64 */
> +#endif /* CONFIG_CPU_FREQ_PMAC64 || CONFIG_CPU_FREQ_MAPLE */
>  
> 
>  /*
> diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
> index f7b0772..4c5eb5b 100644
> --- a/arch/powerpc/platforms/Kconfig
> +++ b/arch/powerpc/platforms/Kconfig
> @@ -187,6 +187,14 @@ config PPC_PASEMI_CPUFREQ
>  	  This adds the support for frequency switching on PA Semi
>  	  PWRficient processors.
>  
> +config CPU_FREQ_MAPLE
> +	bool "Support for Maple 970FX Evaluation Board"
> +	depends on PPC_MAPLE
> +	select CPU_FREQ_TABLE
> +	help
> +	  This adds support for frequency switching on Maple 970FX
> +	  Evaluation Board and compatible boards (IBM JS2x blades).
> +
>  endmenu
>  
>  config PPC601_SYNC_FIX
> diff --git a/arch/powerpc/platforms/maple/Makefile b/arch/powerpc/platforms/maple/Makefile
> index 1be1a99..0b3e3e3 100644
> --- a/arch/powerpc/platforms/maple/Makefile
> +++ b/arch/powerpc/platforms/maple/Makefile
> @@ -1 +1,2 @@
>  obj-y	+= setup.o pci.o time.o
> +obj-$(CONFIG_CPU_FREQ_MAPLE) += cpufreq.o
> diff --git a/arch/powerpc/platforms/maple/cpufreq.c b/arch/powerpc/platforms/maple/cpufreq.c
> new file mode 100644
> index 0000000..854adfa
> --- /dev/null
> +++ b/arch/powerpc/platforms/maple/cpufreq.c
> @@ -0,0 +1,317 @@
> +/*
> + *  Copyright (C) 2011 Dmitry Eremin-Solenikov
> + *  Copyright (C) 2002 - 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org>
> + *  and                       Markus Demleitner <msdemlei@cl.uni-heidelberg.de>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This driver adds basic cpufreq support for SMU & 970FX based G5 Macs,
> + * that is iMac G5 and latest single CPU desktop.
> + */
> +
> +#undef DEBUG
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/delay.h>
> +#include <linux/sched.h>
> +#include <linux/cpufreq.h>
> +#include <linux/init.h>
> +#include <linux/completion.h>
> +#include <linux/mutex.h>
> +#include <asm/prom.h>
> +#include <asm/machdep.h>
> +#include <asm/irq.h>
> +#include <asm/sections.h>
> +#include <asm/cputable.h>
> +#include <asm/time.h>
> +
> +#define DBG(fmt...) pr_debug(fmt)
> +
> +/* see 970FX user manual */
> +
> +#define SCOM_PCR 0x0aa001			/* PCR scom addr */
> +
> +#define PCR_HILO_SELECT		0x80000000U	/* 1 = PCR, 0 = PCRH */
> +#define PCR_SPEED_FULL		0x00000000U	/* 1:1 speed value */
> +#define PCR_SPEED_HALF		0x00020000U	/* 1:2 speed value */
> +#define PCR_SPEED_QUARTER	0x00040000U	/* 1:4 speed value */
> +#define PCR_SPEED_MASK		0x000e0000U	/* speed mask */
> +#define PCR_SPEED_SHIFT		17
> +#define PCR_FREQ_REQ_VALID	0x00010000U	/* freq request valid */
> +#define PCR_VOLT_REQ_VALID	0x00008000U	/* volt request valid */
> +#define PCR_TARGET_TIME_MASK	0x00006000U	/* target time */
> +#define PCR_STATLAT_MASK	0x00001f00U	/* STATLAT value */
> +#define PCR_SNOOPLAT_MASK	0x000000f0U	/* SNOOPLAT value */
> +#define PCR_SNOOPACC_MASK	0x0000000fU	/* SNOOPACC value */
> +
> +#define SCOM_PSR 0x408001			/* PSR scom addr */
> +/* warning: PSR is a 64 bits register */
> +#define PSR_CMD_RECEIVED	0x2000000000000000U   /* command received */
> +#define PSR_CMD_COMPLETED	0x1000000000000000U   /* command completed */
> +#define PSR_CUR_SPEED_MASK	0x0300000000000000U   /* current speed */
> +#define PSR_CUR_SPEED_SHIFT	(56)
> +
> +/*
> + * The G5 only supports two frequencies (Quarter speed is not supported)
> + */
> +#define CPUFREQ_HIGH                  0
> +#define CPUFREQ_LOW                   1
> +
> +static struct cpufreq_frequency_table g5_cpu_freqs[] = {
> +	{CPUFREQ_HIGH, 		0},
> +	{CPUFREQ_LOW,		0},
> +	{0,			CPUFREQ_TABLE_END},
> +};
> +
> +static struct freq_attr* g5_cpu_freqs_attr[] = {
> +	&cpufreq_freq_attr_scaling_available_freqs,
> +	NULL,
> +};
> +
> +/* Power mode data is an array of the 32 bits PCR values to use for
> + * the various frequencies, retrieved from the device-tree
> + */
> +static int g5_pmode_cur;
> +
> +static DEFINE_MUTEX(g5_switch_mutex);
> +
> +static const u32 *g5_pmode_data;
> +static int g5_pmode_max;
> +
> +/*
> + * Fake voltage switching for platforms with missing support
> + */
> +
> +static void g5_dummy_switch_volt(int speed_mode)
> +{
> +}
> +
> +/*
> + * SCOM based frequency switching for 970FX rev3
> + */
> +static int g5_scom_switch_freq(int speed_mode)
> +{
> +	unsigned long flags;
> +	int to;
> +
> +	/* If frequency is going up, first ramp up the voltage */
> +	if (speed_mode < g5_pmode_cur)
> +		g5_dummy_switch_volt(speed_mode);
> +
> +	local_irq_save(flags);
> +
> +	/* Clear PCR high */
> +	scom970_write(SCOM_PCR, 0);
> +	/* Clear PCR low */
> +	scom970_write(SCOM_PCR, PCR_HILO_SELECT | 0);
> +	/* Set PCR low */
> +	scom970_write(SCOM_PCR, PCR_HILO_SELECT |
> +		      g5_pmode_data[speed_mode]);
> +
> +	/* Wait for completion */
> +	for (to = 0; to < 10; to++) {
> +		unsigned long psr = scom970_read(SCOM_PSR);
> +
> +		if ((psr & PSR_CMD_RECEIVED) == 0 &&
> +		    (((psr >> PSR_CUR_SPEED_SHIFT) ^
> +		      (g5_pmode_data[speed_mode] >> PCR_SPEED_SHIFT)) & 0x3)
> +		    == 0)
> +			break;
> +		if (psr & PSR_CMD_COMPLETED)
> +			break;
> +		udelay(100);
> +	}
> +
> +	local_irq_restore(flags);
> +
> +	/* If frequency is going down, last ramp the voltage */
> +	if (speed_mode > g5_pmode_cur)
> +		g5_dummy_switch_volt(speed_mode);
> +
> +	g5_pmode_cur = speed_mode;
> +	ppc_proc_freq = g5_cpu_freqs[speed_mode].frequency * 1000ul;
> +
> +	return 0;
> +}
> +
> +static int g5_scom_query_freq(void)
> +{
> +	unsigned long psr = scom970_read(SCOM_PSR);
> +	int i;
> +
> +	for (i = 0; i <= g5_pmode_max; i++)
> +		if ((((psr >> PSR_CUR_SPEED_SHIFT) ^
> +		      (g5_pmode_data[i] >> PCR_SPEED_SHIFT)) & 0x3) == 0)
> +			break;
> +	return i;
> +}
> +
> +/*
> + * Common interface to the cpufreq core
> + */
> +
> +static int g5_cpufreq_verify(struct cpufreq_policy *policy)
> +{
> +	return cpufreq_frequency_table_verify(policy, g5_cpu_freqs);
> +}
> +
> +static int g5_cpufreq_target(struct cpufreq_policy *policy,
> +	unsigned int target_freq, unsigned int relation)
> +{
> +	unsigned int newstate = 0;
> +	struct cpufreq_freqs freqs;
> +	int rc;
> +
> +	if (cpufreq_frequency_table_target(policy, g5_cpu_freqs,
> +			target_freq, relation, &newstate))
> +		return -EINVAL;
> +
> +	if (g5_pmode_cur == newstate)
> +		return 0;
> +
> +	mutex_lock(&g5_switch_mutex);
> +
> +	freqs.old = g5_cpu_freqs[g5_pmode_cur].frequency;
> +	freqs.new = g5_cpu_freqs[newstate].frequency;
> +	freqs.cpu = 0;
> +
> +	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
> +	rc = g5_scom_switch_freq(newstate);
> +	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
> +
> +	mutex_unlock(&g5_switch_mutex);
> +
> +	return rc;
> +}
> +
> +static unsigned int g5_cpufreq_get_speed(unsigned int cpu)
> +{
> +	return g5_cpu_freqs[g5_pmode_cur].frequency;
> +}
> +
> +static int g5_cpufreq_cpu_init(struct cpufreq_policy *policy)
> +{
> +	policy->cpuinfo.transition_latency = 12000;
> +	policy->cur = g5_cpu_freqs[g5_scom_query_freq()].frequency;
> +	/* secondary CPUs are tied to the primary one by the
> +	 * cpufreq core if in the secondary policy we tell it that
> +	 * it actually must be one policy together with all others. */
> +	cpumask_copy(policy->cpus, cpu_online_mask);
> +	cpufreq_frequency_table_get_attr(g5_cpu_freqs, policy->cpu);
> +
> +	return cpufreq_frequency_table_cpuinfo(policy,
> +		g5_cpu_freqs);
> +}
> +
> +
> +static struct cpufreq_driver g5_cpufreq_driver = {
> +	.name		= "maple",
> +	.owner		= THIS_MODULE,
> +	.flags		= CPUFREQ_CONST_LOOPS,
> +	.init		= g5_cpufreq_cpu_init,
> +	.verify		= g5_cpufreq_verify,
> +	.target		= g5_cpufreq_target,
> +	.get		= g5_cpufreq_get_speed,
> +	.attr		= g5_cpu_freqs_attr,
> +};
> +
> +static int __init g5_cpufreq_init(void)
> +{
> +	struct device_node *cpus;
> +	struct device_node *cpunode;
> +	unsigned int psize;
> +	unsigned long max_freq;
> +	const u32 *valp;
> +	u32 pvr_hi;
> +	int rc = -ENODEV;
> +
> +	cpus = of_find_node_by_path("/cpus");
> +	if (cpus == NULL) {
> +		DBG("No /cpus node !\n");
> +		return -ENODEV;
> +	}
> +
> +	/* Get first CPU node */
> +	for (cpunode = NULL;
> +	     (cpunode = of_get_next_child(cpus, cpunode)) != NULL;) {
> +		const u32 *reg = of_get_property(cpunode, "reg", NULL);
> +		if (reg == NULL || (*reg) != 0)
> +			continue;
> +		if (!strcmp(cpunode->type, "cpu"))
> +			break;
> +	}
> +	if (cpunode == NULL) {
> +		printk(KERN_ERR "cpufreq: Can't find any CPU 0 node\n");
> +		goto bail_cpus;
> +	}
> +
> +	/* Check 970FX for now */
> +	/* we actually don't care on which CPU to access PVR */
> +	pvr_hi = PVR_VER(mfspr(SPRN_PVR));
> +	if (pvr_hi != 0x3c && pvr_hi != 0x44) {
> +		printk(KERN_ERR "cpufreq: Unsupported CPU version (%x)\n", pvr_hi);
> +		goto bail_noprops;
> +	}
> +
> +	/* Look for the powertune data in the device-tree */
> +	g5_pmode_data = of_get_property(cpunode, "power-mode-data",&psize);
> +	if (!g5_pmode_data) {
> +		DBG("No power-mode-data !\n");
> +		goto bail_noprops;
> +	}
> +	g5_pmode_max = psize / sizeof(u32) - 1;
> +
> +	/*
> +	 * From what I see, clock-frequency is always the maximal frequency.
> +	 * The current driver can not slew sysclk yet, so we really only deal
> +	 * with powertune steps for now. We also only implement full freq and
> +	 * half freq in this version. So far, I haven't yet seen a machine
> +	 * supporting anything else.
> +	 */
> +	valp = of_get_property(cpunode, "clock-frequency", NULL);
> +	if (!valp)
> +		return -ENODEV;
> +	max_freq = (*valp)/1000;
> +	g5_cpu_freqs[0].frequency = max_freq;
> +	g5_cpu_freqs[1].frequency = max_freq/2;
> +
> +	/* Force apply current frequency to make sure everything is in
> +	 * sync (voltage is right for example). Firmware may leave us with
> +	 * a strange setting ...
> +	 */
> +	g5_dummy_switch_volt(CPUFREQ_HIGH);
> +	msleep(10);
> +	g5_pmode_cur = -1;
> +	g5_scom_switch_freq(g5_scom_query_freq());
> +
> +	printk(KERN_INFO "Registering G5 CPU frequency driver\n");
> +	printk(KERN_INFO "Frequency method: SCOM, Voltage method: none\n");
> +	printk(KERN_INFO "Low: %d Mhz, High: %d Mhz, Cur: %d MHz\n",
> +		g5_cpu_freqs[1].frequency/1000,
> +		g5_cpu_freqs[0].frequency/1000,
> +		g5_cpu_freqs[g5_pmode_cur].frequency/1000);
> +
> +	rc = cpufreq_register_driver(&g5_cpufreq_driver);
> +
> +	of_node_put(cpunode);
> +	of_node_put(cpus);
> +
> +	return rc;
> +
> +bail_noprops:
> +	of_node_put(cpunode);
> +bail_cpus:
> +	of_node_put(cpus);
> +
> +	return rc;
> +}
> +
> +module_init(g5_cpufreq_init);
> +
> +
> +MODULE_LICENSE("GPL");

^ permalink raw reply

* Re: [PATCH] powerpc/book3e-64: use a separate TLB handler when linear map is bolted
From: Benjamin Herrenschmidt @ 2011-06-17  2:00 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev
In-Reply-To: <20110603221232.GA29809@schlenkerla.am.freescale.net>

On Fri, 2011-06-03 at 17:12 -0500, Scott Wood wrote:
> On MMUs such as FSL where we can guarantee the entire linear mapping is
> bolted, we don't need to worry about linear TLB misses.  If on top of
> that we do a full table walk, we get rid of all recursive TLB faults, and
> can dispense with some state saving.  This gains a few percent on
> TLB-miss-heavy workloads, and around 50% on a benchmark that had a high
> rate of virtual page table faults under the normal handler.
> 
> While touching the EX_TLB layout, remove EX_TLB_MMUCR0, EX_TLB_SRR0, and
> EX_TLB_SRR1 as they're not used.
> 
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---
> This turned out to be a little faster than the virtual pmd approach
> on the sort benchmark as well as lmbench's lat_mem_rd with page stride.
> 
> It's slightly slower than virtual pmd (around 1%), but still faster than
> current code, on linear tests such as lmbench's bw_mem cp.

Does this completely replace your previous series of 7 patches ? (IE.
Should I ditch them in patchwork ?) Or does it apply on top of them ?

Some comments inline...

>  #define SET_IVOR(vector_number, vector_offset)	\
> diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> index a73668a..9d9e444 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -54,6 +54,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
>  	/* 64-bit Book3E keeps track of current PGD in the PACA */
>  #ifdef CONFIG_PPC_BOOK3E_64
>  	get_paca()->pgd = next->pgd;
> +	get_paca()->extlb[0][EX_TLB_PGD / 8] = (unsigned long)next->pgd;
>  #endif
>  	/* Nothing else to do if we aren't actually switching */
>  	if (prev == next)
> @@ -110,6 +111,7 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
>  	/* 64-bit Book3E keeps track of current PGD in the PACA */
>  #ifdef CONFIG_PPC_BOOK3E_64
>  	get_paca()->pgd = NULL;
> +	get_paca()->extlb[0][EX_TLB_PGD / 8] = 0;
>  #endif
>  }

Why do you keep a copy of the pgd there since it's in the PACA already
and you have r13 setup in your handlers ?

> diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
> index af08922..0f4ab86 100644
> --- a/arch/powerpc/mm/tlb_low_64e.S
> +++ b/arch/powerpc/mm/tlb_low_64e.S
> @@ -30,6 +30,212 @@
>  #define VPTE_PGD_SHIFT	(VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
>  #define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
>  
> +/**********************************************************************
> + *                                                                    *
> + * TLB miss handling for Book3E with a bolted linear mapping          *
> + * No virtual page table, no nested TLB misses                        *
> + *                                                                    *
> + **********************************************************************/
> +
> +.macro tlb_prolog_bolted addr
> +	mtspr	SPRN_SPRG_TLB_SCRATCH,r13
> +	mfspr	r13,SPRN_SPRG_PACA
> +	std	r10,PACA_EXTLB+EX_TLB_R10(r13)
> +	mfcr	r10
> +	std	r11,PACA_EXTLB+EX_TLB_R11(r13)
> +	mfspr	r11,SPRN_SPRG_TLB_SCRATCH

Do you need that ? Can't you leave r13 in scratch the whole way and
just pop it out in the error case when branching to DSI/ISI ? The only
thing is that TLB_SCRATCH needs to be saved/restored by
crit/debug/mcheck but thats worth saving cycles in the TLB miss handler
no ?

> +	std	r16,PACA_EXTLB+EX_TLB_R16(r13)
> +	mfspr	r16,\addr		/* get faulting address */
> +	std	r14,PACA_EXTLB+EX_TLB_R14(r13)
> +	ld	r14,PACA_EXTLB+EX_TLB_PGD(r13)

Why not get PGD from paca ?

> +	std	r15,PACA_EXTLB+EX_TLB_R15(r13)
> +	std	r10,PACA_EXTLB+EX_TLB_CR(r13)
> +	std	r11,PACA_EXTLB+EX_TLB_R13(r13)
> +	TLB_MISS_PROLOG_STATS_BOLTED
> +.endm
> +
> +.macro tlb_epilog_bolted
> +	ld	r14,PACA_EXTLB+EX_TLB_CR(r13)
> +	ld	r10,PACA_EXTLB+EX_TLB_R10(r13)
> +	ld	r11,PACA_EXTLB+EX_TLB_R11(r13)
> +	mtcr	r14
> +	ld	r14,PACA_EXTLB+EX_TLB_R14(r13)
> +	ld	r15,PACA_EXTLB+EX_TLB_R15(r13)
> +	TLB_MISS_RESTORE_STATS_BOLTED
> +	ld	r16,PACA_EXTLB+EX_TLB_R16(r13)
> +	ld	r13,PACA_EXTLB+EX_TLB_R13(r13)
> +.endm
> +
> +/* Data TLB miss */
> +	START_EXCEPTION(data_tlb_miss_bolted)
> +	tlb_prolog_bolted SPRN_DEAR
> +
> +	/* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
> +
> +	/* We do the user/kernel test for the PID here along with the RW test
> +	 */
> +	/* We pre-test some combination of permissions to avoid double
> +	 * faults:
> +	 *
> +	 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
> +	 * ESR_ST   is 0x00800000
> +	 * _PAGE_BAP_SW is 0x00000010
> +	 * So the shift is >> 19. This tests for supervisor writeability.
> +	 * If the page happens to be supervisor writeable and not user
> +	 * writeable, we will take a new fault later, but that should be
> +	 * a rare enough case.
> +	 *
> +	 * We also move ESR_ST in _PAGE_DIRTY position
> +	 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
> +	 *
> +	 * MAS1 is preset for all we need except for TID that needs to
> +	 * be cleared for kernel translations
> +	 */
> +
> +	mfspr	r11,SPRN_ESR
> +
> +	srdi	r15,r16,60		/* get region */
> +	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
> +	bne-	dtlb_miss_fault_bolted

Ok so I'm not familiar with your pipeline here, but wouldn't it be
better to move the srdi to after the bne above and make it srdi., thus
avoiding the compare below ?

> +	rlwinm	r10,r11,32-19,27,27
> +	rlwimi	r10,r11,32-16,19,19
> +	cmpwi	r15,0
> +	ori	r10,r10,_PAGE_PRESENT
> +	oris	r11,r10,_PAGE_ACCESSED@h
> +
> +	TLB_MISS_STATS_SAVE_INFO_BOLTED
> +	bne	tlb_miss_kernel_bolted
> +

Cheers,
Ben.

^ permalink raw reply

* [PATCH 4/5] powerpc/pseries: Re-implement HVSI as part of hvc_vio
From: Benjamin Herrenschmidt @ 2011-06-17  1:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel@vger.kernel.org

On pseries machines, consoles are provided by the hypervisor using
a low level get_chars/put_chars type interface. However, this is
really just a transport to the service processor which implements
them either as "raw" console (networked consoles, HMC, ...) or as
"hvsi" serial ports.

The later is a simple packet protocol on top of the raw character
interface that is supposed to convey additional "serial port" style
semantics. In practice however, all it does is provide a way to
read the CD line and set/clear our DTR line, that's it.

We currently implement the "raw" protocol as an hvc console backend
(/dev/hvcN) and the "hvsi" protocol using a separate tty driver
(/dev/hvsi0).

However this is quite impractical. The arbitrary difference between
the two type of devices has been a major source of user (and distro)
confusion. Additionally, there's an additional mini -hvsi implementation
in the pseries platform code for our low level debug console and early
boot kernel messages, which means code duplication, though that low
level variant is impractical as it's incapable of doing the initial
protocol negociation to establish the link to the FSP.

This essentially replaces the dedicated hvsi driver and the platform
udbg code completely by extending the existing hvc_vio backend used
in "raw" mode so that:

 - It now supports HVSI as well
 - We add support for hvc backend providing tiocm{get,set}
 - It also provides a udbg interface for early debug and boot console

This is overall less code, though this will only be obvious once we
remove the old "hvsi" driver, which is still available for now. When
the old driver is enabled, the new code still kicks in for the low
level udbg console, replacing the old mini implementation in the
platform
code, it just doesn't provide the higher level "hvc" interface.

In addition to producing generally simler code, this has several
benefits
over our current situation:

 - The user/distro only has to deal with /dev/hvcN for the hypervisor
console, avoiding all sort of confusion that has plagued us in the past

 - The tty, kernel and low level debug console all use the same code
base which supports the full protocol establishment process, thus the
console is now available much earlier than it used to be with the
old HVSI driver. The kernel console works much earlier and udbg is
available much earlier too. Hackers can enable a hard coded very-early
debug console as well that works with HVSI (previously that was only
supported for the "raw" mode).

I've tried to keep the same semantics as hvsi relative to how I react
to things like CD changes, with some subtle differences though:

 - I clear DTR on close if HUPCL is set

 - Current hvsi triggers a hangup if it detects a up->down transition
   on CD (you can still open a console with CD down). My new
implementation
   triggers a hangup if the link to the FSP is severed, and severs it
upon
   detecting a up->down transition on CD.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/Kconfig.debug               |   15 +
 arch/powerpc/include/asm/udbg.h          |    1 +
 arch/powerpc/kernel/udbg.c               |    3 +
 arch/powerpc/platforms/pseries/lpar.c    |  189 --------
 arch/powerpc/platforms/pseries/pseries.h |    3 +-
 arch/powerpc/platforms/pseries/setup.c   |    5 +-
 drivers/tty/hvc/Kconfig                  |    5 +
 drivers/tty/hvc/Makefile                 |    3 +-
 drivers/tty/hvc/hvc_console.c            |   23 +-
 drivers/tty/hvc/hvc_console.h            |    4 +
 drivers/tty/hvc/hvc_vio.c                |  725
++++++++++++++++++++++++++++--
 11 files changed, 749 insertions(+), 227 deletions(-)

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index e72dcf6..067cb84 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -167,6 +167,13 @@ config PPC_EARLY_DEBUG_LPAR
 	  Select this to enable early debugging for a machine with a HVC
 	  console on vterm 0.
 
+config PPC_EARLY_DEBUG_LPAR_HVSI
+	bool "LPAR HVSI Console"
+	depends on PPC_PSERIES
+	help
+	  Select this to enable early debugging for a machine with a HVSI
+	  console on a specified vterm.
+
 config PPC_EARLY_DEBUG_G5
 	bool "Apple G5"
 	depends on PPC_PMAC64
@@ -253,6 +260,14 @@ config PPC_EARLY_DEBUG_WSP
 
 endchoice
 
+config PPC_EARLY_DEBUG_HVSI_VTERMNO
+	hex "vterm number to use with early debug HVSI"
+	depends on PPC_EARLY_DEBUG_LPAR_HVSI
+	default "0x30000000"
+	help
+	  You probably want 0x30000000 for your first serial port and
+	  0x30000001 for your second one
+
 config PPC_EARLY_DEBUG_44x_PHYSLOW
 	hex "Low 32 bits of early debug UART physical address"
 	depends on PPC_EARLY_DEBUG_44x
diff --git a/arch/powerpc/include/asm/udbg.h
b/arch/powerpc/include/asm/udbg.h
index 58580e9..93e05d1 100644
--- a/arch/powerpc/include/asm/udbg.h
+++ b/arch/powerpc/include/asm/udbg.h
@@ -40,6 +40,7 @@ extern void udbg_adb_init_early(void);
 
 extern void __init udbg_early_init(void);
 extern void __init udbg_init_debug_lpar(void);
+extern void __init udbg_init_debug_lpar_hvsi(void);
 extern void __init udbg_init_pmac_realmode(void);
 extern void __init udbg_init_maple_realmode(void);
 extern void __init udbg_init_pas_realmode(void);
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index a57e61e..faa82c1 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -31,6 +31,9 @@ void __init udbg_early_init(void)
 #if defined(CONFIG_PPC_EARLY_DEBUG_LPAR)
 	/* For LPAR machines that have an HVC console on vterm 0 */
 	udbg_init_debug_lpar();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_LPAR_HVSI)
+	/* For LPAR machines that have an HVSI console on vterm 0 */
+	udbg_init_debug_lpar_hvsi();
 #elif defined(CONFIG_PPC_EARLY_DEBUG_G5)
 	/* For use on Apple G5 machines */
 	udbg_init_pmac_realmode();
diff --git a/arch/powerpc/platforms/pseries/lpar.c
b/arch/powerpc/platforms/pseries/lpar.c
index e3a96c4..f7205d3 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -52,195 +52,6 @@ EXPORT_SYMBOL(plpar_hcall_norets);
 
 extern void pSeries_find_serial_port(void);
 
-
-static int vtermno;	/* virtual terminal# for udbg  */
-
-#define __ALIGNED__ __attribute__((__aligned__(sizeof(long))))
-static void udbg_hvsi_putc(char c)
-{
-	/* packet's seqno isn't used anyways */
-	uint8_t packet[] __ALIGNED__ = { 0xff, 5, 0, 0, c };
-	int rc;
-
-	if (c == '\n')
-		udbg_hvsi_putc('\r');
-
-	do {
-		rc = plpar_put_term_char(vtermno, sizeof(packet), packet);
-	} while (rc == H_BUSY);
-}
-
-static long hvsi_udbg_buf_len;
-static uint8_t hvsi_udbg_buf[256];
-
-static int udbg_hvsi_getc_poll(void)
-{
-	unsigned char ch;
-	int rc, i;
-
-	if (hvsi_udbg_buf_len == 0) {
-		rc = plpar_get_term_char(vtermno, &hvsi_udbg_buf_len, hvsi_udbg_buf);
-		if (rc != H_SUCCESS || hvsi_udbg_buf[0] != 0xff) {
-			/* bad read or non-data packet */
-			hvsi_udbg_buf_len = 0;
-		} else {
-			/* remove the packet header */
-			for (i = 4; i < hvsi_udbg_buf_len; i++)
-				hvsi_udbg_buf[i-4] = hvsi_udbg_buf[i];
-			hvsi_udbg_buf_len -= 4;
-		}
-	}
-
-	if (hvsi_udbg_buf_len <= 0 || hvsi_udbg_buf_len > 256) {
-		/* no data ready */
-		hvsi_udbg_buf_len = 0;
-		return -1;
-	}
-
-	ch = hvsi_udbg_buf[0];
-	/* shift remaining data down */
-	for (i = 1; i < hvsi_udbg_buf_len; i++) {
-		hvsi_udbg_buf[i-1] = hvsi_udbg_buf[i];
-	}
-	hvsi_udbg_buf_len--;
-
-	return ch;
-}
-
-static int udbg_hvsi_getc(void)
-{
-	int ch;
-	for (;;) {
-		ch = udbg_hvsi_getc_poll();
-		if (ch == -1) {
-			/* This shouldn't be needed...but... */
-			volatile unsigned long delay;
-			for (delay=0; delay < 2000000; delay++)
-				;
-		} else {
-			return ch;
-		}
-	}
-}
-
-static void udbg_putcLP(char c)
-{
-	char buf[16];
-	unsigned long rc;
-
-	if (c == '\n')
-		udbg_putcLP('\r');
-
-	buf[0] = c;
-	do {
-		rc = plpar_put_term_char(vtermno, 1, buf);
-	} while(rc == H_BUSY);
-}
-
-/* Buffered chars getc */
-static long inbuflen;
-static long inbuf[2];	/* must be 2 longs */
-
-static int udbg_getc_pollLP(void)
-{
-	/* The interface is tricky because it may return up to 16 chars.
-	 * We save them statically for future calls to udbg_getc().
-	 */
-	char ch, *buf = (char *)inbuf;
-	int i;
-	long rc;
-	if (inbuflen == 0) {
-		/* get some more chars. */
-		inbuflen = 0;
-		rc = plpar_get_term_char(vtermno, &inbuflen, buf);
-		if (rc != H_SUCCESS)
-			inbuflen = 0;	/* otherwise inbuflen is garbage */
-	}
-	if (inbuflen <= 0 || inbuflen > 16) {
-		/* Catch error case as well as other oddities (corruption) */
-		inbuflen = 0;
-		return -1;
-	}
-	ch = buf[0];
-	for (i = 1; i < inbuflen; i++)	/* shuffle them down. */
-		buf[i-1] = buf[i];
-	inbuflen--;
-	return ch;
-}
-
-static int udbg_getcLP(void)
-{
-	int ch;
-	for (;;) {
-		ch = udbg_getc_pollLP();
-		if (ch == -1) {
-			/* This shouldn't be needed...but... */
-			volatile unsigned long delay;
-			for (delay=0; delay < 2000000; delay++)
-				;
-		} else {
-			return ch;
-		}
-	}
-}
-
-/* call this from early_init() for a working debug console on
- * vterm capable LPAR machines
- */
-void __init udbg_init_debug_lpar(void)
-{
-	vtermno = 0;
-	udbg_putc = udbg_putcLP;
-	udbg_getc = udbg_getcLP;
-	udbg_getc_poll = udbg_getc_pollLP;
-}
-
-/* returns 0 if couldn't find or use /chosen/stdout as console */
-void __init find_udbg_vterm(void)
-{
-	struct device_node *stdout_node;
-	const u32 *termno;
-	const char *name;
-
-	/* find the boot console from /chosen/stdout */
-	if (!of_chosen)
-		return;
-	name = of_get_property(of_chosen, "linux,stdout-path", NULL);
-	if (name == NULL)
-		return;
-	stdout_node = of_find_node_by_path(name);
-	if (!stdout_node)
-		return;
-	name = of_get_property(stdout_node, "name", NULL);
-	if (!name) {
-		printk(KERN_WARNING "stdout node missing 'name' property!\n");
-		goto out;
-	}
-
-	/* Check if it's a virtual terminal */
-	if (strncmp(name, "vty", 3) != 0)
-		goto out;
-	termno = of_get_property(stdout_node, "reg", NULL);
-	if (termno == NULL)
-		goto out;
-	vtermno = termno[0];
-
-	if (of_device_is_compatible(stdout_node, "hvterm1")) {
-		udbg_putc = udbg_putcLP;
-		udbg_getc = udbg_getcLP;
-		udbg_getc_poll = udbg_getc_pollLP;
-		add_preferred_console("hvc", termno[0] & 0xff, NULL);
-	} else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) {
-		vtermno = termno[0];
-		udbg_putc = udbg_hvsi_putc;
-		udbg_getc = udbg_hvsi_getc;
-		udbg_getc_poll = udbg_hvsi_getc_poll;
-		add_preferred_console("hvsi", termno[0] & 0xff, NULL);
-	}
-out:
-	of_node_put(stdout_node);
-}
-
 void vpa_init(int cpu)
 {
 	int hwcpu = get_hard_smp_processor_id(cpu);
diff --git a/arch/powerpc/platforms/pseries/pseries.h
b/arch/powerpc/platforms/pseries/pseries.h
index e9f6d28..24c7162 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -47,7 +47,8 @@ extern void pSeries_final_fixup(void);
 /* Poweron flag used for enabling auto ups restart */
 extern unsigned long rtas_poweron_auto;
 
-extern void find_udbg_vterm(void);
+/* Provided by HVC VIO */
+extern void hvc_vio_init_early(void);
 
 /* Dynamic logical Partitioning/Mobility */
 extern void dlpar_free_cc_nodes(struct device_node *);
diff --git a/arch/powerpc/platforms/pseries/setup.c
b/arch/powerpc/platforms/pseries/setup.c
index 593acce..d00e529 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -512,9 +512,10 @@ static void __init pSeries_init_early(void)
 {
 	pr_debug(" -> pSeries_init_early()\n");
 
+#ifdef CONFIG_HVC_CONSOLE
 	if (firmware_has_feature(FW_FEATURE_LPAR))
-		find_udbg_vterm();
-
+		hvc_vio_init_early();
+#endif
 	if (firmware_has_feature(FW_FEATURE_DABR))
 		ppc_md.set_dabr = pseries_set_dabr;
 	else if (firmware_has_feature(FW_FEATURE_XDABR))
diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig
index 6f2c980..e371753 100644
--- a/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@ -19,6 +19,11 @@ config HVC_CONSOLE
 	  console. This driver allows each pSeries partition to have a console
 	  which is accessed via the HMC.
 
+config HVC_OLD_HVSI
+	bool "Old driver for pSeries serial port (/dev/hvsi*)"
+	depends on HVC_CONSOLE
+	default n
+
 config HVC_ISERIES
 	bool "iSeries Hypervisor Virtual Console support"
 	depends on PPC_ISERIES
diff --git a/drivers/tty/hvc/Makefile b/drivers/tty/hvc/Makefile
index 40a25d9..69a444b 100644
--- a/drivers/tty/hvc/Makefile
+++ b/drivers/tty/hvc/Makefile
@@ -1,4 +1,5 @@
-obj-$(CONFIG_HVC_CONSOLE)	+= hvc_vio.o hvsi.o
+obj-$(CONFIG_HVC_CONSOLE)	+= hvc_vio.o
+obj-$(CONFIG_HVC_OLD_HVSI)	+= hvsi.o
 obj-$(CONFIG_HVC_ISERIES)	+= hvc_iseries.o
 obj-$(CONFIG_HVC_RTAS)		+= hvc_rtas.o
 obj-$(CONFIG_HVC_TILE)		+= hvc_tile.o
diff --git a/drivers/tty/hvc/hvc_console.c
b/drivers/tty/hvc/hvc_console.c
index e9cba13..f8ff6f5 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -184,7 +184,7 @@ static struct tty_driver *hvc_console_device(struct
console *c, int *index)
 }
 
 static int __init hvc_console_setup(struct console *co, char *options)
-{
+{	
 	if (co->index < 0 || co->index >= MAX_NR_HVC_CONSOLES)
 		return -ENODEV;
 
@@ -745,6 +745,25 @@ static int khvcd(void *unused)
 	return 0;
 }
 
+static int hvc_tiocmget(struct tty_struct *tty)
+{
+	struct hvc_struct *hp = tty->driver_data;
+
+	if (!hp || !hp->ops->tiocmget)
+		return -EINVAL;
+	return hp->ops->tiocmget(hp);
+}
+
+static int hvc_tiocmset(struct tty_struct *tty,
+			unsigned int set, unsigned int clear)
+{
+	struct hvc_struct *hp = tty->driver_data;
+
+	if (!hp || !hp->ops->tiocmset)
+		return -EINVAL;
+	return hp->ops->tiocmset(hp, set, clear);
+}
+
 static const struct tty_operations hvc_ops = {
 	.open = hvc_open,
 	.close = hvc_close,
@@ -753,6 +772,8 @@ static const struct tty_operations hvc_ops = {
 	.unthrottle = hvc_unthrottle,
 	.write_room = hvc_write_room,
 	.chars_in_buffer = hvc_chars_in_buffer,
+	.tiocmget = hvc_tiocmget,
+	.tiocmset = hvc_tiocmset,
 };
 
 struct hvc_struct *hvc_alloc(uint32_t vtermno, int data,
diff --git a/drivers/tty/hvc/hvc_console.h
b/drivers/tty/hvc/hvc_console.h
index 54381eba..c335a14 100644
--- a/drivers/tty/hvc/hvc_console.h
+++ b/drivers/tty/hvc/hvc_console.h
@@ -73,6 +73,10 @@ struct hv_ops {
 	int (*notifier_add)(struct hvc_struct *hp, int irq);
 	void (*notifier_del)(struct hvc_struct *hp, int irq);
 	void (*notifier_hangup)(struct hvc_struct *hp, int irq);
+
+	/* tiocmget/set implementation */
+	int (*tiocmget)(struct hvc_struct *hp);
+	int (*tiocmset)(struct hvc_struct *hp, unsigned int set, unsigned int
clear);
 };
 
 /* Register a vterm and a slot index for use as a console
(console_init) */
diff --git a/drivers/tty/hvc/hvc_vio.c b/drivers/tty/hvc/hvc_vio.c
index e6eea14..d4e0850 100644
--- a/drivers/tty/hvc/hvc_vio.c
+++ b/drivers/tty/hvc/hvc_vio.c
@@ -27,15 +27,27 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
USA
+ *
+ * TODO:
+ *
+ *   - handle error in sending hvsi protocol packets
+ *   - retry nego on subsequent sends ?
  */
 
+#undef DEBUG
+
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/console.h>
 
 #include <asm/hvconsole.h>
 #include <asm/vio.h>
 #include <asm/prom.h>
 #include <asm/firmware.h>
+#include <asm/hvsi.h>
+#include <asm/udbg.h>
 
 #include "hvc_console.h"
 
@@ -43,14 +55,47 @@ static const char hvc_driver_name[] = "hvc_console";
 
 static struct vio_device_id hvc_driver_table[] __devinitdata = {
 	{"serial", "hvterm1"},
+#ifndef HVC_OLD_HVSI
+	{"serial", "hvterm-protocol"},
+#endif
 	{ "", "" }
 };
 MODULE_DEVICE_TABLE(vio, hvc_driver_table);
 
-static int filtered_get_chars(uint32_t vtermno, char *buf, int count)
+typedef enum hv_protocol {
+	HV_PROTOCOL_RAW,
+	HV_PROTOCOL_HVSI
+} hv_protocol_t;
+
+#define HV_INBUF_SIZE		255
+
+struct hvterm_priv {
+	u32		termno;		/* HV term number */
+	hv_protocol_t	proto;		/* Raw data or HVSI packets */
+	unsigned int	inbuf_len;	/* Data in input buffer */
+	unsigned char	inbuf[HV_INBUF_SIZE];
+	unsigned int	inbuf_cur;	/* Cursor in input buffer */
+	unsigned int	inbuf_pktlen;	/* HVSI packet lenght from cursor */
+	atomic_t	seqno;		/* HVSI packet sequence number */
+	unsigned int	opened:1;	/* HVSI driver opened */
+	unsigned int	established:1;	/* HVSI protocol established */
+	unsigned int 	is_console:1;	/* Used as a kernel console device */
+	unsigned int	mctrl_update:1;	/* HVSI modem control updated */
+	unsigned short	mctrl;		/* HVSI modem control */
+	struct tty_struct *tty;		/* TTY structure */
+};
+static struct hvterm_priv *hvterm_privs[MAX_NR_HVC_CONSOLES];
+
+/* For early boot console */
+static struct hvterm_priv hvterm_priv0;
+
+static int hvterm_raw_get_chars(uint32_t vtermno, char *buf, int count)
 {
-	unsigned long got;
-	int i;
+	struct hvterm_priv *pv = hvterm_privs[vtermno];
+	unsigned long got, i;
+
+	if (WARN_ON(!pv))
+		return 0;
 
 	/*
 	 * Vio firmware will read up to SIZE_VIO_GET_CHARS at its own
discretion
@@ -60,7 +105,7 @@ static int filtered_get_chars(uint32_t vtermno, char
*buf, int count)
 	if (count < SIZE_VIO_GET_CHARS)
 		return -EAGAIN;
 
-	got = hvc_get_chars(vtermno, buf, count);
+	got = hvc_get_chars(pv->termno, buf, count);
 
 	/*
 	 * Work around a HV bug where it gives us a null
@@ -70,32 +115,527 @@ static int filtered_get_chars(uint32_t vtermno,
char *buf, int count)
 		if (buf[i] == 0 && buf[i-1] == '\r') {
 			--got;
 			if (i < got)
-				memmove(&buf[i], &buf[i+1],
-					got - i);
+				memmove(&buf[i], &buf[i+1], got - i);
 		}
 	}
 	return got;
 }
 
-static const struct hv_ops hvc_get_put_ops = {
-	.get_chars = filtered_get_chars,
-	.put_chars = hvc_put_chars,
+static int hvterm_raw_put_chars(uint32_t vtermno, const char *buf, int
count)
+{
+	struct hvterm_priv *pv = hvterm_privs[vtermno];
+
+	if (WARN_ON(!pv))
+		return 0;
+
+	return hvc_put_chars(pv->termno, buf, count);
+}
+
+static const struct hv_ops hvterm_raw_ops = {
+	.get_chars = hvterm_raw_get_chars,
+	.put_chars = hvterm_raw_put_chars,
 	.notifier_add = notifier_add_irq,
 	.notifier_del = notifier_del_irq,
 	.notifier_hangup = notifier_hangup_irq,
 };
 
+static int hvterm_hvsi_send_packet(struct hvterm_priv *pv, struct
hvsi_header *packet)
+{
+	packet->seqno = atomic_inc_return(&pv->seqno);
+
+	/* Assumes that always succeeds, works in practice */
+	return hvc_put_chars(pv->termno, (char *)packet, packet->len);
+}
+
+static void hvterm_hvsi_start_handshake(struct hvterm_priv *pv)
+{
+	struct hvsi_query q;
+
+	/* Reset state */
+	pv->established = 0;
+	atomic_set(&pv->seqno, 0);
+
+	pr_devel("HVSI@%x: Handshaking started\n", pv->termno);
+
+	/* Send version query */
+	q.hdr.type = VS_QUERY_PACKET_HEADER;
+	q.hdr.len = sizeof(struct hvsi_query);
+	q.verb = VSV_SEND_VERSION_NUMBER;
+	hvterm_hvsi_send_packet(pv, &q.hdr);
+}
+
+static int hvterm_hvsi_send_close(struct hvterm_priv *pv)
+{
+	struct hvsi_control ctrl;
+
+	pv->established = 0;
+
+	ctrl.hdr.type = VS_CONTROL_PACKET_HEADER;
+	ctrl.hdr.len = sizeof(struct hvsi_control);
+	ctrl.verb = VSV_CLOSE_PROTOCOL;
+	return hvterm_hvsi_send_packet(pv, &ctrl.hdr);
+}
+
+static void hvterm_cd_change(struct hvterm_priv *pv, int cd)
+{
+	if (cd)
+		pv->mctrl |= TIOCM_CD;
+	else {
+		pv->mctrl &= ~TIOCM_CD;
+
+		/* We copy the existing hvsi driver semantics
+		 * here which are to trigger a hangup when
+		 * we get a carrier loss.
+		 * Closing our connection to the server will
+		 * do just that.
+		 */
+		if (!pv->is_console && pv->opened) {
+			pr_devel("HVSI@%x Carrier lost, hanging up !\n",
+				 pv->termno);
+			hvterm_hvsi_send_close(pv);
+		}
+	}
+}
+
+static void hvterm_hvsi_got_control(struct hvterm_priv *pv)
+{
+	struct hvsi_control *pkt = (struct hvsi_control *)pv->inbuf;
+
+	switch (pkt->verb) {
+	case VSV_CLOSE_PROTOCOL:
+		/* We restart the handshaking */
+		hvterm_hvsi_start_handshake(pv);
+		break;
+	case VSV_MODEM_CTL_UPDATE:
+		/* Transition of carrier detect */
+		hvterm_cd_change(pv, pkt->word & HVSI_TSCD);
+		break;
+	}
+}
+
+static void hvterm_hvsi_got_query(struct hvterm_priv *pv)
+{
+	struct hvsi_query *pkt = (struct hvsi_query *)pv->inbuf;
+	struct hvsi_query_response r;
+
+	/* We only handle version queries */
+	if (pkt->verb != VSV_SEND_VERSION_NUMBER)
+		return;
+
+	pr_devel("HVSI@%x: Got version query, sending response...\n",
+		 pv->termno);
+
+	/* Send version response */
+	r.hdr.type = VS_QUERY_RESPONSE_PACKET_HEADER;
+	r.hdr.len = sizeof(struct hvsi_query_response);
+	r.verb = VSV_SEND_VERSION_NUMBER;
+	r.u.version = HVSI_VERSION;
+	r.query_seqno = pkt->hdr.seqno;
+	hvterm_hvsi_send_packet(pv, &r.hdr);
+
+	/* Assume protocol is open now */
+	pv->established = 1;
+}
+
+static void hvterm_hvsi_got_response(struct hvterm_priv *pv)
+{
+	struct hvsi_query_response *r = (struct hvsi_query_response
*)pv->inbuf;
+
+	switch(r->verb) {
+	case VSV_SEND_MODEM_CTL_STATUS:
+		hvterm_cd_change(pv, r->u.mctrl_word & HVSI_TSCD);
+		pv->mctrl_update = 1;
+		break;
+	}
+}
+
+static int hvterm_hvsi_check_packet(struct hvterm_priv *pv)
+{
+	u8 len, type;
+
+	/* Check header validity. If it's invalid, we ditch
+	 * the whole buffer and hope we eventually resync
+	 */
+	if (pv->inbuf[0] < 0xfc) {
+		pv->inbuf_len = pv->inbuf_pktlen = 0;
+		return 0;
+	}
+	type = pv->inbuf[0];
+	len = pv->inbuf[1];
+
+	/* Packet incomplete ? */
+	if (pv->inbuf_len < len)
+		return 0;
+
+	pr_devel("HVSI@%x: Got packet type %x len %d bytes:\n",
+		 pv->termno, type, len);
+
+	/* We have a packet, yay ! Handle it */
+	switch(type) {
+	case VS_DATA_PACKET_HEADER:
+		pv->inbuf_pktlen = len - 4;
+		pv->inbuf_cur = 4;
+		return 1;
+	case VS_CONTROL_PACKET_HEADER:
+		hvterm_hvsi_got_control(pv);
+		break;
+	case VS_QUERY_PACKET_HEADER:
+		hvterm_hvsi_got_query(pv);
+		break;
+	case VS_QUERY_RESPONSE_PACKET_HEADER:
+		hvterm_hvsi_got_response(pv);
+		break;
+	}
+
+	/* Swallow packet and retry */
+	pv->inbuf_len -= len;
+	memmove(pv->inbuf, &pv->inbuf[len], pv->inbuf_len);
+	return 1;
+}
+
+static int hvterm_hvsi_get_packet(struct hvterm_priv *pv)
+{
+	/* If we have room in the buffer, ask HV for more */
+	if (pv->inbuf_len < HV_INBUF_SIZE)
+		pv->inbuf_len += hvc_get_chars(pv->termno,
+					       &pv->inbuf[pv->inbuf_len],
+					       HV_INBUF_SIZE - pv->inbuf_len);
+	/*
+	 * If we have at least 4 bytes in the buffer, check for
+	 * a full packet and retry
+	 */
+	if (pv->inbuf_len >= 4)
+		return hvterm_hvsi_check_packet(pv);
+	return 0;
+}
+
+static int hvterm_hvsi_get_chars(uint32_t vtermno, char *buf, int
count)
+{
+	struct hvterm_priv *pv = hvterm_privs[vtermno];
+	unsigned int tries, read = 0;
+
+	if (WARN_ON(!pv))
+		return 0;
+
+	/* If we aren't open, dont do anything in order to avoid races
+	 * with connection establishment. The hvc core will call this
+	 * before we have returned from notifier_add(), and we need to
+	 * avoid multiple users playing with the receive buffer
+	 */
+	if (!pv->opened)
+		return 0;
+
+	/* We try twice, once with what data we have and once more
+	 * after we try to fetch some more from the hypervisor
+	 */
+	for (tries = 1; count && tries < 2; tries++) {
+		/* Consume existing data packet */
+		if (pv->inbuf_pktlen) {
+			unsigned int l = min(count, (int)pv->inbuf_pktlen);
+			memcpy(&buf[read], &pv->inbuf[pv->inbuf_cur], l);
+			pv->inbuf_cur += l;
+			pv->inbuf_pktlen -= l;
+			count -= l;
+			read += l;
+		}
+		if (count == 0)
+			break;
+
+		/* Data packet fully consumed, move down remaning data */
+		if (pv->inbuf_cur) {
+			pv->inbuf_len -= pv->inbuf_cur;
+			memmove(pv->inbuf, &pv->inbuf[pv->inbuf_cur], pv->inbuf_len);
+			pv->inbuf_cur = 0;
+		}
+
+		/* Try to get another packet */
+		if (hvterm_hvsi_get_packet(pv))
+			tries--;
+	}
+	if (!pv->established) {
+		pr_devel("HVSI@%x: returning -EPIPE\n", pv->termno);
+		return -EPIPE;
+	}
+	return read;
+}
+
+static int hvterm_hvsi_put_chars(uint32_t vtermno, const char *buf, int
count)
+{
+	struct hvterm_priv *pv = hvterm_privs[vtermno];
+	struct hvsi_data dp;
+	int rc, adjcount = min(count, HVSI_MAX_OUTGOING_DATA);
+
+	if (WARN_ON(!pv))
+		return 0;
+
+	dp.hdr.type = VS_DATA_PACKET_HEADER;
+	dp.hdr.len = adjcount + sizeof(struct hvsi_header);
+	memcpy(dp.data, buf, adjcount);
+	rc = hvterm_hvsi_send_packet(pv, &dp.hdr);
+	if (rc <= 0)
+		return rc;
+	return adjcount;
+}
+
+static void maybe_msleep(unsigned long ms)
+{
+	/* During early boot, IRQs are disabled, use mdelay */
+	if (irqs_disabled())
+		mdelay(ms);
+	else
+		msleep(ms);
+}
+
+static int hvterm_hvsi_read_mctrl(struct hvterm_priv *pv)
+{
+	struct hvsi_query q;
+	int rc, timeout;
+
+	pr_devel("HVSI@%x: Querying modem control status...\n",
+		 pv->termno);
+
+	pv->mctrl_update = 0;
+	q.hdr.type = VS_QUERY_PACKET_HEADER;
+	q.hdr.len = sizeof(struct hvsi_query);
+	q.hdr.seqno = atomic_inc_return(&pv->seqno);
+	q.verb = VSV_SEND_MODEM_CTL_STATUS;
+	rc = hvterm_hvsi_send_packet(pv, &q.hdr);
+	if (rc <= 0) {
+		pr_devel("HVSI@%x: Error %d...\n", pv->termno, rc);
+		return rc;
+	}
+
+	/* Try for up to 1s */
+	for (timeout = 0; timeout < 1000; timeout++) {
+		if (!pv->established)
+			return -ENXIO;
+		if (pv->mctrl_update)
+			return 0;
+		if (!hvterm_hvsi_get_packet(pv))
+			maybe_msleep(1);
+	}
+	return -EIO;
+}
+
+static int hvterm_hvsi_write_mctrl(struct hvterm_priv *pv, int dtr)
+{
+	struct hvsi_control ctrl;
+
+	pr_devel("HVSI@%x: %s DTR...\n", pv->termno,
+		 dtr ? "Setting" : "Clearing");
+
+	ctrl.hdr.type = VS_CONTROL_PACKET_HEADER,
+	ctrl.hdr.len = sizeof(struct hvsi_control);
+	ctrl.verb = VSV_SET_MODEM_CTL;
+	ctrl.mask = HVSI_TSDTR;
+	ctrl.word = dtr ? HVSI_TSDTR : 0;
+	if (dtr)
+		pv->mctrl |= TIOCM_DTR;
+	else
+		pv->mctrl &= ~TIOCM_DTR;
+	return hvterm_hvsi_send_packet(pv, &ctrl.hdr);
+}
+
+static void hvterm_hvsi_establish(struct hvterm_priv *pv)
+{
+	int timeout;
+
+	/* Try for up to 10ms, there can be a packet to
+	 * start the process waiting for us...
+	 */
+	for (timeout = 0; timeout < 10; timeout++) {
+		if (pv->established)
+			goto established;
+		if (!hvterm_hvsi_get_packet(pv))
+			maybe_msleep(1);
+	}
+
+	/* Failed, send a close connection packet just
+	 * in case
+	 */
+	hvterm_hvsi_send_close(pv);
+
+	/* Then restart handshake */
+	hvterm_hvsi_start_handshake(pv);
+
+	/* Try for up to 100ms */
+	for (timeout = 0; timeout < 100; timeout++) {
+		if (pv->established)
+			goto established;
+		if (!hvterm_hvsi_get_packet(pv))
+			maybe_msleep(1);
+	}
+
+	if (!pv->established) {
+		pr_devel("HVSI@%x: Timeout handshaking, giving up !\n",
+			 pv->termno);
+		return;
+	}
+ established:
+	/* Query modem control lines */
+	hvterm_hvsi_read_mctrl(pv);
+
+	/* Set our own DTR */
+	hvterm_hvsi_write_mctrl(pv, 1);
+
+	/* Set the opened flag so reads are allowed */
+	wmb();
+	pv->opened = 1;
+}
+
+static int hvterm_hvsi_open(struct hvc_struct *hp, int data)
+{
+	struct hvterm_priv *pv = hvterm_privs[hp->vtermno];
+	int rc;
+
+	pr_devel("HVSI@%x: open !\n", pv->termno);
+
+	rc = notifier_add_irq(hp, data);
+	if (rc)
+		return rc;
+
+	/* Keep track of the tty data structure */
+	pv->tty = tty_kref_get(hp->tty);
+
+	hvterm_hvsi_establish(pv);
+	return 0;
+}
+
+static void hvterm_hvsi_shutdown(struct hvc_struct *hp, struct
hvterm_priv *pv)
+{
+	unsigned long flags;
+
+	if (!pv->is_console) {
+		pr_devel("HVSI@%x: Not a console, tearing down\n",
+			 pv->termno);
+
+		/* Clear opened, synchronize with khvcd */
+		spin_lock_irqsave(&hp->lock, flags);
+		pv->opened = 0;
+		spin_unlock_irqrestore(&hp->lock, flags);
+
+		/* Clear our own DTR */
+		if (!pv->tty || (pv->tty->termios->c_cflag & HUPCL))
+			hvterm_hvsi_write_mctrl(pv, 0);
+
+		/* Tear down the connection */
+		hvterm_hvsi_send_close(pv);
+	}
+
+	if (pv->tty)
+		tty_kref_put(pv->tty);
+	pv->tty = NULL;
+}
+
+static void hvterm_hvsi_close(struct hvc_struct *hp, int data)
+{
+	struct hvterm_priv *pv = hvterm_privs[hp->vtermno];
+
+	pr_devel("HVSI@%x: close !\n", pv->termno);
+
+	hvterm_hvsi_shutdown(hp, pv);
+
+	notifier_del_irq(hp, data);
+}
+
+void hvterm_hvsi_hangup(struct hvc_struct *hp, int data)
+{
+	struct hvterm_priv *pv = hvterm_privs[hp->vtermno];
+
+	pr_devel("HVSI@%x: hangup !\n", pv->termno);
+
+	hvterm_hvsi_shutdown(hp, pv);
+
+	notifier_hangup_irq(hp, data);
+}
+
+static int hvterm_hvsi_tiocmget(struct hvc_struct *hp)
+{
+	struct hvterm_priv *pv = hvterm_privs[hp->vtermno];
+
+	if (!pv)
+		return -EINVAL;
+	return pv->mctrl;
+}
+
+static int hvterm_hvsi_tiocmset(struct hvc_struct *hp, unsigned int
set,
+				unsigned int clear)
+{
+	struct hvterm_priv *pv = hvterm_privs[hp->vtermno];
+
+	pr_devel("HVSI@%x: Set modem control, set=%x,clr=%x\n",
+		 pv->termno, set, clear);
+
+	if (set & TIOCM_DTR)
+		hvterm_hvsi_write_mctrl(pv, 1);
+	else if (clear & TIOCM_DTR)
+		hvterm_hvsi_write_mctrl(pv, 0);
+
+	return 0;
+}
+
+static const struct hv_ops hvterm_hvsi_ops = {
+	.get_chars = hvterm_hvsi_get_chars,
+	.put_chars = hvterm_hvsi_put_chars,
+	.notifier_add = hvterm_hvsi_open,
+	.notifier_del = hvterm_hvsi_close,
+	.notifier_hangup = hvterm_hvsi_hangup,
+	.tiocmget = hvterm_hvsi_tiocmget,
+	.tiocmset = hvterm_hvsi_tiocmset,
+};
+
 static int __devinit hvc_vio_probe(struct vio_dev *vdev,
-				const struct vio_device_id *id)
+				   const struct vio_device_id *id)
 {
+	const struct hv_ops *ops;
 	struct hvc_struct *hp;
+	struct hvterm_priv *pv;
+	hv_protocol_t proto;
+	int i, termno = -1;
 
 	/* probed with invalid parameters. */
 	if (!vdev || !id)
 		return -EPERM;
 
-	hp = hvc_alloc(vdev->unit_address, vdev->irq, &hvc_get_put_ops,
-			MAX_VIO_PUT_CHARS);
+	if (of_device_is_compatible(vdev->dev.of_node, "hvterm1")) {
+		proto = HV_PROTOCOL_RAW;
+		ops = &hvterm_raw_ops;
+	} else if (of_device_is_compatible(vdev->dev.of_node,
"hvterm-protocol")) {
+		proto = HV_PROTOCOL_HVSI;
+		ops = &hvterm_hvsi_ops;
+	} else {
+		pr_err("hvc_vio: Unkown protocol for %s\n",
vdev->dev.of_node->full_name);
+		return -ENXIO;
+	}
+
+	pr_devel("hvc_vio_probe() device %s, using %s protocol\n",
+		 vdev->dev.of_node->full_name,
+		 proto == HV_PROTOCOL_RAW ? "raw" : "hvsi");
+
+	/* Is it our boot one ? */
+	if (hvterm_privs[0] == &hvterm_priv0 &&
+	    vdev->unit_address == hvterm_priv0.termno) {
+		pv = hvterm_privs[0];
+		termno = 0;
+		pr_devel("->boot console, using termno 0\n");
+	}
+	/* nope, allocate a new one */
+	else {
+		for (i = 0; i < MAX_NR_HVC_CONSOLES && termno < 0; i++)
+			if (!hvterm_privs[i])
+				termno = i;
+		pr_devel("->non-boot console, using termno %d\n", termno);
+		if (termno < 0)
+			return -ENODEV;
+		pv = kzalloc(sizeof(struct hvterm_priv), GFP_KERNEL);
+		if (!pv)
+			return -ENOMEM;
+		pv->termno = vdev->unit_address;
+		pv->proto = proto;
+		hvterm_privs[termno] = pv;
+	}
+
+	hp = hvc_alloc(termno, vdev->irq, ops, MAX_VIO_PUT_CHARS);
 	if (IS_ERR(hp))
 		return PTR_ERR(hp);
 	dev_set_drvdata(&vdev->dev, hp);
@@ -106,8 +646,16 @@ static int __devinit hvc_vio_probe(struct vio_dev
*vdev,
 static int __devexit hvc_vio_remove(struct vio_dev *vdev)
 {
 	struct hvc_struct *hp = dev_get_drvdata(&vdev->dev);
+	int rc, termno;
 
-	return hvc_remove(hp);
+	termno = hp->vtermno;
+	rc = hvc_remove(hp);
+	if (rc == 0) {
+		if (hvterm_privs[termno] != &hvterm_priv0)
+			kfree(hvterm_privs[termno]);
+		hvterm_privs[termno] = NULL;
+	}
+	return rc;
 }
 
 static struct vio_driver hvc_vio_driver = {
@@ -140,34 +688,145 @@ static void __exit hvc_vio_exit(void)
 }
 module_exit(hvc_vio_exit);
 
-/* the device tree order defines our numbering */
-static int hvc_find_vtys(void)
+static void udbg_hvc_putc(char c)
 {
-	struct device_node *vty;
-	int num_found = 0;
+	int count = -1;
 
-	for (vty = of_find_node_by_name(NULL, "vty"); vty != NULL;
-			vty = of_find_node_by_name(vty, "vty")) {
-		const uint32_t *vtermno;
+	if (c == '\n')
+		udbg_hvc_putc('\r');
 
-		/* We have statically defined space for only a certain number
-		 * of console adapters.
-		 */
-		if (num_found >= MAX_NR_HVC_CONSOLES) {
-			of_node_put(vty);
+	do {
+		switch(hvterm_priv0.proto) {
+		case HV_PROTOCOL_RAW:
+			count = hvterm_raw_put_chars(0, &c, 1);
+			break;
+		case HV_PROTOCOL_HVSI:
+			count = hvterm_hvsi_put_chars(0, &c, 1);
 			break;
 		}
+	} while(count == 0);
+}
+
+static int udbg_hvc_getc_poll(void)
+{
+	int rc = 0;
+	char c;
 
-		vtermno = of_get_property(vty, "reg", NULL);
-		if (!vtermno)
-			continue;
+	switch(hvterm_priv0.proto) {
+	case HV_PROTOCOL_RAW:
+		rc = hvterm_raw_get_chars(0, &c, 1);
+		break;
+	case HV_PROTOCOL_HVSI:
+		rc = hvterm_hvsi_get_chars(0, &c, 1);
+		break;
+	}
+	if (!rc)
+		return -1;
+	return c;
+}
 
-		if (of_device_is_compatible(vty, "hvterm1")) {
-			hvc_instantiate(*vtermno, num_found, &hvc_get_put_ops);
-			++num_found;
+static int udbg_hvc_getc(void)
+{
+	int ch;
+	for (;;) {
+		ch = udbg_hvc_getc_poll();
+		if (ch == -1) {
+			/* This shouldn't be needed...but... */
+			volatile unsigned long delay;
+			for (delay=0; delay < 2000000; delay++)
+				;
+		} else {
+			return ch;
 		}
 	}
+}
+
+void __init hvc_vio_init_early(void)
+{
+	struct device_node *stdout_node;
+	const u32 *termno;
+	const char *name;
+	const struct hv_ops *ops;
+
+	/* find the boot console from /chosen/stdout */
+	if (!of_chosen)
+		return;
+	name = of_get_property(of_chosen, "linux,stdout-path", NULL);
+	if (name == NULL)
+		return;
+	stdout_node = of_find_node_by_path(name);
+	if (!stdout_node)
+		return;
+	name = of_get_property(stdout_node, "name", NULL);
+	if (!name) {
+		printk(KERN_WARNING "stdout node missing 'name' property!\n");
+		goto out;
+	}
+
+	/* Check if it's a virtual terminal */
+	if (strncmp(name, "vty", 3) != 0)
+		goto out;
+	termno = of_get_property(stdout_node, "reg", NULL);
+	if (termno == NULL)
+		goto out;
+	hvterm_priv0.termno = *termno;
+	hvterm_priv0.is_console = 1;
+	hvterm_privs[0] = &hvterm_priv0;
+
+	/* Check the protocol */
+	if (of_device_is_compatible(stdout_node, "hvterm1")) {
+		hvterm_priv0.proto = HV_PROTOCOL_RAW;
+		ops = &hvterm_raw_ops;
+	}
+	else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) {
+		hvterm_priv0.proto = HV_PROTOCOL_HVSI;
+		ops = &hvterm_hvsi_ops;
+		/* HVSI, perform the handshake now */
+		hvterm_hvsi_establish(&hvterm_priv0);
+	} else
+		goto out;
+	udbg_putc = udbg_hvc_putc;
+	udbg_getc = udbg_hvc_getc;
+	udbg_getc_poll = udbg_hvc_getc_poll;
+#ifdef HVC_OLD_HVSI
+	/* When using the old HVSI driver don't register the HVC
+	 * backend for HVSI, only do udbg
+	 */
+	if (hvterm_priv0.proto == HV_PROTOCOL_HVSI)
+		goto out;
+#endif
+	add_preferred_console("hvc", 0, NULL);
+	hvc_instantiate(0, 0, ops);
+out:
+	of_node_put(stdout_node);
+}
+
+/* call this from early_init() for a working debug console on
+ * vterm capable LPAR machines
+ */
+#ifdef CONFIG_PPC_EARLY_DEBUG_LPAR
+void __init udbg_init_debug_lpar(void)
+{
+	hvterm_privs[0] = &hvterm_priv0;
+	hvterm_priv0.termno = 0;
+	hvterm_priv0.proto = HV_PROTOCOL_RAW;
+	hvterm_priv0.is_console = 1;
+	udbg_putc = udbg_hvc_putc;
+	udbg_getc = udbg_hvc_getc;
+	udbg_getc_poll = udbg_hvc_getc_poll;
+}
+#endif /* CONFIG_PPC_EARLY_DEBUG_LPAR */
 
-	return num_found;
+#ifdef CONFIG_PPC_EARLY_DEBUG_LPAR_HVSI
+void __init udbg_init_debug_lpar_hvsi(void)
+{
+	hvterm_privs[0] = &hvterm_priv0;
+	hvterm_priv0.termno = CONFIG_PPC_EARLY_DEBUG_HVSI_VTERMNO;
+	hvterm_priv0.proto = HV_PROTOCOL_HVSI;
+	hvterm_priv0.is_console = 1;
+	udbg_putc = udbg_hvc_putc;
+	udbg_getc = udbg_hvc_getc;
+	udbg_getc_poll = udbg_hvc_getc_poll;
+	hvterm_hvsi_establish(&hvterm_priv0);
 }
-console_initcall(hvc_find_vtys);
+#endif /* CONFIG_PPC_EARLY_DEBUG_LPAR_HVSI */

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox