LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC v9 4/6] dma: of: Add common xlate function for matching by channel id
From: Alexander Popov @ 2014-03-12 11:47 UTC (permalink / raw)
  To: Gerhard Sittig, Dan Williams, Vinod Koul, Lars-Peter Clausen,
	Arnd Bergmann, Anatolij Gustschin, Andy Shevchenko,
	Alexander Popov, linuxppc-dev, dmaengine
  Cc: devicetree
In-Reply-To: <1394624875-24411-1-git-send-email-a13xp0p0v88@gmail.com>

This patch adds a new common OF dma xlate callback function which will match a
channel by it's id. The binding expects one integer argument which it will use to
lookup the channel by the id.

Unlike of_dma_simple_xlate this function is able to handle a system with
multiple DMA controllers. When registering the of dma provider with
of_dma_controller_register a pointer to the dma_device struct which is
associated with the dt node needs to passed as the data parameter.
New function will use this pointer to match only channels which belong to the
specified DMA controller.

Signed-off-by: Alexander Popov <a13xp0p0v88@gmail.com>
---
 drivers/dma/of-dma.c   | 35 +++++++++++++++++++++++++++++++++++
 include/linux/of_dma.h |  4 ++++
 2 files changed, 39 insertions(+)

diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index e8fe9dc..d5fbeaa 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -218,3 +218,38 @@ struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 			&dma_spec->args[0]);
 }
 EXPORT_SYMBOL_GPL(of_dma_simple_xlate);
+
+/**
+ * of_dma_xlate_by_chan_id - Translate dt property to DMA channel by channel id
+ * @dma_spec:	pointer to DMA specifier as found in the device tree
+ * @of_dma:	pointer to DMA controller data
+ *
+ * This function can be used as the of xlate callback for DMA driver which wants
+ * to match the channel based on the channel id. When using this xlate function
+ * the #dma-cells propety of the DMA controller dt node needs to be set to 1.
+ * The data parameter of of_dma_controller_register must be a pointer to the
+ * dma_device struct the function should match upon.
+ *
+ * Returns pointer to appropriate dma channel on success or NULL on error.
+ */
+struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec,
+					 struct of_dma *ofdma)
+{
+	struct dma_device *dev = ofdma->of_dma_data;
+	struct dma_chan *chan, *candidate = NULL;
+
+	if (!dev || dma_spec->args_count != 1)
+		return NULL;
+
+	list_for_each_entry(chan, &dev->channels, device_node)
+		if (chan->chan_id == dma_spec->args[0]) {
+			candidate = chan;
+			break;
+		}
+
+	if (!candidate)
+		return NULL;
+
+	return dma_get_slave_channel(candidate);
+}
+EXPORT_SYMBOL_GPL(of_dma_xlate_by_chan_id);
diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h
index ae36298..56bc026 100644
--- a/include/linux/of_dma.h
+++ b/include/linux/of_dma.h
@@ -41,6 +41,8 @@ extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
 						     const char *name);
 extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 		struct of_dma *ofdma);
+extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec,
+		struct of_dma *ofdma);
 #else
 static inline int of_dma_controller_register(struct device_node *np,
 		struct dma_chan *(*of_dma_xlate)
@@ -66,6 +68,8 @@ static inline struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_s
 	return NULL;
 }
 
+#define of_dma_xlate_by_chan_id NULL
+
 #endif
 
 #endif /* __LINUX_OF_DMA_H */
-- 
1.8.4.2

^ permalink raw reply related

* [PATCH RFC v9 5/6] dma: mpc512x: add device tree binding document
From: Alexander Popov @ 2014-03-12 11:47 UTC (permalink / raw)
  To: Gerhard Sittig, Dan Williams, Vinod Koul, Lars-Peter Clausen,
	Arnd Bergmann, Anatolij Gustschin, Andy Shevchenko,
	Alexander Popov, linuxppc-dev, dmaengine
  Cc: devicetree
In-Reply-To: <1394624875-24411-1-git-send-email-a13xp0p0v88@gmail.com>

From: Gerhard Sittig <gsi@denx.de>

introduce a device tree binding document for the MPC512x DMA controller

Signed-off-by: Gerhard Sittig <gsi@denx.de>
[ a13xp0p0v88@gmail.com: turn this into a separate patch ]
---
 .../devicetree/bindings/dma/mpc512x-dma.txt        | 55 ++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma/mpc512x-dma.txt

diff --git a/Documentation/devicetree/bindings/dma/mpc512x-dma.txt b/Documentation/devicetree/bindings/dma/mpc512x-dma.txt
new file mode 100644
index 0000000..a4867d5
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/mpc512x-dma.txt
@@ -0,0 +1,55 @@
+* Freescale MPC512x DMA Controller
+
+The DMA controller in the Freescale MPC512x SoC can move blocks of
+memory contents between memory and peripherals or memory to memory.
+
+Refer to the "Generic DMA Controller and DMA request bindings" description
+in the dma.txt file for a more detailled discussion of the binding.  The
+MPC512x DMA engine binding follows the common scheme, but doesn't provide
+support for the optional channels and requests counters (those values are
+derived from the detected hardware features) and has a fixed client
+specifier length of 1 integer cell (the value is the DMA channel, since
+the DMA controller uses a fixed assignment of request lines per channel).
+
+
+DMA controller node properties:
+
+Required properties:
+- compatible:		should be "fsl,mpc5121-dma"
+- reg:			address and size of the DMA controller's register set
+- interrupts:		interrupt spec for the DMA controller
+
+Optional properties:
+- #dma-cells:		must be <1>, describes the number of integer cells
+			needed to specify the 'dmas' property in client nodes,
+			strongly recommended since common client helper code
+			uses this property
+
+Example:
+
+	dma0: dma@14000 {
+		compatible = "fsl,mpc5121-dma";
+		reg = <0x14000 0x1800>;
+		interrupts = <65 0x8>;
+		#dma-cells = <1>;
+	};
+
+
+Client node properties:
+
+Required properties:
+- dmas:			list of DMA specifiers, consisting each of a handle
+			for the DMA controller and integer cells to specify
+			the channel used within the DMA controller
+- dma-names:		list of identifier strings for the DMA specifiers,
+			client device driver code uses these strings to
+			have DMA channels looked up at the controller
+
+Example:
+
+	sdhc@1500 {
+		compatible = "fsl,mpc5121-sdhc";
+		/* ... */
+		dmas = <&dma0 30>;
+		dma-names = "rx-tx";
+	};
-- 
1.8.4.2

^ permalink raw reply related

* [PATCH RFC v9 6/6] dma: mpc512x: register for device tree channel lookup
From: Alexander Popov @ 2014-03-12 11:47 UTC (permalink / raw)
  To: Gerhard Sittig, Dan Williams, Vinod Koul, Lars-Peter Clausen,
	Arnd Bergmann, Anatolij Gustschin, Andy Shevchenko,
	Alexander Popov, linuxppc-dev, dmaengine
  Cc: devicetree
In-Reply-To: <1394624875-24411-1-git-send-email-a13xp0p0v88@gmail.com>

Register the controller for device tree based lookup of DMA channels
(non-fatal for backwards compatibility with older device trees) and
provide the '#dma-cells' property in the shared mpc5121.dtsi file

Signed-off-by: Gerhard Sittig <gsi@denx.de>
Signed-off-by: Alexander Popov <a13xp0p0v88@gmail.com>
---
 arch/powerpc/boot/dts/mpc5121.dtsi |  1 +
 drivers/dma/mpc512x_dma.c          | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/boot/dts/mpc5121.dtsi b/arch/powerpc/boot/dts/mpc5121.dtsi
index 2c0e155..7f9d14f 100644
--- a/arch/powerpc/boot/dts/mpc5121.dtsi
+++ b/arch/powerpc/boot/dts/mpc5121.dtsi
@@ -498,6 +498,7 @@
 			compatible = "fsl,mpc5121-dma";
 			reg = <0x14000 0x1800>;
 			interrupts = <65 0x8>;
+			#dma-cells = <1>;
 		};
 	};
 
diff --git a/drivers/dma/mpc512x_dma.c b/drivers/dma/mpc512x_dma.c
index ff7f678..453b1cb 100644
--- a/drivers/dma/mpc512x_dma.c
+++ b/drivers/dma/mpc512x_dma.c
@@ -52,6 +52,7 @@
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
+#include <linux/of_dma.h>
 #include <linux/of_platform.h>
 
 #include <linux/random.h>
@@ -1018,11 +1019,23 @@ static int mpc_dma_probe(struct platform_device *op)
 	/* Register DMA engine */
 	dev_set_drvdata(dev, mdma);
 	retval = dma_async_device_register(dma);
-	if (retval) {
-		free_irq(mdma->irq, mdma);
-		irq_dispose_mapping(mdma->irq);
+	if (retval)
+		goto out_irq;
+
+	/* Register with OF helpers for DMA lookups (nonfatal) */
+	if (dev->of_node) {
+		retval = of_dma_controller_register(dev->of_node,
+						of_dma_xlate_by_chan_id, mdma);
+		if (retval)
+			dev_warn(dev, "could not register for OF lookup\n");
 	}
 
+	return 0;
+
+out_irq:
+	free_irq(mdma->irq, mdma);
+	irq_dispose_mapping(mdma->irq);
+
 	return retval;
 }
 
@@ -1031,6 +1044,8 @@ static int mpc_dma_remove(struct platform_device *op)
 	struct device *dev = &op->dev;
 	struct mpc_dma *mdma = dev_get_drvdata(dev);
 
+	if (dev->of_node)
+		of_dma_controller_free(dev->of_node);
 	dma_async_device_unregister(&mdma->dma);
 	free_irq(mdma->irq, mdma);
 	irq_dispose_mapping(mdma->irq);
-- 
1.8.4.2

^ permalink raw reply related

* Re: Node 0 not necessary for powerpc?
From: Christoph Lameter @ 2014-03-12 13:41 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: linux-mm, linuxppc-dev, anton, rientjes
In-Reply-To: <20140311195632.GA946@linux.vnet.ibm.com>

On Tue, 11 Mar 2014, Nishanth Aravamudan wrote:
> I have a P7 system that has no node0, but a node0 shows up in numactl
> --hardware, which has no cpus and no memory (and no PCI devices):

Well as you see from the code there has been so far the assumption that
node 0 has memory. I have never run a machine that has no node 0 memory.

^ permalink raw reply

* Re: [PATCH 9/9] powerpc/pm: support deep sleep feature on T1040
From: Scott Wood @ 2014-03-12 17:43 UTC (permalink / raw)
  To: Kevin Hao; +Cc: linuxppc-dev, Chenhui Zhao, Jason.Jin, linux-kernel
In-Reply-To: <20140312055755.GA17203@pek-khao-d1.corp.ad.wrs.com>

On Wed, 2014-03-12 at 13:57 +0800, Kevin Hao wrote:
> On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote:
> > > +	FSL_DIS_ALL_IRQ
> > > +
> > > +	/*
> > > +	 * Place DDR controller in self refresh mode.
> > > +	 * From here on, DDR can't be access any more.
> > > +	 */
> > > +	lwz	r10, 0(r13)
> > > +	oris	r10, r10, CCSR_DDR_SDRAM_CFG_2_FRC_SR@h
> > > +	stw	r10, 0(r13)
> > > +
> > > +	/* can't call udelay() here, so use a macro to delay */
> > > +	FSLDELAY(50)
> > 
> > A timebase loop doesn't require accessing DDR.
> > 
> > You also probably want to do a "sync, readback, data dependency, isync"
> > sequence to make sure that the store has hit CCSR before you begin your
> > delay (or is a delay required at all if you do that?).
> 
> Shouldn't we use "readback, sync" here? The following is quoted form t4240RM:
>   To guarantee that the results of any sequence of writes to configuration
>   registers are in effect, the final configuration register write should be
>   immediately followed by a read of the same register, and that should be
>   followed by a SYNC instruction. Then accesses can safely be made to memory
>   regions affected by the configuration register write.

I agree that the sync before the readback is probably not necessary,
since transactions to the same address should already be ordered.

A sync after the readback helps if you're trying to order the readback
with subsequent memory accesses, though in that case wouldn't a sync
alone (no readback) be adequate?  Though maybe not always -- see the
comment near the end of fsl_elbc_write_buf() in
drivers/mtd/nand_fsl_elbc.c.  I guess the readback does more than just
make sure the device has seen the write, ensuring that the device has
finished the transaction to the point of acting on another one.

The data dependency plus isync sequence, which is done by the normal I/O
accessors used from C code, orders the readback versus all future
instructions (not just I/O).  The delay loop is not I/O.

> > > +	/* Enable SCU15 to trigger on RCPM Concentrator 0 */
> > > +	lwz	r10, 0(r15)
> > > +	oris	r10, r10, DCSR_EPU_EPECR15_IC0@h
> > > +	stw	r10, 0(r15)
> > > +
> > > +	/* put Core0 in PH15 mode, trigger EPU FSM */
> > > +	lwz	r10, 0(r12)
> > > +	ori	r10, r10, CCSR_RCPM_PCPH15SETR_CORE0
> > > +	stw	r10, 0(r12)
> > 
> > Shouldn't there be a sync to ensure that the previous I/O happens before
> > the final store to enter PH15?
> 
> Do we really need a sync here? According to the PowerISA, the above stores
> should be performed in program order.
>   If two Store instructions or two Load instructions
>   specify storage locations that are both Caching
>   Inhibited and Guarded, the corresponding storage
>   accesses are performed in program order with
>   respect to any processor or mechanism.

OK, wasn't aware of that.

-Scott

^ permalink raw reply

* [PATCH] powerpc: Update ppc4xx maintainer
From: Josh Boyer @ 2014-03-12 18:21 UTC (permalink / raw)
  To: benh; +Cc: Alistair Popple, linuxppc-dev

Alistair Popple has volunteered to take over maintainership of the ppc4xx
stuff upstream.  Switch the MAINTAINERS entry over to him.

Signed-off-by: Josh Boyer <jwboyer@gmail.com>
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1ecfde1..6d220c8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5245,11 +5245,10 @@ F:	arch/powerpc/platforms/512x/
 F:	arch/powerpc/platforms/52xx/
 
 LINUX FOR POWERPC EMBEDDED PPC4XX
-M:	Josh Boyer <jwboyer@gmail.com>
+M:  Alistair Popple <alistair@popple.id.au>
 M:	Matt Porter <mporter@kernel.crashing.org>
 W:	http://www.penguinppc.org/
 L:	linuxppc-dev@lists.ozlabs.org
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jwboyer/powerpc-4xx.git
 S:	Maintained
 F:	arch/powerpc/platforms/40x/
 F:	arch/powerpc/platforms/44x/
-- 
1.8.5.3

^ permalink raw reply related

* Re: [PATCH] T1040RDB: add qe node for T1040RDB dts
From: Scott Wood @ 2014-03-12 18:36 UTC (permalink / raw)
  To: Zhao Qiang; +Cc: B07421, R63061, linuxppc-dev
In-Reply-To: <1394612762-36308-1-git-send-email-B45475@freescale.com>

On Wed, 2014-03-12 at 16:26 +0800, Zhao Qiang wrote:
> Signed-off-by: Zhao Qiang <B45475@freescale.com>
> ---
>  arch/powerpc/boot/dts/t1040rdb.dts | 43 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 43 insertions(+)
> 
> diff --git a/arch/powerpc/boot/dts/t1040rdb.dts b/arch/powerpc/boot/dts/t1040rdb.dts
> index e2eee18..6ff0412 100644
> --- a/arch/powerpc/boot/dts/t1040rdb.dts
> +++ b/arch/powerpc/boot/dts/t1040rdb.dts
> @@ -268,6 +268,49 @@
>  			fsl,fman-mac = <&enet4>;
>  		};
>  	};
> +
> +	qe: qe@ffe139999 {
> +		ranges = <0x0 0xf 0xfe140000 0x40000>;
> +		reg = <0xf 0xfe140000 0 0x480>;

reg does not match unit address

Missing compatible

> +		si1: si@700 {
> +			#address-cells = <1>;
> +			#size-cells = <0>;
> +			compatible = "fsl,qe-si";
> +			reg = <0x700 0x80>;
> +		};

Missing binding

> +
> +		siram1: siram@1000 {
> +			#address-cells = <1>;
> +			#size-cells = <1>;
> +			compatible = "fsl,qe-siram";
> +			reg = <0x1000 0x800>;
> +		};

Missing binding

> +
> +		tdma: ucc@2000 {
> +			compatible = "fsl,ucc-tdm";
> +			rx-clock-name = "clk3";
> +			tx-clock-name = "clk4";
> +			fsl,rx-sync-clock = "rsync_pin";
> +			fsl,tx-sync-clock = "tsync_pin";
> +			fsl,tx-timeslot = <0xfffffffe>;
> +			fsl,rx-timeslot = <0xfffffffe>;
> +			fsl,tdm-framer-type = "e1";
> +			fsl,tdm-mode = "normal";
> +			fsl,tdm-id = <0>;
> +			fsl,siram-entry-id = <0>;
> +		};

Missing binding

> +		serial: ucc@2200 {
> +			device_type = "serial";
> +			compatible = "ucc_uart";
> +			port-number = <1>;
> +			rx-clock-name = "brg2";
> +			tx-clock-name = "brg2";
> +		};

Missing binding

-Scott

^ permalink raw reply

* Re: [PATCH] T1040RDB: add qe node for T1040RDB dts
From: Scott Wood @ 2014-03-12 18:45 UTC (permalink / raw)
  To: Zhao Qiang; +Cc: B07421, R63061, linuxppc-dev
In-Reply-To: <1394612762-36308-1-git-send-email-B45475@freescale.com>

On Wed, 2014-03-12 at 16:26 +0800, Zhao Qiang wrote:
> Signed-off-by: Zhao Qiang <B45475@freescale.com>
> ---
>  arch/powerpc/boot/dts/t1040rdb.dts | 43 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 43 insertions(+)

Presumably this is on top of this patch:
http://patchwork.ozlabs.org/patch/314138/

...since there's no existing t1040 device tree support.  Always mention
when your patch is on top of a patch that hasn't yet been merged and
isn't in the same patch set.

At least some of this stuff seems like it should be in t1040si-post.dts
(or a file included by it), rather than the board dts.

> +		tdma: ucc@2000 {
> +			compatible = "fsl,ucc-tdm";
> +			rx-clock-name = "clk3";
> +			tx-clock-name = "clk4";
> +			fsl,rx-sync-clock = "rsync_pin";
> +			fsl,tx-sync-clock = "tsync_pin";
> +			fsl,tx-timeslot = <0xfffffffe>;
> +			fsl,rx-timeslot = <0xfffffffe>;
> +			fsl,tdm-framer-type = "e1";
> +			fsl,tdm-mode = "normal";
> +			fsl,tdm-id = <0>;
> +			fsl,siram-entry-id = <0>;
> +		};
> +
> +		serial: ucc@2200 {
> +			device_type = "serial";
> +			compatible = "ucc_uart";
> +			port-number = <1>;
> +			rx-clock-name = "brg2";
> +			tx-clock-name = "brg2";
> +		};

Missing reg.

-Scott

^ permalink raw reply

* Re: [PATCH v3 00/52] CPU hotplug: Fix issues with callback registration
From: Srivatsa S. Bhat @ 2014-03-12 20:48 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-arch, ego, walken, linux, linux-pm, peterz, rusty, rjw,
	oleg, linux-kernel, linuxppc-dev, paulus, tj, tglx, paulmck,
	mingo
In-Reply-To: <20140311150733.efcc594dd7fe59c9c5fe9325@linux-foundation.org>

On 03/12/2014 03:37 AM, Andrew Morton wrote:
> On Tue, 11 Mar 2014 02:03:52 +0530 "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com> wrote:
> 
>> Hi,
>>
>> Many subsystems and drivers have the need to register CPU hotplug callbacks
>> from their init routines and also perform initialization for the CPUs that are
>> already online. But unfortunately there is no race-free way to achieve this
>> today.
>>
>> For example, consider this piece of code:
>>
>> 	get_online_cpus();
>>
>> 	for_each_online_cpu(cpu)
>> 		init_cpu(cpu);
>>
>> 	register_cpu_notifier(&foobar_cpu_notifier);
>>
>> 	put_online_cpus();
>>
>> This is not safe because there is a possibility of an ABBA deadlock involving
>> the cpu_add_remove_lock and the cpu_hotplug.lock.
>>
>>           CPU 0                                         CPU 1
>>           -----                                         -----
>>
>>    Acquire cpu_hotplug.lock
>>    [via get_online_cpus()]
>>
>>                                               CPU online/offline operation
>>                                               takes cpu_add_remove_lock
>>                                               [via cpu_maps_update_begin()]
>>
>>    Try to acquire
>>    cpu_add_remove_lock
>>    [via register_cpu_notifier()]
>>
>>                                               CPU online/offline operation
>>                                               tries to acquire cpu_hotplug.lock
>>                                               [via cpu_hotplug_begin()]
> 
> Can't we fix this by using a different (ie: new) lock to protect
> cpu_chain?
> 

No, that won't be a better solution than this one :-( The reason is that
CPU_POST_DEAD notifiers are invoked with the cpu_hotplug.lock dropped (by
design). So if we introduce the new lock, the locking would look as shown
below at the CPU hotplug side:

[ Note that it is unsafe to acquire and release the cpu-chain lock separately
for each invocation of the notifiers, because that would allow manipulations
of the cpu-chain in between two sets of notifications (such as CPU_DOWN_PREPARE
and CPU_DEAD, corresponding to the same CPU hotplug operation), which is
clearly wrong. So we need to acquire the new lock at the very beginning of
the hotplug operation and release it at the very end, after all notifiers
have been invoked.]


cpu_maps_update_begin(); //acquire cpu_add_remove_lock
    cpu_hotplug_begin(); //acquire cpu_hotplug.lock

        cpu_chain_lock(); //acquire a new lock that protects the cpu_chain

		Invoke CPU_DOWN_PREPARE notifiers
		//take cpu offline using stop-machine
		Invoke CPU_DEAD notifiers

    cpu_hotplug_done(); //release cpu_hotplug.lock

		Invoke CPU_POST_DEAD notifiers

        cpu_chain_unlock(); //release a new lock that protects the cpu_chain

cpu_maps_update_done(); //release cpu_add_remove_lock


So, if you observe the nesting of locks, it looks weird, because
cpu_hotplug.lock is acquired first, followed by cpu_chain_lock,
but they are released in the same order! IOW, they don't nest "properly".

To avoid this, if we reorder the locks in such a way that cpu_chain_lock
is the outer lock compared to cpu_hotplug.lock, then it becomes exactly
same as cpu_add_remove_lock. In other words, we can reuse the
cpu_add_remove_lock for this very purpose of protecting the cpu-chains
without adding any new lock to the CPU hotplug core code. And this is
what the existing code already does. I just utilize this fact and make
sure that we don't deadlock in the scenarios mentioned in the cover-letter
of this patchset.

Regards,
Srivatsa S. Bhat

^ permalink raw reply

* Re: [PATCH v3 32/52] powercap, intel-rapl: Fix CPU hotplug callback registration
From: Jacob Pan @ 2014-03-12 22:27 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: linux-arch, ego, walken, linux, akpm, linux-pm, peterz, rusty,
	rjw, oleg, linux-kernel, linuxppc-dev, paulus,
	Srinivas Pandruvada, tj, tglx, paulmck, mingo
In-Reply-To: <20140310203926.10746.11524.stgit@srivatsabhat.in.ibm.com>

On Tue, 11 Mar 2014 02:09:26 +0530
"Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com> wrote:

> Subsystems that want to register CPU hotplug callbacks, as well as
> perform initialization for the CPUs that are already online, often do
> it as shown below:
> 
> 	get_online_cpus();
> 
> 	for_each_online_cpu(cpu)
> 		init_cpu(cpu);
> 
> 	register_cpu_notifier(&foobar_cpu_notifier);
> 
> 	put_online_cpus();
> 
> This is wrong, since it is prone to ABBA deadlocks involving the
> cpu_add_remove_lock and the cpu_hotplug.lock (when running
> concurrently with CPU hotplug operations).
> 
> Instead, the correct and race-free way of performing the callback
> registration is:
> 
> 	cpu_notifier_register_begin();
> 
> 	for_each_online_cpu(cpu)
> 		init_cpu(cpu);
> 
> 	/* Note the use of the double underscored version of the API
> */ __register_cpu_notifier(&foobar_cpu_notifier);
> 
> 	cpu_notifier_register_done();
> 
> 
> Fix the intel-rapl code in the powercap driver by using this latter
> form of callback registration. But retain the calls to
> get/put_online_cpus(), since they also protect the function
> rapl_cleanup_data(). By nesting get/put_online_cpus() *inside*
> cpu_notifier_register_begin/done(), we avoid the ABBA deadlock
> possibility mentioned above.
> 
> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
> ---
> 

Tested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>

>  drivers/powercap/intel_rapl.c |   10 ++++++++--
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/powercap/intel_rapl.c
> b/drivers/powercap/intel_rapl.c index 3c67683..d6c74c1 100644
> --- a/drivers/powercap/intel_rapl.c
> +++ b/drivers/powercap/intel_rapl.c
> @@ -1369,6 +1369,9 @@ static int __init rapl_init(void)
>  
>  		return -ENODEV;
>  	}
> +
> +	cpu_notifier_register_begin();
> +
>  	/* prevent CPU hotplug during detection */
>  	get_online_cpus();
>  	ret = rapl_detect_topology();
> @@ -1380,20 +1383,23 @@ static int __init rapl_init(void)
>  		ret = -ENODEV;
>  		goto done;
>  	}
> -	register_hotcpu_notifier(&rapl_cpu_notifier);
> +	__register_hotcpu_notifier(&rapl_cpu_notifier);
>  done:
>  	put_online_cpus();
> +	cpu_notifier_register_done();
>  
>  	return ret;
>  }
>  
>  static void __exit rapl_exit(void)
>  {
> +	cpu_notifier_register_begin();
>  	get_online_cpus();
> -	unregister_hotcpu_notifier(&rapl_cpu_notifier);
> +	__unregister_hotcpu_notifier(&rapl_cpu_notifier);
>  	rapl_unregister_powercap();
>  	rapl_cleanup_data();
>  	put_online_cpus();
> +	cpu_notifier_register_done();
>  }
>  
>  module_init(rapl_init);
> 

[Jacob Pan]

^ permalink raw reply

* Re: [PATCH v3 10/52] arm, kvm: Fix CPU hotplug callback registration
From: Christoffer Dall @ 2014-03-12 23:21 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: ego, kvm, peterz, linux-kernel, linuxppc-dev, paulus, walken,
	kvmarm, linux-arch, linux, mingo, marc.zyngier, paulmck, linux-pm,
	Gleb Natapov, rusty, tglx, linux-arm-kernel, rjw, oleg, tj,
	Paolo Bonzini, akpm
In-Reply-To: <20140310203538.10746.25364.stgit@srivatsabhat.in.ibm.com>

On Tue, Mar 11, 2014 at 02:05:38AM +0530, Srivatsa S. Bhat wrote:
> Subsystems that want to register CPU hotplug callbacks, as well as perform
> initialization for the CPUs that are already online, often do it as shown
> below:
> 
> 	get_online_cpus();
> 
> 	for_each_online_cpu(cpu)
> 		init_cpu(cpu);
> 
> 	register_cpu_notifier(&foobar_cpu_notifier);
> 
> 	put_online_cpus();
> 
> This is wrong, since it is prone to ABBA deadlocks involving the
> cpu_add_remove_lock and the cpu_hotplug.lock (when running concurrently
> with CPU hotplug operations).
> 
> Instead, the correct and race-free way of performing the callback
> registration is:
> 
> 	cpu_notifier_register_begin();
> 
> 	for_each_online_cpu(cpu)
> 		init_cpu(cpu);
> 
> 	/* Note the use of the double underscored version of the API */
> 	__register_cpu_notifier(&foobar_cpu_notifier);
> 
> 	cpu_notifier_register_done();
> 
> 
> Fix the kvm code in arm by using this latter form of callback registration.
> 
> Cc: Christoffer Dall <christoffer.dall@linaro.org>
> Cc: Gleb Natapov <gleb@kernel.org>
> Cc: Russell King <linux@arm.linux.org.uk>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: kvmarm@lists.cs.columbia.edu
> Cc: kvm@vger.kernel.org
> Cc: linux-arm-kernel@lists.infradead.org
> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
> ---
> 
>  arch/arm/kvm/arm.c |    7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index bd18bb8..f0e50a0 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -1051,21 +1051,26 @@ int kvm_arch_init(void *opaque)
>  		}
>  	}
>  
> +	cpu_notifier_register_begin();
> +
>  	err = init_hyp_mode();
>  	if (err)
>  		goto out_err;
>  
> -	err = register_cpu_notifier(&hyp_init_cpu_nb);
> +	err = __register_cpu_notifier(&hyp_init_cpu_nb);
>  	if (err) {
>  		kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
>  		goto out_err;
>  	}
>  
> +	cpu_notifier_register_done();
> +
>  	hyp_cpu_pm_init();
>  
>  	kvm_coproc_table_init();
>  	return 0;
>  out_err:
> +	cpu_notifier_register_done();
>  	return err;
>  }
>  
> 

Just so we're clear, the existing code was simply racy as not prone to
deadlocks, right?

This makes it clear that the test above for compatible CPUs can be quite
easily evaded by using CPU hotplug, but we don't really have a good
solution for handling that yet...  Hmmm, grumble grumble, I guess if you
hotplug unsupported CPUs on a KVM/ARM system for now, stuff will break.

In any case:
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>

^ permalink raw reply

* RE: [PATCH] T1040RDB: add qe node for T1040RDB dts
From: qiang.zhao @ 2014-03-13  1:56 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev@lists.ozlabs.org, Xiaobo Xie
In-Reply-To: <1394649930.13761.154.camel@snotra.buserror.net>

T24gV2VkLCAyMDE0LTAzLTEzIGF0IDI6NDYgQU0sIFNjb3R0IHdyb3RlOg0KDQo+IC0tLS0tT3Jp
Z2luYWwgTWVzc2FnZS0tLS0tDQo+IEZyb206IFdvb2QgU2NvdHQtQjA3NDIxDQo+IFNlbnQ6IFRo
dXJzZGF5LCBNYXJjaCAxMywgMjAxNCAyOjQ2IEFNDQo+IFRvOiBaaGFvIFFpYW5nLUI0NTQ3NQ0K
PiBDYzogbGludXhwcGMtZGV2QGxpc3RzLm96bGFicy5vcmc7IFdvb2QgU2NvdHQtQjA3NDIxOyBY
aWUgWGlhb2JvLVI2MzA2MQ0KPiBTdWJqZWN0OiBSZTogW1BBVENIXSBUMTA0MFJEQjogYWRkIHFl
IG5vZGUgZm9yIFQxMDQwUkRCIGR0cw0KPiANCj4gT24gV2VkLCAyMDE0LTAzLTEyIGF0IDE2OjI2
ICswODAwLCBaaGFvIFFpYW5nIHdyb3RlOg0KPiA+IFNpZ25lZC1vZmYtYnk6IFpoYW8gUWlhbmcg
PEI0NTQ3NUBmcmVlc2NhbGUuY29tPg0KPiA+IC0tLQ0KPiA+ICBhcmNoL3Bvd2VycGMvYm9vdC9k
dHMvdDEwNDByZGIuZHRzIHwgNDMNCj4gPiArKysrKysrKysrKysrKysrKysrKysrKysrKysrKysr
KysrKysrKw0KPiA+ICAxIGZpbGUgY2hhbmdlZCwgNDMgaW5zZXJ0aW9ucygrKQ0KPiANCj4gUHJl
c3VtYWJseSB0aGlzIGlzIG9uIHRvcCBvZiB0aGlzIHBhdGNoOg0KPiBodHRwOi8vcGF0Y2h3b3Jr
Lm96bGFicy5vcmcvcGF0Y2gvMzE0MTM4Lw0KPiANCj4gLi4uc2luY2UgdGhlcmUncyBubyBleGlz
dGluZyB0MTA0MCBkZXZpY2UgdHJlZSBzdXBwb3J0LiAgQWx3YXlzIG1lbnRpb24NCj4gd2hlbiB5
b3VyIHBhdGNoIGlzIG9uIHRvcCBvZiBhIHBhdGNoIHRoYXQgaGFzbid0IHlldCBiZWVuIG1lcmdl
ZCBhbmQNCj4gaXNuJ3QgaW4gdGhlIHNhbWUgcGF0Y2ggc2V0Lg0KPiANCj4gQXQgbGVhc3Qgc29t
ZSBvZiB0aGlzIHN0dWZmIHNlZW1zIGxpa2UgaXQgc2hvdWxkIGJlIGluIHQxMDQwc2ktcG9zdC5k
dHMNCj4gKG9yIGEgZmlsZSBpbmNsdWRlZCBieSBpdCksIHJhdGhlciB0aGFuIHRoZSBib2FyZCBk
dHMuDQoNCkV2ZXJ5IGJvYXJkIGNhbiB1c2UgdWNjIGRpZmZlcmVudGx5LCBJdCBpcyBub3QgY29y
cmVjdCB0byBwdXQgdGhpcyBub2RlIGludG8gdDEwNDBzaS1wb3N0LmR0c2kuDQpGb3IgZXhhbXBs
ZSB0MTA0MHFkcyBjYW4gdXNlIHVjYzEgdG8gdGRtIHdoaWxlIG1heWJlIHQxMDQwcmRiIHVzZSB1
Y2MxIHRvIHVhcnQuDQoNCj4gDQo+ID4gKwkJdGRtYTogdWNjQDIwMDAgew0KPiA+ICsJCQljb21w
YXRpYmxlID0gImZzbCx1Y2MtdGRtIjsNCj4gPiArCQkJcngtY2xvY2stbmFtZSA9ICJjbGszIjsN
Cj4gPiArCQkJdHgtY2xvY2stbmFtZSA9ICJjbGs0IjsNCj4gPiArCQkJZnNsLHJ4LXN5bmMtY2xv
Y2sgPSAicnN5bmNfcGluIjsNCj4gPiArCQkJZnNsLHR4LXN5bmMtY2xvY2sgPSAidHN5bmNfcGlu
IjsNCj4gPiArCQkJZnNsLHR4LXRpbWVzbG90ID0gPDB4ZmZmZmZmZmU+Ow0KPiA+ICsJCQlmc2ws
cngtdGltZXNsb3QgPSA8MHhmZmZmZmZmZT47DQo+ID4gKwkJCWZzbCx0ZG0tZnJhbWVyLXR5cGUg
PSAiZTEiOw0KPiA+ICsJCQlmc2wsdGRtLW1vZGUgPSAibm9ybWFsIjsNCj4gPiArCQkJZnNsLHRk
bS1pZCA9IDwwPjsNCj4gPiArCQkJZnNsLHNpcmFtLWVudHJ5LWlkID0gPDA+Ow0KPiA+ICsJCX07
DQo+ID4gKw0KPiA+ICsJCXNlcmlhbDogdWNjQDIyMDAgew0KPiA+ICsJCQlkZXZpY2VfdHlwZSA9
ICJzZXJpYWwiOw0KPiA+ICsJCQljb21wYXRpYmxlID0gInVjY191YXJ0IjsNCj4gPiArCQkJcG9y
dC1udW1iZXIgPSA8MT47DQo+ID4gKwkJCXJ4LWNsb2NrLW5hbWUgPSAiYnJnMiI7DQo+ID4gKwkJ
CXR4LWNsb2NrLW5hbWUgPSAiYnJnMiI7DQo+ID4gKwkJfTsNCj4gDQo+IE1pc3NpbmcgcmVnLg0K
PiANCj4gLVNjb3R0DQo+IA0KDQo=

^ permalink raw reply

* Re: [PATCH 9/9] powerpc/pm: support deep sleep feature on T1040
From: Kevin Hao @ 2014-03-13  7:46 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, Chenhui Zhao, Jason.Jin, linux-kernel
In-Reply-To: <1394646185.13761.145.camel@snotra.buserror.net>

[-- Attachment #1: Type: text/plain, Size: 1829 bytes --]

On Wed, Mar 12, 2014 at 12:43:05PM -0500, Scott Wood wrote:
> > Shouldn't we use "readback, sync" here? The following is quoted form t4240RM:
> >   To guarantee that the results of any sequence of writes to configuration
> >   registers are in effect, the final configuration register write should be
> >   immediately followed by a read of the same register, and that should be
> >   followed by a SYNC instruction. Then accesses can safely be made to memory
> >   regions affected by the configuration register write.
> 
> I agree that the sync before the readback is probably not necessary,
> since transactions to the same address should already be ordered.
> 
> A sync after the readback helps if you're trying to order the readback
> with subsequent memory accesses, though in that case wouldn't a sync
> alone (no readback) be adequate?

No, we don't just want to order the subsequent memory access here.
The 'write, readback, sync' is the required sequence if we want to make
sure that the writing to CCSR register does really take effect.

>  Though maybe not always -- see the
> comment near the end of fsl_elbc_write_buf() in
> drivers/mtd/nand_fsl_elbc.c.  I guess the readback does more than just
> make sure the device has seen the write, ensuring that the device has
> finished the transaction to the point of acting on another one.

Agree.

> 
> The data dependency plus isync sequence, which is done by the normal I/O
> accessors used from C code, orders the readback versus all future
> instructions (not just I/O).  The delay loop is not I/O.

According to the PowerISA, the sequence 'load, date dependency, isync' only
order the load accesses. So if we want to order all the storage access as well
as execution synchronization, we should choose sync here.

Thanks,
Kevin

[-- Attachment #2: Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* [PATCH] powerpc/perf: Fix handling of L3 events with bank == 1
From: Michael Ellerman @ 2014-03-13  8:30 UTC (permalink / raw)
  To: linuxppc-dev

Currently we reject events which have the L3 bank == 1, such as
0x000084918F, because the cache field is non-zero.

However that is incorrect, because although the bank is non-zero, the
value we would write into MMCRC is zero, and so we can count the event.

So fix the check to ignore the bank selector when checking whether the
cache selector is non-zero.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/power8-pmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index 3ad363d..fe2763b 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -325,9 +325,10 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long
 		 * HV writable, and there is no API for guest kernels to modify
 		 * it. The solution is for the hypervisor to initialise the
 		 * field to zeroes, and for us to only ever allow events that
-		 * have a cache selector of zero.
+		 * have a cache selector of zero. The bank selector (bit 3) is
+		 * irrelevant, as long as the rest of the value is 0.
 		 */
-		if (cache)
+		if (cache & 0x7)
 			return -1;
 
 	} else if (event & EVENT_IS_L1) {
-- 
1.8.3.2

^ permalink raw reply related

* Re: Node 0 not necessary for powerpc?
From: Nishanth Aravamudan @ 2014-03-13 16:48 UTC (permalink / raw)
  To: David Rientjes; +Cc: linux-mm, cl, linuxppc-dev, anton
In-Reply-To: <alpine.DEB.2.02.1403111900100.19193@chino.kir.corp.google.com>

On 11.03.2014 [19:02:17 -0700], David Rientjes wrote:
> On Tue, 11 Mar 2014, Nishanth Aravamudan wrote:
> 
> > I have a P7 system that has no node0, but a node0 shows up in numactl
> > --hardware, which has no cpus and no memory (and no PCI devices):
> > 
> > numactl --hardware
> > available: 4 nodes (0-3)
> > node 0 cpus:
> > node 0 size: 0 MB
> > node 0 free: 0 MB
> > node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11
> > node 1 size: 0 MB
> > node 1 free: 0 MB
> > node 2 cpus:
> > node 2 size: 7935 MB
> > node 2 free: 7716 MB
> > node 3 cpus:
> > node 3 size: 8395 MB
> > node 3 free: 8015 MB
> > node distances:
> > node   0   1   2   3 
> >   0:  10  20  10  20 
> >   1:  20  10  20  20 
> >   2:  10  20  10  20 
> >   3:  20  20  20  10 
> > 
> > This is because we statically initialize N_ONLINE to be [0] in
> > mm/page_alloc.c:
> > 
> >         [N_ONLINE] = { { [0] = 1UL } },
> > 
> > I'm not sure what the architectural requirements are here, but at least
> > on this test system, removing this initialization, it boots fine and is
> > running. I've not yet tried stress tests, but it's survived the
> > beginnings of kernbench so far.
> > 
> > numactl --hardware
> > available: 3 nodes (1-3)
> > node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11
> > node 1 size: 0 MB
> > node 1 free: 0 MB
> > node 2 cpus:
> > node 2 size: 7935 MB
> > node 2 free: 7479 MB
> > node 3 cpus:
> > node 3 size: 8396 MB
> > node 3 free: 8375 MB
> > node distances:
> > node   1   2   3 
> >   1:  10  20  20 
> >   2:  20  10  20 
> >   3:  20  20  10
> > 
> > Perhaps we could put in a ARCH_DOES_NOT_NEED_NODE0 and only define it on
> > powerpc for now, conditionalizing the above initialization on that?
> > 
> 
> I don't know if anything has recently changed in the past year or so, but 
> I've booted x86 machines with a hacked BIOS so that all memory on node 0 
> is hotpluggable and offline, so I believe this is possible on x86 as well.

Good to know, thanks! This is also certainly not very common on powerpc,
but it is possible -- and the topology ends up being inaccurate because
of the static initialization.

Thanks,
Nish

^ permalink raw reply

* Re: Node 0 not necessary for powerpc?
From: Nishanth Aravamudan @ 2014-03-13 16:49 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linuxppc-dev, anton, rientjes
In-Reply-To: <alpine.DEB.2.10.1403120839110.6865@nuc>

On 12.03.2014 [08:41:40 -0500], Christoph Lameter wrote:
> On Tue, 11 Mar 2014, Nishanth Aravamudan wrote:
> > I have a P7 system that has no node0, but a node0 shows up in numactl
> > --hardware, which has no cpus and no memory (and no PCI devices):
> 
> Well as you see from the code there has been so far the assumption that
> node 0 has memory. I have never run a machine that has no node 0 memory.

Do you mean beyond the initialization? I didn't see anything obvious so
far in the code itself that assumes a given node has memory (in the
sense of the nid). What are your thoughts about how best to support
this?

Thanks,
Nish

^ permalink raw reply

* Re: [RFC PATCH 2/3] topology: support node_numa_mem() for determining the fallback node
From: Nishanth Aravamudan @ 2014-03-13 16:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Han Pingtian, Matt Mackall, Pekka Enberg,
	Linux Memory Management List, Paul Mackerras, Anton Blanchard,
	David Rientjes, Joonsoo Kim, linuxppc-dev, Wanpeng Li
In-Reply-To: <alpine.DEB.2.10.1402241353070.20839@nuc>

On 24.02.2014 [13:54:35 -0600], Christoph Lameter wrote:
> On Mon, 24 Feb 2014, Joonsoo Kim wrote:
> 
> > > It will not common get there because of the tracking. Instead a per cpu
> > > object will be used.
> > > > get_partial_node() always fails even if there are some partial slab on
> > > > memoryless node's neareast node.
> > >
> > > Correct and that leads to a page allocator action whereupon the node will
> > > be marked as empty.
> >
> > Why do we need to request to a page allocator if there is partial slab?
> > Checking whether node is memoryless or not is really easy, so we don't need
> > to skip this. To skip this is suboptimal solution.
> 
> The page allocator action is also used to determine to which other node we
> should fall back if the node is empty. So we need to call the page
> allocator when the per cpu slab is exhaused with the node of the
> memoryless node to get memory from the proper fallback node.

Where do we stand with these patches? I feel like no resolution was
really found...

Thanks,
Nish

^ permalink raw reply

* Re: Bug in reclaim logic with exhausted nodes?
From: Nishanth Aravamudan @ 2014-03-13 17:01 UTC (permalink / raw)
  To: linux-mm; +Cc: mgorman, cl, linuxppc-dev, anton, rientjes
In-Reply-To: <20140311210614.GB946@linux.vnet.ibm.com>

There might have been an error in my original mail, so resending...

On 11.03.2014 [14:06:14 -0700], Nishanth Aravamudan wrote:
> We have seen the following situation on a test system:
> 
> 2-node system, each node has 32GB of memory.
> 
> 2 gigantic (16GB) pages reserved at boot-time, both of which are
> allocated from node 1.
> 
> SLUB notices this:
> 
> [    0.000000] SLUB: Unable to allocate memory from node 1
> [    0.000000] SLUB: Allocating a useless per node structure in order to
> be able to continue
> 
> After boot, user then did:
> 
> echo 24 > /proc/sys/vm/nr_hugepages
> 
> And tasks are stuck:
> 
> [<c0000000010980b8>] kexec_stack+0xb8/0x8000
> [<c0000000000144d0>] .__switch_to+0x1c0/0x390
> [<c0000000001ac708>] .throttle_direct_reclaim.isra.31+0x238/0x2c0
> [<c0000000001b0b34>] .try_to_free_pages+0xb4/0x210
> [<c0000000001a2f1c>] .__alloc_pages_nodemask+0x75c/0xb00
> [<c0000000001eafb0>] .alloc_fresh_huge_page+0x70/0x150
> [<c0000000001eb2d0>] .set_max_huge_pages.part.37+0x130/0x2f0
> [<c0000000001eb7c8>] .hugetlb_sysctl_handler_common+0x168/0x180
> [<c0000000002ae21c>] .proc_sys_call_handler+0xfc/0x120
> [<c00000000021dcc0>] .vfs_write+0xe0/0x260
> [<c00000000021e8c8>] .SyS_write+0x58/0xd0
> [<c000000000009e7c>] syscall_exit+0x0/0x7c
> 
> [<c00000004f9334b0>] 0xc00000004f9334b0
> [<c0000000000144d0>] .__switch_to+0x1c0/0x390
> [<c0000000001ac708>] .throttle_direct_reclaim.isra.31+0x238/0x2c0
> [<c0000000001b0b34>] .try_to_free_pages+0xb4/0x210
> [<c0000000001a2f1c>] .__alloc_pages_nodemask+0x75c/0xb00
> [<c0000000001eafb0>] .alloc_fresh_huge_page+0x70/0x150
> [<c0000000001eb2d0>] .set_max_huge_pages.part.37+0x130/0x2f0
> [<c0000000001eb7c8>] .hugetlb_sysctl_handler_common+0x168/0x180
> [<c0000000002ae21c>] .proc_sys_call_handler+0xfc/0x120
> [<c00000000021dcc0>] .vfs_write+0xe0/0x260
> [<c00000000021e8c8>] .SyS_write+0x58/0xd0
> [<c000000000009e7c>] syscall_exit+0x0/0x7c
> 
> [<c00000004f91f440>] 0xc00000004f91f440
> [<c0000000000144d0>] .__switch_to+0x1c0/0x390
> [<c0000000001ac708>] .throttle_direct_reclaim.isra.31+0x238/0x2c0
> [<c0000000001b0b34>] .try_to_free_pages+0xb4/0x210
> [<c0000000001a2f1c>] .__alloc_pages_nodemask+0x75c/0xb00
> [<c0000000001eafb0>] .alloc_fresh_huge_page+0x70/0x150
> [<c0000000001eb2d0>] .set_max_huge_pages.part.37+0x130/0x2f0
> [<c0000000001eb54c>] .nr_hugepages_store_common.isra.39+0xbc/0x1b0
> [<c0000000003662cc>] .kobj_attr_store+0x2c/0x50
> [<c0000000002b2c2c>] .sysfs_write_file+0xec/0x1c0
> [<c00000000021dcc0>] .vfs_write+0xe0/0x260
> [<c00000000021e8c8>] .SyS_write+0x58/0xd0
> [<c000000000009e7c>] syscall_exit+0x0/0x7c
> 
> kswapd1 is also pegged at this point at 100% cpu.
> 
> If we go in and manually:
> 
> echo 24 >
> /sys/devices/system/node/node0/hugepages/hugepages-16384kB/nr_hugepages
> 
> rather than relying on the interleaving allocator from the sysctl, the
> allocation succeeds (and the echo returns immediately).
> 
> I think we are hitting the following:
> 
> mm/hugetlb.c::alloc_fresh_huge_page_node():
> 
>         page = alloc_pages_exact_node(nid,
>                 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
>                                                 __GFP_REPEAT|__GFP_NOWARN,
>                 huge_page_order(h));
> 
> include/linux/gfp.h:
> 
> #define GFP_THISNODE    (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
> 
> and mm/page_alloc.c::__alloc_pages_slowpath():
> 
>         /*
>          * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
>          * __GFP_NOWARN set) should not cause reclaim since the subsystem
>          * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
>          * using a larger set of nodes after it has established that the
>          * allowed per node queues are empty and that nodes are
>          * over allocated.
>          */
>         if (IS_ENABLED(CONFIG_NUMA) &&
>                         (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
>                 goto nopage;
> 
> so we *do* reclaim in this callpath. Under my reading, since node1 is
> exhausted, no matter how much work kswapd1 does, it will never reclaim
> memory from node1 to satisfy a 16M page allocation request (or any
> other, for that matter).
> 
> I see the following possible changes/fixes, but am unsure if
> a) my analysis is right
> b) which is best.
> 
> 1) Since we did notice early in boot that (in this case) node 1 was
> exhausted, perhaps we should mark it as such there somehow, and if a
> __GFP_THISNODE allocation request comes through on such a node, we
> immediately fallthrough to nopage?
> 
> 2) There is the following check
>         /*
>          * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
>          * specified, then we retry until we no longer reclaim any pages
>          * (above), or we've reclaimed an order of pages at least as
>          * large as the allocation's order. In both cases, if the
>          * allocation still fails, we stop retrying.
>          */
>         if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
>                 return 1;
> 
> I wonder if we should add a check to also be sure that the pages we are
> reclaiming, if __GFP_THISNODE is set, are from the right node?
> 
>        if (gfp_mask & __GFP_THISNODE && the progress we have made is on
>        		the node requested?)
> 
> 3) did_some_progress could be updated to track where the progress is
> occuring, and if we are in __GFP_THISNODE allocation request and we
> didn't make any progress on the correct node, we fail the allocation?
> 
> I think this situation could be reproduced (and am working on it) by
> exhausting a NUMA node with 16M hugepages and then using the generic
> RR allocator to ask for more. Other node exhaustion cases probably
> exist, but since we can't swap the hugepages, it seems like the most
> straightforward way to try and reproduce it.
> 
> Any thoughts on this? Am I way off base?
> 
> Thanks,
> Nish
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev

^ permalink raw reply

* Re: [PATCH RFC v9 5/6] dma: mpc512x: add device tree binding document
From: Mark Rutland @ 2014-03-13 18:09 UTC (permalink / raw)
  To: Alexander Popov
  Cc: devicetree@vger.kernel.org, Lars-Peter Clausen, Arnd Bergmann,
	Vinod Koul, Gerhard Sittig, Andy Shevchenko,
	dmaengine@vger.kernel.org, Dan Williams, Anatolij Gustschin,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1394624875-24411-6-git-send-email-a13xp0p0v88@gmail.com>

On Wed, Mar 12, 2014 at 11:47:54AM +0000, Alexander Popov wrote:
> From: Gerhard Sittig <gsi@denx.de>
> 
> introduce a device tree binding document for the MPC512x DMA controller
> 
> Signed-off-by: Gerhard Sittig <gsi@denx.de>
> [ a13xp0p0v88@gmail.com: turn this into a separate patch ]
> ---
>  .../devicetree/bindings/dma/mpc512x-dma.txt        | 55 ++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/dma/mpc512x-dma.txt
> 
> diff --git a/Documentation/devicetree/bindings/dma/mpc512x-dma.txt b/Documentation/devicetree/bindings/dma/mpc512x-dma.txt
> new file mode 100644
> index 0000000..a4867d5
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/dma/mpc512x-dma.txt
> @@ -0,0 +1,55 @@
> +* Freescale MPC512x DMA Controller
> +
> +The DMA controller in the Freescale MPC512x SoC can move blocks of
> +memory contents between memory and peripherals or memory to memory.
> +
> +Refer to the "Generic DMA Controller and DMA request bindings" description
> +in the dma.txt file for a more detailled discussion of the binding.  The
> +MPC512x DMA engine binding follows the common scheme, but doesn't provide
> +support for the optional channels and requests counters (those values are
> +derived from the detected hardware features) and has a fixed client
> +specifier length of 1 integer cell (the value is the DMA channel, since
> +the DMA controller uses a fixed assignment of request lines per channel).
> +
> +
> +DMA controller node properties:
> +
> +Required properties:
> +- compatible:		should be "fsl,mpc5121-dma"
> +- reg:			address and size of the DMA controller's register set
> +- interrupts:		interrupt spec for the DMA controller
> +
> +Optional properties:
> +- #dma-cells:		must be <1>, describes the number of integer cells
> +			needed to specify the 'dmas' property in client nodes,
> +			strongly recommended since common client helper code
> +			uses this property

Describe what you expect this cell to contain, not the #dma-cells
binding in general. The DMA bindings already cover that.

What are valid value that clients may use, and what do they mean?

> +
> +Example:
> +
> +	dma0: dma@14000 {
> +		compatible = "fsl,mpc5121-dma";
> +		reg = <0x14000 0x1800>;
> +		interrupts = <65 0x8>;
> +		#dma-cells = <1>;
> +	};
> +
> +
> +Client node properties:
> +
> +Required properties:
> +- dmas:			list of DMA specifiers, consisting each of a handle
> +			for the DMA controller and integer cells to specify
> +			the channel used within the DMA controller
> +- dma-names:		list of identifier strings for the DMA specifiers,
> +			client device driver code uses these strings to
> +			have DMA channels looked up at the controller

List the exact names you expect, or the dma-names property is useless.

Thanks,
Mark.

> +
> +Example:
> +
> +	sdhc@1500 {
> +		compatible = "fsl,mpc5121-sdhc";
> +		/* ... */
> +		dmas = <&dma0 30>;
> +		dma-names = "rx-tx";
> +	};
> -- 
> 1.8.4.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe devicetree" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* [PATCH 00/10] powerpc/booke64: critical and mcheck support
From: Scott Wood @ 2014-03-14  0:00 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Tiejun Chen, linuxppc-dev

This patchset adds the state saving required to safely take
critical and machine check exceptions on 64-bit booke, including
TLB misses from inside such exceptions.  Previously, the kernel simply
hung when encountering such an exception.

Scott Wood (8):
  powerpc/booke64: Fix exception numbers
  powerpc/e6500: Make TLB lock recursive
  powerpc/booke64: Use SPRG7 for VDSO
  powerpc/booke64: Use SPRG_TLB_EXFRAME on bolted handlers
  powerpc/booke64: Remove ints from EXCEPTION_COMMON
  powerpc/booke64: Add crit/mc/debug support to EXCEPTION_COMMON
  powerpc/booke64: Critical and machine check exception support
  Revert "powerpc/watchdog: Don't enable interrupt on PPC64 BookE"

Tiejun Chen (2):
  powerpc/book3e: initialize crit/mc/dbg kernel stack pointers
  powerpc/book3e: store crit/mc/dbg exception thread info

 arch/powerpc/include/asm/exception-64e.h    |  15 +-
 arch/powerpc/include/asm/kvm_booke_hv_asm.h |  17 +-
 arch/powerpc/include/asm/mmu-book3e.h       |   9 +-
 arch/powerpc/include/asm/paca.h             |   9 +-
 arch/powerpc/include/asm/reg.h              |  13 +-
 arch/powerpc/kernel/asm-offsets.c           |   2 +-
 arch/powerpc/kernel/exceptions-64e.S        | 435 +++++++++++++++++++++-------
 arch/powerpc/kernel/setup_64.c              |  20 +-
 arch/powerpc/kernel/vdso.c                  |   8 +-
 arch/powerpc/kernel/vdso32/getcpu.S         |   2 +-
 arch/powerpc/kernel/vdso64/getcpu.S         |   2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S     |   4 +-
 arch/powerpc/kvm/book3s_interrupts.S        |   4 +-
 arch/powerpc/kvm/bookehv_interrupts.S       |  24 +-
 arch/powerpc/mm/tlb_low_64e.S               |  63 ++--
 arch/powerpc/mm/tlb_nohash.c                |  11 +
 drivers/watchdog/booke_wdt.c                |   8 -
 17 files changed, 461 insertions(+), 185 deletions(-)

-- 
1.8.3.2

^ permalink raw reply

* [PATCH 01/10] powerpc/book3e: initialize crit/mc/dbg kernel stack pointers
From: Scott Wood @ 2014-03-14  0:00 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, Tiejun Chen, linuxppc-dev
In-Reply-To: <1394755249-8856-1-git-send-email-scottwood@freescale.com>

From: Tiejun Chen <tiejun.chen@windriver.com>

We already allocated critical/machine/debug check exceptions, but
we also should initialize those associated kernel stack pointers
for use by special exceptions in the PACA.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/kernel/setup_64.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index f5f11a7..da9c42f 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -552,14 +552,20 @@ static void __init irqstack_early_init(void)
 static void __init exc_lvl_early_init(void)
 {
 	unsigned int i;
+	unsigned long sp;
 
 	for_each_possible_cpu(i) {
-		critirq_ctx[i] = (struct thread_info *)
-			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
-		dbgirq_ctx[i] = (struct thread_info *)
-			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
-		mcheckirq_ctx[i] = (struct thread_info *)
-			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+		sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+		critirq_ctx[i] = (struct thread_info *)__va(sp);
+		paca[i].crit_kstack = __va(sp + THREAD_SIZE);
+
+		sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+		dbgirq_ctx[i] = (struct thread_info *)__va(sp);
+		paca[i].dbg_kstack = __va(sp + THREAD_SIZE);
+
+		sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+		mcheckirq_ctx[i] = (struct thread_info *)__va(sp);
+		paca[i].mc_kstack = __va(sp + THREAD_SIZE);
 	}
 
 	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 02/10] powerpc/book3e: store crit/mc/dbg exception thread info
From: Scott Wood @ 2014-03-14  0:00 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, Tiejun Chen, linuxppc-dev
In-Reply-To: <1394755249-8856-1-git-send-email-scottwood@freescale.com>

From: Tiejun Chen <tiejun.chen@windriver.com>

We need to store thread info to these exception thread info like something
we already did for PPC32.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/kernel/exceptions-64e.S | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 063b65d..6772512 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -36,6 +36,19 @@
  */
 #define	SPECIAL_EXC_FRAME_SIZE	INT_FRAME_SIZE
 
+/* Now we only store something to exception thread info */
+#define	EXC_LEVEL_EXCEPTION_PROLOG(type)				\
+	ld	r14,PACAKSAVE(r13);					\
+	CURRENT_THREAD_INFO(r14, r14);					\
+	CURRENT_THREAD_INFO(r15, r1);					\
+	ld	r10,TI_FLAGS(r14);		     			\
+	std	r10,TI_FLAGS(r15);			     		\
+	ld	r10,TI_PREEMPT(r14);		     			\
+	std	r10,TI_PREEMPT(r15);		     			\
+	ld	r10,TI_TASK(r14);			     		\
+	std	r10,TI_TASK(r15);
+
+
 /* Exception prolog code for all exceptions */
 #define EXCEPTION_PROLOG(n, intnum, type, addition)	    		    \
 	mtspr	SPRN_SPRG_##type##_SCRATCH,r13;	/* get spare registers */   \
@@ -69,19 +82,22 @@
 
 #define CRIT_SET_KSTACK						            \
 	ld	r1,PACA_CRIT_STACK(r13);				    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;				    \
+	EXC_LEVEL_EXCEPTION_PROLOG(CRIT);
 #define SPRN_CRIT_SRR0	SPRN_CSRR0
 #define SPRN_CRIT_SRR1	SPRN_CSRR1
 
 #define DBG_SET_KSTACK						            \
 	ld	r1,PACA_DBG_STACK(r13);					    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;				    \
+	EXC_LEVEL_EXCEPTION_PROLOG(DBG);
 #define SPRN_DBG_SRR0	SPRN_DSRR0
 #define SPRN_DBG_SRR1	SPRN_DSRR1
 
 #define MC_SET_KSTACK						            \
 	ld	r1,PACA_MC_STACK(r13);					    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;				    \
+	EXC_LEVEL_EXCEPTION_PROLOG(MC);
 #define SPRN_MC_SRR0	SPRN_MCSRR0
 #define SPRN_MC_SRR1	SPRN_MCSRR1
 
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 03/10] powerpc/booke64: Fix exception numbers
From: Scott Wood @ 2014-03-14  0:00 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, Tiejun Chen, linuxppc-dev
In-Reply-To: <1394755249-8856-1-git-send-email-scottwood@freescale.com>

altivec_unavailable was commented as 0xf20 but the code uses 0x200.
Note that 0xf20 is also used by ap_unavailable.

altivec_assist was commented as 0x1700 but the code uses 0x220.

critical_input was commented as 0x580 but the code uses 0x100.

machine_check was commented and implemented as 0x200, which conflicts
with altivec_assist (it only builds because MC_EXCEPTION_PROLOG is
commented out).  Changed to the fixed IVOR value of 0x000.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/kernel/exceptions-64e.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 6772512..41380a4 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -299,8 +299,8 @@ exception_marker:
 	.balign	0x1000
 	.globl interrupt_base_book3e
 interrupt_base_book3e:					/* fake trap */
-	EXCEPTION_STUB(0x000, machine_check)		/* 0x0200 */
-	EXCEPTION_STUB(0x020, critical_input)		/* 0x0580 */
+	EXCEPTION_STUB(0x000, machine_check)
+	EXCEPTION_STUB(0x020, critical_input)		/* 0x0100 */
 	EXCEPTION_STUB(0x040, debug_crit)		/* 0x0d00 */
 	EXCEPTION_STUB(0x060, data_storage)		/* 0x0300 */
 	EXCEPTION_STUB(0x080, instruction_storage)	/* 0x0400 */
@@ -315,8 +315,8 @@ interrupt_base_book3e:					/* fake trap */
 	EXCEPTION_STUB(0x1a0, watchdog)			/* 0x09f0 */
 	EXCEPTION_STUB(0x1c0, data_tlb_miss)
 	EXCEPTION_STUB(0x1e0, instruction_tlb_miss)
-	EXCEPTION_STUB(0x200, altivec_unavailable)	/* 0x0f20 */
-	EXCEPTION_STUB(0x220, altivec_assist)		/* 0x1700 */
+	EXCEPTION_STUB(0x200, altivec_unavailable)
+	EXCEPTION_STUB(0x220, altivec_assist)
 	EXCEPTION_STUB(0x260, perfmon)
 	EXCEPTION_STUB(0x280, doorbell)
 	EXCEPTION_STUB(0x2a0, doorbell_crit)
@@ -343,9 +343,9 @@ interrupt_end_book3e:
 
 /* Machine Check Interrupt */
 	START_EXCEPTION(machine_check);
-	MC_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_MACHINE_CHECK,
+	MC_EXCEPTION_PROLOG(0x000, BOOKE_INTERRUPT_MACHINE_CHECK,
 			    PROLOG_ADDITION_NONE)
-//	EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE)
+//	EXCEPTION_COMMON(0x000, PACA_EXMC, INTS_DISABLE)
 //	bl	special_reg_save_mc
 //	addi	r3,r1,STACK_FRAME_OVERHEAD
 //	CHECK_NAPPING();
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 04/10] powerpc/e6500: Make TLB lock recursive
From: Scott Wood @ 2014-03-14  0:00 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, Tiejun Chen, linuxppc-dev
In-Reply-To: <1394755249-8856-1-git-send-email-scottwood@freescale.com>

Once special level interrupts are supported, we may take nested TLB
misses -- so allow the same thread to acquire the lock recursively.

The lock will not be effective against the nested TLB miss handler
trying to write the same entry as the interrupted TLB miss handler, but
that's also a problem on non-threaded CPUs that lack TLB write
conditional.  This will be addressed in the patch that enables crit/mc
support by invalidating the TLB on return from level exceptions.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/mmu-book3e.h |  9 ++++++---
 arch/powerpc/kernel/setup_64.c        |  2 ++
 arch/powerpc/mm/tlb_low_64e.S         | 19 ++++++++++++-------
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index 89b785d..901dac6 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -287,11 +287,14 @@ extern int mmu_linear_psize;
 extern int mmu_vmemmap_psize;
 
 struct tlb_core_data {
+	/*
+	 * Per-core spinlock for e6500 TLB handlers (no tlbsrx.)
+	 * Must be the first struct element.
+	 */
+	u8 lock;
+
 	/* For software way selection, as on Freescale TLB1 */
 	u8 esel_next, esel_max, esel_first;
-
-	/* Per-core spinlock for e6500 TLB handlers (no tlbsrx.) */
-	u8 lock;
 };
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index da9c42f..4933909 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -102,6 +102,8 @@ static void setup_tlb_core_data(void)
 {
 	int cpu;
 
+	BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != 0);
+
 	for_each_possible_cpu(cpu) {
 		int first = cpu_first_thread_sibling(cpu);
 
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 6bf5050..1e50249 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -284,7 +284,7 @@ itlb_miss_fault_bolted:
  * r14 = page table base
  * r13 = PACA
  * r11 = tlb_per_core ptr
- * r10 = crap (free to use)
+ * r10 = cpu number
  */
 tlb_miss_common_e6500:
 	/*
@@ -293,15 +293,18 @@ tlb_miss_common_e6500:
 	 *
 	 * MAS6:IND should be already set based on MAS4
 	 */
-	addi	r10,r11,TCD_LOCK
-1:	lbarx	r15,0,r10
+1:	lbarx	r15,0,r11
+	lhz	r10,PACAPACAINDEX(r13)
 	cmpdi	r15,0
+	cmpdi	cr1,r15,1	/* set cr1.eq = 0 for non-recursive */
 	bne	2f
-	li	r15,1
-	stbcx.	r15,0,r10
+	stbcx.	r10,0,r11
 	bne	1b
+3:
 	.subsection 1
-2:	lbz	r15,0(r10)
+2:	cmpd	cr1,r15,r10	/* recursive lock due to mcheck/crit/etc? */
+	beq	cr1,3b		/* unlock will happen if cr1.eq = 0 */
+	lbz	r15,0(r11)
 	cmpdi	r15,0
 	bne	2b
 	b	1b
@@ -379,9 +382,11 @@ tlb_miss_common_e6500:
 
 tlb_miss_done_e6500:
 	.macro	tlb_unlock_e6500
+	beq	cr1,1f		/* no unlock if lock was recursively grabbed */
 	li	r15,0
 	isync
-	stb	r15,TCD_LOCK(r11)
+	stb	r15,0(r11)
+1:
 	.endm
 
 	tlb_unlock_e6500
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 05/10] powerpc/booke64: Use SPRG7 for VDSO
From: Scott Wood @ 2014-03-14  0:00 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: kvm-ppc, Tiejun Chen, Paul Mackerras, Anton Blanchard, Scott Wood,
	Mihai Caraman, linuxppc-dev
In-Reply-To: <1394755249-8856-1-git-send-email-scottwood@freescale.com>

Previously SPRG3 was marked for use by both VDSO and critical
interrupts (though critical interrupts were not fully implemented).

In commit 8b64a9dfb091f1eca8b7e58da82f1e7d1d5fe0ad ("powerpc/booke64:
Use SPRG0/3 scratch for bolted TLB miss & crit int"), Mihai Caraman
made an attempt to resolve this conflict by restoring the VDSO value
early in the critical interrupt, but this has some issues:

 - It's incompatible with EXCEPTION_COMMON which restores r13 from the
   by-then-overwritten scratch (this cost me some debugging time).
 - It forces critical exceptions to be a special case handled
   differently from even machine check and debug level exceptions.
 - It didn't occur to me that it was possible to make this work at all
   (by doing a final "ld r13, PACA_EXCRIT+EX_R13(r13)") until after
   I made (most of) this patch. :-)

It might be worth investigating using a load rather than SPRG on return
from all exceptions (except TLB misses where the scratch never leaves
the SPRG) -- it could save a few cycles.  Until then, let's stick with
SPRG for all exceptions.

Since we cannot use SPRG4-7 for scratch without corrupting the state of
a KVM guest, move VDSO to SPRG7 on book3e.  Since neither SPRG4-7 nor
critical interrupts exist on book3s, SPRG3 is still used for VDSO
there.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Cc: Mihai Caraman <mihai.caraman@freescale.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: kvm-ppc@vger.kernel.org
---
 arch/powerpc/include/asm/exception-64e.h    |  5 ++---
 arch/powerpc/include/asm/kvm_booke_hv_asm.h |  9 +--------
 arch/powerpc/include/asm/paca.h             |  2 +-
 arch/powerpc/include/asm/reg.h              | 13 ++++++++++---
 arch/powerpc/kernel/asm-offsets.c           |  2 +-
 arch/powerpc/kernel/exceptions-64e.S        | 19 ++-----------------
 arch/powerpc/kernel/vdso.c                  |  8 ++++----
 arch/powerpc/kernel/vdso32/getcpu.S         |  2 +-
 arch/powerpc/kernel/vdso64/getcpu.S         |  2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S     |  4 ++--
 arch/powerpc/kvm/book3s_interrupts.S        |  4 ++--
 arch/powerpc/kvm/bookehv_interrupts.S       | 10 ++++++----
 12 files changed, 33 insertions(+), 47 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
index 51fa43e..e73452f 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -46,9 +46,8 @@
 #define EX_CR		(1 * 8)
 #define EX_R10		(2 * 8)
 #define EX_R11		(3 * 8)
-#define EX_R13		(4 * 8)
-#define EX_R14		(5 * 8)
-#define EX_R15		(6 * 8)
+#define EX_R14		(4 * 8)
+#define EX_R15		(5 * 8)
 
 /*
  * The TLB miss exception uses different slots.
diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
index 3a79f53..c3e3fd5 100644
--- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h
+++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
@@ -36,20 +36,13 @@
  *   *(r8 + GPR11) = saved r11
  *
  * 64-bit host
- * Expected inputs (GEN/GDBELL/DBG/MC exception types):
+ * Expected inputs (GEN/GDBELL/DBG/CRIT/MC exception types):
  *  r10 = saved CR
  *  r13 = PACA_POINTER
  *  *(r13 + PACA_EX##type + EX_R10) = saved r10
  *  *(r13 + PACA_EX##type + EX_R11) = saved r11
  *  SPRN_SPRG_##type##_SCRATCH = saved r13
  *
-  * Expected inputs (CRIT exception type):
- *  r10 = saved CR
- *  r13 = PACA_POINTER
- *  *(r13 + PACA_EX##type + EX_R10) = saved r10
- *  *(r13 + PACA_EX##type + EX_R11) = saved r11
- *  *(r13 + PACA_EX##type + EX_R13) = saved r13
- *
  * Expected inputs (TLB exception type):
  *  r10 = saved CR
  *  r13 = PACA_POINTER
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9c5dbc3..948f01a 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -146,7 +146,7 @@ struct paca_struct {
 	u8 io_sync;			/* writel() needs spin_unlock sync */
 	u8 irq_work_pending;		/* IRQ_WORK interrupt while soft-disable */
 	u8 nap_state_lost;		/* NV GPR values lost in power7_idle */
-	u64 sprg3;			/* Saved user-visible sprg */
+	u64 sprg_vdso;			/* Saved user-visible sprg */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	u64 tm_scratch;                 /* TM scratch area for reclaim */
 #endif
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 90c06ec..b9ac329 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -577,9 +577,13 @@
 #define SPRN_SPRG3	0x113	/* Special Purpose Register General 3 */
 #define SPRN_USPRG3	0x103	/* SPRG3 userspace read */
 #define SPRN_SPRG4	0x114	/* Special Purpose Register General 4 */
+#define SPRN_USPRG4	0x104	/* SPRG4 userspace read */
 #define SPRN_SPRG5	0x115	/* Special Purpose Register General 5 */
+#define SPRN_USPRG5	0x105	/* SPRG5 userspace read */
 #define SPRN_SPRG6	0x116	/* Special Purpose Register General 6 */
+#define SPRN_USPRG6	0x106	/* SPRG6 userspace read */
 #define SPRN_SPRG7	0x117	/* Special Purpose Register General 7 */
+#define SPRN_USPRG7	0x107	/* SPRG7 userspace read */
 #define SPRN_SRR0	0x01A	/* Save/Restore Register 0 */
 #define SPRN_SRR1	0x01B	/* Save/Restore Register 1 */
 #define   SRR1_ISI_NOPT		0x40000000 /* ISI: Not found in hash */
@@ -879,11 +883,10 @@
  * 64-bit embedded
  *	- SPRG0 generic exception scratch
  *	- SPRG2 TLB exception stack
- *	- SPRG3 critical exception scratch and
- *        CPU and NUMA node for VDSO getcpu (user visible)
+ *	- SPRG3 critical exception scratch (user visible, sorry!)
  *	- SPRG4 unused (user visible)
  *	- SPRG6 TLB miss scratch (user visible, sorry !)
- *	- SPRG7 critical exception scratch
+ *	- SPRG7 CPU and NUMA node for VDSO getcpu (user visible)
  *	- SPRG8 machine check exception scratch
  *	- SPRG9 debug exception scratch
  *
@@ -940,6 +943,8 @@
 #define SPRN_SPRG_SCRATCH0	SPRN_SPRG2
 #define SPRN_SPRG_HPACA		SPRN_HSPRG0
 #define SPRN_SPRG_HSCRATCH0	SPRN_HSPRG1
+#define SPRN_SPRG_VDSO_READ	SPRN_USPRG3
+#define SPRN_SPRG_VDSO_WRITE	SPRN_SPRG3
 
 #define GET_PACA(rX)					\
 	BEGIN_FTR_SECTION_NESTED(66);			\
@@ -983,6 +988,8 @@
 #define SPRN_SPRG_TLB_SCRATCH	SPRN_SPRG6
 #define SPRN_SPRG_GEN_SCRATCH	SPRN_SPRG0
 #define SPRN_SPRG_GDBELL_SCRATCH SPRN_SPRG_GEN_SCRATCH
+#define SPRN_SPRG_VDSO_READ	SPRN_USPRG7
+#define SPRN_SPRG_VDSO_WRITE	SPRN_SPRG7
 
 #define SET_PACA(rX)	mtspr	SPRN_SPRG_PACA,rX
 #define GET_PACA(rX)	mfspr	rX,SPRN_SPRG_PACA
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index b5aacf7..dba8140 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -253,7 +253,7 @@ int main(void)
 	DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
 	DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost));
-	DEFINE(PACA_SPRG3, offsetof(struct paca_struct, sprg3));
+	DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso));
 #endif /* CONFIG_PPC64 */
 
 	/* RTAS */
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 41380a4..89e1133 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -55,7 +55,6 @@
 	mfspr	r13,SPRN_SPRG_PACA;	/* get PACA */			    \
 	std	r10,PACA_EX##type+EX_R10(r13);				    \
 	std	r11,PACA_EX##type+EX_R11(r13);				    \
-	PROLOG_STORE_RESTORE_SCRATCH_##type;				    \
 	mfcr	r10;			/* save CR */			    \
 	mfspr	r11,SPRN_##type##_SRR1;/* what are we coming from */	    \
 	DO_KVM	intnum,SPRN_##type##_SRR1;    /* KVM hook */		    \
@@ -116,20 +115,6 @@
 #define GDBELL_EXCEPTION_PROLOG(n, intnum, addition)			    \
 	EXCEPTION_PROLOG(n, intnum, GDBELL, addition##_GDBELL(n))
 
-/*
- * Store user-visible scratch in PACA exception slots and restore proper value
- */
-#define PROLOG_STORE_RESTORE_SCRATCH_GEN
-#define PROLOG_STORE_RESTORE_SCRATCH_GDBELL
-#define PROLOG_STORE_RESTORE_SCRATCH_DBG
-#define PROLOG_STORE_RESTORE_SCRATCH_MC
-
-#define PROLOG_STORE_RESTORE_SCRATCH_CRIT				    \
-	mfspr	r10,SPRN_SPRG_CRIT_SCRATCH;	/* get r13 */		    \
-	std	r10,PACA_EXCRIT+EX_R13(r13);				    \
-	ld	r11,PACA_SPRG3(r13);					    \
-	mtspr	SPRN_SPRG_CRIT_SCRATCH,r11;
-
 /* Variants of the "addition" argument for the prolog
  */
 #define PROLOG_ADDITION_NONE_GEN(n)
@@ -529,7 +514,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	mtcr	r10
 	ld	r10,PACA_EXCRIT+EX_R10(r13)	/* restore registers */
 	ld	r11,PACA_EXCRIT+EX_R11(r13)
-	ld	r13,PACA_EXCRIT+EX_R13(r13)
+	mfspr	r13,SPRN_SPRG_CRIT_SCRATCH
 	rfci
 
 	/* Normal debug exception */
@@ -542,7 +527,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	/* Now we mash up things to make it look like we are coming on a
 	 * normal exception
 	 */
-	ld	r15,PACA_EXCRIT+EX_R13(r13)
+	mfspr	r15,SPRN_SPRG_CRIT_SCRATCH
 	mtspr	SPRN_SPRG_GEN_SCRATCH,r15
 	mfspr	r14,SPRN_DBSR
 	EXCEPTION_COMMON(0xd00, PACA_EXCRIT, INTS_DISABLE)
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 094e45c..ce74c33 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -715,8 +715,8 @@ int vdso_getcpu_init(void)
 	unsigned long cpu, node, val;
 
 	/*
-	 * SPRG3 contains the CPU in the bottom 16 bits and the NUMA node in
-	 * the next 16 bits. The VDSO uses this to implement getcpu().
+	 * SPRG_VDSO contains the CPU in the bottom 16 bits and the NUMA node
+	 * in the next 16 bits.  The VDSO uses this to implement getcpu().
 	 */
 	cpu = get_cpu();
 	WARN_ON_ONCE(cpu > 0xffff);
@@ -725,8 +725,8 @@ int vdso_getcpu_init(void)
 	WARN_ON_ONCE(node > 0xffff);
 
 	val = (cpu & 0xfff) | ((node & 0xffff) << 16);
-	mtspr(SPRN_SPRG3, val);
-	get_paca()->sprg3 = val;
+	mtspr(SPRN_SPRG_VDSO_WRITE, val);
+	get_paca()->sprg_vdso = val;
 
 	put_cpu();
 
diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S
index 47afd08..23eb9a9 100644
--- a/arch/powerpc/kernel/vdso32/getcpu.S
+++ b/arch/powerpc/kernel/vdso32/getcpu.S
@@ -29,7 +29,7 @@
  */
 V_FUNCTION_BEGIN(__kernel_getcpu)
   .cfi_startproc
-	mfspr	r5,SPRN_USPRG3
+	mfspr	r5,SPRN_SPRG_VDSO_READ
 	cmpdi	cr0,r3,0
 	cmpdi	cr1,r4,0
 	clrlwi  r6,r5,16
diff --git a/arch/powerpc/kernel/vdso64/getcpu.S b/arch/powerpc/kernel/vdso64/getcpu.S
index 47afd08..23eb9a9 100644
--- a/arch/powerpc/kernel/vdso64/getcpu.S
+++ b/arch/powerpc/kernel/vdso64/getcpu.S
@@ -29,7 +29,7 @@
  */
 V_FUNCTION_BEGIN(__kernel_getcpu)
   .cfi_startproc
-	mfspr	r5,SPRN_USPRG3
+	mfspr	r5,SPRN_SPRG_VDSO_READ
 	cmpdi	cr0,r3,0
 	cmpdi	cr1,r4,0
 	clrlwi  r6,r5,16
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e66d4ec..fbfca57 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -75,8 +75,8 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 	/* Restore SPRG3 */
-	ld	r3,PACA_SPRG3(r13)
-	mtspr	SPRN_SPRG3,r3
+	ld	r3,PACA_SPRG_VDSO(r13)
+	mtspr	SPRN_SPRG_VDSO_WRITE,r3
 
 	/* Reload the host's PMU registers */
 	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index f779450..3533c99 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -153,8 +153,8 @@ kvm_start_lightweight:
 	 * Reload kernel SPRG3 value.
 	 * No need to save guest value as usermode can't modify SPRG3.
 	 */
-	ld	r3, PACA_SPRG3(r13)
-	mtspr	SPRN_SPRG3, r3
+	ld	r3, PACA_SPRG_VDSO(r13)
+	mtspr	SPRN_SPRG_VDSO_WRITE, r3
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 	/* R7 = vcpu */
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index e4185f6..99635a3 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -229,11 +229,7 @@
 	stw	r10, VCPU_CR(r4)
 	PPC_STL r11, VCPU_GPR(R4)(r4)
 	PPC_STL	r5, VCPU_GPR(R5)(r4)
-	.if \type == EX_CRIT
-	PPC_LL	r5, (\paca_ex + EX_R13)(r13)
-	.else
 	mfspr	r5, \scratch
-	.endif
 	PPC_STL	r6, VCPU_GPR(R6)(r4)
 	PPC_STL	r8, VCPU_GPR(R8)(r4)
 	PPC_STL	r9, VCPU_GPR(R9)(r4)
@@ -435,10 +431,16 @@ _GLOBAL(kvmppc_resume_host)
 	PPC_STL	r5, VCPU_LR(r4)
 	mfspr	r7, SPRN_SPRG5
 	stw	r3, VCPU_VRSAVE(r4)
+#ifdef CONFIG_64BIT
+	PPC_LL	r3, PACA_SPRG_VDSO(r13)
+#endif
 	PPC_STD(r6, VCPU_SHARED_SPRG4, r11)
 	mfspr	r8, SPRN_SPRG6
 	PPC_STD(r7, VCPU_SHARED_SPRG5, r11)
 	mfspr	r9, SPRN_SPRG7
+#ifdef CONFIG_64BIT
+	mtspr	SPRN_SPRG_VDSO_WRITE, r3
+#endif
 	PPC_STD(r8, VCPU_SHARED_SPRG6, r11)
 	mfxer	r3
 	PPC_STD(r9, VCPU_SHARED_SPRG7, r11)
-- 
1.8.3.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox