LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 15/17] net: can: mscan: remove non-CCF code for MPC512x
From: Gerhard Sittig @ 2013-11-17 23:06 UTC (permalink / raw)
  To: linuxppc-dev, linux-arm-kernel, Anatolij Gustschin,
	Mike Turquette
  Cc: Detlev Zundel, Gerhard Sittig, linux-can, Marc Kleine-Budde,
	Scott Wood, Wolfgang Grandegger
In-Reply-To: <1384729577-7336-1-git-send-email-gsi@denx.de>

transition to the common clock framework has completed and the PPC_CLOCK
is no longer available for the MPC512x platform, remove the now obsolete
code path of the mpc5xxx mscan driver which accessed clock control module
registers directly

Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: linux-can@vger.kernel.org
Signed-off-by: Gerhard Sittig <gsi@denx.de>
---
 drivers/net/can/mscan/mpc5xxx_can.c |  141 -----------------------------------
 1 file changed, 141 deletions(-)

diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c
index f48f1297ff30..6b0c9958d824 100644
--- a/drivers/net/can/mscan/mpc5xxx_can.c
+++ b/drivers/net/can/mscan/mpc5xxx_can.c
@@ -109,9 +109,6 @@ static u32 mpc52xx_can_get_clock(struct platform_device *ofdev,
 #endif /* CONFIG_PPC_MPC52xx */
 
 #ifdef CONFIG_PPC_MPC512x
-
-#if IS_ENABLED(CONFIG_COMMON_CLK)
-
 static u32 mpc512x_can_get_clock(struct platform_device *ofdev,
 				 const char *clock_source, int *mscan_clksrc)
 {
@@ -277,144 +274,6 @@ static void mpc512x_can_put_clock(struct platform_device *ofdev)
 	if (priv->clk_ipg)
 		clk_disable_unprepare(priv->clk_ipg);
 }
-
-#else	/* COMMON_CLK */
-
-struct mpc512x_clockctl {
-	u32 spmr;		/* System PLL Mode Reg */
-	u32 sccr[2];		/* System Clk Ctrl Reg 1 & 2 */
-	u32 scfr1;		/* System Clk Freq Reg 1 */
-	u32 scfr2;		/* System Clk Freq Reg 2 */
-	u32 reserved;
-	u32 bcr;		/* Bread Crumb Reg */
-	u32 pccr[12];		/* PSC Clk Ctrl Reg 0-11 */
-	u32 spccr;		/* SPDIF Clk Ctrl Reg */
-	u32 cccr;		/* CFM Clk Ctrl Reg */
-	u32 dccr;		/* DIU Clk Cnfg Reg */
-	u32 mccr[4];		/* MSCAN Clk Ctrl Reg 1-3 */
-};
-
-static struct of_device_id mpc512x_clock_ids[] = {
-	{ .compatible = "fsl,mpc5121-clock", },
-	{}
-};
-
-static u32 mpc512x_can_get_clock(struct platform_device *ofdev,
-				 const char *clock_name, int *mscan_clksrc)
-{
-	struct mpc512x_clockctl __iomem *clockctl;
-	struct device_node *np_clock;
-	struct clk *sys_clk, *ref_clk;
-	int plen, clockidx, clocksrc = -1;
-	u32 sys_freq, val, clockdiv = 1, freq = 0;
-	const u32 *pval;
-
-	np_clock = of_find_matching_node(NULL, mpc512x_clock_ids);
-	if (!np_clock) {
-		dev_err(&ofdev->dev, "couldn't find clock node\n");
-		return 0;
-	}
-	clockctl = of_iomap(np_clock, 0);
-	if (!clockctl) {
-		dev_err(&ofdev->dev, "couldn't map clock registers\n");
-		goto exit_put;
-	}
-
-	/* Determine the MSCAN device index from the peripheral's
-	 * physical address. Register address offsets against the
-	 * IMMR base are:  0x1300, 0x1380, 0x2300, 0x2380
-	 */
-	pval = of_get_property(ofdev->dev.of_node, "reg", &plen);
-	BUG_ON(!pval || plen < sizeof(*pval));
-	clockidx = (*pval & 0x80) ? 1 : 0;
-	if (*pval & 0x2000)
-		clockidx += 2;
-
-	/*
-	 * Clock source and divider selection: 3 different clock sources
-	 * can be selected: "ip", "ref" or "sys". For the latter two, a
-	 * clock divider can be defined as well. If the clock source is
-	 * not specified by the device tree, we first try to find an
-	 * optimal CAN source clock based on the system clock. If that
-	 * is not posslible, the reference clock will be used.
-	 */
-	if (clock_name && !strcmp(clock_name, "ip")) {
-		*mscan_clksrc = MSCAN_CLKSRC_IPS;
-		freq = mpc5xxx_get_bus_frequency(ofdev->dev.of_node);
-	} else {
-		*mscan_clksrc = MSCAN_CLKSRC_BUS;
-
-		pval = of_get_property(ofdev->dev.of_node,
-				       "fsl,mscan-clock-divider", &plen);
-		if (pval && plen == sizeof(*pval))
-			clockdiv = *pval;
-		if (!clockdiv)
-			clockdiv = 1;
-
-		if (!clock_name || !strcmp(clock_name, "sys")) {
-			sys_clk = devm_clk_get(&ofdev->dev, "sys_clk");
-			if (IS_ERR(sys_clk)) {
-				dev_err(&ofdev->dev, "couldn't get sys_clk\n");
-				goto exit_unmap;
-			}
-			/* Get and round up/down sys clock rate */
-			sys_freq = 1000000 *
-				((clk_get_rate(sys_clk) + 499999) / 1000000);
-
-			if (!clock_name) {
-				/* A multiple of 16 MHz would be optimal */
-				if ((sys_freq % 16000000) == 0) {
-					clocksrc = 0;
-					clockdiv = sys_freq / 16000000;
-					freq = sys_freq / clockdiv;
-				}
-			} else {
-				clocksrc = 0;
-				freq = sys_freq / clockdiv;
-			}
-		}
-
-		if (clocksrc < 0) {
-			ref_clk = devm_clk_get(&ofdev->dev, "ref_clk");
-			if (IS_ERR(ref_clk)) {
-				dev_err(&ofdev->dev, "couldn't get ref_clk\n");
-				goto exit_unmap;
-			}
-			clocksrc = 1;
-			freq = clk_get_rate(ref_clk) / clockdiv;
-		}
-	}
-
-	/* Disable clock */
-	out_be32(&clockctl->mccr[clockidx], 0x0);
-	if (clocksrc >= 0) {
-		/* Set source and divider */
-		val = (clocksrc << 14) | ((clockdiv - 1) << 17);
-		out_be32(&clockctl->mccr[clockidx], val);
-		/* Enable clock */
-		out_be32(&clockctl->mccr[clockidx], val | 0x10000);
-	}
-
-	/* Enable MSCAN clock domain */
-	val = in_be32(&clockctl->sccr[1]);
-	if (!(val & (1 << 25)))
-		out_be32(&clockctl->sccr[1], val | (1 << 25));
-
-	dev_dbg(&ofdev->dev, "using '%s' with frequency divider %d\n",
-		*mscan_clksrc == MSCAN_CLKSRC_IPS ? "ips_clk" :
-		clocksrc == 1 ? "ref_clk" : "sys_clk", clockdiv);
-
-exit_unmap:
-	iounmap(clockctl);
-exit_put:
-	of_node_put(np_clock);
-	return freq;
-}
-
-#define mpc512x_can_put_clock NULL
-
-#endif	/* COMMON_CLK */
-
 #else /* !CONFIG_PPC_MPC512x */
 static u32 mpc512x_can_get_clock(struct platform_device *ofdev,
 				 const char *clock_name, int *mscan_clksrc)
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH v5 16/17] powerpc/mpc512x: improve DIU related clock setup
From: Gerhard Sittig @ 2013-11-17 23:06 UTC (permalink / raw)
  To: linuxppc-dev, linux-arm-kernel, Anatolij Gustschin,
	Mike Turquette
  Cc: Scott Wood, Gerhard Sittig, Detlev Zundel
In-Reply-To: <1384729577-7336-1-git-send-email-gsi@denx.de>

adapt the DIU clock initialization to the COMMON_CLK approach:
device tree based clock lookup, prepare and unprepare for clocks,
work with frequencies not dividers, call the appropriate clk_*()
routines and don't access CCM registers

the "best clock" determination now completely relies on the
platform's clock driver to pick a frequency close to what the
caller requests, and merely checks whether the desired frequency
was met (fits the tolerance of the monitor)

this approach shall succeed upon first try in the usual case,
will test a few less desirable yet acceptable frequencies in
edge cases, and will fallback to "best effort" if none of the
previously tried frequencies pass the test

provide a fallback clock lookup approach in case the OF based clock
lookup for the DIU fails, this allows for successful operation in
the presence of an outdated device tree which lacks clock specs

Cc: Anatolij Gustschin <agust@denx.de>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Gerhard Sittig <gsi@denx.de>
---
 arch/powerpc/platforms/512x/mpc512x_shared.c |  169 ++++++++++++++------------
 1 file changed, 92 insertions(+), 77 deletions(-)

diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index a82a41b4fd91..d8f172b710a8 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -12,6 +12,7 @@
  * (at your option) any later version.
  */
 
+#include <linux/clk.h>
 #include <linux/kernel.h>
 #include <linux/io.h>
 #include <linux/irq.h>
@@ -70,98 +71,112 @@ struct fsl_diu_shared_fb {
 	bool		in_use;
 };
 
-#define DIU_DIV_MASK	0x000000ff
+/* receives a pixel clock spec in pico seconds, adjusts the DIU clock rate */
 void mpc512x_set_pixel_clock(unsigned int pixclock)
 {
-	unsigned long bestval, bestfreq, speed, busfreq;
-	unsigned long minpixclock, maxpixclock, pixval;
-	struct mpc512x_ccm __iomem *ccm;
 	struct device_node *np;
-	u32 temp;
-	long err;
-	int i;
+	struct clk *clk_diu;
+	unsigned long epsilon, minpixclock, maxpixclock;
+	unsigned long offset, want, got, delta;
 
-	np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-clock");
+	/* lookup and enable the DIU clock */
+	np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-diu");
 	if (!np) {
-		pr_err("Can't find clock control module.\n");
+		pr_err("Could not find DIU device tree node.\n");
 		return;
 	}
-
-	ccm = of_iomap(np, 0);
+	clk_diu = of_clk_get(np, 0);
+	if (IS_ERR(clk_diu)) {
+		/* backwards compat with device trees that lack clock specs */
+		clk_diu = clk_get_sys(np->name, "ipg");
+	}
 	of_node_put(np);
-	if (!ccm) {
-		pr_err("Can't map clock control module reg.\n");
+	if (IS_ERR(clk_diu)) {
+		pr_err("Could not lookup DIU clock.\n");
 		return;
 	}
-
-	np = of_find_node_by_type(NULL, "cpu");
-	if (np) {
-		const unsigned int *prop =
-			of_get_property(np, "bus-frequency", NULL);
-
-		of_node_put(np);
-		if (prop) {
-			busfreq = *prop;
-		} else {
-			pr_err("Can't get bus-frequency property\n");
-			return;
-		}
-	} else {
-		pr_err("Can't find 'cpu' node.\n");
+	if (clk_prepare_enable(clk_diu)) {
+		pr_err("Could not enable DIU clock.\n");
 		return;
 	}
 
-	/* Pixel Clock configuration */
-	pr_debug("DIU: Bus Frequency = %lu\n", busfreq);
-	speed = busfreq * 4; /* DIU_DIV ratio is 4 * CSB_CLK / DIU_CLK */
-
-	/* Calculate the pixel clock with the smallest error */
-	/* calculate the following in steps to avoid overflow */
-	pr_debug("DIU pixclock in ps - %d\n", pixclock);
-	temp = (1000000000 / pixclock) * 1000;
-	pixclock = temp;
-	pr_debug("DIU pixclock freq - %u\n", pixclock);
-
-	temp = temp / 20; /* pixclock * 0.05 */
-	pr_debug("deviation = %d\n", temp);
-	minpixclock = pixclock - temp;
-	maxpixclock = pixclock + temp;
-	pr_debug("DIU minpixclock - %lu\n", minpixclock);
-	pr_debug("DIU maxpixclock - %lu\n", maxpixclock);
-	pixval = speed/pixclock;
-	pr_debug("DIU pixval = %lu\n", pixval);
-
-	err = LONG_MAX;
-	bestval = pixval;
-	pr_debug("DIU bestval = %lu\n", bestval);
-
-	bestfreq = 0;
-	for (i = -1; i <= 1; i++) {
-		temp = speed / (pixval+i);
-		pr_debug("DIU test pixval i=%d, pixval=%lu, temp freq. = %u\n",
-			i, pixval, temp);
-		if ((temp < minpixclock) || (temp > maxpixclock))
-			pr_debug("DIU exceeds monitor range (%lu to %lu)\n",
-				minpixclock, maxpixclock);
-		else if (abs(temp - pixclock) < err) {
-			pr_debug("Entered the else if block %d\n", i);
-			err = abs(temp - pixclock);
-			bestval = pixval + i;
-			bestfreq = temp;
-		}
+	/*
+	 * convert the picoseconds spec into the desired clock rate,
+	 * determine the acceptable clock range for the monitor (+/- 5%),
+	 * do the calculation in steps to avoid integer overflow
+	 */
+	pr_debug("DIU pixclock in ps - %u\n", pixclock);
+	pixclock = (1000000000 / pixclock) * 1000;
+	pr_debug("DIU pixclock freq  - %u\n", pixclock);
+	epsilon = pixclock / 20; /* pixclock * 0.05 */
+	pr_debug("DIU deviation      - %lu\n", epsilon);
+	minpixclock = pixclock - epsilon;
+	maxpixclock = pixclock + epsilon;
+	pr_debug("DIU minpixclock    - %lu\n", minpixclock);
+	pr_debug("DIU maxpixclock    - %lu\n", maxpixclock);
+
+	/*
+	 * check whether the DIU supports the desired pixel clock
+	 *
+	 * - simply request the desired clock and see what the
+	 *   platform's clock driver will make of it, assuming that it
+	 *   will setup the best approximation of the requested value
+	 * - try other candidate frequencies in the order of decreasing
+	 *   preference (i.e. with increasing distance from the desired
+	 *   pixel clock, and checking the lower frequency before the
+	 *   higher frequency to not overload the hardware) until the
+	 *   first match is found -- any potential subsequent match
+	 *   would only be as good as the former match or typically
+	 *   would be less preferrable
+	 *
+	 * the offset increment of pixelclock divided by 64 is an
+	 * arbitrary choice -- it's simple to calculate, in the typical
+	 * case we expect the first check to succeed already, in the
+	 * worst case seven frequencies get tested (the exact center and
+	 * three more values each to the left and to the right) before
+	 * the 5% tolerance window is exceeded, resulting in fast enough
+	 * execution yet high enough probability of finding a suitable
+	 * value, while the error rate will be in the order of single
+	 * percents
+	 */
+	for (offset = 0; offset <= epsilon; offset += pixclock / 64) {
+		want = pixclock - offset;
+		pr_debug("DIU checking clock - %lu\n", want);
+		clk_set_rate(clk_diu, want);
+		got = clk_get_rate(clk_diu);
+		delta = abs(pixclock - got);
+		if (delta < epsilon)
+			break;
+		if (!offset)
+			continue;
+		want = pixclock + offset;
+		pr_debug("DIU checking clock - %lu\n", want);
+		clk_set_rate(clk_diu, want);
+		got = clk_get_rate(clk_diu);
+		delta = abs(pixclock - got);
+		if (delta < epsilon)
+			break;
 	}
+	if (offset <= epsilon) {
+		pr_debug("DIU clock accepted - %lu\n", want);
+		pr_debug("DIU pixclock want %u, got %lu, delta %lu, eps %lu\n",
+			 pixclock, got, delta, epsilon);
+		return;
+	}
+	pr_warn("DIU pixclock auto search unsuccessful\n");
 
-	pr_debug("DIU chose = %lx\n", bestval);
-	pr_debug("DIU error = %ld\n NomPixClk ", err);
-	pr_debug("DIU: Best Freq = %lx\n", bestfreq);
-	/* Modify DIU_DIV in CCM SCFR1 */
-	temp = in_be32(&ccm->scfr1);
-	pr_debug("DIU: Current value of SCFR1: 0x%08x\n", temp);
-	temp &= ~DIU_DIV_MASK;
-	temp |= (bestval & DIU_DIV_MASK);
-	out_be32(&ccm->scfr1, temp);
-	pr_debug("DIU: Modified value of SCFR1: 0x%08x\n", temp);
-	iounmap(ccm);
+	/*
+	 * what is the most appropriate action to take when the search
+	 * for an available pixel clock which is acceptable to the
+	 * monitor has failed?  disable the DIU (clock) or just provide
+	 * a "best effort"?  we go with the latter
+	 */
+	pr_warn("DIU pixclock best effort fallback (backend's choice)\n");
+	clk_set_rate(clk_diu, pixclock);
+	got = clk_get_rate(clk_diu);
+	delta = abs(pixclock - got);
+	pr_debug("DIU pixclock want %u, got %lu, delta %lu, eps %lu\n",
+		 pixclock, got, delta, epsilon);
 }
 
 enum fsl_diu_monitor_port
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH v5 17/17] clk: mpc512x: remove migration support workarounds
From: Gerhard Sittig @ 2013-11-17 23:06 UTC (permalink / raw)
  To: linuxppc-dev, linux-arm-kernel, Anatolij Gustschin,
	Mike Turquette
  Cc: Scott Wood, Gerhard Sittig, Detlev Zundel
In-Reply-To: <1384729577-7336-1-git-send-email-gsi@denx.de>

this change removes workarounds which have become obsolete after
migration to common clock support has completed
- remove clkdev registration calls (compatibility clock item aliases)
  after all peripheral drivers were adjusted for device tree based
  clock lookup
- remove pre-enable workarounds after all peripheral drivers were
  adjusted to acquire their respective clock items

workarounds for these clock items get removed:  FEC (ethernet), I2C,
PSC (UART, SPI), PSC FIFO, USB, NFC (NAND flash), VIU (video capture),
BDLC (CAN), CAN MCLK, DIU (video output)

these clkdev registered names won't be provided any longer by the
MPC512x platform's clock driver:  "psc%d_mclk", "mscan%d_mclk",
"usb%d_clk", "nfc_clk", "viu_clk", "sys_clk", "ref_clk"

the pre-enable workaround for PCI remains, but depends on the presence
of PCI related device tree nodes (disables the PCI clock in the absence
of PCI nodes, keeps the PCI clock enabled in the presence of nodes) --
moving clock acquisition into the peripheral driver isn't possible for
PCI because its initialization takes place before the platform clock
driver gets initialized, thus the clock provider isn't available then

Cc: Mike Turquette <mturquette@linaro.org>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Gerhard Sittig <gsi@denx.de>
---
 arch/powerpc/platforms/512x/clock-commonclk.c |   50 ++++++++-----------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/platforms/512x/clock-commonclk.c b/arch/powerpc/platforms/512x/clock-commonclk.c
index c1faf3a9ea1f..b1fbdcbc68b7 100644
--- a/arch/powerpc/platforms/512x/clock-commonclk.c
+++ b/arch/powerpc/platforms/512x/clock-commonclk.c
@@ -695,46 +695,28 @@ static void mpc5121_clk_register_of_provider(struct device_node *np)
  */
 static void mpc5121_clk_provide_migration_support(void)
 {
-	int idx;
-	char name[32];
-
-	/*
-	 * provide "pre-CCF" alias clock names for peripheral drivers
-	 * which have not yet been adjusted to do OF based clock lookups
-	 */
-	clk_register_clkdev(clks[MPC512x_CLK_REF], "ref_clk", NULL);
-	clk_register_clkdev(clks[MPC512x_CLK_SYS], "sys_clk", NULL);
-	clk_register_clkdev(clks[MPC512x_CLK_VIU], "viu_clk", NULL);
-	clk_register_clkdev(clks[MPC512x_CLK_NFC], "nfc_clk", NULL);
-	clk_register_clkdev(clks[MPC512x_CLK_USB1], "usb1_clk", NULL);
-	clk_register_clkdev(clks[MPC512x_CLK_USB2], "usb2_clk", NULL);
-	for (idx = 0; idx < NR_PSCS; idx++) {
-		snprintf(name, sizeof(name), "psc%d_mclk", idx);
-		clk_register_clkdev(clks[MPC512x_CLK_PSC0_MCLK + idx],
-				    name, NULL);
-	}
-	for (idx = 0; idx < NR_MSCANS; idx++) {
-		snprintf(name, sizeof(name), "mscan%d_mclk", idx);
-		clk_register_clkdev(clks[MPC512x_CLK_MSCAN0_MCLK + idx],
-				    name, NULL);
-	}
-	clk_register_clkdev(clks[MPC512x_CLK_SPDIF_MCLK], "spdif_mclk", NULL);
 
 	/*
 	 * pre-enable those clock items which are not yet appropriately
 	 * acquired by their peripheral driver
+	 *
+	 * the PCI clock cannot get acquired by its peripheral driver,
+	 * because for this platform the driver won't probe(), instead
+	 * initialization is done from within the .setup_arch() routine
+	 * at a point in time where the clock provider has not been
+	 * setup yet and thus isn't available yet
+	 *
+	 * so we "pre-enable" the clock here, to not have the clock
+	 * subsystem automatically disable this item in a late init call
+	 *
+	 * this PCI clock pre-enable workaround only applies when there
+	 * are device tree nodes for PCI and thus the peripheral driver
+	 * has attached to bridges, otherwise the PCI clock remains
+	 * unused and so it gets disabled
 	 */
-	clk_prepare_enable(clks[MPC512x_CLK_PSC_FIFO]);
 	clk_prepare_enable(clks[MPC512x_CLK_PSC3_MCLK]);/* serial console */
-	clk_prepare_enable(clks[MPC512x_CLK_FEC]);	/* network, NFS */
-	clk_prepare_enable(clks[MPC512x_CLK_DIU]);	/* display */
-	clk_prepare_enable(clks[MPC512x_CLK_I2C]);	/* I2C */
-	for (idx = 0; idx < NR_PSCS; idx++)		/* PSC ipg */
-		clk_prepare_enable(clks[MPC512x_CLK_PSC0 + idx]);
-	clk_prepare_enable(clks[MPC512x_CLK_BDLC]);	/* MSCAN ipg */
-	for (idx = 0; idx < NR_MSCANS; idx++)		/* MSCAN mclk */
-		clk_prepare_enable(clks[MPC512x_CLK_MSCAN0_MCLK + idx]);
-	clk_prepare_enable(clks[MPC512x_CLK_PCI]);	/* PCI */
+	if (of_find_compatible_node(NULL, "pci", "fsl,mpc5121-pci"))
+		clk_prepare_enable(clks[MPC512x_CLK_PCI]);
 }
 
 /*
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH v2] powerpc: kvm: optimize "sc 1" as fast return
From: liu ping fan @ 2013-11-18  1:06 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Alexander Graf, kvm-ppc
In-Reply-To: <20131116070056.GB18339@iris.ozlabs.ibm.com>

On Sat, Nov 16, 2013 at 3:00 PM, Paul Mackerras <paulus@samba.org> wrote:
> On Fri, Nov 15, 2013 at 04:35:01PM +0800, Liu Ping Fan wrote:
>>
>> +sc_1_fast_return:
>> +     mtspr   SPRN_SRR0,r10
>> +     mtspr   SPRN_SRR1,r11
>> +     li      r10, BOOK3S_INTERRUPT_SYSCALL
>> +     li      r11, (MSR_ME << 1) | 1  /* synthesize MSR_SF | MSR_ME */
>> +     rotldi  r11, r11, 63
>
> You need a "mr r4, r9" instruction here, because fast_guest_return
> needs the vcpu pointer in r4.  Apart from that this looks fine.
>
Will fix it.

Thanks and regards,
Pingfan
>> +     b       fast_guest_return
>
> Paul.

^ permalink raw reply

* [PATCH v3] powerpc: kvm: optimize "sc 1" as fast return
From: Liu Ping Fan @ 2013-11-18  1:09 UTC (permalink / raw)
  To: linuxppc-dev, kvm-ppc; +Cc: Paul Mackerras, Alexander Graf

In some scene, e.g openstack CI, PR guest can trigger "sc 1" frequently,
this patch optimizes the path by directly delivering BOOK3S_INTERRUPT_SYSCALL
to HV guest, so powernv can return to HV guest without heavy exit, i.e,
no need to swap TLB, HTAB,.. etc

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/book3s_hv.c            |  6 ------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +++++++++++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 62a2b5a..73dc852 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -628,12 +628,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* hcall - punt to userspace */
 		int i;
 
-		if (vcpu->arch.shregs.msr & MSR_PR) {
-			/* sc 1 from userspace - reflect to guest syscall */
-			kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
-			r = RESUME_GUEST;
-			break;
-		}
 		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
 		for (i = 0; i < 9; ++i)
 			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c71103b..0d1e2c2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1388,7 +1388,8 @@ kvmppc_hisi:
 hcall_try_real_mode:
 	ld	r3,VCPU_GPR(R3)(r9)
 	andi.	r0,r11,MSR_PR
-	bne	guest_exit_cont
+	/* sc 1 from userspace - reflect to guest syscall */
+	bne	sc_1_fast_return
 	clrrdi	r3,r3,2
 	cmpldi	r3,hcall_real_table_end - hcall_real_table
 	bge	guest_exit_cont
@@ -1409,6 +1410,15 @@ hcall_try_real_mode:
 	ld	r11,VCPU_MSR(r4)
 	b	fast_guest_return
 
+sc_1_fast_return:
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	li	r10, BOOK3S_INTERRUPT_SYSCALL
+	li	r11, (MSR_ME << 1) | 1  /* synthesize MSR_SF | MSR_ME */
+	rotldi	r11, r11, 63
+	mr	r4,r9
+	b	fast_guest_return
+
 	/* We've attempted a real mode hcall, but it's punted it back
 	 * to userspace.  We need to restore some clobbered volatiles
 	 * before resuming the pass-it-to-qemu path */
-- 
1.8.1.4

^ permalink raw reply related

* [PATCH] powerpc: Only print PACATMSCRATCH in oops when TM is active
From: Anton Blanchard @ 2013-11-18  2:19 UTC (permalink / raw)
  To: benh, paulus, mikey; +Cc: linuxppc-dev


If TM is not active there is no need to print PACATMSCRATCH
so we can save ourselves a line.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: b/arch/powerpc/kernel/process.c
===================================================================
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -871,7 +871,8 @@ void show_regs(struct pt_regs * regs)
 	printk("SOFTE: %ld ", regs->softe);
 #endif
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
+	if (MSR_TM_ACTIVE(regs->msr))
+		printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
 #endif
 
 	for (i = 0;  i < 32;  i++) {

^ permalink raw reply

* Re: [PATCH] powerpc: Only print PACATMSCRATCH in oops when TM is active
From: Michael Neuling @ 2013-11-18  2:38 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: paulus, linuxppc-dev
In-Reply-To: <20131118131917.6718ff5b@kryten>

Anton Blanchard <anton@samba.org> wrote:

> 
> If TM is not active there is no need to print PACATMSCRATCH
> so we can save ourselves a line.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org>

Acked-by: Michael Neuling <mikey@neuling.org>

> ---
> 
> Index: b/arch/powerpc/kernel/process.c
> ===================================================================
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -871,7 +871,8 @@ void show_regs(struct pt_regs * regs)
>  	printk("SOFTE: %ld ", regs->softe);
>  #endif
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> -	printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
> +	if (MSR_TM_ACTIVE(regs->msr))
> +		printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
>  #endif
>  
>  	for (i = 0;  i < 32;  i++) {
> 

^ permalink raw reply

* [PATCH] powerpc: ppc64 address space capped at 32TB, mmap randomisation disabled
From: Anton Blanchard @ 2013-11-18  3:55 UTC (permalink / raw)
  To: benh, paulus, walken, aneesh.kumar; +Cc: linuxppc-dev


Commit fba2369e6ceb (mm: use vm_unmapped_area() on powerpc architecture)
has a bug in slice_scan_available() where we compare an unsigned long
(high_slices) against a shifted int. As a result, comparisons against
the top 32 bits of high_slices (representing the top 32TB) always
returns 0 and the top of our mmap region is clamped at 32TB

This also breaks mmap randomisation since the randomised address is
always up near the top of the address space and it gets clamped down
to 32TB.

Cc: stable@vger.kernel.org # v3.10+
Signed-off-by: Anton Blanchard <anton@samba.org>
---

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 3e99c14..7ce9cf3 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -258,7 +258,7 @@ static bool slice_scan_available(unsigned long addr,
 		slice = GET_HIGH_SLICE_INDEX(addr);
 		*boundary_addr = (slice + end) ?
 			((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
-		return !!(available.high_slices & (1u << slice));
+		return !!(available.high_slices & (1ul << slice));
 	}
 }
 

^ permalink raw reply related

* Re: [PATCH] powerpc: ppc64 address space capped at 32TB, mmap randomisation disabled
From: Michel Lespinasse @ 2013-11-18  3:57 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: linuxppc-dev, paulus, aneesh.kumar
In-Reply-To: <20131118145528.2329a1ab@kryten>

On Sun, Nov 17, 2013 at 7:55 PM, Anton Blanchard <anton@samba.org> wrote:
>
> Commit fba2369e6ceb (mm: use vm_unmapped_area() on powerpc architecture)
> has a bug in slice_scan_available() where we compare an unsigned long
> (high_slices) against a shifted int. As a result, comparisons against
> the top 32 bits of high_slices (representing the top 32TB) always
> returns 0 and the top of our mmap region is clamped at 32TB
>
> This also breaks mmap randomisation since the randomised address is
> always up near the top of the address space and it gets clamped down
> to 32TB.
>
> Cc: stable@vger.kernel.org # v3.10+
> Signed-off-by: Anton Blanchard <anton@samba.org>
> ---
>
> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> index 3e99c14..7ce9cf3 100644
> --- a/arch/powerpc/mm/slice.c
> +++ b/arch/powerpc/mm/slice.c
> @@ -258,7 +258,7 @@ static bool slice_scan_available(unsigned long addr,
>                 slice = GET_HIGH_SLICE_INDEX(addr);
>                 *boundary_addr = (slice + end) ?
>                         ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
> -               return !!(available.high_slices & (1u << slice));
> +               return !!(available.high_slices & (1ul << slice));
>         }
>  }
>

Good catch, sorry about that...

Acked-by: Michel Lespinasse <walken@google.com>

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply

* [PATCH v2] powerpc/powernv: infrastructure to read opal messages in generic format.
From: Mahesh J Salgaonkar @ 2013-11-18  4:32 UTC (permalink / raw)
  To: linuxppc-dev, Benjamin Herrenschmidt

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Opal now has a new messaging infrastructure to push the messages to
linux in a generic format for different type of messages using only one
event bit. The format of the opal message is as below:

struct opal_msg {
        uint32_t msg_type;
	uint32_t reserved;
	uint64_t params[8];
};

This patch allows clients to subscribe for notification for specific
message type. It is upto the subscriber to decipher the messages who showed
interested in receiving specific message type.

The interface to subscribe for notification is:

	int opal_message_notifier_register(enum OpalMessageType msg_type,
                                        struct notifier_block *nb)


The notifier will fetch the opal message when available and notify the
subscriber with message type and the opal message. It is subscribers
responsibility to copy the message data before returning from notifier
callback.

I will post a seperate patch series for fsp memory handling which uses
this new messaging channel to pull fsp memory errors.

Changes in v2:
- Fixed opal tokan numbers to match with latest changes in opal.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/opal.h                |   23 ++++++
 arch/powerpc/platforms/powernv/opal-wrappers.S |    2 +
 arch/powerpc/platforms/powernv/opal.c          |   90 ++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index c5cd728..926bb6a 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -129,6 +129,8 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_LPC_READ				67
 #define OPAL_LPC_WRITE				68
 #define OPAL_RETURN_CPU				69
+#define OPAL_GET_MSG				85
+#define OPAL_CHECK_ASYNC_COMPLETION		86
 
 #ifndef __ASSEMBLY__
 
@@ -208,7 +210,16 @@ enum OpalPendingState {
 	OPAL_EVENT_ERROR_LOG		= 0x40,
 	OPAL_EVENT_EPOW			= 0x80,
 	OPAL_EVENT_LED_STATUS		= 0x100,
-	OPAL_EVENT_PCI_ERROR		= 0x200
+	OPAL_EVENT_PCI_ERROR		= 0x200,
+	OPAL_EVENT_PENDING_MSGS		= 0x400,
+};
+
+enum OpalMessageType {
+	OPAL_MSG_ASYNC_COMP		= 0,
+	OPAL_MSG_MEM_ERR,
+	OPAL_MSG_EPOW,
+	OPAL_MSG_SHUTDOWN,
+	OPAL_MSG_TYPE_MAX,
 };
 
 /* Machine check related definitions */
@@ -353,6 +364,12 @@ enum OpalLPCAddressType {
 	OPAL_LPC_FW	= 2,
 };
 
+struct opal_msg {
+	uint32_t msg_type;
+	uint32_t reserved;
+	uint64_t params[8];
+};
+
 struct opal_machine_check_event {
 	enum OpalMCE_Version	version:8;	/* 0x00 */
 	uint8_t			in_use;		/* 0x01 */
@@ -656,6 +673,8 @@ int64_t opal_lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 		       uint32_t addr, uint32_t data, uint32_t sz);
 int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 		      uint32_t addr, uint32_t *data, uint32_t sz);
+int64_t opal_get_msg(uint64_t buffer, size_t size);
+int64_t opal_check_completion(uint64_t buffer, size_t size, uint64_t token);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
@@ -670,6 +689,8 @@ extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
 
 extern int opal_notifier_register(struct notifier_block *nb);
+extern int opal_message_notifier_register(enum OpalMessageType msg_type,
+						struct notifier_block *nb);
 extern void opal_notifier_enable(void);
 extern void opal_notifier_disable(void);
 extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8f38445..4c2e19a 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -116,3 +116,5 @@ OPAL_CALL(opal_xscom_write,			OPAL_XSCOM_WRITE);
 OPAL_CALL(opal_lpc_read,			OPAL_LPC_READ);
 OPAL_CALL(opal_lpc_write,			OPAL_LPC_WRITE);
 OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU);
+OPAL_CALL(opal_get_msg,				OPAL_GET_MSG);
+OPAL_CALL(opal_check_completion,		OPAL_CHECK_ASYNC_COMPLETION);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 2911abe..d3759f7 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -33,6 +33,7 @@ extern u64 opal_mc_secondary_handler[];
 static unsigned int *opal_irqs;
 static unsigned int opal_irq_count;
 static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
+static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
 static DEFINE_SPINLOCK(opal_notifier_lock);
 static uint64_t last_notified_mask = 0x0ul;
 static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
@@ -162,6 +163,95 @@ void opal_notifier_disable(void)
 	atomic_set(&opal_notifier_hold, 1);
 }
 
+/*
+ * Opal message notifier based on message type. Allow subscribers to get
+ * notified for specific messgae type.
+ */
+int opal_message_notifier_register(enum OpalMessageType msg_type,
+					struct notifier_block *nb)
+{
+	if (!nb) {
+		pr_warning("%s: Invalid argument (%p)\n",
+			   __func__, nb);
+		return -EINVAL;
+	}
+	if (msg_type > OPAL_MSG_TYPE_MAX) {
+		pr_warning("%s: Invalid message type argument (%d)\n",
+			   __func__, msg_type);
+		return -EINVAL;
+	}
+	return atomic_notifier_chain_register(
+				&opal_msg_notifier_head[msg_type], nb);
+}
+
+static void opal_message_do_notify(uint32_t msg_type, void *msg)
+{
+	/* notify subscribers */
+	atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+					msg_type, msg);
+}
+
+static void opal_message_handle_event(void)
+{
+	s64 ret;
+	/*
+	 * TODO: pre-allocate a message buffer depending on opal-msg-size
+	 * value in /proc/device-tree.
+	 */
+	static struct opal_msg msg;
+
+	ret = opal_get_msg(__pa(&msg), sizeof(msg));
+	/* No opal message pending. */
+	if (ret == OPAL_RESOURCE)
+		return;
+
+	/* check for errors. */
+	if (ret) {
+		pr_warning("%s: Failed to retrive opal message, err=%lld\n",
+				__func__, ret);
+		return;
+	}
+
+	/* Sanity check */
+	if (msg.msg_type > OPAL_MSG_TYPE_MAX) {
+		pr_warning("%s: Unknown message type: %u\n",
+				__func__, msg.msg_type);
+		return;
+	}
+	opal_message_do_notify(msg.msg_type, (void *)&msg);
+}
+
+static int opal_message_notify(struct notifier_block *nb,
+			  unsigned long events, void *change)
+{
+	if (events & OPAL_EVENT_PENDING_MSGS)
+		opal_message_handle_event();
+	return 0;
+}
+
+static struct notifier_block opal_message_nb = {
+	.notifier_call	= opal_message_notify,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_message_init(void)
+{
+	int ret, i;
+
+	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
+		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
+
+	ret = opal_notifier_register(&opal_message_nb);
+	if (ret) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+		       __func__, ret);
+		return ret;
+	}
+	return 0;
+}
+early_initcall(opal_message_init);
+
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
 	s64 len, rc;

^ permalink raw reply related

* [PATCH] powerpc: Don't use ELFv2 ABI to build the kernel
From: Alistair Popple @ 2013-11-18  6:21 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alistair Popple

The kernel doesn't build correctly using the ELFv2 ABI.  This patch
ensures that the ELFv1 ABI is used when building a kernel with an
ELFv2 enabled compiler.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
---
 arch/powerpc/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 607acf5..8a24636 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -111,6 +111,7 @@ endif
 endif
 
 CFLAGS-$(CONFIG_PPC64)	:= -mtraceback=no -mcall-aixdesc
+CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv1)
 CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mcmodel=medium,-mminimal-toc)
 CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mno-pointers-to-nested-functions)
 CFLAGS-$(CONFIG_PPC32)	:= -ffixed-r2 $(MULTIPLEWORD)
-- 
1.8.2.1

^ permalink raw reply related

* [PATCH V3] powerpc: Add a vga alias node for P1022
From: Jason Jin @ 2013-11-18  6:48 UTC (permalink / raw)
  To: scottwood; +Cc: linuxppc-dev

In u-boot, when set the video as console, the name 'vga' is used
as a general name for the video device, during the fdt_fixup_stdout
process, the 'vga' name is used to search in the dtb to setup the
'linux,stdout-path' node. Though the P1022 DIU is not VGA-compatible
device,
to meet the 'vga' name used in u-boot, the vga alias node is added for
P1022 in this patch. At the same time, a display alias is also added
so that no other components grow dependencies on the vga alias node.

Signed-off-by: Jason Jin <Jason.Jin@freescale.com>
---
V2: Update the description and also add a display alias.
V3: Add code comment for the inaccurate vag name. Thanks for Scott's input.

 arch/powerpc/boot/dts/fsl/p1022si-post.dtsi | 2 +-
 arch/powerpc/boot/dts/fsl/p1022si-pre.dtsi  | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/boot/dts/fsl/p1022si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1022si-post.dtsi
index e179803..be49300 100644
--- a/arch/powerpc/boot/dts/fsl/p1022si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p1022si-post.dtsi
@@ -174,7 +174,7 @@
 
 /include/ "pq3-gpio-0.dtsi"
 
-	display@10000 {
+	display: display@10000 {
 		compatible = "fsl,diu", "fsl,p1022-diu";
 		reg = <0x10000 1000>;
 		interrupts = <64 2 0 0>;
diff --git a/arch/powerpc/boot/dts/fsl/p1022si-pre.dtsi b/arch/powerpc/boot/dts/fsl/p1022si-pre.dtsi
index 1956dea..362ac51 100644
--- a/arch/powerpc/boot/dts/fsl/p1022si-pre.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p1022si-pre.dtsi
@@ -50,6 +50,9 @@
 		pci0 = &pci0;
 		pci1 = &pci1;
 		pci2 = &pci2;
+		/*inaccurate vga name for U-Boot compatibility*/
+		vga = &display;
+		display = &display;
 	};
 
 	cpus {
-- 
1.8.0

^ permalink raw reply related

* [PATCH] powerpc: booke: Fix build failures
From: Aneesh Kumar K.V @ 2013-11-18  9:20 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

arch/powerpc/platforms/wsp/wsp.c: In function ‘wsp_probe_devices’:
arch/powerpc/platforms/wsp/wsp.c:76:3: error: implicit declaration of function ‘of_address_to_resource’ [-Werror=implicit-function-declaration]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/wsp/chroma.c   | 1 +
 arch/powerpc/platforms/wsp/h8.c       | 1 +
 arch/powerpc/platforms/wsp/ics.c      | 2 ++
 arch/powerpc/platforms/wsp/opb_pic.c  | 2 ++
 arch/powerpc/platforms/wsp/psr2.c     | 1 +
 arch/powerpc/platforms/wsp/scom_wsp.c | 1 +
 arch/powerpc/platforms/wsp/wsp.c      | 1 +
 7 files changed, 9 insertions(+)

diff --git a/arch/powerpc/platforms/wsp/chroma.c b/arch/powerpc/platforms/wsp/chroma.c
index 8ef53bc2e70e..aaa46b353715 100644
--- a/arch/powerpc/platforms/wsp/chroma.c
+++ b/arch/powerpc/platforms/wsp/chroma.c
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/smp.h>
 #include <linux/time.h>
+#include <linux/of_fdt.h>
 
 #include <asm/machdep.h>
 #include <asm/udbg.h>
diff --git a/arch/powerpc/platforms/wsp/h8.c b/arch/powerpc/platforms/wsp/h8.c
index d18e6cc19df3..a3c87f395750 100644
--- a/arch/powerpc/platforms/wsp/h8.c
+++ b/arch/powerpc/platforms/wsp/h8.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/io.h>
+#include <linux/of_address.h>
 
 #include "wsp.h"
 
diff --git a/arch/powerpc/platforms/wsp/ics.c b/arch/powerpc/platforms/wsp/ics.c
index 2d3b1dd9571d..9cd92e645028 100644
--- a/arch/powerpc/platforms/wsp/ics.c
+++ b/arch/powerpc/platforms/wsp/ics.c
@@ -18,6 +18,8 @@
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
diff --git a/arch/powerpc/platforms/wsp/opb_pic.c b/arch/powerpc/platforms/wsp/opb_pic.c
index cb565bf93650..3f6729807938 100644
--- a/arch/powerpc/platforms/wsp/opb_pic.c
+++ b/arch/powerpc/platforms/wsp/opb_pic.c
@@ -15,6 +15,8 @@
 #include <linux/of.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
 
 #include <asm/reg_a2.h>
 #include <asm/irq.h>
diff --git a/arch/powerpc/platforms/wsp/psr2.c b/arch/powerpc/platforms/wsp/psr2.c
index 508ec8282b96..a87b414c766a 100644
--- a/arch/powerpc/platforms/wsp/psr2.c
+++ b/arch/powerpc/platforms/wsp/psr2.c
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/smp.h>
 #include <linux/time.h>
+#include <linux/of_fdt.h>
 
 #include <asm/machdep.h>
 #include <asm/udbg.h>
diff --git a/arch/powerpc/platforms/wsp/scom_wsp.c b/arch/powerpc/platforms/wsp/scom_wsp.c
index 8928507affea..6538b4de34fc 100644
--- a/arch/powerpc/platforms/wsp/scom_wsp.c
+++ b/arch/powerpc/platforms/wsp/scom_wsp.c
@@ -14,6 +14,7 @@
 #include <linux/of.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/of_address.h>
 
 #include <asm/cputhreads.h>
 #include <asm/reg_a2.h>
diff --git a/arch/powerpc/platforms/wsp/wsp.c b/arch/powerpc/platforms/wsp/wsp.c
index ddb6efe88914..58cd1f00e1ef 100644
--- a/arch/powerpc/platforms/wsp/wsp.c
+++ b/arch/powerpc/platforms/wsp/wsp.c
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/time.h>
+#include <linux/of_address.h>
 
 #include <asm/scom.h>
 
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH -V2 1/5] powerpc: Use HPTE constants when updating hpte bits
From: Aneesh Kumar K.V @ 2013-11-18  9:28 UTC (permalink / raw)
  To: benh, paulus, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1384766893-10189-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

Even though we have same value for linux PTE bits and hash PTE pits
use the hash pte bits wen updating hash pte

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/cell/beat_htab.c | 4 ++--
 arch/powerpc/platforms/pseries/lpar.c   | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c
index c34ee4e60873..d4d245c0d787 100644
--- a/arch/powerpc/platforms/cell/beat_htab.c
+++ b/arch/powerpc/platforms/cell/beat_htab.c
@@ -111,7 +111,7 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group,
 		DBG_LOW(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
 
 	if (rflags & _PAGE_NO_CACHE)
-		hpte_r &= ~_PAGE_COHERENT;
+		hpte_r &= ~HPTE_R_M;
 
 	raw_spin_lock(&beat_htab_lock);
 	lpar_rc = beat_read_mask(hpte_group);
@@ -337,7 +337,7 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group,
 		DBG_LOW(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
 
 	if (rflags & _PAGE_NO_CACHE)
-		hpte_r &= ~_PAGE_COHERENT;
+		hpte_r &= ~HPTE_R_M;
 
 	/* insert into not-volted entry */
 	lpar_rc = beat_insert_htab_entry3(0, hpte_group, hpte_v, hpte_r,
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 356bc75ca74f..c8fbef238d4b 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -153,7 +153,8 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 
 	/* Make pHyp happy */
 	if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU))
-		hpte_r &= ~_PAGE_COHERENT;
+		hpte_r &= ~HPTE_R_M;
+
 	if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
 		flags |= H_COALESCE_CAND;
 
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH -V2 4/5] powerpc: mm: Only check for _PAGE_PRESENT in set_pte/pmd functions
From: Aneesh Kumar K.V @ 2013-11-18  9:28 UTC (permalink / raw)
  To: benh, paulus, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1384766893-10189-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

We want to make sure we don't use these function when updating a pte
or pmd entry that have a valid hpte entry, because these functions
don't invalidate them. So limit the check to _PAGE_PRESENT bit.
Numafault core changes use these functions for updating _PAGE_NUMA bits.
That should be ok because when _PAGE_NUMA is set we can be sure that
hpte entries are not present.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable.c    | 2 +-
 arch/powerpc/mm/pgtable_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 841e0d00863c..ad90429bbd8b 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -174,7 +174,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 		pte_t pte)
 {
 #ifdef CONFIG_DEBUG_VM
-	WARN_ON(pte_present(*ptep));
+	WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
 #endif
 	/* Note: mm->context.id might not yet have been assigned as
 	 * this context might not have been activated yet when this
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 9d95786aa80f..02e8681fb865 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -687,7 +687,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 		pmd_t *pmdp, pmd_t pmd)
 {
 #ifdef CONFIG_DEBUG_VM
-	WARN_ON(!pmd_none(*pmdp));
+	WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
 	assert_spin_locked(&mm->page_table_lock);
 	WARN_ON(!pmd_trans_huge(pmd));
 #endif
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH -V2 3/5] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE
From: Aneesh Kumar K.V @ 2013-11-18  9:28 UTC (permalink / raw)
  To: benh, paulus, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1384766893-10189-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE.
On archs like ppc64 that don't use _PAGE_PROTNONE and also have
a separate page table outside linux pagetable, we just need to
make sure that when calling change_prot_numa we flush the
hardware page table entry so that next page access  result in a numa
fault.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 include/linux/mm.h | 3 ---
 mm/mempolicy.c     | 9 ---------
 2 files changed, 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0548eb201e05..51794c1a1d7e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1851,11 +1851,8 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
 }
 #endif
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
-#endif
-
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 			unsigned long pfn, unsigned long size, pgprot_t);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c4403cdf3433..cae10af4fdc4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -613,7 +613,6 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 	return 0;
 }
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 /*
  * This is used to mark a range of virtual addresses to be inaccessible.
  * These are later cleared by a NUMA hinting fault. Depending on these
@@ -627,7 +626,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long addr, unsigned long end)
 {
 	int nr_updated;
-	BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 
 	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 	if (nr_updated)
@@ -635,13 +633,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 
 	return nr_updated;
 }
-#else
-static unsigned long change_prot_numa(struct vm_area_struct *vma,
-			unsigned long addr, unsigned long end)
-{
-	return 0;
-}
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 
 /*
  * Walk through page tables and collect pages to be migrated.
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH -V2 2/5] powerpc: Free up _PAGE_COHERENCE for numa fault use later
From: Aneesh Kumar K.V @ 2013-11-18  9:28 UTC (permalink / raw)
  To: benh, paulus, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1384766893-10189-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

Set  memory coherence always on hash64 config. If
a platform cannot have memory coherence always set they
can infer that from _PAGE_NO_CACHE and _PAGE_WRITETHRU
like in lpar. So we dont' really need a separate bit
for tracking _PAGE_COHERENCE.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pte-hash64.h |  2 +-
 arch/powerpc/mm/hash_low_64.S         | 15 ++++++++++++---
 arch/powerpc/mm/hash_utils_64.c       |  7 ++++---
 arch/powerpc/mm/hugepage-hash64.c     |  6 +++++-
 arch/powerpc/mm/hugetlbpage-hash64.c  |  4 ++++
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 0419eeb53274..55aea0caf95e 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -19,7 +19,7 @@
 #define _PAGE_FILE		0x0002 /* (!present only) software: pte holds file offset */
 #define _PAGE_EXEC		0x0004 /* No execute on POWER4 and newer (we invert) */
 #define _PAGE_GUARDED		0x0008
-#define _PAGE_COHERENT		0x0010 /* M: enforce memory coherence (SMP systems) */
+/* We can derive Memory coherence from _PAGE_NO_CACHE */
 #define _PAGE_NO_CACHE		0x0020 /* I: cache inhibit */
 #define _PAGE_WRITETHRU		0x0040 /* W: cache write-through */
 #define _PAGE_DIRTY		0x0080 /* C: page changed */
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index d3cbda62857b..1136d26a95ae 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -148,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...) 
@@ -457,7 +460,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r3,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...)
@@ -795,7 +801,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6176b3cdf579..de6881259aef 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -169,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long pteflags)
 	if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
 					 (pteflags & _PAGE_DIRTY)))
 		rflags |= 1;
-
-	/* Always add C */
-	return rflags | HPTE_R_C;
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	return rflags | HPTE_R_C | HPTE_R_M;
 }
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 34de9e0cdc34..826893fcb3a7 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -127,7 +127,11 @@ repeat:
 
 		/* Add in WIMG bits */
 		rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				      _PAGE_COHERENT | _PAGE_GUARDED));
+				      _PAGE_GUARDED));
+		/*
+		 * enable the memory coherence always
+		 */
+		rflags |= HPTE_R_M;
 
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0b7fb6761015..a5bcf9301196 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -99,6 +99,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		/* Add in WIMG bits */
 		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 				      _PAGE_COHERENT | _PAGE_GUARDED));
+		/*
+		 * enable the memory coherence always
+		 */
+		rflags |= HPTE_R_M;
 
 		slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
 					     mmu_psize, ssize);
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH -V2 0/5] powerpc: mm: Numa faults support for ppc64
From: Aneesh Kumar K.V @ 2013-11-18  9:28 UTC (permalink / raw)
  To: benh, paulus, linux-mm; +Cc: linuxppc-dev

Hi,

This patch series add support for numa faults on ppc64 architecture. We steal the
_PAGE_COHERENCE bit and use that for indicating _PAGE_NUMA. We clear the _PAGE_PRESENT bit
and also invalidate the hpte entry on setting _PAGE_NUMA. The next fault on that
page will be considered a numa fault.

Changes from V1:
* Dropped few patches related pmd update because batch handling of pmd pages got dropped from core code
   0f19c17929c952c6f0966d93ab05558e7bf814cc "mm: numa: Do not batch handle PMD pages"
   This also avoided the large lock contention on page_table_lock that we observed with the previous series.

 -aneesh
 

^ permalink raw reply

* [PATCH -V2 5/5] powerpc: mm: book3s: Enable _PAGE_NUMA for book3s
From: Aneesh Kumar K.V @ 2013-11-18  9:28 UTC (permalink / raw)
  To: benh, paulus, linux-mm; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1384766893-10189-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

We steal the _PAGE_COHERENCE bit and use that for indicating NUMA ptes.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pgtable.h     | 66 +++++++++++++++++++++++++++++++++-
 arch/powerpc/include/asm/pte-hash64.h  |  6 ++++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 7d6eacf249cf..b999ca318985 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -3,6 +3,7 @@
 #ifdef __KERNEL__
 
 #ifndef __ASSEMBLY__
+#include <linux/mmdebug.h>
 #include <asm/processor.h>		/* For TASK_SIZE */
 #include <asm/mmu.h>
 #include <asm/page.h>
@@ -33,10 +34,73 @@ static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
 static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL; }
-static inline int pte_present(pte_t pte)	{ return pte_val(pte) & _PAGE_PRESENT; }
 static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
 static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
 
+#ifdef CONFIG_NUMA_BALANCING
+
+static inline int pte_present(pte_t pte)
+{
+	return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
+}
+
+#define pte_numa pte_numa
+static inline int pte_numa(pte_t pte)
+{
+	return (pte_val(pte) &
+		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+
+#define pte_mknonnuma pte_mknonnuma
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+	pte_val(pte) &= ~_PAGE_NUMA;
+	pte_val(pte) |=  _PAGE_PRESENT | _PAGE_ACCESSED;
+	return pte;
+}
+
+#define pte_mknuma pte_mknuma
+static inline pte_t pte_mknuma(pte_t pte)
+{
+	/*
+	 * We should not set _PAGE_NUMA on non present ptes. Also clear the
+	 * present bit so that hash_page will return 1 and we collect this
+	 * as numa fault.
+	 */
+	if (pte_present(pte)) {
+		pte_val(pte) |= _PAGE_NUMA;
+		pte_val(pte) &= ~_PAGE_PRESENT;
+	} else
+		VM_BUG_ON(1);
+	return pte;
+}
+
+#define pmd_numa pmd_numa
+static inline int pmd_numa(pmd_t pmd)
+{
+	return pte_numa(pmd_pte(pmd));
+}
+
+#define pmd_mknonnuma pmd_mknonnuma
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+	return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
+}
+
+#define pmd_mknuma pmd_mknuma
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+	return pte_pmd(pte_mknuma(pmd_pte(pmd)));
+}
+
+# else
+
+static inline int pte_present(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_PRESENT;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  *
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 55aea0caf95e..2505d8eab15c 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -27,6 +27,12 @@
 #define _PAGE_RW		0x0200 /* software: user write access allowed */
 #define _PAGE_BUSY		0x0800 /* software: PTE & hash are busy */
 
+/*
+ * Used for tracking numa faults
+ */
+#define _PAGE_NUMA	0x00000010 /* Gather numa placement stats */
+
+
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW		(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
 #define _PAGE_KERNEL_RO		 _PAGE_KERNEL_RW
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index c2a566fb8bb8..2048655d8ec4 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -72,6 +72,7 @@ config PPC_BOOK3S_64
 	select PPC_HAVE_PMU_SUPPORT
 	select SYS_SUPPORTS_HUGETLBFS
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
+	select ARCH_SUPPORTS_NUMA_BALANCING
 
 config PPC_BOOK3E_64
 	bool "Embedded processors"
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH v3] powerpc/powernv: infrastructure to read opal messages in generic format.
From: Mahesh J Salgaonkar @ 2013-11-18 10:05 UTC (permalink / raw)
  To: linuxppc-dev, Benjamin Herrenschmidt

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Opal now has a new messaging infrastructure to push the messages to
linux in a generic format for different type of messages using only one
event bit. The format of the opal message is as below:

struct opal_msg {
        uint32_t msg_type;
	uint32_t reserved;
	uint64_t params[8];
};

This patch allows clients to subscribe for notification for specific
message type. It is upto the subscriber to decipher the messages who showed
interested in receiving specific message type.

The interface to subscribe for notification is:

	int opal_message_notifier_register(enum OpalMessageType msg_type,
                                        struct notifier_block *nb)


The notifier will fetch the opal message when available and notify the
subscriber with message type and the opal message. It is subscribers
responsibility to copy the message data before returning from notifier
callback.

I will post a seperate patch series for fsp memory handling which uses
this new messaging channel to pull fsp memory errors.

Changes in v3:
- Fixed event numbering and naming issue.
- Per Ben's comment, renamed opal_message_handle_event() to
  opal_handle_message()

Changes in v2:
- Fixed opal token numbers to match with latest changes in opal.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/opal.h                |   23 ++++++
 arch/powerpc/platforms/powernv/opal-wrappers.S |    2 +
 arch/powerpc/platforms/powernv/opal.c          |   90 ++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index c5cd728..eb0dfd4 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -129,6 +129,8 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_LPC_READ				67
 #define OPAL_LPC_WRITE				68
 #define OPAL_RETURN_CPU				69
+#define OPAL_GET_MSG				85
+#define OPAL_CHECK_ASYNC_COMPLETION		86
 
 #ifndef __ASSEMBLY__
 
@@ -208,7 +210,16 @@ enum OpalPendingState {
 	OPAL_EVENT_ERROR_LOG		= 0x40,
 	OPAL_EVENT_EPOW			= 0x80,
 	OPAL_EVENT_LED_STATUS		= 0x100,
-	OPAL_EVENT_PCI_ERROR		= 0x200
+	OPAL_EVENT_PCI_ERROR		= 0x200,
+	OPAL_EVENT_MSG_PENDING		= 0x800,
+};
+
+enum OpalMessageType {
+	OPAL_MSG_ASYNC_COMP		= 0,
+	OPAL_MSG_MEM_ERR,
+	OPAL_MSG_EPOW,
+	OPAL_MSG_SHUTDOWN,
+	OPAL_MSG_TYPE_MAX,
 };
 
 /* Machine check related definitions */
@@ -353,6 +364,12 @@ enum OpalLPCAddressType {
 	OPAL_LPC_FW	= 2,
 };
 
+struct opal_msg {
+	uint32_t msg_type;
+	uint32_t reserved;
+	uint64_t params[8];
+};
+
 struct opal_machine_check_event {
 	enum OpalMCE_Version	version:8;	/* 0x00 */
 	uint8_t			in_use;		/* 0x01 */
@@ -656,6 +673,8 @@ int64_t opal_lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 		       uint32_t addr, uint32_t data, uint32_t sz);
 int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 		      uint32_t addr, uint32_t *data, uint32_t sz);
+int64_t opal_get_msg(uint64_t buffer, size_t size);
+int64_t opal_check_completion(uint64_t buffer, size_t size, uint64_t token);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
@@ -670,6 +689,8 @@ extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
 
 extern int opal_notifier_register(struct notifier_block *nb);
+extern int opal_message_notifier_register(enum OpalMessageType msg_type,
+						struct notifier_block *nb);
 extern void opal_notifier_enable(void);
 extern void opal_notifier_disable(void);
 extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8f38445..4c2e19a 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -116,3 +116,5 @@ OPAL_CALL(opal_xscom_write,			OPAL_XSCOM_WRITE);
 OPAL_CALL(opal_lpc_read,			OPAL_LPC_READ);
 OPAL_CALL(opal_lpc_write,			OPAL_LPC_WRITE);
 OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU);
+OPAL_CALL(opal_get_msg,				OPAL_GET_MSG);
+OPAL_CALL(opal_check_completion,		OPAL_CHECK_ASYNC_COMPLETION);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 2911abe..dd5a8b3 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -33,6 +33,7 @@ extern u64 opal_mc_secondary_handler[];
 static unsigned int *opal_irqs;
 static unsigned int opal_irq_count;
 static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
+static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
 static DEFINE_SPINLOCK(opal_notifier_lock);
 static uint64_t last_notified_mask = 0x0ul;
 static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
@@ -162,6 +163,95 @@ void opal_notifier_disable(void)
 	atomic_set(&opal_notifier_hold, 1);
 }
 
+/*
+ * Opal message notifier based on message type. Allow subscribers to get
+ * notified for specific messgae type.
+ */
+int opal_message_notifier_register(enum OpalMessageType msg_type,
+					struct notifier_block *nb)
+{
+	if (!nb) {
+		pr_warning("%s: Invalid argument (%p)\n",
+			   __func__, nb);
+		return -EINVAL;
+	}
+	if (msg_type > OPAL_MSG_TYPE_MAX) {
+		pr_warning("%s: Invalid message type argument (%d)\n",
+			   __func__, msg_type);
+		return -EINVAL;
+	}
+	return atomic_notifier_chain_register(
+				&opal_msg_notifier_head[msg_type], nb);
+}
+
+static void opal_message_do_notify(uint32_t msg_type, void *msg)
+{
+	/* notify subscribers */
+	atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+					msg_type, msg);
+}
+
+static void opal_handle_message(void)
+{
+	s64 ret;
+	/*
+	 * TODO: pre-allocate a message buffer depending on opal-msg-size
+	 * value in /proc/device-tree.
+	 */
+	static struct opal_msg msg;
+
+	ret = opal_get_msg(__pa(&msg), sizeof(msg));
+	/* No opal message pending. */
+	if (ret == OPAL_RESOURCE)
+		return;
+
+	/* check for errors. */
+	if (ret) {
+		pr_warning("%s: Failed to retrive opal message, err=%lld\n",
+				__func__, ret);
+		return;
+	}
+
+	/* Sanity check */
+	if (msg.msg_type > OPAL_MSG_TYPE_MAX) {
+		pr_warning("%s: Unknown message type: %u\n",
+				__func__, msg.msg_type);
+		return;
+	}
+	opal_message_do_notify(msg.msg_type, (void *)&msg);
+}
+
+static int opal_message_notify(struct notifier_block *nb,
+			  unsigned long events, void *change)
+{
+	if (events & OPAL_EVENT_MSG_PENDING)
+		opal_handle_message();
+	return 0;
+}
+
+static struct notifier_block opal_message_nb = {
+	.notifier_call	= opal_message_notify,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_message_init(void)
+{
+	int ret, i;
+
+	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
+		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
+
+	ret = opal_notifier_register(&opal_message_nb);
+	if (ret) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+		       __func__, ret);
+		return ret;
+	}
+	return 0;
+}
+early_initcall(opal_message_init);
+
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
 	s64 len, rc;

^ permalink raw reply related

* Re: [PATCH 16/51] DMA-API: ppc: vio.c: replace dma_set_mask()+dma_set_coherent_mask() with new helper
From: Cedric Le Goater @ 2013-11-18 10:54 UTC (permalink / raw)
  To: Russell King - ARM Linux; +Cc: Paul Mackerras, linuxppc-dev
In-Reply-To: <20131116153244.GD25039@n2100.arm.linux.org.uk>

On 11/16/2013 04:32 PM, Russell King - ARM Linux wrote:
> On Fri, Nov 15, 2013 at 05:16:55PM +0100, Cedric Le Goater wrote:
>> The new helper routine dma_set_mask_and_coherent() breaks the 
>> initialization of the pseries vio devices which do not have an 
>> initial dev->dma_mask. I think we need to use dma_coerce_mask_and_coherent()
>> instead.
> 
> Who wants to handle this patch?
>
> Also, is it possible to fix it so that dev->dma_mask is correctly setup
> by the code which creates the device, as it should be in the first place?

The vio_dev should probably be improved to setup devices as the other 
drivers do but we might be short on time to do that for 3.13 and pseries 
is really broken ... For now, I will just resend the patch with some 
context in the changelog. 

Thanks,

C.

^ permalink raw reply

* [PATCH] DMA-API: ppc: vio: use dma_coerce_mask_and_coherent()
From: Cédric Le Goater @ 2013-11-18 10:57 UTC (permalink / raw)
  To: benh; +Cc: Russell King, Cédric Le Goater, linuxppc-dev, Paul Mackerras
In-Reply-To: <20131116153244.GD25039@n2100.arm.linux.org.uk>

Commit 4886c399da70d5f8a4016c2213850dce6cac88c5 (DMA-API: ppc: vio.c: 
replace dma_set_mask()+dma_set_coherent_mask() with new helper) 
introduced the usage of the new helper routine dma_set_mask_and_coherent(). 
This breaks the initialization of the pseries vio devices which do not 
setup an initial dev->dma_mask for the device.

Signed-off-by: Cédric Le Goater <clg@fr.ibm.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
---

 arch/powerpc/kernel/vio.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index e7d0c88f..76a6482 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1419,7 +1419,7 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
 
 		/* needed to ensure proper operation of coherent allocations
 		 * later, in case driver doesn't set it explicitly */
-		dma_set_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64));
+		dma_coerce_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64));
 	}
 
 	/* register with generic device framework */
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH] powerpc/powernv: Move SG list structure to header file
From: Vasant Hegde @ 2013-11-18 11:09 UTC (permalink / raw)
  To: linuxppc-dev

Move SG list and entry structure to header file so that
it can be used in other places as well.

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/opal.h             |   22 +++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-flash.c |   31 +++++----------------------
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 033c06b..d1af862 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -33,6 +33,28 @@ struct opal_takeover_args {
 	u64	rd_loc;			/* r11 */
 };
 
+/*
+ * SG entry
+ *
+ * WARNING: The current implementation requires each entry
+ * to represent a block that is 4k aligned *and* each block
+ * size except the last one in the list to be as well.
+ */
+struct opal_sg_entry {
+	void    *data;
+	long    length;
+};
+
+/* sg list */
+struct opal_sg_list {
+	unsigned long num_entries;
+	struct opal_sg_list *next;
+	struct opal_sg_entry entry[];
+};
+
+/* We calculate number of sg entries based on PAGE_SIZE */
+#define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry))
+
 extern long opal_query_takeover(u64 *hal_size, u64 *hal_align);
 
 extern long opal_do_takeover(struct opal_takeover_args *args);
diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c
index 6ffa6b1..4aeae4f 100644
--- a/arch/powerpc/platforms/powernv/opal-flash.c
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -103,27 +103,6 @@ struct image_header_t {
 	uint32_t	size;
 };
 
-/* Scatter/gather entry */
-struct opal_sg_entry {
-	void	*data;
-	long	length;
-};
-
-/* We calculate number of entries based on PAGE_SIZE */
-#define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry))
-
-/*
- * This struct is very similar but not identical to that
- * needed by the opal flash update. All we need to do for
- * opal is rewrite num_entries into a version/length and
- * translate the pointers to absolute.
- */
-struct opal_sg_list {
-	unsigned long num_entries;
-	struct opal_sg_list *next;
-	struct opal_sg_entry entry[SG_ENTRIES_PER_NODE];
-};
-
 struct validate_flash_t {
 	int		status;		/* Return status */
 	void		*buf;		/* Candiate image buffer */
@@ -333,7 +312,7 @@ static struct opal_sg_list *image_data_to_sglist(void)
 	addr = image_data.data;
 	size = image_data.size;
 
-	sg1 = kzalloc((sizeof(struct opal_sg_list)), GFP_KERNEL);
+	sg1 = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!sg1)
 		return NULL;
 
@@ -351,8 +330,7 @@ static struct opal_sg_list *image_data_to_sglist(void)
 
 		sg1->num_entries++;
 		if (sg1->num_entries >= SG_ENTRIES_PER_NODE) {
-			sg1->next = kzalloc((sizeof(struct opal_sg_list)),
-					    GFP_KERNEL);
+			sg1->next = kzalloc(PAGE_SIZE, GFP_KERNEL);
 			if (!sg1->next) {
 				pr_err("%s : Failed to allocate memory\n",
 				       __func__);
@@ -402,7 +380,10 @@ static int opal_flash_update(int op)
 		else
 			sg->next = NULL;
 
-		/* Make num_entries into the version/length field */
+		/*
+		 * Convert num_entries to version/length format
+		 * to satisfy OPAL.
+		 */
 		sg->num_entries = (SG_LIST_VERSION << 56) |
 			(sg->num_entries * sizeof(struct opal_sg_entry) + 16);
 	}

^ permalink raw reply related

* [PATCH] powerpc/powernv: Platform dump interface
From: Vasant Hegde @ 2013-11-18 11:09 UTC (permalink / raw)
  To: linuxppc-dev

This patch adds Platform dump retrieval interface.

Flow:
  - We register to OPAL notification event.
  - OPAL sends new dump available notification.
  - We retrieve the dump and send it to debugfs.
  - User copies the dump data and end ACKs via debugfs.
  - We send ACK to OPAL.

debugfs files:
  We create below dump related files under "fsp" directory.
  - dump		: Dump data
  - dump_available	: New dump available notification to userspace
  - dump_control	: ACK/initiate new dump
  - README		: README

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/opal.h                |   12 +
 arch/powerpc/platforms/powernv/Makefile        |    2 
 arch/powerpc/platforms/powernv/opal-dump.c     |  420 ++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-wrappers.S |    4 
 arch/powerpc/platforms/powernv/opal.c          |    2 
 5 files changed, 438 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-dump.c

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index d1af862..a1c5237 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -154,6 +154,10 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_FLASH_VALIDATE			76
 #define OPAL_FLASH_MANAGE			77
 #define OPAL_FLASH_UPDATE			78
+#define OPAL_DUMP_INIT				81
+#define OPAL_DUMP_INFO				82
+#define OPAL_DUMP_READ				83
+#define OPAL_DUMP_ACK				84
 
 #ifndef __ASSEMBLY__
 
@@ -233,7 +237,8 @@ enum OpalPendingState {
 	OPAL_EVENT_ERROR_LOG		= 0x40,
 	OPAL_EVENT_EPOW			= 0x80,
 	OPAL_EVENT_LED_STATUS		= 0x100,
-	OPAL_EVENT_PCI_ERROR		= 0x200
+	OPAL_EVENT_PCI_ERROR		= 0x200,
+	OPAL_EVENT_DUMP_AVAIL		= 0x400,
 };
 
 /* Machine check related definitions */
@@ -752,6 +757,10 @@ int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 int64_t opal_validate_flash(uint64_t buffer, uint32_t *size, uint32_t *result);
 int64_t opal_manage_flash(uint8_t op);
 int64_t opal_update_flash(uint64_t blk_list);
+int64_t opal_dump_init(uint8_t dump_type);
+int64_t opal_dump_info(uint32_t *dump_id, uint32_t *dump_size);
+int64_t opal_dump_read(uint32_t dump_id, uint64_t buffer);
+int64_t opal_dump_ack(uint32_t dump_id);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
@@ -781,6 +790,7 @@ extern void opal_get_rtc_time(struct rtc_time *tm);
 extern unsigned long opal_get_boot_time(void);
 extern void opal_nvram_init(void);
 extern void opal_flash_init(void);
+extern void opal_platform_dump_init(void);
 
 extern int opal_machine_check(struct pt_regs *regs);
 
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 873fa13..379b215 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,6 +1,6 @@
 obj-y			+= setup.o opal-takeover.o opal-wrappers.o opal.o
 obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
-obj-y			+= rng.o
+obj-y			+= rng.o opal-dump.o
 
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
new file mode 100644
index 0000000..e102a80
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -0,0 +1,420 @@
+/*
+ * PowerNV OPAL Dump Interface
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kobject.h>
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+
+/* Dump type */
+#define DUMP_TYPE_FSP	0x01
+
+/* Extract failed */
+#define DUMP_NACK_ID	0x00
+
+/* Dump record */
+struct dump_record {
+	uint8_t		type;
+	uint32_t	id;
+	uint32_t	size;
+	char		*buffer;
+};
+static struct dump_record dump_record;
+
+/* Dump available status */
+static u32 dump_avail;
+
+/* Binary blobs */
+static struct debugfs_blob_wrapper dump_blob;
+static struct debugfs_blob_wrapper readme_blob;
+
+/* Ignore dump notification, if we fail to create debugfs files */
+static bool dump_disarmed = false;
+
+
+static void free_dump_sg_list(struct opal_sg_list *list)
+{
+	struct opal_sg_list *sg1;
+	while (list) {
+		sg1 = list->next;
+		kfree(list);
+		list = sg1;
+	}
+	list = NULL;
+}
+
+/*
+ * Build dump buffer scatter gather list
+ */
+static struct opal_sg_list *dump_data_to_sglist(void)
+{
+	struct opal_sg_list *sg1, *list = NULL;
+	void *addr;
+	int64_t size;
+
+	addr = dump_record.buffer;
+	size = dump_record.size;
+
+	sg1 = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!sg1)
+		goto nomem;
+
+	list = sg1;
+	sg1->num_entries = 0;
+	while (size > 0) {
+		/* Translate virtual address to physical address */
+		sg1->entry[sg1->num_entries].data =
+			(void *)(vmalloc_to_pfn(addr) << PAGE_SHIFT);
+
+		if (size > PAGE_SIZE)
+			sg1->entry[sg1->num_entries].length = PAGE_SIZE;
+		else
+			sg1->entry[sg1->num_entries].length = size;
+
+		sg1->num_entries++;
+		if (sg1->num_entries >= SG_ENTRIES_PER_NODE) {
+			sg1->next = kzalloc(PAGE_SIZE, GFP_KERNEL);
+			if (!sg1->next)
+				goto nomem;
+
+			sg1 = sg1->next;
+			sg1->num_entries = 0;
+		}
+		addr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+	return list;
+
+nomem:
+	pr_err("%s : Failed to allocate memory\n", __func__);
+	free_dump_sg_list(list);
+	return NULL;
+}
+
+/*
+ * Translate sg list address to absolute
+ */
+static void sglist_to_phy_addr(struct opal_sg_list *list)
+{
+	struct opal_sg_list *sg, *next;
+
+	for (sg = list; sg; sg = next) {
+		next = sg->next;
+		/* Don't translate NULL pointer for last entry */
+		if (sg->next)
+			sg->next = (struct opal_sg_list *)__pa(sg->next);
+		else
+			sg->next = NULL;
+
+		/* Convert num_entries to length */
+		sg->num_entries =
+			sg->num_entries * sizeof(struct opal_sg_entry) + 16;
+	}
+}
+
+static void free_dump_data_buf(void)
+{
+	vfree(dump_record.buffer);
+	dump_record.size = 0;
+}
+
+/*
+ * Allocate dump data buffer.
+ */
+static int alloc_dump_data_buf(void)
+{
+	dump_record.buffer = vzalloc(PAGE_ALIGN(dump_record.size));
+	if (!dump_record.buffer) {
+		pr_err("%s : Failed to allocate memory\n", __func__);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/*
+ * Initiate FipS dump
+ */
+static int64_t dump_fips_init(uint8_t type)
+{
+	int rc;
+
+	rc = opal_dump_init(type);
+	if (rc)
+		pr_warn("%s: Failed to initiate FipS dump (%d)\n",
+			__func__, rc);
+	return rc;
+}
+
+/*
+ * Get dump ID and size.
+ */
+static int64_t dump_read_info(void)
+{
+	int rc;
+
+	rc = opal_dump_info(&dump_record.id, &dump_record.size);
+	if (rc)
+		pr_warn("%s: Failed to get dump info (%d)\n",
+			__func__, rc);
+	return rc;
+}
+
+/*
+ * Send acknoledgement to OPAL
+ */
+static int64_t dump_send_ack(uint32_t dump_id)
+{
+	int rc;
+
+	rc = opal_dump_ack(dump_id);
+	if (rc)
+		pr_warn("%s: Failed to send ack message to ID 0x%x (%d)\n",
+			__func__, dump_id, rc);
+	return rc;
+}
+
+/*
+ * Retrieve dump data
+ */
+static int64_t dump_read_data(void)
+{
+	struct opal_sg_list *list;
+	uint64_t addr;
+	int64_t rc;
+
+	/* Allocate memory */
+	rc = alloc_dump_data_buf();
+	if (rc)
+		goto out;
+
+	/* Generate SG list */
+	list = dump_data_to_sglist();
+	if (!list) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Translate sg list addr to real address */
+	sglist_to_phy_addr(list);
+
+	/* First entry address */
+	addr = __pa(list);
+
+	/* Fetch data */
+	rc = OPAL_BUSY;
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_dump_read(dump_record.id, addr);
+		if (rc == OPAL_BUSY) {
+			opal_poll_events(NULL);
+			mdelay(10);
+		}
+	}
+
+	if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL)
+		pr_warn("%s: Extract dump failed for ID 0x%x\n",
+			__func__, dump_record.id);
+
+	/* Free SG list */
+	free_dump_sg_list(list);
+
+out:
+	return rc;
+}
+
+static int extract_dump(void)
+{
+	int rc;
+
+	/* Get dump ID, size */
+	rc = dump_read_info();
+	if (rc != OPAL_SUCCESS)
+		return rc;
+
+	/* Read dump data */
+	rc = dump_read_data();
+	if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) {
+		/*
+		 * Failed to allocate memory to retrieve dump. Lets send
+		 * negative ack so that we get notification again.
+		 */
+		dump_send_ack(DUMP_NACK_ID);
+
+		/* Free dump buffer */
+		free_dump_data_buf();
+
+		return rc;
+	}
+	if (rc == OPAL_PARTIAL)
+		pr_info("%s: Partially read dump ID 0x%x\n",
+			__func__, dump_record.id);
+
+	pr_info("%s: New platform dump available. ID = 0x%x\n",
+		__func__, dump_record.id);
+
+	/* Update dump blob */
+	dump_blob.data = (void *)dump_record.buffer;
+	dump_blob.size = dump_record.size;
+
+	/* Update dump available status */
+	dump_avail = 1;
+
+	return rc;
+}
+
+static void dump_extract_fn(struct work_struct *work)
+{
+	extract_dump();
+}
+
+static DECLARE_WORK(dump_work, dump_extract_fn);
+
+/* Workqueue to extract dump */
+static void schedule_extract_dump(void)
+{
+	schedule_work(&dump_work);
+}
+
+/*
+ * New dump available notification
+ *
+ * Once we get notification, we extract dump via OPAL call
+ * and then write dump to file.
+ */
+static int dump_event(struct notifier_block *nb,
+		      unsigned long events, void *change)
+{
+	/*
+	 * Don't retrieve dump, if we don't have debugfs
+	 * interface to pass data to userspace.
+	 */
+	if (dump_disarmed)
+		return 0;
+
+	/* Check for dump available notification */
+	if (events & OPAL_EVENT_DUMP_AVAIL)
+		schedule_extract_dump();
+
+	return 0;
+}
+
+static struct notifier_block dump_nb = {
+	.notifier_call  = dump_event,
+	.next           = NULL,
+	.priority       = 0
+};
+
+
+/* FIXME: debugfs README message */
+static const char readme_msg[] =
+	"This file will be populated shortly..";
+
+/* debugfs dump_control file operations */
+static ssize_t dump_control_write(struct file *file,
+				  const char __user *user_buf,
+				  size_t count, loff_t *ppos)
+{
+	char buf[4];
+	size_t buf_size;
+
+	buf_size = min(count, (sizeof(buf) - 1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	switch (buf[0]) {
+	case '1':	/* Dump send ack */
+		if (dump_avail) {
+			dump_avail = 0;
+			free_dump_data_buf();
+			dump_send_ack(dump_record.id);
+		}
+		break;
+	case '2':	/* Initiate FipS dump */
+		dump_fips_init(DUMP_TYPE_FSP);
+		break;
+	default:
+		break;
+	}
+	return count;
+}
+
+static const struct file_operations dump_control_fops = {
+	.open	= simple_open,
+	.write	= dump_control_write,
+	.llseek	= default_llseek,
+};
+
+/*
+ * Create dump debugfs file
+ */
+static int debugfs_dump_init(void)
+{
+	struct dentry *dir, *file;
+
+	/* FSP dump directory */
+	dir = debugfs_create_dir("fsp", NULL);
+	if (!dir)
+		goto out;
+
+	/* README */
+	readme_blob.data = (void *)readme_msg;
+	readme_blob.size = strlen(readme_msg);
+	file = debugfs_create_blob("README", 0400, dir, &readme_blob);
+	if (!file)
+		goto remove_dir;
+
+	/* Dump available notification */
+	file = debugfs_create_u32("dump_avail", 0400, dir, &dump_avail);
+	if (!file)
+		goto remove_dir;
+
+	/* data file */
+	dump_blob.data = (void *)dump_record.buffer;
+	dump_blob.size = dump_record.size;
+	file = debugfs_create_blob("dump", 0400, dir, &dump_blob);
+	if (!file)
+		goto remove_dir;
+
+	/* Control file */
+	file = debugfs_create_file("dump_control", 0200, dir,
+				   NULL, &dump_control_fops);
+	if (!file)
+		goto remove_dir;
+
+	return 0;
+
+remove_dir:
+	debugfs_remove_recursive(dir);
+
+out:
+	dump_disarmed = true;
+	return -1;
+}
+
+void __init opal_platform_dump_init(void)
+{
+	int ret;
+
+	/* Register for opal notifier */
+	ret = opal_notifier_register(&dump_nb);
+	if (ret) {
+		pr_warn("%s: Can't register OPAL event notifier (%d)\n",
+			__func__, ret);
+		return;
+	}
+
+	/* debugfs interface */
+	ret = debugfs_dump_init();
+}
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index e780650..1485a09 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -126,3 +126,7 @@ OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU);
 OPAL_CALL(opal_validate_flash,			OPAL_FLASH_VALIDATE);
 OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_dump_init,			OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info,			OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_read,			OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack,			OPAL_DUMP_ACK);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 1c798cd..7c7524c 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -442,6 +442,8 @@ static int __init opal_init(void)
 	if (rc == 0) {
 		/* Setup code update interface */
 		opal_flash_init();
+		/* Setup platform dump extract interface */
+		opal_platform_dump_init();
 	}
 
 	return 0;

^ permalink raw reply related

* [PATCH] powerpc/powernv: Update dump README file
From: Vasant Hegde @ 2013-11-18 11:09 UTC (permalink / raw)
  To: linuxppc-dev

Update dump README file content.

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/opal-dump.c |   12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
index e102a80..9bc8ad3 100644
--- a/arch/powerpc/platforms/powernv/opal-dump.c
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -317,9 +317,17 @@ static struct notifier_block dump_nb = {
 };
 
 
-/* FIXME: debugfs README message */
+/* debugfs README message */
 static const char readme_msg[] =
-	"This file will be populated shortly..";
+	"Platform dump HOWTO:\n\n"
+	"files:\n"
+	"  dump                  - Binary file, contains actual dump data\n"
+	"  dump_available (r--)  - New dump available notification\n"
+	"                          0 : No dump present\n"
+	"                          1 : Dump present\n"
+	"  dump_control(-w-)     - Dump control file\n"
+	"                          1 : Send acknowledgement (dump copied)\n"
+	"                          2 : Initiate FipS dump\n";
 
 /* debugfs dump_control file operations */
 static ssize_t dump_control_write(struct file *file,

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox