LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4] xilinx_spi: Splitted into generic, of and platform driver, added support for DS570
From: Richard Röjfors @ 2009-09-28 14:22 UTC (permalink / raw)
  To: spi-devel-general; +Cc: linuxppc-dev, Andrew Morton, dbrownell, John Linn

This patch splits xilinx_spi into three parts, an OF and a platform
driver and generic part.

The generic part now also works on X86, it supports accessing the IP
booth big and little endian. There is also support for 16 and 32 bit
SPI for the Xilinx SPI IP DS570

Signed-off-by: Richard Röjfors <richard.rojfors@mocean-labs.com>
---
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 2c733c2..ecabc12 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -218,8 +218,8 @@ config SPI_TXX9
 	  SPI driver for Toshiba TXx9 MIPS SoCs

 config SPI_XILINX
-	tristate "Xilinx SPI controller"
-	depends on (XILINX_VIRTEX || MICROBLAZE) && EXPERIMENTAL
+	tristate "Xilinx SPI controller common module"
+	depends on HAS_IOMEM && EXPERIMENTAL
 	select SPI_BITBANG
 	help
 	  This exposes the SPI controller IP from the Xilinx EDK.
@@ -227,6 +227,21 @@ config SPI_XILINX
 	  See the "OPB Serial Peripheral Interface (SPI) (v1.00e)"
 	  Product Specification document (DS464) for hardware details.

+	  Or for the DS570, see "XPS Serial Peripheral Interface (SPI) (v2.00b)"
+
+config SPI_XILINX_OF
+	tristate "Xilinx SPI controller OF device"
+	depends on SPI_XILINX && XILINX_VIRTEX
+	help
+	  This is the OF driver for the SPI controller IP from the Xilinx EDK.
+
+config SPI_XILINX_PLTFM
+	tristate "Xilinx SPI controller platform device"
+	depends on SPI_XILINX
+	help
+	  This is the platform driver for the SPI controller IP
+	  from the Xilinx EDK.
+
 #
 # Add new SPI master controllers in alphabetical order above this line
 #
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index 3de408d..5a91cf5 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -30,6 +30,8 @@ obj-$(CONFIG_SPI_S3C24XX_GPIO)		+= spi_s3c24xx_gpio.o
 obj-$(CONFIG_SPI_S3C24XX)		+= spi_s3c24xx.o
 obj-$(CONFIG_SPI_TXX9)			+= spi_txx9.o
 obj-$(CONFIG_SPI_XILINX)		+= xilinx_spi.o
+obj-$(CONFIG_SPI_XILINX_OF)		+= xilinx_spi_of.o
+obj-$(CONFIG_SPI_XILINX_PLTFM)		+= xilinx_spi_pltfm.o
 obj-$(CONFIG_SPI_SH_SCI)		+= spi_sh_sci.o
 # 	... add above this line ...

diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 46b8c5c..b1fcc00 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -14,22 +14,35 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
-#include <linux/platform_device.h>
-
-#include <linux/of_platform.h>
-#include <linux/of_device.h>
-#include <linux/of_spi.h>

 #include <linux/spi/spi.h>
 #include <linux/spi/spi_bitbang.h>
 #include <linux/io.h>

-#define XILINX_SPI_NAME "xilinx_spi"
+#include "xilinx_spi.h"
+
+struct xilinx_spi {
+	/* bitbang has to be first */
+	struct spi_bitbang bitbang;
+	struct completion done;
+	struct resource mem; /* phys mem */
+	void __iomem	*regs;	/* virt. address of the control registers */
+	u32 irq;
+	u8 *rx_ptr;		/* pointer in the Tx buffer */
+	const u8 *tx_ptr;	/* pointer in the Rx buffer */
+	int remaining_bytes;	/* the number of bytes left to transfer */
+	/* offset to the XSPI regs, these might vary... */
+	u8 bits_per_word;
+	bool big_endian;	/* The device could be accessed big or little
+				 * endian
+				 */
+};
+

 /* Register definitions as per "OPB Serial Peripheral Interface (SPI) (v1.00e)
  * Product Specification", DS464
  */
-#define XSPI_CR_OFFSET		0x62	/* 16-bit Control Register */
+#define XSPI_CR_OFFSET		0x60	/* Control Register */

 #define XSPI_CR_ENABLE		0x02
 #define XSPI_CR_MASTER_MODE	0x04
@@ -40,8 +53,9 @@
 #define XSPI_CR_RXFIFO_RESET	0x40
 #define XSPI_CR_MANUAL_SSELECT	0x80
 #define XSPI_CR_TRANS_INHIBIT	0x100
+#define XSPI_CR_LSB_FIRST	0x200

-#define XSPI_SR_OFFSET		0x67	/* 8-bit Status Register */
+#define XSPI_SR_OFFSET		0x64	/* Status Register */

 #define XSPI_SR_RX_EMPTY_MASK	0x01	/* Receive FIFO is empty */
 #define XSPI_SR_RX_FULL_MASK	0x02	/* Receive FIFO is full */
@@ -49,8 +63,8 @@
 #define XSPI_SR_TX_FULL_MASK	0x08	/* Transmit FIFO is full */
 #define XSPI_SR_MODE_FAULT_MASK	0x10	/* Mode fault error */

-#define XSPI_TXD_OFFSET		0x6b	/* 8-bit Data Transmit Register */
-#define XSPI_RXD_OFFSET		0x6f	/* 8-bit Data Receive Register */
+#define XSPI_TXD_OFFSET		0x68	/* Data Transmit Register */
+#define XSPI_RXD_OFFSET		0x6C	/* Data Receive Register */

 #define XSPI_SSR_OFFSET		0x70	/* 32-bit Slave Select Register */

@@ -70,43 +84,72 @@
 #define XSPI_INTR_TX_UNDERRUN		0x08	/* TxFIFO was underrun */
 #define XSPI_INTR_RX_FULL		0x10	/* RxFIFO is full */
 #define XSPI_INTR_RX_OVERRUN		0x20	/* RxFIFO was overrun */
+#define XSPI_INTR_TX_HALF_EMPTY		0x40	/* TxFIFO is half empty */

 #define XIPIF_V123B_RESETR_OFFSET	0x40	/* IPIF reset register */
 #define XIPIF_V123B_RESET_MASK		0x0a	/* the value to write */

-struct xilinx_spi {
-	/* bitbang has to be first */
-	struct spi_bitbang bitbang;
-	struct completion done;
+/* to follow are some functions that does little of big endian read and
+ * write depending on the config of the device.
+ */
+static inline void xspi_write8(struct xilinx_spi *xspi, u32 offs, u8 val)
+{
+	iowrite8(val, xspi->regs + offs + ((xspi->big_endian) ? 3 : 0));
+}

-	void __iomem	*regs;	/* virt. address of the control registers */
+static inline void xspi_write16(struct xilinx_spi *xspi, u32 offs, u16 val)
+{
+	if (xspi->big_endian)
+		iowrite16be(val, xspi->regs + offs + 2);
+	else
+		iowrite16(val, xspi->regs + offs);
+}

-	u32		irq;
+static inline void xspi_write32(struct xilinx_spi *xspi, u32 offs, u32 val)
+{
+	if (xspi->big_endian)
+		iowrite32be(val, xspi->regs + offs);
+	else
+		iowrite32(val, xspi->regs + offs);
+}

-	u32		speed_hz; /* SCK has a fixed frequency of speed_hz Hz */
+static inline u8 xspi_read8(struct xilinx_spi *xspi, u32 offs)
+{
+	return ioread8(xspi->regs + offs + ((xspi->big_endian) ? 3 : 0));
+}

-	u8 *rx_ptr;		/* pointer in the Tx buffer */
-	const u8 *tx_ptr;	/* pointer in the Rx buffer */
-	int remaining_bytes;	/* the number of bytes left to transfer */
-};
+static inline u16 xspi_read16(struct xilinx_spi *xspi, u32 offs)
+{
+	if (xspi->big_endian)
+		return ioread16be(xspi->regs + offs + 2);
+	else
+		return ioread16(xspi->regs + offs);
+}
+
+static inline u32 xspi_read32(struct xilinx_spi *xspi, u32 offs)
+{
+	if (xspi->big_endian)
+		return ioread32be(xspi->regs + offs);
+	else
+		return ioread32(xspi->regs + offs);
+}

-static void xspi_init_hw(void __iomem *regs_base)
+static void xspi_init_hw(struct xilinx_spi *xspi)
 {
 	/* Reset the SPI device */
-	out_be32(regs_base + XIPIF_V123B_RESETR_OFFSET,
-		 XIPIF_V123B_RESET_MASK);
+	xspi_write32(xspi, XIPIF_V123B_RESETR_OFFSET, XIPIF_V123B_RESET_MASK);
 	/* Disable all the interrupts just in case */
-	out_be32(regs_base + XIPIF_V123B_IIER_OFFSET, 0);
+	xspi_write32(xspi, XIPIF_V123B_IIER_OFFSET, 0);
 	/* Enable the global IPIF interrupt */
-	out_be32(regs_base + XIPIF_V123B_DGIER_OFFSET,
-		 XIPIF_V123B_GINTR_ENABLE);
+	xspi_write32(xspi, XIPIF_V123B_DGIER_OFFSET, XIPIF_V123B_GINTR_ENABLE);
 	/* Deselect the slave on the SPI bus */
-	out_be32(regs_base + XSPI_SSR_OFFSET, 0xffff);
+	xspi_write32(xspi, XSPI_SSR_OFFSET, 0xffff);
 	/* Disable the transmitter, enable Manual Slave Select Assertion,
 	 * put SPI controller into master mode, and enable it */
-	out_be16(regs_base + XSPI_CR_OFFSET,
-		 XSPI_CR_TRANS_INHIBIT | XSPI_CR_MANUAL_SSELECT
-		 | XSPI_CR_MASTER_MODE | XSPI_CR_ENABLE);
+	xspi_write16(xspi, XSPI_CR_OFFSET,
+		 XSPI_CR_TRANS_INHIBIT | XSPI_CR_MANUAL_SSELECT |
+		 XSPI_CR_MASTER_MODE | XSPI_CR_ENABLE | XSPI_CR_TXFIFO_RESET |
+		 XSPI_CR_RXFIFO_RESET);
 }

 static void xilinx_spi_chipselect(struct spi_device *spi, int is_on)
@@ -115,16 +158,16 @@ static void xilinx_spi_chipselect(struct spi_device *spi, int is_on)

 	if (is_on == BITBANG_CS_INACTIVE) {
 		/* Deselect the slave on the SPI bus */
-		out_be32(xspi->regs + XSPI_SSR_OFFSET, 0xffff);
+		xspi_write32(xspi, XSPI_SSR_OFFSET, 0xffff);
 	} else if (is_on == BITBANG_CS_ACTIVE) {
 		/* Set the SPI clock phase and polarity */
-		u16 cr = in_be16(xspi->regs + XSPI_CR_OFFSET)
+		u16 cr = xspi_read16(xspi, XSPI_CR_OFFSET)
 			 & ~XSPI_CR_MODE_MASK;
 		if (spi->mode & SPI_CPHA)
 			cr |= XSPI_CR_CPHA;
 		if (spi->mode & SPI_CPOL)
 			cr |= XSPI_CR_CPOL;
-		out_be16(xspi->regs + XSPI_CR_OFFSET, cr);
+		xspi_write16(xspi, XSPI_CR_OFFSET, cr);

 		/* We do not check spi->max_speed_hz here as the SPI clock
 		 * frequency is not software programmable (the IP block design
@@ -132,24 +175,27 @@ static void xilinx_spi_chipselect(struct spi_device *spi, int is_on)
 		 */

 		/* Activate the chip select */
-		out_be32(xspi->regs + XSPI_SSR_OFFSET,
+		xspi_write32(xspi, XSPI_SSR_OFFSET,
 			 ~(0x0001 << spi->chip_select));
 	}
 }

 /* spi_bitbang requires custom setup_transfer() to be defined if there is a
  * custom txrx_bufs(). We have nothing to setup here as the SPI IP block
- * supports just 8 bits per word, and SPI clock can't be changed in software.
- * Check for 8 bits per word. Chip select delay calculations could be
+ * supports 8 or 16 bits per word, which can not be changed in software.
+ * SPI clock can't be changed in software.
+ * Check for correct bits per word. Chip select delay calculations could be
  * added here as soon as bitbang_work() can be made aware of the delay value.
  */
 static int xilinx_spi_setup_transfer(struct spi_device *spi,
-		struct spi_transfer *t)
+	struct spi_transfer *t)
 {
 	u8 bits_per_word;
+	struct xilinx_spi *xspi = spi_master_get_devdata(spi->master);

-	bits_per_word = (t) ? t->bits_per_word : spi->bits_per_word;
-	if (bits_per_word != 8) {
+	bits_per_word = (t->bits_per_word) ? t->bits_per_word :
+		spi->bits_per_word;
+	if (bits_per_word != xspi->bits_per_word) {
 		dev_err(&spi->dev, "%s, unsupported bits_per_word=%d\n",
 			__func__, bits_per_word);
 		return -EINVAL;
@@ -160,34 +206,50 @@ static int xilinx_spi_setup_transfer(struct spi_device *spi,

 static int xilinx_spi_setup(struct spi_device *spi)
 {
-	struct spi_bitbang *bitbang;
-	struct xilinx_spi *xspi;
-	int retval;
-
-	xspi = spi_master_get_devdata(spi->master);
-	bitbang = &xspi->bitbang;
-
-	retval = xilinx_spi_setup_transfer(spi, NULL);
-	if (retval < 0)
-		return retval;
-
+	/* always return 0, we can not check the number of bits.
+	 * There are cases when SPI setup is called before any driver is
+	 * there, in that case the SPI core defaults to 8 bits, which we
+	 * do not support in some cases. But if we return an error, the
+	 * SPI device would not be registered and no driver can get hold of it
+	 * When the driver is there, it will call SPI setup again with the
+	 * correct number of bits per transfer.
+	 * If a driver setups with the wrong bit number, it will fail when
+	 * it tries to do a transfer
+	 */
 	return 0;
 }

 static void xilinx_spi_fill_tx_fifo(struct xilinx_spi *xspi)
 {
 	u8 sr;
+	u8 wsize;
+	if (xspi->bits_per_word == 8)
+		wsize = 1;
+	else if (xspi->bits_per_word == 16)
+		wsize = 2;
+	else
+		wsize = 4;

 	/* Fill the Tx FIFO with as many bytes as possible */
-	sr = in_8(xspi->regs + XSPI_SR_OFFSET);
-	while ((sr & XSPI_SR_TX_FULL_MASK) == 0 && xspi->remaining_bytes > 0) {
+	sr = xspi_read8(xspi, XSPI_SR_OFFSET);
+	while ((sr & XSPI_SR_TX_FULL_MASK) == 0 &&
+		xspi->remaining_bytes > 0) {
 		if (xspi->tx_ptr) {
-			out_8(xspi->regs + XSPI_TXD_OFFSET, *xspi->tx_ptr++);
-		} else {
-			out_8(xspi->regs + XSPI_TXD_OFFSET, 0);
-		}
-		xspi->remaining_bytes--;
-		sr = in_8(xspi->regs + XSPI_SR_OFFSET);
+			if (wsize == 1)
+				xspi_write8(xspi, XSPI_TXD_OFFSET,
+					*xspi->tx_ptr);
+			else if (wsize == 2)
+				xspi_write16(xspi, XSPI_TXD_OFFSET,
+					*(u16 *)(xspi->tx_ptr));
+			else if (wsize == 4)
+				xspi_write32(xspi, XSPI_TXD_OFFSET,
+					*(u32 *)(xspi->tx_ptr));
+
+			xspi->tx_ptr += wsize;
+		} else
+			xspi_write8(xspi, XSPI_TXD_OFFSET, 0);
+		xspi->remaining_bytes -= wsize;
+		sr = xspi_read8(xspi, XSPI_SR_OFFSET);
 	}
 }

@@ -209,23 +271,22 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
 	/* Enable the transmit empty interrupt, which we use to determine
 	 * progress on the transmission.
 	 */
-	ipif_ier = in_be32(xspi->regs + XIPIF_V123B_IIER_OFFSET);
-	out_be32(xspi->regs + XIPIF_V123B_IIER_OFFSET,
+	ipif_ier = xspi_read32(xspi, XIPIF_V123B_IIER_OFFSET);
+	xspi_write32(xspi, XIPIF_V123B_IIER_OFFSET,
 		 ipif_ier | XSPI_INTR_TX_EMPTY);

 	/* Start the transfer by not inhibiting the transmitter any longer */
-	cr = in_be16(xspi->regs + XSPI_CR_OFFSET) & ~XSPI_CR_TRANS_INHIBIT;
-	out_be16(xspi->regs + XSPI_CR_OFFSET, cr);
+	cr = xspi_read16(xspi, XSPI_CR_OFFSET) & ~XSPI_CR_TRANS_INHIBIT;
+	xspi_write16(xspi, XSPI_CR_OFFSET, cr);

 	wait_for_completion(&xspi->done);

 	/* Disable the transmit empty interrupt */
-	out_be32(xspi->regs + XIPIF_V123B_IIER_OFFSET, ipif_ier);
+	xspi_write32(xspi, XIPIF_V123B_IIER_OFFSET, ipif_ier);

 	return t->len - xspi->remaining_bytes;
 }

-
 /* This driver supports single master mode only. Hence Tx FIFO Empty
  * is the only interrupt we care about.
  * Receive FIFO Overrun, Transmit FIFO Underrun, Mode Fault, and Slave Mode
@@ -237,32 +298,50 @@ static irqreturn_t xilinx_spi_irq(int irq, void *dev_id)
 	u32 ipif_isr;

 	/* Get the IPIF interrupts, and clear them immediately */
-	ipif_isr = in_be32(xspi->regs + XIPIF_V123B_IISR_OFFSET);
-	out_be32(xspi->regs + XIPIF_V123B_IISR_OFFSET, ipif_isr);
+	ipif_isr = xspi_read32(xspi, XIPIF_V123B_IISR_OFFSET);
+	xspi_write32(xspi, XIPIF_V123B_IISR_OFFSET, ipif_isr);

 	if (ipif_isr & XSPI_INTR_TX_EMPTY) {	/* Transmission completed */
 		u16 cr;
 		u8 sr;
+		u8 rsize;
+		if (xspi->bits_per_word == 8)
+			rsize = 1;
+		else if (xspi->bits_per_word == 16)
+			rsize = 2;
+		else
+			rsize = 4;

 		/* A transmit has just completed. Process received data and
 		 * check for more data to transmit. Always inhibit the
 		 * transmitter while the Isr refills the transmit register/FIFO,
 		 * or make sure it is stopped if we're done.
 		 */
-		cr = in_be16(xspi->regs + XSPI_CR_OFFSET);
-		out_be16(xspi->regs + XSPI_CR_OFFSET,
-			 cr | XSPI_CR_TRANS_INHIBIT);
+		cr = xspi_read16(xspi, XSPI_CR_OFFSET);
+		xspi_write16(xspi, XSPI_CR_OFFSET, cr | XSPI_CR_TRANS_INHIBIT);

 		/* Read out all the data from the Rx FIFO */
-		sr = in_8(xspi->regs + XSPI_SR_OFFSET);
+		sr = xspi_read8(xspi, XSPI_SR_OFFSET);
 		while ((sr & XSPI_SR_RX_EMPTY_MASK) == 0) {
-			u8 data;
+			u32 data;
+			if (rsize == 1)
+				data = xspi_read8(xspi, XSPI_RXD_OFFSET);
+			else if (rsize == 2)
+				data = xspi_read16(xspi, XSPI_RXD_OFFSET);
+			else
+				data = xspi_read32(xspi, XSPI_RXD_OFFSET);

-			data = in_8(xspi->regs + XSPI_RXD_OFFSET);
 			if (xspi->rx_ptr) {
-				*xspi->rx_ptr++ = data;
+				if (rsize == 1)
+					*xspi->rx_ptr = data & 0xff;
+				else if (rsize == 2)
+					*(u16 *)(xspi->rx_ptr) = data & 0xffff;
+				else
+					*((u32 *)(xspi->rx_ptr)) = data;
+				xspi->rx_ptr += rsize;
 			}
-			sr = in_8(xspi->regs + XSPI_SR_OFFSET);
+
+			sr = xspi_read8(xspi, XSPI_SR_OFFSET);
 		}

 		/* See if there is more data to send */
@@ -271,7 +350,7 @@ static irqreturn_t xilinx_spi_irq(int irq, void *dev_id)
 			/* Start the transfer by not inhibiting the
 			 * transmitter any longer
 			 */
-			out_be16(xspi->regs + XSPI_CR_OFFSET, cr);
+			xspi_write16(xspi, XSPI_CR_OFFSET, cr);
 		} else {
 			/* No more data to send.
 			 * Indicate the transfer is completed.
@@ -279,44 +358,21 @@ static irqreturn_t xilinx_spi_irq(int irq, void *dev_id)
 			complete(&xspi->done);
 		}
 	}
-
 	return IRQ_HANDLED;
 }

-static int __init xilinx_spi_of_probe(struct of_device *ofdev,
-					const struct of_device_id *match)
+struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
+	u32 irq, s16 bus_num, u16 num_chipselect, u8 bits_per_word,
+	bool big_endian)
 {
 	struct spi_master *master;
 	struct xilinx_spi *xspi;
-	struct resource r_irq_struct;
-	struct resource r_mem_struct;
-
-	struct resource *r_irq = &r_irq_struct;
-	struct resource *r_mem = &r_mem_struct;
-	int rc = 0;
-	const u32 *prop;
-	int len;
-
-	/* Get resources(memory, IRQ) associated with the device */
-	master = spi_alloc_master(&ofdev->dev, sizeof(struct xilinx_spi));
-
-	if (master == NULL) {
-		return -ENOMEM;
-	}
+	int ret = 0;

-	dev_set_drvdata(&ofdev->dev, master);
+	master = spi_alloc_master(dev, sizeof(struct xilinx_spi));

-	rc = of_address_to_resource(ofdev->node, 0, r_mem);
-	if (rc) {
-		dev_warn(&ofdev->dev, "invalid address\n");
-		goto put_master;
-	}
-
-	rc = of_irq_to_resource(ofdev->node, 0, r_irq);
-	if (rc == NO_IRQ) {
-		dev_warn(&ofdev->dev, "no IRQ found\n");
-		goto put_master;
-	}
+	if (master == NULL)
+		return ERR_PTR(-ENOMEM);

 	/* the spi->mode bits understood by this driver: */
 	master->mode_bits = SPI_CPOL | SPI_CPHA;
@@ -329,128 +385,73 @@ static int __init xilinx_spi_of_probe(struct of_device *ofdev,
 	xspi->bitbang.master->setup = xilinx_spi_setup;
 	init_completion(&xspi->done);

-	xspi->irq = r_irq->start;
-
-	if (!request_mem_region(r_mem->start,
-			r_mem->end - r_mem->start + 1, XILINX_SPI_NAME)) {
-		rc = -ENXIO;
-		dev_warn(&ofdev->dev, "memory request failure\n");
+	if (!request_mem_region(mem->start, resource_size(mem),
+		XILINX_SPI_NAME)) {
+		ret = -ENXIO;
 		goto put_master;
 	}

-	xspi->regs = ioremap(r_mem->start, r_mem->end - r_mem->start + 1);
+	xspi->regs = ioremap(mem->start, resource_size(mem));
 	if (xspi->regs == NULL) {
-		rc = -ENOMEM;
-		dev_warn(&ofdev->dev, "ioremap failure\n");
-		goto release_mem;
+		ret = -ENOMEM;
+		dev_warn(dev, "ioremap failure\n");
+		goto map_failed;
 	}
-	xspi->irq = r_irq->start;

-	/* dynamic bus assignment */
-	master->bus_num = -1;
+	master->bus_num = bus_num;
+	master->num_chipselect = num_chipselect;

-	/* number of slave select bits is required */
-	prop = of_get_property(ofdev->node, "xlnx,num-ss-bits", &len);
-	if (!prop || len < sizeof(*prop)) {
-		dev_warn(&ofdev->dev, "no 'xlnx,num-ss-bits' property\n");
-		goto unmap_io;
-	}
-	master->num_chipselect = *prop;
+	xspi->mem = *mem;
+	xspi->irq = irq;
+	xspi->bits_per_word = bits_per_word;
+	xspi->big_endian = big_endian;

 	/* SPI controller initializations */
-	xspi_init_hw(xspi->regs);
+	xspi_init_hw(xspi);

 	/* Register for SPI Interrupt */
-	rc = request_irq(xspi->irq, xilinx_spi_irq, 0, XILINX_SPI_NAME, xspi);
-	if (rc != 0) {
-		dev_warn(&ofdev->dev, "irq request failure: %d\n", xspi->irq);
+	ret = request_irq(xspi->irq, xilinx_spi_irq, 0, XILINX_SPI_NAME, xspi);
+	if (ret != 0)
 		goto unmap_io;
-	}

-	rc = spi_bitbang_start(&xspi->bitbang);
-	if (rc != 0) {
-		dev_err(&ofdev->dev, "spi_bitbang_start FAILED\n");
+	ret = spi_bitbang_start(&xspi->bitbang);
+	if (ret != 0) {
+		dev_err(dev, "spi_bitbang_start FAILED\n");
 		goto free_irq;
 	}

-	dev_info(&ofdev->dev, "at 0x%08X mapped to 0x%08X, irq=%d\n",
-			(unsigned int)r_mem->start, (u32)xspi->regs, xspi->irq);
-
-	/* Add any subnodes on the SPI bus */
-	of_register_spi_devices(master, ofdev->node);
-
-	return rc;
+	dev_info(dev, "at 0x%08X mapped to 0x%08X, irq=%d\n",
+		(u32)mem->start, (u32)xspi->regs, xspi->irq);
+	return master;

 free_irq:
 	free_irq(xspi->irq, xspi);
 unmap_io:
 	iounmap(xspi->regs);
-release_mem:
-	release_mem_region(r_mem->start, resource_size(r_mem));
+map_failed:
+	release_mem_region(mem->start, resource_size(mem));
 put_master:
 	spi_master_put(master);
-	return rc;
+	return ERR_PTR(ret);
 }
+EXPORT_SYMBOL(xilinx_spi_init);

-static int __devexit xilinx_spi_remove(struct of_device *ofdev)
+void xilinx_spi_deinit(struct spi_master *master)
 {
 	struct xilinx_spi *xspi;
-	struct spi_master *master;
-	struct resource r_mem;

-	master = platform_get_drvdata(ofdev);
 	xspi = spi_master_get_devdata(master);

 	spi_bitbang_stop(&xspi->bitbang);
 	free_irq(xspi->irq, xspi);
 	iounmap(xspi->regs);
-	if (!of_address_to_resource(ofdev->node, 0, &r_mem))
-		release_mem_region(r_mem.start, resource_size(&r_mem));
-	dev_set_drvdata(&ofdev->dev, 0);
-	spi_master_put(xspi->bitbang.master);
-
-	return 0;
-}
-
-/* work with hotplug and coldplug */
-MODULE_ALIAS("platform:" XILINX_SPI_NAME);
-
-static int __exit xilinx_spi_of_remove(struct of_device *op)
-{
-	return xilinx_spi_remove(op);
-}
-
-static struct of_device_id xilinx_spi_of_match[] = {
-	{ .compatible = "xlnx,xps-spi-2.00.a", },
-	{ .compatible = "xlnx,xps-spi-2.00.b", },
-	{}
-};
-
-MODULE_DEVICE_TABLE(of, xilinx_spi_of_match);
-
-static struct of_platform_driver xilinx_spi_of_driver = {
-	.owner = THIS_MODULE,
-	.name = "xilinx-xps-spi",
-	.match_table = xilinx_spi_of_match,
-	.probe = xilinx_spi_of_probe,
-	.remove = __exit_p(xilinx_spi_of_remove),
-	.driver = {
-		.name = "xilinx-xps-spi",
-		.owner = THIS_MODULE,
-	},
-};

-static int __init xilinx_spi_init(void)
-{
-	return of_register_platform_driver(&xilinx_spi_of_driver);
+	release_mem_region(xspi->mem.start, resource_size(&xspi->mem));
+	spi_master_put(xspi->bitbang.master);
 }
-module_init(xilinx_spi_init);
+EXPORT_SYMBOL(xilinx_spi_deinit);

-static void __exit xilinx_spi_exit(void)
-{
-	of_unregister_platform_driver(&xilinx_spi_of_driver);
-}
-module_exit(xilinx_spi_exit);
 MODULE_AUTHOR("MontaVista Software, Inc. <source@mvista.com>");
 MODULE_DESCRIPTION("Xilinx SPI driver");
 MODULE_LICENSE("GPL");
+
diff --git a/drivers/spi/xilinx_spi.h b/drivers/spi/xilinx_spi.h
new file mode 100644
index 0000000..e0ad67e
--- /dev/null
+++ b/drivers/spi/xilinx_spi.h
@@ -0,0 +1,33 @@
+/*
+ * xilinx_spi.h
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _XILINX_SPI_H_
+#define _XILINX_SPI_H_ 1
+
+#include <linux/spi/spi.h>
+#include <linux/spi/spi_bitbang.h>
+#include <linux/spi/xilinx_spi.h>
+
+#define XILINX_SPI_NAME "xilinx_spi"
+
+struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
+	u32 irq, s16 bus_num, u16 num_chipselect, u8 bits_per_word,
+	bool big_endian);
+
+void xilinx_spi_deinit(struct spi_master *master);
+#endif
diff --git a/drivers/spi/xilinx_spi_of.c b/drivers/spi/xilinx_spi_of.c
new file mode 100644
index 0000000..f101a9f
--- /dev/null
+++ b/drivers/spi/xilinx_spi_of.c
@@ -0,0 +1,120 @@
+/*
+ * xilinx_spi_of.c
+ *
+ * Xilinx SPI controller driver (master mode only)
+ *
+ * Author: MontaVista Software, Inc.
+ *	source@mvista.com
+ *
+ * 2002-2007 (c) MontaVista Software, Inc.  This file is licensed under the
+ * terms of the GNU General Public License version 2.  This program is licensed
+ * "as is" without any warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/platform_device.h>
+
+#include <linux/of_platform.h>
+#include <linux/of_device.h>
+#include <linux/of_spi.h>
+
+#include <linux/spi/spi.h>
+#include <linux/spi/spi_bitbang.h>
+
+#include "xilinx_spi.h"
+
+
+static int __init xilinx_spi_of_probe(struct of_device *ofdev,
+					const struct of_device_id *match)
+{
+	struct resource r_irq_struct;
+	struct resource r_mem_struct;
+	struct spi_master *master;
+
+	struct resource *r_irq = &r_irq_struct;
+	struct resource *r_mem = &r_mem_struct;
+	int rc = 0;
+	const u32 *prop;
+	int len;
+
+	rc = of_address_to_resource(ofdev->node, 0, r_mem);
+	if (rc) {
+		dev_warn(&ofdev->dev, "invalid address\n");
+		return rc;
+	}
+
+	rc = of_irq_to_resource(ofdev->node, 0, r_irq);
+	if (rc == NO_IRQ) {
+		dev_warn(&ofdev->dev, "no IRQ found\n");
+		return -ENODEV;
+	}
+
+	/* number of slave select bits is required */
+	prop = of_get_property(ofdev->node, "xlnx,num-ss-bits", &len);
+	if (!prop || len < sizeof(*prop)) {
+		dev_warn(&ofdev->dev, "no 'xlnx,num-ss-bits' property\n");
+		return -EINVAL;
+	}
+	master = xilinx_spi_init(&ofdev->dev, r_mem, r_irq->start, -1, *prop, 8,
+		true);
+	if (IS_ERR(master))
+		return PTR_ERR(master);
+
+	dev_set_drvdata(&ofdev->dev, master);
+
+	/* Add any subnodes on the SPI bus */
+	of_register_spi_devices(master, ofdev->node);
+
+	return 0;
+}
+
+static int __devexit xilinx_spi_remove(struct of_device *ofdev)
+{
+	xilinx_spi_deinit(dev_get_drvdata(&ofdev->dev));
+	dev_set_drvdata(&ofdev->dev, 0);
+	return 0;
+}
+
+static int __exit xilinx_spi_of_remove(struct of_device *op)
+{
+	return xilinx_spi_remove(op);
+}
+
+static struct of_device_id xilinx_spi_of_match[] = {
+	{ .compatible = "xlnx,xps-spi-2.00.a", },
+	{ .compatible = "xlnx,xps-spi-2.00.b", },
+	{}
+};
+
+MODULE_DEVICE_TABLE(of, xilinx_spi_of_match);
+
+static struct of_platform_driver xilinx_spi_of_driver = {
+	.owner = THIS_MODULE,
+	.name = "xilinx-xps-spi",
+	.match_table = xilinx_spi_of_match,
+	.probe = xilinx_spi_of_probe,
+	.remove = __exit_p(xilinx_spi_of_remove),
+	.driver = {
+		.name = "xilinx-xps-spi",
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init xilinx_spi_of_init(void)
+{
+	return of_register_platform_driver(&xilinx_spi_of_driver);
+}
+module_init(xilinx_spi_of_init);
+
+static void __exit xilinx_spi_of_exit(void)
+{
+	of_unregister_platform_driver(&xilinx_spi_of_driver);
+}
+module_exit(xilinx_spi_of_exit);
+MODULE_AUTHOR("MontaVista Software, Inc. <source@mvista.com>");
+MODULE_DESCRIPTION("Xilinx SPI driver");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/spi/xilinx_spi_pltfm.c b/drivers/spi/xilinx_spi_pltfm.c
new file mode 100644
index 0000000..81ac297
--- /dev/null
+++ b/drivers/spi/xilinx_spi_pltfm.c
@@ -0,0 +1,104 @@
+/*
+ * xilinx_spi_pltfm.c Support for Xilinx SPI platform devices
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Xilinx SPI devices as platform devices
+ *
+ * Inspired by xilinx_spi.c, 2002-2007 (c) MontaVista Software, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/platform_device.h>
+
+#include <linux/spi/spi.h>
+#include <linux/spi/spi_bitbang.h>
+#include <linux/spi/xilinx_spi.h>
+
+#include "xilinx_spi.h"
+
+static int __devinit xilinx_spi_probe(struct platform_device *dev)
+{
+	struct xspi_platform_data *pdata;
+	struct resource *r;
+	int irq;
+	struct spi_master *master;
+	u8 i;
+
+	pdata = dev->dev.platform_data;
+	if (pdata == NULL)
+		return -ENODEV;
+
+	r = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (r == NULL)
+		return -ENODEV;
+
+	irq = platform_get_irq(dev, 0);
+	if (irq < 0)
+		return -ENXIO;
+
+	master = xilinx_spi_init(&dev->dev, r, irq, dev->id,
+		pdata->num_chipselect, pdata->bits_per_word, false);
+	if (IS_ERR(master))
+		return PTR_ERR(master);
+
+	for (i = 0; i < pdata->num_devices; i++)
+		spi_new_device(master, pdata->devices + i);
+
+	platform_set_drvdata(dev, master);
+	return 0;
+}
+
+static int __devexit xilinx_spi_remove(struct platform_device *dev)
+{
+	xilinx_spi_deinit(platform_get_drvdata(dev));
+	platform_set_drvdata(dev, 0);
+
+	return 0;
+}
+
+/* work with hotplug and coldplug */
+MODULE_ALIAS("platform:" XILINX_SPI_NAME);
+
+static struct platform_driver xilinx_spi_driver = {
+	.probe	= xilinx_spi_probe,
+	.remove	= __devexit_p(xilinx_spi_remove),
+	.driver = {
+		.name = XILINX_SPI_NAME,
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init xilinx_spi_pltfm_init(void)
+{
+	return platform_driver_register(&xilinx_spi_driver);
+}
+module_init(xilinx_spi_pltfm_init);
+
+static void __exit xilinx_spi_pltfm_exit(void)
+{
+	platform_driver_unregister(&xilinx_spi_driver);
+}
+module_exit(xilinx_spi_pltfm_exit);
+
+MODULE_AUTHOR("Mocean Laboratories <info@mocean-labs.com>");
+MODULE_DESCRIPTION("Xilinx SPI platform driver");
+MODULE_LICENSE("GPL v2");
+
diff --git a/include/linux/spi/xilinx_spi.h b/include/linux/spi/xilinx_spi.h
new file mode 100644
index 0000000..03e1301
--- /dev/null
+++ b/include/linux/spi/xilinx_spi.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_SPI_XILINX_SPI_H
+#define __LINUX_SPI_XILINX_SPI_H
+
+/**
+ * struct xspi_platform_data - Platform data of the Xilinx SPI driver
+ * @num_chipselect:	Number of chip select by the IP
+ * @bits_per_word:	Number of bits per word. 8/16/32, Note that the DS464
+ *			only support 8bit SPI.
+ * @devices:		Devices to add when the driver is probed.
+ * @num_devices:	Number of devices in the devices array.
+ */
+struct xspi_platform_data {
+	u16 num_chipselect;
+	u8 bits_per_word;
+	struct spi_board_info *devices;
+	u8 num_devices;
+};
+
+#endif /* __LINUX_SPI_XILINX_SPI_H */

^ permalink raw reply related

* Re: [PATCH v3 0/3] cpu: pseries: Cpu offline states framework
From: Vaidyanathan Srinivasan @ 2009-09-28 13:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Gautham R Shenoy, linux-kernel,
	Venkatesh Pallipadi, Arun R Bharadwaj, linuxppc-dev,
	Darrick J. Wong
In-Reply-To: <1253913169.7103.529.camel@pasglop>

* Benjamin Herrenschmidt <benh@kernel.crashing.org> [2009-09-26 07:12:48]:

> On Fri, 2009-09-25 at 16:48 +0200, Peter Zijlstra wrote:
> > On Thu, 2009-09-24 at 10:51 +1000, Benjamin Herrenschmidt wrote:
> > > On Tue, 2009-09-15 at 14:11 +0200, Peter Zijlstra wrote:
> > > > I still think its a layering violation... its the hypervisor manager
> > > > that should be bothered in what state an off-lined cpu is in. 
> > > > 
> > > That's not how our hypervisor works.
> > 
> > Then fix it?
> 
> Are you serious ? :-)
> 
> > CPU hotplug is terribly invasive and expensive to the kernel, doing
> > hotplug on a minute basis is just plain crazy.
> > 
> > If you want a CPU in a keep it near and don't hand it back to the HV
> > state, why not use cpusets to isolate it and simply not run tasks on it?
> > 
> > cpusets don't use stopmachine and are much nicer to the rest of the
> > kernel over-all.
> 
> Gautham, what is the different in term of power saving between having
> it idle for long periods of time (which could do H_CEDE and with NO_HZ,
> probably wouln't need to wake up that often) and having it unplugged in
> a H_CEDE loop ?

Hi Ben,

A cede latency specifier value indicating latency expectation of the
guest OS can be established in the VPA to inform the hypervisor during
the H_CEDE call.  Currently, we do call H_CEDE during NO_HZ for
efficient idle.  However, higher cede latency values may not be
suitable for idle CPUs in the kernel and instead more energy savings
may result from exploiting this feature through CPU hotplug
interface.

--Vaidy

^ permalink raw reply

* Re: [PATCH v3 0/3] cpu: pseries: Cpu offline states framework
From: Vaidyanathan Srinivasan @ 2009-09-28 13:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Gautham R Shenoy, Venkatesh Pallipadi, linux-kernel,
	Arun R Bharadwaj, linuxppc-dev, Darrick J. Wong
In-Reply-To: <1253890120.18939.189.camel@laptop>

* Peter Zijlstra <a.p.zijlstra@chello.nl> [2009-09-25 16:48:40]:

> On Thu, 2009-09-24 at 10:51 +1000, Benjamin Herrenschmidt wrote:
> > On Tue, 2009-09-15 at 14:11 +0200, Peter Zijlstra wrote:
> > > I still think its a layering violation... its the hypervisor manager
> > > that should be bothered in what state an off-lined cpu is in. 
> > > 
> > That's not how our hypervisor works.
> 
> Then fix it?
> 
> > If you ask through the management interface, to remove a CPU from a
> > partition, the HV will communicate with a daemon inside the partition
> > that will then unplug the CPU via the right call.
> > 
> > I don't really understand your objections to be honest. And I fail to
> > see why it would be a layering violation to have the ability for the OS
> > to indicate in what state it wishes to relinguish a CPU to the
> > hypervisor, which more or less defines what is the expected latency for
> > getting it back later on.
> 
> OK, so the main objection is the abuse of CPU hotplug as resource
> management feature.
> 
> CPU hotplug is terribly invasive and expensive to the kernel, doing
> hotplug on a minute basis is just plain crazy.
> 
> If you want a CPU in a keep it near and don't hand it back to the HV
> state, why not use cpusets to isolate it and simply not run tasks on it?
> 
> cpusets don't use stopmachine and are much nicer to the rest of the
> kernel over-all.

Hi Peter,

This interface is not expected to be used every minute or so to impact
the operation of the rest of the system.  Cpuhotplug is currently used
as a resource management feature in virtualised system using dlpar
operations.  I do understand that cpu hotplug is invasive at run time
and that amount of complexity is required to carefully isolate the cpu
from any involvement in the running kernel.

Building another interface to isolate the cpus to the same extent as
cpu hotplug does today would be redundant and is going to be equally
invasive. Alternatives like cpuset for isolation migrates only tasks.

--Vaidy

^ permalink raw reply

* Re: [PATCH] powerpc/8xx: fix regression introduced by cache coherency rewrite
From: Joakim Tjernlund @ 2009-09-28 10:02 UTC (permalink / raw)
  Cc: Rex Feany, linuxppc-dev@ozlabs.org
In-Reply-To: <OF0EEAC985.DECE2C31-ONC125763F.002527BE-C125763F.002882FD@transmode.se>

>
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 28/09/2009 05:21:00:
> >
> > On Sun, 2009-09-27 at 15:22 +0200, Joakim Tjernlund wrote:
> > > > However, adding tlbil_va() to ptep_set_access_flags() as you suggested
> > > > makes everything happy. I need to test it some more, but it looks good
> > > > so far. Below is what I am testing now.
> > >
> > > 8xx, is getting very hacky and I suspect that the only long term fix is
> > > add code to trap the cache instructions in TLB error/miss and fixup the
> > > exception in page fault handler. This will also have the added benefit on being able
> > > to use the cache instructions in both kernel and user space like any other
> > > ppc arch.
> >
> > First I'd like to understand exactly what's happening today, since
> > it makes little sense :-) I suppose I'll have to get myself some
> > 8xx doco and understand how the bloody MMU works.
>
> hmm, I recall that the reason for the currect "force a TLB error"
> procedure was to fix a problem Montavista had, possibly Tom Rini.
> He fixed the problem differently at first but later it
> was changed to what is in use today.
>
> If you have access to the old linuxppc-embedded you will find a lot
> of info in a thread with subject "[PATCH 2.6.14] mm: 8xx MM fix for"
> Also I made a ref that I cannot access anymore:
> http://ozlabs.org/pipermail/linuxppc-embedded/2005-January/016382.html

Found a link to the original problem
 http://www.mail-archive.com/linuxppc-embedded@ozlabs.org/msg03376.html
seems like I had something to do with the solution that is haunting
8xx now :(
Basically 8xx should revert back to the fix suggested in the above link.

^ permalink raw reply

* Re: 64bit kernel is huge
From: Michael Ellerman @ 2009-09-28  9:13 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Anton Blanchard
In-Reply-To: <1254125220.8421.21.camel@pasglop>

[-- Attachment #1: Type: text/plain, Size: 1612 bytes --]

On Mon, 2009-09-28 at 18:07 +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2009-09-28 at 17:45 +1000, Anton Blanchard wrote:
> > Hi,
> > 
> > I've found at least one machine that wont boot 2.6.31-rc* with a 
> > pseries_defconfig. If I move real-base from 0xc00000 to 0xd00000 it
> > boots fine.
> > 
> > # size vmlinux
> >    text	   data	    bss	    dec	    hex	filename
> > 9812942	1982496	1105228	12900666	 c4d93a	vmlinux
> > 
> > Looks like we blow right through the 12MB mark. It desperately needs to eat
> > less and lose weight.

> > 262144  kstat_irqs_all
> > 131072  irq_desc
> > 16384   irq_stat
> > 
> > Could we dynamically allocate our irq structures?
> 
> We still want one big array, unless we go to sparse IRQ numbering like
> x86 but we'd have to also adapt the remapping stuff. Definitely to put
> on a list somewhere for people who want to pick up something to do :-)

Actually I've already been looking at it. The easy step is to enable
SPARSE_IRQ, the harder step is to get rid of our irq_map. But that's not
that big.

I'll try and get them polished for the next merge window.

> It's hard to properly dynamically size it. I'd rather have a "capacity"
> of _lots_ and sparsely populate the array (a tree ?) because we never
> know with MSIs etc... how many we'll really need.
> 
> At the -very-least- we could make NR_IRQS a CONFIG option.

Yeah I have a patch for that too, I'll post it. It's no good for distros
but for custom kernels it could be quite handy, small partitions with
virtual most things don't really need many interrupts.

cheers

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply

* Re: 64bit kernel is huge
From: Benjamin Herrenschmidt @ 2009-09-28  8:07 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: linuxppc-dev
In-Reply-To: <20090928074503.GB16073@kryten>

On Mon, 2009-09-28 at 17:45 +1000, Anton Blanchard wrote:
> Hi,
> 
> I've found at least one machine that wont boot 2.6.31-rc* with a 
> pseries_defconfig. If I move real-base from 0xc00000 to 0xd00000 it
> boots fine.
> 
> # size vmlinux
>    text	   data	    bss	    dec	    hex	filename
> 9812942	1982496	1105228	12900666	 c4d93a	vmlinux
> 
> Looks like we blow right through the 12MB mark. It desperately needs to eat
> less and lose weight.

Depends what you have enabled tho. Known killers are CONFIG_RELOCATABLE,
the FTRACE stuff.

> Here are some of the problem areas:
> 
> 788224  kallsyms_names
> 537600  kallsyms_addresses
> 
> I guess mostly CONFIG_KALLSYMS_ALL.

Yeah, those are full on. Maybe we could compress them ? It should
compress well... but then, zImage does that already.

> 262144  kstat_irqs_all
> 131072  irq_desc
> 16384   irq_stat
> 
> Could we dynamically allocate our irq structures?

We still want one big array, unless we go to sparse IRQ numbering like
x86 but we'd have to also adapt the remapping stuff. Definitely to put
on a list somewhere for people who want to pick up something to do :-)

It's hard to properly dynamically size it. I'd rather have a "capacity"
of _lots_ and sparsely populate the array (a tree ?) because we never
know with MSIs etc... how many we'll really need.

At the -very-least- we could make NR_IRQS a CONFIG option.

> 151912  __start_mcount_loc
> 131076  map_pid_to_cmdline
> 
> ftrace stuff. With a name like map_pid_to_cmdline I'm pretty sure I'm not
> going to like what it does.

:-) No idea in fact

> 131072  lppaca
> 65536   paca
> 
> I think we've attacked these before, not sure if there is anything left
> we can trim.

Doubt it.

> 131072  __log_buf
> 
> Looks like we can dynamically allocate a large log buf at runtime. Perhaps
> we should default to a small log_buf and grow it at boot based on machine size
> (eg max cpus).

Ah, it's a new feature I have missed.

> 101160  powerpc_opcodes
> 
> xmon instruction disassembly. I'd probably prefer to cut off my right hand and
> debug one handed before losing this though.

It's already a CONFIG_OPTION for those who prefer coding with their
feet :-)

> 87600   __start___bug_table
> 
> Can't do much about this.

Appart from having no bugs :-)

> 77452   _fw_acenic_tg2_bin_bin
> 
> We could probably change acenic to be a module, 

Right.

> 46464   kmalloc_caches
> 32768   read_buffers
> 32768   mem_section
> 21816   hstates
> 20480   node_devices
> 14336   bootmem_node_data
> 
> mm stuff.

Also if ftrace is enabled, -pg is going to bloat the shit our of
everything (and slow everything down, gcc becomes really silly)

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] i2c-mpc: Do not generate STOP after read.
From: Joakim Tjernlund @ 2009-09-28  7:43 UTC (permalink / raw)
  To: Jean Delvare; +Cc: linuxppc-dev, Esben Haabendal, linux-i2c
In-Reply-To: <20090928093434.57157435@hyperion.delvare>

Jean Delvare <khali@linux-fr.org> wrote on 28/09/2009 09:34:34:
>
> On Mon, 28 Sep 2009 09:30:32 +0200, Joakim Tjernlund wrote:
> > Jean Delvare <khali@linux-fr.org> wrote on 28/09/2009 09:28:09:
> > >
> > > On Mon, 28 Sep 2009 00:26:54 +0200, Joakim Tjernlund wrote:
> > > > Jean, I just noticed you pull request for i2c on LKML but I didn't see this
> > > > patch nor have I got any feedback from you. What is your view?
> > >
> > > My view is that i2c-mpc is nor under my jurisdiction, so I did not, and
> > > will not, pay any attention to it.
> >
> > Ah, that explains it. Who then will look after i2c-mpc? Kumar?
>
> What does MAINTAINERS say?

I did a grep for i2c-mpc and nothing turned up. Now that someone mentioned
Ben Dooks I found it.

^ permalink raw reply

* 64bit kernel is huge
From: Anton Blanchard @ 2009-09-28  7:45 UTC (permalink / raw)
  To: linuxppc-dev


Hi,

I've found at least one machine that wont boot 2.6.31-rc* with a 
pseries_defconfig. If I move real-base from 0xc00000 to 0xd00000 it
boots fine.

# size vmlinux
   text	   data	    bss	    dec	    hex	filename
9812942	1982496	1105228	12900666	 c4d93a	vmlinux

Looks like we blow right through the 12MB mark. It desperately needs to eat
less and lose weight.

Here are some of the problem areas:

788224  kallsyms_names
537600  kallsyms_addresses

I guess mostly CONFIG_KALLSYMS_ALL.

262144  kstat_irqs_all
131072  irq_desc
16384   irq_stat

Could we dynamically allocate our irq structures?

151912  __start_mcount_loc
131076  map_pid_to_cmdline

ftrace stuff. With a name like map_pid_to_cmdline I'm pretty sure I'm not
going to like what it does.

131072  lppaca
65536   paca

I think we've attacked these before, not sure if there is anything left
we can trim.

131072  __log_buf

Looks like we can dynamically allocate a large log buf at runtime. Perhaps
we should default to a small log_buf and grow it at boot based on machine size
(eg max cpus).

101160  powerpc_opcodes

xmon instruction disassembly. I'd probably prefer to cut off my right hand and
debug one handed before losing this though.

87600   __start___bug_table

Can't do much about this.

77452   _fw_acenic_tg2_bin_bin

We could probably change acenic to be a module, 

46464   kmalloc_caches
32768   read_buffers
32768   mem_section
21816   hstates
20480   node_devices
14336   bootmem_node_data

mm stuff.

32096   _fw_tigon_tg3_tso5_bin_name
12184   .tg3_reset_hw
10124   .tg3_get_invariants

tg3

20804   .radeonfb_pm_init
18412   .radeon_pm_disable_dynamic_mode
15280   .radeon_write_mode

Radeon

26288   __start___ex_table
22632   PPC64_CACHES
20564   alg_test_descs
17784   ioctl_start
16384   xics_ipi_message
16384   vdso64_end
16384   stateid_hashtbl
16384   raw_devices
16384   lockstateid_hashtbl
16384   kexec_stack
16384   init_thread_union
16384   boot_pageset
15360   latency_record
14880   .do_balance
14008   .hidinput_connect
11920   kernel_config_data
10240   futex_queues
10044   .e1000_init_hw

Anton

^ permalink raw reply

* Re: [PATCH] powerpc/8xx: fix regression introduced by cache coherency rewrite
From: Joakim Tjernlund @ 2009-09-28  7:39 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev@ozlabs.org, Rex Feany
In-Reply-To: <1254123286.8421.14.camel@pasglop>

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 28/09/2009 09:34:46:
>
> On Mon, 2009-09-28 at 09:22 +0200, Joakim Tjernlund wrote:
> >
> > > happy to stick with it until somebody comes up with a real good
> > reason
> > > to do more :-) 8xx is on life support and has been around for long
> >
> > yeah, I figured that too but Freescale seems to be cranking out new
> > variants still.
>
> Ugh ? And still not fixing the major core bugs ? Grmbl...

yes, I even had them chase down the problem so they know exactly where
it is :( I don't think it is in any errata though, but I haven't looked
for a few years.

^ permalink raw reply

* Re: [PATCH] i2c-mpc: Do not generate STOP after read.
From: Jean Delvare @ 2009-09-28  7:28 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev, linux-i2c, Esben Haabendal
In-Reply-To: <OFDA26E787.32612566-ONC125763E.007AD2FD-C125763E.007B4FFC@transmode.se>

On Mon, 28 Sep 2009 00:26:54 +0200, Joakim Tjernlund wrote:
> Jean, I just noticed you pull request for i2c on LKML but I didn't see this
> patch nor have I got any feedback from you. What is your view?

My view is that i2c-mpc is nor under my jurisdiction, so I did not, and
will not, pay any attention to it.

-- 
Jean Delvare

^ permalink raw reply

* Re: [PATCH] powerpc/8xx: fix regression introduced by cache coherency rewrite
From: Benjamin Herrenschmidt @ 2009-09-28  7:34 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev@ozlabs.org, Rex Feany
In-Reply-To: <OF0EEAC985.DECE2C31-ONC125763F.002527BE-C125763F.002882FD@transmode.se>

On Mon, 2009-09-28 at 09:22 +0200, Joakim Tjernlund wrote:
> 
> > happy to stick with it until somebody comes up with a real good
> reason
> > to do more :-) 8xx is on life support and has been around for long
> 
> yeah, I figured that too but Freescale seems to be cranking out new
> variants still.

Ugh ? And still not fixing the major core bugs ? Grmbl...

Ben.

^ permalink raw reply

* Re: [PATCH] i2c-mpc: Do not generate STOP after read.
From: Jean Delvare @ 2009-09-28  7:34 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev, Esben Haabendal, linux-i2c
In-Reply-To: <OFC822309E.F9FB4F3C-ONC125763F.0029159C-C125763F.00293F85@transmode.se>

On Mon, 28 Sep 2009 09:30:32 +0200, Joakim Tjernlund wrote:
> Jean Delvare <khali@linux-fr.org> wrote on 28/09/2009 09:28:09:
> >
> > On Mon, 28 Sep 2009 00:26:54 +0200, Joakim Tjernlund wrote:
> > > Jean, I just noticed you pull request for i2c on LKML but I didn't see this
> > > patch nor have I got any feedback from you. What is your view?
> >
> > My view is that i2c-mpc is nor under my jurisdiction, so I did not, and
> > will not, pay any attention to it.
> 
> Ah, that explains it. Who then will look after i2c-mpc? Kumar?

What does MAINTAINERS say?

-- 
Jean Delvare

^ permalink raw reply

* Re: [PATCH] i2c-mpc: Do not generate STOP after read.
From: Peter Korsgaard @ 2009-09-28  7:33 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: Esben Haabendal, linuxppc-dev, linux-i2c, Jean Delvare
In-Reply-To: <OFC822309E.F9FB4F3C-ONC125763F.0029159C-C125763F.00293F85@transmode.se>

>>>>> "Joakim" == Joakim Tjernlund <joakim.tjernlund@transmode.se> writes:

Hi,

 Joakim> Ah, that explains it. Who then will look after i2c-mpc? Kumar?

Ben Dooks (embedded i2c maintainer). He's afaik coming home today, so
give him a few days to catch up on mails.

-- 
Bye, Peter Korsgaard

^ permalink raw reply

* Re: [PATCH] i2c-mpc: Do not generate STOP after read.
From: Joakim Tjernlund @ 2009-09-28  7:30 UTC (permalink / raw)
  To: Jean Delvare, galak; +Cc: linuxppc-dev, linux-i2c, Esben Haabendal
In-Reply-To: <20090928092809.7272b6e6@hyperion.delvare>

Jean Delvare <khali@linux-fr.org> wrote on 28/09/2009 09:28:09:
>
> On Mon, 28 Sep 2009 00:26:54 +0200, Joakim Tjernlund wrote:
> > Jean, I just noticed you pull request for i2c on LKML but I didn't see this
> > patch nor have I got any feedback from you. What is your view?
>
> My view is that i2c-mpc is nor under my jurisdiction, so I did not, and
> will not, pay any attention to it.

Ah, that explains it. Who then will look after i2c-mpc? Kumar?

^ permalink raw reply

* Re: [PATCH] powerpc/8xx: fix regression introduced by cache coherency rewrite
From: Joakim Tjernlund @ 2009-09-28  7:22 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev@ozlabs.org, Rex Feany
In-Reply-To: <1254108060.8421.1.camel@pasglop>

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 28/09/2009 05:21:00:
>
> On Sun, 2009-09-27 at 15:22 +0200, Joakim Tjernlund wrote:
> > > However, adding tlbil_va() to ptep_set_access_flags() as you suggested
> > > makes everything happy. I need to test it some more, but it looks good
> > > so far. Below is what I am testing now.
> >
> > 8xx, is getting very hacky and I suspect that the only long term fix is
> > add code to trap the cache instructions in TLB error/miss and fixup the
> > exception in page fault handler. This will also have the added benefit on being able
> > to use the cache instructions in both kernel and user space like any other
> > ppc arch.
>
> First I'd like to understand exactly what's happening today, since
> it makes little sense :-) I suppose I'll have to get myself some
> 8xx doco and understand how the bloody MMU works.

hmm, I recall that the reason for the currect "force a TLB error"
procedure was to fix a problem Montavista had, possibly Tom Rini.
He fixed the problem differently at first but later it
was changed to what is in use today.

If you have access to the old linuxppc-embedded you will find a lot
of info in a thread with subject "[PATCH 2.6.14] mm: 8xx MM fix for"
Also I made a ref that I cannot access anymore:
http://ozlabs.org/pipermail/linuxppc-embedded/2005-January/016382.html

>
> Then, I saw your old patch and it's -very- invasive. If we can get away
> with a one liner just adding tlbil_va in the right place, I think I'm

Yes, my patch is invasive but that is because I did it in asm. If you do
it in the page fault handler, it will be much easier.
In principle you would only have to tag DAR and test for
the tag value in asm then branch to the page fault handler. Once there
it is easy to decode the insns.

> happy to stick with it until somebody comes up with a real good reason
> to do more :-) 8xx is on life support and has been around for long

yeah, I figured that too but Freescale seems to be cranking out new variants
still.

^ permalink raw reply

* Re: warning: allocated section `.data_nosave'  not in segment
From: Sean MacLennan @ 2009-09-28  5:36 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20090925165411.6c584401@lappy.seanm.ca>

If I revert the commit show at the bottom of this email, the warnings
go away.

It took a while to find this, git bisect kept pointing me to a file
from arch/alpha :(

Cheers,
   Sean


powerpc: Cleanup linker script using new linker script macros.



author	Tim Abbott <tabbott@ksplice.com>

	 Thu, 24 Sep 2009 14:36:24 +0000 (10:36 -0400)

committer	Linus Torvalds <torvalds@linux-foundation.org>

	 Fri, 25 Sep 2009 00:16:22 +0000 (17:16 -0700)

commit	62bef288588bee976b753f7168716621d7a984e2

tree	8f55ea7fc2c5d651b72d1addad5687d61b857910	tree |
snapshot

parent	4a5e35135d1ffcf14ebb2bb3c730b92c18ae9657	commit |
diff



powerpc: Cleanup linker script using new linker script macros.



Signed-off-by: Tim Abbott <tabbott@ksplice.com>

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Cc: Paul Mackerras <paulus@samba.org>

Cc: linuxppc-dev@ozlabs.org

Acked-by: Sam Ravnborg <sam@ravnborg.org>

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

^ permalink raw reply

* Re: GPIO driver for MPC8313.
From: Peter Korsgaard @ 2009-09-28  5:16 UTC (permalink / raw)
  To: Johnny Hung; +Cc: linuxppc-dev, linux-embedded
In-Reply-To: <cb9ecdfa0909271943o6133f5c8r9b25a25140e7842f@mail.gmail.com>

>>>>> "Johnny" == Johnny Hung <johnny.hacking@gmail.com> writes:

 Johnny> The kerne source I used now is 2.6.23 with MPC8313-erdb
 Johnny> pached. I think I should port 2.6.28 gpio function back to
 Johnny> 2.6.23. Is it a common way to implement it in the kernel I used
 Johnny> or I should port MPC8313-erdb pached to 2.6.28 opposite?

It shouldn't be that hard to back port, but I would certainly go for
2.6.28 (or rather 2.6.31) unless something keeps you at 2.6.23.

-- 
Bye, Peter Korsgaard

^ permalink raw reply

* [5/5] Split hash MMU specific hugepage code into a new file
From: David Gibson @ 2009-09-28  4:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, linuxppc-dev
In-Reply-To: <20090928043902.GA6302@yookeroo.seuss>

This patch separates the parts of hugetlbpage.c which are inherently
specific to the hash MMU into a new hugelbpage-hash64.c file.

Signed-off-by: David Gibson <dwg@au1.ibm.com>

---
 arch/powerpc/include/asm/hugetlb.h   |    3 
 arch/powerpc/mm/Makefile             |    5 -
 arch/powerpc/mm/hugetlbpage-hash64.c |  167 ++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/hugetlbpage.c        |  168 -----------------------------------
 4 files changed, 176 insertions(+), 167 deletions(-)

Index: working-2.6/arch/powerpc/mm/Makefile
===================================================================
--- working-2.6.orig/arch/powerpc/mm/Makefile	2009-09-28 13:51:57.000000000 +1000
+++ working-2.6/arch/powerpc/mm/Makefile	2009-09-28 13:53:21.000000000 +1000
@@ -28,7 +28,10 @@ obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
-obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
+obj-y				+= hugetlbpage.o
+obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
+endif
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
Index: working-2.6/arch/powerpc/mm/hugetlbpage-hash64.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ working-2.6/arch/powerpc/mm/hugetlbpage-hash64.c	2009-09-28 13:53:21.000000000 +1000
@@ -0,0 +1,167 @@
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+					pte_t pte, int trap, unsigned long sz)
+{
+	struct page *page;
+	int i;
+
+	if (!pfn_valid(pte_pfn(pte)))
+		return rflags;
+
+	page = pte_page(pte);
+
+	/* page is dirty */
+	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+		if (trap == 0x400) {
+			for (i = 0; i < (sz / PAGE_SIZE); i++)
+				__flush_dcache_icache(page_address(page+i));
+			set_bit(PG_arch_1, &page->flags);
+		} else {
+			rflags |= HPTE_R_N;
+		}
+	}
+	return rflags;
+}
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+		     pte_t *ptep, unsigned long trap, int local, int ssize,
+		     unsigned int shift, unsigned int mmu_psize)
+{
+	unsigned long old_pte, new_pte;
+	unsigned long va, rflags, pa, sz;
+	long slot;
+	int err = 1;
+
+	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+	/* Search the Linux page table for a match with va */
+	va = hpt_va(ea, vsid, ssize);
+
+	/*
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+	if (unlikely(access & ~pte_val(*ptep)))
+		goto out;
+	/*
+	 * At this point, we have a pte (old_pte) which can be used to build
+	 * or update an HPTE. There are 2 cases:
+	 *
+	 * 1. There is a valid (present) pte with no associated HPTE (this is
+	 *	the most common case)
+	 * 2. There is a valid (present) pte with an associated HPTE. The
+	 *	current values of the pp bits in the HPTE prevent access
+	 *	because we are doing software DIRTY bit management and the
+	 *	page is currently not DIRTY.
+	 */
+
+
+	do {
+		old_pte = pte_val(*ptep);
+		if (old_pte & _PAGE_BUSY)
+			goto out;
+		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					 old_pte, new_pte));
+
+	rflags = 0x2 | (!(new_pte & _PAGE_RW));
+ 	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	sz = ((1UL) << shift);
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		/* No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case */
+		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+						       trap, sz);
+
+	/* Check if pte already has an hpte (case 2) */
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
+		/* There MIGHT be an HPTE for this pte */
+		unsigned long hash, slot;
+
+		hash = hpt_hash(va, shift, ssize);
+		if (old_pte & _PAGE_F_SECOND)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (old_pte & _PAGE_F_GIX) >> 12;
+
+		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
+					 ssize, local) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(va, shift, ssize);
+		unsigned long hpte_group;
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+repeat:
+		hpte_group = ((hash & htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+
+		/* clear HPTE slot informations in new PTE */
+#ifdef CONFIG_PPC_64K_PAGES
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
+#else
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+#endif
+		/* Add in WIMG bits */
+		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				      _PAGE_COHERENT | _PAGE_GUARDED));
+
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+					  mmu_psize, ssize);
+
+		/* Primary is full, try the secondary */
+		if (unlikely(slot == -1)) {
+			hpte_group = ((~hash & htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
+						  HPTE_V_SECONDARY,
+						  mmu_psize, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP)&~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+                        }
+		}
+
+		if (unlikely(slot == -2))
+			panic("hash_huge_page: pte_insert failed\n");
+
+		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+	}
+
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+
+	err = 0;
+
+ out:
+	return err;
+}
Index: working-2.6/arch/powerpc/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c	2009-09-28 13:53:16.000000000 +1000
+++ working-2.6/arch/powerpc/mm/hugetlbpage.c	2009-09-28 13:53:21.000000000 +1000
@@ -7,29 +7,17 @@
  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  */
 
-#include <linux/init.h>
-#include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/io.h>
 #include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
+#include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/machdep.h>
-#include <asm/cputable.h>
-#include <asm/spu.h>
 
 #define PAGE_SHIFT_64K	16
 #define PAGE_SHIFT_16M	24
 #define PAGE_SHIFT_16G	34
 
-#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
-#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 #define MAX_NUMBER_GPAGES	1024
 
 /* Tracks the 16G pages after the device tree is scanned and before the
@@ -507,158 +495,6 @@ unsigned long vma_mmu_pagesize(struct vm
 	return 1UL << mmu_psize_to_shift(psize);
 }
 
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-					pte_t pte, int trap, unsigned long sz)
-{
-	struct page *page;
-	int i;
-
-	if (!pfn_valid(pte_pfn(pte)))
-		return rflags;
-
-	page = pte_page(pte);
-
-	/* page is dirty */
-	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-		if (trap == 0x400) {
-			for (i = 0; i < (sz / PAGE_SIZE); i++)
-				__flush_dcache_icache(page_address(page+i));
-			set_bit(PG_arch_1, &page->flags);
-		} else {
-			rflags |= HPTE_R_N;
-		}
-	}
-	return rflags;
-}
-
-int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
-		     pte_t *ptep, unsigned long trap, int local, int ssize,
-		     unsigned int shift, unsigned int mmu_psize)
-{
-	unsigned long old_pte, new_pte;
-	unsigned long va, rflags, pa, sz;
-	long slot;
-	int err = 1;
-
-	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
-
-	/* Search the Linux page table for a match with va */
-	va = hpt_va(ea, vsid, ssize);
-
-	/* 
-	 * Check the user's access rights to the page.  If access should be
-	 * prevented then send the problem up to do_page_fault.
-	 */
-	if (unlikely(access & ~pte_val(*ptep)))
-		goto out;
-	/*
-	 * At this point, we have a pte (old_pte) which can be used to build
-	 * or update an HPTE. There are 2 cases:
-	 *
-	 * 1. There is a valid (present) pte with no associated HPTE (this is 
-	 *	the most common case)
-	 * 2. There is a valid (present) pte with an associated HPTE. The
-	 *	current values of the pp bits in the HPTE prevent access
-	 *	because we are doing software DIRTY bit management and the
-	 *	page is currently not DIRTY. 
-	 */
-
-
-	do {
-		old_pte = pte_val(*ptep);
-		if (old_pte & _PAGE_BUSY)
-			goto out;
-		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
-	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
-					 old_pte, new_pte));
-
-	rflags = 0x2 | (!(new_pte & _PAGE_RW));
- 	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-	sz = ((1UL) << shift);
-	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-		/* No CPU has hugepages but lacks no execute, so we
-		 * don't need to worry about that case */
-		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-						       trap, sz);
-
-	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(old_pte & _PAGE_HASHPTE)) {
-		/* There MIGHT be an HPTE for this pte */
-		unsigned long hash, slot;
-
-		hash = hpt_hash(va, shift, ssize);
-		if (old_pte & _PAGE_F_SECOND)
-			hash = ~hash;
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (old_pte & _PAGE_F_GIX) >> 12;
-
-		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
-					 ssize, local) == -1)
-			old_pte &= ~_PAGE_HPTEFLAGS;
-	}
-
-	if (likely(!(old_pte & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(va, shift, ssize);
-		unsigned long hpte_group;
-
-		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
-repeat:
-		hpte_group = ((hash & htab_hash_mask) *
-			      HPTES_PER_GROUP) & ~0x7UL;
-
-		/* clear HPTE slot informations in new PTE */
-#ifdef CONFIG_PPC_64K_PAGES
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
-#else
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-#endif
-		/* Add in WIMG bits */
-		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				      _PAGE_COHERENT | _PAGE_GUARDED));
-
-		/* Insert into the hash table, primary slot */
-		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
-					  mmu_psize, ssize);
-
-		/* Primary is full, try the secondary */
-		if (unlikely(slot == -1)) {
-			hpte_group = ((~hash & htab_hash_mask) *
-				      HPTES_PER_GROUP) & ~0x7UL; 
-			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
-						  HPTE_V_SECONDARY,
-						  mmu_psize, ssize);
-			if (slot == -1) {
-				if (mftb() & 0x1)
-					hpte_group = ((hash & htab_hash_mask) *
-						      HPTES_PER_GROUP)&~0x7UL;
-
-				ppc_md.hpte_remove(hpte_group);
-				goto repeat;
-                        }
-		}
-
-		if (unlikely(slot == -2))
-			panic("hash_huge_page: pte_insert failed\n");
-
-		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
-	}
-
-	/*
-	 * No need to use ldarx/stdcx here
-	 */
-	*ptep = __pte(new_pte & ~_PAGE_BUSY);
-
-	err = 0;
-
- out:
-	return err;
-}
-
 static int __init add_huge_page_size(unsigned long long size)
 {
 	int shift = __ffs(size);
Index: working-2.6/arch/powerpc/include/asm/hugetlb.h
===================================================================
--- working-2.6.orig/arch/powerpc/include/asm/hugetlb.h	2009-09-28 13:53:00.000000000 +1000
+++ working-2.6/arch/powerpc/include/asm/hugetlb.h	2009-09-28 13:53:21.000000000 +1000
@@ -13,6 +13,9 @@ static inline int hugepd_ok(hugepd_t hpd
 #define is_hugepd(pdep)               (hugepd_ok(*((hugepd_t *)(pdep))))
 #define HUGEPD_SHIFT_MASK     0x3f
 
+pte_t *huge_pte_offset_and_shift(struct mm_struct *mm,
+				 unsigned long addr, unsigned *shift);
+
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len);
 

^ permalink raw reply

* [4/5] Cleanup initialization of hugepages on powerpc
From: David Gibson @ 2009-09-28  4:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, linuxppc-dev
In-Reply-To: <20090928043902.GA6302@yookeroo.seuss>

This patch simplifies the logic used to initialize hugepages on
powerpc.  The somewhat oddly named set_huge_psize() is renamed to
add_huge_page_size() and now does all necessary verification of
whether it's given a valid hugepage sizes (instead of just some) and
instantiates the generic hstate structure (but no more).  

hugetlbpage_init() now steps through the available pagesizes, checks
if they're valid for hugepages by calling add_huge_page_size() and
initializes the kmem_caches for the hugepage pagetables.  This means
we can now eliminate the mmu_huge_psizes array, since we no longer
need to pass the sizing information for the pagetable caches from
set_huge_psize() into hugetlbpage_init()

Signed-off-by: David Gibson <dwg@au1.ibm.com>

---
 arch/powerpc/mm/hugetlbpage.c |  106 +++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 57 deletions(-)

Index: working-2.6/arch/powerpc/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c	2009-09-28 13:53:42.000000000 +1000
+++ working-2.6/arch/powerpc/mm/hugetlbpage.c	2009-09-28 14:02:34.000000000 +1000
@@ -37,11 +37,6 @@
 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
 static unsigned nr_gpages;
 
-/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
- * stored for the huge page sizes that are valid.
- */
-static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
-
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
@@ -502,8 +497,6 @@ unsigned long hugetlb_get_unmapped_area(
 	struct hstate *hstate = hstate_file(file);
 	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 
-	if (!mmu_huge_psizes[mmu_psize])
-		return -EINVAL;
 	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 }
 
@@ -666,47 +659,46 @@ repeat:
 	return err;
 }
 
-static void __init set_huge_psize(int psize)
+static int __init add_huge_page_size(unsigned long long size)
 {
-	unsigned pdshift;
+	int shift = __ffs(size);
+	int mmu_psize;
 
 	/* Check that it is a page size supported by the hardware and
-	 * that it fits within pagetable limits. */
-	if (mmu_psize_defs[psize].shift &&
-		mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
-		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
-		 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
-		 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
-		/* Return if huge page size has already been setup or is the
-		 * same as the base page size. */
-		if (mmu_huge_psizes[psize] ||
-		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
-			return;
-		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
+	 * that it fits within pagetable and slice limits. */
+	if (!is_power_of_2(size)
+	    || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
+		return -EINVAL;
 
-		if (mmu_psize_defs[psize].shift < PMD_SHIFT)
-			pdshift = PMD_SHIFT;
-		else if (mmu_psize_defs[psize].shift < PUD_SHIFT)
-			pdshift = PUD_SHIFT;
-		else
-			pdshift = PGDIR_SHIFT;
-		mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift;
-	}
+	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
+		return -EINVAL;
+
+#ifdef CONFIG_SPU_FS_64K_LS
+	/* Disable support for 64K huge pages when 64K SPU local store
+	 * support is enabled as the current implementation conflicts.
+	 */
+	if (shift == PAGE_SHIFT_64K)
+		return -EINVAL;
+#endif /* CONFIG_SPU_FS_64K_LS */
+
+	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
+
+	/* Return if huge page size has already been setup */
+	if (size_to_hstate(size))
+		return 0;
+
+	hugetlb_add_hstate(shift - PAGE_SHIFT);
+
+	return 0;
 }
 
 static int __init hugepage_setup_sz(char *str)
 {
 	unsigned long long size;
-	int mmu_psize;
-	int shift;
 
 	size = memparse(str, &str);
 
-	shift = __ffs(size);
-	mmu_psize = shift_to_mmu_psize(shift);
-	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
-		set_huge_psize(mmu_psize);
-	else
+	if (add_huge_page_size(size) != 0)
 		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 
 	return 1;
@@ -720,31 +712,31 @@ static int __init hugetlbpage_init(void)
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
 
-	/* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
-	 * and adjust PTE_NONCACHE_NUM if the number of supported huge page
-	 * sizes changes.
-	 */
-	set_huge_psize(MMU_PAGE_16M);
-	set_huge_psize(MMU_PAGE_16G);
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		unsigned shift;
+		unsigned pdshift;
 
-	/* Temporarily disable support for 64K huge pages when 64K SPU local
-	 * store support is enabled as the current implementation conflicts.
-	 */
-#ifndef CONFIG_SPU_FS_64K_LS
-	set_huge_psize(MMU_PAGE_64K);
-#endif
+		if (!mmu_psize_defs[psize].shift)
+			continue;
 
-	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-		if (mmu_huge_psizes[psize]) {
-			pgtable_cache_add(mmu_huge_psizes[psize], NULL);
-			if (!PGT_CACHE(mmu_huge_psizes[psize]))
-				panic("hugetlbpage_init(): could not create "
-				      "pgtable cache for %d bit pagesize\n",
-				      mmu_psize_to_shift(psize));
-		}
+		shift = mmu_psize_to_shift(psize);
+
+		if (add_huge_page_size(1ULL << shift) < 0)
+			continue;
+
+		if (shift < PMD_SHIFT)
+			pdshift = PMD_SHIFT;
+		else if (shift < PUD_SHIFT)
+			pdshift = PUD_SHIFT;
+		else
+			pdshift = PGDIR_SHIFT;
+
+		pgtable_cache_add(pdshift - shift, NULL);
+		if (!PGT_CACHE(pdshift - shift))
+			panic("hugetlbpage_init(): could not create "
+			      "pgtable cache for %d bit pagesize\n", shift);
 	}
 
 	return 0;
 }
-
 module_init(hugetlbpage_init);

^ permalink raw reply

* [3/5] Allow more flexible layouts for hugepage pagetables
From: David Gibson @ 2009-09-28  4:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, linuxppc-dev
In-Reply-To: <20090928043902.GA6302@yookeroo.seuss>

Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level.  Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly.  Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.

This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity.  In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask.  This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.

This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out.  All other (hugepage)
pagetable walking paths can just interpret the structure as they go.

There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug.  We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping).  This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.

While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.

Signed-off-by: David Gibson <dwg@au1.ibm.com>

---
 arch/powerpc/include/asm/hugetlb.h       |   12 
 arch/powerpc/include/asm/mmu-hash64.h    |   14 
 arch/powerpc/include/asm/pgtable-ppc64.h |   13 
 arch/powerpc/kernel/perf_callchain.c     |   20 -
 arch/powerpc/mm/gup.c                    |  149 +--------
 arch/powerpc/mm/hash_utils_64.c          |   17 -
 arch/powerpc/mm/hugetlbpage.c            |  473 ++++++++++++++-----------------
 arch/powerpc/mm/init_64.c                |   10 
 8 files changed, 302 insertions(+), 406 deletions(-)

Index: working-2.6/arch/powerpc/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c	2009-09-28 14:17:38.000000000 +1000
+++ working-2.6/arch/powerpc/mm/hugetlbpage.c	2009-09-28 14:19:08.000000000 +1000
@@ -40,25 +40,11 @@ static unsigned nr_gpages;
 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  * stored for the huge page sizes that are valid.
  */
-unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
-
-#define hugepte_shift			mmu_huge_psizes
-#define HUGEPTE_INDEX_SIZE(psize)	(mmu_huge_psizes[(psize)])
-#define PTRS_PER_HUGEPTE(psize)		(1 << mmu_huge_psizes[psize])
-
-#define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
-					 + HUGEPTE_INDEX_SIZE(psize))
-#define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
-#define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
+static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
-#define HUGEPD_OK	0x1
-
-typedef struct { unsigned long pd; } hugepd_t;
-
-#define hugepd_none(hpd)	((hpd).pd == 0)
 
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
@@ -82,71 +68,126 @@ static inline unsigned int mmu_psize_to_
 	BUG();
 }
 
+#define hugepd_none(hpd)	((hpd).pd == 0)
+
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
-	BUG_ON(!(hpd.pd & HUGEPD_OK));
-	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+	BUG_ON(!hugepd_ok(hpd));
+	return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
 }
 
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
-				    struct hstate *hstate)
+static inline unsigned int hugepd_shift(hugepd_t hpd)
 {
-	unsigned int shift = huge_page_shift(hstate);
-	int psize = shift_to_mmu_psize(shift);
-	unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
+	return hpd.pd & HUGEPD_SHIFT_MASK;
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
+{
+	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
 	pte_t *dir = hugepd_page(*hpdp);
 
 	return dir + idx;
 }
 
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	hugepd_t *hpdp = NULL;
+	unsigned pdshift = PGDIR_SHIFT;
+
+	if (shift)
+		*shift = 0;
+
+	pg = pgdir + pgd_index(ea);
+	if (is_hugepd(pg)) {
+		hpdp = (hugepd_t *)pg;
+	} else if (!pgd_none(*pg)) {
+		pdshift = PUD_SHIFT;
+		pu = pud_offset(pg, ea);
+		if (is_hugepd(pu))
+			hpdp = (hugepd_t *)pu;
+		else if (!pud_none(*pu)) {
+			pdshift = PMD_SHIFT;
+			pm = pmd_offset(pu, ea);
+			if (is_hugepd(pm))
+				hpdp = (hugepd_t *)pm;
+			else if (!pmd_none(*pm)) {
+				return pte_offset_map(pm, ea);
+			}
+		}
+	}
+
+	if (!hpdp)
+		return NULL;
+
+	if (shift)
+		*shift = hugepd_shift(*hpdp);
+	return hugepte_offset(hpdp, ea, pdshift);
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+}
+
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-			   unsigned long address, unsigned int psize)
+			   unsigned long address, unsigned pdshift, unsigned pshift)
 {
-	pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]),
+	pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
 				       GFP_KERNEL|__GFP_REPEAT);
 
+	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
+	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
+
 	if (! new)
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new);
+		kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
 	else
-		hpdp->pd = (unsigned long)new | HUGEPD_OK;
+		hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 
-
-static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
-	if (huge_page_shift(hstate) < PUD_SHIFT)
-		return pud_offset(pgd, addr);
-	else
-		return (pud_t *) pgd;
-}
-static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
-			 struct hstate *hstate)
-{
-	if (huge_page_shift(hstate) < PUD_SHIFT)
-		return pud_alloc(mm, pgd, addr);
-	else
-		return (pud_t *) pgd;
-}
-static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
-{
-	if (huge_page_shift(hstate) < PMD_SHIFT)
-		return pmd_offset(pud, addr);
-	else
-		return (pmd_t *) pud;
-}
-static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
-			 struct hstate *hstate)
-{
-	if (huge_page_shift(hstate) < PMD_SHIFT)
-		return pmd_alloc(mm, pud, addr);
-	else
-		return (pmd_t *) pud;
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	hugepd_t *hpdp = NULL;
+	unsigned pshift = __ffs(sz);
+	unsigned pdshift = PGDIR_SHIFT;
+
+	addr &= ~(sz-1);
+
+	pg = pgd_offset(mm, addr);
+	if (pshift >= PUD_SHIFT) {
+		hpdp = (hugepd_t *)pg;
+	} else {
+		pdshift = PUD_SHIFT;
+		pu = pud_alloc(mm, pg, addr);
+		if (pshift >= PMD_SHIFT) {
+			hpdp = (hugepd_t *)pu;
+		} else {
+			pdshift = PMD_SHIFT;
+			pm = pmd_alloc(mm, pu, addr);
+			hpdp = (hugepd_t *)pm;
+		}
+	}
+
+	if (!hpdp)
+		return NULL;
+
+	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+		return NULL;
+
+	return hugepte_offset(hpdp, addr, pdshift);
 }
 
 /* Build list of addresses of gigantic pages.  This function is used in early
@@ -180,92 +221,38 @@ int alloc_bootmem_huge_page(struct hstat
 	return 1;
 }
 
-
-/* Modelled after find_linux_pte() */
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
-
-	unsigned int psize;
-	unsigned int shift;
-	unsigned long sz;
-	struct hstate *hstate;
-	psize = get_slice_psize(mm, addr);
-	shift = mmu_psize_to_shift(psize);
-	sz = ((1UL) << shift);
-	hstate = size_to_hstate(sz);
-
-	addr &= hstate->mask;
-
-	pg = pgd_offset(mm, addr);
-	if (!pgd_none(*pg)) {
-		pu = hpud_offset(pg, addr, hstate);
-		if (!pud_none(*pu)) {
-			pm = hpmd_offset(pu, addr, hstate);
-			if (!pmd_none(*pm))
-				return hugepte_offset((hugepd_t *)pm, addr,
-						      hstate);
-		}
-	}
-
-	return NULL;
-}
-
-pte_t *huge_pte_alloc(struct mm_struct *mm,
-			unsigned long addr, unsigned long sz)
-{
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
-	hugepd_t *hpdp = NULL;
-	struct hstate *hstate;
-	unsigned int psize;
-	hstate = size_to_hstate(sz);
-
-	psize = get_slice_psize(mm, addr);
-	BUG_ON(!mmu_huge_psizes[psize]);
-
-	addr &= hstate->mask;
-
-	pg = pgd_offset(mm, addr);
-	pu = hpud_alloc(mm, pg, addr, hstate);
-
-	if (pu) {
-		pm = hpmd_alloc(mm, pu, addr, hstate);
-		if (pm)
-			hpdp = (hugepd_t *)pm;
-	}
-
-	if (! hpdp)
-		return NULL;
-
-	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
-		return NULL;
-
-	return hugepte_offset(hpdp, addr, hstate);
-}
-
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
 	return 0;
 }
 
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
-			       unsigned int psize)
+static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
+			      unsigned long start, unsigned long end,
+			      unsigned long floor, unsigned long ceiling)
 {
 	pte_t *hugepte = hugepd_page(*hpdp);
+	unsigned shift = hugepd_shift(*hpdp);
+	unsigned long pdmask = ~((1UL << pdshift) - 1);
+
+	start &= pdmask;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= pdmask;
+		if (! ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
 
 	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]);
+	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				   unsigned long addr, unsigned long end,
-				   unsigned long floor, unsigned long ceiling,
-				   unsigned int psize)
+				   unsigned long floor, unsigned long ceiling)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -277,7 +264,8 @@ static void hugetlb_free_pmd_range(struc
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(*pmd))
 			continue;
-		free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
+		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
+				  addr, next, floor, ceiling);
 	} while (pmd++, addr = next, addr != end);
 
 	start &= PUD_MASK;
@@ -303,23 +291,19 @@ static void hugetlb_free_pud_range(struc
 	pud_t *pud;
 	unsigned long next;
 	unsigned long start;
-	unsigned int shift;
-	unsigned int psize = get_slice_psize(tlb->mm, addr);
-	shift = mmu_psize_to_shift(psize);
 
 	start = addr;
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (shift < PMD_SHIFT) {
+		if (!is_hugepd(pud)) {
 			if (pud_none_or_clear_bad(pud))
 				continue;
 			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
-					       ceiling, psize);
+					       ceiling);
 		} else {
-			if (pud_none(*pud))
-				continue;
-			free_hugepte_range(tlb, (hugepd_t *)pud, psize);
+			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
+					  addr, next, floor, ceiling);
 		}
 	} while (pud++, addr = next, addr != end);
 
@@ -350,74 +334,34 @@ void hugetlb_free_pgd_range(struct mmu_g
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long start;
 
 	/*
-	 * Comments below take from the normal free_pgd_range().  They
-	 * apply here too.  The tests against HUGEPD_MASK below are
-	 * essential, because we *don't* test for this at the bottom
-	 * level.  Without them we'll attempt to free a hugepte table
-	 * when we unmap just part of it, even if there are other
-	 * active mappings using it.
+	 * Because there are a number of different possible pagetable
+	 * layouts for hugepage ranges, we limit knowledge of how
+	 * things should be laid out to the allocation path
+	 * (huge_pte_alloc(), above).  Everything else works out the
+	 * structure as it goes from information in the hugepd
+	 * pointers.  That means that we can't here use the
+	 * optimization used in the normal page free_pgd_range(), of
+	 * checking whether we're actually covering a large enough
+	 * range to have to do anything at the top level of the walk
+	 * instead of at the bottom.
 	 *
-	 * The next few lines have given us lots of grief...
-	 *
-	 * Why are we testing HUGEPD* at this top level?  Because
-	 * often there will be no work to do at all, and we'd prefer
-	 * not to go all the way down to the bottom just to discover
-	 * that.
-	 *
-	 * Why all these "- 1"s?  Because 0 represents both the bottom
-	 * of the address space and the top of it (using -1 for the
-	 * top wouldn't help much: the masks would do the wrong thing).
-	 * The rule is that addr 0 and floor 0 refer to the bottom of
-	 * the address space, but end 0 and ceiling 0 refer to the top
-	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
-	 * that end 0 case should be mythical).
-	 *
-	 * Wherever addr is brought up or ceiling brought down, we
-	 * must be careful to reject "the opposite 0" before it
-	 * confuses the subsequent tests.  But what about where end is
-	 * brought down by HUGEPD_SIZE below? no, end can't go down to
-	 * 0 there.
-	 *
-	 * Whereas we round start (addr) and ceiling down, by different
-	 * masks at different levels, in order to test whether a table
-	 * now has no other vmas using it, so can be freed, we don't
-	 * bother to round floor or end up - the tests don't need that.
+	 * To make sense of this, you should probably go read the big
+	 * block comment at the top of the normal free_pgd_range(),
+	 * too.
 	 */
-	unsigned int psize = get_slice_psize(tlb->mm, addr);
 
-	addr &= HUGEPD_MASK(psize);
-	if (addr < floor) {
-		addr += HUGEPD_SIZE(psize);
-		if (!addr)
-			return;
-	}
-	if (ceiling) {
-		ceiling &= HUGEPD_MASK(psize);
-		if (!ceiling)
-			return;
-	}
-	if (end - 1 > ceiling - 1)
-		end -= HUGEPD_SIZE(psize);
-	if (addr > end - 1)
-		return;
-
-	start = addr;
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
-		psize = get_slice_psize(tlb->mm, addr);
-		BUG_ON(!mmu_huge_psizes[psize]);
 		next = pgd_addr_end(addr, end);
-		if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
+		if (!is_hugepd(pgd)) {
 			if (pgd_none_or_clear_bad(pgd))
 				continue;
 			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 		} else {
-			if (pgd_none(*pgd))
-				continue;
-			free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
+			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
+					  addr, next, floor, ceiling);
 		}
 	} while (pgd++, addr = next, addr != end);
 }
@@ -448,19 +392,19 @@ follow_huge_addr(struct mm_struct *mm, u
 {
 	pte_t *ptep;
 	struct page *page;
-	unsigned int mmu_psize = get_slice_psize(mm, address);
+	unsigned shift;
+	unsigned long mask;
+
+	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 
 	/* Verify it is a huge page else bail. */
-	if (!mmu_huge_psizes[mmu_psize])
+	if (!ptep || !shift)
 		return ERR_PTR(-EINVAL);
 
-	ptep = huge_pte_offset(mm, address);
+	mask = (1UL << shift) - 1;
 	page = pte_page(*ptep);
-	if (page) {
-		unsigned int shift = mmu_psize_to_shift(mmu_psize);
-		unsigned long sz = ((1UL) << shift);
-		page += (address % sz) / PAGE_SIZE;
-	}
+	if (page)
+		page += (address & mask) / PAGE_SIZE;
 
 	return page;
 }
@@ -483,6 +427,73 @@ follow_huge_pmd(struct mm_struct *mm, un
 	return NULL;
 }
 
+static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+		       unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	unsigned long pte_end;
+	struct page *head, *page;
+	pte_t pte;
+	int refs;
+
+	pte_end = (addr + sz) & ~(sz-1);
+	if (pte_end < end)
+		end = pte_end;
+
+	pte = *ptep;
+	mask = _PAGE_PRESENT | _PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+
+	if ((pte_val(pte) & mask) != mask)
+		return 0;
+
+	/* hugepages are never "special" */
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+
+	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+		/* Could be optimized better */
+		while (*nr) {
+			put_page(page);
+			(*nr)--;
+		}
+	}
+
+	return 1;
+}
+
+int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
+	       unsigned long addr, unsigned long end,
+	       int write, struct page **pages, int *nr)
+{
+	pte_t *ptep;
+	unsigned long sz = 1UL << hugepd_shift(*hugepd);
+
+	ptep = hugepte_offset(hugepd, addr, pdshift);
+	do {
+		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+			return 0;
+	} while (ptep++, addr += sz, addr != end);
+
+	return 1;
+}
 
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
@@ -530,34 +541,20 @@ static unsigned int hash_huge_page_do_la
 	return rflags;
 }
 
-int hash_huge_page(struct mm_struct *mm, unsigned long access,
-		   unsigned long ea, unsigned long vsid, int local,
-		   unsigned long trap)
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+		     pte_t *ptep, unsigned long trap, int local, int ssize,
+		     unsigned int shift, unsigned int mmu_psize)
 {
-	pte_t *ptep;
 	unsigned long old_pte, new_pte;
 	unsigned long va, rflags, pa, sz;
 	long slot;
 	int err = 1;
-	int ssize = user_segment_size(ea);
-	unsigned int mmu_psize;
-	int shift;
-	mmu_psize = get_slice_psize(mm, ea);
 
-	if (!mmu_huge_psizes[mmu_psize])
-		goto out;
-	ptep = huge_pte_offset(mm, ea);
+	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
 
 	/* Search the Linux page table for a match with va */
 	va = hpt_va(ea, vsid, ssize);
 
-	/*
-	 * If no pte found or not present, send the problem up to
-	 * do_page_fault
-	 */
-	if (unlikely(!ptep || pte_none(*ptep)))
-		goto out;
-
 	/* 
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
@@ -588,7 +585,6 @@ int hash_huge_page(struct mm_struct *mm,
 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-	shift = mmu_psize_to_shift(mmu_psize);
 	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		/* No CPU has hugepages but lacks no execute, so we
@@ -672,6 +668,8 @@ repeat:
 
 static void __init set_huge_psize(int psize)
 {
+	unsigned pdshift;
+
 	/* Check that it is a page size supported by the hardware and
 	 * that it fits within pagetable limits. */
 	if (mmu_psize_defs[psize].shift &&
@@ -686,29 +684,14 @@ static void __init set_huge_psize(int ps
 			return;
 		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 
-		switch (mmu_psize_defs[psize].shift) {
-		case PAGE_SHIFT_64K:
-		    /* We only allow 64k hpages with 4k base page,
-		     * which was checked above, and always put them
-		     * at the PMD */
-		    hugepte_shift[psize] = PMD_SHIFT;
-		    break;
-		case PAGE_SHIFT_16M:
-		    /* 16M pages can be at two different levels
-		     * of pagestables based on base page size */
-		    if (PAGE_SHIFT == PAGE_SHIFT_64K)
-			    hugepte_shift[psize] = PMD_SHIFT;
-		    else /* 4k base page */
-			    hugepte_shift[psize] = PUD_SHIFT;
-		    break;
-		case PAGE_SHIFT_16G:
-		    /* 16G pages are always at PGD level */
-		    hugepte_shift[psize] = PGDIR_SHIFT;
-		    break;
-		}
-		hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
-	} else
-		hugepte_shift[psize] = 0;
+		if (mmu_psize_defs[psize].shift < PMD_SHIFT)
+			pdshift = PMD_SHIFT;
+		else if (mmu_psize_defs[psize].shift < PUD_SHIFT)
+			pdshift = PUD_SHIFT;
+		else
+			pdshift = PGDIR_SHIFT;
+		mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift;
+	}
 }
 
 static int __init hugepage_setup_sz(char *str)
@@ -732,7 +715,7 @@ __setup("hugepagesz=", hugepage_setup_sz
 
 static int __init hugetlbpage_init(void)
 {
-	unsigned int psize;
+	int psize;
 
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
@@ -753,8 +736,8 @@ static int __init hugetlbpage_init(void)
 
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		if (mmu_huge_psizes[psize]) {
-			pgtable_cache_add(hugepte_shift[psize], NULL);
-			if (!PGT_CACHE(hugepte_shift[psize]))
+			pgtable_cache_add(mmu_huge_psizes[psize], NULL);
+			if (!PGT_CACHE(mmu_huge_psizes[psize]))
 				panic("hugetlbpage_init(): could not create "
 				      "pgtable cache for %d bit pagesize\n",
 				      mmu_psize_to_shift(psize));
Index: working-2.6/arch/powerpc/include/asm/hugetlb.h
===================================================================
--- working-2.6.orig/arch/powerpc/include/asm/hugetlb.h	2009-09-28 14:17:38.000000000 +1000
+++ working-2.6/arch/powerpc/include/asm/hugetlb.h	2009-09-28 14:19:08.000000000 +1000
@@ -3,6 +3,15 @@
 
 #include <asm/page.h>
 
+typedef struct { signed long pd; } hugepd_t;
+
+static inline int hugepd_ok(hugepd_t hpd)
+{
+	return (hpd.pd > 0);
+}
+
+#define is_hugepd(pdep)               (hugepd_ok(*((hugepd_t *)(pdep))))
+#define HUGEPD_SHIFT_MASK     0x3f
 
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len);
@@ -17,6 +26,9 @@ void set_huge_pte_at(struct mm_struct *m
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep);
 
+int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
+	       unsigned long end, int write, struct page **pages, int *nr);
+
 /*
  * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs
  * to override the version in mm/hugetlb.c
Index: working-2.6/arch/powerpc/mm/init_64.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/init_64.c	2009-09-28 14:19:03.000000000 +1000
+++ working-2.6/arch/powerpc/mm/init_64.c	2009-09-28 14:19:08.000000000 +1000
@@ -41,6 +41,7 @@
 #include <linux/module.h>
 #include <linux/poison.h>
 #include <linux/lmb.h>
+#include <linux/hugetlb.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -128,8 +129,13 @@ void pgtable_cache_add(unsigned shift, v
 	unsigned long align = table_size;
 	/* When batching pgtable pointers for RCU freeing, we store
 	 * the index size in the low bits.  Table alignment must be
-	 * big enough to fit it */
-	unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+	 * big enough to fit it.
+	 *
+	 * Likewise, hugeapge pagetable pointers contain a (different)
+	 * shift value in the low bits.  All tables must be aligned so
+	 * as to leave enough 0 bits in the address to contain it. */
+	unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
+				     HUGEPD_SHIFT_MASK + 1);
 	struct kmem_cache *new;
 
 	/* It would be nice if this was a BUILD_BUG_ON(), but at the
Index: working-2.6/arch/powerpc/include/asm/pgtable-ppc64.h
===================================================================
--- working-2.6.orig/arch/powerpc/include/asm/pgtable-ppc64.h	2009-09-28 14:17:38.000000000 +1000
+++ working-2.6/arch/powerpc/include/asm/pgtable-ppc64.h	2009-09-28 14:19:08.000000000 +1000
@@ -379,7 +379,18 @@ void pgtable_cache_init(void);
 	return pt;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long address);
+#ifdef CONFIG_HUGETLB_PAGE
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+				 unsigned *shift);
+#else
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+					       unsigned *shift)
+{
+	if (shift)
+		*shift = 0;
+	return find_linux_pte(pgdir, ea);
+}
+#endif /* !CONFIG_HUGETLB_PAGE */
 
 #endif /* __ASSEMBLY__ */
 
Index: working-2.6/arch/powerpc/mm/gup.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/gup.c	2009-09-28 14:17:38.000000000 +1000
+++ working-2.6/arch/powerpc/mm/gup.c	2009-09-28 14:19:08.000000000 +1000
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t 
 	return 1;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
-static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
-				 unsigned long *addr, unsigned long end,
-				 int write, struct page **pages, int *nr)
-{
-	unsigned long mask;
-	unsigned long pte_end;
-	struct page *head, *page;
-	pte_t pte;
-	int refs;
-
-	pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
-	if (pte_end < end)
-		end = pte_end;
-
-	pte = *ptep;
-	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
-		mask |= _PAGE_RW;
-	if ((pte_val(pte) & mask) != mask)
-		return 0;
-	/* hugepages are never "special" */
-	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-	refs = 0;
-	head = pte_page(pte);
-	page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (*addr += PAGE_SIZE, *addr != end);
-
-	if (!page_cache_add_speculative(head, refs)) {
-		*nr -= refs;
-		return 0;
-	}
-	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-		/* Could be optimized better */
-		while (*nr) {
-			put_page(page);
-			(*nr)--;
-		}
-	}
-
-	return 1;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		int write, struct page **pages, int *nr)
 {
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsi
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(pmd))
 			return 0;
-		if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+		if (is_hugepd(pmdp)) {
+			if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
+					addr, next, write, pages, nr))
+				return 0;
+		} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
 			return 0;
 	} while (pmdp++, addr = next, addr != end);
 
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsi
 		next = pud_addr_end(addr, end);
 		if (pud_none(pud))
 			return 0;
-		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+		if (is_hugepd(pudp)) {
+			if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
+					addr, next, write, pages, nr))
+				return 0;
+		} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
 			return 0;
 	} while (pudp++, addr = next, addr != end);
 
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long st
 	unsigned long next;
 	pgd_t *pgdp;
 	int nr = 0;
-#ifdef CONFIG_PPC64
-	unsigned int shift;
-	int psize;
-#endif
 
 	pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
 
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long st
 
 	pr_devel("  aligned: %lx .. %lx\n", start, end);
 
-#ifdef CONFIG_HUGETLB_PAGE
-	/* We bail out on slice boundary crossing when hugetlb is
-	 * enabled in order to not have to deal with two different
-	 * page table formats
-	 */
-	if (addr < SLICE_LOW_TOP) {
-		if (end > SLICE_LOW_TOP)
-			goto slow_irqon;
-
-		if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
-			     GET_LOW_SLICE_INDEX(end - 1)))
-			goto slow_irqon;
-	} else {
-		if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
-			     GET_HIGH_SLICE_INDEX(end - 1)))
-			goto slow_irqon;
-	}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 	/*
 	 * XXX: batch / limit 'nr', to avoid large irq off latency
 	 * needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long st
 	 */
 	local_irq_disable();
 
-#ifdef CONFIG_PPC64
-	/* Those bits are related to hugetlbfs implementation and only exist
-	 * on 64-bit for now
-	 */
-	psize = get_slice_psize(mm, addr);
-	shift = mmu_psize_defs[psize].shift;
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_HUGETLB_PAGE
-	if (unlikely(mmu_huge_psizes[psize])) {
-		pte_t *ptep;
-		unsigned long a = addr;
-		unsigned long sz = ((1UL) << shift);
-		struct hstate *hstate = size_to_hstate(sz);
-
-		BUG_ON(!hstate);
-		/*
-		 * XXX: could be optimized to avoid hstate
-		 * lookup entirely (just use shift)
-		 */
-
-		do {
-			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
-			ptep = huge_pte_offset(mm, a);
-			pr_devel(" %016lx: huge ptep %p\n", a, ptep);
-			if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
-						   &nr))
-				goto slow;
-		} while (a != end);
-	} else
-#endif /* CONFIG_HUGETLB_PAGE */
-	{
-		pgdp = pgd_offset(mm, addr);
-		do {
-			pgd_t pgd = *pgdp;
-
-#ifdef CONFIG_PPC64
-			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
-#endif
-			pr_devel("  %016lx: normal pgd %p\n", addr,
-				 (void *)pgd_val(pgd));
-			next = pgd_addr_end(addr, end);
-			if (pgd_none(pgd))
-				goto slow;
-			if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		pr_devel("  %016lx: normal pgd %p\n", addr,
+			 (void *)pgd_val(pgd));
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (is_hugepd(pgdp)) {
+			if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
+					addr, next, write, pages, &nr))
 				goto slow;
-		} while (pgdp++, addr = next, addr != end);
-	}
+		} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+
 	local_irq_enable();
 
 	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
Index: working-2.6/arch/powerpc/kernel/perf_callchain.c
===================================================================
--- working-2.6.orig/arch/powerpc/kernel/perf_callchain.c	2009-09-28 14:17:38.000000000 +1000
+++ working-2.6/arch/powerpc/kernel/perf_callchain.c	2009-09-28 14:19:08.000000000 +1000
@@ -119,13 +119,6 @@ static void perf_callchain_kernel(struct
 }
 
 #ifdef CONFIG_PPC64
-
-#ifdef CONFIG_HUGETLB_PAGE
-#define is_huge_psize(pagesize)	(HPAGE_SHIFT && mmu_huge_psizes[pagesize])
-#else
-#define is_huge_psize(pagesize)	0
-#endif
-
 /*
  * On 64-bit we don't want to invoke hash_page on user addresses from
  * interrupt context, so if the access faults, we read the page tables
@@ -135,7 +128,7 @@ static int read_user_stack_slow(void __u
 {
 	pgd_t *pgdir;
 	pte_t *ptep, pte;
-	int pagesize;
+	unsigned shift;
 	unsigned long addr = (unsigned long) ptr;
 	unsigned long offset;
 	unsigned long pfn;
@@ -145,17 +138,14 @@ static int read_user_stack_slow(void __u
 	if (!pgdir)
 		return -EFAULT;
 
-	pagesize = get_slice_psize(current->mm, addr);
+	ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+	if (!shift)
+		shift = PAGE_SHIFT;
 
 	/* align address to page boundary */
-	offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
+	offset = addr & ((1UL << shift) - 1);
 	addr -= offset;
 
-	if (is_huge_psize(pagesize))
-		ptep = huge_pte_offset(current->mm, addr);
-	else
-		ptep = find_linux_pte(pgdir, addr);
-
 	if (ptep == NULL)
 		return -EFAULT;
 	pte = *ptep;
Index: working-2.6/arch/powerpc/mm/hash_utils_64.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/hash_utils_64.c	2009-09-28 14:17:38.000000000 +1000
+++ working-2.6/arch/powerpc/mm/hash_utils_64.c	2009-09-28 14:19:08.000000000 +1000
@@ -891,6 +891,7 @@ int hash_page(unsigned long ea, unsigned
 	unsigned long vsid;
 	struct mm_struct *mm;
 	pte_t *ptep;
+	unsigned hugeshift;
 	const struct cpumask *tmp;
 	int rc, user_region = 0, local = 0;
 	int psize, ssize;
@@ -943,14 +944,6 @@ int hash_page(unsigned long ea, unsigned
 	if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
 		local = 1;
 
-#ifdef CONFIG_HUGETLB_PAGE
-	/* Handle hugepage regions */
-	if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
-		DBG_LOW(" -> huge page !\n");
-		return hash_huge_page(mm, access, ea, vsid, local, trap);
-	}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 #ifndef CONFIG_PPC_64K_PAGES
 	/* If we use 4K pages and our psize is not 4K, then we are hitting
 	 * a special driver mapping, we need to align the address before
@@ -961,12 +954,18 @@ int hash_page(unsigned long ea, unsigned
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = find_linux_pte(pgdir, ea);
+	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
 		return 1;
 	}
 
+#ifdef CONFIG_HUGETLB_PAGE
+	if (hugeshift)
+		return __hash_page_huge(ea, access, vsid, ptep, trap, local,
+					ssize, hugeshift, psize);
+#endif /* CONFIG_HUGETLB_PAGE */
+
 #ifndef CONFIG_PPC_64K_PAGES
 	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
 #else
Index: working-2.6/arch/powerpc/include/asm/mmu-hash64.h
===================================================================
--- working-2.6.orig/arch/powerpc/include/asm/mmu-hash64.h	2009-09-28 14:17:38.000000000 +1000
+++ working-2.6/arch/powerpc/include/asm/mmu-hash64.h	2009-09-28 14:19:08.000000000 +1000
@@ -173,14 +173,6 @@ extern unsigned long tce_alloc_start, tc
  */
 extern int mmu_ci_restrictions;
 
-#ifdef CONFIG_HUGETLB_PAGE
-/*
- * The page size indexes of the huge pages for use by hugetlbfs
- */
-extern unsigned int mmu_huge_psizes[MMU_PAGE_COUNT];
-
-#endif /* CONFIG_HUGETLB_PAGE */
-
 /*
  * This function sets the AVPN and L fields of the HPTE  appropriately
  * for the page size
@@ -254,9 +246,9 @@ extern int __hash_page_64K(unsigned long
 			   unsigned int local, int ssize);
 struct mm_struct;
 extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap);
-extern int hash_huge_page(struct mm_struct *mm, unsigned long access,
-			  unsigned long ea, unsigned long vsid, int local,
-			  unsigned long trap);
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+		     pte_t *ptep, unsigned long trap, int local, int ssize,
+		     unsigned int shift, unsigned int mmu_psize);
 
 extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 			     unsigned long pstart, unsigned long prot,

^ permalink raw reply

* [1/5] Make hpte_need_flush() correctly mask for multiple page sizes
From: David Gibson @ 2009-09-28  4:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, linuxppc-dev
In-Reply-To: <20090928043902.GA6302@yookeroo.seuss>

Currently, hpte_need_flush() only correctly flushes the given address
for normal pages.  Callers for hugepages are required to mask the
address themselves.

But hpte_need_flush() already looks up the page sizes for its own
reasons, so this is a rather silly imposition on the callers.  This
patch alters it to mask based on the pagesize it has looked up itself,
and removes the awkward masking code in the hugepage caller.

Signed-off-by: David Gibson <dwg@au1.ibm.com>

---
 arch/powerpc/mm/hugetlbpage.c |    6 +-----
 arch/powerpc/mm/tlb_hash64.c  |    8 +++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

Index: working-2.6/arch/powerpc/mm/tlb_hash64.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/tlb_hash64.c	2009-09-04 14:35:30.000000000 +1000
+++ working-2.6/arch/powerpc/mm/tlb_hash64.c	2009-09-04 14:36:12.000000000 +1000
@@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *m
 
 	i = batch->index;
 
-	/* We mask the address for the base page size. Huge pages will
-	 * have applied their own masking already
-	 */
-	addr &= PAGE_MASK;
-
 	/* Get page size (maybe move back to caller).
 	 *
 	 * NOTE: when using special 64K mappings in 4K environment like
@@ -75,6 +70,9 @@ void hpte_need_flush(struct mm_struct *m
 	} else
 		psize = pte_pagesize_index(mm, addr, pte);
 
+	/* Mask the address for the correct page size */
+	addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
+
 	/* Build full vaddr */
 	if (!is_kernel_addr(addr)) {
 		ssize = user_segment_size(addr);
Index: working-2.6/arch/powerpc/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c	2009-09-04 14:35:30.000000000 +1000
+++ working-2.6/arch/powerpc/mm/hugetlbpage.c	2009-09-04 14:36:12.000000000 +1000
@@ -445,11 +445,7 @@ void set_huge_pte_at(struct mm_struct *m
 		 * necessary anymore if we make hpte_need_flush() get the
 		 * page size from the slices
 		 */
-		unsigned int psize = get_slice_psize(mm, addr);
-		unsigned int shift = mmu_psize_to_shift(psize);
-		unsigned long sz = ((1UL) << shift);
-		struct hstate *hstate = size_to_hstate(sz);
-		pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
+		pte_update(mm, addr, ptep, ~0UL, 1);
 	}
 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }

^ permalink raw reply

* [2/5] Cleanup management of kmem_caches for pagetables
From: David Gibson @ 2009-09-28  4:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, linuxppc-dev
In-Reply-To: <20090928043902.GA6302@yookeroo.seuss>

Currently we have a fair bit of rather fiddly code to manage the
various kmem_caches used to store page tables of various levels.  We
generally have two caches holding some combination of PGD, PUD and PMD
tables, plus several more for the special hugepage pagetables.

This patch cleans this all up by taking a different approach.  Rather
than the caches being designated as for PUDs or for hugeptes for 16M
pages, the caches are simply allocated to be a specific size.  Thus
sharing of caches between different types/levels of pagetables happens
naturally.  The pagetable size, where needed, is passed around encoded
in the same way as {PGD,PUD,PMD}_INDEX_SIZE; that is n where the
pagetable contains 2^n pointers.

Signed-off-by: David Gibson <dwg@au1.ibm.com>

---
 arch/powerpc/include/asm/pgalloc-64.h    |   51 ++++++++++++++---------------
 arch/powerpc/include/asm/pgalloc.h       |   30 ++---------------
 arch/powerpc/include/asm/pgtable-ppc64.h |    1 
 arch/powerpc/mm/hugetlbpage.c            |   45 ++++++-------------------
 arch/powerpc/mm/init_64.c                |   54 ++++++++++++++++++-------------
 arch/powerpc/mm/pgtable.c                |   25 +++++++++-----
 6 files changed, 92 insertions(+), 114 deletions(-)

Index: working-2.6/arch/powerpc/mm/init_64.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/init_64.c	2009-09-28 12:50:46.000000000 +1000
+++ working-2.6/arch/powerpc/mm/init_64.c	2009-09-28 14:19:03.000000000 +1000
@@ -119,30 +119,42 @@ static void pmd_ctor(void *addr)
 	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
-static const unsigned int pgtable_cache_size[2] = {
-	PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-#ifdef CONFIG_PPC_64K_PAGES
-	"pgd_cache", "pmd_cache",
-#else
-	"pgd_cache", "pud_pmd_cache",
-#endif /* CONFIG_PPC_64K_PAGES */
-};
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need an extra cache per hugepagesize, initialized in
- * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
- * is not compile time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
-#else
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-#endif
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
+	char *name;
+	unsigned long table_size = sizeof(void *) << shift;
+	unsigned long align = table_size;
+	/* When batching pgtable pointers for RCU freeing, we store
+	 * the index size in the low bits.  Table alignment must be
+	 * big enough to fit it */
+	unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+	struct kmem_cache *new;
+
+	/* It would be nice if this was a BUILD_BUG_ON(), but at the
+	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
+	 * constant expression, so so much for that. */
+	BUG_ON(!is_power_of_2(minalign));
+	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+
+	if (PGT_CACHE(shift))
+		return; /* Already have a cache of this size */
+	align = max_t(unsigned long, align, minalign);
+	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+	new = kmem_cache_create(name, table_size, table_size, 0, ctor);
+	PGT_CACHE(shift) = new;
+	pr_debug("Allocated pgtable cache for order %d\n", shift);
+}
+
 
 void pgtable_cache_init(void)
 {
-	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
-	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
+	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+	pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+		panic("Couldn't allocate pgtable caches");
+	BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
Index: working-2.6/arch/powerpc/include/asm/pgalloc-64.h
===================================================================
--- working-2.6.orig/arch/powerpc/include/asm/pgalloc-64.h	2009-08-03 16:00:45.000000000 +1000
+++ working-2.6/arch/powerpc/include/asm/pgalloc-64.h	2009-09-28 13:53:42.000000000 +1000
@@ -11,27 +11,30 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 
+/*
+ * This needs to be big enough to allow any pagetable sizes we need,
+ * but small enough to fit in the low bits of any page table pointer.
+ * In other words all pagetables, even tiny ones, must be aligned to
+ * allow at least enough low 0 bits to contain this value.
+ */
+#define MAX_PGTABLE_INDEX_SIZE	0xf
+
 #ifndef CONFIG_PPC_SUBPAGE_PROT
 static inline void subpage_prot_free(pgd_t *pgd) {}
 #endif
 
 extern struct kmem_cache *pgtable_cache[];
-
-#define PGD_CACHE_NUM		0
-#define PUD_CACHE_NUM		1
-#define PMD_CACHE_NUM		1
-#define HUGEPTE_CACHE_NUM	2
-#define PTE_NONCACHE_NUM	7  /* from GFP rather than kmem_cache */
+#define PGT_CACHE(shift) (pgtable_cache[(shift)-1])
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL);
+	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	subpage_prot_free(pgd);
-	kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd);
+	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
 #ifndef CONFIG_PPC_64K_PAGES
@@ -40,13 +43,13 @@ static inline void pgd_free(struct mm_st
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM],
+	return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
 				GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
-	kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud);
+	kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
@@ -78,13 +81,13 @@ static inline void pmd_populate_kernel(s
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM],
+	return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE),
 				GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
-	kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
+	kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
@@ -107,24 +110,22 @@ static inline pgtable_t pte_alloc_one(st
 	return page;
 }
 
-static inline void pgtable_free(pgtable_free_t pgf)
+static inline void pgtable_free(void *table, unsigned index_size)
 {
-	void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
-	int cachenum = pgf.val & PGF_CACHENUM_MASK;
-
-	if (cachenum == PTE_NONCACHE_NUM)
-		free_page((unsigned long)p);
-	else
-		kmem_cache_free(pgtable_cache[cachenum], p);
+	if (!index_size)
+		free_page((unsigned long)table);
+	else {
+		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
+		kmem_cache_free(PGT_CACHE(index_size), table);
+	}
 }
 
-#define __pmd_free_tlb(tlb, pmd,addr)		      \
-	pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \
-		PMD_CACHE_NUM, PMD_TABLE_SIZE-1))
+#define __pmd_free_tlb(tlb, pmd, addr)		      \
+	pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
 #ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pud, addr)		      \
-	pgtable_free_tlb(tlb, pgtable_free_cache(pud, \
-		PUD_CACHE_NUM, PUD_TABLE_SIZE-1))
+	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
+
 #endif /* CONFIG_PPC_64K_PAGES */
 
 #define check_pgt_cache()	do { } while (0)
Index: working-2.6/arch/powerpc/include/asm/pgalloc.h
===================================================================
--- working-2.6.orig/arch/powerpc/include/asm/pgalloc.h	2009-08-14 16:07:54.000000000 +1000
+++ working-2.6/arch/powerpc/include/asm/pgalloc.h	2009-09-28 13:53:42.000000000 +1000
@@ -24,25 +24,6 @@ static inline void pte_free(struct mm_st
 	__free_page(ptepage);
 }
 
-typedef struct pgtable_free {
-	unsigned long val;
-} pgtable_free_t;
-
-/* This needs to be big enough to allow for MMU_PAGE_COUNT + 2 to be stored
- * and small enough to fit in the low bits of any naturally aligned page
- * table cache entry. Arbitrarily set to 0x1f, that should give us some
- * room to grow
- */
-#define PGF_CACHENUM_MASK	0x1f
-
-static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
-						unsigned long mask)
-{
-	BUG_ON(cachenum > PGF_CACHENUM_MASK);
-
-	return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum};
-}
-
 #ifdef CONFIG_PPC64
 #include <asm/pgalloc-64.h>
 #else
@@ -50,12 +31,12 @@ static inline pgtable_free_t pgtable_fre
 #endif
 
 #ifdef CONFIG_SMP
-extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift);
 extern void pte_free_finish(void);
 #else /* CONFIG_SMP */
-static inline void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
-	pgtable_free(pgf);
+	pgtable_free(table, shift);
 }
 static inline void pte_free_finish(void) { }
 #endif /* !CONFIG_SMP */
@@ -63,12 +44,9 @@ static inline void pte_free_finish(void)
 static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage,
 				  unsigned long address)
 {
-	pgtable_free_t pgf = pgtable_free_cache(page_address(ptepage),
-						PTE_NONCACHE_NUM,
-						PTE_TABLE_SIZE-1);
 	tlb_flush_pgtable(tlb, address);
 	pgtable_page_dtor(ptepage);
-	pgtable_free_tlb(tlb, pgf);
+	pgtable_free_tlb(tlb, page_address(ptepage), 0);
 }
 
 #endif /* __KERNEL__ */
Index: working-2.6/arch/powerpc/mm/pgtable.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/pgtable.c	2009-08-28 13:46:31.000000000 +1000
+++ working-2.6/arch/powerpc/mm/pgtable.c	2009-09-28 13:53:42.000000000 +1000
@@ -47,12 +47,12 @@ struct pte_freelist_batch
 {
 	struct rcu_head	rcu;
 	unsigned int	index;
-	pgtable_free_t	tables[0];
+	unsigned long	tables[0];
 };
 
 #define PTE_FREELIST_SIZE \
 	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-	  / sizeof(pgtable_free_t))
+	  / sizeof(unsigned long))
 
 static void pte_free_smp_sync(void *arg)
 {
@@ -62,13 +62,13 @@ static void pte_free_smp_sync(void *arg)
 /* This is only called when we are critically out of memory
  * (and fail to get a page in pte_free_tlb).
  */
-static void pgtable_free_now(pgtable_free_t pgf)
+static void pgtable_free_now(void *table, unsigned shift)
 {
 	pte_freelist_forced_free++;
 
 	smp_call_function(pte_free_smp_sync, NULL, 1);
 
-	pgtable_free(pgf);
+	pgtable_free(table, shift);
 }
 
 static void pte_free_rcu_callback(struct rcu_head *head)
@@ -77,8 +77,12 @@ static void pte_free_rcu_callback(struct
 		container_of(head, struct pte_freelist_batch, rcu);
 	unsigned int i;
 
-	for (i = 0; i < batch->index; i++)
-		pgtable_free(batch->tables[i]);
+	for (i = 0; i < batch->index; i++) {
+		void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE);
+		unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE;
+
+		pgtable_free(table, shift);
+	}
 
 	free_page((unsigned long)batch);
 }
@@ -89,25 +93,28 @@ static void pte_free_submit(struct pte_f
 	call_rcu(&batch->rcu, pte_free_rcu_callback);
 }
 
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
 	/* This is safe since tlb_gather_mmu has disabled preemption */
 	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+	unsigned long pgf;
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
 	    cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
-		pgtable_free(pgf);
+		pgtable_free(table, shift);
 		return;
 	}
 
 	if (*batchp == NULL) {
 		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
 		if (*batchp == NULL) {
-			pgtable_free_now(pgf);
+			pgtable_free_now(table, shift);
 			return;
 		}
 		(*batchp)->index = 0;
 	}
+	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+	pgf = (unsigned long)table | (shift - 1);
 	(*batchp)->tables[(*batchp)->index++] = pgf;
 	if ((*batchp)->index == PTE_FREELIST_SIZE) {
 		pte_free_submit(*batchp);
Index: working-2.6/arch/powerpc/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c	2009-09-28 13:53:42.000000000 +1000
+++ working-2.6/arch/powerpc/mm/hugetlbpage.c	2009-09-28 14:17:38.000000000 +1000
@@ -43,26 +43,14 @@ static unsigned nr_gpages;
 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 
 #define hugepte_shift			mmu_huge_psizes
-#define PTRS_PER_HUGEPTE(psize)		(1 << hugepte_shift[psize])
-#define HUGEPTE_TABLE_SIZE(psize)	(sizeof(pte_t) << hugepte_shift[psize])
+#define HUGEPTE_INDEX_SIZE(psize)	(mmu_huge_psizes[(psize)])
+#define PTRS_PER_HUGEPTE(psize)		(1 << mmu_huge_psizes[psize])
 
 #define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
-						+ hugepte_shift[psize])
+					 + HUGEPTE_INDEX_SIZE(psize))
 #define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
 #define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
 
-/* Subtract one from array size because we don't need a cache for 4K since
- * is not a huge page size */
-#define HUGE_PGTABLE_INDEX(psize)	(HUGEPTE_CACHE_NUM + psize - 1)
-#define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
-
-static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
-	[MMU_PAGE_64K]	= "hugepte_cache_64K",
-	[MMU_PAGE_1M]	= "hugepte_cache_1M",
-	[MMU_PAGE_16M]	= "hugepte_cache_16M",
-	[MMU_PAGE_16G]	= "hugepte_cache_16G",
-};
-
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
@@ -114,15 +102,15 @@ static inline pte_t *hugepte_offset(huge
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
-				      GFP_KERNEL|__GFP_REPEAT);
+	pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]),
+				       GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
+		kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new);
 	else
 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
 	spin_unlock(&mm->page_table_lock);
@@ -271,9 +259,7 @@ static void free_hugepte_range(struct mm
 
 	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
-						 HUGEPTE_CACHE_NUM+psize-1,
-						 PGF_CACHENUM_MASK));
+	pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -698,8 +684,6 @@ static void __init set_huge_psize(int ps
 		if (mmu_huge_psizes[psize] ||
 		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
 			return;
-		if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
-			return;
 		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 
 		switch (mmu_psize_defs[psize].shift) {
@@ -769,16 +753,11 @@ static int __init hugetlbpage_init(void)
 
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		if (mmu_huge_psizes[psize]) {
-			pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
-				kmem_cache_create(
-					HUGEPTE_CACHE_NAME(psize),
-					HUGEPTE_TABLE_SIZE(psize),
-					HUGEPTE_TABLE_SIZE(psize),
-					0,
-					NULL);
-			if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
-				panic("hugetlbpage_init(): could not create %s"\
-				      "\n", HUGEPTE_CACHE_NAME(psize));
+			pgtable_cache_add(hugepte_shift[psize], NULL);
+			if (!PGT_CACHE(hugepte_shift[psize]))
+				panic("hugetlbpage_init(): could not create "
+				      "pgtable cache for %d bit pagesize\n",
+				      mmu_psize_to_shift(psize));
 		}
 	}
 
Index: working-2.6/arch/powerpc/include/asm/pgtable-ppc64.h
===================================================================
--- working-2.6.orig/arch/powerpc/include/asm/pgtable-ppc64.h	2009-08-28 13:46:31.000000000 +1000
+++ working-2.6/arch/powerpc/include/asm/pgtable-ppc64.h	2009-09-28 14:17:38.000000000 +1000
@@ -354,6 +354,7 @@ static inline void __ptep_set_access_fla
 #define pgoff_to_pte(off)	((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE})
 #define PTE_FILE_MAX_BITS	(BITS_PER_LONG - PTE_RPN_SHIFT)
 
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 
 /*

^ permalink raw reply

* [0/5] Assorted hugepage cleanups (v3)
From: David Gibson @ 2009-09-28  4:39 UTC (permalink / raw)
  To: linuxppc-dev, Benjamin Herrenschmidt

Currently, ordinary pages use one pagetable layout, and each different
hugepage size uses a slightly different variant layout.  A number of
places which need to walk the pagetable must first check the slice map
to see what the pagetable layout then handle the various different
forms.  New hardware, like Book3E is liable to introduce more possible
variants.

This patch series, therefore, is designed to simplify the matter by
limiting knowledge of the pagetable layout to only the allocation
path.  With this patch, ordinary pages are handled as ever, with a
fixed 4 (or 3) level tree.  All other variants branch off from some
layer of that with a specially marked PGD/PUD/PMD pointer which also
contains enough information to interpret the directories below that
point.  This means that things walking the pagetables (without
allocating) don't need to look up the slice map, they can just step
down the tree in the usual way, branching off to the "non-standard
layout" path for hugepages, which uses the embdded information to
interpret the tree from that point on.

This reduces the source size in a number of places, and means that
newer variants on the pagetable layout to handle new hardware and new
features will need to alter the existing code in less places.

In addition we split out the hash / classic MMU specific code into a
separate hugetlbpage-hash64.c file.  This will make adding support for
other MMUs (like 440 and/or Book3E) easier.

I've used the libhugetlbfs testsuite to test these patches on a
Power5+ machine, but they could certainly do with more testing. In
particular, I don't have any suitable hardware to test 16G pages.

V2: Made the tweaks that BenH suggested to patch 2 of the original
series.  Some corresponding tweaks in patch 3 to match.

V3: Fix several small bugs.
	* We had a BUILD_BUG_ON() which is broken by the recent
problems with BUILD_BUG_ON().  Since it's not a hot path, use a
runtime BUG_ON() instead.

	* The ifdef logic was inverted for the test against
CONFIG_SPU_FS_64K_LS.

	* That in turn masked a compile bug (using a non-existent
constant) in the CONFIG_SPU_FS_64K_LS path.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

^ permalink raw reply

* Re: [PATCH] powerpc/8xx: fix regression introduced by cache coherency rewrite
From: Benjamin Herrenschmidt @ 2009-09-28  3:21 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev@ozlabs.org, Rex Feany
In-Reply-To: <OFEC38184A.A376E71D-ONC125763E.0048F581-C125763E.00497FD3@transmode.se>

On Sun, 2009-09-27 at 15:22 +0200, Joakim Tjernlund wrote:
> > However, adding tlbil_va() to ptep_set_access_flags() as you suggested
> > makes everything happy. I need to test it some more, but it looks good
> > so far. Below is what I am testing now.
> 
> 8xx, is getting very hacky and I suspect that the only long term fix is
> add code to trap the cache instructions in TLB error/miss and fixup the
> exception in page fault handler. This will also have the added benefit on being able
> to use the cache instructions in both kernel and user space like any other
> ppc arch.

First I'd like to understand exactly what's happening today, since
it makes little sense :-) I suppose I'll have to get myself some
8xx doco and understand how the bloody MMU works.

Then, I saw your old patch and it's -very- invasive. If we can get away
with a one liner just adding tlbil_va in the right place, I think I'm
happy to stick with it until somebody comes up with a real good reason
to do more :-) 8xx is on life support and has been around for long
enough without people feeling the need overall to work around that
problem so I'm tempted to keep the status-quo here.

Cheers,
Ben

^ permalink raw reply

* Re: GPIO driver for MPC8313.
From: Johnny Hung @ 2009-09-28  2:43 UTC (permalink / raw)
  To: Peter Korsgaard; +Cc: linuxppc-dev, linux-embedded
In-Reply-To: <cb9ecdfa0909230820k6ebbd589o409e58e2d9b6740c@mail.gmail.com>

The kerne source I used now is 2.6.23 with MPC8313-erdb pached. I
think I should port 2.6.28 gpio function back to 2.6.23. Is it a
common way to implement it in the kernel I used or I should port
MPC8313-erdb pached to 2.6.28 opposite?

BRs, H. Johnny

2009/9/23 Johnny Hung <johnny.hacking@gmail.com>:
> Many thanks for your help. I will try it.
>
> 2009/9/23 Peter Korsgaard <jacmet@sunsite.dk>:
>>>>>>> "Johnny" =3D=3D Johnny Hung <johnny.hacking@gmail.com> writes:
>>
>> =A0Johnny> Thanks, got it. BTW, how to trigger GPIO level in user space
>> =A0Johnny> application? I also found
>> =A0Johnny> arch/powerpc/platforms/52xx/mpc52xx_gpio.c is a good
>> =A0Johnny> example. Any reply is appreciate.
>>
>> Through sysfs. See 'Sysfs Interface for Userspace' section of
>> Documentation/gpio.txt
>>
>> --
>> Bye, Peter Korsgaard
>>
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox