LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 03/12][v3] pci: fsl: add PCI indirect access support
From: Minghuan Lian @ 2013-10-23 10:41 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Minghuan Lian, linux-pci, Zang Roy-R61911, Bjorn Helgaas,
	Scott Wood
In-Reply-To: <1382524894-15164-1-git-send-email-Minghuan.Lian@freescale.com>

The patch adds PCI indirect read/write functions. The main code
is ported from arch/powerpc/sysdev/indirect_pci.c. We use general
IO API iowrite32be/ioread32be instead of out_be32/in_be32, and
use structure fsl_Pci instead of PowerPC's pci_controller.
The patch also provides fsl_pcie_check_link() to check PCI link.
The weak function fsl_arch_pci_exclude_device() is provided to
call ppc_md.pci_exclude_device() for PowerPC architecture.

Signed-off-by: Minghuan Lian <Minghuan.Lian@freescale.com>
---
change log:
v1-v3:
Derived from http://patchwork.ozlabs.org/patch/278965/

Based on upstream master.
Based on the discussion of RFC version here
http://patchwork.ozlabs.org/patch/274487/

 drivers/pci/host/pci-fsl-common.c | 169 ++++++++++++++++++++++++++++++++------
 include/linux/fsl/pci-common.h    |   6 ++
 2 files changed, 151 insertions(+), 24 deletions(-)

diff --git a/drivers/pci/host/pci-fsl-common.c b/drivers/pci/host/pci-fsl-common.c
index 69d338b..8bc9a64 100644
--- a/drivers/pci/host/pci-fsl-common.c
+++ b/drivers/pci/host/pci-fsl-common.c
@@ -35,52 +35,173 @@
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 
-static int fsl_pcie_check_link(struct pci_controller *hose)
+/* Indirect type */
+#define INDIRECT_TYPE_EXT_REG			0x00000002
+#define INDIRECT_TYPE_SURPRESS_PRIMARY_BUS	0x00000004
+#define INDIRECT_TYPE_NO_PCIE_LINK		0x00000008
+#define INDIRECT_TYPE_BIG_ENDIAN		0x00000010
+#define INDIRECT_TYPE_FSL_CFG_REG_LINK		0x00000040
+
+int __weak fsl_arch_pci_exclude_device(struct fsl_pci *pci, u8 bus, u8 devfn)
+{
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int fsl_pci_read_config(struct fsl_pci *pci, int bus, int devfn,
+				int offset, int len, u32 *val)
+{
+	u32 bus_no, reg, data;
+
+	if (pci->indirect_type & INDIRECT_TYPE_NO_PCIE_LINK) {
+		if (bus != pci->first_busno)
+			return PCIBIOS_DEVICE_NOT_FOUND;
+		if (devfn != 0)
+			return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	if (fsl_arch_pci_exclude_device(pci, bus, devfn))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	bus_no = (bus == pci->first_busno) ? pci->self_busno : bus;
+
+	if (pci->indirect_type & INDIRECT_TYPE_EXT_REG)
+		reg = ((offset & 0xf00) << 16) | (offset & 0xfc);
+	else
+		reg = offset & 0xfc;
+
+	if (pci->indirect_type & INDIRECT_TYPE_BIG_ENDIAN)
+		iowrite32be(0x80000000 | (bus_no << 16) | (devfn << 8) | reg,
+			    &pci->regs->config_addr);
+	else
+		iowrite32(0x80000000 | (bus_no << 16) | (devfn << 8) | reg,
+			  &pci->regs->config_addr);
+
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	data = ioread32(&pci->regs->config_data);
+	switch (len) {
+	case 1:
+		*val = (data >> (8 * (offset & 3))) & 0xff;
+		break;
+	case 2:
+		*val = (data >> (8 * (offset & 3))) & 0xffff;
+		break;
+	default:
+		*val = data;
+		break;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int fsl_pci_write_config(struct fsl_pci *pci, int bus, int devfn,
+				 int offset, int len, u32 val)
+{
+	void __iomem *cfg_data;
+	u32 bus_no, reg;
+
+	if (pci->indirect_type & INDIRECT_TYPE_NO_PCIE_LINK) {
+		if (bus != pci->first_busno)
+			return PCIBIOS_DEVICE_NOT_FOUND;
+		if (devfn != 0)
+			return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	if (fsl_arch_pci_exclude_device(pci, bus, devfn))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	bus_no = (bus == pci->first_busno) ?
+			pci->self_busno : bus;
+
+	if (pci->indirect_type & INDIRECT_TYPE_EXT_REG)
+		reg = ((offset & 0xf00) << 16) | (offset & 0xfc);
+	else
+		reg = offset & 0xfc;
+
+	if (pci->indirect_type & INDIRECT_TYPE_BIG_ENDIAN)
+		iowrite32be(0x80000000 | (bus_no << 16) | (devfn << 8) | reg,
+			    &pci->regs->config_addr);
+	else
+		iowrite32(0x80000000 | (bus_no << 16) | (devfn << 8) | reg,
+			  &pci->regs->config_addr);
+
+	/* suppress setting of PCI_PRIMARY_BUS */
+	if (pci->indirect_type & INDIRECT_TYPE_SURPRESS_PRIMARY_BUS)
+		if ((offset == PCI_PRIMARY_BUS) &&
+		    (bus == pci->first_busno))
+			val &= 0xffffff00;
+
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	cfg_data = ((void *) &(pci->regs->config_data)) + (offset & 3);
+	switch (len) {
+	case 1:
+		iowrite8(val, cfg_data);
+		break;
+	case 2:
+		iowrite16(val, cfg_data);
+		break;
+	default:
+		iowrite32(val, cfg_data);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+bool fsl_pci_check_link(struct fsl_pci *pci)
 {
 	u32 val = 0;
 
-	if (hose->indirect_type & PPC_INDIRECT_TYPE_FSL_CFG_REG_LINK) {
-		if (hose->ops->read == fsl_indirect_read_config) {
-			struct pci_bus bus;
-			bus.number = hose->first_busno;
-			bus.sysdata = hose;
-			bus.ops = hose->ops;
-			indirect_read_config(&bus, 0, PCIE_LTSSM, 4, &val);
-		} else
-			early_read_config_dword(hose, 0, 0, PCIE_LTSSM, &val);
+	if (pci->indirect_type & INDIRECT_TYPE_FSL_CFG_REG_LINK) {
+		fsl_pci_read_config(pci, 0, 0, PCIE_LTSSM, 4, &val);
 		if (val < PCIE_LTSSM_L0)
-			return 1;
+			return false;
 	} else {
-		struct ccsr_pci __iomem *pci = hose->private_data;
 		/* for PCIe IP rev 3.0 or greater use CSR0 for link state */
-		val = (in_be32(&pci->pex_csr0) & PEX_CSR0_LTSSM_MASK)
+		val = (ioread32be(&pci->regs->pex_csr0) & PEX_CSR0_LTSSM_MASK)
 				>> PEX_CSR0_LTSSM_SHIFT;
 		if (val != PEX_CSR0_LTSSM_L0)
-			return 1;
+			return false;
 	}
 
-	return 0;
+	return true;
 }
 
 static int fsl_indirect_read_config(struct pci_bus *bus, unsigned int devfn,
 				    int offset, int len, u32 *val)
 {
-	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct fsl_pci *pci = fsl_arch_sys_to_pci(bus->sysdata);
+
+	if (!pci)
+		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	if (fsl_pcie_check_link(hose))
-		hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK;
+	if (fsl_pci_check_link(pci))
+		pci->indirect_type &= ~INDIRECT_TYPE_NO_PCIE_LINK;
 	else
-		hose->indirect_type &= ~PPC_INDIRECT_TYPE_NO_PCIE_LINK;
+		pci->indirect_type |= INDIRECT_TYPE_NO_PCIE_LINK;
 
-	return indirect_read_config(bus, devfn, offset, len, val);
+	return fsl_pci_read_config(pci, bus->number, devfn, offset, len, val);
 }
 
-#if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx)
-
-static struct pci_ops fsl_indirect_pcie_ops =
+static int fsl_indirect_write_config(struct pci_bus *bus, unsigned int devfn,
+				     int offset, int len, u32 val)
 {
+	struct fsl_pci *pci = fsl_arch_sys_to_pci(bus->sysdata);
+
+	if (!pci)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	return fsl_pci_write_config(pci, bus->number, devfn,
+				     offset, len, val);
+}
+
+static struct pci_ops fsl_indirect_pci_ops = {
 	.read = fsl_indirect_read_config,
-	.write = indirect_write_config,
+	.write = fsl_indirect_write_config,
 };
 
 static int setup_one_atmu(struct ccsr_pci __iomem *pci,
diff --git a/include/linux/fsl/pci-common.h b/include/linux/fsl/pci-common.h
index e56a040..7df4355 100644
--- a/include/linux/fsl/pci-common.h
+++ b/include/linux/fsl/pci-common.h
@@ -143,5 +143,11 @@ struct fsl_pci {
  */
 extern struct fsl_pci *fsl_arch_sys_to_pci(void *sys);
 
+/* Return link status true -> link, false -> no link */
+bool fsl_pci_check_link(struct fsl_pci *pci);
+
+/* To avoid touching specified devices */
+int fsl_arch_pci_exclude_device(struct fsl_pci *pci, u8 bus, u8 devfn);
+
 #endif /* __PCI_COMMON_H */
 #endif /* __KERNEL__ */
-- 
1.8.1.2

^ permalink raw reply related

* [PATCH 04/12][v3] pci: fsl: add early PCI indirect access support
From: Minghuan Lian @ 2013-10-23 10:41 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Minghuan Lian, linux-pci, Zang Roy-R61911, Bjorn Helgaas,
	Scott Wood
In-Reply-To: <1382524894-15164-1-git-send-email-Minghuan.Lian@freescale.com>

The driver needs to read/write PCI configuration very early, at
that time architecture-specific PCI controller structure and
PCI bus have not been created. The patch provides an interface
fsl_arch_fake_pci_bus which should be implemented in
architecture-specific PCI driver to fake a PCI controller structure
and PCI bus. Using the fake PCI controller and PCI bus, the patch
provides the early indirect read/write functions.

Signed-off-by: Minghuan Lian <Minghuan.Lian@freescale.com>
---
change log:
v1-v3:
Derived from http://patchwork.ozlabs.org/patch/278965/

Based on upstream master.
Based on the discussion of RFC version here
http://patchwork.ozlabs.org/patch/274487/

 drivers/pci/host/pci-fsl-common.c | 26 ++++++++++++++++++++++++++
 include/linux/fsl/pci-common.h    |  7 +++++++
 2 files changed, 33 insertions(+)

diff --git a/drivers/pci/host/pci-fsl-common.c b/drivers/pci/host/pci-fsl-common.c
index 8bc9a64..505a6a1 100644
--- a/drivers/pci/host/pci-fsl-common.c
+++ b/drivers/pci/host/pci-fsl-common.c
@@ -204,6 +204,32 @@ static struct pci_ops fsl_indirect_pci_ops = {
 	.write = fsl_indirect_write_config,
 };
 
+#define EARLY_FSL_PCI_OP(rw, size, type)				\
+int early_fsl_##rw##_config_##size(struct fsl_pci *pci, int bus,	\
+				   int devfn, int offset, type value)	\
+{									\
+	return pci_bus_##rw##_config_##size(fsl_arch_fake_pci_bus(pci, bus),\
+					    devfn, offset, value);	\
+}
+
+EARLY_FSL_PCI_OP(read, byte, u8 *)
+EARLY_FSL_PCI_OP(read, word, u16 *)
+EARLY_FSL_PCI_OP(read, dword, u32 *)
+EARLY_FSL_PCI_OP(write, byte, u8)
+EARLY_FSL_PCI_OP(write, word, u16)
+EARLY_FSL_PCI_OP(write, dword, u32)
+
+static int early_fsl_find_capability(struct fsl_pci *pci,
+				     int busnr, int devfn, int cap)
+{
+	struct pci_bus *bus = fsl_arch_fake_pci_bus(pci, busnr);
+
+	if (!bus)
+		return 0;
+
+	return pci_bus_find_capability(bus, devfn, cap);
+}
+
 static int setup_one_atmu(struct ccsr_pci __iomem *pci,
 	unsigned int index, const struct resource *res,
 	resource_size_t offset)
diff --git a/include/linux/fsl/pci-common.h b/include/linux/fsl/pci-common.h
index 7df4355..a3aca29 100644
--- a/include/linux/fsl/pci-common.h
+++ b/include/linux/fsl/pci-common.h
@@ -149,5 +149,12 @@ bool fsl_pci_check_link(struct fsl_pci *pci);
 /* To avoid touching specified devices */
 int fsl_arch_pci_exclude_device(struct fsl_pci *pci, u8 bus, u8 devfn);
 
+/*
+ * To fake a PCI bus
+ * it is called by early_fsl_*(), at that time the architecture-dependent
+ * pci controller and pci bus have not been created.
+ */
+extern struct pci_bus *fsl_arch_fake_pci_bus(struct fsl_pci *pci, int busnr);
+
 #endif /* __PCI_COMMON_H */
 #endif /* __KERNEL__ */
-- 
1.8.1.2

^ permalink raw reply related

* [PATCH 02/12][v3] pci: fsl: add structure fsl_pci
From: Minghuan Lian @ 2013-10-23 10:41 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Minghuan Lian, linux-pci, Zang Roy-R61911, Bjorn Helgaas,
	Scott Wood
In-Reply-To: <1382524894-15164-1-git-send-email-Minghuan.Lian@freescale.com>

PowerPC uses structure pci_controller to describe PCI controller,
but ARM uses structure pci_sys_data. In order to support PowerPC
and ARM simultaneously, the patch adds a structure fsl_pci that
contains most of the members of the pci_controller and pci_sys_data.
Meanwhile, it defines a interface fsl_arch_sys_to_pci() which should
be implemented in architecture-specific PCI controller driver to
convert pci_controller or pci_sys_data to fsl_pci.

Signed-off-by: Minghuan Lian <Minghuan.Lian@freescale.com>
---
change log:
v1-v3:
Derived from http://patchwork.ozlabs.org/patch/278965/

Based on upstream master.
Based on the discussion of RFC version here
http://patchwork.ozlabs.org/patch/274487/

 include/linux/fsl/pci-common.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/include/linux/fsl/pci-common.h b/include/linux/fsl/pci-common.h
index 5e4f683..e56a040 100644
--- a/include/linux/fsl/pci-common.h
+++ b/include/linux/fsl/pci-common.h
@@ -102,5 +102,46 @@ struct ccsr_pci {
 
 };
 
+/*
+ * Structure of a PCI controller (host bridge)
+ */
+struct fsl_pci {
+	struct list_head node;
+	bool is_pcie;
+	struct device_node *dn;
+	struct device *dev;
+
+	int first_busno;
+	int last_busno;
+	int self_busno;
+	struct resource busn;
+
+	struct pci_ops *ops;
+	struct ccsr_pci __iomem *regs;
+
+	u32 indirect_type;
+
+	struct resource io_resource;
+	resource_size_t io_base_phys;
+	resource_size_t pci_io_size;
+
+	struct resource mem_resources[3];
+	resource_size_t mem_offset[3];
+
+	int global_number;	/* PCI domain number */
+
+	resource_size_t dma_window_base_cur;
+	resource_size_t dma_window_size;
+
+	void *sys;
+};
+
+/*
+ * Convert architecture specific pci controller structure to fsl_pci
+ * PowerPC uses structure pci_controller and ARM uses structure pci_sys_data
+ * to describe pci controller.
+ */
+extern struct fsl_pci *fsl_arch_sys_to_pci(void *sys);
+
 #endif /* __PCI_COMMON_H */
 #endif /* __KERNEL__ */
-- 
1.8.1.2

^ permalink raw reply related

* [PATCH 01/12][v3] pci: fsl: derive the common PCI driver to drivers/pci/host
From: Minghuan Lian @ 2013-10-23 10:41 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Minghuan Lian, linux-pci, Zang Roy-R61911, Bjorn Helgaas,
	Scott Wood

The Freescale's Layerscape series processors will use ARM cores.
The LS1's PCIe controllers is the same as T4240's. So it's better
the PCIe controller driver can support PowerPC and ARM
simultaneously. This patch is for this purpose. It derives
the common functions from arch/powerpc/sysdev/fsl_pci.c to
drivers/pci/host/pci-fsl-common.c and leaves the architecture
specific functions which should be implemented in arch related files.

Signed-off-by: Minghuan Lian <Minghuan.Lian@freescale.com>
---
change log:
v2-v3:
no change
v1-v2:
1. rename pci.h to pci-common.h 
2. rename pci-fsl.c to pci-fsl-common.c

Based on upstream master.
Based on the discussion of RFC version here
http://patchwork.ozlabs.org/patch/274487/

 arch/powerpc/sysdev/fsl_pci.c                      | 521 +-----------------
 arch/powerpc/sysdev/fsl_pci.h                      |  89 ----
 .../fsl_pci.c => drivers/pci/host/pci-fsl-common.c | 591 +--------------------
 .../fsl_pci.h => include/linux/fsl/pci-common.h    |  45 +-
 4 files changed, 7 insertions(+), 1239 deletions(-)
 copy arch/powerpc/sysdev/fsl_pci.c => drivers/pci/host/pci-fsl-common.c (54%)
 copy arch/powerpc/sysdev/fsl_pci.h => include/linux/fsl/pci-common.h (79%)

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index ccfb50d..26039e3 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -27,6 +27,7 @@
 #include <linux/log2.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/fsl/pci-common.h>
 
 #include <asm/io.h>
 #include <asm/prom.h>
@@ -58,57 +59,8 @@ static void quirk_fsl_pcie_header(struct pci_dev *dev)
 	return;
 }
 
-static int fsl_indirect_read_config(struct pci_bus *, unsigned int,
-				    int, int, u32 *);
-
-static int fsl_pcie_check_link(struct pci_controller *hose)
-{
-	u32 val = 0;
-
-	if (hose->indirect_type & PPC_INDIRECT_TYPE_FSL_CFG_REG_LINK) {
-		if (hose->ops->read == fsl_indirect_read_config) {
-			struct pci_bus bus;
-			bus.number = hose->first_busno;
-			bus.sysdata = hose;
-			bus.ops = hose->ops;
-			indirect_read_config(&bus, 0, PCIE_LTSSM, 4, &val);
-		} else
-			early_read_config_dword(hose, 0, 0, PCIE_LTSSM, &val);
-		if (val < PCIE_LTSSM_L0)
-			return 1;
-	} else {
-		struct ccsr_pci __iomem *pci = hose->private_data;
-		/* for PCIe IP rev 3.0 or greater use CSR0 for link state */
-		val = (in_be32(&pci->pex_csr0) & PEX_CSR0_LTSSM_MASK)
-				>> PEX_CSR0_LTSSM_SHIFT;
-		if (val != PEX_CSR0_LTSSM_L0)
-			return 1;
-	}
-
-	return 0;
-}
-
-static int fsl_indirect_read_config(struct pci_bus *bus, unsigned int devfn,
-				    int offset, int len, u32 *val)
-{
-	struct pci_controller *hose = pci_bus_to_host(bus);
-
-	if (fsl_pcie_check_link(hose))
-		hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK;
-	else
-		hose->indirect_type &= ~PPC_INDIRECT_TYPE_NO_PCIE_LINK;
-
-	return indirect_read_config(bus, devfn, offset, len, val);
-}
-
 #if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx)
 
-static struct pci_ops fsl_indirect_pcie_ops =
-{
-	.read = fsl_indirect_read_config,
-	.write = indirect_write_config,
-};
-
 #define MAX_PHYS_ADDR_BITS	40
 static u64 pci64_dma_offset = 1ull << MAX_PHYS_ADDR_BITS;
 
@@ -132,291 +84,6 @@ static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
 	return 0;
 }
 
-static int setup_one_atmu(struct ccsr_pci __iomem *pci,
-	unsigned int index, const struct resource *res,
-	resource_size_t offset)
-{
-	resource_size_t pci_addr = res->start - offset;
-	resource_size_t phys_addr = res->start;
-	resource_size_t size = resource_size(res);
-	u32 flags = 0x80044000; /* enable & mem R/W */
-	unsigned int i;
-
-	pr_debug("PCI MEM resource start 0x%016llx, size 0x%016llx.\n",
-		(u64)res->start, (u64)size);
-
-	if (res->flags & IORESOURCE_PREFETCH)
-		flags |= 0x10000000; /* enable relaxed ordering */
-
-	for (i = 0; size > 0; i++) {
-		unsigned int bits = min(ilog2(size),
-					__ffs(pci_addr | phys_addr));
-
-		if (index + i >= 5)
-			return -1;
-
-		out_be32(&pci->pow[index + i].potar, pci_addr >> 12);
-		out_be32(&pci->pow[index + i].potear, (u64)pci_addr >> 44);
-		out_be32(&pci->pow[index + i].powbar, phys_addr >> 12);
-		out_be32(&pci->pow[index + i].powar, flags | (bits - 1));
-
-		pci_addr += (resource_size_t)1U << bits;
-		phys_addr += (resource_size_t)1U << bits;
-		size -= (resource_size_t)1U << bits;
-	}
-
-	return i;
-}
-
-/* atmu setup for fsl pci/pcie controller */
-static void setup_pci_atmu(struct pci_controller *hose)
-{
-	struct ccsr_pci __iomem *pci = hose->private_data;
-	int i, j, n, mem_log, win_idx = 3, start_idx = 1, end_idx = 4;
-	u64 mem, sz, paddr_hi = 0;
-	u64 offset = 0, paddr_lo = ULLONG_MAX;
-	u32 pcicsrbar = 0, pcicsrbar_sz;
-	u32 piwar = PIWAR_EN | PIWAR_PF | PIWAR_TGI_LOCAL |
-			PIWAR_READ_SNOOP | PIWAR_WRITE_SNOOP;
-	const char *name = hose->dn->full_name;
-	const u64 *reg;
-	int len;
-
-	if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
-		if (in_be32(&pci->block_rev1) >= PCIE_IP_REV_2_2) {
-			win_idx = 2;
-			start_idx = 0;
-			end_idx = 3;
-		}
-	}
-
-	/* Disable all windows (except powar0 since it's ignored) */
-	for(i = 1; i < 5; i++)
-		out_be32(&pci->pow[i].powar, 0);
-	for (i = start_idx; i < end_idx; i++)
-		out_be32(&pci->piw[i].piwar, 0);
-
-	/* Setup outbound MEM window */
-	for(i = 0, j = 1; i < 3; i++) {
-		if (!(hose->mem_resources[i].flags & IORESOURCE_MEM))
-			continue;
-
-		paddr_lo = min(paddr_lo, (u64)hose->mem_resources[i].start);
-		paddr_hi = max(paddr_hi, (u64)hose->mem_resources[i].end);
-
-		/* We assume all memory resources have the same offset */
-		offset = hose->mem_offset[i];
-		n = setup_one_atmu(pci, j, &hose->mem_resources[i], offset);
-
-		if (n < 0 || j >= 5) {
-			pr_err("Ran out of outbound PCI ATMUs for resource %d!\n", i);
-			hose->mem_resources[i].flags |= IORESOURCE_DISABLED;
-		} else
-			j += n;
-	}
-
-	/* Setup outbound IO window */
-	if (hose->io_resource.flags & IORESOURCE_IO) {
-		if (j >= 5) {
-			pr_err("Ran out of outbound PCI ATMUs for IO resource\n");
-		} else {
-			pr_debug("PCI IO resource start 0x%016llx, size 0x%016llx, "
-				 "phy base 0x%016llx.\n",
-				 (u64)hose->io_resource.start,
-				 (u64)resource_size(&hose->io_resource),
-				 (u64)hose->io_base_phys);
-			out_be32(&pci->pow[j].potar, (hose->io_resource.start >> 12));
-			out_be32(&pci->pow[j].potear, 0);
-			out_be32(&pci->pow[j].powbar, (hose->io_base_phys >> 12));
-			/* Enable, IO R/W */
-			out_be32(&pci->pow[j].powar, 0x80088000
-				| (ilog2(hose->io_resource.end
-				- hose->io_resource.start + 1) - 1));
-		}
-	}
-
-	/* convert to pci address space */
-	paddr_hi -= offset;
-	paddr_lo -= offset;
-
-	if (paddr_hi == paddr_lo) {
-		pr_err("%s: No outbound window space\n", name);
-		return;
-	}
-
-	if (paddr_lo == 0) {
-		pr_err("%s: No space for inbound window\n", name);
-		return;
-	}
-
-	/* setup PCSRBAR/PEXCSRBAR */
-	early_write_config_dword(hose, 0, 0, PCI_BASE_ADDRESS_0, 0xffffffff);
-	early_read_config_dword(hose, 0, 0, PCI_BASE_ADDRESS_0, &pcicsrbar_sz);
-	pcicsrbar_sz = ~pcicsrbar_sz + 1;
-
-	if (paddr_hi < (0x100000000ull - pcicsrbar_sz) ||
-		(paddr_lo > 0x100000000ull))
-		pcicsrbar = 0x100000000ull - pcicsrbar_sz;
-	else
-		pcicsrbar = (paddr_lo - pcicsrbar_sz) & -pcicsrbar_sz;
-	early_write_config_dword(hose, 0, 0, PCI_BASE_ADDRESS_0, pcicsrbar);
-
-	paddr_lo = min(paddr_lo, (u64)pcicsrbar);
-
-	pr_info("%s: PCICSRBAR @ 0x%x\n", name, pcicsrbar);
-
-	/* Setup inbound mem window */
-	mem = memblock_end_of_DRAM();
-
-	/*
-	 * The msi-address-64 property, if it exists, indicates the physical
-	 * address of the MSIIR register.  Normally, this register is located
-	 * inside CCSR, so the ATMU that covers all of CCSR is used. But if
-	 * this property exists, then we normally need to create a new ATMU
-	 * for it.  For now, however, we cheat.  The only entity that creates
-	 * this property is the Freescale hypervisor, and the address is
-	 * specified in the partition configuration.  Typically, the address
-	 * is located in the page immediately after the end of DDR.  If so, we
-	 * can avoid allocating a new ATMU by extending the DDR ATMU by one
-	 * page.
-	 */
-	reg = of_get_property(hose->dn, "msi-address-64", &len);
-	if (reg && (len == sizeof(u64))) {
-		u64 address = be64_to_cpup(reg);
-
-		if ((address >= mem) && (address < (mem + PAGE_SIZE))) {
-			pr_info("%s: extending DDR ATMU to cover MSIIR", name);
-			mem += PAGE_SIZE;
-		} else {
-			/* TODO: Create a new ATMU for MSIIR */
-			pr_warn("%s: msi-address-64 address of %llx is "
-				"unsupported\n", name, address);
-		}
-	}
-
-	sz = min(mem, paddr_lo);
-	mem_log = ilog2(sz);
-
-	/* PCIe can overmap inbound & outbound since RX & TX are separated */
-	if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
-		/* Size window to exact size if power-of-two or one size up */
-		if ((1ull << mem_log) != mem) {
-			mem_log++;
-			if ((1ull << mem_log) > mem)
-				pr_info("%s: Setting PCI inbound window "
-					"greater than memory size\n", name);
-		}
-
-		piwar |= ((mem_log - 1) & PIWAR_SZ_MASK);
-
-		/* Setup inbound memory window */
-		out_be32(&pci->piw[win_idx].pitar,  0x00000000);
-		out_be32(&pci->piw[win_idx].piwbar, 0x00000000);
-		out_be32(&pci->piw[win_idx].piwar,  piwar);
-		win_idx--;
-
-		hose->dma_window_base_cur = 0x00000000;
-		hose->dma_window_size = (resource_size_t)sz;
-
-		/*
-		 * if we have >4G of memory setup second PCI inbound window to
-		 * let devices that are 64-bit address capable to work w/o
-		 * SWIOTLB and access the full range of memory
-		 */
-		if (sz != mem) {
-			mem_log = ilog2(mem);
-
-			/* Size window up if we dont fit in exact power-of-2 */
-			if ((1ull << mem_log) != mem)
-				mem_log++;
-
-			piwar = (piwar & ~PIWAR_SZ_MASK) | (mem_log - 1);
-
-			/* Setup inbound memory window */
-			out_be32(&pci->piw[win_idx].pitar,  0x00000000);
-			out_be32(&pci->piw[win_idx].piwbear,
-					pci64_dma_offset >> 44);
-			out_be32(&pci->piw[win_idx].piwbar,
-					pci64_dma_offset >> 12);
-			out_be32(&pci->piw[win_idx].piwar,  piwar);
-
-			/*
-			 * install our own dma_set_mask handler to fixup dma_ops
-			 * and dma_offset
-			 */
-			ppc_md.dma_set_mask = fsl_pci_dma_set_mask;
-
-			pr_info("%s: Setup 64-bit PCI DMA window\n", name);
-		}
-	} else {
-		u64 paddr = 0;
-
-		/* Setup inbound memory window */
-		out_be32(&pci->piw[win_idx].pitar,  paddr >> 12);
-		out_be32(&pci->piw[win_idx].piwbar, paddr >> 12);
-		out_be32(&pci->piw[win_idx].piwar,  (piwar | (mem_log - 1)));
-		win_idx--;
-
-		paddr += 1ull << mem_log;
-		sz -= 1ull << mem_log;
-
-		if (sz) {
-			mem_log = ilog2(sz);
-			piwar |= (mem_log - 1);
-
-			out_be32(&pci->piw[win_idx].pitar,  paddr >> 12);
-			out_be32(&pci->piw[win_idx].piwbar, paddr >> 12);
-			out_be32(&pci->piw[win_idx].piwar,  piwar);
-			win_idx--;
-
-			paddr += 1ull << mem_log;
-		}
-
-		hose->dma_window_base_cur = 0x00000000;
-		hose->dma_window_size = (resource_size_t)paddr;
-	}
-
-	if (hose->dma_window_size < mem) {
-#ifdef CONFIG_SWIOTLB
-		ppc_swiotlb_enable = 1;
-#else
-		pr_err("%s: ERROR: Memory size exceeds PCI ATMU ability to "
-			"map - enable CONFIG_SWIOTLB to avoid dma errors.\n",
-			 name);
-#endif
-		/* adjusting outbound windows could reclaim space in mem map */
-		if (paddr_hi < 0xffffffffull)
-			pr_warning("%s: WARNING: Outbound window cfg leaves "
-				"gaps in memory map. Adjusting the memory map "
-				"could reduce unnecessary bounce buffering.\n",
-				name);
-
-		pr_info("%s: DMA window size is 0x%llx\n", name,
-			(u64)hose->dma_window_size);
-	}
-}
-
-static void __init setup_pci_cmd(struct pci_controller *hose)
-{
-	u16 cmd;
-	int cap_x;
-
-	early_read_config_word(hose, 0, 0, PCI_COMMAND, &cmd);
-	cmd |= PCI_COMMAND_SERR | PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY
-		| PCI_COMMAND_IO;
-	early_write_config_word(hose, 0, 0, PCI_COMMAND, cmd);
-
-	cap_x = early_find_capability(hose, 0, 0, PCI_CAP_ID_PCIX);
-	if (cap_x) {
-		int pci_x_cmd = cap_x + PCI_X_CMD;
-		cmd = PCI_X_CMD_MAX_SPLIT | PCI_X_CMD_MAX_READ
-			| PCI_X_CMD_ERO | PCI_X_CMD_DPERR_E;
-		early_write_config_word(hose, 0, 0, pci_x_cmd, cmd);
-	} else {
-		early_write_config_byte(hose, 0, 0, PCI_LATENCY_TIMER, 0x80);
-	}
-}
-
 void fsl_pcibios_fixup_bus(struct pci_bus *bus)
 {
 	struct pci_controller *hose = pci_bus_to_host(bus);
@@ -454,112 +121,6 @@ void fsl_pcibios_fixup_bus(struct pci_bus *bus)
 	}
 }
 
-int __init fsl_add_bridge(struct platform_device *pdev, int is_primary)
-{
-	int len;
-	struct pci_controller *hose;
-	struct resource rsrc;
-	const int *bus_range;
-	u8 hdr_type, progif;
-	struct device_node *dev;
-	struct ccsr_pci __iomem *pci;
-
-	dev = pdev->dev.of_node;
-
-	if (!of_device_is_available(dev)) {
-		pr_warning("%s: disabled\n", dev->full_name);
-		return -ENODEV;
-	}
-
-	pr_debug("Adding PCI host bridge %s\n", dev->full_name);
-
-	/* Fetch host bridge registers address */
-	if (of_address_to_resource(dev, 0, &rsrc)) {
-		printk(KERN_WARNING "Can't get pci register base!");
-		return -ENOMEM;
-	}
-
-	/* Get bus range if any */
-	bus_range = of_get_property(dev, "bus-range", &len);
-	if (bus_range == NULL || len < 2 * sizeof(int))
-		printk(KERN_WARNING "Can't get bus-range for %s, assume"
-			" bus 0\n", dev->full_name);
-
-	pci_add_flags(PCI_REASSIGN_ALL_BUS);
-	hose = pcibios_alloc_controller(dev);
-	if (!hose)
-		return -ENOMEM;
-
-	/* set platform device as the parent */
-	hose->parent = &pdev->dev;
-	hose->first_busno = bus_range ? bus_range[0] : 0x0;
-	hose->last_busno = bus_range ? bus_range[1] : 0xff;
-
-	pr_debug("PCI memory map start 0x%016llx, size 0x%016llx\n",
-		 (u64)rsrc.start, (u64)resource_size(&rsrc));
-
-	pci = hose->private_data = ioremap(rsrc.start, resource_size(&rsrc));
-	if (!hose->private_data)
-		goto no_bridge;
-
-	setup_indirect_pci(hose, rsrc.start, rsrc.start + 0x4,
-			   PPC_INDIRECT_TYPE_BIG_ENDIAN);
-
-	if (in_be32(&pci->block_rev1) < PCIE_IP_REV_3_0)
-		hose->indirect_type |= PPC_INDIRECT_TYPE_FSL_CFG_REG_LINK;
-
-	if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
-		/* use fsl_indirect_read_config for PCIe */
-		hose->ops = &fsl_indirect_pcie_ops;
-		/* For PCIE read HEADER_TYPE to identify controler mode */
-		early_read_config_byte(hose, 0, 0, PCI_HEADER_TYPE, &hdr_type);
-		if ((hdr_type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
-			goto no_bridge;
-
-	} else {
-		/* For PCI read PROG to identify controller mode */
-		early_read_config_byte(hose, 0, 0, PCI_CLASS_PROG, &progif);
-		if ((progif & 1) == 1)
-			goto no_bridge;
-	}
-
-	setup_pci_cmd(hose);
-
-	/* check PCI express link status */
-	if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
-		hose->indirect_type |= PPC_INDIRECT_TYPE_EXT_REG |
-			PPC_INDIRECT_TYPE_SURPRESS_PRIMARY_BUS;
-		if (fsl_pcie_check_link(hose))
-			hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK;
-	}
-
-	printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. "
-		"Firmware bus number: %d->%d\n",
-		(unsigned long long)rsrc.start, hose->first_busno,
-		hose->last_busno);
-
-	pr_debug(" ->Hose at 0x%p, cfg_addr=0x%p,cfg_data=0x%p\n",
-		hose, hose->cfg_addr, hose->cfg_data);
-
-	/* Interpret the "ranges" property */
-	/* This also maps the I/O region and sets isa_io/mem_base */
-	pci_process_bridge_OF_ranges(hose, dev, is_primary);
-
-	/* Setup PEX window registers */
-	setup_pci_atmu(hose);
-
-	return 0;
-
-no_bridge:
-	iounmap(hose->private_data);
-	/* unmap cfg_data & cfg_addr separately if not on same page */
-	if (((unsigned long)hose->cfg_data & PAGE_MASK) !=
-	    ((unsigned long)hose->cfg_addr & PAGE_MASK))
-		iounmap(hose->cfg_data);
-	iounmap(hose->cfg_addr);
-	pcibios_free_controller(hose);
-	return -ENODEV;
-}
 #endif /* CONFIG_FSL_SOC_BOOKE || CONFIG_PPC_86xx */
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, quirk_fsl_pcie_header);
@@ -1029,26 +590,6 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs)
 #endif
 
 #if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx)
-static const struct of_device_id pci_ids[] = {
-	{ .compatible = "fsl,mpc8540-pci", },
-	{ .compatible = "fsl,mpc8548-pcie", },
-	{ .compatible = "fsl,mpc8610-pci", },
-	{ .compatible = "fsl,mpc8641-pcie", },
-	{ .compatible = "fsl,qoriq-pcie-v2.1", },
-	{ .compatible = "fsl,qoriq-pcie-v2.2", },
-	{ .compatible = "fsl,qoriq-pcie-v2.3", },
-	{ .compatible = "fsl,qoriq-pcie-v2.4", },
-	{ .compatible = "fsl,qoriq-pcie-v3.0", },
-
-	/*
-	 * The following entries are for compatibility with older device
-	 * trees.
-	 */
-	{ .compatible = "fsl,p1022-pcie", },
-	{ .compatible = "fsl,p4080-pcie", },
-
-	{},
-};
 
 struct device_node *fsl_pci_primary;
 
@@ -1083,64 +624,4 @@ void fsl_pci_assign_primary(void)
 		}
 	}
 }
-
-static int fsl_pci_probe(struct platform_device *pdev)
-{
-	int ret;
-	struct device_node *node;
-
-	node = pdev->dev.of_node;
-	ret = fsl_add_bridge(pdev, fsl_pci_primary == node);
-
-	mpc85xx_pci_err_probe(pdev);
-
-	return 0;
-}
-
-#ifdef CONFIG_PM
-static int fsl_pci_resume(struct device *dev)
-{
-	struct pci_controller *hose;
-	struct resource pci_rsrc;
-
-	hose = pci_find_hose_for_OF_device(dev->of_node);
-	if (!hose)
-		return -ENODEV;
-
-	if (of_address_to_resource(dev->of_node, 0, &pci_rsrc)) {
-		dev_err(dev, "Get pci register base failed.");
-		return -ENODEV;
-	}
-
-	setup_pci_atmu(hose);
-
-	return 0;
-}
-
-static const struct dev_pm_ops pci_pm_ops = {
-	.resume = fsl_pci_resume,
-};
-
-#define PCI_PM_OPS (&pci_pm_ops)
-
-#else
-
-#define PCI_PM_OPS NULL
-
-#endif
-
-static struct platform_driver fsl_pci_driver = {
-	.driver = {
-		.name = "fsl-pci",
-		.pm = PCI_PM_OPS,
-		.of_match_table = pci_ids,
-	},
-	.probe = fsl_pci_probe,
-};
-
-static int __init fsl_pci_init(void)
-{
-	return platform_driver_register(&fsl_pci_driver);
-}
-arch_initcall(fsl_pci_init);
 #endif
diff --git a/arch/powerpc/sysdev/fsl_pci.h b/arch/powerpc/sysdev/fsl_pci.h
index 8d455df..ce77aad 100644
--- a/arch/powerpc/sysdev/fsl_pci.h
+++ b/arch/powerpc/sysdev/fsl_pci.h
@@ -21,95 +21,6 @@ struct platform_device;
 #define PCI_FSL_BRR1      0xbf8
 #define PCI_FSL_BRR1_VER 0xffff
 
-#define PCIE_LTSSM	0x0404		/* PCIE Link Training and Status */
-#define PCIE_LTSSM_L0	0x16		/* L0 state */
-#define PCIE_IP_REV_2_2		0x02080202 /* PCIE IP block version Rev2.2 */
-#define PCIE_IP_REV_3_0		0x02080300 /* PCIE IP block version Rev3.0 */
-#define PIWAR_EN		0x80000000	/* Enable */
-#define PIWAR_PF		0x20000000	/* prefetch */
-#define PIWAR_TGI_LOCAL		0x00f00000	/* target - local memory */
-#define PIWAR_READ_SNOOP	0x00050000
-#define PIWAR_WRITE_SNOOP	0x00005000
-#define PIWAR_SZ_MASK          0x0000003f
-
-/* PCI/PCI Express outbound window reg */
-struct pci_outbound_window_regs {
-	__be32	potar;	/* 0x.0 - Outbound translation address register */
-	__be32	potear;	/* 0x.4 - Outbound translation extended address register */
-	__be32	powbar;	/* 0x.8 - Outbound window base address register */
-	u8	res1[4];
-	__be32	powar;	/* 0x.10 - Outbound window attributes register */
-	u8	res2[12];
-};
-
-/* PCI/PCI Express inbound window reg */
-struct pci_inbound_window_regs {
-	__be32	pitar;	/* 0x.0 - Inbound translation address register */
-	u8	res1[4];
-	__be32	piwbar;	/* 0x.8 - Inbound window base address register */
-	__be32	piwbear;	/* 0x.c - Inbound window base extended address register */
-	__be32	piwar;	/* 0x.10 - Inbound window attributes register */
-	u8	res2[12];
-};
-
-/* PCI/PCI Express IO block registers for 85xx/86xx */
-struct ccsr_pci {
-	__be32	config_addr;		/* 0x.000 - PCI/PCIE Configuration Address Register */
-	__be32	config_data;		/* 0x.004 - PCI/PCIE Configuration Data Register */
-	__be32	int_ack;		/* 0x.008 - PCI Interrupt Acknowledge Register */
-	__be32	pex_otb_cpl_tor;	/* 0x.00c - PCIE Outbound completion timeout register */
-	__be32	pex_conf_tor;		/* 0x.010 - PCIE configuration timeout register */
-	__be32	pex_config;		/* 0x.014 - PCIE CONFIG Register */
-	__be32	pex_int_status;		/* 0x.018 - PCIE interrupt status */
-	u8	res2[4];
-	__be32	pex_pme_mes_dr;		/* 0x.020 - PCIE PME and message detect register */
-	__be32	pex_pme_mes_disr;	/* 0x.024 - PCIE PME and message disable register */
-	__be32	pex_pme_mes_ier;	/* 0x.028 - PCIE PME and message interrupt enable register */
-	__be32	pex_pmcr;		/* 0x.02c - PCIE power management command register */
-	u8	res3[3016];
-	__be32	block_rev1;	/* 0x.bf8 - PCIE Block Revision register 1 */
-	__be32	block_rev2;	/* 0x.bfc - PCIE Block Revision register 2 */
-
-/* PCI/PCI Express outbound window 0-4
- * Window 0 is the default window and is the only window enabled upon reset.
- * The default outbound register set is used when a transaction misses
- * in all of the other outbound windows.
- */
-	struct pci_outbound_window_regs pow[5];
-	u8	res14[96];
-	struct pci_inbound_window_regs	pmit;	/* 0xd00 - 0xd9c Inbound MSI */
-	u8	res6[96];
-/* PCI/PCI Express inbound window 3-0
- * inbound window 1 supports only a 32-bit base address and does not
- * define an inbound window base extended address register.
- */
-	struct pci_inbound_window_regs piw[4];
-
-	__be32	pex_err_dr;		/* 0x.e00 - PCI/PCIE error detect register */
-	u8	res21[4];
-	__be32	pex_err_en;		/* 0x.e08 - PCI/PCIE error interrupt enable register */
-	u8	res22[4];
-	__be32	pex_err_disr;		/* 0x.e10 - PCI/PCIE error disable register */
-	u8	res23[12];
-	__be32	pex_err_cap_stat;	/* 0x.e20 - PCI/PCIE error capture status register */
-	u8	res24[4];
-	__be32	pex_err_cap_r0;		/* 0x.e28 - PCIE error capture register 0 */
-	__be32	pex_err_cap_r1;		/* 0x.e2c - PCIE error capture register 0 */
-	__be32	pex_err_cap_r2;		/* 0x.e30 - PCIE error capture register 0 */
-	__be32	pex_err_cap_r3;		/* 0x.e34 - PCIE error capture register 0 */
-	u8	res_e38[200];
-	__be32	pdb_stat;		/* 0x.f00 - PCIE Debug Status */
-	u8	res_f04[16];
-	__be32	pex_csr0;		/* 0x.f14 - PEX Control/Status register 0*/
-#define PEX_CSR0_LTSSM_MASK	0xFC
-#define PEX_CSR0_LTSSM_SHIFT	2
-#define PEX_CSR0_LTSSM_L0	0x11
-	__be32	pex_csr1;		/* 0x.f18 - PEX Control/Status register 1*/
-	u8	res_f1c[228];
-
-};
-
-extern int fsl_add_bridge(struct platform_device *pdev, int is_primary);
 extern void fsl_pcibios_fixup_bus(struct pci_bus *bus);
 extern int mpc83xx_add_bridge(struct device_node *dev);
 u64 fsl_pci_immrbar_base(struct pci_controller *hose);
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/drivers/pci/host/pci-fsl-common.c
similarity index 54%
copy from arch/powerpc/sysdev/fsl_pci.c
copy to drivers/pci/host/pci-fsl-common.c
index ccfb50d..69d338b 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/drivers/pci/host/pci-fsl-common.c
@@ -1,5 +1,5 @@
 /*
- * MPC83xx/85xx/86xx PCI/PCIE support routing.
+ * 85xx/86xx/LS PCI/PCIE support routing.
  *
  * Copyright 2007-2012 Freescale Semiconductor, Inc.
  * Copyright 2008-2009 MontaVista Software, Inc.
@@ -8,9 +8,6 @@
  * Recode: ZHANG WEI <wei.zhang@freescale.com>
  * Rewrite the routing for Frescale PCI and PCI Express
  * 	Roy Zang <tie-fei.zang@freescale.com>
- * MPC83xx PCI-Express support:
- * 	Tony Li <tony.li@freescale.com>
- * 	Anton Vorontsov <avorontsov@ru.mvista.com>
  *
  * This program is free software; you can redistribute  it and/or modify it
  * under  the terms of  the GNU General  Public License as published by the
@@ -38,29 +35,6 @@
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
 
-static int fsl_pcie_bus_fixup, is_mpc83xx_pci;
-
-static void quirk_fsl_pcie_header(struct pci_dev *dev)
-{
-	u8 hdr_type;
-
-	/* if we aren't a PCIe don't bother */
-	if (!pci_find_capability(dev, PCI_CAP_ID_EXP))
-		return;
-
-	/* if we aren't in host mode don't bother */
-	pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type);
-	if ((hdr_type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
-		return;
-
-	dev->class = PCI_CLASS_BRIDGE_PCI << 8;
-	fsl_pcie_bus_fixup = 1;
-	return;
-}
-
-static int fsl_indirect_read_config(struct pci_bus *, unsigned int,
-				    int, int, u32 *);
-
 static int fsl_pcie_check_link(struct pci_controller *hose)
 {
 	u32 val = 0;
@@ -109,29 +83,6 @@ static struct pci_ops fsl_indirect_pcie_ops =
 	.write = indirect_write_config,
 };
 
-#define MAX_PHYS_ADDR_BITS	40
-static u64 pci64_dma_offset = 1ull << MAX_PHYS_ADDR_BITS;
-
-static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-		return -EIO;
-
-	/*
-	 * Fixup PCI devices that are able to DMA to above the physical
-	 * address width of the SoC such that we can address any internal
-	 * SoC address from across PCI if needed
-	 */
-	if ((dev->bus == &pci_bus_type) &&
-	    dma_mask >= DMA_BIT_MASK(MAX_PHYS_ADDR_BITS)) {
-		set_dma_ops(dev, &dma_direct_ops);
-		set_dma_offset(dev, pci64_dma_offset);
-	}
-
-	*dev->dma_mask = dma_mask;
-	return 0;
-}
-
 static int setup_one_atmu(struct ccsr_pci __iomem *pci,
 	unsigned int index, const struct resource *res,
 	resource_size_t offset)
@@ -417,43 +368,6 @@ static void __init setup_pci_cmd(struct pci_controller *hose)
 	}
 }
 
-void fsl_pcibios_fixup_bus(struct pci_bus *bus)
-{
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	int i, is_pcie = 0, no_link;
-
-	/* The root complex bridge comes up with bogus resources,
-	 * we copy the PHB ones in.
-	 *
-	 * With the current generic PCI code, the PHB bus no longer
-	 * has bus->resource[0..4] set, so things are a bit more
-	 * tricky.
-	 */
-
-	if (fsl_pcie_bus_fixup)
-		is_pcie = early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP);
-	no_link = !!(hose->indirect_type & PPC_INDIRECT_TYPE_NO_PCIE_LINK);
-
-	if (bus->parent == hose->bus && (is_pcie || no_link)) {
-		for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; ++i) {
-			struct resource *res = bus->resource[i];
-			struct resource *par;
-
-			if (!res)
-				continue;
-			if (i == 0)
-				par = &hose->io_resource;
-			else if (i < 4)
-				par = &hose->mem_resources[i-1];
-			else par = NULL;
-
-			res->start = par ? par->start : 0;
-			res->end   = par ? par->end   : 0;
-			res->flags = par ? par->flags : 0;
-		}
-	}
-}
-
 int __init fsl_add_bridge(struct platform_device *pdev, int is_primary)
 {
 	int len;
@@ -560,475 +474,7 @@ no_bridge:
 	pcibios_free_controller(hose);
 	return -ENODEV;
 }
-#endif /* CONFIG_FSL_SOC_BOOKE || CONFIG_PPC_86xx */
-
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, quirk_fsl_pcie_header);
-
-#if defined(CONFIG_PPC_83xx) || defined(CONFIG_PPC_MPC512x)
-struct mpc83xx_pcie_priv {
-	void __iomem *cfg_type0;
-	void __iomem *cfg_type1;
-	u32 dev_base;
-};
-
-struct pex_inbound_window {
-	u32 ar;
-	u32 tar;
-	u32 barl;
-	u32 barh;
-};
-
-/*
- * With the convention of u-boot, the PCIE outbound window 0 serves
- * as configuration transactions outbound.
- */
-#define PEX_OUTWIN0_BAR		0xCA4
-#define PEX_OUTWIN0_TAL		0xCA8
-#define PEX_OUTWIN0_TAH		0xCAC
-#define PEX_RC_INWIN_BASE	0xE60
-#define PEX_RCIWARn_EN		0x1
-
-static int mpc83xx_pcie_exclude_device(struct pci_bus *bus, unsigned int devfn)
-{
-	struct pci_controller *hose = pci_bus_to_host(bus);
-
-	if (hose->indirect_type & PPC_INDIRECT_TYPE_NO_PCIE_LINK)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-	/*
-	 * Workaround for the HW bug: for Type 0 configure transactions the
-	 * PCI-E controller does not check the device number bits and just
-	 * assumes that the device number bits are 0.
-	 */
-	if (bus->number == hose->first_busno ||
-			bus->primary == hose->first_busno) {
-		if (devfn & 0xf8)
-			return PCIBIOS_DEVICE_NOT_FOUND;
-	}
-
-	if (ppc_md.pci_exclude_device) {
-		if (ppc_md.pci_exclude_device(hose, bus->number, devfn))
-			return PCIBIOS_DEVICE_NOT_FOUND;
-	}
-
-	return PCIBIOS_SUCCESSFUL;
-}
-
-static void __iomem *mpc83xx_pcie_remap_cfg(struct pci_bus *bus,
-					    unsigned int devfn, int offset)
-{
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	struct mpc83xx_pcie_priv *pcie = hose->dn->data;
-	u32 dev_base = bus->number << 24 | devfn << 16;
-	int ret;
-
-	ret = mpc83xx_pcie_exclude_device(bus, devfn);
-	if (ret)
-		return NULL;
-
-	offset &= 0xfff;
-
-	/* Type 0 */
-	if (bus->number == hose->first_busno)
-		return pcie->cfg_type0 + offset;
-
-	if (pcie->dev_base == dev_base)
-		goto mapped;
-
-	out_le32(pcie->cfg_type0 + PEX_OUTWIN0_TAL, dev_base);
-
-	pcie->dev_base = dev_base;
-mapped:
-	return pcie->cfg_type1 + offset;
-}
-
-static int mpc83xx_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
-				    int offset, int len, u32 *val)
-{
-	void __iomem *cfg_addr;
-
-	cfg_addr = mpc83xx_pcie_remap_cfg(bus, devfn, offset);
-	if (!cfg_addr)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
-	switch (len) {
-	case 1:
-		*val = in_8(cfg_addr);
-		break;
-	case 2:
-		*val = in_le16(cfg_addr);
-		break;
-	default:
-		*val = in_le32(cfg_addr);
-		break;
-	}
-
-	return PCIBIOS_SUCCESSFUL;
-}
-
-static int mpc83xx_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
-				     int offset, int len, u32 val)
-{
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	void __iomem *cfg_addr;
-
-	cfg_addr = mpc83xx_pcie_remap_cfg(bus, devfn, offset);
-	if (!cfg_addr)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
-	/* PPC_INDIRECT_TYPE_SURPRESS_PRIMARY_BUS */
-	if (offset == PCI_PRIMARY_BUS && bus->number == hose->first_busno)
-		val &= 0xffffff00;
-
-	switch (len) {
-	case 1:
-		out_8(cfg_addr, val);
-		break;
-	case 2:
-		out_le16(cfg_addr, val);
-		break;
-	default:
-		out_le32(cfg_addr, val);
-		break;
-	}
-
-	return PCIBIOS_SUCCESSFUL;
-}
-
-static struct pci_ops mpc83xx_pcie_ops = {
-	.read = mpc83xx_pcie_read_config,
-	.write = mpc83xx_pcie_write_config,
-};
-
-static int __init mpc83xx_pcie_setup(struct pci_controller *hose,
-				     struct resource *reg)
-{
-	struct mpc83xx_pcie_priv *pcie;
-	u32 cfg_bar;
-	int ret = -ENOMEM;
-
-	pcie = zalloc_maybe_bootmem(sizeof(*pcie), GFP_KERNEL);
-	if (!pcie)
-		return ret;
-
-	pcie->cfg_type0 = ioremap(reg->start, resource_size(reg));
-	if (!pcie->cfg_type0)
-		goto err0;
-
-	cfg_bar = in_le32(pcie->cfg_type0 + PEX_OUTWIN0_BAR);
-	if (!cfg_bar) {
-		/* PCI-E isn't configured. */
-		ret = -ENODEV;
-		goto err1;
-	}
-
-	pcie->cfg_type1 = ioremap(cfg_bar, 0x1000);
-	if (!pcie->cfg_type1)
-		goto err1;
-
-	WARN_ON(hose->dn->data);
-	hose->dn->data = pcie;
-	hose->ops = &mpc83xx_pcie_ops;
-	hose->indirect_type |= PPC_INDIRECT_TYPE_FSL_CFG_REG_LINK;
-
-	out_le32(pcie->cfg_type0 + PEX_OUTWIN0_TAH, 0);
-	out_le32(pcie->cfg_type0 + PEX_OUTWIN0_TAL, 0);
-
-	if (fsl_pcie_check_link(hose))
-		hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK;
-
-	return 0;
-err1:
-	iounmap(pcie->cfg_type0);
-err0:
-	kfree(pcie);
-	return ret;
-
-}
-
-int __init mpc83xx_add_bridge(struct device_node *dev)
-{
-	int ret;
-	int len;
-	struct pci_controller *hose;
-	struct resource rsrc_reg;
-	struct resource rsrc_cfg;
-	const int *bus_range;
-	int primary;
-
-	is_mpc83xx_pci = 1;
-
-	if (!of_device_is_available(dev)) {
-		pr_warning("%s: disabled by the firmware.\n",
-			   dev->full_name);
-		return -ENODEV;
-	}
-	pr_debug("Adding PCI host bridge %s\n", dev->full_name);
-
-	/* Fetch host bridge registers address */
-	if (of_address_to_resource(dev, 0, &rsrc_reg)) {
-		printk(KERN_WARNING "Can't get pci register base!\n");
-		return -ENOMEM;
-	}
-
-	memset(&rsrc_cfg, 0, sizeof(rsrc_cfg));
-
-	if (of_address_to_resource(dev, 1, &rsrc_cfg)) {
-		printk(KERN_WARNING
-			"No pci config register base in dev tree, "
-			"using default\n");
-		/*
-		 * MPC83xx supports up to two host controllers
-		 * 	one at 0x8500 has config space registers at 0x8300
-		 * 	one at 0x8600 has config space registers at 0x8380
-		 */
-		if ((rsrc_reg.start & 0xfffff) == 0x8500)
-			rsrc_cfg.start = (rsrc_reg.start & 0xfff00000) + 0x8300;
-		else if ((rsrc_reg.start & 0xfffff) == 0x8600)
-			rsrc_cfg.start = (rsrc_reg.start & 0xfff00000) + 0x8380;
-	}
-	/*
-	 * Controller at offset 0x8500 is primary
-	 */
-	if ((rsrc_reg.start & 0xfffff) == 0x8500)
-		primary = 1;
-	else
-		primary = 0;
-
-	/* Get bus range if any */
-	bus_range = of_get_property(dev, "bus-range", &len);
-	if (bus_range == NULL || len < 2 * sizeof(int)) {
-		printk(KERN_WARNING "Can't get bus-range for %s, assume"
-		       " bus 0\n", dev->full_name);
-	}
-
-	pci_add_flags(PCI_REASSIGN_ALL_BUS);
-	hose = pcibios_alloc_controller(dev);
-	if (!hose)
-		return -ENOMEM;
-
-	hose->first_busno = bus_range ? bus_range[0] : 0;
-	hose->last_busno = bus_range ? bus_range[1] : 0xff;
-
-	if (of_device_is_compatible(dev, "fsl,mpc8314-pcie")) {
-		ret = mpc83xx_pcie_setup(hose, &rsrc_reg);
-		if (ret)
-			goto err0;
-	} else {
-		setup_indirect_pci(hose, rsrc_cfg.start,
-				   rsrc_cfg.start + 4, 0);
-	}
-
-	printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. "
-	       "Firmware bus number: %d->%d\n",
-	       (unsigned long long)rsrc_reg.start, hose->first_busno,
-	       hose->last_busno);
-
-	pr_debug(" ->Hose at 0x%p, cfg_addr=0x%p,cfg_data=0x%p\n",
-	    hose, hose->cfg_addr, hose->cfg_data);
-
-	/* Interpret the "ranges" property */
-	/* This also maps the I/O region and sets isa_io/mem_base */
-	pci_process_bridge_OF_ranges(hose, dev, primary);
-
-	return 0;
-err0:
-	pcibios_free_controller(hose);
-	return ret;
-}
-#endif /* CONFIG_PPC_83xx */
-
-u64 fsl_pci_immrbar_base(struct pci_controller *hose)
-{
-#ifdef CONFIG_PPC_83xx
-	if (is_mpc83xx_pci) {
-		struct mpc83xx_pcie_priv *pcie = hose->dn->data;
-		struct pex_inbound_window *in;
-		int i;
-
-		/* Walk the Root Complex Inbound windows to match IMMR base */
-		in = pcie->cfg_type0 + PEX_RC_INWIN_BASE;
-		for (i = 0; i < 4; i++) {
-			/* not enabled, skip */
-			if (!in_le32(&in[i].ar) & PEX_RCIWARn_EN)
-				 continue;
-
-			if (get_immrbase() == in_le32(&in[i].tar))
-				return (u64)in_le32(&in[i].barh) << 32 |
-					    in_le32(&in[i].barl);
-		}
-
-		printk(KERN_WARNING "could not find PCI BAR matching IMMR\n");
-	}
-#endif
-
-#if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx)
-	if (!is_mpc83xx_pci) {
-		u32 base;
-
-		pci_bus_read_config_dword(hose->bus,
-			PCI_DEVFN(0, 0), PCI_BASE_ADDRESS_0, &base);
-		return base;
-	}
-#endif
-
-	return 0;
-}
 
-#ifdef CONFIG_E500
-static int mcheck_handle_load(struct pt_regs *regs, u32 inst)
-{
-	unsigned int rd, ra, rb, d;
-
-	rd = get_rt(inst);
-	ra = get_ra(inst);
-	rb = get_rb(inst);
-	d = get_d(inst);
-
-	switch (get_op(inst)) {
-	case 31:
-		switch (get_xop(inst)) {
-		case OP_31_XOP_LWZX:
-		case OP_31_XOP_LWBRX:
-			regs->gpr[rd] = 0xffffffff;
-			break;
-
-		case OP_31_XOP_LWZUX:
-			regs->gpr[rd] = 0xffffffff;
-			regs->gpr[ra] += regs->gpr[rb];
-			break;
-
-		case OP_31_XOP_LBZX:
-			regs->gpr[rd] = 0xff;
-			break;
-
-		case OP_31_XOP_LBZUX:
-			regs->gpr[rd] = 0xff;
-			regs->gpr[ra] += regs->gpr[rb];
-			break;
-
-		case OP_31_XOP_LHZX:
-		case OP_31_XOP_LHBRX:
-			regs->gpr[rd] = 0xffff;
-			break;
-
-		case OP_31_XOP_LHZUX:
-			regs->gpr[rd] = 0xffff;
-			regs->gpr[ra] += regs->gpr[rb];
-			break;
-
-		case OP_31_XOP_LHAX:
-			regs->gpr[rd] = ~0UL;
-			break;
-
-		case OP_31_XOP_LHAUX:
-			regs->gpr[rd] = ~0UL;
-			regs->gpr[ra] += regs->gpr[rb];
-			break;
-
-		default:
-			return 0;
-		}
-		break;
-
-	case OP_LWZ:
-		regs->gpr[rd] = 0xffffffff;
-		break;
-
-	case OP_LWZU:
-		regs->gpr[rd] = 0xffffffff;
-		regs->gpr[ra] += (s16)d;
-		break;
-
-	case OP_LBZ:
-		regs->gpr[rd] = 0xff;
-		break;
-
-	case OP_LBZU:
-		regs->gpr[rd] = 0xff;
-		regs->gpr[ra] += (s16)d;
-		break;
-
-	case OP_LHZ:
-		regs->gpr[rd] = 0xffff;
-		break;
-
-	case OP_LHZU:
-		regs->gpr[rd] = 0xffff;
-		regs->gpr[ra] += (s16)d;
-		break;
-
-	case OP_LHA:
-		regs->gpr[rd] = ~0UL;
-		break;
-
-	case OP_LHAU:
-		regs->gpr[rd] = ~0UL;
-		regs->gpr[ra] += (s16)d;
-		break;
-
-	default:
-		return 0;
-	}
-
-	return 1;
-}
-
-static int is_in_pci_mem_space(phys_addr_t addr)
-{
-	struct pci_controller *hose;
-	struct resource *res;
-	int i;
-
-	list_for_each_entry(hose, &hose_list, list_node) {
-		if (!(hose->indirect_type & PPC_INDIRECT_TYPE_EXT_REG))
-			continue;
-
-		for (i = 0; i < 3; i++) {
-			res = &hose->mem_resources[i];
-			if ((res->flags & IORESOURCE_MEM) &&
-				addr >= res->start && addr <= res->end)
-				return 1;
-		}
-	}
-	return 0;
-}
-
-int fsl_pci_mcheck_exception(struct pt_regs *regs)
-{
-	u32 inst;
-	int ret;
-	phys_addr_t addr = 0;
-
-	/* Let KVM/QEMU deal with the exception */
-	if (regs->msr & MSR_GS)
-		return 0;
-
-#ifdef CONFIG_PHYS_64BIT
-	addr = mfspr(SPRN_MCARU);
-	addr <<= 32;
-#endif
-	addr += mfspr(SPRN_MCAR);
-
-	if (is_in_pci_mem_space(addr)) {
-		if (user_mode(regs)) {
-			pagefault_disable();
-			ret = get_user(regs->nip, &inst);
-			pagefault_enable();
-		} else {
-			ret = probe_kernel_address(regs->nip, inst);
-		}
-
-		if (mcheck_handle_load(regs, inst)) {
-			regs->nip += 4;
-			return 1;
-		}
-	}
-
-	return 0;
-}
-#endif
-
-#if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx)
 static const struct of_device_id pci_ids[] = {
 	{ .compatible = "fsl,mpc8540-pci", },
 	{ .compatible = "fsl,mpc8548-pcie", },
@@ -1050,40 +496,6 @@ static const struct of_device_id pci_ids[] = {
 	{},
 };
 
-struct device_node *fsl_pci_primary;
-
-void fsl_pci_assign_primary(void)
-{
-	struct device_node *np;
-
-	/* Callers can specify the primary bus using other means. */
-	if (fsl_pci_primary)
-		return;
-
-	/* If a PCI host bridge contains an ISA node, it's primary. */
-	np = of_find_node_by_type(NULL, "isa");
-	while ((fsl_pci_primary = of_get_parent(np))) {
-		of_node_put(np);
-		np = fsl_pci_primary;
-
-		if (of_match_node(pci_ids, np) && of_device_is_available(np))
-			return;
-	}
-
-	/*
-	 * If there's no PCI host bridge with ISA, arbitrarily
-	 * designate one as primary.  This can go away once
-	 * various bugs with primary-less systems are fixed.
-	 */
-	for_each_matching_node(np, pci_ids) {
-		if (of_device_is_available(np)) {
-			fsl_pci_primary = np;
-			of_node_put(np);
-			return;
-		}
-	}
-}
-
 static int fsl_pci_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -1143,4 +555,3 @@ static int __init fsl_pci_init(void)
 	return platform_driver_register(&fsl_pci_driver);
 }
 arch_initcall(fsl_pci_init);
-#endif
diff --git a/arch/powerpc/sysdev/fsl_pci.h b/include/linux/fsl/pci-common.h
similarity index 79%
copy from arch/powerpc/sysdev/fsl_pci.h
copy to include/linux/fsl/pci-common.h
index 8d455df..5e4f683 100644
--- a/arch/powerpc/sysdev/fsl_pci.h
+++ b/include/linux/fsl/pci-common.h
@@ -1,5 +1,5 @@
 /*
- * MPC85xx/86xx PCI Express structure define
+ * MPC85xx/86xx/LS PCI Express structure define
  *
  * Copyright 2007,2011 Freescale Semiconductor, Inc
  *
@@ -11,15 +11,8 @@
  */
 
 #ifdef __KERNEL__
-#ifndef __POWERPC_FSL_PCI_H
-#define __POWERPC_FSL_PCI_H
-
-struct platform_device;
-
-
-/* FSL PCI controller BRR1 register */
-#define PCI_FSL_BRR1      0xbf8
-#define PCI_FSL_BRR1_VER 0xffff
+#ifndef __PCI_COMMON_H
+#define __PCI_COMMON_H
 
 #define PCIE_LTSSM	0x0404		/* PCIE Link Training and Status */
 #define PCIE_LTSSM_L0	0x16		/* L0 state */
@@ -52,7 +45,7 @@ struct pci_inbound_window_regs {
 	u8	res2[12];
 };
 
-/* PCI/PCI Express IO block registers for 85xx/86xx */
+/* PCI/PCI Express IO block registers for 85xx/86xx/LS */
 struct ccsr_pci {
 	__be32	config_addr;		/* 0x.000 - PCI/PCIE Configuration Address Register */
 	__be32	config_data;		/* 0x.004 - PCI/PCIE Configuration Data Register */
@@ -109,33 +102,5 @@ struct ccsr_pci {
 
 };
 
-extern int fsl_add_bridge(struct platform_device *pdev, int is_primary);
-extern void fsl_pcibios_fixup_bus(struct pci_bus *bus);
-extern int mpc83xx_add_bridge(struct device_node *dev);
-u64 fsl_pci_immrbar_base(struct pci_controller *hose);
-
-extern struct device_node *fsl_pci_primary;
-
-#ifdef CONFIG_PCI
-void fsl_pci_assign_primary(void);
-#else
-static inline void fsl_pci_assign_primary(void) {}
-#endif
-
-#ifdef CONFIG_EDAC_MPC85XX
-int mpc85xx_pci_err_probe(struct platform_device *op);
-#else
-static inline int mpc85xx_pci_err_probe(struct platform_device *op)
-{
-	return -ENOTSUPP;
-}
-#endif
-
-#ifdef CONFIG_FSL_PCI
-extern int fsl_pci_mcheck_exception(struct pt_regs *);
-#else
-static inline int fsl_pci_mcheck_exception(struct pt_regs *regs) {return 0; }
-#endif
-
-#endif /* __POWERPC_FSL_PCI_H */
+#endif /* __PCI_COMMON_H */
 #endif /* __KERNEL__ */
-- 
1.8.1.2

^ permalink raw reply related

* Re: powerpc: Don't corrupt user registers on 32-bit
From: Scott Wood @ 2013-10-23 10:20 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Alexander Graf, linuxppc-dev
In-Reply-To: <20131023084002.GA8325@iris.ozlabs.ibm.com>

On Wed, Oct 23, 2013 at 09:40:02AM +0100, Paul Mackerras wrote:
> Commit de79f7b9f6 ("powerpc: Put FP/VSX and VR state into structures")
> modified load_up_fpu() and load_up_altivec() in such a way that they
> now use r7 and r8.  Unfortunately, the callers of these functions on
> 32-bit machines then return to userspace via fast_exception_return,
> which doesn't restore all of the volatile GPRs, but only r1, r3 -- r6
> and r9 -- r12.  This was causing userspace segfaults and other
> userspace misbehaviour on 32-bit machines.
> 
> This fixes the problem by changing the register usage of load_up_fpu()
> and load_up_altivec() to avoid using r7 and r8 and instead use r6 and
> r10.  This also adds comments to those functions saying which registers
> may be used.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> 
> ---
> arch/powerpc/kernel/fpu.S    | 14 ++++++++------
>  arch/powerpc/kernel/vector.S | 15 +++++++++------
>  2 files changed, 17 insertions(+), 12 deletions(-)

Tested-by: Scott Wood <scottwood@freescale.com> (on e500mc, so no altivec)

-Scott

^ permalink raw reply

* Re: [PATCH] [RFC] Emulate "lwsync" to run standard user land on e500 cores
From: Scott Wood @ 2013-10-23 10:15 UTC (permalink / raw)
  To: Kumar Gala; +Cc: linuxppc-dev, Wolfgang Denk
In-Reply-To: <39CCEB38-1D9B-4918-B8F4-148D4E90FE21@kernel.crashing.org>

On Wed, 2013-10-23 at 00:07 -0500, Kumar Gala wrote:
> On Oct 18, 2013, at 2:38 AM, Wolfgang Denk wrote:
> > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> > index f783c93..f330374 100644
> > --- a/arch/powerpc/kernel/traps.c
> > +++ b/arch/powerpc/kernel/traps.c
> > @@ -986,6 +986,13 @@ static int emulate_instruction(struct pt_regs *regs)
> > 		return 0;
> > 	}
> > 
> > +	/* Emulating the lwsync insn as a sync insn */
> > +	if (instword == PPC_INST_LWSYNC) {
> > +		PPC_WARN_EMULATED(lwsync, regs);
> > +		asm volatile("sync" : : : "memory");
> 
> Do we really need the inline asm?  Doesn't the fact of just taking an exception and returning from it equate to a sync.

No, it doesn't equate to a sync.  See the discussion here:
http://patchwork.ozlabs.org/patch/256747/

-Scott

^ permalink raw reply

* Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group
From: Preeti U Murthy @ 2013-10-23  9:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Michael Neuling, Mike Galbraith, linuxppc-dev, linux-kernel,
	Anton Blanchard, Paul Turner, Ingo Molnar
In-Reply-To: <20131022221138.GJ2490@laptop.programming.kicks-ass.net>

Hi Peter

On 10/23/2013 03:41 AM, Peter Zijlstra wrote:
> This nohz stuff really needs to be re-thought and made more scalable --
> its a royal pain :/

Why  not do something like the below instead? It does the following.

This patch introduces sd_busy just like your suggested patch, except that
it points to the parent of the highest level sched domain which has the
SD_SHARE_PKG_RESOURCES set and initializes it in update_top_cache_domain(). 
This is the sched domain that is relevant in nohz_kick_needed().

sd_set_sd_state_busy(), sd_set_sd_state_idle() and nohz_kick_needed() query
and update *only* this sched domain(sd_busy) for nr_busy_cpus. They are the
only users of this parameter. While we are at it, we might as well change
the nohz_idle parameter to be updated at the sd_busy domain level alone and
not the base domain level of a CPU. This will unify the concept of busy cpus
at just one level of sched domain.

There is no need to iterate through all levels of sched domains of a cpu to
update nr_busy_cpus since it is irrelevant at all other sched domains except
at sd_busy level.

De-couple asymmetric load balancing from the nr_busy parameter which the
PATCH 2/3 anyway does. sd_busy therefore is irrelevant for asymmetric load
balancing.

Regards
Preeti U Murthy
--------------------START_PATCH-------------------------------

sched: Fix nohz_kick_needed()

---
 kernel/sched/core.c  |    4 ++++
 kernel/sched/fair.c  |   40 ++++++++++++++++++++++------------------
 kernel/sched/sched.h |    1 +
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06b8d3..c1dd11c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5271,6 +5271,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -5290,6 +5291,9 @@ static void update_top_cache_domain(int cpu)
 
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES)->parent;
+	rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 813dd61..71e6f14 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
 	struct sched_domain *sd;
+	int cpu = smp_processor_id();
 
 	rcu_read_lock();
-	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+	sd = per_cpu(sd_busy, cpu);
 
 	if (!sd || !sd->nohz_idle)
 		goto unlock;
 	sd->nohz_idle = 0;
 
-	for (; sd; sd = sd->parent)
-		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
@@ -6532,16 +6532,16 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
 	struct sched_domain *sd;
+	int cpu = smp_processor_id();
 
 	rcu_read_lock();
-	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+	sd = per_cpu(sd_busy, cpu);
 
 	if (!sd || sd->nohz_idle)
 		goto unlock;
 	sd->nohz_idle = 1;
 
-	for (; sd; sd = sd->parent)
-		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
@@ -6748,6 +6748,9 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
 	unsigned long now = jiffies;
 	struct sched_domain *sd;
+	struct sched_group *sg;
+	struct sched_group_power *sgp;
+	int nr_busy;
 
 	if (unlikely(idle_cpu(cpu)))
 		return 0;
@@ -6773,22 +6776,23 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 		goto need_kick;
 
 	rcu_read_lock();
-	for_each_domain(cpu, sd) {
-		struct sched_group *sg = sd->groups;
-		struct sched_group_power *sgp = sg->sgp;
-		int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+	sd = per_cpu(sd_busy, cpu);
 
-		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-			goto need_kick_unlock;
+	if (sd) {
+		sg = sd->groups;
+		sgp = sg->sgp;
+		nr_busy = atomic_read(&sgp->nr_busy_cpus);
 
-		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
-		    && (cpumask_first_and(nohz.idle_cpus_mask,
-					  sched_domain_span(sd)) < cpu))
+		if (nr_busy > 1)
 			goto need_kick_unlock;
-
-		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
-			break;
 	}
+
+	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+
+	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+				  sched_domain_span(sd)) < cpu))
+		goto need_kick_unlock;
+
 	rcu_read_unlock();
 	return 0;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ffc7087..0f1253f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -623,6 +623,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 
 struct sched_group_power {
 	atomic_t ref;

^ permalink raw reply related

* [v6][PATCH 5/5] powerpc/book3e/kgdb: Fix a single stgep case of lazy IRQ
From: Tiejun Chen @ 2013-10-23  9:31 UTC (permalink / raw)
  To: scottwood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382520685-11609-1-git-send-email-tiejun.chen@windriver.com>

In lazy EE magic, we may have a lazy interrupt occured while
entering kgdb, but we really don't want to replay that interrupt
for kgdb, so we have to clear the PACA_IRQ_HARD_DIS force to
make sure we can exit directly from this debug exception.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/kernel/kgdb.c |    8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 447c14b..9872f58 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -185,6 +185,14 @@ static int kgdb_singlestep(struct pt_regs *regs)
 		/* Restore current_thread_info lastly. */
 		memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info);
 
+#ifdef CONFIG_PPC64
+	/*
+	 * Clear the PACA_IRQ_HARD_DIS from the pending mask
+	 * since we are about to exit this directly from debug
+	 * exception without any replay interrupt in lazy EE case.
+	 */
+	local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+#endif
 	return 1;
 }
 
-- 
1.7.9.5

^ permalink raw reply related

* [v6][PATCH 4/5] powerpc/kgdb: use DEFINE_PER_CPU to allocate kgdb's thread_info
From: Tiejun Chen @ 2013-10-23  9:31 UTC (permalink / raw)
  To: scottwood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382520685-11609-1-git-send-email-tiejun.chen@windriver.com>

Use DEFINE_PER_CPU to allocate thread_info statically instead of kmalloc().
This can avoid introducing more memory check codes.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/kernel/kgdb.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index c1eef24..447c14b 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -151,15 +151,15 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
 	return 1;
 }
 
+static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info);
 static int kgdb_singlestep(struct pt_regs *regs)
 {
 	struct thread_info *thread_info, *exception_thread_info;
-	struct thread_info *backup_current_thread_info;
+	struct thread_info *backup_current_thread_info = &__get_cpu_var(kgdb_thread_info);
 
 	if (user_mode(regs))
 		return 0;
 
-	backup_current_thread_info = kmalloc(sizeof(struct thread_info), GFP_KERNEL);
 	/*
 	 * On Book E and perhaps other processors, singlestep is handled on
 	 * the critical exception stack.  This causes current_thread_info()
@@ -185,7 +185,6 @@ static int kgdb_singlestep(struct pt_regs *regs)
 		/* Restore current_thread_info lastly. */
 		memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info);
 
-	kfree(backup_current_thread_info);
 	return 1;
 }
 
-- 
1.7.9.5

^ permalink raw reply related

* [v6][PATCH 3/5] powerpc/book3e: support kgdb for kernel space
From: Tiejun Chen @ 2013-10-23  9:31 UTC (permalink / raw)
  To: scottwood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382520685-11609-1-git-send-email-tiejun.chen@windriver.com>

Currently we need to skip this for supporting KGDB.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/kernel/exceptions-64e.S |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index a55cf62..0b750c6 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -597,11 +597,13 @@ kernel_dbg_exc:
 	rfdi
 
 	/* Normal debug exception */
+1:	andi.	r14,r11,MSR_PR;		/* check for userspace again */
+#ifndef CONFIG_KGDB
 	/* XXX We only handle coming from userspace for now since we can't
 	 *     quite save properly an interrupted kernel state yet
 	 */
-1:	andi.	r14,r11,MSR_PR;		/* check for userspace again */
 	beq	kernel_dbg_exc;		/* if from kernel mode */
+#endif
 
 	/* Now we mash up things to make it look like we are coming on a
 	 * normal exception
-- 
1.7.9.5

^ permalink raw reply related

* [v6][PATCH 2/5] powerpc/book3e: store crit/mc/dbg exception thread info
From: Tiejun Chen @ 2013-10-23  9:31 UTC (permalink / raw)
  To: scottwood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382520685-11609-1-git-send-email-tiejun.chen@windriver.com>

We need to store thread info to these exception thread info like something
we already did for PPC32.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/kernel/exceptions-64e.S |   22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 68d74b4..a55cf62 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -36,6 +36,19 @@
  */
 #define	SPECIAL_EXC_FRAME_SIZE	INT_FRAME_SIZE
 
+/* Now we only store something to exception thread info */
+#define	EXC_LEVEL_EXCEPTION_PROLOG(type)				\
+	ld	r14,PACAKSAVE(r13);					\
+	CURRENT_THREAD_INFO(r14, r14);					\
+	CURRENT_THREAD_INFO(r15, r1);					\
+	ld	r10,TI_FLAGS(r14);		     			\
+	std	r10,TI_FLAGS(r15);			     		\
+	ld	r10,TI_PREEMPT(r14);		     			\
+	std	r10,TI_PREEMPT(r15);		     			\
+	ld	r10,TI_TASK(r14);			     		\
+	std	r10,TI_TASK(r15);
+
+
 /* Exception prolog code for all exceptions */
 #define EXCEPTION_PROLOG(n, intnum, type, addition)	    		    \
 	mtspr	SPRN_SPRG_##type##_SCRATCH,r13;	/* get spare registers */   \
@@ -69,19 +82,22 @@
 
 #define CRIT_SET_KSTACK						            \
 	ld	r1,PACA_CRIT_STACK(r13);				    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;				    \
+	EXC_LEVEL_EXCEPTION_PROLOG(CRIT);
 #define SPRN_CRIT_SRR0	SPRN_CSRR0
 #define SPRN_CRIT_SRR1	SPRN_CSRR1
 
 #define DBG_SET_KSTACK						            \
 	ld	r1,PACA_DBG_STACK(r13);					    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;				    \
+	EXC_LEVEL_EXCEPTION_PROLOG(DBG);
 #define SPRN_DBG_SRR0	SPRN_DSRR0
 #define SPRN_DBG_SRR1	SPRN_DSRR1
 
 #define MC_SET_KSTACK						            \
 	ld	r1,PACA_MC_STACK(r13);					    \
-	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;				    \
+	EXC_LEVEL_EXCEPTION_PROLOG(MC);
 #define SPRN_MC_SRR0	SPRN_MCSRR0
 #define SPRN_MC_SRR1	SPRN_MCSRR1
 
-- 
1.7.9.5

^ permalink raw reply related

* [v6][PATCH 1/5] powerpc/book3e: initialize crit/mc/dbg kernel stack pointers
From: Tiejun Chen @ 2013-10-23  9:31 UTC (permalink / raw)
  To: scottwood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382520685-11609-1-git-send-email-tiejun.chen@windriver.com>

We already allocated critical/machine/debug check exceptions, but
we also should initialize those associated kernel stack pointers
for use by special exceptions in the PACA.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/kernel/setup_64.c |   18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 278ca93..5c96d92 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -526,14 +526,20 @@ static void __init exc_lvl_early_init(void)
 	extern unsigned int exc_debug_debug_book3e;
 
 	unsigned int i;
+	unsigned long sp;
 
 	for_each_possible_cpu(i) {
-		critirq_ctx[i] = (struct thread_info *)
-			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
-		dbgirq_ctx[i] = (struct thread_info *)
-			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
-		mcheckirq_ctx[i] = (struct thread_info *)
-			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+		sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+		critirq_ctx[i] = (struct thread_info *)__va(sp);
+		paca[i].crit_kstack = __va(sp + THREAD_SIZE);
+
+		sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+		dbgirq_ctx[i] = (struct thread_info *)__va(sp);
+		paca[i].dbg_kstack = __va(sp + THREAD_SIZE);
+
+		sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+		mcheckirq_ctx[i] = (struct thread_info *)__va(sp);
+		paca[i].mc_kstack = __va(sp + THREAD_SIZE);
 	}
 
 	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
-- 
1.7.9.5

^ permalink raw reply related

* [v6][PATCH 0/5] powerpc/book3e: powerpc/book3e: make kgdb to work well
From: Tiejun Chen @ 2013-10-23  9:31 UTC (permalink / raw)
  To: scottwood; +Cc: linuxppc-dev, linux-kernel

Scott,

Tested on fsl-p5040 DS.

v6:

* rebase
* change the C code to initialize the exception stack addresses in the PACA instead.
* Clear the PACA_IRQ_HARD_DIS force to exit directly from this debug exception
  without replaying interrupt.
* so drop "book3e/kgdb: update thread's dbcr0".

v5:

* rebase on merge branch.

Note the original patch, [ATCH 5/7] kgdb/kgdbts: support ppc64, is already merged
by Jason.

v4:

* use DEFINE_PER_CPU to allocate kgdb's thread_info
* add patch 7 to make usre copy thread_info only !__check_irq_replay
* leave "andi.   r14,r11,MSR_PR" out of "#ifndef CONFIG_KGDB"
  since cr0 is still used lately.
* retest

v3:

* make work when enable CONFIG_RELOCATABLE
* fix one typo in patch,
  "powerpc/book3e: store critical/machine/debug exception thread info": 
	ld	r1,PACAKSAVE(r13);
    ->  ld	r14,PACAKSAVE(r13);
* remove copying the thread_info since booke and book3e always copy
  the thead_info now when we enter the debug exception, and so drop
  the v2 patch, "book3e/kgdb: Fix a single stgep case of lazy IRQ"

v2:

* Make sure we cover CONFIG_PPC_BOOK3E_64 safely
* Use LOAD_REG_IMMEDIATE() to load properly
	the value of the constant expression in load debug exception stack 
* Copy thread infor form the kernel stack coming from usr
* Rebase latest powerpc git tree

v1:

* Copy thread info only when we are from !user mode since we'll get kernel stack
  coming from usr directly.
* remove save/restore EX_R14/EX_R15 since DBG_EXCEPTION_PROLOG already covered
  this.
* use CURRENT_THREAD_INFO() conveniently to get thread.
* fix some typos
* add a patch to make sure gdb can generate a single step properly to invoke a
  kgdb state.
* add a patch to if we need to replay an interrupt, we shouldn't restore that
  previous backup thread info to make sure we can replay an interrupt lately
  with a proper thread info.
* rebase latest powerpc git tree

v0:

This patchset is used to support kgdb for book3e.

----------------------------------------------------------------
Tiejun Chen (5):
      powerpc/book3e: initialize crit/mc/dbg kernel stack pointers
      powerpc/book3e: store crit/mc/dbg exception thread info
      powerpc/book3e: support kgdb for kernel space
      powerpc/kgdb: use DEFINE_PER_CPU to allocate kgdb's thread_info
      powerpc/book3e/kgdb: Fix a single stgep case of lazy IRQ

 arch/powerpc/kernel/exceptions-64e.S |   26 ++++++++++++++++++++++----
 arch/powerpc/kernel/kgdb.c           |   13 ++++++++++---
 arch/powerpc/kernel/setup_64.c       |   18 ++++++++++++------
 3 files changed, 44 insertions(+), 13 deletions(-)

Tiejun

^ permalink raw reply

* Re: [v5][PATCH 1/6] powerpc/book3e: load critical/machine/debug exception stack
From: "“tiejun.chen”" @ 2013-10-23  9:28 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382140519.7979.936.camel@snotra.buserror.net>

On 10/19/2013 07:55 AM, Scott Wood wrote:
> On Thu, 2013-06-20 at 18:28 +0800, Tiejun Chen wrote:
>> We always alloc critical/machine/debug check exceptions. This is
>> different from the normal exception. So we should load these exception
>> stack properly like we did for booke.
>>
>> Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
>> ---
>>   arch/powerpc/kernel/exceptions-64e.S |   49 +++++++++++++++++++++++++++++++---
>>   1 file changed, 46 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
>> index 4b23119..4d8e57f 100644
>> --- a/arch/powerpc/kernel/exceptions-64e.S
>> +++ b/arch/powerpc/kernel/exceptions-64e.S
>> @@ -36,6 +36,37 @@
>>    */
>>   #define	SPECIAL_EXC_FRAME_SIZE	INT_FRAME_SIZE
>>
>> +/* only on book3e */
>> +#define DBG_STACK_BASE		dbgirq_ctx
>> +#define MC_STACK_BASE		mcheckirq_ctx
>> +#define CRIT_STACK_BASE		critirq_ctx
>> +
>> +#ifdef CONFIG_RELOCATABLE
>> +#define LOAD_STACK_BASE(reg, level)			\
>> +	tovirt(r2,r2);					\
>> +	LOAD_REG_ADDR(reg, level##_STACK_BASE);
>
> Where does r2 come from here, where does it get used, and why do we need
> tovirt() on book3e?
>

As I remember this should be covered when we boot that capture kernel in 
kexec/kdump case.

Now this is also gone away after move forward the c code.

Thanks,

Tiejun

^ permalink raw reply

* Re: [v5][PATCH 6/6] book3e/kgdb: Fix a single stgep case of lazy IRQ
From: "“tiejun.chen”" @ 2013-10-23  9:28 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382139147.7979.934.camel@snotra.buserror.net>

On 10/19/2013 07:32 AM, Scott Wood wrote:
> On Thu, 2013-06-20 at 18:28 +0800, Tiejun Chen wrote:
>> When we're in kgdb_singlestep(), we have to work around to get
>> thread_info by copying from the kernel stack before calling
>> kgdb_handle_exception(), then copying it back afterwards.
>>
>> But for PPC64, we have a lazy interrupt implementation. So after
>> copying thread info frome kernle stack, if we need to replay an
>> interrupt, we shouldn't restore that previous backup thread info
>> to make sure we can replay an interrupt lately with a proper
>> thread info.
>
> Explain why copying it would be a problem.
>

This would be gone away in next version as well :)

Thanks,

Tiejun

^ permalink raw reply

* Re: [v5][PATCH 4/6] powerpc/book3e: support kgdb for kernel space
From: "“tiejun.chen”" @ 2013-10-23  9:27 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382137119.7979.916.camel@snotra.buserror.net>

On 10/19/2013 06:58 AM, Scott Wood wrote:
> On Thu, 2013-06-20 at 18:28 +0800, Tiejun Chen wrote:
>> Currently we need to skip this for supporting KGDB.
>
> Does it need to depend on CONFIG_KGDB?  Either you've fixed the "can't
> quite save properly" part, or you haven't.

I'm not 100% sure if my change is fine to other scenarios so I have to use this 
guarantee other stuff are still safe. But if you think we can remove this 
dependent, I'm happy to do :)

Thanks,

Tiejun

^ permalink raw reply

* Re: [v5][PATCH 3/6] book3e/kgdb: update thread's dbcr0
From: "“tiejun.chen”" @ 2013-10-23  9:27 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382137030.7979.914.camel@snotra.buserror.net>

On 10/19/2013 06:57 AM, Scott Wood wrote:
> On Thu, 2013-06-20 at 18:28 +0800, Tiejun Chen wrote:
>> gdb always need to generate a single step properly to invoke
>> a kgdb state. But with lazy interrupt, book3e can't always
>> trigger a debug exception with a single step since the current
>> is blocked for handling those pending exception, then we miss
>> that expected dbcr configuration at last to generate a debug
>> exception.
>
> What do you mean by "the current is blocked"?  Could you explain more
> clearly what lazy EE has to do with MSR_DE and DBCR0?
>

I will go another path to make sure the lazy EE doesn't affect KGDB, so please 
see next version.

Thanks,

Tiejun

^ permalink raw reply

* Re: [v5][PATCH 2/6] powerpc/book3e: store critical/machine/debug exception thread info
From: "“tiejun.chen”" @ 2013-10-23  9:27 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382136213.7979.908.camel@snotra.buserror.net>

On 10/19/2013 06:43 AM, Scott Wood wrote:
> On Thu, 2013-06-20 at 18:28 +0800, Tiejun Chen wrote:
>> We need to store thread info to these exception thread info like something
>> we already did for PPC32.
>>
>> Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
>> ---
>>   arch/powerpc/kernel/exceptions-64e.S |   15 +++++++++++++++
>>   1 file changed, 15 insertions(+)
>>
>> diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
>> index 4d8e57f..07cf657 100644
>> --- a/arch/powerpc/kernel/exceptions-64e.S
>> +++ b/arch/powerpc/kernel/exceptions-64e.S
>> @@ -67,6 +67,18 @@
>>   	std	r10,PACA_##level##_STACK(r13);
>>   #endif
>>
>> +/* Store something to exception thread info */
>> +#define	BOOK3E_STORE_EXC_LEVEL_THEAD_INFO(type)					\
>> +	ld	r14,PACAKSAVE(r13);						\
>> +	CURRENT_THREAD_INFO(r14, r14);						\
>> +	CURRENT_THREAD_INFO(r15, r1);						\
>> +	ld	r10,TI_FLAGS(r14);		     				\
>> +	std	r10,TI_FLAGS(r15);			     			\
>> +	ld	r10,TI_PREEMPT(r14);		     				\
>> +	std	r10,TI_PREEMPT(r15);		     				\
>> +	ld	r10,TI_TASK(r14);			     			\
>> +	std	r10,TI_TASK(r15);
>
> Where is "type" used?
>

Yes, its noting now but its worth leaving this to extend something in the future.

> BTW, no need for a BOOK3E prefix for things local to this file.
>

What about "EXC_LEVEL_EXCEPTION_PROLOG"? Please see next version.

Thanks,

Tiejun

^ permalink raw reply

* Re: [v5][PATCH 1/6] powerpc/book3e: load critical/machine/debug exception stack
From: "“tiejun.chen”" @ 2013-10-23  9:26 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1382135824.7979.906.camel@snotra.buserror.net>

On 10/19/2013 06:37 AM, Scott Wood wrote:
> On Thu, 2013-06-20 at 18:28 +0800, Tiejun Chen wrote:
>> We always alloc critical/machine/debug check exceptions. This is
>> different from the normal exception. So we should load these exception
>> stack properly like we did for booke.
>
> This is "booke".  Do you mean like "like we did for 32-bit"?

Yes.

>
> And the code is already trying to load the special stack; it just
> happens that it's loading from a different location than the C code
> placed the stack addresses.  The changelog should point out the specific
> thing that is being fixed.

Here I don't fix anything, and I just want to do the same thing as 32-bit.

>
>> Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
>> ---
>>   arch/powerpc/kernel/exceptions-64e.S |   49 +++++++++++++++++++++++++++++++---
>>   1 file changed, 46 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
>> index 4b23119..4d8e57f 100644
>> --- a/arch/powerpc/kernel/exceptions-64e.S
>> +++ b/arch/powerpc/kernel/exceptions-64e.S
>> @@ -36,6 +36,37 @@
>>    */
>>   #define	SPECIAL_EXC_FRAME_SIZE	INT_FRAME_SIZE
>>
>> +/* only on book3e */
>> +#define DBG_STACK_BASE		dbgirq_ctx
>> +#define MC_STACK_BASE		mcheckirq_ctx
>> +#define CRIT_STACK_BASE		critirq_ctx
>> +
>> +#ifdef CONFIG_RELOCATABLE
>> +#define LOAD_STACK_BASE(reg, level)			\
>> +	tovirt(r2,r2);					\
>> +	LOAD_REG_ADDR(reg, level##_STACK_BASE);
>> +#else
>> +#define LOAD_STACK_BASE(reg, level)			\
>> +	LOAD_REG_IMMEDIATE(reg, level##_STACK_BASE);
>> +#endif
>> +
>> +#ifdef CONFIG_SMP
>> +#define BOOK3E_LOAD_EXC_LEVEL_STACK(level)		\
>> +	mfspr	r14,SPRN_PIR;				\
>> +	slwi	r14,r14,3;				\
>> +	LOAD_STACK_BASE(r10, level);			\
>> +	add	r10,r10,r14;				\
>> +	ld	r10,0(r10);				\
>> +	addi	r10,r10,THREAD_SIZE;			\
>> +	std	r10,PACA_##level##_STACK(r13);
>> +#else
>> +#define BOOK3E_LOAD_EXC_LEVEL_STACK(level)		\
>> +	LOAD_STACK_BASE(r10, level);			\
>> +	ld	r10,0(r10);				\
>> +	addi	r10,r10,THREAD_SIZE;			\
>> +	std	r10,PACA_##level##_STACK(r13);
>> +#endif
>
> It looks like you're loading the stack from *irq_ctx, storing it in
> PACA_*_stack, and then (immediately after this in the caller) loading it
> back from PACA_*_STACK.  Why not just load it from *irq_ctx and get rid
> of PACA_*_STACK altogether -- or change the C code to initialize the
> addresses in the PACA instead, and get ird of *irq_ctx on 64-bit?

Okay, I'd like to move forward the c code, please see next version.

>
>>   /* Exception prolog code for all exceptions */
>>   #define EXCEPTION_PROLOG(n, intnum, type, addition)	    		    \
>>   	mtspr	SPRN_SPRG_##type##_SCRATCH,r13;	/* get spare registers */   \
>> @@ -68,20 +99,32 @@
>>   #define SPRN_GDBELL_SRR1	SPRN_GSRR1
>>
>>   #define CRIT_SET_KSTACK						            \
>> +	andi.	r10,r11,MSR_PR;							\
>> +	bne	1f;								\
>> +	BOOK3E_LOAD_EXC_LEVEL_STACK(CRIT);					\
>>   	ld	r1,PACA_CRIT_STACK(r13);				    \
>> -	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;
>> +	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE;					\
>
> The caller will already check MSR_PR and override this if coming from
> userspace; why do you need to check again here?

Looks this is redundant so this will be left out.

Thanks,

Tiejun

^ permalink raw reply

* [PATCH] powerpc: Don't corrupt user registers on 32-bit
From: Paul Mackerras @ 2013-10-23  8:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, linuxppc-dev, Alexander Graf

Commit de79f7b9f6 ("powerpc: Put FP/VSX and VR state into structures")
modified load_up_fpu() and load_up_altivec() in such a way that they
now use r7 and r8.  Unfortunately, the callers of these functions on
32-bit machines then return to userspace via fast_exception_return,
which doesn't restore all of the volatile GPRs, but only r1, r3 -- r6
and r9 -- r12.  This was causing userspace segfaults and other
userspace misbehaviour on 32-bit machines.

This fixes the problem by changing the register usage of load_up_fpu()
and load_up_altivec() to avoid using r7 and r8 and instead use r6 and
r10.  This also adds comments to those functions saying which registers
may be used.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/fpu.S    | 14 ++++++++------
 arch/powerpc/kernel/vector.S | 15 +++++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 4dca05e..f7f5b8b 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -106,6 +106,8 @@ _GLOBAL(store_fp_state)
  * and save its floating-point registers in its thread_struct.
  * Load up this task's FP registers from its thread_struct,
  * enable the FPU for the current task and return to the task.
+ * Note that on 32-bit this can only use registers that will be
+ * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
  */
 _GLOBAL(load_up_fpu)
 	mfmsr	r5
@@ -131,10 +133,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	beq	1f
 	toreal(r4)
 	addi	r4,r4,THREAD		/* want last_task_used_math->thread */
-	addi	r8,r4,THREAD_FPSTATE
-	SAVE_32FPVSRS(0, R5, R8)
+	addi	r10,r4,THREAD_FPSTATE
+	SAVE_32FPVSRS(0, R5, R10)
 	mffs	fr0
-	stfd	fr0,FPSTATE_FPSCR(r8)
+	stfd	fr0,FPSTATE_FPSCR(r10)
 	PPC_LL	r5,PT_REGS(r4)
 	toreal(r5)
 	PPC_LL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
@@ -157,10 +159,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	or	r12,r12,r4
 	std	r12,_MSR(r1)
 #endif
-	addi	r7,r5,THREAD_FPSTATE
-	lfd	fr0,FPSTATE_FPSCR(r7)
+	addi	r10,r5,THREAD_FPSTATE
+	lfd	fr0,FPSTATE_FPSCR(r10)
 	MTFSF_L(fr0)
-	REST_32FPVSRS(0, R4, R7)
+	REST_32FPVSRS(0, R4, R10)
 #ifndef CONFIG_SMP
 	subi	r4,r5,THREAD
 	fromreal(r4)
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index eacda4e..0458a9a 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -64,6 +64,9 @@ _GLOBAL(store_vr_state)
  * Enables the VMX for use in the kernel on return.
  * On SMP we know the VMX is free, since we give it up every
  * switch (ie, no lazy save of the vector registers).
+ *
+ * Note that on 32-bit this can only use registers that will be
+ * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
  */
 _GLOBAL(load_up_altivec)
 	mfmsr	r5			/* grab the current MSR */
@@ -89,11 +92,11 @@ _GLOBAL(load_up_altivec)
 	/* Save VMX state to last_task_used_altivec's THREAD struct */
 	toreal(r4)
 	addi	r4,r4,THREAD
-	addi	r7,r4,THREAD_VRSTATE
-	SAVE_32VRS(0,r5,r7)
+	addi	r6,r4,THREAD_VRSTATE
+	SAVE_32VRS(0,r5,r6)
 	mfvscr	vr0
 	li	r10,VRSTATE_VSCR
-	stvx	vr0,r10,r7
+	stvx	vr0,r10,r6
 	/* Disable VMX for last_task_used_altivec */
 	PPC_LL	r5,PT_REGS(r4)
 	toreal(r5)
@@ -125,13 +128,13 @@ _GLOBAL(load_up_altivec)
 	oris	r12,r12,MSR_VEC@h
 	std	r12,_MSR(r1)
 #endif
-	addi	r7,r5,THREAD_VRSTATE
+	addi	r6,r5,THREAD_VRSTATE
 	li	r4,1
 	li	r10,VRSTATE_VSCR
 	stw	r4,THREAD_USED_VR(r5)
-	lvx	vr0,r10,r7
+	lvx	vr0,r10,r6
 	mtvscr	vr0
-	REST_32VRS(0,r4,r7)
+	REST_32VRS(0,r4,r6)
 #ifndef CONFIG_SMP
 	/* Update last_task_used_altivec to 'current' */
 	subi	r4,r5,THREAD		/* Back to 'current' */
-- 
1.8.4.rc3

^ permalink raw reply related

* Re: perf events ring buffer memory barrier on powerpc
From: Victor Kaplansky @ 2013-10-23  7:39 UTC (permalink / raw)
  To: Michael Neuling
  Cc: Mathieu Desnoyers, linux-kernel, Linux PPC dev, anton,
	Frederic Weisbecker
In-Reply-To: <12083.1382486094@ale.ozlabs.ibm.com>

See below.

Michael Neuling <mikey@neuling.org> wrote on 10/23/2013 02:54:54 AM:

>
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index cd55144..95768c6 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -87,10 +87,10 @@ again:
>        goto out;
>
>     /*
> -    * Publish the known good head. Rely on the full barrier implied
> -    * by atomic_dec_and_test() order the rb->head read and this
> -    * write.
> +    * Publish the known good head. We need a memory barrier to order the
> +    * order the rb->head read and this write.
>      */
> +   smp_mb ();
>     rb->user_page->data_head = head;
>
>     /*

1. As far as I understand, smp_mb() is superfluous in this case, smp_wmb()
should be enough.
   (same for the space between the name of function and open
parenthesis :-) )

2. Again, as far as I understand from ./Documentation/atomic_ops.txt, it is
mistake in architecture independent
   code to rely on memory barriers in atomic operations, all the more so in
"local" operations.

3. The solution above is sub-optimal on architectures where memory barrier
is part of "local", since we are going to execute
   two consecutive barriers. So, maybe, it would be better to use
smp_mb__after_atomic_dec().

4. I'm not sure, but I think there is another, unrelated potential problem
in function perf_output_put_handle()
   - the write to "data_head" -

kernel/events/ring_buffer.c:

 77         /*
 78          * Publish the known good head. Rely on the full barrier
implied
 79          * by atomic_dec_and_test() order the rb->head read and this
 80          * write.
 81          */
 82         rb->user_page->data_head = head;

As data_head is 64-bit wide, the update should be done by an atomic64_set
().

Regards,
-- Victor

^ permalink raw reply

* Re: Missing _restvr_20 and _savevr_20 subroutines for lib/raid6/altivec8.o
From: Kumar Gala @ 2013-10-23  5:28 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: debian-powerpc, linuxppc-dev@lists.ozlabs.org list,
	Anton Blanchard, Debian kernel maintainers
In-Reply-To: <1382221463.2794.87.camel@deadeye.wl.decadent.org.uk>


On Oct 19, 2013, at 5:24 PM, Ben Hutchings wrote:

> When building lib/raid6/altivec8.o with gcc 4.8 on Debian, the =
compiler
> is generating references to two new runtime subroutines which are
> apparently not included in the kernel:
>=20
> ERROR: "_restvr_20" [lib/raid6/raid6_pq.ko] undefined!
> ERROR: "_savevr_20" [lib/raid6/raid6_pq.ko] undefined!
>=20
> The save/restore subroutines are specified in
> =
http://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.7.1.html#SAVE=
-RESTORE
> and we do have the _restgpr_* and _savegpr_* subroutines in
> arch/powerpc/boot/crtsavres.S.  I'm not sure whether these subroutines
> should be added or whether this indicates the compiler is doing
> something wrong.
>=20
> A configuration that triggers this is included below.
>=20
> Ben.

Try with CONFIG_CC_OPTIMIZE_FOR_SIZE=3Dn.  A feature was added to gcc =
for -Os to "outline" the save/restore routines.  I'm surprised this =
hasn't shown up sooner.

Well need to add _restvr_* / _savevr_* to the version in =
lib/crtsaveres.S.

=
http://gcc.gnu.org/git/?p=3Dgcc.git;a=3Dblob_plain;f=3Dlibgcc/config/rs600=
0/crtrestvr.S;hb=3DHEAD
=
http://gcc.gnu.org/git/?p=3Dgcc.git;a=3Dblob_plain;f=3Dlibgcc/config/rs600=
0/crtsavevr.S;hb=3DHEAD

- k=

^ permalink raw reply

* Re: [PATCH] [RFC] Emulate "lwsync" to run standard user land on e500 cores
From: Kumar Gala @ 2013-10-23  5:07 UTC (permalink / raw)
  To: Wolfgang Denk; +Cc: Scott Wood, linuxppc-dev
In-Reply-To: <1382081880-6666-1-git-send-email-wd@denx.de>


On Oct 18, 2013, at 2:38 AM, Wolfgang Denk wrote:

> Default Debian PowerPC doesn't work on e500 because the code contains
> "lwsync" instructions, which are unsupported on this core.  As a
> result, applications using this will crash with an "unhandled signal =
4"
> "Illegal instruction" error.
>=20
> As a work around we add code to emulate this insn.  This is expensive
> performance-wise, but allows to run standard user land code.
>=20
> Signed-off-by: Wolfgang Denk <wd@denx.de>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Scott Wood <scottwood@freescale.com>
> ---
> I am aware that the clean solution to the problem is to build user
> space with compiler options that match the target architecture.
> However, sometimes this is just too much effort.
>=20
> Also, of course the performance of such an emulation sucks. But the
> the occurrence of such instructions is so rare that no significant
> slowdown can be oserved.
>=20
> I'm not sure if this should / could go into mainline.  I'm posting it
> primarily so it can be found should anybody else need this.
> - wd
>=20
> arch/powerpc/kernel/traps.c | 7 +++++++
> 1 file changed, 7 insertions(+)
>=20
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index f783c93..f330374 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -986,6 +986,13 @@ static int emulate_instruction(struct pt_regs =
*regs)
> 		return 0;
> 	}
>=20
> +	/* Emulating the lwsync insn as a sync insn */
> +	if (instword =3D=3D PPC_INST_LWSYNC) {
> +		PPC_WARN_EMULATED(lwsync, regs);
> +		asm volatile("sync" : : : "memory");

Do we really need the inline asm?  Doesn't the fact of just taking an =
exception and returning from it equate to a sync.

> +		return 0;
> +	}
> +
> 	/* Emulate the mcrxr insn.  */
> 	if ((instword & PPC_INST_MCRXR_MASK) =3D=3D PPC_INST_MCRXR) {
> 		int shift =3D (instword >> 21) & 0x1c;
> --=20
> 1.8.3.1
>=20
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev

^ permalink raw reply

* Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group
From: Preeti U Murthy @ 2013-10-23  4:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Michael Neuling, Mike Galbraith, linuxppc-dev, linux-kernel,
	Anton Blanchard, Paul Turner, Ingo Molnar
In-Reply-To: <526749EC.9030005@linux.vnet.ibm.com>

On 10/23/2013 09:30 AM, Preeti U Murthy wrote:
> Hi Peter,
> 
> On 10/23/2013 03:41 AM, Peter Zijlstra wrote:
>> On Mon, Oct 21, 2013 at 05:14:42PM +0530, Vaidyanathan Srinivasan wrote:
>>>  kernel/sched/fair.c |   19 +++++++++++++------
>>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 7c70201..12f0eab 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -5807,12 +5807,19 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
>>>  
>>>  	rcu_read_lock();
>>>  	for_each_domain(cpu, sd) {
>>> +		struct sched_domain *sd_parent = sd->parent;
>>> +		struct sched_group *sg;
>>> +		struct sched_group_power *sgp;
>>> +		int nr_busy;
>>> +
>>> +		if (sd_parent) {
>>> +			sg = sd_parent->groups;
>>> +			sgp = sg->sgp;
>>> +			nr_busy = atomic_read(&sgp->nr_busy_cpus);
>>> +
>>> +			if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
>>> +				goto need_kick_unlock;
>>> +		}
>>>  
>>>  		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
>>>  		    && (cpumask_first_and(nohz.idle_cpus_mask,
>>>
>>
>> Almost I'd say; what happens on !sd_parent && SD_ASYM_PACKING ?
> 
> You are right, sorry about this. The idea was to correct the nr_busy
> computation before the patch that would remove its usage in the second
> patch. But that would mean the condition nr_busy != sg->group_weight
> would be invalid with this patch. The second patch needs to go first to
> avoid this confusion.
> 
>>
>> Also, this made me look at the nr_busy stuff again, and somehow that
>> entire thing makes me a little sad.
>>
>> Can't we do something like the below and cut that nr_busy sd iteration
>> short?
> 
> We can surely cut the nr_busy sd iteration but not like what is done
> with this patch. You stop the nr_busy computation at the sched domain
> that has the flag SD_SHARE_PKG_RESOURCES set. But nohz_kick_needed()
> would want to know the nr_busy for one level above this.
>    Consider a core. Assume it is the highest domain with this flag set.
> The nr_busy of its groups, which are logical threads are set to 1/0
> each. But nohz_kick_needed() would like to know the sum of the nr_busy
> parameter of all the groups, i.e. the threads in a core before it
> decides if it can kick nohz_idle balancing. The information about the
> individual group's nr_busy is of no relevance here.
> 
> Thats why the above patch tries to get the
> sd->parent->groups->sgp->nr_busy_cpus. This will translate rightly to
> the core's busy cpus in this example. But the below patch stops before
> updating this parameter at the sd->parent level, where sd is the highest
> level sched domain with the SD_SHARE_PKG_RESOURCES flag set.
> 
> But we can get around all this confusion if we can move the nr_busy
> parameter to be included in the sched_domain structure rather than the
> sched_groups_power structure. Anyway the only place where nr_busy is
> used, that is at nohz_kick_needed(), is done to know the total number of
> busy cpus at a sched domain level which has the SD_SHARE_PKG_RESOURCES
> set and not at a sched group level.
> 
> So why not move nr_busy to struct sched_domain  and having the below
> patch which just updates this parameter for the sched domain, sd_busy ?

Oh this can't be done :( Domain structures are per cpu!

Regards
Preeti U Murthy

^ permalink raw reply

* Re: [PATCH 1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group
From: Preeti U Murthy @ 2013-10-23  4:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Michael Neuling, Mike Galbraith, linuxppc-dev, linux-kernel,
	Anton Blanchard, Paul Turner, Ingo Molnar
In-Reply-To: <20131022221138.GJ2490@laptop.programming.kicks-ass.net>

Hi Peter,

On 10/23/2013 03:41 AM, Peter Zijlstra wrote:
> On Mon, Oct 21, 2013 at 05:14:42PM +0530, Vaidyanathan Srinivasan wrote:
>>  kernel/sched/fair.c |   19 +++++++++++++------
>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 7c70201..12f0eab 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5807,12 +5807,19 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
>>  
>>  	rcu_read_lock();
>>  	for_each_domain(cpu, sd) {
>> +		struct sched_domain *sd_parent = sd->parent;
>> +		struct sched_group *sg;
>> +		struct sched_group_power *sgp;
>> +		int nr_busy;
>> +
>> +		if (sd_parent) {
>> +			sg = sd_parent->groups;
>> +			sgp = sg->sgp;
>> +			nr_busy = atomic_read(&sgp->nr_busy_cpus);
>> +
>> +			if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
>> +				goto need_kick_unlock;
>> +		}
>>  
>>  		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
>>  		    && (cpumask_first_and(nohz.idle_cpus_mask,
>>
> 
> Almost I'd say; what happens on !sd_parent && SD_ASYM_PACKING ?

You are right, sorry about this. The idea was to correct the nr_busy
computation before the patch that would remove its usage in the second
patch. But that would mean the condition nr_busy != sg->group_weight
would be invalid with this patch. The second patch needs to go first to
avoid this confusion.

> 
> Also, this made me look at the nr_busy stuff again, and somehow that
> entire thing makes me a little sad.
> 
> Can't we do something like the below and cut that nr_busy sd iteration
> short?

We can surely cut the nr_busy sd iteration but not like what is done
with this patch. You stop the nr_busy computation at the sched domain
that has the flag SD_SHARE_PKG_RESOURCES set. But nohz_kick_needed()
would want to know the nr_busy for one level above this.
   Consider a core. Assume it is the highest domain with this flag set.
The nr_busy of its groups, which are logical threads are set to 1/0
each. But nohz_kick_needed() would like to know the sum of the nr_busy
parameter of all the groups, i.e. the threads in a core before it
decides if it can kick nohz_idle balancing. The information about the
individual group's nr_busy is of no relevance here.

Thats why the above patch tries to get the
sd->parent->groups->sgp->nr_busy_cpus. This will translate rightly to
the core's busy cpus in this example. But the below patch stops before
updating this parameter at the sd->parent level, where sd is the highest
level sched domain with the SD_SHARE_PKG_RESOURCES flag set.

But we can get around all this confusion if we can move the nr_busy
parameter to be included in the sched_domain structure rather than the
sched_groups_power structure. Anyway the only place where nr_busy is
used, that is at nohz_kick_needed(), is done to know the total number of
busy cpus at a sched domain level which has the SD_SHARE_PKG_RESOURCES
set and not at a sched group level.

So why not move nr_busy to struct sched_domain  and having the below
patch which just updates this parameter for the sched domain, sd_busy ?
This will avoid iterating through all the levels of sched domains and
should resolve the scalability issue. We also don't need to get to
sd->parent to get the nr_busy parameter for the sake of nohz_kick_needed().

What do you think?

Regards
Preeti U Murthy
> 
> This nohz stuff really needs to be re-thought and made more scalable --
> its a royal pain :/
> 
> 
>  kernel/sched/core.c  |  4 ++++
>  kernel/sched/fair.c  | 21 +++++++++++++++------
>  kernel/sched/sched.h |  5 ++---
>  3 files changed, 21 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c06b8d3..89db8dc 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5271,6 +5271,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
>  DEFINE_PER_CPU(int, sd_llc_size);
>  DEFINE_PER_CPU(int, sd_llc_id);
>  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
> +DEFINE_PER_CPU(struct sched_domain *, sd_busy);
> 
>  static void update_top_cache_domain(int cpu)
>  {
> @@ -5290,6 +5291,9 @@ static void update_top_cache_domain(int cpu)
> 
>  	sd = lowest_flag_domain(cpu, SD_NUMA);
>  	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
> +
> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING);
> +	rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
>  }
> 
>  /*
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 813dd61..3d5141e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6512,19 +6512,23 @@ static inline void nohz_balance_exit_idle(int cpu)
>  	}
>  }
> 
> -static inline void set_cpu_sd_state_busy(void)
> +static inline void set_cpu_sd_state_busy(int cpu)
>  {
>  	struct sched_domain *sd;
> +	struct rq *rq = cpu_rq(cpu);
> 
>  	rcu_read_lock();
> -	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
> +	sd = rcu_dereference_check_sched_domain(rq->sd);
> 
>  	if (!sd || !sd->nohz_idle)
>  		goto unlock;
>  	sd->nohz_idle = 0;
> 
> -	for (; sd; sd = sd->parent)
> +	for (; sd; sd = sd->parent) {
>  		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
> +		if (sd == per_cpu(sd_busy, cpu))
> +			break;
> +	}
>  unlock:
>  	rcu_read_unlock();
>  }
> @@ -6532,16 +6536,21 @@ static inline void set_cpu_sd_state_busy(void)
>  void set_cpu_sd_state_idle(void)
>  {
>  	struct sched_domain *sd;
> +	int cpu = smp_processor_id();
> +	struct rq *rq = cpu_rq(cpu);
> 
>  	rcu_read_lock();
> -	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
> +	sd = rcu_dereference_check_sched_domain(rq->sd);
> 
>  	if (!sd || sd->nohz_idle)
>  		goto unlock;
>  	sd->nohz_idle = 1;
> 
> -	for (; sd; sd = sd->parent)
> +	for (; sd; sd = sd->parent) {
>  		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
> +		if (sd == per_cpu(sd_busy, cpu))
> +			break;
> +	}
>  unlock:
>  	rcu_read_unlock();
>  }
> @@ -6756,7 +6765,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
>  	* We may be recently in ticked or tickless idle mode. At the first
>  	* busy tick after returning from idle, we will update the busy stats.
>  	*/
> -	set_cpu_sd_state_busy();
> +	set_cpu_sd_state_busy(cpu);
>  	nohz_balance_exit_idle(cpu);
> 
>  	/*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index ffc7087..80c5fd2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -599,9 +599,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
>  	struct sched_domain *sd, *hsd = NULL;
> 
>  	for_each_domain(cpu, sd) {
> -		if (!(sd->flags & flag))
> -			break;
> -		hsd = sd;
> +		if (sd->flags & flag)
> +			hsd = sd;
>  	}
> 
>  	return hsd;
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox