LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* PowerNV PCI & SR-IOV cleanups
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev

Finally bit the bullet and learned how all the MMIO->PE mapping setup
actually works. As a side effect I found a bunch of oddities in how
PowerNV SR-IOV support is implemented. This series mostly sorts that
out with a few more generic cleanups along the way.

This is largely prep work for supporting VFs in the 32bit MMIO window.
This is an unfortunate necessity due to how the Linux BAR allocator
handles BARs marked as non-prefetchable. The distinction
between prefetch and non-prefetchable BARs was made largely irrelevant
with the introduction of PCIe, but the BAR allocator is overly
conservative. It will always place non-pref bars in the prefetchable
window, which is 32bit only. This results in us being unable to use VFs
from NVMe drives and a few different RAID cards.

This series is based on top of these two:

https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187630
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187688

Rebases cleanly on top of the first, but I haven't tested that one plus
this extensively.

Oliver

^ permalink raw reply

* [PATCH 01/15] powernv/pci: Add pci_bus_to_pnvhb() helper
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

Add a helper to go from a pci_bus structure to the pnv_phb that hosts that
bus. There's a lot of instances of the following pattern:

	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
	struct pnv_phb *phb = hose->private_data;

Without any other uses of the pci_controller inside the function. This is
hard to read since it requires you to memorise the contents of the
private data fields and kind of error prone since it involves blindly
assigning a void pointer. Add a helper to make it more concise and
explicit.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 88 +++++++----------------
 arch/powerpc/platforms/powernv/pci.c      | 14 ++--
 arch/powerpc/platforms/powernv/pci.h      | 10 +++
 3 files changed, 38 insertions(+), 74 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 31c3e6d58c41..687919db0347 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -252,8 +252,7 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb)
 static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
 					 unsigned long *pe_bitmap)
 {
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	struct resource *r;
 	resource_size_t base, sgsz, start, end;
 	int segno, i;
@@ -351,8 +350,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
 
 static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
 {
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
 	struct pnv_ioda_pe *master_pe, *pe;
 	unsigned long size, *pe_alloc;
 	int i;
@@ -673,8 +671,7 @@ struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
 
 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 {
-	struct pci_controller *hose = pci_bus_to_host(dev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
 	struct pci_dn *pdn = pci_get_pdn(dev);
 
 	if (!pdn)
@@ -1069,8 +1066,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 
 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 {
-	struct pci_controller *hose = pci_bus_to_host(dev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
 	struct pci_dn *pdn = pci_get_pdn(dev);
 	struct pnv_ioda_pe *pe;
 
@@ -1129,8 +1125,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
  */
 static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 {
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
 	struct pnv_ioda_pe *pe = NULL;
 	unsigned int pe_num;
 
@@ -1196,8 +1191,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
 	struct pnv_ioda_pe *pe;
 	struct pci_dev *gpu_pdev;
 	struct pci_dn *npu_pdn;
-	struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
 
 	/*
 	 * Intentionally leak a reference on the npu device (for
@@ -1300,16 +1294,12 @@ static void pnv_pci_ioda_setup_nvlink(void)
 #ifdef CONFIG_PCI_IOV
 static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pci_dn         *pdn;
 	int                    i, j;
 	int                    m64_bars;
 
-	bus = pdev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
+	phb = pci_bus_to_pnvhb(pdev->bus);
 	pdn = pci_get_pdn(pdev);
 
 	if (pdn->m64_single_mode)
@@ -1333,8 +1323,6 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
 
 static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pci_dn         *pdn;
 	unsigned int           win;
@@ -1346,9 +1334,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	int                    pe_num;
 	int                    m64_bars;
 
-	bus = pdev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
+	phb = pci_bus_to_pnvhb(pdev->bus);
 	pdn = pci_get_pdn(pdev);
 	total_vfs = pci_sriov_get_totalvfs(pdev);
 
@@ -1459,15 +1445,11 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pnv_ioda_pe    *pe, *pe_n;
 	struct pci_dn         *pdn;
 
-	bus = pdev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
+	phb = pci_bus_to_pnvhb(pdev->bus);
 	pdn = pci_get_pdn(pdev);
 
 	if (!pdev->is_physfn)
@@ -1492,16 +1474,12 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 
 static void pnv_pci_sriov_disable(struct pci_dev *pdev)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pnv_ioda_pe    *pe;
 	struct pci_dn         *pdn;
 	u16                    num_vfs, i;
 
-	bus = pdev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
+	phb = pci_bus_to_pnvhb(pdev->bus);
 	pdn = pci_get_pdn(pdev);
 	num_vfs = pdn->num_vfs;
 
@@ -1535,17 +1513,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe);
 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pnv_ioda_pe    *pe;
 	int                    pe_num;
 	u16                    vf_index;
 	struct pci_dn         *pdn;
 
-	bus = pdev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
+	phb = pci_bus_to_pnvhb(pdev->bus);
 	pdn = pci_get_pdn(pdev);
 
 	if (!pdev->is_physfn)
@@ -1572,7 +1546,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		pe->rid = (vf_bus << 8) | vf_devfn;
 
 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
-			hose->global_number, pdev->bus->number,
+			pci_domain_nr(pdev->bus), pdev->bus->number,
 			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
 
 		if (pnv_ioda_configure_pe(phb, pe)) {
@@ -1602,17 +1576,13 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pnv_ioda_pe    *pe;
 	struct pci_dn         *pdn;
 	int                    ret;
 	u16                    i;
 
-	bus = pdev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
+	phb = pci_bus_to_pnvhb(pdev->bus);
 	pdn = pci_get_pdn(pdev);
 
 	if (phb->type == PNV_PHB_IODA2) {
@@ -1735,8 +1705,7 @@ static int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 
 static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
 {
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 	struct pnv_ioda_pe *pe;
 
@@ -1847,8 +1816,7 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
 static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
 		u64 dma_mask)
 {
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 	struct pnv_ioda_pe *pe;
 
@@ -2766,8 +2734,7 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
 #ifdef CONFIG_PCI_IOV
 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 {
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
 	struct resource *res;
 	int i;
@@ -3101,10 +3068,9 @@ static void pnv_pci_ioda_fixup(void)
 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
 						unsigned long type)
 {
-	struct pci_dev *bridge;
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
 	int num_pci_bridges = 0;
+	struct pci_dev *bridge;
 
 	bridge = bus->self;
 	while (bridge) {
@@ -3190,8 +3156,7 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
 
 static void pnv_pci_configure_bus(struct pci_bus *bus)
 {
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
 	struct pci_dev *bridge = bus->self;
 	struct pnv_ioda_pe *pe;
 	bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
@@ -3237,8 +3202,7 @@ static resource_size_t pnv_pci_default_alignment(void)
 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
 						      int resno)
 {
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 	resource_size_t align;
 
@@ -3274,8 +3238,7 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
  */
 static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
 {
-	struct pci_controller *hose = pci_bus_to_host(dev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
 	struct pci_dn *pdn;
 
 	/* The function is probably called while the PEs have
@@ -3488,8 +3451,7 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
 
 static void pnv_pci_release_device(struct pci_dev *pdev)
 {
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 	struct pnv_ioda_pe *pe;
 
@@ -3534,8 +3496,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 
 static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
 {
-	struct pci_controller *hose = bus->sysdata;
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
 	struct pnv_ioda_pe *pe;
 
 	list_for_each_entry(pe, &phb->ioda.pe_list, list) {
@@ -3873,8 +3834,7 @@ void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
 
 static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
 {
-	struct pci_controller *hose = pci_bus_to_host(dev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
 
 	if (!machine_is(powernv))
 		return;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 091fe1cf386b..9b9bca169275 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -162,8 +162,7 @@ EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
 
 int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 {
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	struct msi_desc *entry;
 	struct msi_msg msg;
 	int hwirq;
@@ -211,8 +210,7 @@ int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 
 void pnv_teardown_msi_irqs(struct pci_dev *pdev)
 {
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	struct msi_desc *entry;
 	irq_hw_number_t hwirq;
 
@@ -824,10 +822,9 @@ EXPORT_SYMBOL(pnv_pci_get_phb_node);
 
 int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
 {
-	__be64 val;
-	struct pci_controller *hose;
-	struct pnv_phb *phb;
+	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
 	u64 tunnel_bar;
+	__be64 val;
 	int rc;
 
 	if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR))
@@ -835,9 +832,6 @@ int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
 	if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR))
 		return -ENXIO;
 
-	hose = pci_bus_to_host(dev->bus);
-	phb = hose->private_data;
-
 	mutex_lock(&tunnel_mutex);
 	rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val);
 	if (rc != OPAL_SUCCESS) {
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 51c254f2f3cb..0727dec9a0d1 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -260,4 +260,14 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 
 extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
 
+static inline struct pnv_phb *pci_bus_to_pnvhb(struct pci_bus *bus)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	if (hose)
+		return hose->private_data;
+
+	return NULL;
+}
+
 #endif /* __POWERNV_PCI_H */
-- 
2.26.2


^ permalink raw reply related

* [PATCH 02/15] powerpc/powernv/pci: Always tear down DMA windows on PE release
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

Currently we have these two functions:

	pnv_pci_ioda2_release_dma_pe(), and
	pnv_pci_ioda2_release_pe_dma()

The first is used when tearing down VF PEs and the other is used for normal
devices. There's very little difference between the two though. The latter
(non-VF) will skip a call to pnv_pci_ioda2_unset_window() unless
CONFIG_IOMMU_API=y is set. There's no real point in doing this so fold the
two together.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 +++--------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 687919db0347..bfb40607aa0e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1422,26 +1422,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	return -EBUSY;
 }
 
-static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
-		int num);
-
-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
-{
-	struct iommu_table    *tbl;
-	int64_t               rc;
-
-	tbl = pe->table_group.tables[0];
-	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
-	if (rc)
-		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
-
-	pnv_pci_ioda2_set_bypass(pe, false);
-	if (pe->table_group.group) {
-		iommu_group_put(pe->table_group.group);
-		BUG_ON(pe->table_group.group);
-	}
-	iommu_tce_table_put(tbl);
-}
+static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 {
@@ -1455,11 +1436,12 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 	if (!pdev->is_physfn)
 		return;
 
+	/* FIXME: Use pnv_ioda_release_pe()? */
 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
 		if (pe->parent_dev != pdev)
 			continue;
 
-		pnv_pci_ioda2_release_dma_pe(pdev, pe);
+		pnv_pci_ioda2_release_pe_dma(pe);
 
 		/* Remove from list */
 		mutex_lock(&phb->ioda.pe_list_mutex);
@@ -2429,7 +2411,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	return 0;
 }
 
-#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
 		int num)
 {
@@ -2453,7 +2434,6 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
 
 	return ret;
 }
-#endif
 
 #ifdef CONFIG_IOMMU_API
 unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
@@ -3334,18 +3314,14 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 {
 	struct iommu_table *tbl = pe->table_group.tables[0];
 	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
-#ifdef CONFIG_IOMMU_API
 	int64_t rc;
-#endif
 
 	if (!weight)
 		return;
 
-#ifdef CONFIG_IOMMU_API
 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (rc)
 		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
-#endif
 
 	pnv_pci_ioda2_set_bypass(pe, false);
 	if (pe->table_group.group) {
-- 
2.26.2


^ permalink raw reply related

* [PATCH 03/15] powerpc/powernv/pci: Add explicit tracking of the DMA setup state
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

There's an optimisation in the PE setup which skips performing DMA
setup for a PE if we only have bridges in a PE. The assumption being
that only "real" devices will DMA to system memory, which is probably
fair. However, if we start off with only bridge devices in a PE then
add a non-bridge device the new device won't be able to use DMA because
we never configured it.

Fix this (admittedly pretty weird) edge case by tracking whether we've done
the DMA setup for the PE or not. If a non-bridge device is added to the PE
(via rescan or hotplug, or whatever) we can set up DMA on demand.

This also means the only remaining user of the old "DMA Weight" code is
the IODA1 DMA setup code that it was originally added for, which is good.

Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
Alexey, do we need to have the IOMMU API stuff set/clear this flag?
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 48 ++++++++++++++---------
 arch/powerpc/platforms/powernv/pci.h      |  7 ++++
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index bfb40607aa0e..bb9c1cc60c33 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -141,6 +141,7 @@ static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
 
 	phb->ioda.pe_array[pe_no].phb = phb;
 	phb->ioda.pe_array[pe_no].pe_number = pe_no;
+	phb->ioda.pe_array[pe_no].dma_setup_done = false;
 
 	/*
 	 * Clear the PE frozen state as it might be put into frozen state
@@ -1685,6 +1686,12 @@ static int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 }
 #endif /* CONFIG_PCI_IOV */
 
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe);
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe);
+
 static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
 {
 	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
@@ -1713,6 +1720,24 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
 		pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
 	}
 
+	/*
+	 * We assume that bridges *probably* don't need to do any DMA so we can
+	 * skip allocating a TCE table, etc unless we get a non-bridge device.
+	 */
+	if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
+		switch (phb->type) {
+		case PNV_PHB_IODA1:
+			pnv_pci_ioda1_setup_dma_pe(phb, pe);
+			break;
+		case PNV_PHB_IODA2:
+			pnv_pci_ioda2_setup_dma_pe(phb, pe);
+			break;
+		default:
+			pr_warn("%s: No DMA for PHB#%x (type %d)\n",
+				__func__, phb->hose->global_number, phb->type);
+		}
+	}
+
 	if (pdn)
 		pdn->pe_number = pe->pe_number;
 	pe->device_count++;
@@ -2222,6 +2247,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
 	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
 	iommu_init_table(tbl, phb->hose->node, 0, 0);
 
+	pe->dma_setup_done = true;
 	return;
  fail:
 	/* XXX Failure: Try to fallback to 64-bit only ? */
@@ -2536,9 +2562,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 {
 	int64_t rc;
 
-	if (!pnv_pci_ioda_pe_dma_weight(pe))
-		return;
-
 	/* TVE #1 is selected by PCI address bit 59 */
 	pe->tce_bypass_base = 1ull << 59;
 
@@ -2563,6 +2586,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			     pe->pe_number);
 #endif
+	pe->dma_setup_done = true;
 }
 
 int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
@@ -3136,7 +3160,6 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
 
 static void pnv_pci_configure_bus(struct pci_bus *bus)
 {
-	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
 	struct pci_dev *bridge = bus->self;
 	struct pnv_ioda_pe *pe;
 	bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
@@ -3160,17 +3183,6 @@ static void pnv_pci_configure_bus(struct pci_bus *bus)
 		return;
 
 	pnv_ioda_setup_pe_seg(pe);
-	switch (phb->type) {
-	case PNV_PHB_IODA1:
-		pnv_pci_ioda1_setup_dma_pe(phb, pe);
-		break;
-	case PNV_PHB_IODA2:
-		pnv_pci_ioda2_setup_dma_pe(phb, pe);
-		break;
-	default:
-		pr_warn("%s: No DMA for PHB#%x (type %d)\n",
-			__func__, phb->hose->global_number, phb->type);
-	}
 }
 
 static resource_size_t pnv_pci_default_alignment(void)
@@ -3289,11 +3301,10 @@ static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
 
 static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
 {
-	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
 	struct iommu_table *tbl = pe->table_group.tables[0];
 	int64_t rc;
 
-	if (!weight)
+	if (!pe->dma_setup_done)
 		return;
 
 	rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
@@ -3313,10 +3324,9 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 {
 	struct iommu_table *tbl = pe->table_group.tables[0];
-	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
 	int64_t rc;
 
-	if (!weight)
+	if (pe->dma_setup_done)
 		return;
 
 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 0727dec9a0d1..6aa6aefb637d 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -87,6 +87,13 @@ struct pnv_ioda_pe {
 	bool			tce_bypass_enabled;
 	uint64_t		tce_bypass_base;
 
+	/*
+	 * Used to track whether we've done DMA setup for this PE or not. We
+	 * want to defer allocating TCE tables, etc until we've added a
+	 * non-bridge device to the PE.
+	 */
+	bool			dma_setup_done;
+
 	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
 	 * and -1 if not supported. (It's actually identical to the
 	 * PE number)
-- 
2.26.2


^ permalink raw reply related

* [PATCH 04/15] powerpc/powernv/pci: Initialise M64 for IODA1 as a 1-1 window
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

We pre-configure the m64 window for IODA1 as a 1-1 segment-PE mapping,
similar to PHB3. Currently the actual mapping of segments occurs in
pnv_ioda_pick_m64_pe(), but we can move it into pnv_ioda1_init_m64() and
drop the IODA1 specific code paths in the PE setup / teardown.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 55 +++++++++++------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index bb9c1cc60c33..8fb17676d914 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -311,6 +311,28 @@ static int pnv_ioda1_init_m64(struct pnv_phb *phb)
 		}
 	}
 
+	for (index = 0; index < phb->ioda.total_pe_num; index++) {
+		int64_t rc;
+
+		/*
+		 * P7IOC supports M64DT, which helps mapping M64 segment
+		 * to one particular PE#. However, PHB3 has fixed mapping
+		 * between M64 segment and PE#. In order to have same logic
+		 * for P7IOC and PHB3, we enforce fixed mapping between M64
+		 * segment and PE# on P7IOC.
+		 */
+		rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+				index, OPAL_M64_WINDOW_TYPE,
+				index / PNV_IODA1_M64_SEGS,
+				index % PNV_IODA1_M64_SEGS);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
+				__func__, rc, phb->hose->global_number,
+				index);
+			goto fail;
+		}
+	}
+
 	/*
 	 * Exclude the segments for reserved and root bus PE, which
 	 * are first or last two PEs.
@@ -402,26 +424,6 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
 			pe->master = master_pe;
 			list_add_tail(&pe->list, &master_pe->slaves);
 		}
-
-		/*
-		 * P7IOC supports M64DT, which helps mapping M64 segment
-		 * to one particular PE#. However, PHB3 has fixed mapping
-		 * between M64 segment and PE#. In order to have same logic
-		 * for P7IOC and PHB3, we enforce fixed mapping between M64
-		 * segment and PE# on P7IOC.
-		 */
-		if (phb->type == PNV_PHB_IODA1) {
-			int64_t rc;
-
-			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-					pe->pe_number, OPAL_M64_WINDOW_TYPE,
-					pe->pe_number / PNV_IODA1_M64_SEGS,
-					pe->pe_number % PNV_IODA1_M64_SEGS);
-			if (rc != OPAL_SUCCESS)
-				pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
-					__func__, rc, phb->hose->global_number,
-					pe->pe_number);
-		}
 	}
 
 	kfree(pe_alloc);
@@ -3354,14 +3356,8 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
 		if (map[idx] != pe->pe_number)
 			continue;
 
-		if (win == OPAL_M64_WINDOW_TYPE)
-			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-					phb->ioda.reserved_pe_idx, win,
-					idx / PNV_IODA1_M64_SEGS,
-					idx % PNV_IODA1_M64_SEGS);
-		else
-			rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-					phb->ioda.reserved_pe_idx, win, 0, idx);
+		rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+				phb->ioda.reserved_pe_idx, win, 0, idx);
 
 		if (rc != OPAL_SUCCESS)
 			pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
@@ -3380,8 +3376,7 @@ static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
 				     phb->ioda.io_segmap);
 		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
 				     phb->ioda.m32_segmap);
-		pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
-				     phb->ioda.m64_segmap);
+		/* M64 is pre-configured by pnv_ioda1_init_m64() */
 	} else if (phb->type == PNV_PHB_IODA2) {
 		pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
 				     phb->ioda.m32_segmap);
-- 
2.26.2


^ permalink raw reply related

* [PATCH 06/15] powerpc/powernv/sriov: Explain how SR-IOV works on PowerNV
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

SR-IOV support on PowerNV is a byzantine maze of hooks. I have no idea
how anyone is supposed to know how it works except through a lot of
stuffering. Write up some docs about the overall story to help out
the next sucker^Wperson who needs to tinker with it.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 130 +++++++++++++++++++++
 1 file changed, 130 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 080ea39f5a83..f4c74ab1284d 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -12,6 +12,136 @@
 /* for pci_dev_is_added() */
 #include "../../../../drivers/pci/pci.h"
 
+/*
+ * The majority of the complexity in supporting SR-IOV on PowerNV comes from
+ * the need to put the MMIO space for each VF into a separate PE. Internally
+ * the PHB maps MMIO addresses to a specific PE using the "Memory BAR Table".
+ * The MBT historically only applied to the 64bit MMIO window of the PHB
+ * so it's common to see it referred to as the "M64BT".
+ *
+ * An MBT entry stores the mapped range as an <base>,<mask> pair. This forces
+ * the address range that we want to map to be power-of-two sized and aligned.
+ * For conventional PCI devices this isn't really an issue since PCI device BARs
+ * have the same requirement.
+ *
+ * For a SR-IOV BAR things are a little more awkward since size and alignment
+ * are not coupled. The alignment is set based on the the per-VF BAR size, but
+ * the total BAR area is: number-of-vfs * per-vf-size. The number of VFs
+ * isn't necessarily a power of two, so neither is the total size. To fix that
+ * we need to finesse (read: hack) the Linux BAR allocator so that it will
+ * allocate the SR-IOV BARs in a way that lets us map them using the MBT.
+ *
+ * The changes to size and alignment that we need to do depend on the "mode"
+ * of MBT entry that we use. We only support SR-IOV on PHB3 (IODA2) and above,
+ * so as a baseline we can assume that we have the following BAR modes
+ * available:
+ *
+ *   NB: $PE_COUNT is the number of PEs that the PHB supports.
+ *
+ * a) A segmented BAR that splits the mapped range into $PE_COUNT equally sized
+ *    segments. The n'th segment is mapped to the n'th PE.
+ * b) An un-segmented BAR that maps the whole address range to a specific PE.
+ *
+ *
+ * We prefer to use mode a) since it only requires one MBT entry per SR-IOV BAR
+ * For comparison b) requires one entry per-VF per-BAR, or:
+ * (num-vfs * num-sriov-bars) in total. To use a) we need the size of each segment
+ * to equal the size of the per-VF BAR area. So:
+ *
+ *	new_size = per-vf-size * number-of-PEs
+ *
+ * The alignment for the SR-IOV BAR also needs to be changed from per-vf-size
+ * to "new_size", calculated above. Implementing this is a convoluted process
+ * which requires several hooks in the PCI core:
+ *
+ * 1. In pcibios_add_device() we call pnv_pci_ioda_fixup_iov().
+ *
+ *    At this point the device has been probed and the device's BARs are sized,
+ *    but no resource allocations have been done. The SR-IOV BARs are sized
+ *    based on the maximum number of VFs supported by the device and we need
+ *    to increase that to new_size.
+ *
+ * 2. Later, when Linux actually assigns resources it tries to make the resource
+ *    allocations for each PCI bus as compact as possible. As a part of that it
+ *    sorts the BARs on a bus by their required alignment, which is calculated
+ *    using pci_resource_alignment().
+ *
+ *    For IOV resources this goes:
+ *    pci_resource_alignment()
+ *        pci_sriov_resource_alignment()
+ *            pcibios_sriov_resource_alignment()
+ *                pnv_pci_iov_resource_alignment()
+ *
+ *    Our hook overrides the default alignment, equal to the per-vf-size, with
+ *    new_size computed above.
+ *
+ * 3. When userspace enables VFs for a device:
+ *
+ *    sriov_enable()
+ *       pcibios_sriov_enable()
+ *           pnv_pcibios_sriov_enable()
+ *
+ *    This is where we actually allocate PE numbers for each VF and setup the
+ *    MBT mapping for each SR-IOV BAR. In steps 1) and 2) we setup an "arena"
+ *    where each MBT segment is equal in size to the VF BAR so we can shift
+ *    around the actual SR-IOV BAR location within this arena. We need this
+ *    ability because the PE space is shared by all devices on the same PHB.
+ *    When using mode a) described above segment 0 in maps to PE#0 which might
+ *    be already being used by another device on the PHB.
+ *
+ *    As a result we need allocate a contigious range of PE numbers, then shift
+ *    the address programmed into the SR-IOV BAR of the PF so that the address
+ *    of VF0 matches up with the segment corresponding to the first allocated
+ *    PE number. This is handled in pnv_pci_vf_resource_shift().
+ *
+ *    Once all that is done we return to the PCI core which then enables VFs,
+ *    scans them and creates pci_devs for each. The init process for a VF is
+ *    largely the same as a normal device, but the VF is inserted into the IODA
+ *    PE that we allocated for it rather than the PE associated with the bus.
+ *
+ * 4. When userspace disables VFs we unwind the above in
+ *    pnv_pcibios_sriov_disable(). Fortunately this is relatively simple since
+ *    we don't need to validate anything, just tear down the mappings and
+ *    move SR-IOV resource back to its "proper" location.
+ *
+ * That's how mode a) works. In theory mode b) (single PE mapping) is less work
+ * since we can map each individual VF with a separate BAR. However, there's a
+ * few limitations:
+ *
+ * 1) For IODA2 mode b) has a minimum alignment requirement of 32MB. This makes
+ *    it only usable for devices with very large per-VF BARs. Such devices are
+ *    similar to Big Foot. They definitely exist, but I've never seen one.
+ *
+ * 2) The number of MBT entries that we have is limited. PHB3 and PHB4 only
+ *    16 total and some are needed for. Most SR-IOV capable network cards can support
+ *    more than 16 VFs on each port.
+ *
+ * We use b) when using a) would use more than 1/4 of the entire 64 bit MMIO
+ * window of the PHB.
+ *
+ *
+ *
+ * PHB4 (IODA3) added a few new features that would be useful for SR-IOV. It
+ * allowed the MBT to map 32bit MMIO space in addition to 64bit which allows
+ * us to support SR-IOV BARs in the 32bit MMIO window. This is useful since
+ * the Linux BAR allocation will place any BAR marked as non-prefetchable into
+ * the non-prefetchable bridge window, which is 32bit only. It also added two
+ * new modes:
+ *
+ * c) A segmented BAR similar to a), but each segment can be individually
+ *    mapped to any PE. This is matches how the 32bit MMIO window worked on
+ *    IODA1&2.
+ *
+ * d) A segmented BAR with 8, 64, or 128 segments. This works similarly to a),
+ *    but with fewer segments and configurable base PE.
+ *
+ *    i.e. The n'th segment maps to the (n + base)'th PE.
+ *
+ *    The base PE is also required to be a multiple of the window size.
+ *
+ * Unfortunately, the OPAL API doesn't currently (as of skiboot v6.6) allow us
+ * to exploit any of the IODA3 features.
+ */
 
 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 {
-- 
2.26.2


^ permalink raw reply related

* [PATCH 05/15] powerpc/powernv/sriov: Move SR-IOV into a seperate file
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

pci-ioda.c is getting a bit unwieldly due to the amount of stuff jammed in
there. The SR-IOV support can be extracted easily enough and is mostly
standalone, so move it into a seperate file.

This patch also moves the PowerNV SR-IOV specific fields from pci_dn and moves them
into a platform specific structure. I'm not sure how they ended up in there
in the first place, but leaking platform specifics into common code has
proven to be a terrible idea so far so lets stop doing that.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
The pci_dn change and the pci-sriov.c changes originally separate patches.
I accidently squashed them together while rebasing and fixing that seemed
like more pain that it was worth. I kind of like it this way though since
they did cause a lot of churn on the same set of functions.

I'll split them up again if you really want (please don't want this).
---
 arch/powerpc/include/asm/device.h          |   3 +
 arch/powerpc/platforms/powernv/Makefile    |   1 +
 arch/powerpc/platforms/powernv/pci-ioda.c  | 673 +--------------------
 arch/powerpc/platforms/powernv/pci-sriov.c | 642 ++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.h       |  74 +++
 5 files changed, 738 insertions(+), 655 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/pci-sriov.c

diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 266542769e4b..4d8934db7ef5 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -49,6 +49,9 @@ struct dev_archdata {
 #ifdef CONFIG_CXL_BASE
 	struct cxl_context	*cxl_ctx;
 #endif
+#ifdef CONFIG_PCI_IOV
+	void *iov_data;
+#endif
 };
 
 struct pdev_archdata {
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index fe3f0fb5aeca..2eb6ae150d1f 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_FA_DUMP)	+= opal-fadump.o
 obj-$(CONFIG_PRESERVE_FA_DUMP)	+= opal-fadump.o
 obj-$(CONFIG_OPAL_CORE)	+= opal-core.o
 obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
+obj-$(CONFIG_PCI_IOV)   += pci-sriov.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8fb17676d914..2d36a9ebf0e9 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -115,26 +115,6 @@ static int __init pci_reset_phbs_setup(char *str)
 
 early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
 
-static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
-{
-	/*
-	 * WARNING: We cannot rely on the resource flags. The Linux PCI
-	 * allocation code sometimes decides to put a 64-bit prefetchable
-	 * BAR in the 32-bit window, so we have to compare the addresses.
-	 *
-	 * For simplicity we only test resource start.
-	 */
-	return (r->start >= phb->ioda.m64_base &&
-		r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
-}
-
-static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
-{
-	unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
-
-	return (resource_flags & flags) == flags;
-}
-
 static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
 {
 	s64 rc;
@@ -172,7 +152,7 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
 	pnv_ioda_init_pe(phb, pe_no);
 }
 
-static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
 {
 	long pe;
 
@@ -184,7 +164,7 @@ static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
 	return NULL;
 }
 
-static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
 {
 	struct pnv_phb *phb = pe->phb;
 	unsigned int pe_num = pe->pe_number;
@@ -816,7 +796,7 @@ static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
 		pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
 }
 
-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 {
 	struct pci_dev *parent;
 	uint8_t bcomp, dcomp, fcomp;
@@ -887,7 +867,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 	return 0;
 }
 
-static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 {
 	struct pci_dev *parent;
 	uint8_t bcomp, dcomp, fcomp;
@@ -982,91 +962,6 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 	return 0;
 }
 
-#ifdef CONFIG_PCI_IOV
-static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
-{
-	struct pci_dn *pdn = pci_get_pdn(dev);
-	int i;
-	struct resource *res, res2;
-	resource_size_t size;
-	u16 num_vfs;
-
-	if (!dev->is_physfn)
-		return -EINVAL;
-
-	/*
-	 * "offset" is in VFs.  The M64 windows are sized so that when they
-	 * are segmented, each segment is the same size as the IOV BAR.
-	 * Each segment is in a separate PE, and the high order bits of the
-	 * address are the PE number.  Therefore, each VF's BAR is in a
-	 * separate PE, and changing the IOV BAR start address changes the
-	 * range of PEs the VFs are in.
-	 */
-	num_vfs = pdn->num_vfs;
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
-		res = &dev->resource[i + PCI_IOV_RESOURCES];
-		if (!res->flags || !res->parent)
-			continue;
-
-		/*
-		 * The actual IOV BAR range is determined by the start address
-		 * and the actual size for num_vfs VFs BAR.  This check is to
-		 * make sure that after shifting, the range will not overlap
-		 * with another device.
-		 */
-		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
-		res2.flags = res->flags;
-		res2.start = res->start + (size * offset);
-		res2.end = res2.start + (size * num_vfs) - 1;
-
-		if (res2.end > res->end) {
-			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
-				i, &res2, res, num_vfs, offset);
-			return -EBUSY;
-		}
-	}
-
-	/*
-	 * Since M64 BAR shares segments among all possible 256 PEs,
-	 * we have to shift the beginning of PF IOV BAR to make it start from
-	 * the segment which belongs to the PE number assigned to the first VF.
-	 * This creates a "hole" in the /proc/iomem which could be used for
-	 * allocating other resources so we reserve this area below and
-	 * release when IOV is released.
-	 */
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
-		res = &dev->resource[i + PCI_IOV_RESOURCES];
-		if (!res->flags || !res->parent)
-			continue;
-
-		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
-		res2 = *res;
-		res->start += size * offset;
-
-		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
-			 i, &res2, res, (offset > 0) ? "En" : "Dis",
-			 num_vfs, offset);
-
-		if (offset < 0) {
-			devm_release_resource(&dev->dev, &pdn->holes[i]);
-			memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
-		}
-
-		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
-
-		if (offset > 0) {
-			pdn->holes[i].start = res2.start;
-			pdn->holes[i].end = res2.start + size * offset - 1;
-			pdn->holes[i].flags = IORESOURCE_BUS;
-			pdn->holes[i].name = "pnv_iov_reserved";
-			devm_request_resource(&dev->dev, res->parent,
-					&pdn->holes[i]);
-		}
-	}
-	return 0;
-}
-#endif /* CONFIG_PCI_IOV */
-
 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 {
 	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
@@ -1294,406 +1189,9 @@ static void pnv_pci_ioda_setup_nvlink(void)
 #endif
 }
 
-#ifdef CONFIG_PCI_IOV
-static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
-{
-	struct pnv_phb        *phb;
-	struct pci_dn         *pdn;
-	int                    i, j;
-	int                    m64_bars;
-
-	phb = pci_bus_to_pnvhb(pdev->bus);
-	pdn = pci_get_pdn(pdev);
-
-	if (pdn->m64_single_mode)
-		m64_bars = num_vfs;
-	else
-		m64_bars = 1;
-
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-		for (j = 0; j < m64_bars; j++) {
-			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
-				continue;
-			opal_pci_phb_mmio_enable(phb->opal_id,
-				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
-			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
-			pdn->m64_map[j][i] = IODA_INVALID_M64;
-		}
-
-	kfree(pdn->m64_map);
-	return 0;
-}
-
-static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
-{
-	struct pnv_phb        *phb;
-	struct pci_dn         *pdn;
-	unsigned int           win;
-	struct resource       *res;
-	int                    i, j;
-	int64_t                rc;
-	int                    total_vfs;
-	resource_size_t        size, start;
-	int                    pe_num;
-	int                    m64_bars;
-
-	phb = pci_bus_to_pnvhb(pdev->bus);
-	pdn = pci_get_pdn(pdev);
-	total_vfs = pci_sriov_get_totalvfs(pdev);
-
-	if (pdn->m64_single_mode)
-		m64_bars = num_vfs;
-	else
-		m64_bars = 1;
-
-	pdn->m64_map = kmalloc_array(m64_bars,
-				     sizeof(*pdn->m64_map),
-				     GFP_KERNEL);
-	if (!pdn->m64_map)
-		return -ENOMEM;
-	/* Initialize the m64_map to IODA_INVALID_M64 */
-	for (i = 0; i < m64_bars ; i++)
-		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
-			pdn->m64_map[i][j] = IODA_INVALID_M64;
-
-
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
-		res = &pdev->resource[i + PCI_IOV_RESOURCES];
-		if (!res->flags || !res->parent)
-			continue;
-
-		for (j = 0; j < m64_bars; j++) {
-			do {
-				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
-						phb->ioda.m64_bar_idx + 1, 0);
-
-				if (win >= phb->ioda.m64_bar_idx + 1)
-					goto m64_failed;
-			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
-
-			pdn->m64_map[j][i] = win;
-
-			if (pdn->m64_single_mode) {
-				size = pci_iov_resource_size(pdev,
-							PCI_IOV_RESOURCES + i);
-				start = res->start + size * j;
-			} else {
-				size = resource_size(res);
-				start = res->start;
-			}
-
-			/* Map the M64 here */
-			if (pdn->m64_single_mode) {
-				pe_num = pdn->pe_num_map[j];
-				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-						pe_num, OPAL_M64_WINDOW_TYPE,
-						pdn->m64_map[j][i], 0);
-			}
-
-			rc = opal_pci_set_phb_mem_window(phb->opal_id,
-						 OPAL_M64_WINDOW_TYPE,
-						 pdn->m64_map[j][i],
-						 start,
-						 0, /* unused */
-						 size);
-
-
-			if (rc != OPAL_SUCCESS) {
-				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
-					win, rc);
-				goto m64_failed;
-			}
-
-			if (pdn->m64_single_mode)
-				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
-			else
-				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
-
-			if (rc != OPAL_SUCCESS) {
-				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
-					win, rc);
-				goto m64_failed;
-			}
-		}
-	}
-	return 0;
-
-m64_failed:
-	pnv_pci_vf_release_m64(pdev, num_vfs);
-	return -EBUSY;
-}
-
-static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
-
-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
-{
-	struct pnv_phb        *phb;
-	struct pnv_ioda_pe    *pe, *pe_n;
-	struct pci_dn         *pdn;
-
-	phb = pci_bus_to_pnvhb(pdev->bus);
-	pdn = pci_get_pdn(pdev);
-
-	if (!pdev->is_physfn)
-		return;
-
-	/* FIXME: Use pnv_ioda_release_pe()? */
-	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
-		if (pe->parent_dev != pdev)
-			continue;
-
-		pnv_pci_ioda2_release_pe_dma(pe);
-
-		/* Remove from list */
-		mutex_lock(&phb->ioda.pe_list_mutex);
-		list_del(&pe->list);
-		mutex_unlock(&phb->ioda.pe_list_mutex);
-
-		pnv_ioda_deconfigure_pe(phb, pe);
-
-		pnv_ioda_free_pe(pe);
-	}
-}
-
-static void pnv_pci_sriov_disable(struct pci_dev *pdev)
-{
-	struct pnv_phb        *phb;
-	struct pnv_ioda_pe    *pe;
-	struct pci_dn         *pdn;
-	u16                    num_vfs, i;
-
-	phb = pci_bus_to_pnvhb(pdev->bus);
-	pdn = pci_get_pdn(pdev);
-	num_vfs = pdn->num_vfs;
-
-	/* Release VF PEs */
-	pnv_ioda_release_vf_PE(pdev);
-
-	if (phb->type == PNV_PHB_IODA2) {
-		if (!pdn->m64_single_mode)
-			pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
-
-		/* Release M64 windows */
-		pnv_pci_vf_release_m64(pdev, num_vfs);
-
-		/* Release PE numbers */
-		if (pdn->m64_single_mode) {
-			for (i = 0; i < num_vfs; i++) {
-				if (pdn->pe_num_map[i] == IODA_INVALID_PE)
-					continue;
-
-				pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
-				pnv_ioda_free_pe(pe);
-			}
-		} else
-			bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
-		/* Releasing pe_num_map */
-		kfree(pdn->pe_num_map);
-	}
-}
-
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-				       struct pnv_ioda_pe *pe);
-static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
-{
-	struct pnv_phb        *phb;
-	struct pnv_ioda_pe    *pe;
-	int                    pe_num;
-	u16                    vf_index;
-	struct pci_dn         *pdn;
-
-	phb = pci_bus_to_pnvhb(pdev->bus);
-	pdn = pci_get_pdn(pdev);
-
-	if (!pdev->is_physfn)
-		return;
-
-	/* Reserve PE for each VF */
-	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
-		int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
-		int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
-		struct pci_dn *vf_pdn;
-
-		if (pdn->m64_single_mode)
-			pe_num = pdn->pe_num_map[vf_index];
-		else
-			pe_num = *pdn->pe_num_map + vf_index;
-
-		pe = &phb->ioda.pe_array[pe_num];
-		pe->pe_number = pe_num;
-		pe->phb = phb;
-		pe->flags = PNV_IODA_PE_VF;
-		pe->pbus = NULL;
-		pe->parent_dev = pdev;
-		pe->mve_number = -1;
-		pe->rid = (vf_bus << 8) | vf_devfn;
-
-		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
-			pci_domain_nr(pdev->bus), pdev->bus->number,
-			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
-
-		if (pnv_ioda_configure_pe(phb, pe)) {
-			/* XXX What do we do here ? */
-			pnv_ioda_free_pe(pe);
-			pe->pdev = NULL;
-			continue;
-		}
-
-		/* Put PE to the list */
-		mutex_lock(&phb->ioda.pe_list_mutex);
-		list_add_tail(&pe->list, &phb->ioda.pe_list);
-		mutex_unlock(&phb->ioda.pe_list_mutex);
-
-		/* associate this pe to it's pdn */
-		list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
-			if (vf_pdn->busno == vf_bus &&
-			    vf_pdn->devfn == vf_devfn) {
-				vf_pdn->pe_number = pe_num;
-				break;
-			}
-		}
-
-		pnv_pci_ioda2_setup_dma_pe(phb, pe);
-	}
-}
-
-static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
-{
-	struct pnv_phb        *phb;
-	struct pnv_ioda_pe    *pe;
-	struct pci_dn         *pdn;
-	int                    ret;
-	u16                    i;
-
-	phb = pci_bus_to_pnvhb(pdev->bus);
-	pdn = pci_get_pdn(pdev);
-
-	if (phb->type == PNV_PHB_IODA2) {
-		if (!pdn->vfs_expanded) {
-			dev_info(&pdev->dev, "don't support this SRIOV device"
-				" with non 64bit-prefetchable IOV BAR\n");
-			return -ENOSPC;
-		}
-
-		/*
-		 * When M64 BARs functions in Single PE mode, the number of VFs
-		 * could be enabled must be less than the number of M64 BARs.
-		 */
-		if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
-			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
-			return -EBUSY;
-		}
-
-		/* Allocating pe_num_map */
-		if (pdn->m64_single_mode)
-			pdn->pe_num_map = kmalloc_array(num_vfs,
-							sizeof(*pdn->pe_num_map),
-							GFP_KERNEL);
-		else
-			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
-
-		if (!pdn->pe_num_map)
-			return -ENOMEM;
-
-		if (pdn->m64_single_mode)
-			for (i = 0; i < num_vfs; i++)
-				pdn->pe_num_map[i] = IODA_INVALID_PE;
-
-		/* Calculate available PE for required VFs */
-		if (pdn->m64_single_mode) {
-			for (i = 0; i < num_vfs; i++) {
-				pe = pnv_ioda_alloc_pe(phb);
-				if (!pe) {
-					ret = -EBUSY;
-					goto m64_failed;
-				}
-
-				pdn->pe_num_map[i] = pe->pe_number;
-			}
-		} else {
-			mutex_lock(&phb->ioda.pe_alloc_mutex);
-			*pdn->pe_num_map = bitmap_find_next_zero_area(
-				phb->ioda.pe_alloc, phb->ioda.total_pe_num,
-				0, num_vfs, 0);
-			if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
-				mutex_unlock(&phb->ioda.pe_alloc_mutex);
-				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
-				kfree(pdn->pe_num_map);
-				return -EBUSY;
-			}
-			bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
-			mutex_unlock(&phb->ioda.pe_alloc_mutex);
-		}
-		pdn->num_vfs = num_vfs;
-
-		/* Assign M64 window accordingly */
-		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
-		if (ret) {
-			dev_info(&pdev->dev, "Not enough M64 window resources\n");
-			goto m64_failed;
-		}
-
-		/*
-		 * When using one M64 BAR to map one IOV BAR, we need to shift
-		 * the IOV BAR according to the PE# allocated to the VFs.
-		 * Otherwise, the PE# for the VF will conflict with others.
-		 */
-		if (!pdn->m64_single_mode) {
-			ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
-			if (ret)
-				goto m64_failed;
-		}
-	}
-
-	/* Setup VF PEs */
-	pnv_ioda_setup_vf_PE(pdev, num_vfs);
-
-	return 0;
-
-m64_failed:
-	if (pdn->m64_single_mode) {
-		for (i = 0; i < num_vfs; i++) {
-			if (pdn->pe_num_map[i] == IODA_INVALID_PE)
-				continue;
-
-			pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
-			pnv_ioda_free_pe(pe);
-		}
-	} else
-		bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
-
-	/* Releasing pe_num_map */
-	kfree(pdn->pe_num_map);
-
-	return ret;
-}
-
-static int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
-{
-	pnv_pci_sriov_disable(pdev);
-
-	/* Release PCI data */
-	remove_sriov_vf_pdns(pdev);
-	return 0;
-}
-
-static int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
-{
-	/* Allocate PCI data */
-	add_sriov_vf_pdns(pdev);
-
-	return pnv_pci_sriov_enable(pdev, num_vfs);
-}
-#endif /* CONFIG_PCI_IOV */
-
 static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe);
 
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-				       struct pnv_ioda_pe *pe);
-
 static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
 {
 	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
@@ -2559,8 +2057,8 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 };
 #endif
 
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-				       struct pnv_ioda_pe *pe)
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				struct pnv_ioda_pe *pe)
 {
 	int64_t rc;
 
@@ -2737,117 +2235,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
 		count, phb->msi_base);
 }
 
-#ifdef CONFIG_PCI_IOV
-static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
-{
-	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
-	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
-	struct resource *res;
-	int i;
-	resource_size_t size, total_vf_bar_sz;
-	struct pci_dn *pdn;
-	int mul, total_vfs;
-
-	pdn = pci_get_pdn(pdev);
-	pdn->vfs_expanded = 0;
-	pdn->m64_single_mode = false;
-
-	total_vfs = pci_sriov_get_totalvfs(pdev);
-	mul = phb->ioda.total_pe_num;
-	total_vf_bar_sz = 0;
-
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
-		res = &pdev->resource[i + PCI_IOV_RESOURCES];
-		if (!res->flags || res->parent)
-			continue;
-		if (!pnv_pci_is_m64_flags(res->flags)) {
-			dev_warn(&pdev->dev, "Don't support SR-IOV with"
-					" non M64 VF BAR%d: %pR. \n",
-				 i, res);
-			goto truncate_iov;
-		}
-
-		total_vf_bar_sz += pci_iov_resource_size(pdev,
-				i + PCI_IOV_RESOURCES);
-
-		/*
-		 * If bigger than quarter of M64 segment size, just round up
-		 * power of two.
-		 *
-		 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
-		 * with other devices, IOV BAR size is expanded to be
-		 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
-		 * segment size , the expanded size would equal to half of the
-		 * whole M64 space size, which will exhaust the M64 Space and
-		 * limit the system flexibility.  This is a design decision to
-		 * set the boundary to quarter of the M64 segment size.
-		 */
-		if (total_vf_bar_sz > gate) {
-			mul = roundup_pow_of_two(total_vfs);
-			dev_info(&pdev->dev,
-				"VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
-				total_vf_bar_sz, gate, mul);
-			pdn->m64_single_mode = true;
-			break;
-		}
-	}
-
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
-		res = &pdev->resource[i + PCI_IOV_RESOURCES];
-		if (!res->flags || res->parent)
-			continue;
-
-		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
-		/*
-		 * On PHB3, the minimum size alignment of M64 BAR in single
-		 * mode is 32MB.
-		 */
-		if (pdn->m64_single_mode && (size < SZ_32M))
-			goto truncate_iov;
-		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
-		res->end = res->start + size * mul - 1;
-		dev_dbg(&pdev->dev, "                       %pR\n", res);
-		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
-			 i, res, mul);
-	}
-	pdn->vfs_expanded = mul;
-
-	return;
-
-truncate_iov:
-	/* To save MMIO space, IOV BAR is truncated. */
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
-		res = &pdev->resource[i + PCI_IOV_RESOURCES];
-		res->flags = 0;
-		res->end = res->start - 1;
-	}
-}
-
-static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
-{
-	if (WARN_ON(pci_dev_is_added(pdev)))
-		return;
-
-	if (pdev->is_virtfn) {
-		struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
-
-		/*
-		 * VF PEs are single-device PEs so their pdev pointer needs to
-		 * be set. The pdev doesn't exist when the PE is allocated (in
-		 * (pcibios_sriov_enable()) so we fix it up here.
-		 */
-		pe->pdev = pdev;
-		WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
-	} else if (pdev->is_physfn) {
-		/*
-		 * For PFs adjust their allocated IOV resources to match what
-		 * the PHB can support using it's M64 BAR table.
-		 */
-		pnv_pci_ioda_fixup_iov_resources(pdev);
-	}
-}
-#endif /* CONFIG_PCI_IOV */
-
 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
 				  struct resource *res)
 {
@@ -3192,41 +2579,6 @@ static resource_size_t pnv_pci_default_alignment(void)
 	return PAGE_SIZE;
 }
 
-#ifdef CONFIG_PCI_IOV
-static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
-						      int resno)
-{
-	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
-	struct pci_dn *pdn = pci_get_pdn(pdev);
-	resource_size_t align;
-
-	/*
-	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
-	 * SR-IOV. While from hardware perspective, the range mapped by M64
-	 * BAR should be size aligned.
-	 *
-	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
-	 * powernv-specific hardware restriction is gone. But if just use the
-	 * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
-	 * in one segment of M64 #15, which introduces the PE conflict between
-	 * PF and VF. Based on this, the minimum alignment of an IOV BAR is
-	 * m64_segsize.
-	 *
-	 * This function returns the total IOV BAR size if M64 BAR is in
-	 * Shared PE mode or just VF BAR size if not.
-	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
-	 * M64 segment size if IOV BAR size is less.
-	 */
-	align = pci_iov_resource_size(pdev, resno);
-	if (!pdn->vfs_expanded)
-		return align;
-	if (pdn->m64_single_mode)
-		return max(align, (resource_size_t)phb->ioda.m64_segsize);
-
-	return pdn->vfs_expanded * align;
-}
-#endif /* CONFIG_PCI_IOV */
-
 /* Prevent enabling devices for which we couldn't properly
  * assign a PE
  */
@@ -3323,7 +2675,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
 	iommu_tce_table_put(tbl);
 }
 
-static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 {
 	struct iommu_table *tbl = pe->table_group.tables[0];
 	int64_t rc;
@@ -3436,12 +2788,23 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 	struct pnv_ioda_pe *pe;
 
+	/* The VF PE state is torn down when sriov_disable() is called */
 	if (pdev->is_virtfn)
 		return;
 
 	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
 		return;
 
+#ifdef CONFIG_PCI_IOV
+	/*
+	 * FIXME: Try move this to sriov_disable(). It's here since we allocate
+	 * the iov state at probe time since we need to fiddle with the IOV
+	 * resources.
+	 */
+	if (pdev->is_physfn)
+		kfree(pdev->dev.archdata.iov_data);
+#endif
+
 	/*
 	 * PCI hotplug can happen as part of EEH error recovery. The @pdn
 	 * isn't removed and added afterwards in this scenario. We should
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
new file mode 100644
index 000000000000..080ea39f5a83
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/ioport.h>
+#include <linux/bitmap.h>
+#include <linux/pci.h>
+
+#include <asm/opal.h>
+
+#include "pci.h"
+
+/* for pci_dev_is_added() */
+#include "../../../../drivers/pci/pci.h"
+
+
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
+	struct resource *res;
+	int i;
+	resource_size_t size, total_vf_bar_sz;
+	struct pnv_iov_data *iov;
+	int mul, total_vfs;
+
+	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+	if (!iov)
+		goto truncate_iov;
+	pdev->dev.archdata.iov_data = iov;
+
+	total_vfs = pci_sriov_get_totalvfs(pdev);
+	mul = phb->ioda.total_pe_num;
+	total_vf_bar_sz = 0;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || res->parent)
+			continue;
+		if (!pnv_pci_is_m64_flags(res->flags)) {
+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
+					" non M64 VF BAR%d: %pR. \n",
+				 i, res);
+			goto truncate_iov;
+		}
+
+		total_vf_bar_sz += pci_iov_resource_size(pdev,
+				i + PCI_IOV_RESOURCES);
+
+		/*
+		 * If bigger than quarter of M64 segment size, just round up
+		 * power of two.
+		 *
+		 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
+		 * with other devices, IOV BAR size is expanded to be
+		 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
+		 * segment size , the expanded size would equal to half of the
+		 * whole M64 space size, which will exhaust the M64 Space and
+		 * limit the system flexibility.  This is a design decision to
+		 * set the boundary to quarter of the M64 segment size.
+		 */
+		if (total_vf_bar_sz > gate) {
+			mul = roundup_pow_of_two(total_vfs);
+			dev_info(&pdev->dev,
+				"VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
+				total_vf_bar_sz, gate, mul);
+			iov->m64_single_mode = true;
+			break;
+		}
+	}
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || res->parent)
+			continue;
+
+		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+		/*
+		 * On PHB3, the minimum size alignment of M64 BAR in single
+		 * mode is 32MB.
+		 */
+		if (iov->m64_single_mode && (size < SZ_32M))
+			goto truncate_iov;
+		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
+		res->end = res->start + size * mul - 1;
+		dev_dbg(&pdev->dev, "                       %pR\n", res);
+		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+			 i, res, mul);
+	}
+	iov->vfs_expanded = mul;
+
+	return;
+
+truncate_iov:
+	/* To save MMIO space, IOV BAR is truncated. */
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		res->flags = 0;
+		res->end = res->start - 1;
+	}
+
+	pdev->dev.archdata.iov_data = NULL;
+	kfree(iov);
+}
+
+void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
+{
+	if (WARN_ON(pci_dev_is_added(pdev)))
+		return;
+
+	if (pdev->is_virtfn) {
+		struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
+
+		/*
+		 * VF PEs are single-device PEs so their pdev pointer needs to
+		 * be set. The pdev doesn't exist when the PE is allocated (in
+		 * (pcibios_sriov_enable()) so we fix it up here.
+		 */
+		pe->pdev = pdev;
+		WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
+	} else if (pdev->is_physfn) {
+		/*
+		 * For PFs adjust their allocated IOV resources to match what
+		 * the PHB can support using it's M64 BAR table.
+		 */
+		pnv_pci_ioda_fixup_iov_resources(pdev);
+	}
+}
+
+resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
+						      int resno)
+{
+	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+	struct pnv_iov_data *iov = pnv_iov_get(pdev);
+	resource_size_t align;
+
+	/*
+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+	 * SR-IOV. While from hardware perspective, the range mapped by M64
+	 * BAR should be size aligned.
+	 *
+	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
+	 * powernv-specific hardware restriction is gone. But if just use the
+	 * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
+	 * in one segment of M64 #15, which introduces the PE conflict between
+	 * PF and VF. Based on this, the minimum alignment of an IOV BAR is
+	 * m64_segsize.
+	 *
+	 * This function returns the total IOV BAR size if M64 BAR is in
+	 * Shared PE mode or just VF BAR size if not.
+	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
+	 * M64 segment size if IOV BAR size is less.
+	 */
+	align = pci_iov_resource_size(pdev, resno);
+
+	/*
+	 * iov can be null if we have an SR-IOV device with IOV BAR that can't
+	 * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch).
+	 * In that case we don't allow VFs to be enabled so just return the
+	 * default alignment.
+	 */
+	if (!iov)
+		return align;
+	if (!iov->vfs_expanded)
+		return align;
+	if (iov->m64_single_mode)
+		return max(align, (resource_size_t)phb->ioda.m64_segsize);
+
+	return iov->vfs_expanded * align;
+}
+
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pnv_iov_data   *iov;
+	struct pnv_phb        *phb;
+	int                    i, j;
+	int                    m64_bars;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	iov = pnv_iov_get(pdev);
+
+	if (iov->m64_single_mode)
+		m64_bars = num_vfs;
+	else
+		m64_bars = 1;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+		for (j = 0; j < m64_bars; j++) {
+			if (iov->m64_map[j][i] == IODA_INVALID_M64)
+				continue;
+			opal_pci_phb_mmio_enable(phb->opal_id,
+				OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 0);
+			clear_bit(iov->m64_map[j][i], &phb->ioda.m64_bar_alloc);
+			iov->m64_map[j][i] = IODA_INVALID_M64;
+		}
+
+	kfree(iov->m64_map);
+	return 0;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pnv_iov_data   *iov;
+	struct pnv_phb        *phb;
+	unsigned int           win;
+	struct resource       *res;
+	int                    i, j;
+	int64_t                rc;
+	int                    total_vfs;
+	resource_size_t        size, start;
+	int                    pe_num;
+	int                    m64_bars;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	iov = pnv_iov_get(pdev);
+	total_vfs = pci_sriov_get_totalvfs(pdev);
+
+	if (iov->m64_single_mode)
+		m64_bars = num_vfs;
+	else
+		m64_bars = 1;
+
+	iov->m64_map = kmalloc_array(m64_bars,
+				     sizeof(*iov->m64_map),
+				     GFP_KERNEL);
+	if (!iov->m64_map)
+		return -ENOMEM;
+	/* Initialize the m64_map to IODA_INVALID_M64 */
+	for (i = 0; i < m64_bars ; i++)
+		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
+			iov->m64_map[i][j] = IODA_INVALID_M64;
+
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		for (j = 0; j < m64_bars; j++) {
+			do {
+				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+						phb->ioda.m64_bar_idx + 1, 0);
+
+				if (win >= phb->ioda.m64_bar_idx + 1)
+					goto m64_failed;
+			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+			iov->m64_map[j][i] = win;
+
+			if (iov->m64_single_mode) {
+				size = pci_iov_resource_size(pdev,
+							PCI_IOV_RESOURCES + i);
+				start = res->start + size * j;
+			} else {
+				size = resource_size(res);
+				start = res->start;
+			}
+
+			/* Map the M64 here */
+			if (iov->m64_single_mode) {
+				pe_num = iov->pe_num_map[j];
+				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+						pe_num, OPAL_M64_WINDOW_TYPE,
+						iov->m64_map[j][i], 0);
+			}
+
+			rc = opal_pci_set_phb_mem_window(phb->opal_id,
+						 OPAL_M64_WINDOW_TYPE,
+						 iov->m64_map[j][i],
+						 start,
+						 0, /* unused */
+						 size);
+
+
+			if (rc != OPAL_SUCCESS) {
+				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
+					win, rc);
+				goto m64_failed;
+			}
+
+			if (iov->m64_single_mode)
+				rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				     OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 2);
+			else
+				rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				     OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 1);
+
+			if (rc != OPAL_SUCCESS) {
+				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
+					win, rc);
+				goto m64_failed;
+			}
+		}
+	}
+	return 0;
+
+m64_failed:
+	pnv_pci_vf_release_m64(pdev, num_vfs);
+	return -EBUSY;
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
+{
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe, *pe_n;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+
+	if (!pdev->is_physfn)
+		return;
+
+	/* FIXME: Use pnv_ioda_release_pe()? */
+	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+		if (pe->parent_dev != pdev)
+			continue;
+
+		pnv_pci_ioda2_release_pe_dma(pe);
+
+		/* Remove from list */
+		mutex_lock(&phb->ioda.pe_list_mutex);
+		list_del(&pe->list);
+		mutex_unlock(&phb->ioda.pe_list_mutex);
+
+		pnv_ioda_deconfigure_pe(phb, pe);
+
+		pnv_ioda_free_pe(pe);
+	}
+}
+
+static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+	struct resource *res, res2;
+	struct pnv_iov_data *iov;
+	resource_size_t size;
+	u16 num_vfs;
+	int i;
+
+	if (!dev->is_physfn)
+		return -EINVAL;
+	iov = pnv_iov_get(dev);
+
+	/*
+	 * "offset" is in VFs.  The M64 windows are sized so that when they
+	 * are segmented, each segment is the same size as the IOV BAR.
+	 * Each segment is in a separate PE, and the high order bits of the
+	 * address are the PE number.  Therefore, each VF's BAR is in a
+	 * separate PE, and changing the IOV BAR start address changes the
+	 * range of PEs the VFs are in.
+	 */
+	num_vfs = iov->num_vfs;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &dev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		/*
+		 * The actual IOV BAR range is determined by the start address
+		 * and the actual size for num_vfs VFs BAR.  This check is to
+		 * make sure that after shifting, the range will not overlap
+		 * with another device.
+		 */
+		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		res2.flags = res->flags;
+		res2.start = res->start + (size * offset);
+		res2.end = res2.start + (size * num_vfs) - 1;
+
+		if (res2.end > res->end) {
+			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
+				i, &res2, res, num_vfs, offset);
+			return -EBUSY;
+		}
+	}
+
+	/*
+	 * Since M64 BAR shares segments among all possible 256 PEs,
+	 * we have to shift the beginning of PF IOV BAR to make it start from
+	 * the segment which belongs to the PE number assigned to the first VF.
+	 * This creates a "hole" in the /proc/iomem which could be used for
+	 * allocating other resources so we reserve this area below and
+	 * release when IOV is released.
+	 */
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &dev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		res2 = *res;
+		res->start += size * offset;
+
+		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
+			 i, &res2, res, (offset > 0) ? "En" : "Dis",
+			 num_vfs, offset);
+
+		if (offset < 0) {
+			devm_release_resource(&dev->dev, &iov->holes[i]);
+			memset(&iov->holes[i], 0, sizeof(iov->holes[i]));
+		}
+
+		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+
+		if (offset > 0) {
+			iov->holes[i].start = res2.start;
+			iov->holes[i].end = res2.start + size * offset - 1;
+			iov->holes[i].flags = IORESOURCE_BUS;
+			iov->holes[i].name = "pnv_iov_reserved";
+			devm_request_resource(&dev->dev, res->parent,
+					&iov->holes[i]);
+		}
+	}
+	return 0;
+}
+
+static void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe;
+	struct pnv_iov_data   *iov;
+	u16                    num_vfs, i;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	iov = pnv_iov_get(pdev);
+	num_vfs = iov->num_vfs;
+
+	/* Release VF PEs */
+	pnv_ioda_release_vf_PE(pdev);
+
+	if (phb->type == PNV_PHB_IODA2) {
+		if (!iov->m64_single_mode)
+			pnv_pci_vf_resource_shift(pdev, -*iov->pe_num_map);
+
+		/* Release M64 windows */
+		pnv_pci_vf_release_m64(pdev, num_vfs);
+
+		/* Release PE numbers */
+		if (iov->m64_single_mode) {
+			for (i = 0; i < num_vfs; i++) {
+				if (iov->pe_num_map[i] == IODA_INVALID_PE)
+					continue;
+
+				pe = &phb->ioda.pe_array[iov->pe_num_map[i]];
+				pnv_ioda_free_pe(pe);
+			}
+		} else
+			bitmap_clear(phb->ioda.pe_alloc, *iov->pe_num_map, num_vfs);
+		/* Releasing pe_num_map */
+		kfree(iov->pe_num_map);
+	}
+}
+
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe;
+	int                    pe_num;
+	u16                    vf_index;
+	struct pnv_iov_data   *iov;
+	struct pci_dn         *pdn;
+
+	if (!pdev->is_physfn)
+		return;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	pdn = pci_get_pdn(pdev);
+	iov = pnv_iov_get(pdev);
+
+	/* Reserve PE for each VF */
+	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+		int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
+		int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
+		struct pci_dn *vf_pdn;
+
+		if (iov->m64_single_mode)
+			pe_num = iov->pe_num_map[vf_index];
+		else
+			pe_num = *iov->pe_num_map + vf_index;
+
+		pe = &phb->ioda.pe_array[pe_num];
+		pe->pe_number = pe_num;
+		pe->phb = phb;
+		pe->flags = PNV_IODA_PE_VF;
+		pe->pbus = NULL;
+		pe->parent_dev = pdev;
+		pe->mve_number = -1;
+		pe->rid = (vf_bus << 8) | vf_devfn;
+
+		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
+			pci_domain_nr(pdev->bus), pdev->bus->number,
+			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
+
+		if (pnv_ioda_configure_pe(phb, pe)) {
+			/* XXX What do we do here ? */
+			pnv_ioda_free_pe(pe);
+			pe->pdev = NULL;
+			continue;
+		}
+
+		/* Put PE to the list */
+		mutex_lock(&phb->ioda.pe_list_mutex);
+		list_add_tail(&pe->list, &phb->ioda.pe_list);
+		mutex_unlock(&phb->ioda.pe_list_mutex);
+
+		/* associate this pe to it's pdn */
+		list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
+			if (vf_pdn->busno == vf_bus &&
+			    vf_pdn->devfn == vf_devfn) {
+				vf_pdn->pe_number = pe_num;
+				break;
+			}
+		}
+
+		pnv_pci_ioda2_setup_dma_pe(phb, pe);
+	}
+}
+
+static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pnv_iov_data   *iov;
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe;
+	int                    ret;
+	u16                    i;
+
+	phb = pci_bus_to_pnvhb(pdev->bus);
+	iov = pnv_iov_get(pdev);
+
+	if (phb->type == PNV_PHB_IODA2) {
+		if (!iov->vfs_expanded) {
+			dev_info(&pdev->dev, "don't support this SRIOV device"
+				" with non 64bit-prefetchable IOV BAR\n");
+			return -ENOSPC;
+		}
+
+		/*
+		 * When M64 BARs functions in Single PE mode, the number of VFs
+		 * could be enabled must be less than the number of M64 BARs.
+		 */
+		if (iov->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
+			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
+			return -EBUSY;
+		}
+
+		/* Allocating pe_num_map */
+		if (iov->m64_single_mode)
+			iov->pe_num_map = kmalloc_array(num_vfs,
+							sizeof(*iov->pe_num_map),
+							GFP_KERNEL);
+		else
+			iov->pe_num_map = kmalloc(sizeof(*iov->pe_num_map), GFP_KERNEL);
+
+		if (!iov->pe_num_map)
+			return -ENOMEM;
+
+		if (iov->m64_single_mode)
+			for (i = 0; i < num_vfs; i++)
+				iov->pe_num_map[i] = IODA_INVALID_PE;
+
+		/* Calculate available PE for required VFs */
+		if (iov->m64_single_mode) {
+			for (i = 0; i < num_vfs; i++) {
+				pe = pnv_ioda_alloc_pe(phb);
+				if (!pe) {
+					ret = -EBUSY;
+					goto m64_failed;
+				}
+
+				iov->pe_num_map[i] = pe->pe_number;
+			}
+		} else {
+			mutex_lock(&phb->ioda.pe_alloc_mutex);
+			*iov->pe_num_map = bitmap_find_next_zero_area(
+				phb->ioda.pe_alloc, phb->ioda.total_pe_num,
+				0, num_vfs, 0);
+			if (*iov->pe_num_map >= phb->ioda.total_pe_num) {
+				mutex_unlock(&phb->ioda.pe_alloc_mutex);
+				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
+				kfree(iov->pe_num_map);
+				return -EBUSY;
+			}
+			bitmap_set(phb->ioda.pe_alloc, *iov->pe_num_map, num_vfs);
+			mutex_unlock(&phb->ioda.pe_alloc_mutex);
+		}
+		iov->num_vfs = num_vfs;
+
+		/* Assign M64 window accordingly */
+		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
+		if (ret) {
+			dev_info(&pdev->dev, "Not enough M64 window resources\n");
+			goto m64_failed;
+		}
+
+		/*
+		 * When using one M64 BAR to map one IOV BAR, we need to shift
+		 * the IOV BAR according to the PE# allocated to the VFs.
+		 * Otherwise, the PE# for the VF will conflict with others.
+		 */
+		if (!iov->m64_single_mode) {
+			ret = pnv_pci_vf_resource_shift(pdev, *iov->pe_num_map);
+			if (ret)
+				goto m64_failed;
+		}
+	}
+
+	/* Setup VF PEs */
+	pnv_ioda_setup_vf_PE(pdev, num_vfs);
+
+	return 0;
+
+m64_failed:
+	if (iov->m64_single_mode) {
+		for (i = 0; i < num_vfs; i++) {
+			if (iov->pe_num_map[i] == IODA_INVALID_PE)
+				continue;
+
+			pe = &phb->ioda.pe_array[iov->pe_num_map[i]];
+			pnv_ioda_free_pe(pe);
+		}
+	} else
+		bitmap_clear(phb->ioda.pe_alloc, *iov->pe_num_map, num_vfs);
+
+	/* Releasing pe_num_map */
+	kfree(iov->pe_num_map);
+
+	return ret;
+}
+
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
+{
+	pnv_pci_sriov_disable(pdev);
+
+	/* Release PCI data */
+	remove_sriov_vf_pdns(pdev);
+	return 0;
+}
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	/* Allocate PCI data */
+	add_sriov_vf_pdns(pdev);
+
+	return pnv_pci_sriov_enable(pdev, num_vfs);
+}
+
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 6aa6aefb637d..0156d7d17f7d 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -194,6 +194,80 @@ struct pnv_phb {
 	u8			*diag_data;
 };
 
+
+/* IODA PE management */
+
+static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
+{
+	/*
+	 * WARNING: We cannot rely on the resource flags. The Linux PCI
+	 * allocation code sometimes decides to put a 64-bit prefetchable
+	 * BAR in the 32-bit window, so we have to compare the addresses.
+	 *
+	 * For simplicity we only test resource start.
+	 */
+	return (r->start >= phb->ioda.m64_base &&
+		r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
+}
+
+static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
+{
+	unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
+
+	return (resource_flags & flags) == flags;
+}
+
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
+
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb);
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
+
+#ifdef CONFIG_PCI_IOV
+/*
+ * For SR-IOV we want to put each VF's MMIO resource in to a separate PE.
+ * This requires a bit of acrobatics with the MMIO -> PE configuration
+ * and this structure is used to keep track of it all.
+ */
+struct pnv_iov_data {
+	/* number of VFs IOV BAR expanded. FIXME: rename this to something less bad */
+	u16     vfs_expanded;
+
+	/* number of VFs enabled */
+	u16     num_vfs;
+	unsigned int *pe_num_map;	/* PE# for the first VF PE or array */
+
+	/* Did we map the VF BARs with single-PE IODA BARs? */
+	bool    m64_single_mode;
+
+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
+#define IODA_INVALID_M64        (-1)
+
+	/*
+	 * If we map the SR-IOV BARs with a segmented window then
+	 * parts of that window will be "claimed" by other PEs.
+	 *
+	 * "holes" here is used to reserve the leading portion
+	 * of the window that is used by other (non VF) PEs.
+	 */
+	struct resource holes[PCI_SRIOV_NUM_BARS];
+};
+
+static inline struct pnv_iov_data *pnv_iov_get(struct pci_dev *pdev)
+{
+	return pdev->dev.archdata.iov_data;
+}
+
+void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev);
+resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno);
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev);
+#endif /* CONFIG_PCI_IOV */
+
 extern struct pci_ops pnv_pci_ops;
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
-- 
2.26.2


^ permalink raw reply related

* [PATCH 07/15] powerpc/powernv/sriov: Rename truncate_iov
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

This prevents SR-IOV being used by making the SR-IOV BAR resources
unallocatable. Rename it to reflect what it actually does.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index f4c74ab1284d..216ceeff69b0 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -155,7 +155,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 
 	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
 	if (!iov)
-		goto truncate_iov;
+		goto disable_iov;
 	pdev->dev.archdata.iov_data = iov;
 
 	total_vfs = pci_sriov_get_totalvfs(pdev);
@@ -170,7 +170,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 			dev_warn(&pdev->dev, "Don't support SR-IOV with"
 					" non M64 VF BAR%d: %pR. \n",
 				 i, res);
-			goto truncate_iov;
+			goto disable_iov;
 		}
 
 		total_vf_bar_sz += pci_iov_resource_size(pdev,
@@ -209,7 +209,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 		 * mode is 32MB.
 		 */
 		if (iov->m64_single_mode && (size < SZ_32M))
-			goto truncate_iov;
+			goto disable_iov;
+
 		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
 		res->end = res->start + size * mul - 1;
 		dev_dbg(&pdev->dev, "                       %pR\n", res);
@@ -220,8 +221,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 
 	return;
 
-truncate_iov:
-	/* To save MMIO space, IOV BAR is truncated. */
+disable_iov:
+	/* Save ourselves some MMIO space by disabling the unusable BARs */
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		res->flags = 0;
-- 
2.26.2


^ permalink raw reply related

* [PATCH 08/15] powerpc/powernv/sriov: Simplify used window tracking
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

No need for the multi-dimensional arrays, just use a bitmap.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 48 +++++++---------------
 arch/powerpc/platforms/powernv/pci.h       |  7 +++-
 2 files changed, 20 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 216ceeff69b0..e4c65cb49757 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -303,28 +303,20 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
 {
 	struct pnv_iov_data   *iov;
 	struct pnv_phb        *phb;
-	int                    i, j;
-	int                    m64_bars;
+	int window_id;
 
 	phb = pci_bus_to_pnvhb(pdev->bus);
 	iov = pnv_iov_get(pdev);
 
-	if (iov->m64_single_mode)
-		m64_bars = num_vfs;
-	else
-		m64_bars = 1;
+	for_each_set_bit(window_id, iov->used_m64_bar_mask, 64) {
+		opal_pci_phb_mmio_enable(phb->opal_id,
+					 OPAL_M64_WINDOW_TYPE,
+					 window_id,
+					 0);
 
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-		for (j = 0; j < m64_bars; j++) {
-			if (iov->m64_map[j][i] == IODA_INVALID_M64)
-				continue;
-			opal_pci_phb_mmio_enable(phb->opal_id,
-				OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 0);
-			clear_bit(iov->m64_map[j][i], &phb->ioda.m64_bar_alloc);
-			iov->m64_map[j][i] = IODA_INVALID_M64;
-		}
+		clear_bit(window_id, &phb->ioda.m64_bar_alloc);
+	}
 
-	kfree(iov->m64_map);
 	return 0;
 }
 
@@ -350,23 +342,14 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	else
 		m64_bars = 1;
 
-	iov->m64_map = kmalloc_array(m64_bars,
-				     sizeof(*iov->m64_map),
-				     GFP_KERNEL);
-	if (!iov->m64_map)
-		return -ENOMEM;
-	/* Initialize the m64_map to IODA_INVALID_M64 */
-	for (i = 0; i < m64_bars ; i++)
-		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
-			iov->m64_map[i][j] = IODA_INVALID_M64;
-
-
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || !res->parent)
 			continue;
 
 		for (j = 0; j < m64_bars; j++) {
+
+			/* allocate a window ID for this BAR */
 			do {
 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
 						phb->ioda.m64_bar_idx + 1, 0);
@@ -374,8 +357,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 				if (win >= phb->ioda.m64_bar_idx + 1)
 					goto m64_failed;
 			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
-
-			iov->m64_map[j][i] = win;
+			set_bit(win, iov->used_m64_bar_mask);
 
 			if (iov->m64_single_mode) {
 				size = pci_iov_resource_size(pdev,
@@ -391,12 +373,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 				pe_num = iov->pe_num_map[j];
 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
 						pe_num, OPAL_M64_WINDOW_TYPE,
-						iov->m64_map[j][i], 0);
+						win, 0);
 			}
 
 			rc = opal_pci_set_phb_mem_window(phb->opal_id,
 						 OPAL_M64_WINDOW_TYPE,
-						 iov->m64_map[j][i],
+						 win,
 						 start,
 						 0, /* unused */
 						 size);
@@ -410,10 +392,10 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 
 			if (iov->m64_single_mode)
 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 2);
+				     OPAL_M64_WINDOW_TYPE, win, 2);
 			else
 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 1);
+				     OPAL_M64_WINDOW_TYPE, win, 1);
 
 			if (rc != OPAL_SUCCESS) {
 				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 0156d7d17f7d..58c97e60c3db 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -243,8 +243,11 @@ struct pnv_iov_data {
 	/* Did we map the VF BARs with single-PE IODA BARs? */
 	bool    m64_single_mode;
 
-	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
-#define IODA_INVALID_M64        (-1)
+	/*
+	 * Bit mask used to track which m64 windows that we used to map the
+	 * SR-IOV BARs for this device.
+	 */
+	DECLARE_BITMAP(used_m64_bar_mask, 64);
 
 	/*
 	 * If we map the SR-IOV BARs with a segmented window then
-- 
2.26.2


^ permalink raw reply related

* [PATCH 09/15] powerpc/powernv/sriov: Factor out M64 BAR setup
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

The sequence required to use the single PE BAR mode is kinda janky and
requires a little explanation. The API was designed with P7-IOC style
windows where the setup process is something like:

1. Configure the window start / end address
2. Enable the window
3. Map the segments of each window to the PE

For Single PE BARs the process is:

1. Set the PE for segment zero on a disabled window
2. Set the range
3. Enable the window

Move the OPAL calls into their own helper functions where the quirks can be
contained.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 132 ++++++++++++++++-----
 1 file changed, 103 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index e4c65cb49757..d53a85ccb538 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -320,6 +320,102 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
 	return 0;
 }
 
+
+/*
+ * PHB3 and beyond support "accordion" windows. The window's address range
+ * is subdivided into phb->ioda.total_pe_num segments and there's a 1-1
+ * mapping between PEs and segments.
+ *
+ * They're called that because as the window size changes the segment sizes
+ * change with it. Sort of like an accordion, sort of.
+ */
+static int64_t pnv_ioda_map_m64_accordion(struct pnv_phb *phb,
+					  int window_id,
+					  resource_size_t start,
+					  resource_size_t size)
+{
+	int64_t rc;
+
+	rc = opal_pci_set_phb_mem_window(phb->opal_id,
+					 OPAL_M64_WINDOW_TYPE,
+					 window_id,
+					 start,
+					 0, /* unused */
+					 size);
+	if (rc)
+		goto out;
+
+	rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				      OPAL_M64_WINDOW_TYPE,
+				      window_id,
+				      OPAL_ENABLE_M64_SPLIT);
+out:
+	if (rc)
+		pr_err("Failed to map M64 window #%d: %lld\n", window_id, rc);
+
+	return rc;
+}
+
+static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb,
+				       int pe_num,
+				       int window_id,
+				       resource_size_t start,
+				       resource_size_t size)
+{
+	int64_t rc;
+
+	/*
+	 * The API for setting up m64 mmio windows seems to have been designed
+	 * with P7-IOC in mind. For that chip each M64 BAR (window) had a fixed
+	 * split of 8 equally sized segments each of which could individually
+	 * assigned to a PE.
+	 *
+	 * The problem with this is that the API doesn't have any way to
+	 * communicate the number of segments we want on a BAR. This wasn't
+	 * a problem for p7-ioc since you didn't have a choice, but the
+	 * single PE windows added in PHB3 don't map cleanly to this API.
+	 *
+	 * As a result we've got this slightly awkward process where we
+	 * call opal_pci_map_pe_mmio_window() to put the single in single
+	 * PE mode, and set the PE for the window before setting the address
+	 * bounds. We need to do it this way because the single PE windows
+	 * for PHB3 have different alignment requirements on PHB3.
+	 */
+	rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+					 pe_num,
+					 OPAL_M64_WINDOW_TYPE,
+					 window_id,
+					 0);
+	if (rc)
+		goto out;
+
+	/*
+	 * NB: In single PE mode the window needs to be aligned to 32MB
+	 */
+	rc = opal_pci_set_phb_mem_window(phb->opal_id,
+					 OPAL_M64_WINDOW_TYPE,
+					 window_id,
+					 start,
+					 0, /* ignored by FW, m64 is 1-1 */
+					 size);
+	if (rc)
+		goto out;
+
+	/*
+	 * Now actually enable it. We specified the BAR should be in "non-split"
+	 * mode so FW will validate that the BAR is in single PE mode.
+	 */
+	rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				      OPAL_M64_WINDOW_TYPE,
+				      window_id,
+				      OPAL_ENABLE_M64_NON_SPLIT);
+out:
+	if (rc)
+		pr_err("Error mapping single PE BAR\n");
+
+	return rc;
+}
+
 static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 {
 	struct pnv_iov_data   *iov;
@@ -330,7 +426,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	int64_t                rc;
 	int                    total_vfs;
 	resource_size_t        size, start;
-	int                    pe_num;
 	int                    m64_bars;
 
 	phb = pci_bus_to_pnvhb(pdev->bus);
@@ -359,49 +454,28 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
 			set_bit(win, iov->used_m64_bar_mask);
 
+
 			if (iov->m64_single_mode) {
 				size = pci_iov_resource_size(pdev,
 							PCI_IOV_RESOURCES + i);
 				start = res->start + size * j;
+				rc = pnv_ioda_map_m64_single(phb, win,
+							     iov->pe_num_map[j],
+							     start,
+							     size);
 			} else {
 				size = resource_size(res);
 				start = res->start;
-			}
 
-			/* Map the M64 here */
-			if (iov->m64_single_mode) {
-				pe_num = iov->pe_num_map[j];
-				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-						pe_num, OPAL_M64_WINDOW_TYPE,
-						win, 0);
+				rc = pnv_ioda_map_m64_accordion(phb, win, start,
+								size);
 			}
 
-			rc = opal_pci_set_phb_mem_window(phb->opal_id,
-						 OPAL_M64_WINDOW_TYPE,
-						 win,
-						 start,
-						 0, /* unused */
-						 size);
-
-
 			if (rc != OPAL_SUCCESS) {
 				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
 					win, rc);
 				goto m64_failed;
 			}
-
-			if (iov->m64_single_mode)
-				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, win, 2);
-			else
-				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, win, 1);
-
-			if (rc != OPAL_SUCCESS) {
-				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
-					win, rc);
-				goto m64_failed;
-			}
 		}
 	}
 	return 0;
-- 
2.26.2


^ permalink raw reply related

* [PATCH 10/15] powerpc/powernv/pci: Refactor pnv_ioda_alloc_pe()
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

Rework the PE allocation logic to allow allocating blocks of PEs rather
than individually. We'll use this to allocate contigious blocks of PEs for
the SR-IOVs.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 41 ++++++++++++++++++-----
 arch/powerpc/platforms/powernv/pci.h      |  2 +-
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2d36a9ebf0e9..c9c25fb0783c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -145,23 +145,45 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
 		return;
 	}
 
+	mutex_lock(&phb->ioda.pe_alloc_mutex);
 	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
 		pr_debug("%s: PE %x was reserved on PHB#%x\n",
 			 __func__, pe_no, phb->hose->global_number);
+	mutex_unlock(&phb->ioda.pe_alloc_mutex);
 
 	pnv_ioda_init_pe(phb, pe_no);
 }
 
-struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
 {
-	long pe;
+	struct pnv_ioda_pe *ret = NULL;
+	int run = 0, pe, i;
 
+	mutex_lock(&phb->ioda.pe_alloc_mutex);
+
+	/* scan backwards for a run of @count cleared bits */
 	for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
-		if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
-			return pnv_ioda_init_pe(phb, pe);
+		if (test_bit(pe, phb->ioda.pe_alloc)) {
+			run = 0;
+			continue;
+		}
+
+		run++;
+		if (run == count)
+			break;
 	}
+	if (run != count)
+		goto out;
 
-	return NULL;
+	for (i = pe; i < pe + count; i++) {
+		set_bit(i, phb->ioda.pe_alloc);
+		pnv_ioda_init_pe(phb, i);
+	}
+	ret = &phb->ioda.pe_array[pe];
+
+out:
+	mutex_unlock(&phb->ioda.pe_alloc_mutex);
+	return ret;
 }
 
 void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
@@ -173,7 +195,10 @@ void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
 	WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
 	kfree(pe->npucomp);
 	memset(pe, 0, sizeof(struct pnv_ioda_pe));
+
+	mutex_lock(&phb->ioda.pe_alloc_mutex);
 	clear_bit(pe_num, phb->ioda.pe_alloc);
+	mutex_unlock(&phb->ioda.pe_alloc_mutex);
 }
 
 /* The default M64 BAR is shared by all PEs */
@@ -976,7 +1001,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 	if (pdn->pe_number != IODA_INVALID_PE)
 		return NULL;
 
-	pe = pnv_ioda_alloc_pe(phb);
+	pe = pnv_ioda_alloc_pe(phb, 1);
 	if (!pe) {
 		pr_warn("%s: Not enough PE# available, disabling device\n",
 			pci_name(dev));
@@ -1047,7 +1072,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 
 	/* The PE number isn't pinned by M64 */
 	if (!pe)
-		pe = pnv_ioda_alloc_pe(phb);
+		pe = pnv_ioda_alloc_pe(phb, 1);
 
 	if (!pe) {
 		pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
@@ -3065,7 +3090,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
 	} else {
 		/* otherwise just allocate one */
-		root_pe = pnv_ioda_alloc_pe(phb);
+		root_pe = pnv_ioda_alloc_pe(phb, 1);
 		phb->ioda.root_pe_idx = root_pe->pe_number;
 	}
 
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 58c97e60c3db..b4c9bdba7217 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -223,7 +223,7 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
 void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
 void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
 
-struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb);
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count);
 void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
 
 #ifdef CONFIG_PCI_IOV
-- 
2.26.2


^ permalink raw reply related

* [PATCH 11/15] powerpc/powernv/sriov: Drop iov->pe_num_map[]
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

Currently the iov->pe_num_map[] does one of two things depending on
whether single PE mode is being used or not. When it is, this contains an
array which maps a vf_index to the corresponding PE number. When single PE
mode is not being used this contains a scalar which is the base PE for the
set of enabled VFs (for for VFn is base + n).

The array was necessary because when calling pnv_ioda_alloc_pe() there is
no guarantee that the allocated PEs would be contigious. We can now
allocate contigious blocks of PEs so this is no longer an issue. This
allows us to drop the if (single_mode) {} .. else {} block scattered
through the SR-IOV code which is a nice clean up.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 109 +++++----------------
 arch/powerpc/platforms/powernv/pci.h       |   4 +-
 2 files changed, 25 insertions(+), 88 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index d53a85ccb538..08f88187d65a 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -456,11 +456,13 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 
 
 			if (iov->m64_single_mode) {
+				int pe_num = iov->vf_pe_arr[j].pe_number;
+
 				size = pci_iov_resource_size(pdev,
 							PCI_IOV_RESOURCES + i);
 				start = res->start + size * j;
 				rc = pnv_ioda_map_m64_single(phb, win,
-							     iov->pe_num_map[j],
+							     pe_num,
 							     start,
 							     size);
 			} else {
@@ -599,38 +601,24 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 
 static void pnv_pci_sriov_disable(struct pci_dev *pdev)
 {
+	u16                    num_vfs, base_pe;
 	struct pnv_phb        *phb;
-	struct pnv_ioda_pe    *pe;
 	struct pnv_iov_data   *iov;
-	u16                    num_vfs, i;
 
 	phb = pci_bus_to_pnvhb(pdev->bus);
 	iov = pnv_iov_get(pdev);
 	num_vfs = iov->num_vfs;
+	base_pe = iov->vf_pe_arr[0].pe_number;
 
 	/* Release VF PEs */
 	pnv_ioda_release_vf_PE(pdev);
 
 	if (phb->type == PNV_PHB_IODA2) {
 		if (!iov->m64_single_mode)
-			pnv_pci_vf_resource_shift(pdev, -*iov->pe_num_map);
+			pnv_pci_vf_resource_shift(pdev, -base_pe);
 
 		/* Release M64 windows */
 		pnv_pci_vf_release_m64(pdev, num_vfs);
-
-		/* Release PE numbers */
-		if (iov->m64_single_mode) {
-			for (i = 0; i < num_vfs; i++) {
-				if (iov->pe_num_map[i] == IODA_INVALID_PE)
-					continue;
-
-				pe = &phb->ioda.pe_array[iov->pe_num_map[i]];
-				pnv_ioda_free_pe(pe);
-			}
-		} else
-			bitmap_clear(phb->ioda.pe_alloc, *iov->pe_num_map, num_vfs);
-		/* Releasing pe_num_map */
-		kfree(iov->pe_num_map);
 	}
 }
 
@@ -656,13 +644,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
 		struct pci_dn *vf_pdn;
 
-		if (iov->m64_single_mode)
-			pe_num = iov->pe_num_map[vf_index];
-		else
-			pe_num = *iov->pe_num_map + vf_index;
-
-		pe = &phb->ioda.pe_array[pe_num];
-		pe->pe_number = pe_num;
+		pe = &iov->vf_pe_arr[vf_index];
 		pe->phb = phb;
 		pe->flags = PNV_IODA_PE_VF;
 		pe->pbus = NULL;
@@ -670,6 +652,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		pe->mve_number = -1;
 		pe->rid = (vf_bus << 8) | vf_devfn;
 
+		pe_num = pe->pe_number;
 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
 			pci_domain_nr(pdev->bus), pdev->bus->number,
 			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
@@ -701,9 +684,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 {
+	struct pnv_ioda_pe    *base_pe;
 	struct pnv_iov_data   *iov;
 	struct pnv_phb        *phb;
-	struct pnv_ioda_pe    *pe;
 	int                    ret;
 	u16                    i;
 
@@ -717,55 +700,14 @@ static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 			return -ENOSPC;
 		}
 
-		/*
-		 * When M64 BARs functions in Single PE mode, the number of VFs
-		 * could be enabled must be less than the number of M64 BARs.
-		 */
-		if (iov->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
-			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
+		/* allocate a contigious block of PEs for our VFs */
+		base_pe = pnv_ioda_alloc_pe(phb, num_vfs);
+		if (!base_pe) {
+			pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs);
 			return -EBUSY;
 		}
 
-		/* Allocating pe_num_map */
-		if (iov->m64_single_mode)
-			iov->pe_num_map = kmalloc_array(num_vfs,
-							sizeof(*iov->pe_num_map),
-							GFP_KERNEL);
-		else
-			iov->pe_num_map = kmalloc(sizeof(*iov->pe_num_map), GFP_KERNEL);
-
-		if (!iov->pe_num_map)
-			return -ENOMEM;
-
-		if (iov->m64_single_mode)
-			for (i = 0; i < num_vfs; i++)
-				iov->pe_num_map[i] = IODA_INVALID_PE;
-
-		/* Calculate available PE for required VFs */
-		if (iov->m64_single_mode) {
-			for (i = 0; i < num_vfs; i++) {
-				pe = pnv_ioda_alloc_pe(phb);
-				if (!pe) {
-					ret = -EBUSY;
-					goto m64_failed;
-				}
-
-				iov->pe_num_map[i] = pe->pe_number;
-			}
-		} else {
-			mutex_lock(&phb->ioda.pe_alloc_mutex);
-			*iov->pe_num_map = bitmap_find_next_zero_area(
-				phb->ioda.pe_alloc, phb->ioda.total_pe_num,
-				0, num_vfs, 0);
-			if (*iov->pe_num_map >= phb->ioda.total_pe_num) {
-				mutex_unlock(&phb->ioda.pe_alloc_mutex);
-				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
-				kfree(iov->pe_num_map);
-				return -EBUSY;
-			}
-			bitmap_set(phb->ioda.pe_alloc, *iov->pe_num_map, num_vfs);
-			mutex_unlock(&phb->ioda.pe_alloc_mutex);
-		}
+		iov->vf_pe_arr = base_pe;
 		iov->num_vfs = num_vfs;
 
 		/* Assign M64 window accordingly */
@@ -781,9 +723,10 @@ static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 		 * Otherwise, the PE# for the VF will conflict with others.
 		 */
 		if (!iov->m64_single_mode) {
-			ret = pnv_pci_vf_resource_shift(pdev, *iov->pe_num_map);
+			ret = pnv_pci_vf_resource_shift(pdev,
+							base_pe->pe_number);
 			if (ret)
-				goto m64_failed;
+				goto shift_failed;
 		}
 	}
 
@@ -792,20 +735,12 @@ static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 
 	return 0;
 
-m64_failed:
-	if (iov->m64_single_mode) {
-		for (i = 0; i < num_vfs; i++) {
-			if (iov->pe_num_map[i] == IODA_INVALID_PE)
-				continue;
-
-			pe = &phb->ioda.pe_array[iov->pe_num_map[i]];
-			pnv_ioda_free_pe(pe);
-		}
-	} else
-		bitmap_clear(phb->ioda.pe_alloc, *iov->pe_num_map, num_vfs);
+shift_failed:
+	pnv_pci_vf_release_m64(pdev, num_vfs);
 
-	/* Releasing pe_num_map */
-	kfree(iov->pe_num_map);
+m64_failed:
+	for (i = 0; i < num_vfs; i++)
+		pnv_ioda_free_pe(&iov->vf_pe_arr[i]);
 
 	return ret;
 }
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index b4c9bdba7217..13555bc549f4 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -238,7 +238,9 @@ struct pnv_iov_data {
 
 	/* number of VFs enabled */
 	u16     num_vfs;
-	unsigned int *pe_num_map;	/* PE# for the first VF PE or array */
+
+	/* pointer to the array of VF PEs. num_vfs long*/
+	struct pnv_ioda_pe *vf_pe_arr;
 
 	/* Did we map the VF BARs with single-PE IODA BARs? */
 	bool    m64_single_mode;
-- 
2.26.2


^ permalink raw reply related

* [PATCH 12/15] powerpc/powernv/sriov: De-indent setup and teardown
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

Remove the IODA2 PHB checks. We already assume IODA2 in several places so
there's not much point in wrapping most of the setup and teardown process
in an if block.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 86 ++++++++++++----------
 1 file changed, 49 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 08f88187d65a..d5699cd2ab7a 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -610,16 +610,18 @@ static void pnv_pci_sriov_disable(struct pci_dev *pdev)
 	num_vfs = iov->num_vfs;
 	base_pe = iov->vf_pe_arr[0].pe_number;
 
+	if (WARN_ON(!iov))
+		return;
+
 	/* Release VF PEs */
 	pnv_ioda_release_vf_PE(pdev);
 
-	if (phb->type == PNV_PHB_IODA2) {
-		if (!iov->m64_single_mode)
-			pnv_pci_vf_resource_shift(pdev, -base_pe);
+	/* Un-shift the IOV BAR resources */
+	if (!iov->m64_single_mode)
+		pnv_pci_vf_resource_shift(pdev, -base_pe);
 
-		/* Release M64 windows */
-		pnv_pci_vf_release_m64(pdev, num_vfs);
-	}
+	/* Release M64 windows */
+	pnv_pci_vf_release_m64(pdev, num_vfs);
 }
 
 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -693,41 +695,51 @@ static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	phb = pci_bus_to_pnvhb(pdev->bus);
 	iov = pnv_iov_get(pdev);
 
-	if (phb->type == PNV_PHB_IODA2) {
-		if (!iov->vfs_expanded) {
-			dev_info(&pdev->dev, "don't support this SRIOV device"
-				" with non 64bit-prefetchable IOV BAR\n");
-			return -ENOSPC;
-		}
+	/*
+	 * There's a calls to IODA2 PE setup code littered throughout. We could
+	 * probably fix that, but we'd still have problems due to the
+	 * restriction inherent on IODA1 PHBs.
+	 *
+	 * NB: We class IODA3 as IODA2 since they're very similar.
+	 */
+	if (phb->type != PNV_PHB_IODA2) {
+		pci_err(pdev, "SR-IOV is not supported on this PHB\n");
+		return -ENXIO;
+	}
 
-		/* allocate a contigious block of PEs for our VFs */
-		base_pe = pnv_ioda_alloc_pe(phb, num_vfs);
-		if (!base_pe) {
-			pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs);
-			return -EBUSY;
-		}
+	if (!iov->vfs_expanded) {
+		dev_info(&pdev->dev, "don't support this SRIOV device"
+			" with non 64bit-prefetchable IOV BAR\n");
+		return -ENOSPC;
+	}
 
-		iov->vf_pe_arr = base_pe;
-		iov->num_vfs = num_vfs;
+	/* allocate a contigious block of PEs for our VFs */
+	base_pe = pnv_ioda_alloc_pe(phb, num_vfs);
+	if (!base_pe) {
+		pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs);
+		return -EBUSY;
+	}
 
-		/* Assign M64 window accordingly */
-		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
-		if (ret) {
-			dev_info(&pdev->dev, "Not enough M64 window resources\n");
-			goto m64_failed;
-		}
+	iov->vf_pe_arr = base_pe;
+	iov->num_vfs = num_vfs;
 
-		/*
-		 * When using one M64 BAR to map one IOV BAR, we need to shift
-		 * the IOV BAR according to the PE# allocated to the VFs.
-		 * Otherwise, the PE# for the VF will conflict with others.
-		 */
-		if (!iov->m64_single_mode) {
-			ret = pnv_pci_vf_resource_shift(pdev,
-							base_pe->pe_number);
-			if (ret)
-				goto shift_failed;
-		}
+	/* Assign M64 window accordingly */
+	ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
+	if (ret) {
+		dev_info(&pdev->dev, "Not enough M64 window resources\n");
+		goto m64_failed;
+	}
+
+	/*
+	 * When using one M64 BAR to map one IOV BAR, we need to shift
+	 * the IOV BAR according to the PE# allocated to the VFs.
+	 * Otherwise, the PE# for the VF will conflict with others.
+	 */
+	if (!iov->m64_single_mode) {
+		ret = pnv_pci_vf_resource_shift(pdev,
+						base_pe->pe_number);
+		if (ret)
+			goto shift_failed;
 	}
 
 	/* Setup VF PEs */
-- 
2.26.2


^ permalink raw reply related

* [PATCH 13/15] powerpc/powernv/sriov: Move M64 BAR allocation into a helper
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

I want to refactor the loop this code is currently inside of. Hoist it on
out.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 31 ++++++++++++++--------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index d5699cd2ab7a..2f967aa4fbf5 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -416,6 +416,23 @@ static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb,
 	return rc;
 }
 
+static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
+{
+	int win;
+
+	do {
+		win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+				phb->ioda.m64_bar_idx + 1, 0);
+
+		if (win >= phb->ioda.m64_bar_idx + 1)
+			return -1;
+	} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+	set_bit(win, iov->used_m64_bar_mask);
+
+	return win;
+}
+
 static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 {
 	struct pnv_iov_data   *iov;
@@ -443,17 +460,9 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 			continue;
 
 		for (j = 0; j < m64_bars; j++) {
-
-			/* allocate a window ID for this BAR */
-			do {
-				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
-						phb->ioda.m64_bar_idx + 1, 0);
-
-				if (win >= phb->ioda.m64_bar_idx + 1)
-					goto m64_failed;
-			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
-			set_bit(win, iov->used_m64_bar_mask);
-
+			win = pnv_pci_alloc_m64_bar(phb, iov);
+			if (win < 0)
+				goto m64_failed;
 
 			if (iov->m64_single_mode) {
 				int pe_num = iov->vf_pe_arr[j].pe_number;
-- 
2.26.2


^ permalink raw reply related

* [PATCH 14/15] powerpc/powernv/sriov: Refactor M64 BAR setup
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

Split up the logic so that we have one branch that handles setting up a
segmented window and another that handles setting up single PE windows for
each VF.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
This patch could be folded into the previous one. I've kept it
seperate mainly because the diff is *horrific* when they're merged.
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 57 ++++++++++------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 2f967aa4fbf5..8de03636888a 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -441,52 +441,49 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	struct resource       *res;
 	int                    i, j;
 	int64_t                rc;
-	int                    total_vfs;
 	resource_size_t        size, start;
-	int                    m64_bars;
+	int                    base_pe_num;
 
 	phb = pci_bus_to_pnvhb(pdev->bus);
 	iov = pnv_iov_get(pdev);
-	total_vfs = pci_sriov_get_totalvfs(pdev);
-
-	if (iov->m64_single_mode)
-		m64_bars = num_vfs;
-	else
-		m64_bars = 1;
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || !res->parent)
 			continue;
 
-		for (j = 0; j < m64_bars; j++) {
+		/* don't need single mode? map everything in one go! */
+		if (!iov->m64_single_mode) {
 			win = pnv_pci_alloc_m64_bar(phb, iov);
 			if (win < 0)
 				goto m64_failed;
 
-			if (iov->m64_single_mode) {
-				int pe_num = iov->vf_pe_arr[j].pe_number;
-
-				size = pci_iov_resource_size(pdev,
-							PCI_IOV_RESOURCES + i);
-				start = res->start + size * j;
-				rc = pnv_ioda_map_m64_single(phb, win,
-							     pe_num,
-							     start,
-							     size);
-			} else {
-				size = resource_size(res);
-				start = res->start;
-
-				rc = pnv_ioda_map_m64_accordion(phb, win, start,
-								size);
-			}
+			size = resource_size(res);
+			start = res->start;
 
-			if (rc != OPAL_SUCCESS) {
-				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
-					win, rc);
+			rc = pnv_ioda_map_m64_accordion(phb, win, start, size);
+			if (rc)
+				goto m64_failed;
+
+			continue;
+		}
+
+		/* otherwise map each VF with single PE BARs */
+		size = pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i);
+		base_pe_num = iov->vf_pe_arr[0].pe_number;
+
+		for (j = 0; j < num_vfs; j++) {
+			win = pnv_pci_alloc_m64_bar(phb, iov);
+			if (win < 0)
+				goto m64_failed;
+
+			start = res->start + size * j;
+			rc = pnv_ioda_map_m64_single(phb, win,
+						     base_pe_num + j,
+						     start,
+						     size);
+			if (rc)
 				goto m64_failed;
-			}
 		}
 	}
 	return 0;
-- 
2.26.2


^ permalink raw reply related

* [PATCH 15/15] powerpc/powernv/sriov: Make single PE mode a per-BAR setting
From: Oliver O'Halloran @ 2020-07-10  5:23 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

Using single PE BARs to map an SR-IOV BAR is really a choice about what
strategy to use when mapping a BAR. It doesn't make much sense for this to
be a global setting since a device might have one large BAR which needs to
be mapped with single PE windows and another smaller BAR that can be mapped
with a regular segmented window. Make the segmented vs single decision a
per-BAR setting and clean up the logic that decides which mode to use.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 131 +++++++++++----------
 arch/powerpc/platforms/powernv/pci.h       |  10 +-
 2 files changed, 75 insertions(+), 66 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 8de03636888a..87377d95d648 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -146,10 +146,9 @@
 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 {
 	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
-	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
 	struct resource *res;
 	int i;
-	resource_size_t size, total_vf_bar_sz;
+	resource_size_t vf_bar_sz;
 	struct pnv_iov_data *iov;
 	int mul, total_vfs;
 
@@ -158,9 +157,9 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 		goto disable_iov;
 	pdev->dev.archdata.iov_data = iov;
 
+	/* FIXME: totalvfs > phb->ioda.total_pe_num is going to be a problem */
 	total_vfs = pci_sriov_get_totalvfs(pdev);
 	mul = phb->ioda.total_pe_num;
-	total_vf_bar_sz = 0;
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
@@ -173,50 +172,51 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 			goto disable_iov;
 		}
 
-		total_vf_bar_sz += pci_iov_resource_size(pdev,
-				i + PCI_IOV_RESOURCES);
+		vf_bar_sz = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
 
 		/*
-		 * If bigger than quarter of M64 segment size, just round up
-		 * power of two.
+		 * Generally, one segmented M64 BAR maps one IOV BAR. However,
+		 * if a VF BAR is too large we end up wasting a lot of space.
+		 * If we've got a BAR that's bigger than greater than 1/4 of the
+		 * default window's segment size then switch to using single PE
+		 * windows. This limits the total number of VFs we can support.
 		 *
-		 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
-		 * with other devices, IOV BAR size is expanded to be
-		 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
-		 * segment size , the expanded size would equal to half of the
-		 * whole M64 space size, which will exhaust the M64 Space and
-		 * limit the system flexibility.  This is a design decision to
-		 * set the boundary to quarter of the M64 segment size.
+		 * The 1/4 limit is arbitrary and can be tweaked.
 		 */
-		if (total_vf_bar_sz > gate) {
-			mul = roundup_pow_of_two(total_vfs);
-			dev_info(&pdev->dev,
-				"VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
-				total_vf_bar_sz, gate, mul);
-			iov->m64_single_mode = true;
-			break;
-		}
-	}
+		if (vf_bar_sz > (phb->ioda.m64_segsize >> 2)) {
+			/*
+			 * On PHB3, the minimum size alignment of M64 BAR in
+			 * single mode is 32MB. If this VF BAR is smaller than
+			 * 32MB, but still too large for a segmented window
+			 * then we can't map it and need to disable SR-IOV for
+			 * this device.
+			 */
+			if (vf_bar_sz < SZ_32M) {
+				pci_err(pdev, "VF BAR%d: %pR can't be mapped in single PE mode\n",
+					i, res);
+				goto disable_iov;
+			}
 
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
-		res = &pdev->resource[i + PCI_IOV_RESOURCES];
-		if (!res->flags || res->parent)
+			iov->m64_single_mode[i] = true;
 			continue;
+		}
+
 
-		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
 		/*
-		 * On PHB3, the minimum size alignment of M64 BAR in single
-		 * mode is 32MB.
+		 * This BAR can be mapped with one segmented window, so adjust
+		 * te resource size to accommodate.
 		 */
-		if (iov->m64_single_mode && (size < SZ_32M))
-			goto disable_iov;
+		pci_dbg(pdev, " Fixing VF BAR%d: %pR to\n", i, res);
+		res->end = res->start + vf_bar_sz * mul - 1;
+		pci_dbg(pdev, "                       %pR\n", res);
 
-		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
-		res->end = res->start + size * mul - 1;
-		dev_dbg(&pdev->dev, "                       %pR\n", res);
-		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+		pci_info(pdev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
 			 i, res, mul);
+
+		iov->need_shift = true;
 	}
+
+	// what should this be?
 	iov->vfs_expanded = mul;
 
 	return;
@@ -260,42 +260,42 @@ void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
 resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
 						      int resno)
 {
-	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
 	struct pnv_iov_data *iov = pnv_iov_get(pdev);
 	resource_size_t align;
 
-	/*
-	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
-	 * SR-IOV. While from hardware perspective, the range mapped by M64
-	 * BAR should be size aligned.
-	 *
-	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
-	 * powernv-specific hardware restriction is gone. But if just use the
-	 * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
-	 * in one segment of M64 #15, which introduces the PE conflict between
-	 * PF and VF. Based on this, the minimum alignment of an IOV BAR is
-	 * m64_segsize.
-	 *
-	 * This function returns the total IOV BAR size if M64 BAR is in
-	 * Shared PE mode or just VF BAR size if not.
-	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
-	 * M64 segment size if IOV BAR size is less.
-	 */
-	align = pci_iov_resource_size(pdev, resno);
+	int bar_no = resno - PCI_IOV_RESOURCES;
 
 	/*
 	 * iov can be null if we have an SR-IOV device with IOV BAR that can't
 	 * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch).
-	 * In that case we don't allow VFs to be enabled so just return the
-	 * default alignment.
+	 * In that case we don't allow VFs to be enabled since one of their
+	 * BARs would not be placed in the correct PE.
 	 */
 	if (!iov)
 		return align;
 	if (!iov->vfs_expanded)
 		return align;
-	if (iov->m64_single_mode)
-		return max(align, (resource_size_t)phb->ioda.m64_segsize);
 
+	align = pci_iov_resource_size(pdev, resno);
+
+	/*
+	 * If we're using single mode then we can just use the native VF BAR
+	 * alignment. We validated that it's possible to use a single PE
+	 * window above when we did the fixup.
+	 */
+	if (iov->m64_single_mode[bar_no])
+		return align;
+
+	/*
+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+	 * SR-IOV. While from hardware perspective, the range mapped by M64
+	 * BAR should be size aligned.
+	 *
+	 * This function returns the total IOV BAR size if M64 BAR is in
+	 * Shared PE mode or just VF BAR size if not.
+	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
+	 * M64 segment size if IOV BAR size is less.
+	 */
 	return iov->vfs_expanded * align;
 }
 
@@ -453,7 +453,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 			continue;
 
 		/* don't need single mode? map everything in one go! */
-		if (!iov->m64_single_mode) {
+		if (!iov->m64_single_mode[i]) {
 			win = pnv_pci_alloc_m64_bar(phb, iov);
 			if (win < 0)
 				goto m64_failed;
@@ -546,6 +546,8 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		res = &dev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || !res->parent)
 			continue;
+		if (iov->m64_single_mode[i])
+			continue;
 
 		/*
 		 * The actual IOV BAR range is determined by the start address
@@ -577,6 +579,8 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		res = &dev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || !res->parent)
 			continue;
+		if (iov->m64_single_mode[i])
+			continue;
 
 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
 		res2 = *res;
@@ -622,8 +626,8 @@ static void pnv_pci_sriov_disable(struct pci_dev *pdev)
 	/* Release VF PEs */
 	pnv_ioda_release_vf_PE(pdev);
 
-	/* Un-shift the IOV BAR resources */
-	if (!iov->m64_single_mode)
+	/* Un-shift the IOV BARs if we need to */
+	if (iov->need_shift)
 		pnv_pci_vf_resource_shift(pdev, -base_pe);
 
 	/* Release M64 windows */
@@ -741,9 +745,8 @@ static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	 * the IOV BAR according to the PE# allocated to the VFs.
 	 * Otherwise, the PE# for the VF will conflict with others.
 	 */
-	if (!iov->m64_single_mode) {
-		ret = pnv_pci_vf_resource_shift(pdev,
-						base_pe->pe_number);
+	if (iov->need_shift) {
+		ret = pnv_pci_vf_resource_shift(pdev, base_pe->pe_number);
 		if (ret)
 			goto shift_failed;
 	}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 13555bc549f4..a78d1feb8fb8 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -236,14 +236,20 @@ struct pnv_iov_data {
 	/* number of VFs IOV BAR expanded. FIXME: rename this to something less bad */
 	u16     vfs_expanded;
 
+	/*
+	 * indicates if we need to move our IOV BAR to account for our
+	 * allocated PE number when enabling VFs.
+	 */
+	bool    need_shift;
+
 	/* number of VFs enabled */
 	u16     num_vfs;
 
 	/* pointer to the array of VF PEs. num_vfs long*/
 	struct pnv_ioda_pe *vf_pe_arr;
 
-	/* Did we map the VF BARs with single-PE IODA BARs? */
-	bool    m64_single_mode;
+	/* Did we map the VF BAR with single-PE IODA BARs? */
+	bool    m64_single_mode[PCI_SRIOV_NUM_BARS];
 
 	/*
 	 * Bit mask used to track which m64 windows that we used to map the
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH v5 2/2] powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show cpumask
From: Madhavan Srinivasan @ 2020-07-10  6:15 UTC (permalink / raw)
  To: Kajol Jain, linuxppc-dev, mpe; +Cc: nathanl, ego, suka, maddy, anju
In-Reply-To: <20200709051836.723765-3-kjain@linux.ibm.com>



On 7/9/20 10:48 AM, Kajol Jain wrote:
> Patch here adds a cpumask attr to hv_24x7 pmu along with ABI documentation.
>
> Primary use to expose the cpumask is for the perf tool which has the
> capability to parse the driver sysfs folder and understand the
> cpumask file. Having cpumask file will reduce the number of perf command
> line parameters (will avoid "-C" option in the perf tool
> command line). It can also notify the user which is
> the current cpu used to retrieve the counter data.
>
> command:# cat /sys/devices/hv_24x7/interface/cpumask
> 0


Reviewed-by: Madhavan Srinivasan <maddy@linux.ibm.com>

> Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
> ---
>   .../ABI/testing/sysfs-bus-event_source-devices-hv_24x7    | 7 +++++++
>   arch/powerpc/perf/hv-24x7.c                               | 8 ++++++++
>   2 files changed, 15 insertions(+)
>
> diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
> index e8698afcd952..f7e32f218f73 100644
> --- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
> +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
> @@ -43,6 +43,13 @@ Description:	read only
>   		This sysfs interface exposes the number of cores per chip
>   		present in the system.
>
> +What:		/sys/devices/hv_24x7/interface/cpumask
> +Date:		July 2020
> +Contact:	Linux on PowerPC Developer List <linuxppc-dev@lists.ozlabs.org>
> +Description:	read only
> +		This sysfs file exposes the cpumask which is designated to make
> +		HCALLs to retrieve hv-24x7 pmu event counter data.
> +
>   What:		/sys/bus/event_source/devices/hv_24x7/event_descs/<event-name>
>   Date:		February 2014
>   Contact:	Linux on PowerPC Developer List <linuxppc-dev@lists.ozlabs.org>
> diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
> index 93b4700dcf8c..acc34148ad09 100644
> --- a/arch/powerpc/perf/hv-24x7.c
> +++ b/arch/powerpc/perf/hv-24x7.c
> @@ -448,6 +448,12 @@ static ssize_t device_show_string(struct device *dev,
>   	return sprintf(buf, "%s\n", (char *)d->var);
>   }
>
> +static ssize_t cpumask_show(struct device *dev,
> +			    struct device_attribute *attr, char *buf)
> +{
> +	return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask);
> +}
> +
>   static ssize_t sockets_show(struct device *dev,
>   			    struct device_attribute *attr, char *buf)
>   {
> @@ -1115,6 +1121,7 @@ static DEVICE_ATTR_RO(domains);
>   static DEVICE_ATTR_RO(sockets);
>   static DEVICE_ATTR_RO(chipspersocket);
>   static DEVICE_ATTR_RO(coresperchip);
> +static DEVICE_ATTR_RO(cpumask);
>
>   static struct bin_attribute *if_bin_attrs[] = {
>   	&bin_attr_catalog,
> @@ -1128,6 +1135,7 @@ static struct attribute *if_attrs[] = {
>   	&dev_attr_sockets.attr,
>   	&dev_attr_chipspersocket.attr,
>   	&dev_attr_coresperchip.attr,
> +	&dev_attr_cpumask.attr,
>   	NULL,
>   };
>


^ permalink raw reply

* Re: [PATCH] powerpc/perf: Add kernel support for new MSR[HV PR] bits in trace-imc.
From: Madhavan Srinivasan @ 2020-07-10  6:25 UTC (permalink / raw)
  To: Anju T Sudhakar, mpe; +Cc: maddy, linuxppc-dev
In-Reply-To: <20200703063626.1412544-1-anju@linux.vnet.ibm.com>



On 7/3/20 12:06 PM, Anju T Sudhakar wrote:
> IMC trace-mode record has MSR[HV PR] bits added in the third DW.
> These bits can be used to set the cpumode for the instruction pointer
> captured in each sample.
>                                                                             
> Add support in kernel to use these bits to set the cpumode for
> each sample.
>                   

Changes looks fine to me.
Reviewed-by: Madhavan Srinivasan <maddy@linux.ibm.com>

>                                                            
> Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
> ---
>   arch/powerpc/include/asm/imc-pmu.h |  5 +++++
>   arch/powerpc/perf/imc-pmu.c        | 29 ++++++++++++++++++++++++-----
>   2 files changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h
> index 4da4fcba0684..4f897993b710 100644
> --- a/arch/powerpc/include/asm/imc-pmu.h
> +++ b/arch/powerpc/include/asm/imc-pmu.h
> @@ -99,6 +99,11 @@ struct trace_imc_data {
>    */
>   #define IMC_TRACE_RECORD_TB1_MASK      0x3ffffffffffULL
>
> +/*
> + * Bit 0:1 in third DW of IMC trace record
> + * specifies the MSR[HV PR] values.
> + */
> +#define IMC_TRACE_RECORD_VAL_HVPR(x)	((x) >> 62)
>
>   /*
>    * Device tree parser code detects IMC pmu support and
> diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
> index cb50a9e1fd2d..310922fed9eb 100644
> --- a/arch/powerpc/perf/imc-pmu.c
> +++ b/arch/powerpc/perf/imc-pmu.c
> @@ -1178,11 +1178,30 @@ static int trace_imc_prepare_sample(struct trace_imc_data *mem,
>   	header->size = sizeof(*header) + event->header_size;
>   	header->misc = 0;
>
> -	if (is_kernel_addr(data->ip))
> -		header->misc |= PERF_RECORD_MISC_KERNEL;
> -	else
> -		header->misc |= PERF_RECORD_MISC_USER;
> -
> +	if (cpu_has_feature(CPU_FTRS_POWER9)) {
> +		if (is_kernel_addr(data->ip))
> +			header->misc |= PERF_RECORD_MISC_KERNEL;
> +		else
> +			header->misc |= PERF_RECORD_MISC_USER;
> +	} else {
> +		switch (IMC_TRACE_RECORD_VAL_HVPR(mem->val)) {
> +		case 0:/* when MSR HV and PR not set in the trace-record */
> +			header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
> +			break;
> +		case 1: /* MSR HV is 0 and PR is 1 */
> +			header->misc |= PERF_RECORD_MISC_GUEST_USER;
> +			break;
> +		case 2: /* MSR Hv is 1 and PR is 0 */
> +			header->misc |= PERF_RECORD_MISC_HYPERVISOR;
> +			break;
> +		case 3: /* MSR HV is 1 and PR is 1 */
> +			header->misc |= PERF_RECORD_MISC_USER;
> +			break;
> +		default:
> +			pr_info("IMC: Unable to set the flag based on MSR bits\n");
> +			break;
> +		}
> +	}
>   	perf_event_header__init_id(header, data, event);
>
>   	return 0;


^ permalink raw reply

* Re: PowerNV PCI & SR-IOV cleanups
From: Christoph Hellwig @ 2020-07-10  6:45 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linux-pci, linuxppc-dev
In-Reply-To: <20200710052340.737567-1-oohall@gmail.com>

On Fri, Jul 10, 2020 at 03:23:25PM +1000, Oliver O'Halloran wrote:
> This is largely prep work for supporting VFs in the 32bit MMIO window.
> This is an unfortunate necessity due to how the Linux BAR allocator
> handles BARs marked as non-prefetchable. The distinction
> between prefetch and non-prefetchable BARs was made largely irrelevant
> with the introduction of PCIe, but the BAR allocator is overly
> conservative. It will always place non-pref bars in the prefetchable
> window, which is 32bit only. This results in us being unable to use VFs
> from NVMe drives and a few different RAID cards.

How about fixing that in the core PCI code?

(nothing against this series through, as it seems like a massive
cleanup)

^ permalink raw reply

* Re: [mm/debug_vm_pgtable] a97a171093: BUG:unable_to_handle_page_fault_for_address
From: Anshuman Khandual @ 2020-07-10  6:47 UTC (permalink / raw)
  To: kernel test robot
  Cc: Heiko Carstens, linux-mm, Paul Mackerras, H. Peter Anvin,
	linux-riscv, Will Deacon, linux-arch, linux-s390, x86,
	Mike Rapoport, Christian Borntraeger, Ingo Molnar,
	Catalin Marinas, linux-snps-arc, Vasily Gorbik, lkp,
	Borislav Petkov, Paul Walmsley, Kirill A . Shutemov,
	Thomas Gleixner, linux-arm-kernel, Vineet Gupta, linux-kernel,
	Palmer Dabbelt, Andrew Morton, linuxppc-dev
In-Reply-To: <20200709061122.GN3874@shao2-debian>



On 07/09/2020 11:41 AM, kernel test robot wrote:
> [   94.349598] BUG: unable to handle page fault for address: ffffed10a7ffddff
> [   94.351039] #PF: supervisor read access in kernel mode
> [   94.352172] #PF: error_code(0x0000) - not-present page
> [   94.353256] PGD 43ffed067 P4D 43ffed067 PUD 43fdee067 PMD 0 
> [   94.354484] Oops: 0000 [#1] SMP KASAN
> [   94.355238] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc4-00002-ga97a17109332c #1
> [   94.360456] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
> [   94.361950] RIP: 0010:hugetlb_advanced_tests+0x137/0x699
> [   94.363026] Code: 8b 13 4d 85 f6 75 0b 48 ff 05 2c e4 6a 01 31 ed eb 41 bf f8 ff ff ff ba ff ff 37 00 4c 01 f7 48 c1 e2 2a 48 89 f9 48 c1 e9 03 <80> 3c 11 00 74 05 e8 cd c0 67 fa ba f8 ff ff ff 49 8b 2c 16 48 85
> [   94.366592] RSP: 0000:ffffc90000047d30 EFLAGS: 00010a06
> [   94.367693] RAX: 1ffffffff1049b80 RBX: ffff888380525308 RCX: 1ffff110a7ffddff
> [   94.369215] RDX: dffffc0000000000 RSI: 1ffff11087ffdc00 RDI: ffff88853ffeeff8
> [   94.370693] RBP: 000000000018e510 R08: 0000000000000025 R09: 0000000000000001
> [   94.372165] R10: ffff888380523c07 R11: ffffed10700a4780 R12: ffff88843208e510
> [   94.373674] R13: 0000000000000025 R14: ffff88843ffef000 R15: 000031e01ae61000
> [   94.375147] FS:  0000000000000000(0000) GS:ffff8883a3800000(0000) knlGS:0000000000000000
> [   94.376883] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   94.378051] CR2: ffffed10a7ffddff CR3: 0000000004e15000 CR4: 00000000000406a0
> [   94.379522] Call Trace:
> [   94.380073]  debug_vm_pgtable+0xd81/0x2029
> [   94.380871]  ? pmd_advanced_tests+0x621/0x621
> [   94.381819]  do_one_initcall+0x1eb/0xbd0
> [   94.382551]  ? trace_event_raw_event_initcall_finish+0x240/0x240
> [   94.383634]  ? rcu_read_lock_sched_held+0xb9/0x110
> [   94.388727]  ? rcu_read_lock_held+0xd0/0xd0
> [   94.389604]  ? __kasan_check_read+0x1d/0x30
> [   94.390485]  kernel_init_freeable+0x430/0x4f8
> [   94.391416]  ? rest_init+0x3f8/0x3f8
> [   94.392185]  kernel_init+0x14/0x1e8
> [   94.392918]  ret_from_fork+0x22/0x30
> [   94.393662] Modules linked in:
> [   94.394289] CR2: ffffed10a7ffddff
> [   94.395000] ---[ end trace 8ca5a1655dfb8c39 ]---

This bug is caused from here.

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
#ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section)
                return NULL;
#endif
        if (!mem_section[SECTION_NR_TO_ROOT(nr)]) <-------- BUG
                return NULL;
        return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
        return __nr_to_section(pfn_to_section_nr(pfn));
}

#define __pfn_to_page(pfn)                              \
({      unsigned long __pfn = (pfn);                    \
        struct mem_section *__sec = __pfn_to_section(__pfn);    \
        __section_mem_map_addr(__sec) + __pfn;          \
})

which is called via hugetlb_advanced_tests().

paddr = (__pfn_to_phys(pfn) | RANDOM_ORVALUE) & PMD_MASK;
pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));

Primary reason being RANDOM_ORVALUE, which is added to the paddr before
being masked with PMD_MASK. This clobbers up the pfn value which cannot
be searched in relevant memory sections. This problem stays hidden on
other configs where pfn_to_page() does not go via memory section search.
Dropping off RANDOM_ORVALUE solves the problem. Probably, just wanted to
drop that off during V2 series (https://lkml.org/lkml/2020/4/8/997) but
dont remember why ended up keeping it again.

^ permalink raw reply

* Re: [PATCH v6 0/5] clean up redundant 'kvm_run' parameters
From: Tianjia Zhang @ 2020-07-10  7:32 UTC (permalink / raw)
  To: pbonzini, tsbogend, paulus, mpe, benh, borntraeger, frankja,
	david, cohuck, heiko.carstens, gor, sean.j.christopherson,
	vkuznets, wanpengli, jmattson, joro, tglx, mingo, bp, x86, hpa,
	maz, james.morse, julien.thierry.kdev, suzuki.poulose,
	christoffer.dall, peterx, thuth, chenhuacai
  Cc: linux-s390, kvm, linux-mips, kvm-ppc, linux-kernel, linuxppc-dev,
	kvmarm, linux-arm-kernel
In-Reply-To: <20200623131418.31473-1-tianjia.zhang@linux.alibaba.com>

Hi Paolo,

Any opinion on this series patches? Can I help with this patchset ?

Thanks and best,
Tianjia

On 2020/6/23 21:14, Tianjia Zhang wrote:
> In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
> structure. For historical reasons, many kvm-related function parameters
> retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
> patch does a unified cleanup of these remaining redundant parameters.
> 
> This series of patches has completely cleaned the architecture of
> arm64, mips, ppc, and s390 (no such redundant code on x86). Due to
> the large number of modified codes, a separate patch is made for each
> platform. On the ppc platform, there is also a redundant structure
> pointer of 'kvm_run' in 'vcpu_arch', which has also been cleaned
> separately.
> 
> ---
> v6 changes:
>    Rearrange patch sets, only keep the unmerged patch.
>    rebase on mainline.
> 
> v5 change:
>    ppc: fix for review.
> 
> v4 change:
>    mips: fixes two errors in entry.c.
> 
> v3 change:
>    Keep the existing `vcpu->run` in the function body unchanged.
> 
> v2 change:
>    s390 retains the original variable name and minimizes modification.
> 
> Tianjia Zhang (5):
>    KVM: s390: clean up redundant 'kvm_run' parameters
>    KVM: arm64: clean up redundant 'kvm_run' parameters
>    KVM: PPC: clean up redundant kvm_run parameters in assembly
>    KVM: MIPS: clean up redundant 'kvm_run' parameters
>    KVM: MIPS: clean up redundant kvm_run parameters in assembly
> 
>   arch/arm64/include/asm/kvm_coproc.h   |  12 +--
>   arch/arm64/include/asm/kvm_host.h     |  11 +--
>   arch/arm64/include/asm/kvm_mmu.h      |   2 +-
>   arch/arm64/kvm/arm.c                  |   6 +-
>   arch/arm64/kvm/handle_exit.c          |  36 ++++----
>   arch/arm64/kvm/mmio.c                 |  11 +--
>   arch/arm64/kvm/mmu.c                  |   5 +-
>   arch/arm64/kvm/sys_regs.c             |  13 ++-
>   arch/mips/include/asm/kvm_host.h      |  32 ++------
>   arch/mips/kvm/emulate.c               |  59 +++++--------
>   arch/mips/kvm/entry.c                 |  21 ++---
>   arch/mips/kvm/mips.c                  |  14 ++--
>   arch/mips/kvm/trap_emul.c             | 114 +++++++++++---------------
>   arch/mips/kvm/vz.c                    |  26 +++---
>   arch/powerpc/include/asm/kvm_ppc.h    |   2 +-
>   arch/powerpc/kvm/book3s_interrupts.S  |  22 +++--
>   arch/powerpc/kvm/book3s_pr.c          |   9 +-
>   arch/powerpc/kvm/booke.c              |   9 +-
>   arch/powerpc/kvm/booke_interrupts.S   |   9 +-
>   arch/powerpc/kvm/bookehv_interrupts.S |  10 +--
>   arch/s390/kvm/kvm-s390.c              |  23 ++++--
>   21 files changed, 188 insertions(+), 258 deletions(-)
> 

^ permalink raw reply

* Re: [PATCH v6 1/5] KVM: s390: clean up redundant 'kvm_run' parameters
From: Paolo Bonzini @ 2020-07-10  7:48 UTC (permalink / raw)
  To: Christian Borntraeger, Tianjia Zhang, tsbogend, paulus, mpe, benh,
	frankja, david, cohuck, heiko.carstens, gor,
	sean.j.christopherson, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, x86, hpa, maz, james.morse, julien.thierry.kdev,
	suzuki.poulose, christoffer.dall, peterx, thuth, chenhuacai
  Cc: linux-s390, kvm, linux-mips, kvm-ppc, linux-kernel, linuxppc-dev,
	kvmarm, linux-arm-kernel
In-Reply-To: <c49f8814-c7ea-6884-91c5-3dcd40c6509f@de.ibm.com>

On 23/06/20 17:31, Christian Borntraeger wrote:
> 
> I have trouble seeing value in this particular patch. We add LOCs
> without providing any noticable benefit. All other patches in this series at
> least reduce the amount of code. So I would defer this to Paolo if he prefers
> to have this way across all architectures. 

Yes, it adds lines of code but they're just

+	struct kvm_run *kvm_run = vcpu->run;

You could avoid the LoC increase by repeating vcpu->run over and over,
but I think the code overall is clearer.

Paolo


^ permalink raw reply

* Re: [PATCH v2 02/12] powerpc/kexec_file: mark PPC64 specific code
From: Laurent Dufour @ 2020-07-10  7:52 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <159371966340.21555.11937124937064648539.stgit@hbathini.in.ibm.com>

Le 02/07/2020 à 21:54, Hari Bathini a écrit :
> Some of the kexec_file_load code isn't PPC64 specific. Move PPC64
> specific code from kexec/file_load.c to kexec/file_load_64.c. Also,
> rename purgatory/trampoline.S to purgatory/trampoline_64.S in the
> same spirit.

FWIW and despite a minor comment below,

Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>

> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
> ---
> 
> Changes in v2:
> * No changes.
> 
> 
>   arch/powerpc/include/asm/kexec.h       |   11 +++
>   arch/powerpc/kexec/Makefile            |    2 -
>   arch/powerpc/kexec/elf_64.c            |    7 +-
>   arch/powerpc/kexec/file_load.c         |   37 ++--------
>   arch/powerpc/kexec/file_load_64.c      |  108 ++++++++++++++++++++++++++++++
>   arch/powerpc/purgatory/Makefile        |    4 +
>   arch/powerpc/purgatory/trampoline.S    |  117 --------------------------------
>   arch/powerpc/purgatory/trampoline_64.S |  117 ++++++++++++++++++++++++++++++++
>   8 files changed, 248 insertions(+), 155 deletions(-)
>   create mode 100644 arch/powerpc/kexec/file_load_64.c
>   delete mode 100644 arch/powerpc/purgatory/trampoline.S
>   create mode 100644 arch/powerpc/purgatory/trampoline_64.S
> 
> diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
> index c684768..7008ea1 100644
> --- a/arch/powerpc/include/asm/kexec.h
> +++ b/arch/powerpc/include/asm/kexec.h
> @@ -114,8 +114,17 @@ int setup_purgatory(struct kimage *image, const void *slave_code,
>   		    unsigned long fdt_load_addr);
>   int setup_new_fdt(const struct kimage *image, void *fdt,
>   		  unsigned long initrd_load_addr, unsigned long initrd_len,
> -		  const char *cmdline);
> +		  const char *cmdline, int *node);
>   int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
> +
> +#ifdef CONFIG_PPC64
> +int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
> +			  const void *fdt, unsigned long kernel_load_addr,
> +			  unsigned long fdt_load_addr);
> +int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
> +			unsigned long initrd_load_addr,
> +			unsigned long initrd_len, const char *cmdline);
> +#endif /* CONFIG_PPC64 */
>   #endif /* CONFIG_KEXEC_FILE */
>   
>   #else /* !CONFIG_KEXEC_CORE */
> diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
> index 86380c6..67c3553 100644
> --- a/arch/powerpc/kexec/Makefile
> +++ b/arch/powerpc/kexec/Makefile
> @@ -7,7 +7,7 @@ obj-y				+= core.o crash.o core_$(BITS).o
>   
>   obj-$(CONFIG_PPC32)		+= relocate_32.o
>   
> -obj-$(CONFIG_KEXEC_FILE)	+= file_load.o elf_$(BITS).o
> +obj-$(CONFIG_KEXEC_FILE)	+= file_load.o file_load_$(BITS).o elf_$(BITS).o
>   
>   ifdef CONFIG_HAVE_IMA_KEXEC
>   ifdef CONFIG_IMA
> diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
> index 3072fd6..23ad04c 100644
> --- a/arch/powerpc/kexec/elf_64.c
> +++ b/arch/powerpc/kexec/elf_64.c
> @@ -88,7 +88,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
>   		goto out;
>   	}
>   
> -	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
> +	ret = setup_new_fdt_ppc64(image, fdt, initrd_load_addr,
> +				  initrd_len, cmdline);
>   	if (ret)
>   		goto out;
>   
> @@ -107,8 +108,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
>   	pr_debug("Loaded device tree at 0x%lx\n", fdt_load_addr);
>   
>   	slave_code = elf_info.buffer + elf_info.proghdrs[0].p_offset;
> -	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
> -			      fdt_load_addr);
> +	ret = setup_purgatory_ppc64(image, slave_code, fdt, kernel_load_addr,
> +				    fdt_load_addr);
>   	if (ret)
>   		pr_err("Error setting up the purgatory.\n");
>   
> diff --git a/arch/powerpc/kexec/file_load.c b/arch/powerpc/kexec/file_load.c
> index 143c917..99a2c4d 100644
> --- a/arch/powerpc/kexec/file_load.c
> +++ b/arch/powerpc/kexec/file_load.c
> @@ -1,6 +1,6 @@
>   // SPDX-License-Identifier: GPL-2.0-only
>   /*
> - * ppc64 code to implement the kexec_file_load syscall
> + * powerpc code to implement the kexec_file_load syscall
>    *
>    * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
>    * Copyright (C) 2004  IBM Corp.
> @@ -16,26 +16,10 @@
>   
>   #include <linux/slab.h>
>   #include <linux/kexec.h>
> -#include <linux/of_fdt.h>
>   #include <linux/libfdt.h>
>   #include <asm/ima.h>
>   
> -#define SLAVE_CODE_SIZE		256
> -
> -const struct kexec_file_ops * const kexec_file_loaders[] = {
> -	&kexec_elf64_ops,
> -	NULL
> -};
> -
> -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
> -				  unsigned long buf_len)
> -{
> -	/* We don't support crash kernels yet. */
> -	if (image->type == KEXEC_TYPE_CRASH)
> -		return -EOPNOTSUPP;
> -
> -	return kexec_image_probe_default(image, buf, buf_len);
> -}
> +#define SLAVE_CODE_SIZE		256	/* First 0x100 bytes */
>   
>   /**
>    * setup_purgatory - initialize the purgatory's global variables
> @@ -127,24 +111,17 @@ int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
>    * @initrd_len:		Size of the next initrd, or 0 if there will be none.
>    * @cmdline:		Command line for the next kernel, or NULL if there will
>    *			be none.
> + * @chosen_node:        Set this output parameter to chosen_node.

minor:   ^ seems to be "node"

>    *
>    * Return: 0 on success, or negative errno on error.
>    */
>   int setup_new_fdt(const struct kimage *image, void *fdt,
>   		  unsigned long initrd_load_addr, unsigned long initrd_len,
> -		  const char *cmdline)
> +		  const char *cmdline, int *node)
>   {
>   	int ret, chosen_node;
>   	const void *prop;
>   
> -	/* Remove memory reservation for the current device tree. */
> -	ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
> -				 fdt_totalsize(initial_boot_params));
> -	if (ret == 0)
> -		pr_debug("Removed old device tree reservation.\n");
> -	else if (ret != -ENOENT)
> -		return ret;
> -
>   	chosen_node = fdt_path_offset(fdt, "/chosen");
>   	if (chosen_node == -FDT_ERR_NOTFOUND) {
>   		chosen_node = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
> @@ -157,6 +134,8 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
>   		pr_err("Malformed device tree: error reading /chosen.\n");
>   		return -EINVAL;
>   	}
> +	if (node)
> +		*node = chosen_node;
>   
>   	/* Did we boot using an initrd? */
>   	prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", NULL);
> @@ -242,10 +221,6 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
>   		return ret;
>   	}
>   
> -	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
> -	if (ret)
> -		goto err;
> -
>   	return 0;
>   
>   err:
> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> new file mode 100644
> index 0000000..e6bff960
> --- /dev/null
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -0,0 +1,108 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * ppc64 code to implement the kexec_file_load syscall
> + *
> + * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
> + * Copyright (C) 2004  IBM Corp.
> + * Copyright (C) 2004,2005  Milton D Miller II, IBM Corporation
> + * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
> + * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
> + * Copyright (C) 2020  IBM Corporation
> + *
> + * Based on kexec-tools' kexec-ppc64.c, kexec-elf-rel-ppc64.c, fs2dt.c.
> + * Heavily modified for the kernel by
> + * Hari Bathini <hbathini@linux.ibm.com>.
> + */
> +
> +#include <linux/kexec.h>
> +#include <linux/of_fdt.h>
> +#include <linux/libfdt.h>
> +
> +const struct kexec_file_ops * const kexec_file_loaders[] = {
> +	&kexec_elf64_ops,
> +	NULL
> +};
> +
> +/**
> + * setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
> + *                         variables and call setup_purgatory() to initialize
> + *                         common global variable.
> + * @image:                 kexec image.
> + * @slave_code:            Slave code for the purgatory.
> + * @fdt:                   Flattened device tree for the next kernel.
> + * @kernel_load_addr:      Address where the kernel is loaded.
> + * @fdt_load_addr:         Address where the flattened device tree is loaded.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
> +			  const void *fdt, unsigned long kernel_load_addr,
> +			  unsigned long fdt_load_addr)
> +{
> +	int ret;
> +
> +	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
> +			      fdt_load_addr);
> +	if (ret)
> +		pr_err("Failed to setup purgatory symbols");
> +	return ret;
> +}
> +
> +/**
> + * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
> + *                       being loaded.
> + * @image:               kexec image being loaded.
> + * @fdt:                 Flattened device tree for the next kernel.
> + * @initrd_load_addr:    Address where the next initrd will be loaded.
> + * @initrd_len:          Size of the next initrd, or 0 if there will be none.
> + * @cmdline:             Command line for the next kernel, or NULL if there will
> + *                       be none.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
> +			unsigned long initrd_load_addr,
> +			unsigned long initrd_len, const char *cmdline)
> +{
> +	int chosen_node, ret;
> +
> +	/* Remove memory reservation for the current device tree. */
> +	ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
> +				 fdt_totalsize(initial_boot_params));
> +	if (ret == 0)
> +		pr_debug("Removed old device tree reservation.\n");
> +	else if (ret != -ENOENT) {
> +		pr_err("Failed to remove old device-tree reservation.\n");
> +		return ret;
> +	}
> +
> +	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len,
> +			    cmdline, &chosen_node);
> +	if (ret)
> +		return ret;
> +
> +	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
> +	if (ret)
> +		pr_err("Failed to update device-tree with linux,booted-from-kexec\n");
> +
> +	return ret;
> +}
> +
> +/**
> + * arch_kexec_kernel_image_probe - Does additional handling needed to setup
> + *                                 kexec segments.
> + * @image:                         kexec image being loaded.
> + * @buf:                           Buffer pointing to elf data.
> + * @buf_len:                       Length of the buffer.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
> +				  unsigned long buf_len)
> +{
> +	/* We don't support crash kernels yet. */
> +	if (image->type == KEXEC_TYPE_CRASH)
> +		return -EOPNOTSUPP;
> +
> +	return kexec_image_probe_default(image, buf, buf_len);
> +}
> diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
> index 7c6d8b1..348f5958 100644
> --- a/arch/powerpc/purgatory/Makefile
> +++ b/arch/powerpc/purgatory/Makefile
> @@ -2,11 +2,11 @@
>   
>   KASAN_SANITIZE := n
>   
> -targets += trampoline.o purgatory.ro kexec-purgatory.c
> +targets += trampoline_$(BITS).o purgatory.ro kexec-purgatory.c
>   
>   LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined
>   
> -$(obj)/purgatory.ro: $(obj)/trampoline.o FORCE
> +$(obj)/purgatory.ro: $(obj)/trampoline_$(BITS).o FORCE
>   		$(call if_changed,ld)
>   
>   quiet_cmd_bin2c = BIN2C   $@
> diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S
> deleted file mode 100644
> index a5a83c3..0000000
> --- a/arch/powerpc/purgatory/trampoline.S
> +++ /dev/null
> @@ -1,117 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-only */
> -/*
> - * kexec trampoline
> - *
> - * Based on code taken from kexec-tools and kexec-lite.
> - *
> - * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
> - * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
> - * Copyright (C) 2013, Anton Blanchard, IBM Corporation
> - */
> -
> -#include <asm/asm-compat.h>
> -
> -	.machine ppc64
> -	.balign 256
> -	.globl purgatory_start
> -purgatory_start:
> -	b	master
> -
> -	/* ABI: possible run_at_load flag at 0x5c */
> -	.org purgatory_start + 0x5c
> -	.globl run_at_load
> -run_at_load:
> -	.long 0
> -	.size run_at_load, . - run_at_load
> -
> -	/* ABI: slaves start at 60 with r3=phys */
> -	.org purgatory_start + 0x60
> -slave:
> -	b .
> -	/* ABI: end of copied region */
> -	.org purgatory_start + 0x100
> -	.size purgatory_start, . - purgatory_start
> -
> -/*
> - * The above 0x100 bytes at purgatory_start are replaced with the
> - * code from the kernel (or next stage) by setup_purgatory().
> - */
> -
> -master:
> -	or	%r1,%r1,%r1	/* low priority to let other threads catchup */
> -	isync
> -	mr	%r17,%r3	/* save cpu id to r17 */
> -	mr	%r15,%r4	/* save physical address in reg15 */
> -
> -	or	%r3,%r3,%r3	/* ok now to high priority, lets boot */
> -	lis	%r6,0x1
> -	mtctr	%r6		/* delay a bit for slaves to catch up */
> -	bdnz	.		/* before we overwrite 0-100 again */
> -
> -	bl	0f		/* Work out where we're running */
> -0:	mflr	%r18
> -
> -	/* load device-tree address */
> -	ld	%r3, (dt_offset - 0b)(%r18)
> -	mr	%r16,%r3	/* save dt address in reg16 */
> -	li	%r4,20
> -	LWZX_BE	%r6,%r3,%r4	/* fetch __be32 version number at byte 20 */
> -	cmpwi	%cr0,%r6,2	/* v2 or later? */
> -	blt	1f
> -	li	%r4,28
> -	STWX_BE	%r17,%r3,%r4	/* Store my cpu as __be32 at byte 28 */
> -1:
> -	/* load the kernel address */
> -	ld	%r4,(kernel - 0b)(%r18)
> -
> -	/* load the run_at_load flag */
> -	/* possibly patched by kexec */
> -	ld	%r6,(run_at_load - 0b)(%r18)
> -	/* and patch it into the kernel */
> -	stw	%r6,(0x5c)(%r4)
> -
> -	mr	%r3,%r16	/* restore dt address */
> -
> -	li	%r5,0		/* r5 will be 0 for kernel */
> -
> -	mfmsr	%r11
> -	andi.	%r10,%r11,1	/* test MSR_LE */
> -	bne	.Little_endian
> -
> -	mtctr	%r4		/* prepare branch to */
> -	bctr			/* start kernel */
> -
> -.Little_endian:
> -	mtsrr0	%r4		/* prepare branch to */
> -
> -	clrrdi	%r11,%r11,1	/* clear MSR_LE */
> -	mtsrr1	%r11
> -
> -	rfid			/* update MSR and start kernel */
> -
> -
> -	.balign 8
> -	.globl kernel
> -kernel:
> -	.8byte  0x0
> -	.size kernel, . - kernel
> -
> -	.balign 8
> -	.globl dt_offset
> -dt_offset:
> -	.8byte  0x0
> -	.size dt_offset, . - dt_offset
> -
> -
> -	.data
> -	.balign 8
> -.globl purgatory_sha256_digest
> -purgatory_sha256_digest:
> -	.skip	32
> -	.size purgatory_sha256_digest, . - purgatory_sha256_digest
> -
> -	.balign 8
> -.globl purgatory_sha_regions
> -purgatory_sha_regions:
> -	.skip	8 * 2 * 16
> -	.size purgatory_sha_regions, . - purgatory_sha_regions
> diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
> new file mode 100644
> index 0000000..a5a83c3
> --- /dev/null
> +++ b/arch/powerpc/purgatory/trampoline_64.S
> @@ -0,0 +1,117 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * kexec trampoline
> + *
> + * Based on code taken from kexec-tools and kexec-lite.
> + *
> + * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
> + * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
> + * Copyright (C) 2013, Anton Blanchard, IBM Corporation
> + */
> +
> +#include <asm/asm-compat.h>
> +
> +	.machine ppc64
> +	.balign 256
> +	.globl purgatory_start
> +purgatory_start:
> +	b	master
> +
> +	/* ABI: possible run_at_load flag at 0x5c */
> +	.org purgatory_start + 0x5c
> +	.globl run_at_load
> +run_at_load:
> +	.long 0
> +	.size run_at_load, . - run_at_load
> +
> +	/* ABI: slaves start at 60 with r3=phys */
> +	.org purgatory_start + 0x60
> +slave:
> +	b .
> +	/* ABI: end of copied region */
> +	.org purgatory_start + 0x100
> +	.size purgatory_start, . - purgatory_start
> +
> +/*
> + * The above 0x100 bytes at purgatory_start are replaced with the
> + * code from the kernel (or next stage) by setup_purgatory().
> + */
> +
> +master:
> +	or	%r1,%r1,%r1	/* low priority to let other threads catchup */
> +	isync
> +	mr	%r17,%r3	/* save cpu id to r17 */
> +	mr	%r15,%r4	/* save physical address in reg15 */
> +
> +	or	%r3,%r3,%r3	/* ok now to high priority, lets boot */
> +	lis	%r6,0x1
> +	mtctr	%r6		/* delay a bit for slaves to catch up */
> +	bdnz	.		/* before we overwrite 0-100 again */
> +
> +	bl	0f		/* Work out where we're running */
> +0:	mflr	%r18
> +
> +	/* load device-tree address */
> +	ld	%r3, (dt_offset - 0b)(%r18)
> +	mr	%r16,%r3	/* save dt address in reg16 */
> +	li	%r4,20
> +	LWZX_BE	%r6,%r3,%r4	/* fetch __be32 version number at byte 20 */
> +	cmpwi	%cr0,%r6,2	/* v2 or later? */
> +	blt	1f
> +	li	%r4,28
> +	STWX_BE	%r17,%r3,%r4	/* Store my cpu as __be32 at byte 28 */
> +1:
> +	/* load the kernel address */
> +	ld	%r4,(kernel - 0b)(%r18)
> +
> +	/* load the run_at_load flag */
> +	/* possibly patched by kexec */
> +	ld	%r6,(run_at_load - 0b)(%r18)
> +	/* and patch it into the kernel */
> +	stw	%r6,(0x5c)(%r4)
> +
> +	mr	%r3,%r16	/* restore dt address */
> +
> +	li	%r5,0		/* r5 will be 0 for kernel */
> +
> +	mfmsr	%r11
> +	andi.	%r10,%r11,1	/* test MSR_LE */
> +	bne	.Little_endian
> +
> +	mtctr	%r4		/* prepare branch to */
> +	bctr			/* start kernel */
> +
> +.Little_endian:
> +	mtsrr0	%r4		/* prepare branch to */
> +
> +	clrrdi	%r11,%r11,1	/* clear MSR_LE */
> +	mtsrr1	%r11
> +
> +	rfid			/* update MSR and start kernel */
> +
> +
> +	.balign 8
> +	.globl kernel
> +kernel:
> +	.8byte  0x0
> +	.size kernel, . - kernel
> +
> +	.balign 8
> +	.globl dt_offset
> +dt_offset:
> +	.8byte  0x0
> +	.size dt_offset, . - dt_offset
> +
> +
> +	.data
> +	.balign 8
> +.globl purgatory_sha256_digest
> +purgatory_sha256_digest:
> +	.skip	32
> +	.size purgatory_sha256_digest, . - purgatory_sha256_digest
> +
> +	.balign 8
> +.globl purgatory_sha_regions
> +purgatory_sha_regions:
> +	.skip	8 * 2 * 16
> +	.size purgatory_sha_regions, . - purgatory_sha_regions
> 


^ permalink raw reply

* Re: [PATCH v6 3/5] KVM: PPC: clean up redundant kvm_run parameters in assembly
From: Paolo Bonzini @ 2020-07-10  7:57 UTC (permalink / raw)
  To: Tianjia Zhang, tsbogend, paulus, mpe, benh, borntraeger, frankja,
	david, cohuck, heiko.carstens, gor, sean.j.christopherson,
	vkuznets, wanpengli, jmattson, joro, tglx, mingo, bp, x86, hpa,
	maz, james.morse, julien.thierry.kdev, suzuki.poulose,
	christoffer.dall, peterx, thuth, chenhuacai
  Cc: linux-s390, kvm, linux-mips, kvm-ppc, linux-kernel, linuxppc-dev,
	kvmarm, linux-arm-kernel
In-Reply-To: <20200623131418.31473-4-tianjia.zhang@linux.alibaba.com>

On 23/06/20 15:14, Tianjia Zhang wrote:
>  
>  	/* Load non-volatile guest state from the vcpu */
> -	VCPU_LOAD_NVGPRS(r4)
> +	VCPU_LOAD_NVGPRS(r3)
>  
>  kvm_start_lightweight:
>  	/* Copy registers into shadow vcpu so we can access them in real mode */
> -	mr	r3, r4
>  	bl	FUNC(kvmppc_copy_to_svcpu)
>  	nop
> -	REST_GPR(4, r1)
> +	REST_GPR(3, r1)
>  
>  #ifdef CONFIG_PPC_BOOK3S_64
>  	/* Get the dcbz32 flag */
> @@ -146,7 +144,7 @@ after_sprg3_load:

Below, there are a bunch of references to r3 and r4 left

        rldicl  r3, r3, 0, 63           /* r3 &= 1 */
        stb     r3, HSTATE_RESTORE_HID5(r13)

        /* Load up guest SPRG3 value, since it's user readable */
        lwz     r3, VCPU_SHAREDBE(r4)				  <<<
        cmpwi   r3, 0
        ld      r5, VCPU_SHARED(r4)				  <<<

where r3 is also destroyed.  So I'd rather have three patches:

- one that is as small as possible, changing the prototypes and adding

	mr r4, r3

- one that entirely swaps out r3 and r4.  This would be the hard one to
review!

- one that cleans up the prologue and epilogue

But overall I think it's simplest to just leave out this patch.

Paolo


^ permalink raw reply

* Re: [PATCH v6 0/5] clean up redundant 'kvm_run' parameters
From: Paolo Bonzini @ 2020-07-10  8:06 UTC (permalink / raw)
  To: Tianjia Zhang, tsbogend, paulus, mpe, benh, borntraeger, frankja,
	david, cohuck, heiko.carstens, gor, sean.j.christopherson,
	vkuznets, wanpengli, jmattson, joro, tglx, mingo, bp, x86, hpa,
	maz, james.morse, julien.thierry.kdev, suzuki.poulose,
	christoffer.dall, peterx, thuth, chenhuacai
  Cc: linux-s390, kvm, linux-mips, kvm-ppc, linux-kernel, linuxppc-dev,
	kvmarm, linux-arm-kernel
In-Reply-To: <6604e273-d7b1-5007-8721-75c4a4dec68e@linux.alibaba.com>

On 10/07/20 09:32, Tianjia Zhang wrote:
> Hi Paolo,
> 
> Any opinion on this series patches? Can I help with this patchset ?

I was hoping to have some Tested-by, for now I'm queuing patches 1 and
2.  Thanks,

Paolo

> Thanks and best,
> Tianjia
> 
> On 2020/6/23 21:14, Tianjia Zhang wrote:
>> In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
>> structure. For historical reasons, many kvm-related function parameters
>> retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
>> patch does a unified cleanup of these remaining redundant parameters.
>>
>> This series of patches has completely cleaned the architecture of
>> arm64, mips, ppc, and s390 (no such redundant code on x86). Due to
>> the large number of modified codes, a separate patch is made for each
>> platform. On the ppc platform, there is also a redundant structure
>> pointer of 'kvm_run' in 'vcpu_arch', which has also been cleaned
>> separately.
>>
>> ---
>> v6 changes:
>>    Rearrange patch sets, only keep the unmerged patch.
>>    rebase on mainline.
>>
>> v5 change:
>>    ppc: fix for review.
>>
>> v4 change:
>>    mips: fixes two errors in entry.c.
>>
>> v3 change:
>>    Keep the existing `vcpu->run` in the function body unchanged.
>>
>> v2 change:
>>    s390 retains the original variable name and minimizes modification.
>>
>> Tianjia Zhang (5):
>>    KVM: s390: clean up redundant 'kvm_run' parameters
>>    KVM: arm64: clean up redundant 'kvm_run' parameters
>>    KVM: PPC: clean up redundant kvm_run parameters in assembly
>>    KVM: MIPS: clean up redundant 'kvm_run' parameters
>>    KVM: MIPS: clean up redundant kvm_run parameters in assembly
>>
>>   arch/arm64/include/asm/kvm_coproc.h   |  12 +--
>>   arch/arm64/include/asm/kvm_host.h     |  11 +--
>>   arch/arm64/include/asm/kvm_mmu.h      |   2 +-
>>   arch/arm64/kvm/arm.c                  |   6 +-
>>   arch/arm64/kvm/handle_exit.c          |  36 ++++----
>>   arch/arm64/kvm/mmio.c                 |  11 +--
>>   arch/arm64/kvm/mmu.c                  |   5 +-
>>   arch/arm64/kvm/sys_regs.c             |  13 ++-
>>   arch/mips/include/asm/kvm_host.h      |  32 ++------
>>   arch/mips/kvm/emulate.c               |  59 +++++--------
>>   arch/mips/kvm/entry.c                 |  21 ++---
>>   arch/mips/kvm/mips.c                  |  14 ++--
>>   arch/mips/kvm/trap_emul.c             | 114 +++++++++++---------------
>>   arch/mips/kvm/vz.c                    |  26 +++---
>>   arch/powerpc/include/asm/kvm_ppc.h    |   2 +-
>>   arch/powerpc/kvm/book3s_interrupts.S  |  22 +++--
>>   arch/powerpc/kvm/book3s_pr.c          |   9 +-
>>   arch/powerpc/kvm/booke.c              |   9 +-
>>   arch/powerpc/kvm/booke_interrupts.S   |   9 +-
>>   arch/powerpc/kvm/bookehv_interrupts.S |  10 +--
>>   arch/s390/kvm/kvm-s390.c              |  23 ++++--
>>   21 files changed, 188 insertions(+), 258 deletions(-)
>>
> 


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox