- * [PATCH kernel v10 01/34] powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-12  1:51   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group Alexey Kardashevskiy
                   ` (32 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This relies on the fact that a PCI device always has an IOMMU table
which may not be the case when we get dynamic DMA windows so
let's use more reliable check for IOMMU group here.
As we do not rely on the table presence here, remove the workaround
from pnv_pci_ioda2_set_bypass(); also remove the @add_to_iommu_group
parameter from pnv_ioda_setup_bus_dma().
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/eeh.c                 |  4 +---
 arch/powerpc/platforms/powernv/pci-ioda.c | 27 +++++----------------------
 2 files changed, 6 insertions(+), 25 deletions(-)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9ee61d1..defd874 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1412,13 +1412,11 @@ static int dev_has_iommu_table(struct device *dev, void *data)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct pci_dev **ppdev = data;
-	struct iommu_table *tbl;
 
 	if (!dev)
 		return 0;
 
-	tbl = get_iommu_table_base(dev);
-	if (tbl && tbl->it_group) {
+	if (dev->iommu_group) {
 		*ppdev = pdev;
 		return 1;
 	}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f8bc950..2f092bb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1654,21 +1654,15 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb,
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-				   struct pci_bus *bus,
-				   bool add_to_iommu_group)
+				   struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		if (add_to_iommu_group)
-			set_iommu_table_base_and_group(&dev->dev,
-						       pe->tce32_table);
-		else
-			set_iommu_table_base(&dev->dev, pe->tce32_table);
+		set_iommu_table_base_and_group(&dev->dev, pe->tce32_table);
 
 		if (dev->subordinate)
-			pnv_ioda_setup_bus_dma(pe, dev->subordinate,
-					       add_to_iommu_group);
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
 	}
 }
 
@@ -1845,7 +1839,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 	} else if (pe->flags & PNV_IODA_PE_VF) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
@@ -1882,17 +1876,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 						     window_id,
 						     pe->tce_bypass_base,
 						     0);
-
-		/*
-		 * EEH needs the mapping between IOMMU table and group
-		 * of those VFIO/KVM pass-through devices. We can postpone
-		 * resetting DMA ops until the DMA mask is configured in
-		 * host side.
-		 */
-		if (pe->pdev)
-			set_iommu_table_base(&pe->pdev->dev, tbl);
-		else
-			pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
 	}
 	if (rc)
 		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
@@ -1984,7 +1967,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 	} else if (pe->flags & PNV_IODA_PE_VF) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 01/34] powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group
  2015-05-11 15:38 ` [PATCH kernel v10 01/34] powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group Alexey Kardashevskiy
@ 2015-05-12  1:51   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-12  1:51 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:50AM +1000, Alexey Kardashevskiy wrote:
>This relies on the fact that a PCI device always has an IOMMU table
>which may not be the case when we get dynamic DMA windows so
>let's use more reliable check for IOMMU group here.
>
>As we do not rely on the table presence here, remove the workaround
>from pnv_pci_ioda2_set_bypass(); also remove the @add_to_iommu_group
>parameter from pnv_ioda_setup_bus_dma().
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
> arch/powerpc/kernel/eeh.c                 |  4 +---
> arch/powerpc/platforms/powernv/pci-ioda.c | 27 +++++----------------------
> 2 files changed, 6 insertions(+), 25 deletions(-)
>
>diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
>index 9ee61d1..defd874 100644
>--- a/arch/powerpc/kernel/eeh.c
>+++ b/arch/powerpc/kernel/eeh.c
>@@ -1412,13 +1412,11 @@ static int dev_has_iommu_table(struct device *dev, void *data)
> {
> 	struct pci_dev *pdev = to_pci_dev(dev);
> 	struct pci_dev **ppdev = data;
>-	struct iommu_table *tbl;
>
> 	if (!dev)
> 		return 0;
>
>-	tbl = get_iommu_table_base(dev);
>-	if (tbl && tbl->it_group) {
>+	if (dev->iommu_group) {
> 		*ppdev = pdev;
> 		return 1;
> 	}
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index f8bc950..2f092bb 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1654,21 +1654,15 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb,
> }
>
> static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
>-				   struct pci_bus *bus,
>-				   bool add_to_iommu_group)
>+				   struct pci_bus *bus)
> {
> 	struct pci_dev *dev;
>
> 	list_for_each_entry(dev, &bus->devices, bus_list) {
>-		if (add_to_iommu_group)
>-			set_iommu_table_base_and_group(&dev->dev,
>-						       pe->tce32_table);
>-		else
>-			set_iommu_table_base(&dev->dev, pe->tce32_table);
>+		set_iommu_table_base_and_group(&dev->dev, pe->tce32_table);
>
> 		if (dev->subordinate)
>-			pnv_ioda_setup_bus_dma(pe, dev->subordinate,
>-					       add_to_iommu_group);
>+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
> 	}
> }
>
>@@ -1845,7 +1839,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
> 		iommu_register_group(tbl, phb->hose->global_number,
> 				     pe->pe_number);
>-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
>+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
> 	} else if (pe->flags & PNV_IODA_PE_VF) {
> 		iommu_register_group(tbl, phb->hose->global_number,
> 				     pe->pe_number);
>@@ -1882,17 +1876,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
> 						     window_id,
> 						     pe->tce_bypass_base,
> 						     0);
>-
>-		/*
>-		 * EEH needs the mapping between IOMMU table and group
>-		 * of those VFIO/KVM pass-through devices. We can postpone
>-		 * resetting DMA ops until the DMA mask is configured in
>-		 * host side.
>-		 */
>-		if (pe->pdev)
>-			set_iommu_table_base(&pe->pdev->dev, tbl);
>-		else
>-			pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
> 	}
> 	if (rc)
> 		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
>@@ -1984,7 +1967,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
> 		iommu_register_group(tbl, phb->hose->global_number,
> 				     pe->pe_number);
>-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
>+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
> 	} else if (pe->flags & PNV_IODA_PE_VF) {
> 		iommu_register_group(tbl, phb->hose->global_number,
> 				     pe->pe_number);
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
  2015-05-11 15:38 ` [PATCH kernel v10 01/34] powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  5:18   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration Alexey Kardashevskiy
                   ` (31 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
The set_iommu_table_base_and_group() name suggests that the function
sets table base and add a device to an IOMMU group. However actual
table base setting happens in pnv_pci_ioda_dma_dev_setup().
The actual purpose for table base setting is to put some reference
into a device so later iommu_add_device() can get the IOMMU group
reference and the device to the group.
At the moment a group cannot be explicitly passed to iommu_add_device()
as we want it to work from the bus notifier, we can fix it later and
remove confusing calls of set_iommu_table_base().
This replaces set_iommu_table_base_and_group() with a couple of
set_iommu_table_base() + iommu_add_device() which makes reading the code
easier.
This adds few comments why set_iommu_table_base() and iommu_add_device()
are called where they are called.
For IODA1/2, this essentially removes iommu_add_device() call from
the pnv_pci_ioda_dma_dev_setup() as it will always fail at this particular
place:
- for physical PE, the device is already attached by iommu_add_device()
in pnv_pci_ioda_setup_dma_pe();
- for virtual PE, the sysfs entries are not ready to create all symlinks
so actual adding is happening in tce_iommu_bus_notifier.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* new to the series
---
 arch/powerpc/include/asm/iommu.h            |  7 -------
 arch/powerpc/platforms/powernv/pci-ioda.c   | 27 +++++++++++++++++++++++----
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++-
 arch/powerpc/platforms/pseries/iommu.c      | 15 ++++++++-------
 4 files changed, 33 insertions(+), 19 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 1e27d63..8353c86 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -140,13 +140,6 @@ static inline int __init tce_iommu_bus_notifier_init(void)
 }
 #endif /* !CONFIG_IOMMU_API */
 
-static inline void set_iommu_table_base_and_group(struct device *dev,
-						  void *base)
-{
-	set_iommu_table_base(dev, base);
-	iommu_add_device(dev);
-}
-
 extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    struct scatterlist *sglist, int nelems,
 			    unsigned long mask,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2f092bb..9a77f3c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1598,7 +1598,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
-	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
+	set_iommu_table_base(&pdev->dev, pe->tce32_table);
+	/*
+	 * Note: iommu_add_device() will fail here as
+	 * for physical PE: the device is already added by now;
+	 * for virtual PE: sysfs entries are not ready yet and
+	 * tce_iommu_bus_notifier will add the device to a group later.
+	 */
 }
 
 static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
@@ -1659,7 +1665,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		set_iommu_table_base_and_group(&dev->dev, pe->tce32_table);
+		set_iommu_table_base(&dev->dev, pe->tce32_table);
+		iommu_add_device(&dev->dev);
 
 		if (dev->subordinate)
 			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
@@ -1835,7 +1842,13 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (pe->flags & PNV_IODA_PE_DEV) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
+		/*
+		 * Setting table base here only for carrying iommu_group
+		 * further down to let iommu_add_device() do the job.
+		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+		 */
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+		iommu_add_device(&pe->pdev->dev);
 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
@@ -1963,7 +1976,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (pe->flags & PNV_IODA_PE_DEV) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
+		/*
+		 * Setting table base here only for carrying iommu_group
+		 * further down to let iommu_add_device() do the job.
+		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+		 */
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+		iommu_add_device(&pe->pdev->dev);
 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 4729ca7..b17d93615 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -92,7 +92,8 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
 	}
 
-	set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table);
+	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
+	iommu_add_device(&pdev->dev);
 }
 
 static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 61d5a17..05ab06d 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -688,8 +688,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		iommu_table_setparms(phb, dn, tbl);
 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
 		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
-		set_iommu_table_base_and_group(&dev->dev,
-					       PCI_DN(dn)->iommu_table);
+		set_iommu_table_base(&dev->dev, tbl);
+		iommu_add_device(&dev->dev);
 		return;
 	}
 
@@ -700,10 +700,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
 		dn = dn->parent;
 
-	if (dn && PCI_DN(dn))
-		set_iommu_table_base_and_group(&dev->dev,
-					       PCI_DN(dn)->iommu_table);
-	else
+	if (dn && PCI_DN(dn)) {
+		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
+		iommu_add_device(&dev->dev);
+	} else
 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
 		       pci_name(dev));
 }
@@ -1115,7 +1115,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
 	}
 
-	set_iommu_table_base_and_group(&dev->dev, pci->iommu_table);
+	set_iommu_table_base(&dev->dev, pci->iommu_table);
+	iommu_add_device(&dev->dev);
 }
 
 static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group
  2015-05-11 15:38 ` [PATCH kernel v10 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group Alexey Kardashevskiy
@ 2015-05-13  5:18   ` Gavin Shan
  2015-05-13  7:26     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  5:18 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:51AM +1000, Alexey Kardashevskiy wrote:
>The set_iommu_table_base_and_group() name suggests that the function
>sets table base and add a device to an IOMMU group. However actual
>table base setting happens in pnv_pci_ioda_dma_dev_setup().
>
On PHB3, the DMA32 IOMMU table is created during PHB fixup time
in ppc_md.pcibios_fixup(), which is invoked at end of PCI enumeration.
The IOMMU table of PCI devices are initialized at same time.
pnv_pci_ioda_dma_dev_setup() is called when adding PCI device
or fixing up PCI bus at PCI enumeration time. So the commit logs
here isn't accurate enough.
Basically, set_iommu_table_base_and_group() which does two things
in one function, which is nice. I guess you don't need this function
any more after DDW is supported and it's the reason to remove it?
>The actual purpose for table base setting is to put some reference
>into a device so later iommu_add_device() can get the IOMMU group
>reference and the device to the group.
>
>At the moment a group cannot be explicitly passed to iommu_add_device()
>as we want it to work from the bus notifier, we can fix it later and
>remove confusing calls of set_iommu_table_base().
>
>This replaces set_iommu_table_base_and_group() with a couple of
>set_iommu_table_base() + iommu_add_device() which makes reading the code
>easier.
>
>This adds few comments why set_iommu_table_base() and iommu_add_device()
>are called where they are called.
>
>For IODA1/2, this essentially removes iommu_add_device() call from
>the pnv_pci_ioda_dma_dev_setup() as it will always fail at this particular
>place:
>- for physical PE, the device is already attached by iommu_add_device()
>in pnv_pci_ioda_setup_dma_pe();
>- for virtual PE, the sysfs entries are not ready to create all symlinks
>so actual adding is happening in tce_iommu_bus_notifier.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v10:
>* new to the series
>---
> arch/powerpc/include/asm/iommu.h            |  7 -------
> arch/powerpc/platforms/powernv/pci-ioda.c   | 27 +++++++++++++++++++++++----
> arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++-
> arch/powerpc/platforms/pseries/iommu.c      | 15 ++++++++-------
> 4 files changed, 33 insertions(+), 19 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>index 1e27d63..8353c86 100644
>--- a/arch/powerpc/include/asm/iommu.h
>+++ b/arch/powerpc/include/asm/iommu.h
>@@ -140,13 +140,6 @@ static inline int __init tce_iommu_bus_notifier_init(void)
> }
> #endif /* !CONFIG_IOMMU_API */
>
>-static inline void set_iommu_table_base_and_group(struct device *dev,
>-						  void *base)
>-{
>-	set_iommu_table_base(dev, base);
>-	iommu_add_device(dev);
>-}
>-
> extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
> 			    struct scatterlist *sglist, int nelems,
> 			    unsigned long mask,
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 2f092bb..9a77f3c 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1598,7 +1598,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
>
> 	pe = &phb->ioda.pe_array[pdn->pe_number];
> 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
>-	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
>+	set_iommu_table_base(&pdev->dev, pe->tce32_table);
>+	/*
>+	 * Note: iommu_add_device() will fail here as
>+	 * for physical PE: the device is already added by now;
>+	 * for virtual PE: sysfs entries are not ready yet and
>+	 * tce_iommu_bus_notifier will add the device to a group later.
>+	 */
I didn't figure out how the IOMMU table is initialized for PCI device in this
function during bootup time. At system bootup time, the function is only called
when applying fixup to PCI bus in pcibios_fixup_bus(). At that time, we don't
have PE# yet, which is allocated at PHB fixup time (ppc_md.pcibios_fixup_phb).
> }
>
> static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
>@@ -1659,7 +1665,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
> 	struct pci_dev *dev;
>
> 	list_for_each_entry(dev, &bus->devices, bus_list) {
>-		set_iommu_table_base_and_group(&dev->dev, pe->tce32_table);
>+		set_iommu_table_base(&dev->dev, pe->tce32_table);
>+		iommu_add_device(&dev->dev);
>
> 		if (dev->subordinate)
> 			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
>@@ -1835,7 +1842,13 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	if (pe->flags & PNV_IODA_PE_DEV) {
> 		iommu_register_group(tbl, phb->hose->global_number,
> 				     pe->pe_number);
>-		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
>+		/*
>+		 * Setting table base here only for carrying iommu_group
>+		 * further down to let iommu_add_device() do the job.
>+		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
>+		 */
>+		set_iommu_table_base(&pe->pdev->dev, tbl);
>+		iommu_add_device(&pe->pdev->dev);
> 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
> 		iommu_register_group(tbl, phb->hose->global_number,
> 				     pe->pe_number);
>@@ -1963,7 +1976,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	if (pe->flags & PNV_IODA_PE_DEV) {
> 		iommu_register_group(tbl, phb->hose->global_number,
> 				     pe->pe_number);
>-		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
>+		/*
>+		 * Setting table base here only for carrying iommu_group
>+		 * further down to let iommu_add_device() do the job.
>+		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
>+		 */
>+		set_iommu_table_base(&pe->pdev->dev, tbl);
>+		iommu_add_device(&pe->pdev->dev);
> 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
> 		iommu_register_group(tbl, phb->hose->global_number,
> 				     pe->pe_number);
>diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>index 4729ca7..b17d93615 100644
>--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>@@ -92,7 +92,8 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
> 				pci_domain_nr(phb->hose->bus), phb->opal_id);
> 	}
>
>-	set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table);
>+	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
>+	iommu_add_device(&pdev->dev);
> }
>
> static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
>diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>index 61d5a17..05ab06d 100644
>--- a/arch/powerpc/platforms/pseries/iommu.c
>+++ b/arch/powerpc/platforms/pseries/iommu.c
>@@ -688,8 +688,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
> 		iommu_table_setparms(phb, dn, tbl);
> 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
> 		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
>-		set_iommu_table_base_and_group(&dev->dev,
>-					       PCI_DN(dn)->iommu_table);
>+		set_iommu_table_base(&dev->dev, tbl);
>+		iommu_add_device(&dev->dev);
> 		return;
> 	}
>
>@@ -700,10 +700,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
> 	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
> 		dn = dn->parent;
>
>-	if (dn && PCI_DN(dn))
>-		set_iommu_table_base_and_group(&dev->dev,
>-					       PCI_DN(dn)->iommu_table);
>-	else
>+	if (dn && PCI_DN(dn)) {
>+		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
>+		iommu_add_device(&dev->dev);
>+	} else
> 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
> 		       pci_name(dev));
> }
>@@ -1115,7 +1115,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
> 		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
> 	}
>
>-	set_iommu_table_base_and_group(&dev->dev, pci->iommu_table);
>+	set_iommu_table_base(&dev->dev, pci->iommu_table);
>+	iommu_add_device(&dev->dev);
> }
>
> static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
Thanks,
Gavin
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group
  2015-05-13  5:18   ` Gavin Shan
@ 2015-05-13  7:26     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-13  7:26 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/13/2015 03:18 PM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:38:51AM +1000, Alexey Kardashevskiy wrote:
>> The set_iommu_table_base_and_group() name suggests that the function
>> sets table base and add a device to an IOMMU group. However actual
>> table base setting happens in pnv_pci_ioda_dma_dev_setup().
>>
>
> On PHB3, the DMA32 IOMMU table is created during PHB fixup time
> in ppc_md.pcibios_fixup(), which is invoked at end of PCI enumeration.
> The IOMMU table of PCI devices are initialized at same time.
> pnv_pci_ioda_dma_dev_setup() is called when adding PCI device
> or fixing up PCI bus at PCI enumeration time. So the commit logs
> here isn't accurate enough.
Right. I'll remove the second sentence.
> Basically, set_iommu_table_base_and_group() which does two things
> in one function, which is nice. I guess you don't need this function
> any more after DDW is supported and it's the reason to remove it?
Ideally iommu_add_device() should not need any iommu_table pointer at all, 
it need an iommu_table_group which is IOMMU which is one pe PE. Not a table 
(we can have more than just one or even none).
set_iommu_table_base() sets the table for later use by the platform DMA 
code (first half of arch/powerpc/kernel/iommu.c, all these 
iommu_alloc/iommu_free). If no device driver was loaded for a device in a 
group - calling set_iommu_table_base() is not needed.
KVM/VFIO do not call get_iommu_table_base() and do not use the table_base 
(except iommu_add_device()) and there is no point for VFIO-related bits of 
IOMMU code to rely on whether set_iommu_table_base() was called or not. It 
could make sense when table's life time was exactly the same as PE's 
lifetime but with DDW it is not the case anymore.
What we have now is a workaround - device-to-group assignment does not have 
to do anything with the iommu_table itself, it just needs it_table_group 
pointer. In general, I would like to use common iommu_group_get_for_dev(), 
it just won't work for us right now but we can try fixing that code. Or we 
can add root devices into groups when we configure PEs 
(pnv_pci_ioda2_setup_dma_pe()) and later when we add actual PCI functions 
(physical or SRIOV) from tce_iommu_bus_notifier - we can walk up PCI bus 
hierarchy till we find a first parent with a group set and add the child to 
this group. We can do this later.
So the function might be nice but it was confusing me (and I added it at 
the first place). It is easy to miss iommu_add_device() part while reading 
the code because it was disguised by "set_iommu_table_base" prefix.
>> The actual purpose for table base setting is to put some reference
>> into a device so later iommu_add_device() can get the IOMMU group
>> reference and the device to the group.
>>
>> At the moment a group cannot be explicitly passed to iommu_add_device()
>> as we want it to work from the bus notifier, we can fix it later and
>> remove confusing calls of set_iommu_table_base().
>>
>> This replaces set_iommu_table_base_and_group() with a couple of
>> set_iommu_table_base() + iommu_add_device() which makes reading the code
>> easier.
>>
>> This adds few comments why set_iommu_table_base() and iommu_add_device()
>> are called where they are called.
>>
>> For IODA1/2, this essentially removes iommu_add_device() call from
>> the pnv_pci_ioda_dma_dev_setup() as it will always fail at this particular
>> place:
>> - for physical PE, the device is already attached by iommu_add_device()
>> in pnv_pci_ioda_setup_dma_pe();
>> - for virtual PE, the sysfs entries are not ready to create all symlinks
>> so actual adding is happening in tce_iommu_bus_notifier.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> Changes:
>> v10:
>> * new to the series
>> ---
>> arch/powerpc/include/asm/iommu.h            |  7 -------
>> arch/powerpc/platforms/powernv/pci-ioda.c   | 27 +++++++++++++++++++++++----
>> arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++-
>> arch/powerpc/platforms/pseries/iommu.c      | 15 ++++++++-------
>> 4 files changed, 33 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index 1e27d63..8353c86 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -140,13 +140,6 @@ static inline int __init tce_iommu_bus_notifier_init(void)
>> }
>> #endif /* !CONFIG_IOMMU_API */
>>
>> -static inline void set_iommu_table_base_and_group(struct device *dev,
>> -						  void *base)
>> -{
>> -	set_iommu_table_base(dev, base);
>> -	iommu_add_device(dev);
>> -}
>> -
>> extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
>> 			    struct scatterlist *sglist, int nelems,
>> 			    unsigned long mask,
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 2f092bb..9a77f3c 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1598,7 +1598,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
>>
>> 	pe = &phb->ioda.pe_array[pdn->pe_number];
>> 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
>> -	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
>> +	set_iommu_table_base(&pdev->dev, pe->tce32_table);
>> +	/*
>> +	 * Note: iommu_add_device() will fail here as
>> +	 * for physical PE: the device is already added by now;
>> +	 * for virtual PE: sysfs entries are not ready yet and
>> +	 * tce_iommu_bus_notifier will add the device to a group later.
>> +	 */
>
> I didn't figure out how the IOMMU table is initialized for PCI device in this
> function during bootup time. At system bootup time, the function is only called
> when applying fixup to PCI bus in pcibios_fixup_bus(). At that time, we don't
> have PE# yet, which is allocated at PHB fixup time (ppc_md.pcibios_fixup_phb).
My understanding is:
pnv_pci_ioda_dma_dev_setup() is called when
1. the device is just discovered
2. PE is configured
3. driver called set_dma_mask
The actual table gets initialized (or allocated + initialized after this 
patchset) at 2). So passing anything (NULL or a pointer to uninitialized 
iommu_table) to set_iommu_table_base() at 1) does not make any difference - 
iommu_alloc() won't be used and won't work by then anyway.
But since 1), 2), 3) all call pnv_pci_ioda_dma_dev_setup() and we do not 
know which case it is - we just call set_iommu_table_base() from there in a 
hope that iommu_alloc() happens after set_iommu_table_base() was lucky 
enough to be called with initialized iommu_table pointer.
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
  2015-05-11 15:38 ` [PATCH kernel v10 01/34] powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group Alexey Kardashevskiy
  2015-05-11 15:38 ` [PATCH kernel v10 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  5:21   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 04/34] powerpc/iommu: Put IOMMU group explicitly Alexey Kardashevskiy
                   ` (30 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
The existing code has 3 calls to iommu_register_group() and
all 3 branches actually cover all possible cases.
This replaces 3 calls with one and moves the registration earlier;
the latter will make more sense when we add TCE table sharing.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9a77f3c..8ca7abd 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1784,6 +1784,9 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
+	tbl = pe->tce32_table;
+	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+
 	/* Grab a 32-bit TCE table */
 	pe->tce32_seg = base;
 	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
@@ -1818,7 +1821,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	}
 
 	/* Setup linux iommu table */
-	tbl = pe->tce32_table;
 	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
 				  base << 28, IOMMU_PAGE_SHIFT_4K);
 
@@ -1840,8 +1842,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
 		/*
 		 * Setting table base here only for carrying iommu_group
 		 * further down to let iommu_add_device() do the job.
@@ -1849,14 +1849,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 		 */
 		set_iommu_table_base(&pe->pdev->dev, tbl);
 		iommu_add_device(&pe->pdev->dev);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
-	} else if (pe->flags & PNV_IODA_PE_VF) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-	}
 
 	return;
  fail:
@@ -1923,6 +1917,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
+	tbl = pe->tce32_table;
+	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
 	end = (1 << ilog2(phb->ioda.m32_pci_base));
@@ -1954,7 +1951,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	}
 
 	/* Setup linux iommu table */
-	tbl = pe->tce32_table;
 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
 			IOMMU_PAGE_SHIFT_4K);
 
@@ -1974,8 +1970,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
 		/*
 		 * Setting table base here only for carrying iommu_group
 		 * further down to let iommu_add_device() do the job.
@@ -1983,14 +1977,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 		 */
 		set_iommu_table_base(&pe->pdev->dev, tbl);
 		iommu_add_device(&pe->pdev->dev);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
-	} else if (pe->flags & PNV_IODA_PE_VF) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-	}
 
 	/* Also create a bypass window */
 	if (!pnv_iommu_bypass_disabled)
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration
  2015-05-11 15:38 ` [PATCH kernel v10 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration Alexey Kardashevskiy
@ 2015-05-13  5:21   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  5:21 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:52AM +1000, Alexey Kardashevskiy wrote:
>The existing code has 3 calls to iommu_register_group() and
>all 3 branches actually cover all possible cases.
>
>This replaces 3 calls with one and moves the registration earlier;
>the latter will make more sense when we add TCE table sharing.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
> arch/powerpc/platforms/powernv/pci-ioda.c | 28 ++++++++--------------------
> 1 file changed, 8 insertions(+), 20 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 9a77f3c..8ca7abd 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1784,6 +1784,9 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	if (WARN_ON(pe->tce32_seg >= 0))
> 		return;
>
>+	tbl = pe->tce32_table;
>+	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
>+
> 	/* Grab a 32-bit TCE table */
> 	pe->tce32_seg = base;
> 	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
>@@ -1818,7 +1821,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	}
>
> 	/* Setup linux iommu table */
>-	tbl = pe->tce32_table;
> 	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
> 				  base << 28, IOMMU_PAGE_SHIFT_4K);
>
>@@ -1840,8 +1842,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	iommu_init_table(tbl, phb->hose->node);
>
> 	if (pe->flags & PNV_IODA_PE_DEV) {
>-		iommu_register_group(tbl, phb->hose->global_number,
>-				     pe->pe_number);
> 		/*
> 		 * Setting table base here only for carrying iommu_group
> 		 * further down to let iommu_add_device() do the job.
>@@ -1849,14 +1849,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 		 */
> 		set_iommu_table_base(&pe->pdev->dev, tbl);
> 		iommu_add_device(&pe->pdev->dev);
>-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
>-		iommu_register_group(tbl, phb->hose->global_number,
>-				     pe->pe_number);
>+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
>-	} else if (pe->flags & PNV_IODA_PE_VF) {
>-		iommu_register_group(tbl, phb->hose->global_number,
>-				     pe->pe_number);
>-	}
>
> 	return;
>  fail:
>@@ -1923,6 +1917,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	if (WARN_ON(pe->tce32_seg >= 0))
> 		return;
>
>+	tbl = pe->tce32_table;
>+	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
>+
> 	/* The PE will reserve all possible 32-bits space */
> 	pe->tce32_seg = 0;
> 	end = (1 << ilog2(phb->ioda.m32_pci_base));
>@@ -1954,7 +1951,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	}
>
> 	/* Setup linux iommu table */
>-	tbl = pe->tce32_table;
> 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
> 			IOMMU_PAGE_SHIFT_4K);
>
>@@ -1974,8 +1970,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	iommu_init_table(tbl, phb->hose->node);
>
> 	if (pe->flags & PNV_IODA_PE_DEV) {
>-		iommu_register_group(tbl, phb->hose->global_number,
>-				     pe->pe_number);
> 		/*
> 		 * Setting table base here only for carrying iommu_group
> 		 * further down to let iommu_add_device() do the job.
>@@ -1983,14 +1977,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 		 */
> 		set_iommu_table_base(&pe->pdev->dev, tbl);
> 		iommu_add_device(&pe->pdev->dev);
>-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
>-		iommu_register_group(tbl, phb->hose->global_number,
>-				     pe->pe_number);
>+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
>-	} else if (pe->flags & PNV_IODA_PE_VF) {
>-		iommu_register_group(tbl, phb->hose->global_number,
>-				     pe->pe_number);
>-	}
>
> 	/* Also create a bypass window */
> 	if (!pnv_iommu_bypass_disabled)
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 04/34] powerpc/iommu: Put IOMMU group explicitly
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (2 preceding siblings ...)
  2015-05-11 15:38 ` [PATCH kernel v10 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  5:27   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table() Alexey Kardashevskiy
                   ` (29 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
So far an iommu_table lifetime was the same as PE. Dynamic DMA windows
will change this and iommu_free_table() will not always require
the group to be released.
This moves iommu_group_put() out of iommu_free_table().
This adds a iommu_pseries_free_table() helper which does
iommu_group_put() and iommu_free_table(). Later it will be
changed to receive a table_group and we will have to change less
lines then.
This should cause no behavioural change.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/iommu.c               |  7 -------
 arch/powerpc/platforms/powernv/pci-ioda.c |  5 +++++
 arch/powerpc/platforms/pseries/iommu.c    | 14 +++++++++++++-
 3 files changed, 18 insertions(+), 8 deletions(-)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b054f33..3d47eb3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -726,13 +726,6 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	if (tbl->it_offset == 0)
 		clear_bit(0, tbl->it_map);
 
-#ifdef CONFIG_IOMMU_API
-	if (tbl->it_group) {
-		iommu_group_put(tbl->it_group);
-		BUG_ON(tbl->it_group);
-	}
-#endif
-
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
 		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8ca7abd..8c3c4bf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -23,6 +23,7 @@
 #include <linux/io.h>
 #include <linux/msi.h>
 #include <linux/memblock.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -1310,6 +1311,10 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
+	if (tbl->it_group) {
+		iommu_group_put(tbl->it_group);
+		BUG_ON(tbl->it_group);
+	}
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
 	pe->tce32_table = NULL;
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 05ab06d..89f557b 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
 #include <linux/crash_dump.h>
 #include <linux/memory.h>
 #include <linux/of.h>
+#include <linux/iommu.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -51,6 +52,16 @@
 
 #include "pseries.h"
 
+static void iommu_pseries_free_table(struct iommu_table *tbl,
+		const char *node_name)
+{
+	if (tbl->it_group) {
+		iommu_group_put(tbl->it_group);
+		BUG_ON(tbl->it_group);
+	}
+	iommu_free_table(tbl, node_name);
+}
+
 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
 				      __be64 *startp, __be64 *endp)
 {
@@ -1271,7 +1282,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 		 */
 		remove_ddw(np, false);
 		if (pci && pci->iommu_table)
-			iommu_free_table(pci->iommu_table, np->full_name);
+			iommu_pseries_free_table(pci->iommu_table,
+					np->full_name);
 
 		spin_lock(&direct_window_list_lock);
 		list_for_each_entry(window, &direct_window_list, list) {
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 04/34] powerpc/iommu: Put IOMMU group explicitly
  2015-05-11 15:38 ` [PATCH kernel v10 04/34] powerpc/iommu: Put IOMMU group explicitly Alexey Kardashevskiy
@ 2015-05-13  5:27   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  5:27 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:53AM +1000, Alexey Kardashevskiy wrote:
>So far an iommu_table lifetime was the same as PE. Dynamic DMA windows
>will change this and iommu_free_table() will not always require
>the group to be released.
>
>This moves iommu_group_put() out of iommu_free_table().
>
>This adds a iommu_pseries_free_table() helper which does
>iommu_group_put() and iommu_free_table(). Later it will be
>changed to receive a table_group and we will have to change less
>lines then.
>
>This should cause no behavioural change.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
> arch/powerpc/kernel/iommu.c               |  7 -------
> arch/powerpc/platforms/powernv/pci-ioda.c |  5 +++++
> arch/powerpc/platforms/pseries/iommu.c    | 14 +++++++++++++-
> 3 files changed, 18 insertions(+), 8 deletions(-)
>
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index b054f33..3d47eb3 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -726,13 +726,6 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
> 	if (tbl->it_offset == 0)
> 		clear_bit(0, tbl->it_map);
>
>-#ifdef CONFIG_IOMMU_API
>-	if (tbl->it_group) {
>-		iommu_group_put(tbl->it_group);
>-		BUG_ON(tbl->it_group);
>-	}
>-#endif
>-
> 	/* verify that table contains no entries */
> 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
> 		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 8ca7abd..8c3c4bf 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -23,6 +23,7 @@
> #include <linux/io.h>
> #include <linux/msi.h>
> #include <linux/memblock.h>
>+#include <linux/iommu.h>
>
> #include <asm/sections.h>
> #include <asm/io.h>
>@@ -1310,6 +1311,10 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
> 	if (rc)
> 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>
>+	if (tbl->it_group) {
>+		iommu_group_put(tbl->it_group);
>+		BUG_ON(tbl->it_group);
>+	}
> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
> 	pe->tce32_table = NULL;
>diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>index 05ab06d..89f557b 100644
>--- a/arch/powerpc/platforms/pseries/iommu.c
>+++ b/arch/powerpc/platforms/pseries/iommu.c
>@@ -36,6 +36,7 @@
> #include <linux/crash_dump.h>
> #include <linux/memory.h>
> #include <linux/of.h>
>+#include <linux/iommu.h>
> #include <asm/io.h>
> #include <asm/prom.h>
> #include <asm/rtas.h>
>@@ -51,6 +52,16 @@
>
> #include "pseries.h"
>
>+static void iommu_pseries_free_table(struct iommu_table *tbl,
>+		const char *node_name)
>+{
>+	if (tbl->it_group) {
>+		iommu_group_put(tbl->it_group);
>+		BUG_ON(tbl->it_group);
>+	}
>+	iommu_free_table(tbl, node_name);
>+}
>+
> static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
> 				      __be64 *startp, __be64 *endp)
> {
>@@ -1271,7 +1282,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
> 		 */
> 		remove_ddw(np, false);
> 		if (pci && pci->iommu_table)
>-			iommu_free_table(pci->iommu_table, np->full_name);
>+			iommu_pseries_free_table(pci->iommu_table,
>+					np->full_name);
>
> 		spin_lock(&direct_window_list_lock);
> 		list_for_each_entry(window, &direct_window_list, list) {
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (3 preceding siblings ...)
  2015-05-11 15:38 ` [PATCH kernel v10 04/34] powerpc/iommu: Put IOMMU group explicitly Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  5:33   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver Alexey Kardashevskiy
                   ` (28 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
At the moment iommu_free_table() only releases memory if
the table was initialized for the platform code use, i.e. it had
it_map initialized (which purpose is to track DMA memory space use).
With dynamic DMA windows, we will need to be able to release
iommu_table even if it was used for VFIO in which case it_map is NULL
so does the patch.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 3d47eb3..2c02d4c 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	unsigned int order;
 
 	if (!tbl || !tbl->it_map) {
-		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
-				node_name);
+		kfree(tbl);
 		return;
 	}
 
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()
  2015-05-11 15:38 ` [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table() Alexey Kardashevskiy
@ 2015-05-13  5:33   ` Gavin Shan
  2015-05-13  6:30     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  5:33 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
>At the moment iommu_free_table() only releases memory if
>the table was initialized for the platform code use, i.e. it had
>it_map initialized (which purpose is to track DMA memory space use).
>
>With dynamic DMA windows, we will need to be able to release
>iommu_table even if it was used for VFIO in which case it_map is NULL
>so does the patch.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
> arch/powerpc/kernel/iommu.c | 3 +--
> 1 file changed, 1 insertion(+), 2 deletions(-)
>
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index 3d47eb3..2c02d4c 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
> 	unsigned int order;
>
> 	if (!tbl || !tbl->it_map) {
>-		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
>-				node_name);
>+		kfree(tbl);
I'm not sure if the "tbl" needs to be checked against NULL as kfree() already
has the check. But it looks a bit strange to free NULL "tbl" from the code
itself.
Thanks,
Gavin
> 		return;
> 	}
>
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()
  2015-05-13  5:33   ` Gavin Shan
@ 2015-05-13  6:30     ` Alexey Kardashevskiy
  2015-05-13 12:51       ` Thomas Huth
  0 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-13  6:30 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/13/2015 03:33 PM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
>> At the moment iommu_free_table() only releases memory if
>> the table was initialized for the platform code use, i.e. it had
>> it_map initialized (which purpose is to track DMA memory space use).
>>
>> With dynamic DMA windows, we will need to be able to release
>> iommu_table even if it was used for VFIO in which case it_map is NULL
>> so does the patch.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> arch/powerpc/kernel/iommu.c | 3 +--
>> 1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 3d47eb3..2c02d4c 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
>> 	unsigned int order;
>>
>> 	if (!tbl || !tbl->it_map) {
>> -		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
>> -				node_name);
>> +		kfree(tbl);
>
> I'm not sure if the "tbl" needs to be checked against NULL as kfree() already
> has the check. But it looks a bit strange to free NULL "tbl" from the code
> itself.
Yeah, looks a bit weird, agree, I'll change but in general kfree/vfree/... 
- they all check the passed pointer for NULL.
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()
  2015-05-13  6:30     ` Alexey Kardashevskiy
@ 2015-05-13 12:51       ` Thomas Huth
  2015-05-13 23:27         ` Gavin Shan
  0 siblings, 1 reply; 82+ messages in thread
From: Thomas Huth @ 2015-05-13 12:51 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Wed, 13 May 2015 16:30:16 +1000
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> On 05/13/2015 03:33 PM, Gavin Shan wrote:
> > On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
> >> At the moment iommu_free_table() only releases memory if
> >> the table was initialized for the platform code use, i.e. it had
> >> it_map initialized (which purpose is to track DMA memory space use).
> >>
> >> With dynamic DMA windows, we will need to be able to release
> >> iommu_table even if it was used for VFIO in which case it_map is NULL
> >> so does the patch.
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >
> > Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> >
> >> ---
> >> arch/powerpc/kernel/iommu.c | 3 +--
> >> 1 file changed, 1 insertion(+), 2 deletions(-)
> >>
> >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >> index 3d47eb3..2c02d4c 100644
> >> --- a/arch/powerpc/kernel/iommu.c
> >> +++ b/arch/powerpc/kernel/iommu.c
> >> @@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
> >> 	unsigned int order;
> >>
> >> 	if (!tbl || !tbl->it_map) {
> >> -		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
> >> -				node_name);
> >> +		kfree(tbl);
> >
> > I'm not sure if the "tbl" needs to be checked against NULL as kfree() already
> > has the check. But it looks a bit strange to free NULL "tbl" from the code
> > itself.
> 
> Yeah, looks a bit weird, agree, I'll change but in general kfree/vfree/... 
> - they all check the passed pointer for NULL.
But if tbl is NULL, the tbl->it_map check will fail, won't it? So in
this case, I think you have to keep it.
 Thomas
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()
  2015-05-13 12:51       ` Thomas Huth
@ 2015-05-13 23:27         ` Gavin Shan
  2015-05-14  2:34           ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-13 23:27 UTC (permalink / raw)
  To: Thomas Huth
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, linuxppc-dev, David Gibson
On Wed, May 13, 2015 at 02:51:36PM +0200, Thomas Huth wrote:
>On Wed, 13 May 2015 16:30:16 +1000
>Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
>> On 05/13/2015 03:33 PM, Gavin Shan wrote:
>> > On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
>> >> At the moment iommu_free_table() only releases memory if
>> >> the table was initialized for the platform code use, i.e. it had
>> >> it_map initialized (which purpose is to track DMA memory space use).
>> >>
>> >> With dynamic DMA windows, we will need to be able to release
>> >> iommu_table even if it was used for VFIO in which case it_map is NULL
>> >> so does the patch.
>> >>
>> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> >
>> > Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>> >
>> >> ---
>> >> arch/powerpc/kernel/iommu.c | 3 +--
>> >> 1 file changed, 1 insertion(+), 2 deletions(-)
>> >>
>> >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> >> index 3d47eb3..2c02d4c 100644
>> >> --- a/arch/powerpc/kernel/iommu.c
>> >> +++ b/arch/powerpc/kernel/iommu.c
>> >> @@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
>> >> 	unsigned int order;
>> >>
>> >> 	if (!tbl || !tbl->it_map) {
>> >> -		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
>> >> -				node_name);
>> >> +		kfree(tbl);
>> >
>> > I'm not sure if the "tbl" needs to be checked against NULL as kfree() already
>> > has the check. But it looks a bit strange to free NULL "tbl" from the code
>> > itself.
>> 
>> Yeah, looks a bit weird, agree, I'll change but in general kfree/vfree/... 
>> - they all check the passed pointer for NULL.
>
>But if tbl is NULL, the tbl->it_map check will fail, won't it? So in
>this case, I think you have to keep it.
>
If I understood your question correctly, "tbl->it_map" won't be checked
when "tbl" is NULL because the connection ("||") for the two conditions.
The code can be changed to something like below if Alexey want:
	if (!tbl)
		return;
	if (!tbl->itmap)
		kfree(tbl);
Thanks,
Gavin
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()
  2015-05-13 23:27         ` Gavin Shan
@ 2015-05-14  2:34           ` Alexey Kardashevskiy
  2015-05-14  2:53             ` Alex Williamson
  0 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  2:34 UTC (permalink / raw)
  To: Gavin Shan, Thomas Huth
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/14/2015 09:27 AM, Gavin Shan wrote:
> On Wed, May 13, 2015 at 02:51:36PM +0200, Thomas Huth wrote:
>> On Wed, 13 May 2015 16:30:16 +1000
>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>
>>> On 05/13/2015 03:33 PM, Gavin Shan wrote:
>>>> On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
>>>>> At the moment iommu_free_table() only releases memory if
>>>>> the table was initialized for the platform code use, i.e. it had
>>>>> it_map initialized (which purpose is to track DMA memory space use).
>>>>>
>>>>> With dynamic DMA windows, we will need to be able to release
>>>>> iommu_table even if it was used for VFIO in which case it_map is NULL
>>>>> so does the patch.
>>>>>
>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>
>>>> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>>>
>>>>> ---
>>>>> arch/powerpc/kernel/iommu.c | 3 +--
>>>>> 1 file changed, 1 insertion(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>>>> index 3d47eb3..2c02d4c 100644
>>>>> --- a/arch/powerpc/kernel/iommu.c
>>>>> +++ b/arch/powerpc/kernel/iommu.c
>>>>> @@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
>>>>> 	unsigned int order;
>>>>>
>>>>> 	if (!tbl || !tbl->it_map) {
>>>>> -		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
>>>>> -				node_name);
>>>>> +		kfree(tbl);
>>>>
>>>> I'm not sure if the "tbl" needs to be checked against NULL as kfree() already
>>>> has the check. But it looks a bit strange to free NULL "tbl" from the code
>>>> itself.
>>>
>>> Yeah, looks a bit weird, agree, I'll change but in general kfree/vfree/...
>>> - they all check the passed pointer for NULL.
>>
>> But if tbl is NULL, the tbl->it_map check will fail, won't it? So in
>> this case, I think you have to keep it.
>>
>
> If I understood your question correctly, "tbl->it_map" won't be checked
> when "tbl" is NULL because the connection ("||") for the two conditions.
> The code can be changed to something like below if Alexey want:
>
> 	if (!tbl)
> 		return;
> 	if (!tbl->itmap)
> 		kfree(tbl);
To be precise ;)
if (!tbl->itmap) {
	kfree(tbl);
	return;
}
>
> Thanks,
> Gavin
>
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()
  2015-05-14  2:34           ` Alexey Kardashevskiy
@ 2015-05-14  2:53             ` Alex Williamson
  2015-05-14  6:29               ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Alex Williamson @ 2015-05-14  2:53 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Thomas Huth, Gavin Shan, linux-kernel, Paul Mackerras,
	linuxppc-dev, Wei Yang, David Gibson
On Thu, 2015-05-14 at 12:34 +1000, Alexey Kardashevskiy wrote:
> On 05/14/2015 09:27 AM, Gavin Shan wrote:
> > On Wed, May 13, 2015 at 02:51:36PM +0200, Thomas Huth wrote:
> >> On Wed, 13 May 2015 16:30:16 +1000
> >> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>
> >>> On 05/13/2015 03:33 PM, Gavin Shan wrote:
> >>>> On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
> >>>>> At the moment iommu_free_table() only releases memory if
> >>>>> the table was initialized for the platform code use, i.e. it had
> >>>>> it_map initialized (which purpose is to track DMA memory space use).
> >>>>>
> >>>>> With dynamic DMA windows, we will need to be able to release
> >>>>> iommu_table even if it was used for VFIO in which case it_map is NULL
> >>>>> so does the patch.
> >>>>>
> >>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>
> >>>> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
> >>>>
> >>>>> ---
> >>>>> arch/powerpc/kernel/iommu.c | 3 +--
> >>>>> 1 file changed, 1 insertion(+), 2 deletions(-)
> >>>>>
> >>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>>>> index 3d47eb3..2c02d4c 100644
> >>>>> --- a/arch/powerpc/kernel/iommu.c
> >>>>> +++ b/arch/powerpc/kernel/iommu.c
> >>>>> @@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
> >>>>> 	unsigned int order;
> >>>>>
> >>>>> 	if (!tbl || !tbl->it_map) {
> >>>>> -		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
> >>>>> -				node_name);
> >>>>> +		kfree(tbl);
> >>>>
> >>>> I'm not sure if the "tbl" needs to be checked against NULL as kfree() already
> >>>> has the check. But it looks a bit strange to free NULL "tbl" from the code
> >>>> itself.
> >>>
> >>> Yeah, looks a bit weird, agree, I'll change but in general kfree/vfree/...
> >>> - they all check the passed pointer for NULL.
> >>
> >> But if tbl is NULL, the tbl->it_map check will fail, won't it? So in
> >> this case, I think you have to keep it.
> >>
> >
> > If I understood your question correctly, "tbl->it_map" won't be checked
> > when "tbl" is NULL because the connection ("||") for the two conditions.
> > The code can be changed to something like below if Alexey want:
> >
> > 	if (!tbl)
> > 		return;
> > 	if (!tbl->itmap)
> > 		kfree(tbl);
> 
> To be precise ;)
> 
> if (!tbl->itmap) {
> 	kfree(tbl);
> 	return;
> }
I hope that's not your solution, it clearly segfaults with a null
pointer de-ref if !tbl, which is apparently a concern down this path.
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()
  2015-05-14  2:53             ` Alex Williamson
@ 2015-05-14  6:29               ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  6:29 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Thomas Huth, Gavin Shan, linux-kernel, Paul Mackerras,
	linuxppc-dev, Wei Yang, David Gibson
On 05/14/2015 12:53 PM, Alex Williamson wrote:
> On Thu, 2015-05-14 at 12:34 +1000, Alexey Kardashevskiy wrote:
>> On 05/14/2015 09:27 AM, Gavin Shan wrote:
>>> On Wed, May 13, 2015 at 02:51:36PM +0200, Thomas Huth wrote:
>>>> On Wed, 13 May 2015 16:30:16 +1000
>>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>>
>>>>> On 05/13/2015 03:33 PM, Gavin Shan wrote:
>>>>>> On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
>>>>>>> At the moment iommu_free_table() only releases memory if
>>>>>>> the table was initialized for the platform code use, i.e. it had
>>>>>>> it_map initialized (which purpose is to track DMA memory space use).
>>>>>>>
>>>>>>> With dynamic DMA windows, we will need to be able to release
>>>>>>> iommu_table even if it was used for VFIO in which case it_map is NULL
>>>>>>> so does the patch.
>>>>>>>
>>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>>
>>>>>> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>>>>>>
>>>>>>> ---
>>>>>>> arch/powerpc/kernel/iommu.c | 3 +--
>>>>>>> 1 file changed, 1 insertion(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>>>>>> index 3d47eb3..2c02d4c 100644
>>>>>>> --- a/arch/powerpc/kernel/iommu.c
>>>>>>> +++ b/arch/powerpc/kernel/iommu.c
>>>>>>> @@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
>>>>>>> 	unsigned int order;
>>>>>>>
>>>>>>> 	if (!tbl || !tbl->it_map) {
>>>>>>> -		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
>>>>>>> -				node_name);
>>>>>>> +		kfree(tbl);
>>>>>>
>>>>>> I'm not sure if the "tbl" needs to be checked against NULL as kfree() already
>>>>>> has the check. But it looks a bit strange to free NULL "tbl" from the code
>>>>>> itself.
>>>>>
>>>>> Yeah, looks a bit weird, agree, I'll change but in general kfree/vfree/...
>>>>> - they all check the passed pointer for NULL.
>>>>
>>>> But if tbl is NULL, the tbl->it_map check will fail, won't it? So in
>>>> this case, I think you have to keep it.
>>>>
>>>
>>> If I understood your question correctly, "tbl->it_map" won't be checked
>>> when "tbl" is NULL because the connection ("||") for the two conditions.
>>> The code can be changed to something like below if Alexey want:
>>>
>>> 	if (!tbl)
>>> 		return;
>>> 	if (!tbl->itmap)
>>> 		kfree(tbl);
>>
>> To be precise ;)
>>
>> if (!tbl->itmap) {
>> 	kfree(tbl);
>> 	return;
>> }
>
> I hope that's not your solution, it clearly segfaults with a null
> pointer de-ref if !tbl, which is apparently a concern down this path.
I meant the second "if" needs fixing. I need the first one - "if (!tbl)" - 
anyway. What did I miss?
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
 
 
 
 
 
- * [PATCH kernel v10 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (4 preceding siblings ...)
  2015-05-11 15:38 ` [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table() Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  5:58   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 07/34] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page Alexey Kardashevskiy
                   ` (27 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This moves page pinning (get_user_pages_fast()/put_page()) code out of
the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
to as the platform code does not deal with page pinning.
This makes iommu_take_ownership()/iommu_release_ownership() deal with
the IOMMU table bitmap only.
This removes page unpinning from iommu_take_ownership() as the actual
TCE table might contain garbage and doing put_page() on it is undefined
behaviour.
Besides the last part, the rest of the patch is mechanical.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* added missing tce_iommu_clear call after iommu_release_ownership()
* brought @offset (a local variable) back to make patch even more
mechanical
v4:
* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
---
 arch/powerpc/include/asm/iommu.h    |  4 --
 arch/powerpc/kernel/iommu.c         | 55 -------------------------
 drivers/vfio/vfio_iommu_spapr_tce.c | 80 +++++++++++++++++++++++++++++++------
 3 files changed, 67 insertions(+), 72 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 8353c86..e94a5e3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 		unsigned long hwaddr, enum dma_data_direction direction);
 extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
 		unsigned long entry);
-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-		unsigned long entry, unsigned long pages);
-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
-		unsigned long entry, unsigned long tce);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2c02d4c..8673c94 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -983,30 +983,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tce);
 
-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-		unsigned long entry, unsigned long pages)
-{
-	unsigned long oldtce;
-	struct page *page;
-
-	for ( ; pages; --pages, ++entry) {
-		oldtce = iommu_clear_tce(tbl, entry);
-		if (!oldtce)
-			continue;
-
-		page = pfn_to_page(oldtce >> PAGE_SHIFT);
-		WARN_ON(!page);
-		if (page) {
-			if (oldtce & TCE_PCI_WRITE)
-				SetPageDirty(page);
-			put_page(page);
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
-
 /*
  * hwaddr is a kernel virtual address here (0xc... bazillion),
  * tce_build converts it to a physical address.
@@ -1036,35 +1012,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);
 
-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
-		unsigned long tce)
-{
-	int ret;
-	struct page *page = NULL;
-	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
-	enum dma_data_direction direction = iommu_tce_direction(tce);
-
-	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
-			direction != DMA_TO_DEVICE, &page);
-	if (unlikely(ret != 1)) {
-		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
-				tce, entry << tbl->it_page_shift, ret); */
-		return -EFAULT;
-	}
-	hwaddr = (unsigned long) page_address(page) + offset;
-
-	ret = iommu_tce_build(tbl, entry, hwaddr, direction);
-	if (ret)
-		put_page(page);
-
-	if (ret < 0)
-		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
-			__func__, entry << tbl->it_page_shift, tce, ret);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
-
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long sz = (tbl->it_size + 7) >> 3;
@@ -1078,7 +1025,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	}
 
 	memset(tbl->it_map, 0xff, sz);
-	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
 
 	/*
 	 * Disable iommu bypass, otherwise the user can DMA to all of
@@ -1096,7 +1042,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 {
 	unsigned long sz = (tbl->it_size + 7) >> 3;
 
-	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
 	memset(tbl->it_map, 0, sz);
 
 	/* Restore bit#0 set by iommu_init_table() */
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 730b4ef..b95fa2b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -147,6 +147,67 @@ static void tce_iommu_release(void *iommu_data)
 	kfree(container);
 }
 
+static int tce_iommu_clear(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages)
+{
+	unsigned long oldtce;
+	struct page *page;
+
+	for ( ; pages; --pages, ++entry) {
+		oldtce = iommu_clear_tce(tbl, entry);
+		if (!oldtce)
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+		WARN_ON(!page);
+		if (page) {
+			if (oldtce & TCE_PCI_WRITE)
+				SetPageDirty(page);
+			put_page(page);
+		}
+	}
+
+	return 0;
+}
+
+static long tce_iommu_build(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long tce, unsigned long pages)
+{
+	long i, ret = 0;
+	struct page *page = NULL;
+	unsigned long hva;
+	enum dma_data_direction direction = iommu_tce_direction(tce);
+
+	for (i = 0; i < pages; ++i) {
+		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+
+		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+				direction != DMA_TO_DEVICE, &page);
+		if (unlikely(ret != 1)) {
+			ret = -EFAULT;
+			break;
+		}
+		hva = (unsigned long) page_address(page) + offset;
+
+		ret = iommu_tce_build(tbl, entry + i, hva, direction);
+		if (ret) {
+			put_page(page);
+			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
+					__func__, entry << tbl->it_page_shift,
+					tce, ret);
+			break;
+		}
+		tce += IOMMU_PAGE_SIZE_4K;
+	}
+
+	if (ret)
+		tce_iommu_clear(container, tbl, entry, i);
+
+	return ret;
+}
+
 static long tce_iommu_ioctl(void *iommu_data,
 				 unsigned int cmd, unsigned long arg)
 {
@@ -195,7 +256,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 	case VFIO_IOMMU_MAP_DMA: {
 		struct vfio_iommu_type1_dma_map param;
 		struct iommu_table *tbl = container->tbl;
-		unsigned long tce, i;
+		unsigned long tce;
 
 		if (!tbl)
 			return -ENXIO;
@@ -229,17 +290,9 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (ret)
 			return ret;
 
-		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
-			ret = iommu_put_tce_user_mode(tbl,
-					(param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
-					tce);
-			if (ret)
-				break;
-			tce += IOMMU_PAGE_SIZE_4K;
-		}
-		if (ret)
-			iommu_clear_tces_and_put_pages(tbl,
-					param.iova >> IOMMU_PAGE_SHIFT_4K, i);
+		ret = tce_iommu_build(container, tbl,
+				param.iova >> IOMMU_PAGE_SHIFT_4K,
+				tce, param.size >> IOMMU_PAGE_SHIFT_4K);
 
 		iommu_flush_tce(tbl);
 
@@ -273,7 +326,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (ret)
 			return ret;
 
-		ret = iommu_clear_tces_and_put_pages(tbl,
+		ret = tce_iommu_clear(container, tbl,
 				param.iova >> IOMMU_PAGE_SHIFT_4K,
 				param.size >> IOMMU_PAGE_SHIFT_4K);
 		iommu_flush_tce(tbl);
@@ -357,6 +410,7 @@ static void tce_iommu_detach_group(void *iommu_data,
 		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
 				iommu_group_id(iommu_group), iommu_group); */
 		container->tbl = NULL;
+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 		iommu_release_ownership(tbl);
 	}
 	mutex_unlock(&container->lock);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver
  2015-05-11 15:38 ` [PATCH kernel v10 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver Alexey Kardashevskiy
@ 2015-05-13  5:58   ` Gavin Shan
  2015-05-13  6:32     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  5:58 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:55AM +1000, Alexey Kardashevskiy wrote:
>This moves page pinning (get_user_pages_fast()/put_page()) code out of
>the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
>to as the platform code does not deal with page pinning.
>
>This makes iommu_take_ownership()/iommu_release_ownership() deal with
>the IOMMU table bitmap only.
>
>This removes page unpinning from iommu_take_ownership() as the actual
>TCE table might contain garbage and doing put_page() on it is undefined
>behaviour.
>
>Besides the last part, the rest of the patch is mechanical.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v9:
>* added missing tce_iommu_clear call after iommu_release_ownership()
>* brought @offset (a local variable) back to make patch even more
>mechanical
>
>v4:
>* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
>---
> arch/powerpc/include/asm/iommu.h    |  4 --
> arch/powerpc/kernel/iommu.c         | 55 -------------------------
> drivers/vfio/vfio_iommu_spapr_tce.c | 80 +++++++++++++++++++++++++++++++------
> 3 files changed, 67 insertions(+), 72 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>index 8353c86..e94a5e3 100644
>--- a/arch/powerpc/include/asm/iommu.h
>+++ b/arch/powerpc/include/asm/iommu.h
>@@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
> 		unsigned long hwaddr, enum dma_data_direction direction);
> extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
> 		unsigned long entry);
>-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
>-		unsigned long entry, unsigned long pages);
>-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
>-		unsigned long entry, unsigned long tce);
>
> extern void iommu_flush_tce(struct iommu_table *tbl);
> extern int iommu_take_ownership(struct iommu_table *tbl);
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index 2c02d4c..8673c94 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -983,30 +983,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
> }
> EXPORT_SYMBOL_GPL(iommu_clear_tce);
>
>-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
>-		unsigned long entry, unsigned long pages)
>-{
>-	unsigned long oldtce;
>-	struct page *page;
>-
>-	for ( ; pages; --pages, ++entry) {
>-		oldtce = iommu_clear_tce(tbl, entry);
>-		if (!oldtce)
>-			continue;
>-
>-		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>-		WARN_ON(!page);
>-		if (page) {
>-			if (oldtce & TCE_PCI_WRITE)
>-				SetPageDirty(page);
>-			put_page(page);
>-		}
>-	}
>-
>-	return 0;
>-}
>-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
>-
> /*
>  * hwaddr is a kernel virtual address here (0xc... bazillion),
>  * tce_build converts it to a physical address.
>@@ -1036,35 +1012,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
> }
> EXPORT_SYMBOL_GPL(iommu_tce_build);
>
>-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
>-		unsigned long tce)
>-{
>-	int ret;
>-	struct page *page = NULL;
>-	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
>-	enum dma_data_direction direction = iommu_tce_direction(tce);
>-
>-	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>-			direction != DMA_TO_DEVICE, &page);
>-	if (unlikely(ret != 1)) {
>-		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
>-				tce, entry << tbl->it_page_shift, ret); */
>-		return -EFAULT;
>-	}
>-	hwaddr = (unsigned long) page_address(page) + offset;
>-
>-	ret = iommu_tce_build(tbl, entry, hwaddr, direction);
>-	if (ret)
>-		put_page(page);
>-
>-	if (ret < 0)
>-		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
>-			__func__, entry << tbl->it_page_shift, tce, ret);
>-
>-	return ret;
>-}
>-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
>-
> int iommu_take_ownership(struct iommu_table *tbl)
> {
> 	unsigned long sz = (tbl->it_size + 7) >> 3;
>@@ -1078,7 +1025,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
> 	}
>
> 	memset(tbl->it_map, 0xff, sz);
>-	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
>
> 	/*
> 	 * Disable iommu bypass, otherwise the user can DMA to all of
>@@ -1096,7 +1042,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
> {
> 	unsigned long sz = (tbl->it_size + 7) >> 3;
>
>-	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
> 	memset(tbl->it_map, 0, sz);
>
> 	/* Restore bit#0 set by iommu_init_table() */
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index 730b4ef..b95fa2b 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -147,6 +147,67 @@ static void tce_iommu_release(void *iommu_data)
> 	kfree(container);
> }
>
>+static int tce_iommu_clear(struct tce_container *container,
>+		struct iommu_table *tbl,
>+		unsigned long entry, unsigned long pages)
>+{
>+	unsigned long oldtce;
>+	struct page *page;
>+
>+	for ( ; pages; --pages, ++entry) {
>+		oldtce = iommu_clear_tce(tbl, entry);
It might be nice to rename iommu_clear_tce() to iommu_tce_free() with another
separate patch for two reasons as I can see: iommu_tce_{build, free} is one
pair of functions doing opposite things. iommu_tce_free() is implemented based
on ppc_md.tce_free() as iommu_tce_build() depends on ppc_md.tce_build().
>+		if (!oldtce)
>+			continue;
>+
>+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>+		WARN_ON(!page);
>+		if (page) {
>+			if (oldtce & TCE_PCI_WRITE)
>+				SetPageDirty(page);
>+			put_page(page);
>+		}
>+	}
>+
>+	return 0;
>+}
>+
>+static long tce_iommu_build(struct tce_container *container,
>+		struct iommu_table *tbl,
>+		unsigned long entry, unsigned long tce, unsigned long pages)
>+{
>+	long i, ret = 0;
>+	struct page *page = NULL;
>+	unsigned long hva;
>+	enum dma_data_direction direction = iommu_tce_direction(tce);
>+
>+	for (i = 0; i < pages; ++i) {
>+		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
>+
>+		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>+				direction != DMA_TO_DEVICE, &page);
>+		if (unlikely(ret != 1)) {
>+			ret = -EFAULT;
>+			break;
>+		}
>+		hva = (unsigned long) page_address(page) + offset;
>+
>+		ret = iommu_tce_build(tbl, entry + i, hva, direction);
>+		if (ret) {
>+			put_page(page);
>+			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
>+					__func__, entry << tbl->it_page_shift,
>+					tce, ret);
>+			break;
>+		}
>+		tce += IOMMU_PAGE_SIZE_4K;
>+	}
>+
>+	if (ret)
>+		tce_iommu_clear(container, tbl, entry, i);
>+
>+	return ret;
>+}
>+
> static long tce_iommu_ioctl(void *iommu_data,
> 				 unsigned int cmd, unsigned long arg)
> {
>@@ -195,7 +256,7 @@ static long tce_iommu_ioctl(void *iommu_data,
> 	case VFIO_IOMMU_MAP_DMA: {
> 		struct vfio_iommu_type1_dma_map param;
> 		struct iommu_table *tbl = container->tbl;
>-		unsigned long tce, i;
>+		unsigned long tce;
>
> 		if (!tbl)
> 			return -ENXIO;
>@@ -229,17 +290,9 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		if (ret)
> 			return ret;
>
>-		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
>-			ret = iommu_put_tce_user_mode(tbl,
>-					(param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
>-					tce);
>-			if (ret)
>-				break;
>-			tce += IOMMU_PAGE_SIZE_4K;
>-		}
>-		if (ret)
>-			iommu_clear_tces_and_put_pages(tbl,
>-					param.iova >> IOMMU_PAGE_SHIFT_4K, i);
>+		ret = tce_iommu_build(container, tbl,
>+				param.iova >> IOMMU_PAGE_SHIFT_4K,
>+				tce, param.size >> IOMMU_PAGE_SHIFT_4K);
>
> 		iommu_flush_tce(tbl);
>
>@@ -273,7 +326,7 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		if (ret)
> 			return ret;
>
>-		ret = iommu_clear_tces_and_put_pages(tbl,
>+		ret = tce_iommu_clear(container, tbl,
> 				param.iova >> IOMMU_PAGE_SHIFT_4K,
> 				param.size >> IOMMU_PAGE_SHIFT_4K);
> 		iommu_flush_tce(tbl);
>@@ -357,6 +410,7 @@ static void tce_iommu_detach_group(void *iommu_data,
> 		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> 				iommu_group_id(iommu_group), iommu_group); */
> 		container->tbl = NULL;
>+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
> 		iommu_release_ownership(tbl);
> 	}
> 	mutex_unlock(&container->lock);
Thanks,
Gavin
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver
  2015-05-13  5:58   ` Gavin Shan
@ 2015-05-13  6:32     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-13  6:32 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/13/2015 03:58 PM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:38:55AM +1000, Alexey Kardashevskiy wrote:
>> This moves page pinning (get_user_pages_fast()/put_page()) code out of
>> the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
>> to as the platform code does not deal with page pinning.
>>
>> This makes iommu_take_ownership()/iommu_release_ownership() deal with
>> the IOMMU table bitmap only.
>>
>> This removes page unpinning from iommu_take_ownership() as the actual
>> TCE table might contain garbage and doing put_page() on it is undefined
>> behaviour.
>>
>> Besides the last part, the rest of the patch is mechanical.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> [aw: for the vfio related changes]
>> Acked-by: Alex Williamson <alex.williamson@redhat.com>
>> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> Changes:
>> v9:
>> * added missing tce_iommu_clear call after iommu_release_ownership()
>> * brought @offset (a local variable) back to make patch even more
>> mechanical
>>
>> v4:
>> * s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
>> ---
>> arch/powerpc/include/asm/iommu.h    |  4 --
>> arch/powerpc/kernel/iommu.c         | 55 -------------------------
>> drivers/vfio/vfio_iommu_spapr_tce.c | 80 +++++++++++++++++++++++++++++++------
>> 3 files changed, 67 insertions(+), 72 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index 8353c86..e94a5e3 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
>> 		unsigned long hwaddr, enum dma_data_direction direction);
>> extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
>> 		unsigned long entry);
>> -extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
>> -		unsigned long entry, unsigned long pages);
>> -extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
>> -		unsigned long entry, unsigned long tce);
>>
>> extern void iommu_flush_tce(struct iommu_table *tbl);
>> extern int iommu_take_ownership(struct iommu_table *tbl);
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 2c02d4c..8673c94 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -983,30 +983,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
>> }
>> EXPORT_SYMBOL_GPL(iommu_clear_tce);
>>
>> -int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
>> -		unsigned long entry, unsigned long pages)
>> -{
>> -	unsigned long oldtce;
>> -	struct page *page;
>> -
>> -	for ( ; pages; --pages, ++entry) {
>> -		oldtce = iommu_clear_tce(tbl, entry);
>> -		if (!oldtce)
>> -			continue;
>> -
>> -		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>> -		WARN_ON(!page);
>> -		if (page) {
>> -			if (oldtce & TCE_PCI_WRITE)
>> -				SetPageDirty(page);
>> -			put_page(page);
>> -		}
>> -	}
>> -
>> -	return 0;
>> -}
>> -EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
>> -
>> /*
>>   * hwaddr is a kernel virtual address here (0xc... bazillion),
>>   * tce_build converts it to a physical address.
>> @@ -1036,35 +1012,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
>> }
>> EXPORT_SYMBOL_GPL(iommu_tce_build);
>>
>> -int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
>> -		unsigned long tce)
>> -{
>> -	int ret;
>> -	struct page *page = NULL;
>> -	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
>> -	enum dma_data_direction direction = iommu_tce_direction(tce);
>> -
>> -	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> -			direction != DMA_TO_DEVICE, &page);
>> -	if (unlikely(ret != 1)) {
>> -		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
>> -				tce, entry << tbl->it_page_shift, ret); */
>> -		return -EFAULT;
>> -	}
>> -	hwaddr = (unsigned long) page_address(page) + offset;
>> -
>> -	ret = iommu_tce_build(tbl, entry, hwaddr, direction);
>> -	if (ret)
>> -		put_page(page);
>> -
>> -	if (ret < 0)
>> -		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
>> -			__func__, entry << tbl->it_page_shift, tce, ret);
>> -
>> -	return ret;
>> -}
>> -EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
>> -
>> int iommu_take_ownership(struct iommu_table *tbl)
>> {
>> 	unsigned long sz = (tbl->it_size + 7) >> 3;
>> @@ -1078,7 +1025,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
>> 	}
>>
>> 	memset(tbl->it_map, 0xff, sz);
>> -	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
>>
>> 	/*
>> 	 * Disable iommu bypass, otherwise the user can DMA to all of
>> @@ -1096,7 +1042,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
>> {
>> 	unsigned long sz = (tbl->it_size + 7) >> 3;
>>
>> -	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
>> 	memset(tbl->it_map, 0, sz);
>>
>> 	/* Restore bit#0 set by iommu_init_table() */
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> index 730b4ef..b95fa2b 100644
>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -147,6 +147,67 @@ static void tce_iommu_release(void *iommu_data)
>> 	kfree(container);
>> }
>>
>> +static int tce_iommu_clear(struct tce_container *container,
>> +		struct iommu_table *tbl,
>> +		unsigned long entry, unsigned long pages)
>> +{
>> +	unsigned long oldtce;
>> +	struct page *page;
>> +
>> +	for ( ; pages; --pages, ++entry) {
>> +		oldtce = iommu_clear_tce(tbl, entry);
>
> It might be nice to rename iommu_clear_tce() to iommu_tce_free() with another
> separate patch for two reasons as I can see: iommu_tce_{build, free} is one
> pair of functions doing opposite things. iommu_tce_free() is implemented based
> on ppc_md.tce_free() as iommu_tce_build() depends on ppc_md.tce_build().
Later in this patchset (in "[PATCH kernel v10 23/34] powerpc/iommu/powernv: 
Release replaced TCE") I am removing iommu_clear_tce() and 
iommu_tce_build() (there will be iommu_tce_xchg() only) so no point in 
renaming those.
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 07/34] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (5 preceding siblings ...)
  2015-05-11 15:38 ` [PATCH kernel v10 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  6:06   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 08/34] vfio: powerpc/spapr: Use it_page_size Alexey Kardashevskiy
                   ` (26 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This checks that the TCE table page size is not bigger that the size of
a page we just pinned and going to put its physical address to the table.
Otherwise the hardware gets unwanted access to physical memory between
the end of the actual page and the end of the aligned up TCE page.
Since compound_order() and compound_head() work correctly on non-huge
pages, there is no need for additional check whether the page is huge.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v8: changed subject
v6:
* the helper is simplified to one line
v4:
* s/tce_check_page_size/tce_page_is_contained/
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index b95fa2b..735b308 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -47,6 +47,16 @@ struct tce_container {
 	bool enabled;
 };
 
+static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+{
+	/*
+	 * Check that the TCE table granularity is not bigger than the size of
+	 * a page we just found. Otherwise the hardware can get access to
+	 * a bigger memory chunk that it should.
+	 */
+	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
@@ -189,6 +199,12 @@ static long tce_iommu_build(struct tce_container *container,
 			ret = -EFAULT;
 			break;
 		}
+
+		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+			ret = -EPERM;
+			break;
+		}
+
 		hva = (unsigned long) page_address(page) + offset;
 
 		ret = iommu_tce_build(tbl, entry + i, hva, direction);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 07/34] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page
  2015-05-11 15:38 ` [PATCH kernel v10 07/34] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page Alexey Kardashevskiy
@ 2015-05-13  6:06   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  6:06 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:56AM +1000, Alexey Kardashevskiy wrote:
>This checks that the TCE table page size is not bigger that the size of
>a page we just pinned and going to put its physical address to the table.
>
>Otherwise the hardware gets unwanted access to physical memory between
>the end of the actual page and the end of the aligned up TCE page.
>
>Since compound_order() and compound_head() work correctly on non-huge
>pages, there is no need for additional check whether the page is huge.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
>Changes:
>v8: changed subject
>
>v6:
>* the helper is simplified to one line
>
>v4:
>* s/tce_check_page_size/tce_page_is_contained/
>---
> drivers/vfio/vfio_iommu_spapr_tce.c | 16 ++++++++++++++++
> 1 file changed, 16 insertions(+)
>
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index b95fa2b..735b308 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -47,6 +47,16 @@ struct tce_container {
> 	bool enabled;
> };
>
>+static bool tce_page_is_contained(struct page *page, unsigned page_shift)
>+{
>+	/*
>+	 * Check that the TCE table granularity is not bigger than the size of
>+	 * a page we just found. Otherwise the hardware can get access to
>+	 * a bigger memory chunk that it should.
>+	 */
>+	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
>+}
>+
> static int tce_iommu_enable(struct tce_container *container)
> {
> 	int ret = 0;
>@@ -189,6 +199,12 @@ static long tce_iommu_build(struct tce_container *container,
> 			ret = -EFAULT;
> 			break;
> 		}
>+
>+		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
>+			ret = -EPERM;
>+			break;
>+		}
>+
> 		hva = (unsigned long) page_address(page) + offset;
>
> 		ret = iommu_tce_build(tbl, entry + i, hva, direction);
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 08/34] vfio: powerpc/spapr: Use it_page_size
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (6 preceding siblings ...)
  2015-05-11 15:38 ` [PATCH kernel v10 07/34] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  6:12   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers Alexey Kardashevskiy
                   ` (25 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This makes use of the it_page_size from the iommu_table struct
as page size can differ.
This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
as recently introduced IOMMU_PAGE_XXX macros do not include
IOMMU_PAGE_SHIFT.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 735b308..64300cc 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * enforcing the limit based on the max that the guest can map.
 	 */
 	down_write(¤t->mm->mmap_sem);
-	npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
+	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
 	locked = current->mm->locked_vm + npages;
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
@@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container *container)
 
 	down_write(¤t->mm->mmap_sem);
 	current->mm->locked_vm -= (container->tbl->it_size <<
-			IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
+			container->tbl->it_page_shift) >> PAGE_SHIFT;
 	up_write(¤t->mm->mmap_sem);
 }
 
@@ -215,7 +215,7 @@ static long tce_iommu_build(struct tce_container *container,
 					tce, ret);
 			break;
 		}
-		tce += IOMMU_PAGE_SIZE_4K;
+		tce += IOMMU_PAGE_SIZE(tbl);
 	}
 
 	if (ret)
@@ -260,8 +260,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
-		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
+		info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
+		info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
 		info.flags = 0;
 
 		if (copy_to_user((void __user *)arg, &info, minsz))
@@ -291,8 +291,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 				VFIO_DMA_MAP_FLAG_WRITE))
 			return -EINVAL;
 
-		if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
-				(param.vaddr & ~IOMMU_PAGE_MASK_4K))
+		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 			return -EINVAL;
 
 		/* iova is checked by the IOMMU API */
@@ -307,8 +307,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 			return ret;
 
 		ret = tce_iommu_build(container, tbl,
-				param.iova >> IOMMU_PAGE_SHIFT_4K,
-				tce, param.size >> IOMMU_PAGE_SHIFT_4K);
+				param.iova >> tbl->it_page_shift,
+				tce, param.size >> tbl->it_page_shift);
 
 		iommu_flush_tce(tbl);
 
@@ -334,17 +334,17 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (param.flags)
 			return -EINVAL;
 
-		if (param.size & ~IOMMU_PAGE_MASK_4K)
+		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 			return -EINVAL;
 
 		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
-				param.size >> IOMMU_PAGE_SHIFT_4K);
+				param.size >> tbl->it_page_shift);
 		if (ret)
 			return ret;
 
 		ret = tce_iommu_clear(container, tbl,
-				param.iova >> IOMMU_PAGE_SHIFT_4K,
-				param.size >> IOMMU_PAGE_SHIFT_4K);
+				param.iova >> tbl->it_page_shift,
+				param.size >> tbl->it_page_shift);
 		iommu_flush_tce(tbl);
 
 		return ret;
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 08/34] vfio: powerpc/spapr: Use it_page_size
  2015-05-11 15:38 ` [PATCH kernel v10 08/34] vfio: powerpc/spapr: Use it_page_size Alexey Kardashevskiy
@ 2015-05-13  6:12   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  6:12 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:57AM +1000, Alexey Kardashevskiy wrote:
>This makes use of the it_page_size from the iommu_table struct
>as page size can differ.
>
>This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
>as recently introduced IOMMU_PAGE_XXX macros do not include
>IOMMU_PAGE_SHIFT.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
> drivers/vfio/vfio_iommu_spapr_tce.c | 26 +++++++++++++-------------
> 1 file changed, 13 insertions(+), 13 deletions(-)
>
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index 735b308..64300cc 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container *container)
> 	 * enforcing the limit based on the max that the guest can map.
> 	 */
> 	down_write(¤t->mm->mmap_sem);
>-	npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
>+	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
> 	locked = current->mm->locked_vm + npages;
> 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> 	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>@@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container *container)
>
> 	down_write(¤t->mm->mmap_sem);
> 	current->mm->locked_vm -= (container->tbl->it_size <<
>-			IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
>+			container->tbl->it_page_shift) >> PAGE_SHIFT;
> 	up_write(¤t->mm->mmap_sem);
> }
>
>@@ -215,7 +215,7 @@ static long tce_iommu_build(struct tce_container *container,
> 					tce, ret);
> 			break;
> 		}
>-		tce += IOMMU_PAGE_SIZE_4K;
>+		tce += IOMMU_PAGE_SIZE(tbl);
> 	}
>
> 	if (ret)
>@@ -260,8 +260,8 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		if (info.argsz < minsz)
> 			return -EINVAL;
>
>-		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
>-		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
>+		info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
>+		info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
> 		info.flags = 0;
>
> 		if (copy_to_user((void __user *)arg, &info, minsz))
>@@ -291,8 +291,8 @@ static long tce_iommu_ioctl(void *iommu_data,
> 				VFIO_DMA_MAP_FLAG_WRITE))
> 			return -EINVAL;
>
>-		if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
>-				(param.vaddr & ~IOMMU_PAGE_MASK_4K))
>+		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
>+				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
> 			return -EINVAL;
>
> 		/* iova is checked by the IOMMU API */
>@@ -307,8 +307,8 @@ static long tce_iommu_ioctl(void *iommu_data,
> 			return ret;
>
> 		ret = tce_iommu_build(container, tbl,
>-				param.iova >> IOMMU_PAGE_SHIFT_4K,
>-				tce, param.size >> IOMMU_PAGE_SHIFT_4K);
>+				param.iova >> tbl->it_page_shift,
>+				tce, param.size >> tbl->it_page_shift);
>
> 		iommu_flush_tce(tbl);
>
>@@ -334,17 +334,17 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		if (param.flags)
> 			return -EINVAL;
>
>-		if (param.size & ~IOMMU_PAGE_MASK_4K)
>+		if (param.size & ~IOMMU_PAGE_MASK(tbl))
> 			return -EINVAL;
>
> 		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
>-				param.size >> IOMMU_PAGE_SHIFT_4K);
>+				param.size >> tbl->it_page_shift);
> 		if (ret)
> 			return ret;
>
> 		ret = tce_iommu_clear(container, tbl,
>-				param.iova >> IOMMU_PAGE_SHIFT_4K,
>-				param.size >> IOMMU_PAGE_SHIFT_4K);
>+				param.iova >> tbl->it_page_shift,
>+				param.size >> tbl->it_page_shift);
> 		iommu_flush_tce(tbl);
>
> 		return ret;
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (7 preceding siblings ...)
  2015-05-11 15:38 ` [PATCH kernel v10 08/34] vfio: powerpc/spapr: Use it_page_size Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  6:18   ` Gavin Shan
  2015-05-11 15:38 ` [PATCH kernel v10 10/34] vfio: powerpc/spapr: Disable DMA mappings on disabled container Alexey Kardashevskiy
                   ` (24 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
There moves locked pages accounting to helpers.
Later they will be reused for Dynamic DMA windows (DDW).
This reworks debug messages to show the current value and the limit.
This stores the locked pages number in the container so when unlocking
the iommu table pointer won't be needed. This does not have an effect
now but it will with the multiple tables per container as then we will
allow attaching/detaching groups on fly and we may end up having
a container with no group attached but with the counter incremented.
While we are here, update the comment explaining why RLIMIT_MEMLOCK
might be required to be bigger than the guest RAM. This also prints
pid of the current process in pr_warn/pr_debug.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v4:
* new helpers do nothing if @npages == 0
* tce_iommu_disable() now can decrement the counter if the group was
detached (not possible now but will be in the future)
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 19 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 64300cc..40583f9 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -29,6 +29,51 @@
 static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group);
 
+static long try_increment_locked_vm(long npages)
+{
+	long ret = 0, locked, lock_limit;
+
+	if (!current || !current->mm)
+		return -ESRCH; /* process exited */
+
+	if (!npages)
+		return 0;
+
+	down_write(¤t->mm->mmap_sem);
+	locked = current->mm->locked_vm + npages;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		ret = -ENOMEM;
+	else
+		current->mm->locked_vm += npages;
+
+	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
+			npages << PAGE_SHIFT,
+			current->mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK),
+			ret ? " - exceeded" : "");
+
+	up_write(¤t->mm->mmap_sem);
+
+	return ret;
+}
+
+static void decrement_locked_vm(long npages)
+{
+	if (!current || !current->mm || !npages)
+		return; /* process exited */
+
+	down_write(¤t->mm->mmap_sem);
+	if (npages > current->mm->locked_vm)
+		npages = current->mm->locked_vm;
+	current->mm->locked_vm -= npages;
+	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
+			npages << PAGE_SHIFT,
+			current->mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK));
+	up_write(¤t->mm->mmap_sem);
+}
+
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -45,6 +90,7 @@ struct tce_container {
 	struct mutex lock;
 	struct iommu_table *tbl;
 	bool enabled;
+	unsigned long locked_pages;
 };
 
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
-	unsigned long locked, lock_limit, npages;
+	unsigned long locked;
 	struct iommu_table *tbl = container->tbl;
 
 	if (!container->tbl)
@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 	 * that would effectively kill the guest at random points, much better
 	 * enforcing the limit based on the max that the guest can map.
+	 *
+	 * Unfortunately at the moment it counts whole tables, no matter how
+	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
+	 * each with 2GB DMA window, 8GB will be counted here. The reason for
+	 * this is that we cannot tell here the amount of RAM used by the guest
+	 * as this information is only available from KVM and VFIO is
+	 * KVM agnostic.
 	 */
-	down_write(¤t->mm->mmap_sem);
-	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
-	locked = current->mm->locked_vm + npages;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
-				rlimit(RLIMIT_MEMLOCK));
-		ret = -ENOMEM;
-	} else {
+	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+	ret = try_increment_locked_vm(locked);
+	if (ret)
+		return ret;
 
-		current->mm->locked_vm += npages;
-		container->enabled = true;
-	}
-	up_write(¤t->mm->mmap_sem);
+	container->locked_pages = locked;
+
+	container->enabled = true;
 
 	return ret;
 }
@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
 
 	container->enabled = false;
 
-	if (!container->tbl || !current->mm)
+	if (!current->mm)
 		return;
 
-	down_write(¤t->mm->mmap_sem);
-	current->mm->locked_vm -= (container->tbl->it_size <<
-			container->tbl->it_page_shift) >> PAGE_SHIFT;
-	up_write(¤t->mm->mmap_sem);
+	decrement_locked_vm(container->locked_pages);
 }
 
 static void *tce_iommu_open(unsigned long arg)
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers
  2015-05-11 15:38 ` [PATCH kernel v10 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers Alexey Kardashevskiy
@ 2015-05-13  6:18   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  6:18 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:58AM +1000, Alexey Kardashevskiy wrote:
>There moves locked pages accounting to helpers.
>Later they will be reused for Dynamic DMA windows (DDW).
>
>This reworks debug messages to show the current value and the limit.
>
>This stores the locked pages number in the container so when unlocking
>the iommu table pointer won't be needed. This does not have an effect
>now but it will with the multiple tables per container as then we will
>allow attaching/detaching groups on fly and we may end up having
>a container with no group attached but with the counter incremented.
>
>While we are here, update the comment explaining why RLIMIT_MEMLOCK
>might be required to be bigger than the guest RAM. This also prints
>pid of the current process in pr_warn/pr_debug.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v4:
>* new helpers do nothing if @npages == 0
>* tce_iommu_disable() now can decrement the counter if the group was
>detached (not possible now but will be in the future)
>---
> drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
> 1 file changed, 63 insertions(+), 19 deletions(-)
>
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index 64300cc..40583f9 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -29,6 +29,51 @@
> static void tce_iommu_detach_group(void *iommu_data,
> 		struct iommu_group *iommu_group);
>
>+static long try_increment_locked_vm(long npages)
>+{
>+	long ret = 0, locked, lock_limit;
>+
>+	if (!current || !current->mm)
>+		return -ESRCH; /* process exited */
>+
>+	if (!npages)
>+		return 0;
>+
>+	down_write(¤t->mm->mmap_sem);
>+	locked = current->mm->locked_vm + npages;
>+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>+	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
>+		ret = -ENOMEM;
>+	else
>+		current->mm->locked_vm += npages;
>+
>+	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
>+			npages << PAGE_SHIFT,
>+			current->mm->locked_vm << PAGE_SHIFT,
>+			rlimit(RLIMIT_MEMLOCK),
>+			ret ? " - exceeded" : "");
>+
I'm not sure if current->pid + current->comm can give a bit more
readability or not.
Thanks,
Gavin
>+	up_write(¤t->mm->mmap_sem);
>+
>+	return ret;
>+}
>+
>+static void decrement_locked_vm(long npages)
>+{
>+	if (!current || !current->mm || !npages)
>+		return; /* process exited */
>+
>+	down_write(¤t->mm->mmap_sem);
>+	if (npages > current->mm->locked_vm)
>+		npages = current->mm->locked_vm;
>+	current->mm->locked_vm -= npages;
>+	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
>+			npages << PAGE_SHIFT,
>+			current->mm->locked_vm << PAGE_SHIFT,
>+			rlimit(RLIMIT_MEMLOCK));
>+	up_write(¤t->mm->mmap_sem);
>+}
>+
> /*
>  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
>  *
>@@ -45,6 +90,7 @@ struct tce_container {
> 	struct mutex lock;
> 	struct iommu_table *tbl;
> 	bool enabled;
>+	unsigned long locked_pages;
> };
>
> static bool tce_page_is_contained(struct page *page, unsigned page_shift)
>@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
> static int tce_iommu_enable(struct tce_container *container)
> {
> 	int ret = 0;
>-	unsigned long locked, lock_limit, npages;
>+	unsigned long locked;
> 	struct iommu_table *tbl = container->tbl;
>
> 	if (!container->tbl)
>@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
> 	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
> 	 * that would effectively kill the guest at random points, much better
> 	 * enforcing the limit based on the max that the guest can map.
>+	 *
>+	 * Unfortunately at the moment it counts whole tables, no matter how
>+	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
>+	 * each with 2GB DMA window, 8GB will be counted here. The reason for
>+	 * this is that we cannot tell here the amount of RAM used by the guest
>+	 * as this information is only available from KVM and VFIO is
>+	 * KVM agnostic.
> 	 */
>-	down_write(¤t->mm->mmap_sem);
>-	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
>-	locked = current->mm->locked_vm + npages;
>-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>-	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>-		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
>-				rlimit(RLIMIT_MEMLOCK));
>-		ret = -ENOMEM;
>-	} else {
>+	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
>+	ret = try_increment_locked_vm(locked);
>+	if (ret)
>+		return ret;
>
>-		current->mm->locked_vm += npages;
>-		container->enabled = true;
>-	}
>-	up_write(¤t->mm->mmap_sem);
>+	container->locked_pages = locked;
>+
>+	container->enabled = true;
>
> 	return ret;
> }
>@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
>
> 	container->enabled = false;
>
>-	if (!container->tbl || !current->mm)
>+	if (!current->mm)
> 		return;
>
>-	down_write(¤t->mm->mmap_sem);
>-	current->mm->locked_vm -= (container->tbl->it_size <<
>-			container->tbl->it_page_shift) >> PAGE_SHIFT;
>-	up_write(¤t->mm->mmap_sem);
>+	decrement_locked_vm(container->locked_pages);
> }
>
> static void *tce_iommu_open(unsigned long arg)
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 10/34] vfio: powerpc/spapr: Disable DMA mappings on disabled container
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (8 preceding siblings ...)
  2015-05-11 15:38 ` [PATCH kernel v10 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers Alexey Kardashevskiy
@ 2015-05-11 15:38 ` Alexey Kardashevskiy
  2015-05-13  6:20   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers Alexey Kardashevskiy
                   ` (23 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:38 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
At the moment DMA map/unmap requests are handled irrespective to
the container's state. This allows the user space to pin memory which
it might not be allowed to pin.
This adds checks to MAP/UNMAP that the container is enabled, otherwise
-EPERM is returned.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 40583f9..e21479c 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -318,6 +318,9 @@ static long tce_iommu_ioctl(void *iommu_data,
 		struct iommu_table *tbl = container->tbl;
 		unsigned long tce;
 
+		if (!container->enabled)
+			return -EPERM;
+
 		if (!tbl)
 			return -ENXIO;
 
@@ -362,6 +365,9 @@ static long tce_iommu_ioctl(void *iommu_data,
 		struct vfio_iommu_type1_dma_unmap param;
 		struct iommu_table *tbl = container->tbl;
 
+		if (!container->enabled)
+			return -EPERM;
+
 		if (WARN_ON(!tbl))
 			return -ENXIO;
 
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 10/34] vfio: powerpc/spapr: Disable DMA mappings on disabled container
  2015-05-11 15:38 ` [PATCH kernel v10 10/34] vfio: powerpc/spapr: Disable DMA mappings on disabled container Alexey Kardashevskiy
@ 2015-05-13  6:20   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  6:20 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:38:59AM +1000, Alexey Kardashevskiy wrote:
>At the moment DMA map/unmap requests are handled irrespective to
>the container's state. This allows the user space to pin memory which
>it might not be allowed to pin.
>
>This adds checks to MAP/UNMAP that the container is enabled, otherwise
>-EPERM is returned.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
> drivers/vfio/vfio_iommu_spapr_tce.c | 6 ++++++
> 1 file changed, 6 insertions(+)
>
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index 40583f9..e21479c 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -318,6 +318,9 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		struct iommu_table *tbl = container->tbl;
> 		unsigned long tce;
>
>+		if (!container->enabled)
>+			return -EPERM;
>+
> 		if (!tbl)
> 			return -ENXIO;
>
>@@ -362,6 +365,9 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		struct vfio_iommu_type1_dma_unmap param;
> 		struct iommu_table *tbl = container->tbl;
>
>+		if (!container->enabled)
>+			return -EPERM;
>+
> 		if (WARN_ON(!tbl))
> 			return -ENXIO;
>
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread 
 
- * [PATCH kernel v10 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (9 preceding siblings ...)
  2015-05-11 15:38 ` [PATCH kernel v10 10/34] vfio: powerpc/spapr: Disable DMA mappings on disabled container Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-13  6:32   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 12/34] vfio: powerpc/spapr: Rework groups attaching Alexey Kardashevskiy
                   ` (22 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This is a pretty mechanical patch to make next patches simpler.
New tce_iommu_unuse_page() helper does put_page() now but it might skip
that after the memory registering patch applied.
As we are here, this removes unnecessary checks for a value returned
by pfn_to_page() as it cannot possibly return NULL.
This moves tce_iommu_disable() later to let tce_iommu_clear() know if
the container has been enabled because if it has not been, then
put_page() must not be called on TCEs from the TCE table. This situation
is not yet possible but it will after KVM acceleration patchset is
applied.
This changes code to work with physical addresses rather than linear
mapping addresses for better code readability. Following patches will
add an xchg() callback for an IOMMU table which will accept/return
physical addresses (unlike current tce_build()) which will eliminate
redundant conversions.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* changed helpers to work with physical addresses rather than linear
(for simplicity - later ::xchg() will receive physical and avoid
additional convertions)
v6:
* tce_get_hva() returns hva via a pointer
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 61 +++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 20 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index e21479c..115d5e6 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -191,69 +191,90 @@ static void tce_iommu_release(void *iommu_data)
 	struct tce_container *container = iommu_data;
 
 	WARN_ON(container->tbl && !container->tbl->it_group);
-	tce_iommu_disable(container);
 
 	if (container->tbl && container->tbl->it_group)
 		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
 
+	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
 
 	kfree(container);
 }
 
+static void tce_iommu_unuse_page(struct tce_container *container,
+		unsigned long oldtce)
+{
+	struct page *page;
+
+	if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
+		return;
+
+	page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+	if (oldtce & TCE_PCI_WRITE)
+		SetPageDirty(page);
+
+	put_page(page);
+}
+
 static int tce_iommu_clear(struct tce_container *container,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long pages)
 {
 	unsigned long oldtce;
-	struct page *page;
 
 	for ( ; pages; --pages, ++entry) {
 		oldtce = iommu_clear_tce(tbl, entry);
 		if (!oldtce)
 			continue;
 
-		page = pfn_to_page(oldtce >> PAGE_SHIFT);
-		WARN_ON(!page);
-		if (page) {
-			if (oldtce & TCE_PCI_WRITE)
-				SetPageDirty(page);
-			put_page(page);
-		}
+		tce_iommu_unuse_page(container, oldtce);
 	}
 
 	return 0;
 }
 
+static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
+{
+	struct page *page = NULL;
+	enum dma_data_direction direction = iommu_tce_direction(tce);
+
+	if (get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page) != 1)
+		return -EFAULT;
+
+	*hpa = __pa((unsigned long) page_address(page));
+
+	return 0;
+}
+
 static long tce_iommu_build(struct tce_container *container,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long tce, unsigned long pages)
 {
 	long i, ret = 0;
-	struct page *page = NULL;
-	unsigned long hva;
+	struct page *page;
+	unsigned long hpa;
 	enum dma_data_direction direction = iommu_tce_direction(tce);
 
 	for (i = 0; i < pages; ++i) {
 		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 
-		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
-				direction != DMA_TO_DEVICE, &page);
-		if (unlikely(ret != 1)) {
-			ret = -EFAULT;
+		ret = tce_iommu_use_page(tce, &hpa);
+		if (ret)
 			break;
-		}
 
+		page = pfn_to_page(hpa >> PAGE_SHIFT);
 		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
 			ret = -EPERM;
 			break;
 		}
 
-		hva = (unsigned long) page_address(page) + offset;
-
-		ret = iommu_tce_build(tbl, entry + i, hva, direction);
+		hpa |= offset;
+		ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa),
+				direction);
 		if (ret) {
-			put_page(page);
+			tce_iommu_unuse_page(container, hpa);
 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 					__func__, entry << tbl->it_page_shift,
 					tce, ret);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers
  2015-05-11 15:39 ` [PATCH kernel v10 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers Alexey Kardashevskiy
@ 2015-05-13  6:32   ` Gavin Shan
  2015-05-13  7:30     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-13  6:32 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:00AM +1000, Alexey Kardashevskiy wrote:
>This is a pretty mechanical patch to make next patches simpler.
>
>New tce_iommu_unuse_page() helper does put_page() now but it might skip
>that after the memory registering patch applied.
>
>As we are here, this removes unnecessary checks for a value returned
>by pfn_to_page() as it cannot possibly return NULL.
>
>This moves tce_iommu_disable() later to let tce_iommu_clear() know if
>the container has been enabled because if it has not been, then
>put_page() must not be called on TCEs from the TCE table. This situation
>is not yet possible but it will after KVM acceleration patchset is
>applied.
>
>This changes code to work with physical addresses rather than linear
>mapping addresses for better code readability. Following patches will
>add an xchg() callback for an IOMMU table which will accept/return
>physical addresses (unlike current tce_build()) which will eliminate
>redundant conversions.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v9:
>* changed helpers to work with physical addresses rather than linear
>(for simplicity - later ::xchg() will receive physical and avoid
>additional convertions)
>
>v6:
>* tce_get_hva() returns hva via a pointer
>---
> drivers/vfio/vfio_iommu_spapr_tce.c | 61 +++++++++++++++++++++++++------------
> 1 file changed, 41 insertions(+), 20 deletions(-)
>
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index e21479c..115d5e6 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -191,69 +191,90 @@ static void tce_iommu_release(void *iommu_data)
> 	struct tce_container *container = iommu_data;
>
> 	WARN_ON(container->tbl && !container->tbl->it_group);
>-	tce_iommu_disable(container);
>
> 	if (container->tbl && container->tbl->it_group)
> 		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>
>+	tce_iommu_disable(container);
> 	mutex_destroy(&container->lock);
>
> 	kfree(container);
> }
>
>+static void tce_iommu_unuse_page(struct tce_container *container,
>+		unsigned long oldtce)
>+{
>+	struct page *page;
>+
>+	if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
>+		return;
>+
It might be worthy to have a global helper function in iommu.h to check
if the given TCE entry is empty or not, for better readability. I would
think the helper function is used here and there :-)
Thanks,
Gavin
>+	page = pfn_to_page(oldtce >> PAGE_SHIFT);
>+
>+	if (oldtce & TCE_PCI_WRITE)
>+		SetPageDirty(page);
>+
>+	put_page(page);
>+}
>+
> static int tce_iommu_clear(struct tce_container *container,
> 		struct iommu_table *tbl,
> 		unsigned long entry, unsigned long pages)
> {
> 	unsigned long oldtce;
>-	struct page *page;
>
> 	for ( ; pages; --pages, ++entry) {
> 		oldtce = iommu_clear_tce(tbl, entry);
> 		if (!oldtce)
> 			continue;
>
>-		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>-		WARN_ON(!page);
>-		if (page) {
>-			if (oldtce & TCE_PCI_WRITE)
>-				SetPageDirty(page);
>-			put_page(page);
>-		}
>+		tce_iommu_unuse_page(container, oldtce);
> 	}
>
> 	return 0;
> }
>
>+static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
>+{
>+	struct page *page = NULL;
>+	enum dma_data_direction direction = iommu_tce_direction(tce);
>+
>+	if (get_user_pages_fast(tce & PAGE_MASK, 1,
>+			direction != DMA_TO_DEVICE, &page) != 1)
>+		return -EFAULT;
>+
>+	*hpa = __pa((unsigned long) page_address(page));
>+
>+	return 0;
>+}
>+
> static long tce_iommu_build(struct tce_container *container,
> 		struct iommu_table *tbl,
> 		unsigned long entry, unsigned long tce, unsigned long pages)
> {
> 	long i, ret = 0;
>-	struct page *page = NULL;
>-	unsigned long hva;
>+	struct page *page;
>+	unsigned long hpa;
> 	enum dma_data_direction direction = iommu_tce_direction(tce);
>
> 	for (i = 0; i < pages; ++i) {
> 		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
>
>-		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>-				direction != DMA_TO_DEVICE, &page);
>-		if (unlikely(ret != 1)) {
>-			ret = -EFAULT;
>+		ret = tce_iommu_use_page(tce, &hpa);
>+		if (ret)
> 			break;
>-		}
>
>+		page = pfn_to_page(hpa >> PAGE_SHIFT);
> 		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
> 			ret = -EPERM;
> 			break;
> 		}
>
>-		hva = (unsigned long) page_address(page) + offset;
>-
>-		ret = iommu_tce_build(tbl, entry + i, hva, direction);
>+		hpa |= offset;
>+		ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa),
>+				direction);
> 		if (ret) {
>-			put_page(page);
>+			tce_iommu_unuse_page(container, hpa);
> 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
> 					__func__, entry << tbl->it_page_shift,
> 					tce, ret);
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers
  2015-05-13  6:32   ` Gavin Shan
@ 2015-05-13  7:30     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-13  7:30 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/13/2015 04:32 PM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:39:00AM +1000, Alexey Kardashevskiy wrote:
>> This is a pretty mechanical patch to make next patches simpler.
>>
>> New tce_iommu_unuse_page() helper does put_page() now but it might skip
>> that after the memory registering patch applied.
>>
>> As we are here, this removes unnecessary checks for a value returned
>> by pfn_to_page() as it cannot possibly return NULL.
>>
>> This moves tce_iommu_disable() later to let tce_iommu_clear() know if
>> the container has been enabled because if it has not been, then
>> put_page() must not be called on TCEs from the TCE table. This situation
>> is not yet possible but it will after KVM acceleration patchset is
>> applied.
>>
>> This changes code to work with physical addresses rather than linear
>> mapping addresses for better code readability. Following patches will
>> add an xchg() callback for an IOMMU table which will accept/return
>> physical addresses (unlike current tce_build()) which will eliminate
>> redundant conversions.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> [aw: for the vfio related changes]
>> Acked-by: Alex Williamson <alex.williamson@redhat.com>
>> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> Changes:
>> v9:
>> * changed helpers to work with physical addresses rather than linear
>> (for simplicity - later ::xchg() will receive physical and avoid
>> additional convertions)
>>
>> v6:
>> * tce_get_hva() returns hva via a pointer
>> ---
>> drivers/vfio/vfio_iommu_spapr_tce.c | 61 +++++++++++++++++++++++++------------
>> 1 file changed, 41 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> index e21479c..115d5e6 100644
>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -191,69 +191,90 @@ static void tce_iommu_release(void *iommu_data)
>> 	struct tce_container *container = iommu_data;
>>
>> 	WARN_ON(container->tbl && !container->tbl->it_group);
>> -	tce_iommu_disable(container);
>>
>> 	if (container->tbl && container->tbl->it_group)
>> 		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>>
>> +	tce_iommu_disable(container);
>> 	mutex_destroy(&container->lock);
>>
>> 	kfree(container);
>> }
>>
>> +static void tce_iommu_unuse_page(struct tce_container *container,
>> +		unsigned long oldtce)
>> +{
>> +	struct page *page;
>> +
>> +	if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
>> +		return;
>> +
>
> It might be worthy to have a global helper function in iommu.h to check
> if the given TCE entry is empty or not, for better readability. I would
> think the helper function is used here and there :-)
The patchset adds one later, called iommu_tce_direction() ;)
In general, I removed TCE_PCI_READ, TCE_PCI_WRITE from everywhere but 
powernv code and used  enum dma_data_direction instead as these bits are 
SPAPR TCE protocol specific and VFIO IOMMU API or 
arch/powerpc/kernel/iommu.c do not receive/handle these bits, they have 
their own, platform independent.
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 12/34] vfio: powerpc/spapr: Rework groups attaching
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (10 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-13 23:35   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 13/34] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE Alexey Kardashevskiy
                   ` (21 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This is to make extended ownership and multiple groups support patches
simpler for review.
This should cause no behavioural change.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 40 ++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 16 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 115d5e6..0fbe03e 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -460,16 +460,21 @@ static int tce_iommu_attach_group(void *iommu_data,
 				iommu_group_id(container->tbl->it_group),
 				iommu_group_id(iommu_group));
 		ret = -EBUSY;
-	} else if (container->enabled) {
+		goto unlock_exit;
+	}
+
+	if (container->enabled) {
 		pr_err("tce_vfio: attaching group #%u to enabled container\n",
 				iommu_group_id(iommu_group));
 		ret = -EBUSY;
-	} else {
-		ret = iommu_take_ownership(tbl);
-		if (!ret)
-			container->tbl = tbl;
+		goto unlock_exit;
 	}
 
+	ret = iommu_take_ownership(tbl);
+	if (!ret)
+		container->tbl = tbl;
+
+unlock_exit:
 	mutex_unlock(&container->lock);
 
 	return ret;
@@ -487,19 +492,22 @@ static void tce_iommu_detach_group(void *iommu_data,
 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
 				iommu_group_id(iommu_group),
 				iommu_group_id(tbl->it_group));
-	} else {
-		if (container->enabled) {
-			pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-					iommu_group_id(tbl->it_group));
-			tce_iommu_disable(container);
-		}
+		goto unlock_exit;
+	}
 
-		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
-				iommu_group_id(iommu_group), iommu_group); */
-		container->tbl = NULL;
-		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-		iommu_release_ownership(tbl);
+	if (container->enabled) {
+		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
+				iommu_group_id(tbl->it_group));
+		tce_iommu_disable(container);
 	}
+
+	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+	   iommu_group_id(iommu_group), iommu_group); */
+	container->tbl = NULL;
+	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+	iommu_release_ownership(tbl);
+
+unlock_exit:
 	mutex_unlock(&container->lock);
 }
 
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 12/34] vfio: powerpc/spapr: Rework groups attaching
  2015-05-11 15:39 ` [PATCH kernel v10 12/34] vfio: powerpc/spapr: Rework groups attaching Alexey Kardashevskiy
@ 2015-05-13 23:35   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-13 23:35 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:01AM +1000, Alexey Kardashevskiy wrote:
>This is to make extended ownership and multiple groups support patches
>simpler for review.
>
>This should cause no behavioural change.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
> drivers/vfio/vfio_iommu_spapr_tce.c | 40 ++++++++++++++++++++++---------------
> 1 file changed, 24 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index 115d5e6..0fbe03e 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -460,16 +460,21 @@ static int tce_iommu_attach_group(void *iommu_data,
> 				iommu_group_id(container->tbl->it_group),
> 				iommu_group_id(iommu_group));
> 		ret = -EBUSY;
>-	} else if (container->enabled) {
>+		goto unlock_exit;
>+	}
>+
>+	if (container->enabled) {
> 		pr_err("tce_vfio: attaching group #%u to enabled container\n",
> 				iommu_group_id(iommu_group));
> 		ret = -EBUSY;
>-	} else {
>-		ret = iommu_take_ownership(tbl);
>-		if (!ret)
>-			container->tbl = tbl;
>+		goto unlock_exit;
> 	}
>
>+	ret = iommu_take_ownership(tbl);
>+	if (!ret)
>+		container->tbl = tbl;
>+
>+unlock_exit:
> 	mutex_unlock(&container->lock);
>
> 	return ret;
>@@ -487,19 +492,22 @@ static void tce_iommu_detach_group(void *iommu_data,
> 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> 				iommu_group_id(iommu_group),
> 				iommu_group_id(tbl->it_group));
>-	} else {
>-		if (container->enabled) {
>-			pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
>-					iommu_group_id(tbl->it_group));
>-			tce_iommu_disable(container);
>-		}
>+		goto unlock_exit;
>+	}
>
>-		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>-				iommu_group_id(iommu_group), iommu_group); */
>-		container->tbl = NULL;
>-		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
>-		iommu_release_ownership(tbl);
>+	if (container->enabled) {
>+		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
>+				iommu_group_id(tbl->it_group));
>+		tce_iommu_disable(container);
> 	}
>+
>+	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
>+	   iommu_group_id(iommu_group), iommu_group); */
>+	container->tbl = NULL;
>+	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
>+	iommu_release_ownership(tbl);
>+
>+unlock_exit:
> 	mutex_unlock(&container->lock);
> }
>
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 13/34] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (11 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 12/34] vfio: powerpc/spapr: Rework groups attaching Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  0:00   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table Alexey Kardashevskiy
                   ` (20 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
Normally a bitmap from the iommu_table is used to track what TCE entry
is in use. Since we are going to use iommu_table without its locks and
do xchg() instead, it becomes essential not to put bits which are not
implied in the direction flag as the old TCE value (more precisely -
the permission bits) will be used to decide whether to put the page or not.
This adds iommu_direction_to_tce_perm() (its counterpart is there already)
and uses it for powernv's pnv_tce_build().
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* added comment why we must put only valid permission bits
---
 arch/powerpc/include/asm/iommu.h     |  1 +
 arch/powerpc/kernel/iommu.c          | 15 +++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  7 +------
 3 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e94a5e3..d91bd69 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -200,6 +200,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl);
 extern void iommu_release_ownership(struct iommu_table *tbl);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 8673c94..31319f8 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -863,6 +863,21 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 	}
 }
 
+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return TCE_PCI_READ | TCE_PCI_WRITE;
+	case DMA_FROM_DEVICE:
+		return TCE_PCI_WRITE;
+	case DMA_TO_DEVICE:
+		return TCE_PCI_READ;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
+
 #ifdef CONFIG_IOMMU_API
 /*
  * SPAPR TCE API
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index bca2aeb..b7ea245 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -576,15 +576,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 			 unsigned long uaddr, enum dma_data_direction direction,
 			 struct dma_attrs *attrs, bool rm)
 {
-	u64 proto_tce;
+	u64 proto_tce = iommu_direction_to_tce_perm(direction);
 	__be64 *tcep, *tces;
 	u64 rpn;
 
-	proto_tce = TCE_PCI_READ; // Read allowed
-
-	if (direction != DMA_TO_DEVICE)
-		proto_tce |= TCE_PCI_WRITE;
-
 	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
 	rpn = __pa(uaddr) >> tbl->it_page_shift;
 
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 13/34] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE
  2015-05-11 15:39 ` [PATCH kernel v10 13/34] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE Alexey Kardashevskiy
@ 2015-05-14  0:00   ` Gavin Shan
  2015-05-14  2:51     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  0:00 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:02AM +1000, Alexey Kardashevskiy wrote:
>Normally a bitmap from the iommu_table is used to track what TCE entry
>is in use. Since we are going to use iommu_table without its locks and
>do xchg() instead, it becomes essential not to put bits which are not
>implied in the direction flag as the old TCE value (more precisely -
>the permission bits) will be used to decide whether to put the page or not.
>
>This adds iommu_direction_to_tce_perm() (its counterpart is there already)
>and uses it for powernv's pnv_tce_build().
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v9:
>* added comment why we must put only valid permission bits
>---
> arch/powerpc/include/asm/iommu.h     |  1 +
> arch/powerpc/kernel/iommu.c          | 15 +++++++++++++++
> arch/powerpc/platforms/powernv/pci.c |  7 +------
> 3 files changed, 17 insertions(+), 6 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>index e94a5e3..d91bd69 100644
>--- a/arch/powerpc/include/asm/iommu.h
>+++ b/arch/powerpc/include/asm/iommu.h
>@@ -200,6 +200,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl);
> extern void iommu_release_ownership(struct iommu_table *tbl);
>
> extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
>+extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
>
> #endif /* __KERNEL__ */
> #endif /* _ASM_IOMMU_H */
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index 8673c94..31319f8 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -863,6 +863,21 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> 	}
> }
>
>+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
>+{
>+	switch (dir) {
>+	case DMA_BIDIRECTIONAL:
>+		return TCE_PCI_READ | TCE_PCI_WRITE;
>+	case DMA_FROM_DEVICE:
>+		return TCE_PCI_WRITE;
>+	case DMA_TO_DEVICE:
>+		return TCE_PCI_READ;
>+	default:
>+		return 0;
It might be nice to have a WARN_ON() or log for the default case. If the TCE
entry is going to be updated without permission bits by ppc_md.tce_build().
The DMA operation covered by this TCE entry will cause EEH error. More
logs would be helpful to locate the root cause of the EEH error :-)
>+	}
>+}
>+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
>+
The function converts generic permission flags to PCI specific flags as
the names (TCE_PCI_{READ,WRITE}) indicates. I'm not sure if it's reasonable
to have function name iommu_direction_to_pci_tce_perm(). Platform devices
who have DMA capability might have different flags other than TCE_PCI_{READ,WRITE}
and possibly use iommu.c to manage platform specific TCE table. We might
not have the use case for now, so I'm not sure it makes sense to have a
more specific function name.
Thanks,
Gavin
> #ifdef CONFIG_IOMMU_API
> /*
>  * SPAPR TCE API
>diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>index bca2aeb..b7ea245 100644
>--- a/arch/powerpc/platforms/powernv/pci.c
>+++ b/arch/powerpc/platforms/powernv/pci.c
>@@ -576,15 +576,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
> 			 unsigned long uaddr, enum dma_data_direction direction,
> 			 struct dma_attrs *attrs, bool rm)
> {
>-	u64 proto_tce;
>+	u64 proto_tce = iommu_direction_to_tce_perm(direction);
> 	__be64 *tcep, *tces;
> 	u64 rpn;
>
>-	proto_tce = TCE_PCI_READ; // Read allowed
>-
>-	if (direction != DMA_TO_DEVICE)
>-		proto_tce |= TCE_PCI_WRITE;
>-
> 	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
> 	rpn = __pa(uaddr) >> tbl->it_page_shift;
>
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 13/34] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE
  2015-05-14  0:00   ` Gavin Shan
@ 2015-05-14  2:51     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  2:51 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/14/2015 10:00 AM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:39:02AM +1000, Alexey Kardashevskiy wrote:
>> Normally a bitmap from the iommu_table is used to track what TCE entry
>> is in use. Since we are going to use iommu_table without its locks and
>> do xchg() instead, it becomes essential not to put bits which are not
>> implied in the direction flag as the old TCE value (more precisely -
>> the permission bits) will be used to decide whether to put the page or not.
>>
>> This adds iommu_direction_to_tce_perm() (its counterpart is there already)
>> and uses it for powernv's pnv_tce_build().
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> Changes:
>> v9:
>> * added comment why we must put only valid permission bits
>> ---
>> arch/powerpc/include/asm/iommu.h     |  1 +
>> arch/powerpc/kernel/iommu.c          | 15 +++++++++++++++
>> arch/powerpc/platforms/powernv/pci.c |  7 +------
>> 3 files changed, 17 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index e94a5e3..d91bd69 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -200,6 +200,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl);
>> extern void iommu_release_ownership(struct iommu_table *tbl);
>>
>> extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
>> +extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
>>
>> #endif /* __KERNEL__ */
>> #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 8673c94..31319f8 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -863,6 +863,21 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>> 	}
>> }
>>
>> +unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
>> +{
>> +	switch (dir) {
>> +	case DMA_BIDIRECTIONAL:
>> +		return TCE_PCI_READ | TCE_PCI_WRITE;
>> +	case DMA_FROM_DEVICE:
>> +		return TCE_PCI_WRITE;
>> +	case DMA_TO_DEVICE:
>> +		return TCE_PCI_READ;
>> +	default:
>> +		return 0;
>
> It might be nice to have a WARN_ON() or log for the default case. If the TCE
> entry is going to be updated without permission bits by ppc_md.tce_build().
If this is happening in pnv_tce_build() (which is for the host DMA only) - 
it is quite late to trace anything, we are totally screwed by then.
If you are talking about VFIO (pnv_tce_xchg()), we calculate 
enum_dma_data_direction from the VFIO permission bits so wrong value won't 
be passed here at all.
> The DMA operation covered by this TCE entry will cause EEH error. More
> logs would be helpful to locate the root cause of the EEH error :-)
>
>> +	}
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
>> +
>
> The function converts generic permission flags to PCI specific flags as
> the names (TCE_PCI_{READ,WRITE}) indicates. I'm not sure if it's reasonable
> to have function name iommu_direction_to_pci_tce_perm(). Platform devices
> who have DMA capability might have different flags other than TCE_PCI_{READ,WRITE}
> and possibly use iommu.c to manage platform specific TCE table. We might
> not have the use case for now, so I'm not sure it makes sense to have a
> more specific function name.
"tce" is for SPAPR TCE protocol so the function does pretty certain thing.
It might not be the best place for this function (powernv/pci.c seems to be 
better) but I use this function from POWERNV and KVM and I have either 
duplicate these helpers in POWERNV and KVM or put in some common place and 
where it is now is this place.
And its counterpart - iommu_tce_direction - is there already. We may move 
these somewhere else later if we want.
> Thanks,
> Gavin
>
>> #ifdef CONFIG_IOMMU_API
>> /*
>>   * SPAPR TCE API
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index bca2aeb..b7ea245 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -576,15 +576,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>> 			 unsigned long uaddr, enum dma_data_direction direction,
>> 			 struct dma_attrs *attrs, bool rm)
>> {
>> -	u64 proto_tce;
>> +	u64 proto_tce = iommu_direction_to_tce_perm(direction);
>> 	__be64 *tcep, *tces;
>> 	u64 rpn;
>>
>> -	proto_tce = TCE_PCI_READ; // Read allowed
>> -
>> -	if (direction != DMA_TO_DEVICE)
>> -		proto_tce |= TCE_PCI_WRITE;
>> -
>> 	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>> 	rpn = __pa(uaddr) >> tbl->it_page_shift;
>>
>> --
>> 2.4.0.rc3.8.gfb3e7d5
>>
>
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (12 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 13/34] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  0:23   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free() Alexey Kardashevskiy
                   ` (19 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This adds a iommu_table_ops struct and puts pointer to it into
the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush
callbacks from ppc_md to the new struct where they really belong to.
This adds the requirement for @it_ops to be initialized before calling
iommu_init_table() to make sure that we do not leave any IOMMU table
with iommu_table_ops uninitialized. This is not a parameter of
iommu_init_table() though as there will be cases when iommu_init_table()
will not be called on TCE tables, for example - VFIO.
This does s/tce_build/set/, s/tce_free/clear/ and removes "tce_"
redundand prefixes.
This removes tce_xxx_rm handlers from ppc_md but does not add
them to iommu_table_ops as this will be done later if we decide to
support TCE hypercalls in real mode. This removes _vm callbacks as
only virtual mode is supported by now so this also removes @rm parameter.
For pSeries, this always uses tce_buildmulti_pSeriesLP/
tce_buildmulti_pSeriesLP. This changes multi callback to fall back to
tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not
present. The reason for this is we still have to support "multitce=off"
boot parameter in disable_multitce() and we do not want to walk through
all IOMMU tables in the system and replace "multi" callbacks with single
ones.
For powernv, this defines _ops per PHB type which are P5IOC2/IODA1/IODA2.
This makes the callbacks for them public. Later patches will extend
callbacks for IODA1/2.
No change in behaviour is expected.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* pnv_tce_build/pnv_tce_free/pnv_tce_get have been made public and lost
"rm" parameters to make following patches simpler (realmode is not
supported here anyway)
* got rid of _vm versions of callbacks
---
 arch/powerpc/include/asm/iommu.h            | 17 +++++++++++
 arch/powerpc/include/asm/machdep.h          | 25 ---------------
 arch/powerpc/kernel/iommu.c                 | 46 ++++++++++++++--------------
 arch/powerpc/kernel/vio.c                   |  5 +++
 arch/powerpc/platforms/cell/iommu.c         |  8 +++--
 arch/powerpc/platforms/pasemi/iommu.c       |  7 +++--
 arch/powerpc/platforms/powernv/pci-ioda.c   | 14 +++++++++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 +++++
 arch/powerpc/platforms/powernv/pci.c        | 47 +++++------------------------
 arch/powerpc/platforms/powernv/pci.h        |  5 +++
 arch/powerpc/platforms/pseries/iommu.c      | 34 ++++++++++++---------
 arch/powerpc/sysdev/dart_iommu.c            | 12 +++++---
 12 files changed, 116 insertions(+), 111 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d91bd69..e2a45c3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -44,6 +44,22 @@
 extern int iommu_is_off;
 extern int iommu_force_on;
 
+struct iommu_table_ops {
+	int (*set)(struct iommu_table *tbl,
+			long index, long npages,
+			unsigned long uaddr,
+			enum dma_data_direction direction,
+			struct dma_attrs *attrs);
+	void (*clear)(struct iommu_table *tbl,
+			long index, long npages);
+	unsigned long (*get)(struct iommu_table *tbl, long index);
+	void (*flush)(struct iommu_table *tbl);
+};
+
+/* These are used by VIO */
+extern struct iommu_table_ops iommu_table_lpar_multi_ops;
+extern struct iommu_table_ops iommu_table_pseries_ops;
+
 /*
  * IOMAP_MAX_ORDER defines the largest contiguous block
  * of dma space we can get.  IOMAP_MAX_ORDER = 13
@@ -78,6 +94,7 @@ struct iommu_table {
 #ifdef CONFIG_IOMMU_API
 	struct iommu_group *it_group;
 #endif
+	struct iommu_table_ops *it_ops;
 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
 #ifdef CONFIG_PPC_POWERNV
 	void           *data;
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index ef889943..ab721b4 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -65,31 +65,6 @@ struct machdep_calls {
 	 * destroyed as well */
 	void		(*hpte_clear_all)(void);
 
-	int		(*tce_build)(struct iommu_table *tbl,
-				     long index,
-				     long npages,
-				     unsigned long uaddr,
-				     enum dma_data_direction direction,
-				     struct dma_attrs *attrs);
-	void		(*tce_free)(struct iommu_table *tbl,
-				    long index,
-				    long npages);
-	unsigned long	(*tce_get)(struct iommu_table *tbl,
-				    long index);
-	void		(*tce_flush)(struct iommu_table *tbl);
-
-	/* _rm versions are for real mode use only */
-	int		(*tce_build_rm)(struct iommu_table *tbl,
-				     long index,
-				     long npages,
-				     unsigned long uaddr,
-				     enum dma_data_direction direction,
-				     struct dma_attrs *attrs);
-	void		(*tce_free_rm)(struct iommu_table *tbl,
-				    long index,
-				    long npages);
-	void		(*tce_flush_rm)(struct iommu_table *tbl);
-
 	void __iomem *	(*ioremap)(phys_addr_t addr, unsigned long size,
 				   unsigned long flags, void *caller);
 	void		(*iounmap)(volatile void __iomem *token);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 31319f8..16be6aa 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -322,11 +322,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
-	build_fail = ppc_md.tce_build(tbl, entry, npages,
+	build_fail = tbl->it_ops->set(tbl, entry, npages,
 				      (unsigned long)page &
 				      IOMMU_PAGE_MASK(tbl), direction, attrs);
 
-	/* ppc_md.tce_build() only returns non-zero for transient errors.
+	/* tbl->it_ops->set() only returns non-zero for transient errors.
 	 * Clean up the table bitmap in this case and return
 	 * DMA_ERROR_CODE. For all other errors the functionality is
 	 * not altered.
@@ -337,8 +337,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	}
 
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	/* Make sure updates are seen by hardware */
 	mb();
@@ -408,7 +408,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	if (!iommu_free_check(tbl, dma_addr, npages))
 		return;
 
-	ppc_md.tce_free(tbl, entry, npages);
+	tbl->it_ops->clear(tbl, entry, npages);
 
 	spin_lock_irqsave(&(pool->lock), flags);
 	bitmap_clear(tbl->it_map, free_entry, npages);
@@ -424,8 +424,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	 * not do an mb() here on purpose, it is not needed on any of
 	 * the current platforms.
 	 */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 }
 
 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
@@ -495,7 +495,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
-		build_fail = ppc_md.tce_build(tbl, entry, npages,
+		build_fail = tbl->it_ops->set(tbl, entry, npages,
 					      vaddr & IOMMU_PAGE_MASK(tbl),
 					      direction, attrs);
 		if(unlikely(build_fail))
@@ -534,8 +534,8 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 	}
 
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	DBG("mapped %d elements:\n", outcount);
 
@@ -600,8 +600,8 @@ void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 	 * do not do an mb() here, the affected platforms do not need it
 	 * when freeing.
 	 */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 }
 
 static void iommu_table_clear(struct iommu_table *tbl)
@@ -613,17 +613,17 @@ static void iommu_table_clear(struct iommu_table *tbl)
 	 */
 	if (!is_kdump_kernel() || is_fadump_active()) {
 		/* Clear the table in case firmware left allocations in it */
-		ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
+		tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
 		return;
 	}
 
 #ifdef CONFIG_CRASH_DUMP
-	if (ppc_md.tce_get) {
+	if (tbl->it_ops->get) {
 		unsigned long index, tceval, tcecount = 0;
 
 		/* Reserve the existing mappings left by the first kernel. */
 		for (index = 0; index < tbl->it_size; index++) {
-			tceval = ppc_md.tce_get(tbl, index + tbl->it_offset);
+			tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
 			/*
 			 * Freed TCE entry contains 0x7fffffffffffffff on JS20
 			 */
@@ -657,6 +657,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	unsigned int i;
 	struct iommu_pool *p;
 
+	BUG_ON(!tbl->it_ops);
+
 	/* number of bytes needed for the bitmap */
 	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
@@ -926,8 +928,8 @@ EXPORT_SYMBOL_GPL(iommu_tce_direction);
 void iommu_flush_tce(struct iommu_table *tbl)
 {
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	/* Make sure updates are seen by hardware */
 	mb();
@@ -938,7 +940,7 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce_value,
 		unsigned long npages)
 {
-	/* ppc_md.tce_free() does not support any value but 0 */
+	/* tbl->it_ops->clear() does not support any value but 0 */
 	if (tce_value)
 		return -EINVAL;
 
@@ -986,9 +988,9 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
 
 	spin_lock(&(pool->lock));
 
-	oldtce = ppc_md.tce_get(tbl, entry);
+	oldtce = tbl->it_ops->get(tbl, entry);
 	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
-		ppc_md.tce_free(tbl, entry, 1);
+		tbl->it_ops->clear(tbl, entry, 1);
 	else
 		oldtce = 0;
 
@@ -1011,10 +1013,10 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 
 	spin_lock(&(pool->lock));
 
-	oldtce = ppc_md.tce_get(tbl, entry);
+	oldtce = tbl->it_ops->get(tbl, entry);
 	/* Add new entry if it is not busy */
 	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-		ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+		ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
 
 	spin_unlock(&(pool->lock));
 
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 5bfdab9..b41426c 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1196,6 +1196,11 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 	tbl->it_type = TCE_VB;
 	tbl->it_blocksize = 16;
 
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
+	else
+		tbl->it_ops = &iommu_table_pseries_ops;
+
 	return iommu_init_table(tbl, -1);
 }
 
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 21b5023..14a582b 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -466,6 +466,11 @@ static inline u32 cell_iommu_get_ioid(struct device_node *np)
 	return *ioid;
 }
 
+static struct iommu_table_ops cell_iommu_ops = {
+	.set = tce_build_cell,
+	.clear = tce_free_cell
+};
+
 static struct iommu_window * __init
 cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 			unsigned long offset, unsigned long size,
@@ -492,6 +497,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	window->table.it_offset =
 		(offset >> window->table.it_page_shift) + pte_offset;
 	window->table.it_size = size >> window->table.it_page_shift;
+	window->table.it_ops = &cell_iommu_ops;
 
 	iommu_init_table(&window->table, iommu->nid);
 
@@ -1201,8 +1207,6 @@ static int __init cell_iommu_init(void)
 	/* Setup various callbacks */
 	cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup;
 	ppc_md.dma_get_required_mask = cell_dma_get_required_mask;
-	ppc_md.tce_build = tce_build_cell;
-	ppc_md.tce_free = tce_free_cell;
 
 	if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
 		goto bail;
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index b8f567b..c929644 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -134,6 +134,10 @@ static void iobmap_free(struct iommu_table *tbl, long index,
 	}
 }
 
+static struct iommu_table_ops iommu_table_iobmap_ops = {
+	.set = iobmap_build,
+	.clear  = iobmap_free
+};
 
 static void iommu_table_iobmap_setup(void)
 {
@@ -153,6 +157,7 @@ static void iommu_table_iobmap_setup(void)
 	 * Should probably be 8 (64 bytes)
 	 */
 	iommu_table_iobmap.it_blocksize = 4;
+	iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
 	iommu_init_table(&iommu_table_iobmap, 0);
 	pr_debug(" <- %s\n", __func__);
 }
@@ -252,8 +257,6 @@ void __init iommu_init_early_pasemi(void)
 
 	pasemi_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pasemi;
 	pasemi_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pasemi;
-	ppc_md.tce_build = iobmap_build;
-	ppc_md.tce_free  = iobmap_free;
 	set_pci_dma_ops(&dma_iommu_ops);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8c3c4bf..2924abe 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1725,6 +1725,12 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
 	 */
 }
 
+static struct iommu_table_ops pnv_ioda1_iommu_ops = {
+	.set = pnv_tce_build,
+	.clear = pnv_tce_free,
+	.get = pnv_tce_get,
+};
+
 static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
 					 struct iommu_table *tbl,
 					 __be64 *startp, __be64 *endp, bool rm)
@@ -1769,6 +1775,12 @@ void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
 		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
 }
 
+static struct iommu_table_ops pnv_ioda2_iommu_ops = {
+	.set = pnv_tce_build,
+	.clear = pnv_tce_free,
+	.get = pnv_tce_get,
+};
+
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				      struct pnv_ioda_pe *pe, unsigned int base,
 				      unsigned int segs)
@@ -1844,6 +1856,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				 TCE_PCI_SWINV_FREE   |
 				 TCE_PCI_SWINV_PAIR);
 	}
+	tbl->it_ops = &pnv_ioda1_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
@@ -1972,6 +1985,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				8);
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
 	}
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index b17d93615..2722c1a 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -83,10 +83,17 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb)
 static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
 #endif /* CONFIG_PCI_MSI */
 
+static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
+	.set = pnv_tce_build,
+	.clear = pnv_tce_free,
+	.get = pnv_tce_get,
+};
+
 static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 					 struct pci_dev *pdev)
 {
 	if (phb->p5ioc2.iommu_table.it_map == NULL) {
+		phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops;
 		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
 		iommu_register_group(&phb->p5ioc2.iommu_table,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index b7ea245..4c3bbb1 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -572,9 +572,9 @@ struct pci_ops pnv_pci_ops = {
 	.write = pnv_pci_write_config,
 };
 
-static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
-			 unsigned long uaddr, enum dma_data_direction direction,
-			 struct dma_attrs *attrs, bool rm)
+int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction,
+		struct dma_attrs *attrs)
 {
 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
 	__be64 *tcep, *tces;
@@ -592,22 +592,12 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 	 * of flags if that becomes the case
 	 */
 	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
 
 	return 0;
 }
 
-static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages,
-			    unsigned long uaddr,
-			    enum dma_data_direction direction,
-			    struct dma_attrs *attrs)
-{
-	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs,
-			false);
-}
-
-static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
-		bool rm)
+void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
 	__be64 *tcep, *tces;
 
@@ -617,32 +607,14 @@ static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
 		*(tcep++) = cpu_to_be64(0);
 
 	if (tbl->it_type & TCE_PCI_SWINV_FREE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
 }
 
-static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages)
-{
-	pnv_tce_free(tbl, index, npages, false);
-}
-
-static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
 	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
 }
 
-static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages,
-			    unsigned long uaddr,
-			    enum dma_data_direction direction,
-			    struct dma_attrs *attrs)
-{
-	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true);
-}
-
-static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages)
-{
-	pnv_tce_free(tbl, index, npages, true);
-}
-
 void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 			       void *tce_mem, u64 tce_size,
 			       u64 dma_offset, unsigned page_shift)
@@ -757,11 +729,6 @@ void __init pnv_pci_init(void)
 	pci_devs_phb_init();
 
 	/* Configure IOMMU DMA hooks */
-	ppc_md.tce_build = pnv_tce_build_vm;
-	ppc_md.tce_free = pnv_tce_free_vm;
-	ppc_md.tce_build_rm = pnv_tce_build_rm;
-	ppc_md.tce_free_rm = pnv_tce_free_rm;
-	ppc_md.tce_get = pnv_tce_get;
 	set_pci_dma_ops(&dma_iommu_ops);
 
 	/* Configure MSIs */
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 070ee88..ec26afd 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -200,6 +200,11 @@ struct pnv_phb {
 };
 
 extern struct pci_ops pnv_pci_ops;
+extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction,
+		struct dma_attrs *attrs);
+extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
+extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
 				unsigned char *log_buff);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 89f557b..4f2ab90 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -204,7 +204,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	int ret = 0;
 	unsigned long flags;
 
-	if (npages == 1) {
+	if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
 		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 		                           direction, attrs);
 	}
@@ -296,6 +296,9 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n
 {
 	u64 rc;
 
+	if (!firmware_has_feature(FW_FEATURE_MULTITCE))
+		return tce_free_pSeriesLP(tbl, tcenum, npages);
+
 	rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
 
 	if (rc && printk_ratelimit()) {
@@ -471,7 +474,6 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
 	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 }
 
-
 #ifdef CONFIG_PCI
 static void iommu_table_setparms(struct pci_controller *phb,
 				 struct device_node *dn,
@@ -557,6 +559,12 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
 	tbl->it_size = size >> tbl->it_page_shift;
 }
 
+struct iommu_table_ops iommu_table_pseries_ops = {
+	.set = tce_build_pSeries,
+	.clear = tce_free_pSeries,
+	.get = tce_get_pseries
+};
+
 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 {
 	struct device_node *dn;
@@ -625,6 +633,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 			   pci->phb->node);
 
 	iommu_table_setparms(pci->phb, dn, tbl);
+	tbl->it_ops = &iommu_table_pseries_ops;
 	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
 	iommu_register_group(tbl, pci_domain_nr(bus), 0);
 
@@ -636,6 +645,11 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
 }
 
+struct iommu_table_ops iommu_table_lpar_multi_ops = {
+	.set = tce_buildmulti_pSeriesLP,
+	.clear = tce_freemulti_pSeriesLP,
+	.get = tce_get_pSeriesLP
+};
 
 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 {
@@ -670,6 +684,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   ppci->phb->node);
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
 		iommu_register_group(tbl, pci_domain_nr(bus), 0);
 		pr_debug("  created table: %p\n", ppci->iommu_table);
@@ -697,6 +712,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   phb->node);
 		iommu_table_setparms(phb, dn, tbl);
+		tbl->it_ops = &iommu_table_pseries_ops;
 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
 		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
 		set_iommu_table_base(&dev->dev, tbl);
@@ -1119,6 +1135,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   pci->phb->node);
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
 		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
 		pr_debug("  created table: %p\n", pci->iommu_table);
@@ -1313,22 +1330,11 @@ void iommu_init_early_pSeries(void)
 		return;
 
 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
-		if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
-			ppc_md.tce_build = tce_buildmulti_pSeriesLP;
-			ppc_md.tce_free	 = tce_freemulti_pSeriesLP;
-		} else {
-			ppc_md.tce_build = tce_build_pSeriesLP;
-			ppc_md.tce_free	 = tce_free_pSeriesLP;
-		}
-		ppc_md.tce_get   = tce_get_pSeriesLP;
 		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
 		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
 		ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
 		ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
 	} else {
-		ppc_md.tce_build = tce_build_pSeries;
-		ppc_md.tce_free  = tce_free_pSeries;
-		ppc_md.tce_get   = tce_get_pseries;
 		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
 		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
 	}
@@ -1346,8 +1352,6 @@ static int __init disable_multitce(char *str)
 	    firmware_has_feature(FW_FEATURE_LPAR) &&
 	    firmware_has_feature(FW_FEATURE_MULTITCE)) {
 		printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
-		ppc_md.tce_build = tce_build_pSeriesLP;
-		ppc_md.tce_free	 = tce_free_pSeriesLP;
 		powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
 	}
 	return 1;
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index d00a566..90bcdfe 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -286,6 +286,12 @@ static int __init dart_init(struct device_node *dart_node)
 	return 0;
 }
 
+static struct iommu_table_ops iommu_dart_ops = {
+	.set = dart_build,
+	.clear = dart_free,
+	.flush = dart_flush,
+};
+
 static void iommu_table_dart_setup(void)
 {
 	iommu_table_dart.it_busno = 0;
@@ -298,6 +304,7 @@ static void iommu_table_dart_setup(void)
 	iommu_table_dart.it_base = (unsigned long)dart_vbase;
 	iommu_table_dart.it_index = 0;
 	iommu_table_dart.it_blocksize = 1;
+	iommu_table_dart.it_ops = &iommu_dart_ops;
 	iommu_init_table(&iommu_table_dart, -1);
 
 	/* Reserve the last page of the DART to avoid possible prefetch
@@ -386,11 +393,6 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
 	if (dart_init(dn) != 0)
 		goto bail;
 
-	/* Setup low level TCE operations for the core IOMMU code */
-	ppc_md.tce_build = dart_build;
-	ppc_md.tce_free  = dart_free;
-	ppc_md.tce_flush = dart_flush;
-
 	/* Setup bypass if supported */
 	if (dart_is_u4)
 		ppc_md.dma_set_mask = dart_dma_set_mask;
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
  2015-05-11 15:39 ` [PATCH kernel v10 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table Alexey Kardashevskiy
@ 2015-05-14  0:23   ` Gavin Shan
  2015-05-14  3:07     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  0:23 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:03AM +1000, Alexey Kardashevskiy wrote:
>This adds a iommu_table_ops struct and puts pointer to it into
>the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush
>callbacks from ppc_md to the new struct where they really belong to.
>
>This adds the requirement for @it_ops to be initialized before calling
>iommu_init_table() to make sure that we do not leave any IOMMU table
>with iommu_table_ops uninitialized. This is not a parameter of
>iommu_init_table() though as there will be cases when iommu_init_table()
>will not be called on TCE tables, for example - VFIO.
>
>This does s/tce_build/set/, s/tce_free/clear/ and removes "tce_"
>redundand prefixes.
>
s/redundand/redundant  I might be wrong because of my bad English.
>This removes tce_xxx_rm handlers from ppc_md but does not add
>them to iommu_table_ops as this will be done later if we decide to
>support TCE hypercalls in real mode. This removes _vm callbacks as
>only virtual mode is supported by now so this also removes @rm parameter.
>
>For pSeries, this always uses tce_buildmulti_pSeriesLP/
>tce_buildmulti_pSeriesLP. This changes multi callback to fall back to
>tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not
>present. The reason for this is we still have to support "multitce=off"
>boot parameter in disable_multitce() and we do not want to walk through
>all IOMMU tables in the system and replace "multi" callbacks with single
>ones.
>
>For powernv, this defines _ops per PHB type which are P5IOC2/IODA1/IODA2.
>This makes the callbacks for them public. Later patches will extend
>callbacks for IODA1/2.
>
>No change in behaviour is expected.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v9:
>* pnv_tce_build/pnv_tce_free/pnv_tce_get have been made public and lost
>"rm" parameters to make following patches simpler (realmode is not
>supported here anyway)
>* got rid of _vm versions of callbacks
>---
> arch/powerpc/include/asm/iommu.h            | 17 +++++++++++
> arch/powerpc/include/asm/machdep.h          | 25 ---------------
> arch/powerpc/kernel/iommu.c                 | 46 ++++++++++++++--------------
> arch/powerpc/kernel/vio.c                   |  5 +++
> arch/powerpc/platforms/cell/iommu.c         |  8 +++--
> arch/powerpc/platforms/pasemi/iommu.c       |  7 +++--
> arch/powerpc/platforms/powernv/pci-ioda.c   | 14 +++++++++
> arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 +++++
> arch/powerpc/platforms/powernv/pci.c        | 47 +++++------------------------
> arch/powerpc/platforms/powernv/pci.h        |  5 +++
> arch/powerpc/platforms/pseries/iommu.c      | 34 ++++++++++++---------
> arch/powerpc/sysdev/dart_iommu.c            | 12 +++++---
> 12 files changed, 116 insertions(+), 111 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>index d91bd69..e2a45c3 100644
>--- a/arch/powerpc/include/asm/iommu.h
>+++ b/arch/powerpc/include/asm/iommu.h
>@@ -44,6 +44,22 @@
> extern int iommu_is_off;
> extern int iommu_force_on;
>
>+struct iommu_table_ops {
>+	int (*set)(struct iommu_table *tbl,
>+			long index, long npages,
>+			unsigned long uaddr,
>+			enum dma_data_direction direction,
>+			struct dma_attrs *attrs);
>+	void (*clear)(struct iommu_table *tbl,
>+			long index, long npages);
>+	unsigned long (*get)(struct iommu_table *tbl, long index);
>+	void (*flush)(struct iommu_table *tbl);
Currently, there isn't flush backend on PowerNV platform. I'm not sure
if we have to implement it for PowerNV if we really need it. Maybe you
will have it to support DDW in subsequent patches which I didn't look
into it, but I will :-)
>+};
>+
>+/* These are used by VIO */
>+extern struct iommu_table_ops iommu_table_lpar_multi_ops;
>+extern struct iommu_table_ops iommu_table_pseries_ops;
>+
It might be reasonable to add "struct iommu_table_ops *ops" to function
vio_register_device_node() where the specified "ops" can be hooked to
the newly created IOMMU table. In that way, the platform (pSeries) specific
IOMMU table operations doesn't have to be exposed to PowerPC subsystem.
Thanks,
Gavin
> /*
>  * IOMAP_MAX_ORDER defines the largest contiguous block
>  * of dma space we can get.  IOMAP_MAX_ORDER = 13
>@@ -78,6 +94,7 @@ struct iommu_table {
> #ifdef CONFIG_IOMMU_API
> 	struct iommu_group *it_group;
> #endif
>+	struct iommu_table_ops *it_ops;
> 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
> #ifdef CONFIG_PPC_POWERNV
> 	void           *data;
>diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
>index ef889943..ab721b4 100644
>--- a/arch/powerpc/include/asm/machdep.h
>+++ b/arch/powerpc/include/asm/machdep.h
>@@ -65,31 +65,6 @@ struct machdep_calls {
> 	 * destroyed as well */
> 	void		(*hpte_clear_all)(void);
>
>-	int		(*tce_build)(struct iommu_table *tbl,
>-				     long index,
>-				     long npages,
>-				     unsigned long uaddr,
>-				     enum dma_data_direction direction,
>-				     struct dma_attrs *attrs);
>-	void		(*tce_free)(struct iommu_table *tbl,
>-				    long index,
>-				    long npages);
>-	unsigned long	(*tce_get)(struct iommu_table *tbl,
>-				    long index);
>-	void		(*tce_flush)(struct iommu_table *tbl);
>-
>-	/* _rm versions are for real mode use only */
>-	int		(*tce_build_rm)(struct iommu_table *tbl,
>-				     long index,
>-				     long npages,
>-				     unsigned long uaddr,
>-				     enum dma_data_direction direction,
>-				     struct dma_attrs *attrs);
>-	void		(*tce_free_rm)(struct iommu_table *tbl,
>-				    long index,
>-				    long npages);
>-	void		(*tce_flush_rm)(struct iommu_table *tbl);
>-
> 	void __iomem *	(*ioremap)(phys_addr_t addr, unsigned long size,
> 				   unsigned long flags, void *caller);
> 	void		(*iounmap)(volatile void __iomem *token);
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index 31319f8..16be6aa 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -322,11 +322,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
> 	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
>
> 	/* Put the TCEs in the HW table */
>-	build_fail = ppc_md.tce_build(tbl, entry, npages,
>+	build_fail = tbl->it_ops->set(tbl, entry, npages,
> 				      (unsigned long)page &
> 				      IOMMU_PAGE_MASK(tbl), direction, attrs);
>
>-	/* ppc_md.tce_build() only returns non-zero for transient errors.
>+	/* tbl->it_ops->set() only returns non-zero for transient errors.
> 	 * Clean up the table bitmap in this case and return
> 	 * DMA_ERROR_CODE. For all other errors the functionality is
> 	 * not altered.
>@@ -337,8 +337,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
> 	}
>
> 	/* Flush/invalidate TLB caches if necessary */
>-	if (ppc_md.tce_flush)
>-		ppc_md.tce_flush(tbl);
>+	if (tbl->it_ops->flush)
>+		tbl->it_ops->flush(tbl);
>
> 	/* Make sure updates are seen by hardware */
> 	mb();
>@@ -408,7 +408,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
> 	if (!iommu_free_check(tbl, dma_addr, npages))
> 		return;
>
>-	ppc_md.tce_free(tbl, entry, npages);
>+	tbl->it_ops->clear(tbl, entry, npages);
>
> 	spin_lock_irqsave(&(pool->lock), flags);
> 	bitmap_clear(tbl->it_map, free_entry, npages);
>@@ -424,8 +424,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
> 	 * not do an mb() here on purpose, it is not needed on any of
> 	 * the current platforms.
> 	 */
>-	if (ppc_md.tce_flush)
>-		ppc_md.tce_flush(tbl);
>+	if (tbl->it_ops->flush)
>+		tbl->it_ops->flush(tbl);
> }
>
> int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
>@@ -495,7 +495,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
> 			    npages, entry, dma_addr);
>
> 		/* Insert into HW table */
>-		build_fail = ppc_md.tce_build(tbl, entry, npages,
>+		build_fail = tbl->it_ops->set(tbl, entry, npages,
> 					      vaddr & IOMMU_PAGE_MASK(tbl),
> 					      direction, attrs);
> 		if(unlikely(build_fail))
>@@ -534,8 +534,8 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
> 	}
>
> 	/* Flush/invalidate TLB caches if necessary */
>-	if (ppc_md.tce_flush)
>-		ppc_md.tce_flush(tbl);
>+	if (tbl->it_ops->flush)
>+		tbl->it_ops->flush(tbl);
>
> 	DBG("mapped %d elements:\n", outcount);
>
>@@ -600,8 +600,8 @@ void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
> 	 * do not do an mb() here, the affected platforms do not need it
> 	 * when freeing.
> 	 */
>-	if (ppc_md.tce_flush)
>-		ppc_md.tce_flush(tbl);
>+	if (tbl->it_ops->flush)
>+		tbl->it_ops->flush(tbl);
> }
>
> static void iommu_table_clear(struct iommu_table *tbl)
>@@ -613,17 +613,17 @@ static void iommu_table_clear(struct iommu_table *tbl)
> 	 */
> 	if (!is_kdump_kernel() || is_fadump_active()) {
> 		/* Clear the table in case firmware left allocations in it */
>-		ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
>+		tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
> 		return;
> 	}
>
> #ifdef CONFIG_CRASH_DUMP
>-	if (ppc_md.tce_get) {
>+	if (tbl->it_ops->get) {
> 		unsigned long index, tceval, tcecount = 0;
>
> 		/* Reserve the existing mappings left by the first kernel. */
> 		for (index = 0; index < tbl->it_size; index++) {
>-			tceval = ppc_md.tce_get(tbl, index + tbl->it_offset);
>+			tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
> 			/*
> 			 * Freed TCE entry contains 0x7fffffffffffffff on JS20
> 			 */
>@@ -657,6 +657,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
> 	unsigned int i;
> 	struct iommu_pool *p;
>
>+	BUG_ON(!tbl->it_ops);
>+
> 	/* number of bytes needed for the bitmap */
> 	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
>
>@@ -926,8 +928,8 @@ EXPORT_SYMBOL_GPL(iommu_tce_direction);
> void iommu_flush_tce(struct iommu_table *tbl)
> {
> 	/* Flush/invalidate TLB caches if necessary */
>-	if (ppc_md.tce_flush)
>-		ppc_md.tce_flush(tbl);
>+	if (tbl->it_ops->flush)
>+		tbl->it_ops->flush(tbl);
>
> 	/* Make sure updates are seen by hardware */
> 	mb();
>@@ -938,7 +940,7 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl,
> 		unsigned long ioba, unsigned long tce_value,
> 		unsigned long npages)
> {
>-	/* ppc_md.tce_free() does not support any value but 0 */
>+	/* tbl->it_ops->clear() does not support any value but 0 */
> 	if (tce_value)
> 		return -EINVAL;
>
>@@ -986,9 +988,9 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
>
> 	spin_lock(&(pool->lock));
>
>-	oldtce = ppc_md.tce_get(tbl, entry);
>+	oldtce = tbl->it_ops->get(tbl, entry);
> 	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
>-		ppc_md.tce_free(tbl, entry, 1);
>+		tbl->it_ops->clear(tbl, entry, 1);
> 	else
> 		oldtce = 0;
>
>@@ -1011,10 +1013,10 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
>
> 	spin_lock(&(pool->lock));
>
>-	oldtce = ppc_md.tce_get(tbl, entry);
>+	oldtce = tbl->it_ops->get(tbl, entry);
> 	/* Add new entry if it is not busy */
> 	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>-		ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
>+		ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
>
> 	spin_unlock(&(pool->lock));
>
>diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
>index 5bfdab9..b41426c 100644
>--- a/arch/powerpc/kernel/vio.c
>+++ b/arch/powerpc/kernel/vio.c
>@@ -1196,6 +1196,11 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
> 	tbl->it_type = TCE_VB;
> 	tbl->it_blocksize = 16;
>
>+	if (firmware_has_feature(FW_FEATURE_LPAR))
>+		tbl->it_ops = &iommu_table_lpar_multi_ops;
>+	else
>+		tbl->it_ops = &iommu_table_pseries_ops;
>+
> 	return iommu_init_table(tbl, -1);
> }
>
>diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
>index 21b5023..14a582b 100644
>--- a/arch/powerpc/platforms/cell/iommu.c
>+++ b/arch/powerpc/platforms/cell/iommu.c
>@@ -466,6 +466,11 @@ static inline u32 cell_iommu_get_ioid(struct device_node *np)
> 	return *ioid;
> }
>
>+static struct iommu_table_ops cell_iommu_ops = {
>+	.set = tce_build_cell,
>+	.clear = tce_free_cell
>+};
>+
> static struct iommu_window * __init
> cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
> 			unsigned long offset, unsigned long size,
>@@ -492,6 +497,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
> 	window->table.it_offset =
> 		(offset >> window->table.it_page_shift) + pte_offset;
> 	window->table.it_size = size >> window->table.it_page_shift;
>+	window->table.it_ops = &cell_iommu_ops;
>
> 	iommu_init_table(&window->table, iommu->nid);
>
>@@ -1201,8 +1207,6 @@ static int __init cell_iommu_init(void)
> 	/* Setup various callbacks */
> 	cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup;
> 	ppc_md.dma_get_required_mask = cell_dma_get_required_mask;
>-	ppc_md.tce_build = tce_build_cell;
>-	ppc_md.tce_free = tce_free_cell;
>
> 	if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
> 		goto bail;
>diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
>index b8f567b..c929644 100644
>--- a/arch/powerpc/platforms/pasemi/iommu.c
>+++ b/arch/powerpc/platforms/pasemi/iommu.c
>@@ -134,6 +134,10 @@ static void iobmap_free(struct iommu_table *tbl, long index,
> 	}
> }
>
>+static struct iommu_table_ops iommu_table_iobmap_ops = {
>+	.set = iobmap_build,
>+	.clear  = iobmap_free
>+};
>
> static void iommu_table_iobmap_setup(void)
> {
>@@ -153,6 +157,7 @@ static void iommu_table_iobmap_setup(void)
> 	 * Should probably be 8 (64 bytes)
> 	 */
> 	iommu_table_iobmap.it_blocksize = 4;
>+	iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
> 	iommu_init_table(&iommu_table_iobmap, 0);
> 	pr_debug(" <- %s\n", __func__);
> }
>@@ -252,8 +257,6 @@ void __init iommu_init_early_pasemi(void)
>
> 	pasemi_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pasemi;
> 	pasemi_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pasemi;
>-	ppc_md.tce_build = iobmap_build;
>-	ppc_md.tce_free  = iobmap_free;
> 	set_pci_dma_ops(&dma_iommu_ops);
> }
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 8c3c4bf..2924abe 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1725,6 +1725,12 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
> 	 */
> }
>
>+static struct iommu_table_ops pnv_ioda1_iommu_ops = {
>+	.set = pnv_tce_build,
>+	.clear = pnv_tce_free,
>+	.get = pnv_tce_get,
>+};
>+
> static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
> 					 struct iommu_table *tbl,
> 					 __be64 *startp, __be64 *endp, bool rm)
>@@ -1769,6 +1775,12 @@ void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
> 		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
> }
>
>+static struct iommu_table_ops pnv_ioda2_iommu_ops = {
>+	.set = pnv_tce_build,
>+	.clear = pnv_tce_free,
>+	.get = pnv_tce_get,
>+};
>+
> static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 				      struct pnv_ioda_pe *pe, unsigned int base,
> 				      unsigned int segs)
>@@ -1844,6 +1856,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 				 TCE_PCI_SWINV_FREE   |
> 				 TCE_PCI_SWINV_PAIR);
> 	}
>+	tbl->it_ops = &pnv_ioda1_iommu_ops;
> 	iommu_init_table(tbl, phb->hose->node);
>
> 	if (pe->flags & PNV_IODA_PE_DEV) {
>@@ -1972,6 +1985,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 				8);
> 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
> 	}
>+	tbl->it_ops = &pnv_ioda2_iommu_ops;
> 	iommu_init_table(tbl, phb->hose->node);
>
> 	if (pe->flags & PNV_IODA_PE_DEV) {
>diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>index b17d93615..2722c1a 100644
>--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>@@ -83,10 +83,17 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb)
> static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
> #endif /* CONFIG_PCI_MSI */
>
>+static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
>+	.set = pnv_tce_build,
>+	.clear = pnv_tce_free,
>+	.get = pnv_tce_get,
>+};
>+
> static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
> 					 struct pci_dev *pdev)
> {
> 	if (phb->p5ioc2.iommu_table.it_map == NULL) {
>+		phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops;
> 		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
> 		iommu_register_group(&phb->p5ioc2.iommu_table,
> 				pci_domain_nr(phb->hose->bus), phb->opal_id);
>diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>index b7ea245..4c3bbb1 100644
>--- a/arch/powerpc/platforms/powernv/pci.c
>+++ b/arch/powerpc/platforms/powernv/pci.c
>@@ -572,9 +572,9 @@ struct pci_ops pnv_pci_ops = {
> 	.write = pnv_pci_write_config,
> };
>
>-static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>-			 unsigned long uaddr, enum dma_data_direction direction,
>-			 struct dma_attrs *attrs, bool rm)
>+int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>+		unsigned long uaddr, enum dma_data_direction direction,
>+		struct dma_attrs *attrs)
> {
> 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
> 	__be64 *tcep, *tces;
>@@ -592,22 +592,12 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
> 	 * of flags if that becomes the case
> 	 */
> 	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
>-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
>+		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
>
> 	return 0;
> }
>
>-static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages,
>-			    unsigned long uaddr,
>-			    enum dma_data_direction direction,
>-			    struct dma_attrs *attrs)
>-{
>-	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs,
>-			false);
>-}
>-
>-static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
>-		bool rm)
>+void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
> {
> 	__be64 *tcep, *tces;
>
>@@ -617,32 +607,14 @@ static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
> 		*(tcep++) = cpu_to_be64(0);
>
> 	if (tbl->it_type & TCE_PCI_SWINV_FREE)
>-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
>+		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
> }
>
>-static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages)
>-{
>-	pnv_tce_free(tbl, index, npages, false);
>-}
>-
>-static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
>+unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
> {
> 	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
> }
>
>-static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages,
>-			    unsigned long uaddr,
>-			    enum dma_data_direction direction,
>-			    struct dma_attrs *attrs)
>-{
>-	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true);
>-}
>-
>-static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages)
>-{
>-	pnv_tce_free(tbl, index, npages, true);
>-}
>-
> void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
> 			       void *tce_mem, u64 tce_size,
> 			       u64 dma_offset, unsigned page_shift)
>@@ -757,11 +729,6 @@ void __init pnv_pci_init(void)
> 	pci_devs_phb_init();
>
> 	/* Configure IOMMU DMA hooks */
>-	ppc_md.tce_build = pnv_tce_build_vm;
>-	ppc_md.tce_free = pnv_tce_free_vm;
>-	ppc_md.tce_build_rm = pnv_tce_build_rm;
>-	ppc_md.tce_free_rm = pnv_tce_free_rm;
>-	ppc_md.tce_get = pnv_tce_get;
> 	set_pci_dma_ops(&dma_iommu_ops);
>
> 	/* Configure MSIs */
>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>index 070ee88..ec26afd 100644
>--- a/arch/powerpc/platforms/powernv/pci.h
>+++ b/arch/powerpc/platforms/powernv/pci.h
>@@ -200,6 +200,11 @@ struct pnv_phb {
> };
>
> extern struct pci_ops pnv_pci_ops;
>+extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>+		unsigned long uaddr, enum dma_data_direction direction,
>+		struct dma_attrs *attrs);
>+extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
>+extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
>
> void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
> 				unsigned char *log_buff);
>diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>index 89f557b..4f2ab90 100644
>--- a/arch/powerpc/platforms/pseries/iommu.c
>+++ b/arch/powerpc/platforms/pseries/iommu.c
>@@ -204,7 +204,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
> 	int ret = 0;
> 	unsigned long flags;
>
>-	if (npages == 1) {
>+	if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
> 		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
> 		                           direction, attrs);
> 	}
>@@ -296,6 +296,9 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n
> {
> 	u64 rc;
>
>+	if (!firmware_has_feature(FW_FEATURE_MULTITCE))
>+		return tce_free_pSeriesLP(tbl, tcenum, npages);
>+
> 	rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
>
> 	if (rc && printk_ratelimit()) {
>@@ -471,7 +474,6 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
> 	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
> }
>
>-
> #ifdef CONFIG_PCI
> static void iommu_table_setparms(struct pci_controller *phb,
> 				 struct device_node *dn,
>@@ -557,6 +559,12 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
> 	tbl->it_size = size >> tbl->it_page_shift;
> }
>
>+struct iommu_table_ops iommu_table_pseries_ops = {
>+	.set = tce_build_pSeries,
>+	.clear = tce_free_pSeries,
>+	.get = tce_get_pseries
>+};
>+
> static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
> {
> 	struct device_node *dn;
>@@ -625,6 +633,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
> 			   pci->phb->node);
>
> 	iommu_table_setparms(pci->phb, dn, tbl);
>+	tbl->it_ops = &iommu_table_pseries_ops;
> 	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
> 	iommu_register_group(tbl, pci_domain_nr(bus), 0);
>
>@@ -636,6 +645,11 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
> 	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
> }
>
>+struct iommu_table_ops iommu_table_lpar_multi_ops = {
>+	.set = tce_buildmulti_pSeriesLP,
>+	.clear = tce_freemulti_pSeriesLP,
>+	.get = tce_get_pSeriesLP
>+};
>
> static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
> {
>@@ -670,6 +684,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
> 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> 				   ppci->phb->node);
> 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
>+		tbl->it_ops = &iommu_table_lpar_multi_ops;
> 		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
> 		iommu_register_group(tbl, pci_domain_nr(bus), 0);
> 		pr_debug("  created table: %p\n", ppci->iommu_table);
>@@ -697,6 +712,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
> 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> 				   phb->node);
> 		iommu_table_setparms(phb, dn, tbl);
>+		tbl->it_ops = &iommu_table_pseries_ops;
> 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
> 		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
> 		set_iommu_table_base(&dev->dev, tbl);
>@@ -1119,6 +1135,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
> 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> 				   pci->phb->node);
> 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
>+		tbl->it_ops = &iommu_table_lpar_multi_ops;
> 		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
> 		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
> 		pr_debug("  created table: %p\n", pci->iommu_table);
>@@ -1313,22 +1330,11 @@ void iommu_init_early_pSeries(void)
> 		return;
>
> 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
>-		if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
>-			ppc_md.tce_build = tce_buildmulti_pSeriesLP;
>-			ppc_md.tce_free	 = tce_freemulti_pSeriesLP;
>-		} else {
>-			ppc_md.tce_build = tce_build_pSeriesLP;
>-			ppc_md.tce_free	 = tce_free_pSeriesLP;
>-		}
>-		ppc_md.tce_get   = tce_get_pSeriesLP;
> 		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
> 		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
> 		ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
> 		ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
> 	} else {
>-		ppc_md.tce_build = tce_build_pSeries;
>-		ppc_md.tce_free  = tce_free_pSeries;
>-		ppc_md.tce_get   = tce_get_pseries;
> 		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
> 		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
> 	}
>@@ -1346,8 +1352,6 @@ static int __init disable_multitce(char *str)
> 	    firmware_has_feature(FW_FEATURE_LPAR) &&
> 	    firmware_has_feature(FW_FEATURE_MULTITCE)) {
> 		printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
>-		ppc_md.tce_build = tce_build_pSeriesLP;
>-		ppc_md.tce_free	 = tce_free_pSeriesLP;
> 		powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
> 	}
> 	return 1;
>diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
>index d00a566..90bcdfe 100644
>--- a/arch/powerpc/sysdev/dart_iommu.c
>+++ b/arch/powerpc/sysdev/dart_iommu.c
>@@ -286,6 +286,12 @@ static int __init dart_init(struct device_node *dart_node)
> 	return 0;
> }
>
>+static struct iommu_table_ops iommu_dart_ops = {
>+	.set = dart_build,
>+	.clear = dart_free,
>+	.flush = dart_flush,
>+};
>+
> static void iommu_table_dart_setup(void)
> {
> 	iommu_table_dart.it_busno = 0;
>@@ -298,6 +304,7 @@ static void iommu_table_dart_setup(void)
> 	iommu_table_dart.it_base = (unsigned long)dart_vbase;
> 	iommu_table_dart.it_index = 0;
> 	iommu_table_dart.it_blocksize = 1;
>+	iommu_table_dart.it_ops = &iommu_dart_ops;
> 	iommu_init_table(&iommu_table_dart, -1);
>
> 	/* Reserve the last page of the DART to avoid possible prefetch
>@@ -386,11 +393,6 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
> 	if (dart_init(dn) != 0)
> 		goto bail;
>
>-	/* Setup low level TCE operations for the core IOMMU code */
>-	ppc_md.tce_build = dart_build;
>-	ppc_md.tce_free  = dart_free;
>-	ppc_md.tce_flush = dart_flush;
>-
> 	/* Setup bypass if supported */
> 	if (dart_is_u4)
> 		ppc_md.dma_set_mask = dart_dma_set_mask;
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
  2015-05-14  0:23   ` Gavin Shan
@ 2015-05-14  3:07     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  3:07 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/14/2015 10:23 AM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:39:03AM +1000, Alexey Kardashevskiy wrote:
>> This adds a iommu_table_ops struct and puts pointer to it into
>> the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush
>> callbacks from ppc_md to the new struct where they really belong to.
>>
>> This adds the requirement for @it_ops to be initialized before calling
>> iommu_init_table() to make sure that we do not leave any IOMMU table
>> with iommu_table_ops uninitialized. This is not a parameter of
>> iommu_init_table() though as there will be cases when iommu_init_table()
>> will not be called on TCE tables, for example - VFIO.
>>
>> This does s/tce_build/set/, s/tce_free/clear/ and removes "tce_"
>> redundand prefixes.
>>
>
> s/redundand/redundant  I might be wrong because of my bad English.
No, you're right ;)
>
>> This removes tce_xxx_rm handlers from ppc_md but does not add
>> them to iommu_table_ops as this will be done later if we decide to
>> support TCE hypercalls in real mode. This removes _vm callbacks as
>> only virtual mode is supported by now so this also removes @rm parameter.
>>
>> For pSeries, this always uses tce_buildmulti_pSeriesLP/
>> tce_buildmulti_pSeriesLP. This changes multi callback to fall back to
>> tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not
>> present. The reason for this is we still have to support "multitce=off"
>> boot parameter in disable_multitce() and we do not want to walk through
>> all IOMMU tables in the system and replace "multi" callbacks with single
>> ones.
>>
>> For powernv, this defines _ops per PHB type which are P5IOC2/IODA1/IODA2.
>> This makes the callbacks for them public. Later patches will extend
>> callbacks for IODA1/2.
>>
>> No change in behaviour is expected.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> Changes:
>> v9:
>> * pnv_tce_build/pnv_tce_free/pnv_tce_get have been made public and lost
>> "rm" parameters to make following patches simpler (realmode is not
>> supported here anyway)
>> * got rid of _vm versions of callbacks
>> ---
>> arch/powerpc/include/asm/iommu.h            | 17 +++++++++++
>> arch/powerpc/include/asm/machdep.h          | 25 ---------------
>> arch/powerpc/kernel/iommu.c                 | 46 ++++++++++++++--------------
>> arch/powerpc/kernel/vio.c                   |  5 +++
>> arch/powerpc/platforms/cell/iommu.c         |  8 +++--
>> arch/powerpc/platforms/pasemi/iommu.c       |  7 +++--
>> arch/powerpc/platforms/powernv/pci-ioda.c   | 14 +++++++++
>> arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 +++++
>> arch/powerpc/platforms/powernv/pci.c        | 47 +++++------------------------
>> arch/powerpc/platforms/powernv/pci.h        |  5 +++
>> arch/powerpc/platforms/pseries/iommu.c      | 34 ++++++++++++---------
>> arch/powerpc/sysdev/dart_iommu.c            | 12 +++++---
>> 12 files changed, 116 insertions(+), 111 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index d91bd69..e2a45c3 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -44,6 +44,22 @@
>> extern int iommu_is_off;
>> extern int iommu_force_on;
>>
>> +struct iommu_table_ops {
>> +	int (*set)(struct iommu_table *tbl,
>> +			long index, long npages,
>> +			unsigned long uaddr,
>> +			enum dma_data_direction direction,
>> +			struct dma_attrs *attrs);
>> +	void (*clear)(struct iommu_table *tbl,
>> +			long index, long npages);
>> +	unsigned long (*get)(struct iommu_table *tbl, long index);
>> +	void (*flush)(struct iommu_table *tbl);
>
> Currently, there isn't flush backend on PowerNV platform. I'm not sure
> if we have to implement it for PowerNV if we really need it. Maybe you
> will have it to support DDW in subsequent patches which I didn't look
> into it, but I will :-)
I am not adding new callbacks here, I am moving them. DART uses flush() so 
it has to be here.
>
>> +};
>> +
>> +/* These are used by VIO */
>> +extern struct iommu_table_ops iommu_table_lpar_multi_ops;
>> +extern struct iommu_table_ops iommu_table_pseries_ops;
>> +
>
> It might be reasonable to add "struct iommu_table_ops *ops" to function
> vio_register_device_node() where the specified "ops" can be hooked to
> the newly created IOMMU table. In that way, the platform (pSeries) specific
> IOMMU table operations doesn't have to be exposed to PowerPC subsystem.
I am not exposing anything new here - I am just replacing references to 
ppc_md with references to iommu_table_ops.
vio_register_device_node() is called from VIO code. So I'll have to have a 
copy of iommu_table_pseries_ops and iommu_table_lpar_multi_ops in VIO code. 
If I do this, I'll have to export 
tce_build_pSeries()/tce_free_pSeries()/tce_get_pseries() to initialize 
those structs. Either way I need to expose some symbols and do deeper 
rework but not in this patchset.
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free()
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (13 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  0:48   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group Alexey Kardashevskiy
                   ` (18 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is
supposed to be called on IODA1/2 and not called on p5ioc2. It receives
start and end host addresses of TCE table.
IODA2 actually needs PCI addresses to invalidate the cache. Those
can be calculated from host addresses but since we are going
to implement multi-level TCE tables, calculating PCI address from
a host address might get either tricky or ugly as TCE table remains flat
on PCI bus but not in RAM.
This moves pnv_pci_ioda_tce_invalidate() from generic pnv_tce_build/
pnt_tce_free and defines IODA1/2-specific callbacks which call generic
ones and do PHB-model-specific TCE cache invalidation. P5IOC2 keeps
using generic callbacks as before.
This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and
number of pages which are PCI addresses shifted by IOMMU page shift.
No change in behaviour is expected.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v10:
* moved before "Switch from iommu_table to new iommu_table_group" as it adds
list of groups to iommu_table and tce invalidation depends on it
v9:
* removed confusing comment from commit log about unintentional calling of
pnv_pci_ioda_tce_invalidate()
* moved mechanical changes away to "powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table"
* fixed bug with broken invalidation in pnv_pci_ioda2_tce_invalidate -
@index includes @tbl->it_offset but old code added it anyway which later broke
DDW
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 81 ++++++++++++++++++++++---------
 arch/powerpc/platforms/powernv/pci.c      | 17 ++-----
 2 files changed, 61 insertions(+), 37 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2924abe..1b43e25 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1678,18 +1678,19 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 	}
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
-					 struct iommu_table *tbl,
-					 __be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
 {
+	struct pnv_ioda_pe *pe = tbl->data;
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
 		(__be64 __iomem *)tbl->it_index;
 	unsigned long start, end, inc;
 	const unsigned shift = tbl->it_page_shift;
 
-	start = __pa(startp);
-	end = __pa(endp);
+	start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
+	end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
+			npages - 1);
 
 	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
 	if (tbl->it_busno) {
@@ -1725,16 +1726,39 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
 	 */
 }
 
+static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	long ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+			attrs);
+
+	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
+		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+
+	return ret;
+}
+
+static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
+		long npages)
+{
+	pnv_tce_free(tbl, index, npages);
+
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
+		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+}
+
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
-	.set = pnv_tce_build,
-	.clear = pnv_tce_free,
+	.set = pnv_ioda1_tce_build,
+	.clear = pnv_ioda1_tce_free,
 	.get = pnv_tce_get,
 };
 
-static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
-					 struct iommu_table *tbl,
-					 __be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
 {
+	struct pnv_ioda_pe *pe = tbl->data;
 	unsigned long start, end, inc;
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
@@ -1747,10 +1771,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
 	end = start;
 
 	/* Figure out the start, end and step */
-	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
-	start |= (inc << shift);
-	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
-	end |= (inc << shift);
+	start |= (index << shift);
+	end |= ((index + npages - 1) << shift);
 	inc = (0x1ull << shift);
 	mb();
 
@@ -1763,21 +1785,32 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
 	}
 }
 
-void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
-				 __be64 *startp, __be64 *endp, bool rm)
+static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
-	struct pnv_phb *phb = pe->phb;
+	long ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+			attrs);
 
-	if (phb->type == PNV_PHB_IODA1)
-		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
-	else
-		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
+	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+
+	return ret;
+}
+
+static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
+		long npages)
+{
+	pnv_tce_free(tbl, index, npages);
+
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
-	.set = pnv_tce_build,
-	.clear = pnv_tce_free,
+	.set = pnv_ioda2_tce_build,
+	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
 };
 
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 4c3bbb1..84b4ea4 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -577,37 +577,28 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		struct dma_attrs *attrs)
 {
 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
-	__be64 *tcep, *tces;
+	__be64 *tcep;
 	u64 rpn;
 
-	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
 	rpn = __pa(uaddr) >> tbl->it_page_shift;
 
 	while (npages--)
 		*(tcep++) = cpu_to_be64(proto_tce |
 				(rpn++ << tbl->it_page_shift));
 
-	/* Some implementations won't cache invalid TCEs and thus may not
-	 * need that flush. We'll probably turn it_type into a bit mask
-	 * of flags if that becomes the case
-	 */
-	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
 
 	return 0;
 }
 
 void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
-	__be64 *tcep, *tces;
+	__be64 *tcep;
 
-	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
 
 	while (npages--)
 		*(tcep++) = cpu_to_be64(0);
-
-	if (tbl->it_type & TCE_PCI_SWINV_FREE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
 }
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free()
  2015-05-11 15:39 ` [PATCH kernel v10 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free() Alexey Kardashevskiy
@ 2015-05-14  0:48   ` Gavin Shan
  2015-05-14  3:19     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  0:48 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:04AM +1000, Alexey Kardashevskiy wrote:
>The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is
>supposed to be called on IODA1/2 and not called on p5ioc2. It receives
>start and end host addresses of TCE table.
>
>IODA2 actually needs PCI addresses to invalidate the cache. Those
>can be calculated from host addresses but since we are going
>to implement multi-level TCE tables, calculating PCI address from
>a host address might get either tricky or ugly as TCE table remains flat
>on PCI bus but not in RAM.
>
>This moves pnv_pci_ioda_tce_invalidate() from generic pnv_tce_build/
>pnt_tce_free and defines IODA1/2-specific callbacks which call generic
>ones and do PHB-model-specific TCE cache invalidation. P5IOC2 keeps
>using generic callbacks as before.
>
>This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and
>number of pages which are PCI addresses shifted by IOMMU page shift.
>
>No change in behaviour is expected.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v10:
>* moved before "Switch from iommu_table to new iommu_table_group" as it adds
>list of groups to iommu_table and tce invalidation depends on it
>
>v9:
>* removed confusing comment from commit log about unintentional calling of
>pnv_pci_ioda_tce_invalidate()
>* moved mechanical changes away to "powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table"
>* fixed bug with broken invalidation in pnv_pci_ioda2_tce_invalidate -
>@index includes @tbl->it_offset but old code added it anyway which later broke
>DDW
>---
> arch/powerpc/platforms/powernv/pci-ioda.c | 81 ++++++++++++++++++++++---------
> arch/powerpc/platforms/powernv/pci.c      | 17 ++-----
> 2 files changed, 61 insertions(+), 37 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 2924abe..1b43e25 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1678,18 +1678,19 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
> 	}
> }
>
>-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
>-					 struct iommu_table *tbl,
>-					 __be64 *startp, __be64 *endp, bool rm)
>+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
>+		unsigned long index, unsigned long npages, bool rm)
> {
>+	struct pnv_ioda_pe *pe = tbl->data;
> 	__be64 __iomem *invalidate = rm ?
> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
> 		(__be64 __iomem *)tbl->it_index;
> 	unsigned long start, end, inc;
> 	const unsigned shift = tbl->it_page_shift;
>
>-	start = __pa(startp);
>-	end = __pa(endp);
>+	start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
>+	end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
>+			npages - 1);
Platform is the only one knowing the TCE table layout and iommu_table_ops->get()
helps to retrieve TCE entry for the given index. If iommu_table_ops->get() had
returned the address of the TCE entry, not the content. Here, iommu_table_ops->get()
can be reused and we hide the platform specific TCE table layout in iommu_table_ops->get()
backend. However, it's not a big deal and it probably introduces more changes
than expected. You judge it's worthy to do it or improve it later :-)
>
> 	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
> 	if (tbl->it_busno) {
>@@ -1725,16 +1726,39 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
> 	 */
> }
>
>+static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
>+		long npages, unsigned long uaddr,
>+		enum dma_data_direction direction,
>+		struct dma_attrs *attrs)
>+{
>+	long ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
>+			attrs);
The return value from pnv_tce_build() is "int" :-)
>+
>+	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
>+		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
>+
>+	return ret;
>+}
>+
>+static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
>+		long npages)
>+{
>+	pnv_tce_free(tbl, index, npages);
>+
>+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
>+		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
>+}
>+
> static struct iommu_table_ops pnv_ioda1_iommu_ops = {
>-	.set = pnv_tce_build,
>-	.clear = pnv_tce_free,
>+	.set = pnv_ioda1_tce_build,
>+	.clear = pnv_ioda1_tce_free,
> 	.get = pnv_tce_get,
> };
>
>-static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
>-					 struct iommu_table *tbl,
>-					 __be64 *startp, __be64 *endp, bool rm)
>+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>+		unsigned long index, unsigned long npages, bool rm)
> {
>+	struct pnv_ioda_pe *pe = tbl->data;
> 	unsigned long start, end, inc;
> 	__be64 __iomem *invalidate = rm ?
> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>@@ -1747,10 +1771,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
> 	end = start;
>
> 	/* Figure out the start, end and step */
>-	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
>-	start |= (inc << shift);
>-	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
>-	end |= (inc << shift);
>+	start |= (index << shift);
>+	end |= ((index + npages - 1) << shift);
> 	inc = (0x1ull << shift);
> 	mb();
>
>@@ -1763,21 +1785,32 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
> 	}
> }
>
>-void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
>-				 __be64 *startp, __be64 *endp, bool rm)
>+static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
>+		long npages, unsigned long uaddr,
>+		enum dma_data_direction direction,
>+		struct dma_attrs *attrs)
> {
>-	struct pnv_ioda_pe *pe = tbl->data;
>-	struct pnv_phb *phb = pe->phb;
>+	long ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
>+			attrs);
s/long/int
>
>-	if (phb->type == PNV_PHB_IODA1)
>-		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
>-	else
>-		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
>+	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
>+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
>+
>+	return ret;
>+}
>+
>+static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
>+		long npages)
>+{
>+	pnv_tce_free(tbl, index, npages);
>+
>+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
>+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
> }
>
> static struct iommu_table_ops pnv_ioda2_iommu_ops = {
>-	.set = pnv_tce_build,
>-	.clear = pnv_tce_free,
>+	.set = pnv_ioda2_tce_build,
>+	.clear = pnv_ioda2_tce_free,
> 	.get = pnv_tce_get,
> };
>
>diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>index 4c3bbb1..84b4ea4 100644
>--- a/arch/powerpc/platforms/powernv/pci.c
>+++ b/arch/powerpc/platforms/powernv/pci.c
>@@ -577,37 +577,28 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
> 		struct dma_attrs *attrs)
> {
> 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
>-	__be64 *tcep, *tces;
>+	__be64 *tcep;
> 	u64 rpn;
>
>-	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>+	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
> 	rpn = __pa(uaddr) >> tbl->it_page_shift;
>
> 	while (npages--)
> 		*(tcep++) = cpu_to_be64(proto_tce |
> 				(rpn++ << tbl->it_page_shift));
>
>-	/* Some implementations won't cache invalid TCEs and thus may not
>-	 * need that flush. We'll probably turn it_type into a bit mask
>-	 * of flags if that becomes the case
>-	 */
>-	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
>-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
>
> 	return 0;
> }
>
> void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
> {
>-	__be64 *tcep, *tces;
>+	__be64 *tcep;
>
>-	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>+	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>
> 	while (npages--)
> 		*(tcep++) = cpu_to_be64(0);
>-
>-	if (tbl->it_type & TCE_PCI_SWINV_FREE)
>-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
> }
>
> unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free()
  2015-05-14  0:48   ` Gavin Shan
@ 2015-05-14  3:19     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  3:19 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/14/2015 10:48 AM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:39:04AM +1000, Alexey Kardashevskiy wrote:
>> The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is
>> supposed to be called on IODA1/2 and not called on p5ioc2. It receives
>> start and end host addresses of TCE table.
>>
>> IODA2 actually needs PCI addresses to invalidate the cache. Those
>> can be calculated from host addresses but since we are going
>> to implement multi-level TCE tables, calculating PCI address from
>> a host address might get either tricky or ugly as TCE table remains flat
>> on PCI bus but not in RAM.
>>
>> This moves pnv_pci_ioda_tce_invalidate() from generic pnv_tce_build/
>> pnt_tce_free and defines IODA1/2-specific callbacks which call generic
>> ones and do PHB-model-specific TCE cache invalidation. P5IOC2 keeps
>> using generic callbacks as before.
>>
>> This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and
>> number of pages which are PCI addresses shifted by IOMMU page shift.
>>
>> No change in behaviour is expected.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> Changes:
>> v10:
>> * moved before "Switch from iommu_table to new iommu_table_group" as it adds
>> list of groups to iommu_table and tce invalidation depends on it
>>
>> v9:
>> * removed confusing comment from commit log about unintentional calling of
>> pnv_pci_ioda_tce_invalidate()
>> * moved mechanical changes away to "powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table"
>> * fixed bug with broken invalidation in pnv_pci_ioda2_tce_invalidate -
>> @index includes @tbl->it_offset but old code added it anyway which later broke
>> DDW
>> ---
>> arch/powerpc/platforms/powernv/pci-ioda.c | 81 ++++++++++++++++++++++---------
>> arch/powerpc/platforms/powernv/pci.c      | 17 ++-----
>> 2 files changed, 61 insertions(+), 37 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 2924abe..1b43e25 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1678,18 +1678,19 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
>> 	}
>> }
>>
>> -static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
>> -					 struct iommu_table *tbl,
>> -					 __be64 *startp, __be64 *endp, bool rm)
>> +static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
>> +		unsigned long index, unsigned long npages, bool rm)
>> {
>> +	struct pnv_ioda_pe *pe = tbl->data;
>> 	__be64 __iomem *invalidate = rm ?
>> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>> 		(__be64 __iomem *)tbl->it_index;
>> 	unsigned long start, end, inc;
>> 	const unsigned shift = tbl->it_page_shift;
>>
>> -	start = __pa(startp);
>> -	end = __pa(endp);
>> +	start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
>> +	end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
>> +			npages - 1);
>
> Platform is the only one knowing the TCE table layout and iommu_table_ops->get()
> helps to retrieve TCE entry for the given index. If iommu_table_ops->get() had
> returned the address of the TCE entry, not the content. Here, iommu_table_ops->get()
> can be reused and we hide the platform specific TCE table layout in iommu_table_ops->get()
> backend. However, it's not a big deal and it probably introduces more changes
> than expected. You judge it's worthy to do it or improve it later :-)
This will require a separate patch to convert tce_get() from returning 
value to returning address. I could do that, yes. Furthermore there is even 
pnv_tce() helper added later in this patchset which can be used for this in 
this patch (if I moved that patch earlier). But this would be a bigger 
change which is not very much related to what the patch does - 
cut-n-pasting invalidate() bits. May be later. The patchset is way too big 
already :(
>>
>> 	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
>> 	if (tbl->it_busno) {
>> @@ -1725,16 +1726,39 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
>> 	 */
>> }
>>
>> +static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
>> +		long npages, unsigned long uaddr,
>> +		enum dma_data_direction direction,
>> +		struct dma_attrs *attrs)
>> +{
>> +	long ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
>> +			attrs);
>
> The return value from pnv_tce_build() is "int" :-)
Oops.
>> +
>> +	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
>> +		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
>> +
>> +	return ret;
>> +}
>> +
>> +static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
>> +		long npages)
>> +{
>> +	pnv_tce_free(tbl, index, npages);
>> +
>> +	if (tbl->it_type & TCE_PCI_SWINV_FREE)
>> +		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
>> +}
>> +
>> static struct iommu_table_ops pnv_ioda1_iommu_ops = {
>> -	.set = pnv_tce_build,
>> -	.clear = pnv_tce_free,
>> +	.set = pnv_ioda1_tce_build,
>> +	.clear = pnv_ioda1_tce_free,
>> 	.get = pnv_tce_get,
>> };
>>
>> -static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
>> -					 struct iommu_table *tbl,
>> -					 __be64 *startp, __be64 *endp, bool rm)
>> +static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>> +		unsigned long index, unsigned long npages, bool rm)
>> {
>> +	struct pnv_ioda_pe *pe = tbl->data;
>> 	unsigned long start, end, inc;
>> 	__be64 __iomem *invalidate = rm ?
>> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>> @@ -1747,10 +1771,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
>> 	end = start;
>>
>> 	/* Figure out the start, end and step */
>> -	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
>> -	start |= (inc << shift);
>> -	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
>> -	end |= (inc << shift);
>> +	start |= (index << shift);
>> +	end |= ((index + npages - 1) << shift);
>> 	inc = (0x1ull << shift);
>> 	mb();
>>
>> @@ -1763,21 +1785,32 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
>> 	}
>> }
>>
>> -void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
>> -				 __be64 *startp, __be64 *endp, bool rm)
>> +static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
>> +		long npages, unsigned long uaddr,
>> +		enum dma_data_direction direction,
>> +		struct dma_attrs *attrs)
>> {
>> -	struct pnv_ioda_pe *pe = tbl->data;
>> -	struct pnv_phb *phb = pe->phb;
>> +	long ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
>> +			attrs);
>
> s/long/int
I better make them all long.
>
>>
>> -	if (phb->type == PNV_PHB_IODA1)
>> -		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
>> -	else
>> -		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
>> +	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
>> +		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
>> +
>> +	return ret;
>> +}
>> +
>> +static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
>> +		long npages)
>> +{
>> +	pnv_tce_free(tbl, index, npages);
>> +
>> +	if (tbl->it_type & TCE_PCI_SWINV_FREE)
>> +		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
>> }
>>
>> static struct iommu_table_ops pnv_ioda2_iommu_ops = {
>> -	.set = pnv_tce_build,
>> -	.clear = pnv_tce_free,
>> +	.set = pnv_ioda2_tce_build,
>> +	.clear = pnv_ioda2_tce_free,
>> 	.get = pnv_tce_get,
>> };
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index 4c3bbb1..84b4ea4 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -577,37 +577,28 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
>> 		struct dma_attrs *attrs)
>> {
>> 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
>> -	__be64 *tcep, *tces;
>> +	__be64 *tcep;
>> 	u64 rpn;
>>
>> -	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>> +	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>> 	rpn = __pa(uaddr) >> tbl->it_page_shift;
>>
>> 	while (npages--)
>> 		*(tcep++) = cpu_to_be64(proto_tce |
>> 				(rpn++ << tbl->it_page_shift));
>>
>> -	/* Some implementations won't cache invalid TCEs and thus may not
>> -	 * need that flush. We'll probably turn it_type into a bit mask
>> -	 * of flags if that becomes the case
>> -	 */
>> -	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
>> -		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
>>
>> 	return 0;
>> }
>>
>> void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
>> {
>> -	__be64 *tcep, *tces;
>> +	__be64 *tcep;
>>
>> -	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>> +	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>>
>> 	while (npages--)
>> 		*(tcep++) = cpu_to_be64(0);
>> -
>> -	if (tbl->it_type & TCE_PCI_SWINV_FREE)
>> -		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
>> }
>>
>> unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
>> --
>> 2.4.0.rc3.8.gfb3e7d5
>>
>
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (14 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free() Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-13 21:30   ` Alex Williamson
  2015-05-14  1:21   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group Alexey Kardashevskiy
                   ` (17 subsequent siblings)
  33 siblings, 2 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.
This defines iommu_table_group struct which stores pointers to
iommu_group and iommu_table(s). This replaces iommu_table with
iommu_table_group where iommu_table was used to identify a group:
- iommu_register_group();
- iommudata of generic iommu_group;
This removes @data from iommu_table as it_table_group provides
same access to pnv_ioda_pe.
For IODA, instead of embedding iommu_table, the new iommu_table_group
keeps pointers to those. The iommu_table structs are allocated
dynamically.
For P5IOC2, both iommu_table_group and iommu_table are embedded into
PE struct. As there is no EEH and SRIOV support for P5IOC2,
iommu_free_table() should not be called on iommu_table struct pointers
so we can keep it embedded in pnv_phb::p5ioc2.
For pSeries, this replaces multiple calls of kzalloc_node() with a new
iommu_pseries_alloc_group() helper and stores the table group struct
pointer into the pci_dn struct. For release, a iommu_table_free_group()
helper is added.
This moves iommu_table struct allocation from SR-IOV code to
the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and
pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized.
This change is here because those lines had to be changed anyway.
This should cause no behavioural change.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* new to the series, separated from
"powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group"
* iommu_table is not embedded into iommu_table_group but allocated
dynamically in most cases
* iommu_table allocation is moved to a single place for IODA2's
pnv_pci_ioda_setup_dma_pe where it belongs to
* added list of groups into iommu_table; most of the code just looks at
the first item to keep the patch simpler
---
 arch/powerpc/include/asm/iommu.h            |  17 +++--
 arch/powerpc/include/asm/pci-bridge.h       |   2 +-
 arch/powerpc/kernel/iommu.c                 |  17 ++---
 arch/powerpc/platforms/powernv/pci-ioda.c   |  55 +++++++-------
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  18 +++--
 arch/powerpc/platforms/powernv/pci.h        |   3 +-
 arch/powerpc/platforms/pseries/iommu.c      | 107 +++++++++++++++++++---------
 drivers/vfio/vfio_iommu_spapr_tce.c         |  23 +++---
 8 files changed, 152 insertions(+), 90 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e2a45c3..61bde1a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -92,13 +92,10 @@ struct iommu_table {
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
 	unsigned long  it_page_shift;/* table iommu page size */
 #ifdef CONFIG_IOMMU_API
-	struct iommu_group *it_group;
+	struct iommu_table_group *it_table_group;
 #endif
 	struct iommu_table_ops *it_ops;
 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
-#ifdef CONFIG_PPC_POWERNV
-	void           *data;
-#endif
 };
 
 /* Pure 2^n version of get_order */
@@ -130,13 +127,21 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
 #ifdef CONFIG_IOMMU_API
-extern void iommu_register_group(struct iommu_table *tbl,
+
+#define IOMMU_TABLE_GROUP_MAX_TABLES	1
+
+struct iommu_table_group {
+	struct iommu_group *group;
+	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+};
+
+extern void iommu_register_group(struct iommu_table_group *table_group,
 				 int pci_domain_number, unsigned long pe_num);
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 #else
-static inline void iommu_register_group(struct iommu_table *tbl,
+static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
 					unsigned long pe_num)
 {
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 1811c44..e2d7479 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -185,7 +185,7 @@ struct pci_dn {
 
 	struct  pci_dn *parent;
 	struct  pci_controller *phb;	/* for pci devices */
-	struct	iommu_table *iommu_table;	/* for phb's or bridges */
+	struct	iommu_table_group *table_group;	/* for phb's or bridges */
 	struct	device_node *node;	/* back-pointer to the device_node */
 
 	int	pci_ext_config_space;	/* for pci devices */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 16be6aa..79e8b43 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -886,11 +886,12 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
  */
 static void group_release(void *iommu_data)
 {
-	struct iommu_table *tbl = iommu_data;
-	tbl->it_group = NULL;
+	struct iommu_table_group *table_group = iommu_data;
+
+	table_group->group = NULL;
 }
 
-void iommu_register_group(struct iommu_table *tbl,
+void iommu_register_group(struct iommu_table_group *table_group,
 		int pci_domain_number, unsigned long pe_num)
 {
 	struct iommu_group *grp;
@@ -902,8 +903,8 @@ void iommu_register_group(struct iommu_table *tbl,
 				PTR_ERR(grp));
 		return;
 	}
-	tbl->it_group = grp;
-	iommu_group_set_iommudata(grp, tbl, group_release);
+	table_group->group = grp;
+	iommu_group_set_iommudata(grp, table_group, group_release);
 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
 			pci_domain_number, pe_num);
 	if (!name)
@@ -1091,7 +1092,7 @@ int iommu_add_device(struct device *dev)
 	}
 
 	tbl = get_iommu_table_base(dev);
-	if (!tbl || !tbl->it_group) {
+	if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) {
 		pr_debug("%s: Skipping device %s with no tbl\n",
 			 __func__, dev_name(dev));
 		return 0;
@@ -1099,7 +1100,7 @@ int iommu_add_device(struct device *dev)
 
 	pr_debug("%s: Adding %s to iommu group %d\n",
 		 __func__, dev_name(dev),
-		 iommu_group_id(tbl->it_group));
+		 iommu_group_id(tbl->it_table_group->group));
 
 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
 		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
@@ -1108,7 +1109,7 @@ int iommu_add_device(struct device *dev)
 		return -EINVAL;
 	}
 
-	return iommu_group_add_device(tbl->it_group, dev);
+	return iommu_group_add_device(tbl->it_table_group->group, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_add_device);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1b43e25..02ed448 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1087,10 +1087,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 		return;
 	}
 
-	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
-			GFP_KERNEL, hose->node);
-	pe->tce32_table->data = pe;
-
 	/* Associate it with all child devices */
 	pnv_ioda_setup_same_PE(bus, pe);
 
@@ -1292,11 +1288,12 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	struct iommu_table    *tbl;
 	unsigned long         addr;
 	int64_t               rc;
+	struct iommu_table_group *table_group;
 
 	bus = dev->bus;
 	hose = pci_bus_to_host(bus);
 	phb = hose->private_data;
-	tbl = pe->tce32_table;
+	tbl = pe->table_group.tables[0];
 	addr = tbl->it_base;
 
 	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
@@ -1311,13 +1308,14 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
-	if (tbl->it_group) {
-		iommu_group_put(tbl->it_group);
-		BUG_ON(tbl->it_group);
+	table_group = tbl->it_table_group;
+	if (table_group->group) {
+		iommu_group_put(table_group->group);
+		BUG_ON(table_group->group);
 	}
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
-	pe->tce32_table = NULL;
+	pe->table_group.tables[0] = NULL;
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -1465,10 +1463,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 			continue;
 		}
 
-		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
-				GFP_KERNEL, hose->node);
-		pe->tce32_table->data = pe;
-
 		/* Put PE to the list */
 		mutex_lock(&phb->ioda.pe_list_mutex);
 		list_add_tail(&pe->list, &phb->ioda.pe_list);
@@ -1603,7 +1597,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
-	set_iommu_table_base(&pdev->dev, pe->tce32_table);
+	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
 	/*
 	 * Note: iommu_add_device() will fail here as
 	 * for physical PE: the device is already added by now;
@@ -1636,7 +1630,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
 	} else {
 		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
 		set_dma_ops(&pdev->dev, &dma_iommu_ops);
-		set_iommu_table_base(&pdev->dev, pe->tce32_table);
+		set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
 	}
 	*pdev->dev.dma_mask = dma_mask;
 	return 0;
@@ -1670,7 +1664,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		set_iommu_table_base(&dev->dev, pe->tce32_table);
+		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
 		iommu_add_device(&dev->dev);
 
 		if (dev->subordinate)
@@ -1681,7 +1675,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+			struct pnv_ioda_pe, table_group);
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
 		(__be64 __iomem *)tbl->it_index;
@@ -1758,7 +1753,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+			struct pnv_ioda_pe, table_group);
 	unsigned long start, end, inc;
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
@@ -1834,8 +1830,12 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
-	tbl = pe->tce32_table;
-	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+			phb->hose->node);
+	tbl->it_table_group = &pe->table_group;
+	pe->table_group.tables[0] = tbl;
+	iommu_register_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number);
 
 	/* Grab a 32-bit TCE table */
 	pe->tce32_seg = base;
@@ -1914,7 +1914,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 
 static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+			struct pnv_ioda_pe, table_group);
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
 
@@ -1948,10 +1949,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
 	pe->tce_bypass_base = 1ull << 59;
 
 	/* Install set_bypass callback for VFIO */
-	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
+	pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass;
 
 	/* Enable bypass by default */
-	pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
+	pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true);
 }
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
@@ -1968,8 +1969,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
-	tbl = pe->tce32_table;
-	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+			phb->hose->node);
+	tbl->it_table_group = &pe->table_group;
+	pe->table_group.tables[0] = tbl;
+	iommu_register_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number);
 
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 2722c1a..4ea9def 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -92,14 +92,16 @@ static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
 static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 					 struct pci_dev *pdev)
 {
-	if (phb->p5ioc2.iommu_table.it_map == NULL) {
-		phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops;
-		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
-		iommu_register_group(&phb->p5ioc2.iommu_table,
+	struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0];
+
+	if (!tbl->it_map) {
+		tbl->it_ops = &pnv_p5ioc2_iommu_ops;
+		iommu_init_table(tbl, phb->hose->node);
+		iommu_register_group(&phb->p5ioc2.table_group,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
 	}
 
-	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
+	set_iommu_table_base(&pdev->dev, tbl);
 	iommu_add_device(&pdev->dev);
 }
 
@@ -180,6 +182,12 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
 				  tce_mem, tce_size, 0,
 				  IOMMU_PAGE_SHIFT_4K);
+	/*
+	 * We do not allocate iommu_table as we do not support
+	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
+	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
+	 */
+	phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
 }
 
 void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index ec26afd..720cc99 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -57,7 +57,7 @@ struct pnv_ioda_pe {
 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
 	int			tce32_seg;
 	int			tce32_segcount;
-	struct iommu_table	*tce32_table;
+	struct iommu_table_group table_group;
 	phys_addr_t		tce_inval_reg_phys;
 
 	/* 64-bit TCE bypass region */
@@ -123,6 +123,7 @@ struct pnv_phb {
 	union {
 		struct {
 			struct iommu_table iommu_table;
+			struct iommu_table_group table_group;
 		} p5ioc2;
 
 		struct {
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4f2ab90..ad5ac6d 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -52,14 +52,49 @@
 
 #include "pseries.h"
 
-static void iommu_pseries_free_table(struct iommu_table *tbl,
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+	struct iommu_table_group *table_group = NULL;
+	struct iommu_table *tbl = NULL;
+
+	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
+			   node);
+	if (!table_group)
+		goto fail_exit;
+
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+	if (!tbl)
+		goto fail_exit;
+
+	tbl->it_table_group = table_group;
+	table_group->tables[0] = tbl;
+
+	return table_group;
+
+fail_exit:
+	kfree(table_group);
+	kfree(tbl);
+
+	return NULL;
+}
+
+static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		const char *node_name)
 {
-	if (tbl->it_group) {
-		iommu_group_put(tbl->it_group);
-		BUG_ON(tbl->it_group);
+	struct iommu_table *tbl;
+
+	if (!table_group)
+		return;
+
+	if (table_group->group) {
+		iommu_group_put(table_group->group);
+		BUG_ON(table_group->group);
 	}
+
+	tbl = table_group->tables[0];
 	iommu_free_table(tbl, node_name);
+
+	kfree(table_group);
 }
 
 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
@@ -629,13 +664,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pci->phb->dma_window_size = 0x8000000ul;
 	pci->phb->dma_window_base_cur = 0x8000000ul;
 
-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-			   pci->phb->node);
+	pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+	tbl = pci->table_group->tables[0];
 
 	iommu_table_setparms(pci->phb, dn, tbl);
 	tbl->it_ops = &iommu_table_pseries_ops;
-	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
-	iommu_register_group(tbl, pci_domain_nr(bus), 0);
+	iommu_init_table(tbl, pci->phb->node);
+	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -678,16 +713,17 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	ppci = PCI_DN(pdn);
 
 	pr_debug("  parent is %s, iommu_table: 0x%p\n",
-		 pdn->full_name, ppci->iommu_table);
+		 pdn->full_name, ppci->table_group);
 
-	if (!ppci->iommu_table) {
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   ppci->phb->node);
+	if (!ppci->table_group) {
+		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
+		tbl = ppci->table_group->tables[0];
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
-		iommu_register_group(tbl, pci_domain_nr(bus), 0);
-		pr_debug("  created table: %p\n", ppci->iommu_table);
+		iommu_init_table(tbl, ppci->phb->node);
+		iommu_register_group(ppci->table_group,
+				pci_domain_nr(bus), 0);
+		pr_debug("  created table: %p\n", ppci->table_group);
 	}
 }
 
@@ -709,12 +745,13 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		struct pci_controller *phb = PCI_DN(dn)->phb;
 
 		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   phb->node);
+		PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
+		tbl = PCI_DN(dn)->table_group->tables[0];
 		iommu_table_setparms(phb, dn, tbl);
 		tbl->it_ops = &iommu_table_pseries_ops;
-		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
-		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
+		iommu_init_table(tbl, phb->node);
+		iommu_register_group(PCI_DN(dn)->table_group,
+				pci_domain_nr(phb->bus), 0);
 		set_iommu_table_base(&dev->dev, tbl);
 		iommu_add_device(&dev->dev);
 		return;
@@ -724,11 +761,12 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 	 * an already allocated iommu table is found and use that.
 	 */
 
-	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
+	while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
 		dn = dn->parent;
 
 	if (dn && PCI_DN(dn)) {
-		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
+		set_iommu_table_base(&dev->dev,
+				PCI_DN(dn)->table_group->tables[0]);
 		iommu_add_device(&dev->dev);
 	} else
 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
@@ -1115,7 +1153,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	dn = pci_device_to_OF_node(dev);
 	pr_debug("  node is %s\n", dn->full_name);
 
-	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 	     pdn = pdn->parent) {
 		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
 		if (dma_window)
@@ -1131,19 +1169,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	pr_debug("  parent is %s\n", pdn->full_name);
 
 	pci = PCI_DN(pdn);
-	if (!pci->iommu_table) {
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   pci->phb->node);
+	if (!pci->table_group) {
+		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+		tbl = pci->table_group->tables[0];
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
-		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
-		pr_debug("  created table: %p\n", pci->iommu_table);
+		iommu_init_table(tbl, pci->phb->node);
+		iommu_register_group(pci->table_group,
+				pci_domain_nr(pci->phb->bus), 0);
+		pr_debug("  created table: %p\n", pci->table_group);
 	} else {
-		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
+		pr_debug("  found DMA window, table: %p\n", pci->table_group);
 	}
 
-	set_iommu_table_base(&dev->dev, pci->iommu_table);
+	set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
 	iommu_add_device(&dev->dev);
 }
 
@@ -1174,7 +1213,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
 		 * search upwards in the tree until we either hit a dma-window
 		 * property, OR find a parent with a table already allocated.
 		 */
-		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 				pdn = pdn->parent) {
 			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
 			if (dma_window)
@@ -1218,7 +1257,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
 		dn = pci_device_to_OF_node(pdev);
 
 		/* search upwards for ibm,dma-window */
-		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table;
+		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
 				dn = dn->parent)
 			if (of_get_property(dn, "ibm,dma-window", NULL))
 				break;
@@ -1298,8 +1337,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 		 * the device node.
 		 */
 		remove_ddw(np, false);
-		if (pci && pci->iommu_table)
-			iommu_pseries_free_table(pci->iommu_table,
+		if (pci && pci->table_group)
+			iommu_pseries_free_group(pci->table_group,
 					np->full_name);
 
 		spin_lock(&direct_window_list_lock);
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 0fbe03e..bd87e46 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -190,10 +190,11 @@ static void tce_iommu_release(void *iommu_data)
 {
 	struct tce_container *container = iommu_data;
 
-	WARN_ON(container->tbl && !container->tbl->it_group);
+	WARN_ON(container->tbl && !container->tbl->it_table_group->group);
 
-	if (container->tbl && container->tbl->it_group)
-		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+	if (container->tbl && container->tbl->it_table_group->group)
+		tce_iommu_detach_group(iommu_data,
+				container->tbl->it_table_group->group);
 
 	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
@@ -345,7 +346,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (!tbl)
 			return -ENXIO;
 
-		BUG_ON(!tbl->it_group);
+		BUG_ON(!tbl->it_table_group->group);
 
 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
@@ -433,11 +434,12 @@ static long tce_iommu_ioctl(void *iommu_data,
 		mutex_unlock(&container->lock);
 		return 0;
 	case VFIO_EEH_PE_OP:
-		if (!container->tbl || !container->tbl->it_group)
+		if (!container->tbl || !container->tbl->it_table_group->group)
 			return -ENODEV;
 
-		return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
-						  cmd, arg);
+		return vfio_spapr_iommu_eeh_ioctl(
+				container->tbl->it_table_group->group,
+				cmd, arg);
 	}
 
 	return -ENOTTY;
@@ -457,7 +459,8 @@ static int tce_iommu_attach_group(void *iommu_data,
 			iommu_group_id(iommu_group), iommu_group); */
 	if (container->tbl) {
 		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
-				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(container->tbl->
+						it_table_group->group),
 				iommu_group_id(iommu_group));
 		ret = -EBUSY;
 		goto unlock_exit;
@@ -491,13 +494,13 @@ static void tce_iommu_detach_group(void *iommu_data,
 	if (tbl != container->tbl) {
 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
 				iommu_group_id(iommu_group),
-				iommu_group_id(tbl->it_group));
+				iommu_group_id(tbl->it_table_group->group));
 		goto unlock_exit;
 	}
 
 	if (container->enabled) {
 		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-				iommu_group_id(tbl->it_group));
+				iommu_group_id(tbl->it_table_group->group));
 		tce_iommu_disable(container);
 	}
 
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group
  2015-05-11 15:39 ` [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group Alexey Kardashevskiy
@ 2015-05-13 21:30   ` Alex Williamson
  2015-05-14  1:21   ` Gavin Shan
  1 sibling, 0 replies; 82+ messages in thread
From: Alex Williamson @ 2015-05-13 21:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Paul Mackerras, linuxppc-dev,
	David Gibson
On Tue, 2015-05-12 at 01:39 +1000, Alexey Kardashevskiy wrote:
> Modern IBM POWERPC systems support multiple (currently two) TCE tables
> per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
> for TCE tables. Right now just one table is supported.
> 
> This defines iommu_table_group struct which stores pointers to
> iommu_group and iommu_table(s). This replaces iommu_table with
> iommu_table_group where iommu_table was used to identify a group:
> - iommu_register_group();
> - iommudata of generic iommu_group;
> 
> This removes @data from iommu_table as it_table_group provides
> same access to pnv_ioda_pe.
> 
> For IODA, instead of embedding iommu_table, the new iommu_table_group
> keeps pointers to those. The iommu_table structs are allocated
> dynamically.
> 
> For P5IOC2, both iommu_table_group and iommu_table are embedded into
> PE struct. As there is no EEH and SRIOV support for P5IOC2,
> iommu_free_table() should not be called on iommu_table struct pointers
> so we can keep it embedded in pnv_phb::p5ioc2.
> 
> For pSeries, this replaces multiple calls of kzalloc_node() with a new
> iommu_pseries_alloc_group() helper and stores the table group struct
> pointer into the pci_dn struct. For release, a iommu_table_free_group()
> helper is added.
> 
> This moves iommu_table struct allocation from SR-IOV code to
> the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and
> pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized.
> This change is here because those lines had to be changed anyway.
> 
> This should cause no behavioural change.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v10:
> * new to the series, separated from
> "powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group"
> * iommu_table is not embedded into iommu_table_group but allocated
> dynamically in most cases
> * iommu_table allocation is moved to a single place for IODA2's
> pnv_pci_ioda_setup_dma_pe where it belongs to
> * added list of groups into iommu_table; most of the code just looks at
> the first item to keep the patch simpler
> ---
>  arch/powerpc/include/asm/iommu.h            |  17 +++--
>  arch/powerpc/include/asm/pci-bridge.h       |   2 +-
>  arch/powerpc/kernel/iommu.c                 |  17 ++---
>  arch/powerpc/platforms/powernv/pci-ioda.c   |  55 +++++++-------
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |  18 +++--
>  arch/powerpc/platforms/powernv/pci.h        |   3 +-
>  arch/powerpc/platforms/pseries/iommu.c      | 107 +++++++++++++++++++---------
>  drivers/vfio/vfio_iommu_spapr_tce.c         |  23 +++---
For vfio:
Acked-by: Alex Williamson <alex.williamson@redhat.com>
>  8 files changed, 152 insertions(+), 90 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index e2a45c3..61bde1a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -92,13 +92,10 @@ struct iommu_table {
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
>  	unsigned long  it_page_shift;/* table iommu page size */
>  #ifdef CONFIG_IOMMU_API
> -	struct iommu_group *it_group;
> +	struct iommu_table_group *it_table_group;
>  #endif
>  	struct iommu_table_ops *it_ops;
>  	void (*set_bypass)(struct iommu_table *tbl, bool enable);
> -#ifdef CONFIG_PPC_POWERNV
> -	void           *data;
> -#endif
>  };
>  
>  /* Pure 2^n version of get_order */
> @@ -130,13 +127,21 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
>  extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>  					    int nid);
>  #ifdef CONFIG_IOMMU_API
> -extern void iommu_register_group(struct iommu_table *tbl,
> +
> +#define IOMMU_TABLE_GROUP_MAX_TABLES	1
> +
> +struct iommu_table_group {
> +	struct iommu_group *group;
> +	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
> +};
> +
> +extern void iommu_register_group(struct iommu_table_group *table_group,
>  				 int pci_domain_number, unsigned long pe_num);
>  extern int iommu_add_device(struct device *dev);
>  extern void iommu_del_device(struct device *dev);
>  extern int __init tce_iommu_bus_notifier_init(void);
>  #else
> -static inline void iommu_register_group(struct iommu_table *tbl,
> +static inline void iommu_register_group(struct iommu_table_group *table_group,
>  					int pci_domain_number,
>  					unsigned long pe_num)
>  {
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index 1811c44..e2d7479 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -185,7 +185,7 @@ struct pci_dn {
>  
>  	struct  pci_dn *parent;
>  	struct  pci_controller *phb;	/* for pci devices */
> -	struct	iommu_table *iommu_table;	/* for phb's or bridges */
> +	struct	iommu_table_group *table_group;	/* for phb's or bridges */
>  	struct	device_node *node;	/* back-pointer to the device_node */
>  
>  	int	pci_ext_config_space;	/* for pci devices */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 16be6aa..79e8b43 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -886,11 +886,12 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
>   */
>  static void group_release(void *iommu_data)
>  {
> -	struct iommu_table *tbl = iommu_data;
> -	tbl->it_group = NULL;
> +	struct iommu_table_group *table_group = iommu_data;
> +
> +	table_group->group = NULL;
>  }
>  
> -void iommu_register_group(struct iommu_table *tbl,
> +void iommu_register_group(struct iommu_table_group *table_group,
>  		int pci_domain_number, unsigned long pe_num)
>  {
>  	struct iommu_group *grp;
> @@ -902,8 +903,8 @@ void iommu_register_group(struct iommu_table *tbl,
>  				PTR_ERR(grp));
>  		return;
>  	}
> -	tbl->it_group = grp;
> -	iommu_group_set_iommudata(grp, tbl, group_release);
> +	table_group->group = grp;
> +	iommu_group_set_iommudata(grp, table_group, group_release);
>  	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
>  			pci_domain_number, pe_num);
>  	if (!name)
> @@ -1091,7 +1092,7 @@ int iommu_add_device(struct device *dev)
>  	}
>  
>  	tbl = get_iommu_table_base(dev);
> -	if (!tbl || !tbl->it_group) {
> +	if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) {
>  		pr_debug("%s: Skipping device %s with no tbl\n",
>  			 __func__, dev_name(dev));
>  		return 0;
> @@ -1099,7 +1100,7 @@ int iommu_add_device(struct device *dev)
>  
>  	pr_debug("%s: Adding %s to iommu group %d\n",
>  		 __func__, dev_name(dev),
> -		 iommu_group_id(tbl->it_group));
> +		 iommu_group_id(tbl->it_table_group->group));
>  
>  	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
>  		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
> @@ -1108,7 +1109,7 @@ int iommu_add_device(struct device *dev)
>  		return -EINVAL;
>  	}
>  
> -	return iommu_group_add_device(tbl->it_group, dev);
> +	return iommu_group_add_device(tbl->it_table_group->group, dev);
>  }
>  EXPORT_SYMBOL_GPL(iommu_add_device);
>  
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 1b43e25..02ed448 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1087,10 +1087,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>  		return;
>  	}
>  
> -	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
> -			GFP_KERNEL, hose->node);
> -	pe->tce32_table->data = pe;
> -
>  	/* Associate it with all child devices */
>  	pnv_ioda_setup_same_PE(bus, pe);
>  
> @@ -1292,11 +1288,12 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>  	struct iommu_table    *tbl;
>  	unsigned long         addr;
>  	int64_t               rc;
> +	struct iommu_table_group *table_group;
>  
>  	bus = dev->bus;
>  	hose = pci_bus_to_host(bus);
>  	phb = hose->private_data;
> -	tbl = pe->tce32_table;
> +	tbl = pe->table_group.tables[0];
>  	addr = tbl->it_base;
>  
>  	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
> @@ -1311,13 +1308,14 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>  	if (rc)
>  		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>  
> -	if (tbl->it_group) {
> -		iommu_group_put(tbl->it_group);
> -		BUG_ON(tbl->it_group);
> +	table_group = tbl->it_table_group;
> +	if (table_group->group) {
> +		iommu_group_put(table_group->group);
> +		BUG_ON(table_group->group);
>  	}
>  	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>  	free_pages(addr, get_order(TCE32_TABLE_SIZE));
> -	pe->tce32_table = NULL;
> +	pe->table_group.tables[0] = NULL;
>  }
>  
>  static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> @@ -1465,10 +1463,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>  			continue;
>  		}
>  
> -		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
> -				GFP_KERNEL, hose->node);
> -		pe->tce32_table->data = pe;
> -
>  		/* Put PE to the list */
>  		mutex_lock(&phb->ioda.pe_list_mutex);
>  		list_add_tail(&pe->list, &phb->ioda.pe_list);
> @@ -1603,7 +1597,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
>  
>  	pe = &phb->ioda.pe_array[pdn->pe_number];
>  	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
> -	set_iommu_table_base(&pdev->dev, pe->tce32_table);
> +	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
>  	/*
>  	 * Note: iommu_add_device() will fail here as
>  	 * for physical PE: the device is already added by now;
> @@ -1636,7 +1630,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
>  	} else {
>  		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
>  		set_dma_ops(&pdev->dev, &dma_iommu_ops);
> -		set_iommu_table_base(&pdev->dev, pe->tce32_table);
> +		set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
>  	}
>  	*pdev->dev.dma_mask = dma_mask;
>  	return 0;
> @@ -1670,7 +1664,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
>  	struct pci_dev *dev;
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
> -		set_iommu_table_base(&dev->dev, pe->tce32_table);
> +		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
>  		iommu_add_device(&dev->dev);
>  
>  		if (dev->subordinate)
> @@ -1681,7 +1675,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
>  static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
>  		unsigned long index, unsigned long npages, bool rm)
>  {
> -	struct pnv_ioda_pe *pe = tbl->data;
> +	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
> +			struct pnv_ioda_pe, table_group);
>  	__be64 __iomem *invalidate = rm ?
>  		(__be64 __iomem *)pe->tce_inval_reg_phys :
>  		(__be64 __iomem *)tbl->it_index;
> @@ -1758,7 +1753,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
>  static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>  		unsigned long index, unsigned long npages, bool rm)
>  {
> -	struct pnv_ioda_pe *pe = tbl->data;
> +	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
> +			struct pnv_ioda_pe, table_group);
>  	unsigned long start, end, inc;
>  	__be64 __iomem *invalidate = rm ?
>  		(__be64 __iomem *)pe->tce_inval_reg_phys :
> @@ -1834,8 +1830,12 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>  	if (WARN_ON(pe->tce32_seg >= 0))
>  		return;
>  
> -	tbl = pe->tce32_table;
> -	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
> +	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> +			phb->hose->node);
> +	tbl->it_table_group = &pe->table_group;
> +	pe->table_group.tables[0] = tbl;
> +	iommu_register_group(&pe->table_group, phb->hose->global_number,
> +			pe->pe_number);
>  
>  	/* Grab a 32-bit TCE table */
>  	pe->tce32_seg = base;
> @@ -1914,7 +1914,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>  
>  static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
>  {
> -	struct pnv_ioda_pe *pe = tbl->data;
> +	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
> +			struct pnv_ioda_pe, table_group);
>  	uint16_t window_id = (pe->pe_number << 1 ) + 1;
>  	int64_t rc;
>  
> @@ -1948,10 +1949,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
>  	pe->tce_bypass_base = 1ull << 59;
>  
>  	/* Install set_bypass callback for VFIO */
> -	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
> +	pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass;
>  
>  	/* Enable bypass by default */
> -	pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
> +	pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true);
>  }
>  
>  static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> @@ -1968,8 +1969,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  	if (WARN_ON(pe->tce32_seg >= 0))
>  		return;
>  
> -	tbl = pe->tce32_table;
> -	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
> +	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> +			phb->hose->node);
> +	tbl->it_table_group = &pe->table_group;
> +	pe->table_group.tables[0] = tbl;
> +	iommu_register_group(&pe->table_group, phb->hose->global_number,
> +			pe->pe_number);
>  
>  	/* The PE will reserve all possible 32-bits space */
>  	pe->tce32_seg = 0;
> diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
> index 2722c1a..4ea9def 100644
> --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
> +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
> @@ -92,14 +92,16 @@ static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
>  static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
>  					 struct pci_dev *pdev)
>  {
> -	if (phb->p5ioc2.iommu_table.it_map == NULL) {
> -		phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops;
> -		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
> -		iommu_register_group(&phb->p5ioc2.iommu_table,
> +	struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0];
> +
> +	if (!tbl->it_map) {
> +		tbl->it_ops = &pnv_p5ioc2_iommu_ops;
> +		iommu_init_table(tbl, phb->hose->node);
> +		iommu_register_group(&phb->p5ioc2.table_group,
>  				pci_domain_nr(phb->hose->bus), phb->opal_id);
>  	}
>  
> -	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
> +	set_iommu_table_base(&pdev->dev, tbl);
>  	iommu_add_device(&pdev->dev);
>  }
>  
> @@ -180,6 +182,12 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
>  	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
>  				  tce_mem, tce_size, 0,
>  				  IOMMU_PAGE_SHIFT_4K);
> +	/*
> +	 * We do not allocate iommu_table as we do not support
> +	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
> +	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
> +	 */
> +	phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
>  }
>  
>  void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index ec26afd..720cc99 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -57,7 +57,7 @@ struct pnv_ioda_pe {
>  	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>  	int			tce32_seg;
>  	int			tce32_segcount;
> -	struct iommu_table	*tce32_table;
> +	struct iommu_table_group table_group;
>  	phys_addr_t		tce_inval_reg_phys;
>  
>  	/* 64-bit TCE bypass region */
> @@ -123,6 +123,7 @@ struct pnv_phb {
>  	union {
>  		struct {
>  			struct iommu_table iommu_table;
> +			struct iommu_table_group table_group;
>  		} p5ioc2;
>  
>  		struct {
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 4f2ab90..ad5ac6d 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -52,14 +52,49 @@
>  
>  #include "pseries.h"
>  
> -static void iommu_pseries_free_table(struct iommu_table *tbl,
> +static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> +{
> +	struct iommu_table_group *table_group = NULL;
> +	struct iommu_table *tbl = NULL;
> +
> +	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
> +			   node);
> +	if (!table_group)
> +		goto fail_exit;
> +
> +	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
> +	if (!tbl)
> +		goto fail_exit;
> +
> +	tbl->it_table_group = table_group;
> +	table_group->tables[0] = tbl;
> +
> +	return table_group;
> +
> +fail_exit:
> +	kfree(table_group);
> +	kfree(tbl);
> +
> +	return NULL;
> +}
> +
> +static void iommu_pseries_free_group(struct iommu_table_group *table_group,
>  		const char *node_name)
>  {
> -	if (tbl->it_group) {
> -		iommu_group_put(tbl->it_group);
> -		BUG_ON(tbl->it_group);
> +	struct iommu_table *tbl;
> +
> +	if (!table_group)
> +		return;
> +
> +	if (table_group->group) {
> +		iommu_group_put(table_group->group);
> +		BUG_ON(table_group->group);
>  	}
> +
> +	tbl = table_group->tables[0];
>  	iommu_free_table(tbl, node_name);
> +
> +	kfree(table_group);
>  }
>  
>  static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
> @@ -629,13 +664,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
>  	pci->phb->dma_window_size = 0x8000000ul;
>  	pci->phb->dma_window_base_cur = 0x8000000ul;
>  
> -	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> -			   pci->phb->node);
> +	pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
> +	tbl = pci->table_group->tables[0];
>  
>  	iommu_table_setparms(pci->phb, dn, tbl);
>  	tbl->it_ops = &iommu_table_pseries_ops;
> -	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
> -	iommu_register_group(tbl, pci_domain_nr(bus), 0);
> +	iommu_init_table(tbl, pci->phb->node);
> +	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
>  
>  	/* Divide the rest (1.75GB) among the children */
>  	pci->phb->dma_window_size = 0x80000000ul;
> @@ -678,16 +713,17 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>  	ppci = PCI_DN(pdn);
>  
>  	pr_debug("  parent is %s, iommu_table: 0x%p\n",
> -		 pdn->full_name, ppci->iommu_table);
> +		 pdn->full_name, ppci->table_group);
>  
> -	if (!ppci->iommu_table) {
> -		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> -				   ppci->phb->node);
> +	if (!ppci->table_group) {
> +		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
> +		tbl = ppci->table_group->tables[0];
>  		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
>  		tbl->it_ops = &iommu_table_lpar_multi_ops;
> -		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
> -		iommu_register_group(tbl, pci_domain_nr(bus), 0);
> -		pr_debug("  created table: %p\n", ppci->iommu_table);
> +		iommu_init_table(tbl, ppci->phb->node);
> +		iommu_register_group(ppci->table_group,
> +				pci_domain_nr(bus), 0);
> +		pr_debug("  created table: %p\n", ppci->table_group);
>  	}
>  }
>  
> @@ -709,12 +745,13 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
>  		struct pci_controller *phb = PCI_DN(dn)->phb;
>  
>  		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
> -		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> -				   phb->node);
> +		PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
> +		tbl = PCI_DN(dn)->table_group->tables[0];
>  		iommu_table_setparms(phb, dn, tbl);
>  		tbl->it_ops = &iommu_table_pseries_ops;
> -		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
> -		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
> +		iommu_init_table(tbl, phb->node);
> +		iommu_register_group(PCI_DN(dn)->table_group,
> +				pci_domain_nr(phb->bus), 0);
>  		set_iommu_table_base(&dev->dev, tbl);
>  		iommu_add_device(&dev->dev);
>  		return;
> @@ -724,11 +761,12 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
>  	 * an already allocated iommu table is found and use that.
>  	 */
>  
> -	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
> +	while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
>  		dn = dn->parent;
>  
>  	if (dn && PCI_DN(dn)) {
> -		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
> +		set_iommu_table_base(&dev->dev,
> +				PCI_DN(dn)->table_group->tables[0]);
>  		iommu_add_device(&dev->dev);
>  	} else
>  		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
> @@ -1115,7 +1153,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>  	dn = pci_device_to_OF_node(dev);
>  	pr_debug("  node is %s\n", dn->full_name);
>  
> -	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
> +	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
>  	     pdn = pdn->parent) {
>  		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
>  		if (dma_window)
> @@ -1131,19 +1169,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>  	pr_debug("  parent is %s\n", pdn->full_name);
>  
>  	pci = PCI_DN(pdn);
> -	if (!pci->iommu_table) {
> -		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
> -				   pci->phb->node);
> +	if (!pci->table_group) {
> +		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
> +		tbl = pci->table_group->tables[0];
>  		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
>  		tbl->it_ops = &iommu_table_lpar_multi_ops;
> -		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
> -		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
> -		pr_debug("  created table: %p\n", pci->iommu_table);
> +		iommu_init_table(tbl, pci->phb->node);
> +		iommu_register_group(pci->table_group,
> +				pci_domain_nr(pci->phb->bus), 0);
> +		pr_debug("  created table: %p\n", pci->table_group);
>  	} else {
> -		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
> +		pr_debug("  found DMA window, table: %p\n", pci->table_group);
>  	}
>  
> -	set_iommu_table_base(&dev->dev, pci->iommu_table);
> +	set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
>  	iommu_add_device(&dev->dev);
>  }
>  
> @@ -1174,7 +1213,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
>  		 * search upwards in the tree until we either hit a dma-window
>  		 * property, OR find a parent with a table already allocated.
>  		 */
> -		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
> +		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
>  				pdn = pdn->parent) {
>  			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
>  			if (dma_window)
> @@ -1218,7 +1257,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
>  		dn = pci_device_to_OF_node(pdev);
>  
>  		/* search upwards for ibm,dma-window */
> -		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table;
> +		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
>  				dn = dn->parent)
>  			if (of_get_property(dn, "ibm,dma-window", NULL))
>  				break;
> @@ -1298,8 +1337,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
>  		 * the device node.
>  		 */
>  		remove_ddw(np, false);
> -		if (pci && pci->iommu_table)
> -			iommu_pseries_free_table(pci->iommu_table,
> +		if (pci && pci->table_group)
> +			iommu_pseries_free_group(pci->table_group,
>  					np->full_name);
>  
>  		spin_lock(&direct_window_list_lock);
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 0fbe03e..bd87e46 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -190,10 +190,11 @@ static void tce_iommu_release(void *iommu_data)
>  {
>  	struct tce_container *container = iommu_data;
>  
> -	WARN_ON(container->tbl && !container->tbl->it_group);
> +	WARN_ON(container->tbl && !container->tbl->it_table_group->group);
>  
> -	if (container->tbl && container->tbl->it_group)
> -		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +	if (container->tbl && container->tbl->it_table_group->group)
> +		tce_iommu_detach_group(iommu_data,
> +				container->tbl->it_table_group->group);
>  
>  	tce_iommu_disable(container);
>  	mutex_destroy(&container->lock);
> @@ -345,7 +346,7 @@ static long tce_iommu_ioctl(void *iommu_data,
>  		if (!tbl)
>  			return -ENXIO;
>  
> -		BUG_ON(!tbl->it_group);
> +		BUG_ON(!tbl->it_table_group->group);
>  
>  		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
>  
> @@ -433,11 +434,12 @@ static long tce_iommu_ioctl(void *iommu_data,
>  		mutex_unlock(&container->lock);
>  		return 0;
>  	case VFIO_EEH_PE_OP:
> -		if (!container->tbl || !container->tbl->it_group)
> +		if (!container->tbl || !container->tbl->it_table_group->group)
>  			return -ENODEV;
>  
> -		return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
> -						  cmd, arg);
> +		return vfio_spapr_iommu_eeh_ioctl(
> +				container->tbl->it_table_group->group,
> +				cmd, arg);
>  	}
>  
>  	return -ENOTTY;
> @@ -457,7 +459,8 @@ static int tce_iommu_attach_group(void *iommu_data,
>  			iommu_group_id(iommu_group), iommu_group); */
>  	if (container->tbl) {
>  		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> -				iommu_group_id(container->tbl->it_group),
> +				iommu_group_id(container->tbl->
> +						it_table_group->group),
>  				iommu_group_id(iommu_group));
>  		ret = -EBUSY;
>  		goto unlock_exit;
> @@ -491,13 +494,13 @@ static void tce_iommu_detach_group(void *iommu_data,
>  	if (tbl != container->tbl) {
>  		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
>  				iommu_group_id(iommu_group),
> -				iommu_group_id(tbl->it_group));
> +				iommu_group_id(tbl->it_table_group->group));
>  		goto unlock_exit;
>  	}
>  
>  	if (container->enabled) {
>  		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
> -				iommu_group_id(tbl->it_group));
> +				iommu_group_id(tbl->it_table_group->group));
>  		tce_iommu_disable(container);
>  	}
>  
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group
  2015-05-11 15:39 ` [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group Alexey Kardashevskiy
  2015-05-13 21:30   ` Alex Williamson
@ 2015-05-14  1:21   ` Gavin Shan
  2015-05-14  3:31     ` Alexey Kardashevskiy
  1 sibling, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  1:21 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:05AM +1000, Alexey Kardashevskiy wrote:
>Modern IBM POWERPC systems support multiple (currently two) TCE tables
>per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
>for TCE tables. Right now just one table is supported.
>
>This defines iommu_table_group struct which stores pointers to
>iommu_group and iommu_table(s). This replaces iommu_table with
>iommu_table_group where iommu_table was used to identify a group:
>- iommu_register_group();
>- iommudata of generic iommu_group;
>
>This removes @data from iommu_table as it_table_group provides
>same access to pnv_ioda_pe.
>
>For IODA, instead of embedding iommu_table, the new iommu_table_group
>keeps pointers to those. The iommu_table structs are allocated
>dynamically.
>
>For P5IOC2, both iommu_table_group and iommu_table are embedded into
>PE struct. As there is no EEH and SRIOV support for P5IOC2,
>iommu_free_table() should not be called on iommu_table struct pointers
>so we can keep it embedded in pnv_phb::p5ioc2.
>
>For pSeries, this replaces multiple calls of kzalloc_node() with a new
>iommu_pseries_alloc_group() helper and stores the table group struct
>pointer into the pci_dn struct. For release, a iommu_table_free_group()
>helper is added.
>
>This moves iommu_table struct allocation from SR-IOV code to
>the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and
>pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized.
>This change is here because those lines had to be changed anyway.
>
>This should cause no behavioural change.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v10:
>* new to the series, separated from
>"powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group"
>* iommu_table is not embedded into iommu_table_group but allocated
>dynamically in most cases
>* iommu_table allocation is moved to a single place for IODA2's
>pnv_pci_ioda_setup_dma_pe where it belongs to
>* added list of groups into iommu_table; most of the code just looks at
>the first item to keep the patch simpler
>---
> arch/powerpc/include/asm/iommu.h            |  17 +++--
> arch/powerpc/include/asm/pci-bridge.h       |   2 +-
> arch/powerpc/kernel/iommu.c                 |  17 ++---
> arch/powerpc/platforms/powernv/pci-ioda.c   |  55 +++++++-------
> arch/powerpc/platforms/powernv/pci-p5ioc2.c |  18 +++--
> arch/powerpc/platforms/powernv/pci.h        |   3 +-
> arch/powerpc/platforms/pseries/iommu.c      | 107 +++++++++++++++++++---------
> drivers/vfio/vfio_iommu_spapr_tce.c         |  23 +++---
> 8 files changed, 152 insertions(+), 90 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>index e2a45c3..61bde1a 100644
>--- a/arch/powerpc/include/asm/iommu.h
>+++ b/arch/powerpc/include/asm/iommu.h
>@@ -92,13 +92,10 @@ struct iommu_table {
> 	unsigned long *it_map;       /* A simple allocation bitmap for now */
> 	unsigned long  it_page_shift;/* table iommu page size */
> #ifdef CONFIG_IOMMU_API
>-	struct iommu_group *it_group;
>+	struct iommu_table_group *it_table_group;
> #endif
> 	struct iommu_table_ops *it_ops;
> 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
>-#ifdef CONFIG_PPC_POWERNV
>-	void           *data;
>-#endif
> };
>
> /* Pure 2^n version of get_order */
>@@ -130,13 +127,21 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
> extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
> 					    int nid);
> #ifdef CONFIG_IOMMU_API
>-extern void iommu_register_group(struct iommu_table *tbl,
>+
>+#define IOMMU_TABLE_GROUP_MAX_TABLES	1
>+
>+struct iommu_table_group {
>+	struct iommu_group *group;
>+	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>+};
>+
>+extern void iommu_register_group(struct iommu_table_group *table_group,
> 				 int pci_domain_number, unsigned long pe_num);
> extern int iommu_add_device(struct device *dev);
> extern void iommu_del_device(struct device *dev);
> extern int __init tce_iommu_bus_notifier_init(void);
> #else
>-static inline void iommu_register_group(struct iommu_table *tbl,
>+static inline void iommu_register_group(struct iommu_table_group *table_group,
> 					int pci_domain_number,
> 					unsigned long pe_num)
> {
>diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>index 1811c44..e2d7479 100644
>--- a/arch/powerpc/include/asm/pci-bridge.h
>+++ b/arch/powerpc/include/asm/pci-bridge.h
>@@ -185,7 +185,7 @@ struct pci_dn {
>
> 	struct  pci_dn *parent;
> 	struct  pci_controller *phb;	/* for pci devices */
>-	struct	iommu_table *iommu_table;	/* for phb's or bridges */
>+	struct	iommu_table_group *table_group;	/* for phb's or bridges */
> 	struct	device_node *node;	/* back-pointer to the device_node */
>
> 	int	pci_ext_config_space;	/* for pci devices */
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index 16be6aa..79e8b43 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -886,11 +886,12 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
>  */
> static void group_release(void *iommu_data)
> {
>-	struct iommu_table *tbl = iommu_data;
>-	tbl->it_group = NULL;
>+	struct iommu_table_group *table_group = iommu_data;
>+
>+	table_group->group = NULL;
> }
>
>-void iommu_register_group(struct iommu_table *tbl,
>+void iommu_register_group(struct iommu_table_group *table_group,
> 		int pci_domain_number, unsigned long pe_num)
> {
> 	struct iommu_group *grp;
>@@ -902,8 +903,8 @@ void iommu_register_group(struct iommu_table *tbl,
> 				PTR_ERR(grp));
> 		return;
> 	}
>-	tbl->it_group = grp;
>-	iommu_group_set_iommudata(grp, tbl, group_release);
>+	table_group->group = grp;
>+	iommu_group_set_iommudata(grp, table_group, group_release);
> 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
> 			pci_domain_number, pe_num);
> 	if (!name)
>@@ -1091,7 +1092,7 @@ int iommu_add_device(struct device *dev)
> 	}
>
> 	tbl = get_iommu_table_base(dev);
>-	if (!tbl || !tbl->it_group) {
>+	if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) {
> 		pr_debug("%s: Skipping device %s with no tbl\n",
> 			 __func__, dev_name(dev));
> 		return 0;
>@@ -1099,7 +1100,7 @@ int iommu_add_device(struct device *dev)
>
> 	pr_debug("%s: Adding %s to iommu group %d\n",
> 		 __func__, dev_name(dev),
>-		 iommu_group_id(tbl->it_group));
>+		 iommu_group_id(tbl->it_table_group->group));
>
> 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
> 		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
>@@ -1108,7 +1109,7 @@ int iommu_add_device(struct device *dev)
> 		return -EINVAL;
> 	}
>
>-	return iommu_group_add_device(tbl->it_group, dev);
>+	return iommu_group_add_device(tbl->it_table_group->group, dev);
> }
> EXPORT_SYMBOL_GPL(iommu_add_device);
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 1b43e25..02ed448 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1087,10 +1087,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
> 		return;
> 	}
>
>-	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>-			GFP_KERNEL, hose->node);
>-	pe->tce32_table->data = pe;
>-
> 	/* Associate it with all child devices */
> 	pnv_ioda_setup_same_PE(bus, pe);
>
>@@ -1292,11 +1288,12 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
> 	struct iommu_table    *tbl;
> 	unsigned long         addr;
> 	int64_t               rc;
>+	struct iommu_table_group *table_group;
>
> 	bus = dev->bus;
> 	hose = pci_bus_to_host(bus);
> 	phb = hose->private_data;
>-	tbl = pe->tce32_table;
>+	tbl = pe->table_group.tables[0];
> 	addr = tbl->it_base;
>
> 	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
>@@ -1311,13 +1308,14 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
> 	if (rc)
> 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>
>-	if (tbl->it_group) {
>-		iommu_group_put(tbl->it_group);
>-		BUG_ON(tbl->it_group);
>+	table_group = tbl->it_table_group;
>+	if (table_group->group) {
>+		iommu_group_put(table_group->group);
>+		BUG_ON(table_group->group);
> 	}
> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
>-	pe->tce32_table = NULL;
>+	pe->table_group.tables[0] = NULL;
> }
>
> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>@@ -1465,10 +1463,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> 			continue;
> 		}
>
>-		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>-				GFP_KERNEL, hose->node);
>-		pe->tce32_table->data = pe;
>-
> 		/* Put PE to the list */
> 		mutex_lock(&phb->ioda.pe_list_mutex);
> 		list_add_tail(&pe->list, &phb->ioda.pe_list);
>@@ -1603,7 +1597,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
>
> 	pe = &phb->ioda.pe_array[pdn->pe_number];
> 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
>-	set_iommu_table_base(&pdev->dev, pe->tce32_table);
>+	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
> 	/*
> 	 * Note: iommu_add_device() will fail here as
> 	 * for physical PE: the device is already added by now;
>@@ -1636,7 +1630,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
> 	} else {
> 		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
> 		set_dma_ops(&pdev->dev, &dma_iommu_ops);
>-		set_iommu_table_base(&pdev->dev, pe->tce32_table);
>+		set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
> 	}
> 	*pdev->dev.dma_mask = dma_mask;
> 	return 0;
>@@ -1670,7 +1664,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
> 	struct pci_dev *dev;
>
> 	list_for_each_entry(dev, &bus->devices, bus_list) {
>-		set_iommu_table_base(&dev->dev, pe->tce32_table);
>+		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
> 		iommu_add_device(&dev->dev);
>
> 		if (dev->subordinate)
>@@ -1681,7 +1675,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
> static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
> 		unsigned long index, unsigned long npages, bool rm)
> {
>-	struct pnv_ioda_pe *pe = tbl->data;
>+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>+			struct pnv_ioda_pe, table_group);
> 	__be64 __iomem *invalidate = rm ?
> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
> 		(__be64 __iomem *)tbl->it_index;
>@@ -1758,7 +1753,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
> static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
> 		unsigned long index, unsigned long npages, bool rm)
> {
>-	struct pnv_ioda_pe *pe = tbl->data;
>+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>+			struct pnv_ioda_pe, table_group);
> 	unsigned long start, end, inc;
> 	__be64 __iomem *invalidate = rm ?
> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>@@ -1834,8 +1830,12 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	if (WARN_ON(pe->tce32_seg >= 0))
> 		return;
>
>-	tbl = pe->tce32_table;
>-	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
>+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>+			phb->hose->node);
>+	tbl->it_table_group = &pe->table_group;
>+	pe->table_group.tables[0] = tbl;
>+	iommu_register_group(&pe->table_group, phb->hose->global_number,
>+			pe->pe_number);
>
> 	/* Grab a 32-bit TCE table */
> 	pe->tce32_seg = base;
>@@ -1914,7 +1914,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>
> static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
> {
>-	struct pnv_ioda_pe *pe = tbl->data;
>+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>+			struct pnv_ioda_pe, table_group);
> 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
> 	int64_t rc;
>
>@@ -1948,10 +1949,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
> 	pe->tce_bypass_base = 1ull << 59;
>
> 	/* Install set_bypass callback for VFIO */
>-	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
>+	pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass;
It could be simplied as:
	tbl->set_bypass = pnv_pci_ioda2_set_bypass;
>
> 	/* Enable bypass by default */
>-	pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
>+	pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true);
Similar to above:
	tbl->set_bypass(tbl, true);
> }
>
> static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>@@ -1968,8 +1969,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	if (WARN_ON(pe->tce32_seg >= 0))
> 		return;
>
>-	tbl = pe->tce32_table;
>-	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
>+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>+			phb->hose->node);
>+	tbl->it_table_group = &pe->table_group;
>+	pe->table_group.tables[0] = tbl;
>+	iommu_register_group(&pe->table_group, phb->hose->global_number,
>+			pe->pe_number);
>
> 	/* The PE will reserve all possible 32-bits space */
> 	pe->tce32_seg = 0;
>diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>index 2722c1a..4ea9def 100644
>--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>@@ -92,14 +92,16 @@ static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
> static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
> 					 struct pci_dev *pdev)
> {
>-	if (phb->p5ioc2.iommu_table.it_map == NULL) {
>-		phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops;
>-		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
>-		iommu_register_group(&phb->p5ioc2.iommu_table,
>+	struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0];
>+
>+	if (!tbl->it_map) {
>+		tbl->it_ops = &pnv_p5ioc2_iommu_ops;
>+		iommu_init_table(tbl, phb->hose->node);
>+		iommu_register_group(&phb->p5ioc2.table_group,
> 				pci_domain_nr(phb->hose->bus), phb->opal_id);
> 	}
>
>-	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
>+	set_iommu_table_base(&pdev->dev, tbl);
> 	iommu_add_device(&pdev->dev);
> }
>
>@@ -180,6 +182,12 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
> 	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
> 				  tce_mem, tce_size, 0,
> 				  IOMMU_PAGE_SHIFT_4K);
>+	/*
>+	 * We do not allocate iommu_table as we do not support
>+	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
>+	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
>+	 */
>+	phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
> }
>
> void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>index ec26afd..720cc99 100644
>--- a/arch/powerpc/platforms/powernv/pci.h
>+++ b/arch/powerpc/platforms/powernv/pci.h
>@@ -57,7 +57,7 @@ struct pnv_ioda_pe {
> 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
> 	int			tce32_seg;
> 	int			tce32_segcount;
>-	struct iommu_table	*tce32_table;
>+	struct iommu_table_group table_group;
> 	phys_addr_t		tce_inval_reg_phys;
>
> 	/* 64-bit TCE bypass region */
>@@ -123,6 +123,7 @@ struct pnv_phb {
> 	union {
> 		struct {
> 			struct iommu_table iommu_table;
>+			struct iommu_table_group table_group;
> 		} p5ioc2;
>
> 		struct {
>diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>index 4f2ab90..ad5ac6d 100644
>--- a/arch/powerpc/platforms/pseries/iommu.c
>+++ b/arch/powerpc/platforms/pseries/iommu.c
>@@ -52,14 +52,49 @@
>
> #include "pseries.h"
>
>-static void iommu_pseries_free_table(struct iommu_table *tbl,
>+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
Since it's a static function, the name could be simplied to
iommu_group_alloc(), or alloc_iommu_group(). But it might
not the style you like :-)
>+{
>+	struct iommu_table_group *table_group = NULL;
>+	struct iommu_table *tbl = NULL;
>+
>+	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
>+			   node);
>+	if (!table_group)
>+		goto fail_exit;
>+
>+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
>+	if (!tbl)
>+		goto fail_exit;
>+
>+	tbl->it_table_group = table_group;
>+	table_group->tables[0] = tbl;
>+
>+	return table_group;
>+
>+fail_exit:
>+	kfree(table_group);
>+	kfree(tbl);
>+
>+	return NULL;
>+}
>+
>+static void iommu_pseries_free_group(struct iommu_table_group *table_group,
> 		const char *node_name)
Same suggestion as above.
> {
>-	if (tbl->it_group) {
>-		iommu_group_put(tbl->it_group);
>-		BUG_ON(tbl->it_group);
>+	struct iommu_table *tbl;
>+
>+	if (!table_group)
>+		return;
>+
>+	if (table_group->group) {
>+		iommu_group_put(table_group->group);
>+		BUG_ON(table_group->group);
> 	}
>+
>+	tbl = table_group->tables[0];
> 	iommu_free_table(tbl, node_name);
It might worthy to have one check:
	if (table_group->tables[0])
		iommu_free_table(table_group->tables[0], node_name);
>+
>+	kfree(table_group);
> }
>
> static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
>@@ -629,13 +664,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
> 	pci->phb->dma_window_size = 0x8000000ul;
> 	pci->phb->dma_window_base_cur = 0x8000000ul;
>
>-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>-			   pci->phb->node);
>+	pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
>+	tbl = pci->table_group->tables[0];
The orginal code isn't checking "!pci->table_group". If this function is
called only at bootup time, it would be nice to see kernel crash. Otherwise,
I guess it's still worthy to have the check :-)
Thanks,
Gavin
>
> 	iommu_table_setparms(pci->phb, dn, tbl);
> 	tbl->it_ops = &iommu_table_pseries_ops;
>-	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
>-	iommu_register_group(tbl, pci_domain_nr(bus), 0);
>+	iommu_init_table(tbl, pci->phb->node);
>+	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
>
> 	/* Divide the rest (1.75GB) among the children */
> 	pci->phb->dma_window_size = 0x80000000ul;
>@@ -678,16 +713,17 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
> 	ppci = PCI_DN(pdn);
>
> 	pr_debug("  parent is %s, iommu_table: 0x%p\n",
>-		 pdn->full_name, ppci->iommu_table);
>+		 pdn->full_name, ppci->table_group);
>
>-	if (!ppci->iommu_table) {
>-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>-				   ppci->phb->node);
>+	if (!ppci->table_group) {
>+		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
>+		tbl = ppci->table_group->tables[0];
> 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
> 		tbl->it_ops = &iommu_table_lpar_multi_ops;
>-		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
>-		iommu_register_group(tbl, pci_domain_nr(bus), 0);
>-		pr_debug("  created table: %p\n", ppci->iommu_table);
>+		iommu_init_table(tbl, ppci->phb->node);
>+		iommu_register_group(ppci->table_group,
>+				pci_domain_nr(bus), 0);
>+		pr_debug("  created table: %p\n", ppci->table_group);
> 	}
> }
>
>@@ -709,12 +745,13 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
> 		struct pci_controller *phb = PCI_DN(dn)->phb;
>
> 		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
>-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>-				   phb->node);
>+		PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
>+		tbl = PCI_DN(dn)->table_group->tables[0];
> 		iommu_table_setparms(phb, dn, tbl);
> 		tbl->it_ops = &iommu_table_pseries_ops;
>-		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
>-		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
>+		iommu_init_table(tbl, phb->node);
>+		iommu_register_group(PCI_DN(dn)->table_group,
>+				pci_domain_nr(phb->bus), 0);
> 		set_iommu_table_base(&dev->dev, tbl);
> 		iommu_add_device(&dev->dev);
> 		return;
>@@ -724,11 +761,12 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
> 	 * an already allocated iommu table is found and use that.
> 	 */
>
>-	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
>+	while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
> 		dn = dn->parent;
>
> 	if (dn && PCI_DN(dn)) {
>-		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
>+		set_iommu_table_base(&dev->dev,
>+				PCI_DN(dn)->table_group->tables[0]);
> 		iommu_add_device(&dev->dev);
> 	} else
> 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
>@@ -1115,7 +1153,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
> 	dn = pci_device_to_OF_node(dev);
> 	pr_debug("  node is %s\n", dn->full_name);
>
>-	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
>+	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
> 	     pdn = pdn->parent) {
> 		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
> 		if (dma_window)
>@@ -1131,19 +1169,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
> 	pr_debug("  parent is %s\n", pdn->full_name);
>
> 	pci = PCI_DN(pdn);
>-	if (!pci->iommu_table) {
>-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>-				   pci->phb->node);
>+	if (!pci->table_group) {
>+		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
>+		tbl = pci->table_group->tables[0];
> 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
> 		tbl->it_ops = &iommu_table_lpar_multi_ops;
>-		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
>-		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
>-		pr_debug("  created table: %p\n", pci->iommu_table);
>+		iommu_init_table(tbl, pci->phb->node);
>+		iommu_register_group(pci->table_group,
>+				pci_domain_nr(pci->phb->bus), 0);
>+		pr_debug("  created table: %p\n", pci->table_group);
> 	} else {
>-		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
>+		pr_debug("  found DMA window, table: %p\n", pci->table_group);
> 	}
>
>-	set_iommu_table_base(&dev->dev, pci->iommu_table);
>+	set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
> 	iommu_add_device(&dev->dev);
> }
>
>@@ -1174,7 +1213,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
> 		 * search upwards in the tree until we either hit a dma-window
> 		 * property, OR find a parent with a table already allocated.
> 		 */
>-		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
>+		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
> 				pdn = pdn->parent) {
> 			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
> 			if (dma_window)
>@@ -1218,7 +1257,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
> 		dn = pci_device_to_OF_node(pdev);
>
> 		/* search upwards for ibm,dma-window */
>-		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table;
>+		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
> 				dn = dn->parent)
> 			if (of_get_property(dn, "ibm,dma-window", NULL))
> 				break;
>@@ -1298,8 +1337,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
> 		 * the device node.
> 		 */
> 		remove_ddw(np, false);
>-		if (pci && pci->iommu_table)
>-			iommu_pseries_free_table(pci->iommu_table,
>+		if (pci && pci->table_group)
>+			iommu_pseries_free_group(pci->table_group,
> 					np->full_name);
>
> 		spin_lock(&direct_window_list_lock);
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index 0fbe03e..bd87e46 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -190,10 +190,11 @@ static void tce_iommu_release(void *iommu_data)
> {
> 	struct tce_container *container = iommu_data;
>
>-	WARN_ON(container->tbl && !container->tbl->it_group);
>+	WARN_ON(container->tbl && !container->tbl->it_table_group->group);
>
>-	if (container->tbl && container->tbl->it_group)
>-		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
>+	if (container->tbl && container->tbl->it_table_group->group)
>+		tce_iommu_detach_group(iommu_data,
>+				container->tbl->it_table_group->group);
>
> 	tce_iommu_disable(container);
> 	mutex_destroy(&container->lock);
>@@ -345,7 +346,7 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		if (!tbl)
> 			return -ENXIO;
>
>-		BUG_ON(!tbl->it_group);
>+		BUG_ON(!tbl->it_table_group->group);
>
> 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
>
>@@ -433,11 +434,12 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		mutex_unlock(&container->lock);
> 		return 0;
> 	case VFIO_EEH_PE_OP:
>-		if (!container->tbl || !container->tbl->it_group)
>+		if (!container->tbl || !container->tbl->it_table_group->group)
> 			return -ENODEV;
>
>-		return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
>-						  cmd, arg);
>+		return vfio_spapr_iommu_eeh_ioctl(
>+				container->tbl->it_table_group->group,
>+				cmd, arg);
> 	}
>
> 	return -ENOTTY;
>@@ -457,7 +459,8 @@ static int tce_iommu_attach_group(void *iommu_data,
> 			iommu_group_id(iommu_group), iommu_group); */
> 	if (container->tbl) {
> 		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>-				iommu_group_id(container->tbl->it_group),
>+				iommu_group_id(container->tbl->
>+						it_table_group->group),
> 				iommu_group_id(iommu_group));
> 		ret = -EBUSY;
> 		goto unlock_exit;
>@@ -491,13 +494,13 @@ static void tce_iommu_detach_group(void *iommu_data,
> 	if (tbl != container->tbl) {
> 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> 				iommu_group_id(iommu_group),
>-				iommu_group_id(tbl->it_group));
>+				iommu_group_id(tbl->it_table_group->group));
> 		goto unlock_exit;
> 	}
>
> 	if (container->enabled) {
> 		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
>-				iommu_group_id(tbl->it_group));
>+				iommu_group_id(tbl->it_table_group->group));
> 		tce_iommu_disable(container);
> 	}
>
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group
  2015-05-14  1:21   ` Gavin Shan
@ 2015-05-14  3:31     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  3:31 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/14/2015 11:21 AM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:39:05AM +1000, Alexey Kardashevskiy wrote:
>> Modern IBM POWERPC systems support multiple (currently two) TCE tables
>> per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
>> for TCE tables. Right now just one table is supported.
>>
>> This defines iommu_table_group struct which stores pointers to
>> iommu_group and iommu_table(s). This replaces iommu_table with
>> iommu_table_group where iommu_table was used to identify a group:
>> - iommu_register_group();
>> - iommudata of generic iommu_group;
>>
>> This removes @data from iommu_table as it_table_group provides
>> same access to pnv_ioda_pe.
>>
>> For IODA, instead of embedding iommu_table, the new iommu_table_group
>> keeps pointers to those. The iommu_table structs are allocated
>> dynamically.
>>
>> For P5IOC2, both iommu_table_group and iommu_table are embedded into
>> PE struct. As there is no EEH and SRIOV support for P5IOC2,
>> iommu_free_table() should not be called on iommu_table struct pointers
>> so we can keep it embedded in pnv_phb::p5ioc2.
>>
>> For pSeries, this replaces multiple calls of kzalloc_node() with a new
>> iommu_pseries_alloc_group() helper and stores the table group struct
>> pointer into the pci_dn struct. For release, a iommu_table_free_group()
>> helper is added.
>>
>> This moves iommu_table struct allocation from SR-IOV code to
>> the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and
>> pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized.
>> This change is here because those lines had to be changed anyway.
>>
>> This should cause no behavioural change.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
>> ---
>> Changes:
>> v10:
>> * new to the series, separated from
>> "powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group"
>> * iommu_table is not embedded into iommu_table_group but allocated
>> dynamically in most cases
>> * iommu_table allocation is moved to a single place for IODA2's
>> pnv_pci_ioda_setup_dma_pe where it belongs to
>> * added list of groups into iommu_table; most of the code just looks at
>> the first item to keep the patch simpler
>> ---
>> arch/powerpc/include/asm/iommu.h            |  17 +++--
>> arch/powerpc/include/asm/pci-bridge.h       |   2 +-
>> arch/powerpc/kernel/iommu.c                 |  17 ++---
>> arch/powerpc/platforms/powernv/pci-ioda.c   |  55 +++++++-------
>> arch/powerpc/platforms/powernv/pci-p5ioc2.c |  18 +++--
>> arch/powerpc/platforms/powernv/pci.h        |   3 +-
>> arch/powerpc/platforms/pseries/iommu.c      | 107 +++++++++++++++++++---------
>> drivers/vfio/vfio_iommu_spapr_tce.c         |  23 +++---
>> 8 files changed, 152 insertions(+), 90 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index e2a45c3..61bde1a 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -92,13 +92,10 @@ struct iommu_table {
>> 	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> 	unsigned long  it_page_shift;/* table iommu page size */
>> #ifdef CONFIG_IOMMU_API
>> -	struct iommu_group *it_group;
>> +	struct iommu_table_group *it_table_group;
>> #endif
>> 	struct iommu_table_ops *it_ops;
>> 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
>> -#ifdef CONFIG_PPC_POWERNV
>> -	void           *data;
>> -#endif
>> };
>>
>> /* Pure 2^n version of get_order */
>> @@ -130,13 +127,21 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
>> extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>> 					    int nid);
>> #ifdef CONFIG_IOMMU_API
>> -extern void iommu_register_group(struct iommu_table *tbl,
>> +
>> +#define IOMMU_TABLE_GROUP_MAX_TABLES	1
>> +
>> +struct iommu_table_group {
>> +	struct iommu_group *group;
>> +	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>> +};
>> +
>> +extern void iommu_register_group(struct iommu_table_group *table_group,
>> 				 int pci_domain_number, unsigned long pe_num);
>> extern int iommu_add_device(struct device *dev);
>> extern void iommu_del_device(struct device *dev);
>> extern int __init tce_iommu_bus_notifier_init(void);
>> #else
>> -static inline void iommu_register_group(struct iommu_table *tbl,
>> +static inline void iommu_register_group(struct iommu_table_group *table_group,
>> 					int pci_domain_number,
>> 					unsigned long pe_num)
>> {
>> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
>> index 1811c44..e2d7479 100644
>> --- a/arch/powerpc/include/asm/pci-bridge.h
>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>> @@ -185,7 +185,7 @@ struct pci_dn {
>>
>> 	struct  pci_dn *parent;
>> 	struct  pci_controller *phb;	/* for pci devices */
>> -	struct	iommu_table *iommu_table;	/* for phb's or bridges */
>> +	struct	iommu_table_group *table_group;	/* for phb's or bridges */
>> 	struct	device_node *node;	/* back-pointer to the device_node */
>>
>> 	int	pci_ext_config_space;	/* for pci devices */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 16be6aa..79e8b43 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -886,11 +886,12 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
>>   */
>> static void group_release(void *iommu_data)
>> {
>> -	struct iommu_table *tbl = iommu_data;
>> -	tbl->it_group = NULL;
>> +	struct iommu_table_group *table_group = iommu_data;
>> +
>> +	table_group->group = NULL;
>> }
>>
>> -void iommu_register_group(struct iommu_table *tbl,
>> +void iommu_register_group(struct iommu_table_group *table_group,
>> 		int pci_domain_number, unsigned long pe_num)
>> {
>> 	struct iommu_group *grp;
>> @@ -902,8 +903,8 @@ void iommu_register_group(struct iommu_table *tbl,
>> 				PTR_ERR(grp));
>> 		return;
>> 	}
>> -	tbl->it_group = grp;
>> -	iommu_group_set_iommudata(grp, tbl, group_release);
>> +	table_group->group = grp;
>> +	iommu_group_set_iommudata(grp, table_group, group_release);
>> 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
>> 			pci_domain_number, pe_num);
>> 	if (!name)
>> @@ -1091,7 +1092,7 @@ int iommu_add_device(struct device *dev)
>> 	}
>>
>> 	tbl = get_iommu_table_base(dev);
>> -	if (!tbl || !tbl->it_group) {
>> +	if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) {
>> 		pr_debug("%s: Skipping device %s with no tbl\n",
>> 			 __func__, dev_name(dev));
>> 		return 0;
>> @@ -1099,7 +1100,7 @@ int iommu_add_device(struct device *dev)
>>
>> 	pr_debug("%s: Adding %s to iommu group %d\n",
>> 		 __func__, dev_name(dev),
>> -		 iommu_group_id(tbl->it_group));
>> +		 iommu_group_id(tbl->it_table_group->group));
>>
>> 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
>> 		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
>> @@ -1108,7 +1109,7 @@ int iommu_add_device(struct device *dev)
>> 		return -EINVAL;
>> 	}
>>
>> -	return iommu_group_add_device(tbl->it_group, dev);
>> +	return iommu_group_add_device(tbl->it_table_group->group, dev);
>> }
>> EXPORT_SYMBOL_GPL(iommu_add_device);
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 1b43e25..02ed448 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1087,10 +1087,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>> 		return;
>> 	}
>>
>> -	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>> -			GFP_KERNEL, hose->node);
>> -	pe->tce32_table->data = pe;
>> -
>> 	/* Associate it with all child devices */
>> 	pnv_ioda_setup_same_PE(bus, pe);
>>
>> @@ -1292,11 +1288,12 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>> 	struct iommu_table    *tbl;
>> 	unsigned long         addr;
>> 	int64_t               rc;
>> +	struct iommu_table_group *table_group;
>>
>> 	bus = dev->bus;
>> 	hose = pci_bus_to_host(bus);
>> 	phb = hose->private_data;
>> -	tbl = pe->tce32_table;
>> +	tbl = pe->table_group.tables[0];
>> 	addr = tbl->it_base;
>>
>> 	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
>> @@ -1311,13 +1308,14 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>> 	if (rc)
>> 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>>
>> -	if (tbl->it_group) {
>> -		iommu_group_put(tbl->it_group);
>> -		BUG_ON(tbl->it_group);
>> +	table_group = tbl->it_table_group;
>> +	if (table_group->group) {
>> +		iommu_group_put(table_group->group);
>> +		BUG_ON(table_group->group);
>> 	}
>> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>> 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
>> -	pe->tce32_table = NULL;
>> +	pe->table_group.tables[0] = NULL;
>> }
>>
>> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>> @@ -1465,10 +1463,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>> 			continue;
>> 		}
>>
>> -		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>> -				GFP_KERNEL, hose->node);
>> -		pe->tce32_table->data = pe;
>> -
>> 		/* Put PE to the list */
>> 		mutex_lock(&phb->ioda.pe_list_mutex);
>> 		list_add_tail(&pe->list, &phb->ioda.pe_list);
>> @@ -1603,7 +1597,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
>>
>> 	pe = &phb->ioda.pe_array[pdn->pe_number];
>> 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
>> -	set_iommu_table_base(&pdev->dev, pe->tce32_table);
>> +	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
>> 	/*
>> 	 * Note: iommu_add_device() will fail here as
>> 	 * for physical PE: the device is already added by now;
>> @@ -1636,7 +1630,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
>> 	} else {
>> 		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
>> 		set_dma_ops(&pdev->dev, &dma_iommu_ops);
>> -		set_iommu_table_base(&pdev->dev, pe->tce32_table);
>> +		set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
>> 	}
>> 	*pdev->dev.dma_mask = dma_mask;
>> 	return 0;
>> @@ -1670,7 +1664,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
>> 	struct pci_dev *dev;
>>
>> 	list_for_each_entry(dev, &bus->devices, bus_list) {
>> -		set_iommu_table_base(&dev->dev, pe->tce32_table);
>> +		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
>> 		iommu_add_device(&dev->dev);
>>
>> 		if (dev->subordinate)
>> @@ -1681,7 +1675,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
>> static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
>> 		unsigned long index, unsigned long npages, bool rm)
>> {
>> -	struct pnv_ioda_pe *pe = tbl->data;
>> +	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>> +			struct pnv_ioda_pe, table_group);
>> 	__be64 __iomem *invalidate = rm ?
>> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>> 		(__be64 __iomem *)tbl->it_index;
>> @@ -1758,7 +1753,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
>> static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>> 		unsigned long index, unsigned long npages, bool rm)
>> {
>> -	struct pnv_ioda_pe *pe = tbl->data;
>> +	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>> +			struct pnv_ioda_pe, table_group);
>> 	unsigned long start, end, inc;
>> 	__be64 __iomem *invalidate = rm ?
>> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>> @@ -1834,8 +1830,12 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>> 	if (WARN_ON(pe->tce32_seg >= 0))
>> 		return;
>>
>> -	tbl = pe->tce32_table;
>> -	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
>> +	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>> +			phb->hose->node);
>> +	tbl->it_table_group = &pe->table_group;
>> +	pe->table_group.tables[0] = tbl;
>> +	iommu_register_group(&pe->table_group, phb->hose->global_number,
>> +			pe->pe_number);
>>
>> 	/* Grab a 32-bit TCE table */
>> 	pe->tce32_seg = base;
>> @@ -1914,7 +1914,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>>
>> static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
>> {
>> -	struct pnv_ioda_pe *pe = tbl->data;
>> +	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>> +			struct pnv_ioda_pe, table_group);
>> 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
>> 	int64_t rc;
>>
>> @@ -1948,10 +1949,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
>> 	pe->tce_bypass_base = 1ull << 59;
>>
>> 	/* Install set_bypass callback for VFIO */
>> -	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
>> +	pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass;
>
> It could be simplied as:
>
> 	tbl->set_bypass = pnv_pci_ioda2_set_bypass;
No, this is worse in this case. The whole idea of the patch to be 
mechanical as much as we can. That means:
s/tce32_table/table_group.tables[0]/
If I do what you suggest, the reviewers will have to look further if "tbl" 
has been initialized properly, etc.
And later in this patchset I am getting rid of 
pnv_pci_ioda2_setup_bypass_pe so there is no point to try making it look 
nice :)
>>
>> 	/* Enable bypass by default */
>> -	pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
>> +	pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true);
>
> Similar to above:
>
> 	tbl->set_bypass(tbl, true);
>
>> }
>>
>> static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>> @@ -1968,8 +1969,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>> 	if (WARN_ON(pe->tce32_seg >= 0))
>> 		return;
>>
>> -	tbl = pe->tce32_table;
>> -	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
>> +	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>> +			phb->hose->node);
>> +	tbl->it_table_group = &pe->table_group;
>> +	pe->table_group.tables[0] = tbl;
>> +	iommu_register_group(&pe->table_group, phb->hose->global_number,
>> +			pe->pe_number);
>>
>> 	/* The PE will reserve all possible 32-bits space */
>> 	pe->tce32_seg = 0;
>> diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>> index 2722c1a..4ea9def 100644
>> --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>> +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>> @@ -92,14 +92,16 @@ static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
>> static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
>> 					 struct pci_dev *pdev)
>> {
>> -	if (phb->p5ioc2.iommu_table.it_map == NULL) {
>> -		phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops;
>> -		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
>> -		iommu_register_group(&phb->p5ioc2.iommu_table,
>> +	struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0];
>> +
>> +	if (!tbl->it_map) {
>> +		tbl->it_ops = &pnv_p5ioc2_iommu_ops;
>> +		iommu_init_table(tbl, phb->hose->node);
>> +		iommu_register_group(&phb->p5ioc2.table_group,
>> 				pci_domain_nr(phb->hose->bus), phb->opal_id);
>> 	}
>>
>> -	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
>> +	set_iommu_table_base(&pdev->dev, tbl);
>> 	iommu_add_device(&pdev->dev);
>> }
>>
>> @@ -180,6 +182,12 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
>> 	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
>> 				  tce_mem, tce_size, 0,
>> 				  IOMMU_PAGE_SHIFT_4K);
>> +	/*
>> +	 * We do not allocate iommu_table as we do not support
>> +	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
>> +	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
>> +	 */
>> +	phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
>> }
>>
>> void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>> index ec26afd..720cc99 100644
>> --- a/arch/powerpc/platforms/powernv/pci.h
>> +++ b/arch/powerpc/platforms/powernv/pci.h
>> @@ -57,7 +57,7 @@ struct pnv_ioda_pe {
>> 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>> 	int			tce32_seg;
>> 	int			tce32_segcount;
>> -	struct iommu_table	*tce32_table;
>> +	struct iommu_table_group table_group;
>> 	phys_addr_t		tce_inval_reg_phys;
>>
>> 	/* 64-bit TCE bypass region */
>> @@ -123,6 +123,7 @@ struct pnv_phb {
>> 	union {
>> 		struct {
>> 			struct iommu_table iommu_table;
>> +			struct iommu_table_group table_group;
>> 		} p5ioc2;
>>
>> 		struct {
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 4f2ab90..ad5ac6d 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -52,14 +52,49 @@
>>
>> #include "pseries.h"
>>
>> -static void iommu_pseries_free_table(struct iommu_table *tbl,
>> +static struct iommu_table_group *iommu_pseries_alloc_group(int node)
>
> Since it's a static function, the name could be simplied to
> iommu_group_alloc(), or alloc_iommu_group(). But it might
> not the style you like :-)
Giving it the name like this I am telling people not to try reusing it for 
anything else + easier to grep.
>
>> +{
>> +	struct iommu_table_group *table_group = NULL;
>> +	struct iommu_table *tbl = NULL;
>> +
>> +	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
>> +			   node);
>> +	if (!table_group)
>> +		goto fail_exit;
>> +
>> +	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
>> +	if (!tbl)
>> +		goto fail_exit;
>> +
>> +	tbl->it_table_group = table_group;
>> +	table_group->tables[0] = tbl;
>> +
>> +	return table_group;
>> +
>> +fail_exit:
>> +	kfree(table_group);
>> +	kfree(tbl);
>> +
>> +	return NULL;
>> +}
>> +
>> +static void iommu_pseries_free_group(struct iommu_table_group *table_group,
>> 		const char *node_name)
>
> Same suggestion as above.
>
>> {
>> -	if (tbl->it_group) {
>> -		iommu_group_put(tbl->it_group);
>> -		BUG_ON(tbl->it_group);
>> +	struct iommu_table *tbl;
>> +
>> +	if (!table_group)
>> +		return;
>> +
>> +	if (table_group->group) {
>> +		iommu_group_put(table_group->group);
>> +		BUG_ON(table_group->group);
>> 	}
>> +
>> +	tbl = table_group->tables[0];
>> 	iommu_free_table(tbl, node_name);
>
> It might worthy to have one check:
>
> 	if (table_group->tables[0])
> 		iommu_free_table(table_group->tables[0], node_name);
BUG_ON() - may be. (table_group->tables[0] == NULL) cannot happen normally 
(no memory corruption, etc) as iommu_pseries_alloc_group() allocated both 
table_group and table or nothing.
>
>> +
>> +	kfree(table_group);
>> }
>>
>> static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
>> @@ -629,13 +664,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
>> 	pci->phb->dma_window_size = 0x8000000ul;
>> 	pci->phb->dma_window_base_cur = 0x8000000ul;
>>
>> -	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>> -			   pci->phb->node);
>> +	pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
>> +	tbl = pci->table_group->tables[0];
>
> The orginal code isn't checking "!pci->table_group". If this function is
> called only at bootup time, it would be nice to see kernel crash. Otherwise,
> I guess it's still worthy to have the check :-)
kzalloc_node() is supposed to print something when fails.
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (15 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  1:52   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 18/34] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control Alexey Kardashevskiy
                   ` (16 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.
For IODA, instead of embedding iommu_table, the new iommu_table_group
keeps pointers to those. The iommu_table structs are allocated
dynamically now by a pnv_pci_table_alloc() helper as PCI hotplug
code (for EEH recovery) and SRIOV are supported there.
For P5IOC2, both iommu_table_group and iommu_table are embedded into
PE struct. As there is no EEH and SRIOV support for P5IOC2,
iommu_free_table() should not be called on iommu_table struct pointers
so we can keep it embedded in pnv_phb::p5ioc2.
For pSeries, this replaces multiple calls of kzalloc_node() with a new
iommu_pseries_group_alloc() helper and stores the table group struct
pointer into the pci_dn struct. For release, a iommu_table_group_free()
helper is added.
This moves iommu_table struct allocation from SR-IOV code to
the generic DMA initialization code in pnv_pci_ioda2_setup_dma_pe.
This replaces a single pointer to iommu_group with a list of
iommu_table_group structs. For now it is just a single iommu_table_group
in this list but later with TCE table sharing enabled, the list will
keep all the IOMMU groups which use the particular table. The list
uses iommu_table_group_link structs rather than iommu_table_group::next
as a VFIO container may have 2 IOMMU tables, each will have its own list
head pointer as it is mainly for TCE invalidation code which should
walk through all attached groups and invalidate TCE cache so
the table has to keep the list head pointer. The other option would
be storing list head in a VFIO container but it would not work as
the platform code (which does TCE table update and invalidation) has
no idea about VFIO.
This should cause no behavioural change.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v10:
* iommu_table is not embedded into iommu_table_group but allocated
dynamically
* iommu_table allocation is moved to a single place for IODA2's
pnv_pci_ioda_setup_dma_pe where it belongs to
* added list of groups into iommu_table; most of the code just looks at
the first item to keep the patch simpler
v9:
* s/it_group/it_table_group/
* added and used iommu_table_group_free(), from now iommu_free_table()
is only used for VIO
* added iommu_pseries_group_alloc()
* squashed "powerpc/iommu: Introduce iommu_table_alloc() helper" into this
---
 arch/powerpc/include/asm/iommu.h            |   8 +-
 arch/powerpc/kernel/iommu.c                 |   9 +-
 arch/powerpc/platforms/powernv/pci-ioda.c   |  45 ++++++----
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |   3 +
 arch/powerpc/platforms/powernv/pci.c        |  73 +++++++++++++++++
 arch/powerpc/platforms/powernv/pci.h        |   7 ++
 arch/powerpc/platforms/pseries/iommu.c      |  36 ++++++--
 drivers/vfio/vfio_iommu_spapr_tce.c         | 122 ++++++++++++++++++++--------
 8 files changed, 241 insertions(+), 62 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 61bde1a..664beeb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -92,7 +92,7 @@ struct iommu_table {
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
 	unsigned long  it_page_shift;/* table iommu page size */
 #ifdef CONFIG_IOMMU_API
-	struct iommu_table_group *it_table_group;
+	struct list_head it_group_list;/* List of iommu_table_group_link */
 #endif
 	struct iommu_table_ops *it_ops;
 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
@@ -130,6 +130,12 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES	1
 
+struct iommu_table_group_link {
+	struct list_head next;
+	struct rcu_head rcu;
+	struct iommu_table_group *table_group;
+};
+
 struct iommu_table_group {
 	struct iommu_group *group;
 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 79e8b43..bdf19c6 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1075,6 +1075,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
 int iommu_add_device(struct device *dev)
 {
 	struct iommu_table *tbl;
+	struct iommu_table_group_link *tgl;
 
 	/*
 	 * The sysfs entries should be populated before
@@ -1092,15 +1093,17 @@ int iommu_add_device(struct device *dev)
 	}
 
 	tbl = get_iommu_table_base(dev);
-	if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) {
+	if (!tbl || list_empty(&tbl->it_group_list)) {
 		pr_debug("%s: Skipping device %s with no tbl\n",
 			 __func__, dev_name(dev));
 		return 0;
 	}
 
+	tgl = list_first_entry_or_null(&tbl->it_group_list,
+			struct iommu_table_group_link, next);
 	pr_debug("%s: Adding %s to iommu group %d\n",
 		 __func__, dev_name(dev),
-		 iommu_group_id(tbl->it_table_group->group));
+		 iommu_group_id(tgl->table_group->group));
 
 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
 		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
@@ -1109,7 +1112,7 @@ int iommu_add_device(struct device *dev)
 		return -EINVAL;
 	}
 
-	return iommu_group_add_device(tbl->it_table_group->group, dev);
+	return iommu_group_add_device(tgl->table_group->group, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_add_device);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 02ed448..53bf242b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1288,7 +1288,6 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	struct iommu_table    *tbl;
 	unsigned long         addr;
 	int64_t               rc;
-	struct iommu_table_group *table_group;
 
 	bus = dev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1308,14 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
-	table_group = tbl->it_table_group;
-	if (table_group->group) {
-		iommu_group_put(table_group->group);
-		BUG_ON(table_group->group);
+	pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+	if (pe->table_group.group) {
+		iommu_group_put(pe->table_group.group);
+		BUG_ON(pe->table_group.group);
 	}
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
-	pe->table_group.tables[0] = NULL;
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -1675,7 +1673,10 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
+			&tbl->it_group_list, struct iommu_table_group_link,
+			next);
+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 			struct pnv_ioda_pe, table_group);
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
@@ -1753,7 +1754,10 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
+			&tbl->it_group_list, struct iommu_table_group_link,
+			next);
+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 			struct pnv_ioda_pe, table_group);
 	unsigned long start, end, inc;
 	__be64 __iomem *invalidate = rm ?
@@ -1830,12 +1834,10 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-			phb->hose->node);
-	tbl->it_table_group = &pe->table_group;
-	pe->table_group.tables[0] = tbl;
+	tbl = pnv_pci_table_alloc(phb->hose->node);
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	/* Grab a 32-bit TCE table */
 	pe->tce32_seg = base;
@@ -1910,11 +1912,18 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 		pe->tce32_seg = -1;
 	if (tce_mem)
 		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+	if (tbl) {
+		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+		iommu_free_table(tbl, "pnv");
+	}
 }
 
 static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 {
-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
+			&tbl->it_group_list, struct iommu_table_group_link,
+			next);
+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 			struct pnv_ioda_pe, table_group);
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
@@ -1969,12 +1978,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-			phb->hose->node);
-	tbl->it_table_group = &pe->table_group;
-	pe->table_group.tables[0] = tbl;
+	tbl = pnv_pci_table_alloc(phb->hose->node);
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
@@ -2047,6 +2054,10 @@ fail:
 		pe->tce32_seg = -1;
 	if (tce_mem)
 		__free_pages(tce_mem, get_order(tce_table_size));
+	if (tbl) {
+		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+		iommu_free_table(tbl, "pnv");
+	}
 }
 
 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 4ea9def..b524b17 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -99,6 +99,9 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 		iommu_init_table(tbl, phb->hose->node);
 		iommu_register_group(&phb->p5ioc2.table_group,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
+		INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+		pnv_pci_link_table_and_group(phb->hose->node, 0,
+				tbl, &phb->p5ioc2.table_group);
 	}
 
 	set_iommu_table_base(&pdev->dev, tbl);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 84b4ea4..ed7de7b 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -606,6 +606,79 @@ unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
 }
 
+struct iommu_table *pnv_pci_table_alloc(int nid)
+{
+	struct iommu_table *tbl;
+
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+
+	return tbl;
+}
+
+long pnv_pci_link_table_and_group(int node, int num,
+		struct iommu_table *tbl,
+		struct iommu_table_group *table_group)
+{
+	struct iommu_table_group_link *tgl = NULL;
+
+	BUG_ON(!tbl);
+	BUG_ON(!table_group);
+	BUG_ON(!table_group->group);
+
+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+			node);
+	if (!tgl)
+		return -ENOMEM;
+
+	tgl->table_group = table_group;
+	list_add_rcu(&tgl->next, &tbl->it_group_list);
+
+	table_group->tables[num] = tbl;
+
+	return 0;
+}
+
+static void pnv_iommu_table_group_link_free(struct rcu_head *head)
+{
+	struct iommu_table_group_link *tgl = container_of(head,
+			struct iommu_table_group_link, rcu);
+
+	kfree(tgl);
+}
+
+void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+		struct iommu_table_group *table_group)
+{
+	long i;
+	bool found;
+	struct iommu_table_group_link *tgl;
+
+	/* Remove link to a group from table's list of attached groups */
+	found = false;
+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+		if (tgl->table_group == table_group) {
+			list_del_rcu(&tgl->next);
+			call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free);
+			found = true;
+			break;
+		}
+	}
+	if (WARN_ON(!found))
+		return;
+
+	/* Clean a pointer to iommu_table in iommu_table_group::tables[] */
+	found = false;
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		if (table_group->tables[i] == tbl) {
+			table_group->tables[i] = NULL;
+			found = true;
+			break;
+		}
+	}
+	WARN_ON(!found);
+}
+
 void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 			       void *tce_mem, u64 tce_size,
 			       u64 dma_offset, unsigned page_shift)
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 720cc99..87bdd4f 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -213,6 +213,13 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
 		     int where, int size, u32 *val);
 int pnv_pci_cfg_write(struct pci_dn *pdn,
 		      int where, int size, u32 val);
+extern struct iommu_table *pnv_pci_table_alloc(int nid);
+
+extern long pnv_pci_link_table_and_group(int node, int num,
+		struct iommu_table *tbl,
+		struct iommu_table_group *table_group);
+extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+		struct iommu_table_group *table_group);
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 				      void *tce_mem, u64 tce_size,
 				      u64 dma_offset, unsigned page_shift);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index ad5ac6d..040fd45 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -37,6 +37,7 @@
 #include <linux/memory.h>
 #include <linux/of.h>
 #include <linux/iommu.h>
+#include <linux/rculist.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -56,6 +57,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
 	struct iommu_table_group *table_group = NULL;
 	struct iommu_table *tbl = NULL;
+	struct iommu_table_group_link *tgl = NULL;
 
 	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
 			   node);
@@ -66,12 +68,21 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 	if (!tbl)
 		goto fail_exit;
 
-	tbl->it_table_group = table_group;
+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+			node);
+	if (!tgl)
+		goto fail_exit;
+
+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	tgl->table_group = table_group;
+	list_add_rcu(&tgl->next, &tbl->it_group_list);
+
 	table_group->tables[0] = tbl;
 
 	return table_group;
 
 fail_exit:
+	kfree(tgl);
 	kfree(table_group);
 	kfree(tbl);
 
@@ -82,18 +93,33 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		const char *node_name)
 {
 	struct iommu_table *tbl;
+	long i;
 
 	if (!table_group)
 		return;
 
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		tbl = table_group->tables[i];
+
+		if (tbl) {
+#ifdef CONFIG_IOMMU_API
+			struct iommu_table_group_link *tgl, *tmp;
+
+			list_for_each_entry_safe(tgl, tmp, &tbl->it_group_list,
+					next) {
+				list_del_rcu(&tgl->next);
+				kfree(tgl);
+			}
+#endif
+			iommu_free_table(tbl, node_name);
+		}
+	}
+#ifdef CONFIG_IOMMU_API
 	if (table_group->group) {
 		iommu_group_put(table_group->group);
 		BUG_ON(table_group->group);
 	}
-
-	tbl = table_group->tables[0];
-	iommu_free_table(tbl, node_name);
-
+#endif
 	kfree(table_group);
 }
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index bd87e46..ed3310b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -88,7 +88,7 @@ static void decrement_locked_vm(long npages)
  */
 struct tce_container {
 	struct mutex lock;
-	struct iommu_table *tbl;
+	struct iommu_group *grp;
 	bool enabled;
 	unsigned long locked_pages;
 };
@@ -103,13 +103,42 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 }
 
+static long tce_iommu_find_table(struct tce_container *container,
+		phys_addr_t ioba, struct iommu_table **ptbl)
+{
+	long i;
+	struct iommu_table_group *table_group;
+
+	table_group = iommu_group_get_iommudata(container->grp);
+	if (!table_group)
+		return -1;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = table_group->tables[i];
+
+		if (tbl) {
+			unsigned long entry = ioba >> tbl->it_page_shift;
+			unsigned long start = tbl->it_offset;
+			unsigned long end = start + tbl->it_size;
+
+			if ((start <= entry) && (entry < end)) {
+				*ptbl = tbl;
+				return i;
+			}
+		}
+	}
+
+	return -1;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
 	unsigned long locked;
-	struct iommu_table *tbl = container->tbl;
+	struct iommu_table *tbl;
+	struct iommu_table_group *table_group;
 
-	if (!container->tbl)
+	if (!container->grp)
 		return -ENXIO;
 
 	if (!current->mm)
@@ -143,6 +172,11 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * as this information is only available from KVM and VFIO is
 	 * KVM agnostic.
 	 */
+	table_group = iommu_group_get_iommudata(container->grp);
+	if (!table_group)
+		return -ENODEV;
+
+	tbl = table_group->tables[0];
 	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
 	ret = try_increment_locked_vm(locked);
 	if (ret)
@@ -190,11 +224,10 @@ static void tce_iommu_release(void *iommu_data)
 {
 	struct tce_container *container = iommu_data;
 
-	WARN_ON(container->tbl && !container->tbl->it_table_group->group);
+	WARN_ON(container->grp);
 
-	if (container->tbl && container->tbl->it_table_group->group)
-		tce_iommu_detach_group(iommu_data,
-				container->tbl->it_table_group->group);
+	if (container->grp)
+		tce_iommu_detach_group(iommu_data, container->grp);
 
 	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
@@ -312,9 +345,16 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 		struct vfio_iommu_spapr_tce_info info;
-		struct iommu_table *tbl = container->tbl;
+		struct iommu_table *tbl;
+		struct iommu_table_group *table_group;
 
-		if (WARN_ON(!tbl))
+		if (WARN_ON(!container->grp))
+			return -ENXIO;
+
+		table_group = iommu_group_get_iommudata(container->grp);
+
+		tbl = table_group->tables[0];
+		if (WARN_ON_ONCE(!tbl))
 			return -ENXIO;
 
 		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
@@ -337,17 +377,13 @@ static long tce_iommu_ioctl(void *iommu_data,
 	}
 	case VFIO_IOMMU_MAP_DMA: {
 		struct vfio_iommu_type1_dma_map param;
-		struct iommu_table *tbl = container->tbl;
+		struct iommu_table *tbl = NULL;
 		unsigned long tce;
+		long num;
 
 		if (!container->enabled)
 			return -EPERM;
 
-		if (!tbl)
-			return -ENXIO;
-
-		BUG_ON(!tbl->it_table_group->group);
-
 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
 		if (copy_from_user(¶m, (void __user *)arg, minsz))
@@ -360,6 +396,10 @@ static long tce_iommu_ioctl(void *iommu_data,
 				VFIO_DMA_MAP_FLAG_WRITE))
 			return -EINVAL;
 
+		num = tce_iommu_find_table(container, param.iova, &tbl);
+		if (num < 0)
+			return -ENXIO;
+
 		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 			return -EINVAL;
@@ -385,14 +425,12 @@ static long tce_iommu_ioctl(void *iommu_data,
 	}
 	case VFIO_IOMMU_UNMAP_DMA: {
 		struct vfio_iommu_type1_dma_unmap param;
-		struct iommu_table *tbl = container->tbl;
+		struct iommu_table *tbl = NULL;
+		long num;
 
 		if (!container->enabled)
 			return -EPERM;
 
-		if (WARN_ON(!tbl))
-			return -ENXIO;
-
 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 				size);
 
@@ -406,6 +444,10 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (param.flags)
 			return -EINVAL;
 
+		num = tce_iommu_find_table(container, param.iova, &tbl);
+		if (num < 0)
+			return -ENXIO;
+
 		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 			return -EINVAL;
 
@@ -434,12 +476,11 @@ static long tce_iommu_ioctl(void *iommu_data,
 		mutex_unlock(&container->lock);
 		return 0;
 	case VFIO_EEH_PE_OP:
-		if (!container->tbl || !container->tbl->it_table_group->group)
+		if (!container->grp)
 			return -ENODEV;
 
-		return vfio_spapr_iommu_eeh_ioctl(
-				container->tbl->it_table_group->group,
-				cmd, arg);
+		return vfio_spapr_iommu_eeh_ioctl(container->grp,
+						  cmd, arg);
 	}
 
 	return -ENOTTY;
@@ -450,17 +491,15 @@ static int tce_iommu_attach_group(void *iommu_data,
 {
 	int ret;
 	struct tce_container *container = iommu_data;
-	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+	struct iommu_table_group *table_group;
 
-	BUG_ON(!tbl);
 	mutex_lock(&container->lock);
 
 	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
 			iommu_group_id(iommu_group), iommu_group); */
-	if (container->tbl) {
+	if (container->grp) {
 		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
-				iommu_group_id(container->tbl->
-						it_table_group->group),
+				iommu_group_id(container->grp),
 				iommu_group_id(iommu_group));
 		ret = -EBUSY;
 		goto unlock_exit;
@@ -473,9 +512,15 @@ static int tce_iommu_attach_group(void *iommu_data,
 		goto unlock_exit;
 	}
 
-	ret = iommu_take_ownership(tbl);
+	table_group = iommu_group_get_iommudata(iommu_group);
+	if (!table_group) {
+		ret = -ENXIO;
+		goto unlock_exit;
+	}
+
+	ret = iommu_take_ownership(table_group->tables[0]);
 	if (!ret)
-		container->tbl = tbl;
+		container->grp = iommu_group;
 
 unlock_exit:
 	mutex_unlock(&container->lock);
@@ -487,26 +532,31 @@ static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group)
 {
 	struct tce_container *container = iommu_data;
-	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl;
 
-	BUG_ON(!tbl);
 	mutex_lock(&container->lock);
-	if (tbl != container->tbl) {
+	if (iommu_group != container->grp) {
 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
 				iommu_group_id(iommu_group),
-				iommu_group_id(tbl->it_table_group->group));
+				iommu_group_id(container->grp));
 		goto unlock_exit;
 	}
 
 	if (container->enabled) {
 		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-				iommu_group_id(tbl->it_table_group->group));
+				iommu_group_id(container->grp));
 		tce_iommu_disable(container);
 	}
 
 	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
 	   iommu_group_id(iommu_group), iommu_group); */
-	container->tbl = NULL;
+	container->grp = NULL;
+
+	table_group = iommu_group_get_iommudata(iommu_group);
+	BUG_ON(!table_group);
+
+	tbl = table_group->tables[0];
 	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 	iommu_release_ownership(tbl);
 
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
  2015-05-11 15:39 ` [PATCH kernel v10 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group Alexey Kardashevskiy
@ 2015-05-14  1:52   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  1:52 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:06AM +1000, Alexey Kardashevskiy wrote:
>Modern IBM POWERPC systems support multiple (currently two) TCE tables
>per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
>for TCE tables. Right now just one table is supported.
>
>For IODA, instead of embedding iommu_table, the new iommu_table_group
>keeps pointers to those. The iommu_table structs are allocated
>dynamically now by a pnv_pci_table_alloc() helper as PCI hotplug
>code (for EEH recovery) and SRIOV are supported there.
>
>For P5IOC2, both iommu_table_group and iommu_table are embedded into
>PE struct. As there is no EEH and SRIOV support for P5IOC2,
>iommu_free_table() should not be called on iommu_table struct pointers
>so we can keep it embedded in pnv_phb::p5ioc2.
>
>For pSeries, this replaces multiple calls of kzalloc_node() with a new
>iommu_pseries_group_alloc() helper and stores the table group struct
>pointer into the pci_dn struct. For release, a iommu_table_group_free()
>helper is added.
>
>This moves iommu_table struct allocation from SR-IOV code to
>the generic DMA initialization code in pnv_pci_ioda2_setup_dma_pe.
>
>This replaces a single pointer to iommu_group with a list of
>iommu_table_group structs. For now it is just a single iommu_table_group
>in this list but later with TCE table sharing enabled, the list will
>keep all the IOMMU groups which use the particular table. The list
>uses iommu_table_group_link structs rather than iommu_table_group::next
>as a VFIO container may have 2 IOMMU tables, each will have its own list
>head pointer as it is mainly for TCE invalidation code which should
>walk through all attached groups and invalidate TCE cache so
>the table has to keep the list head pointer. The other option would
>be storing list head in a VFIO container but it would not work as
>the platform code (which does TCE table update and invalidation) has
>no idea about VFIO.
>
>This should cause no behavioural change.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
>Changes:
>v10:
>* iommu_table is not embedded into iommu_table_group but allocated
>dynamically
>* iommu_table allocation is moved to a single place for IODA2's
>pnv_pci_ioda_setup_dma_pe where it belongs to
>* added list of groups into iommu_table; most of the code just looks at
>the first item to keep the patch simpler
>
>v9:
>* s/it_group/it_table_group/
>* added and used iommu_table_group_free(), from now iommu_free_table()
>is only used for VIO
>* added iommu_pseries_group_alloc()
>* squashed "powerpc/iommu: Introduce iommu_table_alloc() helper" into this
>---
> arch/powerpc/include/asm/iommu.h            |   8 +-
> arch/powerpc/kernel/iommu.c                 |   9 +-
> arch/powerpc/platforms/powernv/pci-ioda.c   |  45 ++++++----
> arch/powerpc/platforms/powernv/pci-p5ioc2.c |   3 +
> arch/powerpc/platforms/powernv/pci.c        |  73 +++++++++++++++++
> arch/powerpc/platforms/powernv/pci.h        |   7 ++
> arch/powerpc/platforms/pseries/iommu.c      |  36 ++++++--
> drivers/vfio/vfio_iommu_spapr_tce.c         | 122 ++++++++++++++++++++--------
> 8 files changed, 241 insertions(+), 62 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>index 61bde1a..664beeb 100644
>--- a/arch/powerpc/include/asm/iommu.h
>+++ b/arch/powerpc/include/asm/iommu.h
>@@ -92,7 +92,7 @@ struct iommu_table {
> 	unsigned long *it_map;       /* A simple allocation bitmap for now */
> 	unsigned long  it_page_shift;/* table iommu page size */
> #ifdef CONFIG_IOMMU_API
>-	struct iommu_table_group *it_table_group;
>+	struct list_head it_group_list;/* List of iommu_table_group_link */
> #endif
> 	struct iommu_table_ops *it_ops;
> 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
>@@ -130,6 +130,12 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>
> #define IOMMU_TABLE_GROUP_MAX_TABLES	1
>
>+struct iommu_table_group_link {
>+	struct list_head next;
>+	struct rcu_head rcu;
>+	struct iommu_table_group *table_group;
>+};
>+
> struct iommu_table_group {
> 	struct iommu_group *group;
> 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index 79e8b43..bdf19c6 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -1075,6 +1075,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
> int iommu_add_device(struct device *dev)
> {
> 	struct iommu_table *tbl;
>+	struct iommu_table_group_link *tgl;
>
> 	/*
> 	 * The sysfs entries should be populated before
>@@ -1092,15 +1093,17 @@ int iommu_add_device(struct device *dev)
> 	}
>
> 	tbl = get_iommu_table_base(dev);
>-	if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) {
>+	if (!tbl || list_empty(&tbl->it_group_list)) {
> 		pr_debug("%s: Skipping device %s with no tbl\n",
> 			 __func__, dev_name(dev));
> 		return 0;
> 	}
>
>+	tgl = list_first_entry_or_null(&tbl->it_group_list,
>+			struct iommu_table_group_link, next);
> 	pr_debug("%s: Adding %s to iommu group %d\n",
> 		 __func__, dev_name(dev),
>-		 iommu_group_id(tbl->it_table_group->group));
>+		 iommu_group_id(tgl->table_group->group));
>
> 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
> 		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
>@@ -1109,7 +1112,7 @@ int iommu_add_device(struct device *dev)
> 		return -EINVAL;
> 	}
>
>-	return iommu_group_add_device(tbl->it_table_group->group, dev);
>+	return iommu_group_add_device(tgl->table_group->group, dev);
> }
> EXPORT_SYMBOL_GPL(iommu_add_device);
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 02ed448..53bf242b 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1288,7 +1288,6 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
> 	struct iommu_table    *tbl;
> 	unsigned long         addr;
> 	int64_t               rc;
>-	struct iommu_table_group *table_group;
>
> 	bus = dev->bus;
> 	hose = pci_bus_to_host(bus);
>@@ -1308,14 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
> 	if (rc)
> 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>
>-	table_group = tbl->it_table_group;
>-	if (table_group->group) {
>-		iommu_group_put(table_group->group);
>-		BUG_ON(table_group->group);
>+	pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
>+	if (pe->table_group.group) {
>+		iommu_group_put(pe->table_group.group);
>+		BUG_ON(pe->table_group.group);
> 	}
> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
>-	pe->table_group.tables[0] = NULL;
> }
>
> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>@@ -1675,7 +1673,10 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
> static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
> 		unsigned long index, unsigned long npages, bool rm)
> {
>-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
>+			&tbl->it_group_list, struct iommu_table_group_link,
>+			next);
>+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
> 			struct pnv_ioda_pe, table_group);
> 	__be64 __iomem *invalidate = rm ?
> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>@@ -1753,7 +1754,10 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
> static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
> 		unsigned long index, unsigned long npages, bool rm)
> {
>-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
>+			&tbl->it_group_list, struct iommu_table_group_link,
>+			next);
>+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
> 			struct pnv_ioda_pe, table_group);
> 	unsigned long start, end, inc;
> 	__be64 __iomem *invalidate = rm ?
>@@ -1830,12 +1834,10 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	if (WARN_ON(pe->tce32_seg >= 0))
> 		return;
>
>-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>-			phb->hose->node);
>-	tbl->it_table_group = &pe->table_group;
>-	pe->table_group.tables[0] = tbl;
>+	tbl = pnv_pci_table_alloc(phb->hose->node);
> 	iommu_register_group(&pe->table_group, phb->hose->global_number,
> 			pe->pe_number);
>+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>
> 	/* Grab a 32-bit TCE table */
> 	pe->tce32_seg = base;
>@@ -1910,11 +1912,18 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 		pe->tce32_seg = -1;
> 	if (tce_mem)
> 		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
>+	if (tbl) {
>+		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
>+		iommu_free_table(tbl, "pnv");
>+	}
> }
>
> static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
> {
>-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
>+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
>+			&tbl->it_group_list, struct iommu_table_group_link,
>+			next);
>+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
> 			struct pnv_ioda_pe, table_group);
> 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
> 	int64_t rc;
>@@ -1969,12 +1978,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	if (WARN_ON(pe->tce32_seg >= 0))
> 		return;
>
>-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
>-			phb->hose->node);
>-	tbl->it_table_group = &pe->table_group;
>-	pe->table_group.tables[0] = tbl;
>+	tbl = pnv_pci_table_alloc(phb->hose->node);
> 	iommu_register_group(&pe->table_group, phb->hose->global_number,
> 			pe->pe_number);
>+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>
> 	/* The PE will reserve all possible 32-bits space */
> 	pe->tce32_seg = 0;
>@@ -2047,6 +2054,10 @@ fail:
> 		pe->tce32_seg = -1;
> 	if (tce_mem)
> 		__free_pages(tce_mem, get_order(tce_table_size));
>+	if (tbl) {
>+		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
>+		iommu_free_table(tbl, "pnv");
>+	}
> }
>
> static void pnv_ioda_setup_dma(struct pnv_phb *phb)
>diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>index 4ea9def..b524b17 100644
>--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
>@@ -99,6 +99,9 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
> 		iommu_init_table(tbl, phb->hose->node);
> 		iommu_register_group(&phb->p5ioc2.table_group,
> 				pci_domain_nr(phb->hose->bus), phb->opal_id);
>+		INIT_LIST_HEAD_RCU(&tbl->it_group_list);
>+		pnv_pci_link_table_and_group(phb->hose->node, 0,
>+				tbl, &phb->p5ioc2.table_group);
> 	}
>
> 	set_iommu_table_base(&pdev->dev, tbl);
>diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>index 84b4ea4..ed7de7b 100644
>--- a/arch/powerpc/platforms/powernv/pci.c
>+++ b/arch/powerpc/platforms/powernv/pci.c
>@@ -606,6 +606,79 @@ unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
> 	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
> }
>
>+struct iommu_table *pnv_pci_table_alloc(int nid)
>+{
>+	struct iommu_table *tbl;
>+
>+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
>+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
>+
>+	return tbl;
>+}
>+
>+long pnv_pci_link_table_and_group(int node, int num,
>+		struct iommu_table *tbl,
>+		struct iommu_table_group *table_group)
>+{
>+	struct iommu_table_group_link *tgl = NULL;
>+
>+	BUG_ON(!tbl);
>+	BUG_ON(!table_group);
>+	BUG_ON(!table_group->group);
>+
>+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
>+			node);
>+	if (!tgl)
>+		return -ENOMEM;
>+
>+	tgl->table_group = table_group;
>+	list_add_rcu(&tgl->next, &tbl->it_group_list);
>+
>+	table_group->tables[num] = tbl;
>+
>+	return 0;
>+}
>+
>+static void pnv_iommu_table_group_link_free(struct rcu_head *head)
>+{
>+	struct iommu_table_group_link *tgl = container_of(head,
>+			struct iommu_table_group_link, rcu);
>+
>+	kfree(tgl);
>+}
>+
>+void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
>+		struct iommu_table_group *table_group)
>+{
>+	long i;
>+	bool found;
>+	struct iommu_table_group_link *tgl;
>+
>+	/* Remove link to a group from table's list of attached groups */
>+	found = false;
>+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
>+		if (tgl->table_group == table_group) {
>+			list_del_rcu(&tgl->next);
>+			call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free);
>+			found = true;
>+			break;
>+		}
>+	}
>+	if (WARN_ON(!found))
>+		return;
>+
>+	/* Clean a pointer to iommu_table in iommu_table_group::tables[] */
>+	found = false;
>+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
>+		if (table_group->tables[i] == tbl) {
>+			table_group->tables[i] = NULL;
>+			found = true;
>+			break;
>+		}
>+	}
>+	WARN_ON(!found);
>+}
>+
> void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
> 			       void *tce_mem, u64 tce_size,
> 			       u64 dma_offset, unsigned page_shift)
>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>index 720cc99..87bdd4f 100644
>--- a/arch/powerpc/platforms/powernv/pci.h
>+++ b/arch/powerpc/platforms/powernv/pci.h
>@@ -213,6 +213,13 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
> 		     int where, int size, u32 *val);
> int pnv_pci_cfg_write(struct pci_dn *pdn,
> 		      int where, int size, u32 val);
>+extern struct iommu_table *pnv_pci_table_alloc(int nid);
>+
>+extern long pnv_pci_link_table_and_group(int node, int num,
>+		struct iommu_table *tbl,
>+		struct iommu_table_group *table_group);
>+extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
>+		struct iommu_table_group *table_group);
> extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
> 				      void *tce_mem, u64 tce_size,
> 				      u64 dma_offset, unsigned page_shift);
>diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>index ad5ac6d..040fd45 100644
>--- a/arch/powerpc/platforms/pseries/iommu.c
>+++ b/arch/powerpc/platforms/pseries/iommu.c
>@@ -37,6 +37,7 @@
> #include <linux/memory.h>
> #include <linux/of.h>
> #include <linux/iommu.h>
>+#include <linux/rculist.h>
> #include <asm/io.h>
> #include <asm/prom.h>
> #include <asm/rtas.h>
>@@ -56,6 +57,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> {
> 	struct iommu_table_group *table_group = NULL;
> 	struct iommu_table *tbl = NULL;
>+	struct iommu_table_group_link *tgl = NULL;
>
> 	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
> 			   node);
>@@ -66,12 +68,21 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> 	if (!tbl)
> 		goto fail_exit;
>
>-	tbl->it_table_group = table_group;
>+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
>+			node);
>+	if (!tgl)
>+		goto fail_exit;
>+
>+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
>+	tgl->table_group = table_group;
>+	list_add_rcu(&tgl->next, &tbl->it_group_list);
>+
> 	table_group->tables[0] = tbl;
>
> 	return table_group;
>
> fail_exit:
>+	kfree(tgl);
> 	kfree(table_group);
> 	kfree(tbl);
>
>@@ -82,18 +93,33 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
> 		const char *node_name)
> {
> 	struct iommu_table *tbl;
>+	long i;
>
> 	if (!table_group)
> 		return;
>
>+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
>+		tbl = table_group->tables[i];
>+
>+		if (tbl) {
>+#ifdef CONFIG_IOMMU_API
>+			struct iommu_table_group_link *tgl, *tmp;
>+
>+			list_for_each_entry_safe(tgl, tmp, &tbl->it_group_list,
>+					next) {
>+				list_del_rcu(&tgl->next);
>+				kfree(tgl);
>+			}
>+#endif
>+			iommu_free_table(tbl, node_name);
>+		}
>+	}
>+#ifdef CONFIG_IOMMU_API
> 	if (table_group->group) {
> 		iommu_group_put(table_group->group);
> 		BUG_ON(table_group->group);
> 	}
>-
>-	tbl = table_group->tables[0];
>-	iommu_free_table(tbl, node_name);
>-
>+#endif
> 	kfree(table_group);
> }
>
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index bd87e46..ed3310b 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -88,7 +88,7 @@ static void decrement_locked_vm(long npages)
>  */
> struct tce_container {
> 	struct mutex lock;
>-	struct iommu_table *tbl;
>+	struct iommu_group *grp;
> 	bool enabled;
> 	unsigned long locked_pages;
> };
>@@ -103,13 +103,42 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
> 	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
> }
>
>+static long tce_iommu_find_table(struct tce_container *container,
>+		phys_addr_t ioba, struct iommu_table **ptbl)
>+{
>+	long i;
>+	struct iommu_table_group *table_group;
>+
>+	table_group = iommu_group_get_iommudata(container->grp);
>+	if (!table_group)
>+		return -1;
>+
>+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
>+		struct iommu_table *tbl = table_group->tables[i];
>+
>+		if (tbl) {
>+			unsigned long entry = ioba >> tbl->it_page_shift;
>+			unsigned long start = tbl->it_offset;
>+			unsigned long end = start + tbl->it_size;
>+
>+			if ((start <= entry) && (entry < end)) {
>+				*ptbl = tbl;
>+				return i;
>+			}
>+		}
>+	}
>+
>+	return -1;
>+}
>+
> static int tce_iommu_enable(struct tce_container *container)
> {
> 	int ret = 0;
> 	unsigned long locked;
>-	struct iommu_table *tbl = container->tbl;
>+	struct iommu_table *tbl;
>+	struct iommu_table_group *table_group;
>
>-	if (!container->tbl)
>+	if (!container->grp)
> 		return -ENXIO;
>
> 	if (!current->mm)
>@@ -143,6 +172,11 @@ static int tce_iommu_enable(struct tce_container *container)
> 	 * as this information is only available from KVM and VFIO is
> 	 * KVM agnostic.
> 	 */
>+	table_group = iommu_group_get_iommudata(container->grp);
>+	if (!table_group)
>+		return -ENODEV;
>+
>+	tbl = table_group->tables[0];
> 	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
> 	ret = try_increment_locked_vm(locked);
> 	if (ret)
>@@ -190,11 +224,10 @@ static void tce_iommu_release(void *iommu_data)
> {
> 	struct tce_container *container = iommu_data;
>
>-	WARN_ON(container->tbl && !container->tbl->it_table_group->group);
>+	WARN_ON(container->grp);
>
>-	if (container->tbl && container->tbl->it_table_group->group)
>-		tce_iommu_detach_group(iommu_data,
>-				container->tbl->it_table_group->group);
>+	if (container->grp)
>+		tce_iommu_detach_group(iommu_data, container->grp);
>
> 	tce_iommu_disable(container);
> 	mutex_destroy(&container->lock);
>@@ -312,9 +345,16 @@ static long tce_iommu_ioctl(void *iommu_data,
>
> 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> 		struct vfio_iommu_spapr_tce_info info;
>-		struct iommu_table *tbl = container->tbl;
>+		struct iommu_table *tbl;
>+		struct iommu_table_group *table_group;
>
>-		if (WARN_ON(!tbl))
>+		if (WARN_ON(!container->grp))
>+			return -ENXIO;
>+
>+		table_group = iommu_group_get_iommudata(container->grp);
>+
>+		tbl = table_group->tables[0];
>+		if (WARN_ON_ONCE(!tbl))
> 			return -ENXIO;
>
> 		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
>@@ -337,17 +377,13 @@ static long tce_iommu_ioctl(void *iommu_data,
> 	}
> 	case VFIO_IOMMU_MAP_DMA: {
> 		struct vfio_iommu_type1_dma_map param;
>-		struct iommu_table *tbl = container->tbl;
>+		struct iommu_table *tbl = NULL;
> 		unsigned long tce;
>+		long num;
>
> 		if (!container->enabled)
> 			return -EPERM;
>
>-		if (!tbl)
>-			return -ENXIO;
>-
>-		BUG_ON(!tbl->it_table_group->group);
>-
> 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
>
> 		if (copy_from_user(¶m, (void __user *)arg, minsz))
>@@ -360,6 +396,10 @@ static long tce_iommu_ioctl(void *iommu_data,
> 				VFIO_DMA_MAP_FLAG_WRITE))
> 			return -EINVAL;
>
>+		num = tce_iommu_find_table(container, param.iova, &tbl);
>+		if (num < 0)
>+			return -ENXIO;
>+
> 		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
> 				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
> 			return -EINVAL;
>@@ -385,14 +425,12 @@ static long tce_iommu_ioctl(void *iommu_data,
> 	}
> 	case VFIO_IOMMU_UNMAP_DMA: {
> 		struct vfio_iommu_type1_dma_unmap param;
>-		struct iommu_table *tbl = container->tbl;
>+		struct iommu_table *tbl = NULL;
>+		long num;
>
> 		if (!container->enabled)
> 			return -EPERM;
>
>-		if (WARN_ON(!tbl))
>-			return -ENXIO;
>-
> 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
> 				size);
>
>@@ -406,6 +444,10 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		if (param.flags)
> 			return -EINVAL;
>
>+		num = tce_iommu_find_table(container, param.iova, &tbl);
>+		if (num < 0)
>+			return -ENXIO;
>+
> 		if (param.size & ~IOMMU_PAGE_MASK(tbl))
> 			return -EINVAL;
>
>@@ -434,12 +476,11 @@ static long tce_iommu_ioctl(void *iommu_data,
> 		mutex_unlock(&container->lock);
> 		return 0;
> 	case VFIO_EEH_PE_OP:
>-		if (!container->tbl || !container->tbl->it_table_group->group)
>+		if (!container->grp)
> 			return -ENODEV;
>
>-		return vfio_spapr_iommu_eeh_ioctl(
>-				container->tbl->it_table_group->group,
>-				cmd, arg);
>+		return vfio_spapr_iommu_eeh_ioctl(container->grp,
>+						  cmd, arg);
> 	}
>
> 	return -ENOTTY;
>@@ -450,17 +491,15 @@ static int tce_iommu_attach_group(void *iommu_data,
> {
> 	int ret;
> 	struct tce_container *container = iommu_data;
>-	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>+	struct iommu_table_group *table_group;
>
>-	BUG_ON(!tbl);
> 	mutex_lock(&container->lock);
>
> 	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
> 			iommu_group_id(iommu_group), iommu_group); */
>-	if (container->tbl) {
>+	if (container->grp) {
> 		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
>-				iommu_group_id(container->tbl->
>-						it_table_group->group),
>+				iommu_group_id(container->grp),
> 				iommu_group_id(iommu_group));
> 		ret = -EBUSY;
> 		goto unlock_exit;
>@@ -473,9 +512,15 @@ static int tce_iommu_attach_group(void *iommu_data,
> 		goto unlock_exit;
> 	}
>
>-	ret = iommu_take_ownership(tbl);
>+	table_group = iommu_group_get_iommudata(iommu_group);
>+	if (!table_group) {
>+		ret = -ENXIO;
>+		goto unlock_exit;
>+	}
>+
>+	ret = iommu_take_ownership(table_group->tables[0]);
> 	if (!ret)
>-		container->tbl = tbl;
>+		container->grp = iommu_group;
>
> unlock_exit:
> 	mutex_unlock(&container->lock);
>@@ -487,26 +532,31 @@ static void tce_iommu_detach_group(void *iommu_data,
> 		struct iommu_group *iommu_group)
> {
> 	struct tce_container *container = iommu_data;
>-	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
>+	struct iommu_table_group *table_group;
>+	struct iommu_table *tbl;
>
>-	BUG_ON(!tbl);
> 	mutex_lock(&container->lock);
>-	if (tbl != container->tbl) {
>+	if (iommu_group != container->grp) {
> 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> 				iommu_group_id(iommu_group),
>-				iommu_group_id(tbl->it_table_group->group));
>+				iommu_group_id(container->grp));
> 		goto unlock_exit;
> 	}
>
> 	if (container->enabled) {
> 		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
>-				iommu_group_id(tbl->it_table_group->group));
>+				iommu_group_id(container->grp));
> 		tce_iommu_disable(container);
> 	}
>
> 	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> 	   iommu_group_id(iommu_group), iommu_group); */
>-	container->tbl = NULL;
>+	container->grp = NULL;
>+
>+	table_group = iommu_group_get_iommudata(iommu_group);
>+	BUG_ON(!table_group);
>+
>+	tbl = table_group->tables[0];
> 	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
> 	iommu_release_ownership(tbl);
>
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 18/34] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (16 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  2:01   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 19/34] powerpc/iommu: Fix IOMMU ownership control functions Alexey Kardashevskiy
                   ` (15 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This adds tce_iommu_take_ownership() and tce_iommu_release_ownership
which call in a loop iommu_take_ownership()/iommu_release_ownership()
for every table on the group. As there is just one now, no change in
behaviour is expected.
At the moment the iommu_table struct has a set_bypass() which enables/
disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
which calls this callback when external IOMMU users such as VFIO are
about to get over a PHB.
The set_bypass() callback is not really an iommu_table function but
IOMMU/PE function. This introduces a iommu_table_group_ops struct and
adds take_ownership()/release_ownership() callbacks to it which are
called when an external user takes/releases control over the IOMMU.
This replaces set_bypass() with ownership callbacks as it is not
necessarily just bypass enabling, it can be something else/more
so let's give it more generic name.
The callbacks is implemented for IODA2 only. Other platforms (P5IOC2,
IODA1) will use the old iommu_take_ownership/iommu_release_ownership API.
The following patches will replace iommu_take_ownership/
iommu_release_ownership calls in IODA2 with full IOMMU table release/
create.
As we here and touching bypass control, this removes
pnv_pci_ioda2_setup_bypass_pe() as it does not do much
more compared to pnv_pci_ioda2_set_bypass. This moves tce_bypass_base
initialization to pnv_pci_ioda2_setup_dma_pe.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
Changes:
v10:
* fixed comments around take_ownership/release_ownership in iommu_table_group_ops
v9:
* squashed "vfio: powerpc/spapr: powerpc/iommu: Rework IOMMU ownership control"
and "vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework IOMMU ownership control"
into a single patch
* moved helpers with a loop through tables in a group
to vfio_iommu_spapr_tce.c to keep the platform code free of IOMMU table
groups as much as possible
* added missing tce_iommu_clear() to tce_iommu_release_ownership()
* replaced the set_ownership(enable) callback with take_ownership() and
release_ownership()
---
 arch/powerpc/include/asm/iommu.h          | 11 ++++-
 arch/powerpc/kernel/iommu.c               | 12 -----
 arch/powerpc/platforms/powernv/pci-ioda.c | 73 ++++++++++++++++++-------------
 drivers/vfio/vfio_iommu_spapr_tce.c       | 70 ++++++++++++++++++++++++++---
 4 files changed, 116 insertions(+), 50 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 664beeb..c5375c5 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -95,7 +95,6 @@ struct iommu_table {
 	struct list_head it_group_list;/* List of iommu_table_group_link */
 #endif
 	struct iommu_table_ops *it_ops;
-	void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
 /* Pure 2^n version of get_order */
@@ -130,6 +129,15 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES	1
 
+struct iommu_table_group;
+
+struct iommu_table_group_ops {
+	/* Switch ownership from platform code to external user (e.g. VFIO) */
+	void (*take_ownership)(struct iommu_table_group *table_group);
+	/* Switch ownership from external user (e.g. VFIO) back to core */
+	void (*release_ownership)(struct iommu_table_group *table_group);
+};
+
 struct iommu_table_group_link {
 	struct list_head next;
 	struct rcu_head rcu;
@@ -139,6 +147,7 @@ struct iommu_table_group_link {
 struct iommu_table_group {
 	struct iommu_group *group;
 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+	struct iommu_table_group_ops *ops;
 };
 
 extern void iommu_register_group(struct iommu_table_group *table_group,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index bdf19c6..7e54714 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1044,14 +1044,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
 	memset(tbl->it_map, 0xff, sz);
 
-	/*
-	 * Disable iommu bypass, otherwise the user can DMA to all of
-	 * our physical memory via the bypass window instead of just
-	 * the pages that has been explicitly mapped into the iommu
-	 */
-	if (tbl->set_bypass)
-		tbl->set_bypass(tbl, false);
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
@@ -1065,10 +1057,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 	/* Restore bit#0 set by iommu_init_table() */
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
-
-	/* The kernel owns the device now, we can restore the iommu bypass */
-	if (tbl->set_bypass)
-		tbl->set_bypass(tbl, true);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 53bf242b..35ab19c8 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1918,13 +1918,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	}
 }
 
-static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
-	struct iommu_table_group_link *tgl = list_first_entry_or_null(
-			&tbl->it_group_list, struct iommu_table_group_link,
-			next);
-	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
-			struct pnv_ioda_pe, table_group);
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
 
@@ -1951,33 +1946,48 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 		pe->tce_bypass_enabled = enable;
 }
 
-static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
-					  struct pnv_ioda_pe *pe)
+#ifdef CONFIG_IOMMU_API
+static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 {
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+						table_group);
+
+	iommu_take_ownership(table_group->tables[0]);
+	pnv_pci_ioda2_set_bypass(pe, false);
+}
+
+static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+						table_group);
+
+	iommu_release_ownership(table_group->tables[0]);
+	pnv_pci_ioda2_set_bypass(pe, true);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+	.take_ownership = pnv_ioda2_take_ownership,
+	.release_ownership = pnv_ioda2_release_ownership,
+};
+#endif
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe)
+{
+	struct page *tce_mem = NULL;
+	void *addr;
+	const __be64 *swinvp;
+	struct iommu_table *tbl;
+	unsigned int tce_table_size, end;
+	int64_t rc;
+
+	/* We shouldn't already have a 32-bit DMA associated */
+	if (WARN_ON(pe->tce32_seg >= 0))
+		return;
+
 	/* TVE #1 is selected by PCI address bit 59 */
 	pe->tce_bypass_base = 1ull << 59;
 
-	/* Install set_bypass callback for VFIO */
-	pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass;
-
-	/* Enable bypass by default */
-	pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true);
-}
-
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-				       struct pnv_ioda_pe *pe)
-{
-	struct page *tce_mem = NULL;
-	void *addr;
-	const __be64 *swinvp;
-	struct iommu_table *tbl;
-	unsigned int tce_table_size, end;
-	int64_t rc;
-
-	/* We shouldn't already have a 32-bit DMA associated */
-	if (WARN_ON(pe->tce32_seg >= 0))
-		return;
-
 	tbl = pnv_pci_table_alloc(phb->hose->node);
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
@@ -2032,6 +2042,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	}
 	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
+#ifdef CONFIG_IOMMU_API
+	pe->table_group.ops = &pnv_pci_ioda2_ops;
+#endif
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
 		/*
@@ -2046,7 +2059,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	/* Also create a bypass window */
 	if (!pnv_iommu_bypass_disabled)
-		pnv_pci_ioda2_setup_bypass_pe(phb, pe);
+		pnv_pci_ioda2_set_bypass(pe, true);
 
 	return;
 fail:
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index ed3310b..2ead291 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -486,6 +486,47 @@ static long tce_iommu_ioctl(void *iommu_data,
 	return -ENOTTY;
 }
 
+static void tce_iommu_release_ownership(struct tce_container *container,
+		struct iommu_table_group *table_group)
+{
+	int i;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = table_group->tables[i];
+
+		if (!tbl)
+			continue;
+
+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+		if (tbl->it_map)
+			iommu_release_ownership(tbl);
+	}
+}
+
+static int tce_iommu_take_ownership(struct tce_container *container,
+		struct iommu_table_group *table_group)
+{
+	int i, j, rc = 0;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = table_group->tables[i];
+
+		if (!tbl || !tbl->it_map)
+			continue;
+
+		rc = iommu_take_ownership(tbl);
+		if (rc) {
+			for (j = 0; j < i; ++j)
+				iommu_release_ownership(
+						table_group->tables[j]);
+
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
 static int tce_iommu_attach_group(void *iommu_data,
 		struct iommu_group *iommu_group)
 {
@@ -518,9 +559,23 @@ static int tce_iommu_attach_group(void *iommu_data,
 		goto unlock_exit;
 	}
 
-	ret = iommu_take_ownership(table_group->tables[0]);
-	if (!ret)
-		container->grp = iommu_group;
+	if (!table_group->ops || !table_group->ops->take_ownership ||
+			!table_group->ops->release_ownership) {
+		ret = tce_iommu_take_ownership(container, table_group);
+	} else {
+		/*
+		 * Disable iommu bypass, otherwise the user can DMA to all of
+		 * our physical memory via the bypass window instead of just
+		 * the pages that has been explicitly mapped into the iommu
+		 */
+		table_group->ops->take_ownership(table_group);
+		ret = 0;
+	}
+
+	if (ret)
+		goto unlock_exit;
+
+	container->grp = iommu_group;
 
 unlock_exit:
 	mutex_unlock(&container->lock);
@@ -533,7 +588,6 @@ static void tce_iommu_detach_group(void *iommu_data,
 {
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
-	struct iommu_table *tbl;
 
 	mutex_lock(&container->lock);
 	if (iommu_group != container->grp) {
@@ -556,9 +610,11 @@ static void tce_iommu_detach_group(void *iommu_data,
 	table_group = iommu_group_get_iommudata(iommu_group);
 	BUG_ON(!table_group);
 
-	tbl = table_group->tables[0];
-	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-	iommu_release_ownership(tbl);
+	/* Kernel owns the device now, we can restore bypass */
+	if (!table_group->ops || !table_group->ops->release_ownership)
+		tce_iommu_release_ownership(container, table_group);
+	else
+		table_group->ops->release_ownership(table_group);
 
 unlock_exit:
 	mutex_unlock(&container->lock);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 18/34] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control
  2015-05-11 15:39 ` [PATCH kernel v10 18/34] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control Alexey Kardashevskiy
@ 2015-05-14  2:01   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  2:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:07AM +1000, Alexey Kardashevskiy wrote:
>This adds tce_iommu_take_ownership() and tce_iommu_release_ownership
>which call in a loop iommu_take_ownership()/iommu_release_ownership()
>for every table on the group. As there is just one now, no change in
>behaviour is expected.
>
>At the moment the iommu_table struct has a set_bypass() which enables/
>disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
>which calls this callback when external IOMMU users such as VFIO are
>about to get over a PHB.
>
>The set_bypass() callback is not really an iommu_table function but
>IOMMU/PE function. This introduces a iommu_table_group_ops struct and
>adds take_ownership()/release_ownership() callbacks to it which are
>called when an external user takes/releases control over the IOMMU.
>
>This replaces set_bypass() with ownership callbacks as it is not
>necessarily just bypass enabling, it can be something else/more
>so let's give it more generic name.
>
>The callbacks is implemented for IODA2 only. Other platforms (P5IOC2,
>IODA1) will use the old iommu_take_ownership/iommu_release_ownership API.
>The following patches will replace iommu_take_ownership/
>iommu_release_ownership calls in IODA2 with full IOMMU table release/
>create.
>
>As we here and touching bypass control, this removes
>pnv_pci_ioda2_setup_bypass_pe() as it does not do much
>more compared to pnv_pci_ioda2_set_bypass. This moves tce_bypass_base
>initialization to pnv_pci_ioda2_setup_dma_pe.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>[aw: for the vfio related changes]
>Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
>Changes:
>v10:
>* fixed comments around take_ownership/release_ownership in iommu_table_group_ops
>
>v9:
>* squashed "vfio: powerpc/spapr: powerpc/iommu: Rework IOMMU ownership control"
>and "vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework IOMMU ownership control"
>into a single patch
>* moved helpers with a loop through tables in a group
>to vfio_iommu_spapr_tce.c to keep the platform code free of IOMMU table
>groups as much as possible
>* added missing tce_iommu_clear() to tce_iommu_release_ownership()
>* replaced the set_ownership(enable) callback with take_ownership() and
>release_ownership()
>---
> arch/powerpc/include/asm/iommu.h          | 11 ++++-
> arch/powerpc/kernel/iommu.c               | 12 -----
> arch/powerpc/platforms/powernv/pci-ioda.c | 73 ++++++++++++++++++-------------
> drivers/vfio/vfio_iommu_spapr_tce.c       | 70 ++++++++++++++++++++++++++---
> 4 files changed, 116 insertions(+), 50 deletions(-)
>
>diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>index 664beeb..c5375c5 100644
>--- a/arch/powerpc/include/asm/iommu.h
>+++ b/arch/powerpc/include/asm/iommu.h
>@@ -95,7 +95,6 @@ struct iommu_table {
> 	struct list_head it_group_list;/* List of iommu_table_group_link */
> #endif
> 	struct iommu_table_ops *it_ops;
>-	void (*set_bypass)(struct iommu_table *tbl, bool enable);
> };
>
> /* Pure 2^n version of get_order */
>@@ -130,6 +129,15 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>
> #define IOMMU_TABLE_GROUP_MAX_TABLES	1
>
>+struct iommu_table_group;
>+
>+struct iommu_table_group_ops {
>+	/* Switch ownership from platform code to external user (e.g. VFIO) */
>+	void (*take_ownership)(struct iommu_table_group *table_group);
>+	/* Switch ownership from external user (e.g. VFIO) back to core */
>+	void (*release_ownership)(struct iommu_table_group *table_group);
>+};
>+
> struct iommu_table_group_link {
> 	struct list_head next;
> 	struct rcu_head rcu;
>@@ -139,6 +147,7 @@ struct iommu_table_group_link {
> struct iommu_table_group {
> 	struct iommu_group *group;
> 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>+	struct iommu_table_group_ops *ops;
> };
>
> extern void iommu_register_group(struct iommu_table_group *table_group,
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index bdf19c6..7e54714 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -1044,14 +1044,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
>
> 	memset(tbl->it_map, 0xff, sz);
>
>-	/*
>-	 * Disable iommu bypass, otherwise the user can DMA to all of
>-	 * our physical memory via the bypass window instead of just
>-	 * the pages that has been explicitly mapped into the iommu
>-	 */
>-	if (tbl->set_bypass)
>-		tbl->set_bypass(tbl, false);
>-
> 	return 0;
> }
> EXPORT_SYMBOL_GPL(iommu_take_ownership);
>@@ -1065,10 +1057,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
> 	/* Restore bit#0 set by iommu_init_table() */
> 	if (tbl->it_offset == 0)
> 		set_bit(0, tbl->it_map);
>-
>-	/* The kernel owns the device now, we can restore the iommu bypass */
>-	if (tbl->set_bypass)
>-		tbl->set_bypass(tbl, true);
> }
> EXPORT_SYMBOL_GPL(iommu_release_ownership);
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 53bf242b..35ab19c8 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1918,13 +1918,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	}
> }
>
>-static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
>+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
> {
>-	struct iommu_table_group_link *tgl = list_first_entry_or_null(
>-			&tbl->it_group_list, struct iommu_table_group_link,
>-			next);
>-	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
>-			struct pnv_ioda_pe, table_group);
> 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
> 	int64_t rc;
>
>@@ -1951,33 +1946,48 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
> 		pe->tce_bypass_enabled = enable;
> }
>
>-static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
>-					  struct pnv_ioda_pe *pe)
>+#ifdef CONFIG_IOMMU_API
>+static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
> {
>+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
>+						table_group);
>+
>+	iommu_take_ownership(table_group->tables[0]);
>+	pnv_pci_ioda2_set_bypass(pe, false);
>+}
>+
>+static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
>+{
>+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
>+						table_group);
>+
>+	iommu_release_ownership(table_group->tables[0]);
>+	pnv_pci_ioda2_set_bypass(pe, true);
>+}
>+
>+static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>+	.take_ownership = pnv_ioda2_take_ownership,
>+	.release_ownership = pnv_ioda2_release_ownership,
>+};
>+#endif
>+
>+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>+				       struct pnv_ioda_pe *pe)
>+{
>+	struct page *tce_mem = NULL;
>+	void *addr;
>+	const __be64 *swinvp;
>+	struct iommu_table *tbl;
>+	unsigned int tce_table_size, end;
>+	int64_t rc;
>+
>+	/* We shouldn't already have a 32-bit DMA associated */
>+	if (WARN_ON(pe->tce32_seg >= 0))
>+		return;
>+
> 	/* TVE #1 is selected by PCI address bit 59 */
> 	pe->tce_bypass_base = 1ull << 59;
>
>-	/* Install set_bypass callback for VFIO */
>-	pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass;
>-
>-	/* Enable bypass by default */
>-	pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true);
>-}
>-
>-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>-				       struct pnv_ioda_pe *pe)
>-{
>-	struct page *tce_mem = NULL;
>-	void *addr;
>-	const __be64 *swinvp;
>-	struct iommu_table *tbl;
>-	unsigned int tce_table_size, end;
>-	int64_t rc;
>-
>-	/* We shouldn't already have a 32-bit DMA associated */
>-	if (WARN_ON(pe->tce32_seg >= 0))
>-		return;
>-
> 	tbl = pnv_pci_table_alloc(phb->hose->node);
> 	iommu_register_group(&pe->table_group, phb->hose->global_number,
> 			pe->pe_number);
>@@ -2032,6 +2042,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	}
> 	tbl->it_ops = &pnv_ioda2_iommu_ops;
> 	iommu_init_table(tbl, phb->hose->node);
>+#ifdef CONFIG_IOMMU_API
>+	pe->table_group.ops = &pnv_pci_ioda2_ops;
>+#endif
>
> 	if (pe->flags & PNV_IODA_PE_DEV) {
> 		/*
>@@ -2046,7 +2059,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>
> 	/* Also create a bypass window */
> 	if (!pnv_iommu_bypass_disabled)
>-		pnv_pci_ioda2_setup_bypass_pe(phb, pe);
>+		pnv_pci_ioda2_set_bypass(pe, true);
>
> 	return;
> fail:
>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>index ed3310b..2ead291 100644
>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>@@ -486,6 +486,47 @@ static long tce_iommu_ioctl(void *iommu_data,
> 	return -ENOTTY;
> }
>
>+static void tce_iommu_release_ownership(struct tce_container *container,
>+		struct iommu_table_group *table_group)
>+{
>+	int i;
>+
>+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
>+		struct iommu_table *tbl = table_group->tables[i];
>+
>+		if (!tbl)
>+			continue;
>+
>+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
>+		if (tbl->it_map)
>+			iommu_release_ownership(tbl);
>+	}
>+}
>+
>+static int tce_iommu_take_ownership(struct tce_container *container,
>+		struct iommu_table_group *table_group)
>+{
>+	int i, j, rc = 0;
>+
>+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
>+		struct iommu_table *tbl = table_group->tables[i];
>+
>+		if (!tbl || !tbl->it_map)
>+			continue;
>+
>+		rc = iommu_take_ownership(tbl);
>+		if (rc) {
>+			for (j = 0; j < i; ++j)
>+				iommu_release_ownership(
>+						table_group->tables[j]);
>+
>+			return rc;
>+		}
>+	}
>+
>+	return 0;
>+}
>+
> static int tce_iommu_attach_group(void *iommu_data,
> 		struct iommu_group *iommu_group)
> {
>@@ -518,9 +559,23 @@ static int tce_iommu_attach_group(void *iommu_data,
> 		goto unlock_exit;
> 	}
>
>-	ret = iommu_take_ownership(table_group->tables[0]);
>-	if (!ret)
>-		container->grp = iommu_group;
>+	if (!table_group->ops || !table_group->ops->take_ownership ||
>+			!table_group->ops->release_ownership) {
>+		ret = tce_iommu_take_ownership(container, table_group);
>+	} else {
>+		/*
>+		 * Disable iommu bypass, otherwise the user can DMA to all of
>+		 * our physical memory via the bypass window instead of just
>+		 * the pages that has been explicitly mapped into the iommu
>+		 */
>+		table_group->ops->take_ownership(table_group);
>+		ret = 0;
>+	}
>+
>+	if (ret)
>+		goto unlock_exit;
>+
>+	container->grp = iommu_group;
>
> unlock_exit:
> 	mutex_unlock(&container->lock);
>@@ -533,7 +588,6 @@ static void tce_iommu_detach_group(void *iommu_data,
> {
> 	struct tce_container *container = iommu_data;
> 	struct iommu_table_group *table_group;
>-	struct iommu_table *tbl;
>
> 	mutex_lock(&container->lock);
> 	if (iommu_group != container->grp) {
>@@ -556,9 +610,11 @@ static void tce_iommu_detach_group(void *iommu_data,
> 	table_group = iommu_group_get_iommudata(iommu_group);
> 	BUG_ON(!table_group);
>
>-	tbl = table_group->tables[0];
>-	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
>-	iommu_release_ownership(tbl);
>+	/* Kernel owns the device now, we can restore bypass */
>+	if (!table_group->ops || !table_group->ops->release_ownership)
>+		tce_iommu_release_ownership(container, table_group);
>+	else
>+		table_group->ops->release_ownership(table_group);
>
> unlock_exit:
> 	mutex_unlock(&container->lock);
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 19/34] powerpc/iommu: Fix IOMMU ownership control functions
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (17 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 18/34] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  3:36   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE Alexey Kardashevskiy
                   ` (14 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This adds missing locks in iommu_take_ownership()/
iommu_release_ownership().
This marks all pages busy in iommu_table::it_map in order to catch
errors if there is an attempt to use this table while ownership over it
is taken.
This only clears TCE content if there is no page marked busy in it_map.
Clearing must be done outside of the table locks as iommu_clear_tce()
called from iommu_clear_tces_and_put_pages() does this.
In order to use bitmap_empty(), the existing code clears bit#0 which
is set even in an empty table if it is bus-mapped at 0 as
iommu_init_table() reserves page#0 to prevent buggy drivers
from crashing when allocated page is bus-mapped at zero
(which is correct). This restores the bit in the case of failure
to bring the it_map to the state it was in when we called
iommu_take_ownership().
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* iommu_table_take_ownership() did not return @ret (and ignored EBUSY),
now it does return correct error.
* updated commit log about setting bit#0 in the case of failure
v5:
* do not store bit#0 value, it has to be set for zero-based table
anyway
* removed test_and_clear_bit
---
 arch/powerpc/kernel/iommu.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 7e54714..6275164 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1032,31 +1032,51 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
 
 int iommu_take_ownership(struct iommu_table *tbl)
 {
-	unsigned long sz = (tbl->it_size + 7) >> 3;
+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+	int ret = 0;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_lock(&tbl->pools[i].lock);
 
 	if (tbl->it_offset == 0)
 		clear_bit(0, tbl->it_map);
 
 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
 		pr_err("iommu_tce: it_map is not empty");
-		return -EBUSY;
+		ret = -EBUSY;
+		/* Restore bit#0 set by iommu_init_table() */
+		if (tbl->it_offset == 0)
+			set_bit(0, tbl->it_map);
+	} else {
+		memset(tbl->it_map, 0xff, sz);
 	}
 
-	memset(tbl->it_map, 0xff, sz);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_unlock(&tbl->pools[i].lock);
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
 void iommu_release_ownership(struct iommu_table *tbl)
 {
-	unsigned long sz = (tbl->it_size + 7) >> 3;
+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_lock(&tbl->pools[i].lock);
 
 	memset(tbl->it_map, 0, sz);
 
 	/* Restore bit#0 set by iommu_init_table() */
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
+
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_unlock(&tbl->pools[i].lock);
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 19/34] powerpc/iommu: Fix IOMMU ownership control functions
  2015-05-11 15:39 ` [PATCH kernel v10 19/34] powerpc/iommu: Fix IOMMU ownership control functions Alexey Kardashevskiy
@ 2015-05-14  3:36   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  3:36 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:08AM +1000, Alexey Kardashevskiy wrote:
>This adds missing locks in iommu_take_ownership()/
>iommu_release_ownership().
>
>This marks all pages busy in iommu_table::it_map in order to catch
>errors if there is an attempt to use this table while ownership over it
>is taken.
>
>This only clears TCE content if there is no page marked busy in it_map.
>Clearing must be done outside of the table locks as iommu_clear_tce()
>called from iommu_clear_tces_and_put_pages() does this.
>
>In order to use bitmap_empty(), the existing code clears bit#0 which
>is set even in an empty table if it is bus-mapped at 0 as
>iommu_init_table() reserves page#0 to prevent buggy drivers
>from crashing when allocated page is bus-mapped at zero
>(which is correct). This restores the bit in the case of failure
>to bring the it_map to the state it was in when we called
>iommu_take_ownership().
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
>Changes:
>v9:
>* iommu_table_take_ownership() did not return @ret (and ignored EBUSY),
>now it does return correct error.
>* updated commit log about setting bit#0 in the case of failure
>
>v5:
>* do not store bit#0 value, it has to be set for zero-based table
>anyway
>* removed test_and_clear_bit
>---
> arch/powerpc/kernel/iommu.c | 30 +++++++++++++++++++++++++-----
> 1 file changed, 25 insertions(+), 5 deletions(-)
>
>diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>index 7e54714..6275164 100644
>--- a/arch/powerpc/kernel/iommu.c
>+++ b/arch/powerpc/kernel/iommu.c
>@@ -1032,31 +1032,51 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
>
> int iommu_take_ownership(struct iommu_table *tbl)
> {
>-	unsigned long sz = (tbl->it_size + 7) >> 3;
>+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
>+	int ret = 0;
>+
>+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
>+	for (i = 0; i < tbl->nr_pools; i++)
>+		spin_lock(&tbl->pools[i].lock);
>
> 	if (tbl->it_offset == 0)
> 		clear_bit(0, tbl->it_map);
>
> 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
> 		pr_err("iommu_tce: it_map is not empty");
>-		return -EBUSY;
>+		ret = -EBUSY;
>+		/* Restore bit#0 set by iommu_init_table() */
>+		if (tbl->it_offset == 0)
>+			set_bit(0, tbl->it_map);
>+	} else {
>+		memset(tbl->it_map, 0xff, sz);
> 	}
>
>-	memset(tbl->it_map, 0xff, sz);
>+	for (i = 0; i < tbl->nr_pools; i++)
>+		spin_unlock(&tbl->pools[i].lock);
>+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
>
>-	return 0;
>+	return ret;
> }
> EXPORT_SYMBOL_GPL(iommu_take_ownership);
>
> void iommu_release_ownership(struct iommu_table *tbl)
> {
>-	unsigned long sz = (tbl->it_size + 7) >> 3;
>+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
>+
>+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
>+	for (i = 0; i < tbl->nr_pools; i++)
>+		spin_lock(&tbl->pools[i].lock);
>
> 	memset(tbl->it_map, 0, sz);
>
> 	/* Restore bit#0 set by iommu_init_table() */
> 	if (tbl->it_offset == 0)
> 		set_bit(0, tbl->it_map);
>+
>+	for (i = 0; i < tbl->nr_pools; i++)
>+		spin_unlock(&tbl->pools[i].lock);
>+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
> }
> EXPORT_SYMBOL_GPL(iommu_release_ownership);
>
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (18 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 19/34] powerpc/iommu: Fix IOMMU ownership control functions Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  2:10   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups Alexey Kardashevskiy
                   ` (13 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
At the moment the DMA setup code looks for the "ibm,opal-tce-kill" property
which contains the TCE kill register address. Writes to this register
invalidates TCE cache on IODA/IODA2 hub.
This moves the register address from iommu_table to pnv_ioda_pe as:
1) When we get 2 tables per PE, this register will be used for both tables;
2) When we get TCE tables sharing, we will need to invalidate every
IOMMU group (i.e. PE) which is using this table and each PE has
its own invalidate register.
This moves the property reading/remapping code to a helper to reduce
code duplication. Although this change is not required for IODA1, this
changes it as well to reduce code duplication.
This adds a new pnv_pci_ioda2_tvt_invalidate() helper which invalidates
the entire table. It should be called after every call to
opal_pci_map_pe_dma_window(). It was not required before because
there is just a single TCE table and 64bit DMA is handled via bypass
window (which has no table so no chache is used) but this is going
to change with Dynamic DMA windows (DDW).
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* fixed error from checkpatch.pl
* removed comment at "ibm,opal-tce-kill" parsing as irrelevant
* s/addr/val/ in pnv_pci_ioda2_tvt_invalidate() as it was not a kernel address
v9:
* new in the series
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 64 ++++++++++++++++++-------------
 arch/powerpc/platforms/powernv/pci.h      |  1 +
 2 files changed, 39 insertions(+), 26 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 35ab19c8..f972e40 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1680,7 +1680,7 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 			struct pnv_ioda_pe, table_group);
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
-		(__be64 __iomem *)tbl->it_index;
+		pe->tce_inval_reg;
 	unsigned long start, end, inc;
 	const unsigned shift = tbl->it_page_shift;
 
@@ -1751,6 +1751,18 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.get = pnv_tce_get,
 };
 
+static inline void pnv_pci_ioda2_tvt_invalidate(struct pnv_ioda_pe *pe)
+{
+	/* 01xb - invalidate TCEs that match the specified PE# */
+	unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
+
+	if (!pe->tce_inval_reg)
+		return;
+
+	mb(); /* Ensure above stores are visible */
+	__raw_writeq(cpu_to_be64(val), pe->tce_inval_reg);
+}
+
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
@@ -1762,7 +1774,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 	unsigned long start, end, inc;
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
-		(__be64 __iomem *)tbl->it_index;
+		pe->tce_inval_reg;
 	const unsigned shift = tbl->it_page_shift;
 
 	/* We'll invalidate DMA address in PE scope */
@@ -1814,13 +1826,26 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.get = pnv_tce_get,
 };
 
+static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb,
+		struct pnv_ioda_pe *pe)
+{
+	const __be64 *swinvp;
+
+	/* OPAL variant of PHB3 invalidated TCEs */
+	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+	if (!swinvp)
+		return;
+
+	pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
+	pe->tce_inval_reg = ioremap(pe->tce_inval_reg_phys, 8);
+}
+
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				      struct pnv_ioda_pe *pe, unsigned int base,
 				      unsigned int segs)
 {
 
 	struct page *tce_mem = NULL;
-	const __be64 *swinvp;
 	struct iommu_table *tbl;
 	unsigned int i;
 	int64_t rc;
@@ -1839,6 +1864,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 			pe->pe_number);
 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
+	pnv_pci_ioda_setup_opal_tce_kill(phb, pe);
+
 	/* Grab a 32-bit TCE table */
 	pe->tce32_seg = base;
 	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
@@ -1877,20 +1904,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				  base << 28, IOMMU_PAGE_SHIFT_4K);
 
 	/* OPAL variant of P7IOC SW invalidated TCEs */
-	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
-	if (swinvp) {
-		/* We need a couple more fields -- an address and a data
-		 * to or.  Since the bus is only printed out on table free
-		 * errors, and on the first pass the data will be a relative
-		 * bus number, print that out instead.
-		 */
-		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
-		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
-				8);
+	if (pe->tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE |
 				 TCE_PCI_SWINV_FREE   |
 				 TCE_PCI_SWINV_PAIR);
-	}
+
 	tbl->it_ops = &pnv_ioda1_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
 
@@ -1976,7 +1994,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 {
 	struct page *tce_mem = NULL;
 	void *addr;
-	const __be64 *swinvp;
 	struct iommu_table *tbl;
 	unsigned int tce_table_size, end;
 	int64_t rc;
@@ -1993,6 +2010,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 			pe->pe_number);
 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
+	pnv_pci_ioda_setup_opal_tce_kill(phb, pe);
+
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
 	end = (1 << ilog2(phb->ioda.m32_pci_base));
@@ -2023,23 +2042,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 		goto fail;
 	}
 
+	pnv_pci_ioda2_tvt_invalidate(pe);
+
 	/* Setup linux iommu table */
 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
 			IOMMU_PAGE_SHIFT_4K);
 
 	/* OPAL variant of PHB3 invalidated TCEs */
-	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
-	if (swinvp) {
-		/* We need a couple more fields -- an address and a data
-		 * to or.  Since the bus is only printed out on table free
-		 * errors, and on the first pass the data will be a relative
-		 * bus number, print that out instead.
-		 */
-		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
-		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
-				8);
+	if (pe->tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-	}
+
 	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
 #ifdef CONFIG_IOMMU_API
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 87bdd4f..ea97de5 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -59,6 +59,7 @@ struct pnv_ioda_pe {
 	int			tce32_segcount;
 	struct iommu_table_group table_group;
 	phys_addr_t		tce_inval_reg_phys;
+	__be64 __iomem		*tce_inval_reg;
 
 	/* 64-bit TCE bypass region */
 	bool			tce_bypass_enabled;
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE
  2015-05-11 15:39 ` [PATCH kernel v10 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE Alexey Kardashevskiy
@ 2015-05-14  2:10   ` Gavin Shan
  2015-05-14  3:39     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  2:10 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:09AM +1000, Alexey Kardashevskiy wrote:
>At the moment the DMA setup code looks for the "ibm,opal-tce-kill" property
>which contains the TCE kill register address. Writes to this register
>invalidates TCE cache on IODA/IODA2 hub.
>
>This moves the register address from iommu_table to pnv_ioda_pe as:
>1) When we get 2 tables per PE, this register will be used for both tables;
>2) When we get TCE tables sharing, we will need to invalidate every
>IOMMU group (i.e. PE) which is using this table and each PE has
>its own invalidate register.
>
Actually, it's the virtual address of IO remapped PHB hardware register.
So it would be a property of PHB (struct pnv_phb). As the PE is connecting
with IOMMU table group. The virtual address can be retrieved by the path:
iommu_table -> iommu_table_group -> pnv_ioda_pe -> pnv_phb. However, I
don't insist and you have the best judge on it :-)
>This moves the property reading/remapping code to a helper to reduce
>code duplication. Although this change is not required for IODA1, this
>changes it as well to reduce code duplication.
>
>This adds a new pnv_pci_ioda2_tvt_invalidate() helper which invalidates
>the entire table. It should be called after every call to
>opal_pci_map_pe_dma_window(). It was not required before because
>there is just a single TCE table and 64bit DMA is handled via bypass
>window (which has no table so no chache is used) but this is going
>to change with Dynamic DMA windows (DDW).
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
>Changes:
>v10:
>* fixed error from checkpatch.pl
>* removed comment at "ibm,opal-tce-kill" parsing as irrelevant
>* s/addr/val/ in pnv_pci_ioda2_tvt_invalidate() as it was not a kernel address
>
>v9:
>* new in the series
>---
> arch/powerpc/platforms/powernv/pci-ioda.c | 64 ++++++++++++++++++-------------
> arch/powerpc/platforms/powernv/pci.h      |  1 +
> 2 files changed, 39 insertions(+), 26 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 35ab19c8..f972e40 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1680,7 +1680,7 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
> 			struct pnv_ioda_pe, table_group);
> 	__be64 __iomem *invalidate = rm ?
> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>-		(__be64 __iomem *)tbl->it_index;
>+		pe->tce_inval_reg;
> 	unsigned long start, end, inc;
> 	const unsigned shift = tbl->it_page_shift;
>
>@@ -1751,6 +1751,18 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
> 	.get = pnv_tce_get,
> };
>
>+static inline void pnv_pci_ioda2_tvt_invalidate(struct pnv_ioda_pe *pe)
>+{
>+	/* 01xb - invalidate TCEs that match the specified PE# */
>+	unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
>+
>+	if (!pe->tce_inval_reg)
>+		return;
>+
>+	mb(); /* Ensure above stores are visible */
>+	__raw_writeq(cpu_to_be64(val), pe->tce_inval_reg);
>+}
>+
The function name sounds it's to invalidate TVE cache. Actually, it's invalidting
TCE cache. So I guess the function name pnv_pci_ioda2_tce_invalidate() would be
more accurate.
> static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
> 		unsigned long index, unsigned long npages, bool rm)
> {
>@@ -1762,7 +1774,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
> 	unsigned long start, end, inc;
> 	__be64 __iomem *invalidate = rm ?
> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>-		(__be64 __iomem *)tbl->it_index;
>+		pe->tce_inval_reg;
> 	const unsigned shift = tbl->it_page_shift;
>
> 	/* We'll invalidate DMA address in PE scope */
>@@ -1814,13 +1826,26 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
> 	.get = pnv_tce_get,
> };
>
>+static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb,
>+		struct pnv_ioda_pe *pe)
>+{
>+	const __be64 *swinvp;
>+
>+	/* OPAL variant of PHB3 invalidated TCEs */
>+	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
>+	if (!swinvp)
>+		return;
>+
>+	pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
>+	pe->tce_inval_reg = ioremap(pe->tce_inval_reg_phys, 8);
>+}
>+
Yeah, nice to have the helper function to initialize it :)
> static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 				      struct pnv_ioda_pe *pe, unsigned int base,
> 				      unsigned int segs)
> {
>
> 	struct page *tce_mem = NULL;
>-	const __be64 *swinvp;
> 	struct iommu_table *tbl;
> 	unsigned int i;
> 	int64_t rc;
>@@ -1839,6 +1864,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 			pe->pe_number);
> 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>
>+	pnv_pci_ioda_setup_opal_tce_kill(phb, pe);
>+
> 	/* Grab a 32-bit TCE table */
> 	pe->tce32_seg = base;
> 	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
>@@ -1877,20 +1904,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 				  base << 28, IOMMU_PAGE_SHIFT_4K);
>
> 	/* OPAL variant of P7IOC SW invalidated TCEs */
>-	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
>-	if (swinvp) {
>-		/* We need a couple more fields -- an address and a data
>-		 * to or.  Since the bus is only printed out on table free
>-		 * errors, and on the first pass the data will be a relative
>-		 * bus number, print that out instead.
>-		 */
>-		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
>-		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
>-				8);
>+	if (pe->tce_inval_reg)
> 		tbl->it_type |= (TCE_PCI_SWINV_CREATE |
> 				 TCE_PCI_SWINV_FREE   |
> 				 TCE_PCI_SWINV_PAIR);
>-	}
>+
> 	tbl->it_ops = &pnv_ioda1_iommu_ops;
> 	iommu_init_table(tbl, phb->hose->node);
>
>@@ -1976,7 +1994,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> {
> 	struct page *tce_mem = NULL;
> 	void *addr;
>-	const __be64 *swinvp;
> 	struct iommu_table *tbl;
> 	unsigned int tce_table_size, end;
> 	int64_t rc;
>@@ -1993,6 +2010,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 			pe->pe_number);
> 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>
>+	pnv_pci_ioda_setup_opal_tce_kill(phb, pe);
>+
> 	/* The PE will reserve all possible 32-bits space */
> 	pe->tce32_seg = 0;
> 	end = (1 << ilog2(phb->ioda.m32_pci_base));
>@@ -2023,23 +2042,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 		goto fail;
> 	}
>
>+	pnv_pci_ioda2_tvt_invalidate(pe);
>+
> 	/* Setup linux iommu table */
> 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
> 			IOMMU_PAGE_SHIFT_4K);
>
> 	/* OPAL variant of PHB3 invalidated TCEs */
>-	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
>-	if (swinvp) {
>-		/* We need a couple more fields -- an address and a data
>-		 * to or.  Since the bus is only printed out on table free
>-		 * errors, and on the first pass the data will be a relative
>-		 * bus number, print that out instead.
>-		 */
>-		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
>-		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
>-				8);
>+	if (pe->tce_inval_reg)
> 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
>-	}
>+
> 	tbl->it_ops = &pnv_ioda2_iommu_ops;
> 	iommu_init_table(tbl, phb->hose->node);
> #ifdef CONFIG_IOMMU_API
>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>index 87bdd4f..ea97de5 100644
>--- a/arch/powerpc/platforms/powernv/pci.h
>+++ b/arch/powerpc/platforms/powernv/pci.h
>@@ -59,6 +59,7 @@ struct pnv_ioda_pe {
> 	int			tce32_segcount;
> 	struct iommu_table_group table_group;
> 	phys_addr_t		tce_inval_reg_phys;
>+	__be64 __iomem		*tce_inval_reg;
>
> 	/* 64-bit TCE bypass region */
> 	bool			tce_bypass_enabled;
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE
  2015-05-14  2:10   ` Gavin Shan
@ 2015-05-14  3:39     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  3:39 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/14/2015 12:10 PM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:39:09AM +1000, Alexey Kardashevskiy wrote:
>> At the moment the DMA setup code looks for the "ibm,opal-tce-kill" property
>> which contains the TCE kill register address. Writes to this register
>> invalidates TCE cache on IODA/IODA2 hub.
>>
>> This moves the register address from iommu_table to pnv_ioda_pe as:
>> 1) When we get 2 tables per PE, this register will be used for both tables;
>> 2) When we get TCE tables sharing, we will need to invalidate every
>> IOMMU group (i.e. PE) which is using this table and each PE has
>> its own invalidate register.
>>
>
> Actually, it's the virtual address of IO remapped PHB hardware register.
> So it would be a property of PHB (struct pnv_phb). As the PE is connecting
> with IOMMU table group. The virtual address can be retrieved by the path:
> iommu_table -> iommu_table_group -> pnv_ioda_pe -> pnv_phb. However, I
> don't insist and you have the best judge on it :-)
Are you suggesting moving pe->tce_inval_reg from pnv_ioda_pe to pnv_phb?
>
>> This moves the property reading/remapping code to a helper to reduce
>> code duplication. Although this change is not required for IODA1, this
>> changes it as well to reduce code duplication.
>>
>> This adds a new pnv_pci_ioda2_tvt_invalidate() helper which invalidates
>> the entire table. It should be called after every call to
>> opal_pci_map_pe_dma_window(). It was not required before because
>> there is just a single TCE table and 64bit DMA is handled via bypass
>> window (which has no table so no chache is used) but this is going
>> to change with Dynamic DMA windows (DDW).
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>
> Thanks,
> Gavin
>
>> ---
>> Changes:
>> v10:
>> * fixed error from checkpatch.pl
>> * removed comment at "ibm,opal-tce-kill" parsing as irrelevant
>> * s/addr/val/ in pnv_pci_ioda2_tvt_invalidate() as it was not a kernel address
>>
>> v9:
>> * new in the series
>> ---
>> arch/powerpc/platforms/powernv/pci-ioda.c | 64 ++++++++++++++++++-------------
>> arch/powerpc/platforms/powernv/pci.h      |  1 +
>> 2 files changed, 39 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 35ab19c8..f972e40 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -1680,7 +1680,7 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
>> 			struct pnv_ioda_pe, table_group);
>> 	__be64 __iomem *invalidate = rm ?
>> 		(__be64 __iomem *)pe->tce_inval_reg_phys :
>> -		(__be64 __iomem *)tbl->it_index;
>> +		pe->tce_inval_reg;
>> 	unsigned long start, end, inc;
>> 	const unsigned shift = tbl->it_page_shift;
>>
>> @@ -1751,6 +1751,18 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
>> 	.get = pnv_tce_get,
>> };
>>
>> +static inline void pnv_pci_ioda2_tvt_invalidate(struct pnv_ioda_pe *pe)
>> +{
>> +	/* 01xb - invalidate TCEs that match the specified PE# */
>> +	unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
>> +
>> +	if (!pe->tce_inval_reg)
>> +		return;
>> +
>> +	mb(); /* Ensure above stores are visible */
>> +	__raw_writeq(cpu_to_be64(val), pe->tce_inval_reg);
>> +}
>> +
>
> The function name sounds it's to invalidate TVE cache. Actually, it's invalidting
> TCE cache. So I guess the function name pnv_pci_ioda2_tce_invalidate() would be
> more accurate.
TVT vs. TVE distinction is not clear for me, sorry :)
There is a function with exactly the same name as you proposed, below in 
this mail, it invalidates cache.
May be s/pnv_pci_ioda2_tvt_invalidate/pnv_pci_ioda2_invalidate_entire_cache/ ?
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (19 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  2:22   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 22/34] powerpc/powernv: Implement accessor to TCE entry Alexey Kardashevskiy
                   ` (12 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
The iommu_table struct keeps a list of IOMMU groups it is used for.
At the moment there is just a single group attached but further
patches will add TCE table sharing. When sharing is enabled, TCE cache
in each PE needs to be invalidated so does the patch.
This does not change pnv_pci_ioda1_tce_invalidate() as there is no plan
to enable TCE table sharing on PHBs older than IODA2.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* new to the series
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 35 ++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f972e40..8e4987d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -24,6 +24,7 @@
 #include <linux/msi.h>
 #include <linux/memblock.h>
 #include <linux/iommu.h>
+#include <linux/rculist.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -1763,23 +1764,15 @@ static inline void pnv_pci_ioda2_tvt_invalidate(struct pnv_ioda_pe *pe)
 	__raw_writeq(cpu_to_be64(val), pe->tce_inval_reg);
 }
 
-static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
-		unsigned long index, unsigned long npages, bool rm)
+static void pnv_pci_ioda2_tce_do_invalidate(unsigned pe_number, bool rm,
+		__be64 __iomem *invalidate, unsigned shift,
+		unsigned long index, unsigned long npages)
 {
-	struct iommu_table_group_link *tgl = list_first_entry_or_null(
-			&tbl->it_group_list, struct iommu_table_group_link,
-			next);
-	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
-			struct pnv_ioda_pe, table_group);
 	unsigned long start, end, inc;
-	__be64 __iomem *invalidate = rm ?
-		(__be64 __iomem *)pe->tce_inval_reg_phys :
-		pe->tce_inval_reg;
-	const unsigned shift = tbl->it_page_shift;
 
 	/* We'll invalidate DMA address in PE scope */
 	start = 0x2ull << 60;
-	start |= (pe->pe_number & 0xFF);
+	start |= (pe_number & 0xFF);
 	end = start;
 
 	/* Figure out the start, end and step */
@@ -1797,6 +1790,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 	}
 }
 
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
+{
+	struct iommu_table_group_link *tgl;
+
+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
+				struct pnv_ioda_pe, table_group);
+		__be64 __iomem *invalidate = rm ?
+			(__be64 __iomem *)pe->tce_inval_reg_phys :
+			pe->tce_inval_reg;
+
+		pnv_pci_ioda2_tce_do_invalidate(pe->pe_number, rm,
+			invalidate, tbl->it_page_shift,
+			index, npages);
+	}
+}
+
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 		long npages, unsigned long uaddr,
 		enum dma_data_direction direction,
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups
  2015-05-11 15:39 ` [PATCH kernel v10 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups Alexey Kardashevskiy
@ 2015-05-14  2:22   ` Gavin Shan
  2015-05-14  3:50     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  2:22 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:10AM +1000, Alexey Kardashevskiy wrote:
>The iommu_table struct keeps a list of IOMMU groups it is used for.
>At the moment there is just a single group attached but further
>patches will add TCE table sharing. When sharing is enabled, TCE cache
>in each PE needs to be invalidated so does the patch.
>
>This does not change pnv_pci_ioda1_tce_invalidate() as there is no plan
>to enable TCE table sharing on PHBs older than IODA2.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>---
>Changes:
>v10:
>* new to the series
>---
> arch/powerpc/platforms/powernv/pci-ioda.c | 35 ++++++++++++++++++++-----------
> 1 file changed, 23 insertions(+), 12 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index f972e40..8e4987d 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -24,6 +24,7 @@
> #include <linux/msi.h>
> #include <linux/memblock.h>
> #include <linux/iommu.h>
>+#include <linux/rculist.h>
>
> #include <asm/sections.h>
> #include <asm/io.h>
>@@ -1763,23 +1764,15 @@ static inline void pnv_pci_ioda2_tvt_invalidate(struct pnv_ioda_pe *pe)
> 	__raw_writeq(cpu_to_be64(val), pe->tce_inval_reg);
> }
>
>-static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>-		unsigned long index, unsigned long npages, bool rm)
>+static void pnv_pci_ioda2_tce_do_invalidate(unsigned pe_number, bool rm,
>+		__be64 __iomem *invalidate, unsigned shift,
>+		unsigned long index, unsigned long npages)
> {
>-	struct iommu_table_group_link *tgl = list_first_entry_or_null(
>-			&tbl->it_group_list, struct iommu_table_group_link,
>-			next);
>-	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
>-			struct pnv_ioda_pe, table_group);
> 	unsigned long start, end, inc;
>-	__be64 __iomem *invalidate = rm ?
>-		(__be64 __iomem *)pe->tce_inval_reg_phys :
>-		pe->tce_inval_reg;
>-	const unsigned shift = tbl->it_page_shift;
>
> 	/* We'll invalidate DMA address in PE scope */
> 	start = 0x2ull << 60;
>-	start |= (pe->pe_number & 0xFF);
>+	start |= (pe_number & 0xFF);
> 	end = start;
>
> 	/* Figure out the start, end and step */
>@@ -1797,6 +1790,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
> 	}
> }
>
>+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>+		unsigned long index, unsigned long npages, bool rm)
>+{
>+	struct iommu_table_group_link *tgl;
>+
>+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
>+		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
>+				struct pnv_ioda_pe, table_group);
>+		__be64 __iomem *invalidate = rm ?
>+			(__be64 __iomem *)pe->tce_inval_reg_phys :
>+			pe->tce_inval_reg;
>+
>+		pnv_pci_ioda2_tce_do_invalidate(pe->pe_number, rm,
>+			invalidate, tbl->it_page_shift,
>+			index, npages);
>+	}
>+}
>+
I don't understand this well and need a teaching session: One IOMMU
table can be connected with multiple IOMMU table groups, each of them
can be regarded as being equal to one PE. It means one IOMMU table
can be shared by two PEs. There must be something I missed.
Could you give a teaching session with an example about the IOMMU
table sharing? :-)
Thanks,
Gavin
> static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
> 		long npages, unsigned long uaddr,
> 		enum dma_data_direction direction,
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups
  2015-05-14  2:22   ` Gavin Shan
@ 2015-05-14  3:50     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  3:50 UTC (permalink / raw)
  To: Gavin Shan
  Cc: Wei Yang, linux-kernel, Alex Williamson, Paul Mackerras,
	linuxppc-dev, David Gibson
On 05/14/2015 12:22 PM, Gavin Shan wrote:
> On Tue, May 12, 2015 at 01:39:10AM +1000, Alexey Kardashevskiy wrote:
>> The iommu_table struct keeps a list of IOMMU groups it is used for.
>> At the moment there is just a single group attached but further
>> patches will add TCE table sharing. When sharing is enabled, TCE cache
>> in each PE needs to be invalidated so does the patch.
>>
>> This does not change pnv_pci_ioda1_tce_invalidate() as there is no plan
>> to enable TCE table sharing on PHBs older than IODA2.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> Changes:
>> v10:
>> * new to the series
>> ---
>> arch/powerpc/platforms/powernv/pci-ioda.c | 35 ++++++++++++++++++++-----------
>> 1 file changed, 23 insertions(+), 12 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index f972e40..8e4987d 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -24,6 +24,7 @@
>> #include <linux/msi.h>
>> #include <linux/memblock.h>
>> #include <linux/iommu.h>
>> +#include <linux/rculist.h>
>>
>> #include <asm/sections.h>
>> #include <asm/io.h>
>> @@ -1763,23 +1764,15 @@ static inline void pnv_pci_ioda2_tvt_invalidate(struct pnv_ioda_pe *pe)
>> 	__raw_writeq(cpu_to_be64(val), pe->tce_inval_reg);
>> }
>>
>> -static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>> -		unsigned long index, unsigned long npages, bool rm)
>> +static void pnv_pci_ioda2_tce_do_invalidate(unsigned pe_number, bool rm,
>> +		__be64 __iomem *invalidate, unsigned shift,
>> +		unsigned long index, unsigned long npages)
>> {
>> -	struct iommu_table_group_link *tgl = list_first_entry_or_null(
>> -			&tbl->it_group_list, struct iommu_table_group_link,
>> -			next);
>> -	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
>> -			struct pnv_ioda_pe, table_group);
>> 	unsigned long start, end, inc;
>> -	__be64 __iomem *invalidate = rm ?
>> -		(__be64 __iomem *)pe->tce_inval_reg_phys :
>> -		pe->tce_inval_reg;
>> -	const unsigned shift = tbl->it_page_shift;
>>
>> 	/* We'll invalidate DMA address in PE scope */
>> 	start = 0x2ull << 60;
>> -	start |= (pe->pe_number & 0xFF);
>> +	start |= (pe_number & 0xFF);
>> 	end = start;
>>
>> 	/* Figure out the start, end and step */
>> @@ -1797,6 +1790,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>> 	}
>> }
>>
>> +static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>> +		unsigned long index, unsigned long npages, bool rm)
>> +{
>> +	struct iommu_table_group_link *tgl;
>> +
>> +	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
>> +		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
>> +				struct pnv_ioda_pe, table_group);
>> +		__be64 __iomem *invalidate = rm ?
>> +			(__be64 __iomem *)pe->tce_inval_reg_phys :
>> +			pe->tce_inval_reg;
>> +
>> +		pnv_pci_ioda2_tce_do_invalidate(pe->pe_number, rm,
>> +			invalidate, tbl->it_page_shift,
>> +			index, npages);
>> +	}
>> +}
>> +
>
> I don't understand this well and need a teaching session: One IOMMU
> table can be connected with multiple IOMMU table groups, each of them
> can be regarded as being equal to one PE. It means one IOMMU table
> can be shared by two PEs. There must be something I missed.
No, this is correct.
> Could you give a teaching session with an example about the IOMMU
> table sharing? :-)
If you do not share tables and you have multiple IOMMU groups passed to 
QEMU, and all actual devices are capable of 64bit DMA, and you have 
multiple PHBs in QEMU (each backed with a 64bit TCE table which is updated 
once at the boot time and never changes) - all these tables will have 
exactly the same content.
Another thing is if you do not want to have multiple PHBs in QEMU, and you 
do not have tables sharing, every H_PUT_TCE request would have to update 
each group's TCE table, not just one. Not very fast approach.
So it seems a useful thing. If you do not want sharing, just add another 
virtual PHB and put vfio-pci devices onto it.
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 22/34] powerpc/powernv: Implement accessor to TCE entry
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (20 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  2:34   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 23/34] powerpc/iommu/powernv: Release replaced TCE Alexey Kardashevskiy
                   ` (11 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This replaces direct accesses to TCE table with a helper which
returns an TCE entry address. This does not make difference now but will
when multi-level TCE tables get introduces.
No change in behavior is expected.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* new patch in the series to separate this mechanical change from
functional changes; this is not right before
"powerpc/powernv: Implement multilevel TCE tables" but here in order
to let the next patch - "powerpc/iommu/powernv: Release replaced TCE" -
use pnv_tce() and avoid changing the same code twice
---
 arch/powerpc/platforms/powernv/pci.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index ed7de7b..cc82f05 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -572,38 +572,46 @@ struct pci_ops pnv_pci_ops = {
 	.write = pnv_pci_write_config,
 };
 
+static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
+{
+	__be64 *tmp = ((__be64 *)tbl->it_base);
+
+	return tmp + idx;
+}
+
 int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
 		struct dma_attrs *attrs)
 {
 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
-	__be64 *tcep;
-	u64 rpn;
+	u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
+	long i;
 
-	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
-	rpn = __pa(uaddr) >> tbl->it_page_shift;
-
-	while (npages--)
-		*(tcep++) = cpu_to_be64(proto_tce |
-				(rpn++ << tbl->it_page_shift));
+	for (i = 0; i < npages; i++) {
+		unsigned long newtce = proto_tce |
+			((rpn + i) << tbl->it_page_shift);
+		unsigned long idx = index - tbl->it_offset + i;
 
+		*(pnv_tce(tbl, idx)) = cpu_to_be64(newtce);
+	}
 
 	return 0;
 }
 
 void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
-	__be64 *tcep;
+	long i;
 
-	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+	for (i = 0; i < npages; i++) {
+		unsigned long idx = index - tbl->it_offset + i;
 
-	while (npages--)
-		*(tcep++) = cpu_to_be64(0);
+		*(pnv_tce(tbl, idx)) = cpu_to_be64(0);
+	}
 }
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
-	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
+	return *(pnv_tce(tbl, index - tbl->it_offset));
 }
 
 struct iommu_table *pnv_pci_table_alloc(int nid)
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 22/34] powerpc/powernv: Implement accessor to TCE entry
  2015-05-11 15:39 ` [PATCH kernel v10 22/34] powerpc/powernv: Implement accessor to TCE entry Alexey Kardashevskiy
@ 2015-05-14  2:34   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  2:34 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:11AM +1000, Alexey Kardashevskiy wrote:
>This replaces direct accesses to TCE table with a helper which
>returns an TCE entry address. This does not make difference now but will
>when multi-level TCE tables get introduces.
>
>No change in behavior is expected.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
>Changes:
>v9:
>* new patch in the series to separate this mechanical change from
>functional changes; this is not right before
>"powerpc/powernv: Implement multilevel TCE tables" but here in order
>to let the next patch - "powerpc/iommu/powernv: Release replaced TCE" -
>use pnv_tce() and avoid changing the same code twice
>---
> arch/powerpc/platforms/powernv/pci.c | 34 +++++++++++++++++++++-------------
> 1 file changed, 21 insertions(+), 13 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>index ed7de7b..cc82f05 100644
>--- a/arch/powerpc/platforms/powernv/pci.c
>+++ b/arch/powerpc/platforms/powernv/pci.c
>@@ -572,38 +572,46 @@ struct pci_ops pnv_pci_ops = {
> 	.write = pnv_pci_write_config,
> };
>
>+static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
>+{
>+	__be64 *tmp = ((__be64 *)tbl->it_base);
>+
>+	return tmp + idx;
>+}
>+
> int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
> 		unsigned long uaddr, enum dma_data_direction direction,
> 		struct dma_attrs *attrs)
> {
> 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
>-	__be64 *tcep;
>-	u64 rpn;
>+	u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
>+	long i;
>
>-	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>-	rpn = __pa(uaddr) >> tbl->it_page_shift;
>-
>-	while (npages--)
>-		*(tcep++) = cpu_to_be64(proto_tce |
>-				(rpn++ << tbl->it_page_shift));
>+	for (i = 0; i < npages; i++) {
>+		unsigned long newtce = proto_tce |
>+			((rpn + i) << tbl->it_page_shift);
>+		unsigned long idx = index - tbl->it_offset + i;
>
>+		*(pnv_tce(tbl, idx)) = cpu_to_be64(newtce);
>+	}
>
> 	return 0;
> }
>
> void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
> {
>-	__be64 *tcep;
>+	long i;
>
>-	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
>+	for (i = 0; i < npages; i++) {
>+		unsigned long idx = index - tbl->it_offset + i;
>
>-	while (npages--)
>-		*(tcep++) = cpu_to_be64(0);
>+		*(pnv_tce(tbl, idx)) = cpu_to_be64(0);
>+	}
> }
>
> unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
> {
>-	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
>+	return *(pnv_tce(tbl, index - tbl->it_offset));
> }
>
> struct iommu_table *pnv_pci_table_alloc(int nid)
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 23/34] powerpc/iommu/powernv: Release replaced TCE
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (21 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 22/34] powerpc/powernv: Implement accessor to TCE entry Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-13 15:00   ` Thomas Huth
  2015-05-11 15:39 ` [PATCH kernel v10 24/34] powerpc/powernv/ioda2: Rework iommu_table creation Alexey Kardashevskiy
                   ` (10 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
At the moment writing new TCE value to the IOMMU table fails with EBUSY
if there is a valid entry already. However PAPR specification allows
the guest to write new TCE value without clearing it first.
Another problem this patch is addressing is the use of pool locks for
external IOMMU users such as VFIO. The pool locks are to protect
DMA page allocator rather than entries and since the host kernel does
not control what pages are in use, there is no point in pool locks and
exchange()+put_page(oldtce) is sufficient to avoid possible races.
This adds an exchange() callback to iommu_table_ops which does the same
thing as set() plus it returns replaced TCE and DMA direction so
the caller can release the pages afterwards. The exchange() receives
a physical address unlike set() which receives linear mapping address;
and returns a physical address as the clear() does.
This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
for a platform to have exchange() implemented in order to support VFIO.
This replaces iommu_tce_build() and iommu_clear_tce() with
a single iommu_tce_xchg().
This makes sure that TCE permission bits are not set in TCE passed to
IOMMU API as those are to be calculated by platform code from
DMA direction.
This moves SetPageDirty() to the IOMMU code to make it work for both
VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
available later).
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
Changes:
v10:
* did s/tce/hpa/ in iommu_table_ops::exchange and tce_iommu_unuse_page()
* removed permission bits check from iommu_tce_put_param_check as
permission bits are not allowed in the address
* added BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)) to pnv_tce_xchg()
v9:
* changed exchange() to work with physical addresses as these addresses
are never accessed by the code and physical addresses are actual values
we put into the IOMMU table
---
 arch/powerpc/include/asm/iommu.h            | 22 +++++++++--
 arch/powerpc/kernel/iommu.c                 | 59 +++++++++-------------------
 arch/powerpc/platforms/powernv/pci-ioda.c   | 34 ++++++++++++++++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++
 arch/powerpc/platforms/powernv/pci.c        | 18 +++++++++
 arch/powerpc/platforms/powernv/pci.h        |  2 +
 drivers/vfio/vfio_iommu_spapr_tce.c         | 60 +++++++++++++++++------------
 7 files changed, 130 insertions(+), 68 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index c5375c5..d4ad118 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -45,13 +45,29 @@ extern int iommu_is_off;
 extern int iommu_force_on;
 
 struct iommu_table_ops {
+	/*
+	 * When called with direction==DMA_NONE, it is equal to clear().
+	 * uaddr is a linear map address.
+	 */
 	int (*set)(struct iommu_table *tbl,
 			long index, long npages,
 			unsigned long uaddr,
 			enum dma_data_direction direction,
 			struct dma_attrs *attrs);
+#ifdef CONFIG_IOMMU_API
+	/*
+	 * Exchanges existing TCE with new TCE plus direction bits;
+	 * returns old TCE and DMA direction mask.
+	 * @tce is a physical address.
+	 */
+	int (*exchange)(struct iommu_table *tbl,
+			long index,
+			unsigned long *hpa,
+			enum dma_data_direction *direction);
+#endif
 	void (*clear)(struct iommu_table *tbl,
 			long index, long npages);
+	/* get() returns a physical address */
 	unsigned long (*get)(struct iommu_table *tbl, long index);
 	void (*flush)(struct iommu_table *tbl);
 };
@@ -155,6 +171,8 @@ extern void iommu_register_group(struct iommu_table_group *table_group,
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
+extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
@@ -227,10 +245,6 @@ extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
 		unsigned long npages);
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce);
-extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-		unsigned long hwaddr, enum dma_data_direction direction);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-		unsigned long entry);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 6275164..1287d49 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -962,10 +962,7 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
 int iommu_tce_put_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce)
 {
-	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-		return -EINVAL;
-
-	if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
+	if (tce & ~IOMMU_PAGE_MASK(tbl))
 		return -EINVAL;
 
 	if (ioba & ~IOMMU_PAGE_MASK(tbl))
@@ -982,44 +979,16 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
 
-unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction)
 {
-	unsigned long oldtce;
-	struct iommu_pool *pool = get_pool(tbl, entry);
+	long ret;
 
-	spin_lock(&(pool->lock));
+	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
 
-	oldtce = tbl->it_ops->get(tbl, entry);
-	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
-		tbl->it_ops->clear(tbl, entry, 1);
-	else
-		oldtce = 0;
-
-	spin_unlock(&(pool->lock));
-
-	return oldtce;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tce);
-
-/*
- * hwaddr is a kernel virtual address here (0xc... bazillion),
- * tce_build converts it to a physical address.
- */
-int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-		unsigned long hwaddr, enum dma_data_direction direction)
-{
-	int ret = -EBUSY;
-	unsigned long oldtce;
-	struct iommu_pool *pool = get_pool(tbl, entry);
-
-	spin_lock(&(pool->lock));
-
-	oldtce = tbl->it_ops->get(tbl, entry);
-	/* Add new entry if it is not busy */
-	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-		ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
-
-	spin_unlock(&(pool->lock));
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL)))
+		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
 
 	/* if (unlikely(ret))
 		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
@@ -1028,13 +997,23 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_build);
+EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
 	int ret = 0;
 
+	/*
+	 * VFIO does not control TCE entries allocation and the guest
+	 * can write new TCEs on top of existing ones so iommu_tce_build()
+	 * must be able to release old pages. This functionality
+	 * requires exchange() callback defined so if it is not
+	 * implemented, we disallow taking ownership over the table.
+	 */
+	if (!tbl->it_ops->exchange)
+		return -EINVAL;
+
 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_lock(&tbl->pools[i].lock);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8e4987d..e3c784d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1737,6 +1737,20 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
 	return ret;
 }
 
+#ifdef CONFIG_IOMMU_API
+static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret && (tbl->it_type &
+			(TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
+		pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false);
+
+	return ret;
+}
+#endif
+
 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
 		long npages)
 {
@@ -1748,6 +1762,9 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
 
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.set = pnv_ioda1_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_ioda1_tce_xchg,
+#endif
 	.clear = pnv_ioda1_tce_free,
 	.get = pnv_tce_get,
 };
@@ -1822,6 +1839,20 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 	return ret;
 }
 
+#ifdef CONFIG_IOMMU_API
+static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret && (tbl->it_type &
+			(TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
+		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
+
+	return ret;
+}
+#endif
+
 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 		long npages)
 {
@@ -1833,6 +1864,9 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_ioda2_tce_xchg,
+#endif
 	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
 };
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index b524b17..94c880c 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -85,6 +85,9 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
 
 static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
 	.set = pnv_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_tce_xchg,
+#endif
 	.clear = pnv_tce_free,
 	.get = pnv_tce_get,
 };
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index cc82f05..fd14e2c 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -598,6 +598,24 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 	return 0;
 }
 
+#ifdef CONFIG_IOMMU_API
+int pnv_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+	unsigned long newtce = *hpa | proto_tce, oldtce;
+	unsigned long idx = index - tbl->it_offset;
+
+	BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
+
+	oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
+	*hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	*direction = iommu_tce_direction(oldtce);
+
+	return 0;
+}
+#endif
+
 void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
 	long i;
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index ea97de5..3a72e45 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -206,6 +206,8 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
 		struct dma_attrs *attrs);
 extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
+extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction);
 extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 2ead291..0724ec8 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -236,18 +236,11 @@ static void tce_iommu_release(void *iommu_data)
 }
 
 static void tce_iommu_unuse_page(struct tce_container *container,
-		unsigned long oldtce)
+		unsigned long hpa)
 {
 	struct page *page;
 
-	if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
-		return;
-
-	page = pfn_to_page(oldtce >> PAGE_SHIFT);
-
-	if (oldtce & TCE_PCI_WRITE)
-		SetPageDirty(page);
-
+	page = pfn_to_page(hpa >> PAGE_SHIFT);
 	put_page(page);
 }
 
@@ -255,14 +248,21 @@ static int tce_iommu_clear(struct tce_container *container,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long pages)
 {
-	unsigned long oldtce;
+	unsigned long oldhpa;
+	long ret;
+	enum dma_data_direction direction;
 
 	for ( ; pages; --pages, ++entry) {
-		oldtce = iommu_clear_tce(tbl, entry);
-		if (!oldtce)
+		direction = DMA_NONE;
+		oldhpa = 0;
+		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
+		if (ret)
 			continue;
 
-		tce_iommu_unuse_page(container, oldtce);
+		if (direction == DMA_NONE)
+			continue;
+
+		tce_iommu_unuse_page(container, oldhpa);
 	}
 
 	return 0;
@@ -284,12 +284,13 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 
 static long tce_iommu_build(struct tce_container *container,
 		struct iommu_table *tbl,
-		unsigned long entry, unsigned long tce, unsigned long pages)
+		unsigned long entry, unsigned long tce, unsigned long pages,
+		enum dma_data_direction direction)
 {
 	long i, ret = 0;
 	struct page *page;
 	unsigned long hpa;
-	enum dma_data_direction direction = iommu_tce_direction(tce);
+	enum dma_data_direction dirtmp;
 
 	for (i = 0; i < pages; ++i) {
 		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
@@ -305,8 +306,8 @@ static long tce_iommu_build(struct tce_container *container,
 		}
 
 		hpa |= offset;
-		ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa),
-				direction);
+		dirtmp = direction;
+		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
 		if (ret) {
 			tce_iommu_unuse_page(container, hpa);
 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
@@ -314,6 +315,10 @@ static long tce_iommu_build(struct tce_container *container,
 					tce, ret);
 			break;
 		}
+
+		if (dirtmp != DMA_NONE)
+			tce_iommu_unuse_page(container, hpa);
+
 		tce += IOMMU_PAGE_SIZE(tbl);
 	}
 
@@ -378,8 +383,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 	case VFIO_IOMMU_MAP_DMA: {
 		struct vfio_iommu_type1_dma_map param;
 		struct iommu_table *tbl = NULL;
-		unsigned long tce;
 		long num;
+		enum dma_data_direction direction;
 
 		if (!container->enabled)
 			return -EPERM;
@@ -405,19 +410,26 @@ static long tce_iommu_ioctl(void *iommu_data,
 			return -EINVAL;
 
 		/* iova is checked by the IOMMU API */
-		tce = param.vaddr;
 		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
-			tce |= TCE_PCI_READ;
-		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
-			tce |= TCE_PCI_WRITE;
+			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+				direction = DMA_BIDIRECTIONAL;
+			else
+				direction = DMA_TO_DEVICE;
+		else
+			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+				direction = DMA_FROM_DEVICE;
+			else
+				return -EINVAL;
 
-		ret = iommu_tce_put_param_check(tbl, param.iova, tce);
+		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 		if (ret)
 			return ret;
 
 		ret = tce_iommu_build(container, tbl,
 				param.iova >> tbl->it_page_shift,
-				tce, param.size >> tbl->it_page_shift);
+				param.vaddr,
+				param.size >> tbl->it_page_shift,
+				direction);
 
 		iommu_flush_tce(tbl);
 
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 23/34] powerpc/iommu/powernv: Release replaced TCE
  2015-05-11 15:39 ` [PATCH kernel v10 23/34] powerpc/iommu/powernv: Release replaced TCE Alexey Kardashevskiy
@ 2015-05-13 15:00   ` Thomas Huth
  2015-05-14  3:53     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Thomas Huth @ 2015-05-13 15:00 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, 12 May 2015 01:39:12 +1000
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> At the moment writing new TCE value to the IOMMU table fails with EBUSY
> if there is a valid entry already. However PAPR specification allows
> the guest to write new TCE value without clearing it first.
> 
> Another problem this patch is addressing is the use of pool locks for
> external IOMMU users such as VFIO. The pool locks are to protect
> DMA page allocator rather than entries and since the host kernel does
> not control what pages are in use, there is no point in pool locks and
> exchange()+put_page(oldtce) is sufficient to avoid possible races.
> 
> This adds an exchange() callback to iommu_table_ops which does the same
> thing as set() plus it returns replaced TCE and DMA direction so
> the caller can release the pages afterwards. The exchange() receives
> a physical address unlike set() which receives linear mapping address;
> and returns a physical address as the clear() does.
> 
> This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
> for a platform to have exchange() implemented in order to support VFIO.
> 
> This replaces iommu_tce_build() and iommu_clear_tce() with
> a single iommu_tce_xchg().
> 
> This makes sure that TCE permission bits are not set in TCE passed to
> IOMMU API as those are to be calculated by platform code from
> DMA direction.
> 
> This moves SetPageDirty() to the IOMMU code to make it work for both
> VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
> available later).
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> [aw: for the vfio related changes]
> Acked-by: Alex Williamson <alex.williamson@redhat.com>
[...]
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 6275164..1287d49 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -962,10 +962,7 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
>  int iommu_tce_put_param_check(struct iommu_table *tbl,
>  		unsigned long ioba, unsigned long tce)
>  {
> -	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> -		return -EINVAL;
> -
> -	if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
> +	if (tce & ~IOMMU_PAGE_MASK(tbl))
>  		return -EINVAL;
>  
>  	if (ioba & ~IOMMU_PAGE_MASK(tbl))
> @@ -982,44 +979,16 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
>  }
>  EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
>  
> -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
> +long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long *hpa, enum dma_data_direction *direction)
>  {
> -	unsigned long oldtce;
> -	struct iommu_pool *pool = get_pool(tbl, entry);
> +	long ret;
>  
> -	spin_lock(&(pool->lock));
> +	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
>  
> -	oldtce = tbl->it_ops->get(tbl, entry);
> -	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
> -		tbl->it_ops->clear(tbl, entry, 1);
> -	else
> -		oldtce = 0;
> -
> -	spin_unlock(&(pool->lock));
> -
> -	return oldtce;
> -}
> -EXPORT_SYMBOL_GPL(iommu_clear_tce);
> -
> -/*
> - * hwaddr is a kernel virtual address here (0xc... bazillion),
> - * tce_build converts it to a physical address.
> - */
> -int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
> -		unsigned long hwaddr, enum dma_data_direction direction)
> -{
> -	int ret = -EBUSY;
> -	unsigned long oldtce;
> -	struct iommu_pool *pool = get_pool(tbl, entry);
> -
> -	spin_lock(&(pool->lock));
> -
> -	oldtce = tbl->it_ops->get(tbl, entry);
> -	/* Add new entry if it is not busy */
> -	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> -		ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
> -
> -	spin_unlock(&(pool->lock));
> +	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
> +			(*direction == DMA_BIDIRECTIONAL)))
You could drop some of the parentheses:
	if (!ret && (*direction == DMA_FROM_DEVICE ||
			*direction == DMA_BIDIRECTIONAL))
> +		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
>  
>  	/* if (unlikely(ret))
>  		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
[...]
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 2ead291..0724ec8 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -236,18 +236,11 @@ static void tce_iommu_release(void *iommu_data)
[...]
> @@ -405,19 +410,26 @@ static long tce_iommu_ioctl(void *iommu_data,
>  			return -EINVAL;
>  
>  		/* iova is checked by the IOMMU API */
> -		tce = param.vaddr;
>  		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> -			tce |= TCE_PCI_READ;
> -		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> -			tce |= TCE_PCI_WRITE;
> +			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> +				direction = DMA_BIDIRECTIONAL;
> +			else
> +				direction = DMA_TO_DEVICE;
> +		else
> +			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> +				direction = DMA_FROM_DEVICE;
> +			else
> +				return -EINVAL;
IMHO some curly braces for the outer if-statement would be really fine
here.
> -		ret = iommu_tce_put_param_check(tbl, param.iova, tce);
> +		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
>  		if (ret)
>  			return ret;
>  
>  		ret = tce_iommu_build(container, tbl,
>  				param.iova >> tbl->it_page_shift,
> -				tce, param.size >> tbl->it_page_shift);
> +				param.vaddr,
> +				param.size >> tbl->it_page_shift,
> +				direction);
>  
>  		iommu_flush_tce(tbl);
>  
 Thomas
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 23/34] powerpc/iommu/powernv: Release replaced TCE
  2015-05-13 15:00   ` Thomas Huth
@ 2015-05-14  3:53     ` Alexey Kardashevskiy
  2015-05-15  8:09       ` Thomas Huth
  0 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  3:53 UTC (permalink / raw)
  To: Thomas Huth
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On 05/14/2015 01:00 AM, Thomas Huth wrote:
> On Tue, 12 May 2015 01:39:12 +1000
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
>> At the moment writing new TCE value to the IOMMU table fails with EBUSY
>> if there is a valid entry already. However PAPR specification allows
>> the guest to write new TCE value without clearing it first.
>>
>> Another problem this patch is addressing is the use of pool locks for
>> external IOMMU users such as VFIO. The pool locks are to protect
>> DMA page allocator rather than entries and since the host kernel does
>> not control what pages are in use, there is no point in pool locks and
>> exchange()+put_page(oldtce) is sufficient to avoid possible races.
>>
>> This adds an exchange() callback to iommu_table_ops which does the same
>> thing as set() plus it returns replaced TCE and DMA direction so
>> the caller can release the pages afterwards. The exchange() receives
>> a physical address unlike set() which receives linear mapping address;
>> and returns a physical address as the clear() does.
>>
>> This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
>> for a platform to have exchange() implemented in order to support VFIO.
>>
>> This replaces iommu_tce_build() and iommu_clear_tce() with
>> a single iommu_tce_xchg().
>>
>> This makes sure that TCE permission bits are not set in TCE passed to
>> IOMMU API as those are to be calculated by platform code from
>> DMA direction.
>>
>> This moves SetPageDirty() to the IOMMU code to make it work for both
>> VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
>> available later).
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> [aw: for the vfio related changes]
>> Acked-by: Alex Williamson <alex.williamson@redhat.com>
> [...]
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 6275164..1287d49 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -962,10 +962,7 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
>>   int iommu_tce_put_param_check(struct iommu_table *tbl,
>>   		unsigned long ioba, unsigned long tce)
>>   {
>> -	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>> -		return -EINVAL;
>> -
>> -	if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
>> +	if (tce & ~IOMMU_PAGE_MASK(tbl))
>>   		return -EINVAL;
>>
>>   	if (ioba & ~IOMMU_PAGE_MASK(tbl))
>> @@ -982,44 +979,16 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
>>   }
>>   EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
>>
>> -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
>> +long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long *hpa, enum dma_data_direction *direction)
>>   {
>> -	unsigned long oldtce;
>> -	struct iommu_pool *pool = get_pool(tbl, entry);
>> +	long ret;
>>
>> -	spin_lock(&(pool->lock));
>> +	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
>>
>> -	oldtce = tbl->it_ops->get(tbl, entry);
>> -	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
>> -		tbl->it_ops->clear(tbl, entry, 1);
>> -	else
>> -		oldtce = 0;
>> -
>> -	spin_unlock(&(pool->lock));
>> -
>> -	return oldtce;
>> -}
>> -EXPORT_SYMBOL_GPL(iommu_clear_tce);
>> -
>> -/*
>> - * hwaddr is a kernel virtual address here (0xc... bazillion),
>> - * tce_build converts it to a physical address.
>> - */
>> -int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
>> -		unsigned long hwaddr, enum dma_data_direction direction)
>> -{
>> -	int ret = -EBUSY;
>> -	unsigned long oldtce;
>> -	struct iommu_pool *pool = get_pool(tbl, entry);
>> -
>> -	spin_lock(&(pool->lock));
>> -
>> -	oldtce = tbl->it_ops->get(tbl, entry);
>> -	/* Add new entry if it is not busy */
>> -	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>> -		ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
>> -
>> -	spin_unlock(&(pool->lock));
>> +	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
>> +			(*direction == DMA_BIDIRECTIONAL)))
>
> You could drop some of the parentheses:
>
> 	if (!ret && (*direction == DMA_FROM_DEVICE ||
> 			*direction == DMA_BIDIRECTIONAL))
I really (really) like braces. Is there any kernel code design rule against it?
>
>> +		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
>>
>>   	/* if (unlikely(ret))
>>   		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
> [...]
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> index 2ead291..0724ec8 100644
>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -236,18 +236,11 @@ static void tce_iommu_release(void *iommu_data)
> [...]
>> @@ -405,19 +410,26 @@ static long tce_iommu_ioctl(void *iommu_data,
>>   			return -EINVAL;
>>
>>   		/* iova is checked by the IOMMU API */
>> -		tce = param.vaddr;
>>   		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
>> -			tce |= TCE_PCI_READ;
>> -		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> -			tce |= TCE_PCI_WRITE;
>> +			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> +				direction = DMA_BIDIRECTIONAL;
>> +			else
>> +				direction = DMA_TO_DEVICE;
>> +		else
>> +			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> +				direction = DMA_FROM_DEVICE;
>> +			else
>> +				return -EINVAL;
>
> IMHO some curly braces for the outer if-statement would be really fine
> here.
I believe checkpatch.pl won't like it. There is a check against single 
lines having braces after "if" statements.
>
>> -		ret = iommu_tce_put_param_check(tbl, param.iova, tce);
>> +		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
>>   		if (ret)
>>   			return ret;
>>
>>   		ret = tce_iommu_build(container, tbl,
>>   				param.iova >> tbl->it_page_shift,
>> -				tce, param.size >> tbl->it_page_shift);
>> +				param.vaddr,
>> +				param.size >> tbl->it_page_shift,
>> +				direction);
>>
>>   		iommu_flush_tce(tbl);
>>
>
>   Thomas
>
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 23/34] powerpc/iommu/powernv: Release replaced TCE
  2015-05-14  3:53     ` Alexey Kardashevskiy
@ 2015-05-15  8:09       ` Thomas Huth
  0 siblings, 0 replies; 82+ messages in thread
From: Thomas Huth @ 2015-05-15  8:09 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Thu, 14 May 2015 13:53:57 +1000
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> On 05/14/2015 01:00 AM, Thomas Huth wrote:
> > On Tue, 12 May 2015 01:39:12 +1000
> > Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
...
> >> -/*
> >> - * hwaddr is a kernel virtual address here (0xc... bazillion),
> >> - * tce_build converts it to a physical address.
> >> - */
> >> -int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
> >> -		unsigned long hwaddr, enum dma_data_direction direction)
> >> -{
> >> -	int ret = -EBUSY;
> >> -	unsigned long oldtce;
> >> -	struct iommu_pool *pool = get_pool(tbl, entry);
> >> -
> >> -	spin_lock(&(pool->lock));
> >> -
> >> -	oldtce = tbl->it_ops->get(tbl, entry);
> >> -	/* Add new entry if it is not busy */
> >> -	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >> -		ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
> >> -
> >> -	spin_unlock(&(pool->lock));
> >> +	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
> >> +			(*direction == DMA_BIDIRECTIONAL)))
> >
> > You could drop some of the parentheses:
> >
> > 	if (!ret && (*direction == DMA_FROM_DEVICE ||
> > 			*direction == DMA_BIDIRECTIONAL))
> 
> I really (really) like braces. Is there any kernel code design rule against it?
I don't think so ... but for me it's rather the other way round: If I
see too many braces, I always wonder whether there is a reason for it in
the sense that I did not understand the statement right at the first
glance. Additionally, this is something that Pascal programmers like to
do, so IMHO this just looks ugly in C.
> >> @@ -405,19 +410,26 @@ static long tce_iommu_ioctl(void *iommu_data,
> >>   			return -EINVAL;
> >>
> >>   		/* iova is checked by the IOMMU API */
> >> -		tce = param.vaddr;
> >>   		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> >> -			tce |= TCE_PCI_READ;
> >> -		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> -			tce |= TCE_PCI_WRITE;
> >> +			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> +				direction = DMA_BIDIRECTIONAL;
> >> +			else
> >> +				direction = DMA_TO_DEVICE;
> >> +		else
> >> +			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> +				direction = DMA_FROM_DEVICE;
> >> +			else
> >> +				return -EINVAL;
> >
> > IMHO some curly braces for the outer if-statement would be really fine
> > here.
> 
> I believe checkpatch.pl won't like it. There is a check against single 
> lines having braces after "if" statements.
If you write your code like this (I was only talking about the outer
braces!):
	if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
			direction = DMA_BIDIRECTIONAL;
		else
			direction = DMA_TO_DEVICE;
	} else {
		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
			direction = DMA_FROM_DEVICE;
		else
			return -EINVAL;
	}
... then checkpatch should not complain, as far as I know - in this
case, the braces include three lines, don't they?
 Thomas
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
 
- * [PATCH kernel v10 24/34] powerpc/powernv/ioda2: Rework iommu_table creation
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (22 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 23/34] powerpc/iommu/powernv: Release replaced TCE Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  4:14   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 25/34] powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages Alexey Kardashevskiy
                   ` (9 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This moves iommu_table creation to the beginning to make following changes
easier to review. This starts using table parameters from the iommu_table
struct.
This should cause no behavioural change.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* updated commit log and did minor cleanup
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e3c784d..9b80b74 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2074,13 +2074,23 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	addr = page_address(tce_mem);
 	memset(addr, 0, tce_table_size);
 
+	/* Setup linux iommu table */
+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+			IOMMU_PAGE_SHIFT_4K);
+
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+	iommu_init_table(tbl, phb->hose->node);
+#ifdef CONFIG_IOMMU_API
+	pe->table_group.ops = &pnv_pci_ioda2_ops;
+#endif
+
 	/*
 	 * Map TCE table through TVT. The TVE index is the PE number
 	 * shifted by 1 bit for 32-bits DMA space.
 	 */
 	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-					pe->pe_number << 1, 1, __pa(addr),
-					tce_table_size, 0x1000);
+			pe->pe_number << 1, 1, __pa(tbl->it_base),
+			tbl->it_size << 3, 1ULL << tbl->it_page_shift);
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table,"
 		       " err %ld\n", rc);
@@ -2089,20 +2099,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	pnv_pci_ioda2_tvt_invalidate(pe);
 
-	/* Setup linux iommu table */
-	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-			IOMMU_PAGE_SHIFT_4K);
-
 	/* OPAL variant of PHB3 invalidated TCEs */
 	if (pe->tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
 
-	tbl->it_ops = &pnv_ioda2_iommu_ops;
-	iommu_init_table(tbl, phb->hose->node);
-#ifdef CONFIG_IOMMU_API
-	pe->table_group.ops = &pnv_pci_ioda2_ops;
-#endif
-
 	if (pe->flags & PNV_IODA_PE_DEV) {
 		/*
 		 * Setting table base here only for carrying iommu_group
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 24/34] powerpc/powernv/ioda2: Rework iommu_table creation
  2015-05-11 15:39 ` [PATCH kernel v10 24/34] powerpc/powernv/ioda2: Rework iommu_table creation Alexey Kardashevskiy
@ 2015-05-14  4:14   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  4:14 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:13AM +1000, Alexey Kardashevskiy wrote:
>This moves iommu_table creation to the beginning to make following changes
>easier to review. This starts using table parameters from the iommu_table
>struct.
>
>This should cause no behavioural change.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Thanks,
Gavin
>---
>Changes:
>v9:
>* updated commit log and did minor cleanup
>---
> arch/powerpc/platforms/powernv/pci-ioda.c | 24 ++++++++++++------------
> 1 file changed, 12 insertions(+), 12 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index e3c784d..9b80b74 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -2074,13 +2074,23 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	addr = page_address(tce_mem);
> 	memset(addr, 0, tce_table_size);
>
>+	/* Setup linux iommu table */
>+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
>+			IOMMU_PAGE_SHIFT_4K);
>+
>+	tbl->it_ops = &pnv_ioda2_iommu_ops;
>+	iommu_init_table(tbl, phb->hose->node);
>+#ifdef CONFIG_IOMMU_API
>+	pe->table_group.ops = &pnv_pci_ioda2_ops;
>+#endif
>+
> 	/*
> 	 * Map TCE table through TVT. The TVE index is the PE number
> 	 * shifted by 1 bit for 32-bits DMA space.
> 	 */
> 	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
>-					pe->pe_number << 1, 1, __pa(addr),
>-					tce_table_size, 0x1000);
>+			pe->pe_number << 1, 1, __pa(tbl->it_base),
>+			tbl->it_size << 3, 1ULL << tbl->it_page_shift);
> 	if (rc) {
> 		pe_err(pe, "Failed to configure 32-bit TCE table,"
> 		       " err %ld\n", rc);
>@@ -2089,20 +2099,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>
> 	pnv_pci_ioda2_tvt_invalidate(pe);
>
>-	/* Setup linux iommu table */
>-	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
>-			IOMMU_PAGE_SHIFT_4K);
>-
> 	/* OPAL variant of PHB3 invalidated TCEs */
> 	if (pe->tce_inval_reg)
> 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
>
>-	tbl->it_ops = &pnv_ioda2_iommu_ops;
>-	iommu_init_table(tbl, phb->hose->node);
>-#ifdef CONFIG_IOMMU_API
>-	pe->table_group.ops = &pnv_pci_ioda2_ops;
>-#endif
>-
> 	if (pe->flags & PNV_IODA_PE_DEV) {
> 		/*
> 		 * Setting table base here only for carrying iommu_group
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 25/34] powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (23 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 24/34] powerpc/powernv/ioda2: Rework iommu_table creation Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  4:31   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window Alexey Kardashevskiy
                   ` (8 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This is a part of moving TCE table allocation into an iommu_ops
callback to support multiple IOMMU groups per one VFIO container.
This moves the code which allocates the actual TCE tables to helpers:
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages().
These do not allocate/free the iommu_table struct.
This enforces window size to be a power of two.
This should cause no behavioural change.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* removed @table_group parameter from pnv_pci_create_table as it was not used
* removed *tce_table_allocated from pnv_alloc_tce_table_pages()
* pnv_pci_create_table/pnv_pci_free_table renamed to
pnv_pci_ioda2_table_alloc_pages/pnv_pci_ioda2_table_free_pages and moved
back to pci-ioda.c as these only allocate pages for IODA2 and there is
no chance they will be reused for IODA1/P5IOC2
* shortened subject line
v9:
* moved helpers to the common powernv pci.c file from pci-ioda.c
* moved bits from pnv_pci_create_table() to pnv_alloc_tce_table_pages()
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 82 +++++++++++++++++++++++--------
 1 file changed, 62 insertions(+), 20 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9b80b74..7d98d83 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -49,6 +49,8 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
 
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...)
 {
@@ -1313,8 +1315,8 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
 	}
+	pnv_pci_ioda2_table_free_pages(tbl);
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
-	free_pages(addr, get_order(TCE32_TABLE_SIZE));
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -2034,13 +2036,62 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 };
 #endif
 
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-				       struct pnv_ioda_pe *pe)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
 {
 	struct page *tce_mem = NULL;
+	__be64 *addr;
+	unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+
+	tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+	if (!tce_mem) {
+		pr_err("Failed to allocate a TCE memory, order=%d\n", order);
+		return NULL;
+	}
+	addr = page_address(tce_mem);
+	memset(addr, 0, 1UL << (order + PAGE_SHIFT));
+
+	return addr;
+}
+
+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+		__u32 page_shift, __u64 window_size, struct iommu_table *tbl)
+{
 	void *addr;
+	const unsigned window_shift = ilog2(window_size);
+	unsigned entries_shift = window_shift - page_shift;
+	unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
+	const unsigned long tce_table_size = 1UL << table_shift;
+
+	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
+		return -EINVAL;
+
+	/* Allocate TCE table */
+	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
+	if (!addr)
+		return -ENOMEM;
+
+	/* Setup linux iommu table */
+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
+			page_shift);
+
+	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
+			window_size, tce_table_size, bus_offset);
+
+	return 0;
+}
+
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
+{
+	if (!tbl->it_size)
+		return;
+
+	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe)
+{
 	struct iommu_table *tbl;
-	unsigned int tce_table_size, end;
 	int64_t rc;
 
 	/* We shouldn't already have a 32-bit DMA associated */
@@ -2059,24 +2110,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
-	end = (1 << ilog2(phb->ioda.m32_pci_base));
-	tce_table_size = (end / 0x1000) * 8;
 	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
-		end);
+		phb->ioda.m32_pci_base);
 
-	/* Allocate TCE table */
-	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
-				   get_order(tce_table_size));
-	if (!tce_mem) {
-		pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
+	/* Setup linux iommu table */
+	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
+			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
+	if (rc) {
+		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
 		goto fail;
 	}
-	addr = page_address(tce_mem);
-	memset(addr, 0, tce_table_size);
-
-	/* Setup linux iommu table */
-	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-			IOMMU_PAGE_SHIFT_4K);
 
 	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
@@ -2122,9 +2165,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 fail:
 	if (pe->tce32_seg >= 0)
 		pe->tce32_seg = -1;
-	if (tce_mem)
-		__free_pages(tce_mem, get_order(tce_table_size));
 	if (tbl) {
+		pnv_pci_ioda2_table_free_pages(tbl);
 		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
 		iommu_free_table(tbl, "pnv");
 	}
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 25/34] powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages
  2015-05-11 15:39 ` [PATCH kernel v10 25/34] powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages Alexey Kardashevskiy
@ 2015-05-14  4:31   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  4:31 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:14AM +1000, Alexey Kardashevskiy wrote:
>This is a part of moving TCE table allocation into an iommu_ops
>callback to support multiple IOMMU groups per one VFIO container.
>
>This moves the code which allocates the actual TCE tables to helpers:
>pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages().
>These do not allocate/free the iommu_table struct.
>
>This enforces window size to be a power of two.
>
>This should cause no behavioural change.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v10:
>* removed @table_group parameter from pnv_pci_create_table as it was not used
>* removed *tce_table_allocated from pnv_alloc_tce_table_pages()
>* pnv_pci_create_table/pnv_pci_free_table renamed to
>pnv_pci_ioda2_table_alloc_pages/pnv_pci_ioda2_table_free_pages and moved
>back to pci-ioda.c as these only allocate pages for IODA2 and there is
>no chance they will be reused for IODA1/P5IOC2
>* shortened subject line
>
>v9:
>* moved helpers to the common powernv pci.c file from pci-ioda.c
>* moved bits from pnv_pci_create_table() to pnv_alloc_tce_table_pages()
>---
> arch/powerpc/platforms/powernv/pci-ioda.c | 82 +++++++++++++++++++++++--------
> 1 file changed, 62 insertions(+), 20 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 9b80b74..7d98d83 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -49,6 +49,8 @@
> /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
> #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
>
>+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
>+
> static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
> 			    const char *fmt, ...)
> {
>@@ -1313,8 +1315,8 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
> 		iommu_group_put(pe->table_group.group);
> 		BUG_ON(pe->table_group.group);
> 	}
>+	pnv_pci_ioda2_table_free_pages(tbl);
> 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
>-	free_pages(addr, get_order(TCE32_TABLE_SIZE));
> }
>
> static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>@@ -2034,13 +2036,62 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
> };
> #endif
>
>-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>-				       struct pnv_ioda_pe *pe)
>+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
> {
> 	struct page *tce_mem = NULL;
>+	__be64 *addr;
>+	unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
>+
>+	tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
>+	if (!tce_mem) {
>+		pr_err("Failed to allocate a TCE memory, order=%d\n", order);
>+		return NULL;
>+	}
>+	addr = page_address(tce_mem);
>+	memset(addr, 0, 1UL << (order + PAGE_SHIFT));
>+
>+	return addr;
>+}
>+
>+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
>+		__u32 page_shift, __u64 window_size, struct iommu_table *tbl)
>+{
It seems that pci-ioda.c has the convention to have u32/u64, instead of
__u32/__u64.
Thanks,
Gavin
> 	void *addr;
>+	const unsigned window_shift = ilog2(window_size);
>+	unsigned entries_shift = window_shift - page_shift;
>+	unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
>+	const unsigned long tce_table_size = 1UL << table_shift;
>+
>+	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
>+		return -EINVAL;
>+
>+	/* Allocate TCE table */
>+	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
>+	if (!addr)
>+		return -ENOMEM;
>+
>+	/* Setup linux iommu table */
>+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
>+			page_shift);
>+
>+	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
>+			window_size, tce_table_size, bus_offset);
>+
>+	return 0;
>+}
>+
>+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
>+{
>+	if (!tbl->it_size)
>+		return;
>+
>+	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
>+}
>+
>+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>+				       struct pnv_ioda_pe *pe)
>+{
> 	struct iommu_table *tbl;
>-	unsigned int tce_table_size, end;
> 	int64_t rc;
>
> 	/* We shouldn't already have a 32-bit DMA associated */
>@@ -2059,24 +2110,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>
> 	/* The PE will reserve all possible 32-bits space */
> 	pe->tce32_seg = 0;
>-	end = (1 << ilog2(phb->ioda.m32_pci_base));
>-	tce_table_size = (end / 0x1000) * 8;
> 	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
>-		end);
>+		phb->ioda.m32_pci_base);
>
>-	/* Allocate TCE table */
>-	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
>-				   get_order(tce_table_size));
>-	if (!tce_mem) {
>-		pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
>+	/* Setup linux iommu table */
>+	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
>+			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
>+	if (rc) {
>+		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
> 		goto fail;
> 	}
>-	addr = page_address(tce_mem);
>-	memset(addr, 0, tce_table_size);
>-
>-	/* Setup linux iommu table */
>-	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
>-			IOMMU_PAGE_SHIFT_4K);
>
> 	tbl->it_ops = &pnv_ioda2_iommu_ops;
> 	iommu_init_table(tbl, phb->hose->node);
>@@ -2122,9 +2165,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> fail:
> 	if (pe->tce32_seg >= 0)
> 		pe->tce32_seg = -1;
>-	if (tce_mem)
>-		__free_pages(tce_mem, get_order(tce_table_size));
> 	if (tbl) {
>+		pnv_pci_ioda2_table_free_pages(tbl);
> 		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
> 		iommu_free_table(tbl, "pnv");
> 	}
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (24 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 25/34] powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-14  5:01   ` Gavin Shan
  2015-05-11 15:39 ` [PATCH kernel v10 27/34] powerpc/powernv: Implement multilevel TCE tables Alexey Kardashevskiy
                   ` (7 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This is a part of moving DMA window programming to an iommu_ops
callback. pnv_pci_ioda2_set_window() takes an iommu_table_group as
a first parameter (not pnv_ioda_pe) as it is going to be used as
a callback for VFIO DDW code.
This adds pnv_pci_ioda2_tvt_invalidate() to invalidate TVT as it is
a good thing to do. It does not have immediate effect now as the table
is never recreated after reboot but it will in the following patches.
This should cause no behavioural change.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* initialize pe->table_group.tables[0] at the very end when
tbl is fully initialized
* moved pnv_pci_ioda2_tvt_invalidate() from earlier patch
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 47 +++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7d98d83..85f80b2 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1983,6 +1983,43 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	}
 }
 
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+		int num, struct iommu_table *tbl)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	int64_t rc;
+	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+	pe_info(pe, "Setting up window %llx..%llx pg=%x\n",
+			start_addr, start_addr + win_size - 1,
+			1UL << tbl->it_page_shift);
+
+	/*
+	 * Map TCE table through TVT. The TVE index is the PE number
+	 * shifted by 1 bit for 32-bits DMA space.
+	 */
+	rc = opal_pci_map_pe_dma_window(phb->opal_id,
+			pe->pe_number,
+			pe->pe_number << 1,
+			1,
+			__pa(tbl->it_base),
+			tbl->it_size << 3,
+			1ULL << tbl->it_page_shift);
+	if (rc) {
+		pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
+		return rc;
+	}
+
+	pnv_pci_link_table_and_group(phb->hose->node, num,
+			tbl, &pe->table_group);
+	pnv_pci_ioda2_tvt_invalidate(pe);
+
+	return 0;
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
@@ -2127,21 +2164,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	pe->table_group.ops = &pnv_pci_ioda2_ops;
 #endif
 
-	/*
-	 * Map TCE table through TVT. The TVE index is the PE number
-	 * shifted by 1 bit for 32-bits DMA space.
-	 */
-	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-			pe->pe_number << 1, 1, __pa(tbl->it_base),
-			tbl->it_size << 3, 1ULL << tbl->it_page_shift);
+	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table,"
 		       " err %ld\n", rc);
 		goto fail;
 	}
 
-	pnv_pci_ioda2_tvt_invalidate(pe);
-
 	/* OPAL variant of PHB3 invalidated TCEs */
 	if (pe->tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window
  2015-05-11 15:39 ` [PATCH kernel v10 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window Alexey Kardashevskiy
@ 2015-05-14  5:01   ` Gavin Shan
  0 siblings, 0 replies; 82+ messages in thread
From: Gavin Shan @ 2015-05-14  5:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Alex Williamson,
	Paul Mackerras, linuxppc-dev, David Gibson
On Tue, May 12, 2015 at 01:39:15AM +1000, Alexey Kardashevskiy wrote:
>This is a part of moving DMA window programming to an iommu_ops
>callback. pnv_pci_ioda2_set_window() takes an iommu_table_group as
>a first parameter (not pnv_ioda_pe) as it is going to be used as
>a callback for VFIO DDW code.
>
>This adds pnv_pci_ioda2_tvt_invalidate() to invalidate TVT as it is
>a good thing to do. It does not have immediate effect now as the table
>is never recreated after reboot but it will in the following patches.
>
>This should cause no behavioural change.
>
>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
>Changes:
>v9:
>* initialize pe->table_group.tables[0] at the very end when
>tbl is fully initialized
>* moved pnv_pci_ioda2_tvt_invalidate() from earlier patch
>---
> arch/powerpc/platforms/powernv/pci-ioda.c | 47 +++++++++++++++++++++++++------
> 1 file changed, 38 insertions(+), 9 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index 7d98d83..85f80b2 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -1983,6 +1983,43 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> 	}
> }
>
>+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
>+		int num, struct iommu_table *tbl)
>+{
>+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
>+			table_group);
>+	struct pnv_phb *phb = pe->phb;
>+	int64_t rc;
>+	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
>+	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
s/__u64/u64 maybe :-)
>+
>+	pe_info(pe, "Setting up window %llx..%llx pg=%x\n",
>+			start_addr, start_addr + win_size - 1,
>+			1UL << tbl->it_page_shift);
>+
>+	/*
>+	 * Map TCE table through TVT. The TVE index is the PE number
>+	 * shifted by 1 bit for 32-bits DMA space.
>+	 */
>+	rc = opal_pci_map_pe_dma_window(phb->opal_id,
>+			pe->pe_number,
>+			pe->pe_number << 1,
>+			1,
>+			__pa(tbl->it_base),
>+			tbl->it_size << 3,
>+			1ULL << tbl->it_page_shift);
There is one macro for IOMMU page size: IOMMU_PAGE_SIZE(), which is defined in
arch/powerpc/include/asm/iommu.h as below:
#define IOMMU_PAGE_SIZE(tblptr) (ASM_CONST(1) << (tblptr)->it_page_shift)
>+	if (rc) {
>+		pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
>+		return rc;
>+	}
>+
>+	pnv_pci_link_table_and_group(phb->hose->node, num,
>+			tbl, &pe->table_group);
>+	pnv_pci_ioda2_tvt_invalidate(pe);
>+
>+	return 0;
>+}
>+
> static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
> {
> 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
>@@ -2127,21 +2164,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> 	pe->table_group.ops = &pnv_pci_ioda2_ops;
> #endif
>
>-	/*
>-	 * Map TCE table through TVT. The TVE index is the PE number
>-	 * shifted by 1 bit for 32-bits DMA space.
>-	 */
>-	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
>-			pe->pe_number << 1, 1, __pa(tbl->it_base),
>-			tbl->it_size << 3, 1ULL << tbl->it_page_shift);
>+	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
> 	if (rc) {
> 		pe_err(pe, "Failed to configure 32-bit TCE table,"
> 		       " err %ld\n", rc);
> 		goto fail;
> 	}
>
>-	pnv_pci_ioda2_tvt_invalidate(pe);
>-
> 	/* OPAL variant of PHB3 invalidated TCEs */
> 	if (pe->tce_inval_reg)
> 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
Thanks,
Gavin
>-- 
>2.4.0.rc3.8.gfb3e7d5
>
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 27/34] powerpc/powernv: Implement multilevel TCE tables
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (25 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-11 15:39 ` [PATCH kernel v10 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API Alexey Kardashevskiy
                   ` (6 subsequent siblings)
  33 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.
To address this, POWER8 CPU (actually, IODA2) supports multi-level
TCE tables, up to 5 levels which splits the table into a tree of
smaller subtables.
This adds multi-level TCE tables support to
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages()
helpers.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* fixed multiple comments received for v9
v9:
* moved from ioda2 to common powernv pci code
* fixed cleanup if allocation fails in a middle
* removed check for the size - all boundary checks happen in the calling code
anyway
---
 arch/powerpc/include/asm/iommu.h          |  2 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 98 ++++++++++++++++++++++++++++---
 arch/powerpc/platforms/powernv/pci.c      | 13 ++++
 3 files changed, 104 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d4ad118..a902159 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -96,6 +96,8 @@ struct iommu_pool {
 struct iommu_table {
 	unsigned long  it_busno;     /* Bus number this table belongs to */
 	unsigned long  it_size;      /* Size of iommu table in entries */
+	unsigned long  it_indirect_levels;
+	unsigned long  it_level_size;
 	unsigned long  it_offset;    /* Offset into global table */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_index;     /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 85f80b2..d2a1dcd 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -49,6 +49,9 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
 
+#define POWERNV_IOMMU_DEFAULT_LEVELS	1
+#define POWERNV_IOMMU_MAX_LEVELS	5
+
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
 
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -1990,6 +1993,8 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 			table_group);
 	struct pnv_phb *phb = pe->phb;
 	int64_t rc;
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
 	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
 	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
 
@@ -2004,9 +2009,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	rc = opal_pci_map_pe_dma_window(phb->opal_id,
 			pe->pe_number,
 			pe->pe_number << 1,
-			1,
+			tbl->it_indirect_levels + 1,
 			__pa(tbl->it_base),
-			tbl->it_size << 3,
+			size << 3,
 			1ULL << tbl->it_page_shift);
 	if (rc) {
 		pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
@@ -2073,11 +2078,19 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 };
 #endif
 
-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+		unsigned levels, unsigned long limit,
+		unsigned long *tce_table_allocated)
 {
 	struct page *tce_mem = NULL;
-	__be64 *addr;
+	__be64 *addr, *tmp;
 	unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+	unsigned long local_allocated = 1UL << (order + PAGE_SHIFT);
+	unsigned entries = 1UL << (shift - 3);
+	long i;
+
+	if (*tce_table_allocated >= limit)
+		return NULL;
 
 	tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
 	if (!tce_mem) {
@@ -2085,31 +2098,69 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
 		return NULL;
 	}
 	addr = page_address(tce_mem);
-	memset(addr, 0, 1UL << (order + PAGE_SHIFT));
+	memset(addr, 0, local_allocated);
+
+	--levels;
+	if (!levels) {
+		*tce_table_allocated += local_allocated;
+		return addr;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+				levels, limit, tce_table_allocated);
+		if (!tmp)
+			break;
+
+		addr[i] = cpu_to_be64(__pa(tmp) |
+				TCE_PCI_READ | TCE_PCI_WRITE);
+	}
 
 	return addr;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(unsigned long addr,
+		unsigned long size, unsigned level);
+
 static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
-		__u32 page_shift, __u64 window_size, struct iommu_table *tbl)
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table *tbl)
 {
 	void *addr;
+	unsigned long tce_table_allocated = 0, level_shift;
 	const unsigned window_shift = ilog2(window_size);
 	unsigned entries_shift = window_shift - page_shift;
 	unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
 	const unsigned long tce_table_size = 1UL << table_shift;
 
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
+		return -EINVAL;
+
 	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
 		return -EINVAL;
 
+	/* Adjust direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	level_shift = entries_shift + 3;
+	level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
+
 	/* Allocate TCE table */
-	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
+	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+			levels, tce_table_size, &tce_table_allocated);
 	if (!addr)
 		return -ENOMEM;
 
+	if (tce_table_size > tce_table_allocated) {
+		pnv_pci_ioda2_table_do_free_pages((unsigned long) addr,
+				tbl->it_level_size, tbl->it_indirect_levels);
+		return -ENOMEM;
+	}
+
 	/* Setup linux iommu table */
 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
 			page_shift);
+	tbl->it_level_size = 1ULL << (level_shift - 3);
+	tbl->it_indirect_levels = levels - 1;
 
 	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
 			window_size, tce_table_size, bus_offset);
@@ -2117,12 +2168,40 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	return 0;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(unsigned long addr,
+		unsigned long size, unsigned level)
+{
+	addr &= ~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+	if (level) {
+		long i;
+		u64 *tmp = (u64 *) addr;
+
+		for (i = 0; i < size; ++i) {
+			unsigned long hpa = be64_to_cpu(tmp[i]);
+
+			if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
+				continue;
+
+			pnv_pci_ioda2_table_do_free_pages(
+					(unsigned long) __va(hpa),
+					size, level - 1);
+		}
+	}
+
+	free_pages(addr, get_order(size << 3));
+}
+
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
 {
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
+
 	if (!tbl->it_size)
 		return;
 
-	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+	pnv_pci_ioda2_table_do_free_pages(tbl->it_base, size,
+			tbl->it_indirect_levels);
 }
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
@@ -2152,7 +2231,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	/* Setup linux iommu table */
 	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
-			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
+			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
+			POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
 	if (rc) {
 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
 		goto fail;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index fd14e2c..bca2c95 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -575,6 +575,19 @@ struct pci_ops pnv_pci_ops = {
 static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
 {
 	__be64 *tmp = ((__be64 *)tbl->it_base);
+	int  level = tbl->it_indirect_levels;
+	const long shift = ilog2(tbl->it_level_size);
+	unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
+
+	while (level) {
+		int n = (idx & mask) >> (level * shift);
+		unsigned long tce = be64_to_cpu(tmp[n]);
+
+		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
+		idx &= ~mask;
+		mask >>= shift;
+		--level;
+	}
 
 	return tmp + idx;
 }
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * [PATCH kernel v10 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (26 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 27/34] powerpc/powernv: Implement multilevel TCE tables Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-13 21:30   ` Alex Williamson
  2015-05-11 15:39 ` [PATCH kernel v10 29/34] powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE release Alexey Kardashevskiy
                   ` (5 subsequent siblings)
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This extends iommu_table_group_ops by a set of callbacks to support
dynamic DMA windows management.
create_table() creates a TCE table with specific parameters.
it receives iommu_table_group to know nodeid in order to allocate
TCE table memory closer to the PHB. The exact format of allocated
multi-level table might be also specific to the PHB model (not
the case now though).
This callback calculated the DMA window offset on a PCI bus from @num
and stores it in a just created table.
set_window() sets the window at specified TVT index + @num on PHB.
unset_window() unsets the window from specified TVT.
This adds a free() callback to iommu_table_ops to free the memory
(potentially a tree of tables) allocated for the TCE table.
create_table() and free() are supposed to be called once per
VFIO container and set_window()/unset_window() are supposed to be
called for every group in a container.
This adds IOMMU capabilities to iommu_table_group such as default
32bit window parameters and others. This makes use of new values in
vfio_iommu_spapr_tce. IODA1/P5IOC2 do not support DDW so they do not
advertise pagemasks to the userspace.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* squashed "vfio: powerpc/spapr: Use 32bit DMA window properties from table_group"
into this
* shortened the subject
v9:
* new in the series - to make the next patch simpler
---
 arch/powerpc/include/asm/iommu.h            | 19 ++++++
 arch/powerpc/platforms/powernv/pci-ioda.c   | 96 ++++++++++++++++++++++++++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c         | 19 +++---
 4 files changed, 124 insertions(+), 17 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index a902159..2c41115 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -70,6 +70,7 @@ struct iommu_table_ops {
 	/* get() returns a physical address */
 	unsigned long (*get)(struct iommu_table *tbl, long index);
 	void (*flush)(struct iommu_table *tbl);
+	void (*free)(struct iommu_table *tbl);
 };
 
 /* These are used by VIO */
@@ -150,6 +151,17 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 struct iommu_table_group;
 
 struct iommu_table_group_ops {
+	long (*create_table)(struct iommu_table_group *table_group,
+			int num,
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels,
+			struct iommu_table **ptbl);
+	long (*set_window)(struct iommu_table_group *table_group,
+			int num,
+			struct iommu_table *tblnew);
+	long (*unset_window)(struct iommu_table_group *table_group,
+			int num);
 	/* Switch ownership from platform code to external user (e.g. VFIO) */
 	void (*take_ownership)(struct iommu_table_group *table_group);
 	/* Switch ownership from external user (e.g. VFIO) back to core */
@@ -163,6 +175,13 @@ struct iommu_table_group_link {
 };
 
 struct iommu_table_group {
+	/* IOMMU properties */
+	__u32 tce32_start;
+	__u32 tce32_size;
+	__u64 pgsizes; /* Bitmap of supported page sizes */
+	__u32 max_dynamic_windows_supported;
+	__u32 max_levels;
+
 	struct iommu_group *group;
 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
 	struct iommu_table_group_ops *ops;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d2a1dcd..c1d1aef 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -25,6 +25,7 @@
 #include <linux/memblock.h>
 #include <linux/iommu.h>
 #include <linux/rculist.h>
+#include <linux/sizes.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -1867,6 +1868,12 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
+static void pnv_ioda2_table_free(struct iommu_table *tbl)
+{
+	pnv_pci_ioda2_table_free_pages(tbl);
+	iommu_free_table(tbl, "pnv");
+}
+
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
@@ -1874,6 +1881,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 #endif
 	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
+	.free = pnv_ioda2_table_free,
 };
 
 static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb,
@@ -1960,6 +1968,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				 TCE_PCI_SWINV_PAIR);
 
 	tbl->it_ops = &pnv_ioda1_iommu_ops;
+	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
+	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
@@ -1998,7 +2008,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
 	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
 
-	pe_info(pe, "Setting up window %llx..%llx pg=%x\n",
+	pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
 			start_addr, start_addr + win_size - 1,
 			1UL << tbl->it_page_shift);
 
@@ -2008,7 +2018,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	 */
 	rc = opal_pci_map_pe_dma_window(phb->opal_id,
 			pe->pe_number,
-			pe->pe_number << 1,
+			(pe->pe_number << 1) + num,
 			tbl->it_indirect_levels + 1,
 			__pa(tbl->it_base),
 			size << 3,
@@ -2054,6 +2064,66 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 }
 
 #ifdef CONFIG_IOMMU_API
+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table *tbl);
+
+static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table **ptbl)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	int nid = pe->phb->hose->node;
+	__u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
+	long ret;
+	struct iommu_table *tbl;
+
+	tbl = pnv_pci_table_alloc(nid);
+	if (!tbl)
+		return -ENOMEM;
+
+	ret = pnv_pci_ioda2_table_alloc_pages(nid,
+			bus_offset, page_shift, window_size,
+			levels, tbl);
+	if (ret) {
+		iommu_free_table(tbl, "pnv");
+		return ret;
+	}
+
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+	if (pe->tce_inval_reg)
+		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+
+	*ptbl = tbl;
+
+	return 0;
+}
+
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+		int num)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	long ret;
+
+	pe_info(pe, "Removing DMA window #%d\n", num);
+
+	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+			(pe->pe_number << 1) + num,
+			0/* levels */, 0/* table address */,
+			0/* table size */, 0/* page size */);
+	if (ret)
+		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
+	else
+		pnv_pci_ioda2_tvt_invalidate(pe);
+
+	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+
+	return ret;
+}
+
 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 {
 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
@@ -2073,6 +2143,9 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+	.create_table = pnv_pci_ioda2_create_table,
+	.set_window = pnv_pci_ioda2_set_window,
+	.unset_window = pnv_pci_ioda2_unset_window,
 	.take_ownership = pnv_ioda2_take_ownership,
 	.release_ownership = pnv_ioda2_release_ownership,
 };
@@ -2207,7 +2280,7 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe)
 {
-	struct iommu_table *tbl;
+	struct iommu_table *tbl = NULL;
 	int64_t rc;
 
 	/* We shouldn't already have a 32-bit DMA associated */
@@ -2217,10 +2290,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	/* TVE #1 is selected by PCI address bit 59 */
 	pe->tce_bypass_base = 1ull << 59;
 
-	tbl = pnv_pci_table_alloc(phb->hose->node);
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
-	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	pnv_pci_ioda_setup_opal_tce_kill(phb, pe);
 
@@ -2230,13 +2301,22 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 		phb->ioda.m32_pci_base);
 
 	/* Setup linux iommu table */
-	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
-			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
-			POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
+	pe->table_group.tce32_start = 0;
+	pe->table_group.tce32_size = phb->ioda.m32_pci_base;
+	pe->table_group.max_dynamic_windows_supported =
+			IOMMU_TABLE_GROUP_MAX_TABLES;
+	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
+	pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M;
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
+			IOMMU_PAGE_SHIFT_4K,
+			pe->table_group.tce32_size,
+			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
 	if (rc) {
 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
 		goto fail;
 	}
+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 94c880c..a295660 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -119,6 +119,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	u64 phb_id;
 	int64_t rc;
 	static int primary = 1;
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl;
 
 	pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name);
 
@@ -193,7 +195,10 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
 	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
 	 */
-	phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
+	tbl = phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
+	table_group = &phb->p5ioc2.table_group;
+	table_group->tce32_start = tbl->it_offset << tbl->it_page_shift;
+	table_group->tce32_size = tbl->it_size << tbl->it_page_shift;
 }
 
 void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 0724ec8..bc4956d 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -135,7 +135,6 @@ static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
 	unsigned long locked;
-	struct iommu_table *tbl;
 	struct iommu_table_group *table_group;
 
 	if (!container->grp)
@@ -171,13 +170,19 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * this is that we cannot tell here the amount of RAM used by the guest
 	 * as this information is only available from KVM and VFIO is
 	 * KVM agnostic.
+	 *
+	 * So we do not allow enabling a container without a group attached
+	 * as there is no way to know how much we should increment
+	 * the locked_vm counter.
 	 */
 	table_group = iommu_group_get_iommudata(container->grp);
 	if (!table_group)
 		return -ENODEV;
 
-	tbl = table_group->tables[0];
-	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+	if (!table_group->tce32_size)
+		return -EPERM;
+
+	locked = table_group->tce32_size >> PAGE_SHIFT;
 	ret = try_increment_locked_vm(locked);
 	if (ret)
 		return ret;
@@ -350,7 +355,6 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 		struct vfio_iommu_spapr_tce_info info;
-		struct iommu_table *tbl;
 		struct iommu_table_group *table_group;
 
 		if (WARN_ON(!container->grp))
@@ -358,8 +362,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 		table_group = iommu_group_get_iommudata(container->grp);
 
-		tbl = table_group->tables[0];
-		if (WARN_ON_ONCE(!tbl))
+		if (!table_group)
 			return -ENXIO;
 
 		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
@@ -371,8 +374,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
-		info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
+		info.dma32_window_start = table_group->tce32_start;
+		info.dma32_window_size = table_group->tce32_size;
 		info.flags = 0;
 
 		if (copy_to_user((void __user *)arg, &info, minsz))
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API
  2015-05-11 15:39 ` [PATCH kernel v10 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API Alexey Kardashevskiy
@ 2015-05-13 21:30   ` Alex Williamson
  0 siblings, 0 replies; 82+ messages in thread
From: Alex Williamson @ 2015-05-13 21:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Paul Mackerras, linuxppc-dev,
	David Gibson
On Tue, 2015-05-12 at 01:39 +1000, Alexey Kardashevskiy wrote:
> This extends iommu_table_group_ops by a set of callbacks to support
> dynamic DMA windows management.
> 
> create_table() creates a TCE table with specific parameters.
> it receives iommu_table_group to know nodeid in order to allocate
> TCE table memory closer to the PHB. The exact format of allocated
> multi-level table might be also specific to the PHB model (not
> the case now though).
> This callback calculated the DMA window offset on a PCI bus from @num
> and stores it in a just created table.
> 
> set_window() sets the window at specified TVT index + @num on PHB.
> 
> unset_window() unsets the window from specified TVT.
> 
> This adds a free() callback to iommu_table_ops to free the memory
> (potentially a tree of tables) allocated for the TCE table.
> 
> create_table() and free() are supposed to be called once per
> VFIO container and set_window()/unset_window() are supposed to be
> called for every group in a container.
> 
> This adds IOMMU capabilities to iommu_table_group such as default
> 32bit window parameters and others. This makes use of new values in
> vfio_iommu_spapr_tce. IODA1/P5IOC2 do not support DDW so they do not
> advertise pagemasks to the userspace.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v10:
> * squashed "vfio: powerpc/spapr: Use 32bit DMA window properties from table_group"
> into this
> * shortened the subject
> 
> v9:
> * new in the series - to make the next patch simpler
> ---
>  arch/powerpc/include/asm/iommu.h            | 19 ++++++
>  arch/powerpc/platforms/powernv/pci-ioda.c   | 96 ++++++++++++++++++++++++++---
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 ++-
>  drivers/vfio/vfio_iommu_spapr_tce.c         | 19 +++---
>  4 files changed, 124 insertions(+), 17 deletions(-)
For vfio:
Acked-by: Alex Williamson <alex.williamson@redhat.com>
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index a902159..2c41115 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -70,6 +70,7 @@ struct iommu_table_ops {
>  	/* get() returns a physical address */
>  	unsigned long (*get)(struct iommu_table *tbl, long index);
>  	void (*flush)(struct iommu_table *tbl);
> +	void (*free)(struct iommu_table *tbl);
>  };
>  
>  /* These are used by VIO */
> @@ -150,6 +151,17 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>  struct iommu_table_group;
>  
>  struct iommu_table_group_ops {
> +	long (*create_table)(struct iommu_table_group *table_group,
> +			int num,
> +			__u32 page_shift,
> +			__u64 window_size,
> +			__u32 levels,
> +			struct iommu_table **ptbl);
> +	long (*set_window)(struct iommu_table_group *table_group,
> +			int num,
> +			struct iommu_table *tblnew);
> +	long (*unset_window)(struct iommu_table_group *table_group,
> +			int num);
>  	/* Switch ownership from platform code to external user (e.g. VFIO) */
>  	void (*take_ownership)(struct iommu_table_group *table_group);
>  	/* Switch ownership from external user (e.g. VFIO) back to core */
> @@ -163,6 +175,13 @@ struct iommu_table_group_link {
>  };
>  
>  struct iommu_table_group {
> +	/* IOMMU properties */
> +	__u32 tce32_start;
> +	__u32 tce32_size;
> +	__u64 pgsizes; /* Bitmap of supported page sizes */
> +	__u32 max_dynamic_windows_supported;
> +	__u32 max_levels;
> +
>  	struct iommu_group *group;
>  	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>  	struct iommu_table_group_ops *ops;
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index d2a1dcd..c1d1aef 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -25,6 +25,7 @@
>  #include <linux/memblock.h>
>  #include <linux/iommu.h>
>  #include <linux/rculist.h>
> +#include <linux/sizes.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -1867,6 +1868,12 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
>  		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
>  }
>  
> +static void pnv_ioda2_table_free(struct iommu_table *tbl)
> +{
> +	pnv_pci_ioda2_table_free_pages(tbl);
> +	iommu_free_table(tbl, "pnv");
> +}
> +
>  static struct iommu_table_ops pnv_ioda2_iommu_ops = {
>  	.set = pnv_ioda2_tce_build,
>  #ifdef CONFIG_IOMMU_API
> @@ -1874,6 +1881,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
>  #endif
>  	.clear = pnv_ioda2_tce_free,
>  	.get = pnv_tce_get,
> +	.free = pnv_ioda2_table_free,
>  };
>  
>  static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb,
> @@ -1960,6 +1968,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
>  				 TCE_PCI_SWINV_PAIR);
>  
>  	tbl->it_ops = &pnv_ioda1_iommu_ops;
> +	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
> +	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
>  	iommu_init_table(tbl, phb->hose->node);
>  
>  	if (pe->flags & PNV_IODA_PE_DEV) {
> @@ -1998,7 +2008,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
>  	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
>  	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
>  
> -	pe_info(pe, "Setting up window %llx..%llx pg=%x\n",
> +	pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
>  			start_addr, start_addr + win_size - 1,
>  			1UL << tbl->it_page_shift);
>  
> @@ -2008,7 +2018,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
>  	 */
>  	rc = opal_pci_map_pe_dma_window(phb->opal_id,
>  			pe->pe_number,
> -			pe->pe_number << 1,
> +			(pe->pe_number << 1) + num,
>  			tbl->it_indirect_levels + 1,
>  			__pa(tbl->it_base),
>  			size << 3,
> @@ -2054,6 +2064,66 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
>  }
>  
>  #ifdef CONFIG_IOMMU_API
> +static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
> +		__u32 page_shift, __u64 window_size, __u32 levels,
> +		struct iommu_table *tbl);
> +
> +static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> +		struct iommu_table **ptbl)
> +{
> +	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
> +			table_group);
> +	int nid = pe->phb->hose->node;
> +	__u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
> +	long ret;
> +	struct iommu_table *tbl;
> +
> +	tbl = pnv_pci_table_alloc(nid);
> +	if (!tbl)
> +		return -ENOMEM;
> +
> +	ret = pnv_pci_ioda2_table_alloc_pages(nid,
> +			bus_offset, page_shift, window_size,
> +			levels, tbl);
> +	if (ret) {
> +		iommu_free_table(tbl, "pnv");
> +		return ret;
> +	}
> +
> +	tbl->it_ops = &pnv_ioda2_iommu_ops;
> +	if (pe->tce_inval_reg)
> +		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
> +
> +	*ptbl = tbl;
> +
> +	return 0;
> +}
> +
> +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
> +		int num)
> +{
> +	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
> +			table_group);
> +	struct pnv_phb *phb = pe->phb;
> +	long ret;
> +
> +	pe_info(pe, "Removing DMA window #%d\n", num);
> +
> +	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
> +			(pe->pe_number << 1) + num,
> +			0/* levels */, 0/* table address */,
> +			0/* table size */, 0/* page size */);
> +	if (ret)
> +		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
> +	else
> +		pnv_pci_ioda2_tvt_invalidate(pe);
> +
> +	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
> +
> +	return ret;
> +}
> +
>  static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
>  {
>  	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
> @@ -2073,6 +2143,9 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
>  }
>  
>  static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
> +	.create_table = pnv_pci_ioda2_create_table,
> +	.set_window = pnv_pci_ioda2_set_window,
> +	.unset_window = pnv_pci_ioda2_unset_window,
>  	.take_ownership = pnv_ioda2_take_ownership,
>  	.release_ownership = pnv_ioda2_release_ownership,
>  };
> @@ -2207,7 +2280,7 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
>  static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  				       struct pnv_ioda_pe *pe)
>  {
> -	struct iommu_table *tbl;
> +	struct iommu_table *tbl = NULL;
>  	int64_t rc;
>  
>  	/* We shouldn't already have a 32-bit DMA associated */
> @@ -2217,10 +2290,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  	/* TVE #1 is selected by PCI address bit 59 */
>  	pe->tce_bypass_base = 1ull << 59;
>  
> -	tbl = pnv_pci_table_alloc(phb->hose->node);
>  	iommu_register_group(&pe->table_group, phb->hose->global_number,
>  			pe->pe_number);
> -	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>  
>  	pnv_pci_ioda_setup_opal_tce_kill(phb, pe);
>  
> @@ -2230,13 +2301,22 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  		phb->ioda.m32_pci_base);
>  
>  	/* Setup linux iommu table */
> -	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
> -			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
> -			POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
> +	pe->table_group.tce32_start = 0;
> +	pe->table_group.tce32_size = phb->ioda.m32_pci_base;
> +	pe->table_group.max_dynamic_windows_supported =
> +			IOMMU_TABLE_GROUP_MAX_TABLES;
> +	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
> +	pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M;
> +
> +	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
> +			IOMMU_PAGE_SHIFT_4K,
> +			pe->table_group.tce32_size,
> +			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
>  	if (rc) {
>  		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
>  		goto fail;
>  	}
> +	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
>  
>  	tbl->it_ops = &pnv_ioda2_iommu_ops;
>  	iommu_init_table(tbl, phb->hose->node);
> diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
> index 94c880c..a295660 100644
> --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
> +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
> @@ -119,6 +119,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
>  	u64 phb_id;
>  	int64_t rc;
>  	static int primary = 1;
> +	struct iommu_table_group *table_group;
> +	struct iommu_table *tbl;
>  
>  	pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name);
>  
> @@ -193,7 +195,10 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
>  	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
>  	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
>  	 */
> -	phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
> +	tbl = phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
> +	table_group = &phb->p5ioc2.table_group;
> +	table_group->tce32_start = tbl->it_offset << tbl->it_page_shift;
> +	table_group->tce32_size = tbl->it_size << tbl->it_page_shift;
>  }
>  
>  void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 0724ec8..bc4956d 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -135,7 +135,6 @@ static int tce_iommu_enable(struct tce_container *container)
>  {
>  	int ret = 0;
>  	unsigned long locked;
> -	struct iommu_table *tbl;
>  	struct iommu_table_group *table_group;
>  
>  	if (!container->grp)
> @@ -171,13 +170,19 @@ static int tce_iommu_enable(struct tce_container *container)
>  	 * this is that we cannot tell here the amount of RAM used by the guest
>  	 * as this information is only available from KVM and VFIO is
>  	 * KVM agnostic.
> +	 *
> +	 * So we do not allow enabling a container without a group attached
> +	 * as there is no way to know how much we should increment
> +	 * the locked_vm counter.
>  	 */
>  	table_group = iommu_group_get_iommudata(container->grp);
>  	if (!table_group)
>  		return -ENODEV;
>  
> -	tbl = table_group->tables[0];
> -	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
> +	if (!table_group->tce32_size)
> +		return -EPERM;
> +
> +	locked = table_group->tce32_size >> PAGE_SHIFT;
>  	ret = try_increment_locked_vm(locked);
>  	if (ret)
>  		return ret;
> @@ -350,7 +355,6 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
>  	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>  		struct vfio_iommu_spapr_tce_info info;
> -		struct iommu_table *tbl;
>  		struct iommu_table_group *table_group;
>  
>  		if (WARN_ON(!container->grp))
> @@ -358,8 +362,7 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
>  		table_group = iommu_group_get_iommudata(container->grp);
>  
> -		tbl = table_group->tables[0];
> -		if (WARN_ON_ONCE(!tbl))
> +		if (!table_group)
>  			return -ENXIO;
>  
>  		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> @@ -371,8 +374,8 @@ static long tce_iommu_ioctl(void *iommu_data,
>  		if (info.argsz < minsz)
>  			return -EINVAL;
>  
> -		info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
> -		info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
> +		info.dma32_window_start = table_group->tce32_start;
> +		info.dma32_window_size = table_group->tce32_size;
>  		info.flags = 0;
>  
>  		if (copy_to_user((void __user *)arg, &info, minsz))
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
- * [PATCH kernel v10 29/34] powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE release
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (27 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-11 15:39 ` [PATCH kernel v10 30/34] powerpc/iommu/ioda2: Add get_table_size() to calculate the size of future table Alexey Kardashevskiy
                   ` (4 subsequent siblings)
  33 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
The existing code programmed TVT#0 with some address and then
immediately released that memory.
This makes use of pnv_pci_ioda2_unset_window() and
pnv_pci_ioda2_set_bypass() which do correct resource release and
TVT update.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index c1d1aef..14d4f34 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1287,33 +1287,21 @@ m64_failed:
 	return -EBUSY;
 }
 
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+		int num);
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+
 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
-	struct pnv_phb        *phb;
 	struct iommu_table    *tbl;
-	unsigned long         addr;
 	int64_t               rc;
 
-	bus = dev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
 	tbl = pe->table_group.tables[0];
-	addr = tbl->it_base;
-
-	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-				   pe->pe_number << 1, 1, __pa(addr),
-				   0, 0x1000);
-
-	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
-				        pe->pe_number,
-				        (pe->pe_number << 1) + 1,
-				        pe->tce_bypass_base,
-				        0);
+	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
+	pnv_pci_ioda2_set_bypass(pe, false);
 	pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
 	if (pe->table_group.group) {
 		iommu_group_put(pe->table_group.group);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * [PATCH kernel v10 30/34] powerpc/iommu/ioda2: Add get_table_size() to calculate the size of future table
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (28 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 29/34] powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE release Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-11 15:39 ` [PATCH kernel v10 31/34] vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in ownership control Alexey Kardashevskiy
                   ` (3 subsequent siblings)
  33 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This adds a way for the IOMMU user to know how much a new table will
use so it can be accounted in the locked_vm limit before allocation
happens.
This stores the allocated table size in pnv_pci_ioda2_get_table_size()
so the locked_vm counter can be updated correctly when a table is
being disposed.
This defines an iommu_table_group_ops callback to let VFIO know
how much memory will be locked if a table is created.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* s/ROUND_UP/_ALIGN_UP/
* fixed rounding up for @entries_shift (used to use ROUND_UP)
v9:
* reimplemented the whole patch
---
 arch/powerpc/include/asm/iommu.h          |  5 +++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 35 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c41115..c8bad21 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -99,6 +99,7 @@ struct iommu_table {
 	unsigned long  it_size;      /* Size of iommu table in entries */
 	unsigned long  it_indirect_levels;
 	unsigned long  it_level_size;
+	unsigned long  it_allocated_size;
 	unsigned long  it_offset;    /* Offset into global table */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_index;     /* which iommu table this is */
@@ -151,6 +152,10 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 struct iommu_table_group;
 
 struct iommu_table_group_ops {
+	unsigned long (*get_table_size)(
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels);
 	long (*create_table)(struct iommu_table_group *table_group,
 			int num,
 			__u32 page_shift,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 14d4f34..48aee99 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -41,6 +41,7 @@
 #include <asm/debug.h>
 #include <asm/firmware.h>
 #include <asm/pnv-pci.h>
+#include <asm/mmzone.h>
 
 #include <misc/cxl.h>
 
@@ -2052,6 +2053,38 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 }
 
 #ifdef CONFIG_IOMMU_API
+static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+		__u64 window_size, __u32 levels)
+{
+	unsigned long bytes = 0;
+	const unsigned window_shift = ilog2(window_size);
+	unsigned entries_shift = window_shift - page_shift;
+	unsigned table_shift = entries_shift + 3;
+	unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
+	unsigned long direct_table_size;
+
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
+			(window_size > memory_hotplug_max()) ||
+			!is_power_of_2(window_size))
+		return 0;
+
+	/* Calculate a direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	table_shift = entries_shift + 3;
+	table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
+	direct_table_size =  1UL << table_shift;
+
+	for ( ; levels; --levels) {
+		bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+
+		tce_table_size /= direct_table_size;
+		tce_table_size <<= 3;
+		tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+	}
+
+	return bytes;
+}
+
 static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 		__u32 page_shift, __u64 window_size, __u32 levels,
 		struct iommu_table *tbl);
@@ -2131,6 +2164,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+	.get_table_size = pnv_pci_ioda2_get_table_size,
 	.create_table = pnv_pci_ioda2_create_table,
 	.set_window = pnv_pci_ioda2_set_window,
 	.unset_window = pnv_pci_ioda2_unset_window,
@@ -2222,6 +2256,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 			page_shift);
 	tbl->it_level_size = 1ULL << (level_shift - 3);
 	tbl->it_indirect_levels = levels - 1;
+	tbl->it_allocated_size = tce_table_allocated;
 
 	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
 			window_size, tce_table_size, bus_offset);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * [PATCH kernel v10 31/34] vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in ownership control
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (29 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 30/34] powerpc/iommu/ioda2: Add get_table_size() to calculate the size of future table Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-11 15:39 ` [PATCH kernel v10 32/34] powerpc/mmu: Add userspace-to-physical addresses translation cache Alexey Kardashevskiy
                   ` (2 subsequent siblings)
  33 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
Before the IOMMU user (VFIO) would take control over the IOMMU table
belonging to a specific IOMMU group. This approach did not allow sharing
tables between IOMMU groups attached to the same container.
This introduces a new IOMMU ownership flavour when the user can not
just control the existing IOMMU table but remove/create tables on demand.
If an IOMMU implements take/release_ownership() callbacks, this lets
the user have full control over the IOMMU group. When the ownership
is taken, the platform code removes all the windows so the caller must
create them.
Before returning the ownership back to the platform code, VFIO
unprograms and removes all the tables it created.
This changes IODA2's onwership handler to remove the existing table
rather than manipulating with the existing one. From now on,
iommu_take_ownership() and iommu_release_ownership() are only called
from the vfio_iommu_spapr_tce driver.
Old-style ownership is still supported allowing VFIO to run on older
P5IOC2 and IODA IO controllers.
No change in userspace-visible behaviour is expected. Since it recreates
TCE tables on each ownership change, related kernel traces will appear
more often.
This adds a pnv_pci_ioda2_setup_default_config() which is called
when PE is being configured at boot time and when the ownership is
passed from VFIO to the platform code.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
Changes:
v10:
* created pnv_pci_ioda2_setup_default_config() helper
v9:
* fixed crash in tce_iommu_detach_group() on tbl->it_ops->free as
tce_iommu_attach_group() used to initialize the table from a descriptor
on stack (it does not matter for the series as this bit is changed later anyway
but it ruing bisectability)
v6:
* fixed commit log that VFIO removes tables before passing ownership
back to the platform code, not userspace
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 101 ++++++++++++++++--------------
 drivers/vfio/vfio_iommu_spapr_tce.c       |  78 ++++++++++++++++++++++-
 2 files changed, 130 insertions(+), 49 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 48aee99..1312190 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2145,13 +2145,59 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
 	return ret;
 }
 
+static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table *tbl = NULL;
+	long rc;
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
+			IOMMU_PAGE_SHIFT_4K,
+			pe->table_group.tce32_size,
+			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
+	if (rc) {
+		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
+				rc);
+		return rc;
+	}
+
+	iommu_init_table(tbl, pe->phb->hose->node);
+
+	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	if (rc) {
+		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
+				rc);
+		pnv_ioda2_table_free(tbl);
+		return rc;
+	}
+
+	if (!pnv_iommu_bypass_disabled)
+		pnv_pci_ioda2_set_bypass(pe, true);
+
+	/* OPAL variant of PHB3 invalidated TCEs */
+	if (pe->tce_inval_reg)
+		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+
+	/*
+	 * Setting table base here only for carrying iommu_group
+	 * further down to let iommu_add_device() do the job.
+	 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+	 */
+	if (pe->flags & PNV_IODA_PE_DEV)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+
+	return 0;
+}
+
 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 {
 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
 						table_group);
+	/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
+	struct iommu_table *tbl = pe->table_group.tables[0];
 
-	iommu_take_ownership(table_group->tables[0]);
 	pnv_pci_ioda2_set_bypass(pe, false);
+	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+	pnv_ioda2_table_free(tbl);
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -2159,8 +2205,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
 						table_group);
 
-	iommu_release_ownership(table_group->tables[0]);
-	pnv_pci_ioda2_set_bypass(pe, true);
+	pnv_pci_ioda2_setup_default_config(pe);
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
@@ -2303,7 +2348,6 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe)
 {
-	struct iommu_table *tbl = NULL;
 	int64_t rc;
 
 	/* We shouldn't already have a 32-bit DMA associated */
@@ -2330,58 +2374,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 			IOMMU_TABLE_GROUP_MAX_TABLES;
 	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
 	pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M;
-
-	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
-			IOMMU_PAGE_SHIFT_4K,
-			pe->table_group.tce32_size,
-			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
-	if (rc) {
-		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
-		goto fail;
-	}
-	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
-
-	tbl->it_ops = &pnv_ioda2_iommu_ops;
-	iommu_init_table(tbl, phb->hose->node);
 #ifdef CONFIG_IOMMU_API
 	pe->table_group.ops = &pnv_pci_ioda2_ops;
 #endif
 
-	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	rc = pnv_pci_ioda2_setup_default_config(pe);
 	if (rc) {
-		pe_err(pe, "Failed to configure 32-bit TCE table,"
-		       " err %ld\n", rc);
-		goto fail;
+		if (pe->tce32_seg >= 0)
+			pe->tce32_seg = -1;
+		return;
 	}
 
-	/* OPAL variant of PHB3 invalidated TCEs */
-	if (pe->tce_inval_reg)
-		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-
-	if (pe->flags & PNV_IODA_PE_DEV) {
-		/*
-		 * Setting table base here only for carrying iommu_group
-		 * further down to let iommu_add_device() do the job.
-		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
-		 */
-		set_iommu_table_base(&pe->pdev->dev, tbl);
+	if (pe->flags & PNV_IODA_PE_DEV)
 		iommu_add_device(&pe->pdev->dev);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
-
-	/* Also create a bypass window */
-	if (!pnv_iommu_bypass_disabled)
-		pnv_pci_ioda2_set_bypass(pe, true);
-
-	return;
-fail:
-	if (pe->tce32_seg >= 0)
-		pe->tce32_seg = -1;
-	if (tbl) {
-		pnv_pci_ioda2_table_free_pages(tbl);
-		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
-		iommu_free_table(tbl, "pnv");
-	}
 }
 
 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index bc4956d..8943b29 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -333,6 +333,45 @@ static long tce_iommu_build(struct tce_container *container,
 	return ret;
 }
 
+static long tce_iommu_create_table(struct tce_container *container,
+			struct iommu_table_group *table_group,
+			int num,
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels,
+			struct iommu_table **ptbl)
+{
+	long ret, table_size;
+
+	table_size = table_group->ops->get_table_size(page_shift, window_size,
+			levels);
+	if (!table_size)
+		return -EINVAL;
+
+	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
+	if (ret)
+		return ret;
+
+	ret = table_group->ops->create_table(table_group, num,
+			page_shift, window_size, levels, ptbl);
+
+	WARN_ON(!ret && !(*ptbl)->it_ops->free);
+	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
+
+	if (ret)
+		decrement_locked_vm(table_size >> PAGE_SHIFT);
+
+	return ret;
+}
+
+static void tce_iommu_free_table(struct iommu_table *tbl)
+{
+	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
+
+	tbl->it_ops->free(tbl);
+	decrement_locked_vm(pages);
+}
+
 static long tce_iommu_ioctl(void *iommu_data,
 				 unsigned int cmd, unsigned long arg)
 {
@@ -577,14 +616,32 @@ static int tce_iommu_attach_group(void *iommu_data,
 	if (!table_group->ops || !table_group->ops->take_ownership ||
 			!table_group->ops->release_ownership) {
 		ret = tce_iommu_take_ownership(container, table_group);
+	} else if (!table_group->ops->create_table ||
+			!table_group->ops->set_window) {
+		WARN_ON_ONCE(1);
+		ret = -EFAULT;
 	} else {
+		struct iommu_table *tbl = NULL;
 		/*
 		 * Disable iommu bypass, otherwise the user can DMA to all of
 		 * our physical memory via the bypass window instead of just
 		 * the pages that has been explicitly mapped into the iommu
 		 */
 		table_group->ops->take_ownership(table_group);
-		ret = 0;
+		ret = tce_iommu_create_table(container,
+				table_group,
+				0, /* window number */
+				IOMMU_PAGE_SHIFT_4K,
+				table_group->tce32_size,
+				1, /* default levels */
+				&tbl);
+		if (!ret) {
+			ret = table_group->ops->set_window(table_group, 0, tbl);
+			if (ret)
+				tce_iommu_free_table(tbl);
+			else
+				table_group->tables[0] = tbl;
+		}
 	}
 
 	if (ret)
@@ -603,6 +660,7 @@ static void tce_iommu_detach_group(void *iommu_data,
 {
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
+	long i;
 
 	mutex_lock(&container->lock);
 	if (iommu_group != container->grp) {
@@ -628,8 +686,24 @@ static void tce_iommu_detach_group(void *iommu_data,
 	/* Kernel owns the device now, we can restore bypass */
 	if (!table_group->ops || !table_group->ops->release_ownership)
 		tce_iommu_release_ownership(container, table_group);
-	else
+	else if (!table_group->ops->unset_window)
+		WARN_ON_ONCE(1);
+	else {
+		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+			/* Store table pointer as unset_window resets it */
+			struct iommu_table *tbl = table_group->tables[i];
+
+			if (!tbl)
+				continue;
+
+			table_group->ops->unset_window(table_group, i);
+			tce_iommu_clear(container, tbl,
+					tbl->it_offset, tbl->it_size);
+			tce_iommu_free_table(tbl);
+		}
+
 		table_group->ops->release_ownership(table_group);
+	}
 
 unlock_exit:
 	mutex_unlock(&container->lock);
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * [PATCH kernel v10 32/34] powerpc/mmu: Add userspace-to-physical addresses translation cache
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (30 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 31/34] vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in ownership control Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-11 15:39 ` [PATCH kernel v10 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2 Alexey Kardashevskiy
  2015-05-11 15:39 ` [PATCH kernel v10 34/34] vfio: powerpc/spapr: Support Dynamic DMA windows Alexey Kardashevskiy
  33 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
We are adding support for DMA memory pre-registration to be used in
conjunction with VFIO. The idea is that the userspace which is going to
run a guest may want to pre-register a user space memory region so
it all gets pinned once and never goes away. Having this done,
a hypervisor will not have to pin/unpin pages on every DMA map/unmap
request. This is going to help with multiple pinning of the same memory.
Another use of it is in-kernel real mode (mmu off) acceleration of
DMA requests where real time translation of guest physical to host
physical addresses is non-trivial and may fail as linux ptes may be
temporarily invalid. Also, having cached host physical addresses
(compared to just pinning at the start and then walking the page table
again on every H_PUT_TCE), we can be sure that the addresses which we put
into TCE table are the ones we already pinned.
This adds a list of memory regions to mm_context_t. Each region consists
of a header and a list of physical addresses. This adds API to:
1. register/unregister memory regions;
2. do final cleanup (which puts all pre-registered pages);
3. do userspace to physical address translation;
4. manage a mapped pages counter; when it is zero, it is safe to
unregister the region.
Multiple registration of the same region is allowed, kref is used to
track the number of registrations. atomic_inc_not_zero() and
atomic_dec_if_positive() are used to decide whether to allow or deny
the mapped counter increments.
Each registered region keeps a counter for mapped TCEs plus one per
the registered area.
Host physical addresses are stored in vmalloc'ed array. In order to
access these in the real mode (mmu off), there is a real_vmalloc_addr()
helper. In-kernel acceleration patchset will move it from KVM to MMU code.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* split mm_iommu_mapped_update into mm_iommu_mapped_dec + mm_iommu_mapped_inc
* mapped counter now keep one reference for itself and mm_iommu_mapped_inc()
can tell if the region is being released
* updated commit log
v8:
* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
* fixed error fallback look (s/[i]/[j]/)
---
 arch/powerpc/include/asm/mmu-hash64.h      |   3 +
 arch/powerpc/include/asm/mmu_context.h     |  17 +++
 arch/powerpc/mm/Makefile                   |   1 +
 arch/powerpc/mm/mmu_context_hash64.c       |   6 +
 arch/powerpc/mm/mmu_context_hash64_iommu.c | 221 +++++++++++++++++++++++++++++
 5 files changed, 248 insertions(+)
 create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 1da6a81..a82f534 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -536,6 +536,9 @@ typedef struct {
 	/* for 4K PTE fragment support */
 	void *pte_frag;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	struct list_head iommu_group_mem_list;
+#endif
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 73382eb..138bb53 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -16,6 +16,23 @@
  */
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 extern void destroy_context(struct mm_struct *mm);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+struct mm_iommu_table_group_mem_t;
+
+extern bool mm_iommu_preregistered(void);
+extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
+		unsigned long entries);
+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+		unsigned long size);
+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
+extern long mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
+#endif
 
 extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 9c8770b..e216704 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)	+= mmu_context_hash64_iommu.o
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index 178876ae..eb3080c 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 #ifdef CONFIG_PPC_64K_PAGES
 	mm->context.pte_frag = NULL;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+#endif
 	return 0;
 }
 
@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct *mm)
 
 void destroy_context(struct mm_struct *mm)
 {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_cleanup(&mm->context);
+#endif
 
 #ifdef CONFIG_PPC_ICSWX
 	drop_cop(mm->context.acop, mm);
diff --git a/arch/powerpc/mm/mmu_context_hash64_iommu.c b/arch/powerpc/mm/mmu_context_hash64_iommu.c
new file mode 100644
index 0000000..002c6c9
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash64_iommu.c
@@ -0,0 +1,221 @@
+/*
+ *  IOMMU helpers in MMU context.
+ *
+ *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/kref.h>
+#include <asm/mmu_context.h>
+
+struct mm_iommu_table_group_mem_t {
+	struct list_head next;
+	struct rcu_head rcu;
+	struct kref kref;	/* one reference per VFIO container */
+	atomic_t mapped;	/* number of currently mapped pages */
+	u64 ua;			/* userspace address */
+	u64 entries;		/* number of entries in hpas[] */
+	u64 *hpas;		/* vmalloc'ed */
+};
+
+bool mm_iommu_preregistered(void)
+{
+	if (!current || !current->mm)
+		return false;
+
+	return !list_empty(¤t->mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+long mm_iommu_alloc(unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	long i, j;
+	struct page *page = NULL;
+
+	list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua == ua) && (mem->entries == entries))
+			return -EBUSY;
+
+		/* Overlap? */
+		if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
+				(ua < (mem->ua + (mem->entries << PAGE_SHIFT))))
+			return -EINVAL;
+	}
+
+	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	if (!mem)
+		return -ENOMEM;
+
+	mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
+	if (!mem->hpas) {
+		kfree(mem);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+					1/* pages */, 1/* iswrite */, &page)) {
+			for (j = 0; j < i; ++j)
+				put_page(pfn_to_page(
+						mem->hpas[j] >> PAGE_SHIFT));
+			vfree(mem->hpas);
+			kfree(mem);
+			return -EFAULT;
+		}
+
+		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+	}
+
+	kref_init(&mem->kref);
+	atomic_set(&mem->mapped, 1);
+	mem->ua = ua;
+	mem->entries = entries;
+	*pmem = mem;
+
+	list_add_rcu(&mem->next, ¤t->mm->context.iommu_group_mem_list);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_alloc);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+	long i;
+	struct page *page = NULL;
+
+	for (i = 0; i < mem->entries; ++i) {
+		if (!mem->hpas[i])
+			continue;
+
+		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+		if (!page)
+			continue;
+
+		put_page(page);
+		mem->hpas[i] = 0;
+	}
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+	struct mm_iommu_table_group_mem_t *mem = container_of(head,
+			struct mm_iommu_table_group_mem_t, rcu);
+
+	mm_iommu_unpin(mem);
+	vfree(mem->hpas);
+	kfree(mem);
+}
+
+static void mm_iommu_release(struct kref *kref)
+{
+	struct mm_iommu_table_group_mem_t *mem = container_of(kref,
+			struct mm_iommu_table_group_mem_t, kref);
+
+	list_del_rcu(&mem->next);
+	call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
+		unsigned long entries)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+
+	list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua == ua) && (mem->entries == entries)) {
+			kref_get(&mem->kref);
+			return mem;
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
+{
+	if (1 != atomic_dec_if_positive(&mem->mapped)) {
+		/* There are mappings, exit */
+		atomic_inc(&mem->mapped);
+		return -EBUSY;
+	}
+
+	kref_put(&mem->kref, mm_iommu_release);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+		unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_rcu(mem,
+			¤t->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	u64 *va = &mem->hpas[entry];
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	*hpa = *va | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+	if (atomic_inc_not_zero(&mem->mapped))
+		return 0;
+
+	/* Last mm_iommu_put() has been called, no more mappings allowed() */
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+long mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+	return atomic_dec_if_positive(&mem->mapped);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_cleanup(mm_context_t *ctx)
+{
+	while (!list_empty(&ctx->iommu_group_mem_list)) {
+		struct mm_iommu_table_group_mem_t *mem;
+
+		mem = list_first_entry(&ctx->iommu_group_mem_list,
+				struct mm_iommu_table_group_mem_t, next);
+		mm_iommu_release(&mem->kref);
+	}
+}
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * [PATCH kernel v10 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (31 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 32/34] powerpc/mmu: Add userspace-to-physical addresses translation cache Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  2015-05-13 21:30   ` Alex Williamson
  2015-05-11 15:39 ` [PATCH kernel v10 34/34] vfio: powerpc/spapr: Support Dynamic DMA windows Alexey Kardashevskiy
  33 siblings, 1 reply; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
The existing implementation accounts the whole DMA window in
the locked_vm counter. This is going to be worse with multiple
containers and huge DMA windows. Also, real-time accounting would requite
additional tracking of accounted pages due to the page size difference -
IOMMU uses 4K pages and system uses 4K or 64K pages.
Another issue is that actual pages pinning/unpinning happens on every
DMA map/unmap request. This does not affect the performance much now as
we spend way too much time now on switching context between
guest/userspace/host but this will start to matter when we add in-kernel
DMA map/unmap acceleration.
This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU.
New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces
2 new ioctls to register/unregister DMA memory -
VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY -
which receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.
New IOMMU splits physical pages pinning and TCE table update
into 2 different operations. It requires:
1) guest pages to be registered first
2) consequent map/unmap requests to work only with pre-registered memory.
For the default single window case this means that the entire guest
(instead of 2GB) needs to be pinned before using VFIO.
When a huge DMA window is added, no additional pinning will be
required, otherwise it would be guest RAM + 2GB.
The new memory registration ioctls are not supported by
VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration
will require memory to be preregistered in order to work.
The accounting is done per the user process.
This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
can do with v1 or v2 IOMMUs.
In order to support memory pre-registration, we need a way to track
the use of every registered memory region and only allow unregistration
if a region is not in use anymore. So we need a way to tell from what
region the just cleared TCE was from.
This adds a userspace view of the TCE table into iommu_table struct.
It contains userspace address, one per TCE entry. The table is only
allocated when the ownership over an IOMMU group is taken which means
it is only used from outside of the powernv code (such as VFIO).
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
Alex, should I remove your "acked-by" in the cases like this and
get another one?
---
Changes:
v10:
* moved it_userspace allocation to vfio_iommu_spapr_tce as it VFIO
specific thing
* squashed "powerpc/iommu: Add userspace view of TCE table" into this as
it is
a part of IOMMU v2
* s/tce_iommu_use_page_v2/tce_iommu_prereg_ua_to_hpa/
* fixed some function names to have "tce_iommu_" in the beginning rather
just "tce_"
* as mm_iommu_mapped_inc() can now fail, check for the return code
v9:
* s/tce_get_hva_cached/tce_iommu_use_page_v2/
v7:
* now memory is registered per mm (i.e. process)
* moved memory registration code to powerpc/mmu
* merged "vfio: powerpc/spapr: Define v2 IOMMU" into this
* limited new ioctls to v2 IOMMU
* updated doc
* unsupported ioclts return -ENOTTY instead of -EPERM
v6:
* tce_get_hva_cached() returns hva via a pointer
v4:
* updated docs
* s/kzmalloc/vzalloc/
* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
replaced offset with index
* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
and removed duplicating vfio_iommu_spapr_register_memory
---
 Documentation/vfio.txt              |  31 ++-
 arch/powerpc/include/asm/iommu.h    |   6 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 516 ++++++++++++++++++++++++++++++------
 include/uapi/linux/vfio.h           |  27 ++
 4 files changed, 494 insertions(+), 86 deletions(-)
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..7dcf2b5 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
 
 This implementation has some specifics:
 
-1) Only one IOMMU group per container is supported as an IOMMU group
-represents the minimal entity which isolation can be guaranteed for and
-groups are allocated statically, one per a Partitionable Endpoint (PE)
+1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
+container is supported as an IOMMU table is allocated at the boot time,
+one table per a IOMMU group which is a Partitionable Endpoint (PE)
 (PE is often a PCI domain but not always).
+Newer systems (POWER8 with IODA2) have improved hardware design which allows
+to remove this limitation and have multiple IOMMU groups per a VFIO container.
 
 2) The hardware supports so called DMA windows - the PCI address range
 within which DMA transfer is allowed, any attempt to access address space
@@ -427,6 +429,29 @@ The code flow from the example above should be slightly changed:
 
 	....
 
+5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/
+VFIO_IOMMU_DISABLE and implements 2 new ioctls:
+VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY
+(which are unsupported in v1 IOMMU).
+
+PPC64 paravirtualized guests generate a lot of map/unmap requests,
+and the handling of those includes pinning/unpinning pages and updating
+mm::locked_vm counter to make sure we do not exceed the rlimit.
+The v2 IOMMU splits accounting and pinning into separate operations:
+
+- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
+receive a user space address and size of the block to be pinned.
+Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
+be called with the exact address and size used for registering
+the memory block. The userspace is not expected to call these often.
+The ranges are stored in a linked list in a VFIO container.
+
+- VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual
+IOMMU table and do not do pinning; instead these check that the userspace
+address is from pre-registered range.
+
+This separation helps in optimizing DMA for guests.
+
 -------------------------------------------------------------------------------
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index c8bad21..763c041 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -113,10 +113,16 @@ struct iommu_table {
 	unsigned long  it_page_shift;/* table iommu page size */
 #ifdef CONFIG_IOMMU_API
 	struct list_head it_group_list;/* List of iommu_table_group_link */
+	unsigned long *it_userspace; /* userspace view of the table */
 #endif
 	struct iommu_table_ops *it_ops;
 };
 
+#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
+		((tbl)->it_userspace ? \
+			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
+			NULL)
+
 /* Pure 2^n version of get_order */
 static inline __attribute_const__
 int get_iommu_order(unsigned long size, struct iommu_table *tbl)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 8943b29..e7e8db3 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -19,8 +19,10 @@
 #include <linux/uaccess.h>
 #include <linux/err.h>
 #include <linux/vfio.h>
+#include <linux/vmalloc.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/mmu_context.h>
 
 #define DRIVER_VERSION  "0.1"
 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
@@ -81,6 +83,11 @@ static void decrement_locked_vm(long npages)
  * into DMA'ble space using the IOMMU
  */
 
+struct tce_iommu_group {
+	struct list_head next;
+	struct iommu_group *grp;
+};
+
 /*
  * The container descriptor supports only a single group per container.
  * Required by the API as the container is not supplied with the IOMMU group
@@ -88,11 +95,98 @@ static void decrement_locked_vm(long npages)
  */
 struct tce_container {
 	struct mutex lock;
-	struct iommu_group *grp;
 	bool enabled;
 	unsigned long locked_pages;
+	bool v2;
+	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+	struct list_head group_list;
 };
 
+static long tce_iommu_unregister_pages(struct tce_container *container,
+		__u64 vaddr, __u64 size)
+{
+	long ret;
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
+		return -EINVAL;
+
+	mem = mm_iommu_get(vaddr, size >> PAGE_SHIFT);
+	if (!mem)
+		return -EINVAL;
+
+	ret = mm_iommu_put(mem); /* undo kref_get() from mm_iommu_get() */
+	if (!ret)
+		ret = mm_iommu_put(mem);
+
+	return ret;
+}
+
+static long tce_iommu_register_pages(struct tce_container *container,
+		__u64 vaddr, __u64 size)
+{
+	long ret = 0;
+	struct mm_iommu_table_group_mem_t *mem;
+	unsigned long entries = size >> PAGE_SHIFT;
+
+	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
+			((vaddr + size) < vaddr))
+		return -EINVAL;
+
+	mem = mm_iommu_get(vaddr, entries);
+	if (!mem) {
+		ret = try_increment_locked_vm(entries);
+		if (ret)
+			return ret;
+
+		ret = mm_iommu_alloc(vaddr, entries, &mem);
+		if (ret) {
+			decrement_locked_vm(entries);
+			return ret;
+		}
+	}
+
+	container->enabled = true;
+
+	return 0;
+}
+
+static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
+{
+	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
+			tbl->it_size, PAGE_SIZE);
+	unsigned long *uas;
+	long ret;
+
+	BUG_ON(tbl->it_userspace);
+
+	ret = try_increment_locked_vm(cb >> PAGE_SHIFT);
+	if (ret)
+		return ret;
+
+	uas = vzalloc(cb);
+	if (!uas) {
+		decrement_locked_vm(cb >> PAGE_SHIFT);
+		return -ENOMEM;
+	}
+	tbl->it_userspace = uas;
+
+	return 0;
+}
+
+static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
+{
+	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
+			tbl->it_size, PAGE_SIZE);
+
+	if (!tbl->it_userspace)
+		return;
+
+	vfree(tbl->it_userspace);
+	tbl->it_userspace = NULL;
+	decrement_locked_vm(cb >> PAGE_SHIFT);
+}
+
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 {
 	/*
@@ -103,18 +197,18 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 }
 
+static inline bool tce_groups_attached(struct tce_container *container)
+{
+	return !list_empty(&container->group_list);
+}
+
 static long tce_iommu_find_table(struct tce_container *container,
 		phys_addr_t ioba, struct iommu_table **ptbl)
 {
 	long i;
-	struct iommu_table_group *table_group;
-
-	table_group = iommu_group_get_iommudata(container->grp);
-	if (!table_group)
-		return -1;
 
 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-		struct iommu_table *tbl = table_group->tables[i];
+		struct iommu_table *tbl = container->tables[i];
 
 		if (tbl) {
 			unsigned long entry = ioba >> tbl->it_page_shift;
@@ -136,9 +230,7 @@ static int tce_iommu_enable(struct tce_container *container)
 	int ret = 0;
 	unsigned long locked;
 	struct iommu_table_group *table_group;
-
-	if (!container->grp)
-		return -ENXIO;
+	struct tce_iommu_group *tcegrp;
 
 	if (!current->mm)
 		return -ESRCH; /* process exited */
@@ -175,7 +267,12 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * as there is no way to know how much we should increment
 	 * the locked_vm counter.
 	 */
-	table_group = iommu_group_get_iommudata(container->grp);
+	if (!tce_groups_attached(container))
+		return -ENODEV;
+
+	tcegrp = list_first_entry(&container->group_list,
+			struct tce_iommu_group, next);
+	table_group = iommu_group_get_iommudata(tcegrp->grp);
 	if (!table_group)
 		return -ENODEV;
 
@@ -211,7 +308,7 @@ static void *tce_iommu_open(unsigned long arg)
 {
 	struct tce_container *container;
 
-	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 		pr_err("tce_vfio: Wrong IOMMU type\n");
 		return ERR_PTR(-EINVAL);
 	}
@@ -221,18 +318,45 @@ static void *tce_iommu_open(unsigned long arg)
 		return ERR_PTR(-ENOMEM);
 
 	mutex_init(&container->lock);
+	INIT_LIST_HEAD_RCU(&container->group_list);
+
+	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 
 	return container;
 }
 
+static int tce_iommu_clear(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages);
+static void tce_iommu_free_table(struct iommu_table *tbl);
+
 static void tce_iommu_release(void *iommu_data)
 {
 	struct tce_container *container = iommu_data;
+	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp;
+	long i;
 
-	WARN_ON(container->grp);
+	while (tce_groups_attached(container)) {
+		tcegrp = list_first_entry(&container->group_list,
+				struct tce_iommu_group, next);
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+		tce_iommu_detach_group(iommu_data, tcegrp->grp);
+	}
 
-	if (container->grp)
-		tce_iommu_detach_group(iommu_data, container->grp);
+	/*
+	 * If VFIO created a table, it was not disposed
+	 * by tce_iommu_detach_group() so do it now.
+	 */
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = container->tables[i];
+
+		if (!tbl)
+			continue;
+
+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+		tce_iommu_free_table(tbl);
+	}
 
 	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
@@ -249,6 +373,47 @@ static void tce_iommu_unuse_page(struct tce_container *container,
 	put_page(page);
 }
 
+static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
+		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
+{
+	long ret = 0;
+	struct mm_iommu_table_group_mem_t *mem;
+
+	mem = mm_iommu_lookup(tce, size);
+	if (!mem)
+		return -EINVAL;
+
+	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
+	if (ret)
+		return -EINVAL;
+
+	*pmem = mem;
+
+	return 0;
+}
+
+static void tce_iommu_unuse_page_v2(struct iommu_table *tbl,
+		unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	int ret;
+	unsigned long hpa = 0;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua || !current || !current->mm)
+		return;
+
+	ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl),
+			&hpa, &mem);
+	if (ret)
+		pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
+				__func__, *pua, entry, ret);
+	if (mem)
+		mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+}
+
 static int tce_iommu_clear(struct tce_container *container,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long pages)
@@ -267,6 +432,11 @@ static int tce_iommu_clear(struct tce_container *container,
 		if (direction == DMA_NONE)
 			continue;
 
+		if (container->v2) {
+			tce_iommu_unuse_page_v2(tbl, entry);
+			continue;
+		}
+
 		tce_iommu_unuse_page(container, oldhpa);
 	}
 
@@ -333,6 +503,64 @@ static long tce_iommu_build(struct tce_container *container,
 	return ret;
 }
 
+static long tce_iommu_build_v2(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long tce, unsigned long pages,
+		enum dma_data_direction direction)
+{
+	long i, ret = 0;
+	struct page *page;
+	unsigned long hpa;
+	enum dma_data_direction dirtmp;
+
+	for (i = 0; i < pages; ++i) {
+		struct mm_iommu_table_group_mem_t *mem = NULL;
+		unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
+				entry + i);
+
+		ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl),
+				&hpa, &mem);
+		if (ret)
+			break;
+
+		page = pfn_to_page(hpa >> PAGE_SHIFT);
+		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+			ret = -EPERM;
+			break;
+		}
+
+		/* Preserve offset within IOMMU page */
+		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+		dirtmp = direction;
+
+		/* The registered region is being unregistered */
+		if (mm_iommu_mapped_inc(mem))
+			break;
+
+		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
+		if (ret) {
+			/* dirtmp cannot be DMA_NONE here */
+			tce_iommu_unuse_page_v2(tbl, entry + i);
+			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
+					__func__, entry << tbl->it_page_shift,
+					tce, ret);
+			break;
+		}
+
+		if (dirtmp != DMA_NONE)
+			tce_iommu_unuse_page_v2(tbl, entry + i);
+
+		*pua = tce;
+
+		tce += IOMMU_PAGE_SIZE(tbl);
+	}
+
+	if (ret)
+		tce_iommu_clear(container, tbl, entry, i);
+
+	return ret;
+}
+
 static long tce_iommu_create_table(struct tce_container *container,
 			struct iommu_table_group *table_group,
 			int num,
@@ -358,6 +586,12 @@ static long tce_iommu_create_table(struct tce_container *container,
 	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
 
+	if (!ret && container->v2) {
+		ret = tce_iommu_userspace_view_alloc(*ptbl);
+		if (ret)
+			(*ptbl)->it_ops->free(*ptbl);
+	}
+
 	if (ret)
 		decrement_locked_vm(table_size >> PAGE_SHIFT);
 
@@ -368,6 +602,7 @@ static void tce_iommu_free_table(struct iommu_table *tbl)
 {
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
+	tce_iommu_userspace_view_free(tbl);
 	tbl->it_ops->free(tbl);
 	decrement_locked_vm(pages);
 }
@@ -383,6 +618,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 	case VFIO_CHECK_EXTENSION:
 		switch (arg) {
 		case VFIO_SPAPR_TCE_IOMMU:
+		case VFIO_SPAPR_TCE_v2_IOMMU:
 			ret = 1;
 			break;
 		default:
@@ -394,12 +630,15 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 		struct vfio_iommu_spapr_tce_info info;
+		struct tce_iommu_group *tcegrp;
 		struct iommu_table_group *table_group;
 
-		if (WARN_ON(!container->grp))
+		if (!tce_groups_attached(container))
 			return -ENXIO;
 
-		table_group = iommu_group_get_iommudata(container->grp);
+		tcegrp = list_first_entry(&container->group_list,
+				struct tce_iommu_group, next);
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
 
 		if (!table_group)
 			return -ENXIO;
@@ -467,11 +706,18 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (ret)
 			return ret;
 
-		ret = tce_iommu_build(container, tbl,
-				param.iova >> tbl->it_page_shift,
-				param.vaddr,
-				param.size >> tbl->it_page_shift,
-				direction);
+		if (container->v2)
+			ret = tce_iommu_build_v2(container, tbl,
+					param.iova >> tbl->it_page_shift,
+					param.vaddr,
+					param.size >> tbl->it_page_shift,
+					direction);
+		else
+			ret = tce_iommu_build(container, tbl,
+					param.iova >> tbl->it_page_shift,
+					param.vaddr,
+					param.size >> tbl->it_page_shift,
+					direction);
 
 		iommu_flush_tce(tbl);
 
@@ -517,7 +763,61 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 		return ret;
 	}
+	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
+		struct vfio_iommu_spapr_register_memory param;
+
+		if (!container->v2)
+			break;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
+				size);
+
+		if (copy_from_user(¶m, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+		ret = tce_iommu_register_pages(container, param.vaddr,
+				param.size);
+		mutex_unlock(&container->lock);
+
+		return ret;
+	}
+	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
+		struct vfio_iommu_spapr_register_memory param;
+
+		if (!container->v2)
+			break;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
+				size);
+
+		if (copy_from_user(¶m, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+		tce_iommu_unregister_pages(container, param.vaddr, param.size);
+		mutex_unlock(&container->lock);
+
+		return 0;
+	}
 	case VFIO_IOMMU_ENABLE:
+		if (container->v2)
+			break;
+
 		mutex_lock(&container->lock);
 		ret = tce_iommu_enable(container);
 		mutex_unlock(&container->lock);
@@ -525,16 +825,27 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 
 	case VFIO_IOMMU_DISABLE:
+		if (container->v2)
+			break;
+
 		mutex_lock(&container->lock);
 		tce_iommu_disable(container);
 		mutex_unlock(&container->lock);
 		return 0;
-	case VFIO_EEH_PE_OP:
-		if (!container->grp)
-			return -ENODEV;
 
-		return vfio_spapr_iommu_eeh_ioctl(container->grp,
-						  cmd, arg);
+	case VFIO_EEH_PE_OP: {
+		struct tce_iommu_group *tcegrp;
+
+		ret = 0;
+		list_for_each_entry(tcegrp, &container->group_list, next) {
+			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
+					cmd, arg);
+			if (ret)
+				return ret;
+		}
+		return ret;
+	}
+
 	}
 
 	return -ENOTTY;
@@ -546,14 +857,17 @@ static void tce_iommu_release_ownership(struct tce_container *container,
 	int i;
 
 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-		struct iommu_table *tbl = table_group->tables[i];
+		struct iommu_table *tbl = container->tables[i];
 
 		if (!tbl)
 			continue;
 
 		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+		tce_iommu_userspace_view_free(tbl);
 		if (tbl->it_map)
 			iommu_release_ownership(tbl);
+
+		container->tables[i] = NULL;
 	}
 }
 
@@ -568,7 +882,10 @@ static int tce_iommu_take_ownership(struct tce_container *container,
 		if (!tbl || !tbl->it_map)
 			continue;
 
-		rc = iommu_take_ownership(tbl);
+		rc = tce_iommu_userspace_view_alloc(tbl);
+		if (!rc)
+			rc = iommu_take_ownership(tbl);
+
 		if (rc) {
 			for (j = 0; j < i; ++j)
 				iommu_release_ownership(
@@ -578,38 +895,57 @@ static int tce_iommu_take_ownership(struct tce_container *container,
 		}
 	}
 
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
+		container->tables[i] = table_group->tables[i];
+
 	return 0;
 }
 
 static int tce_iommu_attach_group(void *iommu_data,
 		struct iommu_group *iommu_group)
 {
-	int ret;
+	int ret, i;
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp = NULL;
+	bool first_group = !tce_groups_attached(container);
 
 	mutex_lock(&container->lock);
 
 	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
 			iommu_group_id(iommu_group), iommu_group); */
-	if (container->grp) {
-		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
-				iommu_group_id(container->grp),
-				iommu_group_id(iommu_group));
-		ret = -EBUSY;
-		goto unlock_exit;
-	}
-
-	if (container->enabled) {
-		pr_err("tce_vfio: attaching group #%u to enabled container\n",
-				iommu_group_id(iommu_group));
-		ret = -EBUSY;
-		goto unlock_exit;
-	}
-
 	table_group = iommu_group_get_iommudata(iommu_group);
-	if (!table_group) {
-		ret = -ENXIO;
+
+	if (!first_group && (!table_group->ops ||
+			!table_group->ops->take_ownership ||
+			!table_group->ops->release_ownership)) {
+		ret = -EBUSY;
+		goto unlock_exit;
+	}
+
+	/* Check if new group has the same iommu_ops (i.e. compatible) */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		struct iommu_table_group *table_group_tmp;
+
+		if (tcegrp->grp == iommu_group) {
+			pr_warn("tce_vfio: Group %d is already attached\n",
+					iommu_group_id(iommu_group));
+			ret = -EBUSY;
+			goto unlock_exit;
+		}
+		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
+		if (table_group_tmp->ops != table_group->ops) {
+			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
+					iommu_group_id(iommu_group),
+					iommu_group_id(tcegrp->grp));
+			ret = -EPERM;
+			goto unlock_exit;
+		}
+	}
+
+	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
+	if (!tcegrp) {
+		ret = -ENOMEM;
 		goto unlock_exit;
 	}
 
@@ -628,28 +964,50 @@ static int tce_iommu_attach_group(void *iommu_data,
 		 * the pages that has been explicitly mapped into the iommu
 		 */
 		table_group->ops->take_ownership(table_group);
-		ret = tce_iommu_create_table(container,
-				table_group,
-				0, /* window number */
-				IOMMU_PAGE_SHIFT_4K,
-				table_group->tce32_size,
-				1, /* default levels */
-				&tbl);
-		if (!ret) {
-			ret = table_group->ops->set_window(table_group, 0, tbl);
+
+		/*
+		 * If it the first group attached, check if there is
+		 * a default DMA window and create one if none as
+		 * the userspace expects it to exist.
+		 */
+		if (first_group && !container->tables[0]) {
+			ret = tce_iommu_create_table(container,
+					table_group,
+					0, /* window number */
+					IOMMU_PAGE_SHIFT_4K,
+					table_group->tce32_size,
+					1, /* default levels */
+					&tbl);
 			if (ret)
-				tce_iommu_free_table(tbl);
+				goto unlock_exit;
 			else
-				table_group->tables[0] = tbl;
+				container->tables[0] = tbl;
+		}
+
+		/* Set all windows to the new group */
+		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+			tbl = container->tables[i];
+
+			if (!tbl)
+				continue;
+
+			/* Set the default window to a new group */
+			ret = table_group->ops->set_window(table_group, i, tbl);
+			if (ret)
+				break;
 		}
 	}
 
 	if (ret)
 		goto unlock_exit;
 
-	container->grp = iommu_group;
+	tcegrp->grp = iommu_group;
+	list_add(&tcegrp->next, &container->group_list);
 
 unlock_exit:
+	if (ret && tcegrp)
+		kfree(tcegrp);
+
 	mutex_unlock(&container->lock);
 
 	return ret;
@@ -660,25 +1018,27 @@ static void tce_iommu_detach_group(void *iommu_data,
 {
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp;
 	long i;
+	bool found = false;
 
 	mutex_lock(&container->lock);
-	if (iommu_group != container->grp) {
-		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
-				iommu_group_id(iommu_group),
-				iommu_group_id(container->grp));
+
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		if (tcegrp->grp == iommu_group) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		pr_warn("tce_vfio: detaching unattached group #%u\n",
+				iommu_group_id(iommu_group));
 		goto unlock_exit;
 	}
 
-	if (container->enabled) {
-		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-				iommu_group_id(container->grp));
-		tce_iommu_disable(container);
-	}
-
-	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
-	   iommu_group_id(iommu_group), iommu_group); */
-	container->grp = NULL;
+	list_del(&tcegrp->next);
+	kfree(tcegrp);
 
 	table_group = iommu_group_get_iommudata(iommu_group);
 	BUG_ON(!table_group);
@@ -689,18 +1049,8 @@ static void tce_iommu_detach_group(void *iommu_data,
 	else if (!table_group->ops->unset_window)
 		WARN_ON_ONCE(1);
 	else {
-		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-			/* Store table pointer as unset_window resets it */
-			struct iommu_table *tbl = table_group->tables[i];
-
-			if (!tbl)
-				continue;
-
+		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
 			table_group->ops->unset_window(table_group, i);
-			tce_iommu_clear(container, tbl,
-					tbl->it_offset, tbl->it_size);
-			tce_iommu_free_table(tbl);
-		}
 
 		table_group->ops->release_ownership(table_group);
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index b57b750..8fdcfb9 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -36,6 +36,8 @@
 /* Two-stage IOMMU */
 #define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
 
+#define VFIO_SPAPR_TCE_v2_IOMMU		7
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -495,6 +497,31 @@ struct vfio_eeh_pe_op {
 
 #define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
 
+/**
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory)
+ *
+ * Registers user space memory where DMA is allowed. It pins
+ * user pages and does the locked memory accounting so
+ * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
+ * get faster.
+ */
+struct vfio_iommu_spapr_register_memory {
+	__u32	argsz;
+	__u32	flags;
+	__u64	vaddr;				/* Process virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/**
+ * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory)
+ *
+ * Unregisters user space memory registered with
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
+ * Uses vfio_iommu_spapr_register_memory for parameters.
+ */
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2
  2015-05-11 15:39 ` [PATCH kernel v10 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2 Alexey Kardashevskiy
@ 2015-05-13 21:30   ` Alex Williamson
  2015-05-14  6:08     ` Alexey Kardashevskiy
  0 siblings, 1 reply; 82+ messages in thread
From: Alex Williamson @ 2015-05-13 21:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Wei Yang, Gavin Shan, linux-kernel, Paul Mackerras, linuxppc-dev,
	David Gibson
On Tue, 2015-05-12 at 01:39 +1000, Alexey Kardashevskiy wrote:
> The existing implementation accounts the whole DMA window in
> the locked_vm counter. This is going to be worse with multiple
> containers and huge DMA windows. Also, real-time accounting would requite
> additional tracking of accounted pages due to the page size difference -
> IOMMU uses 4K pages and system uses 4K or 64K pages.
> 
> Another issue is that actual pages pinning/unpinning happens on every
> DMA map/unmap request. This does not affect the performance much now as
> we spend way too much time now on switching context between
> guest/userspace/host but this will start to matter when we add in-kernel
> DMA map/unmap acceleration.
> 
> This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU.
> New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces
> 2 new ioctls to register/unregister DMA memory -
> VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY -
> which receive user space address and size of a memory region which
> needs to be pinned/unpinned and counted in locked_vm.
> New IOMMU splits physical pages pinning and TCE table update
> into 2 different operations. It requires:
> 1) guest pages to be registered first
> 2) consequent map/unmap requests to work only with pre-registered memory.
> For the default single window case this means that the entire guest
> (instead of 2GB) needs to be pinned before using VFIO.
> When a huge DMA window is added, no additional pinning will be
> required, otherwise it would be guest RAM + 2GB.
> 
> The new memory registration ioctls are not supported by
> VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration
> will require memory to be preregistered in order to work.
> 
> The accounting is done per the user process.
> 
> This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
> can do with v1 or v2 IOMMUs.
> 
> In order to support memory pre-registration, we need a way to track
> the use of every registered memory region and only allow unregistration
> if a region is not in use anymore. So we need a way to tell from what
> region the just cleared TCE was from.
> 
> This adds a userspace view of the TCE table into iommu_table struct.
> It contains userspace address, one per TCE entry. The table is only
> allocated when the ownership over an IOMMU group is taken which means
> it is only used from outside of the powernv code (such as VFIO).
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> [aw: for the vfio related changes]
> Acked-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> Alex, should I remove your "acked-by" in the cases like this and
> get another one?
Generally if it's more than a trivial change, you'll want fresh acks.
> ---
> Changes:
> v10:
> * moved it_userspace allocation to vfio_iommu_spapr_tce as it VFIO
> specific thing
> * squashed "powerpc/iommu: Add userspace view of TCE table" into this as
> it is
> a part of IOMMU v2
> * s/tce_iommu_use_page_v2/tce_iommu_prereg_ua_to_hpa/
> * fixed some function names to have "tce_iommu_" in the beginning rather
> just "tce_"
> * as mm_iommu_mapped_inc() can now fail, check for the return code
> 
> v9:
> * s/tce_get_hva_cached/tce_iommu_use_page_v2/
> 
> v7:
> * now memory is registered per mm (i.e. process)
> * moved memory registration code to powerpc/mmu
> * merged "vfio: powerpc/spapr: Define v2 IOMMU" into this
> * limited new ioctls to v2 IOMMU
> * updated doc
> * unsupported ioclts return -ENOTTY instead of -EPERM
> 
> v6:
> * tce_get_hva_cached() returns hva via a pointer
> 
> v4:
> * updated docs
> * s/kzmalloc/vzalloc/
> * in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
> replaced offset with index
> * renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
> and removed duplicating vfio_iommu_spapr_register_memory
> ---
>  Documentation/vfio.txt              |  31 ++-
>  arch/powerpc/include/asm/iommu.h    |   6 +
>  drivers/vfio/vfio_iommu_spapr_tce.c | 516 ++++++++++++++++++++++++++++++------
>  include/uapi/linux/vfio.h           |  27 ++
>  4 files changed, 494 insertions(+), 86 deletions(-)
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index 96978ec..7dcf2b5 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
>  
>  This implementation has some specifics:
>  
> -1) Only one IOMMU group per container is supported as an IOMMU group
> -represents the minimal entity which isolation can be guaranteed for and
> -groups are allocated statically, one per a Partitionable Endpoint (PE)
> +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
> +container is supported as an IOMMU table is allocated at the boot time,
> +one table per a IOMMU group which is a Partitionable Endpoint (PE)
>  (PE is often a PCI domain but not always).
> +Newer systems (POWER8 with IODA2) have improved hardware design which allows
> +to remove this limitation and have multiple IOMMU groups per a VFIO container.
>  
>  2) The hardware supports so called DMA windows - the PCI address range
>  within which DMA transfer is allowed, any attempt to access address space
> @@ -427,6 +429,29 @@ The code flow from the example above should be slightly changed:
>  
>  	....
>  
> +5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/
> +VFIO_IOMMU_DISABLE and implements 2 new ioctls:
> +VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY
> +(which are unsupported in v1 IOMMU).
> +
> +PPC64 paravirtualized guests generate a lot of map/unmap requests,
> +and the handling of those includes pinning/unpinning pages and updating
> +mm::locked_vm counter to make sure we do not exceed the rlimit.
> +The v2 IOMMU splits accounting and pinning into separate operations:
> +
> +- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
> +receive a user space address and size of the block to be pinned.
> +Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
> +be called with the exact address and size used for registering
> +the memory block. The userspace is not expected to call these often.
> +The ranges are stored in a linked list in a VFIO container.
> +
> +- VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual
> +IOMMU table and do not do pinning; instead these check that the userspace
> +address is from pre-registered range.
> +
> +This separation helps in optimizing DMA for guests.
> +
>  -------------------------------------------------------------------------------
>  
>  [1] VFIO was originally an acronym for "Virtual Function I/O" in its
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index c8bad21..763c041 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -113,10 +113,16 @@ struct iommu_table {
>  	unsigned long  it_page_shift;/* table iommu page size */
>  #ifdef CONFIG_IOMMU_API
>  	struct list_head it_group_list;/* List of iommu_table_group_link */
> +	unsigned long *it_userspace; /* userspace view of the table */
>  #endif
>  	struct iommu_table_ops *it_ops;
>  };
>  
> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> +		((tbl)->it_userspace ? \
> +			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
> +			NULL)
> +
>  /* Pure 2^n version of get_order */
>  static inline __attribute_const__
>  int get_iommu_order(unsigned long size, struct iommu_table *tbl)
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 8943b29..e7e8db3 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -19,8 +19,10 @@
>  #include <linux/uaccess.h>
>  #include <linux/err.h>
>  #include <linux/vfio.h>
> +#include <linux/vmalloc.h>
>  #include <asm/iommu.h>
>  #include <asm/tce.h>
> +#include <asm/mmu_context.h>
>  
>  #define DRIVER_VERSION  "0.1"
>  #define DRIVER_AUTHOR   "aik@ozlabs.ru"
> @@ -81,6 +83,11 @@ static void decrement_locked_vm(long npages)
>   * into DMA'ble space using the IOMMU
>   */
>  
> +struct tce_iommu_group {
> +	struct list_head next;
> +	struct iommu_group *grp;
> +};
> +
>  /*
>   * The container descriptor supports only a single group per container.
>   * Required by the API as the container is not supplied with the IOMMU group
> @@ -88,11 +95,98 @@ static void decrement_locked_vm(long npages)
>   */
>  struct tce_container {
>  	struct mutex lock;
> -	struct iommu_group *grp;
>  	bool enabled;
>  	unsigned long locked_pages;
> +	bool v2;
> +	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
> +	struct list_head group_list;
You're wasting space by not packing your bools next to each other.
>  };
>  
> +static long tce_iommu_unregister_pages(struct tce_container *container,
> +		__u64 vaddr, __u64 size)
> +{
> +	long ret;
> +	struct mm_iommu_table_group_mem_t *mem;
> +
> +	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
> +		return -EINVAL;
> +
> +	mem = mm_iommu_get(vaddr, size >> PAGE_SHIFT);
> +	if (!mem)
> +		return -EINVAL;
> +
> +	ret = mm_iommu_put(mem); /* undo kref_get() from mm_iommu_get() */
> +	if (!ret)
> +		ret = mm_iommu_put(mem);
Should \put\ really be able to fail?  I think you really need to examine
your reference model, mm_iommu_put() looks pretty suspicious.  If
there's an implicit reference by being mapped, it should be handled that
way, not via an atomic that gets decremented then corrected.  That's not
only not atomic, but causes lots of fallout with references that don't
get released.  Notice how you don't even check the return value at the
call location of this function?  How many references does that
potentially leave and where do the get resolved?
> +
> +	return ret;
> +}
> +
> +static long tce_iommu_register_pages(struct tce_container *container,
> +		__u64 vaddr, __u64 size)
> +{
> +	long ret = 0;
> +	struct mm_iommu_table_group_mem_t *mem;
> +	unsigned long entries = size >> PAGE_SHIFT;
> +
> +	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
> +			((vaddr + size) < vaddr))
> +		return -EINVAL;
> +
> +	mem = mm_iommu_get(vaddr, entries);
> +	if (!mem) {
> +		ret = try_increment_locked_vm(entries);
> +		if (ret)
> +			return ret;
> +
> +		ret = mm_iommu_alloc(vaddr, entries, &mem);
> +		if (ret) {
> +			decrement_locked_vm(entries);
> +			return ret;
> +		}
> +	}
> +
> +	container->enabled = true;
> +
> +	return 0;
> +}
> +
> +static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
> +{
> +	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
> +			tbl->it_size, PAGE_SIZE);
> +	unsigned long *uas;
> +	long ret;
> +
> +	BUG_ON(tbl->it_userspace);
> +
> +	ret = try_increment_locked_vm(cb >> PAGE_SHIFT);
> +	if (ret)
> +		return ret;
> +
> +	uas = vzalloc(cb);
> +	if (!uas) {
> +		decrement_locked_vm(cb >> PAGE_SHIFT);
> +		return -ENOMEM;
> +	}
> +	tbl->it_userspace = uas;
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
> +{
> +	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
> +			tbl->it_size, PAGE_SIZE);
> +
> +	if (!tbl->it_userspace)
> +		return;
> +
> +	vfree(tbl->it_userspace);
> +	tbl->it_userspace = NULL;
> +	decrement_locked_vm(cb >> PAGE_SHIFT);
> +}
> +
>  static bool tce_page_is_contained(struct page *page, unsigned page_shift)
>  {
>  	/*
> @@ -103,18 +197,18 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
>  	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
>  }
>  
> +static inline bool tce_groups_attached(struct tce_container *container)
> +{
> +	return !list_empty(&container->group_list);
> +}
> +
>  static long tce_iommu_find_table(struct tce_container *container,
>  		phys_addr_t ioba, struct iommu_table **ptbl)
>  {
>  	long i;
> -	struct iommu_table_group *table_group;
> -
> -	table_group = iommu_group_get_iommudata(container->grp);
> -	if (!table_group)
> -		return -1;
>  
>  	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> -		struct iommu_table *tbl = table_group->tables[i];
> +		struct iommu_table *tbl = container->tables[i];
>  
>  		if (tbl) {
>  			unsigned long entry = ioba >> tbl->it_page_shift;
> @@ -136,9 +230,7 @@ static int tce_iommu_enable(struct tce_container *container)
>  	int ret = 0;
>  	unsigned long locked;
>  	struct iommu_table_group *table_group;
> -
> -	if (!container->grp)
> -		return -ENXIO;
> +	struct tce_iommu_group *tcegrp;
>  
>  	if (!current->mm)
>  		return -ESRCH; /* process exited */
> @@ -175,7 +267,12 @@ static int tce_iommu_enable(struct tce_container *container)
>  	 * as there is no way to know how much we should increment
>  	 * the locked_vm counter.
>  	 */
> -	table_group = iommu_group_get_iommudata(container->grp);
> +	if (!tce_groups_attached(container))
> +		return -ENODEV;
> +
> +	tcegrp = list_first_entry(&container->group_list,
> +			struct tce_iommu_group, next);
> +	table_group = iommu_group_get_iommudata(tcegrp->grp);
>  	if (!table_group)
>  		return -ENODEV;
>  
> @@ -211,7 +308,7 @@ static void *tce_iommu_open(unsigned long arg)
>  {
>  	struct tce_container *container;
>  
> -	if (arg != VFIO_SPAPR_TCE_IOMMU) {
> +	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
>  		pr_err("tce_vfio: Wrong IOMMU type\n");
>  		return ERR_PTR(-EINVAL);
>  	}
> @@ -221,18 +318,45 @@ static void *tce_iommu_open(unsigned long arg)
>  		return ERR_PTR(-ENOMEM);
>  
>  	mutex_init(&container->lock);
> +	INIT_LIST_HEAD_RCU(&container->group_list);
> +
> +	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
>  
>  	return container;
>  }
>  
> +static int tce_iommu_clear(struct tce_container *container,
> +		struct iommu_table *tbl,
> +		unsigned long entry, unsigned long pages);
> +static void tce_iommu_free_table(struct iommu_table *tbl);
> +
>  static void tce_iommu_release(void *iommu_data)
>  {
>  	struct tce_container *container = iommu_data;
> +	struct iommu_table_group *table_group;
> +	struct tce_iommu_group *tcegrp;
> +	long i;
>  
> -	WARN_ON(container->grp);
> +	while (tce_groups_attached(container)) {
> +		tcegrp = list_first_entry(&container->group_list,
> +				struct tce_iommu_group, next);
> +		table_group = iommu_group_get_iommudata(tcegrp->grp);
> +		tce_iommu_detach_group(iommu_data, tcegrp->grp);
> +	}
>  
> -	if (container->grp)
> -		tce_iommu_detach_group(iommu_data, container->grp);
> +	/*
> +	 * If VFIO created a table, it was not disposed
> +	 * by tce_iommu_detach_group() so do it now.
> +	 */
> +	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> +		struct iommu_table *tbl = container->tables[i];
> +
> +		if (!tbl)
> +			continue;
> +
> +		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
> +		tce_iommu_free_table(tbl);
> +	}
>  
>  	tce_iommu_disable(container);
>  	mutex_destroy(&container->lock);
> @@ -249,6 +373,47 @@ static void tce_iommu_unuse_page(struct tce_container *container,
>  	put_page(page);
>  }
>  
> +static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
> +		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
> +{
> +	long ret = 0;
> +	struct mm_iommu_table_group_mem_t *mem;
> +
> +	mem = mm_iommu_lookup(tce, size);
> +	if (!mem)
> +		return -EINVAL;
> +
> +	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
> +	if (ret)
> +		return -EINVAL;
> +
> +	*pmem = mem;
> +
> +	return 0;
> +}
> +
> +static void tce_iommu_unuse_page_v2(struct iommu_table *tbl,
> +		unsigned long entry)
> +{
> +	struct mm_iommu_table_group_mem_t *mem = NULL;
> +	int ret;
> +	unsigned long hpa = 0;
> +	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
> +
> +	if (!pua || !current || !current->mm)
> +		return;
> +
> +	ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl),
> +			&hpa, &mem);
> +	if (ret)
> +		pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
> +				__func__, *pua, entry, ret);
> +	if (mem)
> +		mm_iommu_mapped_dec(mem);
> +
> +	*pua = 0;
> +}
> +
>  static int tce_iommu_clear(struct tce_container *container,
>  		struct iommu_table *tbl,
>  		unsigned long entry, unsigned long pages)
> @@ -267,6 +432,11 @@ static int tce_iommu_clear(struct tce_container *container,
>  		if (direction == DMA_NONE)
>  			continue;
>  
> +		if (container->v2) {
> +			tce_iommu_unuse_page_v2(tbl, entry);
> +			continue;
> +		}
> +
>  		tce_iommu_unuse_page(container, oldhpa);
>  	}
>  
> @@ -333,6 +503,64 @@ static long tce_iommu_build(struct tce_container *container,
>  	return ret;
>  }
>  
> +static long tce_iommu_build_v2(struct tce_container *container,
> +		struct iommu_table *tbl,
> +		unsigned long entry, unsigned long tce, unsigned long pages,
> +		enum dma_data_direction direction)
> +{
> +	long i, ret = 0;
> +	struct page *page;
> +	unsigned long hpa;
> +	enum dma_data_direction dirtmp;
> +
> +	for (i = 0; i < pages; ++i) {
> +		struct mm_iommu_table_group_mem_t *mem = NULL;
> +		unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
> +				entry + i);
> +
> +		ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl),
> +				&hpa, &mem);
> +		if (ret)
> +			break;
> +
> +		page = pfn_to_page(hpa >> PAGE_SHIFT);
> +		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
> +			ret = -EPERM;
> +			break;
> +		}
> +
> +		/* Preserve offset within IOMMU page */
> +		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
> +		dirtmp = direction;
> +
> +		/* The registered region is being unregistered */
> +		if (mm_iommu_mapped_inc(mem))
> +			break;
> +
> +		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
> +		if (ret) {
> +			/* dirtmp cannot be DMA_NONE here */
> +			tce_iommu_unuse_page_v2(tbl, entry + i);
> +			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
> +					__func__, entry << tbl->it_page_shift,
> +					tce, ret);
> +			break;
> +		}
> +
> +		if (dirtmp != DMA_NONE)
> +			tce_iommu_unuse_page_v2(tbl, entry + i);
> +
> +		*pua = tce;
> +
> +		tce += IOMMU_PAGE_SIZE(tbl);
> +	}
> +
> +	if (ret)
> +		tce_iommu_clear(container, tbl, entry, i);
> +
> +	return ret;
> +}
> +
>  static long tce_iommu_create_table(struct tce_container *container,
>  			struct iommu_table_group *table_group,
>  			int num,
> @@ -358,6 +586,12 @@ static long tce_iommu_create_table(struct tce_container *container,
>  	WARN_ON(!ret && !(*ptbl)->it_ops->free);
>  	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
>  
> +	if (!ret && container->v2) {
> +		ret = tce_iommu_userspace_view_alloc(*ptbl);
> +		if (ret)
> +			(*ptbl)->it_ops->free(*ptbl);
> +	}
> +
>  	if (ret)
>  		decrement_locked_vm(table_size >> PAGE_SHIFT);
>  
> @@ -368,6 +602,7 @@ static void tce_iommu_free_table(struct iommu_table *tbl)
>  {
>  	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
>  
> +	tce_iommu_userspace_view_free(tbl);
>  	tbl->it_ops->free(tbl);
>  	decrement_locked_vm(pages);
>  }
> @@ -383,6 +618,7 @@ static long tce_iommu_ioctl(void *iommu_data,
>  	case VFIO_CHECK_EXTENSION:
>  		switch (arg) {
>  		case VFIO_SPAPR_TCE_IOMMU:
> +		case VFIO_SPAPR_TCE_v2_IOMMU:
>  			ret = 1;
>  			break;
>  		default:
> @@ -394,12 +630,15 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
>  	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>  		struct vfio_iommu_spapr_tce_info info;
> +		struct tce_iommu_group *tcegrp;
>  		struct iommu_table_group *table_group;
>  
> -		if (WARN_ON(!container->grp))
> +		if (!tce_groups_attached(container))
>  			return -ENXIO;
>  
> -		table_group = iommu_group_get_iommudata(container->grp);
> +		tcegrp = list_first_entry(&container->group_list,
> +				struct tce_iommu_group, next);
> +		table_group = iommu_group_get_iommudata(tcegrp->grp);
>  
>  		if (!table_group)
>  			return -ENXIO;
> @@ -467,11 +706,18 @@ static long tce_iommu_ioctl(void *iommu_data,
>  		if (ret)
>  			return ret;
>  
> -		ret = tce_iommu_build(container, tbl,
> -				param.iova >> tbl->it_page_shift,
> -				param.vaddr,
> -				param.size >> tbl->it_page_shift,
> -				direction);
> +		if (container->v2)
> +			ret = tce_iommu_build_v2(container, tbl,
> +					param.iova >> tbl->it_page_shift,
> +					param.vaddr,
> +					param.size >> tbl->it_page_shift,
> +					direction);
> +		else
> +			ret = tce_iommu_build(container, tbl,
> +					param.iova >> tbl->it_page_shift,
> +					param.vaddr,
> +					param.size >> tbl->it_page_shift,
> +					direction);
>  
>  		iommu_flush_tce(tbl);
>  
> @@ -517,7 +763,61 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
>  		return ret;
>  	}
> +	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
> +		struct vfio_iommu_spapr_register_memory param;
> +
> +		if (!container->v2)
> +			break;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
> +				size);
> +
> +		if (copy_from_user(¶m, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		/* No flag is supported now */
> +		if (param.flags)
> +			return -EINVAL;
> +
> +		mutex_lock(&container->lock);
> +		ret = tce_iommu_register_pages(container, param.vaddr,
> +				param.size);
> +		mutex_unlock(&container->lock);
> +
> +		return ret;
> +	}
> +	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
> +		struct vfio_iommu_spapr_register_memory param;
> +
> +		if (!container->v2)
> +			break;
> +
> +		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
> +				size);
> +
> +		if (copy_from_user(¶m, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (param.argsz < minsz)
> +			return -EINVAL;
> +
> +		/* No flag is supported now */
> +		if (param.flags)
> +			return -EINVAL;
> +
> +		mutex_lock(&container->lock);
> +		tce_iommu_unregister_pages(container, param.vaddr, param.size);
> +		mutex_unlock(&container->lock);
> +
> +		return 0;
> +	}
>  	case VFIO_IOMMU_ENABLE:
> +		if (container->v2)
> +			break;
> +
>  		mutex_lock(&container->lock);
>  		ret = tce_iommu_enable(container);
>  		mutex_unlock(&container->lock);
> @@ -525,16 +825,27 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
> 
>  	case VFIO_IOMMU_DISABLE:
> +		if (container->v2)
> +			break;
> +
>  		mutex_lock(&container->lock);
>  		tce_iommu_disable(container);
>  		mutex_unlock(&container->lock);
>  		return 0;
> -	case VFIO_EEH_PE_OP:
> -		if (!container->grp)
> -			return -ENODEV;
>  
> -		return vfio_spapr_iommu_eeh_ioctl(container->grp,
> -						  cmd, arg);
> +	case VFIO_EEH_PE_OP: {
> +		struct tce_iommu_group *tcegrp;
> +
> +		ret = 0;
> +		list_for_each_entry(tcegrp, &container->group_list, next) {
> +			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
> +					cmd, arg);
> +			if (ret)
> +				return ret;
> +		}
> +		return ret;
> +	}
> +
>  	}
>  
>  	return -ENOTTY;
> @@ -546,14 +857,17 @@ static void tce_iommu_release_ownership(struct tce_container *container,
>  	int i;
>  
>  	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> -		struct iommu_table *tbl = table_group->tables[i];
> +		struct iommu_table *tbl = container->tables[i];
>  
>  		if (!tbl)
>  			continue;
>  
>  		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
> +		tce_iommu_userspace_view_free(tbl);
>  		if (tbl->it_map)
>  			iommu_release_ownership(tbl);
> +
> +		container->tables[i] = NULL;
>  	}
>  }
>  
> @@ -568,7 +882,10 @@ static int tce_iommu_take_ownership(struct tce_container *container,
>  		if (!tbl || !tbl->it_map)
>  			continue;
>  
> -		rc = iommu_take_ownership(tbl);
> +		rc = tce_iommu_userspace_view_alloc(tbl);
> +		if (!rc)
> +			rc = iommu_take_ownership(tbl);
> +
>  		if (rc) {
>  			for (j = 0; j < i; ++j)
>  				iommu_release_ownership(
> @@ -578,38 +895,57 @@ static int tce_iommu_take_ownership(struct tce_container *container,
>  		}
>  	}
>  
> +	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
> +		container->tables[i] = table_group->tables[i];
> +
>  	return 0;
>  }
>  
>  static int tce_iommu_attach_group(void *iommu_data,
>  		struct iommu_group *iommu_group)
>  {
> -	int ret;
> +	int ret, i;
>  	struct tce_container *container = iommu_data;
>  	struct iommu_table_group *table_group;
> +	struct tce_iommu_group *tcegrp = NULL;
> +	bool first_group = !tce_groups_attached(container);
>  
>  	mutex_lock(&container->lock);
>  
>  	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>  			iommu_group_id(iommu_group), iommu_group); */
> -	if (container->grp) {
> -		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> -				iommu_group_id(container->grp),
> -				iommu_group_id(iommu_group));
> -		ret = -EBUSY;
> -		goto unlock_exit;
> -	}
> -
> -	if (container->enabled) {
> -		pr_err("tce_vfio: attaching group #%u to enabled container\n",
> -				iommu_group_id(iommu_group));
> -		ret = -EBUSY;
> -		goto unlock_exit;
> -	}
> -
>  	table_group = iommu_group_get_iommudata(iommu_group);
> -	if (!table_group) {
> -		ret = -ENXIO;
> +
> +	if (!first_group && (!table_group->ops ||
> +			!table_group->ops->take_ownership ||
> +			!table_group->ops->release_ownership)) {
> +		ret = -EBUSY;
> +		goto unlock_exit;
> +	}
> +
> +	/* Check if new group has the same iommu_ops (i.e. compatible) */
> +	list_for_each_entry(tcegrp, &container->group_list, next) {
> +		struct iommu_table_group *table_group_tmp;
> +
> +		if (tcegrp->grp == iommu_group) {
> +			pr_warn("tce_vfio: Group %d is already attached\n",
> +					iommu_group_id(iommu_group));
> +			ret = -EBUSY;
> +			goto unlock_exit;
> +		}
> +		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
> +		if (table_group_tmp->ops != table_group->ops) {
> +			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
> +					iommu_group_id(iommu_group),
> +					iommu_group_id(tcegrp->grp));
> +			ret = -EPERM;
> +			goto unlock_exit;
> +		}
> +	}
> +
> +	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
> +	if (!tcegrp) {
> +		ret = -ENOMEM;
>  		goto unlock_exit;
>  	}
>  
> @@ -628,28 +964,50 @@ static int tce_iommu_attach_group(void *iommu_data,
>  		 * the pages that has been explicitly mapped into the iommu
>  		 */
>  		table_group->ops->take_ownership(table_group);
> -		ret = tce_iommu_create_table(container,
> -				table_group,
> -				0, /* window number */
> -				IOMMU_PAGE_SHIFT_4K,
> -				table_group->tce32_size,
> -				1, /* default levels */
> -				&tbl);
> -		if (!ret) {
> -			ret = table_group->ops->set_window(table_group, 0, tbl);
> +
> +		/*
> +		 * If it the first group attached, check if there is
> +		 * a default DMA window and create one if none as
> +		 * the userspace expects it to exist.
> +		 */
> +		if (first_group && !container->tables[0]) {
> +			ret = tce_iommu_create_table(container,
> +					table_group,
> +					0, /* window number */
> +					IOMMU_PAGE_SHIFT_4K,
> +					table_group->tce32_size,
> +					1, /* default levels */
> +					&tbl);
>  			if (ret)
> -				tce_iommu_free_table(tbl);
> +				goto unlock_exit;
>  			else
> -				table_group->tables[0] = tbl;
> +				container->tables[0] = tbl;
> +		}
> +
> +		/* Set all windows to the new group */
> +		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> +			tbl = container->tables[i];
> +
> +			if (!tbl)
> +				continue;
> +
> +			/* Set the default window to a new group */
> +			ret = table_group->ops->set_window(table_group, i, tbl);
> +			if (ret)
> +				break;
>  		}
>  	}
>  
>  	if (ret)
>  		goto unlock_exit;
>  
> -	container->grp = iommu_group;
> +	tcegrp->grp = iommu_group;
> +	list_add(&tcegrp->next, &container->group_list);
>  
>  unlock_exit:
> +	if (ret && tcegrp)
> +		kfree(tcegrp);
> +
>  	mutex_unlock(&container->lock);
>  
>  	return ret;
> @@ -660,25 +1018,27 @@ static void tce_iommu_detach_group(void *iommu_data,
>  {
>  	struct tce_container *container = iommu_data;
>  	struct iommu_table_group *table_group;
> +	struct tce_iommu_group *tcegrp;
>  	long i;
> +	bool found = false;
>  
>  	mutex_lock(&container->lock);
> -	if (iommu_group != container->grp) {
> -		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> -				iommu_group_id(iommu_group),
> -				iommu_group_id(container->grp));
> +
> +	list_for_each_entry(tcegrp, &container->group_list, next) {
> +		if (tcegrp->grp == iommu_group) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	if (!found) {
> +		pr_warn("tce_vfio: detaching unattached group #%u\n",
> +				iommu_group_id(iommu_group));
>  		goto unlock_exit;
>  	}
>  
> -	if (container->enabled) {
> -		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
> -				iommu_group_id(container->grp));
> -		tce_iommu_disable(container);
> -	}
> -
> -	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> -	   iommu_group_id(iommu_group), iommu_group); */
> -	container->grp = NULL;
> +	list_del(&tcegrp->next);
> +	kfree(tcegrp);
>  
>  	table_group = iommu_group_get_iommudata(iommu_group);
>  	BUG_ON(!table_group);
> @@ -689,18 +1049,8 @@ static void tce_iommu_detach_group(void *iommu_data,
>  	else if (!table_group->ops->unset_window)
>  		WARN_ON_ONCE(1);
>  	else {
> -		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> -			/* Store table pointer as unset_window resets it */
> -			struct iommu_table *tbl = table_group->tables[i];
> -
> -			if (!tbl)
> -				continue;
> -
> +		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
>  			table_group->ops->unset_window(table_group, i);
> -			tce_iommu_clear(container, tbl,
> -					tbl->it_offset, tbl->it_size);
> -			tce_iommu_free_table(tbl);
> -		}
>  
>  		table_group->ops->release_ownership(table_group);
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index b57b750..8fdcfb9 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -36,6 +36,8 @@
>  /* Two-stage IOMMU */
>  #define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
>  
> +#define VFIO_SPAPR_TCE_v2_IOMMU		7
> +
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
>   * structure length (argsz) and flags into structures passed between
> @@ -495,6 +497,31 @@ struct vfio_eeh_pe_op {
>  
>  #define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
>  
> +/**
> + * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory)
> + *
> + * Registers user space memory where DMA is allowed. It pins
> + * user pages and does the locked memory accounting so
> + * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
> + * get faster.
> + */
> +struct vfio_iommu_spapr_register_memory {
> +	__u32	argsz;
> +	__u32	flags;
> +	__u64	vaddr;				/* Process virtual address */
> +	__u64	size;				/* Size of mapping (bytes) */
> +};
> +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 17)
> +
> +/**
> + * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory)
> + *
> + * Unregisters user space memory registered with
> + * VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
> + * Uses vfio_iommu_spapr_register_memory for parameters.
> + */
> +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
> +
>  /* ***************************************************************** */
>  
>  #endif /* _UAPIVFIO_H */
^ permalink raw reply	[flat|nested] 82+ messages in thread
- * Re: [PATCH kernel v10 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2
  2015-05-13 21:30   ` Alex Williamson
@ 2015-05-14  6:08     ` Alexey Kardashevskiy
  0 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-14  6:08 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Wei Yang, Gavin Shan, linux-kernel, Paul Mackerras, linuxppc-dev,
	David Gibson
On 05/14/2015 07:30 AM, Alex Williamson wrote:
> On Tue, 2015-05-12 at 01:39 +1000, Alexey Kardashevskiy wrote:
>> The existing implementation accounts the whole DMA window in
>> the locked_vm counter. This is going to be worse with multiple
>> containers and huge DMA windows. Also, real-time accounting would requite
>> additional tracking of accounted pages due to the page size difference -
>> IOMMU uses 4K pages and system uses 4K or 64K pages.
>>
>> Another issue is that actual pages pinning/unpinning happens on every
>> DMA map/unmap request. This does not affect the performance much now as
>> we spend way too much time now on switching context between
>> guest/userspace/host but this will start to matter when we add in-kernel
>> DMA map/unmap acceleration.
>>
>> This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU.
>> New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces
>> 2 new ioctls to register/unregister DMA memory -
>> VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY -
>> which receive user space address and size of a memory region which
>> needs to be pinned/unpinned and counted in locked_vm.
>> New IOMMU splits physical pages pinning and TCE table update
>> into 2 different operations. It requires:
>> 1) guest pages to be registered first
>> 2) consequent map/unmap requests to work only with pre-registered memory.
>> For the default single window case this means that the entire guest
>> (instead of 2GB) needs to be pinned before using VFIO.
>> When a huge DMA window is added, no additional pinning will be
>> required, otherwise it would be guest RAM + 2GB.
>>
>> The new memory registration ioctls are not supported by
>> VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration
>> will require memory to be preregistered in order to work.
>>
>> The accounting is done per the user process.
>>
>> This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
>> can do with v1 or v2 IOMMUs.
>>
>> In order to support memory pre-registration, we need a way to track
>> the use of every registered memory region and only allow unregistration
>> if a region is not in use anymore. So we need a way to tell from what
>> region the just cleared TCE was from.
>>
>> This adds a userspace view of the TCE table into iommu_table struct.
>> It contains userspace address, one per TCE entry. The table is only
>> allocated when the ownership over an IOMMU group is taken which means
>> it is only used from outside of the powernv code (such as VFIO).
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> [aw: for the vfio related changes]
>> Acked-by: Alex Williamson <alex.williamson@redhat.com>
>> ---
>>
>> Alex, should I remove your "acked-by" in the cases like this and
>> get another one?
>
>
> Generally if it's more than a trivial change, you'll want fresh acks.
>
>> ---
>> Changes:
>> v10:
>> * moved it_userspace allocation to vfio_iommu_spapr_tce as it VFIO
>> specific thing
>> * squashed "powerpc/iommu: Add userspace view of TCE table" into this as
>> it is
>> a part of IOMMU v2
>> * s/tce_iommu_use_page_v2/tce_iommu_prereg_ua_to_hpa/
>> * fixed some function names to have "tce_iommu_" in the beginning rather
>> just "tce_"
>> * as mm_iommu_mapped_inc() can now fail, check for the return code
>>
>> v9:
>> * s/tce_get_hva_cached/tce_iommu_use_page_v2/
>>
>> v7:
>> * now memory is registered per mm (i.e. process)
>> * moved memory registration code to powerpc/mmu
>> * merged "vfio: powerpc/spapr: Define v2 IOMMU" into this
>> * limited new ioctls to v2 IOMMU
>> * updated doc
>> * unsupported ioclts return -ENOTTY instead of -EPERM
>>
>> v6:
>> * tce_get_hva_cached() returns hva via a pointer
>>
>> v4:
>> * updated docs
>> * s/kzmalloc/vzalloc/
>> * in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
>> replaced offset with index
>> * renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
>> and removed duplicating vfio_iommu_spapr_register_memory
>> ---
>>   Documentation/vfio.txt              |  31 ++-
>>   arch/powerpc/include/asm/iommu.h    |   6 +
>>   drivers/vfio/vfio_iommu_spapr_tce.c | 516 ++++++++++++++++++++++++++++++------
>>   include/uapi/linux/vfio.h           |  27 ++
>>   4 files changed, 494 insertions(+), 86 deletions(-)
>>
>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>> index 96978ec..7dcf2b5 100644
>> --- a/Documentation/vfio.txt
>> +++ b/Documentation/vfio.txt
>> @@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
>>
>>   This implementation has some specifics:
>>
>> -1) Only one IOMMU group per container is supported as an IOMMU group
>> -represents the minimal entity which isolation can be guaranteed for and
>> -groups are allocated statically, one per a Partitionable Endpoint (PE)
>> +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
>> +container is supported as an IOMMU table is allocated at the boot time,
>> +one table per a IOMMU group which is a Partitionable Endpoint (PE)
>>   (PE is often a PCI domain but not always).
>> +Newer systems (POWER8 with IODA2) have improved hardware design which allows
>> +to remove this limitation and have multiple IOMMU groups per a VFIO container.
>>
>>   2) The hardware supports so called DMA windows - the PCI address range
>>   within which DMA transfer is allowed, any attempt to access address space
>> @@ -427,6 +429,29 @@ The code flow from the example above should be slightly changed:
>>
>>   	....
>>
>> +5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/
>> +VFIO_IOMMU_DISABLE and implements 2 new ioctls:
>> +VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY
>> +(which are unsupported in v1 IOMMU).
>> +
>> +PPC64 paravirtualized guests generate a lot of map/unmap requests,
>> +and the handling of those includes pinning/unpinning pages and updating
>> +mm::locked_vm counter to make sure we do not exceed the rlimit.
>> +The v2 IOMMU splits accounting and pinning into separate operations:
>> +
>> +- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
>> +receive a user space address and size of the block to be pinned.
>> +Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
>> +be called with the exact address and size used for registering
>> +the memory block. The userspace is not expected to call these often.
>> +The ranges are stored in a linked list in a VFIO container.
>> +
>> +- VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual
>> +IOMMU table and do not do pinning; instead these check that the userspace
>> +address is from pre-registered range.
>> +
>> +This separation helps in optimizing DMA for guests.
>> +
>>   -------------------------------------------------------------------------------
>>
>>   [1] VFIO was originally an acronym for "Virtual Function I/O" in its
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index c8bad21..763c041 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -113,10 +113,16 @@ struct iommu_table {
>>   	unsigned long  it_page_shift;/* table iommu page size */
>>   #ifdef CONFIG_IOMMU_API
>>   	struct list_head it_group_list;/* List of iommu_table_group_link */
>> +	unsigned long *it_userspace; /* userspace view of the table */
>>   #endif
>>   	struct iommu_table_ops *it_ops;
>>   };
>>
>> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
>> +		((tbl)->it_userspace ? \
>> +			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
>> +			NULL)
>> +
>>   /* Pure 2^n version of get_order */
>>   static inline __attribute_const__
>>   int get_iommu_order(unsigned long size, struct iommu_table *tbl)
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> index 8943b29..e7e8db3 100644
>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -19,8 +19,10 @@
>>   #include <linux/uaccess.h>
>>   #include <linux/err.h>
>>   #include <linux/vfio.h>
>> +#include <linux/vmalloc.h>
>>   #include <asm/iommu.h>
>>   #include <asm/tce.h>
>> +#include <asm/mmu_context.h>
>>
>>   #define DRIVER_VERSION  "0.1"
>>   #define DRIVER_AUTHOR   "aik@ozlabs.ru"
>> @@ -81,6 +83,11 @@ static void decrement_locked_vm(long npages)
>>    * into DMA'ble space using the IOMMU
>>    */
>>
>> +struct tce_iommu_group {
>> +	struct list_head next;
>> +	struct iommu_group *grp;
>> +};
>> +
>>   /*
>>    * The container descriptor supports only a single group per container.
>>    * Required by the API as the container is not supplied with the IOMMU group
>> @@ -88,11 +95,98 @@ static void decrement_locked_vm(long npages)
>>    */
>>   struct tce_container {
>>   	struct mutex lock;
>> -	struct iommu_group *grp;
>>   	bool enabled;
>>   	unsigned long locked_pages;
>> +	bool v2;
>> +	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>> +	struct list_head group_list;
>
> You're wasting space by not packing your bools next to each other.
I'll fix it :)
>>   };
>>
>> +static long tce_iommu_unregister_pages(struct tce_container *container,
>> +		__u64 vaddr, __u64 size)
>> +{
>> +	long ret;
>> +	struct mm_iommu_table_group_mem_t *mem;
>> +
>> +	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
>> +		return -EINVAL;
>> +
>> +	mem = mm_iommu_get(vaddr, size >> PAGE_SHIFT);
>> +	if (!mem)
>> +		return -EINVAL;
>> +
>> +	ret = mm_iommu_put(mem); /* undo kref_get() from mm_iommu_get() */
>> +	if (!ret)
>> +		ret = mm_iommu_put(mem);
>
> Should \put\ really be able to fail?
tce_iommu_unregister_pages() is called from ioctl so yes, the userspace 
deserves to know that the memory will remain pinned.
> I think you really need to examine
> your reference model, mm_iommu_put() looks pretty suspicious.  If
> there's an implicit reference by being mapped, it should be handled that
> way, not via an atomic that gets decremented then corrected.
One implicit reference (*) in @mapped (from atomic_set(&mem->mapped, 1)) is 
only to protect against the race between checking for active mappings and 
putting the reference a registered memory descriptor.
If tce_iommu_unregister_pages() is called when @mapped > 1, then EBUSY is 
returned.
If tce_iommu_unregister_pages() is called when @mapped == 1 or 0, then 
there is no active mapping, @mapped becomes zero (if it is not already) and 
we can safely put the descriptor. All consequent mm_iommu_mapped_inc() 
calls will fail to increment @mapped and return error.
After looking there more, there are 2 bugs though:
--- a/arch/powerpc/mm/mmu_context_hash64_iommu.c
+++ b/arch/powerpc/mm/mmu_context_hash64_iommu.c
@@ -178,9 +178,9 @@ EXPORT_SYMBOL_GPL(mm_iommu_get);
  long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
  {
-       if (1 != atomic_dec_if_positive(&mem->mapped)) {
+       if (atomic_dec_if_positive(&mem->mapped) > 1) {
                 /* There are mappings, exit */
-               atomic_inc(&mem->mapped);
+               atomic_inc_not_zero(&mem->mapped);
                 return -EBUSY;
         }
s/1!=/1</ is to allow putting second/third/... reference of mem->kref and 
atomic_inc_not_zero() is to not elevate the counter if another thread 
managed to release the very last mapping and  decrement my implicit 
reference (*).
Am I still missing something here?
> That's not only not atomic, but causes lots of fallout with references that don't
> get released.
> Notice how you don't even check the return value at the
> call location of this function?
Ouch. This is a bug. @ret needs to be returned to the userspace.
> How many references does that
> potentially leave and where do the get resolved?
Every successful "register" should be coupled with successful "unregister" 
(if it failed - just repeat). If this did not happen, memory remains pinned 
till the process exit, and then it is unpinned unconditionally.
-- 
Alexey
^ permalink raw reply	[flat|nested] 82+ messages in thread
 
 
- * [PATCH kernel v10 34/34] vfio: powerpc/spapr: Support Dynamic DMA windows
  2015-05-11 15:38 [PATCH kernel v10 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows Alexey Kardashevskiy
                   ` (32 preceding siblings ...)
  2015-05-11 15:39 ` [PATCH kernel v10 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2 Alexey Kardashevskiy
@ 2015-05-11 15:39 ` Alexey Kardashevskiy
  33 siblings, 0 replies; 82+ messages in thread
From: Alexey Kardashevskiy @ 2015-05-11 15:39 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Wei Yang, Alexey Kardashevskiy, Gavin Shan, linux-kernel,
	Alex Williamson, Paul Mackerras, David Gibson
This adds create/remove window ioctls to create and remove DMA windows.
sPAPR defines a Dynamic DMA windows capability which allows
para-virtualized guests to create additional DMA windows on a PCI bus.
The existing linux kernels use this new window to map the entire guest
memory and switch to the direct DMA operations saving time on map/unmap
requests which would normally happen in a big amounts.
This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
Up to 2 windows are supported now by the hardware and by this driver.
This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
information such as a number of supported windows and maximum number
levels of TCE tables.
DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature
as we still want to support v2 on platforms which cannot do DDW for
the sake of TCE acceleration in KVM (coming soon).
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
Changes:
v7:
* s/VFIO_IOMMU_INFO_DDW/VFIO_IOMMU_SPAPR_INFO_DDW/
* fixed typos in and updated vfio.txt
* fixed VFIO_IOMMU_SPAPR_TCE_GET_INFO handler
* moved ddw properties to vfio_iommu_spapr_tce_ddw_info
v6:
* added explicit VFIO_IOMMU_INFO_DDW flag to vfio_iommu_spapr_tce_info,
it used to be page mask flags from platform code
* added explicit pgsizes field
* added cleanup if tce_iommu_create_window() failed in a middle
* added checks for callbacks in tce_iommu_create_window and remove those
from tce_iommu_remove_window when it is too late to test anyway
* spapr_tce_find_free_table returns sensible error code now
* updated description of VFIO_IOMMU_SPAPR_TCE_CREATE/
VFIO_IOMMU_SPAPR_TCE_REMOVE
v4:
* moved code to tce_iommu_create_window()/tce_iommu_remove_window()
helpers
* added docs
---
 Documentation/vfio.txt              |  19 ++++
 arch/powerpc/include/asm/iommu.h    |   2 +-
 drivers/vfio/vfio_iommu_spapr_tce.c | 196 +++++++++++++++++++++++++++++++++++-
 include/uapi/linux/vfio.h           |  61 ++++++++++-
 4 files changed, 273 insertions(+), 5 deletions(-)
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 7dcf2b5..8b1ec51 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -452,6 +452,25 @@ address is from pre-registered range.
 
 This separation helps in optimizing DMA for guests.
 
+6) sPAPR specification allows guests to have an additional DMA window(s) on
+a PCI bus with a variable page size. Two ioctls have been added to support
+this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
+The platform has to support the functionality or error will be returned to
+the userspace. The existing hardware supports up to 2 DMA windows, one is
+2GB long, uses 4K pages and called "default 32bit window"; the other can
+be as big as entire RAM, use different page size, it is optional - guests
+create those in run-time if the guest driver supports 64bit DMA.
+
+VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
+a number of TCE table levels (if a TCE table is going to be big enough and
+the kernel may not be able to allocate enough of physically contiguous memory).
+It creates a new window in the available slot and returns the bus address where
+the new window starts. Due to hardware limitation, the user space cannot choose
+the location of DMA windows.
+
+VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
+and removes it.
+
 -------------------------------------------------------------------------------
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 763c041..dd777d6 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -153,7 +153,7 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
 #ifdef CONFIG_IOMMU_API
 
-#define IOMMU_TABLE_GROUP_MAX_TABLES	1
+#define IOMMU_TABLE_GROUP_MAX_TABLES	2
 
 struct iommu_table_group;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index e7e8db3..6f68901 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -225,6 +225,18 @@ static long tce_iommu_find_table(struct tce_container *container,
 	return -1;
 }
 
+static int tce_iommu_find_free_table(struct tce_container *container)
+{
+	int i;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		if (!container->tables[i])
+			return i;
+	}
+
+	return -ENOSPC;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
@@ -607,11 +619,115 @@ static void tce_iommu_free_table(struct iommu_table *tbl)
 	decrement_locked_vm(pages);
 }
 
+static long tce_iommu_create_window(struct tce_container *container,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		__u64 *start_addr)
+{
+	struct tce_iommu_group *tcegrp;
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl = NULL;
+	long ret, num;
+
+	num = tce_iommu_find_free_table(container);
+	if (num < 0)
+		return num;
+
+	/* Get the first group for ops::create_table */
+	tcegrp = list_first_entry(&container->group_list,
+			struct tce_iommu_group, next);
+	table_group = iommu_group_get_iommudata(tcegrp->grp);
+	if (!table_group)
+		return -EFAULT;
+
+	if (!(table_group->pgsizes & (1ULL << page_shift)))
+		return -EINVAL;
+
+	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
+			!table_group->ops->get_table_size ||
+			!table_group->ops->create_table)
+		return -EPERM;
+
+	/* Create TCE table */
+	ret = tce_iommu_create_table(container, table_group, num,
+			page_shift, window_size, levels, &tbl);
+	if (ret)
+		return ret;
+
+	BUG_ON(!tbl->it_ops->free);
+
+	/*
+	 * Program the table to every group.
+	 * Groups have been tested for compatibility at the attach time.
+	 */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+
+		ret = table_group->ops->set_window(table_group, num, tbl);
+		if (ret)
+			goto unset_exit;
+	}
+
+	container->tables[num] = tbl;
+
+	/* Return start address assigned by platform in create_table() */
+	*start_addr = tbl->it_offset << tbl->it_page_shift;
+
+	return 0;
+
+unset_exit:
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+		table_group->ops->unset_window(table_group, num);
+	}
+	tce_iommu_free_table(tbl);
+
+	return ret;
+}
+
+static long tce_iommu_remove_window(struct tce_container *container,
+		__u64 start_addr)
+{
+	struct iommu_table_group *table_group = NULL;
+	struct iommu_table *tbl;
+	struct tce_iommu_group *tcegrp;
+	int num;
+
+	num = tce_iommu_find_table(container, start_addr, &tbl);
+	if (num < 0)
+		return -EINVAL;
+
+	BUG_ON(!tbl->it_size);
+
+	/* Detach groups from IOMMUs */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+
+		/*
+		 * SPAPR TCE IOMMU exposes the default DMA window to
+		 * the guest via dma32_window_start/size of
+		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
+		 * the userspace to remove this window, some do not so
+		 * here we check for the platform capability.
+		 */
+		if (!table_group->ops || !table_group->ops->unset_window)
+			return -EPERM;
+
+		table_group->ops->unset_window(table_group, num);
+	}
+
+	/* Free table */
+	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+	tce_iommu_free_table(tbl);
+	container->tables[num] = NULL;
+
+	return 0;
+}
+
 static long tce_iommu_ioctl(void *iommu_data,
 				 unsigned int cmd, unsigned long arg)
 {
 	struct tce_container *container = iommu_data;
-	unsigned long minsz;
+	unsigned long minsz, ddwsz;
 	long ret;
 
 	switch (cmd) {
@@ -655,6 +771,21 @@ static long tce_iommu_ioctl(void *iommu_data,
 		info.dma32_window_start = table_group->tce32_start;
 		info.dma32_window_size = table_group->tce32_size;
 		info.flags = 0;
+		memset(&info.ddw, 0, sizeof(info.ddw));
+
+		if (table_group->max_dynamic_windows_supported &&
+				container->v2) {
+			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
+			info.ddw.pgsizes = table_group->pgsizes;
+			info.ddw.max_dynamic_windows_supported =
+				table_group->max_dynamic_windows_supported;
+			info.ddw.levels = table_group->max_levels;
+		}
+
+		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
+
+		if (info.argsz >= ddwsz)
+			minsz = ddwsz;
 
 		if (copy_to_user((void __user *)arg, &info, minsz))
 			return -EFAULT;
@@ -846,6 +977,69 @@ static long tce_iommu_ioctl(void *iommu_data,
 		return ret;
 	}
 
+	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
+		struct vfio_iommu_spapr_tce_create create;
+
+		if (!container->v2)
+			break;
+
+		if (!tce_groups_attached(container))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
+				start_addr);
+
+		if (copy_from_user(&create, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (create.argsz < minsz)
+			return -EINVAL;
+
+		if (create.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+
+		ret = tce_iommu_create_window(container, create.page_shift,
+				create.window_size, create.levels,
+				&create.start_addr);
+
+		mutex_unlock(&container->lock);
+
+		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
+			ret = -EFAULT;
+
+		return ret;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
+		struct vfio_iommu_spapr_tce_remove remove;
+
+		if (!container->v2)
+			break;
+
+		if (!tce_groups_attached(container))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
+				start_addr);
+
+		if (copy_from_user(&remove, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (remove.argsz < minsz)
+			return -EINVAL;
+
+		if (remove.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+
+		ret = tce_iommu_remove_window(container, remove.start_addr);
+
+		mutex_unlock(&container->lock);
+
+		return ret;
+	}
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 8fdcfb9..dde0fe5 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -445,6 +445,23 @@ struct vfio_iommu_type1_dma_unmap {
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*
+ * The SPAPR TCE DDW info struct provides the information about
+ * the details of Dynamic DMA window capability.
+ *
+ * @pgsizes contains a page size bitmask, 4K/64K/16M are supported.
+ * @max_dynamic_windows_supported tells the maximum number of windows
+ * which the platform can create.
+ * @levels tells the maximum number of levels in multi-level IOMMU tables;
+ * this allows splitting a table into smaller chunks which reduces
+ * the amount of physically contiguous memory required for the table.
+ */
+struct vfio_iommu_spapr_tce_ddw_info {
+	__u64 pgsizes;			/* Bitmap of supported page sizes */
+	__u32 max_dynamic_windows_supported;
+	__u32 levels;
+};
+
+/*
  * The SPAPR TCE info struct provides the information about the PCI bus
  * address ranges available for DMA, these values are programmed into
  * the hardware so the guest has to know that information.
@@ -454,14 +471,17 @@ struct vfio_iommu_type1_dma_unmap {
  * addresses too so the window works as a filter rather than an offset
  * for IOVA addresses.
  *
- * A flag will need to be added if other page sizes are supported,
- * so as defined here, it is always 4k.
+ * Flags supported:
+ * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows
+ *   (DDW) support is present. @ddw is only supported when DDW is present.
  */
 struct vfio_iommu_spapr_tce_info {
 	__u32 argsz;
-	__u32 flags;			/* reserved for future use */
+	__u32 flags;
+#define VFIO_IOMMU_SPAPR_INFO_DDW	(1 << 0)	/* DDW supported */
 	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
 	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+	struct vfio_iommu_spapr_tce_ddw_info ddw;
 };
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
@@ -522,6 +542,41 @@ struct vfio_iommu_spapr_register_memory {
  */
 #define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
 
+/**
+ * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create)
+ *
+ * Creates an additional TCE table and programs it (sets a new DMA window)
+ * to every IOMMU group in the container. It receives page shift, window
+ * size and number of levels in the TCE table being created.
+ *
+ * It allocates and returns an offset on a PCI bus of the new DMA window.
+ */
+struct vfio_iommu_spapr_tce_create {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u32 page_shift;
+	__u64 window_size;
+	__u32 levels;
+	/* out */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_CREATE	_IO(VFIO_TYPE, VFIO_BASE + 19)
+
+/**
+ * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove)
+ *
+ * Unprograms a TCE table from all groups in the container and destroys it.
+ * It receives a PCI bus offset as a window id.
+ */
+struct vfio_iommu_spapr_tce_remove {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE	_IO(VFIO_TYPE, VFIO_BASE + 20)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */
-- 
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related	[flat|nested] 82+ messages in thread