LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] powerpc/pseries: Fix SMP=n build of rng.c
From: Michael Ellerman @ 2013-11-20  0:05 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <1384905902-18042-1-git-send-email-mpe@ellerman.id.au>

In commit a489043 "Implement arch_get_random_long() based on H_RANDOM" I
broke the SMP=n build. We were getting plpar_wrappers.h via spinlock.h
which breaks when SMP=n.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/rng.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c
index a702f1c..72a1027 100644
--- a/arch/powerpc/platforms/pseries/rng.c
+++ b/arch/powerpc/platforms/pseries/rng.c
@@ -13,6 +13,7 @@
 #include <linux/of.h>
 #include <asm/archrandom.h>
 #include <asm/machdep.h>
+#include <asm/plpar_wrappers.h>
 
 
 static int pseries_get_random_long(unsigned long *v)
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 1/2] powerpc: Make cpu_to_chip_id() available when SMP=n
From: Michael Ellerman @ 2013-11-20  0:05 UTC (permalink / raw)
  To: linuxppc-dev

Up until now we have only used cpu_to_chip_id() in the topology code,
which is only used on SMP builds. However my recent commit a4da0d5
"Implement arch_get_random_long/int() for powernv" added a usage when
SMP=n, breaking the build.

Move cpu_to_chip_id() into prom.c so it is available for SMP=n builds.

We would move the extern to prom.h, but that breaks the include in
topology.h. Instead we leave it in smp.h, but move it out of the
CONFIG_SMP #ifdef. We also need to include asm/smp.h in rng.c, because
the linux version skips asm/smp.h on UP. What a mess.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/smp.h       |  2 +-
 arch/powerpc/kernel/prom.c           | 20 ++++++++++++++++++++
 arch/powerpc/kernel/smp.c            | 16 ----------------
 arch/powerpc/platforms/powernv/rng.c |  1 +
 4 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 98da78e..084e080 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -33,6 +33,7 @@ extern int boot_cpuid;
 extern int spinning_secondaries;
 
 extern void cpu_die(void);
+extern int cpu_to_chip_id(int cpu);
 
 #ifdef CONFIG_SMP
 
@@ -112,7 +113,6 @@ static inline struct cpumask *cpu_core_mask(int cpu)
 }
 
 extern int cpu_to_core_id(int cpu);
-extern int cpu_to_chip_id(int cpu);
 
 /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.
  *
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 4432fd8..67a7b3b 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -786,6 +786,26 @@ int of_get_ibm_chip_id(struct device_node *np)
 	return -1;
 }
 
+/**
+ * cpu_to_chip_id - Return the cpus chip-id
+ * @cpu: The logical cpu number.
+ *
+ * Return the value of the ibm,chip-id property corresponding to the given
+ * logical cpu number. If the chip-id can not be found, returns -1.
+ */
+int cpu_to_chip_id(int cpu)
+{
+	struct device_node *np;
+
+	np = of_get_cpu_node(cpu, NULL);
+	if (!np)
+		return -1;
+
+	of_node_put(np);
+	return of_get_ibm_chip_id(np);
+}
+EXPORT_SYMBOL(cpu_to_chip_id);
+
 #ifdef CONFIG_PPC_PSERIES
 /*
  * Fix up the uninitialized fields in a new device node:
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8e59abc..e5174d3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -597,22 +597,6 @@ out:
 	return id;
 }
 
-/* Return the value of the chip-id property corresponding
- * to the given logical cpu.
- */
-int cpu_to_chip_id(int cpu)
-{
-	struct device_node *np;
-
-	np = of_get_cpu_node(cpu, NULL);
-	if (!np)
-		return -1;
-
-	of_node_put(np);
-	return of_get_ibm_chip_id(np);
-}
-EXPORT_SYMBOL(cpu_to_chip_id);
-
 /* Helper routines for cpu to core mapping */
 int cpu_core_index_of_thread(int cpu)
 {
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 8844628..1cb160d 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -19,6 +19,7 @@
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
+#include <asm/smp.h>
 
 
 struct powernv_rng {
-- 
1.8.3.2

^ permalink raw reply related

* Re: [PATCH] lib/crc32: slice by 4 is more efficient than the default slice by 8 on Powerpc 8xx.
From: Scott Wood @ 2013-11-19 23:43 UTC (permalink / raw)
  To: Joakim Tjernlund
  Cc: Christophe Leroy, Marcelo Tosatti, Bob Pearson, linuxppc-dev,
	linux-kernel
In-Reply-To: <OFE596957E.9CEC5C97-ONC1257C28.00814EE8-C1257C28.0081FB80@transmode.se>

On Wed, 2013-11-20 at 00:39 +0100, Joakim Tjernlund wrote:
> Scott Wood <scottwood@freescale.com> wrote on 2013/11/19 19:29:26:
> > 
> > I don't think we should go littering the Kconfig with defaults for
> > various bits of hardware -- especially since you've already pointed out
> > non-8xx hardware that would also want this.  Put it in defconfig
> > instead, unless you can identify very broad classes of machines for
> > which SLICEBY4 is faster.
> 
> hmm, when 64bits went in there was not much proof that it was faster for
> a wide range of CPU, just 2 or 3 if I recall correctly. I suspect there
> are quite a few CPUs where 32 bits a equal or faster.

That may be the case, but I don't think we want a big list of them in
lib/Kconfig.  Whether the default should change (for all targets that
don't override it in defconfig, or at least for some broader category
such as "all 32-bit chips") is a different discussion.

-Scott

^ permalink raw reply

* Re: [PATCH] lib/crc32: slice by 4 is more efficient than the default slice by 8 on Powerpc 8xx.
From: Joakim Tjernlund @ 2013-11-19 23:39 UTC (permalink / raw)
  To: Scott Wood
  Cc: Christophe Leroy, Marcelo Tosatti, Bob Pearson, linuxppc-dev,
	linux-kernel
In-Reply-To: <1384885766.1403.366.camel@snotra.buserror.net>

Scott Wood <scottwood@freescale.com> wrote on 2013/11/19 19:29:26:
>=20
> I don't think we should go littering the Kconfig with defaults for
> various bits of hardware -- especially since you've already pointed out
> non-8xx hardware that would also want this.  Put it in defconfig
> instead, unless you can identify very broad classes of machines for
> which SLICEBY4 is faster.

hmm, when 64bits went in there was not much proof that it was faster for
a wide range of CPU, just 2 or 3 if I recall correctly. I suspect there
are quite a few CPUs where 32 bits a equal or faster.

  Jocke

>=20
> -Scott
>=20
> On Tue, 2013-11-19 at 15:11 +0100, Joakim Tjernlund wrote:
> > I found the same on MPC8321 long time ago(when 64 bits change went=20
in),=20
> > the 32 bits were much faster. I guess the "smaller"
> > CPUs cannot handle the cache trashing these big tables impose, I=20
didn't=20
> > look into the details though.
> > So I think this is a good change for 8xx.
> >=20
> > Acked-by: Joakim Tjernlund <joakim.tjernlund@transmode.se>
> >=20
> > Christophe Leroy <christophe.leroy@c-s.fr> wrote on 2013/11/18=20
08:04:23:
> >=20
> > > From: Christophe Leroy <christophe.leroy@c-s.fr>
> > > To: Vitaly Bordug <vitb@kernel.crashing.org>, Marcelo Tosatti=20
> > <marcelo@kvack.org>, Joakim Tjernlund <joakim.tjernlund@transmode.se>, =

Bob=20
> > Pearson <rpearson@systemfabricworks.com>,=20
> > > Cc: linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org
> > > Date: 2013/11/19 13:05
> > > Subject: [PATCH] lib/crc32: slice by 4 is more efficient than the=20
> > default slice by 8 on Powerpc 8xx.
> > >=20
> > > On PPC=5F8xx, CRC32=5FSLICEBY4 is more efficient (almost twice) than =

> > CRC32=5FSLICEBY8,
> > > as shown below:
> > >=20
> > > With CRC32=5FSLICEBY8:
> > > [    1.109204] crc32: CRC=5FLE=5FBITS =3D 64, CRC=5FBE BITS =3D 64
> > > [    1.114401] crc32: self tests passed, processed 225944 bytes in=20
> > 15118910 nsec
> > > [    1.130655] crc32c: CRC=5FLE=5FBITS =3D 64
> > > [    1.134235] crc32c: self tests passed, processed 225944 bytes in=20
> > 4479879 nsec
> > >=20
> > > With CRC32=5FSLICEBY4:
> > > [    1.097129] crc32: CRC=5FLE=5FBITS =3D 32, CRC=5FBE BITS =3D 32
> > > [    1.101878] crc32: self tests passed, processed 225944 bytes in=20
> > 8616242 nsec
> > > [    1.116298] crc32c: CRC=5FLE=5FBITS =3D 32
> > > [    1.119607] crc32c: self tests passed, processed 225944 bytes in=20
> > 3289576 nsec
> > >=20
> > > Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> > >=20
> > > Index: a/lib/Kconfig
> > > =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> > > --- a/lib/Kconfig   (r=C3=A9vision 5325)
> > > +++ b/lib/Kconfig   (copie de travail)
> > > @@ -102,6 +102,7 @@
> > >  choice
> > >     prompt "CRC32 implementation"
> > >     depends on CRC32
> > > +   default CRC32=5FSLICEBY4 if PPC=5F8xx
> > >     default CRC32=5FSLICEBY8
> > >     help
> > >       This option allows a kernel builder to override the default=20
choice
> >=20
> > =5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=
=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F=5F
> > Linuxppc-dev mailing list
> > Linuxppc-dev@lists.ozlabs.org
> > https://lists.ozlabs.org/listinfo/linuxppc-dev
> >=20
> >=20
>=20
>=20
>=20

^ permalink raw reply

* Re: [PATCH] powerpc/gpio: Fix the wrong GPIO input data on MPC8572/MPC8536
From: Scott Wood @ 2013-11-19 22:51 UTC (permalink / raw)
  To: Liu Gang; +Cc: linux-gpio, linus.walleij, linuxppc-dev, r61911, b07421
In-Reply-To: <1384499789-3631-1-git-send-email-Gang.Liu@freescale.com>

On Fri, 2013-11-15 at 15:16 +0800, Liu Gang wrote:
> For MPC8572/MPC8536, the status of GPIOs defined as output
> cannot be determined by reading GPDAT register, so the code
> use shadow data register instead. But if the input pins are
> asserted high, they will always read high due to the shadow
> data, even if the pins are set to low.
> 
> So the input pins should be read directly from GPDAT, not
> the shadow data.
> 
> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
> ---
>  drivers/gpio/gpio-mpc8xxx.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c
> index 9ae29cc..1d4ac75 100644
> --- a/drivers/gpio/gpio-mpc8xxx.c
> +++ b/drivers/gpio/gpio-mpc8xxx.c
> @@ -71,6 +71,7 @@ static int mpc8572_gpio_get(struct gpio_chip *gc, unsigned int gpio)
>  	struct mpc8xxx_gpio_chip *mpc8xxx_gc = to_mpc8xxx_gpio_chip(mm);
>  
>  	val = in_be32(mm->regs + GPIO_DAT) & ~in_be32(mm->regs + GPIO_DIR);
> +	mpc8xxx_gc->data &= in_be32(mm->regs + GPIO_DIR);
>  
>  	return (val | mpc8xxx_gc->data) & mpc8xxx_gpio2mask(gpio);
>  }

It seems odd to update ->data in a function that's supposed to be
reading things...  Perhaps it would be better to keep ->data in a good
state from the beginning.

-Scott

^ permalink raw reply

* Re: [PATCH v5 01/17] powerpc/fsl-pci: improve clock API use
From: Scott Wood @ 2013-11-19 22:41 UTC (permalink / raw)
  To: Gerhard Sittig
  Cc: Mike Turquette, Detlev Zundel, Minghuan Lian, Paul Mackerras,
	Anatolij Gustschin, linuxppc-dev, linux-arm-kernel
In-Reply-To: <1384729577-7336-2-git-send-email-gsi@denx.de>

On Mon, 2013-11-18 at 00:06 +0100, Gerhard Sittig wrote:
> make the Freescale PCI driver get, prepare and enable the PCI clock
> during probe(); the clock gets put upon device shutdown by the devm
> approach
> 
> clock lookup is non-fatal as not all platforms may provide clock specs
> in their device tree or implement a device tree based clock provider,
> but failure to enable clocks after successful lookup is fatal
> 
> the driver appears to not have a remove() routine, so no reference to
> the clock is kept during use, and the clock isn't released (the devm
> approach will put the clock, but it won't get disabled or unprepared)
> 
> the 85xx/86xx platforms go through the probe() routine, where clock
> lookup occurs and the clock gets acquired if one was specified; the
> 512x/83xx platforms don't pass through probe() but instead directly call
> the add_bridge() routine at a point in time where the clock provider has
> not been setup yet even if the platform implements one -- add comments
> to the code paths as a reminder for the potential need of a workaround
> in the platform's clock driver, and to keep awareness if code should get
> re-arranged or moved
> 
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Kumar Gala <galak@kernel.crashing.org>
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Gerhard Sittig <gsi@denx.de>
> ---
>  arch/powerpc/sysdev/fsl_pci.c |   52 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)

Please coordinate this change with Minghuan Lian's patchset (posted Oct
23) to move the bulk of this driver outside of arch/powerpc.


> diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
> index ccfb50ddfe38..efa0916f61b6 100644
> --- a/arch/powerpc/sysdev/fsl_pci.c
> +++ b/arch/powerpc/sysdev/fsl_pci.c
> @@ -17,6 +17,8 @@
>   * Free Software Foundation;  either version 2 of the  License, or (at your
>   * option) any later version.
>   */
> +
> +#include <linux/clk.h>
>  #include <linux/kernel.h>
>  #include <linux/pci.h>
>  #include <linux/delay.h>
> @@ -755,6 +757,32 @@ int __init mpc83xx_add_bridge(struct device_node *dev)
>  	const int *bus_range;
>  	int primary;
>  
> +	/*
> +	 * 85xx/86xx platforms take the path through the probe() routine
> +	 * as one would expect, PCI related clocks get acquired there if
> +	 * specified
> +	 *
> +	 * 83xx/512x _don't_ pass through probe(), this add_bridge()
> +	 * routine instead is called from within .setup_arch() at a
> +	 * point in time where clock providers haven't been setup yet;
> +	 * so clocks cannot get acquired here -- lookup would always
> +	 * fail even on those platforms which implement the provider
> +	 *
> +	 * there is no counterpart for add_bridge() just like there is
> +	 * no remove() counterpart for probe(), so in either case the
> +	 * PCI related clock won't get released, and all of the
> +	 * 512x/83xx/85xx/86xx platforms behave in identical ways

How is it identical if 85xx/86xx will acquire a clock in probe(), but
83xx/512x can't acquire it in add_bridge()?

Could you explain the relevance of releasing clocks here?

> +	 *
> +	 * this comment is here to "keep the balance" against the
> +	 * probe() routine, and as a reminder to acquire clocks if the
> +	 * add_bridge() call should move to some later point in time
> +	 *
> +	 * until then clock providers are expected to work around the
> +	 * peripheral driver's not acquiring the PCI clock on those
> +	 * platforms where clock providers exist, while nothing needs to
> +	 * be done for those platforms without a clock provider
> +	 */

What would be involved in moving 83xx/512x to use .probe() as well?

>  	is_mpc83xx_pci = 1;
>  
>  	if (!of_device_is_available(dev)) {
> @@ -1086,9 +1114,33 @@ void fsl_pci_assign_primary(void)
>  
>  static int fsl_pci_probe(struct platform_device *pdev)
>  {
> +	struct clk *clk;
>  	int ret;
>  	struct device_node *node;
>  
> +	/*
> +	 * clock lookup is non-fatal since the driver is shared among
> +	 * platforms and not all of them provide clocks specs in their
> +	 * device tree, but failure to enable a specified clock is
> +	 * considered fatal
> +	 *
> +	 * note that only the 85xx and 86xx platforms pass through this
> +	 * probe() routine, while 83xx and 512x directly invoke the
> +	 * mpc83xx_add_bridge() routine from within .setup_arch() code
> +	 */
> +	clk = devm_clk_get(&pdev->dev, "ipg");
> +	if (!IS_ERR(clk)) {
> +		ret = clk_prepare_enable(clk);
> +		if (ret) {
> +			dev_err(&pdev->dev, "Could not enable PCI clock\n");
> +			return ret;
> +		}
> +		/*
> +		 * TODO where to store the 'clk' reference?  there appears
> +		 * to be no remove() routine which undoes what probe() does
> +		 */
> +	}

There is a .remove(); this driver just doesn't support it.

As for where to store things, you could turn private_data into a struct
rather than a direct iomem pointer.  Or just replace the comment with a
non-TODO statement that says we'll never release the clock because the
PCI controller driver is non-removable.

-Scott

^ permalink raw reply

* Re: [PATCH v2] panic: Make panic_timeout configurable
From: Jason Baron @ 2013-11-19 22:04 UTC (permalink / raw)
  To: Ingo Molnar, benh@kernel.crashing.org, paulus@samba.org,
	Felipe Contreras
  Cc: Andrew Morton, linuxppc-dev, linux-kernel@vger.kernel.org,
	ralf@linux-mips.org
In-Reply-To: <20131119070905.GF32367@gmail.com>

On 11/19/2013 02:09 AM, Ingo Molnar wrote:
> 
> * Jason Baron <jbaron@akamai.com> wrote:
> 
>> On 11/18/2013 05:30 PM, Andrew Morton wrote:
>>> On Mon, 18 Nov 2013 21:04:36 +0000 (GMT) Jason Baron <jbaron@akamai.com> wrote:
>>>
>>>> The panic_timeout value can be set via the command line option 'panic=x', or via
>>>> /proc/sys/kernel/panic, however that is not sufficient when the panic occurs
>>>> before we are able to set up these values. Thus, add a CONFIG_PANIC_TIMEOUT
>>>> so that we can set the desired value from the .config.
>>>>
>>>> The default panic_timeout value continues to be 0 - wait forever, 
>>>> except for powerpc and mips, which have been defaulted to 180 and 
>>>> 5 respectively. This is in keeping with the fact that these 
>>>> arches already set panic_timeout in their arch init code. 
>>>> However, I found three exceptions- two in mips and one in powerpc 
>>>> where the settings didn't match these default values. In those 
>>>> cases, I left the arch code so it continues to override, in case 
>>>> the user has not changed from the default. It would nice if these 
>>>> arches had one default value, or if we could determine the 
>>>> correct setting at compile-time.
>>>
>>> Felipe is proposing a simpler patch ("panic: setup panic_timeout 
>>> early") which switches to early_param().  Is that sufficient for 
>>> the (undescribed!) failure which you are presumably observing?
>>>
>>
>> No - that patch doesn't change the 'panic_timeout' value until the 
>> call to 'parse_early_param()' is made. If there is a panic before 
>> that point, the param doesn't do anything. The idea of this patch is 
>> to allow it to be configured at build-time.
>>
>> I've tested the patch by simply inserting a panic() call at the 
>> beginning of 'start_kernel()'. So, no I do not have a specific panic 
>> in mind for this.
> 
> Would you be interested in picking up Felipe's patch/fix on top of 
> yours? I was unable to communicate with him efficiently, but I'd take 
> the patch if it's signed off by you.
> 
> Thanks,
> 
> 	Ingo
> 

Sure, I can round up all the related patches in this area that make
sense and re-submit as a series.

Felipe, would the CONFIG_PANIC_TIMEOUT=xx .config parameter work for your
needs, or would you still like to see the command-line processing moved
up?

I'd also like to hear from the PowerPC folks about the arch defaults
there. Now, that mips is ok with CONFIG_PANIC_TIMEOUT, PowerPC is the
only arch doing specific initialization of 'panic_timeout'.

Thanks,

-Jason

^ permalink raw reply

* Re: Problem reading and programming memory location...
From: Anatolij Gustschin @ 2013-11-19 21:45 UTC (permalink / raw)
  To: neorf3k; +Cc: Linux Ppc Dev List Dev List
In-Reply-To: <F24AEED4-A8F7-4DE8-A1BB-D0BED489A223@gmail.com>

Hi Lorenzo,

On Tue, 19 Nov 2013 11:20:24 +0100
neorf3k <neorf3k@gmail.com> wrote:

> Hello Anatolij, this is our code, used at University, but again it doesn=
=E2=80=99t work=E2=80=A6
>=20
> How i told, the only information we have about that reg are:
>=20
> Chip select 4 specification:
> Lp_cs4
> bus size: 8 bit
> bus control: 2 wait state R/W ACK disabled
> size allocated: 4 KByte
>=20
> Our Register 8 bit LP_cs4 (we want to write)
>=20
> cs4 offset: 0x001

is the byte in FPGA at offset 0x0 writable? In your code you
currently test read/write access to the byte at offset 0x0.

If the read/write access works under U-Boot, then maybe the
chip select parameters for CS4 are configured differently
in U-Boot. You can dump the Chip Select 4 configuration
registers under U-Boot and compare. Is address- and data-bus
to the FPGA multipexed? Another possible reason for non-working
access could be that the configured CS4 range 0x10020000 - 0x10030000
overlaps with configured range for CS0, CS1, CS2 or CS3. Can you
verify that no such overlapping exists.

Thanks,

Anatolij

^ permalink raw reply

* Re: [PATCH] lib/crc32: slice by 4 is more efficient than the default slice by 8 on Powerpc 8xx.
From: Scott Wood @ 2013-11-19 18:29 UTC (permalink / raw)
  To: Joakim Tjernlund
  Cc: Christophe Leroy, Marcelo Tosatti, Bob Pearson, linuxppc-dev,
	linux-kernel
In-Reply-To: <OF5004EA63.AA9B7989-ONC1257C28.004D6C13-C1257C28.004DF922@transmode.se>

I don't think we should go littering the Kconfig with defaults for
various bits of hardware -- especially since you've already pointed out
non-8xx hardware that would also want this.  Put it in defconfig
instead, unless you can identify very broad classes of machines for
which SLICEBY4 is faster.

-Scott

On Tue, 2013-11-19 at 15:11 +0100, Joakim Tjernlund wrote:
> I found the same on MPC8321 long time ago(when 64 bits change went in),=
=20
> the 32 bits were much faster. I guess the "smaller"
> CPUs cannot handle the cache trashing these big tables impose, I didn't=
=20
> look into the details though.
> So I think this is a good change for 8xx.
>=20
> Acked-by: Joakim Tjernlund <joakim.tjernlund@transmode.se>
>=20
> Christophe Leroy <christophe.leroy@c-s.fr> wrote on 2013/11/18 08:04:23=
:
>=20
> > From: Christophe Leroy <christophe.leroy@c-s.fr>
> > To: Vitaly Bordug <vitb@kernel.crashing.org>, Marcelo Tosatti=20
> <marcelo@kvack.org>, Joakim Tjernlund <joakim.tjernlund@transmode.se>, =
Bob=20
> Pearson <rpearson@systemfabricworks.com>,=20
> > Cc: linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org
> > Date: 2013/11/19 13:05
> > Subject: [PATCH] lib/crc32: slice by 4 is more efficient than the=20
> default slice by 8 on Powerpc 8xx.
> >=20
> > On PPC_8xx, CRC32_SLICEBY4 is more efficient (almost twice) than=20
> CRC32_SLICEBY8,
> > as shown below:
> >=20
> > With CRC32_SLICEBY8:
> > [    1.109204] crc32: CRC_LE_BITS =3D 64, CRC_BE BITS =3D 64
> > [    1.114401] crc32: self tests passed, processed 225944 bytes in=20
> 15118910 nsec
> > [    1.130655] crc32c: CRC_LE_BITS =3D 64
> > [    1.134235] crc32c: self tests passed, processed 225944 bytes in=20
> 4479879 nsec
> >=20
> > With CRC32_SLICEBY4:
> > [    1.097129] crc32: CRC_LE_BITS =3D 32, CRC_BE BITS =3D 32
> > [    1.101878] crc32: self tests passed, processed 225944 bytes in=20
> 8616242 nsec
> > [    1.116298] crc32c: CRC_LE_BITS =3D 32
> > [    1.119607] crc32c: self tests passed, processed 225944 bytes in=20
> 3289576 nsec
> >=20
> > Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> >=20
> > Index: a/lib/Kconfig
> > =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> > --- a/lib/Kconfig   (r=C3=83=C2=A9vision 5325)
> > +++ b/lib/Kconfig   (copie de travail)
> > @@ -102,6 +102,7 @@
> >  choice
> >     prompt "CRC32 implementation"
> >     depends on CRC32
> > +   default CRC32_SLICEBY4 if PPC_8xx
> >     default CRC32_SLICEBY8
> >     help
> >       This option allows a kernel builder to override the default cho=
ice
>=20
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>=20
>=20

^ permalink raw reply

* Re: [PATCH] powerpc/gpio: Fix the wrong GPIO input data on MPC8572/MPC8536
From: Anatolij Gustschin @ 2013-11-19 15:32 UTC (permalink / raw)
  To: Liu Gang; +Cc: linux-gpio, linus.walleij, linuxppc-dev, r61911, b07421
In-Reply-To: <1384499789-3631-1-git-send-email-Gang.Liu@freescale.com>

On Fri, 15 Nov 2013 15:16:29 +0800
Liu Gang <Gang.Liu@freescale.com> wrote:

> For MPC8572/MPC8536, the status of GPIOs defined as output
> cannot be determined by reading GPDAT register, so the code
> use shadow data register instead. But if the input pins are
> asserted high, they will always read high due to the shadow
> data, even if the pins are set to low.

Could you please add a better description of the problem?
I'm having some difficulties to understand the last sentence
above. Does the issue appear if some pins were configured as
inputs and were asserted high before booting the kernel, and
therefore the shadow data has been initialized with these pin
values?

Or does the issue appear if some pin has been configured as output
first and has been set to the high value, then reconfigured as
input? Now reading the pin state will always return high even
if the actual pin state is low?

It seems the issue will appear in both cases. If so, please add
this information to the commit message.

> So the input pins should be read directly from GPDAT, not
> the shadow data.
> 
> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
> ---
>  drivers/gpio/gpio-mpc8xxx.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c
> index 9ae29cc..1d4ac75 100644
> --- a/drivers/gpio/gpio-mpc8xxx.c
> +++ b/drivers/gpio/gpio-mpc8xxx.c
> @@ -71,6 +71,7 @@ static int mpc8572_gpio_get(struct gpio_chip *gc, unsigned int gpio)
>  	struct mpc8xxx_gpio_chip *mpc8xxx_gc = to_mpc8xxx_gpio_chip(mm);
>  
>  	val = in_be32(mm->regs + GPIO_DAT) & ~in_be32(mm->regs + GPIO_DIR);
> +	mpc8xxx_gc->data &= in_be32(mm->regs + GPIO_DIR);

we can reduce one in_be32() call here, i.e.

	u32 out_mask;
	...
	out_mask = in_be32(mm->regs + GPIO_DIR);
	val = in_be32(mm->regs + GPIO_DAT) & ~out_mask;
	mpc8xxx_gc->data &= out_mask;

>  	return (val | mpc8xxx_gc->data) & mpc8xxx_gpio2mask(gpio);
>  }

Thanks,

Anatolij

--
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: +49-8142-66989-0  Fax: +49-8142-66989-80 Email: office@denx.de

^ permalink raw reply

* Re: [PATCH] lib/crc32: slice by 4 is more efficient than the default slice by 8 on Powerpc 8xx.
From: Joakim Tjernlund @ 2013-11-19 14:11 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: Marcelo Tosatti, Bob Pearson, linuxppc-dev, linux-kernel
In-Reply-To: <20131118070423.E47181A4D3D@localhost.localdomain>

I found the same on MPC8321 long time ago(when 64 bits change went in),=20
the 32 bits were much faster. I guess the "smaller"
CPUs cannot handle the cache trashing these big tables impose, I didn't=20
look into the details though.
So I think this is a good change for 8xx.

Acked-by: Joakim Tjernlund <joakim.tjernlund@transmode.se>

Christophe Leroy <christophe.leroy@c-s.fr> wrote on 2013/11/18 08:04:23:

> From: Christophe Leroy <christophe.leroy@c-s.fr>
> To: Vitaly Bordug <vitb@kernel.crashing.org>, Marcelo Tosatti=20
<marcelo@kvack.org>, Joakim Tjernlund <joakim.tjernlund@transmode.se>, Bob =

Pearson <rpearson@systemfabricworks.com>,=20
> Cc: linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org
> Date: 2013/11/19 13:05
> Subject: [PATCH] lib/crc32: slice by 4 is more efficient than the=20
default slice by 8 on Powerpc 8xx.
>=20
> On PPC=5F8xx, CRC32=5FSLICEBY4 is more efficient (almost twice) than=20
CRC32=5FSLICEBY8,
> as shown below:
>=20
> With CRC32=5FSLICEBY8:
> [    1.109204] crc32: CRC=5FLE=5FBITS =3D 64, CRC=5FBE BITS =3D 64
> [    1.114401] crc32: self tests passed, processed 225944 bytes in=20
15118910 nsec
> [    1.130655] crc32c: CRC=5FLE=5FBITS =3D 64
> [    1.134235] crc32c: self tests passed, processed 225944 bytes in=20
4479879 nsec
>=20
> With CRC32=5FSLICEBY4:
> [    1.097129] crc32: CRC=5FLE=5FBITS =3D 32, CRC=5FBE BITS =3D 32
> [    1.101878] crc32: self tests passed, processed 225944 bytes in=20
8616242 nsec
> [    1.116298] crc32c: CRC=5FLE=5FBITS =3D 32
> [    1.119607] crc32c: self tests passed, processed 225944 bytes in=20
3289576 nsec
>=20
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>=20
> Index: a/lib/Kconfig
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> --- a/lib/Kconfig   (r=C3=A9vision 5325)
> +++ b/lib/Kconfig   (copie de travail)
> @@ -102,6 +102,7 @@
>  choice
>     prompt "CRC32 implementation"
>     depends on CRC32
> +   default CRC32=5FSLICEBY4 if PPC=5F8xx
>     default CRC32=5FSLICEBY8
>     help
>       This option allows a kernel builder to override the default choice

^ permalink raw reply

* [PATCH] lib/crc32: slice by 4 is more efficient than the default slice by 8 on Powerpc 8xx.
From: Christophe Leroy @ 2013-11-18  7:04 UTC (permalink / raw)
  To: Vitaly Bordug, Marcelo Tosatti, Joakim Tjernlund, Bob Pearson
  Cc: linuxppc-dev, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain, Size: 1097 bytes --]

On PPC_8xx, CRC32_SLICEBY4 is more efficient (almost twice) than CRC32_SLICEBY8,
as shown below:

With CRC32_SLICEBY8:
[    1.109204] crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64
[    1.114401] crc32: self tests passed, processed 225944 bytes in 15118910 nsec
[    1.130655] crc32c: CRC_LE_BITS = 64
[    1.134235] crc32c: self tests passed, processed 225944 bytes in 4479879 nsec

With CRC32_SLICEBY4:
[    1.097129] crc32: CRC_LE_BITS = 32, CRC_BE BITS = 32
[    1.101878] crc32: self tests passed, processed 225944 bytes in 8616242 nsec
[    1.116298] crc32c: CRC_LE_BITS = 32
[    1.119607] crc32c: self tests passed, processed 225944 bytes in 3289576 nsec

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>

Index: a/lib/Kconfig
===================================================================
--- a/lib/Kconfig	(révision 5325)
+++ b/lib/Kconfig	(copie de travail)
@@ -102,6 +102,7 @@
 choice
 	prompt "CRC32 implementation"
 	depends on CRC32
+	default CRC32_SLICEBY4 if PPC_8xx
 	default CRC32_SLICEBY8
 	help
 	  This option allows a kernel builder to override the default choice

^ permalink raw reply

* Re: [PATCH v3] powerpc: kvm: optimize "sc 1" as fast return
From: Alexander Graf @ 2013-11-19 10:40 UTC (permalink / raw)
  To: Liu Ping Fan
  Cc: Gleb Natapov, kvm@vger.kernel.org mailing list, kvm-ppc,
	Paul Mackerras, Paolo Bonzini, linuxppc-dev
In-Reply-To: <1384841568-20986-2-git-send-email-pingfank@linux.vnet.ibm.com>


On 19.11.2013, at 07:12, Liu Ping Fan <kernelfans@gmail.com> wrote:

> In some scene, e.g openstack CI, PR guest can trigger "sc 1" =
frequently,
> this patch optimizes the path by directly delivering =
BOOK3S_INTERRUPT_SYSCALL
> to HV guest, so powernv can return to HV guest without heavy exit, =
i.e,
> no need to swap TLB, HTAB,.. etc
>=20
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> ---
> v3: add some document

This is v4 I think

> ---
> arch/powerpc/kvm/book3s_hv.c            | 10 ++++------
> arch/powerpc/kvm/book3s_hv_rmhandlers.S | 19 ++++++++++++++++++-
> 2 files changed, 22 insertions(+), 7 deletions(-)
>=20
> diff --git a/arch/powerpc/kvm/book3s_hv.c =
b/arch/powerpc/kvm/book3s_hv.c
> index 62a2b5a..1addb1a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -628,12 +628,10 @@ static int kvmppc_handle_exit(struct kvm_run =
*run, struct kvm_vcpu *vcpu,
> 		/* hcall - punt to userspace */
> 		int i;
>=20
> -		if (vcpu->arch.shregs.msr & MSR_PR) {
> -			/* sc 1 from userspace - reflect to guest =
syscall */
> -			kvmppc_book3s_queue_irqprio(vcpu, =
BOOK3S_INTERRUPT_SYSCALL);
> -			r =3D RESUME_GUEST;
> -			break;
> -		}
> +		/* hypercall with MSR_PR has already been handled in =
rmode,
> +		 * and never reaches here.
> +		 */

It would've been nice to also mention the real mode hypercall handling, =
but I can just post a follow-up patch for that one.


Thanks, applied to kvm-ppc-queue.

Alex

^ permalink raw reply

* Re: [PATCH RESEND v4] powerpc: kvm: fix rare but potential deadlock scene
From: Alexander Graf @ 2013-11-19 10:39 UTC (permalink / raw)
  To: Liu Ping Fan
  Cc: Gleb Natapov, kvm@vger.kernel.org mailing list, kvm-ppc,
	Paul Mackerras, Paolo Bonzini, linuxppc-dev
In-Reply-To: <1384841568-20986-1-git-send-email-pingfank@linux.vnet.ibm.com>


On 19.11.2013, at 07:12, Liu Ping Fan <kernelfans@gmail.com> wrote:

> Since kvmppc_hv_find_lock_hpte() is called from both virtmode and
> realmode, so it can trigger the deadlock.
> 
> Suppose the following scene:
> 
> Two physical cpuM, cpuN, two VM instances A, B, each VM has a group of
> vcpus.
> 
> If on cpuM, vcpu_A_1 holds bitlock X (HPTE_V_HVLOCK), then is switched
> out, and on cpuN, vcpu_A_2 try to lock X in realmode, then cpuN will be
> caught in realmode for a long time.
> 
> What makes things even worse if the following happens,
>  On cpuM, bitlockX is hold, on cpuN, Y is hold.
>  vcpu_B_2 try to lock Y on cpuM in realmode
>  vcpu_A_2 try to lock X on cpuN in realmode
> 
> Oops! deadlock happens
> 
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

Any particular reason for the resend? The patch is already applied, no?


Alex

^ permalink raw reply

* Re: Problem reading and programming memory location...
From: neorf3k @ 2013-11-19 10:20 UTC (permalink / raw)
  To: Anatolij Gustschin; +Cc: Linux Ppc Dev List Dev List
In-Reply-To: <20131116152933.276f9939@crub>

[-- Attachment #1: Type: text/plain, Size: 3499 bytes --]

Hello Anatolij, this is our code, used at University, but again it doesn’t work…

How i told, the only information we have about that reg are:

Chip select 4 specification:
Lp_cs4
bus size: 8 bit
bus control: 2 wait state R/W ACK disabled
size allocated: 4 KByte

Our Register 8 bit LP_cs4 (we want to write)

cs4 offset: 0x001

The code we have used:

——
#define MBAR_BASE       0xF0000000
#define MALab_MM_START	 0x10020000U
#define MALab_MM_END	 0x10030000U
#define MALab_MM_SIZE	 0x00010000U

#define MALab_DEVICE_NAME	"MALab"

int init_module(void) { ...
    u16 cs4_start_value;
    u16 cs4_stop_value;
    u32 cs4_enable_value;
    u32 cs0_reg_value;
    u32 cs3_reg_value;
    u32 ipbi_cr_value;
    u32 cs_ctrl_reg_value;
    u8 rvoice_ioaddr_value;
    
    
    // reserve a page of memory for our hardware /proc/iomem
    if ( check_region(MALab_MM_START,MALab_MM_SIZE) ) {
        printk (KERN_ALERT "LED init_module: memory already in use\n");
        return -EBUSY;
    }
    
    request_region(MALab_MM_START,MALab_MM_SIZE,MALab_DEVICE_NAME);
    
    void __iomem *reg_base = ioremap(MBAR_BASE, 0x400);
    void __iomem *cs0_reg   = reg_base + 0x0300;
    void __iomem *cs1_reg   = reg_base + 0x0304;
    void __iomem *cs2_reg   = reg_base + 0x0308;
    void __iomem *cs3_reg   = reg_base + 0x030C;
    void __iomem *ipbi_cr = reg_base + 0x0054;
    void __iomem *cs4_start  = reg_base + 0x0024;
    void __iomem *cs4_stop   = reg_base + 0x0028;
    void __iomem *cs4_enable   = reg_base + 0x0310;
    void __iomem *cs_ctrl_reg   = reg_base + 0x0318;
    void __iomem *rvoice_ioaddr   = ioremap ((volatile unsigned long)(MALab_MM_START), MALab_MM_SIZE);
    
    printk(KERN_ALERT “r_ioaddr: 0x%08x\n", (u32)rvoice_ioaddr);
    
    //Chip Select 0/Boot Configuration Register: CE=0
    cs0_reg_value =in_be32(cs0_reg);
    out_be32(cs0_reg, (cs0_reg_value &= ~0x00010000));
    
    //Chip Select 3 Configuration Register: CE=0
    cs3_reg_value =in_be32(cs3_reg);
    out_be32(cs3_reg, (cs3_reg_value &= ~0x00010000));
    
    //enable LocalBus chip select CS4 to hit on our address range
    ipbi_cr_value=in_be32(ipbi_cr);
    out_be32(ipbi_cr, (ipbi_cr_value &= 0x00100001));
    out_be32(ipbi_cr, (ipbi_cr_value |= 0x00100001));
    out_be16(cs4_start+2, MALab_MM_START >>16);
    out_be16(cs4_stop+2, MALab_MM_END >>16);

    // LocalBus Chip Select 4 Configuration Register
    out_be32(cs4_enable, 0x0002DC00);
    
    //Enable Chip Select Control Register
    cs_ctrl_reg_value=in_be32(cs_ctrl_reg);
    out_be32(cs_ctrl_reg, (cs_ctrl_reg_value |= 0x01000000));
    
    //start solution A
    rvoice_ioaddr_value=in_8(rvoice_ioaddr);
    rvoice_ioaddr_value=0xAA;
    printk("A r_ioaddr_value---before : %x \n",in_8(rvoice_ioaddr));
    
    out_8(rvoice_ioaddr, rvoice_ioaddr_value);
    
    printk("A r_ioaddr_value---after : %x \n",in_8(rvoice_ioaddr));
    //end solution A
    
    //start solution B
    *(volatile u8 *)rvoice_ioaddr = 0xAA;
    printk("\n B r_ioaddr_value %x\n",*(volatile u8 *)rvoice_ioaddr);
    //end solution B

    
    ... }
——

the result is:

rvoice_ioaddr: 0xc9080000
A rvoice_ioaddr_value---before : 10 
A rvoice_ioaddr_value---after : 10 
B rvoice_ioaddr_value 10
——

we found some information on this post: http://linuxppc.10917.n7.nabble.com/MPC5200b-kernel-module-memory-mapping-td59862.html

Thanks again…

Lorenzo


[-- Attachment #2: Type: text/html, Size: 14224 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc/gpio: Fix the wrong GPIO input data on MPC8572/MPC8536
From: Linus Walleij @ 2013-11-19  9:49 UTC (permalink / raw)
  To: Liu Gang, Anatolij Gustschin, Benjamin Herrenschmidt
  Cc: linux-gpio@vger.kernel.org, linuxppc-dev@lists.ozlabs.org list,
	r61911, b07421
In-Reply-To: <1384499789-3631-1-git-send-email-Gang.Liu@freescale.com>

On Fri, Nov 15, 2013 at 8:16 AM, Liu Gang <Gang.Liu@freescale.com> wrote:

> For MPC8572/MPC8536, the status of GPIOs defined as output
> cannot be determined by reading GPDAT register, so the code
> use shadow data register instead. But if the input pins are
> asserted high, they will always read high due to the shadow
> data, even if the pins are set to low.
>
> So the input pins should be read directly from GPDAT, not
> the shadow data.
>
> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
> ---
>  drivers/gpio/gpio-mpc8xxx.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c
> index 9ae29cc..1d4ac75 100644
> --- a/drivers/gpio/gpio-mpc8xxx.c
> +++ b/drivers/gpio/gpio-mpc8xxx.c
> @@ -71,6 +71,7 @@ static int mpc8572_gpio_get(struct gpio_chip *gc, unsigned int gpio)
>         struct mpc8xxx_gpio_chip *mpc8xxx_gc = to_mpc8xxx_gpio_chip(mm);
>
>         val = in_be32(mm->regs + GPIO_DAT) & ~in_be32(mm->regs + GPIO_DIR);
> +       mpc8xxx_gc->data &= in_be32(mm->regs + GPIO_DIR);
>
>         return (val | mpc8xxx_gc->data) & mpc8xxx_gpio2mask(gpio);
>  }

Anatolij, Ben: can either of you take a look at this patch and ACK it
if OK?

Yours,
Linus Walleij

^ permalink raw reply

* Re: [RFC PATCH powerpc] Fix compiling error in powernv/rng.c
From: Li Zhong @ 2013-11-19  9:24 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: PowerPC email list, Paul Mackerras
In-Reply-To: <20131119040455.GB27740@concordia>

On Tue, 2013-11-19 at 15:04 +1100, Michael Ellerman wrote:
> On Fri, Nov 15, 2013 at 03:36:04PM +0800, Li Zhong wrote:
> > This is seen when CONFIG_SMP is not enabled:
> > 
> > arch/powerpc/platforms/powernv/rng.c: In function 'rng_init_per_cpu':
> > arch/powerpc/platforms/powernv/rng.c:74: error: implicit declaration of function 'cpu_to_chip_id'
> 
> Hi Li,
> 
> We try whenever possible to avoid adding #ifdefs in C code.
> 
> Also on a multi chip system where there are multiple RNGs, your code for
> UP will not necessarily choose the RNG on the same core as the cpu.

OK, thank you for the review, Michael.

Just try to make sure I understand it: 

So even in UP, we could have multiple rng sources, and we should try to
use the source which has the same chip_id as the logical cpu?

Thanks, Zhong

> I have a different fix that I will send.
> 
> cheers
> 

^ permalink raw reply

* [RFC PATCH powerpc] Fix a dma_mask issue of vio
From: Li Zhong @ 2013-11-19  8:11 UTC (permalink / raw)
  To: PowerPC email list; +Cc: Paul Mackerras, rmk+kernel

I encountered following issue:
[    0.283035] ibmvscsi 30000015: couldn't initialize event pool
[    5.688822] ibmvscsi: probe of 30000015 failed with error -1

which prevents the storage from being recognized, and the machine from
booting.

After some digging, it seems that it is caused by commit 4886c399da

as dma_mask pointer in viodev->dev is not set, so in
dma_set_mask_and_coherent(), dma_set_coherent_mask() is not called
because dma_set_mask(), which is dma_set_mask_pSeriesLP() returned EIO.
While before the commit, dma_set_coherent_mask() is always called. 

I tried to replace dma_set_mask_and_coherent() with
dma_coerce_mask_and_coherent(), and the machine could boot again. 

But I'm not sure whether this is the correct fix...

---
 arch/powerpc/kernel/vio.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index e7d0c88..76a6482 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1419,7 +1419,7 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
 
 		/* needed to ensure proper operation of coherent allocations
 		 * later, in case driver doesn't set it explicitly */
-		dma_set_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64));
+		dma_coerce_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64));
 	}
 
 	/* register with generic device framework */

^ permalink raw reply related

* [PATCH v3] powerpc: kvm: optimize "sc 1" as fast return
From: Liu Ping Fan @ 2013-11-19  6:12 UTC (permalink / raw)
  To: linuxppc-dev, kvm-ppc
  Cc: kvm, Gleb Natapov, Alexander Graf, Paul Mackerras, Paolo Bonzini
In-Reply-To: <1384841568-20986-1-git-send-email-pingfank@linux.vnet.ibm.com>

In some scene, e.g openstack CI, PR guest can trigger "sc 1" frequently,
this patch optimizes the path by directly delivering BOOK3S_INTERRUPT_SYSCALL
to HV guest, so powernv can return to HV guest without heavy exit, i.e,
no need to swap TLB, HTAB,.. etc

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
v3: add some document
---
 arch/powerpc/kvm/book3s_hv.c            | 10 ++++------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 19 ++++++++++++++++++-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 62a2b5a..1addb1a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -628,12 +628,10 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* hcall - punt to userspace */
 		int i;
 
-		if (vcpu->arch.shregs.msr & MSR_PR) {
-			/* sc 1 from userspace - reflect to guest syscall */
-			kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
-			r = RESUME_GUEST;
-			break;
-		}
+		/* hypercall with MSR_PR has already been handled in rmode,
+		 * and never reaches here.
+		 */
+
 		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
 		for (i = 0; i < 9; ++i)
 			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c71103b..eea7ca7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -534,6 +534,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 5:	mtspr	SPRN_SRR0, r6
 	mtspr	SPRN_SRR1, r7
 
+/*
+ * Required state:
+ * R4 = vcpu
+ * R10: value for HSRR0
+ * R11: value for HSRR1
+ * R13 = PACA
+ */
 fast_guest_return:
 	li	r0,0
 	stb	r0,VCPU_CEDED(r4)	/* cancel cede */
@@ -1388,7 +1395,8 @@ kvmppc_hisi:
 hcall_try_real_mode:
 	ld	r3,VCPU_GPR(R3)(r9)
 	andi.	r0,r11,MSR_PR
-	bne	guest_exit_cont
+	/* sc 1 from userspace - reflect to guest syscall */
+	bne	sc_1_fast_return
 	clrrdi	r3,r3,2
 	cmpldi	r3,hcall_real_table_end - hcall_real_table
 	bge	guest_exit_cont
@@ -1409,6 +1417,15 @@ hcall_try_real_mode:
 	ld	r11,VCPU_MSR(r4)
 	b	fast_guest_return
 
+sc_1_fast_return:
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	li	r10, BOOK3S_INTERRUPT_SYSCALL
+	li	r11, (MSR_ME << 1) | 1  /* synthesize MSR_SF | MSR_ME */
+	rotldi	r11, r11, 63
+	mr	r4,r9
+	b	fast_guest_return
+
 	/* We've attempted a real mode hcall, but it's punted it back
 	 * to userspace.  We need to restore some clobbered volatiles
 	 * before resuming the pass-it-to-qemu path */
-- 
1.8.1.4

^ permalink raw reply related

* [PATCH RESEND v4] powerpc: kvm: fix rare but potential deadlock scene
From: Liu Ping Fan @ 2013-11-19  6:12 UTC (permalink / raw)
  To: linuxppc-dev, kvm-ppc
  Cc: kvm, Gleb Natapov, Alexander Graf, Paul Mackerras, Paolo Bonzini

Since kvmppc_hv_find_lock_hpte() is called from both virtmode and
realmode, so it can trigger the deadlock.

Suppose the following scene:

Two physical cpuM, cpuN, two VM instances A, B, each VM has a group of
vcpus.

If on cpuM, vcpu_A_1 holds bitlock X (HPTE_V_HVLOCK), then is switched
out, and on cpuN, vcpu_A_2 try to lock X in realmode, then cpuN will be
caught in realmode for a long time.

What makes things even worse if the following happens,
  On cpuM, bitlockX is hold, on cpuN, Y is hold.
  vcpu_B_2 try to lock Y on cpuM in realmode
  vcpu_A_2 try to lock X on cpuN in realmode

Oops! deadlock happens

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 +++++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 842f081..abf81fe 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -473,11 +473,14 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		slb_v = vcpu->kvm->arch.vrma_slb_v;
 	}
 
+	preempt_disable();
 	/* Find the HPTE in the hash table */
 	index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
 					 HPTE_V_VALID | HPTE_V_ABSENT);
-	if (index < 0)
+	if (index < 0) {
+		preempt_enable();
 		return -ENOENT;
+	}
 	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 	v = hptep[0] & ~HPTE_V_HVLOCK;
 	gr = kvm->arch.revmap[index].guest_rpte;
@@ -485,6 +488,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	/* Unlock the HPTE */
 	asm volatile("lwsync" : : : "memory");
 	hptep[0] = v;
+	preempt_enable();
 
 	gpte->eaddr = eaddr;
 	gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 9c51544..ea17b30 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -749,6 +749,10 @@ static int slb_base_page_shift[4] = {
 	20,	/* 1M, unsupported */
 };
 
+/* When called from virtmode, this func should be protected by
+ * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK
+ * can trigger deadlock issue.
+ */
 long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 			      unsigned long valid)
 {
-- 
1.8.1.4

^ permalink raw reply related

* [PATCH 9/9 v2] vfio pci: Add vfio iommu implementation for FSL_PAMU
From: Bharat Bhushan @ 2013-11-19  5:17 UTC (permalink / raw)
  To: alex.williamson, joro, bhelgaas, agraf, scottwood, stuart.yoder,
	iommu, linux-pci, linuxppc-dev, linux-kernel
  Cc: Bharat Bhushan
In-Reply-To: <1384838233-24847-1-git-send-email-Bharat.Bhushan@freescale.com>

This patch adds vfio iommu support for Freescale IOMMU (PAMU -
Peripheral Access Management Unit).

The Freescale PAMU is an aperture-based IOMMU with the following
characteristics.  Each device has an entry in a table in memory
describing the iova->phys mapping. The mapping has:
   -an overall aperture that is power of 2 sized, and has a start iova that
    is naturally aligned
   -has 1 or more windows within the aperture
   -number of windows must be power of 2, max is 256
   -size of each window is determined by aperture size / # of windows
   -iova of each window is determined by aperture start iova / # of windows
   -the mapped region in each window can be different than
    the window size...mapping must power of 2
   -physical address of the mapping must be naturally aligned
    with the mapping size

Some of the code is derived from TYPE1 iommu (driver/vfio/vfio_iommu_type1.c).

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
v1->v2
 - Use lock around msi-dma list
 - check for overlap between dma and msi-dma pages
 - Some code cleanup as per various comments

 drivers/vfio/Kconfig               |    6 +
 drivers/vfio/Makefile              |    1 +
 drivers/vfio/vfio_iommu_fsl_pamu.c | 1003 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h          |  100 ++++
 4 files changed, 1110 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vfio/vfio_iommu_fsl_pamu.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 26b3d9d..7d1da26 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -8,11 +8,17 @@ config VFIO_IOMMU_SPAPR_TCE
 	depends on VFIO && SPAPR_TCE_IOMMU
 	default n
 
+config VFIO_IOMMU_FSL_PAMU
+	tristate
+	depends on VFIO
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
 	select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
+	select VFIO_IOMMU_FSL_PAMU if FSL_PAMU
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index c5792ec..7461350 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_common.o vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_common.o vfio_iommu_spapr_tce.o
+obj-$(CONFIG_VFIO_IOMMU_FSL_PAMU) += vfio_iommu_common.o vfio_iommu_fsl_pamu.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_fsl_pamu.c b/drivers/vfio/vfio_iommu_fsl_pamu.c
new file mode 100644
index 0000000..66efc84
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_fsl_pamu.c
@@ -0,0 +1,1003 @@
+/*
+ * VFIO: IOMMU DMA mapping support for FSL PAMU IOMMU
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ *     Author: Bharat Bhushan <bharat.bhushan@freescale.com>
+ *
+ * This file is derived from driver/vfio/vfio_iommu_type1.c
+ *
+ * The Freescale PAMU is an aperture-based IOMMU with the following
+ * characteristics.  Each device has an entry in a table in memory
+ * describing the iova->phys mapping. The mapping has:
+ *  -an overall aperture that is power of 2 sized, and has a start iova that
+ *   is naturally aligned
+ *  -has 1 or more windows within the aperture
+ *     -number of windows must be power of 2, max is 256
+ *     -size of each window is determined by aperture size / # of windows
+ *     -iova of each window is determined by aperture start iova / # of windows
+ *     -the mapped region in each window can be different than
+ *      the window size...mapping must power of 2
+ *     -physical address of the mapping must be naturally aligned
+ *      with the mapping size
+ */
+
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pci.h>		/* pci_bus_type */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <linux/hugetlb.h>
+#include <linux/msi.h>
+#include <asm/fsl_pamu_stash.h>
+
+#include "vfio_iommu_common.h"
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "Bharat Bhushan <bharat.bhushan@freescale.com>"
+#define DRIVER_DESC     "FSL PAMU IOMMU driver for VFIO"
+
+struct vfio_iommu {
+	struct iommu_domain	*domain;
+	struct mutex		lock;
+	dma_addr_t		aperture_start;
+	dma_addr_t		aperture_end;
+	dma_addr_t		page_size;	/* Maximum mapped Page size */
+	int			nsubwindows;	/* Number of subwindows */
+	struct rb_root		dma_list;
+	struct list_head	msi_dma_list;
+	struct list_head	group_list;
+};
+
+struct vfio_dma {
+	struct rb_node		node;
+	dma_addr_t		iova;		/* Device address */
+	unsigned long		vaddr;		/* Process virtual addr */
+	size_t			size;		/* Map size (bytes) */
+	int			prot;		/* IOMMU_READ/WRITE */
+};
+
+struct vfio_msi_dma {
+	struct list_head	next;
+	dma_addr_t		iova;		/* Device address */
+	size_t			size;		/* MSI page size */
+	int			bank_id;
+	int			prot;		/* IOMMU_READ/WRITE */
+};
+
+struct vfio_group {
+	struct iommu_group	*iommu_group;
+	struct list_head	next;
+};
+
+static int iova_to_win(struct vfio_iommu *iommu, dma_addr_t iova)
+{
+	u64 offset = iova - iommu->aperture_start;
+	do_div(offset, iommu->page_size);
+	return (int) offset;
+}
+
+static int vfio_disable_iommu_domain(struct vfio_iommu *iommu)
+{
+	int enable = 0;
+	return iommu_domain_set_attr(iommu->domain,
+				     DOMAIN_ATTR_FSL_PAMU_ENABLE, &enable);
+}
+
+static int vfio_enable_iommu_domain(struct vfio_iommu *iommu)
+{
+	int enable = 1;
+	return iommu_domain_set_attr(iommu->domain,
+				     DOMAIN_ATTR_FSL_PAMU_ENABLE, &enable);
+}
+
+/* Unmap DMA region */
+/* This function disable iommu if no dma mapping is set */
+static void vfio_check_and_disable_iommu(struct vfio_iommu *iommu)
+{
+	if (list_empty(&iommu->msi_dma_list) && !rb_first(&iommu->dma_list))
+		vfio_disable_iommu_domain(iommu);
+}
+
+static struct vfio_msi_dma *vfio_find_msi_dma(struct vfio_iommu *iommu,
+					      dma_addr_t start, size_t size)
+{
+	struct vfio_msi_dma *msi_dma;
+
+	/* Check MSI MAP entries */
+	list_for_each_entry(msi_dma, &iommu->msi_dma_list, next) {
+		if ((start + size) <= (msi_dma->iova))
+			continue;
+
+		if ((start >= (msi_dma->iova + msi_dma->size)))
+			continue;
+
+		return msi_dma;
+	}
+
+	return NULL;
+}
+
+static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
+				      dma_addr_t start, size_t size)
+{
+	struct rb_node *node = iommu->dma_list.rb_node;
+
+	/* check DMA MAP entries */
+	while (node) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+		if (start + size <= dma->iova)
+			node = node->rb_left;
+		else if (start >= dma->iova + dma->size)
+			node = node->rb_right;
+		else
+			return dma;
+	}
+
+	return NULL;
+}
+
+static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
+{
+	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
+	struct vfio_dma *dma;
+
+	while (*link) {
+		parent = *link;
+		dma = rb_entry(parent, struct vfio_dma, node);
+
+		if (new->iova + new->size <= dma->iova)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &iommu->dma_list);
+}
+
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
+{
+	rb_erase(&old->node, &iommu->dma_list);
+	vfio_check_and_disable_iommu(iommu);
+}
+
+static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    dma_addr_t iova, size_t *size)
+{
+	dma_addr_t start = iova;
+	int win, win_start, win_end;
+	long unlocked = 0;
+	unsigned int nr_pages;
+
+	nr_pages = iommu->page_size / PAGE_SIZE;
+	win_start = iova_to_win(iommu, iova);
+	win_end = iova_to_win(iommu, iova + *size - 1);
+
+	/* Release the pinned pages */
+	for (win = win_start; win <= win_end; iova += iommu->page_size, win++) {
+		unsigned long pfn;
+
+		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
+		if (!pfn)
+			continue;
+
+		iommu_domain_window_disable(iommu->domain, win);
+
+		unlocked += vfio_unpin_pages(pfn, nr_pages, dma->prot, 1);
+	}
+
+	vfio_lock_acct(-unlocked);
+	*size = iova - start;
+	return 0;
+}
+
+static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
+				   size_t *size, struct vfio_dma *dma)
+{
+	size_t offset, overlap, tmp;
+	struct vfio_dma *split;
+	int ret;
+
+	if (!*size)
+		return 0;
+
+	/*
+	 * Existing dma region is completely covered, unmap all.  This is
+	 * the likely case since userspace tends to map and unmap buffers
+	 * in one shot rather than multiple mappings within a buffer.
+	 */
+	if (likely(start <= dma->iova &&
+		   start + *size >= dma->iova + dma->size)) {
+		*size = dma->size;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
+		if (ret)
+			return ret;
+
+		/*
+		 * Did we remove more than we have?  Should never happen
+		 * since a vfio_dma is contiguous in iova and vaddr.
+		 */
+		WARN_ON(*size != dma->size);
+
+		vfio_remove_dma(iommu, dma);
+		kfree(dma);
+		return 0;
+	}
+
+	/* Overlap low address of existing range */
+	if (start <= dma->iova) {
+		overlap = start + *size - dma->iova;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
+		if (ret)
+			return ret;
+
+		vfio_remove_dma(iommu, dma);
+
+		/*
+		 * Check, we may have removed to whole vfio_dma.  If not
+		 * fixup and re-insert.
+		 */
+		if (overlap < dma->size) {
+			dma->iova += overlap;
+			dma->vaddr += overlap;
+			dma->size -= overlap;
+			vfio_insert_dma(iommu, dma);
+		} else
+			kfree(dma);
+
+		*size = overlap;
+		return 0;
+	}
+
+	/* Overlap high address of existing range */
+	if (start + *size >= dma->iova + dma->size) {
+		offset = start - dma->iova;
+		overlap = dma->size - offset;
+
+		ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
+		if (ret)
+			return ret;
+
+		dma->size -= overlap;
+		*size = overlap;
+		return 0;
+	}
+
+	/* Split existing */
+
+	/*
+	 * Allocate our tracking structure early even though it may not
+	 * be used.  An Allocation failure later loses track of pages and
+	 * is more difficult to unwind.
+	 */
+	split = kzalloc(sizeof(*split), GFP_KERNEL);
+	if (!split)
+		return -ENOMEM;
+
+	offset = start - dma->iova;
+
+	ret = vfio_unmap_unpin(iommu, dma, start, size);
+	if (ret || !*size) {
+		kfree(split);
+		return ret;
+	}
+
+	tmp = dma->size;
+
+	/* Resize the lower vfio_dma in place, before the below insert */
+	dma->size = offset;
+
+	/* Insert new for remainder, assuming it didn't all get unmapped */
+	if (likely(offset + *size < tmp)) {
+		split->size = tmp - offset - *size;
+		split->iova = dma->iova + offset + *size;
+		split->vaddr = dma->vaddr + offset + *size;
+		split->prot = dma->prot;
+		vfio_insert_dma(iommu, split);
+	} else
+		kfree(split);
+
+	return 0;
+}
+
+/* Map DMA region */
+static int vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
+			  unsigned long vaddr, long npage, int prot)
+{
+	int ret = 0, i;
+	size_t size;
+	unsigned int win, nr_subwindows;
+	dma_addr_t iovamap;
+
+	win = iova_to_win(iommu, iova);
+	if (iova != iommu->aperture_start + iommu->page_size * win) {
+		pr_err("%s iova(%llx) unalligned to window size %llx\n",
+			__func__, iova, iommu->page_size);
+		return -EINVAL;
+	}
+
+	/* total size to be mapped */
+	size = npage << PAGE_SHIFT;
+	nr_subwindows = size >> ilog2(iommu->page_size);
+	iovamap = iova;
+
+	for (i = 0; i < nr_subwindows; i++, win++) {
+		unsigned long pfn;
+		unsigned long nr_pages;
+		dma_addr_t mapsize;
+		struct vfio_dma *dma = NULL;
+
+		mapsize = min(iova + size - iovamap, iommu->page_size);
+		nr_pages = mapsize >> PAGE_SHIFT;
+
+		/* Pin a contiguous chunk of memory */
+		ret = vfio_pin_pages(vaddr, nr_pages, prot, &pfn);
+		if (ret != nr_pages) {
+			pr_err("%s unable to pin pages = %lx, pinned(%lx/%lx)\n",
+				__func__, vaddr, npage, nr_pages);
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = iommu_domain_window_enable(iommu->domain, win,
+						 (phys_addr_t)pfn << PAGE_SHIFT,
+						 mapsize, prot);
+		if (ret) {
+			pr_err("%s unable to iommu_map()\n", __func__);
+			ret = -EINVAL;
+			break;
+		}
+
+		/*
+		 * Check if we abut a region below - nothing below 0.
+		 * This is the most likely case when mapping chunks of
+		 * physically contiguous regions within a virtual address
+		 * range.  Update the abutting entry in place since iova
+		 * doesn't change.
+		 */
+		if (likely(iovamap)) {
+			struct vfio_dma *tmp;
+			tmp = vfio_find_dma(iommu, iovamap - 1, 1);
+			if (tmp && tmp->prot == prot &&
+			    tmp->vaddr + tmp->size == vaddr) {
+				tmp->size += mapsize;
+				dma = tmp;
+			}
+		}
+
+		/*
+		 * Check if we abut a region above - nothing above ~0 + 1.
+		 * If we abut above and below, remove and free.  If only
+		 * abut above, remove, modify, reinsert.
+		 */
+		if (likely(iovamap + mapsize)) {
+			struct vfio_dma *tmp;
+			tmp = vfio_find_dma(iommu, iovamap + mapsize, 1);
+			if (tmp && tmp->prot == prot &&
+			    tmp->vaddr == vaddr + mapsize) {
+				vfio_remove_dma(iommu, tmp);
+				if (dma) {
+					dma->size += tmp->size;
+					kfree(tmp);
+				} else {
+					tmp->size += mapsize;
+					tmp->iova = iovamap;
+					tmp->vaddr = vaddr;
+					vfio_insert_dma(iommu, tmp);
+					dma = tmp;
+				}
+			}
+		}
+
+		if (!dma) {
+			dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+			if (!dma) {
+				iommu_unmap(iommu->domain, iovamap, mapsize);
+				vfio_unpin_pages(pfn, npage, prot, true);
+				ret = -ENOMEM;
+				break;
+			}
+
+			dma->size = mapsize;
+			dma->iova = iovamap;
+			dma->vaddr = vaddr;
+			dma->prot = prot;
+			vfio_insert_dma(iommu, dma);
+		}
+
+		iovamap += mapsize;
+		vaddr += mapsize;
+	}
+
+	if (ret) {
+		struct vfio_dma *tmp;
+		while ((tmp = vfio_find_dma(iommu, iova, size))) {
+			int r = vfio_remove_dma_overlap(iommu, iova,
+							&size, tmp);
+			if (WARN_ON(r || !size))
+				break;
+		}
+		return 0;
+	}
+
+	vfio_enable_iommu_domain(iommu);
+	return 0;
+}
+
+static int vfio_dma_do_map(struct vfio_iommu *iommu,
+			   struct vfio_iommu_type1_dma_map *map)
+{
+	dma_addr_t iova = map->iova;
+	size_t size = map->size;
+	unsigned long vaddr = map->vaddr;
+	int ret = 0, prot = 0;
+	long npage;
+
+	/* READ/WRITE from device perspective */
+	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
+		prot |= IOMMU_WRITE;
+	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
+		prot |= IOMMU_READ;
+
+	if (!prot)
+		return -EINVAL; /* No READ/WRITE? */
+
+	/* Don't allow IOVA wrap */
+	if (iova + size && iova + size < iova)
+		return -EINVAL;
+
+	/* Don't allow virtual address wrap */
+	if (vaddr + size && vaddr + size < vaddr)
+		return -EINVAL;
+
+	/*
+	 * FIXME: Currently we only support mapping page-size
+	 * of subwindow-size.
+	 */
+	if (size < iommu->page_size)
+		return -EINVAL;
+
+	npage = size >> PAGE_SHIFT;
+	if (!npage)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	/* Check for dma maping and msi_dma mapping */
+	if (vfio_find_dma(iommu, iova, size) ||
+	    vfio_find_msi_dma(iommu, iova, size)) {
+		ret = -EEXIST;
+		goto out_lock;
+	}
+
+	ret = vfio_dma_map(iommu, iova, vaddr, npage, prot);
+
+out_lock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
+			     struct vfio_iommu_type1_dma_unmap *unmap)
+{
+	struct vfio_dma *dma;
+	size_t unmapped = 0, size;
+	int ret = 0;
+
+	mutex_lock(&iommu->lock);
+
+	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
+		size = unmap->size;
+		ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
+		if (ret || !size)
+			break;
+		unmapped += size;
+	}
+
+	mutex_unlock(&iommu->lock);
+
+	/*
+	 * We may unmap more than requested, update the unmap struct so
+	 * userspace can know.
+	 */
+	unmap->size = unmapped;
+
+	return ret;
+}
+
+static int vfio_handle_get_attr(struct vfio_iommu *iommu,
+			 struct vfio_pamu_attr *pamu_attr)
+{
+	int ret = 0;
+
+	switch (pamu_attr->attribute) {
+	case VFIO_ATTR_GEOMETRY: {
+		struct iommu_domain_geometry geom;
+		ret = iommu_domain_get_attr(iommu->domain,
+					  DOMAIN_ATTR_GEOMETRY, &geom);
+		pamu_attr->attr_info.attr.aperture_start = geom.aperture_start;
+		pamu_attr->attr_info.attr.aperture_end = geom.aperture_end;
+		break;
+	}
+	case VFIO_ATTR_WINDOWS: {
+		u32 count;
+		ret = iommu_domain_get_attr(iommu->domain,
+				      DOMAIN_ATTR_WINDOWS, &count);
+		pamu_attr->attr_info.windows = count;
+		break;
+	}
+	case VFIO_ATTR_PAMU_STASH: {
+		struct pamu_stash_attribute stash;
+		ret = iommu_domain_get_attr(iommu->domain,
+				      DOMAIN_ATTR_FSL_PAMU_STASH, &stash);
+		pamu_attr->attr_info.stash.cpu = stash.cpu;
+		pamu_attr->attr_info.stash.cache = stash.cache;
+		break;
+	}
+
+	default:
+		pr_err("%s Error: Invalid attribute (%d)\n",
+			 __func__, pamu_attr->attribute);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int vfio_handle_set_attr(struct vfio_iommu *iommu,
+			 struct vfio_pamu_attr *pamu_attr)
+{
+	int ret = 0;
+
+	switch (pamu_attr->attribute) {
+	case VFIO_ATTR_GEOMETRY: {
+		struct iommu_domain_geometry geom;
+
+		geom.aperture_start = pamu_attr->attr_info.attr.aperture_start;
+		geom.aperture_end = pamu_attr->attr_info.attr.aperture_end;
+		iommu->aperture_start = geom.aperture_start;
+		iommu->aperture_end = geom.aperture_end;
+		geom.force_aperture = 1;
+		ret = iommu_domain_set_attr(iommu->domain,
+					  DOMAIN_ATTR_GEOMETRY, &geom);
+		break;
+	}
+	case VFIO_ATTR_WINDOWS: {
+		u32 count = pamu_attr->attr_info.windows;
+		u64 size = iommu->aperture_end - iommu->aperture_start + 1;
+
+		ret = iommu_domain_set_attr(iommu->domain,
+				      DOMAIN_ATTR_WINDOWS, &count);
+		if (!ret) {
+			iommu->nsubwindows = pamu_attr->attr_info.windows;
+			iommu->page_size = size >> ilog2(count);
+		}
+
+		break;
+	}
+	case VFIO_ATTR_PAMU_STASH: {
+		struct pamu_stash_attribute stash;
+
+		stash.cpu = pamu_attr->attr_info.stash.cpu;
+		stash.cache = pamu_attr->attr_info.stash.cache;
+		ret = iommu_domain_set_attr(iommu->domain,
+				      DOMAIN_ATTR_FSL_PAMU_STASH, &stash);
+		break;
+	}
+
+	default:
+		pr_err("%s Error: Invalid attribute (%d)\n",
+			 __func__, pamu_attr->attribute);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int pci_msi_set_device_iova(struct device *dev, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct vfio_msi_dma *msi_dma = data;
+
+	return msi_set_iova(pdev, msi_dma->bank_id, msi_dma->iova, 1);
+}
+
+static int pci_msi_clear_device_iova(struct device *dev, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct vfio_msi_dma *msi_dma = data;
+
+	return msi_set_iova(pdev, msi_dma->bank_id, msi_dma->iova, 0);
+}
+
+static int vfio_iommu_set_msi_iova(struct vfio_iommu *iommu,
+				   struct vfio_msi_dma *msi_dma)
+{
+	struct vfio_group *group;
+	int ret = 0;
+
+	list_for_each_entry(group, &iommu->group_list, next) {
+		ret = iommu_group_for_each_dev(group->iommu_group, msi_dma,
+					       pci_msi_set_device_iova);
+	}
+
+	return ret;
+}
+
+static int vfio_iommu_clear_msi_iova(struct vfio_iommu *iommu,
+				     struct vfio_msi_dma *msi_dma)
+{
+	struct vfio_group *group;
+	int ret = 0;
+
+	list_for_each_entry(group, &iommu->group_list, next) {
+		ret = iommu_group_for_each_dev(group->iommu_group, msi_dma,
+					       pci_msi_clear_device_iova);
+	}
+
+	return ret;
+}
+
+static int vfio_do_msi_map(struct vfio_iommu *iommu,
+			struct vfio_pamu_msi_bank_map *msi_map)
+{
+	struct msi_region region;
+	struct vfio_msi_dma *msi_dma;
+	int window;
+	int prot = 0;
+	int ret;
+
+	/* READ/WRITE from device perspective */
+	if (msi_map->flags & VFIO_DMA_MAP_FLAG_WRITE)
+		prot |= IOMMU_WRITE;
+	if (msi_map->flags & VFIO_DMA_MAP_FLAG_READ)
+		prot |= IOMMU_READ;
+
+	if (!prot)
+		return -EINVAL; /* No READ/WRITE? */
+
+	ret = msi_get_region(msi_map->msi_bank_index, &region);
+	if (ret) {
+		pr_err("%s MSI region (%d) not found\n", __func__,
+		       msi_map->msi_bank_index);
+		return ret;
+	}
+
+	mutex_lock(&iommu->lock);
+	/* Check for dma maping and msi_dma mapping */
+	if (vfio_find_dma(iommu, msi_map->iova, region.size) ||
+	    vfio_find_msi_dma(iommu, msi_map->iova, region.size)) {
+		ret = -EEXIST;
+		goto out_lock;
+	}
+
+	window = iova_to_win(iommu, msi_map->iova);
+	ret = iommu_domain_window_enable(iommu->domain, window, region.addr,
+					 region.size, prot);
+	if (ret) {
+		pr_err("%s Error: unable to map msi region\n", __func__);
+		goto out_lock;
+	}
+
+	msi_dma = kzalloc(sizeof(*msi_dma), GFP_KERNEL);
+	if (!msi_dma) {
+		ret = -ENOMEM;
+		goto out_lock;
+	}
+
+	msi_dma->iova = msi_map->iova;
+	msi_dma->size = region.size;
+	msi_dma->bank_id = msi_map->msi_bank_index;
+	list_add(&msi_dma->next, &iommu->msi_dma_list);
+
+	/* Set iova for all the device in iommu-group for the given msi-bank */
+	ret = vfio_iommu_set_msi_iova(iommu, msi_dma);
+
+out_lock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static void vfio_msi_unmap(struct vfio_iommu *iommu, dma_addr_t iova)
+{
+	int window;
+	window = iova_to_win(iommu, iova);
+	iommu_domain_window_disable(iommu->domain, window);
+}
+
+static int vfio_do_msi_unmap(struct vfio_iommu *iommu,
+			     struct vfio_pamu_msi_bank_unmap *msi_unmap)
+{
+	struct vfio_msi_dma *mdma, *mdma_tmp;
+
+	mutex_lock(&iommu->lock);
+
+	list_for_each_entry_safe(mdma, mdma_tmp, &iommu->msi_dma_list, next) {
+		if (mdma->iova == msi_unmap->iova) {
+			/* Clear mapping for msi iova page mapping */
+			vfio_iommu_clear_msi_iova(iommu, mdma);
+			/* Unmap in iommu (PAMU) */
+			vfio_msi_unmap(iommu, mdma->iova);
+			list_del(&mdma->next);
+			vfio_check_and_disable_iommu(iommu);
+			kfree(mdma);
+			mutex_unlock(&iommu->lock);
+			return 0;
+		}
+	}
+
+	mutex_unlock(&iommu->lock);
+	return -EINVAL;
+}
+static void *vfio_iommu_fsl_pamu_open(unsigned long arg)
+{
+	struct vfio_iommu *iommu;
+
+	if (arg != VFIO_FSL_PAMU_IOMMU)
+		return ERR_PTR(-EINVAL);
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&iommu->group_list);
+	iommu->dma_list = RB_ROOT;
+	INIT_LIST_HEAD(&iommu->msi_dma_list);
+	mutex_init(&iommu->lock);
+
+	/*
+	 * Wish we didn't have to know about bus_type here.
+	 */
+	iommu->domain = iommu_domain_alloc(&pci_bus_type);
+	if (!iommu->domain) {
+		kfree(iommu);
+		return ERR_PTR(-EIO);
+	}
+
+	return iommu;
+}
+
+static void vfio_iommu_fsl_pamu_release(void *iommu_data)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_group *group, *group_tmp;
+	struct vfio_msi_dma *mdma, *mdma_tmp;
+	struct rb_node *node;
+
+	list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
+		iommu_detach_group(iommu->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	while ((node = rb_first(&iommu->dma_list))) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+		size_t size = dma->size;
+		vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
+		if (WARN_ON(!size))
+			break;
+	}
+
+	list_for_each_entry_safe(mdma, mdma_tmp, &iommu->msi_dma_list, next) {
+		vfio_msi_unmap(iommu, mdma->iova);
+		list_del(&mdma->next);
+		kfree(mdma);
+	}
+
+	/* Disable the iommu as there is no valid entry */
+	vfio_disable_iommu_domain(iommu);
+
+	iommu_domain_free(iommu->domain);
+	iommu->domain = NULL;
+	kfree(iommu);
+}
+
+static long vfio_iommu_fsl_pamu_ioctl(void *iommu_data,
+				      unsigned int cmd, unsigned long arg)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	unsigned long minsz;
+
+	if (cmd == VFIO_CHECK_EXTENSION) {
+		switch (arg) {
+		case VFIO_FSL_PAMU_IOMMU:
+			return 1;
+		default:
+			return 0;
+		}
+	} else if (cmd == VFIO_IOMMU_MAP_DMA) {
+		struct vfio_iommu_type1_dma_map map;
+		uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+		if (copy_from_user(&map, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (map.argsz < minsz || map.flags & ~mask)
+			return -EINVAL;
+
+		return vfio_dma_do_map(iommu, &map);
+
+	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
+		struct vfio_iommu_type1_dma_unmap unmap;
+		long ret;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+
+		if (copy_from_user(&unmap, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (unmap.argsz < minsz || unmap.flags)
+			return -EINVAL;
+
+		ret = vfio_dma_do_unmap(iommu, &unmap);
+		if (ret)
+			return ret;
+
+		return copy_to_user((void __user *)arg, &unmap, minsz);
+	} else if (cmd == VFIO_IOMMU_PAMU_GET_ATTR) {
+		struct vfio_pamu_attr pamu_attr;
+
+		minsz = offsetofend(struct vfio_pamu_attr, attr_info);
+		if (copy_from_user(&pamu_attr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (pamu_attr.argsz < minsz)
+			return -EINVAL;
+
+		vfio_handle_get_attr(iommu, &pamu_attr);
+
+		copy_to_user((void __user *)arg, &pamu_attr, minsz);
+		return 0;
+	} else if (cmd == VFIO_IOMMU_PAMU_SET_ATTR) {
+		struct vfio_pamu_attr pamu_attr;
+
+		minsz = offsetofend(struct vfio_pamu_attr, attr_info);
+		if (copy_from_user(&pamu_attr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (pamu_attr.argsz < minsz)
+			return -EINVAL;
+
+		vfio_handle_set_attr(iommu, &pamu_attr);
+		return 0;
+	} else if (cmd == VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT) {
+		return msi_get_region_count();
+	} else if (cmd == VFIO_IOMMU_PAMU_MAP_MSI_BANK) {
+		struct vfio_pamu_msi_bank_map msi_map;
+
+		minsz = offsetofend(struct vfio_pamu_msi_bank_map, iova);
+		if (copy_from_user(&msi_map, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (msi_map.argsz < minsz)
+			return -EINVAL;
+
+		vfio_do_msi_map(iommu, &msi_map);
+		return 0;
+	} else if (cmd == VFIO_IOMMU_PAMU_UNMAP_MSI_BANK) {
+		struct vfio_pamu_msi_bank_unmap msi_unmap;
+
+		minsz = offsetofend(struct vfio_pamu_msi_bank_unmap, iova);
+		if (copy_from_user(&msi_unmap, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (msi_unmap.argsz < minsz)
+			return -EINVAL;
+
+		vfio_do_msi_unmap(iommu, &msi_unmap);
+		return 0;
+
+	}
+
+	return -ENOTTY;
+}
+
+static int vfio_iommu_fsl_pamu_attach_group(void *iommu_data,
+					 struct iommu_group *iommu_group)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_group *group, *tmp;
+	int ret;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return -ENOMEM;
+
+	mutex_lock(&iommu->lock);
+
+	list_for_each_entry(tmp, &iommu->group_list, next) {
+		if (tmp->iommu_group == iommu_group) {
+			mutex_unlock(&iommu->lock);
+			kfree(group);
+			return -EINVAL;
+		}
+	}
+
+	ret = iommu_attach_group(iommu->domain, iommu_group);
+	if (ret) {
+		mutex_unlock(&iommu->lock);
+		kfree(group);
+		return ret;
+	}
+
+	group->iommu_group = iommu_group;
+	list_add(&group->next, &iommu->group_list);
+
+	mutex_unlock(&iommu->lock);
+
+	return 0;
+}
+
+static void vfio_iommu_fsl_pamu_detach_group(void *iommu_data,
+					  struct iommu_group *iommu_group)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_group *group;
+
+	mutex_lock(&iommu->lock);
+
+	list_for_each_entry(group, &iommu->group_list, next) {
+		if (group->iommu_group == iommu_group) {
+			iommu_detach_group(iommu->domain, iommu_group);
+			list_del(&group->next);
+			kfree(group);
+			break;
+		}
+	}
+
+	mutex_unlock(&iommu->lock);
+}
+
+static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_fsl_pamu = {
+	.name		= "vfio-iommu-fsl_pamu",
+	.owner		= THIS_MODULE,
+	.open		= vfio_iommu_fsl_pamu_open,
+	.release	= vfio_iommu_fsl_pamu_release,
+	.ioctl		= vfio_iommu_fsl_pamu_ioctl,
+	.attach_group	= vfio_iommu_fsl_pamu_attach_group,
+	.detach_group	= vfio_iommu_fsl_pamu_detach_group,
+};
+
+static int __init vfio_iommu_fsl_pamu_init(void)
+{
+	if (!iommu_present(&pci_bus_type))
+		return -ENODEV;
+
+	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_fsl_pamu);
+}
+
+static void __exit vfio_iommu_fsl_pamu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_fsl_pamu);
+}
+
+module_init(vfio_iommu_fsl_pamu_init);
+module_exit(vfio_iommu_fsl_pamu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0fd47f5..d359055 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -23,6 +23,7 @@
 
 #define VFIO_TYPE1_IOMMU		1
 #define VFIO_SPAPR_TCE_IOMMU		2
+#define VFIO_FSL_PAMU_IOMMU		3
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -451,4 +452,103 @@ struct vfio_iommu_spapr_tce_info {
 
 /* ***************************************************************** */
 
+/*********** APIs for VFIO_PAMU type only ****************/
+/*
+ * VFIO_IOMMU_PAMU_GET_ATTR - _IO(VFIO_TYPE, VFIO_BASE + 17,
+ *				  struct vfio_pamu_attr)
+ *
+ * Gets the iommu attributes for the current vfio container.
+ * Caller sets argsz and attribute.  The ioctl fills in
+ * the provided struct vfio_pamu_attr based on the attribute
+ * value that was set.
+ * Return: 0 on success, -errno on failure
+ */
+struct vfio_pamu_attr {
+	__u32	argsz;
+	__u32	flags;	/* no flags currently */
+#define VFIO_ATTR_GEOMETRY	0
+#define VFIO_ATTR_WINDOWS	1
+#define VFIO_ATTR_PAMU_STASH	2
+	__u32	attribute;
+
+	union {
+		/* VFIO_ATTR_GEOMETRY */
+		struct {
+			/* first addr that can be mapped */
+			__u64 aperture_start;
+			/* last addr that can be mapped */
+			__u64 aperture_end;
+		} attr;
+
+		/* VFIO_ATTR_WINDOWS */
+		__u32 windows;  /* number of windows in the aperture
+				 * initially this will be the max number
+				 * of windows that can be set
+				 */
+		/* VFIO_ATTR_PAMU_STASH */
+		struct {
+			__u32 cpu;	/* CPU number for stashing */
+			__u32 cache;	/* cache ID for stashing */
+		} stash;
+	} attr_info;
+};
+#define VFIO_IOMMU_PAMU_GET_ATTR  _IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/*
+ * VFIO_IOMMU_PAMU_SET_ATTR - _IO(VFIO_TYPE, VFIO_BASE + 18,
+ *				  struct vfio_pamu_attr)
+ *
+ * Sets the iommu attributes for the current vfio container.
+ * Caller sets struct vfio_pamu attr, including argsz and attribute and
+ * setting any fields that are valid for the attribute.
+ * Return: 0 on success, -errno on failure
+ */
+#define VFIO_IOMMU_PAMU_SET_ATTR  _IO(VFIO_TYPE, VFIO_BASE + 18)
+
+/*
+ * VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT - _IO(VFIO_TYPE, VFIO_BASE + 19, __u32)
+ *
+ * Returns the number of MSI banks for this platform.  This tells user space
+ * how many aperture windows should be reserved for MSI banks when setting
+ * the PAMU geometry and window count.
+ * Return: __u32 bank count on success, -errno on failure
+ */
+#define VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT _IO(VFIO_TYPE, VFIO_BASE + 19)
+
+/*
+ * VFIO_IOMMU_PAMU_MAP_MSI_BANK - _IO(VFIO_TYPE, VFIO_BASE + 20,
+ *				      struct vfio_pamu_msi_bank_map)
+ *
+ * Maps the MSI bank at the specified index and iova.  User space must
+ * call this ioctl once for each MSI bank (count of banks is returned by
+ * VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT).
+ * Caller provides struct vfio_pamu_msi_bank_map with all fields set.
+ * Return: 0 on success, -errno on failure
+ */
+
+struct vfio_pamu_msi_bank_map {
+	__u32	argsz;
+	__u32	flags;		/* no flags currently */
+	__u32	msi_bank_index;	/* the index of the MSI bank */
+	__u64	iova;		/* the iova the bank is to be mapped to */
+};
+#define VFIO_IOMMU_PAMU_MAP_MSI_BANK  _IO(VFIO_TYPE, VFIO_BASE + 20)
+
+/*
+ * VFIO_IOMMU_PAMU_UNMAP_MSI_BANK - _IO(VFIO_TYPE, VFIO_BASE + 21,
+ *					struct vfio_pamu_msi_bank_unmap)
+ *
+ * Unmaps the MSI bank at the specified iova.
+ * Caller provides struct vfio_pamu_msi_bank_unmap with all fields set.
+ * Operates on VFIO file descriptor (/dev/vfio/vfio).
+ * Return: 0 on success, -errno on failure
+ */
+
+struct vfio_pamu_msi_bank_unmap {
+	__u32	argsz;
+	__u32	flags;	/* no flags currently */
+	__u64	iova;	/* the iova to be unmapped to */
+};
+#define VFIO_IOMMU_PAMU_UNMAP_MSI_BANK  _IO(VFIO_TYPE, VFIO_BASE + 21)
+
 #endif /* _UAPIVFIO_H */
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 8/9 v2] vfio: moving some functions in common file
From: Bharat Bhushan @ 2013-11-19  5:17 UTC (permalink / raw)
  To: alex.williamson, joro, bhelgaas, agraf, scottwood, stuart.yoder,
	iommu, linux-pci, linuxppc-dev, linux-kernel
  Cc: Bharat Bhushan
In-Reply-To: <1384838233-24847-1-git-send-email-Bharat.Bhushan@freescale.com>

Some function defined in vfio_iommu_type1.c are generic (not specific
or type1 iommu) and we want to use these for FSL IOMMU (PAMU) and
going forward in iommu-none driver.
So I have created a new file naming vfio_iommu_common.c and moved some
of generic functions into this file.

I Agree (with Alex Williamson and myself :-)) that some more functions
can be moved to this new common file (with some changes in type1/fsl_pamu
and others). But in this patch i avoided doing these changes and
just moved functions which are straight forward and allow me to
get fsl-powerpc vfio framework in place.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
v1->v2
 - removed un-necessary header file inclusion
 - mark static function which are internal to *common.c

 drivers/vfio/Makefile            |    4 +-
 drivers/vfio/vfio_iommu_common.c |  227 ++++++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio_iommu_common.h |   27 +++++
 drivers/vfio/vfio_iommu_type1.c  |  206 +----------------------------------
 4 files changed, 257 insertions(+), 207 deletions(-)
 create mode 100644 drivers/vfio/vfio_iommu_common.c
 create mode 100644 drivers/vfio/vfio_iommu_common.h

diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 72bfabc..c5792ec 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
-obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
-obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
+obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_common.o vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_common.o vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_common.c b/drivers/vfio/vfio_iommu_common.c
new file mode 100644
index 0000000..08eea71
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_common.c
@@ -0,0 +1,227 @@
+/*
+ * VFIO: Common code for vfio IOMMU support
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *     Author: Bharat Bhushan <bharat.bhushan@freescale.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/compat.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+static bool disable_hugepages;
+module_param_named(disable_hugepages,
+		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_hugepages,
+		 "Disable VFIO IOMMU support for IOMMU hugepages.");
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void vfio_lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+void vfio_lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm || !npage)
+		return; /* process exited or nothing to do */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * Some mappings aren't backed by a struct page, for example an mmap'd
+ * MMIO range for our own or another device.  These use a different
+ * pfn conversion and shouldn't be tracked as locked pages.
+ */
+static bool is_invalid_reserved_pfn(unsigned long pfn)
+{
+	if (pfn_valid(pfn)) {
+		bool reserved;
+		struct page *tail = pfn_to_page(pfn);
+		struct page *head = compound_trans_head(tail);
+		reserved = !!(PageReserved(head));
+		if (head != tail) {
+			/*
+			 * "head" is not a dangling pointer
+			 * (compound_trans_head takes care of that)
+			 * but the hugepage may have been split
+			 * from under us (and we may not hold a
+			 * reference count on the head page so it can
+			 * be reused before we run PageReferenced), so
+			 * we've to check PageTail before returning
+			 * what we just read.
+			 */
+			smp_rmb();
+			if (PageTail(tail))
+				return reserved;
+		}
+		return PageReserved(tail);
+	}
+
+	return true;
+}
+
+static int put_pfn(unsigned long pfn, int prot)
+{
+	if (!is_invalid_reserved_pfn(pfn)) {
+		struct page *page = pfn_to_page(pfn);
+		if (prot & IOMMU_WRITE)
+			SetPageDirty(page);
+		put_page(page);
+		return 1;
+	}
+	return 0;
+}
+
+static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+{
+	struct page *page[1];
+	struct vm_area_struct *vma;
+	int ret = -EFAULT;
+
+	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+		*pfn = page_to_pfn(page[0]);
+		return 0;
+	}
+
+	down_read(&current->mm->mmap_sem);
+
+	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+
+	if (vma && vma->vm_flags & VM_PFNMAP) {
+		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+		if (is_invalid_reserved_pfn(*pfn))
+			ret = 0;
+	}
+
+	up_read(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+/*
+ * Attempt to pin pages.  We really don't want to track all the pfns and
+ * the iommu can only map chunks of consecutive pfns anyway, so get the
+ * first page and all consecutive pages with the same locking.
+ */
+long vfio_pin_pages(unsigned long vaddr, long npage,
+			   int prot, unsigned long *pfn_base)
+{
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = capable(CAP_IPC_LOCK);
+	long ret, i;
+
+	if (!current->mm)
+		return -ENODEV;
+
+	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	if (ret)
+		return ret;
+
+	if (is_invalid_reserved_pfn(*pfn_base))
+		return 1;
+
+	if (!lock_cap && current->mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, prot);
+		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+			limit << PAGE_SHIFT);
+		return -ENOMEM;
+	}
+
+	if (unlikely(disable_hugepages)) {
+		vfio_lock_acct(1);
+		return 1;
+	}
+
+	/* Lock all the consecutive pages from pfn_base */
+	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
+		unsigned long pfn = 0;
+
+		ret = vaddr_get_pfn(vaddr, prot, &pfn);
+		if (ret)
+			break;
+
+		if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
+			put_pfn(pfn, prot);
+			break;
+		}
+
+		if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
+			put_pfn(pfn, prot);
+			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+				__func__, limit << PAGE_SHIFT);
+			break;
+		}
+	}
+
+	vfio_lock_acct(i);
+
+	return i;
+}
+
+long vfio_unpin_pages(unsigned long pfn, long npage,
+			     int prot, bool do_accounting)
+{
+	unsigned long unlocked = 0;
+	long i;
+
+	for (i = 0; i < npage; i++)
+		unlocked += put_pfn(pfn++, prot);
+
+	if (do_accounting)
+		vfio_lock_acct(-unlocked);
+
+	return unlocked;
+}
diff --git a/drivers/vfio/vfio_iommu_common.h b/drivers/vfio/vfio_iommu_common.h
new file mode 100644
index 0000000..2566ce6
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_common.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef _VFIO_IOMMU_COMMON_H
+#define _VFIO_IOMMU_COMMON_H
+
+void vfio_lock_acct(long npage);
+long vfio_pin_pages(unsigned long vaddr, long npage, int prot,
+		    unsigned long *pfn_base);
+long vfio_unpin_pages(unsigned long pfn, long npage,
+		      int prot, bool do_accounting);
+#endif
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a9807de..e9a58fa 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
+#include "vfio_iommu_common.h"
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -48,12 +49,6 @@ module_param_named(allow_unsafe_interrupts,
 MODULE_PARM_DESC(allow_unsafe_interrupts,
 		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
 
-static bool disable_hugepages;
-module_param_named(disable_hugepages,
-		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(disable_hugepages,
-		 "Disable VFIO IOMMU support for IOMMU hugepages.");
-
 struct vfio_iommu {
 	struct iommu_domain	*domain;
 	struct mutex		lock;
@@ -123,205 +118,6 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 	rb_erase(&old->node, &iommu->dma_list);
 }
 
-struct vwork {
-	struct mm_struct	*mm;
-	long			npage;
-	struct work_struct	work;
-};
-
-/* delayed decrement/increment for locked_vm */
-static void vfio_lock_acct_bg(struct work_struct *work)
-{
-	struct vwork *vwork = container_of(work, struct vwork, work);
-	struct mm_struct *mm;
-
-	mm = vwork->mm;
-	down_write(&mm->mmap_sem);
-	mm->locked_vm += vwork->npage;
-	up_write(&mm->mmap_sem);
-	mmput(mm);
-	kfree(vwork);
-}
-
-static void vfio_lock_acct(long npage)
-{
-	struct vwork *vwork;
-	struct mm_struct *mm;
-
-	if (!current->mm || !npage)
-		return; /* process exited or nothing to do */
-
-	if (down_write_trylock(&current->mm->mmap_sem)) {
-		current->mm->locked_vm += npage;
-		up_write(&current->mm->mmap_sem);
-		return;
-	}
-
-	/*
-	 * Couldn't get mmap_sem lock, so must setup to update
-	 * mm->locked_vm later. If locked_vm were atomic, we
-	 * wouldn't need this silliness
-	 */
-	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
-	if (!vwork)
-		return;
-	mm = get_task_mm(current);
-	if (!mm) {
-		kfree(vwork);
-		return;
-	}
-	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
-	vwork->mm = mm;
-	vwork->npage = npage;
-	schedule_work(&vwork->work);
-}
-
-/*
- * Some mappings aren't backed by a struct page, for example an mmap'd
- * MMIO range for our own or another device.  These use a different
- * pfn conversion and shouldn't be tracked as locked pages.
- */
-static bool is_invalid_reserved_pfn(unsigned long pfn)
-{
-	if (pfn_valid(pfn)) {
-		bool reserved;
-		struct page *tail = pfn_to_page(pfn);
-		struct page *head = compound_trans_head(tail);
-		reserved = !!(PageReserved(head));
-		if (head != tail) {
-			/*
-			 * "head" is not a dangling pointer
-			 * (compound_trans_head takes care of that)
-			 * but the hugepage may have been split
-			 * from under us (and we may not hold a
-			 * reference count on the head page so it can
-			 * be reused before we run PageReferenced), so
-			 * we've to check PageTail before returning
-			 * what we just read.
-			 */
-			smp_rmb();
-			if (PageTail(tail))
-				return reserved;
-		}
-		return PageReserved(tail);
-	}
-
-	return true;
-}
-
-static int put_pfn(unsigned long pfn, int prot)
-{
-	if (!is_invalid_reserved_pfn(pfn)) {
-		struct page *page = pfn_to_page(pfn);
-		if (prot & IOMMU_WRITE)
-			SetPageDirty(page);
-		put_page(page);
-		return 1;
-	}
-	return 0;
-}
-
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
-{
-	struct page *page[1];
-	struct vm_area_struct *vma;
-	int ret = -EFAULT;
-
-	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
-		*pfn = page_to_pfn(page[0]);
-		return 0;
-	}
-
-	down_read(&current->mm->mmap_sem);
-
-	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
-
-	if (vma && vma->vm_flags & VM_PFNMAP) {
-		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-		if (is_invalid_reserved_pfn(*pfn))
-			ret = 0;
-	}
-
-	up_read(&current->mm->mmap_sem);
-
-	return ret;
-}
-
-/*
- * Attempt to pin pages.  We really don't want to track all the pfns and
- * the iommu can only map chunks of consecutive pfns anyway, so get the
- * first page and all consecutive pages with the same locking.
- */
-static long vfio_pin_pages(unsigned long vaddr, long npage,
-			   int prot, unsigned long *pfn_base)
-{
-	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	bool lock_cap = capable(CAP_IPC_LOCK);
-	long ret, i;
-
-	if (!current->mm)
-		return -ENODEV;
-
-	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
-	if (ret)
-		return ret;
-
-	if (is_invalid_reserved_pfn(*pfn_base))
-		return 1;
-
-	if (!lock_cap && current->mm->locked_vm + 1 > limit) {
-		put_pfn(*pfn_base, prot);
-		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
-			limit << PAGE_SHIFT);
-		return -ENOMEM;
-	}
-
-	if (unlikely(disable_hugepages)) {
-		vfio_lock_acct(1);
-		return 1;
-	}
-
-	/* Lock all the consecutive pages from pfn_base */
-	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
-		unsigned long pfn = 0;
-
-		ret = vaddr_get_pfn(vaddr, prot, &pfn);
-		if (ret)
-			break;
-
-		if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
-			put_pfn(pfn, prot);
-			break;
-		}
-
-		if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
-			put_pfn(pfn, prot);
-			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
-				__func__, limit << PAGE_SHIFT);
-			break;
-		}
-	}
-
-	vfio_lock_acct(i);
-
-	return i;
-}
-
-static long vfio_unpin_pages(unsigned long pfn, long npage,
-			     int prot, bool do_accounting)
-{
-	unsigned long unlocked = 0;
-	long i;
-
-	for (i = 0; i < npage; i++)
-		unlocked += put_pfn(pfn++, prot);
-
-	if (do_accounting)
-		vfio_lock_acct(-unlocked);
-
-	return unlocked;
-}
-
 static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 			    dma_addr_t iova, size_t *size)
 {
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 7/9 v2] pci: msi: Extend msi iova setting interface to powerpc arch
From: Bharat Bhushan @ 2013-11-19  5:17 UTC (permalink / raw)
  To: alex.williamson, joro, bhelgaas, agraf, scottwood, stuart.yoder,
	iommu, linux-pci, linuxppc-dev, linux-kernel
  Cc: Bharat Bhushan
In-Reply-To: <1384838233-24847-1-git-send-email-Bharat.Bhushan@freescale.com>

Now we Keep track of devices which have msi page mapping to specific
iova page for all msi bank. When composing MSI address and data then
this list will be traversed. If device found in the list then use
configured iova page otherwise iova page will be taken as before.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
v2
 - new patch

 arch/powerpc/sysdev/fsl_msi.c |   90 +++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/sysdev/fsl_msi.h |   16 ++++++-
 2 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index eeebbf0..52d2beb 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -137,6 +137,75 @@ static int fsl_msi_get_region(int region_num, struct msi_region *region)
 	return -ENODEV;
 }
 
+/* Add device to the list which have iova page mapping */
+static int fsl_msi_add_iova_device(struct fsl_msi *msi_data,
+				   struct pci_dev *pdev, dma_addr_t iova)
+{
+	struct fsl_msi_device *device;
+
+	mutex_lock(&msi_data->lock);
+	list_for_each_entry(device, &msi_data->device_list, list) {
+		/* If mapping already exits then update with new page mapping */
+		if (device->dev == pdev) {
+			device->iova = iova;
+			mutex_unlock(&msi_data->lock);
+			return 0;
+		}
+	}
+
+	device = kzalloc(sizeof(struct fsl_msi_device), GFP_KERNEL);
+	if (!device) {
+		pr_err("%s: Memory allocation failed\n", __func__);
+		mutex_unlock(&msi_data->lock);
+		return -ENOMEM;
+	}
+
+	device->dev = pdev;
+	device->iova = iova;
+	list_add_tail(&device->list, &msi_data->device_list);
+	mutex_unlock(&msi_data->lock);
+	return 0;
+}
+
+/* Remove device to the list which have iova page mapping */
+static int fsl_msi_del_iova_device(struct fsl_msi *msi_data,
+				   struct pci_dev *pdev)
+{
+	struct fsl_msi_device *device;
+
+	mutex_lock(&msi_data->lock);
+	list_for_each_entry(device, &msi_data->device_list, list) {
+		if (device->dev == pdev) {
+			list_del(&device->list);
+			kfree(device);
+			break;
+		}
+	}
+	mutex_unlock(&msi_data->lock);
+	return 0;
+}
+
+/* set/clear device iova mapping for the requested msi region */
+static int fsl_msi_set_iova(struct pci_dev *pdev, int region_num,
+			    dma_addr_t iova, bool set)
+{
+	struct fsl_msi *msi_data;
+	int ret = -EINVAL;
+
+	list_for_each_entry(msi_data, &msi_head, list) {
+		if (msi_data->bank_index != region_num)
+			continue;
+
+		if (set)
+			ret = fsl_msi_add_iova_device(msi_data, pdev, iova);
+		else
+			ret = fsl_msi_del_iova_device(msi_data, pdev);
+
+		break;
+	}
+	return ret;
+}
+
 static int fsl_msi_check_device(struct pci_dev *pdev, int nvec, int type)
 {
 	if (type == PCI_CAP_ID_MSIX)
@@ -167,6 +236,7 @@ static void fsl_compose_msi_msg(struct pci_dev *pdev, int hwirq,
 				struct msi_msg *msg,
 				struct fsl_msi *fsl_msi_data)
 {
+	struct fsl_msi_device *device;
 	struct fsl_msi *msi_data = fsl_msi_data;
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	u64 address; /* Physical address of the MSIIR */
@@ -181,6 +251,15 @@ static void fsl_compose_msi_msg(struct pci_dev *pdev, int hwirq,
 		address = fsl_pci_immrbar_base(hose) +
 			   (msi_data->msiir & 0xfffff);
 
+	mutex_lock(&msi_data->lock);
+	list_for_each_entry(device, &msi_data->device_list, list) {
+		if (device->dev == pdev) {
+			address = device->iova | (msi_data->msiir & 0xfff);
+			break;
+		}
+	}
+	mutex_unlock(&msi_data->lock);
+
 	msg->address_lo = lower_32_bits(address);
 	msg->address_hi = upper_32_bits(address);
 
@@ -356,6 +435,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
 	struct fsl_msi *msi = platform_get_drvdata(ofdev);
 	int virq, i;
 	struct fsl_msi_cascade_data *cascade_data;
+	struct fsl_msi_device *device;
 
 	if (msi->list.prev != NULL)
 		list_del(&msi->list);
@@ -371,6 +451,13 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
 		msi_bitmap_free(&msi->bitmap);
 	if ((msi->feature & FSL_PIC_IP_MASK) != FSL_PIC_IP_VMPIC)
 		iounmap(msi->msi_regs);
+
+	mutex_lock(&msi->lock);
+	list_for_each_entry(device, &msi->device_list, list) {
+		list_del(&device->list);
+		kfree(device);
+	}
+	mutex_unlock(&msi->lock);
 	kfree(msi);
 
 	return 0;
@@ -436,6 +523,8 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 		dev_err(&dev->dev, "No memory for MSI structure\n");
 		return -ENOMEM;
 	}
+	INIT_LIST_HEAD(&msi->device_list);
+	mutex_init(&msi->lock);
 	platform_set_drvdata(dev, msi);
 
 	msi->irqhost = irq_domain_add_linear(dev->dev.of_node,
@@ -558,6 +647,7 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 		ppc_md.msi_check_device = fsl_msi_check_device;
 		ppc_md.msi_get_region_count = fsl_msi_get_region_count;
 		ppc_md.msi_get_region = fsl_msi_get_region;
+		ppc_md.msi_set_iova = fsl_msi_set_iova;
 	} else if (ppc_md.setup_msi_irqs != fsl_setup_msi_irqs) {
 		dev_err(&dev->dev, "Different MSI driver already installed!\n");
 		err = -ENODEV;
diff --git a/arch/powerpc/sysdev/fsl_msi.h b/arch/powerpc/sysdev/fsl_msi.h
index a2cc5a2..4da2af9 100644
--- a/arch/powerpc/sysdev/fsl_msi.h
+++ b/arch/powerpc/sysdev/fsl_msi.h
@@ -27,9 +27,16 @@
 #define FSL_PIC_IP_IPIC   0x00000002
 #define FSL_PIC_IP_VMPIC  0x00000003
 
+/* List of devices having specific iova page mapping */
+struct fsl_msi_device {
+	struct list_head list;
+	struct pci_dev *dev;
+	dma_addr_t iova;
+};
+
 struct fsl_msi {
 	struct irq_domain *irqhost;
-
+	struct mutex lock;
 	unsigned long cascade_irq;
 	phys_addr_t msiir; /* MSIIR Address in CCSR */
 	u32 ibs_shift; /* Shift of interrupt bit select */
@@ -37,7 +44,12 @@ struct fsl_msi {
 	void __iomem *msi_regs;
 	u32 feature;
 	int msi_virqs[NR_MSI_REG_MAX];
-
+	/*
+	 * Keep track of devices which have msi page mapping to specific
+	 * iova page. Default this is NULL which means legacy way of
+	 * setting iova will be used.
+	 */
+	struct list_head device_list;
 	/*
 	 * During probe each bank is assigned a index number.
 	 * index number start from 0.
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 6/9 v2] powerpc: pci: Extend msi iova page setup to arch specific
From: Bharat Bhushan @ 2013-11-19  5:17 UTC (permalink / raw)
  To: alex.williamson, joro, bhelgaas, agraf, scottwood, stuart.yoder,
	iommu, linux-pci, linuxppc-dev, linux-kernel
  Cc: Bharat Bhushan
In-Reply-To: <1384838233-24847-1-git-send-email-Bharat.Bhushan@freescale.com>

This patch extend the interface to arch specific code for setting
msi iova address for a msi page. Machine specific code is not yet
implemented.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
v2
 - new patch

 arch/powerpc/include/asm/machdep.h |    2 ++
 arch/powerpc/kernel/msi.c          |   10 ++++++++++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 8d1b787..e87b806 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -132,6 +132,8 @@ struct machdep_calls {
 	/* Returns the requested region's address and size */
 	int		(*msi_get_region)(int region_num,
 					  struct msi_region *region);
+	int		(*msi_set_iova)(struct pci_dev *pdev, int region_num,
+					dma_addr_t iova, bool set);
 #endif
 
 	void		(*restart)(char *cmd);
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 1a67787..e2bd555 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -13,6 +13,16 @@
 
 #include <asm/machdep.h>
 
+int arch_msi_set_iova(struct pci_dev *pdev, int region_num,
+		      dma_addr_t iova, bool set)
+{
+	if (ppc_md.msi_set_iova) {
+		pr_debug("msi: Using platform get_region_count routine.\n");
+		return ppc_md.msi_set_iova(pdev, region_num, iova, set);
+	}
+	return 0;
+}
+
 int arch_msi_get_region_count(void)
 {
 	if (ppc_md.msi_get_region_count) {
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 5/9 v2] pci/msi: interface to set an iova for a msi region
From: Bharat Bhushan @ 2013-11-19  5:17 UTC (permalink / raw)
  To: alex.williamson, joro, bhelgaas, agraf, scottwood, stuart.yoder,
	iommu, linux-pci, linuxppc-dev, linux-kernel
  Cc: Bharat Bhushan
In-Reply-To: <1384838233-24847-1-git-send-email-Bharat.Bhushan@freescale.com>

This patch defines an interface by which a msi page
can be mapped to a specific iova page.

This is a requirement in aperture type of IOMMUs (like Freescale PAMU),
where we map msi iova page just after guest memory iova address.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
v2
 - new patch

 drivers/pci/msi.c   |   13 +++++++++++++
 include/linux/pci.h |    8 ++++++++
 2 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 2643a29..040609f 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -77,6 +77,19 @@ int __weak arch_msi_get_region(int region_num, struct msi_region *region)
 	return 0;
 }
 
+int __weak arch_msi_set_iova(struct pci_dev *pdev, int region_num,
+			     dma_addr_t iova, bool set)
+{
+	return 0;
+}
+
+int msi_set_iova(struct pci_dev *pdev, int region_num,
+		 dma_addr_t iova, bool set)
+{
+	return arch_msi_set_iova(pdev, region_num, iova, set);
+}
+EXPORT_SYMBOL(msi_set_iova);
+
 int msi_get_region_count(void)
 {
 	return arch_msi_get_region_count();
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c587034..c6d3e58 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1195,6 +1195,12 @@ static inline int msi_get_region(int region_num, struct msi_region *region)
 {
 	return 0;
 }
+
+static inline int msi_set_iova(struct pci_dev *pdev, int region_num,
+			       dma_addr_t iova, bool set)
+{
+	return 0;
+}
 #else
 int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
 int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *maxvec);
@@ -1209,6 +1215,8 @@ void pci_restore_msi_state(struct pci_dev *dev);
 int pci_msi_enabled(void);
 int msi_get_region_count(void);
 int msi_get_region(int region_num, struct msi_region *region);
+int msi_set_iova(struct pci_dev *pdev, int region_num,
+		 dma_addr_t iova, bool set);
 #endif
 
 #ifdef CONFIG_PCIEPORTBUS
-- 
1.7.0.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox