LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v2 1/4] powerpc/cputable: reserve bits in HWCAP2 for new features
From: Ryan Arnold @ 2013-05-07 15:07 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Michael Neuling, Nishanth Aravamudan, Steve Munroe, Peter Bergner,
	linuxppc-dev, Michael R Meissner
In-Reply-To: <1367876461.15842.66.camel@pasglop>

[-- Attachment #1: Type: text/plain, Size: 938 bytes --]

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/06/2013
04:41:01 PM:

> Benjamin Herrenschmidt <benh@kernel.crashing.org>
> 05/06/2013 04:41 PM
>
> To
>
> Ryan Arnold/Rochester/IBM@IBMUS
>
> cc
>
> Nishanth Aravamudan <nacc@linux.vnet.ibm.com>, linuxppc-
> dev@lists.ozlabs.org, michael@ellerman.id.au, Michael R Meissner/
> Cambridge/IBM@IBMUS, Michael Neuling <mikey@neuling.org>, Peter
> Bergner/Rochester/IBM@IBMUS, Steve Munroe/Rochester/IBM@IBMUS
>
> Subject
>
> Re: [PATCH v2 1/4] powerpc/cputable: reserve bits in HWCAP2 for new
features
>
> On Mon, 2013-05-06 at 14:07 -0500, Ryan Arnold wrote:
> > Notice that I changed DSCR to DSC.  The 'R' wasn't descriptive.
>
> The "R" is the name of the register for which we are exposing the
> availability to userspace... it's also the name of the sysfs entry so
> I'd rather keep it for consistency.

I'm fine with keeping the 'R' in the name.  Thanks for the input.

Ryan

[-- Attachment #2: Type: text/html, Size: 1554 bytes --]

^ permalink raw reply

* [PATCH][v2] powerpc: Bring all threads online prior to migration/hibernation
From: Robert Jennings @ 2013-05-07 14:34 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Nathan Fontenot, Robert Jennings, linuxppc-dev

This patch brings online all threads which are present but not online
prior to migration/hibernation.  After migration/hibernation those
threads are taken back offline.

During migration/hibernation all online CPUs must call H_JOIN, this is
required by the hypervisor.  Without this patch, threads that are offline
(H_CEDE'd) will not be woken to make the H_JOIN call and the OS will be
deadlocked (all threads either JOIN'd or CEDE'd).

Cc: <stable@kernel.org>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
---
v2: Fixed for UP build
---
 arch/powerpc/include/asm/rtas.h          |    2 +
 arch/powerpc/kernel/rtas.c               |  113 ++++++++++++++++++++++++++++++
 arch/powerpc/platforms/pseries/suspend.c |   22 ++++++
 3 files changed, 137 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index a8bc2bb..34fd704 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -264,6 +264,8 @@ extern void rtas_progress(char *s, unsigned short hex);
 extern void rtas_initialize(void);
 extern int rtas_suspend_cpu(struct rtas_suspend_me_data *data);
 extern int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data);
+extern int rtas_online_cpus_mask(cpumask_var_t cpus);
+extern int rtas_offline_cpus_mask(cpumask_var_t cpus);
 extern int rtas_ibm_suspend_me(struct rtas_args *);
 
 struct rtc_time;
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 1fd6e7b..52add6f 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/delay.h>
+#include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
@@ -807,6 +808,95 @@ static void rtas_percpu_suspend_me(void *info)
 	__rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1);
 }
 
+enum rtas_cpu_state {
+	DOWN,
+	UP,
+};
+
+#ifndef CONFIG_SMP
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+				cpumask_var_t cpus)
+{
+	if (!cpumask_empty(cpus)) {
+		cpumask_clear(cpus);
+		return -EINVAL;
+	} else
+		return 0;
+}
+#else
+/* On return cpumask will be altered to indicate CPUs changed.
+ * CPUs with states changed will be set in the mask,
+ * CPUs with status unchanged will be unset in the mask. */
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+				cpumask_var_t cpus)
+{
+	int cpu;
+	int cpuret = 0;
+	int ret = 0;
+
+	if (cpumask_empty(cpus))
+		return 0;
+
+	for_each_cpu(cpu, cpus) {
+		switch (state) {
+		case DOWN:
+			cpuret = cpu_down(cpu);
+			break;
+		case UP:
+			cpuret = cpu_up(cpu);
+			break;
+		}
+		if (cpuret) {
+			pr_debug("%s: cpu_%s for cpu#%d returned %d.\n",
+					__func__,
+					((state == UP) ? "up" : "down"),
+					cpu, cpuret);
+			if (!ret)
+				ret = cpuret;
+			if (state == UP) {
+				/* clear bits for unchanged cpus, return */
+				cpumask_shift_right(cpus, cpus, cpu);
+				cpumask_shift_left(cpus, cpus, cpu);
+				break;
+			} else {
+				/* clear bit for unchanged cpu, continue */
+				cpumask_clear_cpu(cpu, cpus);
+			}
+		}
+	}
+
+	return ret;
+}
+#endif
+
+int rtas_online_cpus_mask(cpumask_var_t cpus)
+{
+	int ret;
+
+	ret = rtas_cpu_state_change_mask(UP, cpus);
+
+	if (ret) {
+		cpumask_var_t tmp_mask;
+
+		if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY))
+			return ret;
+
+		/* Use tmp_mask to preserve cpus mask from first failure */
+		cpumask_copy(tmp_mask, cpus);
+		rtas_offline_cpus_mask(tmp_mask);
+		free_cpumask_var(tmp_mask);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rtas_online_cpus_mask);
+
+int rtas_offline_cpus_mask(cpumask_var_t cpus)
+{
+	return rtas_cpu_state_change_mask(DOWN, cpus);
+}
+EXPORT_SYMBOL(rtas_offline_cpus_mask);
+
 int rtas_ibm_suspend_me(struct rtas_args *args)
 {
 	long state;
@@ -814,6 +904,8 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 	struct rtas_suspend_me_data data;
 	DECLARE_COMPLETION_ONSTACK(done);
+	cpumask_var_t offline_mask;
+	int cpuret;
 
 	if (!rtas_service_present("ibm,suspend-me"))
 		return -ENOSYS;
@@ -837,11 +929,24 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
 		return 0;
 	}
 
+	if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+		return -ENOMEM;
+
 	atomic_set(&data.working, 0);
 	atomic_set(&data.done, 0);
 	atomic_set(&data.error, 0);
 	data.token = rtas_token("ibm,suspend-me");
 	data.complete = &done;
+
+	/* All present CPUs must be online */
+	cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask);
+	cpuret = rtas_online_cpus_mask(offline_mask);
+	if (cpuret) {
+		pr_err("%s: Could not bring present CPUs online.\n", __func__);
+		atomic_set(&data.error, cpuret);
+		goto out;
+	}
+
 	stop_topology_update();
 
 	/* Call function on all CPUs.  One of us will make the
@@ -857,6 +962,14 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
 
 	start_topology_update();
 
+	/* Take down CPUs not online prior to suspend */
+	cpuret = rtas_offline_cpus_mask(offline_mask);
+	if (cpuret)
+		pr_warn("%s: Could not restore CPUs to offline state.\n",
+				__func__);
+
+out:
+	free_cpumask_var(offline_mask);
 	return atomic_read(&data.error);
 }
 #else /* CONFIG_PPC_PSERIES */
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
index 47226e0..5f997e7 100644
--- a/arch/powerpc/platforms/pseries/suspend.c
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -16,6 +16,7 @@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
   */
 
+#include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/suspend.h>
 #include <linux/stat.h>
@@ -126,11 +127,15 @@ static ssize_t store_hibernate(struct device *dev,
 			       struct device_attribute *attr,
 			       const char *buf, size_t count)
 {
+	cpumask_var_t offline_mask;
 	int rc;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+		return -ENOMEM;
+
 	stream_id = simple_strtoul(buf, NULL, 16);
 
 	do {
@@ -140,15 +145,32 @@ static ssize_t store_hibernate(struct device *dev,
 	} while (rc == -EAGAIN);
 
 	if (!rc) {
+		/* All present CPUs must be online */
+		cpumask_andnot(offline_mask, cpu_present_mask,
+				cpu_online_mask);
+		rc = rtas_online_cpus_mask(offline_mask);
+		if (rc) {
+			pr_err("%s: Could not bring present CPUs online.\n",
+					__func__);
+			goto out;
+		}
+
 		stop_topology_update();
 		rc = pm_suspend(PM_SUSPEND_MEM);
 		start_topology_update();
+
+		/* Take down CPUs not online prior to suspend */
+		if (!rtas_offline_cpus_mask(offline_mask))
+			pr_warn("%s: Could not restore CPUs to offline "
+					"state.\n", __func__);
 	}
 
 	stream_id = 0;
 
 	if (!rc)
 		rc = count;
+out:
+	free_cpumask_var(offline_mask);
 	return rc;
 }
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH] powerpc/fsl_msi: fix error return code in fsl_of_msi_probe()
From: Wei Yongjun @ 2013-05-07 13:46 UTC (permalink / raw)
  To: benh, paulus, grant.likely, rob.herring, galak, u.kleine-koenig,
	agraf, gregkh
  Cc: yongjun_wei, linuxppc-dev, devicetree-discuss, linux-kernel

From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

Fix to return a negative error code in the MSI bitmap alloc error
handling case instead of 0, as done elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
---
 arch/powerpc/sysdev/fsl_msi.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index ab02db3..f45556a 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -370,7 +370,6 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 	struct fsl_msi *msi;
 	struct resource res;
 	int err, i, j, irq_index, count;
-	int rc;
 	const u32 *p;
 	const struct fsl_msi_feature *features;
 	int len;
@@ -431,8 +430,8 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 	 */
 	msi->phandle = dev->dev.of_node->phandle;
 
-	rc = fsl_msi_init_allocator(msi);
-	if (rc) {
+	err = fsl_msi_init_allocator(msi);
+	if (err) {
 		dev_err(&dev->dev, "Error allocating MSI bitmap\n");
 		goto error_out;
 	}

^ permalink raw reply related

* [v1][KVM][PATCH 1/1] kvm:ppc:booehv: direct ISI exception to Guest
From: Tiejun Chen @ 2013-05-07 11:06 UTC (permalink / raw)
  To: agraf, scottwood; +Cc: linuxppc-dev, kvm, kvm-ppc

We also can direct ISI exception to Guest like DSI.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/kvm/booke_emulate.c |    3 +++
 arch/powerpc/kvm/e500mc.c        |    3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 27a4b28..33b14e9 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -197,6 +197,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		break;
 	case SPRN_IVOR3:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR3, spr_val);
+#endif
 		break;
 	case SPRN_IVOR4:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val;
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index c3bdc0a..acf546a 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -123,6 +123,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	mtspr(SPRN_GIVPR, vcpu->arch.ivpr);
 	mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]);
+	mtspr(SPRN_GIVOR3, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]);
 	mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]);
 	mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0);
 	mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1);
@@ -185,7 +186,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
 	vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \
-				 SPRN_EPCR_DUVD;
+				 SPRN_EPCR_DUVD | SPRN_EPCR_ISIGS;
 #ifdef CONFIG_64BIT
 	vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM;
 #endif
-- 
1.7.9.5

^ permalink raw reply related

* [v2][KVM][PATCH 1/1] kvm:ppc: enable doorbell exception with CONFIG_PPC_DOORBELL
From: Tiejun Chen @ 2013-05-07 10:23 UTC (permalink / raw)
  To: agraf; +Cc: linuxppc-dev, kvm, kvm-ppc

CONFIG_PPC_DOORBELL is enough to cover all variants.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/kvm/booke.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 1020119..62d4ece 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -795,7 +795,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
 		kvmppc_fill_pt_regs(&regs);
 		timer_interrupt(&regs);
 		break;
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64)
+#if defined(CONFIG_PPC_DOORBELL)
 	case BOOKE_INTERRUPT_DOORBELL:
 		kvmppc_fill_pt_regs(&regs);
 		doorbell_exception(&regs);
-- 
1.7.9.5

^ permalink raw reply related

* Re: [v1][KVM][PATCH 1/1] kvm:ppc: enable doorbell exception with E500MC
From: tiejun.chen @ 2013-05-07 10:22 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <518859F5.7020503@windriver.com>

On 05/07/2013 09:33 AM, tiejun.chen wrote:
> On 05/06/2013 10:58 PM, Alexander Graf wrote:
>> On 05/06/2013 04:53 AM, Tiejun Chen wrote:
>>> Actually E500MC also support doorbell exception, and CONFIG_PPC_E500MC
>>> can cover BOOK3E/BOOK3E_64 as well.
>>>
>>> Signed-off-by: Tiejun Chen<tiejun.chen@windriver.com>
>>> ---
>>>   arch/powerpc/kvm/booke.c |    2 +-
>>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
>>> index 1020119..dc1f590 100644
>>> --- a/arch/powerpc/kvm/booke.c
>>> +++ b/arch/powerpc/kvm/booke.c
>>> @@ -795,7 +795,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
>>>           kvmppc_fill_pt_regs(&regs);
>>>           timer_interrupt(&regs);
>>>           break;
>>> -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64)
>>> +#if defined(CONFIG_PPC_E500MC)
>>
>> I suppose you mean CONFIG_KVM_E500MC here? Why didn't this work for you before?
>
> This works for me.
>
> Here I just mean currently CONFIG_PPC_E500MC is always selected no matter what
> CONFIG_PPC_FSL_BOOK3E or CONFIG_PPC_BOOK3E_64 is enabled. And especially, this
> is already in the arch/powerpc/kvm/booke.c file, so I think one #ifdef
> (CONFIG_PPC_E500MC) is enough and also makes sense.
>
>> The ifdef above should cover the same range of CPUs.
>
> Or furthermore, the #ifdef CONFIG_PPC_DOORBELL is reasonable to cover this.
>

I think this may be better so I send next version with this. So please take a 
look at.

Tiejun

^ permalink raw reply

* Re: [PATCH v8 1/3] of/pci: Unify pci_process_bridge_OF_ranges from Microblaze and PowerPC
From: Benjamin Herrenschmidt @ 2013-05-07 10:19 UTC (permalink / raw)
  To: Andrew Murray
  Cc: linux-mips@linux-mips.org, siva.kallam@samsung.com,
	linux-pci@vger.kernel.org, linus.walleij@linaro.org,
	thierry.reding@avionic-design.de, Liviu Dudau, juhosg@openwrt.org,
	paulus@samba.org, linux-samsung-soc@vger.kernel.org,
	linux@arm.linux.org.uk, jg1.han@samsung.com,
	jgunthorpe@obsidianresearch.com, thomas.abraham@linaro.org,
	grant.likely@linaro.org, Rob Herring, arnd@arndb.de,
	devicetree-discuss@lists.ozlabs.org, kgene.kim@samsung.com,
	bhelgaas@google.com, linux-arm-kernel@lists.infradead.org,
	thomas.petazzoni@free-electrons.com, monstr@monstr.eu,
	linux-kernel@vger.kernel.org, suren.reddy@samsung.com,
	linuxppc-dev
In-Reply-To: <20130507080142.GA8808@arm.com>

On Tue, 2013-05-07 at 09:01 +0100, Andrew Murray wrote:
> 
> There were no objections to this latest revision until now and it is
> currently sitting with Jason Cooper (mvebu-next/pcie). [1]

Ok, well I've just sent Linus a pull request for my changes so at least
drop the powerpc changes from your tree for the time being.

> This is a view that was also shared by Bjorn [2] when I attempted to
> submit a patchset which moves struct pci_controller to asm-generic.

Right, it's the logical way to go

> The motativation for my patchsets were to give a way for ARM PCI host
> bridge drivers to parse the DT ranges property, but this snow-balled
> into unifying pci_process_bridge_OF_ranges.

Which I understand, I would probably have done the same thing in your
shoes :-)

> My v8 patchset provides a of_pci_range_parser which is used directly
> by a few ARM PCI DT host bridge drivers, this has been generally
> accepted and tested. I don't see why this can't remain and so I'd
> really like to keep this around. 

Sure, no objection, in fact I should/could probably update my new code
to use it as well.

> Grant, Benjamin would you be happy for me to resubmit this series
> which provides the of_pci_range_parser which will be used by the
> separate implementations of pci_process_bridge_OF_ranges in
> PowerPC/Microblaze?

Sure, in fact feel free to update my new code if you have more bandwidth
than I do, it should hit Linus tree soon hopefully unless he objects to
me having a second pull request this merge window...

> Benjamin are you able to still use of_pci_range_parser in your
> 'Support per-aperture memory offset' patch?

I see no reason why not. I just haven't looked into it much, I admit,
being bogged down with a pile of new HW bringup in the lab etc...

Cheers,
Ben.

^ permalink raw reply

* [git pull] Please pull powerpc.git merge branch
From: Benjamin Herrenschmidt @ 2013-05-07 10:10 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linuxppc-dev, Linux Kernel list

Hi Linus !

Here are a few more powerpc bits that I would like in 3.10. Mostly
remaining bolts & screw tightening of power8 support such as actually
exposing the new features via the previously added AT_HWCAP2, and a
few fixes, some of them for problems exposed recently like irqdomain
warnings or sysfs access permission issues, some exposed by power8
hardware.

The only change outside of arch/powerpc is a small one to irqdomain.c
to allow silent failure to fix a problem on Cell where we get a dozen
WARN_ON's tripping at boot for what is basically a normal case.

Grant Acked it but I forgot to put it in and didn't want to rebase.

Cheers,
Ben.

The following changes since commit 01227a889ed56ae53aeebb9f93be9d54dd8b2de8:

  Merge tag 'kvm-3.10-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm (2013-05-05 14:47:31 -0700)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git test

for you to fetch changes up to 5737789c8340620d7b542d1d4e9b197de8eb2801:

  powerpc: Make hard_irq_disable() do the right thing vs. irq tracing (2013-05-07 17:13:57 +1000)

----------------------------------------------------------------
Anton Blanchard (1):
      powerpc: Emulate non privileged DSCR read and write

Benjamin Herrenschmidt (9):
      powerpc/powerpnv: Properly handle failure starting CPUs
      powerpc/pci: Don't add bogus empty resources to PHBs
      powerpc/pnv: Fix "compatible" property for P8 PHB
      irqdomain: Allow quiet failure mode
      powerpc/cell/spufs: Fix status attribute permission
      powerpc/cell/iommu: Improve error message for missing node
      powerpc/pci: Support per-aperture memory offset
      powerpc/topology: Fix spurr attribute permission
      powerpc: Make hard_irq_disable() do the right thing vs. irq tracing

Brian King (1):
      powerpc/pseries: Force 32 bit MSIs for devices that require it

Jeremy Kerr (1):
      powerpc/powernv: Defer OPAL exception handler registration

Kleber Sacilotto de Souza (1):
      powerpc/pseries: Perform proper max_bus_speed detection

Michael Neuling (1):
      powerpc/tm: Fix null pointer deference in flush_hash_page

Nishanth Aravamudan (4):
      powerpc/cputable: Reserve bits in HWCAP2 for new features
      powerpc/cputable: Advertise DSCR support on P7/P7+
      powerpc/cputable: Advertise ISEL support on appropriate embedded processors
      powerpc/cputable: Advertise support for ISEL/HTM/DSCR/TAR on POWER8

 arch/powerpc/include/asm/cputable.h         |    2 +
 arch/powerpc/include/asm/hw_irq.h           |   16 ++---
 arch/powerpc/include/asm/machdep.h          |    3 +
 arch/powerpc/include/asm/pci-bridge.h       |    8 +--
 arch/powerpc/include/asm/ppc-opcode.h       |    4 ++
 arch/powerpc/include/uapi/asm/cputable.h    |    9 +++
 arch/powerpc/kernel/cputable.c              |   14 ++++
 arch/powerpc/kernel/pci-common.c            |  101 ++++++++++-----------------
 arch/powerpc/kernel/pci_32.c                |    2 +-
 arch/powerpc/kernel/pci_64.c                |    2 +-
 arch/powerpc/kernel/sysfs.c                 |    2 +-
 arch/powerpc/kernel/traps.c                 |   10 ++-
 arch/powerpc/mm/hash_utils_64.c             |    1 +
 arch/powerpc/platforms/cell/iommu.c         |    2 +-
 arch/powerpc/platforms/cell/spu_base.c      |    2 +-
 arch/powerpc/platforms/embedded6xx/mpc10x.h |   11 ---
 arch/powerpc/platforms/powermac/pci.c       |    2 +-
 arch/powerpc/platforms/powernv/opal.c       |   15 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c   |   12 ++--
 arch/powerpc/platforms/powernv/smp.c        |    4 +-
 arch/powerpc/platforms/pseries/msi.c        |   21 +++++-
 arch/powerpc/platforms/pseries/pci.c        |   53 ++++++++++++++
 arch/powerpc/platforms/pseries/pseries.h    |    4 ++
 arch/powerpc/platforms/pseries/setup.c      |    2 +
 arch/powerpc/platforms/wsp/wsp_pci.c        |    2 +-
 arch/powerpc/sysdev/fsl_pci.c               |   11 +--
 arch/powerpc/sysdev/mpic.c                  |   14 +++-
 arch/powerpc/sysdev/ppc4xx_pci.c            |   15 ++--
 kernel/irq/irqdomain.c                      |   20 +++++-
 29 files changed, 236 insertions(+), 128 deletions(-)

^ permalink raw reply

* [PATCH] powerpc, cpu hotplug: Fix warning on boot regarding sysfs file permission
From: Srivatsa S. Bhat @ 2013-05-07  9:13 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel
  Cc: caiqian, gregkh, balbi, paulus, anton, Srivatsa S. Bhat

On boot, the following warning shows up on each CPU online operation:

------------[ cut here ]------------
WARNING: at drivers/base/core.c:575
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W    3.9.0+ #2
task: c0000006fa180000 ti: c0000006fa200000 task.ti: c0000006fa200000
NIP: c00000000053772c LR: c000000000537728 CTR: 0000000001764c5c
REGS: c0000006fa203780 TRAP: 0700   Tainted: G        W     (3.9.0+)
MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>  CR: 28000024  XER: 00000002
SOFTE: 1
CFAR: c0000000008a6994

GPR00: c000000000537728 c0000006fa203a00 c000000000dafaf0 0000000000000020
GPR04: 0000000000000001 c000000000085d9c 0000000000000000 0000000000000002
GPR08: 0000000000000000 c0000006fa180000 c0000006fa180000 c000000000e38804
GPR12: 0000000000000000 c000000001db0000 c00000000000c1b0 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000001 c000000000e44880
GPR20: c000000000e44d18 c000000000c726a8 0000000000000000 0000000000000000
GPR24: c000000000bab158 000000000000007e c000000000e68200 c000000000c71d48
GPR28: c0000000130c0030 0000000000000002 c0000000130c0030 c000000000c71f58
NIP [c00000000053772c] .device_create_file+0xac/0xc0
LR [c000000000537728] .device_create_file+0xa8/0xc0
PACATMSCRATCH [8000000000009032]
Call Trace:
[c0000006fa203a00] [c000000000537728] .device_create_file+0xa8/0xc0 (unreliable)
[c0000006fa203a80] [c00000000089f3e4] .register_cpu_online+0x10c/0x238
[c0000006fa203b30] [c000000000bc9a80] .topology_init+0x1fc/0x200
[c0000006fa203c10] [c00000000000b870] .do_one_initcall+0x60/0x1e0
[c0000006fa203cd0] [c000000000bc3fc0] .kernel_init_freeable+0x22c/0x310
[c0000006fa203da0] [c00000000000c1cc] .kernel_init+0x1c/0x1c0
[c0000006fa203e30] [c00000000000a170] .ret_from_kernel_thread+0x64/0x74
Instruction dump:
0fe00000 4bffffb8 60000000 60000000 60000000 e8040028 2fa00000 409eff98
3c62ffd6 3863dc38 4836f221 60000000 <0fe00000> a13f0008 4bffff7c 60000000
---[ end trace b53dfda141d62ab0 ]---


The problem is that the 'spurr' per-cpu sysfs file sports a write permission
without the corresponding ->store() method. So remove the bogus write
permission to fix the warning.

Reported-by: CAI Qian <caiqian@redhat.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 arch/powerpc/kernel/sysfs.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 3ce1f86..e68a845 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -180,7 +180,7 @@ SYSFS_PMCSETUP(dscr, SPRN_DSCR);
 SYSFS_PMCSETUP(pir, SPRN_PIR);
 
 static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
-static DEVICE_ATTR(spurr, 0600, show_spurr, NULL);
+static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
 static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr);
 static DEVICE_ATTR(purr, 0600, show_purr, store_purr);
 static DEVICE_ATTR(pir, 0400, show_pir, NULL);

^ permalink raw reply related

* [PATCH v2] powerpc: Fix  MAX_STACK_TRACE_ENTRIES too low warning again
From: Li Zhong @ 2013-05-07  8:44 UTC (permalink / raw)
  To: PowerPC email list; +Cc: Paul Mackerras
In-Reply-To: <1367893978.3083.13.camel@ThinkPad-T5421>

Saw this warning again, and this time from the ret_from_fork path. 

It seems we could clear the back chain earlier in copy_thread(), which
could cover both path, and also fix potential lockdep usage in
schedule_tail(), or exception occurred before we clear the back chain.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
--- 
 arch/powerpc/kernel/entry_32.S |    2 --
 arch/powerpc/kernel/entry_64.S |    2 --
 arch/powerpc/kernel/process.c  |    1 +
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index e514de5..d22e73e 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -439,8 +439,6 @@ ret_from_fork:
 ret_from_kernel_thread:
 	REST_NVGPRS(r1)
 	bl	schedule_tail
-	li	r3,0
-	stw	r3,0(r1)
 	mtlr	r14
 	mr	r3,r15
 	PPC440EP_ERR42
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 3fe5259..48e8a86 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -376,8 +376,6 @@ _GLOBAL(ret_from_fork)
 _GLOBAL(ret_from_kernel_thread)
 	bl	.schedule_tail
 	REST_NVGPRS(r1)
-	li	r3,0
-	std	r3,0(r1)
 	ld	r14, 0(r14)
 	mtlr	r14
 	mr	r3,r15
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ceb4e7b..80af366 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -971,6 +971,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 	 * do some house keeping and then return from the fork or clone
 	 * system call, using the stack frame created above.
 	 */
+	((unsigned long *)sp)[0] = 0;
 	sp -= sizeof(struct pt_regs);
 	kregs = (struct pt_regs *) sp;
 	sp -= STACK_FRAME_OVERHEAD;

^ permalink raw reply related

* Re: [PATCH] powerpc: Fix  MAX_STACK_TRACE_ENTRIES too low warning again
From: Li Zhong @ 2013-05-07  8:29 UTC (permalink / raw)
  To: PowerPC email list; +Cc: Paul Mackerras
In-Reply-To: <1367893978.3083.13.camel@ThinkPad-T5421>

On Tue, 2013-05-07 at 10:32 +0800, Li Zhong wrote:
> Saw this warning again, and this time from the ret_from_fork path. 
> 
> It seems we could clear the back chain earlier in copy_thread(), which
> could cover both path, and also fix potential lockdep usage in
> schedule_tail(), or exception occurred before we clear the back chain.

Sorry, I made some mistake, please ignore this patch...
It seems clearing the back chain shouldn't use kregs->gpr[1] below, it
should be ((unsigned long*)sp)[0]. I'll send an updated version.

Thanks, Zhong

> 
> Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/entry_32.S |    2 --
>  arch/powerpc/kernel/entry_64.S |    2 --
>  arch/powerpc/kernel/process.c  |    1 +
>  3 files changed, 1 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
> index e514de5..d22e73e 100644
> --- a/arch/powerpc/kernel/entry_32.S
> +++ b/arch/powerpc/kernel/entry_32.S
> @@ -439,8 +439,6 @@ ret_from_fork:
>  ret_from_kernel_thread:
>  	REST_NVGPRS(r1)
>  	bl	schedule_tail
> -	li	r3,0
> -	stw	r3,0(r1)
>  	mtlr	r14
>  	mr	r3,r15
>  	PPC440EP_ERR42
> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> index 3fe5259..48e8a86 100644
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -376,8 +376,6 @@ _GLOBAL(ret_from_fork)
>  _GLOBAL(ret_from_kernel_thread)
>  	bl	.schedule_tail
>  	REST_NVGPRS(r1)
> -	li	r3,0
> -	std	r3,0(r1)
>  	ld	r14, 0(r14)
>  	mtlr	r14
>  	mr	r3,r15
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index ceb4e7b..2c9fc5e 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -973,6 +973,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
>  	 */
>  	sp -= sizeof(struct pt_regs);
>  	kregs = (struct pt_regs *) sp;
> +	kregs->gpr[1] = 0;
>  	sp -= STACK_FRAME_OVERHEAD;
>  	p->thread.ksp = sp;
>  	p->thread.ksp_limit = (unsigned long)task_stack_page(p) +

^ permalink raw reply

* Re: [PATCH v8 1/3] of/pci: Unify pci_process_bridge_OF_ranges from Microblaze and PowerPC
From: Andrew Murray @ 2013-05-07  8:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linux-mips@linux-mips.org, siva.kallam@samsung.com,
	linux-pci@vger.kernel.org, linus.walleij@linaro.org,
	thierry.reding@avionic-design.de, Liviu Dudau, juhosg@openwrt.org,
	paulus@samba.org, linux-samsung-soc@vger.kernel.org,
	linux@arm.linux.org.uk, jg1.han@samsung.com,
	jgunthorpe@obsidianresearch.com, thomas.abraham@linaro.org,
	grant.likely@linaro.org, Rob Herring, arnd@arndb.de,
	devicetree-discuss@lists.ozlabs.org, kgene.kim@samsung.com,
	bhelgaas@google.com, linux-arm-kernel@lists.infradead.org,
	thomas.petazzoni@free-electrons.com, monstr@monstr.eu,
	linux-kernel@vger.kernel.org, suren.reddy@samsung.com,
	linuxppc-dev
In-Reply-To: <1367721709.11982.37.camel@pasglop>

On Sun, May 05, 2013 at 03:41:49AM +0100, Benjamin Herrenschmidt wrote:
> On Mon, 2013-04-22 at 11:41 +0100, Andrew Murray wrote:
> > The pci_process_bridge_OF_ranges function, used to parse the "ranges"
> > property of a PCI host device, is found in both Microblaze and PowerPC
> > architectures. These implementations are nearly identical. This patch
> > moves this common code to a common place.
> 
> What's happening with this ? I'd like to avoid that patch for now
> as I'm doing some changes to pci_process_bridge_OF_ranges
> which are fairly urgent (I might even stick them in the current
> merge window) to deal with memory windows having separate offsets.

There were no objections to this latest revision until now and it is
currently sitting with Jason Cooper (mvebu-next/pcie). [1]

> 
> There's also a few hacks in there that are really ppc specific...
> 
> I think the right long term approach is to change the way powerpc
> (and microblaze ?) initializes PCI host bridges. Move it away from
> setup_arch() (which is a PITA anyway since it's way too early) to
> an early init call of some sort, and encapsulate the new struct
> pci_host_bridge.
> 
> We can then directly configure the host bridge windows rather
> than having this "intermediary" set of resources in our pci_controller
> and in fact move most of the fields from pci_controller to
> pci_host_bridge to the point where the former can remain as a
> simple platform specific wrapper if needed.

This is a view that was also shared by Bjorn [2] when I attempted to
submit a patchset which moves struct pci_controller to asm-generic.

> 
> So for new stuff (hint: DT based ARM PCI) or stuff that has to deal with
> a lot less archaic platforms (hint: Microblaze), I'd recommend going
> straight for that approach rather than perpetuating the PowerPC code
> which I'll try to deal with in the next few monthes.

The motativation for my patchsets were to give a way for ARM PCI host
bridge drivers to parse the DT ranges property, but this snow-balled into
unifying pci_process_bridge_OF_ranges.

My v8 patchset provides a of_pci_range_parser which is used directly by a
few ARM PCI DT host bridge drivers, this has been generally accepted and
tested. I don't see why this can't remain and so I'd really like to keep this
around. 

Grant, Benjamin would you be happy for me to resubmit this series which provides
the of_pci_range_parser which will be used by the separate implementations of
pci_process_bridge_OF_ranges in PowerPC/Microblaze?

Benjamin are you able to still use of_pci_range_parser in your
'Support per-aperture memory offset' patch?

Thanks,

Andrew Murray

[1] https://lkml.org/lkml/2013/4/22/505
[2] https://patchwork.kernel.org/patch/2487671

> 
> Cheers,
> Ben.
>  
> 
> 

^ permalink raw reply

* Re: [PATCH] powerpc: Make hard_irq_disable() do the right thing vs. irq tracing
From: Benjamin Herrenschmidt @ 2013-05-07  7:25 UTC (permalink / raw)
  To: tiejun.chen; +Cc: Scott Wood, Mihai Caraman, linuxppc-dev, kvm-ppc
In-Reply-To: <5188AA2D.1040802@windriver.com>

On Tue, 2013-05-07 at 15:15 +0800, tiejun.chen wrote:
> Could we simplify this as follows:
> 
> +#define hard_irq_disable()     do {                    \
> +       __hard_irq_disable();                           \
> +       if (get_paca()->soft_enabled) {                 \
> +               trace_hardirqs_off();                   \
> +               get_paca()->soft_enabled = 0;           \
> +       }                                               \
> +       get_paca()->irq_happened |= PACA_IRQ_HARD_DIS;  \
> +} while(0)

I'd rather not. When lockdep is not enabled trace_hardirqs_off() is a
nop and thus we have no conditional at all which is arguably better...

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] powerpc: Make hard_irq_disable() do the right thing vs. irq tracing
From: tiejun.chen @ 2013-05-07  7:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, Mihai Caraman, linuxppc-dev, kvm-ppc
In-Reply-To: <1367910242.5769.11.camel@pasglop>

On 05/07/2013 03:04 PM, Benjamin Herrenschmidt wrote:
> If hard_irq_disable() is called while interrupts are already soft-disabled
> (which is the most common case) all is already well.
>
> However you can (and in some cases want) to call it while everything is
> enabled (to make sure you don't get a lazy even, for example before entry
> into KVM guests) and in this case we need to inform the irq tracer that
> the irqs are going off.
>
> We have to change the inline into a macro to avoid an include circular
> dependency hell hole.
>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>
> Tested on pseries, Scott, I don't expect a problem with that patch especially
> since most callers already are soft disabled, so I'll merge it now along with
> my other pending stuff and you can simplify your KVM one.
>
> diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
> index e45c494..d615b28 100644
> --- a/arch/powerpc/include/asm/hw_irq.h
> +++ b/arch/powerpc/include/asm/hw_irq.h
> @@ -95,15 +95,13 @@ static inline bool arch_irqs_disabled(void)
>   #define __hard_irq_disable()	__mtmsrd(local_paca->kernel_msr, 1)
>   #endif
>
> -static inline void hard_irq_disable(void)
> -{
> -	__hard_irq_disable();
> -	get_paca()->soft_enabled = 0;
> -	get_paca()->irq_happened |= PACA_IRQ_HARD_DIS;
> -}
> -
> -/* include/linux/interrupt.h needs hard_irq_disable to be a macro */
> -#define hard_irq_disable	hard_irq_disable
> +#define hard_irq_disable()	do {			\
> +	__hard_irq_disable();				\
> +	if (local_paca->soft_enabled)			\
> +		trace_hardirqs_off();			\
> +	get_paca()->soft_enabled = 0;			\

Could we simplify this as follows:

+#define hard_irq_disable()	do {			\
+	__hard_irq_disable();				\
+	if (get_paca()->soft_enabled) {			\
+		trace_hardirqs_off();			\
+		get_paca()->soft_enabled = 0;		\
+	}						\
+	get_paca()->irq_happened |= PACA_IRQ_HARD_DIS;	\
+} while(0)

Tiejun

> +	get_paca()->irq_happened |= PACA_IRQ_HARD_DIS;	\
> +} while(0)
>
>   static inline bool lazy_irq_pending(void)
>   {
>
>

^ permalink raw reply

* [PATCH] powerpc: Make hard_irq_disable() do the right thing vs. irq tracing
From: Benjamin Herrenschmidt @ 2013-05-07  7:04 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Scott Wood, Mihai Caraman, kvm-ppc, Tiejun Chen

If hard_irq_disable() is called while interrupts are already soft-disabled
(which is the most common case) all is already well.

However you can (and in some cases want) to call it while everything is
enabled (to make sure you don't get a lazy even, for example before entry
into KVM guests) and in this case we need to inform the irq tracer that
the irqs are going off.

We have to change the inline into a macro to avoid an include circular
dependency hell hole.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

Tested on pseries, Scott, I don't expect a problem with that patch especially
since most callers already are soft disabled, so I'll merge it now along with
my other pending stuff and you can simplify your KVM one.

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index e45c494..d615b28 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -95,15 +95,13 @@ static inline bool arch_irqs_disabled(void)
 #define __hard_irq_disable()	__mtmsrd(local_paca->kernel_msr, 1)
 #endif

-static inline void hard_irq_disable(void)
-{
-	__hard_irq_disable();
-	get_paca()->soft_enabled = 0;
-	get_paca()->irq_happened |= PACA_IRQ_HARD_DIS;
-}
-
-/* include/linux/interrupt.h needs hard_irq_disable to be a macro */
-#define hard_irq_disable	hard_irq_disable
+#define hard_irq_disable()	do {			\
+	__hard_irq_disable();				\
+	if (local_paca->soft_enabled)			\
+		trace_hardirqs_off();			\
+	get_paca()->soft_enabled = 0;			\
+	get_paca()->irq_happened |= PACA_IRQ_HARD_DIS;	\
+} while(0)

 static inline bool lazy_irq_pending(void)
 {

^ permalink raw reply related

* Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling
From: David Gibson @ 2013-05-07  6:54 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, Joerg Roedel, Alexander Graf, kvm-ppc, Alex Williamson,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <51889EE5.5050807@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 1021 bytes --]

On Tue, May 07, 2013 at 04:27:49PM +1000, Alexey Kardashevskiy wrote:
> On 05/07/2013 04:02 PM, David Gibson wrote:
> > On Tue, May 07, 2013 at 03:51:31PM +1000, Alexey Kardashevskiy wrote:
> >> On 05/07/2013 03:29 PM, David Gibson wrote:
> >>> On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote:
[snip]
> >>>> +#define DRIVER_VERSION	"0.1"
> >>>> +#define DRIVER_AUTHOR	"Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>"
> >>>> +#define DRIVER_DESC	"POWERPC KVM driver"
> >>>
> >>> Really?
> >>
> >>
> >> What is wrong here?
> > 
> > Well, it seems entirely unrelated to the rest of the changes, 
> 
> 
> The patch adds a module parameter so I had to add those DRIVER_xxx.

Ah, ok.

> > and not obviously accurate.
> 
> Let's fix it then. How? Paul signed it...

Fair enough then.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling
From: Alexey Kardashevskiy @ 2013-05-07  6:27 UTC (permalink / raw)
  To: David Gibson
  Cc: kvm, Joerg Roedel, Alexander Graf, kvm-ppc, Alex Williamson,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <20130507060240.GQ13692@truffula.fritz.box>

On 05/07/2013 04:02 PM, David Gibson wrote:
> On Tue, May 07, 2013 at 03:51:31PM +1000, Alexey Kardashevskiy wrote:
>> On 05/07/2013 03:29 PM, David Gibson wrote:
>>> On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote:
>>>> This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
>>>> and H_STUFF_TCE requests without passing them to QEMU, which should
>>>> save time on switching to QEMU and back.
>>>>
>>>> Both real and virtual modes are supported - whenever the kernel
>>>> fails to handle TCE request, it passes it to the virtual mode.
>>>> If it the virtual mode handlers fail, then the request is passed
>>>> to the user mode, for example, to QEMU.
>>>>
>>>> This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate
>>>> a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables
>>>> in-kernel handling of IOMMU map/unmap.
>>>>
>>>> This adds a special case for huge pages (16MB).  The reference
>>>> counting cannot be easily done for such pages in real mode (when
>>>> MMU is off) so we added a list of huge pages.  It is populated in
>>>> virtual mode and get_page is called just once per a huge page.
>>>> Real mode handlers check if the requested page is huge and in the list,
>>>> then no reference counting is done, otherwise an exit to virtual mode
>>>> happens.  The list is released at KVM exit.  At the moment the fastest
>>>> card available for tests uses up to 9 huge pages so walking through this
>>>> list is not very expensive.  However this can change and we may want
>>>> to optimize this.
>>>>
>>>> This also adds the virt_only parameter to the KVM module
>>>> for debug and performance check purposes.
>>>>
>>>> Tests show that this patch increases transmission speed from 220MB/s
>>>> to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).
>>>>
>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> Signed-off-by: Paul Mackerras <paulus@samba.org>
>>>> ---
>>>>  Documentation/virtual/kvm/api.txt   |   28 ++++
>>>>  arch/powerpc/include/asm/kvm_host.h |    2 +
>>>>  arch/powerpc/include/asm/kvm_ppc.h  |    2 +
>>>>  arch/powerpc/include/uapi/asm/kvm.h |    7 +
>>>>  arch/powerpc/kvm/book3s_64_vio.c    |  242 ++++++++++++++++++++++++++++++++++-
>>>>  arch/powerpc/kvm/book3s_64_vio_hv.c |  192 +++++++++++++++++++++++++++
>>>>  arch/powerpc/kvm/powerpc.c          |   12 ++
>>>>  include/uapi/linux/kvm.h            |    2 +
>>>>  8 files changed, 485 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>>> index f621cd6..2039767 100644
>>>> --- a/Documentation/virtual/kvm/api.txt
>>>> +++ b/Documentation/virtual/kvm/api.txt
>>>> @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, invalidating any previously
>>>>  valid entries found.
>>>>  
>>>>  
>>>> +4.79 KVM_CREATE_SPAPR_TCE_IOMMU
>>>> +
>>>> +Capability: KVM_CAP_SPAPR_TCE_IOMMU
>>>> +Architectures: powerpc
>>>> +Type: vm ioctl
>>>> +Parameters: struct kvm_create_spapr_tce_iommu (in)
>>>> +Returns: 0 on success, -1 on error
>>>> +
>>>> +This creates a link between IOMMU group and a hardware TCE (translation
>>>> +control entry) table. This link lets the host kernel know what IOMMU
>>>> +group (i.e. TCE table) to use for the LIOBN number passed with
>>>> +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
>>>> +
>>>> +/* for KVM_CAP_SPAPR_TCE_IOMMU */
>>>> +struct kvm_create_spapr_tce_iommu {
>>>> +	__u64 liobn;
>>>> +	__u32 iommu_id;
>>>
>>> Wouldn't it be more in keeping 
>>
>>
>> pardon?
> 
> Sorry, I was going to suggest a change, but then realised it wasn't
> actually any better than what you have now.
> 
>>>> +	__u32 flags;
>>>> +};
>>>> +
>>>> +No flag is supported at the moment.
>>>> +
>>>> +When the guest issues TCE call on a liobn for which a TCE table has been
>>>> +registered, the kernel will handle it in real mode, updating the hardware
>>>> +TCE table. TCE table calls for other liobns will cause a vm exit and must
>>>> +be handled by userspace.
>>>> +
>>>> +
>>>>  5. The kvm_run structure
>>>>  ------------------------
>>>>  
>>>> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
>>>> index 36ceb0d..2b70cbc 100644
>>>> --- a/arch/powerpc/include/asm/kvm_host.h
>>>> +++ b/arch/powerpc/include/asm/kvm_host.h
>>>> @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table {
>>>>  	struct kvm *kvm;
>>>>  	u64 liobn;
>>>>  	u32 window_size;
>>>> +	bool virtmode_only;
>>>
>>> I see this is now initialized from the global parameter, but I think
>>> it would be better to just check the global (debug) parameter
>>> directly, rather than duplicating it here.
>>
>>
>> The global parameter is in kvm.ko and the struct above is in the real mode
>> part which cannot go to the module.
> 
> Ah, ok.  I'm half inclined to just drop the virtmode_only thing
> entirely.
> 
>>>> +	struct iommu_group *grp;    /* used for IOMMU groups */
>>>>  	struct page *pages[0];
>>>>  };
>>>>  
>>>> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
>>>> index d501246..bdfa140 100644
>>>> --- a/arch/powerpc/include/asm/kvm_ppc.h
>>>> +++ b/arch/powerpc/include/asm/kvm_ppc.h
>>>> @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm);
>>>>  
>>>>  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>>>>  				struct kvm_create_spapr_tce *args);
>>>> +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
>>>> +				struct kvm_create_spapr_tce_iommu *args);
>>>>  extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
>>>>  		struct kvm_vcpu *vcpu, unsigned long liobn);
>>>>  extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt,
>>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>>>> index 681b314..b67d44b 100644
>>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>>> @@ -291,6 +291,13 @@ struct kvm_create_spapr_tce {
>>>>  	__u32 window_size;
>>>>  };
>>>>  
>>>> +/* for KVM_CAP_SPAPR_TCE_IOMMU */
>>>> +struct kvm_create_spapr_tce_iommu {
>>>> +	__u64 liobn;
>>>> +	__u32 iommu_id;
>>>> +	__u32 flags;
>>>> +};
>>>> +
>>>>  /* for KVM_ALLOCATE_RMA */
>>>>  struct kvm_allocate_rma {
>>>>  	__u64 rma_size;
>>>> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
>>>> index 643ac1e..98cf949 100644
>>>> --- a/arch/powerpc/kvm/book3s_64_vio.c
>>>> +++ b/arch/powerpc/kvm/book3s_64_vio.c
>>>> @@ -27,6 +27,9 @@
>>>>  #include <linux/hugetlb.h>
>>>>  #include <linux/list.h>
>>>>  #include <linux/anon_inodes.h>
>>>> +#include <linux/pci.h>
>>>> +#include <linux/iommu.h>
>>>> +#include <linux/module.h>
>>>>  
>>>>  #include <asm/tlbflush.h>
>>>>  #include <asm/kvm_ppc.h>
>>>> @@ -38,10 +41,19 @@
>>>>  #include <asm/kvm_host.h>
>>>>  #include <asm/udbg.h>
>>>>  #include <asm/iommu.h>
>>>> +#include <asm/tce.h>
>>>> +
>>>> +#define DRIVER_VERSION	"0.1"
>>>> +#define DRIVER_AUTHOR	"Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>"
>>>> +#define DRIVER_DESC	"POWERPC KVM driver"
>>>
>>> Really?
>>
>>
>> What is wrong here?
> 
> Well, it seems entirely unrelated to the rest of the changes, 


The patch adds a module parameter so I had to add those DRIVER_xxx.


> and not obviously accurate.

Let's fix it then. How? Paul signed it...



-- 
Alexey

^ permalink raw reply

* Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling
From: David Gibson @ 2013-05-07  6:02 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, Joerg Roedel, Alexander Graf, kvm-ppc, Alex Williamson,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <51889663.5060909@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 7585 bytes --]

On Tue, May 07, 2013 at 03:51:31PM +1000, Alexey Kardashevskiy wrote:
> On 05/07/2013 03:29 PM, David Gibson wrote:
> > On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote:
> >> This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
> >> and H_STUFF_TCE requests without passing them to QEMU, which should
> >> save time on switching to QEMU and back.
> >>
> >> Both real and virtual modes are supported - whenever the kernel
> >> fails to handle TCE request, it passes it to the virtual mode.
> >> If it the virtual mode handlers fail, then the request is passed
> >> to the user mode, for example, to QEMU.
> >>
> >> This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate
> >> a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables
> >> in-kernel handling of IOMMU map/unmap.
> >>
> >> This adds a special case for huge pages (16MB).  The reference
> >> counting cannot be easily done for such pages in real mode (when
> >> MMU is off) so we added a list of huge pages.  It is populated in
> >> virtual mode and get_page is called just once per a huge page.
> >> Real mode handlers check if the requested page is huge and in the list,
> >> then no reference counting is done, otherwise an exit to virtual mode
> >> happens.  The list is released at KVM exit.  At the moment the fastest
> >> card available for tests uses up to 9 huge pages so walking through this
> >> list is not very expensive.  However this can change and we may want
> >> to optimize this.
> >>
> >> This also adds the virt_only parameter to the KVM module
> >> for debug and performance check purposes.
> >>
> >> Tests show that this patch increases transmission speed from 220MB/s
> >> to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).
> >>
> >> Cc: David Gibson <david@gibson.dropbear.id.au>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> Signed-off-by: Paul Mackerras <paulus@samba.org>
> >> ---
> >>  Documentation/virtual/kvm/api.txt   |   28 ++++
> >>  arch/powerpc/include/asm/kvm_host.h |    2 +
> >>  arch/powerpc/include/asm/kvm_ppc.h  |    2 +
> >>  arch/powerpc/include/uapi/asm/kvm.h |    7 +
> >>  arch/powerpc/kvm/book3s_64_vio.c    |  242 ++++++++++++++++++++++++++++++++++-
> >>  arch/powerpc/kvm/book3s_64_vio_hv.c |  192 +++++++++++++++++++++++++++
> >>  arch/powerpc/kvm/powerpc.c          |   12 ++
> >>  include/uapi/linux/kvm.h            |    2 +
> >>  8 files changed, 485 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> >> index f621cd6..2039767 100644
> >> --- a/Documentation/virtual/kvm/api.txt
> >> +++ b/Documentation/virtual/kvm/api.txt
> >> @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, invalidating any previously
> >>  valid entries found.
> >>  
> >>  
> >> +4.79 KVM_CREATE_SPAPR_TCE_IOMMU
> >> +
> >> +Capability: KVM_CAP_SPAPR_TCE_IOMMU
> >> +Architectures: powerpc
> >> +Type: vm ioctl
> >> +Parameters: struct kvm_create_spapr_tce_iommu (in)
> >> +Returns: 0 on success, -1 on error
> >> +
> >> +This creates a link between IOMMU group and a hardware TCE (translation
> >> +control entry) table. This link lets the host kernel know what IOMMU
> >> +group (i.e. TCE table) to use for the LIOBN number passed with
> >> +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
> >> +
> >> +/* for KVM_CAP_SPAPR_TCE_IOMMU */
> >> +struct kvm_create_spapr_tce_iommu {
> >> +	__u64 liobn;
> >> +	__u32 iommu_id;
> > 
> > Wouldn't it be more in keeping 
> 
> 
> pardon?

Sorry, I was going to suggest a change, but then realised it wasn't
actually any better than what you have now.

> >> +	__u32 flags;
> >> +};
> >> +
> >> +No flag is supported at the moment.
> >> +
> >> +When the guest issues TCE call on a liobn for which a TCE table has been
> >> +registered, the kernel will handle it in real mode, updating the hardware
> >> +TCE table. TCE table calls for other liobns will cause a vm exit and must
> >> +be handled by userspace.
> >> +
> >> +
> >>  5. The kvm_run structure
> >>  ------------------------
> >>  
> >> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> >> index 36ceb0d..2b70cbc 100644
> >> --- a/arch/powerpc/include/asm/kvm_host.h
> >> +++ b/arch/powerpc/include/asm/kvm_host.h
> >> @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table {
> >>  	struct kvm *kvm;
> >>  	u64 liobn;
> >>  	u32 window_size;
> >> +	bool virtmode_only;
> > 
> > I see this is now initialized from the global parameter, but I think
> > it would be better to just check the global (debug) parameter
> > directly, rather than duplicating it here.
> 
> 
> The global parameter is in kvm.ko and the struct above is in the real mode
> part which cannot go to the module.

Ah, ok.  I'm half inclined to just drop the virtmode_only thing
entirely.

> >> +	struct iommu_group *grp;    /* used for IOMMU groups */
> >>  	struct page *pages[0];
> >>  };
> >>  
> >> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> >> index d501246..bdfa140 100644
> >> --- a/arch/powerpc/include/asm/kvm_ppc.h
> >> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> >> @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm);
> >>  
> >>  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
> >>  				struct kvm_create_spapr_tce *args);
> >> +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
> >> +				struct kvm_create_spapr_tce_iommu *args);
> >>  extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
> >>  		struct kvm_vcpu *vcpu, unsigned long liobn);
> >>  extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt,
> >> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> >> index 681b314..b67d44b 100644
> >> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >> @@ -291,6 +291,13 @@ struct kvm_create_spapr_tce {
> >>  	__u32 window_size;
> >>  };
> >>  
> >> +/* for KVM_CAP_SPAPR_TCE_IOMMU */
> >> +struct kvm_create_spapr_tce_iommu {
> >> +	__u64 liobn;
> >> +	__u32 iommu_id;
> >> +	__u32 flags;
> >> +};
> >> +
> >>  /* for KVM_ALLOCATE_RMA */
> >>  struct kvm_allocate_rma {
> >>  	__u64 rma_size;
> >> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> >> index 643ac1e..98cf949 100644
> >> --- a/arch/powerpc/kvm/book3s_64_vio.c
> >> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> >> @@ -27,6 +27,9 @@
> >>  #include <linux/hugetlb.h>
> >>  #include <linux/list.h>
> >>  #include <linux/anon_inodes.h>
> >> +#include <linux/pci.h>
> >> +#include <linux/iommu.h>
> >> +#include <linux/module.h>
> >>  
> >>  #include <asm/tlbflush.h>
> >>  #include <asm/kvm_ppc.h>
> >> @@ -38,10 +41,19 @@
> >>  #include <asm/kvm_host.h>
> >>  #include <asm/udbg.h>
> >>  #include <asm/iommu.h>
> >> +#include <asm/tce.h>
> >> +
> >> +#define DRIVER_VERSION	"0.1"
> >> +#define DRIVER_AUTHOR	"Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>"
> >> +#define DRIVER_DESC	"POWERPC KVM driver"
> > 
> > Really?
> 
> 
> What is wrong here?

Well, it seems entirely unrelated to the rest of the changes, and not
obviously accurate.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling
From: Alexey Kardashevskiy @ 2013-05-07  5:51 UTC (permalink / raw)
  To: David Gibson
  Cc: kvm, Joerg Roedel, Alexander Graf, kvm-ppc, Alex Williamson,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <20130507052933.GP13692@truffula.fritz.box>

On 05/07/2013 03:29 PM, David Gibson wrote:
> On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote:
>> This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
>> and H_STUFF_TCE requests without passing them to QEMU, which should
>> save time on switching to QEMU and back.
>>
>> Both real and virtual modes are supported - whenever the kernel
>> fails to handle TCE request, it passes it to the virtual mode.
>> If it the virtual mode handlers fail, then the request is passed
>> to the user mode, for example, to QEMU.
>>
>> This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate
>> a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables
>> in-kernel handling of IOMMU map/unmap.
>>
>> This adds a special case for huge pages (16MB).  The reference
>> counting cannot be easily done for such pages in real mode (when
>> MMU is off) so we added a list of huge pages.  It is populated in
>> virtual mode and get_page is called just once per a huge page.
>> Real mode handlers check if the requested page is huge and in the list,
>> then no reference counting is done, otherwise an exit to virtual mode
>> happens.  The list is released at KVM exit.  At the moment the fastest
>> card available for tests uses up to 9 huge pages so walking through this
>> list is not very expensive.  However this can change and we may want
>> to optimize this.
>>
>> This also adds the virt_only parameter to the KVM module
>> for debug and performance check purposes.
>>
>> Tests show that this patch increases transmission speed from 220MB/s
>> to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> Signed-off-by: Paul Mackerras <paulus@samba.org>
>> ---
>>  Documentation/virtual/kvm/api.txt   |   28 ++++
>>  arch/powerpc/include/asm/kvm_host.h |    2 +
>>  arch/powerpc/include/asm/kvm_ppc.h  |    2 +
>>  arch/powerpc/include/uapi/asm/kvm.h |    7 +
>>  arch/powerpc/kvm/book3s_64_vio.c    |  242 ++++++++++++++++++++++++++++++++++-
>>  arch/powerpc/kvm/book3s_64_vio_hv.c |  192 +++++++++++++++++++++++++++
>>  arch/powerpc/kvm/powerpc.c          |   12 ++
>>  include/uapi/linux/kvm.h            |    2 +
>>  8 files changed, 485 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index f621cd6..2039767 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, invalidating any previously
>>  valid entries found.
>>  
>>  
>> +4.79 KVM_CREATE_SPAPR_TCE_IOMMU
>> +
>> +Capability: KVM_CAP_SPAPR_TCE_IOMMU
>> +Architectures: powerpc
>> +Type: vm ioctl
>> +Parameters: struct kvm_create_spapr_tce_iommu (in)
>> +Returns: 0 on success, -1 on error
>> +
>> +This creates a link between IOMMU group and a hardware TCE (translation
>> +control entry) table. This link lets the host kernel know what IOMMU
>> +group (i.e. TCE table) to use for the LIOBN number passed with
>> +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
>> +
>> +/* for KVM_CAP_SPAPR_TCE_IOMMU */
>> +struct kvm_create_spapr_tce_iommu {
>> +	__u64 liobn;
>> +	__u32 iommu_id;
> 
> Wouldn't it be more in keeping 


pardon?



>> +	__u32 flags;
>> +};
>> +
>> +No flag is supported at the moment.
>> +
>> +When the guest issues TCE call on a liobn for which a TCE table has been
>> +registered, the kernel will handle it in real mode, updating the hardware
>> +TCE table. TCE table calls for other liobns will cause a vm exit and must
>> +be handled by userspace.
>> +
>> +
>>  5. The kvm_run structure
>>  ------------------------
>>  
>> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
>> index 36ceb0d..2b70cbc 100644
>> --- a/arch/powerpc/include/asm/kvm_host.h
>> +++ b/arch/powerpc/include/asm/kvm_host.h
>> @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table {
>>  	struct kvm *kvm;
>>  	u64 liobn;
>>  	u32 window_size;
>> +	bool virtmode_only;
> 
> I see this is now initialized from the global parameter, but I think
> it would be better to just check the global (debug) parameter
> directly, rather than duplicating it here.


The global parameter is in kvm.ko and the struct above is in the real mode
part which cannot go to the module.



>> +	struct iommu_group *grp;    /* used for IOMMU groups */
>>  	struct page *pages[0];
>>  };
>>  
>> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
>> index d501246..bdfa140 100644
>> --- a/arch/powerpc/include/asm/kvm_ppc.h
>> +++ b/arch/powerpc/include/asm/kvm_ppc.h
>> @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm);
>>  
>>  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>>  				struct kvm_create_spapr_tce *args);
>> +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
>> +				struct kvm_create_spapr_tce_iommu *args);
>>  extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
>>  		struct kvm_vcpu *vcpu, unsigned long liobn);
>>  extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt,
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>> index 681b314..b67d44b 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -291,6 +291,13 @@ struct kvm_create_spapr_tce {
>>  	__u32 window_size;
>>  };
>>  
>> +/* for KVM_CAP_SPAPR_TCE_IOMMU */
>> +struct kvm_create_spapr_tce_iommu {
>> +	__u64 liobn;
>> +	__u32 iommu_id;
>> +	__u32 flags;
>> +};
>> +
>>  /* for KVM_ALLOCATE_RMA */
>>  struct kvm_allocate_rma {
>>  	__u64 rma_size;
>> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
>> index 643ac1e..98cf949 100644
>> --- a/arch/powerpc/kvm/book3s_64_vio.c
>> +++ b/arch/powerpc/kvm/book3s_64_vio.c
>> @@ -27,6 +27,9 @@
>>  #include <linux/hugetlb.h>
>>  #include <linux/list.h>
>>  #include <linux/anon_inodes.h>
>> +#include <linux/pci.h>
>> +#include <linux/iommu.h>
>> +#include <linux/module.h>
>>  
>>  #include <asm/tlbflush.h>
>>  #include <asm/kvm_ppc.h>
>> @@ -38,10 +41,19 @@
>>  #include <asm/kvm_host.h>
>>  #include <asm/udbg.h>
>>  #include <asm/iommu.h>
>> +#include <asm/tce.h>
>> +
>> +#define DRIVER_VERSION	"0.1"
>> +#define DRIVER_AUTHOR	"Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>"
>> +#define DRIVER_DESC	"POWERPC KVM driver"
> 
> Really?


What is wrong here?



-- 
Alexey

^ permalink raw reply

* Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling
From: David Gibson @ 2013-05-07  5:29 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, Joerg Roedel, Alexander Graf, kvm-ppc, Alex Williamson,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <1367825157-27231-6-git-send-email-aik@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 21989 bytes --]

On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote:
> This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
> and H_STUFF_TCE requests without passing them to QEMU, which should
> save time on switching to QEMU and back.
> 
> Both real and virtual modes are supported - whenever the kernel
> fails to handle TCE request, it passes it to the virtual mode.
> If it the virtual mode handlers fail, then the request is passed
> to the user mode, for example, to QEMU.
> 
> This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate
> a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables
> in-kernel handling of IOMMU map/unmap.
> 
> This adds a special case for huge pages (16MB).  The reference
> counting cannot be easily done for such pages in real mode (when
> MMU is off) so we added a list of huge pages.  It is populated in
> virtual mode and get_page is called just once per a huge page.
> Real mode handlers check if the requested page is huge and in the list,
> then no reference counting is done, otherwise an exit to virtual mode
> happens.  The list is released at KVM exit.  At the moment the fastest
> card available for tests uses up to 9 huge pages so walking through this
> list is not very expensive.  However this can change and we may want
> to optimize this.
> 
> This also adds the virt_only parameter to the KVM module
> for debug and performance check purposes.
> 
> Tests show that this patch increases transmission speed from 220MB/s
> to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
>  Documentation/virtual/kvm/api.txt   |   28 ++++
>  arch/powerpc/include/asm/kvm_host.h |    2 +
>  arch/powerpc/include/asm/kvm_ppc.h  |    2 +
>  arch/powerpc/include/uapi/asm/kvm.h |    7 +
>  arch/powerpc/kvm/book3s_64_vio.c    |  242 ++++++++++++++++++++++++++++++++++-
>  arch/powerpc/kvm/book3s_64_vio_hv.c |  192 +++++++++++++++++++++++++++
>  arch/powerpc/kvm/powerpc.c          |   12 ++
>  include/uapi/linux/kvm.h            |    2 +
>  8 files changed, 485 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index f621cd6..2039767 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, invalidating any previously
>  valid entries found.
>  
>  
> +4.79 KVM_CREATE_SPAPR_TCE_IOMMU
> +
> +Capability: KVM_CAP_SPAPR_TCE_IOMMU
> +Architectures: powerpc
> +Type: vm ioctl
> +Parameters: struct kvm_create_spapr_tce_iommu (in)
> +Returns: 0 on success, -1 on error
> +
> +This creates a link between IOMMU group and a hardware TCE (translation
> +control entry) table. This link lets the host kernel know what IOMMU
> +group (i.e. TCE table) to use for the LIOBN number passed with
> +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
> +
> +/* for KVM_CAP_SPAPR_TCE_IOMMU */
> +struct kvm_create_spapr_tce_iommu {
> +	__u64 liobn;
> +	__u32 iommu_id;

Wouldn't it be more in keeping 

> +	__u32 flags;
> +};
> +
> +No flag is supported at the moment.
> +
> +When the guest issues TCE call on a liobn for which a TCE table has been
> +registered, the kernel will handle it in real mode, updating the hardware
> +TCE table. TCE table calls for other liobns will cause a vm exit and must
> +be handled by userspace.
> +
> +
>  5. The kvm_run structure
>  ------------------------
>  
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 36ceb0d..2b70cbc 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table {
>  	struct kvm *kvm;
>  	u64 liobn;
>  	u32 window_size;
> +	bool virtmode_only;

I see this is now initialized from the global parameter, but I think
it would be better to just check the global (debug) parameter
directly, rather than duplicating it here.

> +	struct iommu_group *grp;    /* used for IOMMU groups */
>  	struct page *pages[0];
>  };
>  
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index d501246..bdfa140 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm);
>  
>  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  				struct kvm_create_spapr_tce *args);
> +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
> +				struct kvm_create_spapr_tce_iommu *args);
>  extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
>  		struct kvm_vcpu *vcpu, unsigned long liobn);
>  extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt,
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> index 681b314..b67d44b 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -291,6 +291,13 @@ struct kvm_create_spapr_tce {
>  	__u32 window_size;
>  };
>  
> +/* for KVM_CAP_SPAPR_TCE_IOMMU */
> +struct kvm_create_spapr_tce_iommu {
> +	__u64 liobn;
> +	__u32 iommu_id;
> +	__u32 flags;
> +};
> +
>  /* for KVM_ALLOCATE_RMA */
>  struct kvm_allocate_rma {
>  	__u64 rma_size;
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index 643ac1e..98cf949 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -27,6 +27,9 @@
>  #include <linux/hugetlb.h>
>  #include <linux/list.h>
>  #include <linux/anon_inodes.h>
> +#include <linux/pci.h>
> +#include <linux/iommu.h>
> +#include <linux/module.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/kvm_ppc.h>
> @@ -38,10 +41,19 @@
>  #include <asm/kvm_host.h>
>  #include <asm/udbg.h>
>  #include <asm/iommu.h>
> +#include <asm/tce.h>
> +
> +#define DRIVER_VERSION	"0.1"
> +#define DRIVER_AUTHOR	"Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>"
> +#define DRIVER_DESC	"POWERPC KVM driver"

Really?

>  
>  #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
>  #define ERROR_ADDR      (~(unsigned long)0x0)
>  
> +static bool kvmppc_tce_virt_only = false;
> +module_param_named(virt_only, kvmppc_tce_virt_only, bool, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(virt_only, "Disable realmode handling of IOMMU map/unmap");
> +
>  /*
>   * TCE tables handlers.
>   */
> @@ -58,8 +70,13 @@ static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
>  
>  	mutex_lock(&kvm->lock);
>  	list_del(&stt->list);
> -	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
> -		__free_page(stt->pages[i]);
> +#ifdef CONFIG_IOMMU_API
> +	if (stt->grp) {
> +		iommu_group_put(stt->grp);
> +	} else
> +#endif
> +		for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
> +			__free_page(stt->pages[i]);
>  	kfree(stt);
>  	mutex_unlock(&kvm->lock);
>  
> @@ -155,9 +172,127 @@ fail:
>  	return ret;
>  }
>  
> +#ifdef CONFIG_IOMMU_API
> +static const struct file_operations kvm_spapr_tce_iommu_fops = {
> +	.release	= kvm_spapr_tce_release,
> +};
> +
> +long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
> +		struct kvm_create_spapr_tce_iommu *args)
> +{
> +	struct kvmppc_spapr_tce_table *tt = NULL;
> +	struct iommu_group *grp;
> +	struct iommu_table *tbl;
> +
> +	/* Find an IOMMU table for the given ID */
> +	grp = iommu_group_get_by_id(args->iommu_id);
> +	if (!grp)
> +		return -ENXIO;
> +
> +	tbl = iommu_group_get_iommudata(grp);
> +	if (!tbl)
> +		return -ENXIO;
> +
> +	/* Check this LIOBN hasn't been previously allocated */
> +	list_for_each_entry(tt, &kvm->arch.spapr_tce_tables, list) {
> +		if (tt->liobn == args->liobn)
> +			return -EBUSY;
> +	}
> +
> +	tt = kzalloc(sizeof(*tt), GFP_KERNEL);
> +	if (!tt)
> +		return -ENOMEM;
> +
> +	tt->liobn = args->liobn;
> +	tt->kvm = kvm;
> +	tt->virtmode_only = kvmppc_tce_virt_only;
> +	tt->grp = grp;
> +
> +	kvm_get_kvm(kvm);
> +
> +	mutex_lock(&kvm->lock);
> +	list_add(&tt->list, &kvm->arch.spapr_tce_tables);
> +
> +	mutex_unlock(&kvm->lock);
> +
> +	pr_debug("LIOBN=%llX hooked to IOMMU %d, flags=%u\n",
> +			args->liobn, args->iommu_id, args->flags);
> +
> +	return anon_inode_getfd("kvm-spapr-tce-iommu",
> +			&kvm_spapr_tce_iommu_fops, tt, O_RDWR);
> +}
> +#else
> +long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
> +		struct kvm_create_spapr_tce_iommu *args)
> +{
> +	return -ENOSYS;
> +}
> +#endif /* CONFIG_IOMMU_API */
> +
> +#ifdef CONFIG_IOMMU_API
>  /*
>   * Virtual mode handling of IOMMU map/unmap.
>   */
> +static int clear_tce_virt_mode(struct iommu_table *tbl,
> +		unsigned long ioba, unsigned long tce_value,
> +		unsigned long npages)
> +{
> +	int ret;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +
> +	ret = iommu_tce_clear_param_check(tbl, ioba, tce_value, npages);
> +	if (ret)
> +		return ret;
> +
> +	ret = iommu_clear_tces_and_put_pages(tbl, entry, npages);
> +	if (ret < 0)
> +		pr_err("iommu_tce: %s failed ioba=%lx, tce_value=%lx ret=%d\n",
> +				__func__, ioba, tce_value, ret);
> +
> +	return ret;
> +}
> +
> +static int put_tce_virt_mode(struct kvmppc_spapr_tce_table *tt,
> +		struct iommu_table *tbl,
> +		unsigned long ioba, unsigned long tce,
> +		pte_t pte, unsigned long pg_size)
> +{
> +	int ret;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +
> +	ret = iommu_tce_put_param_check(tbl, ioba, tce);
> +	if (ret)
> +		return ret;
> +
> +	/* System page size case, easy to handle */
> +	if (pg_size == PAGE_SIZE)
> +		return iommu_put_tce_user_mode(tbl, entry, tce);
> +
> +	return -EAGAIN;
> +}
> +
> +static pte_t va_to_linux_pte(struct kvm_vcpu *vcpu,
> +		unsigned long hva, bool writing, unsigned long *pg_sizep)
> +{
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	/* Find out the page pte and size if requested */
> +	pte_t pte;
> +	unsigned long pg_size = 0;
> +
> +	pte = lookup_linux_pte(vcpu->arch.pgdir, hva,
> +			writing, &pg_size);
> +	if (!pte_present(pte))
> +		return 0;
> +
> +	*pg_sizep = pg_size;
> +
> +	return pte;
> +#else
> +	return 0;
> +#endif
> +}
> +#endif /* CONFIG_IOMMU_API */
> +
>  /* Converts guest physical address into host virtual */
>  static unsigned long get_virt_address(struct kvm_vcpu *vcpu,
>  		unsigned long gpa)
> @@ -188,6 +323,43 @@ long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
>  	if (!tt)
>  		return H_TOO_HARD;
>  
> +#ifdef CONFIG_IOMMU_API
> +	if (tt->grp) {
> +		long ret;
> +		struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
> +
> +		/* Return error if the group is being destroyed */
> +		if (!tbl)
> +			return H_RESCINDED;
> +
> +		if (tce & (TCE_PCI_READ | TCE_PCI_WRITE)) {
> +			unsigned long hpa, pg_size = 0;
> +			pte_t pte;
> +
> +			hpa = get_virt_address(vcpu, tce);
> +			if (hpa == ERROR_ADDR)
> +				return -EFAULT;
> +
> +			pte = va_to_linux_pte(vcpu, hpa, tce & TCE_PCI_WRITE,
> +					&pg_size);
> +			if (!pte)
> +				return -EFAULT;
> +
> +			ret = put_tce_virt_mode(tt, tbl, ioba, hpa,
> +					pte, pg_size);
> +		} else {
> +			ret = clear_tce_virt_mode(tbl, ioba, 0, 1);
> +		}
> +		iommu_flush_tce(tbl);
> +
> +		WARN_ON(ret == -EAGAIN);
> +		if (ret < 0)
> +			return H_PARAMETER;
> +
> +		return H_SUCCESS;
> +	}
> +#endif
> +
>  	/* Emulated IO */
>  	return kvmppc_emulated_h_put_tce(tt, ioba, tce);
>  }
> @@ -213,6 +385,52 @@ long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>  	if (tces == ERROR_ADDR)
>  		return H_TOO_HARD;
>  
> +#ifdef CONFIG_IOMMU_API
> +	if (tt->grp) {
> +		long ret = 0;
> +		struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
> +
> +		/* Return error if the group is being destroyed */
> +		if (!tbl)
> +			return H_RESCINDED;
> +
> +		for (i = 0; i < npages; ++i) {
> +			unsigned long hpa, pg_size = 0;
> +			pte_t pte = 0;
> +			unsigned long tce;
> +			unsigned long ptce = tces + i * sizeof(unsigned long);
> +
> +			if (get_user(tce, (unsigned long __user *)ptce))
> +				break;
> +
> +			hpa = get_virt_address(vcpu, tce);
> +			if (hpa == ERROR_ADDR)
> +				return -EFAULT;
> +
> +			pte = va_to_linux_pte(vcpu, hpa,
> +					tce & TCE_PCI_WRITE, &pg_size);
> +			if (!pte)
> +				return -EFAULT;
> +
> +			ret = put_tce_virt_mode(tt, tbl,
> +					ioba + (i << IOMMU_PAGE_SHIFT),
> +					hpa, pte, pg_size);
> +			if (ret)
> +				break;
> +		}
> +		if (ret)
> +			clear_tce_virt_mode(tbl, ioba, 0, i);
> +
> +		iommu_flush_tce(tbl);
> +
> +		WARN_ON(ret == -EAGAIN);
> +		if (ret < 0)
> +			return H_PARAMETER;
> +
> +		return H_SUCCESS;
> +	}
> +#endif
> +
>  	/* Emulated IO */
>  	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
>  		return H_PARAMETER;
> @@ -253,6 +471,26 @@ long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
>  	if (!tt)
>  		return H_TOO_HARD;
>  
> +#ifdef CONFIG_IOMMU_API
> +	if (tt->grp) {
> +		long ret;
> +		struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
> +
> +		/* Return error if the group is being destroyed */
> +		if (!tbl)
> +			return H_RESCINDED;
> +
> +		ret = clear_tce_virt_mode(tbl, ioba,
> +				tce_value, npages);
> +
> +		WARN_ON(ret == -EAGAIN);
> +		if (ret < 0)
> +			return H_PARAMETER;
> +
> +		return H_SUCCESS;
> +	}
> +#endif
> +
>  	/* Emulated IO */
>  	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
>  		return H_PARAMETER;
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 55fdf7a..c5e5905 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -26,6 +26,7 @@
>  #include <linux/slab.h>
>  #include <linux/hugetlb.h>
>  #include <linux/list.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/kvm_ppc.h>
> @@ -161,6 +162,85 @@ static unsigned long get_real_address(struct kvm_vcpu *vcpu,
>  	return hwaddr;
>  }
>  
> +#ifdef CONFIG_IOMMU_API
> +static int clear_tce_real_mode(struct iommu_table *tbl,
> +		unsigned long ioba,
> +		unsigned long tce_value, unsigned long npages)
> +{
> +	int ret;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +
> +	ret = iommu_tce_clear_param_check(tbl, ioba, tce_value, npages);
> +	if (ret)
> +		return ret;
> +
> +	for ( ; npages; --npages, ++entry) {
> +		struct page *page;
> +		unsigned long oldtce;
> +
> +		oldtce = iommu_clear_tce(tbl, entry);
> +		if (!oldtce)
> +			continue;
> +
> +		page = realmode_pfn_to_page(oldtce >> PAGE_SHIFT);
> +		if (!page) {
> +			ret = -EAGAIN;
> +			break;
> +		}
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		ret = realmode_put_page(page);
> +		if (ret)
> +			break;
> +	}
> +	/* if (ret < 0)
> +		pr_err("iommu_tce: %s failed ioba=%lx, tce_value=%lx ret=%d\n",
> +				__func__, ioba, tce_value, ret); */
> +
> +	return ret;
> +}
> +
> +static int put_tce_real_mode(struct kvmppc_spapr_tce_table *tt,
> +		struct iommu_table *tbl,
> +		unsigned long ioba, unsigned long tce,
> +		pte_t pte, unsigned long pg_size)
> +{
> +	int ret;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +	struct page *page = NULL;
> +	enum dma_data_direction direction = iommu_tce_direction(tce);
> +
> +	ret = iommu_tce_put_param_check(tbl, ioba, tce);
> +	if (ret)
> +		return ret;
> +
> +	if (pg_size != PAGE_SIZE)
> +		return -EAGAIN;
> +
> +	/* Small page case, find page struct to increment a counter */
> +	page = realmode_pfn_to_page(tce >> PAGE_SHIFT);
> +	if (!page)
> +		return -EAGAIN;
> +
> +	ret = realmode_get_page(page);
> +	if (ret)
> +		return ret;
> +
> +	/* tce_build accepts virtual addresses */
> +	ret = iommu_tce_build(tbl, entry, (unsigned long) __va(tce), direction);
> +	if (ret)
> +		realmode_put_page(page);
> +
> +	/* if (ret < 0)
> +		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
> +				__func__, ioba, tce, ret); */
> +
> +	return ret;
> +}
> +#endif /* CONFIG_IOMMU_API */
> +
>  long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  		      unsigned long ioba, unsigned long tce)
>  {
> @@ -171,6 +251,44 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  	if (!tt)
>  		return H_TOO_HARD;
>  
> +	if (tt->virtmode_only)
> +		return H_TOO_HARD;
> +
> +#ifdef CONFIG_IOMMU_API
> +	if (tt->grp) {
> +		long ret;
> +		struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
> +
> +		/* Return error if the group is being destroyed */
> +		if (!tbl)
> +			return H_RESCINDED;
> +
> +		if (tce & (TCE_PCI_READ | TCE_PCI_WRITE)) {
> +			unsigned long hpa, pg_size = 0;
> +			pte_t pte = 0;
> +
> +			hpa = get_real_address(vcpu, tce, tce & TCE_PCI_WRITE,
> +					&pte, &pg_size);
> +			if (hpa == ERROR_ADDR)
> +				return H_TOO_HARD;
> +
> +			ret = put_tce_real_mode(tt, tbl, ioba,
> +					hpa, pte, pg_size);
> +		} else {
> +			ret = clear_tce_real_mode(tbl, ioba, 0, 1);
> +		}
> +		iommu_flush_tce(tbl);
> +
> +		if (ret == -EAGAIN)
> +			return H_TOO_HARD;
> +
> +		if (ret < 0)
> +			return H_PARAMETER;
> +
> +		return H_SUCCESS;
> +	}
> +#endif
> +
>  	/* Emulated IO */
>  	return kvmppc_emulated_h_put_tce(tt, ioba, tce);
>  }
> @@ -192,10 +310,58 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>  	if (!tt)
>  		return H_TOO_HARD;
>  
> +	if (tt->virtmode_only)
> +		return H_TOO_HARD;
> +
>  	tces = get_real_address(vcpu, tce_list, false, NULL, NULL);
>  	if (tces == ERROR_ADDR)
>  		return H_TOO_HARD;
>  
> +#ifdef CONFIG_IOMMU_API
> +	if (tt->grp) {
> +		long ret = 0;
> +		struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
> +
> +		/* Return error if the group is being destroyed */
> +		if (!tbl)
> +			return H_RESCINDED;
> +
> +		for (i = 0; i < npages; ++i) {
> +			unsigned long hpa, pg_size = 0;
> +			pte_t pte = 0;
> +			unsigned long tce;
> +			unsigned long ptce = tces + i * sizeof(unsigned long);
> +
> +			if (get_user(tce, (unsigned long __user *)ptce))
> +				break;
> +
> +			hpa = get_real_address(vcpu, tce,
> +					tce & TCE_PCI_WRITE,
> +					&pte, &pg_size);
> +			if (hpa == ERROR_ADDR)
> +				ret = -EAGAIN;
> +			else
> +				ret = put_tce_real_mode(tt, tbl,
> +						ioba + (i << IOMMU_PAGE_SHIFT),
> +						hpa, pte, pg_size);
> +			if (ret)
> +				break;
> +		}
> +		if (ret)
> +			clear_tce_real_mode(tbl, ioba, 0, i);
> +
> +		iommu_flush_tce(tbl);
> +
> +		if (ret == -EAGAIN)
> +			return H_TOO_HARD;
> +
> +		if (ret < 0)
> +			return H_PARAMETER;
> +
> +		return H_SUCCESS;
> +	}
> +#endif
> +
>  	/* Emulated IO */
>  	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
>  		return H_PARAMETER;
> @@ -236,6 +402,32 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
>  	if (!tt)
>  		return H_TOO_HARD;
>  
> +	if (tt->virtmode_only)
> +		return H_TOO_HARD;
> +
> +#ifdef CONFIG_IOMMU_API
> +	if (tt->grp) {
> +		long ret;
> +		struct iommu_table *tbl = iommu_group_get_iommudata(tt->grp);
> +
> +		/* Return error if the group is being destroyed */
> +		if (!tbl)
> +			return H_RESCINDED;
> +
> +		ret = clear_tce_real_mode(tbl, ioba,
> +				tce_value, npages);
> +		iommu_flush_tce(tbl);
> +
> +		if (ret == -EAGAIN)
> +			return H_TOO_HARD;
> +
> +		if (ret < 0)
> +			return H_PARAMETER;
> +
> +		return H_SUCCESS;
> +	}
> +#endif
> +
>  	/* Emulated IO */
>  	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
>  		return H_PARAMETER;
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index b7ad589..269b0f6 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -385,6 +385,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  		break;
>  #endif
>  	case KVM_CAP_SPAPR_MULTITCE:
> +	case KVM_CAP_SPAPR_TCE_IOMMU:
>  		r = 1;
>  		break;
>  	default:
> @@ -935,6 +936,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
>  		goto out;
>  	}
> +	case KVM_CREATE_SPAPR_TCE_IOMMU: {
> +		struct kvm_create_spapr_tce_iommu create_tce_iommu;
> +		struct kvm *kvm = filp->private_data;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&create_tce_iommu, argp,
> +				sizeof(create_tce_iommu)))
> +			goto out;
> +		r = kvm_vm_ioctl_create_spapr_tce_iommu(kvm, &create_tce_iommu);
> +		goto out;
> +	}
>  #endif /* CONFIG_PPC_BOOK3S_64 */
>  
>  #ifdef CONFIG_KVM_BOOK3S_64_HV
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6c04da1..161e1d3 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -641,6 +641,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_PPC_RTAS (0x100000 + 87)
>  #define KVM_CAP_SPAPR_XICS (0x100000 + 88)
>  #define KVM_CAP_SPAPR_MULTITCE (0x110000 + 89)
> +#define KVM_CAP_SPAPR_TCE_IOMMU (0x110000 + 90)
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -885,6 +886,7 @@ struct kvm_s390_ucas_mapping {
>  #define KVM_PPC_GET_HTAB_FD	  _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
>  /* Available with KVM_CAP_PPC_RTAS */
>  #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xdc, struct kvm_rtas_token_args)
> +#define KVM_CREATE_SPAPR_TCE_IOMMU _IOW(KVMIO,  0xaf, struct kvm_create_spapr_tce_iommu)
>  
>  /*
>   * ioctls for vcpu fds

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [PATCH 3/3] powerpc/powernv: Don't configure IO window on PHB3
From: Gavin Shan @ 2013-05-07  5:12 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: yinghai, linux-pci, bhelgaas, linuxppc-dev, Gavin Shan
In-Reply-To: <1367876070.15842.60.camel@pasglop>

On Tue, May 07, 2013 at 07:34:30AM +1000, Benjamin Herrenschmidt wrote:
>On Mon, 2013-05-06 at 21:44 +0800, Gavin Shan wrote:
>> We needn't configure IO windows for the corresponding PEs on PHB3
>> since that doesn't support IO.
>
>Here too, no need for such a flag, just check that
>pci_controller->io_resource.flags is 0.
>
>BTW. Please work on top of the patch I sent already that avoids adding
>bogus resources to pci_host_bridge when their flags are 0. I'll send
>it to Linus today.
>

I've changed it accordingly on top of your patches in v2 :-)

Thanks,
Gavin

^ permalink raw reply

* Re: [PATCH 2/3] powerpc/powernv: Disable IO space for PCI buses
From: Gavin Shan @ 2013-05-07  5:11 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: bhelgaas, linux-pci, linuxppc-dev, yinghai, Gavin Shan
In-Reply-To: <1367875982.15842.58.camel@pasglop>

On Tue, May 07, 2013 at 07:33:02AM +1000, Benjamin Herrenschmidt wrote:
>On Mon, 2013-05-06 at 21:44 +0800, Gavin Shan wrote:
>> The patch intends to set the special flag (PCI_BUS_FLAGS_NO_IO) for
>> root buses so PCI core will skip assignment for IO stuff. Besides,
>> we also clear the IO resources on all PCI devices for PHB3.
>
>Why the new hook ? Can't this be detected simply because there is
>no aperture in the pci_host_bridge with IORESOURCE_IO set in the flags ?
>

Yeah, Ben. I've changed it accordingly in v2 and the problem is that we
need the proposed patches from Yinghai or mine :-)

Thanks,
Gavin 

^ permalink raw reply

* Re: [PATCH 2/6] KVM: PPC: Add support for multiple-TCE hcalls
From: David Gibson @ 2013-05-07  5:05 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, Joerg Roedel, Alexander Graf, kvm-ppc, Alex Williamson,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <1367825157-27231-3-git-send-email-aik@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 1350 bytes --]

On Mon, May 06, 2013 at 05:25:53PM +1000, Alexey Kardashevskiy wrote:
> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
> H_STUFF_TCE hypercalls for QEMU emulated devices such as virtio
> devices or emulated PCI.  These calls allow adding multiple entries
> (up to 512) into the TCE table in one call which saves time on
> transition to/from real mode.
> 
> This adds a guest physical to host real address converter
> and calls the existing H_PUT_TCE handler. The converting function
> is going to be fully utilized by upcoming VFIO supporting patches.
> 
> This also implements the KVM_CAP_PPC_MULTITCE capability,
> so in order to support the functionality of this patch, QEMU
> needs to query for this capability and set the "hcall-multi-tce"
> hypertas property only if the capability is present, otherwise
> there will be serious performance degradation.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: Paul Mackerras <paulus@samba.org>

Fwiw, it would be nice to get this patch merged, regardless of the
rest of the VFIO/powerpc patches.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* [PATCH] powerpc: Make CONFIG_RTAS_PROC depend on CONFIG_PROC_FS
From: Michael Ellerman @ 2013-05-07  4:43 UTC (permalink / raw)
  To: linuxppc-dev

We are getting build errors with CONFIG_PROC_FS=n:

arch/powerpc/kernel/rtas_flash.c
   In function 'rtas_flash_init':
  745:33: error: unused variable 'f' [-Werror=unused-variable]

But rtas_flash.c should not be built when CONFIG_PROC_FS=n, beacause all
it does is provide a /proc interface to the RTAS flash routines.

CONFIG_RTAS_FLASH already depends on CONFIG_RTAS_PROC, to indicate that
it depends on the RTAS proc support, but CONFIG_RTAS_PROC does not
depend on CONFIG_PROC_FS. So fix that.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
---
 arch/powerpc/platforms/Kconfig |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 34d224b..632444f 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -128,7 +128,7 @@ config PPC_RTAS_DAEMON

 config RTAS_PROC
 	bool "Proc interface to RTAS"
-	depends on PPC_RTAS
+	depends on PPC_RTAS && PROC_FS
 	default y

 config RTAS_FLASH
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH] powerpc: fix numa distance for form0 device tree
From: Michael Ellerman @ 2013-05-07  4:23 UTC (permalink / raw)
  To: Greg KH; +Cc: linuxppc-dev, Anton Blanchard, stable
In-Reply-To: <20130507040614.GA19525@kroah.com>

On Mon, May 06, 2013 at 09:06:15PM -0700, Greg KH wrote:
> On Tue, May 07, 2013 at 01:49:34PM +1000, Michael Ellerman wrote:
> > From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
> > 
> > Commit 7122beeee7bc1757682049780179d7c216dd1c83 upstream.
> > 
> > The following commit breaks numa distance setup for old powerpc
> > systems that use form0 encoding in device tree.
> > 
> > commit 41eab6f88f24124df89e38067b3766b7bef06ddb
> > powerpc/numa: Use form 1 affinity to setup node distance
> > 
> > Device tree node /rtas/ibm,associativity-reference-points would
> > index into /cpus/PowerPCxxxx/ibm,associativity based on form0 or
> > form1 encoding detected by ibm,architecture-vec-5 property.
> > 
> > All modern systems use form1 and current kernel code is correct.
> > However, on older systems with form0 encoding, the numa distance
> > will get hard coded as LOCAL_DISTANCE for all nodes.  This causes
> > task scheduling anomaly since scheduler will skip building numa
> > level domain (topmost domain with all cpus) if all numa distances
> > are same.  (value of 'level' in sched_init_numa() will remain 0)
> > 
> > Prior to the above commit:
> > ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
> > 
> > Restoring compatible behavior with this patch for old powerpc systems
> > with device tree where numa distance are encoded as form0.
> > 
> > Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
> > Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
> > ---
> >  arch/powerpc/mm/numa.c |    2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> What stable tree should this be applied to?

All of them please.

cheers

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox