LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 04/13] x86/PCI: Use new pci_is_bridge() to simplify code
From: Yijing Wang @ 2014-05-04  4:23 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Tony Luck, linux-ia64, x86, Yijing Wang, linux-pci, sparclinux,
	Thomas Gleixner, linuxppc-dev, David S. Miller
In-Reply-To: <1399177428-3784-1-git-send-email-wangyijing@huawei.com>

Now we can use new pci_is_bridge() helper function
to simplify code.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
---
 arch/x86/pci/fixup.c |    4 +---
 1 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 94ae9ae..e5f000c 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -337,9 +337,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
 		 * type BRIDGE, or CARDBUS. Host to PCI controllers use
 		 * PCI header type NORMAL.
 		 */
-		if (bridge
-		    && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
-		       || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
+		if (bridge && (pci_is_bridge(bridge))) {
 			pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
 						&config);
 			if (!(config & PCI_BRIDGE_CTL_VGA))
-- 
1.7.1

^ permalink raw reply related

* [PATCH v2 10/13] PCI, cpcihp: Use new pci_is_bridge() to simplify code
From: Yijing Wang @ 2014-05-04  4:23 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Tony Luck, linux-ia64, x86, Yijing Wang, linux-pci, sparclinux,
	Thomas Gleixner, linuxppc-dev, David S. Miller
In-Reply-To: <1399177428-3784-1-git-send-email-wangyijing@huawei.com>

Now we can use new pci_is_bridge() helper function
to simplify code.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
---
 drivers/pci/hotplug/cpci_hotplug_pci.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/hotplug/cpci_hotplug_pci.c b/drivers/pci/hotplug/cpci_hotplug_pci.c
index 8c14648..9843371 100644
--- a/drivers/pci/hotplug/cpci_hotplug_pci.c
+++ b/drivers/pci/hotplug/cpci_hotplug_pci.c
@@ -289,8 +289,7 @@ int __ref cpci_configure_slot(struct slot *slot)
 	list_for_each_entry(dev, &parent->devices, bus_list)
 		if (PCI_SLOT(dev->devfn) != PCI_SLOT(slot->devfn))
 			continue;
-		if ((dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) ||
-		    (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS))
+		if (pci_is_bridge(dev))
 			pci_hp_add_bridge(dev);
 
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH v2 03/13] PCI: Use new pci_is_bridge() to simplify code
From: Yijing Wang @ 2014-05-04  4:23 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Tony Luck, linux-ia64, x86, Yijing Wang, linux-pci, sparclinux,
	Thomas Gleixner, linuxppc-dev, David S. Miller
In-Reply-To: <1399177428-3784-1-git-send-email-wangyijing@huawei.com>

Now we can use new pci_is_bridge() helper function
to simplify code.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
---
 drivers/pci/pci-acpi.c  |    8 +-------
 drivers/pci/probe.c     |    3 +--
 drivers/pci/setup-bus.c |    4 +---
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index f49abef..ca4927b 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -309,13 +309,7 @@ static struct acpi_device *acpi_pci_find_companion(struct device *dev)
 	bool check_children;
 	u64 addr;
 
-	/*
-	 * pci_is_bridge() is not suitable here, because pci_dev->subordinate
-	 * is set only after acpi_pci_find_device() has been called for the
-	 * given device.
-	 */
-	check_children = pci_dev->hdr_type == PCI_HEADER_TYPE_BRIDGE
-			|| pci_dev->hdr_type == PCI_HEADER_TYPE_CARDBUS;
+	check_children = pci_is_bridge(pci_dev);
 	/* Please ref to ACPI spec for the syntax of _ADR */
 	addr = (PCI_SLOT(pci_dev->devfn) << 16) | PCI_FUNC(pci_dev->devfn);
 	return acpi_find_child_device(ACPI_COMPANION(dev->parent), addr,
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ef09f5f..f831dd8 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1670,8 +1670,7 @@ unsigned int pci_scan_child_bus(struct pci_bus *bus)
 
 	for (pass=0; pass < 2; pass++)
 		list_for_each_entry(dev, &bus->devices, bus_list) {
-			if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
-			    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+			if (pci_is_bridge(dev))
 				max = pci_scan_bridge(bus, dev, max, pass);
 		}
 
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 138bdd6..e399d00 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1629,9 +1629,7 @@ void pci_assign_unassigned_bus_resources(struct pci_bus *bus)
 
 	down_read(&pci_bus_sem);
 	list_for_each_entry(dev, &bus->devices, bus_list)
-		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
-		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
-			if (dev->subordinate)
+		if (pci_is_bridge(dev) && pci_has_subordinate(dev))
 				__pci_bus_size_bridges(dev->subordinate,
 							 &add_list);
 	up_read(&pci_bus_sem);
-- 
1.7.1

^ permalink raw reply related

* [PATCH v2 07/13] sparc/PCI: Use new pci_is_bridge() to simplify code
From: Yijing Wang @ 2014-05-04  4:23 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Tony Luck, linux-ia64, x86, Yijing Wang, linux-pci, sparclinux,
	Thomas Gleixner, linuxppc-dev, David S. Miller
In-Reply-To: <1399177428-3784-1-git-send-email-wangyijing@huawei.com>

Now we can use new pci_is_bridge() helper function
to simplify code.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
---
 arch/sparc/kernel/pci.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 1555bbc..857ad77 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -543,8 +543,7 @@ static void pci_of_scan_bus(struct pci_pbm_info *pbm,
 			printk("PCI: dev header type: %x\n",
 			       dev->hdr_type);
 
-		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
-		    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+		if (pci_is_bridge(dev))
 			of_scan_pci_bridge(pbm, child, dev);
 	}
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH v2 13/13] PCI, pciehp: Use new pci_is_bridge() to simplify code
From: Yijing Wang @ 2014-05-04  4:23 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Tony Luck, linux-ia64, x86, Yijing Wang, linux-pci, sparclinux,
	Thomas Gleixner, linuxppc-dev, David S. Miller
In-Reply-To: <1399177428-3784-1-git-send-email-wangyijing@huawei.com>

Now we can use new pci_is_bridge() helper function
to simplify code.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
---
 drivers/pci/hotplug/pciehp_pci.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c
index 1b53306..b6cb1df 100644
--- a/drivers/pci/hotplug/pciehp_pci.c
+++ b/drivers/pci/hotplug/pciehp_pci.c
@@ -62,8 +62,7 @@ int pciehp_configure_device(struct slot *p_slot)
 	}
 
 	list_for_each_entry(dev, &parent->devices, bus_list)
-		if ((dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) ||
-				(dev->hdr_type == PCI_HEADER_TYPE_CARDBUS))
+		if (pci_is_bridge(dev))
 			pci_hp_add_bridge(dev);
 
 	pci_assign_unassigned_bridge_resources(bridge);
-- 
1.7.1

^ permalink raw reply related

* [PATCH v2 12/13] PCI, pcmcia: Use new pci_is_bridge() to simplify code
From: Yijing Wang @ 2014-05-04  4:23 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Tony Luck, linux-ia64, x86, Yijing Wang, linux-pci, sparclinux,
	Thomas Gleixner, linuxppc-dev, David S. Miller
In-Reply-To: <1399177428-3784-1-git-send-email-wangyijing@huawei.com>

Now we can use new pci_is_bridge() helper function
to simplify code.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
---
 drivers/pcmcia/cardbus.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/drivers/pcmcia/cardbus.c b/drivers/pcmcia/cardbus.c
index 8bde619..4fe4cc4 100644
--- a/drivers/pcmcia/cardbus.c
+++ b/drivers/pcmcia/cardbus.c
@@ -78,8 +78,7 @@ int __ref cb_alloc(struct pcmcia_socket *s)
 	max = bus->busn_res.start;
 	for (pass = 0; pass < 2; pass++)
 		list_for_each_entry(dev, &bus->devices, bus_list)
-			if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
-			    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+			if (pci_is_bridge(dev))
 				max = pci_scan_bridge(bus, dev, max, pass);
 
 	/*
-- 
1.7.1

^ permalink raw reply related

* [PATCH v2 02/13] PCI: Introduce new pci_is_bridge() helper function
From: Yijing Wang @ 2014-05-04  4:23 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Tony Luck, linux-ia64, x86, Yijing Wang, linux-pci, sparclinux,
	Thomas Gleixner, linuxppc-dev, David S. Miller
In-Reply-To: <1399177428-3784-1-git-send-email-wangyijing@huawei.com>

PCIe Spec define the PCI bridge is the PCI device
which header type(bit 0 through 6) is 0x1(PCI bridge)
or 0x2(CardBus bridge).

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
---
 include/linux/pci.h |   13 +++++++++++++
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/include/linux/pci.h b/include/linux/pci.h
index aab57b4..9f5f89e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -477,6 +477,19 @@ static inline bool pci_is_root_bus(struct pci_bus *pbus)
 	return !(pbus->parent);
 }
 
+/**
+ * pci_is_bridge - check if the PCI device is a bridge
+ * @dev: PCI device
+ *
+ * Return true if the PCI device is bridge whether it has subordinate
+ * or not.
+ */
+static inline bool pci_is_bridge(struct pci_dev *dev)
+{
+	return dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+		dev->hdr_type == PCI_HEADER_TYPE_CARDBUS;
+}
+
 static inline struct pci_dev *pci_upstream_bridge(struct pci_dev *dev)
 {
 	dev = pci_physfn(dev);
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH v4 7/8] DMA: Freescale: use spin_lock_bh instead of spin_lock_irqsave
From: Hongbo Zhang @ 2014-05-04  8:40 UTC (permalink / raw)
  To: Vinod Koul
  Cc: leo.li, vkoul, linux-kernel, scottwood, dmaengine, dan.j.williams,
	linuxppc-dev
In-Reply-To: <20140502165143.GI32284@intel.com>


On 05/03/2014 12:51 AM, Vinod Koul wrote:
> On Fri, Apr 18, 2014 at 04:17:50PM +0800, hongbo.zhang@freescale.com wrote:
>> From: Hongbo Zhang <hongbo.zhang@freescale.com>
>>
>> The usage of spin_lock_irqsave() is a stronger locking mechanism than is
>> required throughout the driver. The minimum locking required should be used
>> instead. Interrupts will be turned off and context will be saved, it is
>> unnecessary to use irqsave.
>>
>> This patch changes all instances of spin_lock_irqsave() to spin_lock_bh(). All
>> manipulation of protected fields is done using tasklet context or weaker, which
>> makes spin_lock_bh() the correct choice.
>>
> This doesnt apply, perhpas due to depends on 6/8
>
So let's wait for the review result of 6/8.

^ permalink raw reply

* Re: [PATCH v4 8/8] DMA: Freescale: add suspend resume functions for DMA driver
From: Hongbo Zhang @ 2014-05-04 10:22 UTC (permalink / raw)
  To: Vinod Koul
  Cc: leo.li, vkoul, linux-kernel, scottwood, dmaengine, dan.j.williams,
	linuxppc-dev
In-Reply-To: <20140502164604.GB32284@intel.com>


On 05/03/2014 12:46 AM, Vinod Koul wrote:
> On Fri, Apr 18, 2014 at 04:17:51PM +0800, hongbo.zhang@freescale.com wrote:
>> From: Hongbo Zhang <hongbo.zhang@freescale.com>
>>
>> This patch adds suspend resume functions for Freescale DMA driver.
>> .prepare callback is used to stop further descriptors from being added into the
>> pending queue, and also issue pending queues into execution if there is any.
>> .suspend callback makes sure all the pending jobs are cleaned up and all the
>> channels are idle, and save the mode registers.
>> .resume callback re-initializes the channels by restore the mode registers.
>>
>> +
>> +static const struct dev_pm_ops fsldma_pm_ops = {
>> +	.prepare	= fsldma_prepare,
>> +	.suspend	= fsldma_suspend,
>> +	.resume		= fsldma_resume,
>> +};
> I think this is not correct. We discussed this sometime back on list. The
> DMAengine drivers should use late resume and early suspend to ensure they get
> suspended after clients (who should use normal ones) and resume before them
>
OK, will update it like this:
use .suspend to take place of current .prepare
use .suspend_late to take place of current .suspend
use .resume_early to take place of current .resume

^ permalink raw reply

* Re: [PATCH 5/6] powerpc/corenet: Add DPAA FMan support to the SoC device tree(s)
From: Emil Medve @ 2014-05-04 10:59 UTC (permalink / raw)
  To: Scott Wood, Kanetkar Shruti-B44454, devicetree,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1398118442.1694.190.camel__272.432543761347$1398129129$gmane$org@snotra.buserror.net>

Hello Scott,


On 04/21/2014 05:14 PM, Scott Wood wrote:
> On Fri, 2014-04-18 at 07:21 -0500, Shruti Kanetkar wrote:
>> FMan 1 Gb/s MACs (dTSEC and mEMAC) have support for SGMII PHYs.
>> Add support for the internal SerDes TBI PHYs
>>
>> Based on prior work by Andy Fleming <afleming@gmail.com>
>>
>> Signed-off-by: Shruti Kanetkar <Shruti@Freescale.com>
>> ---
>>  arch/powerpc/boot/dts/fsl/b4860si-post.dtsi |  28 +++++
>>  arch/powerpc/boot/dts/fsl/b4si-post.dtsi    |  51 +++++++++
>>  arch/powerpc/boot/dts/fsl/p1023si-post.dtsi |  14 +++
>>  arch/powerpc/boot/dts/fsl/p2041si-post.dtsi |  64 ++++++++++++
>>  arch/powerpc/boot/dts/fsl/p3041si-post.dtsi |  64 ++++++++++++
>>  arch/powerpc/boot/dts/fsl/p4080si-post.dtsi | 104 +++++++++++++++++++
>>  arch/powerpc/boot/dts/fsl/p5020si-post.dtsi |  64 ++++++++++++
>>  arch/powerpc/boot/dts/fsl/p5040si-post.dtsi | 128 +++++++++++++++++++++++
>>  arch/powerpc/boot/dts/fsl/t4240si-post.dtsi | 154 ++++++++++++++++++++++++++++
>>  9 files changed, 671 insertions(+)
>>
>> diff --git a/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
>> index cbc354b..45b0ff5 100644
>> --- a/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
>> +++ b/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
>> @@ -172,6 +172,34 @@
>>  		compatible = "fsl,b4860-rcpm", "fsl,qoriq-rcpm-2.0";
>>  	};
>>  
>> +/include/ "qoriq-fman3-0-1g-4.dtsi"
>> +/include/ "qoriq-fman3-0-1g-5.dtsi"
>> +/include/ "qoriq-fman3-0-10g-0.dtsi"
>> +/include/ "qoriq-fman3-0-10g-1.dtsi"
>> +	fman@400000 {
>> +		ethernet@e8000 {
>> +			tbi-handle = <&tbi4>;
>> +		};
> 
> Binding needed
> 
> Where is the "reg" for these unit addresses?

As I said, the bulk of the FMan work comes from another team. Here we
need just enough to hook up the MDIO and PHY nodes. I'd really like to
be able to make progress on this without waiting for that moment in time
we can get the entire FMan binding in place

>> +		mdio@e9000 {
>> +			tbi4: tbi-phy@8 {
>> +				reg = <0x8>;
>> +				device_type = "tbi-phy";
>> +			};
>> +		};
> 
> Binding needed for tbi-phy device_type

I guess that's fair (BTW, you accepted tbi-phy nodes/device-type before
without a binding)

> Why are we using device_type at all for this?

That's what the upstream driver is looking for. Anyway, most days PHYs
can be discovered so they don't use/need compatible properties. That's I
guess part of the reason we don't have bindings for them PHY nodes

However, what you can't discover is how they are wired to the MAC(s) so
we still need some nodes in the device tree to convey that. Also, when
looking for a specific kind of PHY, such as TBI, device_type works
easier then parsing compatibles from various vendors or so


Cheers,

^ permalink raw reply

* Boot problems with a PA6T board
From: Christian Zigotzky @ 2014-05-04 16:02 UTC (permalink / raw)
  To: linuxppc-dev

Hi All,

The RC 1, 2, and 3 of the kernel 3.15 don't boot on my PA6T board with a 
Radeon HD 6870 graphics card.

Screenshot: 
http://forum.hyperion-entertainment.biz/download/file.php?id=1060&mode=view

The kernel 3.14 starts without any problems. Has anyone a tip for me, 
please?

Cheers,

Christian

^ permalink raw reply

* [PATCH V4] KVM: PPC: BOOK3S: PR: Enable Little Endian PR guest
From: Aneesh Kumar K.V @ 2014-05-04 17:18 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc, Aneesh Kumar K.V

This patch make sure we inherit the LE bit correctly in different case
so that we can run Little Endian distro in PR mode

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
Changes from V3:
* Address review comments.
  * rebase to new kernel so that intr_msr is moved instead of adding a new variable
  * Drop the lock since we are single threaded for PR KVM

 arch/powerpc/include/asm/kvm_host.h |  2 +-
 arch/powerpc/kernel/asm-offsets.c   |  2 +-
 arch/powerpc/kvm/book3s_64_mmu.c    |  2 +-
 arch/powerpc/kvm/book3s_pr.c        | 29 ++++++++++++++++++++++++++++-
 4 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1eaea2dea174..d342f8efc843 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -562,6 +562,7 @@ struct kvm_vcpu_arch {
 #ifdef CONFIG_PPC_BOOK3S
 	ulong fault_dar;
 	u32 fault_dsisr;
+	unsigned long intr_msr;
 #endif
 
 #ifdef CONFIG_BOOKE
@@ -654,7 +655,6 @@ struct kvm_vcpu_arch {
 	spinlock_t tbacct_lock;
 	u64 busy_stolen;
 	u64 busy_preempt;
-	unsigned long intr_msr;
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index dba8140ebc20..6a4b77d197f3 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -493,7 +493,6 @@ int main(void)
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
 	DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
-	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -528,6 +527,7 @@ int main(void)
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
 	DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 83da1f868fd5..8231b83c493b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -38,7 +38,7 @@
 
 static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 {
-	kvmppc_set_msr(vcpu, MSR_SF);
+	kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f30cdfee800d..466273aca58b 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -249,7 +249,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 	ulong smsr = vcpu->arch.shared->msr;
 
 	/* Guest MSR values */
-	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
 	/* Process MSR values */
 	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
 	/* External providers the guest reserved */
@@ -1118,6 +1118,15 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_HIOR:
 		*val = get_reg_val(id, to_book3s(vcpu)->hior);
 		break;
+	case KVM_REG_PPC_LPCR:
+		/*
+		 * We are only interested in the LPCR_ILE bit
+		 */
+		if (vcpu->arch.intr_msr & MSR_LE)
+			*val = get_reg_val(id, LPCR_ILE);
+		else
+			*val = get_reg_val(id, 0);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -1126,6 +1135,20 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 	return r;
 }
 
+static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr)
+{
+	/*
+	 * If ILE (interrupt little-endian) has changed, update the
+	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
+	 */
+	if ((new_lpcr & LPCR_ILE) != (vcpu->arch.intr_msr & MSR_LE)) {
+		if (new_lpcr & LPCR_ILE)
+			vcpu->arch.intr_msr |= MSR_LE;
+		else
+			vcpu->arch.intr_msr &= ~MSR_LE;
+	}
+}
+
 static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 				 union kvmppc_one_reg *val)
 {
@@ -1136,6 +1159,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 		to_book3s(vcpu)->hior = set_reg_val(id, *val);
 		to_book3s(vcpu)->hior_explicit = true;
 		break;
+	case KVM_REG_PPC_LPCR:
+		kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -1188,6 +1214,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
 	vcpu->arch.pvr = 0x3C0301;
 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
 		vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu->arch.intr_msr = MSR_SF;
 #else
 	/* default to book3s_32 (750) */
 	vcpu->arch.pvr = 0x84202;
-- 
1.9.1

^ permalink raw reply related

* [PATCH V4] POWERPC: BOOK3S: KVM: Use the saved dar value and generic make_dsisr
From: Aneesh Kumar K.V @ 2014-05-04 17:21 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc, Aneesh Kumar K.V

Although it's optional IBM POWER cpus always had DAR value set on
alignment interrupt. So don't try to compute these values.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
Changes from V3:
* Use make_dsisr instead of checking feature flag to decide whether to use
  saved dsisr or not

 arch/powerpc/include/asm/disassemble.h | 34 +++++++++++++++++++++++++++
 arch/powerpc/kernel/align.c            | 34 +--------------------------
 arch/powerpc/kvm/book3s_emulate.c      | 43 ++++------------------------------
 3 files changed, 40 insertions(+), 71 deletions(-)

diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h
index 856f8deb557a..6330a61b875a 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -81,4 +81,38 @@ static inline unsigned int get_oc(u32 inst)
 {
 	return (inst >> 11) & 0x7fff;
 }
+
+#define IS_XFORM(inst)	(get_op(inst)  == 31)
+#define IS_DSFORM(inst)	(get_op(inst) >= 56)
+
+/*
+ * Create a DSISR value from the instruction
+ */
+static inline unsigned make_dsisr(unsigned instr)
+{
+	unsigned dsisr;
+
+
+	/* bits  6:15 --> 22:31 */
+	dsisr = (instr & 0x03ff0000) >> 16;
+
+	if (IS_XFORM(instr)) {
+		/* bits 29:30 --> 15:16 */
+		dsisr |= (instr & 0x00000006) << 14;
+		/* bit     25 -->    17 */
+		dsisr |= (instr & 0x00000040) << 8;
+		/* bits 21:24 --> 18:21 */
+		dsisr |= (instr & 0x00000780) << 3;
+	} else {
+		/* bit      5 -->    17 */
+		dsisr |= (instr & 0x04000000) >> 12;
+		/* bits  1: 4 --> 18:21 */
+		dsisr |= (instr & 0x78000000) >> 17;
+		/* bits 30:31 --> 12:13 */
+		if (IS_DSFORM(instr))
+			dsisr |= (instr & 0x00000003) << 18;
+	}
+
+	return dsisr;
+}
 #endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 94908af308d8..34f55524d456 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -25,14 +25,13 @@
 #include <asm/cputable.h>
 #include <asm/emulated_ops.h>
 #include <asm/switch_to.h>
+#include <asm/disassemble.h>
 
 struct aligninfo {
 	unsigned char len;
 	unsigned char flags;
 };
 
-#define IS_XFORM(inst)	(((inst) >> 26) == 31)
-#define IS_DSFORM(inst)	(((inst) >> 26) >= 56)
 
 #define INVALID	{ 0, 0 }
 
@@ -192,37 +191,6 @@ static struct aligninfo aligninfo[128] = {
 };
 
 /*
- * Create a DSISR value from the instruction
- */
-static inline unsigned make_dsisr(unsigned instr)
-{
-	unsigned dsisr;
-
-
-	/* bits  6:15 --> 22:31 */
-	dsisr = (instr & 0x03ff0000) >> 16;
-
-	if (IS_XFORM(instr)) {
-		/* bits 29:30 --> 15:16 */
-		dsisr |= (instr & 0x00000006) << 14;
-		/* bit     25 -->    17 */
-		dsisr |= (instr & 0x00000040) << 8;
-		/* bits 21:24 --> 18:21 */
-		dsisr |= (instr & 0x00000780) << 3;
-	} else {
-		/* bit      5 -->    17 */
-		dsisr |= (instr & 0x04000000) >> 12;
-		/* bits  1: 4 --> 18:21 */
-		dsisr |= (instr & 0x78000000) >> 17;
-		/* bits 30:31 --> 12:13 */
-		if (IS_DSFORM(instr))
-			dsisr |= (instr & 0x00000003) << 18;
-	}
-
-	return dsisr;
-}
-
-/*
  * The dcbz (data cache block zero) instruction
  * gives an alignment fault if used on non-cacheable
  * memory.  We handle the fault mainly for the
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 99d40f8977e8..04c38f049dfd 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -569,48 +569,14 @@ unprivileged:
 
 u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
 {
-	u32 dsisr = 0;
-
-	/*
-	 * This is what the spec says about DSISR bits (not mentioned = 0):
-	 *
-	 * 12:13		[DS]	Set to bits 30:31
-	 * 15:16		[X]	Set to bits 29:30
-	 * 17			[X]	Set to bit 25
-	 *			[D/DS]	Set to bit 5
-	 * 18:21		[X]	Set to bits 21:24
-	 *			[D/DS]	Set to bits 1:4
-	 * 22:26			Set to bits 6:10 (RT/RS/FRT/FRS)
-	 * 27:31			Set to bits 11:15 (RA)
-	 */
-
-	switch (get_op(inst)) {
-	/* D-form */
-	case OP_LFS:
-	case OP_LFD:
-	case OP_STFD:
-	case OP_STFS:
-		dsisr |= (inst >> 12) & 0x4000;	/* bit 17 */
-		dsisr |= (inst >> 17) & 0x3c00; /* bits 18:21 */
-		break;
-	/* X-form */
-	case 31:
-		dsisr |= (inst << 14) & 0x18000; /* bits 15:16 */
-		dsisr |= (inst << 8)  & 0x04000; /* bit 17 */
-		dsisr |= (inst << 3)  & 0x03c00; /* bits 18:21 */
-		break;
-	default:
-		printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst);
-		break;
-	}
-
-	dsisr |= (inst >> 16) & 0x03ff; /* bits 22:31 */
-
-	return dsisr;
+	return make_dsisr(inst);
 }
 
 ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
 {
+#ifdef CONFIG_PPC_BOOK3S_64
+	return vcpu->arch.fault_dar;
+#else
 	ulong dar = 0;
 	ulong ra = get_ra(inst);
 	ulong rb = get_rb(inst);
@@ -635,4 +601,5 @@ ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
 	}
 
 	return dar;
+#endif
 }
-- 
1.9.1

^ permalink raw reply related

* [PATCH] KVM: PPC: BOOK3S: HV: Don't try to allocate from kernel page allocator for hash page table.
From: Aneesh Kumar K.V @ 2014-05-04 17:25 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc, Aneesh Kumar K.V

We reserve 5% of total ram for CMA allocation and not using that can
result in us running out of numa node memory with specific
configuration. One caveat is we may not have node local hpt with pinned
vcpu configuration. But currently libvirt also pins the vcpu to cpuset
after creating hash page table.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index fb25ebc0af0c..f32896ffd784 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -52,7 +52,7 @@ static void kvmppc_rmap_reset(struct kvm *kvm);
 
 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
-	unsigned long hpt;
+	unsigned long hpt = 0;
 	struct revmap_entry *rev;
 	struct page *page = NULL;
 	long order = KVM_DEFAULT_HPT_ORDER;
@@ -64,22 +64,11 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	}
 
 	kvm->arch.hpt_cma_alloc = 0;
-	/*
-	 * try first to allocate it from the kernel page allocator.
-	 * We keep the CMA reserved for failed allocation.
-	 */
-	hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT |
-			       __GFP_NOWARN, order - PAGE_SHIFT);
-
-	/* Next try to allocate from the preallocated pool */
-	if (!hpt) {
-		VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
-		page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
-		if (page) {
-			hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
-			kvm->arch.hpt_cma_alloc = 1;
-		} else
-			--order;
+	VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
+	page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
+	if (page) {
+		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+		kvm->arch.hpt_cma_alloc = 1;
 	}
 
 	/* Lastly try successively smaller sizes from the page allocator */
-- 
1.9.1

^ permalink raw reply related

* [PATCH] KVM: PPC: BOOK3S: PR: Fix WARN_ON with debug options on
From: Aneesh Kumar K.V @ 2014-05-04 17:26 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc, Aneesh Kumar K.V

With debug option "sleep inside atomic section checking" enabled we get
the below WARN_ON during a PR KVM boot. This is because upstream now
have PREEMPT_COUNT enabled even if we have preempt disabled. Fix the
warning by adding preempt_disable/enable around floating point and altivec
enable.

WARNING: at arch/powerpc/kernel/process.c:156
Modules linked in: kvm_pr kvm
CPU: 1 PID: 3990 Comm: qemu-system-ppc Tainted: G        W     3.15.0-rc1+ #4
task: c0000000eb85b3a0 ti: c0000000ec59c000 task.ti: c0000000ec59c000
NIP: c000000000015c84 LR: d000000003334644 CTR: c000000000015c00
REGS: c0000000ec59f140 TRAP: 0700   Tainted: G        W      (3.15.0-rc1+)
MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>  CR: 42000024  XER: 20000000
CFAR: c000000000015c24 SOFTE: 1
GPR00: d000000003334644 c0000000ec59f3c0 c000000000e2fa40 c0000000e2f80000
GPR04: 0000000000000800 0000000000002000 0000000000000001 8000000000000000
GPR08: 0000000000000001 0000000000000001 0000000000002000 c000000000015c00
GPR12: d00000000333da18 c00000000fb80900 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 00003fffce4e0fa1
GPR20: 0000000000000010 0000000000000001 0000000000000002 00000000100b9a38
GPR24: 0000000000000002 0000000000000000 0000000000000000 0000000000000013
GPR28: 0000000000000000 c0000000eb85b3a0 0000000000002000 c0000000e2f80000
NIP [c000000000015c84] .enable_kernel_fp+0x84/0x90
LR [d000000003334644] .kvmppc_handle_ext+0x134/0x190 [kvm_pr]
Call Trace:
[c0000000ec59f3c0] [0000000000000010] 0x10 (unreliable)
[c0000000ec59f430] [d000000003334644] .kvmppc_handle_ext+0x134/0x190 [kvm_pr]
[c0000000ec59f4c0] [d00000000324b380] .kvmppc_set_msr+0x30/0x50 [kvm]
[c0000000ec59f530] [d000000003337cac] .kvmppc_core_emulate_op_pr+0x16c/0x5e0 [kvm_pr]
[c0000000ec59f5f0] [d00000000324a944] .kvmppc_emulate_instruction+0x284/0xa80 [kvm]
[c0000000ec59f6c0] [d000000003336888] .kvmppc_handle_exit_pr+0x488/0xb70 [kvm_pr]
[c0000000ec59f790] [d000000003338d34] kvm_start_lightweight+0xcc/0xdc [kvm_pr]
[c0000000ec59f960] [d000000003336288] .kvmppc_vcpu_run_pr+0xc8/0x190 [kvm_pr]
[c0000000ec59f9f0] [d00000000324c880] .kvmppc_vcpu_run+0x30/0x50 [kvm]
[c0000000ec59fa60] [d000000003249e74] .kvm_arch_vcpu_ioctl_run+0x54/0x1b0 [kvm]
[c0000000ec59faf0] [d000000003244948] .kvm_vcpu_ioctl+0x478/0x760 [kvm]
[c0000000ec59fcb0] [c000000000224e34] .do_vfs_ioctl+0x4d4/0x790
[c0000000ec59fd90] [c000000000225148] .SyS_ioctl+0x58/0xb0
[c0000000ec59fe30] [c00000000000a1e4] syscall_exit+0x0/0x98

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/book3s_pr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index c5c052a9729c..f30cdfee800d 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -683,16 +683,20 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #endif
 
 	if (msr & MSR_FP) {
+		preempt_disable();
 		enable_kernel_fp();
 		load_fp_state(&vcpu->arch.fp);
 		t->fp_save_area = &vcpu->arch.fp;
+		preempt_enable();
 	}
 
 	if (msr & MSR_VEC) {
 #ifdef CONFIG_ALTIVEC
+		preempt_disable();
 		enable_kernel_altivec();
 		load_vr_state(&vcpu->arch.vr);
 		t->vr_save_area = &vcpu->arch.vr;
+		preempt_enable();
 #endif
 	}
 
@@ -716,13 +720,17 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
 		return;
 
 	if (lost_ext & MSR_FP) {
+		preempt_disable();
 		enable_kernel_fp();
 		load_fp_state(&vcpu->arch.fp);
+		preempt_enable();
 	}
 #ifdef CONFIG_ALTIVEC
 	if (lost_ext & MSR_VEC) {
+		preempt_disable();
 		enable_kernel_altivec();
 		load_vr_state(&vcpu->arch.vr);
+		preempt_enable();
 	}
 #endif
 	current->thread.regs->msr |= lost_ext;
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH] KVM: PPC: BOOK3S: HV: THP support for guest
From: Aneesh Kumar K.V @ 2014-05-04 17:30 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc, Aneesh Kumar K.V

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 146 ++++++++++++++++++++++++++-----
 arch/powerpc/kvm/book3s_hv.c             |   7 ++
 2 files changed, 130 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 51388befeddb..f03ea8f90576 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -77,34 +77,122 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 	return old == 0;
 }
 
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+	int i, shift;
+	unsigned int mask;
+
+	/* start from 1 ignoring MMU_PAGE_4K */
+	for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+		/* invalid penc */
+		if (mmu_psize_defs[psize].penc[i] == -1)
+			continue;
+		/*
+		 * encoding bits per actual page size
+		 *        PTE LP     actual page size
+		 *    rrrr rrrz		>=8KB
+		 *    rrrr rrzz		>=16KB
+		 *    rrrr rzzz		>=32KB
+		 *    rrrr zzzz		>=64KB
+		 * .......
+		 */
+		shift = mmu_psize_defs[i].shift - LP_SHIFT;
+		if (shift > LP_BITS)
+			shift = LP_BITS;
+		mask = (1 << shift) - 1;
+		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+			return i;
+	}
+	return -1;
+}
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 {
-	unsigned long rb, va_low;
+	int b_size, a_size;
+	unsigned int penc;
+	unsigned long rb = 0, va_low, sllp;
+	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+	if (!(v & HPTE_V_LARGE)) {
+		/* both base and actual psize is 4k */
+		b_size = MMU_PAGE_4K;
+		a_size = MMU_PAGE_4K;
+	} else {
+		for (b_size = 0; b_size < MMU_PAGE_COUNT; b_size++) {
+
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[b_size].shift)
+				continue;
 
+			a_size = __hpte_actual_psize(lp, b_size);
+			if (a_size != -1)
+				break;
+		}
+	}
+	/*
+	 * Ignore the top 14 bits of va
+	 * v have top two bits covering segment size, hence move
+	 * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
+	 * AVA field in v also have the lower 23 bits ignored.
+	 * For base page size 4K we need 14 .. 65 bits (so need to
+	 * collect extra 11 bits)
+	 * For others we need 14..14+i
+	 */
+	/* This covers 14..54 bits of va*/
 	rb = (v & ~0x7fUL) << 16;		/* AVA field */
+	/*
+	 * AVA in v had cleared lower 23 bits. We need to derive
+	 * that from pteg index
+	 */
 	va_low = pte_index >> 3;
 	if (v & HPTE_V_SECONDARY)
 		va_low = ~va_low;
-	/* xor vsid from AVA */
+	/*
+	 * get the vpn bits from va_low using reverse of hashing.
+	 * In v we have va with 23 bits dropped and then left shifted
+	 * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
+	 * right shift it with (SID_SHIFT - (23 - 7))
+	 */
 	if (!(v & HPTE_V_1TB_SEG))
-		va_low ^= v >> 12;
+		va_low ^= v >> (SID_SHIFT - 16);
 	else
-		va_low ^= v >> 24;
+		va_low ^= v >> (SID_SHIFT_1T - 16);
 	va_low &= 0x7ff;
-	if (v & HPTE_V_LARGE) {
-		rb |= 1;			/* L field */
-		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-		    (r & 0xff000)) {
-			/* non-16MB large page, must be 64k */
-			/* (masks depend on page size) */
-			rb |= 0x1000;		/* page encoding in LP field */
-			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
-		}
-	} else {
-		/* 4kB page */
-		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
+
+	switch (b_size) {
+	case MMU_PAGE_4K:
+		sllp = ((mmu_psize_defs[a_size].sllp & SLB_VSID_L) >> 6) |
+			((mmu_psize_defs[a_size].sllp & SLB_VSID_LP) >> 4);
+		rb |= sllp << 5;	/*  AP field */
+		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
+		break;
+	default:
+	{
+		int aval_shift;
+		/*
+		 * remaining 7bits of AVA/LP fields
+		 * Also contain the rr bits of LP
+		 */
+		rb |= (va_low & 0x7f) << 16;
+		/*
+		 * Now clear not needed LP bits based on actual psize
+		 */
+		rb &= ~((1ul << mmu_psize_defs[a_size].shift) - 1);
+		/*
+		 * AVAL field 58..77 - base_page_shift bits of va
+		 * we have space for 58..64 bits, Missing bits should
+		 * be zero filled. +1 is to take care of L bit shift
+		 */
+		aval_shift = 64 - (77 - mmu_psize_defs[b_size].shift) + 1;
+		rb |= ((va_low << aval_shift) & 0xfe);
+
+		rb |= 1;		/* L field */
+		penc = mmu_psize_defs[b_size].penc[a_size];
+		rb |= penc << 12;	/* LP field */
+		break;
+	}
 	}
 	rb |= (v >> 54) & 0x300;		/* B field */
 	return rb;
@@ -112,14 +200,26 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 
 static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
 {
+	int size, a_size;
+	/* Look at the 8 bit LP value */
+	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
 	/* only handle 4k, 64k and 16M pages for now */
 	if (!(h & HPTE_V_LARGE))
-		return 1ul << 12;		/* 4k page */
-	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
-		return 1ul << 16;		/* 64k page */
-	if ((l & 0xff000) == 0)
-		return 1ul << 24;		/* 16M page */
-	return 0;				/* error */
+		return 1ul << 12;
+	else {
+		for (size = 0; size < MMU_PAGE_COUNT; size++) {
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[size].shift)
+				continue;
+
+			a_size = __hpte_actual_psize(lp, size);
+			if (a_size != -1)
+				return 1ul << mmu_psize_defs[a_size].shift;
+		}
+
+	}
+	return 0;
 }
 
 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8227dba5af0f..a38d3289320a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
 	 * support pte_enc here
 	 */
 	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
+	/*
+	 * Add 16MB MPSS support
+	 */
+	if (linux_psize != MMU_PAGE_16M) {
+		(*sps)->enc[1].page_shift = 24;
+		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+	}
 	(*sps)++;
 }
 
-- 
1.9.1

^ permalink raw reply related

* Re: [RFC PATCH] KVM: PPC: BOOK3S: HV: THP support for guest
From: Aneesh Kumar K.V @ 2014-05-04 17:36 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <1399224616-25142-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/kvm_book3s_64.h | 146 ++++++++++++++++++++++++++-----
>  arch/powerpc/kvm/book3s_hv.c             |   7 ++
>  2 files changed, 130 insertions(+), 23 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 51388befeddb..f03ea8f90576 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -77,34 +77,122 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
>  	return old == 0;
>  }
>
> +static inline int __hpte_actual_psize(unsigned int lp, int psize)
> +{
> +	int i, shift;
> +	unsigned int mask;
> +
> +	/* start from 1 ignoring MMU_PAGE_4K */
> +	for (i = 1; i < MMU_PAGE_COUNT; i++) {
> +
> +		/* invalid penc */
> +		if (mmu_psize_defs[psize].penc[i] == -1)
> +			continue;
> +		/*
> +		 * encoding bits per actual page size
> +		 *        PTE LP     actual page size
> +		 *    rrrr rrrz		>=8KB
> +		 *    rrrr rrzz		>=16KB
> +		 *    rrrr rzzz		>=32KB
> +		 *    rrrr zzzz		>=64KB
> +		 * .......
> +		 */
> +		shift = mmu_psize_defs[i].shift - LP_SHIFT;
> +		if (shift > LP_BITS)
> +			shift = LP_BITS;
> +		mask = (1 << shift) - 1;
> +		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
> +			return i;
> +	}
> +	return -1;
> +}
> +
>  static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>  					     unsigned long pte_index)
>  {
> -	unsigned long rb, va_low;
> +	int b_size, a_size;
> +	unsigned int penc;
> +	unsigned long rb = 0, va_low, sllp;
> +	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
> +	if (!(v & HPTE_V_LARGE)) {
> +		/* both base and actual psize is 4k */
> +		b_size = MMU_PAGE_4K;
> +		a_size = MMU_PAGE_4K;
> +	} else {
> +		for (b_size = 0; b_size < MMU_PAGE_COUNT; b_size++) {
> +
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[b_size].shift)
> +				continue;
>
> +			a_size = __hpte_actual_psize(lp, b_size);
> +			if (a_size != -1)
> +				break;
> +		}
> +	}
> +	/*
> +	 * Ignore the top 14 bits of va
> +	 * v have top two bits covering segment size, hence move
> +	 * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
> +	 * AVA field in v also have the lower 23 bits ignored.
> +	 * For base page size 4K we need 14 .. 65 bits (so need to
> +	 * collect extra 11 bits)
> +	 * For others we need 14..14+i
> +	 */
> +	/* This covers 14..54 bits of va*/
>  	rb = (v & ~0x7fUL) << 16;		/* AVA field */
> +	/*
> +	 * AVA in v had cleared lower 23 bits. We need to derive
> +	 * that from pteg index
> +	 */
>  	va_low = pte_index >> 3;
>  	if (v & HPTE_V_SECONDARY)
>  		va_low = ~va_low;
> -	/* xor vsid from AVA */
> +	/*
> +	 * get the vpn bits from va_low using reverse of hashing.
> +	 * In v we have va with 23 bits dropped and then left shifted
> +	 * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
> +	 * right shift it with (SID_SHIFT - (23 - 7))
> +	 */
>  	if (!(v & HPTE_V_1TB_SEG))
> -		va_low ^= v >> 12;
> +		va_low ^= v >> (SID_SHIFT - 16);
>  	else
> -		va_low ^= v >> 24;
> +		va_low ^= v >> (SID_SHIFT_1T - 16);
>  	va_low &= 0x7ff;
> -	if (v & HPTE_V_LARGE) {
> -		rb |= 1;			/* L field */
> -		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> -		    (r & 0xff000)) {
> -			/* non-16MB large page, must be 64k */
> -			/* (masks depend on page size) */
> -			rb |= 0x1000;		/* page encoding in LP field */
> -			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
> -			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
> -		}
> -	} else {
> -		/* 4kB page */
> -		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
> +
> +	switch (b_size) {
> +	case MMU_PAGE_4K:
> +		sllp = ((mmu_psize_defs[a_size].sllp & SLB_VSID_L) >> 6) |
> +			((mmu_psize_defs[a_size].sllp & SLB_VSID_LP) >> 4);
> +		rb |= sllp << 5;	/*  AP field */
> +		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
> +		break;
> +	default:
> +	{
> +		int aval_shift;
> +		/*
> +		 * remaining 7bits of AVA/LP fields
> +		 * Also contain the rr bits of LP
> +		 */
> +		rb |= (va_low & 0x7f) << 16;
> +		/*
> +		 * Now clear not needed LP bits based on actual psize
> +		 */
> +		rb &= ~((1ul << mmu_psize_defs[a_size].shift) - 1);
> +		/*
> +		 * AVAL field 58..77 - base_page_shift bits of va
> +		 * we have space for 58..64 bits, Missing bits should
> +		 * be zero filled. +1 is to take care of L bit shift
> +		 */
> +		aval_shift = 64 - (77 - mmu_psize_defs[b_size].shift) + 1;
> +		rb |= ((va_low << aval_shift) & 0xfe);
> +
> +		rb |= 1;		/* L field */
> +		penc = mmu_psize_defs[b_size].penc[a_size];
> +		rb |= penc << 12;	/* LP field */
> +		break;
> +	}
>  	}
>  	rb |= (v >> 54) & 0x300;		/* B field */
>  	return rb;
> @@ -112,14 +200,26 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>
>  static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
>  {
> +	int size, a_size;
> +	/* Look at the 8 bit LP value */
> +	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
>  	/* only handle 4k, 64k and 16M pages for now */
>  	if (!(h & HPTE_V_LARGE))
> -		return 1ul << 12;		/* 4k page */
> -	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
> -		return 1ul << 16;		/* 64k page */
> -	if ((l & 0xff000) == 0)
> -		return 1ul << 24;		/* 16M page */
> -	return 0;				/* error */
> +		return 1ul << 12;
> +	else {
> +		for (size = 0; size < MMU_PAGE_COUNT; size++) {
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[size].shift)
> +				continue;
> +
> +			a_size = __hpte_actual_psize(lp, size);
> +			if (a_size != -1)
> +				return 1ul << mmu_psize_defs[a_size].shift;
> +		}
> +
> +	}
> +	return 0;
>  }
>
>  static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 8227dba5af0f..a38d3289320a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
>  	 * support pte_enc here
>  	 */
>  	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
> +	/*
> +	 * Add 16MB MPSS support
> +	 */
> +	if (linux_psize != MMU_PAGE_16M) {
> +		(*sps)->enc[1].page_shift = 24;
> +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
> +	}

We ideally want to do this only when the guest memory is backed up by
hugetlbfs. I was thinking qemu should ensure that. But then i am not
sure existing qemu work that way. So we may want to look at how to
enable MPSS.

-aneesh

^ permalink raw reply

* [PATCH 3/3] powerpc/powernv: Don't escalate non-existing frozen PE
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan
In-Reply-To: <1399246144-20247-1-git-send-email-gwshan@linux.vnet.ibm.com>

Commit cb5b242c ("powerpc/eeh: Escalate error on non-existing PE")
escalates the frozen state on non-existing PE to fenced PHB. It
was to improve kdump reliability. After that, commit 361f2a2a
("powrpc/powernv: Reset PHB in kdump kernel") was introduced to
issue complete reset on all PHBs to increase the reliability of
kdump kernel.

Commit cb5b242c becomes unuseful and it would be reverted.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-ioda.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 65feec6..1b5982f 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -884,13 +884,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 			 * it again.
 			 */
 			if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) {
-				*pe = phb_pe;
-				pr_err("EEH: Escalated frozen PHB#%x-"
-				       "PE#%llx (%s) detected\n",
-					hose->global_number,
-					frozen_pe_no,
-					eeh_pe_loc_get(phb_pe));
-				ret = EEH_NEXT_ERR_FENCED_PHB;
+				/* Try best to clear it */
+				pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
+					hose->global_number, frozen_pe_no);
+				opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no,
+					OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+				ret = EEH_NEXT_ERR_NONE;
 			} else if ((*pe)->state & EEH_PE_ISOLATED) {
 				ret = EEH_NEXT_ERR_NONE;
 			} else {
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 2/3] powerpc/eeh: Report frozen parent PE prior to child PE
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan
In-Reply-To: <1399246144-20247-1-git-send-email-gwshan@linux.vnet.ibm.com>

When we have the corner case of frozen parent and child PE at the
same time, we have to handle the frozen parent PE prior to the
child. Without clearning the frozen state on parent PE, the child
PE can't be recovered successfully.

The patch searches the EEH PE hierarchy tree and returns the toppest
frozen PE to be handled. It ensures the frozen parent PE will be
handled prior to child PE.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh.c                 | 27 ++++++++++++++++++++++++---
 arch/powerpc/platforms/powernv/eeh-ioda.c | 30 ++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9f8de75..33d683a 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -357,10 +357,11 @@ out:
 int eeh_dev_check_failure(struct eeh_dev *edev)
 {
 	int ret;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 	unsigned long flags;
 	struct device_node *dn;
 	struct pci_dev *dev;
-	struct eeh_pe *pe;
+	struct eeh_pe *pe, *parent_pe;
 	int rc = 0;
 	const char *location;
 
@@ -438,14 +439,34 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	 */
 	if ((ret < 0) ||
 	    (ret == EEH_STATE_NOT_SUPPORT) ||
-	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
-	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+	    ((ret & active_flags) == active_flags)) {
 		eeh_stats.false_positives++;
 		pe->false_positives++;
 		rc = 0;
 		goto dn_unlock;
 	}
 
+	/*
+	 * It should be corner case that the parent PE has been
+	 * put into frozen state as well. We should take care
+	 * that at first.
+	 */
+	parent_pe = pe->parent;
+	while (parent_pe) {
+		/* Hit the ceiling ? */
+		if (parent_pe->type & EEH_PE_PHB)
+			break;
+
+		/* Frozen parent PE ? */
+		ret = eeh_ops->get_state(parent_pe, NULL);
+		if (ret > 0 &&
+		    (ret & active_flags) != active_flags)
+			pe = parent_pe;
+
+		/* Next parent level */
+		parent_pe = parent_pe->parent;
+	}
+
 	eeh_stats.slot_resets++;
 
 	/* Avoid repeated reports of this failure, including problems
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index e5b88c5..65feec6 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -783,11 +783,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 {
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
-	struct eeh_pe *phb_pe;
+	struct eeh_pe *phb_pe, *parent_pe;
 	u64 frozen_pe_no;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 	u16 err_type, severity;
 	long rc;
-	int ret = EEH_NEXT_ERR_NONE;
+	int state, ret = EEH_NEXT_ERR_NONE;
 
 	/*
 	 * While running here, it's safe to purge the event queue.
@@ -920,6 +921,31 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		}
 
 		/*
+		 * We probably have the frozen parent PE out there and
+		 * we need have to handle frozen parent PE firstly.
+		 */
+		if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+			parent_pe = (*pe)->parent;
+			while (parent_pe) {
+				/* Hit the ceiling ? */
+				if (parent_pe->type & EEH_PE_PHB)
+					break;
+
+				/* Frozen parent PE ? */
+				state = ioda_eeh_get_state(parent_pe);
+				if (state > 0 &&
+				    (state & active_flags) != active_flags)
+					*pe = parent_pe;
+
+				/* Next parent level */
+				parent_pe = parent_pe->parent;
+			}
+
+			/* We possibly migrate to another PE */
+			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+		}
+
+		/*
 		 * If we have no errors on the specific PHB or only
 		 * informative error there, we continue poking it.
 		 * Otherwise, we need actions to be taken by upper
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 1/3] powerpc/eeh: Clear frozen state for child PE
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan

Since commit cb523e09 ("powerpc/eeh: Avoid I/O access during PE
reset"), the PE is kept as frozen state on hardware level until
the PE reset is done completely. After that, we explicitly clear
the frozen state of the affected PE. However, there might have
frozen child PEs of the affected PE and we also need clear their
frozen state as well. Otherwise, the recovery is going to fail.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh_driver.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7100a5b..8bb40e7 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -447,8 +447,9 @@ static void *eeh_pe_detach_dev(void *data, void *userdata)
  * PE reset (for 3 times), we try to clear the frozen state
  * for 3 times as well.
  */
-static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
+static void *__eeh_clear_pe_frozen_state(void *data, void *flag)
 {
+	struct eeh_pe *pe = (struct eeh_pe *)data;
 	int i, rc;
 
 	for (i = 0; i < 3; i++) {
@@ -461,13 +462,24 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
 	}
 
 	/* The PE has been isolated, clear it */
-	if (rc)
+	if (rc) {
 		pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n",
 			__func__, pe->phb->global_number, pe->addr, rc);
-	else
+		return (void *)pe;
+	}
+
+	return NULL;
+}
+
+static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
+{
+	void *rc;
+
+	rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, NULL);
+	if (!rc)
 		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
 
-	return rc;
+	return rc ? -EIO : 0;
 }
 
 /**
-- 
1.8.3.2

^ permalink raw reply related

* Re: [PATCH V4] KVM: PPC: BOOK3S: PR: Enable Little Endian PR guest
From: Paul Mackerras @ 2014-05-04 23:59 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, agraf, kvm-ppc, kvm
In-Reply-To: <1399223932-17840-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Sun, May 04, 2014 at 10:48:52PM +0530, Aneesh Kumar K.V wrote:
> This patch make sure we inherit the LE bit correctly in different case
> so that we can run Little Endian distro in PR mode

[snip]

> +static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr)
> +{
> +	/*
> +	 * If ILE (interrupt little-endian) has changed, update the
> +	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
> +	 */
> +	if ((new_lpcr & LPCR_ILE) != (vcpu->arch.intr_msr & MSR_LE)) {

Since LPCR_ILE != MSR_LE, this condition is always going to be true.
I suggest you remove this if statement and just do the body
unconditionally.

Paul.

^ permalink raw reply

* [PATCH RFC 00/22] EEH Support for VFIO PCI devices on PowerKVM guest
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan

The series of patches intends to support EEH for PCI devices, which have been
passed through to PowerKVM based guest via VFIO. The implementation is
straightforward based on the issues or problems we have to resolve to support
EEH for PowerKVM based guest.

- Emulation for EEH RTAS requests. Thanksfully, we already have infrastructure
  to emulate XICS. Without introducing new mechanism, we just extend that
  existing infrastructure to support EEH RTAS emulation. EEH RTAS requests
  initiated from guest are posted to host where the requests get handled or
  delivered to underly firmware for further handling. For that, the host kerenl
  has to maintain the PCI address (host domain/bus/slot/function to guest's
  PHB BUID/bus/slot/function) mapping via KVM VFIO device. The address mapping
  will be built when initializing VFIO device in QEMU and destroied when the
  VFIO device in QEMU is going to offline, or VM is destroy.

- The infrastructure for error injection is introduced. The emulation for the
  related RTAS services is similar to what we do for EEH/XICS RTAS requests.
  For now, we just support PCI error injection. We need extend it for injecting
  other types of errors in future.

The series of patches requires corresponding firmware changes from Mike Qiu to
support error injection and QEMU changes to support EEH for guest. It also needs
QEMU changes to support it. QEMU patchset will be sent separately.

I usually use command line (not virsh) to start PowerKVM based guests on Firebird-L
machine with different types of PCI devices assigend (passed through) to guest.
Following cases have been tested. The EEH error can be injected by utility "errinjct"
running on guest successfully and we can recover from the EEH error successfully.

Testing on P7
=============

- Emulex adapter
- USB (OHCI) PCI adapter

Testing on P8
=============

- MLX4 adapter (Partially)
- USB (xHCI) PCI adapter

-----

arch/powerpc/include/asm/book3s_errinjct.h     |  97 ++++++++++++++++++++++++
arch/powerpc/include/asm/eeh.h                 |  78 ++++++++++++++++++++
arch/powerpc/include/asm/kvm_ppc.h             |   7 ++
arch/powerpc/include/asm/opal.h                |  65 ++++++++++++++++
arch/powerpc/kernel/eeh.c                      |   8 ++
arch/powerpc/kernel/eeh_pe.c                   | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/kvm/Kconfig                       |  17 +++++
arch/powerpc/kvm/Makefile                      |   6 ++
arch/powerpc/kvm/book3s_errinjct.c             | 329 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/kvm/book3s_hv.c                   |   2 +
arch/powerpc/kvm/book3s_rtas.c                 |  67 +++++++++++++++++
arch/powerpc/platforms/powernv/Makefile        |   2 +
arch/powerpc/platforms/powernv/eeh-ioda.c      |   3 +-
arch/powerpc/platforms/powernv/eeh-rtas.c      | 551 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/errinjct.c      | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
include/linux/kvm_host.h                       |  21 ++++++
include/uapi/linux/kvm.h                       |  10 +++
virt/kvm/vfio.c                                |  60 ++++++++++++++-
19 files changed, 1834 insertions(+), 2 deletions(-)
create mode 100644 arch/powerpc/include/asm/book3s_errinjct.h
create mode 100644 arch/powerpc/kvm/book3s_errinjct.c
create mode 100644 arch/powerpc/platforms/powernv/eeh-rtas.c
create mode 100644 arch/powerpc/platforms/powernv/errinjct.c

Thanks,
Gavin

^ permalink raw reply

* [PATCH 02/22] powerpc/eeh: Info to trace passed devices
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The address of passed PCI devices (domain:bus:slot:func) might be
quite different from the perspective of host and guest. We have to
trace the address mapping so that we can emulate EEH RTAS requests
from guest. The patch introduces additional fields to eeh_pe and
eeh_dev for the purpose.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h | 49 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7782056..8bfb167 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -48,6 +48,17 @@ struct device_node;
 #define EEH_PE_RST_HOLD_TIME		250
 #define EEH_PE_RST_SETTLE_TIME		1800
 
+#ifdef CONFIG_KVM_EEH
+struct eeh_vfio_pci_addr {
+	struct kvm	*kvm;		/* KVM identifier		*/
+	unsigned int	buid_hi;	/* PHB BUID high		*/
+	unsigned int	buid_lo;	/* PHB BUID low			*/
+	unsigned char	bus;		/* Bus number			*/
+	unsigned char	devfn;		/* Slot and function		*/
+	int		pe_addr;	/* PE configuration address	*/
+};
+#endif /* CONFIG_KVM_EEH */
+
 /*
  * The struct is used to trace PE related EEH functionality.
  * In theory, there will have one instance of the struct to
@@ -72,6 +83,7 @@ struct device_node;
 #define EEH_PE_RESET		(1 << 2)	/* PE reset in progress	*/
 
 #define EEH_PE_KEEP		(1 << 8)	/* Keep PE on hotplug	*/
+#define EEH_PE_PASSTHROUGH	(1 << 9)	/* PE owned by guest	*/
 
 struct eeh_pe {
 	int type;			/* PE type: PHB/Bus/Device	*/
@@ -85,6 +97,9 @@ struct eeh_pe {
 	struct timeval tstamp;		/* Time on first-time freeze	*/
 	int false_positives;		/* Times of reported #ff's	*/
 	struct eeh_pe *parent;		/* Parent PE			*/
+#ifdef CONFIG_KVM_EEH
+	struct eeh_vfio_pci_addr gaddr;	/* Associated KVM guest address */
+#endif
 	struct list_head child_list;	/* Link PE to the child list	*/
 	struct list_head edevs;		/* Link list of EEH devices	*/
 	struct list_head child;		/* Child PEs			*/
@@ -93,6 +108,21 @@ struct eeh_pe {
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
 		list_for_each_entry_safe(edev, tmp, &pe->edevs, list)
 
+static inline bool eeh_pe_passed(struct eeh_pe *pe)
+{
+	return pe ? !!(pe->state & EEH_PE_PASSTHROUGH) : false;
+}
+
+static inline void eeh_pe_set_passed(struct eeh_pe *pe, bool passed)
+{
+	if (pe) {
+		if (passed)
+			pe->state |= EEH_PE_PASSTHROUGH;
+		else
+			pe->state &= ~EEH_PE_PASSTHROUGH;
+	}
+}
+
 /*
  * The struct is used to trace EEH state for the associated
  * PCI device node or PCI device. In future, it might
@@ -110,6 +140,7 @@ struct eeh_pe {
 #define EEH_DEV_SYSFS		(1 << 9)	/* Sysfs created	*/
 #define EEH_DEV_REMOVED		(1 << 10)	/* Removed permanently	*/
 #define EEH_DEV_FRESET		(1 << 11)	/* Fundamental reset	*/
+#define EEH_DEV_PASSTHROUGH	(1 << 12)	/* Owned by guest	*/
 
 struct eeh_dev {
 	int mode;			/* EEH mode			*/
@@ -126,6 +157,9 @@ struct eeh_dev {
 	struct device_node *dn;		/* Associated device node	*/
 	struct pci_dev *pdev;		/* Associated PCI device	*/
 	struct pci_bus *bus;		/* PCI bus for partial hotplug	*/
+#ifdef CONFIG_KVM_EEH
+	struct eeh_vfio_pci_addr gaddr;	/* Address in guest		*/
+#endif
 };
 
 static inline struct device_node *eeh_dev_to_of_node(struct eeh_dev *edev)
@@ -138,6 +172,21 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 	return edev ? edev->pdev : NULL;
 }
 
+static inline bool eeh_dev_passed(struct eeh_dev *dev)
+{
+	return dev ? !!(dev->mode & EEH_DEV_PASSTHROUGH) : false;
+}
+
+static inline void eeh_dev_set_passed(struct eeh_dev *dev, bool passed)
+{
+	if (dev) {
+		if (passed)
+			dev->mode |= EEH_DEV_PASSTHROUGH;
+		else
+			dev->mode &= ~EEH_DEV_PASSTHROUGH;
+	}
+}
+
 /* Return values from eeh_ops::next_error */
 enum {
 	EEH_NEXT_ERR_NONE = 0,
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 01/22] powerpc: Introduce CONFIG_KVM_EEH
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The patch introduces kernel configuration option KVM_EEH, which
depends on KVM_BOOK3S_64, VFIO_IOMMU_SPAPR_TCE and EEH. The option
is to enable emulating EEH RTAS services that required by EEH
module in pSeries-based guest.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/Kconfig | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 141b202..743d2d9 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -189,6 +189,14 @@ config KVM_XICS
 	  Specification) interrupt controller architecture used on
 	  IBM POWER (pSeries) servers.
 
+config KVM_EEH
+	bool "KVM in-kernel EEH RTAS emulation"
+	depends on PPC_POWERNV && KVM_BOOK3S_64 && EEH && VFIO_IOMMU_SPAPR_TCE
+	default y
+	---help---
+	  Enable support for emulating EEH RTAS services used on IBM
+	  POWER (pSeries) servers.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 05/22] powerpc/eeh: Release VFIO dev on VM destruction
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

When the VM is destroyed, the EEH devices and PEs that have been
marked as being owned by guest should be returned to host. The
patch introduces kvmppc_vfio_pci_free() to do it.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h |  6 +++++-
 arch/powerpc/kernel/eeh_pe.c   | 42 ++++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c   |  2 ++
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 3807167..677c719 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -380,6 +380,8 @@ static inline void eeh_add_sysfs_files(struct pci_bus *bus) { }
 
 static inline void eeh_remove_device(struct pci_dev *dev) { }
 
+static inline void kvmppc_eeh_vfio_release(struct kvm *kvm) { }
+
 #define EEH_POSSIBLE_ERROR(val, type) (0)
 #define EEH_IO_ERROR_VALUE(size) (-1UL)
 #endif /* CONFIG_EEH */
@@ -388,7 +390,9 @@ static inline void eeh_remove_device(struct pci_dev *dev) { }
 #ifdef CONFIG_KVM_EEH
 struct eeh_dev *eeh_vfio_dev_get(struct eeh_vfio_pci_addr *addr);
 struct eeh_pe *eeh_vfio_pe_get(struct eeh_vfio_pci_addr *addr);
-
+void kvmppc_eeh_vfio_release(struct kvm *kvm);
+#else
+static inline void kvmppc_eeh_vfio_release(void *kvm) { };
 #endif /* CONFIG_KVM_EEH */
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 1bd7b1f..9e73188 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -331,6 +331,48 @@ struct eeh_dev *eeh_vfio_dev_get(struct eeh_vfio_pci_addr *addr)
 
 	return NULL;
 }
+
+static void *__kvmppc_eeh_vfio_release(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct kvm *kvm = (struct kvm *)flag;
+	struct eeh_dev *edev, *tmp;
+
+	if (!eeh_pe_passed(pe))
+		return NULL;
+
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (!eeh_dev_passed(edev))
+			continue;
+
+		if (edev->gaddr.kvm == kvm)
+			eeh_dev_set_passed(edev, false);
+	}
+
+	eeh_pe_set_passed(pe, false);
+
+	return NULL;
+}
+
+/**
+ * kvmppc_eeh_vfio_release - Release VFIO devices for the given VM
+ * @kvm: VM indicator
+ *
+ * The function is expected to be called while the VM is destroyed.
+ * In turn, the PCI devices that have been passed to that VM should
+ * be released and their address mapping maintained will be destroyed.
+ */
+void kvmppc_eeh_vfio_release(struct kvm *kvm)
+{
+	struct eeh_pe *root;
+	void *ret;
+
+	list_for_each_entry(root, &eeh_phb_pe, child) {
+		ret = eeh_pe_traverse(root, __kvmppc_eeh_vfio_release, kvm);
+		if (ret) return;
+	}
+}
+EXPORT_SYMBOL_GPL(kvmppc_eeh_vfio_release);
 #endif /* CONFIG_KVM_EEH */
 
 /**
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8227dba..f07a12d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -49,6 +49,7 @@
 #include <asm/hvcall.h>
 #include <asm/switch_to.h>
 #include <asm/smp.h>
+#include <asm/eeh.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -2344,6 +2345,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 		kvm->arch.rma = NULL;
 	}
 
+	kvmppc_eeh_vfio_release(kvm);
 	kvmppc_free_hpt(kvm);
 }
 
-- 
1.8.3.2

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox