LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 03/29] powerpc/85xx: Update SPI binding to match binding spec for P1020RDB
From: Kumar Gala @ 2011-11-17  7:15 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <1321514181-28897-2-git-send-email-galak@kernel.crashing.org>

The SPI node is out of date with regards to the binding for fsl-espi and
driver support.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/boot/dts/p1020rdb.dts |   30 +++++++++++++-----------------
 arch/powerpc/boot/dts/p1020si.dtsi |    5 ++---
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/boot/dts/p1020rdb.dts b/arch/powerpc/boot/dts/p1020rdb.dts
index 8b1a7ee..b31e7ec 100644
--- a/arch/powerpc/boot/dts/p1020rdb.dts
+++ b/arch/powerpc/boot/dts/p1020rdb.dts
@@ -149,49 +149,45 @@
 		};
 
 		spi@7000 {
-
-			fsl_m25p80@0 {
+			flash@0 {
 				#address-cells = <1>;
 				#size-cells = <1>;
-				compatible = "fsl,espi-flash";
+				compatible = "spansion,s25sl12801";
 				reg = <0>;
-				linux,modalias = "fsl_m25p80";
-				modal = "s25sl128b";
-				spi-max-frequency = <50000000>;
-				mode = <0>;
+				spi-max-frequency = <40000000>; /* input clock */
 
-				partition@0 {
+				partition@u-boot {
 					/* 512KB for u-boot Bootloader Image */
 					reg = <0x0 0x00080000>;
-					label = "SPI (RO) U-Boot Image";
+					label = "u-boot";
 					read-only;
 				};
 
-				partition@80000 {
+				partition@dtb {
 					/* 512KB for DTB Image */
 					reg = <0x00080000 0x00080000>;
-					label = "SPI (RO) DTB Image";
+					label = "dtb";
 					read-only;
 				};
 
-				partition@100000 {
+				partition@kernel {
 					/* 4MB for Linux Kernel Image */
 					reg = <0x00100000 0x00400000>;
-					label = "SPI (RO) Linux Kernel Image";
+					label = "kernel";
 					read-only;
 				};
 
-				partition@500000 {
+				partition@fs {
 					/* 4MB for Compressed RFS Image */
 					reg = <0x00500000 0x00400000>;
-					label = "SPI (RO) Compressed RFS Image";
+					label = "file system";
 					read-only;
 				};
 
-				partition@900000 {
+				partition@jffs-fs {
 					/* 7MB for JFFS2 based RFS */
 					reg = <0x00900000 0x00700000>;
-					label = "SPI (RW) JFFS2 RFS";
+					label = "file system jffs2";
 				};
 			};
 		};
diff --git a/arch/powerpc/boot/dts/p1020si.dtsi b/arch/powerpc/boot/dts/p1020si.dtsi
index 58f6b30..25e10cf 100644
--- a/arch/powerpc/boot/dts/p1020si.dtsi
+++ b/arch/powerpc/boot/dts/p1020si.dtsi
@@ -112,14 +112,13 @@
 		};
 
 		spi@7000 {
-			cell-index = <0>;
 			#address-cells = <1>;
 			#size-cells = <0>;
-			compatible = "fsl,espi";
+			compatible = "fsl,p1020-espi", "fsl,mpc8536-espi";
 			reg = <0x7000 0x1000>;
 			interrupts = <59 0x2>;
 			interrupt-parent = <&mpic>;
-			mode = "cpu";
+			fsl,espi-num-chipselects = <4>;
 		};
 
 		gpio: gpio-controller@f000 {
-- 
1.7.3.4

^ permalink raw reply related

* [PATCH 01/29] powerpc/85xx: Simplify P1020RDB CAMP dts using includes
From: Kumar Gala @ 2011-11-17  7:15 UTC (permalink / raw)
  To: linuxppc-dev

If we include the p1020rdb.dts instead of p1020si.dts we greatly reduce
duplication and maintenance.  We can just list which devices are
disabled for the given core and mpic protected sources.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/boot/dts/p1020rdb_camp_core0.dts |  154 +------------------------
 arch/powerpc/boot/dts/p1020rdb_camp_core1.dts |   11 +--
 2 files changed, 4 insertions(+), 161 deletions(-)

diff --git a/arch/powerpc/boot/dts/p1020rdb_camp_core0.dts b/arch/powerpc/boot/dts/p1020rdb_camp_core0.dts
index f0bf7f4..41b4585 100644
--- a/arch/powerpc/boot/dts/p1020rdb_camp_core0.dts
+++ b/arch/powerpc/boot/dts/p1020rdb_camp_core0.dts
@@ -16,7 +16,7 @@
  * option) any later version.
  */
 
-/include/ "p1020si.dtsi"
+/include/ "p1020rdb.dts"
 
 / {
 	model = "fsl,P1020RDB";
@@ -32,7 +32,7 @@
 
 	cpus {
 		PowerPC,P1020@1 {
-		status = "disabled";
+			status = "disabled";
 		};
 	};
 
@@ -45,169 +45,19 @@
 	};
 
 	soc@ffe00000 {
-		i2c@3000 {
-			rtc@68 {
-				compatible = "dallas,ds1339";
-				reg = <0x68>;
-			};
-		};
-
 		serial1: serial@4600 {
 			status = "disabled";
 		};
 
-		spi@7000 {
-			fsl_m25p80@0 {
-				#address-cells = <1>;
-				#size-cells = <1>;
-				compatible = "fsl,espi-flash";
-				reg = <0>;
-				linux,modalias = "fsl_m25p80";
-				spi-max-frequency = <40000000>;
-
-				partition@0 {
-					/* 512KB for u-boot Bootloader Image */
-					reg = <0x0 0x00080000>;
-					label = "SPI (RO) U-Boot Image";
-					read-only;
-				};
-
-				partition@80000 {
-					/* 512KB for DTB Image */
-					reg = <0x00080000 0x00080000>;
-					label = "SPI (RO) DTB Image";
-					read-only;
-				};
-
-				partition@100000 {
-					/* 4MB for Linux Kernel Image */
-					reg = <0x00100000 0x00400000>;
-					label = "SPI (RO) Linux Kernel Image";
-					read-only;
-				};
-
-				partition@500000 {
-					/* 4MB for Compressed RFS Image */
-					reg = <0x00500000 0x00400000>;
-					label = "SPI (RO) Compressed RFS Image";
-					read-only;
-				};
-
-				partition@900000 {
-					/* 7MB for JFFS2 based RFS */
-					reg = <0x00900000 0x00700000>;
-					label = "SPI (RW) JFFS2 RFS";
-				};
-			};
-		};
-
-		mdio@24000 {
-			phy0: ethernet-phy@0 {
-				interrupt-parent = <&mpic>;
-				interrupts = <3 1>;
-				reg = <0x0>;
-			};
-			phy1: ethernet-phy@1 {
-				interrupt-parent = <&mpic>;
-				interrupts = <2 1>;
-				reg = <0x1>;
-			};
-		};
-
-		mdio@25000 {
-			tbi0: tbi-phy@11 {
-				reg = <0x11>;
-				device_type = "tbi-phy";
-			};
-		};
-
 		enet0: ethernet@b0000 {
 			status = "disabled";
 		};
 
-		enet1: ethernet@b1000 {
-			phy-handle = <&phy0>;
-			tbi-handle = <&tbi0>;
-			phy-connection-type = "sgmii";
-		};
-
-		enet2: ethernet@b2000 {
-			phy-handle = <&phy1>;
-			phy-connection-type = "rgmii-id";
-		};
-
-		usb@22000 {
-			phy_type = "ulpi";
-		};
-
-		/* USB2 is shared with localbus, so it must be disabled
-		   by default. We can't put 'status = "disabled";' here
-		   since U-Boot doesn't clear the status property when
-		   it enables USB2. OTOH, U-Boot does create a new node
-		   when there isn't any. So, just comment it out.
-		usb@23000 {
-			phy_type = "ulpi";
-		};
-		*/
-
 		mpic: pic@40000 {
 			protected-sources = <
 			42 29 30 34	/* serial1, enet0-queue-group0 */
 			17 18 24 45	/* enet0-queue-group1, crypto */
 			>;
 		};
-
-	};
-
-	pci0: pcie@ffe09000 {
-		ranges = <0x2000000 0x0 0xa0000000 0 0xa0000000 0x0 0x20000000
-			  0x1000000 0x0 0x00000000 0 0xffc10000 0x0 0x10000>;
-		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
-		interrupt-map = <
-			/* IDSEL 0x0 */
-			0000 0x0 0x0 0x1 &mpic 0x4 0x1
-			0000 0x0 0x0 0x2 &mpic 0x5 0x1
-			0000 0x0 0x0 0x3 &mpic 0x6 0x1
-			0000 0x0 0x0 0x4 &mpic 0x7 0x1
-			>;
-		pcie@0 {
-			reg = <0x0 0x0 0x0 0x0 0x0>;
-			#size-cells = <2>;
-			#address-cells = <3>;
-			device_type = "pci";
-			ranges = <0x2000000 0x0 0xa0000000
-				  0x2000000 0x0 0xa0000000
-				  0x0 0x20000000
-
-				  0x1000000 0x0 0x0
-				  0x1000000 0x0 0x0
-				  0x0 0x100000>;
-		};
-	};
-
-	pci1: pcie@ffe0a000 {
-		ranges = <0x2000000 0x0 0x80000000 0 0x80000000 0x0 0x20000000
-			  0x1000000 0x0 0x00000000 0 0xffc00000 0x0 0x10000>;
-		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
-		interrupt-map = <
-			/* IDSEL 0x0 */
-			0000 0x0 0x0 0x1 &mpic 0x0 0x1
-			0000 0x0 0x0 0x2 &mpic 0x1 0x1
-			0000 0x0 0x0 0x3 &mpic 0x2 0x1
-			0000 0x0 0x0 0x4 &mpic 0x3 0x1
-			>;
-		pcie@0 {
-			reg = <0x0 0x0 0x0 0x0 0x0>;
-			#size-cells = <2>;
-			#address-cells = <3>;
-			device_type = "pci";
-			ranges = <0x2000000 0x0 0x80000000
-				  0x2000000 0x0 0x80000000
-				  0x0 0x20000000
-
-				  0x1000000 0x0 0x0
-				  0x1000000 0x0 0x0
-				  0x0 0x100000>;
-		};
 	};
 };
diff --git a/arch/powerpc/boot/dts/p1020rdb_camp_core1.dts b/arch/powerpc/boot/dts/p1020rdb_camp_core1.dts
index 6ec0220..5174538 100644
--- a/arch/powerpc/boot/dts/p1020rdb_camp_core1.dts
+++ b/arch/powerpc/boot/dts/p1020rdb_camp_core1.dts
@@ -15,7 +15,7 @@
  * option) any later version.
  */
 
-/include/ "p1020si.dtsi"
+/include/ "p1020rdb.dts"
 
 / {
 	model = "fsl,P1020RDB";
@@ -28,7 +28,7 @@
 
 	cpus {
 		PowerPC,P1020@0 {
-		status = "disabled";
+			status = "disabled";
 		};
 	};
 
@@ -85,12 +85,6 @@
 			status = "disabled";
 		};
 
-		enet0: ethernet@b0000 {
-			fixed-link = <1 1 1000 0 0>;
-			phy-connection-type = "rgmii-id";
-
-		};
-
 		enet1: ethernet@b1000 {
 			status = "disabled";
 		};
@@ -135,7 +129,6 @@
 		global-utilities@e0000 {	//global utilities block
 			status = "disabled";
 		};
-
 	};
 
 	pci0: pcie@ffe09000 {
-- 
1.7.3.4

^ permalink raw reply related

* [PATCH 02/29] powerpc/85xx: Rework PCI nodes on P1020RDB
From: Kumar Gala @ 2011-11-17  7:15 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <1321514181-28897-1-git-send-email-galak@kernel.crashing.org>

* Move SoC specific details like irq mapping to SoC dtsi
* Update interrupt property to cover both error interrupt and PCIe
  runtime interrupts

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/boot/dts/p1020rdb.dts |   26 +---------------------
 arch/powerpc/boot/dts/p1020si.dtsi |   40 ++++++++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/boot/dts/p1020rdb.dts b/arch/powerpc/boot/dts/p1020rdb.dts
index d6a8ae4..8b1a7ee 100644
--- a/arch/powerpc/boot/dts/p1020rdb.dts
+++ b/arch/powerpc/boot/dts/p1020rdb.dts
@@ -257,19 +257,8 @@
 	pci0: pcie@ffe09000 {
 		ranges = <0x2000000 0x0 0xa0000000 0 0xa0000000 0x0 0x20000000
 			  0x1000000 0x0 0x00000000 0 0xffc10000 0x0 0x10000>;
-		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
-		interrupt-map = <
-			/* IDSEL 0x0 */
-			0000 0x0 0x0 0x1 &mpic 0x4 0x1
-			0000 0x0 0x0 0x2 &mpic 0x5 0x1
-			0000 0x0 0x0 0x3 &mpic 0x6 0x1
-			0000 0x0 0x0 0x4 &mpic 0x7 0x1
-			>;
+		reg = <0 0xffe09000 0 0x1000>;
 		pcie@0 {
-			reg = <0x0 0x0 0x0 0x0 0x0>;
-			#size-cells = <2>;
-			#address-cells = <3>;
-			device_type = "pci";
 			ranges = <0x2000000 0x0 0xa0000000
 				  0x2000000 0x0 0xa0000000
 				  0x0 0x20000000
@@ -281,21 +270,10 @@
 	};
 
 	pci1: pcie@ffe0a000 {
+		reg = <0 0xffe0a000 0 0x1000>;
 		ranges = <0x2000000 0x0 0x80000000 0 0x80000000 0x0 0x20000000
 			  0x1000000 0x0 0x00000000 0 0xffc00000 0x0 0x10000>;
-		interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
-		interrupt-map = <
-			/* IDSEL 0x0 */
-			0000 0x0 0x0 0x1 &mpic 0x0 0x1
-			0000 0x0 0x0 0x2 &mpic 0x1 0x1
-			0000 0x0 0x0 0x3 &mpic 0x2 0x1
-			0000 0x0 0x0 0x4 &mpic 0x3 0x1
-			>;
 		pcie@0 {
-			reg = <0x0 0x0 0x0 0x0 0x0>;
-			#size-cells = <2>;
-			#address-cells = <3>;
-			device_type = "pci";
 			ranges = <0x2000000 0x0 0x80000000
 				  0x2000000 0x0 0x80000000
 				  0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/p1020si.dtsi b/arch/powerpc/boot/dts/p1020si.dtsi
index 5c5acb6..58f6b30 100644
--- a/arch/powerpc/boot/dts/p1020si.dtsi
+++ b/arch/powerpc/boot/dts/p1020si.dtsi
@@ -352,26 +352,58 @@
 	pci0: pcie@ffe09000 {
 		compatible = "fsl,mpc8548-pcie";
 		device_type = "pci";
-		#interrupt-cells = <1>;
 		#size-cells = <2>;
 		#address-cells = <3>;
-		reg = <0 0xffe09000 0 0x1000>;
 		bus-range = <0 255>;
 		clock-frequency = <33333333>;
 		interrupt-parent = <&mpic>;
 		interrupts = <16 2>;
+
+		pcie@0 {
+			reg = <0 0 0 0 0>;
+			#interrupt-cells = <1>;
+			#size-cells = <2>;
+			#address-cells = <3>;
+			device_type = "pci";
+			interrupts = <16 2>;
+			interrupt-map-mask = <0xf800 0 0 7>;
+			interrupt-map = <
+				/* IDSEL 0x0 */
+				0000 0x0 0x0 0x1 &mpic 0x4 0x1
+				0000 0x0 0x0 0x2 &mpic 0x5 0x1
+				0000 0x0 0x0 0x3 &mpic 0x6 0x1
+				0000 0x0 0x0 0x4 &mpic 0x7 0x1
+				>;
+		};
+
 	};
 
 	pci1: pcie@ffe0a000 {
 		compatible = "fsl,mpc8548-pcie";
 		device_type = "pci";
-		#interrupt-cells = <1>;
 		#size-cells = <2>;
 		#address-cells = <3>;
-		reg = <0 0xffe0a000 0 0x1000>;
 		bus-range = <0 255>;
 		clock-frequency = <33333333>;
 		interrupt-parent = <&mpic>;
 		interrupts = <16 2>;
+
+		pcie@0 {
+			reg = <0 0 0 0 0>;
+			#interrupt-cells = <1>;
+			#size-cells = <2>;
+			#address-cells = <3>;
+			device_type = "pci";
+			interrupts = <16 2>;
+			interrupt-map-mask = <0xf800 0 0 7>;
+
+			interrupt-map = <
+				/* IDSEL 0x0 */
+				0000 0x0 0x0 0x1 &mpic 0x0 0x1
+				0000 0x0 0x0 0x2 &mpic 0x1 0x1
+				0000 0x0 0x0 0x3 &mpic 0x2 0x1
+				0000 0x0 0x0 0x4 &mpic 0x3 0x1
+				>;
+		};
 	};
 };
-- 
1.7.3.4

^ permalink raw reply related

* [PATCH][v2] powerpc/p1023: set IRQ[4:6, 11] to active-high level sensitive for PCIe
From: Kumar Gala @ 2011-11-17  5:09 UTC (permalink / raw)
  To: linuxppc-dev

From: Roy Zang <tie-fei.zang@freescale.com>

P1023 external IRQ[4:6, 11] are not pin out, but the interrupts are
utilized by the PCIe controllers.  As they are not exposed as pins we
need to set them as active-high (internal to the SoC these interrupts
are pulled down).

IRQs[0:3,7:10] are pulled up on the board so we have them set as
active-low.

Signed-off-by: Roy Zang <tie-fei.zang@freescale.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
v2:
* cleaned up commit message & added comments to be clear about IRQ usage

 arch/powerpc/boot/dts/p1023rds.dts |   17 +++++++++++++----
 1 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/boot/dts/p1023rds.dts b/arch/powerpc/boot/dts/p1023rds.dts
index 25f9897..319c4b4 100644
--- a/arch/powerpc/boot/dts/p1023rds.dts
+++ b/arch/powerpc/boot/dts/p1023rds.dts
@@ -698,6 +698,7 @@
 			interrupt-parent = <&mpic>;
 			interrupts = <16 2>;
 			interrupt-map-mask = <0xf800 0 0 7>;
+			/* IRQ[0:3] are pulled up on board, set to active-low */
 			interrupt-map = <
 				/* IDSEL 0x0 */
 				0000 0 0 1 &mpic 0 1
@@ -737,11 +738,15 @@
 			interrupt-parent = <&mpic>;
 			interrupts = <16 2>;
 			interrupt-map-mask = <0xf800 0 0 7>;
+			/*
+			 * IRQ[4:6] only for PCIe, set to active-high,
+			 * IRQ[7] is pulled up on board, set to active-low
+			 */
 			interrupt-map = <
 				/* IDSEL 0x0 */
-				0000 0 0 1 &mpic 4 1
-				0000 0 0 2 &mpic 5 1
-				0000 0 0 3 &mpic 6 1
+				0000 0 0 1 &mpic 4 2
+				0000 0 0 2 &mpic 5 2
+				0000 0 0 3 &mpic 6 2
 				0000 0 0 4 &mpic 7 1
 				>;
 			ranges = <0x2000000 0x0 0xa0000000
@@ -776,12 +781,16 @@
 			interrupt-parent = <&mpic>;
 			interrupts = <16 2>;
 			interrupt-map-mask = <0xf800 0 0 7>;
+			/*
+			 * IRQ[8:10] are pulled up on board, set to active-low
+			 * IRQ[11] only for PCIe, set to active-high,
+			 */
 			interrupt-map = <
 				/* IDSEL 0x0 */
 				0000 0 0 1 &mpic 8 1
 				0000 0 0 2 &mpic 9 1
 				0000 0 0 3 &mpic 10 1
-				0000 0 0 4 &mpic 11 1
+				0000 0 0 4 &mpic 11 2
 				>;
 			ranges = <0x2000000 0x0 0x80000000
 				  0x2000000 0x0 0x80000000
-- 
1.7.3.4

^ permalink raw reply related

* [stable] [PATCH] powerpc/ptrace: Fix build with gcc 4.6
From: Michael Neuling @ 2011-11-17  2:31 UTC (permalink / raw)
  To: stable; +Cc: linuxppc-dev

From: Benjamin Herrenschmidt <benh@kernel.crashing.org>

powerpc/ptrace: Fix build with gcc 4.6

gcc (rightfully) complains that we are accessing beyond the
end of the fpr array (we do, to access the fpscr).

The only sane thing to do (whether anything in that code can be
called remotely sane is debatable) is to special case fpscr and
handle it as a separate statement.

I initially tried to do it it by making the array access conditional
to index < PT_FPSCR and using a 3rd else leg but for some reason gcc
was unable to understand it and still spewed the warning.

So I ended up with something a tad more intricated but it seems to
build on 32-bit and on 64-bit with and without VSX.

commit e69b742a6793dc5bf16f6eedca534d4bc10d68b2

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Neuling <mikey@neuling.org>
cc: stable@kernel.org

diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 05b7dd2..18447c4 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1497,9 +1497,14 @@ long arch_ptrace(struct task_struct *child, long request,
 		if (index < PT_FPR0) {
 			tmp = ptrace_get_reg(child, (int) index);
 		} else {
+			unsigned int fpidx = index - PT_FPR0;
+
 			flush_fp_to_thread(child);
-			tmp = ((unsigned long *)child->thread.fpr)
-				[TS_FPRWIDTH * (index - PT_FPR0)];
+			if (fpidx < (PT_FPSCR - PT_FPR0))
+				tmp = ((unsigned long *)child->thread.fpr)
+					[fpidx * TS_FPRWIDTH];
+			else
+				tmp = child->thread.fpscr.val;
 		}
 		ret = put_user(tmp, datalp);
 		break;
@@ -1525,9 +1530,14 @@ long arch_ptrace(struct task_struct *child, long request,
 		if (index < PT_FPR0) {
 			ret = ptrace_put_reg(child, index, data);
 		} else {
+			unsigned int fpidx = index - PT_FPR0;
+
 			flush_fp_to_thread(child);
-			((unsigned long *)child->thread.fpr)
-				[TS_FPRWIDTH * (index - PT_FPR0)] = data;
+			if (fpidx < (PT_FPSCR - PT_FPR0))
+				((unsigned long *)child->thread.fpr)
+					[fpidx * TS_FPRWIDTH] = data;
+			else
+				child->thread.fpscr.val = data;
 			ret = 0;
 		}
 		break;

^ permalink raw reply related

* Re: [PATCH net-next 0/5] Use ETH_ALEN
From: David Miller @ 2011-11-17  1:35 UTC (permalink / raw)
  To: joe; +Cc: netdev, linuxppc-dev, linux-kernel
In-Reply-To: <cover.1321472142.git.joe@perches.com>

From: Joe Perches <joe@perches.com>
Date: Wed, 16 Nov 2011 11:38:01 -0800

> Remove other #defines and uses in favor of ETH_ALEN
> 
> Joe Perches (5):
>   ethernet: Convert MAC_ADDR_LEN uses to ETH_ALEN
>   ethernet: Convert ETHER_ADDR_LEN uses to ETH_ALEN
>   bna: Convert MAC_ADDRLEN uses to ETH_ALEN
>   amd8111e:  Convert ETH_ADDR_LEN uses to ETH_ALEN
>   ucc_geth: Convert ENET_NUM_OCTETS_PER_ADDRESS uses to ETH_ALEN

All applied, thanks Joe.

^ permalink raw reply

* [PATCH 4/5] crypto: talitos - support for channel remap and 2nd IRQ
From: Kim Phillips @ 2011-11-17  0:20 UTC (permalink / raw)
  To: linux-crypto; +Cc: linuxppc-dev

Some later SEC v3.x are equipped with a second IRQ line.
By correctly assigning IRQ affinity, this feature can be
used to increase performance on dual core parts, like the
MPC8572E and P2020E.

The existence of the 2nd IRQ is determined from the device
node's interrupt property.  If present, the driver remaps
two of four channels, which in turn makes those channels
trigger their interrupts on the 2nd line instead of the first.
To handle single- and dual-IRQ combinations efficiently,
talitos gets two new interrupt handlers and back-half workers.

[includes a fix to MCR_LO's address.]

Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
---
 drivers/crypto/talitos.c |  203 +++++++++++++++++++++++++++++++---------------
 drivers/crypto/talitos.h |   14 +++-
 2 files changed, 147 insertions(+), 70 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 7f82e91..92c0ca7 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -122,7 +122,7 @@ struct talitos_private {
 	struct device *dev;
 	struct platform_device *ofdev;
 	void __iomem *reg;
-	int irq;
+	int irq[2];
 
 	/* SEC version geometry (from device tree node) */
 	unsigned int num_channels;
@@ -146,7 +146,7 @@ struct talitos_private {
 	atomic_t last_chan ____cacheline_aligned;
 
 	/* request callback tasklet */
-	struct tasklet_struct done_task;
+	struct tasklet_struct done_task[2];
 
 	/* list of registered algorithms */
 	struct list_head alg_list;
@@ -226,13 +226,19 @@ static int reset_device(struct device *dev)
 {
 	struct talitos_private *priv = dev_get_drvdata(dev);
 	unsigned int timeout = TALITOS_TIMEOUT;
+	u32 mcr = TALITOS_MCR_SWR;
 
-	setbits32(priv->reg + TALITOS_MCR, TALITOS_MCR_SWR);
+	setbits32(priv->reg + TALITOS_MCR, mcr);
 
 	while ((in_be32(priv->reg + TALITOS_MCR) & TALITOS_MCR_SWR)
 	       && --timeout)
 		cpu_relax();
 
+	if (priv->irq[1] != NO_IRQ) {
+		mcr = TALITOS_MCR_RCA1 | TALITOS_MCR_RCA3;
+		setbits32(priv->reg + TALITOS_MCR, mcr);
+	}
+
 	if (timeout == 0) {
 		dev_err(dev, "failed to reset device\n");
 		return -EIO;
@@ -401,21 +407,32 @@ static void flush_channel(struct device *dev, int ch, int error, int reset_ch)
 /*
  * process completed requests for channels that have done status
  */
-static void talitos_done(unsigned long data)
-{
-	struct device *dev = (struct device *)data;
-	struct talitos_private *priv = dev_get_drvdata(dev);
-	int ch;
-
-	for (ch = 0; ch < priv->num_channels; ch++)
-		flush_channel(dev, ch, 0, 0);
-
-	/* At this point, all completed channels have been processed.
-	 * Unmask done interrupts for channels completed later on.
-	 */
-	setbits32(priv->reg + TALITOS_IMR, TALITOS_IMR_INIT);
-	setbits32(priv->reg + TALITOS_IMR_LO, TALITOS_IMR_LO_INIT);
+#define DEF_TALITOS_DONE(name, ch_done_mask)				\
+static void talitos_done_##name(unsigned long data)			\
+{									\
+	struct device *dev = (struct device *)data;			\
+	struct talitos_private *priv = dev_get_drvdata(dev);		\
+									\
+	if (ch_done_mask & 1)						\
+		flush_channel(dev, 0, 0, 0);				\
+	if (priv->num_channels == 1)					\
+		goto out;						\
+	if (ch_done_mask & (1 << 2))					\
+		flush_channel(dev, 1, 0, 0);				\
+	if (ch_done_mask & (1 << 4))					\
+		flush_channel(dev, 2, 0, 0);				\
+	if (ch_done_mask & (1 << 6))					\
+		flush_channel(dev, 3, 0, 0);				\
+									\
+out:									\
+	/* At this point, all completed channels have been processed */	\
+	/* Unmask done interrupts for channels completed later on. */	\
+	setbits32(priv->reg + TALITOS_IMR, ch_done_mask);		\
+	setbits32(priv->reg + TALITOS_IMR_LO, TALITOS_IMR_LO_INIT);	\
 }
+DEF_TALITOS_DONE(4ch, TALITOS_ISR_4CHDONE)
+DEF_TALITOS_DONE(ch0_2, TALITOS_ISR_CH_0_2_DONE)
+DEF_TALITOS_DONE(ch1_3, TALITOS_ISR_CH_1_3_DONE)
 
 /*
  * locate current (offending) descriptor
@@ -584,7 +601,7 @@ static void talitos_error(unsigned long data, u32 isr, u32 isr_lo)
 			}
 		}
 	}
-	if (reset_dev || isr & ~TALITOS_ISR_CHERR || isr_lo) {
+	if (reset_dev || isr & ~TALITOS_ISR_4CHERR || isr_lo) {
 		dev_err(dev, "done overflow, internal time out, or rngu error: "
 		        "ISR 0x%08x_%08x\n", isr, isr_lo);
 
@@ -597,30 +614,35 @@ static void talitos_error(unsigned long data, u32 isr, u32 isr_lo)
 	}
 }
 
-static irqreturn_t talitos_interrupt(int irq, void *data)
-{
-	struct device *dev = data;
-	struct talitos_private *priv = dev_get_drvdata(dev);
-	u32 isr, isr_lo;
-
-	isr = in_be32(priv->reg + TALITOS_ISR);
-	isr_lo = in_be32(priv->reg + TALITOS_ISR_LO);
-	/* Acknowledge interrupt */
-	out_be32(priv->reg + TALITOS_ICR, isr);
-	out_be32(priv->reg + TALITOS_ICR_LO, isr_lo);
-
-	if (unlikely((isr & ~TALITOS_ISR_CHDONE) || isr_lo))
-		talitos_error((unsigned long)data, isr, isr_lo);
-	else
-		if (likely(isr & TALITOS_ISR_CHDONE)) {
-			/* mask further done interrupts. */
-			clrbits32(priv->reg + TALITOS_IMR, TALITOS_IMR_DONE);
-			/* done_task will unmask done interrupts at exit */
-			tasklet_schedule(&priv->done_task);
-		}
-
-	return (isr || isr_lo) ? IRQ_HANDLED : IRQ_NONE;
+#define DEF_TALITOS_INTERRUPT(name, ch_done_mask, ch_err_mask, tlet)	       \
+static irqreturn_t talitos_interrupt_##name(int irq, void *data)	       \
+{									       \
+	struct device *dev = data;					       \
+	struct talitos_private *priv = dev_get_drvdata(dev);		       \
+	u32 isr, isr_lo;						       \
+									       \
+	isr = in_be32(priv->reg + TALITOS_ISR);				       \
+	isr_lo = in_be32(priv->reg + TALITOS_ISR_LO);			       \
+	/* Acknowledge interrupt */					       \
+	out_be32(priv->reg + TALITOS_ICR, isr & (ch_done_mask | ch_err_mask)); \
+	out_be32(priv->reg + TALITOS_ICR_LO, isr_lo);			       \
+									       \
+	if (unlikely((isr & ~TALITOS_ISR_4CHDONE) & ch_err_mask || isr_lo))    \
+		talitos_error((unsigned long)data, isr, isr_lo);	       \
+	else								       \
+		if (likely(isr & ch_done_mask)) {			       \
+			/* mask further done interrupts. */		       \
+			clrbits32(priv->reg + TALITOS_IMR, ch_done_mask);      \
+			/* done_task will unmask done interrupts at exit */    \
+			tasklet_schedule(&priv->done_task[tlet]);	       \
+		}							       \
+									       \
+	return (isr & (ch_done_mask | ch_err_mask) || isr_lo) ? IRQ_HANDLED :  \
+								IRQ_NONE;      \
 }
+DEF_TALITOS_INTERRUPT(4ch, TALITOS_ISR_4CHDONE, TALITOS_ISR_4CHERR, 0)
+DEF_TALITOS_INTERRUPT(ch0_2, TALITOS_ISR_CH_0_2_DONE, TALITOS_ISR_CH_0_2_ERR, 0)
+DEF_TALITOS_INTERRUPT(ch1_3, TALITOS_ISR_CH_1_3_DONE, TALITOS_ISR_CH_1_3_ERR, 1)
 
 /*
  * hwrng
@@ -2558,12 +2580,15 @@ static int talitos_remove(struct platform_device *ofdev)
 
 	kfree(priv->chan);
 
-	if (priv->irq != NO_IRQ) {
-		free_irq(priv->irq, dev);
-		irq_dispose_mapping(priv->irq);
-	}
+	for (i = 0; i < 2; i++)
+		if (priv->irq[i] != NO_IRQ) {
+			free_irq(priv->irq[i], dev);
+			irq_dispose_mapping(priv->irq[i]);
+		}
 
-	tasklet_kill(&priv->done_task);
+	tasklet_kill(&priv->done_task[0]);
+	if (priv->irq[1] != NO_IRQ)
+		tasklet_kill(&priv->done_task[1]);
 
 	iounmap(priv->reg);
 
@@ -2628,6 +2653,54 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev,
 	return t_alg;
 }
 
+static int talitos_probe_irq(struct platform_device *ofdev)
+{
+	struct device *dev = &ofdev->dev;
+	struct device_node *np = ofdev->dev.of_node;
+	struct talitos_private *priv = dev_get_drvdata(dev);
+	int err;
+
+	priv->irq[0] = irq_of_parse_and_map(np, 0);
+	if (priv->irq[0] == NO_IRQ) {
+		dev_err(dev, "failed to map irq\n");
+		return -EINVAL;
+	}
+
+	priv->irq[1] = irq_of_parse_and_map(np, 1);
+
+	/* get the primary irq line */
+	if (priv->irq[1] == NO_IRQ) {
+		err = request_irq(priv->irq[0], talitos_interrupt_4ch, 0,
+				  dev_driver_string(dev), dev);
+		goto primary_out;
+	}
+
+	err = request_irq(priv->irq[0], talitos_interrupt_ch0_2, 0,
+			  dev_driver_string(dev), dev);
+	if (err)
+		goto primary_out;
+
+	/* get the secondary irq line */
+	err = request_irq(priv->irq[1], talitos_interrupt_ch1_3, 0,
+			  dev_driver_string(dev), dev);
+	if (err) {
+		dev_err(dev, "failed to request secondary irq\n");
+		irq_dispose_mapping(priv->irq[1]);
+		priv->irq[1] = NO_IRQ;
+	}
+
+	return err;
+
+primary_out:
+	if (err) {
+		dev_err(dev, "failed to request primary irq\n");
+		irq_dispose_mapping(priv->irq[0]);
+		priv->irq[0] = NO_IRQ;
+	}
+
+	return err;
+}
+
 static int talitos_probe(struct platform_device *ofdev)
 {
 	struct device *dev = &ofdev->dev;
@@ -2644,28 +2717,22 @@ static int talitos_probe(struct platform_device *ofdev)
 
 	priv->ofdev = ofdev;
 
-	tasklet_init(&priv->done_task, talitos_done, (unsigned long)dev);
-
-	INIT_LIST_HEAD(&priv->alg_list);
-
-	priv->irq = irq_of_parse_and_map(np, 0);
-
-	if (priv->irq == NO_IRQ) {
-		dev_err(dev, "failed to map irq\n");
-		err = -EINVAL;
+	err = talitos_probe_irq(ofdev);
+	if (err)
 		goto err_out;
-	}
 
-	/* get the irq line */
-	err = request_irq(priv->irq, talitos_interrupt, 0,
-			  dev_driver_string(dev), dev);
-	if (err) {
-		dev_err(dev, "failed to request irq %d\n", priv->irq);
-		irq_dispose_mapping(priv->irq);
-		priv->irq = NO_IRQ;
-		goto err_out;
+	if (priv->irq[1] == NO_IRQ) {
+		tasklet_init(&priv->done_task[0], talitos_done_4ch,
+			     (unsigned long)dev);
+	} else {
+		tasklet_init(&priv->done_task[0], talitos_done_ch0_2,
+			     (unsigned long)dev);
+		tasklet_init(&priv->done_task[1], talitos_done_ch1_3,
+			     (unsigned long)dev);
 	}
 
+	INIT_LIST_HEAD(&priv->alg_list);
+
 	priv->reg = of_iomap(np, 0);
 	if (!priv->reg) {
 		dev_err(dev, "failed to of_iomap\n");
@@ -2713,9 +2780,11 @@ static int talitos_probe(struct platform_device *ofdev)
 		goto err_out;
 	}
 
-	for (i = 0; i < priv->num_channels; i++)
-		priv->chan[i].reg = priv->reg + TALITOS_CH_BASE_OFFSET +
-				    TALITOS_CH_STRIDE * (i + 1);
+	for (i = 0; i < priv->num_channels; i++) {
+		priv->chan[i].reg = priv->reg + TALITOS_CH_STRIDE * (i + 1);
+		if ((priv->irq[1] == NO_IRQ) || !(i & 1))
+			priv->chan[i].reg += TALITOS_CH_BASE_OFFSET;
+	}
 
 	for (i = 0; i < priv->num_channels; i++) {
 		spin_lock_init(&priv->chan[i].head_lock);
diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
index 3ed319d..3c17395 100644
--- a/drivers/crypto/talitos.h
+++ b/drivers/crypto/talitos.h
@@ -34,16 +34,24 @@
 
 /* global register offset addresses */
 #define TALITOS_MCR			0x1030  /* master control register */
-#define TALITOS_MCR_LO			0x1038
+#define   TALITOS_MCR_RCA0		(1 << 15) /* remap channel 0 */
+#define   TALITOS_MCR_RCA1		(1 << 14) /* remap channel 1 */
+#define   TALITOS_MCR_RCA2		(1 << 13) /* remap channel 2 */
+#define   TALITOS_MCR_RCA3		(1 << 12) /* remap channel 3 */
 #define   TALITOS_MCR_SWR		0x1     /* s/w reset */
+#define TALITOS_MCR_LO			0x1034
 #define TALITOS_IMR			0x1008  /* interrupt mask register */
 #define   TALITOS_IMR_INIT		0x100ff /* enable channel IRQs */
 #define   TALITOS_IMR_DONE		0x00055 /* done IRQs */
 #define TALITOS_IMR_LO			0x100C
 #define   TALITOS_IMR_LO_INIT		0x20000 /* allow RNGU error IRQs */
 #define TALITOS_ISR			0x1010  /* interrupt status register */
-#define   TALITOS_ISR_CHERR		0xaa    /* channel errors mask */
-#define   TALITOS_ISR_CHDONE		0x55    /* channel done mask */
+#define   TALITOS_ISR_4CHERR		0xaa    /* 4 channel errors mask */
+#define   TALITOS_ISR_4CHDONE		0x55    /* 4 channel done mask */
+#define   TALITOS_ISR_CH_0_2_ERR	0x22    /* channels 0, 2 errors mask */
+#define   TALITOS_ISR_CH_0_2_DONE	0x11    /* channels 0, 2 done mask */
+#define   TALITOS_ISR_CH_1_3_ERR	0x88    /* channels 1, 3 errors mask */
+#define   TALITOS_ISR_CH_1_3_DONE	0x44    /* channels 1, 3 done mask */
 #define TALITOS_ISR_LO			0x1014
 #define TALITOS_ICR			0x1018  /* interrupt clear register */
 #define TALITOS_ICR_LO			0x101C
-- 
1.7.7

^ permalink raw reply related

* Re: [PATCH v2 4/7] powerpc/85xx: add support to JOG feature using cpufreq interface
From: Scott Wood @ 2011-11-17  0:17 UTC (permalink / raw)
  To: Zhao Chenhui; +Cc: linuxppc-dev
In-Reply-To: <1321437344-19253-4-git-send-email-chenhui.zhao@freescale.com>

On 11/16/2011 03:55 AM, Zhao Chenhui wrote:
> From: Li Yang <leoli@freescale.com>
> 
> Some 85xx silicons like MPC8536 and P1022 has the JOG PM feature.

P1023 as well -- any plan to support?

I see this in the p1022 and mpc8536 manuals:

> The system operates as if a request to enter sleep mode has occurred, with the exception that the
> values written into the PMCDR register (clock disable register for sleep/ deep sleep modes) are
> ignored, and it is treated as if every bit in PMCDR is a logic 1. This means that the eTSECs, USB
> controllers, DDR and eLBC will be stopped.

...which doesn't sound good.

> The patch adds the support to change CPU frequency using the standard
> cpufreq interface. Add the all PLL ratio core support. The ratio CORE
> to CCB can 1:1(except MPC8536), 3:2, 2:1, 5:2, 3:1, 7:2 and 4:1.

The ratios supported are implementation-specific.  Only p1022 supports
1:1.  p1023 supports only 3:2, 2:1, 5:2, and 3:1 (assuming the
preliminary manual I have is accurate).

> +	local_irq_save(flags);
> +	/*
> +	 * A Jog request can not be asserted when any core is in a low power
> +	 * state. Before executing a jog request, any core which is in
> +	 * a low power state must be waked by a interrupt.
> +	 */
> +	if (mpc85xx_freqs == p1022_freqs_table) {
> +		powersave = ppc_md.power_save;
> +		ppc_md.power_save = NULL;
> +		wmb();
> +		val = in_be32(guts + POWMGTCSR);
> +		for_each_online_cpu(i) {
> +			if (val & ((POWMGTCSR_CORE0_DOZING |
> +					POWMGTCSR_CORE0_NAPPING) << (i * 2)))
> +				smp_send_reschedule(i);
> +		}
> +	}

This is racy, what if another core read ppc_md.power_save just before
you wrote NULL, but hasn't yet entered a low power state?

You should send a reschedule to all cores regardless of what you see in
POWMGTCSR.

The p1022 also says that MSR[EE] should be zero -- it is on this core,
but what about the other?

> +	setbits32(guts + POWMGTCSR, POWMGTCSR_JOG_MASK);

This might work on p1022, but don't you have to go through a core reset
on mpc8536?  In that case, you can't just set the bit, you have to go
through the deep sleep code to save/restore state.

P1022 also says, "Mask all the interrupts to the cores by setting the
bits CORE_UDE_MSK, CORE_MCP_MSK, CORE_INT_MSK and CORE_CINT_MSK in the
POWMGTCSR," which I don't see happening.

Though, this directly contradicts where it later says, "The user must
not issue a jog request at the same time as issuing a request
for another low power mode, or while the system is in the process of
entering a low power mode. This means that a jog request must not be
asserted when any other bit of POWMGTCSR is non-zero. If the user tries
to do this, the jog request is ignored."

POWMGTCSR must be zero except for the JOG bit, but you must set other
POWMGTCSR bits.  Lovely. :-P  I assume that the "This means..."
statement is just wrong, and you really are supposed to set those other
bits.  P1023 refines the statement to, "This means that POWMGTCSR[JOG]
must not be asserted when any of the other power management request bits
(COREn_DOZ, SLP) in POWMGTCSR are set."

> +	if (powersave) {
> +		ppc_md.power_save = powersave;
> +		wmb();
> +	}

How do you know the jog has happened at this point?  Just because you've
issued a store that requests it doesn't mean it has taken effect by the
time you execute the next instruction.

> +	local_irq_restore(flags);
> +
> +	/* verify */
> +	if (!spin_event_timeout(get_pll(hw_cpu) == pll, 10000, 10)) {
> +		pr_err("%s: Fail to switch the core frequency. "
> +			"The current PLL of core %d is %d instead of %d.\n",
> +				__func__, hw_cpu, get_pll(hw_cpu), pll);
> +		ret = -EINVAL;
> +	}

Shouldn't the pll be where it's supposed to be as soon as we resume
execution?  I don't see a need to spin here, provided we properly wait
for the jog to happen earlier (which we want to do so that we don't
enable power_save and EE early).

> +static int mpc85xx_cpufreq_target(struct cpufreq_policy *policy,
> +			      unsigned int target_freq,
> +			      unsigned int relation)
> +{
> +	struct cpufreq_freqs freqs;
> +	unsigned int new;
> +	int ret = 0;
> +
> +	cpufreq_frequency_table_target(policy,
> +				       mpc85xx_freqs,
> +				       target_freq,
> +				       relation,
> +				       &new);
> +
> +	freqs.old = policy->cur;
> +	freqs.new = mpc85xx_freqs[new].frequency;
> +	freqs.cpu = policy->cpu;
> +
> +	mutex_lock(&mpc85xx_switch_mutex);
> +	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
> +
> +	ret = set_pll(policy->cpu, mpc85xx_freqs[new].index);
> +	if (!ret) {
> +		pr_info("cpufreq: Setting core%d frequency to %d kHz and " \
> +			 "PLL ratio to %d:2\n",
> +			 policy->cpu,
> +			 mpc85xx_freqs[new].frequency,
> +			 mpc85xx_freqs[new].index);
> +
> +		ppc_proc_freq = freqs.new * 1000ul;
> +	}
> +	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
> +	mutex_unlock(&mpc85xx_switch_mutex);

I still do not understand what sense it makes to set a global variable
(ppc_proc_freq) to the frequency of a specific CPU.

> +static int mpc85xx_job_probe(struct platform_device *ofdev)
> +{
> +	struct device_node *np = ofdev->dev.of_node;
> +
> +	if (of_device_is_compatible(np, "fsl,mpc8536-guts")) {
> +		threshold_freq = FREQ_800MHz;
> +		mpc85xx_freqs = mpc8536_freqs_table;
> +	} else if (of_device_is_compatible(np, "fsl,p1022-guts")) {
> +		threshold_freq = FREQ_533MHz;
> +		mpc85xx_freqs = p1022_freqs_table;
> +	}

Maybe use .data in the of_device_id table, similar to
arch/powerpc/platforms/83xx/suspend.c?  Though it's slightly less
convenient now that we need to call of_match_device() again in order to
get a match pointer.

-Scott

^ permalink raw reply

* [RFC PATCH 11/11] KVM: PPC: Eliminate global spinlock in kvmppc_h_enter
From: Paul Mackerras @ 2011-11-16 23:55 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf, kvm
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

>From dfd5bcfac841f8a36593edf60d9fb15e0d633287 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 14 Nov 2011 13:30:38 +1100
Subject: 

Currently, kvmppc_h_enter takes a spinlock that is global to the guest,
kvm->mmu_lock, in order to check for pending PTE invalidations safely.
On some workloads, kvmppc_h_enter is called heavily and the use of a
global spinlock could compromise scalability.  We already use a per-
guest page spinlock in the form of the bit spinlock on the rmap chain,
and this gives us synchronization with the PTE invalidation side, which
also takes the bit spinlock on the rmap chain for each page being
invalidated.  Thus it is sufficient to check for pending invalidations
while the rmap chain bit spinlock is held.  However, now we require
barriers in mmu_notifier_retry() and in the places where
mmu_notifier_count and mmu_notifier_seq are updated, since we can now
call mmu_notifier_retry() concurrently with updates to those fields.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
Cc'd to kvm@vger.kernel.org for review of the generic kvm changes.

 arch/powerpc/include/asm/kvm_book3s_64.h |   13 +++++
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |   19 ++++----
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |   75 ++++++++++++-----------------
 include/linux/kvm_host.h                 |   13 +++--
 virt/kvm/kvm_main.c                      |    4 ++
 5 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 3745337..db6cbd5 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -161,4 +161,17 @@ static inline unsigned long kvmppc_read_update_linux_pte(pte_t *p)
 	return pfn;
 }
 
+static inline void lock_rmap(unsigned long *rmap)
+{
+	do {
+		while (test_bit(KVMPPC_RMAP_LOCK_BIT, rmap))
+			cpu_relax();
+	} while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmap));
+}
+
+static inline void unlock_rmap(unsigned long *rmap)
+{
+	__clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmap);
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 8c497b8..bb75bfb 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -611,12 +611,6 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		goto out_put;
 	pfn = page_to_pfn(page);
 
-	/* Check if we might have been invalidated; let the guest retry if so */
-	ret = RESUME_GUEST;
-	spin_lock(&kvm->mmu_lock);
-	if (mmu_notifier_retry(vcpu, mmu_seq))
-		goto out_unlock;
-
 	/* Set the HPTE to point to pfn */
 	ret = RESUME_GUEST;
 	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
@@ -627,19 +621,26 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	    rev->guest_rpte != hpte[2]) {
 		/* HPTE has been changed under us; let the guest retry */
 		hptep[0] &= ~HPTE_V_HVLOCK;
-		goto out_unlock;
+		goto out_put;
 	}
 	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 	hpte[1] = (rev->guest_rpte & ~(HPTE_R_PP0 - pte_size)) |
 		(pfn << PAGE_SHIFT);
 	rmap = &memslot->rmap[gfn - memslot->base_gfn];
+	lock_rmap(rmap);
+
+	/* Check if we might have been invalidated; let the guest retry if so */
+	ret = RESUME_GUEST;
+	if (mmu_notifier_retry(vcpu, mmu_seq)) {
+		unlock_rmap(rmap);
+		hptep[0] &= ~HPTE_V_HVLOCK;
+		goto out_put;
+	}
 	kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
 	kvmppc_modify_hpte(kvm, hptep, hpte, index);
 	if (page)
 		SetPageDirty(page);
 
- out_unlock:
-	spin_unlock(&kvm->mmu_lock);
  out_put:
 	if (page)
 		put_page(page);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 2cadd06..4070920 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -57,22 +57,16 @@ static struct kvm_memory_slot *builtin_gfn_to_memslot(struct kvm *kvm,
 	return NULL;
 }
 
-static void lock_rmap(unsigned long *rmap)
-{
-	do {
-		while (test_bit(KVMPPC_RMAP_LOCK_BIT, rmap))
-			cpu_relax();
-	} while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmap));
-}
-
-/* Add this HPTE into the chain for the real page */
+/*
+ * Add this HPTE into the chain for the real page.
+ * Must be called with the chain locked; it unlocks the chain.
+ */
 void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 			     unsigned long *rmap, long pte_index, int realmode)
 {
 	struct revmap_entry *head, *tail;
 	unsigned long i;
 
-	lock_rmap(rmap);
 	if (*rmap & KVMPPC_RMAP_PRESENT) {
 		i = *rmap & KVMPPC_RMAP_INDEX;
 		head = &kvm->arch.revmap[i];
@@ -125,7 +119,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 		else
 			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
 	}
-	__clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmap);
+	unlock_rmap(rmap);
 }
 
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
@@ -218,38 +212,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			return H_PARAMETER;
 	}
 
-	/*
-	 * Now that we're about to write the HPTE and thus give the guest
-	 * access to the page, check for any pending invalidations.
-	 * We don't need to worry about that if this is a non-present page.
-	 * Note that the HPTE bitlock has to nest inside the kvm->mmu_lock.
-	 */
-	spin_lock(&kvm->mmu_lock);
-	if (mmu_notifier_retry(vcpu, mmu_seq))
-		/* inval in progress, write a non-present HPTE */
-		pa = 0;
-
-	err = H_PARAMETER;
-	if (!pa) {
-		/*
-		 * If this is a non-present page for any reason
-		 * and this is a POWER7, set the key to 31 and set N.
-		 * If this is a page which could be accessed in real mode
-		 * using VRMA (which ignores page class keys) we have
-		 * to make it invalid instead.
-		 * On 970 we have to have all pages present.
-		 */
-		if (!cpu_has_feature(CPU_FTR_ARCH_206))
-			goto out;
-		pteh |= HPTE_V_ABSENT;
-		if ((pteh & 0xffffffffff000000ul) ==
-		    (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
-			pteh &= ~HPTE_V_VALID;
-		else
-			ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
-	}
-
 	/* Find and lock the HPTEG slot to use */
+	err = H_PARAMETER;
 	if (pte_index >= HPT_NPTE)
 		goto out;
 	err = H_PTEG_FULL;
@@ -281,7 +245,31 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	/* Link HPTE into reverse-map chain */
 	if (pa) {
 		rmap = real_vmalloc_addr(rmap);
-		kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, 1);
+		lock_rmap(rmap);
+		/* Check for pending invalidations under the rmap chain lock */
+		if (mmu_notifier_retry(vcpu, mmu_seq)) {
+			/* inval in progress, write a non-present HPTE */
+			pa = 0;
+			unlock_rmap(rmap);
+		} else {
+			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, 1);
+		}
+	}
+
+	if (!pa) {
+		/*
+		 * If this is a non-present page for any reason
+		 * and this is a POWER7, set the key to 31 and set N.
+		 * If this is a page which could be accessed in real mode
+		 * using VRMA (which ignores page class keys) we have
+		 * to make it invalid instead.
+		 */
+		pteh |= HPTE_V_ABSENT;
+		if ((pteh & 0xffffffffff000000ul) ==
+		    (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
+			pteh &= ~HPTE_V_VALID;
+		else
+			ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
 	}
 
 	hpte[1] = ptel;
@@ -295,7 +283,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	err = H_SUCCESS;
 
  out:
-	spin_unlock(&kvm->mmu_lock);
 	return err;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c6a2ec9..9b5e61a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -672,12 +672,15 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
 	if (unlikely(vcpu->kvm->mmu_notifier_count))
 		return 1;
 	/*
-	 * Both reads happen under the mmu_lock and both values are
-	 * modified under mmu_lock, so there's no need of smb_rmb()
-	 * here in between, otherwise mmu_notifier_count should be
-	 * read before mmu_notifier_seq, see
-	 * mmu_notifier_invalidate_range_end write side.
+	 * Ensure the read of mmu_notifier_count happens before the read
+	 * of mmu_notifier_seq.  This interacts with the smp_wmb() in
+	 * mmu_notifier_invalidate_range_end to make sure that the caller
+	 * either sees the old (non-zero) value of mmu_notifier_count or
+	 * the new (incremented) value of mmu_notifier_seq.
+	 * PowerPC Book3s HV KVM calls this under a per-page lock
+	 * rather than under kvm->mmu_lock, for scalability.
 	 */
+	smp_rmb();
 	if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
 		return 1;
 	return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d9cfb78..95081f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -290,6 +290,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	kvm->mmu_notifier_seq++;
+	smp_wmb();
 	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -311,6 +312,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	kvm->mmu_notifier_seq++;
+	smp_wmb();
 	kvm_set_spte_hva(kvm, address, pte);
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -332,6 +334,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 * count is also read inside the mmu_lock critical section.
 	 */
 	kvm->mmu_notifier_count++;
+	smp_wmb();
 	for (; start < end; start += PAGE_SIZE)
 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
 	need_tlb_flush |= kvm->tlbs_dirty;
@@ -357,6 +360,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	 * been freed.
 	 */
 	kvm->mmu_notifier_seq++;
+	smp_wmb();
 	/*
 	 * The above sequence increase must be visible before the
 	 * below count decrease but both values are read by the kvm
-- 
1.7.7.2

^ permalink raw reply related

* [RFC PATCH 10/11] KVM: PPC: Implement MMU notifiers
From: Paul Mackerras @ 2011-11-16 23:52 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This implements the low-level functions called by the MMU notifiers in
the generic KVM code, and defines KVM_ARCH_WANT_MMU_NOTIFIER if
CONFIG_KVM_BOOK3S_64_HV so that the generic KVM MMU notifiers get
included.

That means we also have to take notice of when PTE invalidations are
in progress, as indicated by mmu_notifier_retry().  In kvmppc_h_enter,
if any invalidation is in progress we just install a non-present HPTE.
In kvmppc_book3s_hv_page_fault, if an invalidation is in progress we
just return without resolving the guest, causing it to encounter another
page fault immediately.  This is better than spinning inside
kvmppc_book3s_hv_page_fault because this way the guest can get preempted
by a hypervisor decrementer interrupt without us having to do any
special checks.

We currently maintain a referenced bit in the rmap array, and when we
clear it, we make all the HPTEs that map the corresponding page be
non-present, as if the page were invalidated.  In future we could use
the hardware reference bit in the guest HPT instead.

The kvm_set_spte_hva function is implemented as kvm_unmap_hva.  The
former appears to be unused anyway.

This all means that on processors that support virtual partition
memory (POWER7), we can claim support for the KVM_CAP_SYNC_MMU
capability, and we no longer have to pin all the guest memory.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_host.h |   13 +++
 arch/powerpc/kvm/Kconfig            |    1 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c |  160 ++++++++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv.c        |   25 +++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |   34 ++++++-
 arch/powerpc/kvm/powerpc.c          |    3 +
 6 files changed, 218 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 3dfac3d..79bfc69 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -44,6 +44,19 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+#include <linux/mmu_notifier.h>
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+
+struct kvm;
+extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
+#endif
+
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x)	0
 #define KVM_NR_PAGE_SIZES	1
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 78133de..8f64709 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -69,6 +69,7 @@ config KVM_BOOK3S_64
 config KVM_BOOK3S_64_HV
 	bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
 	depends on KVM_BOOK3S_64
+	select MMU_NOTIFIER
 	---help---
 	  Support running unmodified book3s_64 guest kernels in
 	  virtual machines on POWER7 and PPC970 processors that have
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index e93c789..8c497b8 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -138,6 +138,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	hp1 = hpte1_pgsize_encoding(psize) |
 		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 
+	spin_lock(&kvm->mmu_lock);
+	/* wait until no invalidations are in progress */
+	while (kvm->mmu_notifier_count) {
+		spin_unlock(&kvm->mmu_lock);
+		while (kvm->mmu_notifier_count)
+			cpu_relax();
+		spin_lock(&kvm->mmu_lock);
+	}
+		
 	for (i = 0; i < npages; ++i) {
 		addr = i << porder;
 		if (pfns) {
@@ -185,6 +194,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 				KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT;
 		}
 	}
+	spin_unlock(&kvm->mmu_lock);
 }
 
 int kvmppc_mmu_hv_init(void)
@@ -506,7 +516,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_slb *slbe;
 	unsigned long *hptep, hpte[3];
-	unsigned long psize, pte_size;
+	unsigned long mmu_seq, psize, pte_size;
 	unsigned long gfn, hva, pfn, amr;
 	struct kvm_memory_slot *memslot;
 	unsigned long *rmap;
@@ -581,6 +591,11 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	if (kvm->arch.slot_pfns[memslot->id])
 		return -EFAULT;		/* should never get here */
 	hva = gfn_to_hva_memslot(memslot, gfn);
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
 	npages = get_user_pages_fast(hva, 1, 1, pages);
 	if (npages < 1)
 		return -EFAULT;
@@ -596,9 +611,15 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		goto out_put;
 	pfn = page_to_pfn(page);
 
+	/* Check if we might have been invalidated; let the guest retry if so */
+	ret = RESUME_GUEST;
+	spin_lock(&kvm->mmu_lock);
+	if (mmu_notifier_retry(vcpu, mmu_seq))
+		goto out_unlock;
+
 	/* Set the HPTE to point to pfn */
 	ret = RESUME_GUEST;
-	hptep = (unsigned long *)kvm->arch.hpt_virt + (index << 1);
+	hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 	rev = &kvm->arch.revmap[index];
 	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 		cpu_relax();
@@ -606,7 +627,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	    rev->guest_rpte != hpte[2]) {
 		/* HPTE has been changed under us; let the guest retry */
 		hptep[0] &= ~HPTE_V_HVLOCK;
-		goto out_put;
+		goto out_unlock;
 	}
 	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 	hpte[1] = (rev->guest_rpte & ~(HPTE_R_PP0 - pte_size)) |
@@ -617,6 +638,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	if (page)
 		SetPageDirty(page);
 
+ out_unlock:
+	spin_unlock(&kvm->mmu_lock);
  out_put:
 	if (page)
 		put_page(page);
@@ -635,6 +658,137 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return RESUME_GUEST;
 }
 
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 unsigned long gfn))
+{
+	int i;
+	int ret;
+	int retval = 0;
+	struct kvm_memslots *slots;
+
+	slots = kvm_memslots(kvm);
+	for (i = 0; i < slots->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &slots->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+
+			ret = handler(kvm, &memslot->rmap[gfn_offset],
+				      memslot->base_gfn + gfn_offset);
+			retval |= ret;
+		}
+	}
+
+	return retval;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			   unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long h, i, j;
+	unsigned long *hptep, new_hpte[2];
+	unsigned long ptel, psize;
+	int n = 0;
+
+	for (;;) {
+		while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
+			cpu_relax();
+		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+			__clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+			break;
+		}
+
+		/*
+		 * To avoid an ABBA deadlock with the HPTE lock bit,
+		 * we have to unlock the rmap chain before locking the HPTE.
+		 * Thus we remove the first entry, unlock the rmap chain,
+		 * lock the HPTE and then check that it is for the
+		 * page we're unmapping before changing it to non-present.
+		 */
+		i = *rmapp & KVMPPC_RMAP_INDEX;
+		j = rev[i].forw;
+		if (j == i) {
+			/* chain is now empty */
+			j = 0;
+		} else {
+			/* remove i from chain */
+			h = rev[i].back;
+			rev[h].forw = j;
+			rev[j].back = h;
+			rev[i].forw = rev[i].back = i;
+			j |= KVMPPC_RMAP_PRESENT;
+		}
+		smp_wmb();
+		*rmapp = j | (1ul << KVMPPC_RMAP_REF_BIT);
+
+		/* Now lock, check and modify the HPTE */
+		hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+		while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+			cpu_relax();
+		ptel = rev[i].guest_rpte;
+		psize = hpte_page_size(hptep[0], ptel);
+		if ((hptep[0] & HPTE_V_VALID) &&
+		    hpte_rpn(ptel, psize) == gfn) {
+			new_hpte[0] = hptep[0] | HPTE_V_ABSENT;
+			if ((new_hpte[0] & 0xffffffffff000000ul) ==
+			    (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
+				new_hpte[0] &= ~HPTE_V_VALID;
+			new_hpte[1] = (ptel & ~(HPTE_R_PP0 - psize)) |
+				HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
+			kvmppc_modify_hpte(kvm, hptep, new_hpte, i);
+			++n;
+		} else {
+			hptep[0] &= ~HPTE_V_HVLOCK;
+		}
+	}
+	return 0;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	return 0;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			 unsigned long gfn)
+{
+	if (!(*rmapp & KVMPPC_RMAP_REFERENCED))
+		return 0;
+	kvm_unmap_rmapp(kvm, rmapp, gfn);
+	while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
+		cpu_relax();
+	__clear_bit(KVMPPC_RMAP_REF_BIT, rmapp);
+	__clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+	return 1;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			      unsigned long gfn)
+{
+	return !!(*rmapp & KVMPPC_RMAP_REFERENCED);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+}
+
 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 			    unsigned long *nb_ret)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 47053e9..9e67320 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1278,10 +1278,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
 
-	pfns = vzalloc(npages * sizeof(unsigned long));
-	if (!pfns)
-		return -ENOMEM;
-	kvm->arch.slot_pfns[mem->slot] = pfns;
+	if (!cpu_has_feature(CPU_FTR_ARCH_206)) {
+		pfns = vzalloc(npages * sizeof(unsigned long));
+		if (!pfns)
+			return -ENOMEM;
+		kvm->arch.slot_pfns[mem->slot] = pfns;
+	}
 
 	return 0;
 
@@ -1305,12 +1307,14 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 		return;
 
 	pfns = kvm->arch.slot_pfns[mem->slot];
-	npages = mem->memory_size >> porder;
-	for (i = 0; i < npages; ++i) {
-		hva = mem->userspace_addr + (i << porder);
-		page = hva_to_page(hva);
-		if (page)
-			pfns[i] = page_to_pfn(page);
+	if (pfns) {
+		npages = mem->memory_size >> porder;
+		for (i = 0; i < npages; ++i) {
+			hva = mem->userspace_addr + (i << porder);
+			page = hva_to_page(hva);
+			if (page)
+				pfns[i] = page_to_pfn(page);
+		}
 	}
 
 	if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
@@ -1384,6 +1388,7 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 				page = pfn_to_page(pfns[j]);
 				if (PageHuge(page))
 					page = compound_head(page);
+				SetPageDirty(page);
 				put_page(page);
 			}
 		}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 622bfcd..2cadd06 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -143,11 +143,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *rmap;
 	pte_t *ptep;
 	unsigned int shift;
+	unsigned long mmu_seq;
+	long err;
 
 	psize = hpte_page_size(pteh, ptel);
 	if (!psize)
 		return H_PARAMETER;
 
+	/* used later to detect if we might have been invalidated */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
 	/* Find the memslot (if any) for this address */
 	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 	gfn = gpa >> PAGE_SHIFT;
@@ -212,6 +218,18 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			return H_PARAMETER;
 	}
 
+	/*
+	 * Now that we're about to write the HPTE and thus give the guest
+	 * access to the page, check for any pending invalidations.
+	 * We don't need to worry about that if this is a non-present page.
+	 * Note that the HPTE bitlock has to nest inside the kvm->mmu_lock.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	if (mmu_notifier_retry(vcpu, mmu_seq))
+		/* inval in progress, write a non-present HPTE */
+		pa = 0;
+
+	err = H_PARAMETER;
 	if (!pa) {
 		/*
 		 * If this is a non-present page for any reason
@@ -222,7 +240,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		 * On 970 we have to have all pages present.
 		 */
 		if (!cpu_has_feature(CPU_FTR_ARCH_206))
-			return H_PARAMETER;
+			goto out;
 		pteh |= HPTE_V_ABSENT;
 		if ((pteh & 0xffffffffff000000ul) ==
 		    (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
@@ -231,14 +249,16 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
 	}
 
+	/* Find and lock the HPTEG slot to use */
 	if (pte_index >= HPT_NPTE)
-		return H_PARAMETER;
+		goto out;
+	err = H_PTEG_FULL;
 	if (likely((flags & H_EXACT) == 0)) {
 		pte_index &= ~7UL;
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		for (i = 0; ; ++i) {
 			if (i == 8)
-				return H_PTEG_FULL;
+				goto out;
 			if ((*hpte & HPTE_V_VALID) == 0 &&
 			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 					  HPTE_V_ABSENT))
@@ -250,7 +270,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 				   HPTE_V_ABSENT))
-			return H_PTEG_FULL;
+			goto out;
 	}
 
 	/* Save away the guest's idea of the second HPTE dword */
@@ -272,7 +292,11 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	asm volatile("ptesync" : : : "memory");
 
 	vcpu->arch.gpr[4] = pte_index;
-	return H_SUCCESS;
+	err = H_SUCCESS;
+
+ out:
+	spin_unlock(&kvm->mmu_lock);
+	return err;
 }
 
 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 084d1c5..0f10a04 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -244,6 +244,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 		if (cpu_has_feature(CPU_FTR_ARCH_201))
 			r = 2;
 		break;
+	case KVM_CAP_SYNC_MMU:
+		r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
+		break;
 #endif
 	default:
 		r = 0;
-- 
1.7.7.2

^ permalink raw reply related

* [RFC PATCH 09/11] KVM: PPC: Maintain a doubly-linked list of guest HPTEs for each gfn
From: Paul Mackerras @ 2011-11-16 23:51 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This expands the reverse mapping array to contain two links for each
HPTE which are used to link together HPTEs that correspond to the
same guest logical page.  Each circular list of HPTEs is pointed to
by the rmap array entry for the guest logical page, pointed to by
the relevant memslot.  Links are 32-bit HPT entry indexes rather than
full 64-bit pointers, to save space.  We use 3 of the remaining 32
bits in the rmap array entries as a lock bit, a referenced bit and
a present bit (the present bit is needed since HPTE index 0 is valid).
The bit lock for the rmap chain nests inside the HPTE lock bit.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s.h |    2 +
 arch/powerpc/include/asm/kvm_host.h   |   17 ++++++-
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |    8 +++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   |   88 ++++++++++++++++++++++++++++++++-
 4 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index ac48438..8454a82 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -143,6 +143,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
+			unsigned long *rmap, long pte_index, int realmode);
 extern void kvmppc_modify_hpte(struct kvm *kvm, unsigned long *hptep,
 			unsigned long new_hpte[2], unsigned long pte_index);
 extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index ababf17..3dfac3d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -179,12 +179,27 @@ struct kvmppc_slb {
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
- * (including the guest physical address of the mapping).
+ * (including the guest physical address of the mapping),
+ * plus forward and backward pointers in a doubly-linked ring
+ * of HPTEs that map the same host page.  The pointers in this
+ * ring are 32-bit HPTE indexes, to save space.
  */
 struct revmap_entry {
 	unsigned long guest_rpte;
+	unsigned int forw, back;
 };
 
+/*
+ * We use the top bit of each memslot->rmap entry as a lock bit,
+ * and bit 32 as a present flag.  The bottom 32 bits are the
+ * index in the guest HPT of a HPTE that points to the page.
+ */
+#define KVMPPC_RMAP_LOCK_BIT	63
+#define KVMPPC_RMAP_REF_BIT	33
+#define KVMPPC_RMAP_REFERENCED	(1ul << KVMPPC_RMAP_REF_BIT)
+#define KVMPPC_RMAP_PRESENT	0x100000000ul
+#define KVMPPC_RMAP_INDEX	0xfffffffful
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 32c7d8c..e93c789 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -179,6 +179,11 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		/* Reverse map info */
 		rev = &kvm->arch.revmap[hash];
 		rev->guest_rpte = hp1 | addr;
+		if (pfn) {
+			rev->forw = rev->back = hash;
+			memslot->rmap[i << (porder - PAGE_SHIFT)] = hash |
+				KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT;
+		}
 	}
 }
 
@@ -504,6 +509,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	unsigned long psize, pte_size;
 	unsigned long gfn, hva, pfn, amr;
 	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
 	struct revmap_entry *rev;
 	struct page *page, *pages[1];
 	unsigned int pp, ok;
@@ -605,6 +611,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 	hpte[1] = (rev->guest_rpte & ~(HPTE_R_PP0 - pte_size)) |
 		(pfn << PAGE_SHIFT);
+	rmap = &memslot->rmap[gfn - memslot->base_gfn];
+	kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
 	kvmppc_modify_hpte(kvm, hptep, hpte, index);
 	if (page)
 		SetPageDirty(page);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index b477e68..622bfcd 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -57,6 +57,77 @@ static struct kvm_memory_slot *builtin_gfn_to_memslot(struct kvm *kvm,
 	return NULL;
 }
 
+static void lock_rmap(unsigned long *rmap)
+{
+	do {
+		while (test_bit(KVMPPC_RMAP_LOCK_BIT, rmap))
+			cpu_relax();
+	} while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmap));
+}
+
+/* Add this HPTE into the chain for the real page */
+void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
+			     unsigned long *rmap, long pte_index, int realmode)
+{
+	struct revmap_entry *head, *tail;
+	unsigned long i;
+
+	lock_rmap(rmap);
+	if (*rmap & KVMPPC_RMAP_PRESENT) {
+		i = *rmap & KVMPPC_RMAP_INDEX;
+		head = &kvm->arch.revmap[i];
+		if (realmode)
+			head = real_vmalloc_addr(head);
+		tail = &kvm->arch.revmap[head->back];
+		if (realmode)
+			tail = real_vmalloc_addr(tail);
+		rev->forw = i;
+		rev->back = head->back;
+		tail->forw = pte_index;
+		head->back = pte_index;
+	} else {
+		rev->forw = rev->back = pte_index;
+		i = pte_index;
+	}
+	smp_wmb();
+	*rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
+}
+EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
+
+/* Remove this HPTE from the chain for a real page */
+static void remove_revmap_chain(struct kvm *kvm, long pte_index,
+				unsigned long hpte_v)
+{
+	struct revmap_entry *rev, *next, *prev;
+	unsigned long gfn, ptel, head;
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
+
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	ptel = rev->guest_rpte;
+	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
+	memslot = builtin_gfn_to_memslot(kvm, gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		return;
+
+	rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]);
+	lock_rmap(rmap);
+
+	head = *rmap & KVMPPC_RMAP_INDEX;
+	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
+	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
+	next->back = rev->back;
+	prev->forw = rev->forw;
+	if (head == pte_index) {
+		head = rev->forw;
+		if (head == pte_index)
+			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+		else
+			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
+	}
+	__clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmap);
+}
+
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		    long pte_index, unsigned long pteh, unsigned long ptel)
 {
@@ -69,6 +140,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	struct kvm_memory_slot *memslot;
 	unsigned long *pfnp, pte_size;
 	unsigned long is_io;
+	unsigned long *rmap;
 	pte_t *ptep;
 	unsigned int shift;
 
@@ -82,12 +154,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	memslot = builtin_gfn_to_memslot(kvm, gfn);
 	pa = 0;
 	is_io = 1;
+	rmap = NULL;
 	if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
 		/* Check if the requested page fits entirely in the memslot. */
 		slot_fn = gfn - memslot->base_gfn;
 		if (slot_fn + (psize >> PAGE_SHIFT) > memslot->npages) 
 			return H_PARAMETER;
 		is_io = memslot->flags & KVM_MEMSLOT_IO;
+		rmap = &memslot->rmap[slot_fn];
 
 		pfnp = kvmppc_pfn_entry(kvm, memslot, gfn);
 		if (pfnp) {
@@ -184,6 +258,12 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (rev)
 		rev->guest_rpte = g_ptel;
 
+	/* Link HPTE into reverse-map chain */
+	if (pa) {
+		rmap = real_vmalloc_addr(rmap);
+		kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, 1);
+	}
+
 	hpte[1] = ptel;
 
 	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
@@ -239,11 +319,14 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
 	vcpu->arch.gpr[5] = r = hpte[1];
 	rb = compute_tlbie_rb(v, r, pte_index);
+	if (!(v & HPTE_V_ABSENT))
+		remove_revmap_chain(kvm, pte_index, v);
+	smp_wmb();
 	hpte[0] = 0;
 	if (!(v & HPTE_V_VALID))
 		return H_SUCCESS;
 	if (!(flags & H_LOCAL)) {
-		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 			cpu_relax();
 		asm volatile("ptesync" : : : "memory");
 		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
@@ -315,6 +398,9 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 		args[i * 2] = ((0x80 | flags) << 56) + pte_index;
 		if (hp[0] & HPTE_V_VALID)
 			tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
+		if (!(hp[0] & HPTE_V_ABSENT))
+			remove_revmap_chain(kvm, pte_index, hp[0]);
+		smp_wmb();
 		hp[0] = 0;
 	}
 	if (n_inval == 0)
-- 
1.7.7.2

^ permalink raw reply related

* [RFC PATCH 08/11] KVM: PPC: Add a page fault handler function
From: Paul Mackerras @ 2011-11-16 23:50 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This adds a kvmppc_book3s_hv_page_fault function that is capable of
handling the fault we get if the guest tries to access a non-present
page (one that we have marked with storage key 31 and no-execute),
and either doing MMIO emulation, or making the page resident and
rewriting the guest HPTE to point to it, if it is RAM.

We now call this for hypervisor instruction storage interrupts, and
for hypervisor data storage interrupts instead of the emulate-MMIO
function.  It can now be called for real-mode accesses through the
VRMA as well as virtual-mode accesses.

In order to identify non-present HPTEs, we use a second software-use
bit in the first dword of the HPTE, called HPTE_V_ABSENT.  We can't
just look for storage key 31 because non-present HPTEs for the VRMA
have to be actually invalid, as the storage key mechanism doesn't
operate in real mode.  Using this bit also means that we don't have
to restrict the guest from using key 31 any more.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s.h    |    6 +-
 arch/powerpc/include/asm/kvm_book3s_64.h |   11 ++-
 arch/powerpc/include/asm/kvm_host.h      |   30 ++--
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  259 +++++++++++++++++++++++-------
 arch/powerpc/kvm/book3s_hv.c             |   54 ++++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  121 ++++++++------
 6 files changed, 340 insertions(+), 141 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index b5ee1ce..ac48438 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -121,7 +121,9 @@ extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
-extern int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
+			struct kvm_vcpu *vcpu, unsigned long addr,
+			unsigned long status);
 
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
@@ -141,6 +143,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+extern void kvmppc_modify_hpte(struct kvm *kvm, unsigned long *hptep,
+			unsigned long new_hpte[2], unsigned long pte_index);
 extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
 			unsigned long *nb_ret);
 extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 307e649..3745337 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -37,6 +37,8 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 #define HPT_HASH_MASK	(HPT_NPTEG - 1)
 #endif
 
+#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 {
@@ -72,9 +74,11 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 
 /*
  * We use a lock bit in HPTE dword 0 to synchronize updates and
- * accesses to each HPTE.
+ * accesses to each HPTE, and another bit to indicate non-present
+ * HPTEs.
  */
 #define HPTE_V_HVLOCK	0x40UL
+#define HPTE_V_ABSENT	0x20UL
 
 static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 {
@@ -106,6 +110,11 @@ static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
 	return 0;				/* error */
 }
 
+static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
+{
+	return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
+}
+
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 static inline unsigned long *kvmppc_pfn_entry(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long gfn)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f211643..ababf17 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -162,6 +162,20 @@ struct kvmppc_rma_info {
 	atomic_t 	 use_count;
 };
 
+struct kvmppc_slb {
+	u64 esid;
+	u64 vsid;
+	u64 orige;
+	u64 origv;
+	bool valid	: 1;
+	bool Ks		: 1;
+	bool Kp		: 1;
+	bool nx		: 1;
+	bool large	: 1;	/* PTEs are 16MB */
+	bool tb		: 1;	/* 1TB segment */
+	bool class	: 1;
+};
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -184,6 +198,8 @@ struct kvm_arch {
 	unsigned long lpcr;
 	unsigned long rmor;
 	struct kvmppc_rma_info *rma;
+	struct kvmppc_slb vrma_slb;
+	unsigned long vrma_pgorder;
 	struct list_head spapr_tce_tables;
 	unsigned long *slot_pfns[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS];
 	int slot_page_order[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS];
@@ -251,20 +267,6 @@ struct kvmppc_mmu {
 	bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
 };
 
-struct kvmppc_slb {
-	u64 esid;
-	u64 vsid;
-	u64 orige;
-	u64 origv;
-	bool valid	: 1;
-	bool Ks		: 1;
-	bool Kp		: 1;
-	bool nx		: 1;
-	bool large	: 1;	/* PTEs are 16MB */
-	bool tb		: 1;	/* 1TB segment */
-	bool class	: 1;
-};
-
 struct kvm_vcpu_arch {
 	ulong host_stack;
 	u32 host_pid;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 9c7e825..32c7d8c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -34,8 +34,6 @@
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
 
-#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
-
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 #define MAX_LPID_970	63
 #define NR_LPIDS	(LPID_RSVD + 1)
@@ -125,6 +123,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	pfns = kvm->arch.slot_pfns[mem->slot];
 	porder = kvm->arch.slot_page_order[mem->slot];
 	psize = 1ul << porder;
+	kvm->arch.vrma_pgorder = porder;
 	npages = memslot->npages >> (porder - PAGE_SHIFT);
 
 	/* VRMA can't be > 1TB */
@@ -135,7 +134,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		npages = HPT_NPTEG;
 
 	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
-		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize) | HPTE_V_VALID;
+		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
 	hp1 = hpte1_pgsize_encoding(psize) |
 		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 
@@ -154,7 +153,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 			local_irq_enable();
 		}
 
-		if (!pfn) {
+		if (!pfn && !cpu_has_feature(CPU_FTR_ARCH_206)) {
 			pr_err("KVM: Couldn't find page for VRMA at %lx\n",
 			       addr);
 			break;
@@ -174,7 +173,8 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		hpte[1] = hp1 | (pfn << PAGE_SHIFT);
 		smp_wmb();
 		/* HPTE high word - virtual address, bolted, valid, large */
-		hpte[0] = hp0 | ((addr >> 16) & ~0x7fUL);
+		hpte[0] = hp0 | ((addr >> 16) & ~0x7fUL) |
+			(pfn ? HPTE_V_VALID : HPTE_V_ABSENT);
 
 		/* Reverse map info */
 		rev = &kvm->arch.revmap[hash];
@@ -216,10 +216,16 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
-							 gva_t eaddr)
+							 gva_t eaddr, bool data)
 {
 	u64 mask;
 	int i;
+	unsigned long xr;
+
+	xr = data ? MSR_DR : MSR_IR;
+	if (!(vcpu->arch.shregs.msr & xr))
+		/* real mode access, assume VRMA */
+		return &vcpu->kvm->arch.vrma_slb;
 
 	for (i = 0; i < vcpu->arch.slb_nr; i++) {
 		if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
@@ -377,7 +383,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	int index;
 
 	/* Get SLB entry */
-	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
+	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr, data);
 	if (!slbe)
 		return -EINVAL;
 
@@ -402,58 +408,14 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	return 0;
 }
 
-int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				  unsigned long gpa)
 {
-	struct kvm *kvm = vcpu->kvm;
-	struct kvmppc_slb *slbe;
-	unsigned long hpte[3];
+	int ret;
+	u32 last_inst;
 	unsigned long srr0 = kvmppc_get_pc(vcpu);
-	unsigned long ea = vcpu->arch.fault_dar;	
-	unsigned long gpa;
-	unsigned int pp, ok;
-	u32 last_inst, dsisr = vcpu->arch.fault_dsisr;
-	int index, ret = 0;
-
-	/*
-	 * Translate the access address.
-	 * If we can't find the HPTE, just return and re-execute the
-	 * instruction.
- 	 */
-	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, ea);
-	if (!slbe)
-		return RESUME_GUEST;
-	index = kvmppc_hv_find_hpte(kvm, ea, slbe, hpte);
-	if (index < 0)
-		return RESUME_GUEST;
-
-	/*
-	 * Check if this is a special HPTE (storage key = 31); if not then
-	 * this is just a key fault in the guest.
-	 */
-	if ((hpte[1] & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) !=
-	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) {
-		vcpu->arch.shregs.dsisr = dsisr;
-		vcpu->arch.shregs.dar = ea;
-		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
-		return RESUME_GUEST;
-	}
-
-	/* Check whether the attempted access was permitted */
-	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte[1]);
-	ok = (dsisr & DSISR_ISSTORE) ? pp_write_perm[pp] : pp_read_perm[pp];
-	if (!ok) {
-		vcpu->arch.shregs.dar = ea;
-		vcpu->arch.shregs.dsisr = (dsisr & DSISR_ISSTORE) |
-			DSISR_PROTFAULT;
-		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
-		return RESUME_GUEST;
-	}
 
-	/* Translate the logical address */
-	gpa = kvmppc_mmu_get_real_addr(hpte[0], hpte[2], ea);
-
-	/*
-	 * We try to load the last instruction.  We don't let
+	/* We try to load the last instruction.  We don't let
 	 * emulate_instruction do it as its failure mode is pretty bogus.
 	 * If we fail, we just return to the guest and try executing it again.
 	 */
@@ -475,11 +437,196 @@ int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	 * so right now we just do it badly and racily, but that will need
 	 * fixing
 	 */
+	/*
+	 * Emulated accesses are emulated by looking at the hash for
+	 * translation once, then performing the access later. The
+	 * translation could be invalidated in the meantime in which
+	 * point performing the subsequent memory access on the old
+	 * physical address is a violation of the architecture and
+	 * a security hole.
+	 *
+	 * This is less of an issue for MMIO stores since they aren't
+	 * globally visible. It could be an issue for MMIO loads to
+	 * a certain extent but we'll ignore it for now
+	 */
 
 	vcpu->arch.paddr_accessed = gpa;
 	return kvmppc_emulate_mmio(run, vcpu);
 }
 
+/*
+ * Look for a VRMA hash entry.  This only looks in the last slot of
+ * the primary PTEG, and accepts VRMA entries that are absent and invalid.
+ */
+static int kvmppc_hv_find_vrma(struct kvm *kvm, unsigned long addr,
+			       unsigned long hpte[3])
+{
+	unsigned long v, r, gr;
+	unsigned long i, hash;
+	unsigned long *hp;
+	unsigned long mask, val;
+	unsigned long porder, psize;
+
+	porder = kvm->arch.vrma_pgorder;
+	psize = 1ul << porder;
+	i = addr >> porder;
+	addr &= ~(psize - 1);
+	hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+	hash = (hash << 3) + 7;
+	hp = (unsigned long *) (kvm->arch.hpt_virt + (hash << 4));
+	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY | HPTE_V_LARGE;
+	val = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+		((addr >> 16) & ~0x7fUL) | hpte0_pgsize_encoding(psize);
+	if ((hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
+	    (hp[0] & mask) != val)
+		return -1;
+	while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
+		cpu_relax();
+	v = hp[0] & ~HPTE_V_HVLOCK;
+	r = hp[1];
+	gr = kvm->arch.revmap[hash].guest_rpte;
+	smp_wmb();
+	hp[0] = v;	/* unlock */
+	if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || (v & mask) != val)
+		return -1;
+	hpte[0] = v;
+	hpte[1] = r;
+	hpte[2] = gr;
+	return hash;
+}
+
+int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				unsigned long ea, unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_slb *slbe;
+	unsigned long *hptep, hpte[3];
+	unsigned long psize, pte_size;
+	unsigned long gfn, hva, pfn, amr;
+	struct kvm_memory_slot *memslot;
+	struct revmap_entry *rev;
+	struct page *page, *pages[1];
+	unsigned int pp, ok;
+	int index, ret, skey, npages;
+	bool data = vcpu->arch.trap == BOOK3S_INTERRUPT_H_DATA_STORAGE;
+	bool realmode = !(vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR));
+
+	/*
+	 * Translate the access address.
+	 * If we can't find the HPTE, just return and re-execute the
+	 * instruction.
+ 	 */
+	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, ea, data);
+	if (!slbe)
+		return RESUME_GUEST;
+	index = kvmppc_hv_find_hpte(kvm, ea, slbe, hpte);
+	/* if not found and real mode, look for an absent VRMA entry */
+	if (index < 0) {
+		if (!realmode)
+			return RESUME_GUEST;
+		index = kvmppc_hv_find_vrma(kvm, ea, hpte);
+		if (index < 0)
+			goto pass_to_guest;
+	}
+
+	/*
+	 * Check if this is a special HPTE (HPTE_V_ABSENT set); if not then
+	 * this is just a key fault or no-execute fault in the guest.
+	 * It could be that this was a special HPTE at the time of the
+	 * fault, but it has subsequently been turned into a normal HPTE
+	 * by another CPU, so check if the access should have been allowed.
+	 * If it should, just retry.
+	 */
+	if (!(hpte[0] & HPTE_V_ABSENT) && !realmode && data) {
+		skey = ((hpte[1] & HPTE_R_KEY_HI) >> 57) |
+			((hpte[1] & HPTE_R_KEY_LO) >> 9);
+		amr = vcpu->arch.amr << (2 * skey);
+		if (!(dsisr & DSISR_ISSTORE))
+			amr <<= 1;
+		if (amr & (1ul << 63))
+			goto pass_to_guest;
+	}
+
+	/* Check whether the attempted access was permitted */
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte[1]);
+	if (data) {
+		ok = (dsisr & DSISR_ISSTORE) ? pp_write_perm[pp] :
+			pp_read_perm[pp];
+		dsisr = (dsisr & DSISR_ISSTORE) | DSISR_PROTFAULT;
+	} else {
+		ok = pp_read_perm[pp] && (hpte[2] & (HPTE_R_N | HPTE_R_G)) == 0;
+	}
+	if (!ok)
+		goto pass_to_guest;
+	if (!(hpte[0] & HPTE_V_ABSENT))
+		return RESUME_GUEST;
+
+	/* Translate the logical address and get the page */
+	psize = hpte_page_size(hpte[0], hpte[1]);
+	gfn = hpte_rpn(hpte[2], psize);
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	/* No memslot means it's an emulated MMIO region */
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+		unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa);
+	}
+
+	if (kvm->arch.slot_pfns[memslot->id])
+		return -EFAULT;		/* should never get here */
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	npages = get_user_pages_fast(hva, 1, 1, pages);
+	if (npages < 1)
+		return -EFAULT;
+	page = pages[0];
+
+	pte_size = PAGE_SIZE;
+	if (PageHuge(page)) {
+		page = compound_head(page);
+		pte_size <<= compound_order(page);
+	}
+	ret = -EFAULT;
+	if (psize > pte_size)
+		goto out_put;
+	pfn = page_to_pfn(page);
+
+	/* Set the HPTE to point to pfn */
+	ret = RESUME_GUEST;
+	hptep = (unsigned long *)kvm->arch.hpt_virt + (index << 1);
+	rev = &kvm->arch.revmap[index];
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
+	    rev->guest_rpte != hpte[2]) {
+		/* HPTE has been changed under us; let the guest retry */
+		hptep[0] &= ~HPTE_V_HVLOCK;
+		goto out_put;
+	}
+	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+	hpte[1] = (rev->guest_rpte & ~(HPTE_R_PP0 - pte_size)) |
+		(pfn << PAGE_SHIFT);
+	kvmppc_modify_hpte(kvm, hptep, hpte, index);
+	if (page)
+		SetPageDirty(page);
+
+ out_put:
+	if (page)
+		put_page(page);
+	return ret;
+
+ pass_to_guest:
+	/* Pass the interrupt along to the guest */
+	if (data) {
+		vcpu->arch.shregs.dsisr = dsisr;
+		vcpu->arch.shregs.dar = ea;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+	} else {
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
+					vcpu->arch.shregs.msr & 0x78000000);
+	}
+	return RESUME_GUEST;
+}
+
 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 			    unsigned long *nb_ret)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ceb49d2..47053e9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -348,12 +348,16 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * as we have enabled VRMA (virtualized real mode area) mode in the
 	 * LPCR.  We just generate an appropriate DSI/ISI to the guest.
 	 *
-	 * We also get them for MMIO emulation via key faults
+	 * We also get them if the guest accesses a page which it thinks
+	 * it has mapped but which is not actually present, either because
+	 * it is for an emulated I/O device or because the corresonding
+	 * host page has been paged out.
 	 */
 	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
-		/* We attempt MMIO emulation for key faults */
-		if (vcpu->arch.fault_dsisr & DSISR_KEYFAULT) {
-			r = kvmppc_book3s_hv_emulate_mmio(run, vcpu);
+		if ((vcpu->arch.fault_dsisr & DSISR_KEYFAULT) ||
+		    !(vcpu->arch.shregs.msr & MSR_DR)) {
+			r = kvmppc_book3s_hv_page_fault(run, vcpu,
+				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
 			break;
 		}
 		vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
@@ -362,6 +366,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		if ((vcpu->arch.shregs.msr & SRR1_ISI_N_OR_G) ||
+		    !(vcpu->arch.shregs.msr & MSR_IR)) {
+			r = kvmppc_book3s_hv_page_fault(run, vcpu,
+				kvmppc_get_pc(vcpu), 0);
+			break;
+		}
 		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
 					vcpu->arch.shregs.msr & 0x78000000);
 		r = RESUME_GUEST;
@@ -1108,6 +1118,18 @@ static struct page *hva_to_page(unsigned long addr)
 	return page[0];
 }
 
+static unsigned long slb_pgsize_encoding(unsigned long psize)
+{
+	unsigned long senc = 0;
+
+	if (psize > 0x1000) {
+		senc = SLB_VSID_L;
+		if (psize == 0x10000)
+			senc |= SLB_VSID_LP_01;
+	}
+	return senc;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot,
 				      struct kvm_userspace_memory_region *mem)
@@ -1117,7 +1139,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	struct kvmppc_rma_info *ri = NULL;
 	struct vm_area_struct *vma;
 	unsigned long pfn;
-	unsigned long lpcr;
+	unsigned long lpcr, senc;
 	unsigned long *pfns = NULL;
 
 	npages = mem->memory_size >> PAGE_SHIFT;
@@ -1207,18 +1229,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		if (!(psize == 0x1000 || psize == 0x1000000 ||
 		      (psize == 0x10000 && cpu_has_feature(CPU_FTR_ARCH_206))))
 			goto err;
-		lpcr = kvm->arch.lpcr;
-		switch (porder) {
-		case 12:
-			lpcr &= ~(LPCR_VRMA_L);
-			break;
-		case 16:
-			lpcr |= (LPCR_VRMA_L | LPCR_VRMA_LP1);
-			break;
-		case 24:
-			lpcr |= LPCR_VRMA_L;
-			break;
-		}
+
+		senc = slb_pgsize_encoding(psize);
+		kvm->arch.vrma_slb.origv = senc | SLB_VSID_B_1T |
+			(VRMA_VSID << SLB_VSID_SHIFT_1T);
+		lpcr = kvm->arch.lpcr & ~(0x1fUL << LPCR_VRMASD_SH);
+		lpcr |= senc << (LPCR_VRMASD_SH - 4);
 		kvm->arch.lpcr = lpcr;
 	}
 
@@ -1262,7 +1278,6 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
 
-
 	pfns = vzalloc(npages * sizeof(unsigned long));
 	if (!pfns)
 		return -ENOMEM;
@@ -1337,6 +1352,9 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		 *     only upon instruction from qemu... 
 		 */
 		lpcr |= LPCR_VPM1;
+		kvm->arch.vrma_slb.orige = SLB_ESID_V;
+		kvm->arch.vrma_slb.origv = SLB_VSID_B_1T | SLB_VSID_L |
+			(VRMA_VSID << SLB_VSID_SHIFT_1T);
 	}
 	kvm->arch.lpcr = lpcr;
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 1778091..b477e68 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -11,6 +11,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/hugetlb.h>
+#include <linux/module.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -75,27 +76,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (!psize)
 		return H_PARAMETER;
 
-	/*
-	 * We do not allow the guest to set key 31 which is reserved
-	 * for MMIO emulation and non-present RAM pages.  We don't want
-	 * to allow MMIO emulation to be used to access RAM due to possible
-	 * races between emulation and TLB invalidations.
-	 *
-	 * Emulated accesses are emulated by looking at the hash for
-	 * translation once, then performing the access later. The
-	 * translation could be invalidated in the meantime in which
-	 * point performing the subsequent memory access on the old
-	 * physical address is a violation of the architecture and
-	 * a security hole.
-	 *
-	 * This is less of an issue for MMIO stores since they aren't
-	 * globally visible. It could be an issue for MMIO loads to
-	 * a certain extent but we'll ignore it for now
-	 */
-	if ((ptel & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
-	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
-		return H_PARAMETER;
-
 	/* Find the memslot (if any) for this address */
 	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 	gfn = gpa >> PAGE_SHIFT;
@@ -162,11 +142,19 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		/*
 		 * If this is a non-present page for any reason
 		 * and this is a POWER7, set the key to 31 and set N.
+		 * If this is a page which could be accessed in real mode
+		 * using VRMA (which ignores page class keys) we have
+		 * to make it invalid instead.
 		 * On 970 we have to have all pages present.
 		 */
 		if (!cpu_has_feature(CPU_FTR_ARCH_206))
 			return H_PARAMETER;
-		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
+		pteh |= HPTE_V_ABSENT;
+		if ((pteh & 0xffffffffff000000ul) ==
+		    (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
+			pteh &= ~HPTE_V_VALID;
+		else
+			ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
 	}
 
 	if (pte_index >= HPT_NPTE)
@@ -178,14 +166,16 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			if (i == 8)
 				return H_PTEG_FULL;
 			if ((*hpte & HPTE_V_VALID) == 0 &&
-			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
+					  HPTE_V_ABSENT))
 				break;
 			hpte += 2;
 		}
 		pte_index += i;
 	} else {
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
+				   HPTE_V_ABSENT))
 			return H_PTEG_FULL;
 	}
 
@@ -238,7 +228,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
-	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
 	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
 		hpte[0] &= ~HPTE_V_HVLOCK;
@@ -250,6 +240,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	vcpu->arch.gpr[5] = r = hpte[1];
 	rb = compute_tlbie_rb(v, r, pte_index);
 	hpte[0] = 0;
+	if (!(v & HPTE_V_VALID))
+		return H_SUCCESS;
 	if (!(flags & H_LOCAL)) {
 		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
 			cpu_relax();
@@ -298,7 +290,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 		while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
 			cpu_relax();
 		found = 0;
-		if (hp[0] & HPTE_V_VALID) {
+		if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) {
 			switch (flags & 3) {
 			case 0:		/* absolute */
 				found = 1;
@@ -321,7 +313,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 		/* insert R and C bits from PTE */
 		flags |= (hp[1] >> 5) & 0x0c;
 		args[i * 2] = ((0x80 | flags) << 56) + pte_index;
-		tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
+		if (hp[0] & HPTE_V_VALID)
+			tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
 		hp[0] = 0;
 	}
 	if (n_inval == 0)
@@ -356,14 +349,11 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
-	/* Don't let it set a normal memory page to key 31 */
-	if (((flags >> 9) & 0x1f) == 0x1f)
-		return H_PARAMETER;
 
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
-	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
 		hpte[0] &= ~HPTE_V_HVLOCK;
 		return H_NOT_FOUND;
@@ -386,9 +376,8 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 		rev->guest_rpte = r;
 	}
 
- 	/* Don't let guest remove N or key from emulated MMIO pages */
-	if ((hpte[1] & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == 
-	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
+ 	/* Don't let guest remove N or key from non-present pages */
+	if (hpte[0] & HPTE_V_ABSENT)
 		mask = HPTE_R_PP0 | HPTE_R_PP;
 	else
 		mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
@@ -396,20 +385,22 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	r = (hpte[1] & ~mask) | (bits & mask);
 
 	/* Update HPTE */
-	rb = compute_tlbie_rb(v, r, pte_index);
-	hpte[0] = v & ~HPTE_V_VALID;
-	if (!(flags & H_LOCAL)) {
-		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-			cpu_relax();
-		asm volatile("ptesync" : : : "memory");
-		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-			     : : "r" (rb), "r" (kvm->arch.lpid));
-		asm volatile("ptesync" : : : "memory");
-		kvm->arch.tlbie_lock = 0;
-	} else {
-		asm volatile("ptesync" : : : "memory");
-		asm volatile("tlbiel %0" : : "r" (rb));
-		asm volatile("ptesync" : : : "memory");
+	if (v & HPTE_V_VALID) {
+		rb = compute_tlbie_rb(v, r, pte_index);
+		hpte[0] = v & ~HPTE_V_VALID;
+		if (!(flags & H_LOCAL)) {
+			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+				cpu_relax();
+			asm volatile("ptesync" : : : "memory");
+			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+				     : : "r" (rb), "r" (kvm->arch.lpid));
+			asm volatile("ptesync" : : : "memory");
+			kvm->arch.tlbie_lock = 0;
+		} else {
+			asm volatile("ptesync" : : : "memory");
+			asm volatile("tlbiel %0" : : "r" (rb));
+			asm volatile("ptesync" : : : "memory");
+		}
 	}
 	hpte[1] = r;
 	eieio();
@@ -422,7 +413,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 		   unsigned long pte_index)
 {
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long *hpte, r;
+	unsigned long *hpte, v, r;
 	int i, n = 1;
 	struct revmap_entry *rev = NULL;
 
@@ -436,15 +427,43 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 		rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 	for (i = 0; i < n; ++i, ++pte_index) {
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		v = hpte[0] & ~HPTE_V_HVLOCK;
 		r = hpte[1];
-		if (hpte[0] & HPTE_V_VALID) {
+		if (v & HPTE_V_ABSENT) {
+			v &= ~HPTE_V_ABSENT;
+			v |= HPTE_V_VALID;
+		}
+		if (v & HPTE_V_VALID) {
 			if (rev)
 				r = rev[i].guest_rpte;
 			else
 				r = hpte[1] | HPTE_R_RPN;
 		}
-		vcpu->arch.gpr[4 + i * 2] = hpte[0];
+		vcpu->arch.gpr[4 + i * 2] = v;
 		vcpu->arch.gpr[5 + i * 2] = r;
 	}
 	return H_SUCCESS;
 }
+
+void kvmppc_modify_hpte(struct kvm *kvm, unsigned long *hptep,
+			unsigned long new_hpte[2], unsigned long pte_index)
+{
+	unsigned long rb;
+
+	hptep[1] = new_hpte[1];
+	eieio();
+	if (hptep[0] & HPTE_V_VALID) {
+		/* previously valid, so need to tlbie */
+		rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
+		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+			     : : "r" (rb), "r" (kvm->arch.lpid));
+		asm volatile("ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	}
+	hptep[0] = new_hpte[0] & ~HPTE_V_HVLOCK;	/* unlocks it */
+	asm volatile("ptesync" : : : "memory");
+}
+EXPORT_SYMBOL_GPL(kvmppc_modify_hpte);
-- 
1.7.7.2

^ permalink raw reply related

* [RFC PATCH 07/11] KVM: PPC: Convert do_h_register_vpa to use Linux page tables
From: Paul Mackerras @ 2011-11-16 23:02 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This makes do_h_register_vpa use a new helper function,
kvmppc_pin_guest_page, to pin the page containing the virtual
processor area that the guest wants to register.  The logic of
whether to use the userspace Linux page tables or the slot_pfns
array is thus hidden in kvmppc_pin_guest_page.  There is also a
new kvmppc_unpin_guest_page to release a previously-pinned page,
which we call at VPA unregistration time, or when a new VPA is
registered, or when the vcpu is destroyed.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s.h |    3 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |   44 +++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c          |   52 ++++++++++++++++++++++----------
 3 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index bd8345f..b5ee1ce 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -141,6 +141,9 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
+			unsigned long *nb_ret);
+extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
 
 extern void kvmppc_entry_trampoline(void);
 extern void kvmppc_hv_entry_trampoline(void);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 99187db..9c7e825 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -480,6 +480,50 @@ int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return kvmppc_emulate_mmio(run, vcpu);
 }
 
+void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
+			    unsigned long *nb_ret)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	struct page *pages[1];
+	int npages;
+	unsigned long hva, psize, offset;
+	unsigned long pfn;
+	unsigned long *pfnp;
+
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID) ||
+	    (memslot->flags & KVM_MEMSLOT_IO))
+		return NULL;
+	pfnp = kvmppc_pfn_entry(kvm, memslot, gfn);
+	if (pfnp) {
+		pfn = *pfnp;
+		if (!pfn)
+			return NULL;
+		psize = 1ul << kvm->arch.slot_page_order[memslot->id];
+		pages[0] = pfn_to_page(pfn);
+		get_page(pages[0]);
+	} else {
+		hva = gfn_to_hva_memslot(memslot, gfn);
+		npages = get_user_pages_fast(hva, 1, 1, pages);
+		if (npages < 1)
+			return NULL;
+		psize = PAGE_SIZE;
+	}
+	offset = gpa & (psize - 1);
+	if (nb_ret)
+		*nb_ret = psize - offset;
+	return page_address(pages[0]) + offset;
+}
+
+void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
+{
+	struct page *page = virt_to_page(va);
+
+	page = compound_head(page);
+	put_page(page);
+}
+
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cb21845..ceb49d2 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -163,10 +163,10 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 				       unsigned long vcpuid, unsigned long vpa)
 {
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long ra, len;
-	unsigned long nb;
+	unsigned long len, nb;
 	void *va;
 	struct kvm_vcpu *tvcpu;
+	int err = H_PARAMETER;
 
 	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
 	if (!tvcpu)
@@ -179,40 +179,41 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 	if (flags < 4) {
 		if (vpa & 0x7f)
 			return H_PARAMETER;
+		if (flags >= 2 && !tvcpu->arch.vpa)
+			return H_RESOURCE;
 		/* registering new area; convert logical addr to real */
-		ra = kvmppc_logical_to_real(kvm, vpa, &nb);
-		if (!ra)
+		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
+		if (va == NULL)
 			return H_PARAMETER;
-		va = __va(ra);
 		if (flags <= 1)
 			len = *(unsigned short *)(va + 4);
 		else
 			len = *(unsigned int *)(va + 4);
 		if (len > nb)
-			return H_PARAMETER;
+			goto out_unpin;
 		switch (flags) {
 		case 1:		/* register VPA */
 			if (len < 640)
-				return H_PARAMETER;
+				goto out_unpin;
+			if (tvcpu->arch.vpa)
+				kvmppc_unpin_guest_page(kvm, vcpu->arch.vpa);
 			tvcpu->arch.vpa = va;
 			init_vpa(vcpu, va);
 			break;
 		case 2:		/* register DTL */
 			if (len < 48)
-				return H_PARAMETER;
-			if (!tvcpu->arch.vpa)
-				return H_RESOURCE;
+				goto out_unpin;
 			len -= len % 48;
+			if (tvcpu->arch.dtl)
+				kvmppc_unpin_guest_page(kvm, vcpu->arch.dtl);
 			tvcpu->arch.dtl = va;
 			tvcpu->arch.dtl_end = va + len;
 			break;
 		case 3:		/* register SLB shadow buffer */
-			if (len < 8)
-				return H_PARAMETER;
-			if (!tvcpu->arch.vpa)
-				return H_RESOURCE;
-			tvcpu->arch.slb_shadow = va;
-			len = (len - 16) / 16;
+			if (len < 16)
+				goto out_unpin;
+			if (tvcpu->arch.slb_shadow)
+				kvmppc_unpin_guest_page(kvm, vcpu->arch.slb_shadow);
 			tvcpu->arch.slb_shadow = va;
 			break;
 		}
@@ -221,17 +222,30 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 		case 5:		/* unregister VPA */
 			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
 				return H_RESOURCE;
+			if (!tvcpu->arch.vpa)
+				break;
+			kvmppc_unpin_guest_page(kvm, tvcpu->arch.vpa);
 			tvcpu->arch.vpa = NULL;
 			break;
 		case 6:		/* unregister DTL */
+			if (!tvcpu->arch.dtl)
+				break;
+			kvmppc_unpin_guest_page(kvm, tvcpu->arch.dtl);
 			tvcpu->arch.dtl = NULL;
 			break;
 		case 7:		/* unregister SLB shadow buffer */
+			if (!tvcpu->arch.slb_shadow)
+				break;
+			kvmppc_unpin_guest_page(kvm, tvcpu->arch.slb_shadow);
 			tvcpu->arch.slb_shadow = NULL;
 			break;
 		}
 	}
 	return H_SUCCESS;
+
+ out_unpin:
+	kvmppc_unpin_guest_page(kvm, va);
+	return err;
 }
 
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -488,6 +502,12 @@ out:
 
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
+	if (vcpu->arch.dtl)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl);
+	if (vcpu->arch.slb_shadow)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow);
+	if (vcpu->arch.vpa)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa);
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu);
 }
-- 
1.7.7.2

^ permalink raw reply related

* [RFC PATCH 06/11] KVM: PPC: Use Linux page tables in h_enter and map_vrma
From: Paul Mackerras @ 2011-11-16 22:59 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This changes kvmppc_h_enter() and kvmppc_map_vrma to get the real page
numbers that they put into the guest HPT from the Linux page tables
for our userspace as an alternative to getting them from the slot_pfns
arrays.  In future this will enable us to avoid pinning all of guest
memory on POWER7, but we will still have to pin all guest memory on
PPC970 as it doesn't support virtual partition memory.

This also exports find_linux_pte_or_hugepte() since we need it when
KVM is modular.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h |   31 +++++++
 arch/powerpc/include/asm/kvm_host.h      |    2 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |   26 +++++-
 arch/powerpc/kvm/book3s_hv.c             |    1 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  127 ++++++++++++++++--------------
 arch/powerpc/mm/hugetlbpage.c            |    2 +
 6 files changed, 125 insertions(+), 64 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9243f35..307e649 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -121,4 +121,35 @@ static inline unsigned long *kvmppc_pfn_entry(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
+/*
+ * Lock and read a linux PTE.  If it's present and writable, atomically
+ * set dirty and referenced bits and return the PFN, otherwise return 0.
+ */
+static inline unsigned long kvmppc_read_update_linux_pte(pte_t *p)
+{
+	pte_t pte, tmp;
+	unsigned long pfn = 0;
+
+	/* wait until _PAGE_BUSY is clear then set it atomically */
+	__asm__ __volatile__ (
+		"1:	ldarx	%0,0,%3\n"
+		"	andi.	%1,%0,%4\n"
+		"	bne-	1b\n"
+		"	ori	%1,%0,%4\n"
+		"	stdcx.	%1,0,%3\n"
+		"	bne-	1b"
+		: "=&r" (pte), "=&r" (tmp), "=m" (*p)
+		: "r" (p), "i" (_PAGE_BUSY)
+		: "cc");
+
+	if (pte_present(pte) && pte_write(pte)) {
+		pfn = pte_pfn(pte);
+		pte = pte_mkdirty(pte_mkyoung(pte));
+	}
+
+	*p = pte;	/* clears _PAGE_BUSY */
+
+	return pfn;
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 93b7e04..f211643 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -32,6 +32,7 @@
 #include <linux/atomic.h>
 #include <asm/kvm_asm.h>
 #include <asm/processor.h>
+#include <asm/page.h>
 
 #define KVM_MAX_VCPUS		NR_CPUS
 #define KVM_MAX_VCORES		NR_CPUS
@@ -432,6 +433,7 @@ struct kvm_vcpu_arch {
 	struct list_head run_list;
 	struct task_struct *run_task;
 	struct kvm_run *kvm_run;
+	pgd_t *pgdir;
 #endif
 };
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 4d558c4..99187db 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -111,13 +111,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	unsigned long npages;
 	unsigned long pfn;
 	unsigned long *hpte;
-	unsigned long addr, hash;
+	unsigned long addr, hash, hva;
 	unsigned long psize;
 	int porder;
 	struct revmap_entry *rev;
 	struct kvm_memory_slot *memslot;
 	unsigned long hp0, hp1;
 	unsigned long *pfns;
+	pte_t *p;
+	unsigned int shift;
 
 	memslot = &kvm->memslots->memslots[mem->slot];
 	pfns = kvm->arch.slot_pfns[mem->slot];
@@ -138,10 +140,26 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 
 	for (i = 0; i < npages; ++i) {
-		pfn = pfns[i];
-		if (!pfn)
-			continue;
 		addr = i << porder;
+		if (pfns) {
+			pfn = pfns[i];
+		} else {
+			pfn = 0;
+			local_irq_disable();
+			hva = addr + mem->userspace_addr;
+			p = find_linux_pte_or_hugepte(current->mm->pgd, hva,
+						      &shift);
+			if (p && (psize == PAGE_SIZE || shift == porder))
+				pfn = kvmppc_read_update_linux_pte(p);
+			local_irq_enable();
+		}
+
+		if (!pfn) {
+			pr_err("KVM: Couldn't find page for VRMA at %lx\n",
+			       addr);
+			break;
+		}
+
 		/* can't use hpt_hash since va > 64 bits */
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		/*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7434258..cb21845 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -868,6 +868,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	flush_altivec_to_thread(current);
 	flush_vsx_to_thread(current);
 	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+	vcpu->arch.pgdir = current->mm->pgd;
 
 	do {
 		r = kvmppc_run_vcpu(run, vcpu);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5438442..1778091 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -59,37 +59,27 @@ static struct kvm_memory_slot *builtin_gfn_to_memslot(struct kvm *kvm,
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		    long pte_index, unsigned long pteh, unsigned long ptel)
 {
-	unsigned long porder;
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long i, pa, gpa, gfn, psize;
+	unsigned long slot_fn, hva;
 	unsigned long *hpte;
 	struct revmap_entry *rev;
 	unsigned long g_ptel = ptel;
 	struct kvm_memory_slot *memslot;
 	unsigned long *pfnp, pte_size;
+	unsigned long is_io;
+	pte_t *ptep;
+	unsigned int shift;
 
-	/* only handle 4k, 64k and 16M pages for now */
-	porder = 12;
-	if (pteh & HPTE_V_LARGE) {
-		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-		    (ptel & 0xf000) == 0x1000) {
-			/* 64k page */
-			porder = 16;
-		} else if ((ptel & 0xff000) == 0) {
-			/* 16M page */
-			porder = 24;
-			/* lowest AVA bit must be 0 for 16M pages */
-			if (pteh & 0x80)
-				return H_PARAMETER;
-		} else
-			return H_PARAMETER;
-	}
-	psize = (1ul << porder);
+	psize = hpte_page_size(pteh, ptel);
+	if (!psize)
+		return H_PARAMETER;
 
-	/* We do not allow the guest to set key 31 which is reserved
-	 * for MMIO emulation. We don't want to allow MMIO emulation
-	 * to be used to access RAM due to possible races between
-	 * emulation and TLB invalidations.
+	/*
+	 * We do not allow the guest to set key 31 which is reserved
+	 * for MMIO emulation and non-present RAM pages.  We don't want
+	 * to allow MMIO emulation to be used to access RAM due to possible
+	 * races between emulation and TLB invalidations.
 	 *
 	 * Emulated accesses are emulated by looking at the hash for
 	 * translation once, then performing the access later. The
@@ -106,66 +96,79 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
 		return H_PARAMETER;
 
-	/* Figure out the type of page and handle accordingly,
-	 * first check for RAM pages
-	 */
+	/* Find the memslot (if any) for this address */
 	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 	gfn = gpa >> PAGE_SHIFT;
 	memslot = builtin_gfn_to_memslot(kvm, gfn);
+	pa = 0;
+	is_io = 1;
 	if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
-		unsigned long egfn = (gpa + psize) >> PAGE_SHIFT;
-
 		/* Check if the requested page fits entirely in the memslot. */
-		if ((egfn - memslot->base_gfn) > memslot->npages)
+		slot_fn = gfn - memslot->base_gfn;
+		if (slot_fn + (psize >> PAGE_SHIFT) > memslot->npages) 
 			return H_PARAMETER;
+		is_io = memslot->flags & KVM_MEMSLOT_IO;
 
-		/* Check for MMIO pass-through */
-		if (memslot->flags & KVM_MEMSLOT_IO) {
-			/* Check WIMG */
-			if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
-			    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
-				return H_PARAMETER;		
-		} else {
-			/* System RAM */
-			if (porder > kvm->arch.slot_page_order[memslot->id])
+		pfnp = kvmppc_pfn_entry(kvm, memslot, gfn);
+		if (pfnp) {
+			pte_size = 1ul << kvm->arch.slot_page_order[memslot->id];
+			if (!is_io && psize > pte_size)
 				return H_PARAMETER;
-
-			/* Check WIMG */
-			if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
-			    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+			pfnp = real_vmalloc_addr(pfnp);
+			pa = *pfnp << PAGE_SHIFT;
+			if (!pa)
 				return H_PARAMETER;
+		} else {
+			/* Translate to host virtual address */
+			hva = gfn_to_hva_memslot(memslot, gfn);
+
+			/* Look up the Linux PTE for the backing page */
+			ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva,
+							 &shift);
+			if (ptep) {
+				if (shift)
+					pte_size = 1ul << shift;
+				else
+					pte_size = PAGE_SIZE;
+				if (pte_size < psize)
+					return H_PARAMETER;
+				pa = kvmppc_read_update_linux_pte(ptep);
+				pa <<= PAGE_SHIFT;
+			}
 		}
-		pfnp = kvmppc_pfn_entry(kvm, memslot, gfn);
-		if (!pfnp)
-			return H_PARAMETER;
-		pfnp = real_vmalloc_addr(pfnp);
-		pa = *pfnp << PAGE_SHIFT;
-		if (!pa)
-			return H_PARAMETER;
-		pte_size = 1ul << kvm->arch.slot_page_order[memslot->id];
-		pa |= gpa & (pte_size - 1);
+		if (pa && pte_size > psize)
+			pa |= gpa & (pte_size - 1);
 
 		/* check if the start pfn has page size alignment */
 		if (pa & (psize - 1))
 			return H_PARAMETER;
 		ptel &= ~(HPTE_R_PP0 - psize);
 		ptel |= pa;
-
+	}
+	pteh &= ~0x60UL;
+	
+	/* Check WIMG */
+	if (is_io) {
+		if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
+		    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
+			return H_PARAMETER;
 	} else {
-		/* Else check for MMIO emulation */
-		if (!cpu_has_feature(CPU_FTR_ARCH_206))
+		if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+		    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
 			return H_PARAMETER;
+	}
 
-		/* Leave RPN intact */
-		/* We force no-execute and set key to 1 to cause
-		 * faults on access.
-		 * XXX Should we instead just return H_PARAMETER if
-		 * N isn't already set ?
+	if (!pa) {
+		/*
+		 * If this is a non-present page for any reason
+		 * and this is a POWER7, set the key to 31 and set N.
+		 * On 970 we have to have all pages present.
 		 */
+		if (!cpu_has_feature(CPU_FTR_ARCH_206))
+			return H_PARAMETER;
 		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
 	}
-	pteh &= ~0x60UL;
-	
+
 	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
@@ -190,10 +193,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 	if (rev)
 		rev->guest_rpte = g_ptel;
+
 	hpte[1] = ptel;
+
+	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
 	eieio();
 	hpte[0] = pteh;
 	asm volatile("ptesync" : : : "memory");
+
 	vcpu->arch.gpr[4] = pte_index;
 	return H_SUCCESS;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0b9a5c1..701e920 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -11,6 +11,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
+#include <linux/module.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
@@ -105,6 +106,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 		*shift = hugepd_shift(*hpdp);
 	return hugepte_offset(hpdp, ea, pdshift);
 }
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-- 
1.7.7.2

^ permalink raw reply related

* [PATCH 05/11] KVM: PPC: Use a separate vmalloc'd array to store pfns
From: Paul Mackerras @ 2011-11-16 22:59 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This changes the book3s_hv code to store the page frame numbers in
a separate vmalloc'd array, pointed to by an array in struct kvm_arch,
rather than the memslot->rmap arrays.  This frees up the rmap arrays
to be used later to store reverse mapping information.  For large page
regions, we now store only one pfn per large page rather than one pfn
per small page.  This reduces the size of the pfns arrays and eliminates
redundant get_page and put_page calls.

We also now pin the guest pages and store the pfns in the commit_memory
function rather than the prepare_memory function.  This avoids a memory
leak should the add memory procedure hit an error after calling the
prepare_memory function.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h |   15 ++++
 arch/powerpc/include/asm/kvm_host.h      |    4 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |   10 ++-
 arch/powerpc/kvm/book3s_hv.c             |  124 +++++++++++++++++++-----------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |   14 ++--
 5 files changed, 112 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 63542dd..9243f35 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -106,4 +106,19 @@ static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
 	return 0;				/* error */
 }
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+static inline unsigned long *kvmppc_pfn_entry(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long gfn)
+{
+	int id = memslot->id;
+	unsigned long index;
+
+	if (!kvm->arch.slot_pfns[id])
+		return NULL;
+	index = gfn - memslot->base_gfn;
+	index >>= kvm->arch.slot_page_order[id] - PAGE_SHIFT;
+	return &kvm->arch.slot_pfns[id][index];
+}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e0751e5..93b7e04 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -174,8 +174,6 @@ struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
 	struct revmap_entry *revmap;
-	unsigned long ram_psize;
-	unsigned long ram_porder;
 	unsigned int lpid;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
@@ -186,6 +184,8 @@ struct kvm_arch {
 	unsigned long rmor;
 	struct kvmppc_rma_info *rma;
 	struct list_head spapr_tce_tables;
+	unsigned long *slot_pfns[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS];
+	int slot_page_order[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS];
 	unsigned short last_vcpu[NR_CPUS];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bed6c61..4d558c4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -112,13 +112,17 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	unsigned long pfn;
 	unsigned long *hpte;
 	unsigned long addr, hash;
-	unsigned long psize = kvm->arch.ram_psize;
-	unsigned long porder = kvm->arch.ram_porder;
+	unsigned long psize;
+	int porder;
 	struct revmap_entry *rev;
 	struct kvm_memory_slot *memslot;
 	unsigned long hp0, hp1;
+	unsigned long *pfns;
 
 	memslot = &kvm->memslots->memslots[mem->slot];
+	pfns = kvm->arch.slot_pfns[mem->slot];
+	porder = kvm->arch.slot_page_order[mem->slot];
+	psize = 1ul << porder;
 	npages = memslot->npages >> (porder - PAGE_SHIFT);
 
 	/* VRMA can't be > 1TB */
@@ -134,7 +138,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 
 	for (i = 0; i < npages; ++i) {
-		pfn = memslot->rmap[i << (porder - PAGE_SHIFT)];
+		pfn = pfns[i];
 		if (!pfn)
 			continue;
 		addr = i << porder;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 48a0648..7434258 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -133,16 +133,40 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
 	vpa->yield_count = 1;
 }
 
+unsigned long kvmppc_logical_to_real(struct kvm *kvm, unsigned long gpa,
+				     unsigned long *nb_ret)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn, ra, offset;
+	unsigned long *pfnp;
+	unsigned long pg_size;
+
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		return 0;
+	pfnp = kvmppc_pfn_entry(kvm, memslot, gfn);
+	if (!pfnp)
+		return 0;
+	ra = *pfnp << PAGE_SHIFT;
+	if (!ra)
+		return 0;
+	pg_size = 1ul << kvm->arch.slot_page_order[memslot->id];
+	offset = gpa & (pg_size - 1);
+	if (nb_ret)
+		*nb_ret = pg_size - offset;
+	return ra + offset;
+}
+
 static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 				       unsigned long flags,
 				       unsigned long vcpuid, unsigned long vpa)
 {
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long pg_index, ra, len;
-	unsigned long pg_offset;
+	unsigned long ra, len;
+	unsigned long nb;
 	void *va;
 	struct kvm_vcpu *tvcpu;
-	struct kvm_memory_slot *memslot;
 
 	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
 	if (!tvcpu)
@@ -156,21 +180,15 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 		if (vpa & 0x7f)
 			return H_PARAMETER;
 		/* registering new area; convert logical addr to real */
-		pg_index = vpa >> PAGE_SHIFT;
-		pg_offset = vpa & (PAGE_SIZE - 1);
-		memslot = gfn_to_memslot(kvm, pg_index);
-		if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
-			return H_PARAMETER;
-		ra = memslot->rmap[pg_index - memslot->base_gfn] << PAGE_SHIFT;
+		ra = kvmppc_logical_to_real(kvm, vpa, &nb);
 		if (!ra)
 			return H_PARAMETER;
-		ra |= pg_offset;
 		va = __va(ra);
 		if (flags <= 1)
 			len = *(unsigned short *)(va + 4);
 		else
 			len = *(unsigned int *)(va + 4);
-		if (pg_offset + len > kvm->arch.ram_psize)
+		if (len > nb)
 			return H_PARAMETER;
 		switch (flags) {
 		case 1:		/* register VPA */
@@ -1077,9 +1095,11 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	unsigned long i, npages;
 	struct kvmppc_rma_info *ri = NULL;
 	struct vm_area_struct *vma;
-	struct page *page;
-	unsigned long hva, pfn;
+	unsigned long pfn;
 	unsigned long lpcr;
+	unsigned long *pfns = NULL;
+
+	npages = mem->memory_size >> PAGE_SHIFT;
 
 	/*
 	 * This could be an attempt at adding memory or it could be MMIO
@@ -1092,8 +1112,6 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	if (!vma || vma->vm_start > mem->userspace_addr)
 		goto err_unlock;
 
-	npages = mem->memory_size >> PAGE_SHIFT;
-
 	/* For now require the memory to be in one vma */
 	if (mem->userspace_addr + mem->memory_size > vma->vm_end) {
 		pr_err("not one vma %llx > %lx\n",
@@ -1120,12 +1138,17 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 		/*
 		 * Tag the memslot with a private flag and store the pfns
-		 * in the rmap array.
+		 * in the pfns array.
 		 */
+		pfns = vzalloc(npages * sizeof(unsigned long));
+		if (!pfns)
+			return -ENOMEM;
+		kvm->arch.slot_pfns[mem->slot] = pfns;
+		kvm->arch.slot_page_order[mem->slot] = PAGE_SHIFT;
 		memslot->flags |= KVM_MEMSLOT_IO;
 		pfn = vma->vm_pgoff + (offset >> PAGE_SHIFT);
 		for (i = 0; i < npages; ++i)
-			memslot->rmap[i] = pfn++;
+			pfns[i] = pfn++;
 		return 0;
 	}
 
@@ -1146,23 +1169,23 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	    (mem->guest_phys_addr & (psize - 1))) {
 		pr_err("bad memory_size=%llx @ %llx\n",
 		       mem->memory_size, mem->guest_phys_addr);
-		return -EINVAL;
+		goto err;
 	}
 
 	/* Do we already have an RMA registered? */
 	if (mem->guest_phys_addr == 0 && kvm->arch.rma)
-		return -EINVAL;
+		goto err;
 
 	if (!ri && mem->guest_phys_addr == 0) {
 		if (cpu_has_feature(CPU_FTR_ARCH_201)) {
 			pr_err("CPU requires an RMO\n");
-			return -EINVAL;
+			goto err;
 		}
 
 		/* We can handle 4k, 64k and 16M pages in the VRMA */
 		if (!(psize == 0x1000 || psize == 0x1000000 ||
 		      (psize == 0x10000 && cpu_has_feature(CPU_FTR_ARCH_206))))
-			return -EINVAL;
+			goto err;
 		lpcr = kvm->arch.lpcr;
 		switch (porder) {
 		case 12:
@@ -1178,10 +1201,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		kvm->arch.lpcr = lpcr;
 	}
 
-	if (!ri && psize < kvm->arch.ram_psize) {
-		kvm->arch.ram_psize = psize;
-		kvm->arch.ram_porder = porder;
-	}
+	kvm->arch.slot_page_order[mem->slot] = porder;
 
 	/* Handle pre-allocated RMAs */
 	if (ri) {
@@ -1194,7 +1214,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		rmls = lpcr_rmls(rma_size);
 		if (rmls < 0) {
 			pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
-			return -EINVAL;
+			goto err;
 		}
 		atomic_inc(&ri->use_count);
 		kvm->arch.rma = ri;
@@ -1221,15 +1241,11 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
 
-	for (i = 0; i < npages; ++i) {
-		hva = mem->userspace_addr + (i << PAGE_SHIFT);
-		page = hva_to_page(hva);
-		if (!page) {
-			pr_err("oops, no pfn for hva %lx\n", hva);
-			goto err;
-		}
-		memslot->rmap[i] = page_to_pfn(page);
-	}
+
+	pfns = vzalloc(npages * sizeof(unsigned long));
+	if (!pfns)
+		return -ENOMEM;
+	kvm->arch.slot_pfns[mem->slot] = pfns;
 
 	return 0;
 
@@ -1242,6 +1258,25 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
+	unsigned long i, npages, *pfns;
+	unsigned long hva;
+	unsigned long porder = kvm->arch.slot_page_order[mem->slot];
+	struct page *page;
+	struct kvm_memory_slot *memslot;
+
+	memslot = &kvm->memslots->memslots[mem->slot];
+	if (memslot->flags & KVM_MEMSLOT_IO)
+		return;
+
+	pfns = kvm->arch.slot_pfns[mem->slot];
+	npages = mem->memory_size >> porder;
+	for (i = 0; i < npages; ++i) {
+		hva = mem->userspace_addr + (i << porder);
+		page = hva_to_page(hva);
+		if (page)
+			pfns[i] = page_to_pfn(page);
+	}
+
 	if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
 	    !kvm->arch.rma)
 		kvmppc_map_vrma(kvm, mem);
@@ -1259,10 +1294,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
-	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;	/* max page size */
-	kvm->arch.ram_porder = LARGE_PAGE_ORDER;
 	kvm->arch.rma = NULL;
-
 	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
 	if (cpu_has_feature(CPU_FTR_ARCH_201)) {
@@ -1295,25 +1327,29 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 	unsigned long i, j, npages;
-	unsigned long *rmap;
+	unsigned long *pfns;
 	struct page *page;
+	unsigned long porder;
 
 	slots = kvm_memslots(kvm);
 	for (i = 0; i < slots->nmemslots; i++) {
 		memslot = &slots->memslots[i];
-		rmap = memslot->rmap;
-		npages = memslot->npages;
+		pfns = kvm->arch.slot_pfns[i];
+		porder = kvm->arch.slot_page_order[i];
+		npages = memslot->npages >> (porder - PAGE_SHIFT);
 
-		if ((memslot->flags & KVM_MEMSLOT_INVALID) || !rmap)
+		if ((memslot->flags & KVM_MEMSLOT_INVALID) || !pfns)
 			continue;
 		for (j = 0; j < npages; j++) {
-			if (rmap[j]) {
-				page = pfn_to_page(rmap[j]);
+			if (pfns[j]) {
+				page = pfn_to_page(pfns[j]);
 				if (PageHuge(page))
 					page = compound_head(page);
 				put_page(page);
 			}
 		}
+		vfree(pfns);
+		kvm->arch.slot_pfns[i] = NULL;
 	}
 
 	if (kvm->arch.rma) {
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5a84791..5438442 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -66,7 +66,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	struct revmap_entry *rev;
 	unsigned long g_ptel = ptel;
 	struct kvm_memory_slot *memslot;
-	unsigned long *rmap_entry;
+	unsigned long *pfnp, pte_size;
 
 	/* only handle 4k, 64k and 16M pages for now */
 	porder = 12;
@@ -127,7 +127,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 				return H_PARAMETER;		
 		} else {
 			/* System RAM */
-			if (porder > kvm->arch.ram_porder)
+			if (porder > kvm->arch.slot_page_order[memslot->id])
 				return H_PARAMETER;
 
 			/* Check WIMG */
@@ -135,13 +135,15 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
 				return H_PARAMETER;
 		}
-		rmap_entry = &memslot->rmap[gfn - memslot->base_gfn];
-		rmap_entry = real_vmalloc_addr(rmap_entry);
-		if (!rmap_entry)
+		pfnp = kvmppc_pfn_entry(kvm, memslot, gfn);
+		if (!pfnp)
 			return H_PARAMETER;
-		pa = *rmap_entry << PAGE_SHIFT;
+		pfnp = real_vmalloc_addr(pfnp);
+		pa = *pfnp << PAGE_SHIFT;
 		if (!pa)
 			return H_PARAMETER;
+		pte_size = 1ul << kvm->arch.slot_page_order[memslot->id];
+		pa |= gpa & (pte_size - 1);
 
 		/* check if the start pfn has page size alignment */
 		if (pa & (psize - 1))
-- 
1.7.7.2

^ permalink raw reply related

* [PATCH 04/11] KVM: PPC: Remove io_slot_pfn array
From: Paul Mackerras @ 2011-11-16 22:58 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This stores the PFNs for I/O mappings in the slot->rmap array, as is
now done for system RAM.  This simplifies the h_enter code and allows
us to remove the io_slot_pfn array.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_host.h |    2 --
 arch/powerpc/kvm/book3s_hv.c        |   16 +++++++++-------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |   22 +++++++++++-----------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 52fd741..e0751e5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -188,8 +188,6 @@ struct kvm_arch {
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
-	unsigned long io_slot_pfn[KVM_MEMORY_SLOTS +
-				  KVM_PRIVATE_MEM_SLOTS];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bc512ef..48a0648 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1078,7 +1078,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	struct kvmppc_rma_info *ri = NULL;
 	struct vm_area_struct *vma;
 	struct page *page;
-	unsigned long hva;
+	unsigned long hva, pfn;
 	unsigned long lpcr;
 
 	/*
@@ -1092,6 +1092,8 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	if (!vma || vma->vm_start > mem->userspace_addr)
 		goto err_unlock;
 
+	npages = mem->memory_size >> PAGE_SHIFT;
+
 	/* For now require the memory to be in one vma */
 	if (mem->userspace_addr + mem->memory_size > vma->vm_end) {
 		pr_err("not one vma %llx > %lx\n",
@@ -1114,15 +1116,16 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		if ((vma->vm_flags & (VM_READ | VM_WRITE)) !=
 		    (VM_READ | VM_WRITE))
 			goto err_unlock;
+		up_read(&current->mm->mmap_sem);
 
 		/*
-		 * Tag the memslot with a private flag and store the pfn
-		 * in a separate array for use by H_ENTER
+		 * Tag the memslot with a private flag and store the pfns
+		 * in the rmap array.
 		 */
 		memslot->flags |= KVM_MEMSLOT_IO;
-		kvm->arch.io_slot_pfn[memslot->id] =
-			vma->vm_pgoff + (offset >> PAGE_SHIFT);
-		up_read(&current->mm->mmap_sem);
+		pfn = vma->vm_pgoff + (offset >> PAGE_SHIFT);
+		for (i = 0; i < npages; ++i)
+			memslot->rmap[i] = pfn++;
 		return 0;
 	}
 
@@ -1218,7 +1221,6 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
 
-	npages = mem->memory_size >> PAGE_SHIFT;
 	for (i = 0; i < npages; ++i) {
 		hva = mem->userspace_addr + (i << PAGE_SHIFT);
 		page = hva_to_page(hva);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index b82da85..5a84791 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -121,12 +121,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 
 		/* Check for MMIO pass-through */
 		if (memslot->flags & KVM_MEMSLOT_IO) {
-			/* check if the start pfn has page size alignment */
-			pa = kvm->arch.io_slot_pfn[memslot->id] << PAGE_SHIFT;
-			pa += gpa - (memslot->base_gfn << PAGE_SHIFT);
-			if (pa & (psize - 1))
-				return H_PARAMETER;
-
 			/* Check WIMG */
 			if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
 			    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
@@ -135,17 +129,23 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			/* System RAM */
 			if (porder > kvm->arch.ram_porder)
 				return H_PARAMETER;
-			rmap_entry = &memslot->rmap[gfn - memslot->base_gfn];
-			rmap_entry = real_vmalloc_addr(rmap_entry);
-			pa = *rmap_entry << PAGE_SHIFT;
-			if (!pa)
-				return H_PARAMETER;
 
 			/* Check WIMG */
 			if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
 			    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
 				return H_PARAMETER;
 		}
+		rmap_entry = &memslot->rmap[gfn - memslot->base_gfn];
+		rmap_entry = real_vmalloc_addr(rmap_entry);
+		if (!rmap_entry)
+			return H_PARAMETER;
+		pa = *rmap_entry << PAGE_SHIFT;
+		if (!pa)
+			return H_PARAMETER;
+
+		/* check if the start pfn has page size alignment */
+		if (pa & (psize - 1))
+			return H_PARAMETER;
 		ptel &= ~(HPTE_R_PP0 - psize);
 		ptel |= pa;
 
-- 
1.7.7.2

^ permalink raw reply related

* [PATCH 03/11] KVM: PPC: Allow use of small pages to back guest memory
From: Paul Mackerras @ 2011-11-16 22:58 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

From: Nishanth Aravamudan <nacc@us.ibm.com>

This puts the page frame numbers for the memory backing the guest in
the slot->rmap array for each slot, rather than using the ram_pginfo
array.  Since the rmap array is vmalloc'd, we use real_vmalloc_addr()
to access it when we access it in real mode in kvmppc_h_enter().
The rmap array contains one PFN for each small page, even if the
backing memory is large pages.

This lets us get rid of the ram_pginfo array.

[paulus@samba.org - Cleaned up and reorganized a bit, abstracted out
HPTE page size encoding functions, added check that memory being
added in kvmppc_core_prepare_memory_region is all in one VMA.]

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_host.h |    8 --
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   47 +++++++----
 arch/powerpc/kvm/book3s_hv.c        |  153 +++++++++++++++++------------------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |   90 ++++++++++----------
 4 files changed, 151 insertions(+), 147 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 56f7046..52fd741 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -145,11 +145,6 @@ struct kvmppc_exit_timing {
 	};
 };
 
-struct kvmppc_pginfo {
-	unsigned long pfn;
-	atomic_t refcnt;
-};
-
 struct kvmppc_spapr_tce_table {
 	struct list_head list;
 	struct kvm *kvm;
@@ -179,17 +174,14 @@ struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
 	struct revmap_entry *revmap;
-	unsigned long ram_npages;
 	unsigned long ram_psize;
 	unsigned long ram_porder;
-	struct kvmppc_pginfo *ram_pginfo;
 	unsigned int lpid;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
-	int n_rma_pages;
 	unsigned long lpcr;
 	unsigned long rmor;
 	struct kvmppc_rma_info *rma;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 2b9b8be..bed6c61 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -34,8 +34,6 @@
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
 
-/* Pages in the VRMA are 16MB pages */
-#define VRMA_PAGE_ORDER	24
 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
 
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
@@ -95,19 +93,33 @@ void kvmppc_free_hpt(struct kvm *kvm)
 	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
 }
 
+/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
+}
+
+/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize == 0x10000) ? 0x1000 : 0;
+}
+
 void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 {
 	unsigned long i;
-	unsigned long npages = kvm->arch.ram_npages;
+	unsigned long npages;
 	unsigned long pfn;
 	unsigned long *hpte;
-	unsigned long hash;
+	unsigned long addr, hash;
+	unsigned long psize = kvm->arch.ram_psize;
 	unsigned long porder = kvm->arch.ram_porder;
 	struct revmap_entry *rev;
-	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+	struct kvm_memory_slot *memslot;
+	unsigned long hp0, hp1;
 
-	if (!pginfo)
-		return;
+	memslot = &kvm->memslots->memslots[mem->slot];
+	npages = memslot->npages >> (porder - PAGE_SHIFT);
 
 	/* VRMA can't be > 1TB */
 	if (npages > 1ul << (40 - porder))
@@ -116,10 +128,16 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	if (npages > HPT_NPTEG)
 		npages = HPT_NPTEG;
 
+	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize) | HPTE_V_VALID;
+	hp1 = hpte1_pgsize_encoding(psize) |
+		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
+
 	for (i = 0; i < npages; ++i) {
-		pfn = pginfo[i].pfn;
+		pfn = memslot->rmap[i << (porder - PAGE_SHIFT)];
 		if (!pfn)
-			break;
+			continue;
+		addr = i << porder;
 		/* can't use hpt_hash since va > 64 bits */
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		/*
@@ -131,17 +149,14 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		hash = (hash << 3) + 7;
 		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 4));
 		/* HPTE low word - RPN, protection, etc. */
-		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
-			HPTE_R_M | PP_RWXX;
+		hpte[1] = hp1 | (pfn << PAGE_SHIFT);
 		smp_wmb();
-		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
-			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
-			HPTE_V_LARGE | HPTE_V_VALID;
+		/* HPTE high word - virtual address, bolted, valid, large */
+		hpte[0] = hp0 | ((addr >> 16) & ~0x7fUL);
 
 		/* Reverse map info */
 		rev = &kvm->arch.revmap[hash];
-		rev->guest_rpte = (i << porder) | HPTE_R_R | HPTE_R_C |
-			HPTE_R_M | PP_RWXX;
+		rev->guest_rpte = hp1 | addr;
 	}
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d1f0774..bc512ef 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -47,14 +47,7 @@
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
-
-/*
- * For now, limit memory to 64GB and require it to be large pages.
- * This value is chosen because it makes the ram_pginfo array be
- * 64kB in size, which is about as large as we want to be trying
- * to allocate with kmalloc.
- */
-#define MAX_MEM_ORDER		36
+#include <linux/hugetlb.h>
 
 #define LARGE_PAGE_ORDER	24	/* 16MB pages */
 
@@ -149,6 +142,7 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 	unsigned long pg_offset;
 	void *va;
 	struct kvm_vcpu *tvcpu;
+	struct kvm_memory_slot *memslot;
 
 	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
 	if (!tvcpu)
@@ -162,13 +156,14 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 		if (vpa & 0x7f)
 			return H_PARAMETER;
 		/* registering new area; convert logical addr to real */
-		pg_index = vpa >> kvm->arch.ram_porder;
-		pg_offset = vpa & (kvm->arch.ram_psize - 1);
-		if (pg_index >= kvm->arch.ram_npages)
+		pg_index = vpa >> PAGE_SHIFT;
+		pg_offset = vpa & (PAGE_SIZE - 1);
+		memslot = gfn_to_memslot(kvm, pg_index);
+		if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 			return H_PARAMETER;
-		if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+		ra = memslot->rmap[pg_index - memslot->base_gfn] << PAGE_SHIFT;
+		if (!ra)
 			return H_PARAMETER;
-		ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
 		ra |= pg_offset;
 		va = __va(ra);
 		if (flags <= 1)
@@ -1079,13 +1074,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				      struct kvm_userspace_memory_region *mem)
 {
 	unsigned long psize, porder;
-	unsigned long i, npages, totalpages;
-	unsigned long pg_ix;
-	struct kvmppc_pginfo *pginfo;
+	unsigned long i, npages;
 	struct kvmppc_rma_info *ri = NULL;
 	struct vm_area_struct *vma;
 	struct page *page;
 	unsigned long hva;
+	unsigned long lpcr;
 
 	/*
 	 * This could be an attempt at adding memory or it could be MMIO
@@ -1098,6 +1092,13 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	if (!vma || vma->vm_start > mem->userspace_addr)
 		goto err_unlock;
 
+	/* For now require the memory to be in one vma */
+	if (mem->userspace_addr + mem->memory_size > vma->vm_end) {
+		pr_err("not one vma %llx > %lx\n",
+		       mem->userspace_addr + mem->memory_size, vma->vm_end);
+		goto err_unlock;
+	}
+
 	/* Anything with VM_IO will be handled as MMIO pass-through */
 	if (vma->vm_flags & VM_IO) {
 		unsigned long offset = mem->userspace_addr - vma->vm_start;
@@ -1125,6 +1126,9 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		return 0;
 	}
 
+	psize = vma_kernel_pagesize(vma);
+	porder = __ilog2(psize);
+
 	/* Is this one of our preallocated RMAs? */
 	if (mem->guest_phys_addr == 0) {
 		if (vma && vma->vm_file &&
@@ -1135,9 +1139,6 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 	up_read(&current->mm->mmap_sem);
 
-	/* For now, only allow 16MB pages for memory */
-	porder = LARGE_PAGE_ORDER;
-	psize = 1ul << porder;
 	if ((mem->memory_size & (psize - 1)) ||
 	    (mem->guest_phys_addr & (psize - 1))) {
 		pr_err("bad memory_size=%llx @ %llx\n",
@@ -1145,30 +1146,43 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		return -EINVAL;
 	}
 
-	npages = mem->memory_size >> porder;
-	totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
-
-	/* More memory than we have space to track? */
-	if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
-		return -EINVAL;
-
 	/* Do we already have an RMA registered? */
 	if (mem->guest_phys_addr == 0 && kvm->arch.rma)
 		return -EINVAL;
 
-	if (totalpages > kvm->arch.ram_npages)
-		kvm->arch.ram_npages = totalpages;
+	if (!ri && mem->guest_phys_addr == 0) {
+		if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+			pr_err("CPU requires an RMO\n");
+			return -EINVAL;
+		}
 
-	if (!ri && mem->guest_phys_addr == 0 &&
-	    cpu_has_feature(CPU_FTR_ARCH_201)) {
-		pr_err("CPU requires an RMO\n");
-		return -EINVAL;
+		/* We can handle 4k, 64k and 16M pages in the VRMA */
+		if (!(psize == 0x1000 || psize == 0x1000000 ||
+		      (psize == 0x10000 && cpu_has_feature(CPU_FTR_ARCH_206))))
+			return -EINVAL;
+		lpcr = kvm->arch.lpcr;
+		switch (porder) {
+		case 12:
+			lpcr &= ~(LPCR_VRMA_L);
+			break;
+		case 16:
+			lpcr |= (LPCR_VRMA_L | LPCR_VRMA_LP1);
+			break;
+		case 24:
+			lpcr |= LPCR_VRMA_L;
+			break;
+		}
+		kvm->arch.lpcr = lpcr;
+	}
+
+	if (!ri && psize < kvm->arch.ram_psize) {
+		kvm->arch.ram_psize = psize;
+		kvm->arch.ram_porder = porder;
 	}
 
 	/* Handle pre-allocated RMAs */
 	if (ri) {
 		unsigned long rma_size;
-		unsigned long lpcr;
 		long rmls;
 
 		rma_size = ri->npages << PAGE_SHIFT;
@@ -1181,7 +1195,6 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		}
 		atomic_inc(&ri->use_count);
 		kvm->arch.rma = ri;
-		kvm->arch.n_rma_pages = rma_size >> porder;
 
 		/* Update LPCR and RMOR */
 		lpcr = kvm->arch.lpcr;
@@ -1205,28 +1218,15 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
 
-	pg_ix = mem->guest_phys_addr >> porder;
-	pginfo = kvm->arch.ram_pginfo + pg_ix;
-	for (i = 0; i < npages; ++i, ++pg_ix) {
-		if (ri && pg_ix < kvm->arch.n_rma_pages) {
-			pginfo[i].pfn = ri->base_pfn +
-				(pg_ix << (porder - PAGE_SHIFT));
-			continue;
-		}
-		hva = mem->userspace_addr + (i << porder);
+	npages = mem->memory_size >> PAGE_SHIFT;
+	for (i = 0; i < npages; ++i) {
+		hva = mem->userspace_addr + (i << PAGE_SHIFT);
 		page = hva_to_page(hva);
 		if (!page) {
 			pr_err("oops, no pfn for hva %lx\n", hva);
 			goto err;
 		}
-		/* Check it's a 16MB page */
-		if (!PageHead(page) ||
-		    compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
-			pr_err("page at %lx isn't 16MB (o=%d)\n",
-			       hva, compound_order(page));
-			goto err;
-		}
-		pginfo[i].pfn = page_to_pfn(page);
+		memslot->rmap[i] = page_to_pfn(page);
 	}
 
 	return 0;
@@ -1248,8 +1248,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
 	long r;
-	unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
-	long err = -ENOMEM;
 	unsigned long lpcr;
 
 	/* Allocate hashed page table */
@@ -1259,19 +1257,9 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
-	kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
-				       GFP_KERNEL);
-	if (!kvm->arch.ram_pginfo) {
-		pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
-		       npages * sizeof(struct kvmppc_pginfo));
-		goto out_free;
-	}
-
-	kvm->arch.ram_npages = 0;
-	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
+	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;	/* max page size */
 	kvm->arch.ram_porder = LARGE_PAGE_ORDER;
 	kvm->arch.rma = NULL;
-	kvm->arch.n_rma_pages = 0;
 
 	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
@@ -1298,25 +1286,34 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 	kvm->arch.lpcr = lpcr;
 
 	return 0;
-
- out_free:
-	kvmppc_free_hpt(kvm);
-	return err;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
-	struct kvmppc_pginfo *pginfo;
-	unsigned long i;
-
-	if (kvm->arch.ram_pginfo) {
-		pginfo = kvm->arch.ram_pginfo;
-		kvm->arch.ram_pginfo = NULL;
-		for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
-			if (pginfo[i].pfn)
-				put_page(pfn_to_page(pginfo[i].pfn));
-		kfree(pginfo);
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	unsigned long i, j, npages;
+	unsigned long *rmap;
+	struct page *page;
+
+	slots = kvm_memslots(kvm);
+	for (i = 0; i < slots->nmemslots; i++) {
+		memslot = &slots->memslots[i];
+		rmap = memslot->rmap;
+		npages = memslot->npages;
+
+		if ((memslot->flags & KVM_MEMSLOT_INVALID) || !rmap)
+			continue;
+		for (j = 0; j < npages; j++) {
+			if (rmap[j]) {
+				page = pfn_to_page(rmap[j]);
+				if (PageHuge(page))
+					page = compound_head(page);
+				put_page(page);
+			}
+		}
 	}
+
 	if (kvm->arch.rma) {
 		kvm_release_rma(kvm->arch.rma);
 		kvm->arch.rma = NULL;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 2da8fac..b82da85 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -61,10 +61,12 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 {
 	unsigned long porder;
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long i, lpn, pa, gpa, psize;
+	unsigned long i, pa, gpa, gfn, psize;
 	unsigned long *hpte;
 	struct revmap_entry *rev;
 	unsigned long g_ptel = ptel;
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap_entry;
 
 	/* only handle 4k, 64k and 16M pages for now */
 	porder = 12;
@@ -108,59 +110,57 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	 * first check for RAM pages
 	 */
 	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
-	if ((gpa >> kvm->arch.ram_porder) < kvm->arch.ram_npages) {
-		lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
-		if (porder > kvm->arch.ram_porder)
-			return H_PARAMETER;
-		pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
-		if (!pa)
-			return H_PARAMETER;
-		/* Check WIMG */
-		if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
-		    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = builtin_gfn_to_memslot(kvm, gfn);
+	if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
+		unsigned long egfn = (gpa + psize) >> PAGE_SHIFT;
+
+		/* Check if the requested page fits entirely in the memslot. */
+		if ((egfn - memslot->base_gfn) > memslot->npages)
 			return H_PARAMETER;
-		ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
-		ptel |= pa;
-	} else {
-		struct kvm_memory_slot *memslot;
-
-		/* Check WIMG */
-		if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
-		    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
-			return H_PARAMETER;		
-
-		/* Else check for MMIO pass-through */
-		memslot = builtin_gfn_to_memslot(kvm, gpa >> PAGE_SHIFT);
-		if (memslot && memslot->flags & KVM_MEMSLOT_IO) {
-			unsigned long egfn = (gpa + psize) >> PAGE_SHIFT;
-
-			/* Check if the requested page fits entirely in
-			 * the memslot and check if the start pfn fits
-			 * out page size alignment
-			 */
-			if ((egfn - memslot->base_gfn) > memslot->npages)
-				return H_PARAMETER;
+
+		/* Check for MMIO pass-through */
+		if (memslot->flags & KVM_MEMSLOT_IO) {
+			/* check if the start pfn has page size alignment */
 			pa = kvm->arch.io_slot_pfn[memslot->id] << PAGE_SHIFT;
 			pa += gpa - (memslot->base_gfn << PAGE_SHIFT);
 			if (pa & (psize - 1))
 				return H_PARAMETER;
 
-			/* Make up HPTE */
-			ptel &= ~(HPTE_R_PP0 - psize);
-			ptel |= pa;
+			/* Check WIMG */
+			if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
+			    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
+				return H_PARAMETER;		
+		} else {
+			/* System RAM */
+			if (porder > kvm->arch.ram_porder)
+				return H_PARAMETER;
+			rmap_entry = &memslot->rmap[gfn - memslot->base_gfn];
+			rmap_entry = real_vmalloc_addr(rmap_entry);
+			pa = *rmap_entry << PAGE_SHIFT;
+			if (!pa)
+				return H_PARAMETER;
+
+			/* Check WIMG */
+			if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+			    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+				return H_PARAMETER;
 		}
+		ptel &= ~(HPTE_R_PP0 - psize);
+		ptel |= pa;
+
+	} else {
 		/* Else check for MMIO emulation */
-		else if (cpu_has_feature(CPU_FTR_ARCH_206)) {
-			/* Leave RPN intact */
-
-			/* We force no-execute and set key to 1 to cause
-			 * faults on access.
-			 * XXX Should we instead just return H_PARAMETER if
-			 * N isn't already set ?
-			 */
-			ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
-		} else
+		if (!cpu_has_feature(CPU_FTR_ARCH_206))
 			return H_PARAMETER;
+
+		/* Leave RPN intact */
+		/* We force no-execute and set key to 1 to cause
+		 * faults on access.
+		 * XXX Should we instead just return H_PARAMETER if
+		 * N isn't already set ?
+		 */
+		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
 	}
 	pteh &= ~0x60UL;
 	
-- 
1.7.7.2

^ permalink raw reply related

* [PATCH 02/11] KVM: PPC: Keep a record of HV guest view of hashed page table entries
From: Paul Mackerras @ 2011-11-16 22:56 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This adds an array that parallels the guest hashed page table (HPT),
that is, it has one entry per HPTE, used to store the guest's view
of the second doubleword of the corresponding HPTE.  The first
doubleword in the HPTE is the same as the guest's idea of it, so we
don't need to store a copy, but the second doubleword in the HPTE has
the real page number rather than the guest's logical page number.
This allows us to remove the back_translate() and reverse_xlate()
functions.

This "reverse mapping" array is vmalloc'd, meaning that to access it
in real mode we have to walk the kernel's page tables explicitly.
That is done by the new real_vmalloc_addr() function.  (In fact this
returns an address in the linear mapping, so the result is usable
both in real mode and in virtual mode.)

This also corrects a couple of bugs in kvmppc_mmu_get_pp_value().

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h |   20 +++++
 arch/powerpc/include/asm/kvm_host.h      |   10 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  136 +++++++++++++-----------------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |   95 +++++++++++++--------
 4 files changed, 147 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 53692c2..63542dd 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -29,6 +29,14 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 
 #define SPAPR_TCE_SHIFT		12
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_NPTE	(HPT_NPTEG << 3)		/* 8 PTEs per PTEG */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+#endif
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 {
@@ -86,4 +94,16 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 	return old == 0;
 }
 
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+	/* only handle 4k, 64k and 16M pages for now */
+	if (!(h & HPTE_V_LARGE))
+		return 1ul << 12;		/* 4k page */
+	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
+		return 1ul << 16;		/* 64k page */
+	if ((l & 0xff000) == 0)
+		return 1ul << 24;		/* 16M page */
+	return 0;				/* error */
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f142a2d..56f7046 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -166,9 +166,19 @@ struct kvmppc_rma_info {
 	atomic_t 	 use_count;
 };
 
+/*
+ * The reverse mapping array has one entry for each HPTE,
+ * which stores the guest's view of the second word of the HPTE
+ * (including the guest physical address of the mapping).
+ */
+struct revmap_entry {
+	unsigned long guest_rpte;
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
+	struct revmap_entry *revmap;
 	unsigned long ram_npages;
 	unsigned long ram_psize;
 	unsigned long ram_porder;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index da8c2f4..2b9b8be 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -23,6 +23,7 @@
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -33,11 +34,6 @@
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
 
-/* For now use fixed-size 16MB page table */
-#define HPT_ORDER	24
-#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
-#define HPT_HASH_MASK	(HPT_NPTEG - 1)
-
 /* Pages in the VRMA are 16MB pages */
 #define VRMA_PAGE_ORDER	24
 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
@@ -51,7 +47,9 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 {
 	unsigned long hpt;
 	unsigned long lpid;
+	struct revmap_entry *rev;
 
+	/* Allocate guest's hashed page table */
 	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
 			       HPT_ORDER - PAGE_SHIFT);
 	if (!hpt) {
@@ -60,12 +58,20 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 	}
 	kvm->arch.hpt_virt = hpt;
 
+	/* Allocate reverse map array */
+	rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
+	if (!rev) {
+		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
+		goto out_freehpt;
+	}
+	kvm->arch.revmap = rev;
+
+	/* Allocate the guest's logical partition ID */
 	do {
 		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
 		if (lpid >= NR_LPIDS) {
 			pr_err("kvm_alloc_hpt: No LPIDs free\n");
-			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
-			return -ENOMEM;
+			goto out_freeboth;
 		}
 	} while (test_and_set_bit(lpid, lpid_inuse));
 
@@ -74,11 +80,18 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
 	return 0;
+
+ out_freeboth:
+	vfree(rev);
+ out_freehpt:
+	free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+	return -ENOMEM;
 }
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
 	clear_bit(kvm->arch.lpid, lpid_inuse);
+	vfree(kvm->arch.revmap);
 	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
 }
 
@@ -89,14 +102,16 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	unsigned long pfn;
 	unsigned long *hpte;
 	unsigned long hash;
+	unsigned long porder = kvm->arch.ram_porder;
+	struct revmap_entry *rev;
 	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
 
 	if (!pginfo)
 		return;
 
 	/* VRMA can't be > 1TB */
-	if (npages > 1ul << (40 - kvm->arch.ram_porder))
-		npages = 1ul << (40 - kvm->arch.ram_porder);
+	if (npages > 1ul << (40 - porder))
+		npages = 1ul << (40 - porder);
 	/* Can't use more than 1 HPTE per HPTEG */
 	if (npages > HPT_NPTEG)
 		npages = HPT_NPTEG;
@@ -113,15 +128,20 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		 * at most one HPTE per HPTEG, we just assume entry 7
 		 * is available and use it.
 		 */
-		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
-		hpte += 7 * 2;
+		hash = (hash << 3) + 7;
+		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 4));
 		/* HPTE low word - RPN, protection, etc. */
 		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
 			HPTE_R_M | PP_RWXX;
-		wmb();
+		smp_wmb();
 		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
 			HPTE_V_LARGE | HPTE_V_VALID;
+
+		/* Reverse map info */
+		rev = &kvm->arch.revmap[hash];
+		rev->guest_rpte = (i << porder) | HPTE_R_R | HPTE_R_C |
+			HPTE_R_M | PP_RWXX;
 	}
 }
 
@@ -192,22 +212,6 @@ static unsigned int kvmppc_mmu_book3s_hv_slb_pshift(struct kvmppc_slb *slbe)
 	return 12;     	/* Unsupported */
 }
 
-static unsigned long back_translate(struct kvm *kvm, unsigned long ra)
-{
-	unsigned long offset, rpn, i;
-
-	/* XXX handle MMIO  */
-	offset = ra & (kvm->arch.ram_psize - 1);
-	rpn = (ra - offset) >> PAGE_SHIFT;
-	for (i = 0; i < kvm->arch.ram_npages; ++i)
-		if (rpn == kvm->arch.ram_pginfo[i].pfn)
-			return (i << kvm->arch.ram_porder) + offset;
-
-	/* Error value */
-	return -1ull;
-}
-
-
 static char pp_read_perm[16] = {
 	/* key = 0 */	1, 1, 1, 1, 0, 0, 1, 0,
 	/* key = 1 */	0, 1, 1, 1, 0, 0, 0, 0
@@ -224,7 +228,7 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 	unsigned int i;
 	unsigned int pshift;
 	unsigned long somask;
-	unsigned long vsid, hash;
+	unsigned long vsid, hash, index;
 	unsigned long avpn;
 	unsigned long *hpte;
 
@@ -252,7 +256,7 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));	
 
 		for (i = 0; i < 16; i += 2) {
-			unsigned long oldv, v, r;
+			unsigned long oldv, v, r, gr;
 
 			/* Read the PTE racily */
 			oldv = hpte[i] & ~HPTE_V_HVLOCK;
@@ -267,6 +271,8 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 				cpu_relax();
 			v = hpte[i];
 			r = hpte[i+1];
+			index = (hash << 3) + (i >> 1);
+			gr = kvm->arch.revmap[index].guest_rpte;
 
 			/* Unlock the HPTE */
 			asm volatile("lwsync" : : : "memory");
@@ -280,7 +286,8 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 			}
 			ret[0] = v;
 			ret[1] = r;
-			return 1;
+			ret[2] = gr;
+			return index;
 		}
 
 		if (avpn & HPTE_V_SECONDARY)
@@ -288,32 +295,20 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 		avpn |= HPTE_V_SECONDARY;
 		hash = hash ^ HPT_HASH_MASK;
 	}
-	return 0;
+	return -1;
 }
 
-static unsigned long kvmppc_mmu_get_real_addr(unsigned long hpte[2],
+static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
 			unsigned long ea)
 {
-	unsigned int hpshift;
-	unsigned long r = hpte[1];
 	unsigned long ra_mask;
 
-	/* Get page size */
-	hpshift = 12;
-	if (hpte[0] & HPTE_V_LARGE) {
-		if ((r & 0xf000) == 0x1000)
-			hpshift = 16;
-		else if ((r & 0xff000) == 0)
-			hpshift = 24;
-		/* XXX TODO: Add 16G */
-	}
-	ra_mask = (1 << hpshift) - 1;
-
+	ra_mask = hpte_page_size(v, r) - 1;
 	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
 }
 
 static unsigned int kvmppc_mmu_get_pp_value(struct kvm_vcpu *vcpu,
-			struct kvmppc_slb *slbe, unsigned long hpte[2])
+			struct kvmppc_slb *slbe, unsigned long hpte_r)
 {
 	unsigned int key, pp;
 
@@ -322,8 +317,8 @@ static unsigned int kvmppc_mmu_get_pp_value(struct kvm_vcpu *vcpu,
 	else 
 		key = slbe->origv & SLB_VSID_KS;
 
-	pp = hpte[0] & HPTE_R_PP;
-	if (pp & HPTE_R_PP0)
+	pp = hpte_r & HPTE_R_PP;
+	if (hpte_r & HPTE_R_PP0)
 		pp |= 4;
 	if (key)
 		pp |= 8;
@@ -340,9 +335,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_slb *slbe;
-	unsigned int pp, skey;
-	unsigned long hpte[2];
-	unsigned long ra;
+	unsigned int pp;
+	unsigned long hpte[3];
+	int index;
 
 	/* Get SLB entry */
 	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
@@ -350,37 +345,23 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		return -EINVAL;
 
 	/* Find the HPTE in the hash table */
-	if (!kvmppc_hv_find_hpte(kvm, eaddr, slbe, hpte))
+	index = kvmppc_hv_find_hpte(kvm, eaddr, slbe, hpte);
+	if (index < 0)
 		return -ENOENT;
 
 	gpte->eaddr = eaddr;
 	gpte->vpage = ((hpte[0] & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
 
-	/* Get the real address from the HPTE */
-	ra = kvmppc_mmu_get_real_addr(hpte, eaddr);
-
 	/* Get PP bits and key for permission check */
-	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte);
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte[1]);
 
 	/* Calculate permissions */
 	gpte->may_execute = !(hpte[1] & (HPTE_R_N | HPTE_R_G));
 	gpte->may_read = pp_read_perm[pp];
 	gpte->may_write = pp_write_perm[pp];
 
-	/*
-	 * Get the storage key value.  31 means a special no-access
-	 * HPTE that we have inserted, with the guest physical address
-	 * in the RPN field.  Other keys mean that the the RPN field
-	 * contains the real address.
-	 */
-	skey = ((hpte[1] & HPTE_R_KEY_HI) >> 57) |
-		((hpte[1] & HPTE_R_KEY_LO) >> 9);
-	if (skey == 31) {
-		gpte->raddr = ra;
-		return 0;
-	}
-
-	gpte->raddr = back_translate(kvm, ra);
+	/* Get the guest physical address */
+	gpte->raddr = kvmppc_mmu_get_real_addr(hpte[0], hpte[2], eaddr);
 	return 0;
 }
 
@@ -388,23 +369,24 @@ int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_slb *slbe;
-	unsigned long hpte[2];
+	unsigned long hpte[3];
 	unsigned long srr0 = kvmppc_get_pc(vcpu);
 	unsigned long ea = vcpu->arch.fault_dar;	
 	unsigned long gpa;
 	unsigned int pp, ok;
 	u32 last_inst, dsisr = vcpu->arch.fault_dsisr;
-	int ret = 0;
+	int index, ret = 0;
 
 	/*
 	 * Translate the access address.
 	 * If we can't find the HPTE, just return and re-execute the
-	 * instruction.f
+	 * instruction.
  	 */
 	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, ea);
 	if (!slbe)
 		return RESUME_GUEST;
-	if (!kvmppc_hv_find_hpte(kvm, ea, slbe, hpte))
+	index = kvmppc_hv_find_hpte(kvm, ea, slbe, hpte);
+	if (index < 0)
 		return RESUME_GUEST;
 
 	/*
@@ -420,7 +402,7 @@ int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	/* Check whether the attempted access was permitted */
-	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte);
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte[1]);
 	ok = (dsisr & DSISR_ISSTORE) ? pp_write_perm[pp] : pp_read_perm[pp];
 	if (!ok) {
 		vcpu->arch.shregs.dar = ea;
@@ -431,7 +413,7 @@ int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	/* Translate the logical address */
-	gpa = kvmppc_mmu_get_real_addr(hpte, ea);
+	gpa = kvmppc_mmu_get_real_addr(hpte[0], hpte[2], ea);
 
 	/*
 	 * We try to load the last instruction.  We don't let
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6cb2f23..2da8fac 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -20,10 +20,19 @@
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 
-/* For now use fixed-size 16MB page table */
-#define HPT_ORDER	24
-#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
-#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+/* Translate address of a vmalloc'd thing to a linear map address */
+static void *real_vmalloc_addr(void *x)
+{
+	unsigned long addr = (unsigned long) x;
+	pte_t *p;
+
+	p = find_linux_pte(swapper_pg_dir, addr);
+	if (!p || !pte_present(*p))
+		return NULL;
+	/* assume we don't have huge pages in vmalloc space... */
+	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
+	return __va(addr);
+}
 
 /*
  * Since this file is built in even if KVM is a module, we need
@@ -54,6 +63,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long i, lpn, pa, gpa, psize;
 	unsigned long *hpte;
+	struct revmap_entry *rev;
+	unsigned long g_ptel = ptel;
 
 	/* only handle 4k, 64k and 16M pages for now */
 	porder = 12;
@@ -153,7 +164,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	}
 	pteh &= ~0x60UL;
 	
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
 		pte_index &= ~7UL;
@@ -166,18 +177,22 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 				break;
 			hpte += 2;
 		}
+		pte_index += i;
 	} else {
-		i = 0;
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
 			return H_PTEG_FULL;
 	}
+
+	/* Save away the guest's idea of the second HPTE dword */
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	if (rev)
+		rev->guest_rpte = g_ptel;
 	hpte[1] = ptel;
 	eieio();
 	hpte[0] = pteh;
 	asm volatile("ptesync" : : : "memory");
-	// XXX atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
-	vcpu->arch.gpr[4] = pte_index + i;
+	vcpu->arch.gpr[4] = pte_index;
 	return H_SUCCESS;
 }
 
@@ -209,7 +224,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *hpte;
 	unsigned long v, r, rb;
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
@@ -264,7 +279,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 		if (req == 3)
 			break;
 		if (req != 1 || flags == 3 ||
-		    pte_index >= (HPT_NPTEG << 3)) {
+		    pte_index >= HPT_NPTE) {
 			/* parameter error */
 			args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
 			ret = H_PARAMETER;
@@ -327,9 +342,10 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 {
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hpte;
-	unsigned long v, r, rb;
+	struct revmap_entry *rev;
+	unsigned long v, r, rb, mask, bits;
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	/* Don't let it set a normal memory page to key 31 */
 	if (((flags >> 9) & 0x1f) == 0x1f)
@@ -347,17 +363,30 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (atomic_read(&kvm->online_vcpus) == 1)
 		flags |= H_LOCAL;
 	v = hpte[0];
-	r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
-			HPTE_R_KEY_HI | HPTE_R_KEY_LO);
-	r |= (flags << 55) & HPTE_R_PP0;
-	r |= (flags << 48) & HPTE_R_KEY_HI;
-	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+	bits = (flags << 55) & HPTE_R_PP0;
+	bits |= (flags << 48) & HPTE_R_KEY_HI;
+	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+	/* Update guest view of 2nd HPTE dword */
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	if (rev) {
+		mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+			HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+		r = rev->guest_rpte & ~mask;
+		r |= bits;
+		rev->guest_rpte = r;
+	}
 
  	/* Don't let guest remove N or key from emulated MMIO pages */
 	if ((hpte[1] & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == 
 	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
-		r |= HPTE_R_N | HPTE_R_KEY_HI | HPTE_R_KEY_LO;
-		
+		mask = HPTE_R_PP0 | HPTE_R_PP;
+	else
+		mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+			HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+	r = (hpte[1] & ~mask) | (bits & mask);
+
+	/* Update HPTE */
 	rb = compute_tlbie_rb(v, r, pte_index);
 	hpte[0] = v & ~HPTE_V_VALID;
 	if (!(flags & H_LOCAL)) {
@@ -380,39 +409,31 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	return H_SUCCESS;
 }
 
-static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
-{
-	long int i;
-	unsigned long offset, rpn;
-
-	/* XXX handle MMIO and EMU */
-	offset = realaddr & (kvm->arch.ram_psize - 1);
-	rpn = (realaddr - offset) >> PAGE_SHIFT;
-	for (i = 0; i < kvm->arch.ram_npages; ++i)
-		if (rpn == kvm->arch.ram_pginfo[i].pfn)
-			return (i << kvm->arch.ram_porder) + offset;
-	return HPTE_R_RPN;	/* all 1s in the RPN field */
-}
-
 long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 		   unsigned long pte_index)
 {
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hpte, r;
 	int i, n = 1;
+	struct revmap_entry *rev = NULL;
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
 		pte_index &= ~3;
 		n = 4;
 	}
+	if (flags & H_R_XLATE)
+		rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 	for (i = 0; i < n; ++i, ++pte_index) {
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		r = hpte[1];
-		if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
-			r = reverse_xlate(kvm, r & HPTE_R_RPN) |
-				(r & ~HPTE_R_RPN);
+		if (hpte[0] & HPTE_V_VALID) {
+			if (rev)
+				r = rev[i].guest_rpte;
+			else
+				r = hpte[1] | HPTE_R_RPN;
+		}
 		vcpu->arch.gpr[4 + i * 2] = hpte[0];
 		vcpu->arch.gpr[5 + i * 2] = r;
 	}
-- 
1.7.7.2

^ permalink raw reply related

* [PATCH 01/11] KVM: PPC: Add memory-mapping support for PCI passthrough and emulation
From: Paul Mackerras @ 2011-11-16 22:52 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

From: Benjamin Herrenschmidt <benh@kernel.crashing.org>

This adds support for adding PCI device I/O regions to the guest memory
map, and for trapping guest accesses to emulated MMIO regions and
delivering them to qemu for MMIO emulation.  To trap guest accesses to
emulated MMIO regions, we reserve key 31 for the hypervisor's use and
set the VPM1 bit in LPCR, which sends all page faults to the host.
Any page fault that is not a key fault gets reflected immediately to the
guest.  We set HPTEs for emulated MMIO regions to have key = 31, and
don't allow the guest to create HPTEs with key = 31.  Any page fault
that is a key fault with key = 31 is then a candidate for MMIO
emulation and thus gets sent up to qemu.  We also load the instruction
that caused the fault for use later when qemu has done the emulation.

[paulus@samba.org: Cleaned up, moved kvmppc_book3s_hv_emulate_mmio()
 to book3s_64_mmu_hv.c]

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s.h    |    1 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   24 +++
 arch/powerpc/include/asm/kvm_host.h      |    2 +
 arch/powerpc/include/asm/kvm_ppc.h       |    1 +
 arch/powerpc/include/asm/reg.h           |    4 +
 arch/powerpc/kernel/exceptions-64s.S     |    8 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  301 +++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv.c             |   91 +++++++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  153 ++++++++++++----
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  131 ++++++++++++-
 arch/powerpc/kvm/book3s_pr.c             |    1 +
 arch/powerpc/kvm/booke.c                 |    1 +
 arch/powerpc/kvm/powerpc.c               |    2 +-
 include/linux/kvm.h                      |    3 +
 14 files changed, 656 insertions(+), 67 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index deb8a4e..bd8345f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -121,6 +121,7 @@ extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
+extern int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d0ac94f..53692c2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -62,4 +62,28 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 	return rb;
 }
 
+/*
+ * We use a lock bit in HPTE dword 0 to synchronize updates and
+ * accesses to each HPTE.
+ */
+#define HPTE_V_HVLOCK	0x40UL
+
+static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
+{
+	unsigned long tmp, old;
+
+	asm volatile("	ldarx	%0,0,%2\n"
+		     "	and.	%1,%0,%3\n"
+		     "	bne	2f\n"
+		     "	ori	%0,%0,%4\n"
+		     "  stdcx.	%0,0,%2\n"
+		     "	beq+	2f\n"
+		     "	li	%1,%3\n"
+		     "2:	isync"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+		     : "cc", "memory");
+	return old == 0;
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index bf8af5d..f142a2d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -186,6 +186,8 @@ struct kvm_arch {
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
+	unsigned long io_slot_pfn[KVM_MEMORY_SLOTS +
+				  KVM_PRIVATE_MEM_SLOTS];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a284f20..8c372b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -132,6 +132,7 @@ extern void kvm_release_rma(struct kvmppc_rma_info *ri);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_memory_slot *memslot,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 559da19..ff3d627 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -216,6 +216,7 @@
 #define   DSISR_ISSTORE		0x02000000	/* access was a store */
 #define   DSISR_DABRMATCH	0x00400000	/* hit data breakpoint */
 #define   DSISR_NOSEGMENT	0x00200000	/* STAB/SLB miss */
+#define   DSISR_KEYFAULT	0x00200000	/* Key fault */
 #define SPRN_TBRL	0x10C	/* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU	0x10D	/* Time Base Read Upper Register (user, R/O) */
 #define SPRN_TBWL	0x11C	/* Time Base Lower Register (super, R/W) */
@@ -493,6 +494,9 @@
 #define SPRN_SPRG7	0x117	/* Special Purpose Register General 7 */
 #define SPRN_SRR0	0x01A	/* Save/Restore Register 0 */
 #define SPRN_SRR1	0x01B	/* Save/Restore Register 1 */
+#define   SRR1_ISI_NOPT		0x40000000 /* ISI: Not found in hash */
+#define   SRR1_ISI_N_OR_G	0x10000000 /* ISI: Access is no-exec or G */
+#define   SRR1_ISI_PROT		0x08000000 /* ISI: Other protection fault */
 #define   SRR1_WAKEMASK		0x00380000 /* reason for wakeup */
 #define   SRR1_WAKESYSERR	0x00300000 /* System error */
 #define   SRR1_WAKEEE		0x00200000 /* External interrupt */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 396d080..9c2f0e2 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -100,14 +100,14 @@ data_access_not_stab:
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 #endif
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST_PR, 0x300)
+				 KVMTEST, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 #ifdef __DISABLED__
@@ -329,8 +329,8 @@ do_stab_bolted_pSeries:
 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
 #endif /* CONFIG_POWER4_ONLY */
 
-	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
 	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bc3a2ea..da8c2f4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -158,10 +158,307 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
 }
 
+static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
+							 gva_t eaddr)
+{
+	u64 mask;
+	int i;
+
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
+			continue;
+
+		if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
+			mask = ESID_MASK_1T;
+		else
+			mask = ESID_MASK;
+
+		if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
+			return &vcpu->arch.slb[i];
+	}
+	return NULL;
+}
+
+static unsigned int kvmppc_mmu_book3s_hv_slb_pshift(struct kvmppc_slb *slbe)
+{
+	if (!(slbe->origv & SLB_VSID_L))
+		return 12;	/*  4K */
+	switch ((slbe->origv >> 4) & 0x3) {
+	case 0: return 24;	/* 16M */
+	case 1: return 16;	/* 64K */
+	case 2: return 34;	/* 16G */
+	case 3: return 20;	/* 1M !!! but we don't support it */
+	}
+	return 12;     	/* Unsupported */
+}
+
+static unsigned long back_translate(struct kvm *kvm, unsigned long ra)
+{
+	unsigned long offset, rpn, i;
+
+	/* XXX handle MMIO  */
+	offset = ra & (kvm->arch.ram_psize - 1);
+	rpn = (ra - offset) >> PAGE_SHIFT;
+	for (i = 0; i < kvm->arch.ram_npages; ++i)
+		if (rpn == kvm->arch.ram_pginfo[i].pfn)
+			return (i << kvm->arch.ram_porder) + offset;
+
+	/* Error value */
+	return -1ull;
+}
+
+
+static char pp_read_perm[16] = {
+	/* key = 0 */	1, 1, 1, 1, 0, 0, 1, 0,
+	/* key = 1 */	0, 1, 1, 1, 0, 0, 0, 0
+};
+
+static char pp_write_perm[16] = {
+	/* key = 0 */	1, 1, 1, 0, 0, 0, 0, 0,
+	/* key = 1 */	0, 0, 1, 0, 0, 0, 0, 0
+};
+
+static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
+			struct kvmppc_slb *slbe, unsigned long *ret)
+{
+	unsigned int i;
+	unsigned int pshift;
+	unsigned long somask;
+	unsigned long vsid, hash;
+	unsigned long avpn;
+	unsigned long *hpte;
+
+	/* Get page shift, work out hash and AVPN etc. */
+	pshift = kvmppc_mmu_book3s_hv_slb_pshift(slbe);
+	if (slbe->origv & SLB_VSID_B_1T) {
+		somask = (1UL << 40) - 1;
+		vsid = (slbe->origv & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
+		vsid ^= vsid << 25;
+	} else {
+		somask = (1UL << 28) - 1;
+		vsid = (slbe->origv & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
+	}
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK;
+	avpn = slbe->origv & ~(somask >> 16);	/* also includes B */
+	avpn |= (eaddr & somask) >> 16;
+
+	if (pshift >= 24)
+		avpn &= ~((1UL << (pshift - 16)) - 1);
+	else
+		avpn &= ~0x7fUL;
+	avpn |= HPTE_V_VALID;
+
+	for (;;) {
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));	
+
+		for (i = 0; i < 16; i += 2) {
+			unsigned long oldv, v, r;
+
+			/* Read the PTE racily */
+			oldv = hpte[i] & ~HPTE_V_HVLOCK;
+
+			/* Check valid, hash, segment size and AVPN */
+			if (avpn != (oldv & (SLB_VSID_B | HPTE_V_AVPN |
+					     HPTE_V_SECONDARY | HPTE_V_VALID)))
+				continue;
+
+			/* Lock the PTE and read it under the lock */
+			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
+				cpu_relax();
+			v = hpte[i];
+			r = hpte[i+1];
+
+			/* Unlock the HPTE */
+			asm volatile("lwsync" : : : "memory");
+			v &= ~HPTE_V_HVLOCK;
+			hpte[i] = v;
+
+			/* Still OK? */
+			if (v != oldv) {
+				i -= 2;
+				continue;
+			}
+			ret[0] = v;
+			ret[1] = r;
+			return 1;
+		}
+
+		if (avpn & HPTE_V_SECONDARY)
+			break;
+		avpn |= HPTE_V_SECONDARY;
+		hash = hash ^ HPT_HASH_MASK;
+	}
+	return 0;
+}
+
+static unsigned long kvmppc_mmu_get_real_addr(unsigned long hpte[2],
+			unsigned long ea)
+{
+	unsigned int hpshift;
+	unsigned long r = hpte[1];
+	unsigned long ra_mask;
+
+	/* Get page size */
+	hpshift = 12;
+	if (hpte[0] & HPTE_V_LARGE) {
+		if ((r & 0xf000) == 0x1000)
+			hpshift = 16;
+		else if ((r & 0xff000) == 0)
+			hpshift = 24;
+		/* XXX TODO: Add 16G */
+	}
+	ra_mask = (1 << hpshift) - 1;
+
+	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
+}
+
+static unsigned int kvmppc_mmu_get_pp_value(struct kvm_vcpu *vcpu,
+			struct kvmppc_slb *slbe, unsigned long hpte[2])
+{
+	unsigned int key, pp;
+
+	if (vcpu->arch.shared->msr & MSR_PR)
+		key = slbe->origv & SLB_VSID_KP;
+	else 
+		key = slbe->origv & SLB_VSID_KS;
+
+	pp = hpte[0] & HPTE_R_PP;
+	if (pp & HPTE_R_PP0)
+		pp |= 4;
+	if (key)
+		pp |= 8;
+	return pp;
+}
+
+/*
+ * XXX TODO: Handle key values from guest (add them to kvmppc_pte),
+ * for now we don't care tho as Linux guest doesn't use
+ * them. We also force key 31 for any MMIO emulation mapping
+ */
 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-				struct kvmppc_pte *gpte, bool data)
+			struct kvmppc_pte *gpte, bool data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_slb *slbe;
+	unsigned int pp, skey;
+	unsigned long hpte[2];
+	unsigned long ra;
+
+	/* Get SLB entry */
+	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
+	if (!slbe)
+		return -EINVAL;
+
+	/* Find the HPTE in the hash table */
+	if (!kvmppc_hv_find_hpte(kvm, eaddr, slbe, hpte))
+		return -ENOENT;
+
+	gpte->eaddr = eaddr;
+	gpte->vpage = ((hpte[0] & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
+
+	/* Get the real address from the HPTE */
+	ra = kvmppc_mmu_get_real_addr(hpte, eaddr);
+
+	/* Get PP bits and key for permission check */
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte);
+
+	/* Calculate permissions */
+	gpte->may_execute = !(hpte[1] & (HPTE_R_N | HPTE_R_G));
+	gpte->may_read = pp_read_perm[pp];
+	gpte->may_write = pp_write_perm[pp];
+
+	/*
+	 * Get the storage key value.  31 means a special no-access
+	 * HPTE that we have inserted, with the guest physical address
+	 * in the RPN field.  Other keys mean that the the RPN field
+	 * contains the real address.
+	 */
+	skey = ((hpte[1] & HPTE_R_KEY_HI) >> 57) |
+		((hpte[1] & HPTE_R_KEY_LO) >> 9);
+	if (skey == 31) {
+		gpte->raddr = ra;
+		return 0;
+	}
+
+	gpte->raddr = back_translate(kvm, ra);
+	return 0;
+}
+
+int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
-	return -ENOENT;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_slb *slbe;
+	unsigned long hpte[2];
+	unsigned long srr0 = kvmppc_get_pc(vcpu);
+	unsigned long ea = vcpu->arch.fault_dar;	
+	unsigned long gpa;
+	unsigned int pp, ok;
+	u32 last_inst, dsisr = vcpu->arch.fault_dsisr;
+	int ret = 0;
+
+	/*
+	 * Translate the access address.
+	 * If we can't find the HPTE, just return and re-execute the
+	 * instruction.f
+ 	 */
+	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, ea);
+	if (!slbe)
+		return RESUME_GUEST;
+	if (!kvmppc_hv_find_hpte(kvm, ea, slbe, hpte))
+		return RESUME_GUEST;
+
+	/*
+	 * Check if this is a special HPTE (storage key = 31); if not then
+	 * this is just a key fault in the guest.
+	 */
+	if ((hpte[1] & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) !=
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) {
+		vcpu->arch.shregs.dsisr = dsisr;
+		vcpu->arch.shregs.dar = ea;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		return RESUME_GUEST;
+	}
+
+	/* Check whether the attempted access was permitted */
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte);
+	ok = (dsisr & DSISR_ISSTORE) ? pp_write_perm[pp] : pp_read_perm[pp];
+	if (!ok) {
+		vcpu->arch.shregs.dar = ea;
+		vcpu->arch.shregs.dsisr = (dsisr & DSISR_ISSTORE) |
+			DSISR_PROTFAULT;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		return RESUME_GUEST;
+	}
+
+	/* Translate the logical address */
+	gpa = kvmppc_mmu_get_real_addr(hpte, ea);
+
+	/*
+	 * We try to load the last instruction.  We don't let
+	 * emulate_instruction do it as its failure mode is pretty bogus.
+	 * If we fail, we just return to the guest and try executing it again.
+	 */
+	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
+		ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
+		if (ret != EMULATE_DONE)
+			return RESUME_GUEST;
+		vcpu->arch.last_inst = last_inst;
+	}
+
+	/*
+	 * XXX WARNING: We do not know for sure whether the instruction we just
+	 * read from memory is the same that caused the fault in the first
+	 * place. We don't have a problem with the guest shooting itself in
+	 * the foot that way, however we must be careful that we enforce
+	 * the write permission based on the instruction we are actually
+	 * emulating, not based on dsisr. Unfortunately, the KVM code for
+	 * instruction emulation isn't smart enough for that to work
+	 * so right now we just do it badly and racily, but that will need
+	 * fixing
+	 */
+
+	vcpu->arch.paddr_accessed = gpa;
+	return kvmppc_emulate_mmio(run, vcpu);
 }
 
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b8ad233..d1f0774 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -320,8 +320,15 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * We get these next two if the guest does a bad real-mode access,
 	 * as we have enabled VRMA (virtualized real mode area) mode in the
 	 * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+	 *
+	 * We also get them for MMIO emulation via key faults
 	 */
 	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		/* We attempt MMIO emulation for key faults */
+		if (vcpu->arch.fault_dsisr & DSISR_KEYFAULT) {
+			r = kvmppc_book3s_hv_emulate_mmio(run, vcpu);
+			break;
+		}
 		vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
 		vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
 		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
@@ -329,7 +336,7 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
 		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
-					0x08000000);
+					vcpu->arch.shregs.msr & 0x78000000);
 		r = RESUME_GUEST;
 		break;
 	/*
@@ -1068,17 +1075,67 @@ static struct page *hva_to_page(unsigned long addr)
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem)
+				      struct kvm_memory_slot *memslot,
+				      struct kvm_userspace_memory_region *mem)
 {
 	unsigned long psize, porder;
 	unsigned long i, npages, totalpages;
 	unsigned long pg_ix;
 	struct kvmppc_pginfo *pginfo;
-	unsigned long hva;
 	struct kvmppc_rma_info *ri = NULL;
+	struct vm_area_struct *vma;
 	struct page *page;
+	unsigned long hva;
+
+	/*
+	 * This could be an attempt at adding memory or it could be MMIO
+	 * pass-through. We need to treat them differently but the only
+	 * way for us to know what it is is to look at the VMA and play
+	 * guess work so let's just do that
+	 */
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, mem->userspace_addr);
+	if (!vma || vma->vm_start > mem->userspace_addr)
+		goto err_unlock;
+
+	/* Anything with VM_IO will be handled as MMIO pass-through */
+	if (vma->vm_flags & VM_IO) {
+		unsigned long offset = mem->userspace_addr - vma->vm_start;
+
+		/* We require VM_PFNMAP for now */
+		if (!(vma->vm_flags & VM_PFNMAP))
+			goto err_unlock;
+
+		/*
+		 * We require read & write permission as we cannot yet
+		 * enforce guest read-only protection or no access.
+		 */
+		if ((vma->vm_flags & (VM_READ | VM_WRITE)) !=
+		    (VM_READ | VM_WRITE))
+			goto err_unlock;
+
+		/*
+		 * Tag the memslot with a private flag and store the pfn
+		 * in a separate array for use by H_ENTER
+		 */
+		memslot->flags |= KVM_MEMSLOT_IO;
+		kvm->arch.io_slot_pfn[memslot->id] =
+			vma->vm_pgoff + (offset >> PAGE_SHIFT);
+		up_read(&current->mm->mmap_sem);
+		return 0;
+	}
+
+	/* Is this one of our preallocated RMAs? */
+	if (mem->guest_phys_addr == 0) {
+		if (vma && vma->vm_file &&
+		    vma->vm_file->f_op == &kvm_rma_fops &&
+		    mem->userspace_addr == vma->vm_start)
+			ri = vma->vm_file->private_data;
+	}
+
+	up_read(&current->mm->mmap_sem);
 
-	/* For now, only allow 16MB pages */
+	/* For now, only allow 16MB pages for memory */
 	porder = LARGE_PAGE_ORDER;
 	psize = 1ul << porder;
 	if ((mem->memory_size & (psize - 1)) ||
@@ -1102,23 +1159,13 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	if (totalpages > kvm->arch.ram_npages)
 		kvm->arch.ram_npages = totalpages;
 
-	/* Is this one of our preallocated RMAs? */
-	if (mem->guest_phys_addr == 0) {
-		struct vm_area_struct *vma;
-
-		down_read(&current->mm->mmap_sem);
-		vma = find_vma(current->mm, mem->userspace_addr);
-		if (vma && vma->vm_file &&
-		    vma->vm_file->f_op == &kvm_rma_fops &&
-		    mem->userspace_addr == vma->vm_start)
-			ri = vma->vm_file->private_data;
-		up_read(&current->mm->mmap_sem);
-		if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
-			pr_err("CPU requires an RMO\n");
-			return -EINVAL;
-		}
+	if (!ri && mem->guest_phys_addr == 0 &&
+	    cpu_has_feature(CPU_FTR_ARCH_201)) {
+		pr_err("CPU requires an RMO\n");
+		return -EINVAL;
 	}
 
+	/* Handle pre-allocated RMAs */
 	if (ri) {
 		unsigned long rma_size;
 		unsigned long lpcr;
@@ -1184,6 +1231,8 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 	return 0;
 
+ err_unlock:
+	up_read(&current->mm->mmap_sem);
  err:
 	return -EINVAL;
 }
@@ -1241,6 +1290,10 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		lpcr &= LPCR_PECE | LPCR_LPES;
 		lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
 			LPCR_VPM0 | LPCR_VRMA_L;
+		/* XXX Enable MMIO emu, we should probably do that
+		 *     only upon instruction from qemu... 
+		 */
+		lpcr |= LPCR_VPM1;
 	}
 	kvm->arch.lpcr = lpcr;
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index bacb0cf..6cb2f23 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -25,24 +25,26 @@
 #define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
 #define HPT_HASH_MASK	(HPT_NPTEG - 1)
 
-#define HPTE_V_HVLOCK	0x40UL
-
-static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
+/*
+ * Since this file is built in even if KVM is a module, we need
+ * a local copy of this function for the case where kvm_main.c is
+ * modular.
+ */
+static struct kvm_memory_slot *builtin_gfn_to_memslot(struct kvm *kvm,
+						      gfn_t gfn)
 {
-	unsigned long tmp, old;
+	int i;
+	struct kvm_memslots *slots;
 
-	asm volatile("	ldarx	%0,0,%2\n"
-		     "	and.	%1,%0,%3\n"
-		     "	bne	2f\n"
-		     "	ori	%0,%0,%4\n"
-		     "  stdcx.	%0,0,%2\n"
-		     "	beq+	2f\n"
-		     "	li	%1,%3\n"
-		     "2:	isync"
-		     : "=&r" (tmp), "=&r" (old)
-		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
-		     : "cc", "memory");
-	return old == 0;
+	slots = kvm_memslots(kvm);
+	for (i = 0; i < slots->nmemslots; ++i) {
+		struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return memslot;
+	}
+	return NULL;
 }
 
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
@@ -50,7 +52,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 {
 	unsigned long porder;
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long i, lpn, pa;
+	unsigned long i, lpn, pa, gpa, psize;
 	unsigned long *hpte;
 
 	/* only handle 4k, 64k and 16M pages for now */
@@ -69,19 +71,88 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		} else
 			return H_PARAMETER;
 	}
-	lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
-	if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
-		return H_PARAMETER;
-	pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
-	if (!pa)
-		return H_PARAMETER;
-	/* Check WIMG */
-	if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
-	    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+	psize = (1ul << porder);
+
+	/* We do not allow the guest to set key 31 which is reserved
+	 * for MMIO emulation. We don't want to allow MMIO emulation
+	 * to be used to access RAM due to possible races between
+	 * emulation and TLB invalidations.
+	 *
+	 * Emulated accesses are emulated by looking at the hash for
+	 * translation once, then performing the access later. The
+	 * translation could be invalidated in the meantime in which
+	 * point performing the subsequent memory access on the old
+	 * physical address is a violation of the architecture and
+	 * a security hole.
+	 *
+	 * This is less of an issue for MMIO stores since they aren't
+	 * globally visible. It could be an issue for MMIO loads to
+	 * a certain extent but we'll ignore it for now
+	 */
+	if ((ptel & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
 		return H_PARAMETER;
+
+	/* Figure out the type of page and handle accordingly,
+	 * first check for RAM pages
+	 */
+	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
+	if ((gpa >> kvm->arch.ram_porder) < kvm->arch.ram_npages) {
+		lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
+		if (porder > kvm->arch.ram_porder)
+			return H_PARAMETER;
+		pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
+		if (!pa)
+			return H_PARAMETER;
+		/* Check WIMG */
+		if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+		    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+			return H_PARAMETER;
+		ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
+		ptel |= pa;
+	} else {
+		struct kvm_memory_slot *memslot;
+
+		/* Check WIMG */
+		if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
+		    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
+			return H_PARAMETER;		
+
+		/* Else check for MMIO pass-through */
+		memslot = builtin_gfn_to_memslot(kvm, gpa >> PAGE_SHIFT);
+		if (memslot && memslot->flags & KVM_MEMSLOT_IO) {
+			unsigned long egfn = (gpa + psize) >> PAGE_SHIFT;
+
+			/* Check if the requested page fits entirely in
+			 * the memslot and check if the start pfn fits
+			 * out page size alignment
+			 */
+			if ((egfn - memslot->base_gfn) > memslot->npages)
+				return H_PARAMETER;
+			pa = kvm->arch.io_slot_pfn[memslot->id] << PAGE_SHIFT;
+			pa += gpa - (memslot->base_gfn << PAGE_SHIFT);
+			if (pa & (psize - 1))
+				return H_PARAMETER;
+
+			/* Make up HPTE */
+			ptel &= ~(HPTE_R_PP0 - psize);
+			ptel |= pa;
+		}
+		/* Else check for MMIO emulation */
+		else if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+			/* Leave RPN intact */
+
+			/* We force no-execute and set key to 1 to cause
+			 * faults on access.
+			 * XXX Should we instead just return H_PARAMETER if
+			 * N isn't already set ?
+			 */
+			ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
+		} else
+			return H_PARAMETER;
+	}
 	pteh &= ~0x60UL;
-	ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
-	ptel |= pa;
+	
 	if (pte_index >= (HPT_NPTEG << 3))
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
@@ -91,21 +162,21 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			if (i == 8)
 				return H_PTEG_FULL;
 			if ((*hpte & HPTE_V_VALID) == 0 &&
-			    lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
 				break;
 			hpte += 2;
 		}
 	} else {
 		i = 0;
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-		if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
 			return H_PTEG_FULL;
 	}
 	hpte[1] = ptel;
 	eieio();
 	hpte[0] = pteh;
 	asm volatile("ptesync" : : : "memory");
-	atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
+	// XXX atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
 	vcpu->arch.gpr[4] = pte_index + i;
 	return H_SUCCESS;
 }
@@ -141,7 +212,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (pte_index >= (HPT_NPTEG << 3))
 		return H_PARAMETER;
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	if ((hpte[0] & HPTE_V_VALID) == 0 ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
@@ -200,7 +271,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 			break;
 		}
 		hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-		while (!lock_hpte(hp, HPTE_V_HVLOCK))
+		while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
 			cpu_relax();
 		found = 0;
 		if (hp[0] & HPTE_V_VALID) {
@@ -260,14 +331,19 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (pte_index >= (HPT_NPTEG << 3))
 		return H_PARAMETER;
+	/* Don't let it set a normal memory page to key 31 */
+	if (((flags >> 9) & 0x1f) == 0x1f)
+		return H_PARAMETER;
+
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	if ((hpte[0] & HPTE_V_VALID) == 0 ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
 		hpte[0] &= ~HPTE_V_HVLOCK;
 		return H_NOT_FOUND;
 	}
+
 	if (atomic_read(&kvm->online_vcpus) == 1)
 		flags |= H_LOCAL;
 	v = hpte[0];
@@ -276,6 +352,12 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	r |= (flags << 55) & HPTE_R_PP0;
 	r |= (flags << 48) & HPTE_R_KEY_HI;
 	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+ 	/* Don't let guest remove N or key from emulated MMIO pages */
+	if ((hpte[1] & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == 
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
+		r |= HPTE_R_N | HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+		
 	rb = compute_tlbie_rb(v, r, pte_index);
 	hpte[0] = v & ~HPTE_V_VALID;
 	if (!(flags & H_LOCAL)) {
@@ -303,11 +385,12 @@ static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
 	long int i;
 	unsigned long offset, rpn;
 
+	/* XXX handle MMIO and EMU */
 	offset = realaddr & (kvm->arch.ram_psize - 1);
 	rpn = (realaddr - offset) >> PAGE_SHIFT;
 	for (i = 0; i < kvm->arch.ram_npages; ++i)
 		if (rpn == kvm->arch.ram_pginfo[i].pfn)
-			return (i << PAGE_SHIFT) + offset;
+			return (i << kvm->arch.ram_porder) + offset;
 	return HPTE_R_RPN;	/* all 1s in the RPN field */
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 44d8829..7916e1d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -230,10 +230,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	mtspr	SPRN_DABR,r6
 
 BEGIN_FTR_SECTION
-	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	/* Restore AMR and UAMOR and set AMOR such that
+	 *
+	 *   - AMOR allow change to all keys except 31
+	 *   - AMR disables access for key 31
+	 *   - Other AMR and UAMOR bits are under guest control
+	 *
+	 * Key 31 is thus protected for use by MMIO emulation
+	 */
 	ld	r5,VCPU_AMR(r4)
 	ld	r6,VCPU_UAMOR(r4)
-	li	r7,-1
+	li	r7,-4 /* Disable access to key 31 */
+	ori	r5,r5,3
+	and	r6,r6,r7
 	mtspr	SPRN_AMR,r5
 	mtspr	SPRN_UAMOR,r6
 	mtspr	SPRN_AMOR,r7
@@ -544,13 +553,24 @@ kvmppc_interrupt:
 	 * Register contents:
 	 * R12		= interrupt vector
 	 * R13		= PACA
-	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest CR, R12 saved in PACA HSTATE_SCRATCH1/0
 	 * guest R13 saved in SPRN_SCRATCH0
 	 */
 	/* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
 	std	r9, HSTATE_HOST_R2(r13)
-	ld	r9, HSTATE_KVM_VCPU(r13)
 
+BEGIN_FTR_SECTION
+	/* check for HDSI/HISI for fast reflection to guest when
+	 * VPM is enabled
+	 */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + 2
+	beq	kvmppc_hdsi
+	cmpwi	r12, BOOK3S_INTERRUPT_H_INST_STORAGE + 2
+	beq	kvmppc_hisi
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+.Lhxsi_cont:
+	ld	r9, HSTATE_KVM_VCPU(r13)
 	/* Save registers */
 
 	std	r0, VCPU_GPR(r0)(r9)
@@ -631,7 +651,7 @@ hcall_real_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 
 	/* Save HEIR (HV emulation assist reg) in last_inst
 	   if this is an HEI (HV emulation interrupt, e40) */
-	li	r3,-1
+	li	r3,KVM_INST_FETCH_FAILED
 BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
 	bne	11f
@@ -649,7 +669,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	std	r6, VCPU_DAR(r9)
 	stw	r7, VCPU_DSISR(r9)
 	std	r8, VCPU_CTR(r9)
-	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI)
+	 * also try to load the instruction
+	 */
 BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
 	beq	6f
@@ -1091,11 +1113,108 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_HSRR1, r7
 	ba	0x500
 
+	/* Out of line H_DATA_STORAGE exception, grab HDAR and HDSISR
+	 * and try to load the instruction from guest memory. Note that
+	 * VCPU_LAST_INST has already been set to -1 at this point.
+	 */
 6:	mfspr	r6,SPRN_HDAR
 	mfspr	r7,SPRN_HDSISR
+
+	/* Only fetch instruction if guest IR relocation is enabled */
+	andi.	r0,r11,MSR_IR
+	beq	7b
+	
+	/* In case lwz faults */
+	li	r8,KVM_INST_FETCH_FAILED
+
+	/* Set guest mode to 'jump over instruction' so if lwz faults
+	 * we'll just continue at the next IP. */
+	li	r0,KVM_GUEST_MODE_SKIP
+	stb	r0,HSTATE_IN_GUEST(r13)
+
+	/* Do the access with MSR:DR enabled */
+	mfmsr	r3
+	ori	r4,r3,MSR_DR		/* Enable paging for data */
+	mtmsrd	r4
+	sync
+	lwz	r8,0(r10)
+	mtmsr	r3
+	sync
+
+	/* Store the result */
+	stw	r8,VCPU_LAST_INST(r9)
+
+	/* Unset guest mode. XXX This is a dup, maybe we could
+	 * move the original later in the code flow, just before
+	 * starting the MMU switch
+	 */
+	li	r0,KVM_GUEST_MODE_NONE
+	stb	r0,HSTATE_IN_GUEST(r13)
 	b	7b
 
 /*
+ * See if this H[DI]SI interrupt is one that can be bounced to the guest.
+ * It can be bounced immediately if it is not in real mode and is
+ * not a key fault (DSI) or not a non-exec fault (ISI).
+ *
+ * Here, r9, r12 and cr are saved in the PACA, r13 is saved in SPRN_SCRATCH0.
+ */
+kvmppc_hdsi:
+	std	r0, PACA_EXGEN(r13)
+	mfspr	r9, SPRN_HDSISR
+	mfspr	r12, SPRN_HSRR1
+	andis.	r0, r9, DSISR_KEYFAULT@h
+	bne	1f
+	andi.	r0, r12, MSR_DR
+	beq	1f
+	mfspr	r0, SPRN_HSRR0		/* turn it into a DSI for the guest */
+	mtspr	SPRN_DSISR, r9
+	mtspr	SPRN_SRR1, r12
+	mtspr	SPRN_SRR0, r0
+	mfspr	r9, SPRN_HDAR
+	li	r0, BOOK3S_INTERRUPT_DATA_STORAGE
+	li	r12, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r12, r12, 63
+	mtspr	SPRN_DAR, r9
+	mtspr	SPRN_HSRR0, r0
+	mtspr	SPRN_HSRR1, r12
+	lwz	r0, HSTATE_SCRATCH1(r13)
+	mtocrf	0x80, r0
+	ld	r9, HSTATE_HOST_R2(r13)
+	ld	r12, HSTATE_SCRATCH0(r13)
+	ld	r0, PACA_EXGEN(r13)
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+1:	ld	r0, PACA_EXGEN(r13)
+	li	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + 2
+	b	.Lhxsi_cont
+
+kvmppc_hisi:
+	mfspr	r9, SPRN_HSRR1
+	andi.	r12, r9, MSR_IR
+	beq	1f
+	andis.	r12, r9, SRR1_ISI_N_OR_G@h
+	bne	1f
+	mfspr	r12, SPRN_HSRR0		/* turn it into a ISI for the guest */
+	mtspr	SPRN_SRR1, r9
+	mtspr	SPRN_SRR0, r12
+	li	r9, BOOK3S_INTERRUPT_INST_STORAGE
+	li	r12, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r12, r12, 63
+	mtspr	SPRN_HSRR0, r9
+	mtspr	SPRN_HSRR1, r12
+	lwz	r9, HSTATE_SCRATCH1(r13)
+	mtocrf	0x80, r9
+	ld	r9, HSTATE_HOST_R2(r13)
+	ld	r12, HSTATE_SCRATCH0(r13)
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+1:	li	r12, BOOK3S_INTERRUPT_H_INST_STORAGE + 2
+	b	.Lhxsi_cont
+
+/*
  * Try to handle an hcall in real mode.
  * Returns to the guest if we handle it, or continues on up to
  * the kernel if we can't (i.e. if we don't have a handler for
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f3a6414..30e7c2e 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1007,6 +1007,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
 				      struct kvm_userspace_memory_region *mem)
 {
 	return 0;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9c78589..e9186e9 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -895,6 +895,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
 				      struct kvm_userspace_memory_region *mem)
 {
 	return 0;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c33f6a7..084d1c5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -265,7 +265,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_userspace_memory_region *mem,
                                    int user_alloc)
 {
-	return kvmppc_core_prepare_memory_region(kvm, mem);
+	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index c107fae..774b04d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -105,6 +105,9 @@ struct kvm_userspace_memory_region {
 #define KVM_MEM_LOG_DIRTY_PAGES  1UL
 #define KVM_MEMSLOT_INVALID      (1UL << 1)
 
+/* Kernel internal use */
+#define KVM_MEMSLOT_IO		 (1UL << 31)
+
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
 	/*
-- 
1.7.7.2

^ permalink raw reply related

* [RFC PATCH 0/11] KVM: PPC: Update Book3S HV memory handling
From: Paul Mackerras @ 2011-11-16 22:50 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf

This series of patches updates the Book3S-HV KVM code that manages the
guest hashed page table (HPT) to enable several things:

* MMIO emulation and MMIO pass-through

* Use of small pages (4kB or 64kB, depending on config) to back the
  guest memory

* Pageable guest memory - i.e. backing pages can be removed from the
  guest and reinstated on demand, using the MMU notifier mechanism.

On PPC970 we have no way to get DSIs and ISIs to come to the
hypervisor, so we can't do MMIO emulation or pageable guest memory.
On POWER7 we set the VPM1 bit in the LPCR to make all DSIs and ISIs
come to the hypervisor (host) as HDSIs or HISIs.

This series is RFC for the moment, although the first 5 or so patches
are pretty solid and could go in.  I am going to rework the later
patches to use HPTEs with V=0 for the absent pages rather than key=31,
which will require handling the HPTE-not-present HDSIs we will get and
differentiating the case where the guest has created a HPTE but the
underlying page is not resident from the case where the guest has
created no HPTE for the address.

Paul.

^ permalink raw reply

* Re: [RFC][PATCH 15/30] powerpc/85xx: Rework P1022DS device tree
From: Kumar Gala @ 2011-11-16 21:50 UTC (permalink / raw)
  To: Timur Tabi; +Cc: linuxppc-dev
In-Reply-To: <4EC42F6E.2030500@freescale.com>


On Nov 16, 2011, at 3:47 PM, Timur Tabi wrote:

> wrote:
>> I just noticed this bug in the original p1022ds.dts, and I see you're
>> carrying it over here.  The reg property should look like this:
>> 
>> reg = <0xf 0xffe05000 0 0x1000>;
>>       ^^^
> 
> It looks like there's also a problem with the 'ranges' property:
> 
> 	ranges = <0x0 0x0 0xf 0xe8000000 0x08000000
> 		  0x1 0x0 0xf 0xe0000000 0x08000000
> 		  0x2 0x0 0x0 0xffa00000 0x00040000
> 		          ^^^
> 		  0x3 0x0 0xf 0xffdf0000 0x00008000>;

Gotcha, existing bug but will fix it.

- k

^ permalink raw reply

* Re: [RFC][PATCH 15/30] powerpc/85xx: Rework P1022DS device tree
From: Timur Tabi @ 2011-11-16 21:47 UTC (permalink / raw)
  Cc: linuxppc-dev
In-Reply-To: <CAOZdJXU6NXgWv=VwShOvEGH5rm0KJmrQgOCVDiSJtKoVMs5=eg@mail.gmail.com>

 wrote:
> I just noticed this bug in the original p1022ds.dts, and I see you're
> carrying it over here.  The reg property should look like this:
> 
> reg = <0xf 0xffe05000 0 0x1000>;
>        ^^^

It looks like there's also a problem with the 'ranges' property:

	ranges = <0x0 0x0 0xf 0xe8000000 0x08000000
		  0x1 0x0 0xf 0xe0000000 0x08000000
		  0x2 0x0 0x0 0xffa00000 0x00040000
		          ^^^
		  0x3 0x0 0xf 0xffdf0000 0x00008000>;

-- 
Timur Tabi
Linux kernel developer at Freescale

^ permalink raw reply

* Re: [PATCH v2 3/7] powerpc/85xx: add sleep and deep sleep support
From: Scott Wood @ 2011-11-16 21:42 UTC (permalink / raw)
  To: Zhao Chenhui; +Cc: linuxppc-dev
In-Reply-To: <1321437344-19253-3-git-send-email-chenhui.zhao@freescale.com>

On 11/16/2011 03:55 AM, Zhao Chenhui wrote:
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index ce4f7f1..d5cc385 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -63,6 +63,7 @@ obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
>  ifeq ($(CONFIG_PPC32),y)
>  obj-$(CONFIG_E500)		+= idle_e500.o
>  endif
> +obj-$(CONFIG_PPC_85xx)		+= l2cr_85xx.o

Can you restrict this to e500v1/v2?

Also, don't call it "l2cr" -- that's a 6xx register that is not present
on 85xx.

> +	.section .data
> +	.align	5
> +mpc85xx_sleep_save_area:
> +	.space	STATE_SAVE_SIZE
> +ccsrbase_low:
> +	.long	0
> +ccsrbase_high:
> +	.long	0
> +powmgtreq:
> +	.long	POWMGTCSR_DPSLP_MASK
> +
> +	.section .text
> +	.align	12
> +
> +	/*
> +	 * r3 = high word of physical address of CCSR
> +	 * r4 = low word of physical address of CCSR
> +	 */

The whole point of powmgtreq is to store a dynamically-passed-in
value... and it doesn't look like you add it in the jog patch.

-Scott

^ permalink raw reply

* P4080: attempting to use the uio_pdrv to attach FPGA on localbus ...
From: Robert Sciuk @ 2011-11-16 21:19 UTC (permalink / raw)
  To: devicetree-discuss; +Cc: linuxppc-dev
In-Reply-To: <mailman.2323.1320874671.15294.devicetree-discuss@lists.ozlabs.org>

I have succeeded in using the i2c bus with GPIO expander to access the =
programming pins of my FPGA devices, but the data port uses the =
localbus.  I had initially thought that the uio platform driver would be =
the ideal approach to creating a device which would allow configuration =
from userland via the /dev/uio[0|1] device interface, but apparently the =
device binding is not being accomplished as expected. =20

	localbus@ffe124000 {
		compatible =3D "fsl,p4080-elbc", "fsl,elbc", "simple-bus";
		reg =3D <0xf 0xfe124000 0 0x1000>;
		interrupts =3D <25 2 0 0>;
		interrupt-parent =3D <&mpic>;
		#address-cells =3D <2>;
		#size-cells =3D <1>;

                /* Local bus region mappings */
                ranges =3D <0 0 0xf 0xe8000000 0x08000000 	/* CS0: Boot =
flash */
                          1 0 0xf 0xd0000000 0x8000 		/* CS1: FPGA0 */
                          2 0 0xf 0xd1000000 0x8000 >; 	/* CS2: FPGA1 */

		flash@0,0 {
		  ........
            };=20

		lim: fpga@1,0 {
			compatible =3D "uio_pdrv";
			pin-handle=3D<&lim_ctrl>;
		};=20

		nitro: fpga@2,0 {
			compatible =3D "uio_pdrv";
			pin-handle=3D<&fpe0_ctrl &fpe1_ctrl>;
		};=20
	};

I have aliases pointing to the localbus nodes lim and nitro, but it =
appears that the uio_pdrv driver does not bind to the device based upon =
the compatible property of the tree.  I'm hoping to be able to mmap the =
localbus port memory (0xf_d000_0000 and 0xf_d100_0000) respectively when =
I open /dev/uio[0|1].  Is there additional driver registration needed in =
order to use the uio_pdrv driver?  What am I missing??

Any pointers would be appreciated.

Cheers,
Rob

^ permalink raw reply

* Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup
From: Moffett, Kyle D @ 2011-11-16 20:52 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: B04825@freescale.com, linux-kernel@vger.kernel.org,
	paul.gortmaker@windriver.com, scottwood@freescale.com,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <20111116044032.GA26476@bloggs.ozlabs.ibm.com>

On Nov 15, 2011, at 23:40, Paul Mackerras wrote:
> On Tue, Nov 15, 2011 at 04:45:18PM -0600, Moffett, Kyle D wrote:
>>=20
>> I guess that's doable, although I have to admit that idea almost gives
>> me more of a headache than trying to fix up the 32-bit ASM.
>>=20
>> One thing that bothers me in particular is that both 32/64 versions of
>> __copy_tofrom_user() are dramatically overcomplicated for what they
>> ought to be doing.
>>=20
>> It would seem that if we get a page fault during an unaligned copy, we
>> ought to just give up and fall back to a simple byte-by-byte copy loop
>> from wherever we left off.  That would eliminate 90% of the ugly
>> special cases without actually hurting performance, right?
>=20
> That's basically what we do, IIRC, and most of the complexity comes
> from working out where we were up to.  We could probably use a simpler
> approximation that means we might copy some bytes twice.  In fact the
> greatest simplification would probably be to implement range entries
> in the exception table so we can just have one entry for all the loads
> and stores instead of an entry for each individual load and store.

Well, I spent some time tinkering with the GCC inline-assembly option,
which was probably a waste, but I figured I would post my code here for
other people to chuckle at.  :-D

Here's a basic, relatively easily extended "copy u8" macro that sets up
the exception table using "asm goto":

#define try_copy_u8(DST, SRC, LOAD_FAULT, STORE_FAULT) do {	\
	unsigned long try_copy_tmp__ =3D (try_copy_tmp__);	\
	asm goto (						\
		"1:	lbz %[tmp], %[src]\n"			\
		"2:	stb %[tmp], %[dst]\n"			\
		"	.pushsection __ex_table, \"a\"\n"	\
		"	.align 2\n"				\
		"	.long 1b, %l["#LOAD_FAULT"]\n"		\
		"	.long 2b, %l["#STORE_FAULT"]\n"		\
		"	.popsection\n"				\
		: /* No outputs allowed for "asm goto" */	\
		: [dst] "m"(*(__user u8 *)(DST)),		\
		  [src] "m"(*(const __user u8 *)(SRC)),		\
		  [tmp] "r"(try_copy_tmp__)			\
		: "memory"					\
		: LOAD_FAULT, STORE_FAULT			\
	);							\
} while(0)

If I put that into a function and compile it, the assembly and the
exception table look perfectly OK, even under register pressure.
With a few macros like that it looks like it should be possible to
write the copy function directly in C and get optimal results.

The only other variants you need would be "try_copy_ulong" and
"try_copy_4ulong"/"try_copy_8ulong" for 32/64-bit.

Unfortunately, as I mentioned before, GCC 4.4 and older don't have
"asm goto" support :-(.

Perhaps I could put __copy_tofrom_user() into its own file and make
the assembled 32/64 output files be ".shipped"?

On the other hand, perhaps this is overly complicated :-D.

I'll poke at it more tomorrow.


>> For a page-fault during a cacheline-aligned copy, we should be able to
>> handle the exception and retry from the last cacheline without much
>> logic, again with good performance.
>>=20
>> With that said, I'm curious about the origin of the PPC32 ASM.  In
>> particular, it looks like it was generated by GCC at some point in the
>> distant past, and I'm wondering if there's a good way to rewrite that
>> file in C and trick GCC into generating the relevant exception tables
>> for it?
>=20
> Why do you think it was generated by gcc?  I wrote the original
> version, but I think it got extended and macro-ized by others.

Ah, sorry,  when I first looked at it the large collection of numeric
labels and the very sparing comments made it look autogenerated.

Although, given how much of a pain in the neck it is maybe you would
rather people not think you wrote it at all. ;-)

Cheers,
Kyle Moffett

--
Curious about my work on the Debian powerpcspe port?
I'm keeping a blog here: http://pureperl.blogspot.com/

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox