LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 15/15] KVM: PPC: book3s_hv: Add support for PPC970-family processors
From: Paul Mackerras @ 2011-06-18  8:39 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <20110618082705.GA28413@bloggs.ozlabs.ibm.com>

This adds support for running KVM guests in supervisor mode on those
PPC970 processors that have a usable hypervisor mode.  Unfortunately,
Apple G5 machines have supervisor mode disabled (MSR[HV] is forced to
1), but the YDL PowerStation does have a usable hypervisor mode.

There are several differences between the PPC970 and POWER7 in how
guests are managed.  These differences are accommodated using the
CPU_FTR_ARCH_201 (PPC970) and CPU_FTR_ARCH_206 (POWER7) CPU feature
bits.  Notably, on PPC970:

* The LPCR, LPID or RMOR registers don't exist, and the functions of
  those registers are provided by bits in HID4 and one bit in HID0.

* External interrupts can be directed to the hypervisor, but unlike
  POWER7 they are masked by MSR[EE] in non-hypervisor modes and use
  SRR0/1 not HSRR0/1.

* There is no virtual RMA (VRMA) mode; the guest must use an RMO
  (real mode offset) area.

* The TLB entries are not tagged with the LPID, so it is necessary to
  flush the whole TLB on partition switch.  Furthermore, when switching
  partitions we have to ensure that no other CPU is executing the tlbie
  or tlbsync instructions in either the old or the new partition,
  otherwise undefined behaviour can occur.

* The PMU has 8 counters (PMC registers) rather than 6.

* The DSCR, PURR, SPURR, AMR, AMOR, UAMOR registers don't exist.

* The SLB has 64 entries rather than 32.

* There is no mediated external interrupt facility, so if we switch to
  a guest that has a virtual external interrupt pending but the guest
  has MSR[EE] = 0, we have to arrange to have an interrupt pending for
  it so that we can get control back once it re-enables interrupts.  We
  do that by sending ourselves an IPI with smp_send_reschedule after
  hard-disabling interrupts.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/exception-64s.h  |    4 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |    2 +-
 arch/powerpc/include/asm/kvm_host.h       |    2 +-
 arch/powerpc/kernel/asm-offsets.c         |    1 +
 arch/powerpc/kernel/exceptions-64s.S      |    2 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c       |   36 ++++--
 arch/powerpc/kvm/book3s_hv.c              |   65 +++++++--
 arch/powerpc/kvm/book3s_hv_interrupts.S   |   57 +++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  216 ++++++++++++++++++++++++++++-
 arch/powerpc/kvm/powerpc.c                |    3 +
 arch/powerpc/mm/hash_native_64.c          |    2 +-
 11 files changed, 353 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 69435da..8057f4f 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -246,6 +246,10 @@ label##_hv:						\
 	KVMTEST(vec);							\
 	_SOFTEN_TEST(EXC_HV)
 
+#define SOFTEN_TEST_HV_201(vec)						\
+	KVMTEST(vec);							\
+	_SOFTEN_TEST(EXC_STD)
+
 #define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
 	HMT_MEDIUM;							\
 	SET_SCRATCH0(r13);    /* save r13 */				\
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 3ac82f1..7b37a5b 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -81,7 +81,7 @@ struct kvmppc_host_state {
 	unsigned long xics_phys;
 	u64 dabr;
 	u64 host_mmcr[3];
-	u32 host_pmc[6];
+	u32 host_pmc[8];
 	u64 host_purr;
 	u64 host_spurr;
 	u64 host_dscr;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f572d9c..cc22b28 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -353,7 +353,7 @@ struct kvm_vcpu_arch {
 	u32 dbsr;
 
 	u64 mmcr[3];
-	u32 pmc[6];
+	u32 pmc[8];
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct mutex exit_timing_lock;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6832a06..0062875 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -128,6 +128,7 @@ int main(void)
 	DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
 	/* paca */
 	DEFINE(PACA_SIZE, sizeof(struct paca_struct));
+	DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
 	DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
 	DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
 	DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e654165..8301bfd 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -171,7 +171,7 @@ hardware_interrupt_hv:
 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST_PR)
+					    EXC_STD, SOFTEN_TEST_HV_201)
 		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 3f773b3..2cdc202 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -42,6 +42,8 @@
 #define VRMA_PAGE_ORDER	24
 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
 
+/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
+#define MAX_LPID_970	63
 #define NR_LPIDS	(LPID_RSVD + 1)
 unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
 
@@ -69,9 +71,6 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
 	kvm->arch.lpid = lpid;
-	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
-	kvm->arch.host_lpid = mfspr(SPRN_LPID);
-	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
 
 	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
 	return 0;
@@ -157,7 +156,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	/* only handle 4k, 64k and 16M pages for now */
 	porder = 12;
 	if (pteh & HPTE_V_LARGE) {
-		if ((ptel & 0xf000) == 0x1000) {
+		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		    (ptel & 0xf000) == 0x1000) {
 			/* 64k page */
 			porder = 16;
 		} else if ((ptel & 0xff000) == 0) {
@@ -227,7 +227,8 @@ static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 	va_low &= 0x7ff;
 	if (v & HPTE_V_LARGE) {
 		rb |= 1;			/* L field */
-		if (r & 0xff000) {
+		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		    (r & 0xff000)) {
 			/* non-16MB large page, must be 64k */
 			/* (masks depend on page size) */
 			rb |= 0x1000;		/* page encoding in LP field */
@@ -470,12 +471,24 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 
 int kvmppc_mmu_hv_init(void)
 {
-	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
-	    !cpu_has_feature(CPU_FTR_ARCH_206))
+	unsigned long host_lpid, rsvd_lpid;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
 		return -EINVAL;
+
 	memset(lpid_inuse, 0, sizeof(lpid_inuse));
-	set_bit(mfspr(SPRN_LPID), lpid_inuse);
-	set_bit(LPID_RSVD, lpid_inuse);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+		host_lpid = mfspr(SPRN_LPID);	/* POWER7 */
+		rsvd_lpid = LPID_RSVD;
+	} else {
+		host_lpid = 0;			/* PPC970 */
+		rsvd_lpid = MAX_LPID_970;
+	}
+
+	set_bit(host_lpid, lpid_inuse);
+	/* rsvd_lpid is reserved for use in partition switching */
+	set_bit(rsvd_lpid, lpid_inuse);
 
 	return 0;
 }
@@ -499,7 +512,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
 
-	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
+	if (cpu_has_feature(CPU_FTR_ARCH_206))
+		vcpu->arch.slb_nr = 32;		/* POWER7 */
+	else
+		vcpu->arch.slb_nr = 64;
 
 	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
 	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ebc097d..1ae1693 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -444,8 +444,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 int kvmppc_core_check_processor_compat(void)
 {
-	if (cpu_has_feature(CPU_FTR_HVMODE) &&
-	    cpu_has_feature(CPU_FTR_ARCH_206))
+	if (cpu_has_feature(CPU_FTR_HVMODE))
 		return 0;
 	return -EIO;
 }
@@ -739,6 +738,10 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		return -EINTR;
 	}
 
+	/* On PPC970, check that we have an RMA region */
+	if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
+		return -EPERM;
+
 	kvm_run->exit_reason = 0;
 	vcpu->arch.ret = RESUME_GUEST;
 	vcpu->arch.trap = 0;
@@ -965,12 +968,14 @@ static LIST_HEAD(free_rmas);
 static DEFINE_SPINLOCK(rma_lock);
 
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
-   Assumes POWER7. */
+   Assumes POWER7 or PPC970. */
 static int lpcr_rmls(unsigned long rma_size)
 {
 	switch (rma_size) {
 	case 32ul << 20:	/* 32 MB */
-		return 8;
+		if (cpu_has_feature(CPU_FTR_ARCH_206))
+			return 8;	/* only supported on POWER7 */
+		return -1;
 	case 64ul << 20:	/* 64 MB */
 		return 3;
 	case 128ul << 20:	/* 128 MB */
@@ -1000,8 +1005,9 @@ void kvm_rma_init(void)
 	void *rma;
 	struct page *pg;
 
-	/* Only do this in HV mode */
-	if (!cpu_has_feature(CPU_FTR_HVMODE))
+	/* Only do this on PPC970 in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+	    !cpu_has_feature(CPU_FTR_ARCH_201))
 		return;
 
 	if (!kvm_rma_size || !kvm_rma_count)
@@ -1166,6 +1172,10 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		    mem->userspace_addr == vma->vm_start)
 			ri = vma->vm_file->private_data;
 		up_read(&current->mm->mmap_sem);
+		if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
+			pr_err("CPU requires an RMO\n");
+			return -EINVAL;
+		}
 	}
 
 	if (ri) {
@@ -1184,10 +1194,25 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		atomic_inc(&ri->use_count);
 		kvm->arch.rma = ri;
 		kvm->arch.n_rma_pages = rma_size >> porder;
-		lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L);
-		lpcr |= rmls << LPCR_RMLS_SH;
+
+		/* Update LPCR and RMOR */
+		lpcr = kvm->arch.lpcr;
+		if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+			/* PPC970; insert RMLS value (split field) in HID4 */
+			lpcr &= ~((1ul << HID4_RMLS0_SH) |
+				  (3ul << HID4_RMLS2_SH));
+			lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
+				((rmls & 3) << HID4_RMLS2_SH);
+			/* RMOR is also in HID4 */
+			lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
+				<< HID4_RMOR_SH;
+		} else {
+			/* POWER7 */
+			lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
+			lpcr |= rmls << LPCR_RMLS_SH;
+			kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+		}
 		kvm->arch.lpcr = lpcr;
-		kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
 		pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
@@ -1258,11 +1283,25 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 	kvm->arch.rma = NULL;
 	kvm->arch.n_rma_pages = 0;
 
-	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
-	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
-		LPCR_VPM0 | LPCR_VRMA_L;
-	kvm->arch.lpcr = lpcr;
+	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
+	if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+		/* PPC970; HID4 is effectively the LPCR */
+		unsigned long lpid = kvm->arch.lpid;
+		kvm->arch.host_lpid = 0;
+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
+		lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
+		lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
+			((lpid & 0xf) << HID4_LPID5_SH);
+	} else {
+		/* POWER7; init LPCR for virtual RMA mode */
+		kvm->arch.host_lpid = mfspr(SPRN_LPID);
+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+		lpcr &= LPCR_PECE | LPCR_LPES;
+		lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+			LPCR_VPM0 | LPCR_VRMA_L;
+	}
+	kvm->arch.lpcr = lpcr;
 
 	return 0;
 
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 5c6a387..527ea6e 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -77,17 +77,27 @@ _GLOBAL(__kvmppc_vcore_entry)
 	mfspr	r7, SPRN_PMC4
 	mfspr	r8, SPRN_PMC5
 	mfspr	r9, SPRN_PMC6
+BEGIN_FTR_SECTION
+	mfspr	r10, SPRN_PMC7
+	mfspr	r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	stw	r3, HSTATE_PMC(r13)
 	stw	r5, HSTATE_PMC + 4(r13)
 	stw	r6, HSTATE_PMC + 8(r13)
 	stw	r7, HSTATE_PMC + 12(r13)
 	stw	r8, HSTATE_PMC + 16(r13)
 	stw	r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+	stw	r10, HSTATE_PMC + 24(r13)
+	stw	r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 31:
 
 	/* Save host DSCR */
+BEGIN_FTR_SECTION
 	mfspr	r3, SPRN_DSCR
 	std	r3, HSTATE_DSCR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save host DABR */
 	mfspr	r3, SPRN_DABR
@@ -106,6 +116,25 @@ _GLOBAL(__kvmppc_vcore_entry)
 	add	r8,r8,r7
 	std	r8,HSTATE_DECEXP(r13)
 
+	/*
+	 * On PPC970, if the guest vcpu has an external interrupt pending,
+	 * send ourselves an IPI so as to interrupt the guest once it
+	 * enables interrupts.  (It must have interrupts disabled,
+	 * otherwise we would already have delivered the interrupt.)
+	 */
+BEGIN_FTR_SECTION
+	ld	r0, VCPU_PENDING_EXC(r4)
+	li	r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
+	oris	r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+	and.	r0, r0, r7
+	beq	32f
+	mr	r31, r4
+	lhz	r3, PACAPACAINDEX(r13)
+	bl	smp_send_reschedule
+	mr	r4, r31
+32:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+
 	ld	r5, VCPU_TRAMPOLINE_ENTER(r4)
 	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
 
@@ -153,12 +182,20 @@ kvmppc_handler_highmem:
 	lwz	r6, HSTATE_PMC + 12(r13)
 	lwz	r8, HSTATE_PMC + 16(r13)
 	lwz	r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+	lwz	r10, HSTATE_PMC + 24(r13)
+	lwz	r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_PMC1, r3
 	mtspr	SPRN_PMC2, r4
 	mtspr	SPRN_PMC3, r5
 	mtspr	SPRN_PMC4, r6
 	mtspr	SPRN_PMC5, r8
 	mtspr	SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PMC7, r10
+	mtspr	SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	ld	r3, HSTATE_MMCR(r13)
 	ld	r4, HSTATE_MMCR + 8(r13)
 	ld	r5, HSTATE_MMCR + 16(r13)
@@ -221,11 +258,14 @@ _GLOBAL(kvmppc_save_fp)
 	mfmsr	r9
 	ori	r8,r9,MSR_FP
 #ifdef CONFIG_ALTIVEC
-#ifdef CONFIG_VSX
-	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
-#else
+BEGIN_FTR_SECTION
 	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif
 	mtmsrd	r8
 	isync
@@ -278,11 +318,14 @@ kvmppc_load_fp:
 	mfmsr	r9
 	ori	r8,r9,MSR_FP
 #ifdef CONFIG_ALTIVEC
-#ifdef CONFIG_VSX
-	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
-#else
+BEGIN_FTR_SECTION
 	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif
 	mtmsrd	r8
 	isync
@@ -308,10 +351,10 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
 #endif
 
 #ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
 	li	r7,VCPU_VSCR
 	lvx	vr0,r7,r4
 	mtvscr	vr0
-BEGIN_FTR_SECTION
 	reg = 0
 	.rept	32
 	li	r7,reg*16+VCPU_VRS
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 697ef88..c3121cc 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -64,8 +64,10 @@ kvmppc_skip_Hinterrupt:
  */
 .global kvmppc_handler_lowmem_trampoline
 kvmppc_handler_lowmem_trampoline:
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
 	beq	1f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
 	mtsrr0	r3
 	mtsrr1	r4
 	blr
@@ -178,12 +180,20 @@ kvmppc_handler_trampoline_enter:
 	lwz	r7, VCPU_PMC + 12(r4)
 	lwz	r8, VCPU_PMC + 16(r4)
 	lwz	r9, VCPU_PMC + 20(r4)
+BEGIN_FTR_SECTION
+	lwz	r10, VCPU_PMC + 24(r4)
+	lwz	r11, VCPU_PMC + 28(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_PMC1, r3
 	mtspr	SPRN_PMC2, r5
 	mtspr	SPRN_PMC3, r6
 	mtspr	SPRN_PMC4, r7
 	mtspr	SPRN_PMC5, r8
 	mtspr	SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PMC7, r10
+	mtspr	SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	ld	r3, VCPU_MMCR(r4)
 	ld	r5, VCPU_MMCR + 8(r4)
 	ld	r6, VCPU_MMCR + 16(r4)
@@ -195,9 +205,11 @@ kvmppc_handler_trampoline_enter:
 	/* Load up FP, VMX and VSX registers */
 	bl	kvmppc_load_fp
 
+BEGIN_FTR_SECTION
 	/* Switch DSCR to guest value */
 	ld	r5, VCPU_DSCR(r4)
 	mtspr	SPRN_DSCR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/*
 	 * Set the decrementer to the guest decrementer.
@@ -240,6 +252,7 @@ kvmppc_handler_trampoline_enter:
 	mtspr	SPRN_DABRX,r5
 	mtspr	SPRN_DABR,r6
 
+BEGIN_FTR_SECTION
 	/* Restore AMR and UAMOR, set AMOR to all 1s */
 	ld	r5,VCPU_AMR(r4)
 	ld	r6,VCPU_UAMOR(r4)
@@ -247,6 +260,7 @@ kvmppc_handler_trampoline_enter:
 	mtspr	SPRN_AMR,r5
 	mtspr	SPRN_UAMOR,r6
 	mtspr	SPRN_AMOR,r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Clear out SLB */
 	li	r6,0
@@ -254,6 +268,14 @@ kvmppc_handler_trampoline_enter:
 	slbia
 	ptesync
 
+BEGIN_FTR_SECTION
+	b	30f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	/*
+	 * POWER7 host -> guest partition switch code.
+	 * We don't have to lock against concurrent tlbies,
+	 * but we do have to coordinate across hardware threads.
+	 */
 	/* Increment entry count iff exit count is zero. */
 	ld	r5,HSTATE_KVM_VCORE(r13)
 	addi	r9,r5,VCORE_ENTRY_EXIT
@@ -345,9 +367,94 @@ kvmppc_handler_trampoline_enter:
 	ld	r8,VCPU_SPURR(r4)
 	mtspr	SPRN_PURR,r7
 	mtspr	SPRN_SPURR,r8
+	b	31f
+
+	/*
+	 * PPC970 host -> guest partition switch code.
+	 * We have to lock against concurrent tlbies,
+	 * using native_tlbie_lock to lock against host tlbies
+	 * and kvm->arch.tlbie_lock to lock against guest tlbies.
+	 * We also have to invalidate the TLB since its
+	 * entries aren't tagged with the LPID.
+	 */
+30:	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+
+	/* first take native_tlbie_lock */
+	.section ".toc","aw"
+toc_tlbie_lock:
+	.tc	native_tlbie_lock[TC],native_tlbie_lock
+	.previous
+	ld	r3,toc_tlbie_lock@toc(2)
+	lwz	r8,PACA_LOCK_TOKEN(r13)
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r7,KVM_LPCR(r9)		/* use kvm->arch.lpcr to store HID4 */
+	li	r0,0x18f
+	rotldi	r0,r0,HID4_LPID5_SH	/* all lpid bits in HID4 = 1 */
+	or	r0,r7,r0
+	ptesync
+	sync
+	mtspr	SPRN_HID4,r0		/* switch to reserved LPID */
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop native_tlbie_lock */
+
+	/* invalidate the whole TLB */
+	li	r0,256
+	mtctr	r0
+	li	r6,0
+25:	tlbiel	r6
+	addi	r6,r6,0x1000
+	bdnz	25b
+	ptesync
+
+	/* Take the guest's tlbie_lock */
+	addi	r3,r9,KVM_TLBIE_LOCK
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+	ld	r6,KVM_SDR1(r9)
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+
+	/* Set up HID4 with the guest's LPID etc. */
+	sync
+	mtspr	SPRN_HID4,r7
+	isync
+
+	/* drop the guest's tlbie_lock */
+	li	r0,0
+	stw	r0,0(r3)
+
+	/* Check if HDEC expires soon */
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,10
+	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	mr	r9,r4
+	blt	hdec_soon
+
+	/* Enable HDEC interrupts */
+	mfspr	r0,SPRN_HID0
+	li	r3,1
+	rldimi	r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+	sync
+	mtspr	SPRN_HID0,r0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
 
 	/* Load up guest SLB entries */
-	lwz	r5,VCPU_SLB_MAX(r4)
+31:	lwz	r5,VCPU_SLB_MAX(r4)
 	cmpwi	r5,0
 	beq	9f
 	mtctr	r5
@@ -504,6 +611,7 @@ kvmppc_interrupt:
 hcall_real_cont:
 
 	/* Check for mediated interrupts (could be done earlier really ...) */
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
 	bne+	1f
 	ld	r5,VCPU_KVM(r9)
@@ -513,6 +621,7 @@ hcall_real_cont:
 	andi.	r0,r5,LPCR_MER
 	bne	bounce_ext_interrupt
 1:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save DEC */
 	mfspr	r5,SPRN_DEC
@@ -524,9 +633,11 @@ hcall_real_cont:
 	/* Save HEIR (HV emulation assist reg) in last_inst
 	   if this is an HEI (HV emulation interrupt, e40) */
 	li	r3,-1
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
 	bne	11f
 	mfspr	r3,SPRN_HEIR
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 11:	stw	r3,VCPU_LAST_INST(r9)
 
 	/* Save more register state  */
@@ -540,8 +651,10 @@ hcall_real_cont:
 	stw	r7, VCPU_DSISR(r9)
 	std	r8, VCPU_CTR(r9)
 	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
 	beq	6f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 7:	std	r6, VCPU_FAULT_DAR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 
@@ -575,6 +688,7 @@ hcall_real_cont:
 	/*
 	 * Save the guest PURR/SPURR
 	 */
+BEGIN_FTR_SECTION
 	mfspr	r5,SPRN_PURR
 	mfspr	r6,SPRN_SPURR
 	ld	r7,VCPU_PURR(r9)
@@ -594,6 +708,7 @@ hcall_real_cont:
 	add	r4,r4,r6
 	mtspr	SPRN_PURR,r3
 	mtspr	SPRN_SPURR,r4
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
 
 	/* Clear out SLB */
 	li	r5,0
@@ -602,6 +717,14 @@ hcall_real_cont:
 	ptesync
 
 hdec_soon:
+BEGIN_FTR_SECTION
+	b	32f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	/*
+	 * POWER7 guest -> host partition switch code.
+	 * We don't have to lock against tlbies but we do
+	 * have to coordinate the hardware threads.
+	 */
 	/* Increment the threads-exiting-guest count in the 0xff00
 	   bits of vcore->entry_exit_count */
 	lwsync
@@ -672,9 +795,82 @@ hdec_soon:
 16:	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
+	b	33f
+
+	/*
+	 * PPC970 guest -> host partition switch code.
+	 * We have to lock against concurrent tlbies, and
+	 * we have to flush the whole TLB.
+	 */
+32:	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+
+	/* Take the guest's tlbie_lock */
+	lwz	r8,PACA_LOCK_TOKEN(r13)
+	addi	r3,r4,KVM_TLBIE_LOCK
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r7,KVM_HOST_LPCR(r4)	/* use kvm->arch.host_lpcr for HID4 */
+	li	r0,0x18f
+	rotldi	r0,r0,HID4_LPID5_SH	/* all lpid bits in HID4 = 1 */
+	or	r0,r7,r0
+	ptesync
+	sync
+	mtspr	SPRN_HID4,r0		/* switch to reserved LPID */
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop guest tlbie_lock */
+
+	/* invalidate the whole TLB */
+	li	r0,256
+	mtctr	r0
+	li	r6,0
+25:	tlbiel	r6
+	addi	r6,r6,0x1000
+	bdnz	25b
+	ptesync
+
+	/* take native_tlbie_lock */
+	ld	r3,toc_tlbie_lock@toc(2)
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r6,KVM_HOST_SDR1(r4)
+	mtspr	SPRN_SDR1,r6		/* switch to host page table */
+
+	/* Set up host HID4 value */
+	sync
+	mtspr	SPRN_HID4,r7
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop native_tlbie_lock */
+
+	lis	r8,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC,r8
+
+	/* Disable HDEC interrupts */
+	mfspr	r0,SPRN_HID0
+	li	r3,0
+	rldimi	r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+	sync
+	mtspr	SPRN_HID0,r0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
 
 	/* load host SLB entries */
-	ld	r8,PACA_SLBSHADOWPTR(r13)
+33:	ld	r8,PACA_SLBSHADOWPTR(r13)
 
 	.rept	SLB_NUM_BOLTED
 	ld	r5,SLBSHADOW_SAVEAREA(r8)
@@ -686,12 +882,14 @@ hdec_soon:
 	.endr
 
 	/* Save and reset AMR and UAMOR before turning on the MMU */
+BEGIN_FTR_SECTION
 	mfspr	r5,SPRN_AMR
 	mfspr	r6,SPRN_UAMOR
 	std	r5,VCPU_AMR(r9)
 	std	r6,VCPU_UAMOR(r9)
 	li	r6,0
 	mtspr	SPRN_AMR,r6
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Restore host DABR and DABRX */
 	ld	r5,HSTATE_DABR(r13)
@@ -700,10 +898,12 @@ hdec_soon:
 	mtspr	SPRN_DABRX,r6
 
 	/* Switch DSCR back to host value */
+BEGIN_FTR_SECTION
 	mfspr	r8, SPRN_DSCR
 	ld	r7, HSTATE_DSCR(r13)
 	std	r8, VCPU_DSCR(r7)
 	mtspr	SPRN_DSCR, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save non-volatile GPRs */
 	std	r14, VCPU_GPR(r14)(r9)
@@ -767,21 +967,31 @@ hdec_soon:
 	mfspr	r6, SPRN_PMC4
 	mfspr	r7, SPRN_PMC5
 	mfspr	r8, SPRN_PMC6
+BEGIN_FTR_SECTION
+	mfspr	r10, SPRN_PMC7
+	mfspr	r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	stw	r3, VCPU_PMC(r9)
 	stw	r4, VCPU_PMC + 4(r9)
 	stw	r5, VCPU_PMC + 8(r9)
 	stw	r6, VCPU_PMC + 12(r9)
 	stw	r7, VCPU_PMC + 16(r9)
 	stw	r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+	stw	r10, VCPU_PMC + 24(r9)
+	stw	r11, VCPU_PMC + 28(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 22:
 	/* save FP state */
 	mr	r3, r9
 	bl	.kvmppc_save_fp
 
-	/* Secondary threads go off to take a nap */
+	/* Secondary threads go off to take a nap on POWER7 */
+BEGIN_FTR_SECTION
 	lwz	r0,VCPU_PTID(r3)
 	cmpwi	r0,0
 	bne	secondary_nap
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* RFI into the highmem handler */
 	mfmsr	r7
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 72c5065..a107c9b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -213,6 +213,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 		break;
 	case KVM_CAP_PPC_RMA:
 		r = 1;
+		/* PPC970 requires an RMA */
+		if (cpu_has_feature(CPU_FTR_ARCH_201))
+			r = 2;
 		break;
 #endif
 	default:
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index b44f5f8..90039bc 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -37,7 +37,7 @@
 
 #define HPTE_LOCK_BIT 3
 
-static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
 static inline void __tlbie(unsigned long va, int psize, int ssize)
 {
-- 
1.7.5.4

^ permalink raw reply related

* Re: [PATCH 0/15] Hypervisor-mode KVM on POWER7 and PPC970
From: Alexander Graf @ 2011-06-18 14:34 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm-ppc, kvm
In-Reply-To: <20110618082705.GA28413@bloggs.ozlabs.ibm.com>


On 18.06.2011, at 10:27, Paul Mackerras wrote:

> The following series of patches enable KVM to exploit the hardware
> hypervisor mode on 64-bit Power ISA Book3S machines.  At present,
> POWER7 and PPC970 processors are supported.  (Note that the PPC970
> processors in Apple G5 machines don't have a usable hypervisor mode
> and are not supported by these patches.)
>=20
> Running the KVM host in hypervisor mode means that the guest can use
> both supervisor mode and user mode.  That means that the guest can
> execute supervisor-privilege instructions and access supervisor-
> privilege registers.  In addition the hardware directs most exceptions
> to the guest.  Thus we don't need to emulate any instructions in the
> host.  Generally, the only times we need to exit the guest are when it
> does a hypercall or when an external interrupt or host timer
> (decrementer) interrupt occurs.
>=20
> The focus of this KVM implementation is to run guests that use the
> PAPR (Power Architecture Platform Requirements) paravirtualization
> interface, which is the interface supplied by PowerVM on IBM pSeries
> machines.  Currently the "pseries" machine type in qemu is only
> supported by book3s_hv KVM, and book3s_hv KVM only supports the
> "pseries" machine type.  That will hopefully change in future.
>=20
> These patches are against Alex Graf's kvm-ppc-next branch.

I just applied them on top of kvm-ppc-next and started compiling for =
book3s_64_pr=3DM:

arch/powerpc/kernel/exceptions-64s.S: Assembler messages:
arch/powerpc/kernel/exceptions-64s.S:314: Error: undefined symbol =
`PACA_KVM_SVCPU' in operation
arch/powerpc/kernel/exceptions-64s.S:314: Error: undefined symbol =
`SVCPU_IN_GUEST' in operation
make[1]: *** [arch/powerpc/kernel/head_64.o] Error 1
make: *** [arch/powerpc/kernel] Error 2


Alex

^ permalink raw reply

* [PATCH 1/2][v2] powerpc: Move free_initmem to common code
From: Dave Carroll @ 2011-06-18 17:36 UTC (permalink / raw)
  To: Milton Miller, Benjamin Herrenschmidt; +Cc: Dave Carroll, LPPC, LKML
In-Reply-To: <1308350584.32158.58.camel@pasglop>

From: Dave Carroll <dcarroll@astekcorp.com>

The free_initmem function is basically duplicated in mm/init_32,
and init_64, and is moved to the common 32/64-bit mm/mem.c.

All other sections except init were removed in v2.6.15 by
6c45ab992e4299c869fb26427944a8f8ea177024 (powerpc: Remove section
free() and linker script bits), and therefore the bulk of the executed
code is identical.

This patch also removes updating ppc_md.progress to NULL in the powermac
late_initcall.

Suggested-by: Milton Miller <miltonm@bga.com>
Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Dave Carroll <dcarroll@astekcorp.com>
---
 [v2] This is a rebase of the original patch to Linus' current tree
 
 arch/powerpc/mm/init_32.c               |   32 -------------------------------
 arch/powerpc/mm/init_64.c               |   16 ---------------
 arch/powerpc/mm/mem.c                   |   19 ++++++++++++++++++
 arch/powerpc/platforms/powermac/setup.c |    3 --
 4 files changed, 19 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 5de0f25..c77fef5 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -191,38 +191,6 @@ void __init *early_get_page(void)
 		return __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
 }
 
-/* Free up now-unused memory */
-static void free_sec(unsigned long start, unsigned long end, const char *name)
-{
-	unsigned long cnt = 0;
-
-	while (start < end) {
-		ClearPageReserved(virt_to_page(start));
-		init_page_count(virt_to_page(start));
-		free_page(start);
-		cnt++;
-		start += PAGE_SIZE;
- 	}
-	if (cnt) {
-		printk(" %ldk %s", cnt << (PAGE_SHIFT - 10), name);
-		totalram_pages += cnt;
-	}
-}
-
-void free_initmem(void)
-{
-#define FREESEC(TYPE) \
-	free_sec((unsigned long)(&__ ## TYPE ## _begin), \
-		 (unsigned long)(&__ ## TYPE ## _end), \
-		 #TYPE);
-
-	printk ("Freeing unused kernel memory:");
-	FREESEC(init);
- 	printk("\n");
-	ppc_md.progress = NULL;
-#undef FREESEC
-}
-
 #ifdef CONFIG_8xx /* No 8xx specific .c file to put that in ... */
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 				phys_addr_t first_memblock_size)
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index f6dbb4c..e94b57f 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -83,22 +83,6 @@ EXPORT_SYMBOL_GPL(memstart_addr);
 phys_addr_t kernstart_addr;
 EXPORT_SYMBOL_GPL(kernstart_addr);
 
-void free_initmem(void)
-{
-	unsigned long addr;
-
-	addr = (unsigned long)__init_begin;
-	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
-		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		free_page(addr);
-		totalram_pages++;
-	}
-	printk ("Freeing unused kernel memory: %luk freed\n",
-		((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
-}
-
 static void pgd_ctor(void *addr)
 {
 	memset(addr, 0, PGD_TABLE_SIZE);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 29d4dde..f4e6408 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -382,6 +382,25 @@ void __init mem_init(void)
 	mem_init_done = 1;
 }
 
+void free_initmem(void)
+{
+	unsigned long addr;
+
+	ppc_md.progress = NULL;
+
+	addr = (unsigned long)__init_begin;
+	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+		free_page(addr);
+		totalram_pages++;
+	}
+	pr_info("Freeing unused kernel memory: %luk freed\n",
+		((unsigned long)__init_end -
+		(unsigned long)__init_begin) >> 10);
+}
+
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c
index aa45281..a028f08 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -355,9 +355,6 @@ static int initializing = 1;
 static int pmac_late_init(void)
 {
 	initializing = 0;
-	/* this is udbg (which is __init) and we can later use it during
-	 * cpu hotplug (in smp_core99_kick_cpu) */
-	ppc_md.progress = NULL;
 	return 0;
 }
 machine_late_initcall(powermac, pmac_late_init);
-- 
1.7.5

^ permalink raw reply related

* [PATCH 2/2][v2] powerpc: Add printk companion for ppc_md.progress
From: Dave Carroll @ 2011-06-18 17:36 UTC (permalink / raw)
  To: Milton Miller, Benjamin Herrenschmidt; +Cc: Dave Carroll, LPPC, LKML
In-Reply-To: <1308418600-9167-1-git-send-email-dcarroll@astekcorp.com>

From: Dave Carroll <dcarroll@astekcorp.com>

This patch adds a printk companion to replace the udbg progress function
when initmem is freed.

Suggested-by: Milton Miller <miltonm@bga.com>
Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Dave Carroll <dcarroll@astekcorp.com>
---
 [v2] This is a rebase of the original patch to Linus' current tree
 
 arch/powerpc/include/asm/setup.h   |    2 ++
 arch/powerpc/kernel/setup-common.c |    5 +++++
 arch/powerpc/mm/mem.c              |    2 +-
 3 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index dae1934..c77cb7a 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -3,4 +3,6 @@
 
 #include <asm-generic/setup.h>
 
+extern void ppc_printk_progress(char *s, unsigned short hex);
+
 #endif	/* _ASM_POWERPC_SETUP_H */
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 79fca26..e053b16 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -704,6 +704,11 @@ static int powerpc_debugfs_init(void)
 arch_initcall(powerpc_debugfs_init);
 #endif
 
+void ppc_printk_progress(char *s, unsigned short hex)
+{
+	pr_info("%s\n", s);
+}
+
 static int ppc_dflt_bus_notify(struct notifier_block *nb,
 				unsigned long action, void *data)
 {
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index f4e6408..e0d2aa0 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -386,7 +386,7 @@ void free_initmem(void)
 {
 	unsigned long addr;
 
-	ppc_md.progress = NULL;
+	ppc_md.progress = ppc_printk_progress;
 
 	addr = (unsigned long)__init_begin;
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
-- 
1.7.5

^ permalink raw reply related

* RE: NAND BBT corruption on MPC83xx
From: Mike Hench @ 2011-06-18 17:55 UTC (permalink / raw)
  To: Scott Wood, Matthew L. Creech; +Cc: linuxppc-dev, linux-mtd
In-Reply-To: <20110617163442.204348a0@schlenkerla.am.freescale.net>

Scott Wood wrote:
> As for the corruption, could it be degradation from repeated reads of
that
> one page?

Read Disturb. I Did not know SLC did that.
It just takes 10x as long as MLC, on the order of a million reads.
Supposedly erasing the block fixes it.
It is not a permanent damage thing.
I was seeing ~9 hours before failure with heavy writes.
~4GByte/hour =3D 2M pages, total ~18 million reads before errors in that
last block showed up.

Cool. Now we know.
Thanks.

Mike Hench

^ permalink raw reply

* Re: [PATCH 0/15] Hypervisor-mode KVM on POWER7 and PPC970
From: Paul Mackerras @ 2011-06-19  1:01 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm-ppc, kvm
In-Reply-To: <FF184B48-CFBB-42E9-9CAE-897A56172F5E@suse.de>

On Sat, Jun 18, 2011 at 04:34:14PM +0200, Alexander Graf wrote:

> I just applied them on top of kvm-ppc-next and started compiling for book3s_64_pr=M:
> 
> arch/powerpc/kernel/exceptions-64s.S: Assembler messages:
> arch/powerpc/kernel/exceptions-64s.S:314: Error: undefined symbol `PACA_KVM_SVCPU' in operation
> arch/powerpc/kernel/exceptions-64s.S:314: Error: undefined symbol `SVCPU_IN_GUEST' in operation
> make[1]: *** [arch/powerpc/kernel/head_64.o] Error 1
> make: *** [arch/powerpc/kernel] Error 2

Ah, good catch, you must be compiling with CONFIG_POWER4_ONLY=n.
I did test compiling for book3s_pr but I had CONFIG_POWER4_ONLY=y.
I'll post revised versions of patches 8/15 and 9/15.

Paul.

^ permalink raw reply

* [PATCH 08/15 revised] KVM: PPC: Split host-state fields out of kvmppc_book3s_shadow_vcpu
From: Paul Mackerras @ 2011-06-19  1:03 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <20110618083524.GI28413@bloggs.ozlabs.ibm.com>

There are several fields in struct kvmppc_book3s_shadow_vcpu that
temporarily store bits of host state while a guest is running,
rather than anything relating to the particular guest or vcpu.
This splits them out into a new kvmppc_host_state structure and
modifies the definitions in asm-offsets.c to suit.

On 32-bit, we have a kvmppc_host_state structure inside the
kvmppc_book3s_shadow_vcpu since the assembly code needs to be able
to get to them both with one pointer.  On 64-bit they are separate
fields in the PACA.  This means that on 64-bit we don't need to
copy the kvmppc_host_state in and out on vcpu load/unload, and
in future will mean that the book3s_hv code doesn't need a
shadow_vcpu struct in the PACA at all.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
Fixed compilation with CONFIG_POWER4_ONLY=n

 arch/powerpc/include/asm/exception-64s.h  |   10 ++--
 arch/powerpc/include/asm/kvm_book3s_asm.h |   27 ++++++---
 arch/powerpc/include/asm/paca.h           |    1 +
 arch/powerpc/kernel/asm-offsets.c         |   94 ++++++++++++++--------------
 arch/powerpc/kernel/exceptions-64s.S      |    2 +-
 arch/powerpc/kvm/book3s_interrupts.S      |   17 +++---
 arch/powerpc/kvm/book3s_rmhandlers.S      |   18 +++---
 arch/powerpc/kvm/book3s_segment.S         |   72 +++++++++++-----------
 8 files changed, 125 insertions(+), 116 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index b6a3a44..296c9b6 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -96,16 +96,16 @@
 	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
 #define __KVMTEST(n)							\
-	lbz	r10,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13);			\
+	lbz	r10,HSTATE_IN_GUEST(r13);			\
 	cmpwi	r10,0;							\
 	bne	do_kvm_##n
 
 #define __KVM_HANDLER(area, h, n)					\
 do_kvm_##n:								\
 	ld	r10,area+EX_R10(r13);					\
-	stw	r9,PACA_KVM_SVCPU+SVCPU_SCRATCH1(r13);			\
+	stw	r9,HSTATE_SCRATCH1(r13);			\
 	ld	r9,area+EX_R9(r13);					\
-	std	r12,PACA_KVM_SVCPU+SVCPU_SCRATCH0(r13);			\
+	std	r12,HSTATE_SCRATCH0(r13);			\
 	li	r12,n;							\
 	b	kvmppc_interrupt
 
@@ -114,9 +114,9 @@ do_kvm_##n:								\
 	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
 	ld	r10,area+EX_R10(r13);					\
 	beq	89f;							\
-	stw	r9,PACA_KVM_SVCPU+SVCPU_SCRATCH1(r13);			\
+	stw	r9,HSTATE_SCRATCH1(r13);			\
 	ld	r9,area+EX_R9(r13);					\
-	std	r12,PACA_KVM_SVCPU+SVCPU_SCRATCH0(r13);			\
+	std	r12,HSTATE_SCRATCH0(r13);			\
 	li	r12,n;							\
 	b	kvmppc_interrupt;					\
 89:	mtocrf	0x80,r9;						\
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d5a8a38..3126175 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -60,6 +60,22 @@ kvmppc_resume_\intno:
 
 #else  /*__ASSEMBLY__ */
 
+/*
+ * This struct goes in the PACA on 64-bit processors.  It is used
+ * to store host state that needs to be saved when we enter a guest
+ * and restored when we exit, but isn't specific to any particular
+ * guest or vcpu.  It also has some scratch fields used by the guest
+ * exit code.
+ */
+struct kvmppc_host_state {
+	ulong host_r1;
+	ulong host_r2;
+	ulong vmhandler;
+	ulong scratch0;
+	ulong scratch1;
+	u8 in_guest;
+};
+
 struct kvmppc_book3s_shadow_vcpu {
 	ulong gpr[14];
 	u32 cr;
@@ -73,17 +89,12 @@ struct kvmppc_book3s_shadow_vcpu {
 	ulong shadow_srr1;
 	ulong fault_dar;
 
-	ulong host_r1;
-	ulong host_r2;
-	ulong handler;
-	ulong scratch0;
-	ulong scratch1;
-	ulong vmhandler;
-	u8 in_guest;
-
 #ifdef CONFIG_PPC_BOOK3S_32
 	u32     sr[16];			/* Guest SRs */
+
+	struct kvmppc_host_state hstate;
 #endif
+
 #ifdef CONFIG_PPC_BOOK3S_64
 	u8 slb_max;			/* highest used guest slb entry */
 	struct  {
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 7412676..58f4a18 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -149,6 +149,7 @@ struct paca_struct {
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
 	/* We use this to store guest state in */
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+	struct kvmppc_host_state kvm_hstate;
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index faf8461..dabfb73 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -198,11 +198,6 @@ int main(void)
 	DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
 	DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
-	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
-	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
-#endif
 #endif /* CONFIG_PPC64 */
 
 	/* RTAS */
@@ -416,49 +411,54 @@ int main(void)
 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
-	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
-			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
-	DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
-	DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
-	DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
-	DEFINE(SVCPU_LR, offsetof(struct kvmppc_book3s_shadow_vcpu, lr));
-	DEFINE(SVCPU_PC, offsetof(struct kvmppc_book3s_shadow_vcpu, pc));
-	DEFINE(SVCPU_R0, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[0]));
-	DEFINE(SVCPU_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[1]));
-	DEFINE(SVCPU_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[2]));
-	DEFINE(SVCPU_R3, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[3]));
-	DEFINE(SVCPU_R4, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[4]));
-	DEFINE(SVCPU_R5, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[5]));
-	DEFINE(SVCPU_R6, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[6]));
-	DEFINE(SVCPU_R7, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[7]));
-	DEFINE(SVCPU_R8, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[8]));
-	DEFINE(SVCPU_R9, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[9]));
-	DEFINE(SVCPU_R10, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[10]));
-	DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
-	DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
-	DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
-	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
-	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
-	DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					 vmhandler));
-	DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					scratch0));
-	DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					scratch1));
-	DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					in_guest));
-	DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					   fault_dsisr));
-	DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					 fault_dar));
-	DEFINE(SVCPU_LAST_INST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					 last_inst));
-	DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					   shadow_srr1));
+
+#ifdef CONFIG_PPC_BOOK3S_64
+# define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
+# define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
+#else	/* 32-bit */
+# define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
+# define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f))
+#endif
+
+	SVCPU_FIELD(SVCPU_CR, cr);
+	SVCPU_FIELD(SVCPU_XER, xer);
+	SVCPU_FIELD(SVCPU_CTR, ctr);
+	SVCPU_FIELD(SVCPU_LR, lr);
+	SVCPU_FIELD(SVCPU_PC, pc);
+	SVCPU_FIELD(SVCPU_R0, gpr[0]);
+	SVCPU_FIELD(SVCPU_R1, gpr[1]);
+	SVCPU_FIELD(SVCPU_R2, gpr[2]);
+	SVCPU_FIELD(SVCPU_R3, gpr[3]);
+	SVCPU_FIELD(SVCPU_R4, gpr[4]);
+	SVCPU_FIELD(SVCPU_R5, gpr[5]);
+	SVCPU_FIELD(SVCPU_R6, gpr[6]);
+	SVCPU_FIELD(SVCPU_R7, gpr[7]);
+	SVCPU_FIELD(SVCPU_R8, gpr[8]);
+	SVCPU_FIELD(SVCPU_R9, gpr[9]);
+	SVCPU_FIELD(SVCPU_R10, gpr[10]);
+	SVCPU_FIELD(SVCPU_R11, gpr[11]);
+	SVCPU_FIELD(SVCPU_R12, gpr[12]);
+	SVCPU_FIELD(SVCPU_R13, gpr[13]);
+	SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr);
+	SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar);
+	SVCPU_FIELD(SVCPU_LAST_INST, last_inst);
+	SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1);
 #ifdef CONFIG_PPC_BOOK3S_32
-	DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
+	SVCPU_FIELD(SVCPU_SR, sr);
 #endif
-#else
+#ifdef CONFIG_PPC64
+	SVCPU_FIELD(SVCPU_SLB, slb);
+	SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
+#endif
+
+	HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
+	HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
+	HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
+	HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
+	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
+
+#else /* CONFIG_PPC_BOOK3S */
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
@@ -468,7 +468,7 @@ int main(void)
 	DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
 	DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
 #endif /* CONFIG_PPC_BOOK3S */
-#endif
+#endif /* CONFIG_KVM */
 
 #ifdef CONFIG_KVM_GUEST
 	DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e76472c..6da0055 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -298,7 +298,7 @@ data_access_check_stab:
 	srdi	r10,r10,60
 	rlwimi	r10,r9,16,0x20
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	lbz	r9,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13)
+	lbz	r9,HSTATE_IN_GUEST(r13)
 	rlwimi	r10,r9,8,0x300
 #endif
 	mfcr	r9
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 8c5e0e1..91c6de6 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -29,8 +29,7 @@
 #define ULONG_SIZE 		8
 #define FUNC(name) 		GLUE(.,name)
 
-#define GET_SHADOW_VCPU(reg)    \
-        addi    reg, r13, PACA_KVM_SVCPU
+#define GET_SHADOW_VCPU_R13
 
 #define DISABLE_INTERRUPTS	\
 	mfmsr   r0;		\
@@ -43,8 +42,8 @@
 #define ULONG_SIZE              4
 #define FUNC(name)		name
 
-#define GET_SHADOW_VCPU(reg)    \
-        lwz     reg, (THREAD + THREAD_KVM_SVCPU)(r2)
+#define GET_SHADOW_VCPU_R13	\
+	lwz	r13, (THREAD + THREAD_KVM_SVCPU)(r2)
 
 #define DISABLE_INTERRUPTS	\
 	mfmsr   r0;		\
@@ -107,15 +106,15 @@ kvm_start_entry:
 	/* Load non-volatile guest state from the vcpu */
 	VCPU_LOAD_NVGPRS(r4)
 
-	GET_SHADOW_VCPU(r5)
+	GET_SHADOW_VCPU_R13
 
-	/* Save R1/R2 in the PACA */
-	PPC_STL	r1, SVCPU_HOST_R1(r5)
-	PPC_STL	r2, SVCPU_HOST_R2(r5)
+	/* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
+	PPC_STL	r1, HSTATE_HOST_R1(r13)
+	PPC_STL	r2, HSTATE_HOST_R2(r13)
 
 	/* XXX swap in/out on load? */
 	PPC_LL	r3, VCPU_HIGHMEM_HANDLER(r4)
-	PPC_STL	r3, SVCPU_VMHANDLER(r5)
+	PPC_STL	r3, HSTATE_VMHANDLER(r13)
 
 kvm_start_lightweight:
 
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index dd03689..c1f877c 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -36,7 +36,6 @@
 #if defined(CONFIG_PPC_BOOK3S_64)
 
 #define LOAD_SHADOW_VCPU(reg)	GET_PACA(reg)					
-#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
 #define MSR_NOIRQ		MSR_KERNEL & ~(MSR_IR | MSR_DR)
 #define FUNC(name) 		GLUE(.,name)
 
@@ -66,7 +65,6 @@ kvmppc_skip_Hinterrupt:
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
-#define SHADOW_VCPU_OFF		0
 #define MSR_NOIRQ		MSR_KERNEL
 #define FUNC(name)		name
 
@@ -96,14 +94,14 @@ kvmppc_trampoline_\intno:
 	b	kvmppc_resume_\intno		/* Get back original handler */
 
 1:	tophys(r13, r13)
-	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	stw	r12, HSTATE_SCRATCH1(r13)
 	mfspr	r12, SPRN_SPRG_SCRATCH1
-	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
-	lbz	r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+	stw	r12, HSTATE_SCRATCH0(r13)
+	lbz	r12, HSTATE_IN_GUEST(r13)
 	cmpwi	r12, KVM_GUEST_MODE_NONE
 	bne	..kvmppc_handler_hasmagic_\intno
 	/* No KVM guest? Then jump back to the Linux handler! */
-	lwz	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	lwz	r12, HSTATE_SCRATCH1(r13)
 	b	2b
 
 	/* Now we know we're handling a KVM guest */
@@ -146,8 +144,8 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALTIVEC
  *
  * R12            = free
  * R13            = Shadow VCPU (PACA)
- * SVCPU.SCRATCH0 = guest R12
- * SVCPU.SCRATCH1 = guest CR
+ * HSTATE.SCRATCH0 = guest R12
+ * HSTATE.SCRATCH1 = guest CR
  * SPRG_SCRATCH0  = guest R13
  *
  */
@@ -159,9 +157,9 @@ kvmppc_handler_skip_ins:
 	mtsrr0	r12
 
 	/* Clean up all state */
-	lwz	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	lwz	r12, HSTATE_SCRATCH1(r13)
 	mtcr	r12
-	PPC_LL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
+	PPC_LL	r12, HSTATE_SCRATCH0(r13)
 	GET_SCRATCH0(r13)
 
 	/* And get back into the code */
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 4a623eb..b0ae527 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -22,7 +22,7 @@
 #if defined(CONFIG_PPC_BOOK3S_64)
 
 #define GET_SHADOW_VCPU(reg)    \
-	addi    reg, r13, PACA_KVM_SVCPU
+	mr	reg, r13
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
@@ -78,7 +78,7 @@ kvmppc_handler_trampoline_enter:
 
 	/* Activate guest mode, so faults get handled by KVM */
 	li	r11, KVM_GUEST_MODE_GUEST
-	stb	r11, SVCPU_IN_GUEST(r3)
+	stb	r11, HSTATE_IN_GUEST(r3)
 
 	/* Switch to guest segment. This is subarch specific. */
 	LOAD_GUEST_SEGMENTS
@@ -132,30 +132,30 @@ kvmppc_interrupt:
 	 *
 	 * SPRG_SCRATCH0  = guest R13
 	 * R12            = exit handler id
-	 * R13            = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
-	 * SVCPU.SCRATCH0 = guest R12
-	 * SVCPU.SCRATCH1 = guest CR
+	 * R13            = shadow vcpu (32-bit) or PACA (64-bit)
+	 * HSTATE.SCRATCH0 = guest R12
+	 * HSTATE.SCRATCH1 = guest CR
 	 *
 	 */
 
 	/* Save registers */
 
-	PPC_STL	r0, (SHADOW_VCPU_OFF + SVCPU_R0)(r13)
-	PPC_STL	r1, (SHADOW_VCPU_OFF + SVCPU_R1)(r13)
-	PPC_STL	r2, (SHADOW_VCPU_OFF + SVCPU_R2)(r13)
-	PPC_STL	r3, (SHADOW_VCPU_OFF + SVCPU_R3)(r13)
-	PPC_STL	r4, (SHADOW_VCPU_OFF + SVCPU_R4)(r13)
-	PPC_STL	r5, (SHADOW_VCPU_OFF + SVCPU_R5)(r13)
-	PPC_STL	r6, (SHADOW_VCPU_OFF + SVCPU_R6)(r13)
-	PPC_STL	r7, (SHADOW_VCPU_OFF + SVCPU_R7)(r13)
-	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_R8)(r13)
-	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_R9)(r13)
-	PPC_STL	r10, (SHADOW_VCPU_OFF + SVCPU_R10)(r13)
-	PPC_STL	r11, (SHADOW_VCPU_OFF + SVCPU_R11)(r13)
+	PPC_STL	r0, SVCPU_R0(r13)
+	PPC_STL	r1, SVCPU_R1(r13)
+	PPC_STL	r2, SVCPU_R2(r13)
+	PPC_STL	r3, SVCPU_R3(r13)
+	PPC_STL	r4, SVCPU_R4(r13)
+	PPC_STL	r5, SVCPU_R5(r13)
+	PPC_STL	r6, SVCPU_R6(r13)
+	PPC_STL	r7, SVCPU_R7(r13)
+	PPC_STL	r8, SVCPU_R8(r13)
+	PPC_STL	r9, SVCPU_R9(r13)
+	PPC_STL	r10, SVCPU_R10(r13)
+	PPC_STL	r11, SVCPU_R11(r13)
 
 	/* Restore R1/R2 so we can handle faults */
-	PPC_LL	r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13)
-	PPC_LL	r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
+	PPC_LL	r1, HSTATE_HOST_R1(r13)
+	PPC_LL	r2, HSTATE_HOST_R2(r13)
 
 	/* Save guest PC and MSR */
 #ifdef CONFIG_PPC64
@@ -171,17 +171,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 1:	mfsrr0	r3
 	mfsrr1	r4
 2:
-	PPC_STL	r3, (SHADOW_VCPU_OFF + SVCPU_PC)(r13)
-	PPC_STL	r4, (SHADOW_VCPU_OFF + SVCPU_SHADOW_SRR1)(r13)
+	PPC_STL	r3, SVCPU_PC(r13)
+	PPC_STL	r4, SVCPU_SHADOW_SRR1(r13)
 
 	/* Get scratch'ed off registers */
 	GET_SCRATCH0(r9)
-	PPC_LL	r8, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
-	lwz	r7, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	PPC_LL	r8, HSTATE_SCRATCH0(r13)
+	lwz	r7, HSTATE_SCRATCH1(r13)
 
-	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_R13)(r13)
-	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_R12)(r13)
-	stw	r7, (SHADOW_VCPU_OFF + SVCPU_CR)(r13)
+	PPC_STL	r9, SVCPU_R13(r13)
+	PPC_STL	r8, SVCPU_R12(r13)
+	stw	r7, SVCPU_CR(r13)
 
 	/* Save more register state  */
 
@@ -191,11 +191,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 	mfctr	r8
 	mflr	r9
 
-	stw	r5, (SHADOW_VCPU_OFF + SVCPU_XER)(r13)
-	PPC_STL	r6, (SHADOW_VCPU_OFF + SVCPU_FAULT_DAR)(r13)
-	stw	r7, (SHADOW_VCPU_OFF + SVCPU_FAULT_DSISR)(r13)
-	PPC_STL	r8, (SHADOW_VCPU_OFF + SVCPU_CTR)(r13)
-	PPC_STL	r9, (SHADOW_VCPU_OFF + SVCPU_LR)(r13)
+	stw	r5, SVCPU_XER(r13)
+	PPC_STL	r6, SVCPU_FAULT_DAR(r13)
+	stw	r7, SVCPU_FAULT_DSISR(r13)
+	PPC_STL	r8, SVCPU_CTR(r13)
+	PPC_STL	r9, SVCPU_LR(r13)
 
 	/*
 	 * In order for us to easily get the last instruction,
@@ -225,7 +225,7 @@ ld_last_inst:
 	/* Set guest mode to 'jump over instruction' so if lwz faults
 	 * we'll just continue at the next IP. */
 	li	r9, KVM_GUEST_MODE_SKIP
-	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+	stb	r9, HSTATE_IN_GUEST(r13)
 
 	/*    1) enable paging for data */
 	mfmsr	r9
@@ -239,13 +239,13 @@ ld_last_inst:
 	sync
 
 #endif
-	stw	r0, (SHADOW_VCPU_OFF + SVCPU_LAST_INST)(r13)
+	stw	r0, SVCPU_LAST_INST(r13)
 
 no_ld_last_inst:
 
 	/* Unset guest mode */
 	li	r9, KVM_GUEST_MODE_NONE
-	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+	stb	r9, HSTATE_IN_GUEST(r13)
 
 	/* Switch back to host MMU */
 	LOAD_HOST_SEGMENTS
@@ -255,7 +255,7 @@ no_ld_last_inst:
 	 * R1       = host R1
 	 * R2       = host R2
 	 * R12      = exit handler id
-	 * R13      = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
+	 * R13      = shadow vcpu (32-bit) or PACA (64-bit)
 	 * SVCPU.*  = guest *
 	 *
 	 */
@@ -265,7 +265,7 @@ no_ld_last_inst:
 	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
 	mtsrr1	r7
 	/* Load highmem handler address */
-	PPC_LL	r8, (SHADOW_VCPU_OFF + SVCPU_VMHANDLER)(r13)
+	PPC_LL	r8, HSTATE_VMHANDLER(r13)
 	mtsrr0	r8
 
 	RFI
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 09/15 revised] KVM: PPC: Add support for Book3S processors in hypervisor mode
From: Paul Mackerras @ 2011-06-19  1:04 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <20110618083558.GJ28413@bloggs.ozlabs.ibm.com>

This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode.  Using hypervisor mode means
that the guest can use the processor's supervisor mode.  That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host.  This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.

This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses.  That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification.  In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.

Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.

This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.

With the guest running in supervisor mode, most exceptions go straight
to the guest.  We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.

We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.

In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount.  Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.

The POWER7 processor has a restriction that all threads in a core have
to be in the same partition.  MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest.  At present we require the host and guest to run
in single-thread mode because of this hardware restriction.

This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management.  This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
Re-diffed due to context changes from revised patch 8/15

 Documentation/virtual/kvm/api.txt         |   17 +
 arch/powerpc/include/asm/exception-64s.h  |   19 +-
 arch/powerpc/include/asm/kvm_asm.h        |    4 +
 arch/powerpc/include/asm/kvm_book3s.h     |  136 +++++-
 arch/powerpc/include/asm/kvm_book3s_64.h  |    2 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |   11 +
 arch/powerpc/include/asm/kvm_booke.h      |    4 +
 arch/powerpc/include/asm/kvm_host.h       |   53 ++-
 arch/powerpc/include/asm/kvm_ppc.h        |    6 +
 arch/powerpc/include/asm/mmu-hash64.h     |   10 +-
 arch/powerpc/include/asm/paca.h           |    2 +
 arch/powerpc/include/asm/reg.h            |    4 +
 arch/powerpc/kernel/asm-offsets.c         |   78 ++++
 arch/powerpc/kernel/exceptions-64s.S      |   60 ++--
 arch/powerpc/kvm/Kconfig                  |   41 ++-
 arch/powerpc/kvm/Makefile                 |   16 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c       |  258 +++++++++++
 arch/powerpc/kvm/book3s_hv.c              |  452 +++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_interrupts.S   |  325 ++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  667 +++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_segment.S         |   34 +-
 arch/powerpc/kvm/powerpc.c                |   22 +-
 arch/powerpc/kvm/trace.h                  |    2 +-
 include/linux/kvm.h                       |    6 +
 24 files changed, 2147 insertions(+), 82 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_interrupts.S
 create mode 100644 arch/powerpc/kvm/book3s_hv_rmhandlers.S

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 0ebe922..cc5064c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1530,6 +1530,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
 necessary. Upon guest entry all guest GPRs will then be replaced by the values
 in this struct.
 
+		/* KVM_EXIT_PAPR_HCALL */
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
+
+This is used on 64-bit PowerPC when emulating a pSeries partition,
+e.g. with the 'pseries' machine type in qemu.  It occurs when the
+guest does a hypercall using the 'sc 1' instruction.  The 'nr' field
+contains the hypercall number (from the guest R3), and 'args' contains
+the arguments (from the guest R4 - R12).  Userspace should put the
+return code in 'ret' and any extra returned values in args[].
+The possible hypercalls are defined in the Power Architecture Platform
+Requirements (PAPR) document available from www.power.org (free
+developer registration required to access it).
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 296c9b6..69435da 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -134,6 +134,17 @@ do_kvm_##n:								\
 #define KVM_HANDLER_SKIP(area, h, n)
 #endif
 
+#ifdef CONFIG_KVM_BOOK3S_PR
+#define KVMTEST_PR(n)			__KVMTEST(n)
+#define KVM_HANDLER_PR(area, h, n)	__KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST_PR(n)
+#define KVM_HANDLER_PR(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)
+#endif
+
 #define NOTEST(n)
 
 /*
@@ -210,7 +221,7 @@ label##_pSeries:					\
 	HMT_MEDIUM;					\
 	SET_SCRATCH0(r13);		/* save r13 */		\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
-				 EXC_STD, KVMTEST, vec)
+				 EXC_STD, KVMTEST_PR, vec)
 
 #define STD_EXCEPTION_HV(loc, vec, label)		\
 	. = loc;					\
@@ -227,8 +238,8 @@ label##_hv:						\
 	beq	masked_##h##interrupt
 #define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
 
-#define SOFTEN_TEST(vec)						\
-	KVMTEST(vec);							\
+#define SOFTEN_TEST_PR(vec)						\
+	KVMTEST_PR(vec);						\
 	_SOFTEN_TEST(EXC_STD)
 
 #define SOFTEN_TEST_HV(vec)						\
@@ -248,7 +259,7 @@ label##_hv:						\
 	.globl label##_pSeries;						\
 label##_pSeries:							\
 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
-				    EXC_STD, SOFTEN_TEST)
+				    EXC_STD, SOFTEN_TEST_PR)
 
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
 	. = loc;							\
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 0951b17..7b1f0e0 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -64,8 +64,12 @@
 #define BOOK3S_INTERRUPT_PROGRAM	0x700
 #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
 #define BOOK3S_INTERRUPT_DECREMENTER	0x900
+#define BOOK3S_INTERRUPT_HV_DECREMENTER	0x980
 #define BOOK3S_INTERRUPT_SYSCALL	0xc00
 #define BOOK3S_INTERRUPT_TRACE		0xd00
+#define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
+#define BOOK3S_INTERRUPT_H_INST_STORAGE	0xe20
+#define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
 #define BOOK3S_INTERRUPT_PERFMON	0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
 #define BOOK3S_INTERRUPT_VSX		0xf40
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 480fff6..6135a98 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -116,6 +116,7 @@ extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
 extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -127,10 +128,12 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern int kvmppc_mmu_hpte_sysinit(void);
 extern void kvmppc_mmu_hpte_sysexit(void);
+extern int kvmppc_mmu_hv_init(void);
 
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
@@ -151,6 +154,19 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
 }
 
+extern void kvm_return_point(void);
+
+/* Also add subarch specific defines */
+
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+#include <asm/kvm_book3s_32.h>
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/kvm_book3s_64.h>
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+
 static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 {
 	return to_book3s(vcpu)->hior;
@@ -165,16 +181,6 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 		vcpu->arch.shared->int_pending = 0;
 }
 
-static inline ulong dsisr(void)
-{
-	ulong r;
-	asm ( "mfdsisr %0 " : "=r" (r) );
-	return r;
-}
-
-extern void kvm_return_point(void);
-static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
-
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
 	if ( num < 14 ) {
@@ -281,6 +287,108 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 	return crit;
 }
+#else /* CONFIG_KVM_BOOK3S_PR */
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+			unsigned long pending_now, unsigned long old_pending)
+{
+	/* Recalculate LPCR:MER based on the presence of
+	 * a pending external interrupt
+	 */
+	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &pending_now) ||
+	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &pending_now))
+		vcpu->arch.lpcr |= LPCR_MER;
+	else
+		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
+}
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+	vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+	return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.xer;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pc;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+	ulong pc = kvmppc_get_pc(vcpu);
+
+	/* Load the instruction manually if it failed to do so in the
+	 * exit path */
+	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+
+	return vcpu->arch.last_inst;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.fault_dar;
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+#endif
 
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
@@ -289,12 +397,4 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 #define INS_DCBZ			0x7c0007ec
 
-/* Also add subarch specific defines */
-
-#ifdef CONFIG_PPC_BOOK3S_32
-#include <asm/kvm_book3s_32.h>
-#else
-#include <asm/kvm_book3s_64.h>
-#endif
-
 #endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 4cadd61..5f73388 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,9 +20,11 @@
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#ifdef CONFIG_KVM_BOOK3S_PR
 static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 {
 	return &get_paca()->shadow_vcpu;
 }
+#endif
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 3126175..550cf07 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -74,6 +74,17 @@ struct kvmppc_host_state {
 	ulong scratch0;
 	ulong scratch1;
 	u8 in_guest;
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	struct kvm_vcpu *kvm_vcpu;
+	u64 dabr;
+	u64 host_mmcr[3];
+	u32 host_pmc[6];
+	u64 host_purr;
+	u64 host_spurr;
+	u64 host_dscr;
+	u64 dec_expires;
+#endif
 };
 
 struct kvmppc_book3s_shadow_vcpu {
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index 9c9ba3d..a90e091 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 	return vcpu->arch.fault_dear;
 }
 
+static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->msr;
+}
 #endif /* __ASM_KVM_BOOKE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 069eb9f..4a3f790 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -33,7 +33,9 @@
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 
+#ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#endif
 
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x)	0
@@ -133,7 +135,26 @@ struct kvmppc_exit_timing {
 	};
 };
 
+struct kvmppc_pginfo {
+	unsigned long pfn;
+	atomic_t refcnt;
+};
+
 struct kvm_arch {
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	unsigned long hpt_virt;
+	unsigned long ram_npages;
+	unsigned long ram_psize;
+	unsigned long ram_porder;
+	struct kvmppc_pginfo *ram_pginfo;
+	unsigned int lpid;
+	unsigned int host_lpid;
+	unsigned long host_lpcr;
+	unsigned long sdr1;
+	unsigned long host_sdr1;
+	int tlbie_lock;
+	unsigned short last_vcpu[NR_CPUS];
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
 struct kvmppc_pte {
@@ -190,7 +211,7 @@ struct kvm_vcpu_arch {
 	ulong rmcall;
 	ulong host_paca_phys;
 	struct kvmppc_slb slb[64];
-	int slb_max;		/* # valid entries in slb[] */
+	int slb_max;		/* 1 + index of last valid entry in slb[] */
 	int slb_nr;		/* total number of entries in SLB */
 	struct kvmppc_mmu mmu;
 #endif
@@ -212,7 +233,7 @@ struct kvm_vcpu_arch {
 #endif
 
 #ifdef CONFIG_VSX
-	u64 vsr[32];
+	u64 vsr[64];
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S
@@ -220,18 +241,24 @@ struct kvm_vcpu_arch {
 	u32 qpr[32];
 #endif
 
-#ifdef CONFIG_BOOKE
 	ulong pc;
 	ulong ctr;
 	ulong lr;
 
 	ulong xer;
 	u32 cr;
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S
 	ulong hflags;
 	ulong guest_owned_ext;
+	ulong purr;
+	ulong spurr;
+	ulong lpcr;
+	ulong dscr;
+	ulong amr;
+	ulong uamor;
+	u32 ctrl;
+	ulong dabr;
 #endif
 	u32 vrsave; /* also USPRG0 */
 	u32 mmucr;
@@ -270,6 +297,9 @@ struct kvm_vcpu_arch {
 	u32 dbcr1;
 	u32 dbsr;
 
+	u64 mmcr[3];
+	u32 pmc[6];
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct mutex exit_timing_lock;
 	struct kvmppc_exit_timing timing_exit;
@@ -284,8 +314,12 @@ struct kvm_vcpu_arch {
 	struct dentry *debugfs_exit_timing;
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S
+	ulong fault_dar;
+	u32 fault_dsisr;
+#endif
+
 #ifdef CONFIG_BOOKE
-	u32 last_inst;
 	ulong fault_dear;
 	ulong fault_esr;
 	ulong queued_dear;
@@ -300,16 +334,25 @@ struct kvm_vcpu_arch {
 	u8 dcr_is_write;
 	u8 osi_needed;
 	u8 osi_enabled;
+	u8 hcall_needed;
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
 	struct hrtimer dec_timer;
 	struct tasklet_struct tasklet;
 	u64 dec_jiffies;
+	u64 dec_expires;
 	unsigned long pending_exceptions;
+	u16 last_cpu;
+	u32 last_inst;
+	int trap;
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	struct kvm_vcpu_arch_shared shregs;
+#endif
 };
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 48b7ab7..0dafd53 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -112,6 +112,12 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
+extern long kvmppc_alloc_hpt(struct kvm *kvm);
+extern void kvmppc_free_hpt(struct kvm *kvm);
+extern long kvmppc_prepare_vrma(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem);
+extern void kvmppc_map_vrma(struct kvm *kvm,
+			    struct kvm_userspace_memory_region *mem);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index d865bd9..b445e0a 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -90,13 +90,19 @@ extern char initial_stab[];
 
 #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
 #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
+#define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
 #define HPTE_R_RPN_SHIFT	12
-#define HPTE_R_RPN		ASM_CONST(0x3ffffffffffff000)
-#define HPTE_R_FLAGS		ASM_CONST(0x00000000000003ff)
+#define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
 #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
 #define HPTE_R_N		ASM_CONST(0x0000000000000004)
+#define HPTE_R_G		ASM_CONST(0x0000000000000008)
+#define HPTE_R_M		ASM_CONST(0x0000000000000010)
+#define HPTE_R_I		ASM_CONST(0x0000000000000020)
+#define HPTE_R_W		ASM_CONST(0x0000000000000040)
+#define HPTE_R_WIMG		ASM_CONST(0x0000000000000078)
 #define HPTE_R_C		ASM_CONST(0x0000000000000080)
 #define HPTE_R_R		ASM_CONST(0x0000000000000100)
+#define HPTE_R_KEY_LO		ASM_CONST(0x0000000000000e00)
 
 #define HPTE_V_1TB_SEG		ASM_CONST(0x4000000000000000)
 #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 58f4a18..a6da128 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -147,8 +147,10 @@ struct paca_struct {
 	struct dtl_entry *dtl_curr;	/* pointer corresponding to dtl_ridx */
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 	/* We use this to store guest state in */
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+#endif
 	struct kvmppc_host_state kvm_hstate;
 #endif
 };
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d879a6b..36a611b 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -189,6 +189,9 @@
 #define SPRN_CTR	0x009	/* Count Register */
 #define SPRN_DSCR	0x11
 #define SPRN_CFAR	0x1c	/* Come From Address Register */
+#define SPRN_AMR	0x1d	/* Authority Mask Register */
+#define SPRN_UAMOR	0x9d	/* User Authority Mask Override Register */
+#define SPRN_AMOR	0x15d	/* Authority Mask Override Register */
 #define SPRN_ACOP	0x1F	/* Available Coprocessor Register */
 #define SPRN_CTRLF	0x088
 #define SPRN_CTRLT	0x098
@@ -252,6 +255,7 @@
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
 #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
+#define   LPID_RSVD	0x3ff		/* Reserved LPID for partn switching */
 #define	SPRN_HMER	0x150	/* Hardware m? error recovery */
 #define	SPRN_HMEER	0x151	/* Hardware m? enable error recovery */
 #define	SPRN_HEIR	0x153	/* Hypervisor Emulated Instruction Register */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index dabfb73..4f73572 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -187,6 +187,7 @@ int main(void)
 	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
 	DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
+	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
@@ -392,6 +393,29 @@ int main(void)
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
+	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
+	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+#ifdef CONFIG_ALTIVEC
+	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
+	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
+#endif
+#ifdef CONFIG_VSX
+	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+#endif
+	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
+	DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
+	DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
+	DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0));
+	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
+	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
+	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
+#endif
 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
@@ -403,17 +427,60 @@ int main(void)
 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 
 	/* book3s */
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
+	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
+	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
+	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
+	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
+	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
+	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
+	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
+#endif
 #ifdef CONFIG_PPC_BOOK3S
+	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
 	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
+	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
+	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
+	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
+	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
+	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
 	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
 	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
+	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
+	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
+	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
+	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
+	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
+	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
+	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
+	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
+			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
+	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
+	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
+	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
 
 #ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_PR
 # define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
+#else
+# define SVCPU_FIELD(x, f)
+#endif
 # define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
 #else	/* 32-bit */
 # define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
@@ -458,6 +525,17 @@ int main(void)
 	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
 	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
+	HSTATE_FIELD(HSTATE_PMC, host_pmc);
+	HSTATE_FIELD(HSTATE_PURR, host_purr);
+	HSTATE_FIELD(HSTATE_SPURR, host_spurr);
+	HSTATE_FIELD(HSTATE_DSCR, host_dscr);
+	HSTATE_FIELD(HSTATE_DABR, dabr);
+	HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 #else /* CONFIG_PPC_BOOK3S */
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6da0055..163c041 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -87,14 +87,14 @@ data_access_not_stab:
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 #endif
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST, 0x300)
+				 KVMTEST_PR, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 #ifdef __DISABLED__
@@ -125,7 +125,7 @@ data_access_slb_pSeries:
 instruction_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
 #ifdef __DISABLED__
@@ -153,32 +153,32 @@ instruction_access_slb_pSeries:
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
-		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST)
-		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
-	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
 					    EXC_HV, SOFTEN_TEST_HV)
 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
-	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
+	FTR_SECTION_ELSE
+		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+					    EXC_STD, SOFTEN_TEST_PR)
+		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
 
 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700)
 
 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
 
 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
 	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
 
 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
 
 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00)
 
 	. = 0xc00
 	.globl	system_call_pSeries
@@ -219,7 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	b	.
 
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
 
 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
 	 * out of line to handle them
@@ -254,23 +254,23 @@ vsx_unavailable_pSeries_1:
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
 #endif /* CONFIG_CBE_RAS */
 
 	. = 0x3000
@@ -297,7 +297,7 @@ data_access_check_stab:
 	mfspr	r9,SPRN_DSISR
 	srdi	r10,r10,60
 	rlwimi	r10,r9,16,0x20
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 	lbz	r9,HSTATE_IN_GUEST(r13)
 	rlwimi	r10,r9,8,0x300
 #endif
@@ -316,11 +316,11 @@ do_stab_bolted_pSeries:
 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
 #endif /* CONFIG_POWER4_ONLY */
 
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
-	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
+	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
+	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
 
 	.align	7
@@ -336,11 +336,11 @@ do_stab_bolted_pSeries:
 
 	/* moved from 0xf00 */
 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
+	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
 
 /*
  * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -417,7 +417,11 @@ slb_miss_user_pseries:
 /* KVM's trampoline code needs to be close to the interrupt handlers */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 #include "../kvm/book3s_rmhandlers.S"
+#else
+#include "../kvm/book3s_hv_rmhandlers.S"
+#endif
 #endif
 
 	.align	7
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index b7baff7..3662ecc 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
 	bool
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
-	select KVM_MMIO
 
 config KVM_BOOK3S_HANDLER
 	bool
@@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
 config KVM_BOOK3S_32_HANDLER
 	bool
 	select KVM_BOOK3S_HANDLER
+	select KVM_MMIO
 
 config KVM_BOOK3S_64_HANDLER
 	bool
 	select KVM_BOOK3S_HANDLER
 
+config KVM_BOOK3S_PR
+	bool
+	select KVM_MMIO
+
 config KVM_BOOK3S_32
 	tristate "KVM support for PowerPC book3s_32 processors"
 	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
 	select KVM
 	select KVM_BOOK3S_32_HANDLER
+	select KVM_BOOK3S_PR
 	---help---
 	  Support running unmodified book3s_32 guest kernels
 	  in virtual machines on book3s_32 host processors.
@@ -48,10 +53,38 @@ config KVM_BOOK3S_32
 	  If unsure, say N.
 
 config KVM_BOOK3S_64
-	tristate "KVM support for PowerPC book3s_64 processors"
+	bool
+	select KVM_BOOK3S_64_HANDLER
+
+config KVM_BOOK3S_64_HV
+	bool "KVM support for POWER7 using hypervisor mode in host"
 	depends on EXPERIMENTAL && PPC_BOOK3S_64
 	select KVM
-	select KVM_BOOK3S_64_HANDLER
+	select KVM_BOOK3S_64
+	---help---
+	  Support running unmodified book3s_64 guest kernels in
+	  virtual machines on POWER7 processors that have hypervisor
+	  mode available to the host.
+
+	  If you say Y here, KVM will use the hardware virtualization
+	  facilities of POWER7 (and later) processors, meaning that
+	  guest operating systems will run at full hardware speed
+	  using supervisor and user modes.  However, this also means
+	  that KVM is not usable under PowerVM (pHyp), is only usable
+	  on POWER7 (or later) processors, and can only emulate
+	  POWER5+, POWER6 and POWER7 processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64_PR
+	tristate "KVM support for PowerPC book3s_64 processors"
+	depends on EXPERIMENTAL && PPC_BOOK3S_64 && !KVM_BOOK3S_64_HV
+	select KVM
+	select KVM_BOOK3S_64
+	select KVM_BOOK3S_PR
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.
@@ -65,6 +98,7 @@ config KVM_440
 	bool "KVM support for PowerPC 440 processors"
 	depends on EXPERIMENTAL && 44x
 	select KVM
+	select KVM_MMIO
 	---help---
 	  Support running unmodified 440 guest kernels in virtual machines on
 	  440 host processors.
@@ -89,6 +123,7 @@ config KVM_E500
 	bool "KVM support for PowerPC E500 processors"
 	depends on EXPERIMENTAL && E500
 	select KVM
+	select KVM_MMIO
 	---help---
 	  Support running unmodified E500 guest kernels in virtual machines on
 	  E500 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index bf9854f..ca7ed27 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -14,7 +14,7 @@ CFLAGS_emulate.o  := -I.
 
 common-objs-y += powerpc.o emulate.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
-obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
+obj-$(CONFIG_KVM_BOOK3S_PR) += book3s_exports.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
 
@@ -38,7 +38,7 @@ kvm-e500-objs := \
 	e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
 
-kvm-book3s_64-objs := \
+kvm-book3s_64_pr-objs := \
 	$(common-objs-y) \
 	fpu.o \
 	book3s_paired_singles.o \
@@ -50,7 +50,17 @@ kvm-book3s_64-objs := \
 	book3s_64_mmu_host.o \
 	book3s_64_mmu.o \
 	book3s_32_mmu.o
-kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
+kvm-objs-$(CONFIG_KVM_BOOK3S_64_PR) := $(kvm-book3s_64_pr-objs)
+
+kvm-book3s_64_hv-objs := \
+	../../../virt/kvm/kvm_main.o \
+	powerpc.o \
+	emulate.o \
+	book3s.o \
+	book3s_hv.o \
+	book3s_hv_interrupts.o \
+	book3s_64_mmu_hv.o
+kvm-objs-$(CONFIG_KVM_BOOK3S_64_HV) := $(kvm-book3s_64_hv-objs)
 
 kvm-book3s_32-objs := \
 	$(common-objs-y) \
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644
index 0000000..4a4fbec
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -0,0 +1,258 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+
+/* Pages in the VRMA are 16MB pages */
+#define VRMA_PAGE_ORDER	24
+#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
+
+#define NR_LPIDS	(LPID_RSVD + 1)
+unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
+
+long kvmppc_alloc_hpt(struct kvm *kvm)
+{
+	unsigned long hpt;
+	unsigned long lpid;
+
+	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
+			       HPT_ORDER - PAGE_SHIFT);
+	if (!hpt) {
+		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
+		return -ENOMEM;
+	}
+	kvm->arch.hpt_virt = hpt;
+
+	do {
+		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
+		if (lpid >= NR_LPIDS) {
+			pr_err("kvm_alloc_hpt: No LPIDs free\n");
+			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+			return -ENOMEM;
+		}
+	} while (test_and_set_bit(lpid, lpid_inuse));
+
+	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
+	kvm->arch.lpid = lpid;
+	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+	kvm->arch.host_lpid = mfspr(SPRN_LPID);
+	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
+
+	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
+	return 0;
+}
+
+void kvmppc_free_hpt(struct kvm *kvm)
+{
+	unsigned long i;
+	struct kvmppc_pginfo *pginfo;
+
+	clear_bit(kvm->arch.lpid, lpid_inuse);
+	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+
+	if (kvm->arch.ram_pginfo) {
+		pginfo = kvm->arch.ram_pginfo;
+		kvm->arch.ram_pginfo = NULL;
+		for (i = 0; i < kvm->arch.ram_npages; ++i)
+			put_page(pfn_to_page(pginfo[i].pfn));
+		kfree(pginfo);
+	}
+}
+
+static unsigned long user_page_size(unsigned long addr)
+{
+	struct vm_area_struct *vma;
+	unsigned long size = PAGE_SIZE;
+
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, addr);
+	if (vma)
+		size = vma_kernel_pagesize(vma);
+	up_read(&current->mm->mmap_sem);
+	return size;
+}
+
+static pfn_t hva_to_pfn(unsigned long addr)
+{
+	struct page *page[1];
+	int npages;
+
+	might_sleep();
+
+	npages = get_user_pages_fast(addr, 1, 1, page);
+
+	if (unlikely(npages != 1))
+		return 0;
+
+	return page_to_pfn(page[0]);
+}
+
+long kvmppc_prepare_vrma(struct kvm *kvm,
+			 struct kvm_userspace_memory_region *mem)
+{
+	unsigned long psize, porder;
+	unsigned long i, npages;
+	struct kvmppc_pginfo *pginfo;
+	pfn_t pfn;
+	unsigned long hva;
+
+	/* First see what page size we have */
+	psize = user_page_size(mem->userspace_addr);
+	/* For now, only allow 16MB pages */
+	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
+		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
+		       psize, mem->memory_size, mem->userspace_addr);
+		return -EINVAL;
+	}
+	porder = __ilog2(psize);
+
+	npages = mem->memory_size >> porder;
+	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
+	if (!pginfo) {
+		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
+		       npages * sizeof(struct kvmppc_pginfo));
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < npages; ++i) {
+		hva = mem->userspace_addr + (i << porder);
+		if (user_page_size(hva) != psize)
+			goto err;
+		pfn = hva_to_pfn(hva);
+		if (pfn == 0) {
+			pr_err("oops, no pfn for hva %lx\n", hva);
+			goto err;
+		}
+		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
+			pr_err("oops, unaligned pfn %llx\n", pfn);
+			put_page(pfn_to_page(pfn));
+			goto err;
+		}
+		pginfo[i].pfn = pfn;
+	}
+
+	kvm->arch.ram_npages = npages;
+	kvm->arch.ram_psize = psize;
+	kvm->arch.ram_porder = porder;
+	kvm->arch.ram_pginfo = pginfo;
+
+	return 0;
+
+ err:
+	kfree(pginfo);
+	return -EINVAL;
+}
+
+void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
+{
+	unsigned long i;
+	unsigned long npages = kvm->arch.ram_npages;
+	unsigned long pfn;
+	unsigned long *hpte;
+	unsigned long hash;
+	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+
+	if (!pginfo)
+		return;
+
+	/* VRMA can't be > 1TB */
+	if (npages > 1ul << (40 - kvm->arch.ram_porder))
+		npages = 1ul << (40 - kvm->arch.ram_porder);
+	/* Can't use more than 1 HPTE per HPTEG */
+	if (npages > HPT_NPTEG)
+		npages = HPT_NPTEG;
+
+	for (i = 0; i < npages; ++i) {
+		pfn = pginfo[i].pfn;
+		/* can't use hpt_hash since va > 64 bits */
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+		/*
+		 * We assume that the hash table is empty and no
+		 * vcpus are using it at this stage.  Since we create
+		 * at most one HPTE per HPTEG, we just assume entry 7
+		 * is available and use it.
+		 */
+		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
+		hpte += 7 * 2;
+		/* HPTE low word - RPN, protection, etc. */
+		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
+			HPTE_R_M | PP_RWXX;
+		wmb();
+		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
+			HPTE_V_LARGE | HPTE_V_VALID;
+	}
+}
+
+int kvmppc_mmu_hv_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+		return -EINVAL;
+	memset(lpid_inuse, 0, sizeof(lpid_inuse));
+	set_bit(mfspr(SPRN_LPID), lpid_inuse);
+	set_bit(LPID_RSVD, lpid_inuse);
+
+	return 0;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+}
+
+static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
+{
+	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
+}
+
+static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+				struct kvmppc_pte *gpte, bool data)
+{
+	return -ENOENT;
+}
+
+void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
+
+	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644
index 0000000..321bf38
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpumask.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/processor.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	local_paca->kvm_hstate.kvm_vcpu = vcpu;
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
+{
+	u64 now;
+	unsigned long dec_nsec;
+
+	now = get_tb();
+	if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_queue_dec(vcpu);
+	if (vcpu->arch.pending_exceptions)
+		return;
+	if (vcpu->arch.dec_expires != ~(u64)0) {
+		dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
+			tb_ticks_per_sec;
+		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+			      HRTIMER_MODE_REL);
+	}
+
+	kvm_vcpu_block(vcpu);
+	vcpu->stat.halt_wakeup++;
+
+	if (vcpu->arch.dec_expires != ~(u64)0)
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	vcpu->arch.shregs.msr = msr;
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	vcpu->arch.pvr = pvr;
+}
+
+void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
+	       vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
+	for (r = 0; r < 16; ++r)
+		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+		       r, kvmppc_get_gpr(vcpu, r),
+		       r+16, kvmppc_get_gpr(vcpu, r+16));
+	pr_err("ctr = %.16lx  lr  = %.16lx\n",
+	       vcpu->arch.ctr, vcpu->arch.lr);
+	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
+	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
+	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
+	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
+	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
+	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
+	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
+	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
+	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
+	pr_err("fault dar = %.16lx dsisr = %.8x\n",
+	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+	for (r = 0; r < vcpu->arch.slb_max; ++r)
+		pr_err("  ESID = %.16llx VSID = %.16llx\n",
+		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
+	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->arch.last_inst);
+}
+
+static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			      struct task_struct *tsk)
+{
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		ulong flags;
+		/*
+		 * Normally program interrupts are delivered directly
+		 * to the guest by the hardware, but we can get here
+		 * as a result of a hypervisor emulation interrupt
+		 * (e40) getting turned into a 700 by BML RTAS.
+		 */
+		flags = vcpu->arch.shregs.msr & 0x1f0000ull;
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		/* hcall - punt to userspace */
+		int i;
+
+		if (vcpu->arch.shregs.msr & MSR_PR) {
+			/* sc 1 from userspace - reflect to guest syscall */
+			kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
+			r = RESUME_GUEST;
+			break;
+		}
+		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+		for (i = 0; i < 9; ++i)
+			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+		run->exit_reason = KVM_EXIT_PAPR_HCALL;
+		vcpu->arch.hcall_needed = 1;
+		r = RESUME_HOST;
+		break;
+	}
+	/*
+	 * We get these next two if the guest does a bad real-mode access,
+	 * as we have enabled VRMA (virtualized real mode area) mode in the
+	 * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+	 */
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
+		vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
+					0x08000000);
+		r = RESUME_GUEST;
+		break;
+	/*
+	 * This occurs if the guest executes an illegal instruction.
+	 * We just generate a program interrupt to the guest, since
+	 * we don't emulate any guest instructions at this stage.
+	 */
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		kvmppc_core_queue_program(vcpu, 0x80000);
+		r = RESUME_GUEST;
+		break;
+	default:
+		kvmppc_dump_regs(vcpu);
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			vcpu->arch.shregs.msr);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+		if (signal_pending(tsk)) {
+			vcpu->stat.signal_exits++;
+			run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+		} else {
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	memset(sregs, 0, sizeof(struct kvm_sregs));
+	for (i = 0; i < vcpu->arch.slb_max; i++) {
+		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+	}
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i, j;
+
+	kvmppc_set_pvr(vcpu, sregs->pvr);
+
+	j = 0;
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+			++j;
+		}
+	}
+	vcpu->arch.slb_max = j;
+
+	return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+		return 0;
+	return -EIO;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvm_vcpu *vcpu;
+	int err = -ENOMEM;
+	unsigned long lpcr;
+
+	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+	if (!vcpu)
+		goto out;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	vcpu->arch.shared = &vcpu->arch.shregs;
+	vcpu->arch.last_cpu = -1;
+	vcpu->arch.host_msr = mfmsr();
+	vcpu->arch.mmcr[0] = MMCR0_FC;
+	vcpu->arch.ctrl = CTRL_RUNLATCH;
+	/* default to host PVR, since we can't spoof it */
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+
+	/* remember where some real-mode handlers are */
+	vcpu->arch.trampoline_lowmem = (ulong)kvmppc_handler_lowmem_trampoline;
+	vcpu->arch.trampoline_enter = (ulong)kvmppc_handler_trampoline_enter;
+	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
+
+	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
+	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
+	vcpu->arch.lpcr = lpcr;
+
+	kvmppc_mmu_book3s_hv_init(vcpu);
+
+	return vcpu;
+
+free_vcpu:
+	kfree(vcpu);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_uninit(vcpu);
+	kfree(vcpu);
+}
+
+extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	u64 now;
+
+	if (signal_pending(current)) {
+		run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+	preempt_disable();
+
+	/*
+	 * Make sure we are running on thread 0, and that
+	 * secondary threads are offline.
+	 * XXX we should also block attempts to bring any
+	 * secondary threads online.
+	 */
+	if (threads_per_core > 1) {
+		int cpu = smp_processor_id();
+		int thr = cpu_thread_in_core(cpu);
+
+		if (thr)
+			goto out;
+		while (++thr < threads_per_core)
+			if (cpu_online(cpu + thr))
+				goto out;
+	}
+
+	kvm_guest_enter();
+
+	__kvmppc_vcore_entry(NULL, vcpu);
+
+	kvm_guest_exit();
+
+	preempt_enable();
+	kvm_resched(vcpu);
+
+	now = get_tb();
+	/* cancel pending dec exception if dec is positive */
+	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_dequeue_dec(vcpu);
+
+	return kvmppc_handle_exit(run, vcpu, current);
+
+ out:
+	preempt_enable();
+	return -EBUSY;
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		return kvmppc_prepare_vrma(kvm, mem);
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		kvmppc_map_vrma(kvm, mem);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	long r;
+
+	/* Allocate hashed page table */
+	r = kvmppc_alloc_hpt(kvm);
+
+	return r;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvmppc_free_hpt(kvm);
+}
+
+/* These are stubs for now */
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_book3s_hv_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hv_init();
+
+	return r;
+}
+
+static void kvmppc_book3s_hv_exit(void)
+{
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_hv_init);
+module_exit(kvmppc_book3s_hv_exit);
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644
index 0000000..5c6a387
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -0,0 +1,325 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_interrupts.S, which is:
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/ppc-opcode.h>
+
+#define DISABLE_INTERRUPTS	\
+	mfmsr   r0;		\
+	rldicl  r0,r0,48,1;	\
+	rotldi  r0,r0,16;	\
+	mtmsrd  r0,1;		\
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcore_entry)
+
+	/* Write correct stack frame */
+	mflr	r0
+	std	r0,PPC_LR_STKOFF(r1)
+
+	/* Save host state to the stack */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save non-volatile registers (r14 - r31) */
+	SAVE_NVGPRS(r1)
+
+	/* Save host PMU registers and load guest PMU registers */
+	/* R4 is live here (vcpu pointer) but not r3 or r5 */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
+	isync
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r5, LPPACA_PMCINUSE(r3)
+	cmpwi	r5, 0
+	beq	31f			/* skip if not */
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r6, SPRN_MMCRA
+	std	r7, HSTATE_MMCR(r13)
+	std	r5, HSTATE_MMCR + 8(r13)
+	std	r6, HSTATE_MMCR + 16(r13)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r5, SPRN_PMC2
+	mfspr	r6, SPRN_PMC3
+	mfspr	r7, SPRN_PMC4
+	mfspr	r8, SPRN_PMC5
+	mfspr	r9, SPRN_PMC6
+	stw	r3, HSTATE_PMC(r13)
+	stw	r5, HSTATE_PMC + 4(r13)
+	stw	r6, HSTATE_PMC + 8(r13)
+	stw	r7, HSTATE_PMC + 12(r13)
+	stw	r8, HSTATE_PMC + 16(r13)
+	stw	r9, HSTATE_PMC + 20(r13)
+31:
+
+	/* Save host DSCR */
+	mfspr	r3, SPRN_DSCR
+	std	r3, HSTATE_DSCR(r13)
+
+	/* Save host DABR */
+	mfspr	r3, SPRN_DABR
+	std	r3, HSTATE_DABR(r13)
+
+	DISABLE_INTERRUPTS
+
+	/*
+	 * Put whatever is in the decrementer into the
+	 * hypervisor decrementer.
+	 */
+	mfspr	r8,SPRN_DEC
+	mftb	r7
+	mtspr	SPRN_HDEC,r8
+	extsw	r8,r8
+	add	r8,r8,r7
+	std	r8,HSTATE_DECEXP(r13)
+
+	ld	r5, VCPU_TRAMPOLINE_ENTER(r4)
+	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+
+	/* Jump to segment patching handler and into our guest */
+	b	kvmppc_rmcall
+
+/*
+ * This is the handler in module memory. It gets jumped at from the
+ * lowmem trampoline code, so it's basically the guest exit code.
+ *
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R12      = exit handler id
+	 * R13      = PACA
+	 *
+	 */
+
+	/* R7 = vcpu */
+	ld	r7, HSTATE_KVM_VCPU(r13)
+
+	/*
+	 * Reload DEC.  HDEC interrupts were disabled when
+	 * we reloaded the host's LPCR value.
+	 */
+	ld	r3, HSTATE_DECEXP(r13)
+	mftb	r4
+	subf	r4, r4, r3
+	mtspr	SPRN_DEC, r4
+
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r4, LPPACA_PMCINUSE(r3)
+	cmpwi	r4, 0
+	beq	23f			/* skip if not */
+	lwz	r3, HSTATE_PMC(r13)
+	lwz	r4, HSTATE_PMC + 4(r13)
+	lwz	r5, HSTATE_PMC + 8(r13)
+	lwz	r6, HSTATE_PMC + 12(r13)
+	lwz	r8, HSTATE_PMC + 16(r13)
+	lwz	r9, HSTATE_PMC + 20(r13)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r4
+	mtspr	SPRN_PMC3, r5
+	mtspr	SPRN_PMC4, r6
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, HSTATE_MMCR(r13)
+	ld	r4, HSTATE_MMCR + 8(r13)
+	ld	r5, HSTATE_MMCR + 16(r13)
+	mtspr	SPRN_MMCR1, r4
+	mtspr	SPRN_MMCRA, r5
+	mtspr	SPRN_MMCR0, r3
+	isync
+23:
+
+	/* Restore host msr -> SRR1 */
+	ld	r4, VCPU_HOST_MSR(r7)
+
+	/*
+	 * For some interrupts, we need to call the real Linux
+	 * handler, so it can do work for us. This has to happen
+	 * as if the interrupt arrived from the kernel though,
+	 * so let's fake it here where most state is restored.
+	 *
+	 * Call Linux for hardware interrupts/decrementer
+	 * r3 = address of interrupt handler (exit reason)
+	 */
+	/* Note: preemption is disabled at this point */
+
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	1f
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	1f
+
+	/* Back to EE=1 */
+	mtmsr	r4
+	sync
+	b	kvm_return_point
+
+1:	bl	call_linux_handler
+
+.global kvm_return_point
+kvm_return_point:
+	/* Restore non-volatile host registers (r14 - r31) */
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+call_linux_handler:
+	/* Restore host IP -> SRR0 */
+	mflr	r3
+	mtlr	r12
+
+	ld	r5, VCPU_TRAMPOLINE_LOWMEM(r7)
+	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+	b	kvmppc_rmcall
+
+/*
+ * Save away FP, VMX and VSX registers.
+ * r3 = vcpu pointer
+*/
+_GLOBAL(kvmppc_save_fp)
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+#ifdef CONFIG_VSX
+	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
+#else
+	oris	r8,r8,MSR_VEC@h
+#endif
+#endif
+	mtmsrd	r8
+	isync
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r6,reg*16+VCPU_VSRS
+	stxvd2x	reg,r6,r3
+	reg = reg + 1
+	.endr
+FTR_SECTION_ELSE
+#endif
+	reg = 0
+	.rept	32
+	stfd	reg,reg*8+VCPU_FPRS(r3)
+	reg = reg + 1
+	.endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+	mffs	fr0
+	stfd	fr0,VCPU_FPSCR(r3)
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r6,reg*16+VCPU_VRS
+	stvx	reg,r6,r3
+	reg = reg + 1
+	.endr
+	mfvscr	vr0
+	li	r6,VCPU_VSCR
+	stvx	vr0,r6,r3
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	mfspr	r6,SPRN_VRSAVE
+	stw	r6,VCPU_VRSAVE(r3)
+	mtmsrd	r9
+	isync
+	blr
+
+/*
+ * Load up FP, VMX and VSX registers
+ * r4 = vcpu pointer
+ */
+	.globl	kvmppc_load_fp
+kvmppc_load_fp:
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+#ifdef CONFIG_VSX
+	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
+#else
+	oris	r8,r8,MSR_VEC@h
+#endif
+#endif
+	mtmsrd	r8
+	isync
+	lfd	fr0,VCPU_FPSCR(r4)
+	MTFSF_L(fr0)
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r7,reg*16+VCPU_VSRS
+	lxvd2x	reg,r7,r4
+	reg = reg + 1
+	.endr
+FTR_SECTION_ELSE
+#endif
+	reg = 0
+	.rept	32
+	lfd	reg,reg*8+VCPU_FPRS(r4)
+	reg = reg + 1
+	.endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+
+#ifdef CONFIG_ALTIVEC
+	li	r7,VCPU_VSCR
+	lvx	vr0,r7,r4
+	mtvscr	vr0
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r7,reg*16+VCPU_VRS
+	lvx	reg,r7,r4
+	reg = reg + 1
+	.endr
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	lwz	r7,VCPU_VRSAVE(r4)
+	mtspr	SPRN_VRSAVE,r7
+	blr
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644
index 0000000..f0dc0ff
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -0,0 +1,667 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_rmhandlers.S and other files, which are:
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in the linear mapping           *
+ *                                                                           *
+ ****************************************************************************/
+
+#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
+
+	.globl	kvmppc_skip_interrupt
+kvmppc_skip_interrupt:
+	mfspr	r13,SPRN_SRR0
+	addi	r13,r13,4
+	mtspr	SPRN_SRR0,r13
+	GET_SCRATCH0(r13)
+	rfid
+	b	.
+
+	.globl	kvmppc_skip_Hinterrupt
+kvmppc_skip_Hinterrupt:
+	mfspr	r13,SPRN_HSRR0
+	addi	r13,r13,4
+	mtspr	SPRN_HSRR0,r13
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+
+/*
+ * This trampoline brings us back to a real mode handler
+ *
+ * Input Registers:
+ *
+ * R5 = SRR0
+ * R6 = SRR1
+ * R12 = real-mode IP
+ * LR = real-mode IP
+ *
+ * We have external interrupts set to come to the hypervisor using
+ * HSRR0/1, so if we're handling an external interrupt, we have to
+ * set HSRR0/1 rather than SRR0/1.
+ */
+.global kvmppc_handler_lowmem_trampoline
+kvmppc_handler_lowmem_trampoline:
+	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
+	beq	1f
+	mtsrr0	r3
+	mtsrr1	r4
+	blr
+1:	mtspr	SPRN_HSRR0,r3
+	mtspr	SPRN_HSRR1,r4
+	blr
+
+/*
+ * Call a function in real mode.
+ * Must be called with interrupts hard-disabled.
+ *
+ * Input Registers:
+ *
+ * R5 = function
+ * R6 = MSR
+ * R7 = scratch register
+ *
+ */
+_GLOBAL(kvmppc_rmcall)
+	mfmsr	r7
+	li	r0,MSR_RI		/* clear RI in MSR */
+	andc	r7,r7,r0
+	mtmsrd	r7,1
+	mtsrr0	r5
+	mtsrr1	r6
+	RFI
+
+.global kvmppc_trampoline_lowmem
+kvmppc_trampoline_lowmem:
+	PPC_LONG kvmppc_handler_lowmem_trampoline - _stext
+
+.global kvmppc_trampoline_enter
+kvmppc_trampoline_enter:
+	PPC_LONG kvmppc_handler_trampoline_enter - _stext
+
+#define ULONG_SIZE 		8
+#define VCPU_GPR(n)		(VCPU_GPRS + (n * ULONG_SIZE))
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_enter
+kvmppc_handler_trampoline_enter:
+
+	/* Required state:
+	 *
+	 * R4 = vcpu pointer
+	 * MSR = ~IR|DR
+	 * R13 = PACA
+	 * R1 = host R1
+	 * all other volatile GPRS = free
+	 */
+	ld	r14, VCPU_GPR(r14)(r4)
+	ld	r15, VCPU_GPR(r15)(r4)
+	ld	r16, VCPU_GPR(r16)(r4)
+	ld	r17, VCPU_GPR(r17)(r4)
+	ld	r18, VCPU_GPR(r18)(r4)
+	ld	r19, VCPU_GPR(r19)(r4)
+	ld	r20, VCPU_GPR(r20)(r4)
+	ld	r21, VCPU_GPR(r21)(r4)
+	ld	r22, VCPU_GPR(r22)(r4)
+	ld	r23, VCPU_GPR(r23)(r4)
+	ld	r24, VCPU_GPR(r24)(r4)
+	ld	r25, VCPU_GPR(r25)(r4)
+	ld	r26, VCPU_GPR(r26)(r4)
+	ld	r27, VCPU_GPR(r27)(r4)
+	ld	r28, VCPU_GPR(r28)(r4)
+	ld	r29, VCPU_GPR(r29)(r4)
+	ld	r30, VCPU_GPR(r30)(r4)
+	ld	r31, VCPU_GPR(r31)(r4)
+
+	/* Load guest PMU registers */
+	/* R4 is live here (vcpu pointer) */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
+	lwz	r6, VCPU_PMC + 8(r4)
+	lwz	r7, VCPU_PMC + 12(r4)
+	lwz	r8, VCPU_PMC + 16(r4)
+	lwz	r9, VCPU_PMC + 20(r4)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r5
+	mtspr	SPRN_PMC3, r6
+	mtspr	SPRN_PMC4, r7
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, VCPU_MMCR(r4)
+	ld	r5, VCPU_MMCR + 8(r4)
+	ld	r6, VCPU_MMCR + 16(r4)
+	mtspr	SPRN_MMCR1, r5
+	mtspr	SPRN_MMCRA, r6
+	mtspr	SPRN_MMCR0, r3
+	isync
+
+	/* Load up FP, VMX and VSX registers */
+	bl	kvmppc_load_fp
+
+	/* Switch DSCR to guest value */
+	ld	r5, VCPU_DSCR(r4)
+	mtspr	SPRN_DSCR, r5
+
+	/*
+	 * Set the decrementer to the guest decrementer.
+	 */
+	ld	r8,VCPU_DEC_EXPIRES(r4)
+	mftb	r7
+	subf	r3,r7,r8
+	mtspr	SPRN_DEC,r3
+	stw	r3,VCPU_DEC(r4)
+
+	ld	r5, VCPU_SPRG0(r4)
+	ld	r6, VCPU_SPRG1(r4)
+	ld	r7, VCPU_SPRG2(r4)
+	ld	r8, VCPU_SPRG3(r4)
+	mtspr	SPRN_SPRG0, r5
+	mtspr	SPRN_SPRG1, r6
+	mtspr	SPRN_SPRG2, r7
+	mtspr	SPRN_SPRG3, r8
+
+	/* Save R1 in the PACA */
+	std	r1, HSTATE_HOST_R1(r13)
+
+	/* Load up DAR and DSISR */
+	ld	r5, VCPU_DAR(r4)
+	lwz	r6, VCPU_DSISR(r4)
+	mtspr	SPRN_DAR, r5
+	mtspr	SPRN_DSISR, r6
+
+	/* Set partition DABR */
+	li	r5,3
+	ld	r6,VCPU_DABR(r4)
+	mtspr	SPRN_DABRX,r5
+	mtspr	SPRN_DABR,r6
+
+	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	ld	r5,VCPU_AMR(r4)
+	ld	r6,VCPU_UAMOR(r4)
+	li	r7,-1
+	mtspr	SPRN_AMR,r5
+	mtspr	SPRN_UAMOR,r6
+	mtspr	SPRN_AMOR,r7
+
+	/* Clear out SLB */
+	li	r6,0
+	slbmte	r6,r6
+	slbia
+	ptesync
+
+	/* Switch to guest partition. */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+	ld	r6,KVM_SDR1(r9)
+	lwz	r7,KVM_LPID(r9)
+	li	r0,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r0
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+	ld	r8,VCPU_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* Check if HDEC expires soon */
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,10
+	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	mr	r9,r4
+	blt	hdec_soon
+
+	/*
+	 * Invalidate the TLB if we could possibly have stale TLB
+	 * entries for this partition on this core due to the use
+	 * of tlbiel.
+	 */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+	lwz	r5,VCPU_VCPUID(r4)
+	lhz	r6,PACAPACAINDEX(r13)
+	lhz	r8,VCPU_LAST_CPU(r4)
+	sldi	r7,r6,1			/* see if this is the same vcpu */
+	add	r7,r7,r9		/* as last ran on this pcpu */
+	lhz	r0,KVM_LAST_VCPU(r7)
+	cmpw	r6,r8			/* on the same cpu core as last time? */
+	bne	3f
+	cmpw	r0,r5			/* same vcpu as this core last ran? */
+	beq	1f
+3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
+	sth	r5,KVM_LAST_VCPU(r7)
+	li	r6,128
+	mtctr	r6
+	li	r7,0x800		/* IS field = 0b10 */
+	ptesync
+2:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	2b
+	ptesync
+1:
+
+	/* Save purr/spurr */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	std	r5,HSTATE_PURR(r13)
+	std	r6,HSTATE_SPURR(r13)
+	ld	r7,VCPU_PURR(r4)
+	ld	r8,VCPU_SPURR(r4)
+	mtspr	SPRN_PURR,r7
+	mtspr	SPRN_SPURR,r8
+
+	/* Load up guest SLB entries */
+	lwz	r5,VCPU_SLB_MAX(r4)
+	cmpwi	r5,0
+	beq	9f
+	mtctr	r5
+	addi	r6,r4,VCPU_SLB
+1:	ld	r8,VCPU_SLB_E(r6)
+	ld	r9,VCPU_SLB_V(r6)
+	slbmte	r9,r8
+	addi	r6,r6,VCPU_SLB_SIZE
+	bdnz	1b
+9:
+
+	/* Restore state of CTRL run bit; assume 1 on entry */
+	lwz	r5,VCPU_CTRL(r4)
+	andi.	r5,r5,1
+	bne	4f
+	mfspr	r6,SPRN_CTRLF
+	clrrdi	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	ld	r6, VCPU_CTR(r4)
+	lwz	r7, VCPU_XER(r4)
+
+	mtctr	r6
+	mtxer	r7
+
+	/* Move SRR0 and SRR1 into the respective regs */
+	ld	r6, VCPU_SRR0(r4)
+	ld	r7, VCPU_SRR1(r4)
+	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r7
+
+	ld	r10, VCPU_PC(r4)
+
+	ld	r11, VCPU_MSR(r4)	/* r10 = vcpu->arch.msr & ~MSR_HV */
+	rldicl	r11, r11, 63 - MSR_HV_LG, 1
+	rotldi	r11, r11, 1 + MSR_HV_LG
+	ori	r11, r11, MSR_ME
+
+fast_guest_return:
+	mtspr	SPRN_HSRR0,r10
+	mtspr	SPRN_HSRR1,r11
+
+	/* Activate guest mode, so faults get handled by KVM */
+	li	r9, KVM_GUEST_MODE_GUEST
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	/* Enter guest */
+
+	ld	r5, VCPU_LR(r4)
+	lwz	r6, VCPU_CR(r4)
+	mtlr	r5
+	mtcr	r6
+
+	ld	r0, VCPU_GPR(r0)(r4)
+	ld	r1, VCPU_GPR(r1)(r4)
+	ld	r2, VCPU_GPR(r2)(r4)
+	ld	r3, VCPU_GPR(r3)(r4)
+	ld	r5, VCPU_GPR(r5)(r4)
+	ld	r6, VCPU_GPR(r6)(r4)
+	ld	r7, VCPU_GPR(r7)(r4)
+	ld	r8, VCPU_GPR(r8)(r4)
+	ld	r9, VCPU_GPR(r9)(r4)
+	ld	r10, VCPU_GPR(r10)(r4)
+	ld	r11, VCPU_GPR(r11)(r4)
+	ld	r12, VCPU_GPR(r12)(r4)
+	ld	r13, VCPU_GPR(r13)(r4)
+
+	ld	r4, VCPU_GPR(r4)(r4)
+
+	hrfid
+	b	.
+kvmppc_handler_trampoline_enter_end:
+
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+/*
+ * We come here from the first-level interrupt handlers.
+ */
+	.globl	kvmppc_interrupt
+kvmppc_interrupt:
+	/*
+	 * Register contents:
+	 * R12		= interrupt vector
+	 * R13		= PACA
+	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest R13 saved in SPRN_SCRATCH0
+	 */
+	/* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
+	std	r9, HSTATE_HOST_R2(r13)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Save registers */
+
+	std	r0, VCPU_GPR(r0)(r9)
+	std	r1, VCPU_GPR(r1)(r9)
+	std	r2, VCPU_GPR(r2)(r9)
+	std	r3, VCPU_GPR(r3)(r9)
+	std	r4, VCPU_GPR(r4)(r9)
+	std	r5, VCPU_GPR(r5)(r9)
+	std	r6, VCPU_GPR(r6)(r9)
+	std	r7, VCPU_GPR(r7)(r9)
+	std	r8, VCPU_GPR(r8)(r9)
+	ld	r0, HSTATE_HOST_R2(r13)
+	std	r0, VCPU_GPR(r9)(r9)
+	std	r10, VCPU_GPR(r10)(r9)
+	std	r11, VCPU_GPR(r11)(r9)
+	ld	r3, HSTATE_SCRATCH0(r13)
+	lwz	r4, HSTATE_SCRATCH1(r13)
+	std	r3, VCPU_GPR(r12)(r9)
+	stw	r4, VCPU_CR(r9)
+
+	/* Restore R1/R2 so we can handle faults */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+
+	mfspr	r10, SPRN_SRR0
+	mfspr	r11, SPRN_SRR1
+	std	r10, VCPU_SRR0(r9)
+	std	r11, VCPU_SRR1(r9)
+	andi.	r0, r12, 2		/* need to read HSRR0/1? */
+	beq	1f
+	mfspr	r10, SPRN_HSRR0
+	mfspr	r11, SPRN_HSRR1
+	clrrdi	r12, r12, 2
+1:	std	r10, VCPU_PC(r9)
+	std	r11, VCPU_MSR(r9)
+
+	GET_SCRATCH0(r3)
+	mflr	r4
+	std	r3, VCPU_GPR(r13)(r9)
+	std	r4, VCPU_LR(r9)
+
+	/* Unset guest mode */
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	stw	r12,VCPU_TRAP(r9)
+
+	/* See if this is a leftover HDEC interrupt */
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	bne	2f
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,0
+	bge	ignore_hdec
+2:
+
+	/* Check for mediated interrupts (could be done earlier really ...) */
+	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
+	bne+	1f
+	ld	r5,VCPU_LPCR(r9)
+	andi.	r0,r11,MSR_EE
+	beq	1f
+	andi.	r0,r5,LPCR_MER
+	bne	bounce_ext_interrupt
+1:
+
+	/* Save DEC */
+	mfspr	r5,SPRN_DEC
+	mftb	r6
+	extsw	r5,r5
+	add	r5,r5,r6
+	std	r5,VCPU_DEC_EXPIRES(r9)
+
+	/* Save HEIR (HV emulation assist reg) in last_inst
+	   if this is an HEI (HV emulation interrupt, e40) */
+	li	r3,-1
+	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
+	bne	11f
+	mfspr	r3,SPRN_HEIR
+11:	stw	r3,VCPU_LAST_INST(r9)
+
+	/* Save more register state  */
+	mfxer	r5
+	mfdar	r6
+	mfdsisr	r7
+	mfctr	r8
+
+	stw	r5, VCPU_XER(r9)
+	std	r6, VCPU_DAR(r9)
+	stw	r7, VCPU_DSISR(r9)
+	std	r8, VCPU_CTR(r9)
+	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	6f
+7:	std	r6, VCPU_FAULT_DAR(r9)
+	stw	r7, VCPU_FAULT_DSISR(r9)
+
+	/* Save guest CTRL register, set runlatch to 1 */
+	mfspr	r6,SPRN_CTRLF
+	stw	r6,VCPU_CTRL(r9)
+	andi.	r0,r6,1
+	bne	4f
+	ori	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	/* Read the guest SLB and save it away */
+	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
+	mtctr	r0
+	li	r6,0
+	addi	r7,r9,VCPU_SLB
+	li	r5,0
+1:	slbmfee	r8,r6
+	andis.	r0,r8,SLB_ESID_V@h
+	beq	2f
+	add	r8,r8,r6		/* put index in */
+	slbmfev	r3,r6
+	std	r8,VCPU_SLB_E(r7)
+	std	r3,VCPU_SLB_V(r7)
+	addi	r7,r7,VCPU_SLB_SIZE
+	addi	r5,r5,1
+2:	addi	r6,r6,1
+	bdnz	1b
+	stw	r5,VCPU_SLB_MAX(r9)
+
+	/*
+	 * Save the guest PURR/SPURR
+	 */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	ld	r7,VCPU_PURR(r9)
+	ld	r8,VCPU_SPURR(r9)
+	std	r5,VCPU_PURR(r9)
+	std	r6,VCPU_SPURR(r9)
+	subf	r5,r7,r5
+	subf	r6,r8,r6
+
+	/*
+	 * Restore host PURR/SPURR and add guest times
+	 * so that the time in the guest gets accounted.
+	 */
+	ld	r3,HSTATE_PURR(r13)
+	ld	r4,HSTATE_SPURR(r13)
+	add	r3,r3,r5
+	add	r4,r4,r6
+	mtspr	SPRN_PURR,r3
+	mtspr	SPRN_SPURR,r4
+
+	/* Clear out SLB */
+	li	r5,0
+	slbmte	r5,r5
+	slbia
+	ptesync
+
+hdec_soon:
+	/* Switch back to host partition */
+	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+	ld	r6,KVM_HOST_SDR1(r4)
+	lwz	r7,KVM_HOST_LPID(r4)
+	li	r8,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r8
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+	lis	r8,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC,r8
+
+	ld	r8,KVM_HOST_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* load host SLB entries */
+	ld	r8,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	ld	r5,SLBSHADOW_SAVEAREA(r8)
+	ld	r6,SLBSHADOW_SAVEAREA+8(r8)
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r8,r8,16
+	.endr
+
+	/* Save and reset AMR and UAMOR before turning on the MMU */
+	mfspr	r5,SPRN_AMR
+	mfspr	r6,SPRN_UAMOR
+	std	r5,VCPU_AMR(r9)
+	std	r6,VCPU_UAMOR(r9)
+	li	r6,0
+	mtspr	SPRN_AMR,r6
+
+	/* Restore host DABR and DABRX */
+	ld	r5,HSTATE_DABR(r13)
+	li	r6,7
+	mtspr	SPRN_DABR,r5
+	mtspr	SPRN_DABRX,r6
+
+	/* Switch DSCR back to host value */
+	mfspr	r8, SPRN_DSCR
+	ld	r7, HSTATE_DSCR(r13)
+	std	r8, VCPU_DSCR(r7)
+	mtspr	SPRN_DSCR, r7
+
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(r14)(r9)
+	std	r15, VCPU_GPR(r15)(r9)
+	std	r16, VCPU_GPR(r16)(r9)
+	std	r17, VCPU_GPR(r17)(r9)
+	std	r18, VCPU_GPR(r18)(r9)
+	std	r19, VCPU_GPR(r19)(r9)
+	std	r20, VCPU_GPR(r20)(r9)
+	std	r21, VCPU_GPR(r21)(r9)
+	std	r22, VCPU_GPR(r22)(r9)
+	std	r23, VCPU_GPR(r23)(r9)
+	std	r24, VCPU_GPR(r24)(r9)
+	std	r25, VCPU_GPR(r25)(r9)
+	std	r26, VCPU_GPR(r26)(r9)
+	std	r27, VCPU_GPR(r27)(r9)
+	std	r28, VCPU_GPR(r28)(r9)
+	std	r29, VCPU_GPR(r29)(r9)
+	std	r30, VCPU_GPR(r30)(r9)
+	std	r31, VCPU_GPR(r31)(r9)
+
+	/* Save SPRGs */
+	mfspr	r3, SPRN_SPRG0
+	mfspr	r4, SPRN_SPRG1
+	mfspr	r5, SPRN_SPRG2
+	mfspr	r6, SPRN_SPRG3
+	std	r3, VCPU_SPRG0(r9)
+	std	r4, VCPU_SPRG1(r9)
+	std	r5, VCPU_SPRG2(r9)
+	std	r6, VCPU_SPRG3(r9)
+
+	/* Save PMU registers */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r6, SPRN_MMCRA
+	std	r4, VCPU_MMCR(r9)
+	std	r5, VCPU_MMCR + 8(r9)
+	std	r6, VCPU_MMCR + 16(r9)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r4, SPRN_PMC2
+	mfspr	r5, SPRN_PMC3
+	mfspr	r6, SPRN_PMC4
+	mfspr	r7, SPRN_PMC5
+	mfspr	r8, SPRN_PMC6
+	stw	r3, VCPU_PMC(r9)
+	stw	r4, VCPU_PMC + 4(r9)
+	stw	r5, VCPU_PMC + 8(r9)
+	stw	r6, VCPU_PMC + 12(r9)
+	stw	r7, VCPU_PMC + 16(r9)
+	stw	r8, VCPU_PMC + 20(r9)
+22:
+	/* save FP state */
+	mr	r3, r9
+	bl	.kvmppc_save_fp
+
+	/* RFI into the highmem handler */
+	mfmsr	r7
+	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
+	mtsrr1	r7
+	/* Load highmem handler address */
+	ld	r8, VCPU_HIGHMEM_HANDLER(r3)
+	mtsrr0	r8
+
+	RFI
+kvmppc_handler_trampoline_exit_end:
+
+6:	mfspr	r6,SPRN_HDAR
+	mfspr	r7,SPRN_HDSISR
+	b	7b
+
+ignore_hdec:
+	mr	r4,r9
+	b	fast_guest_return
+
+bounce_ext_interrupt:
+	mr	r4,r9
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	li	r10,BOOK3S_INTERRUPT_EXTERNAL
+	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
+	b	fast_guest_return
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index b0ae527..174d59c 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -85,29 +85,29 @@ kvmppc_handler_trampoline_enter:
 
 	/* Enter guest */
 
-	PPC_LL	r4, (SVCPU_CTR)(r3)
-	PPC_LL	r5, (SVCPU_LR)(r3)
-	lwz	r6, (SVCPU_CR)(r3)
-	lwz	r7, (SVCPU_XER)(r3)
+	PPC_LL	r4, SVCPU_CTR(r3)
+	PPC_LL	r5, SVCPU_LR(r3)
+	lwz	r6, SVCPU_CR(r3)
+	lwz	r7, SVCPU_XER(r3)
 
 	mtctr	r4
 	mtlr	r5
 	mtcr	r6
 	mtxer	r7
 
-	PPC_LL	r0, (SVCPU_R0)(r3)
-	PPC_LL	r1, (SVCPU_R1)(r3)
-	PPC_LL	r2, (SVCPU_R2)(r3)
-	PPC_LL	r4, (SVCPU_R4)(r3)
-	PPC_LL	r5, (SVCPU_R5)(r3)
-	PPC_LL	r6, (SVCPU_R6)(r3)
-	PPC_LL	r7, (SVCPU_R7)(r3)
-	PPC_LL	r8, (SVCPU_R8)(r3)
-	PPC_LL	r9, (SVCPU_R9)(r3)
-	PPC_LL	r10, (SVCPU_R10)(r3)
-	PPC_LL	r11, (SVCPU_R11)(r3)
-	PPC_LL	r12, (SVCPU_R12)(r3)
-	PPC_LL	r13, (SVCPU_R13)(r3)
+	PPC_LL	r0, SVCPU_R0(r3)
+	PPC_LL	r1, SVCPU_R1(r3)
+	PPC_LL	r2, SVCPU_R2(r3)
+	PPC_LL	r4, SVCPU_R4(r3)
+	PPC_LL	r5, SVCPU_R5(r3)
+	PPC_LL	r6, SVCPU_R6(r3)
+	PPC_LL	r7, SVCPU_R7(r3)
+	PPC_LL	r8, SVCPU_R8(r3)
+	PPC_LL	r9, SVCPU_R9(r3)
+	PPC_LL	r10, SVCPU_R10(r3)
+	PPC_LL	r11, SVCPU_R11(r3)
+	PPC_LL	r12, SVCPU_R12(r3)
+	PPC_LL	r13, SVCPU_R13(r3)
 
 	PPC_LL	r3, (SVCPU_R3)(r3)
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 026036e..3a4f379 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,8 +38,12 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
+#ifndef CONFIG_KVM_BOOK3S_64_HV
 	return !(v->arch.shared->msr & MSR_WE) ||
 	       !!(v->arch.pending_exceptions);
+#else
+	return 1;
+#endif
 }
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -184,10 +188,13 @@ int kvm_dev_ioctl_check_extension(long ext)
 #else
 	case KVM_CAP_PPC_SEGSTATE:
 #endif
-	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
+		r = 1;
+		break;
+#ifndef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_GET_PVINFO:
 		r = 1;
@@ -195,6 +202,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+#endif
 	default:
 		r = 0;
 		break;
@@ -291,6 +299,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+	vcpu->arch.dec_expires = ~(u64)0;
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);
@@ -317,6 +326,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
 #endif
 	kvmppc_core_vcpu_load(vcpu, cpu);
+	vcpu->cpu = smp_processor_id();
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -325,6 +335,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_BOOKE
 	vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
 #endif
+	vcpu->cpu = -1;
 }
 
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -496,6 +507,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		for (i = 0; i < 32; i++)
 			kvmppc_set_gpr(vcpu, i, gprs[i]);
 		vcpu->arch.osi_needed = 0;
+	} else if (vcpu->arch.hcall_needed) {
+		int i;
+
+		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
+		for (i = 0; i < 9; ++i)
+			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
+		vcpu->arch.hcall_needed = 0;
 	}
 
 	kvmppc_core_deliver_interrupts(vcpu);
@@ -518,6 +536,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
+	} else if (vcpu->cpu != -1) {
+		smp_send_reschedule(vcpu->cpu);
 	}
 
 	return 0;
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index d62a14b..b135d3d 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
  *                         Book3S trace points                           *
  *************************************************************************/
 
-#ifdef CONFIG_PPC_BOOK3S
+#ifdef CONFIG_KVM_BOOK3S_PR
 
 TRACE_EVENT(kvm_book3s_exit,
 	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 9c9ca7c..a156294 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_PAPR_HCALL	  19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,11 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
1.7.5.4

^ permalink raw reply related

* ??: [PATCH] fs_enet: fix freescale FCC ethernet dp buffer alignment
From: ?? @ 2011-06-20  4:01 UTC (permalink / raw)
  To: 'David Laight', 'Holger Brunck', linuxppc-dev
  Cc: netdev, 'Clive Stubbings', 'Vitaly Bordug'
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6D8ADA6@saturn3.aculab.com>

Hi David Laight, Thank you so much~~
1. Hardware based on FPGA, it dose not support buffer chaining.
2. How dose the dma map support buffer chaining function?
3. On the align issue, the hardware designer start to fix it, maybe it's a
better way.  

-----????-----
???: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] ??
David Laight
????: 2011?6?17? 18:16
???: yangyong@neusoft.com; Holger Brunck; linuxppc-dev@lists.ozlabs.org
??: netdev@vger.kernel.org; Clive Stubbings; Vitaly Bordug
??: RE: [PATCH] fs_enet: fix freescale FCC ethernet dp buffer alignment

 
> Hello,
> Motioned to the memory aligned, now there is such requirement:
> When the driver send an packet to hardware, the skb's address passed
by
> stack do a dma map into hardware, the skb's dma address must 
> be 64-byte aligned.

Does the hardware support buffer chaining?
In which case you only need to copy the data upto the first
64 byte boundary into another buffer.

Actually, given that you are likely to have to fixup every
fragment of the frame being transmitted, if might be worth
allocating a fixed transmnit buffer area and copying the
frames into it prior to sending.
Certainly you need to allow for transmits made up of a
significant number of small buffers linked together.

Really you should beat up the hardware designers!

Copying the data to even a 4 byte boundary is almost
always a misaligned copy. Typically this only applies
to the receive dma - when writing a 2 byte pad before
the frame data would be much better.

	David


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

---------------------------------------------------------------------------------------------------
Confidentiality Notice: The information contained in this e-mail and any accompanying attachment(s) 
is intended only for the use of the intended recipient and may be confidential and/or privileged of 
Neusoft Corporation, its subsidiaries and/or its affiliates. If any reader of this communication is 
not the intended recipient, unauthorized use, forwarding, printing,  storing, disclosure or copying 
is strictly prohibited, and may be unlawful.If you have received this communication in error,please 
immediately notify the sender by return e-mail, and delete the original message and all copies from 
your system. Thank you. 
---------------------------------------------------------------------------------------------------

^ permalink raw reply

* [PATCH] MAINTAINERS: add arch/powerpc/platforms/85xx/ to the 85xx entry
From: Baruch Siach @ 2011-06-20  5:00 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Baruch Siach

Cc: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
---
 MAINTAINERS |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index e50fc6e..8294613 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3904,6 +3904,7 @@ W:	http://www.penguinppc.org/
 L:	linuxppc-dev@lists.ozlabs.org
 S:	Maintained
 F:	arch/powerpc/platforms/83xx/
+F:	arch/powerpc/platforms/85xx/
 
 LINUX FOR POWERPC PA SEMI PWRFICIENT
 M:	Olof Johansson <olof@lixom.net>
-- 
1.7.5.3

^ permalink raw reply related

* [RFC PATCH] powerpc: 85xx: Make e500/e500v2 depend on !E500MC
From: Baruch Siach @ 2011-06-20  4:56 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Baruch Siach

CONFIG_E500MC breaks e500/e500v2 systems. It defines L1_CACHE_SHIFT to 6, thus
breaking clear_pages(), probably others too.

Cc: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
---
Is this the right approach?

 arch/powerpc/platforms/85xx/Kconfig |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig
index b6976e1..5b8546d 100644
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -13,6 +13,8 @@ if FSL_SOC_BOOKE
 
 if PPC32
 
+if !PPC_E500MC
+
 config MPC8540_ADS
 	bool "Freescale MPC8540 ADS"
 	select DEFAULT_UIMAGE
@@ -155,6 +157,8 @@ config SBC8560
 	help
 	  This option enables support for the Wind River SBC8560 board
 
+endif # !PPC_E500MC
+
 config P3041_DS
 	bool "Freescale P3041 DS"
 	select DEFAULT_UIMAGE
-- 
1.7.5.3

^ permalink raw reply related

* RE: NAND BBT corruption on MPC83xx
From: Atlant Schmidt @ 2011-06-20 11:22 UTC (permalink / raw)
  To: 'Mike Hench', Scott Wood, Matthew L. Creech
  Cc: linux-mtd@lists.infradead.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <F87D23B7E1F84E4AB52E4FA4A55F85DC0259E343@tpamail.elutions.com>

Mike:

> It is not a permanent damage thing.

  A "read disturb" does no permanent damage to the chip
  but if the read disturb event involves more bits than
  can be corrected by your ECC code, it can do permanent
  damage to the *DATA* you've stored in that block.

  For this reason, a good flash management system manages
  to at least occasionally read through *ALL* of the in-use
  blocks in the device so that single-bit errors can be
  scrubbed out (read and successfully corrected) before
  an adjacent bit in the block also fails (which would
  eventually lead to a multi-bit error that might be
  beyond the ability to be corrected by the ECC).

  As far as I know (and I'm sure the list will correct
  me if I'm wrong! ;-) ), neither UBI nor UBIFS nor any
  Linux layer provides this routine scrubbing; you have
  to code it up yourself, probably by accessing the
  device at the UBI (underlying block device/LEB) layer.

                        Atlant

-----Original Message-----
From: linux-mtd-bounces@lists.infradead.org [mailto:linux-mtd-bounces@lists=
.infradead.org] On Behalf Of Mike Hench
Sent: Saturday, June 18, 2011 13:55
To: Scott Wood; Matthew L. Creech
Cc: linuxppc-dev@lists.ozlabs.org; linux-mtd@lists.infradead.org
Subject: RE: NAND BBT corruption on MPC83xx

Scott Wood wrote:
> As for the corruption, could it be degradation from repeated reads of
that
> one page?

Read Disturb. I Did not know SLC did that.
It just takes 10x as long as MLC, on the order of a million reads.
Supposedly erasing the block fixes it.
It is not a permanent damage thing.
I was seeing ~9 hours before failure with heavy writes.
~4GByte/hour =3D 2M pages, total ~18 million reads before errors in that
last block showed up.

Cool. Now we know.
Thanks.

Mike Hench



______________________________________________________
Linux MTD discussion mailing list
http://lists.infradead.org/mailman/listinfo/linux-mtd/

This e-mail and the information, including any attachments, it contains are=
 intended to be a confidential communication only to the person or entity t=
o whom it is addressed and may contain information that is privileged. If t=
he reader of this message is not the intended recipient, you are hereby not=
ified that any dissemination, distribution or copying of this communication=
 is strictly prohibited. If you have received this communication in error, =
please immediately notify the sender and destroy the original message.

Thank you.

Please consider the environment before printing this email.

^ permalink raw reply

* Re: NAND BBT corruption on MPC83xx
From: Matthew L. Creech @ 2011-06-20 15:20 UTC (permalink / raw)
  To: Scott Wood; +Cc: linux-mtd, linuxppc-dev, mhench
In-Reply-To: <20110617163442.204348a0@schlenkerla.am.freescale.net>

On Fri, Jun 17, 2011 at 5:34 PM, Scott Wood <scottwood@freescale.com> wrote:
>
> As for the corruption, could it be degradation from repeated reads of that
> one page?
>

Could be.  I think Mike's theory was that the -1 page_addr sort of
"wrapped around", and caused us to read in the last block on flash
each time NAND_CMD_PAGEPROG was performed.  So with a lot of writes
happening, we could end up with a BBT that looks like this.

That makes sense I guess, since set_addr() in fsl_elbc_nand.c uses
page_addr to set FBAR.  I don't see anything about it in the manual,
but if FBAR wraps beyond the end of the chip, maybe the bits that
don't make sense are simply ignored.  (In which case we should
probably add a check in set_addr() to prevent anything like this in
the future)

In theory I should be able to prove it out by running 2 devices in
parallel - one with that block of code still there, and one with it
removed.  If the former device sees bit-flips in the BBT and the
latter one doesn't, we'll be sure of the culprit.  I'll try this and
come back with the results.

Thanks!

-- 
Matthew L. Creech

^ permalink raw reply

* Re: [PATCH v3 2/2] powerpc: add support for MPIC message register API
From: Scott Wood @ 2011-06-20 15:59 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Meador Inge, openmcapi-dev, devicetree-discuss, Hollis Blanchard,
	linuxppc-dev
In-Reply-To: <1308351533.32158.61.camel@pasglop>

On Sat, 18 Jun 2011 08:58:53 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Fri, 2011-06-17 at 11:58 -0500, Scott Wood wrote:
> > When did this change from "considered an internal implementation
> > issue, and not really an interface" to "all new interfaces"?
> 
> Interesting blurb... that's not everybody's opinion and I would argue
> that supporting AMP kernels isn't something we want to do with closed
> source crap.

I'm not advocating "closed source crap", just that if something is
"policy" (as opposed to opinion), it'd be nice if the documentation
actually matched.

-Scott

^ permalink raw reply

* [PATCH] spi/fsl_spi: fix CPM spi driver
From: Holger Brunck @ 2011-06-20 16:31 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: spi-devel-general, Holger Brunck

This patch fixes the freescale spi driver for CPM. Without this
patch SPI on CPM failed because cpm_muram_alloc_fixed tries to
allocate muram in an preserved area. The error reported was:

mpc8xxx_spi f0011a80.spi: can't allocate spi parameter ram
mpc8xxx_spi: probe of f0011a80.spi failed with error -12

Now the driver uses of_iomap to get access to this area
similar to i2c driver driver in the i2c-cpm.c which has a
similar device tree node. This is tested on a MPC8247 with CPM2.

Signed-off-by: Holger Brunck <holger.brunck@keymile.com>
cc: Grant Likely <grant.likely@secretlab.ca>
cc: spi-devel-general@lists.sourceforge.net
---
This was the same problem reported and discussed on ppc-dev for CPM1:
http://lists.ozlabs.org/pipermail/linuxppc-dev/2010-September/085739.html

 drivers/spi/spi_fsl_spi.c |   28 +++++++++++-----------------
 1 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/drivers/spi/spi_fsl_spi.c b/drivers/spi/spi_fsl_spi.c
index 7963c9b..ca57edf 100644
--- a/drivers/spi/spi_fsl_spi.c
+++ b/drivers/spi/spi_fsl_spi.c
@@ -684,7 +684,7 @@ static unsigned long fsl_spi_cpm_get_pram(struct mpc8xxx_spi *mspi)
 	struct device_node *np = dev->of_node;
 	const u32 *iprop;
 	int size;
-	unsigned long spi_base_ofs;
+	void __iomem *spi_base;
 	unsigned long pram_ofs = -ENOMEM;
 
 	/* Can't use of_address_to_resource(), QE muram isn't at 0. */
@@ -702,33 +702,27 @@ static unsigned long fsl_spi_cpm_get_pram(struct mpc8xxx_spi *mspi)
 		return pram_ofs;
 	}
 
-	/* CPM1 and CPM2 pram must be at a fixed addr. */
-	if (!iprop || size != sizeof(*iprop) * 4)
-		return -ENOMEM;
-
-	spi_base_ofs = cpm_muram_alloc_fixed(iprop[2], 2);
-	if (IS_ERR_VALUE(spi_base_ofs))
-		return -ENOMEM;
+	spi_base = of_iomap(np, 1);
+	if (spi_base == NULL)
+		return -EINVAL;
 
 	if (mspi->flags & SPI_CPM2) {
 		pram_ofs = cpm_muram_alloc(SPI_PRAM_SIZE, 64);
-		if (!IS_ERR_VALUE(pram_ofs)) {
-			u16 __iomem *spi_base = cpm_muram_addr(spi_base_ofs);
-
-			out_be16(spi_base, pram_ofs);
-		}
+		out_be16(spi_base, pram_ofs);
 	} else {
-		struct spi_pram __iomem *pram = cpm_muram_addr(spi_base_ofs);
+		struct spi_pram __iomem *pram = spi_base;
 		u16 rpbase = in_be16(&pram->rpbase);
 
 		/* Microcode relocation patch applied? */
 		if (rpbase)
 			pram_ofs = rpbase;
-		else
-			return spi_base_ofs;
+		else {
+			pram_ofs = cpm_muram_alloc(SPI_PRAM_SIZE, 64);
+			out_be16(spi_base, pram_ofs);
+		}
 	}
 
-	cpm_muram_free(spi_base_ofs);
+	iounmap(spi_base);
 	return pram_ofs;
 }
 
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH] perf_events: Enable idle state tracing for pseries (ppc64)
From: deepthi @ 2011-06-20 17:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <1308284659.32158.4.camel@pasglop>

On Friday 17 June 2011 09:54 AM, Benjamin Herrenschmidt wrote:
> On Wed, 2011-06-01 at 18:05 +0530, Deepthi Dharwar wrote:
>> Hi,
>>
>> Please find below a patch, which has perf_events added for pseries (ppc64)
>> platform in order to emit the trace required for perf timechart. 
>> It essentially enables perf timechart for pseries platfrom to analyse
>> power savings events like cpuidle states.
> 
> Unless I'm mistaken, you added traces to dedicated CPU idle sleep but
> not shared processor. Any reason ?
> 
Yes, the traces were added only to dedicated CPU idle sleep and not for 
shared processor. This was added only for RFC purpose, and looking for 
comments from trace implementation point of view. This can be
easily extended to the latter too.

> Also I don't really know that tracing stuff but what's the point of
> having start/end _and trace_cpu_idle if you're going to always start &
> end around a single occurence of trace_cpu_idle ?
> 
power_start/end are the APIs that were used initially
and they are going to be deprecated in the upcoming kernel releases.
trace_cpu_idle call is going to replace power start/end routines. 
To maintain backward compatibility and uniformity, both the routines 
have been used.
(ref:https://lkml.org/lkml/2010/11/14/60)

> Wouldn't there be a way to start/end and then trace the snooze and
> subsequent cede within the same start/end section or that makes no
> sense ?
> 
We wanted to find the residency time of both Snooze as well as cede 
separately. Knowing this will help us tweak our cpuidle code. So, both 
have been captured separately.

> Also would there be any interest in doing the tracing more generically
> in idle.c ?
> 
Yes, this tracing is already implemented for Intel platform. This would
be a part of cpuidle framework. Going further, once the power cpuidle 
framework is ported and ready, we will extend this trace there as well.
(ref:https://lkml.org/lkml/2011/6/7/375)

> Cheers,
> Ben.
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev

Regards,
Deepthi

^ permalink raw reply

* Re: [PATCH] perf_events: Enable idle state tracing for pseries (ppc64)
From: Benjamin Herrenschmidt @ 2011-06-20 21:42 UTC (permalink / raw)
  To: deepthi; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <4DFF80FD.8080600@linux.vnet.ibm.com>

On Mon, 2011-06-20 at 22:48 +0530, deepthi wrote:
> On Friday 17 June 2011 09:54 AM, Benjamin Herrenschmidt wrote:
> > On Wed, 2011-06-01 at 18:05 +0530, Deepthi Dharwar wrote:
> >> Hi,
> >>
> >> Please find below a patch, which has perf_events added for pseries (ppc64)
> >> platform in order to emit the trace required for perf timechart. 
> >> It essentially enables perf timechart for pseries platfrom to analyse
> >> power savings events like cpuidle states.
> > 
> > Unless I'm mistaken, you added traces to dedicated CPU idle sleep but
> > not shared processor. Any reason ?
> > 
> Yes, the traces were added only to dedicated CPU idle sleep and not for 
> shared processor. This was added only for RFC purpose, and looking for 
> comments from trace implementation point of view. This can be
> easily extended to the latter too.

Please do both.

> > Also I don't really know that tracing stuff but what's the point of
> > having start/end _and trace_cpu_idle if you're going to always start &
> > end around a single occurence of trace_cpu_idle ?
> > 
> power_start/end are the APIs that were used initially
> and they are going to be deprecated in the upcoming kernel releases.
> trace_cpu_idle call is going to replace power start/end routines. 
> To maintain backward compatibility and uniformity, both the routines 
> have been used.
> (ref:https://lkml.org/lkml/2010/11/14/60ref:https://lkml.org/lkml/2010/11/14/60)

Backward compatible with what ? Userspace ? Do we care in that specific
case since it's a new feature ?

> > Wouldn't there be a way to start/end and then trace the snooze and
> > subsequent cede within the same start/end section or that makes no
> > sense ?
> > 
> We wanted to find the residency time of both Snooze as well as cede 
> separately. Knowing this will help us tweak our cpuidle code. So, both 
> have been captured separately.
> 
> > Also would there be any interest in doing the tracing more generically
> > in idle.c ?
> > 
> Yes, this tracing is already implemented for Intel platform. This would
> be a part of cpuidle framework. Going further, once the power cpuidle 
> framework is ported and ready, we will extend this trace there as well.
> (ref:https://lkml.org/lkml/2011/6/7/375)

So do we need to apply this patch at all since the cpuidle stuff is
happening too ?

Cheers,
Ben.

^ permalink raw reply

* Re: [RFC PATCH V1 1/7] cpuidle: create bootparam "cpuidle.off=1"
From: Trinabh Gupta @ 2011-06-21  4:36 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <1308284948.32158.6.camel@pasglop>



On 06/17/2011 09:59 AM, Benjamin Herrenschmidt wrote:
> On Tue, 2011-06-07 at 21:59 +0530, Trinabh Gupta wrote:
>> From: Len Brown<len.brown@intel.com>
>>
>> useful for disabling cpuidle to fall back
>> to architecture-default idle loop
>>
>> cpuidle drivers and governors will fail to register.
>> on x86 they'll say so:
>>
>> intel_idle: intel_idle yielding to (null)
>> ACPI: acpi_idle yielding to (null)
>>
>> Signed-off-by: Len Brown<len.brown@intel.com>
>> ---

Hi Ben,

Thanks for the review.
>
> When you carry over somebody's patch like this you need to also add your
> own signed-off-by.

Ok, thanks
>
> Have those generic changes been reviewed by whoever is in charge of that
> cpuidle framework ?

These patches were posted by Len Brown himself who is ACPI, Intel Idle
cpuidle driver maintainer. He pulled in most of the patches that were
part of that series (https://lkml.org/lkml/2011/4/2/8)
in 3.0-rc1, but these few patches are still out there. These changes
(removal of pm_idle) have already been agreed upon as they were
initially reported by Peter Zijlstra himself
(http://lkml.org/lkml/2009/8/28/43).

Thanks
-Trinabh
>
> Cheers,
> Ben.
>
>>   Documentation/kernel-parameters.txt |    3 +++
>>   drivers/cpuidle/cpuidle.c           |   10 ++++++++++
>>   drivers/cpuidle/cpuidle.h           |    1 +
>>   drivers/cpuidle/driver.c            |    3 +++
>>   drivers/cpuidle/governor.c          |    3 +++
>>   5 files changed, 20 insertions(+), 0 deletions(-)
>>
>> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
>> index d9a203b..5697faf 100644
>> --- a/Documentation/kernel-parameters.txt
>> +++ b/Documentation/kernel-parameters.txt
>> @@ -546,6 +546,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>>   			/proc/<pid>/coredump_filter.
>>   			See also Documentation/filesystems/proc.txt.
>>
>> +	cpuidle.off=1	[CPU_IDLE]
>> +			disable the cpuidle sub-system
>> +
>>   	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver
>>   			Format:
>>   			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
>> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
>> index 406be83..a171b9e 100644
>> --- a/drivers/cpuidle/cpuidle.c
>> +++ b/drivers/cpuidle/cpuidle.c
>> @@ -28,6 +28,12 @@ LIST_HEAD(cpuidle_detected_devices);
>>   static void (*pm_idle_old)(void);
>>
>>   static int enabled_devices;
>> +static int off __read_mostly;
>> +
>> +int cpuidle_disabled(void)
>> +{
>> +	return off;
>> +}
>>
>>   #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
>>   static void cpuidle_kick_cpus(void)
>> @@ -397,6 +403,9 @@ static int __init cpuidle_init(void)
>>   {
>>   	int ret;
>>
>> +	if (cpuidle_disabled())
>> +		return -ENODEV;
>> +
>>   	pm_idle_old = pm_idle;
>>
>>   	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
>> @@ -408,4 +417,5 @@ static int __init cpuidle_init(void)
>>   	return 0;
>>   }
>>
>> +module_param(off, int, 0444);
>>   core_initcall(cpuidle_init);
>> diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
>> index 33e50d5..38c3fd8 100644
>> --- a/drivers/cpuidle/cpuidle.h
>> +++ b/drivers/cpuidle/cpuidle.h
>> @@ -13,6 +13,7 @@ extern struct list_head cpuidle_governors;
>>   extern struct list_head cpuidle_detected_devices;
>>   extern struct mutex cpuidle_lock;
>>   extern spinlock_t cpuidle_driver_lock;
>> +extern int cpuidle_disabled(void);
>>
>>   /* idle loop */
>>   extern void cpuidle_install_idle_handler(void);
>> diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
>> index 33e3189..284d7af 100644
>> --- a/drivers/cpuidle/driver.c
>> +++ b/drivers/cpuidle/driver.c
>> @@ -50,6 +50,9 @@ int cpuidle_register_driver(struct cpuidle_driver *drv)
>>   	if (!drv)
>>   		return -EINVAL;
>>
>> +	if (cpuidle_disabled())
>> +		return -ENODEV;
>> +
>>   	spin_lock(&cpuidle_driver_lock);
>>   	if (cpuidle_curr_driver) {
>>   		spin_unlock(&cpuidle_driver_lock);
>> diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
>> index 724c164..ea2f8e7 100644
>> --- a/drivers/cpuidle/governor.c
>> +++ b/drivers/cpuidle/governor.c
>> @@ -81,6 +81,9 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
>>   	if (!gov || !gov->select)
>>   		return -EINVAL;
>>
>> +	if (cpuidle_disabled())
>> +		return -ENODEV;
>> +
>>   	mutex_lock(&cpuidle_lock);
>>   	if (__cpuidle_find_governor(gov->name) == NULL) {
>>   		ret = 0;
>>
>> _______________________________________________
>> Linuxppc-dev mailing list
>> Linuxppc-dev@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
>

^ permalink raw reply

* Re: [RFC PATCH V1 4/7] cpuidle: (powerpc) Add cpu_idle_wait() to allow switching idle routines
From: Trinabh Gupta @ 2011-06-21  6:00 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <1308285128.32158.8.camel@pasglop>



On 06/17/2011 10:02 AM, Benjamin Herrenschmidt wrote:
> On Tue, 2011-06-07 at 22:00 +0530, Trinabh Gupta wrote:
>
>> diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
>> index 39a2baa..932392b 100644
>> --- a/arch/powerpc/kernel/idle.c
>> +++ b/arch/powerpc/kernel/idle.c
>> @@ -102,6 +102,24 @@ void cpu_idle(void)
>>   	}
>>   }
>>
>> +static void do_nothing(void *unused)
>> +{
>> +}
>> +
>> +/*
>> + * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
>> + * idle loop and start using the new idle loop.
>> + * Required while changing idle handler on SMP systems.
>> + * Caller must have changed idle handler to the new value before the call.
>> + */
>> +void cpu_idle_wait(void)
>> +{
>> +	smp_mb();
>> +	/* kick all the CPUs so that they exit out of old idle routine */
>> +	smp_call_function(do_nothing, NULL, 1);
>> +}
>> +EXPORT_SYMBOL_GPL(cpu_idle_wait);
>> +
>>   int powersave_nap;
>>
>>   #ifdef CONFIG_SYSCTL
>
> This is gross :-)

Well this is what exists today for x86; so didn't think too
much into this. Maybe there is cleaner way. The requirement
is to completely exit the idle loop and call cpuidle_idle_call()
again. I think sending reschedule may be enough.

With respect to current implementation the arch-independent cpuidle
code needs a cpu_idle_wait() function for any architecture where
CONFIG_SMP is defined. This cpu_idle_wait function is called
whenever we have to pause usage of cpuidle; to switch driver
or governor etc. So maybe there is a cleaner implementation of
cpu_idle_wait instead of smp_call_function(do_nothing...); sending
reschedule may work.

Thanks
-Trinabh

>
> Do you need to absolutely ensure the idle task has changed or just
> kicking it with a send reschedule is enough ?


>
> Cheers,
> Ben.
>
>

^ permalink raw reply

* Re: [PATCH v3 2/2] powerpc: add support for MPIC message register API
From: Benjamin Herrenschmidt @ 2011-06-21  7:47 UTC (permalink / raw)
  To: Scott Wood
  Cc: Meador Inge, openmcapi-dev, devicetree-discuss, Hollis Blanchard,
	linuxppc-dev
In-Reply-To: <20110620105946.23708a43@schlenkerla.am.freescale.net>

On Mon, 2011-06-20 at 10:59 -0500, Scott Wood wrote:
> On Sat, 18 Jun 2011 08:58:53 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Fri, 2011-06-17 at 11:58 -0500, Scott Wood wrote:
> > > When did this change from "considered an internal implementation
> > > issue, and not really an interface" to "all new interfaces"?
> > 
> > Interesting blurb... that's not everybody's opinion and I would argue
> > that supporting AMP kernels isn't something we want to do with closed
> > source crap.
> 
> I'm not advocating "closed source crap", just that if something is
> "policy" (as opposed to opinion), it'd be nice if the documentation
> actually matched.

Well, in Linux, the line between opinion and policy is quite blurred.

I don't know for sure what Linus himself thinks here and various other
maintainers have expressed various opinions as well. As far as I'm
concerned, I don't see the point in encouraging binary junk, especially
for low level interfaces like this one.

Cheers,
Ben.
 

^ permalink raw reply

* Re: [PATCH 09/15] KVM: PPC: Add support for Book3S processors in hypervisor mode
From: Alexander Graf @ 2011-06-21  8:55 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm-ppc, kvm
In-Reply-To: <20110618083558.GJ28413@bloggs.ozlabs.ibm.com>


On 18.06.2011, at 10:35, Paul Mackerras wrote:

> This adds support for KVM running on 64-bit Book 3S processors,
> specifically POWER7, in hypervisor mode.  Using hypervisor mode means
> that the guest can use the processor's supervisor mode.  That means
> that the guest can execute privileged instructions and access =
privileged
> registers itself without trapping to the host.  This gives excellent
> performance, but does mean that KVM cannot emulate a processor
> architecture other than the one that the hardware implements.
>=20
> This code assumes that the guest is running paravirtualized using the
> PAPR (Power Architecture Platform Requirements) interface, which is =
the
> interface that IBM's PowerVM hypervisor uses.  That means that =
existing
> Linux distributions that run on IBM pSeries machines will also run
> under KVM without modification.  In order to communicate the PAPR
> hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
> to include/linux/kvm.h.
>=20
> Currently the choice between book3s_hv support and book3s_pr support
> (i.e. the existing code, which runs the guest in user mode) has to be
> made at kernel configuration time, so a given kernel binary can only
> do one or the other.
>=20
> This new book3s_hv code doesn't support MMIO emulation at present.
> Since we are running paravirtualized guests, this isn't a serious
> restriction.
>=20
> With the guest running in supervisor mode, most exceptions go straight
> to the guest.  We will never get data or instruction storage or =
segment
> interrupts, alignment interrupts, decrementer interrupts, program
> interrupts, single-step interrupts, etc., coming to the hypervisor =
from
> the guest.  Therefore this introduces a new KVMTEST_NONHV macro for =
the
> exception entry path so that we don't have to do the KVM test on entry
> to those exception handlers.
>=20
> We do however get hypervisor decrementer, hypervisor data storage,
> hypervisor instruction storage, and hypervisor emulation assist
> interrupts, so we have to handle those.
>=20
> In hypervisor mode, real-mode accesses can access all of RAM, not just
> a limited amount.  Therefore we put all the guest state in the =
vcpu.arch
> and use the shadow_vcpu in the PACA only for temporary scratch space.
> We allocate the vcpu with kzalloc rather than vzalloc, and we don't =
use
> anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
> We don't have a shared page with the guest, but we still need a
> kvm_vcpu_arch_shared struct to store the values of various registers,
> so we include one in the vcpu_arch struct.
>=20
> The POWER7 processor has a restriction that all threads in a core have
> to be in the same partition.  MMU-on kernel code counts as a partition
> (partition 0), so we have to do a partition switch on every entry to =
and
> exit from the guest.  At present we require the host and guest to run
> in single-thread mode because of this hardware restriction.
>=20
> This code allocates a hashed page table for the guest and initializes
> it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
> require that the guest memory is allocated using 16MB huge pages, in
> order to simplify the low-level memory management.  This also means =
that
> we can get away without tracking paging activity in the host for now,
> since huge pages can't be paged or swapped.
>=20
> Signed-off-by: Paul Mackerras <paulus@samba.org>
>=20


> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index b7baff7..3662ecc 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -20,7 +20,6 @@ config KVM
> 	bool
> 	select PREEMPT_NOTIFIERS
> 	select ANON_INODES
> -	select KVM_MMIO
>=20
> config KVM_BOOK3S_HANDLER
> 	bool
> @@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
> config KVM_BOOK3S_32_HANDLER
> 	bool
> 	select KVM_BOOK3S_HANDLER
> +	select KVM_MMIO
>=20
> config KVM_BOOK3S_64_HANDLER
> 	bool
> 	select KVM_BOOK3S_HANDLER
>=20
> +config KVM_BOOK3S_PR
> +	bool
> +	select KVM_MMIO
> +
> config KVM_BOOK3S_32
> 	tristate "KVM support for PowerPC book3s_32 processors"
> 	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
> 	select KVM
> 	select KVM_BOOK3S_32_HANDLER
> +	select KVM_BOOK3S_PR

KVM_BOOK3S_32 is the equivalent to KVM_BOOK3S_64_PR, right? We should =
rename both or none to stay consistent.

> 	---help---
> 	  Support running unmodified book3s_32 guest kernels
> 	  in virtual machines on book3s_32 host processors.
> @@ -48,10 +53,38 @@ config KVM_BOOK3S_32
> 	  If unsure, say N.
>=20
> config KVM_BOOK3S_64
> -	tristate "KVM support for PowerPC book3s_64 processors"
> +	bool

This means that if a user has selected KVM in his config before, it's =
unset after. Is there any good way to keep users from doing that? Maybe =
we could define KVM_BOOK3S_64 to be the PR type and introduce a new =
option that fulfills the role of KVM_BOOK3S_64 as you intended it here?


Alex

^ permalink raw reply

* Re: [RFC PATCH V1 5/7] cpuidle: (POWER) cpuidle driver for pSeries
From: Trinabh Gupta @ 2011-06-21  9:00 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, linux-pm, linux-kernel
In-Reply-To: <1308285398.32158.12.camel@pasglop>



On 06/17/2011 10:06 AM, Benjamin Herrenschmidt wrote:
> On Tue, 2011-06-07 at 22:00 +0530, Trinabh Gupta wrote:
>
>> +static int snooze_loop(struct cpuidle_device *dev,
>> +			struct cpuidle_driver *drv,
>> +			int index)
>> +{
>> +	unsigned long in_purr, out_purr;
>> +	ktime_t kt_before, kt_after;
>> +	s64 usec_delta;
>> +
>> +	/*
>> +	 * Indicate to the HV that we are idle. Now would be
>> +	 * a good time to find other work to dispatch.
>> +	 */
>> +	get_lppaca()->idle = 1;
>> +	get_lppaca()->donate_dedicated_cpu = 1;
>> +	in_purr = mfspr(SPRN_PURR);
>> +
>> +	kt_before = ktime_get_real();
>
> Don't you want to timestamp before you tell the HV that you are idle ?
> Or is the above stuff only polled by phyp when partition interrupts are
> enabled ?

Hi Ben,

Yes, timestamp should be before telling HV that we are idle. Thanks

>
>> +	local_irq_enable();
>> +	set_thread_flag(TIF_POLLING_NRFLAG);
>> +	while (!need_resched()) {
>> +		ppc64_runlatch_off();
>> +		HMT_low();
>> +		HMT_very_low();
>> +	}
>> +	HMT_medium();
>> +	clear_thread_flag(TIF_POLLING_NRFLAG);
>> +	smp_mb();
>> +	local_irq_disable();
>> +
>> +	kt_after = ktime_get_real();
>> +	usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
>> +
>> +	out_purr = mfspr(SPRN_PURR);
>> +	get_lppaca()->wait_state_cycles += out_purr - in_purr;
>> +	get_lppaca()->donate_dedicated_cpu = 0;
>> +	get_lppaca()->idle = 0;
>> +
>> +	dev->last_residency = (int)usec_delta;
>> +
>> +	return index;
>> +}
>> +
>> +static int dedicated_cede_loop(struct cpuidle_device *dev,
>> +				struct cpuidle_driver *drv,
>> +				int index)
>> +{
>> +	unsigned long in_purr, out_purr;
>> +	ktime_t kt_before, kt_after;
>> +	s64 usec_delta;
>> +
>> +	/*
>> +	 * Indicate to the HV that we are idle. Now would be
>> +	 * a good time to find other work to dispatch.
>> +	 */
>> +	get_lppaca()->idle = 1;
>> +	get_lppaca()->donate_dedicated_cpu = 1;
>> +	in_purr = mfspr(SPRN_PURR);
>> +
>> +	kt_before = ktime_get_real();
>
> There's a bit too much code duplication for my taste here between the
> two functions. Not sure if it can be helped, maybe with some inlines
> for the prolog/epilogue ... Looks like stuff that's easy to "fix" in one
> place and forget the other...
>

Yes, I agree that there is too much code duplication in these idle
routines; will fix this.

Thanks
-Trinabh

>> +	ppc64_runlatch_off();
>> +	HMT_medium();
>> +	cede_processor();
>> +
>> +	kt_after = ktime_get_real();
>> +	usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
>> +
>> +	out_purr = mfspr(SPRN_PURR);
>> +	get_lppaca()->wait_state_cycles += out_purr - in_purr;
>> +	get_lppaca()->donate_dedicated_cpu = 0;
>> +	get_lppaca()->idle = 0;
>> +
>> +	dev->last_residency = (int)usec_delta;
>> +
>> +	return index;
>> +}
>> +
>> +static int shared_cede_loop(struct cpuidle_device *dev,
>> +			struct cpuidle_driver *drv,
>> +			int index)
>> +{
>> +	unsigned long in_purr, out_purr;
>> +	ktime_t kt_before, kt_after;
>> +	s64 usec_delta;
>> +
>> +	/*
>> +	 * Indicate to the HV that we are idle. Now would be
>> +	 * a good time to find other work to dispatch.
>> +	 */
>> +	get_lppaca()->idle = 1;
>> +	get_lppaca()->donate_dedicated_cpu = 1;
>> +	in_purr = mfspr(SPRN_PURR);
>> +
>> +	kt_before = ktime_get_real();
>> +	/*
>> +	 * Yield the processor to the hypervisor.  We return if
>> +	 * an external interrupt occurs (which are driven prior
>> +	 * to returning here) or if a prod occurs from another
>> +	 * processor. When returning here, external interrupts
>> +	 * are enabled.
>> +	 */
>> +	cede_processor();
>> +
>> +	kt_after = ktime_get_real();
>> +
>> +	usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before));
>> +
>> +	out_purr = mfspr(SPRN_PURR);
>> +	get_lppaca()->wait_state_cycles += out_purr - in_purr;
>> +	get_lppaca()->donate_dedicated_cpu = 0;
>> +	get_lppaca()->idle = 0;
>> +
>> +	dev->last_residency = (int)usec_delta;
>> +
>> +	return index;
>> +}
>> +
>> +/*
>> + * States for dedicated partition case.
>> + */
>> +static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
>> +	{ /* Snooze */
>> +		.name = "snooze",
>> +		.desc = "snooze",
>> +		.flags = CPUIDLE_FLAG_TIME_VALID,
>> +		.exit_latency = 0,
>> +		.target_residency = 0,
>> +		.enter =&snooze_loop },
>> +	{ /* CEDE */
>> +		.name = "CEDE",
>> +		.desc = "CEDE",
>> +		.flags = CPUIDLE_FLAG_TIME_VALID,
>> +		.exit_latency = 1,
>> +		.target_residency = 10,
>> +		.enter =&dedicated_cede_loop },
>> +};
>> +
>> +/*
>> + * States for shared partition case.
>> + */
>> +static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
>> +	{ /* Shared Cede */
>> +		.name = "Shared Cede",
>> +		.desc = "Shared Cede",
>> +		.flags = CPUIDLE_FLAG_TIME_VALID,
>> +		.exit_latency = 0,
>> +		.target_residency = 0,
>> +		.enter =&shared_cede_loop },
>> +};
>> +
>> +int pseries_notify_cpuidle_add_cpu(int cpu)
>> +{
>> +	struct cpuidle_device *dev =
>> +			per_cpu_ptr(pseries_idle_cpuidle_devices, cpu);
>> +	if (dev&&  cpuidle_get_driver()) {
>> +		cpuidle_disable_device(dev);
>> +		cpuidle_enable_device(dev);
>> +	}
>> +	return 0;
>> +}
>> +
>> +/*
>> + * pseries_idle_cpuidle_driver_init()
>> + */
>> +static int pseries_idle_cpuidle_driver_init(void)
>> +{
>> +	int cstate;
>> +	struct cpuidle_driver *drv =&pseries_idle_driver;
>> +
>> +	drv->state_count = 0;
>> +
>> +	for (cstate = 0; cstate<  MAX_IDLE_STATE_COUNT; ++cstate) {
>> +
>> +		if (cstate>  max_cstate)
>> +			break;
>> +
>> +		/* is the state not enabled? */
>> +		if (cpuidle_state_table[cstate].enter == NULL)
>> +			continue;
>> +
>> +		drv->states[drv->state_count] =	/* structure copy */
>> +			cpuidle_state_table[cstate];
>> +
>> +		if (cpuidle_state_table == dedicated_states)
>> +			drv->states[drv->state_count].target_residency =
>> +				__get_cpu_var(smt_snooze_delay);
>> +
>> +		drv->state_count += 1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/* pseries_idle_devices_uninit(void)
>> + * unregister cpuidle devices and de-allocate memory
>> + */
>> +static void pseries_idle_devices_uninit(void)
>> +{
>> +	int i;
>> +	struct cpuidle_device *dev;
>> +
>> +	for_each_possible_cpu(i) {
>> +		dev = per_cpu_ptr(pseries_idle_cpuidle_devices, i);
>> +		cpuidle_unregister_device(dev);
>> +	}
>> +
>> +	free_percpu(pseries_idle_cpuidle_devices);
>> +	return;
>> +}
>> +
>> +/* pseries_idle_devices_init()
>> + * allocate, initialize and register cpuidle device
>> + */
>> +static int pseries_idle_devices_init(void)
>> +{
>> +	int i;
>> +	struct cpuidle_driver *drv =&pseries_idle_driver;
>> +	struct cpuidle_device *dev;
>> +
>> +	pseries_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
>> +	if (pseries_idle_cpuidle_devices == NULL)
>> +		return -ENOMEM;
>> +
>> +	for_each_possible_cpu(i) {
>> +		dev = per_cpu_ptr(pseries_idle_cpuidle_devices, i);
>> +		dev->state_count = drv->state_count;
>> +		dev->cpu = i;
>> +		if (cpuidle_register_device(dev)) {
>> +			printk(KERN_DEBUG "cpuidle_register_device %d failed!\n",
>> +				 i);
>> +			return -EIO;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * pseries_idle_probe()
>> + * Choose state table for shared versus dedicated partition
>> + */
>> +static int pseries_idle_probe(void)
>> +{
>> +	if (max_cstate == 0) {
>> +		printk(KERN_DEBUG "pseries processor idle disabled.\n");
>> +		return -EPERM;
>> +	}
>> +
>> +	if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
>> +		printk(KERN_DEBUG "Using default idle\n");
>> +		return -ENODEV;
>> +	}
>> +
>> +	if (get_lppaca()->shared_proc)
>> +		cpuidle_state_table = shared_states;
>> +	else
>> +		cpuidle_state_table = dedicated_states;
>> +
>> +	return 0;
>> +}
>> +
>> +static int __init pseries_processor_idle_init(void)
>> +{
>> +	int retval;
>> +
>> +	retval = pseries_idle_probe();
>> +	if (retval)
>> +		return retval;
>> +
>> +	pseries_idle_cpuidle_driver_init();
>> +	retval = cpuidle_register_driver(&pseries_idle_driver);
>> +	if (retval) {
>> +		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
>> +		return retval;
>> +	}
>> +
>> +	retval = pseries_idle_devices_init();
>> +	if (retval) {
>> +		pseries_idle_devices_uninit();
>> +		cpuidle_unregister_driver(&pseries_idle_driver);
>> +		return retval;
>> +	}
>> +
>> +	printk(KERN_DEBUG "pseries_idle_driver registered\n");
>> +
>> +	return 0;
>> +}
>> +
>> +device_initcall(pseries_processor_idle_init);
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index e9f6d28..7c60380 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -56,4 +56,7 @@ extern struct device_node *dlpar_configure_connector(u32);
>>   extern int dlpar_attach_node(struct device_node *);
>>   extern int dlpar_detach_node(struct device_node *);
>>
>> +/* Snooze Delay, pseries_idle */
>> +DECLARE_PER_CPU(long, smt_snooze_delay);
>> +
>>   #endif /* _PSERIES_PSERIES_H */
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index 593acce..6893a0c 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -584,9 +584,6 @@ static int __init pSeries_probe(void)
>>   	return 1;
>>   }
>>
>> -
>> -DECLARE_PER_CPU(long, smt_snooze_delay);
>> -
>>   static void pseries_dedicated_idle_sleep(void)
>>   {
>>   	unsigned int cpu = smp_processor_id();
>> diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
>> index fbffd7e..2e46883 100644
>> --- a/arch/powerpc/platforms/pseries/smp.c
>> +++ b/arch/powerpc/platforms/pseries/smp.c
>> @@ -150,6 +150,7 @@ static void __devinit smp_xics_setup_cpu(int cpu)
>>   	set_cpu_current_state(cpu, CPU_STATE_ONLINE);
>>   	set_default_offline_state(cpu);
>>   #endif
>> +	pseries_notify_cpuidle_add_cpu(cpu);
>>   }
>>
>>   static int __devinit smp_pSeries_kick_cpu(int nr)
>>
>> _______________________________________________
>> Linuxppc-dev mailing list
>> Linuxppc-dev@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
>

^ permalink raw reply

* Re: [PATCH 0/15] Hypervisor-mode KVM on POWER7 and PPC970
From: Alexander Graf @ 2011-06-21  9:53 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm-ppc, kvm
In-Reply-To: <20110618082705.GA28413@bloggs.ozlabs.ibm.com>


On 18.06.2011, at 10:27, Paul Mackerras wrote:

> The following series of patches enable KVM to exploit the hardware
> hypervisor mode on 64-bit Power ISA Book3S machines.  At present,
> POWER7 and PPC970 processors are supported.  (Note that the PPC970
> processors in Apple G5 machines don't have a usable hypervisor mode
> and are not supported by these patches.)
>=20
> Running the KVM host in hypervisor mode means that the guest can use
> both supervisor mode and user mode.  That means that the guest can
> execute supervisor-privilege instructions and access supervisor-
> privilege registers.  In addition the hardware directs most exceptions
> to the guest.  Thus we don't need to emulate any instructions in the
> host.  Generally, the only times we need to exit the guest are when it
> does a hypercall or when an external interrupt or host timer
> (decrementer) interrupt occurs.
>=20
> The focus of this KVM implementation is to run guests that use the
> PAPR (Power Architecture Platform Requirements) paravirtualization
> interface, which is the interface supplied by PowerVM on IBM pSeries
> machines.  Currently the "pseries" machine type in qemu is only
> supported by book3s_hv KVM, and book3s_hv KVM only supports the
> "pseries" machine type.  That will hopefully change in future.
>=20
> These patches are against Alex Graf's kvm-ppc-next branch.

Could you please also add some mechanism that sets the guest machine =
type? We currently support 2:

  * bare metal
  * PAPR

BOOK3S_PR can't do PAPR (yet), BOOK3S_HV can't do bare metal (ever). =
User space needs to know about these limitations, so it can set the type =
accordingly and fail to start if kernel space can't provide it.

Since we might end up supporting PAPR even with _pr, it can't just be as =
simple as a CAP bitmap, since we really need to set the type.


Alex

^ permalink raw reply

* [PATCH] crypto: crypto4xx - Perform read/modify/write on device control register
From: Josh Boyer @ 2011-06-21 12:13 UTC (permalink / raw)
  To: James Hsiao, linux-crypto; +Cc: linuxppc-dev, Herbert Xu

The Security function on the AMCC SoCs has multiple engines within a
single MMIO range.  The crypto driver currently enables the 3DES
functionality by doing a blind write to the device control register.
This can unintentionally disable other functions like the PKA or TRNG
when the driver is loaded.

Perform a read/modify/write to enable the 3DES function instead.

Signed-off-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>

---

diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c
index 1891252..1d103f9 100644
--- a/drivers/crypto/amcc/crypto4xx_core.c
+++ b/drivers/crypto/amcc/crypto4xx_core.c
@@ -51,6 +51,7 @@ static void crypto4xx_hw_init(struct crypto4xx_device *dev)
 	union ce_io_threshold io_threshold;
 	u32 rand_num;
 	union ce_pe_dma_cfg pe_dma_cfg;
+	u32 device_ctrl;
 
 	writel(PPC4XX_BYTE_ORDER, dev->ce_base + CRYPTO4XX_BYTE_ORDER_CFG);
 	/* setup pe dma, include reset sg, pdr and pe, then release reset */
@@ -84,7 +85,9 @@ static void crypto4xx_hw_init(struct crypto4xx_device *dev)
 	writel(ring_size.w, dev->ce_base + CRYPTO4XX_RING_SIZE);
 	ring_ctrl.w = 0;
 	writel(ring_ctrl.w, dev->ce_base + CRYPTO4XX_RING_CTRL);
-	writel(PPC4XX_DC_3DES_EN, dev->ce_base + CRYPTO4XX_DEVICE_CTRL);
+	device_ctrl = readl(dev->ce_base + CRYPTO4XX_DEVICE_CTRL);
+	device_ctrl |= PPC4XX_DC_3DES_EN;
+	writel(device_ctrl, dev->ce_base + CRYPTO4XX_DEVICE_CTRL);
 	writel(dev->gdr_pa, dev->ce_base + CRYPTO4XX_GATH_RING_BASE);
 	writel(dev->sdr_pa, dev->ce_base + CRYPTO4XX_SCAT_RING_BASE);
 	part_ring_size.w = 0;

^ permalink raw reply related

* [PATCH] hwrng: ppc4xx - add support for ppc4xx TRNG
From: Josh Boyer @ 2011-06-21 12:19 UTC (permalink / raw)
  To: Matt Mackall, Herbert Xu; +Cc: linuxppc-dev

Various PowerPC 4xx SoCs contain a TRNG embedded in the Security function.
This adds a device driver for that TRNG.

Signed-off-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>

---
 drivers/char/hw_random/Kconfig      |   12 +++
 drivers/char/hw_random/Makefile     |    1 +
 drivers/char/hw_random/ppc4xx-rng.c |  156 +++++++++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+), 0 deletions(-)
 create mode 100644 drivers/char/hw_random/ppc4xx-rng.c

diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index a60043b..1d2ebc7 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -210,3 +210,15 @@ config HW_RANDOM_PICOXCELL
 	  module will be called picoxcell-rng.
 
 	  If unsure, say Y.
+
+config HW_RANDOM_PPC4XX
+	tristate "PowerPC 4xx generic true random number generator support"
+	depends on HW_RANDOM && PPC && 4xx
+	---help---
+	 This driver provides the kernel-side support for the TRNG hardware
+	 found in the security function of some PowerPC 4xx SoCs.
+
+	 To compile this driver as a module, choose M here: the
+	 module will be called ppc4xx-rng.
+
+	 If unsure, say N.
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index 3db4eb8..c88f244 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_HW_RANDOM_MXC_RNGA) += mxc-rnga.o
 obj-$(CONFIG_HW_RANDOM_OCTEON) += octeon-rng.o
 obj-$(CONFIG_HW_RANDOM_NOMADIK) += nomadik-rng.o
 obj-$(CONFIG_HW_RANDOM_PICOXCELL) += picoxcell-rng.o
+obj-$(CONFIG_HW_RANDOM_PPC4XX) += ppc4xx-rng.o
diff --git a/drivers/char/hw_random/ppc4xx-rng.c b/drivers/char/hw_random/ppc4xx-rng.c
new file mode 100644
index 0000000..b8afa6a
--- /dev/null
+++ b/drivers/char/hw_random/ppc4xx-rng.c
@@ -0,0 +1,156 @@
+/*
+ * Generic PowerPC 44x RNG driver
+ *
+ * Copyright 2011 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/hw_random.h>
+#include <linux/delay.h>
+#include <linux/of_platform.h>
+#include <asm/io.h>
+
+#define PPC4XX_TRNG_DEV_CTRL 0x60080
+
+#define PPC4XX_TRNGE 0x00020000
+#define PPC4XX_TRNG_CTRL 0x0008
+#define PPC4XX_TRNG_CTRL_DALM 0x20
+#define PPC4XX_TRNG_STAT 0x0004
+#define PPC4XX_TRNG_STAT_B 0x1
+#define PPC4XX_TRNG_DATA 0x0000
+
+#define MODULE_NAME "ppc4xx_rng"
+
+static int ppc4xx_rng_data_present(struct hwrng *rng, int wait)
+{
+	void __iomem *rng_regs = (void __iomem *) rng->priv;
+	int busy, i, present = 0;
+
+	for (i = 0; i < 20; i++) {
+		busy = (in_le32(rng_regs + PPC4XX_TRNG_STAT) & PPC4XX_TRNG_STAT_B);
+		if (!busy || !wait) {
+			present = 1;
+			break;
+		}
+		udelay(10);
+	}
+	return present;
+}
+
+static int ppc4xx_rng_data_read(struct hwrng *rng, u32 *data)
+{
+	void __iomem *rng_regs = (void __iomem *) rng->priv;
+	*data = in_le32(rng_regs + PPC4XX_TRNG_DATA);
+	return 4;
+}
+
+static int ppc4xx_rng_enable(int enable)
+{
+	struct device_node *ctrl;
+	void __iomem *ctrl_reg;
+	int err = 0;
+	u32 val;
+
+	/* Find the main crypto device node and map it to turn the TRNG on */
+	ctrl = of_find_compatible_node(NULL, NULL, "amcc,ppc4xx-crypto");
+	if (!ctrl)
+		return -ENODEV;
+
+	ctrl_reg = of_iomap(ctrl, 0);
+	if (!ctrl_reg) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	val = in_le32(ctrl_reg + PPC4XX_TRNG_DEV_CTRL);
+
+	if (enable)
+		val |= PPC4XX_TRNGE;
+	else
+		val = val & ~PPC4XX_TRNGE;
+
+	out_le32(ctrl_reg + PPC4XX_TRNG_DEV_CTRL, val);
+	iounmap(ctrl_reg);
+
+out:
+	of_node_put(ctrl);
+
+	return err;
+}
+
+static struct hwrng ppc4xx_rng = {
+	.name = MODULE_NAME,
+	.data_present = ppc4xx_rng_data_present,
+	.data_read = ppc4xx_rng_data_read,
+};
+
+static int __devinit ppc4xx_rng_probe(struct platform_device *dev)
+{
+	void __iomem *rng_regs;
+	int err = 0;
+
+	rng_regs = of_iomap(dev->dev.of_node, 0);
+	if (!rng_regs)
+		return -ENODEV;
+
+	err = ppc4xx_rng_enable(1);
+	if (err)
+		return err;
+
+	out_le32(rng_regs + PPC4XX_TRNG_CTRL, PPC4XX_TRNG_CTRL_DALM);
+	ppc4xx_rng.priv = (unsigned long) rng_regs;
+
+	err = hwrng_register(&ppc4xx_rng);
+
+	return err;
+}
+
+static int __devexit ppc4xx_rng_remove(struct platform_device *dev)
+{
+	void __iomem *rng_regs = (void __iomem *) ppc4xx_rng.priv;
+
+	hwrng_unregister(&ppc4xx_rng);
+	ppc4xx_rng_enable(0);
+	iounmap(rng_regs);
+
+	return 0;
+}
+
+static struct of_device_id ppc4xx_rng_match[] = {
+	{ .compatible = "ppc4xx-rng", },
+	{ .compatible = "amcc,ppc460ex-rng", },
+	{ .compatible = "amcc,ppc440epx-rng", },
+	{},
+};
+
+static struct platform_driver ppc4xx_rng_driver = {
+	.driver = {
+		.name = MODULE_NAME,
+		.owner = THIS_MODULE,
+		.of_match_table = ppc4xx_rng_match,
+	},
+	.probe = ppc4xx_rng_probe,
+	.remove = ppc4xx_rng_remove,
+};
+
+static int __init ppc4xx_rng_init(void)
+{
+	return platform_driver_register(&ppc4xx_rng_driver);
+}
+module_init(ppc4xx_rng_init);
+
+static void __exit ppc4xx_rng_exit(void)
+{
+	platform_driver_unregister(&ppc4xx_rng_driver);
+}
+module_exit(ppc4xx_rng_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Josh Boyer <jwboyer@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("HW RNG driver for PPC 4xx processors");
-- 
1.7.4.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox