LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 03/11] KVM: PPC: Allow use of small pages to back guest memory
From: Paul Mackerras @ 2011-11-16 22:58 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

From: Nishanth Aravamudan <nacc@us.ibm.com>

This puts the page frame numbers for the memory backing the guest in
the slot->rmap array for each slot, rather than using the ram_pginfo
array.  Since the rmap array is vmalloc'd, we use real_vmalloc_addr()
to access it when we access it in real mode in kvmppc_h_enter().
The rmap array contains one PFN for each small page, even if the
backing memory is large pages.

This lets us get rid of the ram_pginfo array.

[paulus@samba.org - Cleaned up and reorganized a bit, abstracted out
HPTE page size encoding functions, added check that memory being
added in kvmppc_core_prepare_memory_region is all in one VMA.]

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_host.h |    8 --
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   47 +++++++----
 arch/powerpc/kvm/book3s_hv.c        |  153 +++++++++++++++++------------------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |   90 ++++++++++----------
 4 files changed, 151 insertions(+), 147 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 56f7046..52fd741 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -145,11 +145,6 @@ struct kvmppc_exit_timing {
 	};
 };
 
-struct kvmppc_pginfo {
-	unsigned long pfn;
-	atomic_t refcnt;
-};
-
 struct kvmppc_spapr_tce_table {
 	struct list_head list;
 	struct kvm *kvm;
@@ -179,17 +174,14 @@ struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
 	struct revmap_entry *revmap;
-	unsigned long ram_npages;
 	unsigned long ram_psize;
 	unsigned long ram_porder;
-	struct kvmppc_pginfo *ram_pginfo;
 	unsigned int lpid;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
-	int n_rma_pages;
 	unsigned long lpcr;
 	unsigned long rmor;
 	struct kvmppc_rma_info *rma;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 2b9b8be..bed6c61 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -34,8 +34,6 @@
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
 
-/* Pages in the VRMA are 16MB pages */
-#define VRMA_PAGE_ORDER	24
 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
 
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
@@ -95,19 +93,33 @@ void kvmppc_free_hpt(struct kvm *kvm)
 	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
 }
 
+/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
+}
+
+/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize == 0x10000) ? 0x1000 : 0;
+}
+
 void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 {
 	unsigned long i;
-	unsigned long npages = kvm->arch.ram_npages;
+	unsigned long npages;
 	unsigned long pfn;
 	unsigned long *hpte;
-	unsigned long hash;
+	unsigned long addr, hash;
+	unsigned long psize = kvm->arch.ram_psize;
 	unsigned long porder = kvm->arch.ram_porder;
 	struct revmap_entry *rev;
-	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+	struct kvm_memory_slot *memslot;
+	unsigned long hp0, hp1;
 
-	if (!pginfo)
-		return;
+	memslot = &kvm->memslots->memslots[mem->slot];
+	npages = memslot->npages >> (porder - PAGE_SHIFT);
 
 	/* VRMA can't be > 1TB */
 	if (npages > 1ul << (40 - porder))
@@ -116,10 +128,16 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	if (npages > HPT_NPTEG)
 		npages = HPT_NPTEG;
 
+	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize) | HPTE_V_VALID;
+	hp1 = hpte1_pgsize_encoding(psize) |
+		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
+
 	for (i = 0; i < npages; ++i) {
-		pfn = pginfo[i].pfn;
+		pfn = memslot->rmap[i << (porder - PAGE_SHIFT)];
 		if (!pfn)
-			break;
+			continue;
+		addr = i << porder;
 		/* can't use hpt_hash since va > 64 bits */
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		/*
@@ -131,17 +149,14 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		hash = (hash << 3) + 7;
 		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 4));
 		/* HPTE low word - RPN, protection, etc. */
-		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
-			HPTE_R_M | PP_RWXX;
+		hpte[1] = hp1 | (pfn << PAGE_SHIFT);
 		smp_wmb();
-		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
-			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
-			HPTE_V_LARGE | HPTE_V_VALID;
+		/* HPTE high word - virtual address, bolted, valid, large */
+		hpte[0] = hp0 | ((addr >> 16) & ~0x7fUL);
 
 		/* Reverse map info */
 		rev = &kvm->arch.revmap[hash];
-		rev->guest_rpte = (i << porder) | HPTE_R_R | HPTE_R_C |
-			HPTE_R_M | PP_RWXX;
+		rev->guest_rpte = hp1 | addr;
 	}
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d1f0774..bc512ef 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -47,14 +47,7 @@
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
-
-/*
- * For now, limit memory to 64GB and require it to be large pages.
- * This value is chosen because it makes the ram_pginfo array be
- * 64kB in size, which is about as large as we want to be trying
- * to allocate with kmalloc.
- */
-#define MAX_MEM_ORDER		36
+#include <linux/hugetlb.h>
 
 #define LARGE_PAGE_ORDER	24	/* 16MB pages */
 
@@ -149,6 +142,7 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 	unsigned long pg_offset;
 	void *va;
 	struct kvm_vcpu *tvcpu;
+	struct kvm_memory_slot *memslot;
 
 	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
 	if (!tvcpu)
@@ -162,13 +156,14 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 		if (vpa & 0x7f)
 			return H_PARAMETER;
 		/* registering new area; convert logical addr to real */
-		pg_index = vpa >> kvm->arch.ram_porder;
-		pg_offset = vpa & (kvm->arch.ram_psize - 1);
-		if (pg_index >= kvm->arch.ram_npages)
+		pg_index = vpa >> PAGE_SHIFT;
+		pg_offset = vpa & (PAGE_SIZE - 1);
+		memslot = gfn_to_memslot(kvm, pg_index);
+		if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 			return H_PARAMETER;
-		if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+		ra = memslot->rmap[pg_index - memslot->base_gfn] << PAGE_SHIFT;
+		if (!ra)
 			return H_PARAMETER;
-		ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
 		ra |= pg_offset;
 		va = __va(ra);
 		if (flags <= 1)
@@ -1079,13 +1074,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				      struct kvm_userspace_memory_region *mem)
 {
 	unsigned long psize, porder;
-	unsigned long i, npages, totalpages;
-	unsigned long pg_ix;
-	struct kvmppc_pginfo *pginfo;
+	unsigned long i, npages;
 	struct kvmppc_rma_info *ri = NULL;
 	struct vm_area_struct *vma;
 	struct page *page;
 	unsigned long hva;
+	unsigned long lpcr;
 
 	/*
 	 * This could be an attempt at adding memory or it could be MMIO
@@ -1098,6 +1092,13 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	if (!vma || vma->vm_start > mem->userspace_addr)
 		goto err_unlock;
 
+	/* For now require the memory to be in one vma */
+	if (mem->userspace_addr + mem->memory_size > vma->vm_end) {
+		pr_err("not one vma %llx > %lx\n",
+		       mem->userspace_addr + mem->memory_size, vma->vm_end);
+		goto err_unlock;
+	}
+
 	/* Anything with VM_IO will be handled as MMIO pass-through */
 	if (vma->vm_flags & VM_IO) {
 		unsigned long offset = mem->userspace_addr - vma->vm_start;
@@ -1125,6 +1126,9 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		return 0;
 	}
 
+	psize = vma_kernel_pagesize(vma);
+	porder = __ilog2(psize);
+
 	/* Is this one of our preallocated RMAs? */
 	if (mem->guest_phys_addr == 0) {
 		if (vma && vma->vm_file &&
@@ -1135,9 +1139,6 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 	up_read(&current->mm->mmap_sem);
 
-	/* For now, only allow 16MB pages for memory */
-	porder = LARGE_PAGE_ORDER;
-	psize = 1ul << porder;
 	if ((mem->memory_size & (psize - 1)) ||
 	    (mem->guest_phys_addr & (psize - 1))) {
 		pr_err("bad memory_size=%llx @ %llx\n",
@@ -1145,30 +1146,43 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		return -EINVAL;
 	}
 
-	npages = mem->memory_size >> porder;
-	totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
-
-	/* More memory than we have space to track? */
-	if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
-		return -EINVAL;
-
 	/* Do we already have an RMA registered? */
 	if (mem->guest_phys_addr == 0 && kvm->arch.rma)
 		return -EINVAL;
 
-	if (totalpages > kvm->arch.ram_npages)
-		kvm->arch.ram_npages = totalpages;
+	if (!ri && mem->guest_phys_addr == 0) {
+		if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+			pr_err("CPU requires an RMO\n");
+			return -EINVAL;
+		}
 
-	if (!ri && mem->guest_phys_addr == 0 &&
-	    cpu_has_feature(CPU_FTR_ARCH_201)) {
-		pr_err("CPU requires an RMO\n");
-		return -EINVAL;
+		/* We can handle 4k, 64k and 16M pages in the VRMA */
+		if (!(psize == 0x1000 || psize == 0x1000000 ||
+		      (psize == 0x10000 && cpu_has_feature(CPU_FTR_ARCH_206))))
+			return -EINVAL;
+		lpcr = kvm->arch.lpcr;
+		switch (porder) {
+		case 12:
+			lpcr &= ~(LPCR_VRMA_L);
+			break;
+		case 16:
+			lpcr |= (LPCR_VRMA_L | LPCR_VRMA_LP1);
+			break;
+		case 24:
+			lpcr |= LPCR_VRMA_L;
+			break;
+		}
+		kvm->arch.lpcr = lpcr;
+	}
+
+	if (!ri && psize < kvm->arch.ram_psize) {
+		kvm->arch.ram_psize = psize;
+		kvm->arch.ram_porder = porder;
 	}
 
 	/* Handle pre-allocated RMAs */
 	if (ri) {
 		unsigned long rma_size;
-		unsigned long lpcr;
 		long rmls;
 
 		rma_size = ri->npages << PAGE_SHIFT;
@@ -1181,7 +1195,6 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		}
 		atomic_inc(&ri->use_count);
 		kvm->arch.rma = ri;
-		kvm->arch.n_rma_pages = rma_size >> porder;
 
 		/* Update LPCR and RMOR */
 		lpcr = kvm->arch.lpcr;
@@ -1205,28 +1218,15 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
 
-	pg_ix = mem->guest_phys_addr >> porder;
-	pginfo = kvm->arch.ram_pginfo + pg_ix;
-	for (i = 0; i < npages; ++i, ++pg_ix) {
-		if (ri && pg_ix < kvm->arch.n_rma_pages) {
-			pginfo[i].pfn = ri->base_pfn +
-				(pg_ix << (porder - PAGE_SHIFT));
-			continue;
-		}
-		hva = mem->userspace_addr + (i << porder);
+	npages = mem->memory_size >> PAGE_SHIFT;
+	for (i = 0; i < npages; ++i) {
+		hva = mem->userspace_addr + (i << PAGE_SHIFT);
 		page = hva_to_page(hva);
 		if (!page) {
 			pr_err("oops, no pfn for hva %lx\n", hva);
 			goto err;
 		}
-		/* Check it's a 16MB page */
-		if (!PageHead(page) ||
-		    compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
-			pr_err("page at %lx isn't 16MB (o=%d)\n",
-			       hva, compound_order(page));
-			goto err;
-		}
-		pginfo[i].pfn = page_to_pfn(page);
+		memslot->rmap[i] = page_to_pfn(page);
 	}
 
 	return 0;
@@ -1248,8 +1248,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
 	long r;
-	unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
-	long err = -ENOMEM;
 	unsigned long lpcr;
 
 	/* Allocate hashed page table */
@@ -1259,19 +1257,9 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
-	kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
-				       GFP_KERNEL);
-	if (!kvm->arch.ram_pginfo) {
-		pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
-		       npages * sizeof(struct kvmppc_pginfo));
-		goto out_free;
-	}
-
-	kvm->arch.ram_npages = 0;
-	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
+	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;	/* max page size */
 	kvm->arch.ram_porder = LARGE_PAGE_ORDER;
 	kvm->arch.rma = NULL;
-	kvm->arch.n_rma_pages = 0;
 
 	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
@@ -1298,25 +1286,34 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 	kvm->arch.lpcr = lpcr;
 
 	return 0;
-
- out_free:
-	kvmppc_free_hpt(kvm);
-	return err;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
-	struct kvmppc_pginfo *pginfo;
-	unsigned long i;
-
-	if (kvm->arch.ram_pginfo) {
-		pginfo = kvm->arch.ram_pginfo;
-		kvm->arch.ram_pginfo = NULL;
-		for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
-			if (pginfo[i].pfn)
-				put_page(pfn_to_page(pginfo[i].pfn));
-		kfree(pginfo);
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	unsigned long i, j, npages;
+	unsigned long *rmap;
+	struct page *page;
+
+	slots = kvm_memslots(kvm);
+	for (i = 0; i < slots->nmemslots; i++) {
+		memslot = &slots->memslots[i];
+		rmap = memslot->rmap;
+		npages = memslot->npages;
+
+		if ((memslot->flags & KVM_MEMSLOT_INVALID) || !rmap)
+			continue;
+		for (j = 0; j < npages; j++) {
+			if (rmap[j]) {
+				page = pfn_to_page(rmap[j]);
+				if (PageHuge(page))
+					page = compound_head(page);
+				put_page(page);
+			}
+		}
 	}
+
 	if (kvm->arch.rma) {
 		kvm_release_rma(kvm->arch.rma);
 		kvm->arch.rma = NULL;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 2da8fac..b82da85 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -61,10 +61,12 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 {
 	unsigned long porder;
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long i, lpn, pa, gpa, psize;
+	unsigned long i, pa, gpa, gfn, psize;
 	unsigned long *hpte;
 	struct revmap_entry *rev;
 	unsigned long g_ptel = ptel;
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap_entry;
 
 	/* only handle 4k, 64k and 16M pages for now */
 	porder = 12;
@@ -108,59 +110,57 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	 * first check for RAM pages
 	 */
 	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
-	if ((gpa >> kvm->arch.ram_porder) < kvm->arch.ram_npages) {
-		lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
-		if (porder > kvm->arch.ram_porder)
-			return H_PARAMETER;
-		pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
-		if (!pa)
-			return H_PARAMETER;
-		/* Check WIMG */
-		if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
-		    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = builtin_gfn_to_memslot(kvm, gfn);
+	if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
+		unsigned long egfn = (gpa + psize) >> PAGE_SHIFT;
+
+		/* Check if the requested page fits entirely in the memslot. */
+		if ((egfn - memslot->base_gfn) > memslot->npages)
 			return H_PARAMETER;
-		ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
-		ptel |= pa;
-	} else {
-		struct kvm_memory_slot *memslot;
-
-		/* Check WIMG */
-		if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
-		    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
-			return H_PARAMETER;		
-
-		/* Else check for MMIO pass-through */
-		memslot = builtin_gfn_to_memslot(kvm, gpa >> PAGE_SHIFT);
-		if (memslot && memslot->flags & KVM_MEMSLOT_IO) {
-			unsigned long egfn = (gpa + psize) >> PAGE_SHIFT;
-
-			/* Check if the requested page fits entirely in
-			 * the memslot and check if the start pfn fits
-			 * out page size alignment
-			 */
-			if ((egfn - memslot->base_gfn) > memslot->npages)
-				return H_PARAMETER;
+
+		/* Check for MMIO pass-through */
+		if (memslot->flags & KVM_MEMSLOT_IO) {
+			/* check if the start pfn has page size alignment */
 			pa = kvm->arch.io_slot_pfn[memslot->id] << PAGE_SHIFT;
 			pa += gpa - (memslot->base_gfn << PAGE_SHIFT);
 			if (pa & (psize - 1))
 				return H_PARAMETER;
 
-			/* Make up HPTE */
-			ptel &= ~(HPTE_R_PP0 - psize);
-			ptel |= pa;
+			/* Check WIMG */
+			if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
+			    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
+				return H_PARAMETER;		
+		} else {
+			/* System RAM */
+			if (porder > kvm->arch.ram_porder)
+				return H_PARAMETER;
+			rmap_entry = &memslot->rmap[gfn - memslot->base_gfn];
+			rmap_entry = real_vmalloc_addr(rmap_entry);
+			pa = *rmap_entry << PAGE_SHIFT;
+			if (!pa)
+				return H_PARAMETER;
+
+			/* Check WIMG */
+			if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+			    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+				return H_PARAMETER;
 		}
+		ptel &= ~(HPTE_R_PP0 - psize);
+		ptel |= pa;
+
+	} else {
 		/* Else check for MMIO emulation */
-		else if (cpu_has_feature(CPU_FTR_ARCH_206)) {
-			/* Leave RPN intact */
-
-			/* We force no-execute and set key to 1 to cause
-			 * faults on access.
-			 * XXX Should we instead just return H_PARAMETER if
-			 * N isn't already set ?
-			 */
-			ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
-		} else
+		if (!cpu_has_feature(CPU_FTR_ARCH_206))
 			return H_PARAMETER;
+
+		/* Leave RPN intact */
+		/* We force no-execute and set key to 1 to cause
+		 * faults on access.
+		 * XXX Should we instead just return H_PARAMETER if
+		 * N isn't already set ?
+		 */
+		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
 	}
 	pteh &= ~0x60UL;
 	
-- 
1.7.7.2

^ permalink raw reply related

* [PATCH 02/11] KVM: PPC: Keep a record of HV guest view of hashed page table entries
From: Paul Mackerras @ 2011-11-16 22:56 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This adds an array that parallels the guest hashed page table (HPT),
that is, it has one entry per HPTE, used to store the guest's view
of the second doubleword of the corresponding HPTE.  The first
doubleword in the HPTE is the same as the guest's idea of it, so we
don't need to store a copy, but the second doubleword in the HPTE has
the real page number rather than the guest's logical page number.
This allows us to remove the back_translate() and reverse_xlate()
functions.

This "reverse mapping" array is vmalloc'd, meaning that to access it
in real mode we have to walk the kernel's page tables explicitly.
That is done by the new real_vmalloc_addr() function.  (In fact this
returns an address in the linear mapping, so the result is usable
both in real mode and in virtual mode.)

This also corrects a couple of bugs in kvmppc_mmu_get_pp_value().

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h |   20 +++++
 arch/powerpc/include/asm/kvm_host.h      |   10 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  136 +++++++++++++-----------------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |   95 +++++++++++++--------
 4 files changed, 147 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 53692c2..63542dd 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -29,6 +29,14 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 
 #define SPAPR_TCE_SHIFT		12
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_NPTE	(HPT_NPTEG << 3)		/* 8 PTEs per PTEG */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+#endif
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 {
@@ -86,4 +94,16 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 	return old == 0;
 }
 
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+	/* only handle 4k, 64k and 16M pages for now */
+	if (!(h & HPTE_V_LARGE))
+		return 1ul << 12;		/* 4k page */
+	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
+		return 1ul << 16;		/* 64k page */
+	if ((l & 0xff000) == 0)
+		return 1ul << 24;		/* 16M page */
+	return 0;				/* error */
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f142a2d..56f7046 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -166,9 +166,19 @@ struct kvmppc_rma_info {
 	atomic_t 	 use_count;
 };
 
+/*
+ * The reverse mapping array has one entry for each HPTE,
+ * which stores the guest's view of the second word of the HPTE
+ * (including the guest physical address of the mapping).
+ */
+struct revmap_entry {
+	unsigned long guest_rpte;
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
+	struct revmap_entry *revmap;
 	unsigned long ram_npages;
 	unsigned long ram_psize;
 	unsigned long ram_porder;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index da8c2f4..2b9b8be 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -23,6 +23,7 @@
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -33,11 +34,6 @@
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
 
-/* For now use fixed-size 16MB page table */
-#define HPT_ORDER	24
-#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
-#define HPT_HASH_MASK	(HPT_NPTEG - 1)
-
 /* Pages in the VRMA are 16MB pages */
 #define VRMA_PAGE_ORDER	24
 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
@@ -51,7 +47,9 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 {
 	unsigned long hpt;
 	unsigned long lpid;
+	struct revmap_entry *rev;
 
+	/* Allocate guest's hashed page table */
 	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
 			       HPT_ORDER - PAGE_SHIFT);
 	if (!hpt) {
@@ -60,12 +58,20 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 	}
 	kvm->arch.hpt_virt = hpt;
 
+	/* Allocate reverse map array */
+	rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
+	if (!rev) {
+		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
+		goto out_freehpt;
+	}
+	kvm->arch.revmap = rev;
+
+	/* Allocate the guest's logical partition ID */
 	do {
 		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
 		if (lpid >= NR_LPIDS) {
 			pr_err("kvm_alloc_hpt: No LPIDs free\n");
-			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
-			return -ENOMEM;
+			goto out_freeboth;
 		}
 	} while (test_and_set_bit(lpid, lpid_inuse));
 
@@ -74,11 +80,18 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
 	return 0;
+
+ out_freeboth:
+	vfree(rev);
+ out_freehpt:
+	free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+	return -ENOMEM;
 }
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
 	clear_bit(kvm->arch.lpid, lpid_inuse);
+	vfree(kvm->arch.revmap);
 	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
 }
 
@@ -89,14 +102,16 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	unsigned long pfn;
 	unsigned long *hpte;
 	unsigned long hash;
+	unsigned long porder = kvm->arch.ram_porder;
+	struct revmap_entry *rev;
 	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
 
 	if (!pginfo)
 		return;
 
 	/* VRMA can't be > 1TB */
-	if (npages > 1ul << (40 - kvm->arch.ram_porder))
-		npages = 1ul << (40 - kvm->arch.ram_porder);
+	if (npages > 1ul << (40 - porder))
+		npages = 1ul << (40 - porder);
 	/* Can't use more than 1 HPTE per HPTEG */
 	if (npages > HPT_NPTEG)
 		npages = HPT_NPTEG;
@@ -113,15 +128,20 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		 * at most one HPTE per HPTEG, we just assume entry 7
 		 * is available and use it.
 		 */
-		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
-		hpte += 7 * 2;
+		hash = (hash << 3) + 7;
+		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 4));
 		/* HPTE low word - RPN, protection, etc. */
 		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
 			HPTE_R_M | PP_RWXX;
-		wmb();
+		smp_wmb();
 		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
 			HPTE_V_LARGE | HPTE_V_VALID;
+
+		/* Reverse map info */
+		rev = &kvm->arch.revmap[hash];
+		rev->guest_rpte = (i << porder) | HPTE_R_R | HPTE_R_C |
+			HPTE_R_M | PP_RWXX;
 	}
 }
 
@@ -192,22 +212,6 @@ static unsigned int kvmppc_mmu_book3s_hv_slb_pshift(struct kvmppc_slb *slbe)
 	return 12;     	/* Unsupported */
 }
 
-static unsigned long back_translate(struct kvm *kvm, unsigned long ra)
-{
-	unsigned long offset, rpn, i;
-
-	/* XXX handle MMIO  */
-	offset = ra & (kvm->arch.ram_psize - 1);
-	rpn = (ra - offset) >> PAGE_SHIFT;
-	for (i = 0; i < kvm->arch.ram_npages; ++i)
-		if (rpn == kvm->arch.ram_pginfo[i].pfn)
-			return (i << kvm->arch.ram_porder) + offset;
-
-	/* Error value */
-	return -1ull;
-}
-
-
 static char pp_read_perm[16] = {
 	/* key = 0 */	1, 1, 1, 1, 0, 0, 1, 0,
 	/* key = 1 */	0, 1, 1, 1, 0, 0, 0, 0
@@ -224,7 +228,7 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 	unsigned int i;
 	unsigned int pshift;
 	unsigned long somask;
-	unsigned long vsid, hash;
+	unsigned long vsid, hash, index;
 	unsigned long avpn;
 	unsigned long *hpte;
 
@@ -252,7 +256,7 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));	
 
 		for (i = 0; i < 16; i += 2) {
-			unsigned long oldv, v, r;
+			unsigned long oldv, v, r, gr;
 
 			/* Read the PTE racily */
 			oldv = hpte[i] & ~HPTE_V_HVLOCK;
@@ -267,6 +271,8 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 				cpu_relax();
 			v = hpte[i];
 			r = hpte[i+1];
+			index = (hash << 3) + (i >> 1);
+			gr = kvm->arch.revmap[index].guest_rpte;
 
 			/* Unlock the HPTE */
 			asm volatile("lwsync" : : : "memory");
@@ -280,7 +286,8 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 			}
 			ret[0] = v;
 			ret[1] = r;
-			return 1;
+			ret[2] = gr;
+			return index;
 		}
 
 		if (avpn & HPTE_V_SECONDARY)
@@ -288,32 +295,20 @@ static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
 		avpn |= HPTE_V_SECONDARY;
 		hash = hash ^ HPT_HASH_MASK;
 	}
-	return 0;
+	return -1;
 }
 
-static unsigned long kvmppc_mmu_get_real_addr(unsigned long hpte[2],
+static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
 			unsigned long ea)
 {
-	unsigned int hpshift;
-	unsigned long r = hpte[1];
 	unsigned long ra_mask;
 
-	/* Get page size */
-	hpshift = 12;
-	if (hpte[0] & HPTE_V_LARGE) {
-		if ((r & 0xf000) == 0x1000)
-			hpshift = 16;
-		else if ((r & 0xff000) == 0)
-			hpshift = 24;
-		/* XXX TODO: Add 16G */
-	}
-	ra_mask = (1 << hpshift) - 1;
-
+	ra_mask = hpte_page_size(v, r) - 1;
 	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
 }
 
 static unsigned int kvmppc_mmu_get_pp_value(struct kvm_vcpu *vcpu,
-			struct kvmppc_slb *slbe, unsigned long hpte[2])
+			struct kvmppc_slb *slbe, unsigned long hpte_r)
 {
 	unsigned int key, pp;
 
@@ -322,8 +317,8 @@ static unsigned int kvmppc_mmu_get_pp_value(struct kvm_vcpu *vcpu,
 	else 
 		key = slbe->origv & SLB_VSID_KS;
 
-	pp = hpte[0] & HPTE_R_PP;
-	if (pp & HPTE_R_PP0)
+	pp = hpte_r & HPTE_R_PP;
+	if (hpte_r & HPTE_R_PP0)
 		pp |= 4;
 	if (key)
 		pp |= 8;
@@ -340,9 +335,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_slb *slbe;
-	unsigned int pp, skey;
-	unsigned long hpte[2];
-	unsigned long ra;
+	unsigned int pp;
+	unsigned long hpte[3];
+	int index;
 
 	/* Get SLB entry */
 	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
@@ -350,37 +345,23 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		return -EINVAL;
 
 	/* Find the HPTE in the hash table */
-	if (!kvmppc_hv_find_hpte(kvm, eaddr, slbe, hpte))
+	index = kvmppc_hv_find_hpte(kvm, eaddr, slbe, hpte);
+	if (index < 0)
 		return -ENOENT;
 
 	gpte->eaddr = eaddr;
 	gpte->vpage = ((hpte[0] & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
 
-	/* Get the real address from the HPTE */
-	ra = kvmppc_mmu_get_real_addr(hpte, eaddr);
-
 	/* Get PP bits and key for permission check */
-	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte);
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte[1]);
 
 	/* Calculate permissions */
 	gpte->may_execute = !(hpte[1] & (HPTE_R_N | HPTE_R_G));
 	gpte->may_read = pp_read_perm[pp];
 	gpte->may_write = pp_write_perm[pp];
 
-	/*
-	 * Get the storage key value.  31 means a special no-access
-	 * HPTE that we have inserted, with the guest physical address
-	 * in the RPN field.  Other keys mean that the the RPN field
-	 * contains the real address.
-	 */
-	skey = ((hpte[1] & HPTE_R_KEY_HI) >> 57) |
-		((hpte[1] & HPTE_R_KEY_LO) >> 9);
-	if (skey == 31) {
-		gpte->raddr = ra;
-		return 0;
-	}
-
-	gpte->raddr = back_translate(kvm, ra);
+	/* Get the guest physical address */
+	gpte->raddr = kvmppc_mmu_get_real_addr(hpte[0], hpte[2], eaddr);
 	return 0;
 }
 
@@ -388,23 +369,24 @@ int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_slb *slbe;
-	unsigned long hpte[2];
+	unsigned long hpte[3];
 	unsigned long srr0 = kvmppc_get_pc(vcpu);
 	unsigned long ea = vcpu->arch.fault_dar;	
 	unsigned long gpa;
 	unsigned int pp, ok;
 	u32 last_inst, dsisr = vcpu->arch.fault_dsisr;
-	int ret = 0;
+	int index, ret = 0;
 
 	/*
 	 * Translate the access address.
 	 * If we can't find the HPTE, just return and re-execute the
-	 * instruction.f
+	 * instruction.
  	 */
 	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, ea);
 	if (!slbe)
 		return RESUME_GUEST;
-	if (!kvmppc_hv_find_hpte(kvm, ea, slbe, hpte))
+	index = kvmppc_hv_find_hpte(kvm, ea, slbe, hpte);
+	if (index < 0)
 		return RESUME_GUEST;
 
 	/*
@@ -420,7 +402,7 @@ int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	/* Check whether the attempted access was permitted */
-	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte);
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte[1]);
 	ok = (dsisr & DSISR_ISSTORE) ? pp_write_perm[pp] : pp_read_perm[pp];
 	if (!ok) {
 		vcpu->arch.shregs.dar = ea;
@@ -431,7 +413,7 @@ int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	/* Translate the logical address */
-	gpa = kvmppc_mmu_get_real_addr(hpte, ea);
+	gpa = kvmppc_mmu_get_real_addr(hpte[0], hpte[2], ea);
 
 	/*
 	 * We try to load the last instruction.  We don't let
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6cb2f23..2da8fac 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -20,10 +20,19 @@
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 
-/* For now use fixed-size 16MB page table */
-#define HPT_ORDER	24
-#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
-#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+/* Translate address of a vmalloc'd thing to a linear map address */
+static void *real_vmalloc_addr(void *x)
+{
+	unsigned long addr = (unsigned long) x;
+	pte_t *p;
+
+	p = find_linux_pte(swapper_pg_dir, addr);
+	if (!p || !pte_present(*p))
+		return NULL;
+	/* assume we don't have huge pages in vmalloc space... */
+	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
+	return __va(addr);
+}
 
 /*
  * Since this file is built in even if KVM is a module, we need
@@ -54,6 +63,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long i, lpn, pa, gpa, psize;
 	unsigned long *hpte;
+	struct revmap_entry *rev;
+	unsigned long g_ptel = ptel;
 
 	/* only handle 4k, 64k and 16M pages for now */
 	porder = 12;
@@ -153,7 +164,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	}
 	pteh &= ~0x60UL;
 	
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
 		pte_index &= ~7UL;
@@ -166,18 +177,22 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 				break;
 			hpte += 2;
 		}
+		pte_index += i;
 	} else {
-		i = 0;
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
 			return H_PTEG_FULL;
 	}
+
+	/* Save away the guest's idea of the second HPTE dword */
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	if (rev)
+		rev->guest_rpte = g_ptel;
 	hpte[1] = ptel;
 	eieio();
 	hpte[0] = pteh;
 	asm volatile("ptesync" : : : "memory");
-	// XXX atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
-	vcpu->arch.gpr[4] = pte_index + i;
+	vcpu->arch.gpr[4] = pte_index;
 	return H_SUCCESS;
 }
 
@@ -209,7 +224,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *hpte;
 	unsigned long v, r, rb;
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
@@ -264,7 +279,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 		if (req == 3)
 			break;
 		if (req != 1 || flags == 3 ||
-		    pte_index >= (HPT_NPTEG << 3)) {
+		    pte_index >= HPT_NPTE) {
 			/* parameter error */
 			args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
 			ret = H_PARAMETER;
@@ -327,9 +342,10 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 {
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hpte;
-	unsigned long v, r, rb;
+	struct revmap_entry *rev;
+	unsigned long v, r, rb, mask, bits;
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	/* Don't let it set a normal memory page to key 31 */
 	if (((flags >> 9) & 0x1f) == 0x1f)
@@ -347,17 +363,30 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (atomic_read(&kvm->online_vcpus) == 1)
 		flags |= H_LOCAL;
 	v = hpte[0];
-	r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
-			HPTE_R_KEY_HI | HPTE_R_KEY_LO);
-	r |= (flags << 55) & HPTE_R_PP0;
-	r |= (flags << 48) & HPTE_R_KEY_HI;
-	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+	bits = (flags << 55) & HPTE_R_PP0;
+	bits |= (flags << 48) & HPTE_R_KEY_HI;
+	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+	/* Update guest view of 2nd HPTE dword */
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	if (rev) {
+		mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+			HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+		r = rev->guest_rpte & ~mask;
+		r |= bits;
+		rev->guest_rpte = r;
+	}
 
  	/* Don't let guest remove N or key from emulated MMIO pages */
 	if ((hpte[1] & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == 
 	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
-		r |= HPTE_R_N | HPTE_R_KEY_HI | HPTE_R_KEY_LO;
-		
+		mask = HPTE_R_PP0 | HPTE_R_PP;
+	else
+		mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+			HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+	r = (hpte[1] & ~mask) | (bits & mask);
+
+	/* Update HPTE */
 	rb = compute_tlbie_rb(v, r, pte_index);
 	hpte[0] = v & ~HPTE_V_VALID;
 	if (!(flags & H_LOCAL)) {
@@ -380,39 +409,31 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	return H_SUCCESS;
 }
 
-static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
-{
-	long int i;
-	unsigned long offset, rpn;
-
-	/* XXX handle MMIO and EMU */
-	offset = realaddr & (kvm->arch.ram_psize - 1);
-	rpn = (realaddr - offset) >> PAGE_SHIFT;
-	for (i = 0; i < kvm->arch.ram_npages; ++i)
-		if (rpn == kvm->arch.ram_pginfo[i].pfn)
-			return (i << kvm->arch.ram_porder) + offset;
-	return HPTE_R_RPN;	/* all 1s in the RPN field */
-}
-
 long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 		   unsigned long pte_index)
 {
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hpte, r;
 	int i, n = 1;
+	struct revmap_entry *rev = NULL;
 
-	if (pte_index >= (HPT_NPTEG << 3))
+	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
 		pte_index &= ~3;
 		n = 4;
 	}
+	if (flags & H_R_XLATE)
+		rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 	for (i = 0; i < n; ++i, ++pte_index) {
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 		r = hpte[1];
-		if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
-			r = reverse_xlate(kvm, r & HPTE_R_RPN) |
-				(r & ~HPTE_R_RPN);
+		if (hpte[0] & HPTE_V_VALID) {
+			if (rev)
+				r = rev[i].guest_rpte;
+			else
+				r = hpte[1] | HPTE_R_RPN;
+		}
 		vcpu->arch.gpr[4 + i * 2] = hpte[0];
 		vcpu->arch.gpr[5 + i * 2] = r;
 	}
-- 
1.7.7.2

^ permalink raw reply related

* [PATCH 01/11] KVM: PPC: Add memory-mapping support for PCI passthrough and emulation
From: Paul Mackerras @ 2011-11-16 22:52 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

From: Benjamin Herrenschmidt <benh@kernel.crashing.org>

This adds support for adding PCI device I/O regions to the guest memory
map, and for trapping guest accesses to emulated MMIO regions and
delivering them to qemu for MMIO emulation.  To trap guest accesses to
emulated MMIO regions, we reserve key 31 for the hypervisor's use and
set the VPM1 bit in LPCR, which sends all page faults to the host.
Any page fault that is not a key fault gets reflected immediately to the
guest.  We set HPTEs for emulated MMIO regions to have key = 31, and
don't allow the guest to create HPTEs with key = 31.  Any page fault
that is a key fault with key = 31 is then a candidate for MMIO
emulation and thus gets sent up to qemu.  We also load the instruction
that caused the fault for use later when qemu has done the emulation.

[paulus@samba.org: Cleaned up, moved kvmppc_book3s_hv_emulate_mmio()
 to book3s_64_mmu_hv.c]

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s.h    |    1 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   24 +++
 arch/powerpc/include/asm/kvm_host.h      |    2 +
 arch/powerpc/include/asm/kvm_ppc.h       |    1 +
 arch/powerpc/include/asm/reg.h           |    4 +
 arch/powerpc/kernel/exceptions-64s.S     |    8 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  301 +++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv.c             |   91 +++++++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  153 ++++++++++++----
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  131 ++++++++++++-
 arch/powerpc/kvm/book3s_pr.c             |    1 +
 arch/powerpc/kvm/booke.c                 |    1 +
 arch/powerpc/kvm/powerpc.c               |    2 +-
 include/linux/kvm.h                      |    3 +
 14 files changed, 656 insertions(+), 67 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index deb8a4e..bd8345f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -121,6 +121,7 @@ extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
+extern int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d0ac94f..53692c2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -62,4 +62,28 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 	return rb;
 }
 
+/*
+ * We use a lock bit in HPTE dword 0 to synchronize updates and
+ * accesses to each HPTE.
+ */
+#define HPTE_V_HVLOCK	0x40UL
+
+static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
+{
+	unsigned long tmp, old;
+
+	asm volatile("	ldarx	%0,0,%2\n"
+		     "	and.	%1,%0,%3\n"
+		     "	bne	2f\n"
+		     "	ori	%0,%0,%4\n"
+		     "  stdcx.	%0,0,%2\n"
+		     "	beq+	2f\n"
+		     "	li	%1,%3\n"
+		     "2:	isync"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+		     : "cc", "memory");
+	return old == 0;
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index bf8af5d..f142a2d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -186,6 +186,8 @@ struct kvm_arch {
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
+	unsigned long io_slot_pfn[KVM_MEMORY_SLOTS +
+				  KVM_PRIVATE_MEM_SLOTS];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a284f20..8c372b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -132,6 +132,7 @@ extern void kvm_release_rma(struct kvmppc_rma_info *ri);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_memory_slot *memslot,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 559da19..ff3d627 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -216,6 +216,7 @@
 #define   DSISR_ISSTORE		0x02000000	/* access was a store */
 #define   DSISR_DABRMATCH	0x00400000	/* hit data breakpoint */
 #define   DSISR_NOSEGMENT	0x00200000	/* STAB/SLB miss */
+#define   DSISR_KEYFAULT	0x00200000	/* Key fault */
 #define SPRN_TBRL	0x10C	/* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU	0x10D	/* Time Base Read Upper Register (user, R/O) */
 #define SPRN_TBWL	0x11C	/* Time Base Lower Register (super, R/W) */
@@ -493,6 +494,9 @@
 #define SPRN_SPRG7	0x117	/* Special Purpose Register General 7 */
 #define SPRN_SRR0	0x01A	/* Save/Restore Register 0 */
 #define SPRN_SRR1	0x01B	/* Save/Restore Register 1 */
+#define   SRR1_ISI_NOPT		0x40000000 /* ISI: Not found in hash */
+#define   SRR1_ISI_N_OR_G	0x10000000 /* ISI: Access is no-exec or G */
+#define   SRR1_ISI_PROT		0x08000000 /* ISI: Other protection fault */
 #define   SRR1_WAKEMASK		0x00380000 /* reason for wakeup */
 #define   SRR1_WAKESYSERR	0x00300000 /* System error */
 #define   SRR1_WAKEEE		0x00200000 /* External interrupt */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 396d080..9c2f0e2 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -100,14 +100,14 @@ data_access_not_stab:
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 #endif
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST_PR, 0x300)
+				 KVMTEST, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 #ifdef __DISABLED__
@@ -329,8 +329,8 @@ do_stab_bolted_pSeries:
 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
 #endif /* CONFIG_POWER4_ONLY */
 
-	KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
 	KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bc3a2ea..da8c2f4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -158,10 +158,307 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
 }
 
+static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
+							 gva_t eaddr)
+{
+	u64 mask;
+	int i;
+
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
+			continue;
+
+		if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
+			mask = ESID_MASK_1T;
+		else
+			mask = ESID_MASK;
+
+		if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
+			return &vcpu->arch.slb[i];
+	}
+	return NULL;
+}
+
+static unsigned int kvmppc_mmu_book3s_hv_slb_pshift(struct kvmppc_slb *slbe)
+{
+	if (!(slbe->origv & SLB_VSID_L))
+		return 12;	/*  4K */
+	switch ((slbe->origv >> 4) & 0x3) {
+	case 0: return 24;	/* 16M */
+	case 1: return 16;	/* 64K */
+	case 2: return 34;	/* 16G */
+	case 3: return 20;	/* 1M !!! but we don't support it */
+	}
+	return 12;     	/* Unsupported */
+}
+
+static unsigned long back_translate(struct kvm *kvm, unsigned long ra)
+{
+	unsigned long offset, rpn, i;
+
+	/* XXX handle MMIO  */
+	offset = ra & (kvm->arch.ram_psize - 1);
+	rpn = (ra - offset) >> PAGE_SHIFT;
+	for (i = 0; i < kvm->arch.ram_npages; ++i)
+		if (rpn == kvm->arch.ram_pginfo[i].pfn)
+			return (i << kvm->arch.ram_porder) + offset;
+
+	/* Error value */
+	return -1ull;
+}
+
+
+static char pp_read_perm[16] = {
+	/* key = 0 */	1, 1, 1, 1, 0, 0, 1, 0,
+	/* key = 1 */	0, 1, 1, 1, 0, 0, 0, 0
+};
+
+static char pp_write_perm[16] = {
+	/* key = 0 */	1, 1, 1, 0, 0, 0, 0, 0,
+	/* key = 1 */	0, 0, 1, 0, 0, 0, 0, 0
+};
+
+static int kvmppc_hv_find_hpte(struct kvm *kvm, gva_t eaddr,
+			struct kvmppc_slb *slbe, unsigned long *ret)
+{
+	unsigned int i;
+	unsigned int pshift;
+	unsigned long somask;
+	unsigned long vsid, hash;
+	unsigned long avpn;
+	unsigned long *hpte;
+
+	/* Get page shift, work out hash and AVPN etc. */
+	pshift = kvmppc_mmu_book3s_hv_slb_pshift(slbe);
+	if (slbe->origv & SLB_VSID_B_1T) {
+		somask = (1UL << 40) - 1;
+		vsid = (slbe->origv & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
+		vsid ^= vsid << 25;
+	} else {
+		somask = (1UL << 28) - 1;
+		vsid = (slbe->origv & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
+	}
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK;
+	avpn = slbe->origv & ~(somask >> 16);	/* also includes B */
+	avpn |= (eaddr & somask) >> 16;
+
+	if (pshift >= 24)
+		avpn &= ~((1UL << (pshift - 16)) - 1);
+	else
+		avpn &= ~0x7fUL;
+	avpn |= HPTE_V_VALID;
+
+	for (;;) {
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));	
+
+		for (i = 0; i < 16; i += 2) {
+			unsigned long oldv, v, r;
+
+			/* Read the PTE racily */
+			oldv = hpte[i] & ~HPTE_V_HVLOCK;
+
+			/* Check valid, hash, segment size and AVPN */
+			if (avpn != (oldv & (SLB_VSID_B | HPTE_V_AVPN |
+					     HPTE_V_SECONDARY | HPTE_V_VALID)))
+				continue;
+
+			/* Lock the PTE and read it under the lock */
+			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
+				cpu_relax();
+			v = hpte[i];
+			r = hpte[i+1];
+
+			/* Unlock the HPTE */
+			asm volatile("lwsync" : : : "memory");
+			v &= ~HPTE_V_HVLOCK;
+			hpte[i] = v;
+
+			/* Still OK? */
+			if (v != oldv) {
+				i -= 2;
+				continue;
+			}
+			ret[0] = v;
+			ret[1] = r;
+			return 1;
+		}
+
+		if (avpn & HPTE_V_SECONDARY)
+			break;
+		avpn |= HPTE_V_SECONDARY;
+		hash = hash ^ HPT_HASH_MASK;
+	}
+	return 0;
+}
+
+static unsigned long kvmppc_mmu_get_real_addr(unsigned long hpte[2],
+			unsigned long ea)
+{
+	unsigned int hpshift;
+	unsigned long r = hpte[1];
+	unsigned long ra_mask;
+
+	/* Get page size */
+	hpshift = 12;
+	if (hpte[0] & HPTE_V_LARGE) {
+		if ((r & 0xf000) == 0x1000)
+			hpshift = 16;
+		else if ((r & 0xff000) == 0)
+			hpshift = 24;
+		/* XXX TODO: Add 16G */
+	}
+	ra_mask = (1 << hpshift) - 1;
+
+	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
+}
+
+static unsigned int kvmppc_mmu_get_pp_value(struct kvm_vcpu *vcpu,
+			struct kvmppc_slb *slbe, unsigned long hpte[2])
+{
+	unsigned int key, pp;
+
+	if (vcpu->arch.shared->msr & MSR_PR)
+		key = slbe->origv & SLB_VSID_KP;
+	else 
+		key = slbe->origv & SLB_VSID_KS;
+
+	pp = hpte[0] & HPTE_R_PP;
+	if (pp & HPTE_R_PP0)
+		pp |= 4;
+	if (key)
+		pp |= 8;
+	return pp;
+}
+
+/*
+ * XXX TODO: Handle key values from guest (add them to kvmppc_pte),
+ * for now we don't care tho as Linux guest doesn't use
+ * them. We also force key 31 for any MMIO emulation mapping
+ */
 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-				struct kvmppc_pte *gpte, bool data)
+			struct kvmppc_pte *gpte, bool data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_slb *slbe;
+	unsigned int pp, skey;
+	unsigned long hpte[2];
+	unsigned long ra;
+
+	/* Get SLB entry */
+	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
+	if (!slbe)
+		return -EINVAL;
+
+	/* Find the HPTE in the hash table */
+	if (!kvmppc_hv_find_hpte(kvm, eaddr, slbe, hpte))
+		return -ENOENT;
+
+	gpte->eaddr = eaddr;
+	gpte->vpage = ((hpte[0] & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
+
+	/* Get the real address from the HPTE */
+	ra = kvmppc_mmu_get_real_addr(hpte, eaddr);
+
+	/* Get PP bits and key for permission check */
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte);
+
+	/* Calculate permissions */
+	gpte->may_execute = !(hpte[1] & (HPTE_R_N | HPTE_R_G));
+	gpte->may_read = pp_read_perm[pp];
+	gpte->may_write = pp_write_perm[pp];
+
+	/*
+	 * Get the storage key value.  31 means a special no-access
+	 * HPTE that we have inserted, with the guest physical address
+	 * in the RPN field.  Other keys mean that the the RPN field
+	 * contains the real address.
+	 */
+	skey = ((hpte[1] & HPTE_R_KEY_HI) >> 57) |
+		((hpte[1] & HPTE_R_KEY_LO) >> 9);
+	if (skey == 31) {
+		gpte->raddr = ra;
+		return 0;
+	}
+
+	gpte->raddr = back_translate(kvm, ra);
+	return 0;
+}
+
+int kvmppc_book3s_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
-	return -ENOENT;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_slb *slbe;
+	unsigned long hpte[2];
+	unsigned long srr0 = kvmppc_get_pc(vcpu);
+	unsigned long ea = vcpu->arch.fault_dar;	
+	unsigned long gpa;
+	unsigned int pp, ok;
+	u32 last_inst, dsisr = vcpu->arch.fault_dsisr;
+	int ret = 0;
+
+	/*
+	 * Translate the access address.
+	 * If we can't find the HPTE, just return and re-execute the
+	 * instruction.f
+ 	 */
+	slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, ea);
+	if (!slbe)
+		return RESUME_GUEST;
+	if (!kvmppc_hv_find_hpte(kvm, ea, slbe, hpte))
+		return RESUME_GUEST;
+
+	/*
+	 * Check if this is a special HPTE (storage key = 31); if not then
+	 * this is just a key fault in the guest.
+	 */
+	if ((hpte[1] & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) !=
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) {
+		vcpu->arch.shregs.dsisr = dsisr;
+		vcpu->arch.shregs.dar = ea;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		return RESUME_GUEST;
+	}
+
+	/* Check whether the attempted access was permitted */
+	pp = kvmppc_mmu_get_pp_value(vcpu, slbe, hpte);
+	ok = (dsisr & DSISR_ISSTORE) ? pp_write_perm[pp] : pp_read_perm[pp];
+	if (!ok) {
+		vcpu->arch.shregs.dar = ea;
+		vcpu->arch.shregs.dsisr = (dsisr & DSISR_ISSTORE) |
+			DSISR_PROTFAULT;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		return RESUME_GUEST;
+	}
+
+	/* Translate the logical address */
+	gpa = kvmppc_mmu_get_real_addr(hpte, ea);
+
+	/*
+	 * We try to load the last instruction.  We don't let
+	 * emulate_instruction do it as its failure mode is pretty bogus.
+	 * If we fail, we just return to the guest and try executing it again.
+	 */
+	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
+		ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
+		if (ret != EMULATE_DONE)
+			return RESUME_GUEST;
+		vcpu->arch.last_inst = last_inst;
+	}
+
+	/*
+	 * XXX WARNING: We do not know for sure whether the instruction we just
+	 * read from memory is the same that caused the fault in the first
+	 * place. We don't have a problem with the guest shooting itself in
+	 * the foot that way, however we must be careful that we enforce
+	 * the write permission based on the instruction we are actually
+	 * emulating, not based on dsisr. Unfortunately, the KVM code for
+	 * instruction emulation isn't smart enough for that to work
+	 * so right now we just do it badly and racily, but that will need
+	 * fixing
+	 */
+
+	vcpu->arch.paddr_accessed = gpa;
+	return kvmppc_emulate_mmio(run, vcpu);
 }
 
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b8ad233..d1f0774 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -320,8 +320,15 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * We get these next two if the guest does a bad real-mode access,
 	 * as we have enabled VRMA (virtualized real mode area) mode in the
 	 * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+	 *
+	 * We also get them for MMIO emulation via key faults
 	 */
 	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		/* We attempt MMIO emulation for key faults */
+		if (vcpu->arch.fault_dsisr & DSISR_KEYFAULT) {
+			r = kvmppc_book3s_hv_emulate_mmio(run, vcpu);
+			break;
+		}
 		vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
 		vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
 		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
@@ -329,7 +336,7 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
 		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
-					0x08000000);
+					vcpu->arch.shregs.msr & 0x78000000);
 		r = RESUME_GUEST;
 		break;
 	/*
@@ -1068,17 +1075,67 @@ static struct page *hva_to_page(unsigned long addr)
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem)
+				      struct kvm_memory_slot *memslot,
+				      struct kvm_userspace_memory_region *mem)
 {
 	unsigned long psize, porder;
 	unsigned long i, npages, totalpages;
 	unsigned long pg_ix;
 	struct kvmppc_pginfo *pginfo;
-	unsigned long hva;
 	struct kvmppc_rma_info *ri = NULL;
+	struct vm_area_struct *vma;
 	struct page *page;
+	unsigned long hva;
+
+	/*
+	 * This could be an attempt at adding memory or it could be MMIO
+	 * pass-through. We need to treat them differently but the only
+	 * way for us to know what it is is to look at the VMA and play
+	 * guess work so let's just do that
+	 */
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, mem->userspace_addr);
+	if (!vma || vma->vm_start > mem->userspace_addr)
+		goto err_unlock;
+
+	/* Anything with VM_IO will be handled as MMIO pass-through */
+	if (vma->vm_flags & VM_IO) {
+		unsigned long offset = mem->userspace_addr - vma->vm_start;
+
+		/* We require VM_PFNMAP for now */
+		if (!(vma->vm_flags & VM_PFNMAP))
+			goto err_unlock;
+
+		/*
+		 * We require read & write permission as we cannot yet
+		 * enforce guest read-only protection or no access.
+		 */
+		if ((vma->vm_flags & (VM_READ | VM_WRITE)) !=
+		    (VM_READ | VM_WRITE))
+			goto err_unlock;
+
+		/*
+		 * Tag the memslot with a private flag and store the pfn
+		 * in a separate array for use by H_ENTER
+		 */
+		memslot->flags |= KVM_MEMSLOT_IO;
+		kvm->arch.io_slot_pfn[memslot->id] =
+			vma->vm_pgoff + (offset >> PAGE_SHIFT);
+		up_read(&current->mm->mmap_sem);
+		return 0;
+	}
+
+	/* Is this one of our preallocated RMAs? */
+	if (mem->guest_phys_addr == 0) {
+		if (vma && vma->vm_file &&
+		    vma->vm_file->f_op == &kvm_rma_fops &&
+		    mem->userspace_addr == vma->vm_start)
+			ri = vma->vm_file->private_data;
+	}
+
+	up_read(&current->mm->mmap_sem);
 
-	/* For now, only allow 16MB pages */
+	/* For now, only allow 16MB pages for memory */
 	porder = LARGE_PAGE_ORDER;
 	psize = 1ul << porder;
 	if ((mem->memory_size & (psize - 1)) ||
@@ -1102,23 +1159,13 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 	if (totalpages > kvm->arch.ram_npages)
 		kvm->arch.ram_npages = totalpages;
 
-	/* Is this one of our preallocated RMAs? */
-	if (mem->guest_phys_addr == 0) {
-		struct vm_area_struct *vma;
-
-		down_read(&current->mm->mmap_sem);
-		vma = find_vma(current->mm, mem->userspace_addr);
-		if (vma && vma->vm_file &&
-		    vma->vm_file->f_op == &kvm_rma_fops &&
-		    mem->userspace_addr == vma->vm_start)
-			ri = vma->vm_file->private_data;
-		up_read(&current->mm->mmap_sem);
-		if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
-			pr_err("CPU requires an RMO\n");
-			return -EINVAL;
-		}
+	if (!ri && mem->guest_phys_addr == 0 &&
+	    cpu_has_feature(CPU_FTR_ARCH_201)) {
+		pr_err("CPU requires an RMO\n");
+		return -EINVAL;
 	}
 
+	/* Handle pre-allocated RMAs */
 	if (ri) {
 		unsigned long rma_size;
 		unsigned long lpcr;
@@ -1184,6 +1231,8 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 	return 0;
 
+ err_unlock:
+	up_read(&current->mm->mmap_sem);
  err:
 	return -EINVAL;
 }
@@ -1241,6 +1290,10 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		lpcr &= LPCR_PECE | LPCR_LPES;
 		lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
 			LPCR_VPM0 | LPCR_VRMA_L;
+		/* XXX Enable MMIO emu, we should probably do that
+		 *     only upon instruction from qemu... 
+		 */
+		lpcr |= LPCR_VPM1;
 	}
 	kvm->arch.lpcr = lpcr;
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index bacb0cf..6cb2f23 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -25,24 +25,26 @@
 #define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
 #define HPT_HASH_MASK	(HPT_NPTEG - 1)
 
-#define HPTE_V_HVLOCK	0x40UL
-
-static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
+/*
+ * Since this file is built in even if KVM is a module, we need
+ * a local copy of this function for the case where kvm_main.c is
+ * modular.
+ */
+static struct kvm_memory_slot *builtin_gfn_to_memslot(struct kvm *kvm,
+						      gfn_t gfn)
 {
-	unsigned long tmp, old;
+	int i;
+	struct kvm_memslots *slots;
 
-	asm volatile("	ldarx	%0,0,%2\n"
-		     "	and.	%1,%0,%3\n"
-		     "	bne	2f\n"
-		     "	ori	%0,%0,%4\n"
-		     "  stdcx.	%0,0,%2\n"
-		     "	beq+	2f\n"
-		     "	li	%1,%3\n"
-		     "2:	isync"
-		     : "=&r" (tmp), "=&r" (old)
-		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
-		     : "cc", "memory");
-	return old == 0;
+	slots = kvm_memslots(kvm);
+	for (i = 0; i < slots->nmemslots; ++i) {
+		struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return memslot;
+	}
+	return NULL;
 }
 
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
@@ -50,7 +52,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 {
 	unsigned long porder;
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long i, lpn, pa;
+	unsigned long i, lpn, pa, gpa, psize;
 	unsigned long *hpte;
 
 	/* only handle 4k, 64k and 16M pages for now */
@@ -69,19 +71,88 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		} else
 			return H_PARAMETER;
 	}
-	lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
-	if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
-		return H_PARAMETER;
-	pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
-	if (!pa)
-		return H_PARAMETER;
-	/* Check WIMG */
-	if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
-	    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+	psize = (1ul << porder);
+
+	/* We do not allow the guest to set key 31 which is reserved
+	 * for MMIO emulation. We don't want to allow MMIO emulation
+	 * to be used to access RAM due to possible races between
+	 * emulation and TLB invalidations.
+	 *
+	 * Emulated accesses are emulated by looking at the hash for
+	 * translation once, then performing the access later. The
+	 * translation could be invalidated in the meantime in which
+	 * point performing the subsequent memory access on the old
+	 * physical address is a violation of the architecture and
+	 * a security hole.
+	 *
+	 * This is less of an issue for MMIO stores since they aren't
+	 * globally visible. It could be an issue for MMIO loads to
+	 * a certain extent but we'll ignore it for now
+	 */
+	if ((ptel & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
 		return H_PARAMETER;
+
+	/* Figure out the type of page and handle accordingly,
+	 * first check for RAM pages
+	 */
+	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
+	if ((gpa >> kvm->arch.ram_porder) < kvm->arch.ram_npages) {
+		lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
+		if (porder > kvm->arch.ram_porder)
+			return H_PARAMETER;
+		pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
+		if (!pa)
+			return H_PARAMETER;
+		/* Check WIMG */
+		if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+		    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+			return H_PARAMETER;
+		ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
+		ptel |= pa;
+	} else {
+		struct kvm_memory_slot *memslot;
+
+		/* Check WIMG */
+		if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
+		    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
+			return H_PARAMETER;		
+
+		/* Else check for MMIO pass-through */
+		memslot = builtin_gfn_to_memslot(kvm, gpa >> PAGE_SHIFT);
+		if (memslot && memslot->flags & KVM_MEMSLOT_IO) {
+			unsigned long egfn = (gpa + psize) >> PAGE_SHIFT;
+
+			/* Check if the requested page fits entirely in
+			 * the memslot and check if the start pfn fits
+			 * out page size alignment
+			 */
+			if ((egfn - memslot->base_gfn) > memslot->npages)
+				return H_PARAMETER;
+			pa = kvm->arch.io_slot_pfn[memslot->id] << PAGE_SHIFT;
+			pa += gpa - (memslot->base_gfn << PAGE_SHIFT);
+			if (pa & (psize - 1))
+				return H_PARAMETER;
+
+			/* Make up HPTE */
+			ptel &= ~(HPTE_R_PP0 - psize);
+			ptel |= pa;
+		}
+		/* Else check for MMIO emulation */
+		else if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+			/* Leave RPN intact */
+
+			/* We force no-execute and set key to 1 to cause
+			 * faults on access.
+			 * XXX Should we instead just return H_PARAMETER if
+			 * N isn't already set ?
+			 */
+			ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
+		} else
+			return H_PARAMETER;
+	}
 	pteh &= ~0x60UL;
-	ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
-	ptel |= pa;
+	
 	if (pte_index >= (HPT_NPTEG << 3))
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
@@ -91,21 +162,21 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			if (i == 8)
 				return H_PTEG_FULL;
 			if ((*hpte & HPTE_V_VALID) == 0 &&
-			    lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
 				break;
 			hpte += 2;
 		}
 	} else {
 		i = 0;
 		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-		if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
 			return H_PTEG_FULL;
 	}
 	hpte[1] = ptel;
 	eieio();
 	hpte[0] = pteh;
 	asm volatile("ptesync" : : : "memory");
-	atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
+	// XXX atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
 	vcpu->arch.gpr[4] = pte_index + i;
 	return H_SUCCESS;
 }
@@ -141,7 +212,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (pte_index >= (HPT_NPTEG << 3))
 		return H_PARAMETER;
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	if ((hpte[0] & HPTE_V_VALID) == 0 ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
@@ -200,7 +271,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 			break;
 		}
 		hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-		while (!lock_hpte(hp, HPTE_V_HVLOCK))
+		while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
 			cpu_relax();
 		found = 0;
 		if (hp[0] & HPTE_V_VALID) {
@@ -260,14 +331,19 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (pte_index >= (HPT_NPTEG << 3))
 		return H_PARAMETER;
+	/* Don't let it set a normal memory page to key 31 */
+	if (((flags >> 9) & 0x1f) == 0x1f)
+		return H_PARAMETER;
+
 	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
-	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	if ((hpte[0] & HPTE_V_VALID) == 0 ||
 	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
 		hpte[0] &= ~HPTE_V_HVLOCK;
 		return H_NOT_FOUND;
 	}
+
 	if (atomic_read(&kvm->online_vcpus) == 1)
 		flags |= H_LOCAL;
 	v = hpte[0];
@@ -276,6 +352,12 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	r |= (flags << 55) & HPTE_R_PP0;
 	r |= (flags << 48) & HPTE_R_KEY_HI;
 	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+ 	/* Don't let guest remove N or key from emulated MMIO pages */
+	if ((hpte[1] & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == 
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
+		r |= HPTE_R_N | HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+		
 	rb = compute_tlbie_rb(v, r, pte_index);
 	hpte[0] = v & ~HPTE_V_VALID;
 	if (!(flags & H_LOCAL)) {
@@ -303,11 +385,12 @@ static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
 	long int i;
 	unsigned long offset, rpn;
 
+	/* XXX handle MMIO and EMU */
 	offset = realaddr & (kvm->arch.ram_psize - 1);
 	rpn = (realaddr - offset) >> PAGE_SHIFT;
 	for (i = 0; i < kvm->arch.ram_npages; ++i)
 		if (rpn == kvm->arch.ram_pginfo[i].pfn)
-			return (i << PAGE_SHIFT) + offset;
+			return (i << kvm->arch.ram_porder) + offset;
 	return HPTE_R_RPN;	/* all 1s in the RPN field */
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 44d8829..7916e1d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -230,10 +230,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	mtspr	SPRN_DABR,r6
 
 BEGIN_FTR_SECTION
-	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	/* Restore AMR and UAMOR and set AMOR such that
+	 *
+	 *   - AMOR allow change to all keys except 31
+	 *   - AMR disables access for key 31
+	 *   - Other AMR and UAMOR bits are under guest control
+	 *
+	 * Key 31 is thus protected for use by MMIO emulation
+	 */
 	ld	r5,VCPU_AMR(r4)
 	ld	r6,VCPU_UAMOR(r4)
-	li	r7,-1
+	li	r7,-4 /* Disable access to key 31 */
+	ori	r5,r5,3
+	and	r6,r6,r7
 	mtspr	SPRN_AMR,r5
 	mtspr	SPRN_UAMOR,r6
 	mtspr	SPRN_AMOR,r7
@@ -544,13 +553,24 @@ kvmppc_interrupt:
 	 * Register contents:
 	 * R12		= interrupt vector
 	 * R13		= PACA
-	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest CR, R12 saved in PACA HSTATE_SCRATCH1/0
 	 * guest R13 saved in SPRN_SCRATCH0
 	 */
 	/* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
 	std	r9, HSTATE_HOST_R2(r13)
-	ld	r9, HSTATE_KVM_VCPU(r13)
 
+BEGIN_FTR_SECTION
+	/* check for HDSI/HISI for fast reflection to guest when
+	 * VPM is enabled
+	 */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + 2
+	beq	kvmppc_hdsi
+	cmpwi	r12, BOOK3S_INTERRUPT_H_INST_STORAGE + 2
+	beq	kvmppc_hisi
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+.Lhxsi_cont:
+	ld	r9, HSTATE_KVM_VCPU(r13)
 	/* Save registers */
 
 	std	r0, VCPU_GPR(r0)(r9)
@@ -631,7 +651,7 @@ hcall_real_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 
 	/* Save HEIR (HV emulation assist reg) in last_inst
 	   if this is an HEI (HV emulation interrupt, e40) */
-	li	r3,-1
+	li	r3,KVM_INST_FETCH_FAILED
 BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
 	bne	11f
@@ -649,7 +669,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	std	r6, VCPU_DAR(r9)
 	stw	r7, VCPU_DSISR(r9)
 	std	r8, VCPU_CTR(r9)
-	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI)
+	 * also try to load the instruction
+	 */
 BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
 	beq	6f
@@ -1091,11 +1113,108 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_HSRR1, r7
 	ba	0x500
 
+	/* Out of line H_DATA_STORAGE exception, grab HDAR and HDSISR
+	 * and try to load the instruction from guest memory. Note that
+	 * VCPU_LAST_INST has already been set to -1 at this point.
+	 */
 6:	mfspr	r6,SPRN_HDAR
 	mfspr	r7,SPRN_HDSISR
+
+	/* Only fetch instruction if guest IR relocation is enabled */
+	andi.	r0,r11,MSR_IR
+	beq	7b
+	
+	/* In case lwz faults */
+	li	r8,KVM_INST_FETCH_FAILED
+
+	/* Set guest mode to 'jump over instruction' so if lwz faults
+	 * we'll just continue at the next IP. */
+	li	r0,KVM_GUEST_MODE_SKIP
+	stb	r0,HSTATE_IN_GUEST(r13)
+
+	/* Do the access with MSR:DR enabled */
+	mfmsr	r3
+	ori	r4,r3,MSR_DR		/* Enable paging for data */
+	mtmsrd	r4
+	sync
+	lwz	r8,0(r10)
+	mtmsr	r3
+	sync
+
+	/* Store the result */
+	stw	r8,VCPU_LAST_INST(r9)
+
+	/* Unset guest mode. XXX This is a dup, maybe we could
+	 * move the original later in the code flow, just before
+	 * starting the MMU switch
+	 */
+	li	r0,KVM_GUEST_MODE_NONE
+	stb	r0,HSTATE_IN_GUEST(r13)
 	b	7b
 
 /*
+ * See if this H[DI]SI interrupt is one that can be bounced to the guest.
+ * It can be bounced immediately if it is not in real mode and is
+ * not a key fault (DSI) or not a non-exec fault (ISI).
+ *
+ * Here, r9, r12 and cr are saved in the PACA, r13 is saved in SPRN_SCRATCH0.
+ */
+kvmppc_hdsi:
+	std	r0, PACA_EXGEN(r13)
+	mfspr	r9, SPRN_HDSISR
+	mfspr	r12, SPRN_HSRR1
+	andis.	r0, r9, DSISR_KEYFAULT@h
+	bne	1f
+	andi.	r0, r12, MSR_DR
+	beq	1f
+	mfspr	r0, SPRN_HSRR0		/* turn it into a DSI for the guest */
+	mtspr	SPRN_DSISR, r9
+	mtspr	SPRN_SRR1, r12
+	mtspr	SPRN_SRR0, r0
+	mfspr	r9, SPRN_HDAR
+	li	r0, BOOK3S_INTERRUPT_DATA_STORAGE
+	li	r12, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r12, r12, 63
+	mtspr	SPRN_DAR, r9
+	mtspr	SPRN_HSRR0, r0
+	mtspr	SPRN_HSRR1, r12
+	lwz	r0, HSTATE_SCRATCH1(r13)
+	mtocrf	0x80, r0
+	ld	r9, HSTATE_HOST_R2(r13)
+	ld	r12, HSTATE_SCRATCH0(r13)
+	ld	r0, PACA_EXGEN(r13)
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+1:	ld	r0, PACA_EXGEN(r13)
+	li	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + 2
+	b	.Lhxsi_cont
+
+kvmppc_hisi:
+	mfspr	r9, SPRN_HSRR1
+	andi.	r12, r9, MSR_IR
+	beq	1f
+	andis.	r12, r9, SRR1_ISI_N_OR_G@h
+	bne	1f
+	mfspr	r12, SPRN_HSRR0		/* turn it into a ISI for the guest */
+	mtspr	SPRN_SRR1, r9
+	mtspr	SPRN_SRR0, r12
+	li	r9, BOOK3S_INTERRUPT_INST_STORAGE
+	li	r12, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r12, r12, 63
+	mtspr	SPRN_HSRR0, r9
+	mtspr	SPRN_HSRR1, r12
+	lwz	r9, HSTATE_SCRATCH1(r13)
+	mtocrf	0x80, r9
+	ld	r9, HSTATE_HOST_R2(r13)
+	ld	r12, HSTATE_SCRATCH0(r13)
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+1:	li	r12, BOOK3S_INTERRUPT_H_INST_STORAGE + 2
+	b	.Lhxsi_cont
+
+/*
  * Try to handle an hcall in real mode.
  * Returns to the guest if we handle it, or continues on up to
  * the kernel if we can't (i.e. if we don't have a handler for
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f3a6414..30e7c2e 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1007,6 +1007,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
 				      struct kvm_userspace_memory_region *mem)
 {
 	return 0;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9c78589..e9186e9 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -895,6 +895,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
 				      struct kvm_userspace_memory_region *mem)
 {
 	return 0;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c33f6a7..084d1c5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -265,7 +265,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_userspace_memory_region *mem,
                                    int user_alloc)
 {
-	return kvmppc_core_prepare_memory_region(kvm, mem);
+	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index c107fae..774b04d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -105,6 +105,9 @@ struct kvm_userspace_memory_region {
 #define KVM_MEM_LOG_DIRTY_PAGES  1UL
 #define KVM_MEMSLOT_INVALID      (1UL << 1)
 
+/* Kernel internal use */
+#define KVM_MEMSLOT_IO		 (1UL << 31)
+
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
 	/*
-- 
1.7.7.2

^ permalink raw reply related

* [RFC PATCH 0/11] KVM: PPC: Update Book3S HV memory handling
From: Paul Mackerras @ 2011-11-16 22:50 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Alexander Graf

This series of patches updates the Book3S-HV KVM code that manages the
guest hashed page table (HPT) to enable several things:

* MMIO emulation and MMIO pass-through

* Use of small pages (4kB or 64kB, depending on config) to back the
  guest memory

* Pageable guest memory - i.e. backing pages can be removed from the
  guest and reinstated on demand, using the MMU notifier mechanism.

On PPC970 we have no way to get DSIs and ISIs to come to the
hypervisor, so we can't do MMIO emulation or pageable guest memory.
On POWER7 we set the VPM1 bit in the LPCR to make all DSIs and ISIs
come to the hypervisor (host) as HDSIs or HISIs.

This series is RFC for the moment, although the first 5 or so patches
are pretty solid and could go in.  I am going to rework the later
patches to use HPTEs with V=0 for the absent pages rather than key=31,
which will require handling the HPTE-not-present HDSIs we will get and
differentiating the case where the guest has created a HPTE but the
underlying page is not resident from the case where the guest has
created no HPTE for the address.

Paul.

^ permalink raw reply

* Re: [RFC][PATCH 15/30] powerpc/85xx: Rework P1022DS device tree
From: Kumar Gala @ 2011-11-16 21:50 UTC (permalink / raw)
  To: Timur Tabi; +Cc: linuxppc-dev
In-Reply-To: <4EC42F6E.2030500@freescale.com>


On Nov 16, 2011, at 3:47 PM, Timur Tabi wrote:

> wrote:
>> I just noticed this bug in the original p1022ds.dts, and I see you're
>> carrying it over here.  The reg property should look like this:
>> 
>> reg = <0xf 0xffe05000 0 0x1000>;
>>       ^^^
> 
> It looks like there's also a problem with the 'ranges' property:
> 
> 	ranges = <0x0 0x0 0xf 0xe8000000 0x08000000
> 		  0x1 0x0 0xf 0xe0000000 0x08000000
> 		  0x2 0x0 0x0 0xffa00000 0x00040000
> 		          ^^^
> 		  0x3 0x0 0xf 0xffdf0000 0x00008000>;

Gotcha, existing bug but will fix it.

- k

^ permalink raw reply

* Re: [RFC][PATCH 15/30] powerpc/85xx: Rework P1022DS device tree
From: Timur Tabi @ 2011-11-16 21:47 UTC (permalink / raw)
  Cc: linuxppc-dev
In-Reply-To: <CAOZdJXU6NXgWv=VwShOvEGH5rm0KJmrQgOCVDiSJtKoVMs5=eg@mail.gmail.com>

 wrote:
> I just noticed this bug in the original p1022ds.dts, and I see you're
> carrying it over here.  The reg property should look like this:
> 
> reg = <0xf 0xffe05000 0 0x1000>;
>        ^^^

It looks like there's also a problem with the 'ranges' property:

	ranges = <0x0 0x0 0xf 0xe8000000 0x08000000
		  0x1 0x0 0xf 0xe0000000 0x08000000
		  0x2 0x0 0x0 0xffa00000 0x00040000
		          ^^^
		  0x3 0x0 0xf 0xffdf0000 0x00008000>;

-- 
Timur Tabi
Linux kernel developer at Freescale

^ permalink raw reply

* Re: [PATCH v2 3/7] powerpc/85xx: add sleep and deep sleep support
From: Scott Wood @ 2011-11-16 21:42 UTC (permalink / raw)
  To: Zhao Chenhui; +Cc: linuxppc-dev
In-Reply-To: <1321437344-19253-3-git-send-email-chenhui.zhao@freescale.com>

On 11/16/2011 03:55 AM, Zhao Chenhui wrote:
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index ce4f7f1..d5cc385 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -63,6 +63,7 @@ obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
>  ifeq ($(CONFIG_PPC32),y)
>  obj-$(CONFIG_E500)		+= idle_e500.o
>  endif
> +obj-$(CONFIG_PPC_85xx)		+= l2cr_85xx.o

Can you restrict this to e500v1/v2?

Also, don't call it "l2cr" -- that's a 6xx register that is not present
on 85xx.

> +	.section .data
> +	.align	5
> +mpc85xx_sleep_save_area:
> +	.space	STATE_SAVE_SIZE
> +ccsrbase_low:
> +	.long	0
> +ccsrbase_high:
> +	.long	0
> +powmgtreq:
> +	.long	POWMGTCSR_DPSLP_MASK
> +
> +	.section .text
> +	.align	12
> +
> +	/*
> +	 * r3 = high word of physical address of CCSR
> +	 * r4 = low word of physical address of CCSR
> +	 */

The whole point of powmgtreq is to store a dynamically-passed-in
value... and it doesn't look like you add it in the jog patch.

-Scott

^ permalink raw reply

* P4080: attempting to use the uio_pdrv to attach FPGA on localbus ...
From: Robert Sciuk @ 2011-11-16 21:19 UTC (permalink / raw)
  To: devicetree-discuss; +Cc: linuxppc-dev
In-Reply-To: <mailman.2323.1320874671.15294.devicetree-discuss@lists.ozlabs.org>

I have succeeded in using the i2c bus with GPIO expander to access the =
programming pins of my FPGA devices, but the data port uses the =
localbus.  I had initially thought that the uio platform driver would be =
the ideal approach to creating a device which would allow configuration =
from userland via the /dev/uio[0|1] device interface, but apparently the =
device binding is not being accomplished as expected. =20

	localbus@ffe124000 {
		compatible =3D "fsl,p4080-elbc", "fsl,elbc", "simple-bus";
		reg =3D <0xf 0xfe124000 0 0x1000>;
		interrupts =3D <25 2 0 0>;
		interrupt-parent =3D <&mpic>;
		#address-cells =3D <2>;
		#size-cells =3D <1>;

                /* Local bus region mappings */
                ranges =3D <0 0 0xf 0xe8000000 0x08000000 	/* CS0: Boot =
flash */
                          1 0 0xf 0xd0000000 0x8000 		/* CS1: FPGA0 */
                          2 0 0xf 0xd1000000 0x8000 >; 	/* CS2: FPGA1 */

		flash@0,0 {
		  ........
            };=20

		lim: fpga@1,0 {
			compatible =3D "uio_pdrv";
			pin-handle=3D<&lim_ctrl>;
		};=20

		nitro: fpga@2,0 {
			compatible =3D "uio_pdrv";
			pin-handle=3D<&fpe0_ctrl &fpe1_ctrl>;
		};=20
	};

I have aliases pointing to the localbus nodes lim and nitro, but it =
appears that the uio_pdrv driver does not bind to the device based upon =
the compatible property of the tree.  I'm hoping to be able to mmap the =
localbus port memory (0xf_d000_0000 and 0xf_d100_0000) respectively when =
I open /dev/uio[0|1].  Is there additional driver registration needed in =
order to use the uio_pdrv driver?  What am I missing??

Any pointers would be appreciated.

Cheers,
Rob

^ permalink raw reply

* Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup
From: Moffett, Kyle D @ 2011-11-16 20:52 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: B04825@freescale.com, linux-kernel@vger.kernel.org,
	paul.gortmaker@windriver.com, scottwood@freescale.com,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <20111116044032.GA26476@bloggs.ozlabs.ibm.com>

On Nov 15, 2011, at 23:40, Paul Mackerras wrote:
> On Tue, Nov 15, 2011 at 04:45:18PM -0600, Moffett, Kyle D wrote:
>>=20
>> I guess that's doable, although I have to admit that idea almost gives
>> me more of a headache than trying to fix up the 32-bit ASM.
>>=20
>> One thing that bothers me in particular is that both 32/64 versions of
>> __copy_tofrom_user() are dramatically overcomplicated for what they
>> ought to be doing.
>>=20
>> It would seem that if we get a page fault during an unaligned copy, we
>> ought to just give up and fall back to a simple byte-by-byte copy loop
>> from wherever we left off.  That would eliminate 90% of the ugly
>> special cases without actually hurting performance, right?
>=20
> That's basically what we do, IIRC, and most of the complexity comes
> from working out where we were up to.  We could probably use a simpler
> approximation that means we might copy some bytes twice.  In fact the
> greatest simplification would probably be to implement range entries
> in the exception table so we can just have one entry for all the loads
> and stores instead of an entry for each individual load and store.

Well, I spent some time tinkering with the GCC inline-assembly option,
which was probably a waste, but I figured I would post my code here for
other people to chuckle at.  :-D

Here's a basic, relatively easily extended "copy u8" macro that sets up
the exception table using "asm goto":

#define try_copy_u8(DST, SRC, LOAD_FAULT, STORE_FAULT) do {	\
	unsigned long try_copy_tmp__ =3D (try_copy_tmp__);	\
	asm goto (						\
		"1:	lbz %[tmp], %[src]\n"			\
		"2:	stb %[tmp], %[dst]\n"			\
		"	.pushsection __ex_table, \"a\"\n"	\
		"	.align 2\n"				\
		"	.long 1b, %l["#LOAD_FAULT"]\n"		\
		"	.long 2b, %l["#STORE_FAULT"]\n"		\
		"	.popsection\n"				\
		: /* No outputs allowed for "asm goto" */	\
		: [dst] "m"(*(__user u8 *)(DST)),		\
		  [src] "m"(*(const __user u8 *)(SRC)),		\
		  [tmp] "r"(try_copy_tmp__)			\
		: "memory"					\
		: LOAD_FAULT, STORE_FAULT			\
	);							\
} while(0)

If I put that into a function and compile it, the assembly and the
exception table look perfectly OK, even under register pressure.
With a few macros like that it looks like it should be possible to
write the copy function directly in C and get optimal results.

The only other variants you need would be "try_copy_ulong" and
"try_copy_4ulong"/"try_copy_8ulong" for 32/64-bit.

Unfortunately, as I mentioned before, GCC 4.4 and older don't have
"asm goto" support :-(.

Perhaps I could put __copy_tofrom_user() into its own file and make
the assembled 32/64 output files be ".shipped"?

On the other hand, perhaps this is overly complicated :-D.

I'll poke at it more tomorrow.


>> For a page-fault during a cacheline-aligned copy, we should be able to
>> handle the exception and retry from the last cacheline without much
>> logic, again with good performance.
>>=20
>> With that said, I'm curious about the origin of the PPC32 ASM.  In
>> particular, it looks like it was generated by GCC at some point in the
>> distant past, and I'm wondering if there's a good way to rewrite that
>> file in C and trick GCC into generating the relevant exception tables
>> for it?
>=20
> Why do you think it was generated by gcc?  I wrote the original
> version, but I think it got extended and macro-ized by others.

Ah, sorry,  when I first looked at it the large collection of numeric
labels and the very sparing comments made it look autogenerated.

Although, given how much of a pain in the neck it is maybe you would
rather people not think you wrote it at all. ;-)

Cheers,
Kyle Moffett

--
Curious about my work on the Debian powerpcspe port?
I'm keeping a blog here: http://pureperl.blogspot.com/

^ permalink raw reply

* Re: [PATCH v2 2/7] powerpc/85xx: add HOTPLUG_CPU support
From: Scott Wood @ 2011-11-16 19:02 UTC (permalink / raw)
  To: Zhao Chenhui; +Cc: linuxppc-dev
In-Reply-To: <1321437344-19253-2-git-send-email-chenhui.zhao@freescale.com>

On 11/16/2011 03:55 AM, Zhao Chenhui wrote:
> +static void __cpuinit smp_85xx_mach_cpu_die(void)
> +{
> +	unsigned int cpu = smp_processor_id();
> +	register u32 tmp;
> +
> +	local_irq_disable();
> +	idle_task_exit();
> +	generic_set_cpu_dead(cpu);
> +	mb();
> +
> +	mtspr(SPRN_TCR, 0);
> +	mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);

Clearing these bits in TSR should be unnecessary since we clear TCR --
and doesn't really accomplish anything since the TSR bits can continue
to be set.

If watchdog is in use, we need to set the period to the highest possible
to effectively disable it.

> +	if (cpu_has_feature(CPU_FTR_CAN_NAP)) {

Again, don't check this.  On 85xx, we *always* can and should use nap
here.  At best this is noise, at worst this will cause problems if
CONFIG_BDI_SWITCH is enabled, or if CPU_FTR_CAN_NAP is cleared for any
other reason (e.g. it's not set on e500mc, and the reason isn't that the
nap implementation is different (which it is), but that it's not usable
in the idle loop).

> +static int __cpuinit smp_85xx_kick_cpu(int nr)
> +
>  {
>  	unsigned long flags;
>  	const u64 *cpu_rel_addr;
> -	__iomem u32 *bptr_vaddr;
> +	__iomem struct epapr_spin_table *epapr;

Please don't call this just "epapr".  That's like calling a reference to
any powerpc-specific struct "powerpc".

How about "spin_table"?

> -	out_be32(bptr_vaddr + BOOT_ENTRY_PIR, hw_cpu);
> +	out_be32(&epapr->pir, hw_cpu);
>  #ifdef CONFIG_PPC32
> -	out_be32(bptr_vaddr + BOOT_ENTRY_ADDR_LOWER, __pa(__early_start));
> +#ifdef CONFIG_HOTPLUG_CPU
> +	/* Corresponding to generic_set_cpu_dead() */
> +	generic_set_cpu_up(nr);
> +
> +	if (system_state == SYSTEM_RUNNING) {
> +		out_be32(&epapr->addr_l, 0);
> +
> +		smp_85xx_set_bootpg((u32)(*cpu_rel_addr >> PAGE_SHIFT));

As previously requested, please document why you're setting the boot
page here.  This should really be done when you resume from deep sleep,
rather than here, and should be a restoration of the value that the
register held prior to deep sleep.

>  struct smp_ops_t smp_85xx_ops = {
>  	.kick_cpu = smp_85xx_kick_cpu,
> +	.setup_cpu	= smp_85xx_setup_cpu,
> +#ifdef CONFIG_HOTPLUG_CPU
> +	.cpu_disable	= generic_cpu_disable,
> +	.cpu_die	= generic_cpu_die,
> +#endif

Only fill these fields in on e500v1/v2, until we properly support
e500mc.  Likewise in ppc_md.cpu_die and anywhere else we advertise this
functionality.

> +	of_node_put(np);
> +#ifdef CONFIG_HOTPLUG_CPU
> +	bptr = NULL;
> +	np = of_find_node_by_name(NULL, "ecm-law");
> +	if (!np) {
> +		pr_err("%s: can't find ecm-law node in dts\n", __func__);
> +		return;
> +	}

Look up by compatible, not name.

-Scott

^ permalink raw reply

* Re: [PATCH v2 1/7] powerpc/85xx: re-enable timebase sync disabled by KEXEC patch
From: Scott Wood @ 2011-11-16 18:42 UTC (permalink / raw)
  To: Zhao Chenhui; +Cc: linuxppc-dev
In-Reply-To: <1321437344-19253-1-git-send-email-chenhui.zhao@freescale.com>

On 11/16/2011 03:55 AM, Zhao Chenhui wrote:
> From: Li Yang <leoli@freescale.com>
> 
> The timebase sync is not only necessary when using KEXEC. It should also
> be used by normal boot up and cpu hotplug. Remove the ifdef added by
> the KEXEC patch.

Again, no it should not be used by normal boot up (whether KEXEC support
is enabled or not).  We should only do timebase sync when we actually
need to (when we've actually just reset a core), and we should do it the
way U-Boot does rather than with smp-tbsync.c.

-Scott

^ permalink raw reply

* Re: [PATCH 4/5, v3] powerpc/8xxx: Update device tree bus probe for new RapidIO node binding
From: Kumar Gala @ 2011-11-16 15:33 UTC (permalink / raw)
  To: Bounine, Alexandre
  Cc: r58472, Kai Jiang, linux-kernel, r61911, Liu Gang, B11780,
	linuxppc-dev, akpm
In-Reply-To: <0CE8B6BE3C4AD74AB97D9D29BD24E552024917D5@CORPEXCH1.na.ads.idt.com>


On Nov 16, 2011, at 7:48 AM, Bounine, Alexandre wrote:

>> -----Original Message-----
>> From: Liu Gang [mailto:Gang.Liu@freescale.com]
>> Sent: Saturday, November 12, 2011 7:03 AM
>> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org;
>> r58472@freescale.com; B11780@freescale.com; r61911@freescale.com; Kai
>> Jiang; Kumar Gala
>> Subject: [PATCH 4/5,v3] powerpc/8xxx: Update device tree bus probe for
>> new RapidIO node binding
>> 
>> From: Kai Jiang <Kai.Jiang@freescale.com>
>> 
>> Update of_platform_bus_probe() RapidIO node to be compitable with
>> new RapidIO dts compatible property.
>> 
>> Signed-off-by: Kai Jiang <Kai.Jiang@freescale.com>
>> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
>> ---
>> arch/powerpc/platforms/85xx/corenet_ds.c   |    2 +-
>> arch/powerpc/platforms/85xx/mpc85xx_mds.c  |    2 +-
>> arch/powerpc/platforms/86xx/mpc86xx_hpcn.c |    2 +-
>> 3 files changed, 3 insertions(+), 3 deletions(-)
>> 
> 
> Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>


applied to 'next'

- k

^ permalink raw reply

* Re: [PATCH 3/5,v3] powerpc/85xx: Update SRIO device tree nodes
From: Kumar Gala @ 2011-11-16 15:35 UTC (permalink / raw)
  To: Liu Gang
  Cc: r58472, linux-kernel, r61911, Alexandre.Bounine, akpm,
	linuxppc-dev, B11780
In-Reply-To: <1321099352-21462-3-git-send-email-Gang.Liu@freescale.com>


On Nov 12, 2011, at 6:02 AM, Liu Gang wrote:

> From: Kumar Gala <galak@kernel.crashing.org>
>=20
> Update all dts files that support SRIO controllers to match the new
> fsl,srio device tree binding.
>=20
> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
> arch/powerpc/boot/dts/mpc8568mds.dts   |   66 =
++++++++++++++++++++++-------
> arch/powerpc/boot/dts/mpc8569mds.dts   |   72 =
+++++++++++++++++++++++++------
> arch/powerpc/boot/dts/mpc8641_hpcn.dts |   69 =
++++++++++++++++++++++++-------
> arch/powerpc/boot/dts/p2041rdb.dts     |   11 +++++
> arch/powerpc/boot/dts/p2041si.dtsi     |   20 +++++++++
> arch/powerpc/boot/dts/p3041ds.dts      |   11 +++++
> arch/powerpc/boot/dts/p3041si.dtsi     |   26 ++++++++---
> arch/powerpc/boot/dts/p4080ds.dts      |   12 ++++-
> arch/powerpc/boot/dts/p4080si.dtsi     |   64 =
+++++++++++++++++++++++-----
> arch/powerpc/boot/dts/p5020ds.dts      |   11 +++++
> arch/powerpc/boot/dts/p5020si.dtsi     |   26 ++++++++---
> 11 files changed, 314 insertions(+), 74 deletions(-)

I'm holding off on this, since I'm reworking the .dts.  I'll workup a =
new patch based on top of the dts changes.

- k=

^ permalink raw reply

* Re: [PATCH 2/5, v3] fsl-rio: Add two ports and rapidio message units support
From: Kumar Gala @ 2011-11-16 15:33 UTC (permalink / raw)
  To: Bounine, Alexandre
  Cc: Jin Qing, r58472, r61911, linux-kernel, Liu Gang, akpm,
	linuxppc-dev, B11780
In-Reply-To: <0CE8B6BE3C4AD74AB97D9D29BD24E552024917CC@CORPEXCH1.na.ads.idt.com>


On Nov 16, 2011, at 7:39 AM, Bounine, Alexandre wrote:

>> -----Original Message-----
>> From: linuxppc-dev-bounces+alexandre.bounine=idt.com@lists.ozlabs.org
>> [mailto:linuxppc-dev-
>> bounces+alexandre.bounine=idt.com@lists.ozlabs.org] On Behalf Of Liu
>> Gang
>> Sent: Saturday, November 12, 2011 7:02 AM
>> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
>> Cc: Jin Qing; r58472@freescale.com; r61911@freescale.com; linux-
>> kernel@vger.kernel.org; Liu Gang; akpm@linux-foundation.org;
>> B11780@freescale.com
>> Subject: [PATCH 2/5, v3] fsl-rio: Add two ports and rapidio message
>> units support
>> 
>> Usually, freescale rapidio endpoint can support one or two 1x or 4X
>> LP-Serial link interfaces, and rapidio message transactions can be
>> implemented by two message units. This adds the support of two
>> rapidio ports and initializes message unit 0 and message unit 1. And
>> these ports and message units can work simultaneously.
>> 
>> Signed-off-by: Li Yang <leoli@freescale.com>
>> Signed-off-by: Jin Qing <b24347@freescale.com>
>> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
>> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
>> ---
>> arch/powerpc/sysdev/fsl_rio.c |  391 +++++++++++++++++++++-----------
>> arch/powerpc/sysdev/fsl_rio.h |   75 ++++++-
>> arch/powerpc/sysdev/fsl_rmu.c |  502
> ++++++++++++++++++---------------
>> --------
>> 3 files changed, 545 insertions(+), 423 deletions(-)
>> 
> 
> Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>
> 

applied to 'next'

- k

^ permalink raw reply

* Re: [PATCH 1/5, v3] fsl-rio: Split rio driver into two parts, RapidIO endpoint and message unit
From: Kumar Gala @ 2011-11-16 15:33 UTC (permalink / raw)
  To: Bounine, Alexandre
  Cc: r58472, linux-kernel, r61911, Lian Minghuan, Liu Gang, B11780,
	linuxppc-dev, akpm
In-Reply-To: <0CE8B6BE3C4AD74AB97D9D29BD24E552024917C9@CORPEXCH1.na.ads.idt.com>


On Nov 16, 2011, at 7:36 AM, Bounine, Alexandre wrote:

>> -----Original Message-----
>> From: Liu Gang [mailto:Gang.Liu@freescale.com]
>> Sent: Saturday, November 12, 2011 7:02 AM
>> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org;
>> r58472@freescale.com; B11780@freescale.com; r61911@freescale.com; Liu
>> Gang; Lian Minghuan; Kumar Gala
>> Subject: [PATCH 1/5,v3] fsl-rio: Split rio driver into two parts,
>> RapidIO endpoint and message unit
>> 
>> The Freescale PowerPC RapidIO controller consists of a RapidIO
> endpoint
>> and
>> a RapidIO message unit(RMU). Or use RapidIO message manager(RMan) to
>> replace the RMU in DPAA architecture. Therefore, we should split the
>> code
>> into two function modules according to the hardware architecture. Add
>> new
>> struct for RMU module, and new initialization function to set up RMU
>> module. This policy is very conducive to adding new module like RMan,
>> or
>> adding multi-ports or message units support.
>> 
>> Signed-off-by: Lian Minghuan <Minghuan.Lian@freescale.com>
>> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
>> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
>> ---
>> arch/powerpc/sysdev/Makefile  |    2 +-
>> arch/powerpc/sysdev/fsl_rio.c | 1152
> +--------------------------------
>> -------
>> arch/powerpc/sysdev/fsl_rio.h |   78 +++
>> arch/powerpc/sysdev/fsl_rmu.c | 1163
>> +++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 1267 insertions(+), 1128 deletions(-)
>> create mode 100644 arch/powerpc/sysdev/fsl_rio.h
>> create mode 100644 arch/powerpc/sysdev/fsl_rmu.c
>> 
> 
> Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>
> 

applied to 'next'

- k

^ permalink raw reply

* Re: [PATCH 5/5, v3] powerpc/fsl: Document rapidio node binding-information
From: Kumar Gala @ 2011-11-16 15:31 UTC (permalink / raw)
  To: Bounine, Alexandre
  Cc: Jin Qing, r58472, linux-kernel, r61911, Liu Gang, B11780,
	linuxppc-dev, akpm
In-Reply-To: <0CE8B6BE3C4AD74AB97D9D29BD24E552024917D6@CORPEXCH1.na.ads.idt.com>


On Nov 16, 2011, at 7:49 AM, Bounine, Alexandre wrote:

>> -----Original Message-----
>> From: Liu Gang [mailto:Gang.Liu@freescale.com]
>> Sent: Saturday, November 12, 2011 7:03 AM
>> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org;
>> r58472@freescale.com; B11780@freescale.com; r61911@freescale.com; Liu
>> Gang; Li Yang; Jin Qing; Kumar Gala
>> Subject: [PATCH 5/5,v3] powerpc/fsl: Document rapidio node binding-
>> information
>> 
>> This document is created for powerpc rapidio and rmu nodes in dts
> file.
>> These nodes can support two rapidio ports and message units. In
>> addition,
>> It explicates the properties and gives examples about rapidio and rmu
>> nodes.
>> 
>> Signed-off-by: Li Yang <leoli@freescale.com>
>> Signed-off-by: Jin Qing <b24347@freescale.com>
>> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
>> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
>> ---
>> .../devicetree/bindings/powerpc/fsl/srio-rmu.txt   |  163
>> ++++++++++++++++++++
>> .../devicetree/bindings/powerpc/fsl/srio.txt       |  103
> ++++++++++++
>> 2 files changed, 266 insertions(+), 0 deletions(-)
>> create mode 100644
> Documentation/devicetree/bindings/powerpc/fsl/srio-
>> rmu.txt
>> create mode 100644
>> Documentation/devicetree/bindings/powerpc/fsl/srio.txt
>> 
> 
> Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>

applied to 'next'

- k

^ permalink raw reply

* RE: [PATCH 5/5, v3] powerpc/fsl: Document rapidio node binding-information
From: Bounine, Alexandre @ 2011-11-16 13:49 UTC (permalink / raw)
  To: Liu Gang, linuxppc-dev
  Cc: Jin Qing, r58472, r61911, linux-kernel, akpm, B11780
In-Reply-To: <1321099352-21462-5-git-send-email-Gang.Liu@freescale.com>

> -----Original Message-----
> From: Liu Gang [mailto:Gang.Liu@freescale.com]
> Sent: Saturday, November 12, 2011 7:03 AM
> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org;
> r58472@freescale.com; B11780@freescale.com; r61911@freescale.com; Liu
> Gang; Li Yang; Jin Qing; Kumar Gala
> Subject: [PATCH 5/5,v3] powerpc/fsl: Document rapidio node binding-
> information
>=20
> This document is created for powerpc rapidio and rmu nodes in dts
file.
> These nodes can support two rapidio ports and message units. In
> addition,
> It explicates the properties and gives examples about rapidio and rmu
> nodes.
>=20
> Signed-off-by: Li Yang <leoli@freescale.com>
> Signed-off-by: Jin Qing <b24347@freescale.com>
> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
>  .../devicetree/bindings/powerpc/fsl/srio-rmu.txt   |  163
> ++++++++++++++++++++
>  .../devicetree/bindings/powerpc/fsl/srio.txt       |  103
++++++++++++
>  2 files changed, 266 insertions(+), 0 deletions(-)
>  create mode 100644
Documentation/devicetree/bindings/powerpc/fsl/srio-
> rmu.txt
>  create mode 100644
> Documentation/devicetree/bindings/powerpc/fsl/srio.txt
>=20

Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>

^ permalink raw reply

* RE: [PATCH 4/5, v3] powerpc/8xxx: Update device tree bus probe for new RapidIO node binding
From: Bounine, Alexandre @ 2011-11-16 13:48 UTC (permalink / raw)
  To: Liu Gang, linuxppc-dev
  Cc: r58472, Kai Jiang, r61911, linux-kernel, akpm, B11780
In-Reply-To: <1321099352-21462-4-git-send-email-Gang.Liu@freescale.com>

> -----Original Message-----
> From: Liu Gang [mailto:Gang.Liu@freescale.com]
> Sent: Saturday, November 12, 2011 7:03 AM
> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org;
> r58472@freescale.com; B11780@freescale.com; r61911@freescale.com; Kai
> Jiang; Kumar Gala
> Subject: [PATCH 4/5,v3] powerpc/8xxx: Update device tree bus probe for
> new RapidIO node binding
>=20
> From: Kai Jiang <Kai.Jiang@freescale.com>
>=20
> Update of_platform_bus_probe() RapidIO node to be compitable with
> new RapidIO dts compatible property.
>=20
> Signed-off-by: Kai Jiang <Kai.Jiang@freescale.com>
> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
>  arch/powerpc/platforms/85xx/corenet_ds.c   |    2 +-
>  arch/powerpc/platforms/85xx/mpc85xx_mds.c  |    2 +-
>  arch/powerpc/platforms/86xx/mpc86xx_hpcn.c |    2 +-
>  3 files changed, 3 insertions(+), 3 deletions(-)
>=20

Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>

^ permalink raw reply

* Re: [PATCH] powerpc: Fix atomic_xxx_return barrier semantics
From: Paul E. McKenney @ 2011-11-16 13:45 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Anton Blanchard, linuxppc-dev, Paul Mackerras
In-Reply-To: <1321413087.3170.27.camel@pasglop>

On Wed, Nov 16, 2011 at 02:11:27PM +1100, Benjamin Herrenschmidt wrote:
> The Documentation/memory-barriers.txt document requires that atomic
> operations that return a value act as a memory barrier both before
> and after the actual atomic operation.
> 
> Our current implementation doesn't guarantee this. More specifically,
> while a load following the isync can not be issued before stwcx. has
> completed, that completion doesn't architecturally means that the
> result of stwcx. is visible to other processors (or any previous stores
> for that matter) (typically, the other processors L1 caches can still
> hold the old value).
> 
> This has caused an actual crash in RCU torture testing on Power 7
> 
> This fixes it by changing those atomic ops to use new macros instead
> of RELEASE/ACQUIRE barriers, called ATOMIC_ENTRY and ATMOIC_EXIT barriers,
> which are then defined respectively to lwsync and sync.
> 
> I haven't had a chance to measure the performance impact (or rather
> what I measured with kernel compiles is in the noise, I yet have to
> find a more precise benchmark)
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> ---
> 
> diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
> index e2a4c26..02e41b5 100644
> --- a/arch/powerpc/include/asm/atomic.h
> +++ b/arch/powerpc/include/asm/atomic.h
> @@ -49,13 +49,13 @@ static __inline__ int atomic_add_return(int a, atomic_t *v)
>  	int t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	lwarx	%0,0,%2		# atomic_add_return\n\
>  	add	%0,%1,%0\n"
>  	PPC405_ERR77(0,%2)
>  "	stwcx.	%0,0,%2 \n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	: "=&r" (t)
>  	: "r" (a), "r" (&v->counter)
>  	: "cc", "memory");
> @@ -85,13 +85,13 @@ static __inline__ int atomic_sub_return(int a, atomic_t *v)
>  	int t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	lwarx	%0,0,%2		# atomic_sub_return\n\
>  	subf	%0,%1,%0\n"
>  	PPC405_ERR77(0,%2)
>  "	stwcx.	%0,0,%2 \n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	: "=&r" (t)
>  	: "r" (a), "r" (&v->counter)
>  	: "cc", "memory");
> @@ -119,13 +119,13 @@ static __inline__ int atomic_inc_return(atomic_t *v)
>  	int t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	lwarx	%0,0,%1		# atomic_inc_return\n\
>  	addic	%0,%0,1\n"
>  	PPC405_ERR77(0,%1)
>  "	stwcx.	%0,0,%1 \n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	: "=&r" (t)
>  	: "r" (&v->counter)
>  	: "cc", "xer", "memory");
> @@ -163,13 +163,13 @@ static __inline__ int atomic_dec_return(atomic_t *v)
>  	int t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	lwarx	%0,0,%1		# atomic_dec_return\n\
>  	addic	%0,%0,-1\n"
>  	PPC405_ERR77(0,%1)
>  "	stwcx.	%0,0,%1\n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	: "=&r" (t)
>  	: "r" (&v->counter)
>  	: "cc", "xer", "memory");
> @@ -194,7 +194,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
>  	int t;
> 
>  	__asm__ __volatile__ (
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	lwarx	%0,0,%1		# __atomic_add_unless\n\
>  	cmpw	0,%0,%3 \n\
>  	beq-	2f \n\
> @@ -202,7 +202,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
>  	PPC405_ERR77(0,%2)
>  "	stwcx.	%0,0,%1 \n\
>  	bne-	1b \n"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  "	subf	%0,%2,%0 \n\
>  2:"
>  	: "=&r" (t)
> @@ -226,7 +226,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
>  	int t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	lwarx	%0,0,%1		# atomic_dec_if_positive\n\
>  	cmpwi	%0,1\n\
>  	addi	%0,%0,-1\n\
> @@ -234,7 +234,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
>  	PPC405_ERR77(0,%1)
>  "	stwcx.	%0,0,%1\n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	"\n\
>  2:"	: "=&b" (t)
>  	: "r" (&v->counter)
> @@ -285,12 +285,12 @@ static __inline__ long atomic64_add_return(long a, atomic64_t *v)
>  	long t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	ldarx	%0,0,%2		# atomic64_add_return\n\
>  	add	%0,%1,%0\n\
>  	stdcx.	%0,0,%2 \n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	: "=&r" (t)
>  	: "r" (a), "r" (&v->counter)
>  	: "cc", "memory");
> @@ -319,12 +319,12 @@ static __inline__ long atomic64_sub_return(long a, atomic64_t *v)
>  	long t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	ldarx	%0,0,%2		# atomic64_sub_return\n\
>  	subf	%0,%1,%0\n\
>  	stdcx.	%0,0,%2 \n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	: "=&r" (t)
>  	: "r" (a), "r" (&v->counter)
>  	: "cc", "memory");
> @@ -351,12 +351,12 @@ static __inline__ long atomic64_inc_return(atomic64_t *v)
>  	long t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	ldarx	%0,0,%1		# atomic64_inc_return\n\
>  	addic	%0,%0,1\n\
>  	stdcx.	%0,0,%1 \n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	: "=&r" (t)
>  	: "r" (&v->counter)
>  	: "cc", "xer", "memory");
> @@ -393,12 +393,12 @@ static __inline__ long atomic64_dec_return(atomic64_t *v)
>  	long t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	ldarx	%0,0,%1		# atomic64_dec_return\n\
>  	addic	%0,%0,-1\n\
>  	stdcx.	%0,0,%1\n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	: "=&r" (t)
>  	: "r" (&v->counter)
>  	: "cc", "xer", "memory");
> @@ -418,13 +418,13 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
>  	long t;
> 
>  	__asm__ __volatile__(
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	ldarx	%0,0,%1		# atomic64_dec_if_positive\n\
>  	addic.	%0,%0,-1\n\
>  	blt-	2f\n\
>  	stdcx.	%0,0,%1\n\
>  	bne-	1b"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  	"\n\
>  2:"	: "=&r" (t)
>  	: "r" (&v->counter)
> @@ -450,14 +450,14 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
>  	long t;
> 
>  	__asm__ __volatile__ (
> -	PPC_RELEASE_BARRIER
> +	PPC_ATOMIC_ENTRY_BARRIER
>  "1:	ldarx	%0,0,%1		# __atomic_add_unless\n\
>  	cmpd	0,%0,%3 \n\
>  	beq-	2f \n\
>  	add	%0,%2,%0 \n"
>  "	stdcx.	%0,0,%1 \n\
>  	bne-	1b \n"
> -	PPC_ACQUIRE_BARRIER
> +	PPC_ATOMIC_EXIT_BARRIER
>  "	subf	%0,%2,%0 \n\
>  2:"
>  	: "=&r" (t)
> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
> index e137afc..efdc926 100644
> --- a/arch/powerpc/include/asm/bitops.h
> +++ b/arch/powerpc/include/asm/bitops.h
> @@ -124,14 +124,14 @@ static __inline__ unsigned long fn(			\
>  	return (old & mask);				\
>  }
> 
> -DEFINE_TESTOP(test_and_set_bits, or, PPC_RELEASE_BARRIER,
> -	      PPC_ACQUIRE_BARRIER, 0)
> +DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
> +	      PPC_ATOMIC_EXIT_BARRIER, 0)
>  DEFINE_TESTOP(test_and_set_bits_lock, or, "",
>  	      PPC_ACQUIRE_BARRIER, 1)
> -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_RELEASE_BARRIER,
> -	      PPC_ACQUIRE_BARRIER, 0)
> -DEFINE_TESTOP(test_and_change_bits, xor, PPC_RELEASE_BARRIER,
> -	      PPC_ACQUIRE_BARRIER, 0)
> +DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
> +	      PPC_ATOMIC_EXIT_BARRIER, 0)
> +DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
> +	      PPC_ATOMIC_EXIT_BARRIER, 0)
> 
>  static __inline__ int test_and_set_bit(unsigned long nr,
>  				       volatile unsigned long *addr)
> diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
> index c94e4a3..2a9cf84 100644
> --- a/arch/powerpc/include/asm/futex.h
> +++ b/arch/powerpc/include/asm/futex.h
> @@ -11,12 +11,13 @@
> 
>  #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
>    __asm__ __volatile ( \
> -	PPC_RELEASE_BARRIER \
> +	PPC_ATOMIC_ENTRY_BARRIER \
>  "1:	lwarx	%0,0,%2\n" \
>  	insn \
>  	PPC405_ERR77(0, %2) \
>  "2:	stwcx.	%1,0,%2\n" \
>  	"bne-	1b\n" \
> +	PPC_ATOMIC_EXIT_BARRIER \
>  	"li	%1,0\n" \
>  "3:	.section .fixup,\"ax\"\n" \
>  "4:	li	%1,%3\n" \
> @@ -92,14 +93,14 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
>  		return -EFAULT;
> 
>          __asm__ __volatile__ (
> -        PPC_RELEASE_BARRIER
> +        PPC_ATOMIC_ENTRY_BARRIER
>  "1:     lwarx   %1,0,%3         # futex_atomic_cmpxchg_inatomic\n\
>          cmpw    0,%1,%4\n\
>          bne-    3f\n"
>          PPC405_ERR77(0,%3)
>  "2:     stwcx.  %5,0,%3\n\
>          bne-    1b\n"
> -        PPC_ACQUIRE_BARRIER
> +        PPC_ATOMIC_EXIT_BARRIER
>  "3:	.section .fixup,\"ax\"\n\
>  4:	li	%0,%6\n\
>  	b	3b\n\
> diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h
> index d7cab44..24fc618 100644
> --- a/arch/powerpc/include/asm/synch.h
> +++ b/arch/powerpc/include/asm/synch.h
> @@ -41,11 +41,15 @@ static inline void isync(void)
>  	START_LWSYNC_SECTION(97);			\
>  	isync;						\
>  	MAKE_LWSYNC_SECTION_ENTRY(97, __lwsync_fixup);
> -#define PPC_ACQUIRE_BARRIER	"\n" stringify_in_c(__PPC_ACQUIRE_BARRIER)
> -#define PPC_RELEASE_BARRIER	stringify_in_c(LWSYNC) "\n"
> +#define PPC_ACQUIRE_BARRIER	 "\n" stringify_in_c(__PPC_ACQUIRE_BARRIER)
> +#define PPC_RELEASE_BARRIER	 stringify_in_c(LWSYNC) "\n"
> +#define PPC_ATOMIC_ENTRY_BARRIER "\n" stringify_in_c(LWSYNC) "\n"
> +#define PPC_ATOMIC_EXIT_BARRIER	 "\n" stringify_in_c(sync) "\n"
>  #else
>  #define PPC_ACQUIRE_BARRIER
>  #define PPC_RELEASE_BARRIER
> +#define PPC_ATOMIC_ENTRY_BARRIER
> +#define PPC_ATOMIC_EXIT_BARRIER
>  #endif
> 
>  #endif /* __KERNEL__ */
> 
> 

^ permalink raw reply

* RE: [PATCH 3/5,v3] powerpc/85xx: Update SRIO device tree nodes
From: Bounine, Alexandre @ 2011-11-16 13:40 UTC (permalink / raw)
  To: Liu Gang, linuxppc-dev; +Cc: r58472, r61911, linux-kernel, akpm, B11780
In-Reply-To: <1321099352-21462-3-git-send-email-Gang.Liu@freescale.com>

> -----Original Message-----
> From: Liu Gang [mailto:Gang.Liu@freescale.com]
> Sent: Saturday, November 12, 2011 7:03 AM
> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org;
> r58472@freescale.com; B11780@freescale.com; r61911@freescale.com;
Kumar
> Gala
> Subject: [PATCH 3/5,v3] powerpc/85xx: Update SRIO device tree nodes
>=20
> From: Kumar Gala <galak@kernel.crashing.org>
>=20
> Update all dts files that support SRIO controllers to match the new
> fsl,srio device tree binding.
>=20
> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
>  arch/powerpc/boot/dts/mpc8568mds.dts   |   66
++++++++++++++++++++++--
> -----
>  arch/powerpc/boot/dts/mpc8569mds.dts   |   72
> +++++++++++++++++++++++++------
>  arch/powerpc/boot/dts/mpc8641_hpcn.dts |   69
> ++++++++++++++++++++++++-------
>  arch/powerpc/boot/dts/p2041rdb.dts     |   11 +++++
>  arch/powerpc/boot/dts/p2041si.dtsi     |   20 +++++++++
>  arch/powerpc/boot/dts/p3041ds.dts      |   11 +++++
>  arch/powerpc/boot/dts/p3041si.dtsi     |   26 ++++++++---
>  arch/powerpc/boot/dts/p4080ds.dts      |   12 ++++-
>  arch/powerpc/boot/dts/p4080si.dtsi     |   64
+++++++++++++++++++++++-
> ----
>  arch/powerpc/boot/dts/p5020ds.dts      |   11 +++++
>  arch/powerpc/boot/dts/p5020si.dtsi     |   26 ++++++++---
>  11 files changed, 314 insertions(+), 74 deletions(-)
>=20

Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>

^ permalink raw reply

* RE: [PATCH 2/5, v3] fsl-rio: Add two ports and rapidio message units support
From: Bounine, Alexandre @ 2011-11-16 13:39 UTC (permalink / raw)
  To: Liu Gang, linuxppc-dev
  Cc: Jin Qing, r58472, linux-kernel, r61911, akpm, B11780
In-Reply-To: <1321099352-21462-2-git-send-email-Gang.Liu@freescale.com>

> -----Original Message-----
> From: =
linuxppc-dev-bounces+alexandre.bounine=3Didt.com@lists.ozlabs.org
> [mailto:linuxppc-dev-
> bounces+alexandre.bounine=3Didt.com@lists.ozlabs.org] On Behalf Of Liu
> Gang
> Sent: Saturday, November 12, 2011 7:02 AM
> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
> Cc: Jin Qing; r58472@freescale.com; r61911@freescale.com; linux-
> kernel@vger.kernel.org; Liu Gang; akpm@linux-foundation.org;
> B11780@freescale.com
> Subject: [PATCH 2/5, v3] fsl-rio: Add two ports and rapidio message
> units support
>=20
> Usually, freescale rapidio endpoint can support one or two 1x or 4X
> LP-Serial link interfaces, and rapidio message transactions can be
> implemented by two message units. This adds the support of two
> rapidio ports and initializes message unit 0 and message unit 1. And
> these ports and message units can work simultaneously.
>=20
> Signed-off-by: Li Yang <leoli@freescale.com>
> Signed-off-by: Jin Qing <b24347@freescale.com>
> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
>  arch/powerpc/sysdev/fsl_rio.c |  391 +++++++++++++++++++++-----------
>  arch/powerpc/sysdev/fsl_rio.h |   75 ++++++-
>  arch/powerpc/sysdev/fsl_rmu.c |  502
++++++++++++++++++---------------
> --------
>  3 files changed, 545 insertions(+), 423 deletions(-)
>=20

Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>

^ permalink raw reply

* RE: [PATCH 1/5, v3] fsl-rio: Split rio driver into two parts, RapidIO endpoint and message unit
From: Bounine, Alexandre @ 2011-11-16 13:36 UTC (permalink / raw)
  To: Liu Gang, linuxppc-dev
  Cc: r58472, r61911, linux-kernel, Lian Minghuan, akpm, B11780
In-Reply-To: <1321099352-21462-1-git-send-email-Gang.Liu@freescale.com>

> -----Original Message-----
> From: Liu Gang [mailto:Gang.Liu@freescale.com]
> Sent: Saturday, November 12, 2011 7:02 AM
> To: linuxppc-dev@lists.ozlabs.org; Bounine, Alexandre
> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org;
> r58472@freescale.com; B11780@freescale.com; r61911@freescale.com; Liu
> Gang; Lian Minghuan; Kumar Gala
> Subject: [PATCH 1/5,v3] fsl-rio: Split rio driver into two parts,
> RapidIO endpoint and message unit
>=20
> The Freescale PowerPC RapidIO controller consists of a RapidIO
endpoint
> and
> a RapidIO message unit(RMU). Or use RapidIO message manager(RMan) to
> replace the RMU in DPAA architecture. Therefore, we should split the
> code
> into two function modules according to the hardware architecture. Add
> new
> struct for RMU module, and new initialization function to set up RMU
> module. This policy is very conducive to adding new module like RMan,
> or
> adding multi-ports or message units support.
>=20
> Signed-off-by: Lian Minghuan <Minghuan.Lian@freescale.com>
> Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
>  arch/powerpc/sysdev/Makefile  |    2 +-
>  arch/powerpc/sysdev/fsl_rio.c | 1152
+--------------------------------
> -------
>  arch/powerpc/sysdev/fsl_rio.h |   78 +++
>  arch/powerpc/sysdev/fsl_rmu.c | 1163
> +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 1267 insertions(+), 1128 deletions(-)
>  create mode 100644 arch/powerpc/sysdev/fsl_rio.h
>  create mode 100644 arch/powerpc/sysdev/fsl_rmu.c
>=20

Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>

^ permalink raw reply

* [PATCH v2 5/7] fsl_pmc: update device bindings
From: Zhao Chenhui @ 2011-11-16  9:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: scottwood
In-Reply-To: <1321437344-19253-1-git-send-email-chenhui.zhao@freescale.com>

From: Li Yang <leoli@freescale.com>

Signed-off-by: Li Yang <leoli@freescale.com>
---
 .../devicetree/bindings/powerpc/fsl/pmc.txt        |   63 +++++++++++--------
 1 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/Documentation/devicetree/bindings/powerpc/fsl/pmc.txt b/Documentation/devicetree/bindings/powerpc/fsl/pmc.txt
index 07256b7..d84b4f8 100644
--- a/Documentation/devicetree/bindings/powerpc/fsl/pmc.txt
+++ b/Documentation/devicetree/bindings/powerpc/fsl/pmc.txt
@@ -9,22 +9,27 @@ Properties:
 
   "fsl,mpc8548-pmc" should be listed for any chip whose PMC is
   compatible.  "fsl,mpc8536-pmc" should also be listed for any chip
-  whose PMC is compatible, and implies deep-sleep capability.
+  whose PMC is compatible, and implies deep-sleep capability and
+  wake on user defined packet(wakeup on ARP).
+
+  "fsl,p1022-pmc" should be listed for any chip whose PMC is
+  compatible, and implies lossless Ethernet capability during sleep.
 
   "fsl,mpc8641d-pmc" should be listed for any chip whose PMC is
   compatible; all statements below that apply to "fsl,mpc8548-pmc" also
   apply to "fsl,mpc8641d-pmc".
 
   Compatibility does not include bit assignments in SCCR/PMCDR/DEVDISR; these
-  bit assignments are indicated via the sleep specifier in each device's
-  sleep property.
+  bit assignments are indicated via the clock nodes.  Device which has a
+  controllable clock source should have a "clk-handle" property pointing
+  to the clock node.
 
 - reg: For devices compatible with "fsl,mpc8349-pmc", the first resource
   is the PMC block, and the second resource is the Clock Configuration
   block.
 
-  For devices compatible with "fsl,mpc8548-pmc", the first resource
-  is a 32-byte block beginning with DEVDISR.
+  For devices compatible with "fsl,mpc8548-pmc", the second resource
+  is a 32-byte block beginning with DEVDISR if supported.
 
 - interrupts: For "fsl,mpc8349-pmc"-compatible devices, the first
   resource is the PMC block interrupt.
@@ -33,31 +38,35 @@ Properties:
   this is a phandle to an "fsl,gtm" node on which timer 4 can be used as
   a wakeup source from deep sleep.
 
-Sleep specifiers:
+Clock nodes:
+The clock nodes are to describe the masks in PM controller registers for each
+soc clock.
+- fsl,pmcdr-mask: For "fsl,mpc8548-pmc"-compatible devices, the mask will be
+  ORed into PMCDR before suspend if the device using this clock is the wake-up
+  source and need to be running during low power mode; clear the mask if
+  otherwise.
 
-  fsl,mpc8349-pmc: Sleep specifiers consist of one cell.  For each bit
-  that is set in the cell, the corresponding bit in SCCR will be saved
-  and cleared on suspend, and restored on resume.  This sleep controller
-  supports disabling and resuming devices at any time.
+- fsl,sccr-mask: For "fsl,mpc8349-pmc"-compatible devices, the corresponding
+  bit specified by the mask in SCCR will be saved and cleared on suspend, and
+  restored on resume.
 
-  fsl,mpc8536-pmc: Sleep specifiers consist of three cells, the third of
-  which will be ORed into PMCDR upon suspend, and cleared from PMCDR
-  upon resume.  The first two cells are as described for fsl,mpc8578-pmc.
-  This sleep controller only supports disabling devices during system
-  sleep, or permanently.
-
-  fsl,mpc8548-pmc: Sleep specifiers consist of one or two cells, the
-  first of which will be ORed into DEVDISR (and the second into
-  DEVDISR2, if present -- this cell should be zero or absent if the
-  hardware does not have DEVDISR2) upon a request for permanent device
-  disabling.  This sleep controller does not support configuring devices
-  to disable during system sleep (unless supported by another compatible
-  match), or dynamically.
+- fsl,devdisr-mask: Contain one or two cells, depending on the availability of
+  DEVDISR2 register.  For compatible devices, the mask will be ORed into DEVDISR
+  or DEVDISR2 when the clock should be permenently disabled.
 
 Example:
 
-	power@b00 {
-		compatible = "fsl,mpc8313-pmc", "fsl,mpc8349-pmc";
-		reg = <0xb00 0x100 0xa00 0x100>;
-		interrupts = <80 8>;
+	power@e0070 {
+		compatible = "fsl,mpc8536-pmc", "fsl,mpc8548-pmc";
+		reg = <0xe0070 0x20>;
+
+		etsec1_clk: soc-clk@24 {
+			fsl,pmcdr-mask = <0x00000080>;
+		};
+		etsec2_clk: soc-clk@25 {
+			fsl,pmcdr-mask = <0x00000040>;
+		};
+		etsec3_clk: soc-clk@26 {
+			fsl,pmcdr-mask = <0x00000020>;
+		};
 	};
-- 
1.6.4.1

^ permalink raw reply related

* [PATCH v2 4/7] powerpc/85xx: add support to JOG feature using cpufreq interface
From: Zhao Chenhui @ 2011-11-16  9:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: scottwood
In-Reply-To: <1321437344-19253-1-git-send-email-chenhui.zhao@freescale.com>

From: Li Yang <leoli@freescale.com>

Some 85xx silicons like MPC8536 and P1022 has the JOG PM feature.

The patch adds the support to change CPU frequency using the standard
cpufreq interface. Add the all PLL ratio core support. The ratio CORE
to CCB can 1:1(except MPC8536), 3:2, 2:1, 5:2, 3:1, 7:2 and 4:1.

Signed-off-by: Dave Liu <daveliu@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Jerry Huang <Chang-Ming.Huang@freescale.com>
Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
---
Changes for v2:
 - rework set_pll(). wakeup all cores before issuing a jog request.
 - use the platform driver framwork

 arch/powerpc/platforms/85xx/Makefile      |    1 +
 arch/powerpc/platforms/85xx/cpufreq-jog.c |  322 +++++++++++++++++++++++++++++
 arch/powerpc/platforms/Kconfig            |    8 +
 3 files changed, 331 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/platforms/85xx/cpufreq-jog.c

diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile
index cec54c7..49a865a 100644
--- a/arch/powerpc/platforms/85xx/Makefile
+++ b/arch/powerpc/platforms/85xx/Makefile
@@ -3,6 +3,7 @@
 #
 obj-$(CONFIG_SMP) += smp.o
 obj-$(CONFIG_SUSPEND)	+= sleep.o
+obj-$(CONFIG_MPC85xx_CPUFREQ) += cpufreq-jog.o
 
 obj-$(CONFIG_MPC8540_ADS) += mpc85xx_ads.o
 obj-$(CONFIG_MPC8560_ADS) += mpc85xx_ads.o
diff --git a/arch/powerpc/platforms/85xx/cpufreq-jog.c b/arch/powerpc/platforms/85xx/cpufreq-jog.c
new file mode 100644
index 0000000..efe62b9
--- /dev/null
+++ b/arch/powerpc/platforms/85xx/cpufreq-jog.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc.
+ * Author: Dave Liu <daveliu@freescale.com>
+ * Modifier: Chenhui Zhao <chenhui.zhao@freescale.com>
+ *
+ * The cpufreq driver is for Freescale 85xx processor,
+ * based on arch/powerpc/platforms/cell/cbe_cpufreq.c
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
+ *	Christian Krafft <krafft@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/cpufreq.h>
+#include <linux/of_platform.h>
+
+#include <asm/prom.h>
+#include <asm/time.h>
+#include <asm/reg.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+
+#include <sysdev/fsl_soc.h>
+
+static DEFINE_MUTEX(mpc85xx_switch_mutex);
+static void __iomem *guts;
+static u32 sysfreq, threshold_freq;
+static struct cpufreq_frequency_table *mpc85xx_freqs;
+
+static struct cpufreq_frequency_table mpc8536_freqs_table[] = {
+	{3,	0},
+	{4,	0},
+	{5,	0},
+	{6,	0},
+	{7,	0},
+	{8,	0},
+	{0,	CPUFREQ_TABLE_END},
+};
+
+static struct cpufreq_frequency_table p1022_freqs_table[] = {
+	{2,	0},
+	{3,	0},
+	{4,	0},
+	{5,	0},
+	{6,	0},
+	{7,	0},
+	{8,	0},
+	{0,	CPUFREQ_TABLE_END},
+};
+
+#define FREQ_533MHz	533340000
+#define FREQ_800MHz	800000000
+
+#define CORE_RATIO_BITS		8
+#define CORE_RATIO_MASK		0x3f
+#define CORE0_RATIO_SHIFT	16
+
+#define PORPLLSR	0x0
+
+#define PMJCR		0x7c
+#define PMJCR_CORE0_SPD_MASK	0x00001000
+
+#define POWMGTCSR	0x80
+#define POWMGTCSR_LOSSLESS_MASK	0x00400000
+#define POWMGTCSR_JOG_MASK	0x00200000
+#define POWMGTCSR_CORE0_IRQ_MSK	0x80000000
+#define POWMGTCSR_CORE0_CI_MSK	0x40000000
+#define POWMGTCSR_CORE0_DOZING	0x00000008
+#define POWMGTCSR_CORE0_NAPPING	0x00000004
+
+/*
+ * hardware specific functions
+ */
+static int get_pll(int hw_cpu)
+{
+	int ret, shift;
+	u32 cur_pll = in_be32(guts + PORPLLSR);
+
+	shift = hw_cpu * CORE_RATIO_BITS + CORE0_RATIO_SHIFT;
+	ret = (cur_pll >> shift) & CORE_RATIO_MASK;
+	return ret;
+}
+
+static int set_pll(unsigned int cpu, unsigned int pll)
+{
+	void *powersave = NULL;
+	int hw_cpu = get_hard_smp_processor_id(cpu);
+	int shift, i;
+	u32 corefreq, val;
+	u32 mask;
+	unsigned long flags;
+	int ret = 0;
+
+	if (pll == get_pll(hw_cpu))
+		return 0;
+
+	shift = hw_cpu * CORE_RATIO_BITS + CORE0_RATIO_SHIFT;
+	val = (pll & CORE_RATIO_MASK) << shift;
+
+	corefreq = sysfreq * pll / 2;
+	/*
+	 * Set the COREx_SPD bit if the requested core frequency
+	 * is larger than the threshold frequency.
+	 */
+	if (corefreq > threshold_freq)
+		val |= PMJCR_CORE0_SPD_MASK << hw_cpu;
+
+	mask = (CORE_RATIO_MASK << shift) | (PMJCR_CORE0_SPD_MASK << hw_cpu);
+	clrsetbits_be32(guts + PMJCR, mask, val);
+
+	/* readback to sync write */
+	val = in_be32(guts + PMJCR);
+
+	local_irq_save(flags);
+	/*
+	 * A Jog request can not be asserted when any core is in a low power
+	 * state. Before executing a jog request, any core which is in
+	 * a low power state must be waked by a interrupt.
+	 */
+	if (mpc85xx_freqs == p1022_freqs_table) {
+		powersave = ppc_md.power_save;
+		ppc_md.power_save = NULL;
+		wmb();
+		val = in_be32(guts + POWMGTCSR);
+		for_each_online_cpu(i) {
+			if (val & ((POWMGTCSR_CORE0_DOZING |
+					POWMGTCSR_CORE0_NAPPING) << (i * 2)))
+				smp_send_reschedule(i);
+		}
+	}
+	setbits32(guts + POWMGTCSR, POWMGTCSR_JOG_MASK);
+
+	if (powersave) {
+		ppc_md.power_save = powersave;
+		wmb();
+	}
+
+	local_irq_restore(flags);
+
+	/* verify */
+	if (!spin_event_timeout(get_pll(hw_cpu) == pll, 10000, 10)) {
+		pr_err("%s: Fail to switch the core frequency. "
+			"The current PLL of core %d is %d instead of %d.\n",
+				__func__, hw_cpu, get_pll(hw_cpu), pll);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+/*
+ * cpufreq functions
+ */
+static int mpc85xx_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int i, cur_pll;
+	int hw_cpu = get_hard_smp_processor_id(policy->cpu);
+
+	if (!cpu_present(policy->cpu))
+		return -EINVAL;
+
+	/* the latency of a transition, the unit is ns */
+	policy->cpuinfo.transition_latency = 2000;
+
+	cur_pll = get_pll(hw_cpu);
+
+	/* initialize frequency table */
+	pr_debug("core%d frequency table:\n", hw_cpu);
+	for (i = 0; mpc85xx_freqs[i].frequency != CPUFREQ_TABLE_END; i++) {
+		/* The frequency unit is kHz. */
+		mpc85xx_freqs[i].frequency =
+				(sysfreq * mpc85xx_freqs[i].index / 2) / 1000;
+		pr_debug("%d: %dkHz\n", i, mpc85xx_freqs[i].frequency);
+
+		if (mpc85xx_freqs[i].index == cur_pll)
+			policy->cur = mpc85xx_freqs[i].frequency;
+	}
+	pr_debug("current pll is at %d, and core freq is%d\n",
+					cur_pll, policy->cur);
+
+	cpufreq_frequency_table_get_attr(mpc85xx_freqs, policy->cpu);
+
+	/*
+	 * This ensures that policy->cpuinfo_min
+	 * and policy->cpuinfo_max are set correctly.
+	 */
+	return cpufreq_frequency_table_cpuinfo(policy, mpc85xx_freqs);
+}
+
+static int mpc85xx_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+static int mpc85xx_cpufreq_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, mpc85xx_freqs);
+}
+
+static int mpc85xx_cpufreq_target(struct cpufreq_policy *policy,
+			      unsigned int target_freq,
+			      unsigned int relation)
+{
+	struct cpufreq_freqs freqs;
+	unsigned int new;
+	int ret = 0;
+
+	cpufreq_frequency_table_target(policy,
+				       mpc85xx_freqs,
+				       target_freq,
+				       relation,
+				       &new);
+
+	freqs.old = policy->cur;
+	freqs.new = mpc85xx_freqs[new].frequency;
+	freqs.cpu = policy->cpu;
+
+	mutex_lock(&mpc85xx_switch_mutex);
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	ret = set_pll(policy->cpu, mpc85xx_freqs[new].index);
+	if (!ret) {
+		pr_info("cpufreq: Setting core%d frequency to %d kHz and " \
+			 "PLL ratio to %d:2\n",
+			 policy->cpu,
+			 mpc85xx_freqs[new].frequency,
+			 mpc85xx_freqs[new].index);
+
+		ppc_proc_freq = freqs.new * 1000ul;
+	}
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	mutex_unlock(&mpc85xx_switch_mutex);
+
+	return ret;
+}
+
+static struct cpufreq_driver mpc85xx_cpufreq_driver = {
+	.verify		= mpc85xx_cpufreq_verify,
+	.target		= mpc85xx_cpufreq_target,
+	.init		= mpc85xx_cpufreq_cpu_init,
+	.exit		= mpc85xx_cpufreq_cpu_exit,
+	.name		= "mpc85xx-JOG",
+	.owner		= THIS_MODULE,
+	.flags		= CPUFREQ_CONST_LOOPS,
+};
+
+static int mpc85xx_job_probe(struct platform_device *ofdev)
+{
+	struct device_node *np = ofdev->dev.of_node;
+
+	if (of_device_is_compatible(np, "fsl,mpc8536-guts")) {
+		threshold_freq = FREQ_800MHz;
+		mpc85xx_freqs = mpc8536_freqs_table;
+	} else if (of_device_is_compatible(np, "fsl,p1022-guts")) {
+		threshold_freq = FREQ_533MHz;
+		mpc85xx_freqs = p1022_freqs_table;
+	}
+
+	sysfreq = fsl_get_sys_freq();
+
+	guts = of_iomap(np, 0);
+	if (guts == NULL)
+		return -ENOMEM;
+
+	pr_info("Freescale MPC85xx CPU frequency switching(JOG) driver\n");
+
+	return cpufreq_register_driver(&mpc85xx_cpufreq_driver);
+}
+
+static int mpc85xx_jog_remove(struct platform_device *ofdev)
+{
+	iounmap(guts);
+	cpufreq_unregister_driver(&mpc85xx_cpufreq_driver);
+
+	return 0;
+}
+
+static struct of_device_id mpc85xx_jog_ids[] = {
+	{ .compatible = "fsl,mpc8536-guts", },
+	{ .compatible = "fsl,p1022-guts", },
+	{}
+};
+
+static struct platform_driver mpc85xx_jog_driver = {
+	.driver = {
+		.name = "mpc85xx_cpufreq_jog",
+		.owner = THIS_MODULE,
+		.of_match_table = mpc85xx_jog_ids,
+	},
+	.probe = mpc85xx_job_probe,
+	.remove = mpc85xx_jog_remove,
+};
+
+static int __init mpc85xx_jog_init(void)
+{
+	return platform_driver_register(&mpc85xx_jog_driver);
+}
+
+static void __exit mpc85xx_jog_exit(void)
+{
+	platform_driver_unregister(&mpc85xx_jog_driver);
+}
+
+module_init(mpc85xx_jog_init);
+module_exit(mpc85xx_jog_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dave Liu <daveliu@freescale.com>");
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index e458872..63bd32a 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -200,6 +200,14 @@ config CPU_FREQ_PMAC64
 	  This adds support for frequency switching on Apple iMac G5,
 	  and some of the more recent desktop G5 machines as well.
 
+config MPC85xx_CPUFREQ
+	bool "Support for Freescale MPC85xx CPU freq"
+	depends on PPC_85xx && PPC32
+	select CPU_FREQ_TABLE
+	help
+	  This adds support for frequency switching on Freescale MPC85xx,
+	  currently including P1022 and MPC8536.
+
 config PPC_PASEMI_CPUFREQ
 	bool "Support for PA Semi PWRficient"
 	depends on PPC_PASEMI
-- 
1.6.4.1

^ permalink raw reply related

* [PATCH v2 6/7] fsl_pmc: Add API to enable device as wakeup event source
From: Zhao Chenhui @ 2011-11-16  9:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: scottwood
In-Reply-To: <1321437344-19253-1-git-send-email-chenhui.zhao@freescale.com>

Add APIs for setting wakeup source and lossless Ethernet in low power modes.
These APIs can be used by wake-on-packet feature.

Signed-off-by: Dave Liu <daveliu@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Jin Qing <b24347@freescale.com>
Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
---
Changes for v2:
 - rename functions
 - add pmc_flag

 arch/powerpc/sysdev/fsl_pmc.c |   72 ++++++++++++++++++++++++++++++++++++++++-
 arch/powerpc/sysdev/fsl_soc.h |    9 +++++
 2 files changed, 80 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_pmc.c b/arch/powerpc/sysdev/fsl_pmc.c
index d6c65a7..6a2f8b4 100644
--- a/arch/powerpc/sysdev/fsl_pmc.c
+++ b/arch/powerpc/sysdev/fsl_pmc.c
@@ -40,13 +40,83 @@ static unsigned int pmc_flag;
 
 #define PMC_SLEEP	0x1
 #define PMC_DEEP_SLEEP	0x2
+#define PMC_LOSSLESS	0x4
 
 #define POWMGTCSR_SLP_MASK	0x00020000
+#define POWMGTCSR_LOSSLESS_MASK	0x00400000
 
 /* Cast the ccsrbar to 64-bit parameter so that the assembly
  * code can be compatible with both 32-bit & 36-bit */
 extern void mpc85xx_enter_deep_sleep(u64 ccsrbar);
 
+#ifdef CONFIG_FSL_PMC
+/**
+ * mpc85xx_pmc_set_wake - enable OF device as wakeup event source
+ * @pdev: platform device affected
+ * @enable: True to enable event generation; false to disable
+ *
+ * This enables the device as a wakeup event source, or disables it.
+ *
+ * RETURN VALUE:
+ * 0 is returned on success
+ * -EINVAL is returned if device is not supposed to wake up the system
+ * Error code depending on the platform is returned if both the platform and
+ * the native mechanism fail to enable the generation of wake-up events
+ */
+int mpc85xx_pmc_set_wake(struct platform_device *pdev, bool enable)
+{
+	int ret = 0;
+	struct device_node *clk_np;
+	u32 pmcdr_mask;
+
+	if (!pmc_regs) {
+		pr_err("%s: PMC is unavailable\n", __func__);
+		return -ENODEV;
+	}
+
+	if (enable && !device_may_wakeup(&pdev->dev))
+		return -EINVAL;
+
+	clk_np = of_parse_phandle(pdev->dev.of_node, "clk-handle", 0);
+	if (!clk_np)
+		return -EINVAL;
+
+	if (of_property_read_u32(clk_np, "fsl,pmcdr-mask", &pmcdr_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (enable)
+		/* clear to enable clock in low power mode */
+		clrbits32(&pmc_regs->pmcdr, pmcdr_mask);
+	else
+		setbits32(&pmc_regs->pmcdr, pmcdr_mask);
+
+out:
+	of_node_put(clk_np);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mpc85xx_pmc_set_wake);
+
+/**
+ * mpc85xx_pmc_set_lossless_ethernet - enable lossless ethernet
+ * in (deep) sleep mode
+ * @enable: True to enable event generation; false to disable
+ */
+void mpc85xx_pmc_set_lossless_ethernet(int enable)
+{
+	if (pmc_flag & PMC_LOSSLESS) {
+		if (enable)
+			setbits32(&pmc_regs->powmgtcsr,
+					POWMGTCSR_LOSSLESS_MASK);
+		else
+			clrbits32(&pmc_regs->powmgtcsr,
+					POWMGTCSR_LOSSLESS_MASK);
+	}
+}
+EXPORT_SYMBOL_GPL(mpc85xx_pmc_set_lossless_ethernet);
+#endif
+
 static int pmc_suspend_enter(suspend_state_t state)
 {
 	int ret = 0;
@@ -120,7 +190,7 @@ static int pmc_probe(struct platform_device *pdev)
 		pmc_flag |= PMC_DEEP_SLEEP;
 
 	if (of_device_is_compatible(np, "fsl,p1022-pmc"))
-		pmc_flag |= PMC_DEEP_SLEEP;
+		pmc_flag |= PMC_DEEP_SLEEP | PMC_LOSSLESS;
 
 	suspend_set_ops(&pmc_suspend_ops);
 
diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h
index c6d0073..3422b0d 100644
--- a/arch/powerpc/sysdev/fsl_soc.h
+++ b/arch/powerpc/sysdev/fsl_soc.h
@@ -3,6 +3,7 @@
 #ifdef __KERNEL__
 
 #include <asm/mmu.h>
+#include <linux/platform_device.h>
 
 struct spi_device;
 
@@ -21,6 +22,14 @@ struct device_node;
 
 extern void fsl_rstcr_restart(char *cmd);
 
+#ifdef CONFIG_FSL_PMC
+int mpc85xx_pmc_set_wake(struct platform_device *pdev, bool enable);
+void mpc85xx_pmc_set_lossless_ethernet(int enable);
+#else
+#define mpc85xx_pmc_set_wake(pdev, enable)
+#define mpc85xx_pmc_set_lossless_ethernet(enable)
+#endif
+
 #if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
 
 /* The different ports that the DIU can be connected to */
-- 
1.6.4.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox