LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH -V4 24/25] powerpc: Optimize hugepage invalidate
From: Aneesh Kumar K.V @ 2013-03-20 19:35 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1363808110-25748-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

Hugepage invalidate involves invalidating multiple hpte entries.
Optimize the operation using H_BULK_REMOVE on lpar platforms.
On native, reduce the number of tlb flush.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/machdep.h    |    3 +
 arch/powerpc/mm/hash_native_64.c      |   78 ++++++++++++++++++++
 arch/powerpc/mm/pgtable.c             |   13 +++-
 arch/powerpc/platforms/pseries/lpar.c |  126 +++++++++++++++++++++++++++++++--
 4 files changed, 210 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 6cee6e0..3bc7816 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -56,6 +56,9 @@ struct machdep_calls {
 	void            (*hpte_removebolted)(unsigned long ea,
 					     int psize, int ssize);
 	void		(*flush_hash_range)(unsigned long number, int local);
+	void		(*hugepage_invalidate)(struct mm_struct *mm,
+					       unsigned char *hpte_slot_array,
+					       unsigned long addr, int psize);
 
 	/* special for kexec, to be called in real mode, linear mapping is
 	 * destroyed as well */
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index ac84fa6..59f29bf 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -450,6 +450,83 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	local_irq_restore(flags);
 }
 
+static void native_hugepage_invalidate(struct mm_struct *mm,
+				       unsigned char *hpte_slot_array,
+				       unsigned long addr, int psize)
+{
+	int ssize = 0, i;
+	int lock_tlbie;
+	struct hash_pte *hptep;
+	int actual_psize = MMU_PAGE_16M;
+	unsigned int max_hpte_count, valid;
+	unsigned long flags, s_addr = addr;
+	unsigned long hpte_v, want_v, shift;
+	unsigned long hidx, vpn = 0, vsid, hash, slot;
+
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = HUGE_PAGE_SIZE/(1ul << shift);
+
+	local_irq_save(flags);
+	for (i = 0; i < max_hpte_count; i++) {
+		/*
+		 * 8 bits per each hpte entries
+		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+		 */
+		valid = hpte_slot_array[i] & 0x1;
+		if (!valid)
+			continue;
+		hidx =  hpte_slot_array[i]  >> 1;
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		if (!is_kernel_addr(addr)) {
+			ssize = user_segment_size(addr);
+			vsid = get_vsid(mm->context.id, addr, ssize);
+			WARN_ON(vsid == 0);
+		} else {
+			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+			ssize = mmu_kernel_ssize;
+		}
+
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		hptep = htab_address + slot;
+		want_v = hpte_encode_avpn(vpn, psize, ssize);
+		native_lock_hpte(hptep);
+		hpte_v = hptep->v;
+
+		/* Even if we miss, we need to invalidate the TLB */
+		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+			native_unlock_hpte(hptep);
+		else
+			/* Invalidate the hpte. NOTE: this also unlocks it */
+			hptep->v = 0;
+	}
+	/*
+	 * Since this is a hugepage, we just need a single tlbie.
+	 * use the last vpn.
+	 */
+	lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+	if (lock_tlbie)
+		raw_spin_lock(&native_tlbie_lock);
+
+	asm volatile("ptesync":::"memory");
+	__tlbie(vpn, psize, actual_psize, ssize);
+	asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+	if (lock_tlbie)
+		raw_spin_unlock(&native_tlbie_lock);
+
+	local_irq_restore(flags);
+}
+
+
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 			int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
@@ -678,4 +755,5 @@ void __init hpte_init_native(void)
 	ppc_md.hpte_remove	= native_hpte_remove;
 	ppc_md.hpte_clear_all	= native_hpte_clear;
 	ppc_md.flush_hash_range = native_flush_hash_range;
+	ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
 }
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index fbff062..386cab8 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -433,6 +433,7 @@ void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 {
 	int ssize, i;
 	unsigned long s_addr;
+	int max_hpte_count;
 	unsigned int psize, valid;
 	unsigned char *hpte_slot_array;
 	unsigned long hidx, vpn, vsid, hash, shift, slot;
@@ -446,12 +447,18 @@ void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 	 * second half of the PMD
 	 */
 	hpte_slot_array = *(char **)(pmdp + PTRS_PER_PMD);
-
 	/* get the base page size */
 	psize = get_slice_psize(mm, s_addr);
-	shift = mmu_psize_defs[psize].shift;
 
-	for (i = 0; i < HUGE_PAGE_SIZE/(1ul << shift); i++) {
+	if (ppc_md.hugepage_invalidate)
+		return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+						  s_addr, psize);
+	/*
+	 * No bluk hpte removal support, invalidate each entry
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = HUGE_PAGE_SIZE/(1ul << shift);
+	for (i = 0; i < max_hpte_count; i++) {
 		/*
 		 * 8 bits per each hpte entries
 		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 3daced3..5fcc621 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -45,6 +45,13 @@
 #include "plpar_wrappers.h"
 #include "pseries.h"
 
+/* Flag bits for H_BULK_REMOVE */
+#define HBR_REQUEST	0x4000000000000000UL
+#define HBR_RESPONSE	0x8000000000000000UL
+#define HBR_END		0xc000000000000000UL
+#define HBR_AVPN	0x0200000000000000UL
+#define HBR_ANDCOND	0x0100000000000000UL
+
 
 /* in hvCall.S */
 EXPORT_SYMBOL(plpar_hcall);
@@ -339,6 +346,117 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	BUG_ON(lpar_rc != H_SUCCESS);
 }
 
+/*
+ * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
+ * to make sure that we avoid bouncing the hypervisor tlbie lock.
+ */
+#define PPC64_HUGE_HPTE_BATCH 12
+
+static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
+					     unsigned long *vpn, int count,
+					     int psize, int ssize)
+{
+	unsigned long param[9];
+	int i = 0, pix = 0, rc;
+	unsigned long flags = 0;
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+	if (lock_tlbie)
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+	for (i = 0; i < count; i++) {
+
+		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize,
+						     ssize, 0);
+		} else {
+			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
+			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
+			pix += 2;
+			if (pix == 8) {
+				rc = plpar_hcall9(H_BULK_REMOVE, param,
+						  param[0], param[1], param[2],
+						  param[3], param[4], param[5],
+						  param[6], param[7]);
+				BUG_ON(rc != H_SUCCESS);
+				pix = 0;
+			}
+		}
+	}
+	if (pix) {
+		param[pix] = HBR_END;
+		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+				  param[2], param[3], param[4], param[5],
+				  param[6], param[7]);
+		BUG_ON(rc != H_SUCCESS);
+	}
+
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
+				       unsigned char *hpte_slot_array,
+				       unsigned long addr, int psize)
+{
+	int ssize = 0, i, index = 0;
+	unsigned long s_addr = addr;
+	unsigned int max_hpte_count, valid;
+	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
+	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
+	unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
+
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = HUGE_PAGE_SIZE/(1ul << shift);
+
+	for (i = 0; i < max_hpte_count; i++) {
+		/*
+		 * 8 bits per each hpte entries
+		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+		 */
+		valid = hpte_slot_array[i] & 0x1;
+		if (!valid)
+			continue;
+		hidx =  hpte_slot_array[i]  >> 1;
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		if (!is_kernel_addr(addr)) {
+			ssize = user_segment_size(addr);
+			vsid = get_vsid(mm->context.id, addr, ssize);
+			WARN_ON(vsid == 0);
+		} else {
+			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+			ssize = mmu_kernel_ssize;
+		}
+
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		slot_array[index] = slot;
+		vpn_array[index] = vpn;
+		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
+			/*
+			 * Now do a bluk invalidate
+			 */
+			__pSeries_lpar_hugepage_invalidate(slot_array,
+							   vpn_array,
+							   PPC64_HUGE_HPTE_BATCH,
+							   psize, ssize);
+			index = 0;
+		} else
+			index++;
+	}
+	if (index)
+		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
+						   index, psize, ssize);
+}
+
 static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 					   int psize, int ssize)
 {
@@ -354,13 +472,6 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 	pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, 0);
 }
 
-/* Flag bits for H_BULK_REMOVE */
-#define HBR_REQUEST	0x4000000000000000UL
-#define HBR_RESPONSE	0x8000000000000000UL
-#define HBR_END		0xc000000000000000UL
-#define HBR_AVPN	0x0200000000000000UL
-#define HBR_ANDCOND	0x0100000000000000UL
-
 /*
  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
  * lock.
@@ -446,6 +557,7 @@ void __init hpte_init_lpar(void)
 	ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
 	ppc_md.flush_hash_range	= pSeries_lpar_flush_hash_range;
 	ppc_md.hpte_clear_all   = pSeries_lpar_hptab_clear;
+	ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
 }
 
 #ifdef CONFIG_PPC_SMLPAR
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V4 20/25] powerpc/THP: Add code to handle HPTE faults for large pages
From: Aneesh Kumar K.V @ 2013-03-20 19:35 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1363808110-25748-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

We now have pmd entries covering to 16MB range. To implement THP on powerpc,
we double the size of PMD. The second half is used to deposit the pgtable (PTE page).
We also use the depoisted PTE page for tracking the HPTE information. The information
include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need
4096 entries. Both will fit in a 4K PTE page.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/mmu-hash64.h    |    5 +
 arch/powerpc/include/asm/pgtable-ppc64.h |   31 +----
 arch/powerpc/kernel/io-workarounds.c     |    3 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |    2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |    4 +-
 arch/powerpc/mm/Makefile                 |    1 +
 arch/powerpc/mm/hash_utils_64.c          |   16 ++-
 arch/powerpc/mm/hugepage-hash64.c        |  185 ++++++++++++++++++++++++++++++
 arch/powerpc/mm/hugetlbpage.c            |   31 ++++-
 arch/powerpc/mm/pgtable.c                |   38 ++++++
 arch/powerpc/mm/tlb_hash64.c             |    5 +-
 arch/powerpc/perf/callchain.c            |    2 +-
 arch/powerpc/platforms/pseries/eeh.c     |    5 +-
 13 files changed, 286 insertions(+), 42 deletions(-)
 create mode 100644 arch/powerpc/mm/hugepage-hash64.c

diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index e187254..a74a3de 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -322,6 +322,11 @@ extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		     pte_t *ptep, unsigned long trap, int local, int ssize,
 		     unsigned int shift, unsigned int mmu_psize);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __hash_page_thp(unsigned long ea, unsigned long access,
+			   unsigned long vsid, pmd_t *pmdp, unsigned long trap,
+			   int local, int ssize, unsigned int psize);
+#endif
 extern void hash_failure_debug(unsigned long ea, unsigned long access,
 			       unsigned long vsid, unsigned long trap,
 			       int ssize, int psize, int lpsize,
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index d4e845c..9b81283 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -345,39 +345,18 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 
-/*
- * find_linux_pte returns the address of a linux pte for a given
- * effective address and directory.  If not found, it returns zero.
- */
-static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
-{
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
-	pte_t *pt = NULL;
-
-	pg = pgdir + pgd_index(ea);
-	if (!pgd_none(*pg)) {
-		pu = pud_offset(pg, ea);
-		if (!pud_none(*pu)) {
-			pm = pmd_offset(pu, ea);
-			if (pmd_present(*pm))
-				pt = pte_offset_kernel(pm, ea);
-		}
-	}
-	return pt;
-}
-
+pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea, unsigned int *thp);
 #ifdef CONFIG_HUGETLB_PAGE
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-				 unsigned *shift);
+				 unsigned *shift, unsigned int *hugepage);
 #else
 static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-					       unsigned *shift)
+					       unsigned *shift,
+					       unsigned int *hugepage)
 {
 	if (shift)
 		*shift = 0;
-	return find_linux_pte(pgdir, ea);
+	return find_linux_pte(pgdir, ea, hugepage);
 }
 #endif /* !CONFIG_HUGETLB_PAGE */
 
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 50e90b7..a9c904f 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -70,7 +70,8 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 		if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
 			return NULL;
 
-		ptep = find_linux_pte(init_mm.pgd, vaddr);
+		/* we won't find hugepages here */
+		ptep = find_linux_pte(init_mm.pgd, vaddr, NULL);
 		if (ptep == NULL)
 			paddr = 0;
 		else
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 8cc18ab..4f2a7dc 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -683,7 +683,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 */
 			rcu_read_lock_sched();
 			ptep = find_linux_pte_or_hugepte(current->mm->pgd,
-							 hva, NULL);
+							 hva, NULL, NULL);
 			if (ptep && pte_present(*ptep)) {
 				pte = kvmppc_read_update_linux_pte(ptep, 1);
 				if (pte_write(pte))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 19c93ba..7c8e1ed 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x)
 	unsigned long addr = (unsigned long) x;
 	pte_t *p;
 
-	p = find_linux_pte(swapper_pg_dir, addr);
+	p = find_linux_pte(swapper_pg_dir, addr, NULL);
 	if (!p || !pte_present(*p))
 		return NULL;
 	/* assume we don't have huge pages in vmalloc space... */
@@ -152,7 +152,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
 	unsigned long ps = *pte_sizep;
 	unsigned int shift;
 
-	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
+	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift, NULL);
 	if (!ptep)
 		return __pte(0);
 	if (shift)
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3787b61..997deb4 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -33,6 +33,7 @@ obj-y				+= hugetlbpage.o
 obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
 endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1f2ebbd..cd3ecd8 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -955,7 +955,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	unsigned long vsid;
 	struct mm_struct *mm;
 	pte_t *ptep;
-	unsigned hugeshift;
+	unsigned hugeshift, hugepage;
 	const struct cpumask *tmp;
 	int rc, user_region = 0, local = 0;
 	int psize, ssize;
@@ -1021,7 +1021,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
+	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift, &hugepage);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
 		return 1;
@@ -1044,6 +1044,12 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 					ssize, hugeshift, psize);
 #endif /* CONFIG_HUGETLB_PAGE */
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (hugepage)
+		return __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+				       trap, local, ssize, psize);
+#endif
+
 #ifndef CONFIG_PPC_64K_PAGES
 	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
 #else
@@ -1149,7 +1155,11 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	pgdir = mm->pgd;
 	if (pgdir == NULL)
 		return;
-	ptep = find_linux_pte(pgdir, ea);
+	/*
+	 * We haven't implemented update_mmu_cache_pmd yet. We get called
+	 * only for non hugepages. Hence can ignore THP here
+	 */
+	ptep = find_linux_pte(pgdir, ea, NULL);
 	if (!ptep)
 		return;
 
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 0000000..3f6140d
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+/*
+ * The linux hugepage PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
+ * With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need
+ * 4096 entries. Both will fit in a 4K pgtable_t.
+ */
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+		    pmd_t *pmdp, unsigned long trap, int local, int ssize,
+		    unsigned int psize)
+{
+	unsigned int index, valid;
+	unsigned char *hpte_slot_array;
+	unsigned long rflags, pa, hidx;
+	unsigned long old_pmd, new_pmd;
+	int ret, lpsize = MMU_PAGE_16M;
+	unsigned long vpn, hash, shift, slot;
+
+	/*
+	 * atomically mark the linux large page PMD busy and dirty
+	 */
+	do {
+		old_pmd = pmd_val(*pmdp);
+		/* If PMD busy, retry the access */
+		if (unlikely(old_pmd & PMD_HUGE_BUSY))
+			return 0;
+		/* If PMD permissions don't match, take page fault */
+		if (unlikely(access & ~old_pmd))
+			return 1;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access
+		 */
+		new_pmd = old_pmd | PMD_HUGE_BUSY | PMD_HUGE_ACCESSED;
+		if (access & _PAGE_RW)
+			new_pmd |= PMD_HUGE_DIRTY;
+	} while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+					  old_pmd, new_pmd));
+	/*
+	 * PP bits. PMD_HUGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	rflags = new_pmd & PMD_HUGE_USER;
+	if ((new_pmd & PMD_HUGE_USER) && !((new_pmd & PMD_HUGE_RW) &&
+					   (new_pmd & PMD_HUGE_DIRTY)))
+		rflags |= 0x1;
+	/*
+	 * PMD_HUGE_EXEC -> HW_NO_EXEC since it's inverted
+	 */
+	rflags |= ((new_pmd & PMD_HUGE_EXEC) ? 0 : HPTE_R_N);
+
+#if 0 /* FIXME!! */
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+		/*
+		 * No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case
+		 */
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+	}
+#endif
+	/*
+	 * Find the slot index details for this ea, using base page size.
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	index = (ea & (HUGE_PAGE_SIZE - 1)) >> shift;
+	BUG_ON(index > 4096);
+
+	vpn = hpt_vpn(ea, vsid, ssize);
+	hash = hpt_hash(vpn, shift, ssize);
+	/*
+	 * The hpte hindex are stored in the pgtable whose address is in the
+	 * second half of the PMD
+	 */
+	hpte_slot_array = *(char **)(pmdp + PTRS_PER_PMD);
+
+	valid = hpte_slot_array[index]  & 0x1;
+	if (unlikely(valid)) {
+		/* update the hpte bits */
+		hidx =  hpte_slot_array[index]  >> 1;
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+					   psize, ssize, local);
+		/*
+		 * We failed to update, try to insert a new entry.
+		 */
+		if (ret == -1) {
+			/*
+			 * large pte is marked busy, so we can be sure
+			 * nobody is looking at hpte_slot_array. hence we can
+			 * safely update this here.
+			 */
+			hpte_slot_array[index] = 0;
+			valid = 0;
+		}
+	}
+
+	if (likely(!valid)) {
+		unsigned long hpte_group;
+
+		/* insert new entry */
+		pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+		/* clear the busy bits and set the hash pte bits */
+		new_pmd = (new_pmd & ~PMD_HUGE_HPTEFLAGS) | PMD_HUGE_HASHPTE;
+
+		/*
+		 * WIMG bits.
+		 * We always have _PAGE_COHERENT enabled for system RAM
+		 */
+		rflags |= _PAGE_COHERENT;
+
+		if (new_pmd & PMD_HUGE_SAO)
+			rflags |= _PAGE_SAO;
+
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+					  psize, lpsize, ssize);
+		/*
+		 * Primary is full, try the secondary
+		 */
+		if (unlikely(slot == -1)) {
+			hpte_group = ((~hash & htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+						  rflags, HPTE_V_SECONDARY,
+						  psize, lpsize, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP) & ~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+			}
+		}
+		/*
+		 * Hypervisor failure. Restore old pmd and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*pmdp = __pmd(old_pmd);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   psize, lpsize, old_pmd);
+			return -1;
+		}
+		/*
+		 * large pte is marked busy, so we can be sure
+		 * nobody is looking at hpte_slot_array. hence we can
+		 * safely update this here.
+		 */
+		hpte_slot_array[index] = slot << 1 | 0x1;
+	}
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*pmdp = __pmd(new_pmd & ~PMD_HUGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1a6de0a..7f11fa0 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -67,7 +67,8 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
 
 #define hugepd_none(hpd)	((hpd).pd == 0)
 
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+				 unsigned *shift, unsigned int *hugepage)
 {
 	pgd_t *pg;
 	pud_t *pu;
@@ -77,6 +78,8 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 
 	if (shift)
 		*shift = 0;
+	if (hugepage)
+		*hugepage = 0;
 
 	pg = pgdir + pgd_index(ea);
 	if (is_hugepd(pg)) {
@@ -91,12 +94,24 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 			pm = pmd_offset(pu, ea);
 			if (is_hugepd(pm))
 				hpdp = (hugepd_t *)pm;
-			else if (!pmd_none(*pm)) {
+			else if (pmd_large(*pm)) {
+				/* THP page */
+				if (hugepage) {
+					*hugepage = 1;
+					/*
+					 * This should be ok, except for few
+					 * flags. Most of the pte and hugepage
+					 * pmd bits overlap. We don't use the
+					 * returned value as pte_t in the caller.
+					 */
+					return (pte_t *)pm;
+				} else
+					return NULL;
+			} else if (!pmd_none(*pm)) {
 				return pte_offset_kernel(pm, ea);
 			}
 		}
 	}
-
 	if (!hpdp)
 		return NULL;
 
@@ -108,7 +123,8 @@ EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+	/* Only called for HugeTLB pages, hence can ignore THP */
+	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -613,8 +629,11 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 	struct page *page;
 	unsigned shift;
 	unsigned long mask;
-
-	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
+	/*
+	 * Transparent hugepages are handled by generic code. We can skip them
+	 * here.
+	 */
+	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift, NULL);
 
 	/* Verify it is a huge page else bail. */
 	if (!ptep || !shift)
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index cf3ca8e..fbff062 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -557,3 +557,41 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 }
 
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/*
+ * find_linux_pte returns the address of a linux pte for a given
+ * effective address and directory.  If not found, it returns zero.
+ */
+pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea, unsigned int *hugepage)
+{
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt = NULL;
+
+	if (hugepage)
+		*hugepage = 0;
+	pg = pgdir + pgd_index(ea);
+	if (!pgd_none(*pg)) {
+		pu = pud_offset(pg, ea);
+		if (!pud_none(*pu)) {
+			pm = pmd_offset(pu, ea);
+			if (pmd_large(*pm)) {
+				/* THP page */
+				if (hugepage) {
+					*hugepage = 1;
+					/*
+					 * This should be ok, except for few
+					 * flags. Most of the pte and hugepage
+					 * pmd bits overlap. We don't use the
+					 * returned value as pte_t in the caller.
+					 */
+					return (pte_t *)pm;
+				} else
+					return NULL;
+			} else if (pmd_present(*pm))
+				pt = pte_offset_kernel(pm, ea);
+		}
+	}
+	return pt;
+}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 023ec8a..be0066f 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -206,7 +206,10 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 	local_irq_save(flags);
 	arch_enter_lazy_mmu_mode();
 	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte(mm->pgd, start);
+		/*
+		 * We won't find hugepages here.
+		 */
+		pte_t *ptep = find_linux_pte(mm->pgd, start, NULL);
 		unsigned long pte;
 
 		if (ptep == NULL)
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 74d1e78..578cac7 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -125,7 +125,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
 	if (!pgdir)
 		return -EFAULT;
 
-	ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+	ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift, NULL);
 	if (!shift)
 		shift = PAGE_SHIFT;
 
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 9a04322..44c931a 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -261,7 +261,10 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 	pte_t *ptep;
 	unsigned long pa;
 
-	ptep = find_linux_pte(init_mm.pgd, token);
+	/*
+	 * We won't find hugepages here
+	 */
+	ptep = find_linux_pte(init_mm.pgd, token, NULL);
 	if (!ptep)
 		return token;
 	pa = pte_pfn(*ptep) << PAGE_SHIFT;
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V4 23/25] powerpc/THP: Enable THP on PPC64
From: Aneesh Kumar K.V @ 2013-03-20 19:35 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1363808110-25748-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

We enable only if the we support 16MB page size.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pgtable.h |   31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9681de4..5617dee 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -81,8 +81,35 @@ static inline int pmd_trans_huge(pmd_t pmd)
 	return ((pmd_val(pmd) & PMD_ISHUGE) ==  PMD_ISHUGE);
 }
 
-/* We will enable it in the last patch */
-#define has_transparent_hugepage() 0
+static inline int has_transparent_hugepage(void)
+{
+	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+		return 0;
+	/*
+	 * We support THP only if HPAGE_SHIFT is 16MB.
+	 */
+	if (!HPAGE_SHIFT || (HPAGE_SHIFT != mmu_psize_defs[MMU_PAGE_16M].shift))
+		return 0;
+	/*
+	 * We need to make sure that we support 16MB hugepage in a segement
+	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+	 * of 64K.
+	 */
+	/*
+	 * If we have 64K HPTE, we will be using that by default
+	 */
+	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+		return 0;
+	/*
+	 * Ok we only have 4K HPTE
+	 */
+	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+		return 0;
+
+	return 1;
+}
+
 #else
 #define pmd_large(pmd)		0
 #define has_transparent_hugepage() 0
-- 
1.7.10

^ permalink raw reply related

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
From: Alex Williamson @ 2013-03-20 19:46 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: aik, linuxppc-dev, Gavin Shan, kvm
In-Reply-To: <1363807906.17680.11.camel@pasglop>

On Wed, 2013-03-20 at 20:31 +0100, Benjamin Herrenschmidt wrote:
> On Wed, 2013-03-20 at 12:48 -0600, Alex Williamson wrote:
> > Perhaps my problem is that I don't have a clear picture of where
> > you're
> > going with this like I do for AER.  For AER we're starting with
> > notification of an error, from that we build into how to retrieve the
> > error information, and finally how to perform corrective action.  Each
> > of these will be done through vifo-pci.
> > 
> > Here we're starting by registering a mapping that's really only useful
> > to the vfio "accelerator" path, but we don't even have a hint of what
> > the non-accelerated path is and how vfio is involved with it.  Thanks,
> 
> I'm surprised that you are building so much policy around AER ... can't
> you just pass the raw stuff down to the guest and let the guest do it's
> own corrective actions ?

How does the guest get the raw stuff?  We need to get the AER interrupt
out to the guest so it can be injected into the virtual PCIe port, then
we need to be able to retrieve the physical device log and pass it to
the qemu to mangle to match the guest topology.  We don't have existing
firmware interfaces for the guest to do that, so it's all being routed
through vfio-pci.

> As for EEH, I will let Gavin describe in more details what he is doing,
> though I wouldn't be surprised if so far he doesn't have a
> non-accelerated path :-) Which indeed makes things oddball, granted ...
> at least for now. I *think* what Gavin's doing right now is a
> pass-through to the host EEH directly in the kernel, so without a slow
> path...
> 
> Gavin, it really boils down to that. In-kernel EEH for guests is a
> KVMism that ends up not involving VFIO in any other way than
> establishing the mapping, then arguably it could be done via a VM ioctl.
> 
> If there's more going through VFIO and shared state, then it should
> probably go through VFIO-PCI.

Exactly my thinking.  Thanks,

Alex

^ permalink raw reply

* Re: [PATCH 3/3] powerpc/fsl: add MPIC timer wakeup support
From: Scott Wood @ 2013-03-20 21:48 UTC (permalink / raw)
  To: Wang Dongsheng-B40534
  Cc: Wood Scott-B07421, Gala Kumar-B11780,
	linuxppc-dev@lists.ozlabs.org, Li Yang-R58472,
	Zhao Chenhui-B35336
In-Reply-To: <ABB05CD9C9F68C46A5CEDC7F15439259EB7712@039-SN2MPN1-022.039d.mgd.msft.net>

On 03/19/2013 10:48:53 PM, Wang Dongsheng-B40534 wrote:
>=20
>=20
> > -----Original Message-----
> > From: Wood Scott-B07421
> > Sent: Wednesday, March 20, 2013 6:55 AM
> > To: Wang Dongsheng-B40534
> > Cc: Wood Scott-B07421; Gala Kumar-B11780; =20
> linuxppc-dev@lists.ozlabs.org;
> > Zhao Chenhui-B35336; Li Yang-R58472
> > Subject: Re: [PATCH 3/3] powerpc/fsl: add MPIC timer wakeup support
> >
> > On 03/19/2013 01:25:42 AM, Wang Dongsheng-B40534 wrote:
> > > > -----Original Message-----
> > > > From: Wood Scott-B07421
> > > > Sent: Tuesday, March 19, 2013 8:31 AM
> > > > To: Wang Dongsheng-B40534
> > > > Cc: Gala Kumar-B11780; linuxppc-dev@lists.ozlabs.org; Wang
> > > Dongsheng-
> > > > B40534; Zhao Chenhui-B35336; Li Yang-R58472
> > > > Subject: Re: [PATCH 3/3] powerpc/fsl: add MPIC timer wakeup =20
> support
> > > >
> > > > On 03/08/2013 01:38:47 AM, Wang Dongsheng wrote:
> > > > > +static ssize_t fsl_timer_wakeup_store(struct device *dev,
> > > > > +				struct device_attribute *attr,
> > > > > +				const char *buf,
> > > > > +				size_t count)
> > > > > +{
> > > > > +	struct timeval interval;
> > > > > +	int ret;
> > > > > +
> > > > > +	interval.tv_usec =3D 0;
> > > > > +	if (kstrtol(buf, 0, &interval.tv_sec))
> > > > > +		return -EINVAL;
> > > >
> > > > I don't think the buffer will NUL-terminated...  Ordinarily
> > > there'll be
> > > > an LF terminator, but you can't rely on that (many other sysfs
> > > attributes
> > > > seem to, though...).
> > > >
> > > I think we don't need to care about LF terminator.
> > > The kstrtol--> _kstrtoull has been done.
> >
> > My point is, what happens if userspace passes in a buffer that has =20
> no
> > terminator of any sort?  kstrtol will continue reading beyond the =20
> end of
> > the buffer.
> >
> Do not care about terminator.

kstrtol() obviously *does* because it doesn't take the buffer length as =20
a parameter.

> kstrtol--> _kstrtoull--> _parse_integer
>=20
> _kstrtoull(...) {
> 	...
> 	rv =3D _parse_integer(s, base, &_res);
> 	if (rv & KSTRTOX_OVERFLOW)
> 		return -ERANGE;
> 	rv &=3D ~KSTRTOX_OVERFLOW;
> 	if (rv =3D=3D 0)
> 		return -EINVAL;
> 	s +=3D rv;
>=20
> 	if (*s =3D=3D '\n')
> 		s++;
> 	if (*s)
> 		return -EINVAL;
> 	...
> }
>=20
> _parse_integer(...) {
> 	...
> 	while (*s) {
> 		if ('0' <=3D *s && *s <=3D '9')
> 			val =3D *s - '0';
> 		else if ('a' <=3D _tolower(*s) && _tolower(*s) <=3D 'f')
> 			val =3D _tolower(*s) - 'a' + 10;
> 		else
> 			break;	//this will break out to convert.

Really?  How do you know that the next byte after the buffer isn't a =20
valid hex digit?  How do you even know that we won't take a fault =20
accessing it?

> > Echoing a nonzero value wouldn't just be to cancel, it would be to =20
> set a
> > new timer after cancelling the old.
> >
> If you think this way is better, I can change.

I do.

> But why should do it?
> Explicitly stop the timer (echo 0) before reuse it is more reasonable =20
> for me.

It's an unnecessary restriction, and eliminating it doesn't make =20
anything simpler.

-Scott=

^ permalink raw reply

* Re: [PATCH 2/3] powerpc/mpic: add global timer support
From: Scott Wood @ 2013-03-20 22:59 UTC (permalink / raw)
  To: Wang Dongsheng-B40534
  Cc: Wood Scott-B07421, Gala Kumar-B11780,
	linuxppc-dev@lists.ozlabs.org, Li Yang-R58472
In-Reply-To: <ABB05CD9C9F68C46A5CEDC7F15439259EB77BB@039-SN2MPN1-022.039d.mgd.msft.net>

On 03/20/2013 01:45:03 AM, Wang Dongsheng-B40534 wrote:
>=20
>=20
> > -----Original Message-----
> > From: Wood Scott-B07421
> > Sent: Wednesday, March 20, 2013 6:59 AM
> > To: Wang Dongsheng-B40534
> > Cc: Wood Scott-B07421; Gala Kumar-B11780; =20
> linuxppc-dev@lists.ozlabs.org;
> > Li Yang-R58472
> > Subject: Re: [PATCH 2/3] powerpc/mpic: add global timer support
> >
> > On 03/19/2013 02:55:58 AM, Wang Dongsheng-B40534 wrote:
> > > > > +static void convert_ticks_to_time(struct timer_group_priv =20
> *priv,
> > > > > +		const u64 ticks, struct timeval *time) {
> > > > > +	u64 tmp_sec;
> > > > > +	u32 rem_us;
> > > > > +	u32 div;
> > > > > +
> > > > > +	if (!(priv->flags & FSL_GLOBAL_TIMER)) {
> > > > > +		time->tv_sec =3D (__kernel_time_t)
> > > > > +			div_u64_rem(ticks, priv->timerfreq, =20
> &rem_us);
> > > > > +		tmp_sec =3D (u64)time->tv_sec * =20
> (u64)priv->timerfreq;
> > > > > +		time->tv_usec =3D (__kernel_suseconds_t)
> > > > > +			div_u64((ticks - tmp_sec) * 1000000,
> > > > > priv->timerfreq);
> > > > > +
> > > > > +		return;
> > > > > +	}
> > > > > +
> > > > > +	div =3D (1 << (MPIC_TIMER_TCR_CLKDIV_64 >> 8)) * 8;
> > > > > +
> > > > > +	time->tv_sec =3D (__kernel_time_t)div_u64(ticks, priv-
> > >timerfreq
> > > > > / div);
> > > > > +	tmp_sec =3D div_u64((u64)time->tv_sec * =20
> (u64)priv->timerfreq,
> > > > > div);
> > > > > +
> > > > > +	time->tv_usec =3D (__kernel_suseconds_t)
> > > > > +		div_u64((ticks - tmp_sec) * 1000000, =20
> priv->timerfreq /
> > > > > div);
> > > > > +
> > > > > +	return;
> > > >
> > > > Why don't you just adjust the clock frequency up front for
> > > CLKDIV_64,
> > > > rather than introduce alternate (and untested!) code paths
> > > throughout the
> > > > driver?
> > > >
> > > No, It cannot be integrated. The div cannot be removed.
> > > Because if do priv->timerfreq /=3D div, that will affect the =20
> accuracy.
> > >
> > > Like:
> > > 3 * 5 / 2 =3D 7;
> > > 3 / 2 * 5 =3D 5;
> >
> > I don't follow -- a change in the clock speed is a change in the =20
> clock
> > speed, no matter how you accomplish it.
> >
> This is not change hardware clock frequency.

Citation needed.  It looks like a change in the timer frequency to me:

   Clock ratio. Specifies the ratio of the timer frequency to the MPIC =20
input clock (platform clock/2) . The
   following clock ratios are supported:
   00
   01
   10
   11
   Default. Divide by 8
   Divide by 16
   Divide by 32
   Divide by 64

The end result is that the counter in the timer register changes only
1/64 as often as the input clock.  There's nothing special about that,
compared to having an input clock that is 1/64 the speed.

> The mpic timer hardware clock is not be changed after initialization. =20
> This is just conversion ticks.
> These calculated ticks will be set to the hardware.
>=20
> > How you round is a different question.  You should probably be =20
> rounding
> > up always, based on the final clock frequency -- though it's =20
> unlikely to
> > matter much given the high precision of the timer relative to the =20
> input
> > granularity.
> >
> Each ticks are based on the mpic timer hardware clock frequency.
> The conversion and calculation are in order to make the tick value is =20
> more
> accurate, more close to real time.
> If echo 40 seconds may be difference is not obvious. But echo =20
> 315360000(10 years)
> difference is obvious.

So basically you're taking advantage of the fact that you have what
appears to be a more precise value of the frequency than is expressible
in integer Hz -- but I think that's false precision; odds are the
frequency is not accurate to 1 Hz to begin with.  Even if it is, I doubt
it's worth worrying about.  The error as a percentage will still be very
small with an input frequency of many MHz.  Does an error of a few
minutes really matter if you're delaying for 10 years?  That's =20
acceptable
clock drift for something not synced to network time.  The main thing is
to ensure that you round up, not down, so that software doesn't see an
early wakeup as measured by its own timers.

BTW, the input clock frequency has been similarly scaled, yet you don't
try to scrounge up that information to get further precision...

-Scott=

^ permalink raw reply

* [PATCH] powerpc/mm/nohash: ignore NULL stale_map entries
From: Scott Wood @ 2013-03-21  0:06 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

This happens with threads that are offline due to CPU hotplug
(including threads that were never "plugged in" to begin with because
SMT is disabled).

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/mm/mmu_context_nohash.c |    9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index e779642..810f8e4 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
 		 */
 		for_each_cpu(cpu, mm_cpumask(mm)) {
 			for (i = cpu_first_thread_sibling(cpu);
-			     i <= cpu_last_thread_sibling(cpu); i++)
-				__set_bit(id, stale_map[i]);
+			     i <= cpu_last_thread_sibling(cpu); i++) {
+				if (stale_map[i])
+					__set_bit(id, stale_map[i]);
+			}
 			cpu = i - 1;
 		}
 		return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
 		for (i = cpu_first_thread_sibling(cpu);
 		     i <= cpu_last_thread_sibling(cpu); i++) {
-			__clear_bit(id, stale_map[i]);
+			if (stale_map[i])
+				__clear_bit(id, stale_map[i]);
 		}
 	}
 
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH] powerpc: Add accounting for Doorbell interrupts
From: Michael Neuling @ 2013-03-21  0:28 UTC (permalink / raw)
  To: Ian Munsie; +Cc: linuxppc-dev
In-Reply-To: <1363765332-28461-1-git-send-email-imunsie@au1.ibm.com>

> From: Ian Munsie <imunsie@au1.ibm.com>
> 
> This patch adds a new line to /proc/interrupts to account for the
> doorbell interrupts that each hardware thread has received. The total
> interrupt count in /proc/stat will now also include doorbells.

It's probably worth noting in the comment that these are not being
accounted at all currently, even in the existing IPI line in
/proc/interrupts.

Mikey

>  # cat /proc/interrupts
>            CPU0       CPU1       CPU2       CPU3
>  16:        551       1267        281        175      XICS Level     IPI
> LOC:       2037       1503       1688       1625   Local timer interrupts
> SPU:          0          0          0          0   Spurious interrupts
> CNT:          0          0          0          0   Performance monitoring interrupts
> MCE:          0          0          0          0   Machine check exceptions
> DBL:         42        550         20         91   Doorbell interrupts
>
> 
> Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
> ---
>  arch/powerpc/include/asm/hardirq.h |    1 +
>  arch/powerpc/kernel/dbell.c        |    2 ++
>  arch/powerpc/kernel/irq.c          |    8 ++++++++
>  3 files changed, 11 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h
> index 3147a29..e88c5f2 100644
> --- a/arch/powerpc/include/asm/hardirq.h
> +++ b/arch/powerpc/include/asm/hardirq.h
> @@ -10,6 +10,7 @@ typedef struct {
>  	unsigned int pmu_irqs;
>  	unsigned int mce_exceptions;
>  	unsigned int spurious_irqs;
> +	unsigned int doorbell_irqs;
>  } ____cacheline_aligned irq_cpustat_t;
>  
>  DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
> diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
> index 9ebbc24..d55c76c 100644
> --- a/arch/powerpc/kernel/dbell.c
> +++ b/arch/powerpc/kernel/dbell.c
> @@ -41,6 +41,8 @@ void doorbell_exception(struct pt_regs *regs)
>  
>  	may_hard_irq_enable();
>  
> +	__get_cpu_var(irq_stat).doorbell_irqs++;
> +
>  	smp_ipi_demux();
>  
>  	irq_exit();
> diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
> index 4f97fe3..3a3c3b9 100644
> --- a/arch/powerpc/kernel/irq.c
> +++ b/arch/powerpc/kernel/irq.c
> @@ -374,6 +374,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
>  		seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions);
>  	seq_printf(p, "  Machine check exceptions\n");
>  
> +	if (cpu_has_feature(CPU_FTR_DBELL)) {
> +		seq_printf(p, "%*s: ", prec, "DBL");
> +		for_each_online_cpu(j)
> +			seq_printf(p, "%10u ", per_cpu(irq_stat, j).doorbell_irqs);
> +		seq_printf(p, "  Doorbell interrupts\n");
> +	}
> +
>  	return 0;
>  }
>  
> @@ -387,6 +394,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
>  	sum += per_cpu(irq_stat, cpu).pmu_irqs;
>  	sum += per_cpu(irq_stat, cpu).mce_exceptions;
>  	sum += per_cpu(irq_stat, cpu).spurious_irqs;
> +	sum += per_cpu(irq_stat, cpu).doorbell_irqs;
>  
>  	return sum;
>  }
> -- 
> 1.7.10.4
> 

^ permalink raw reply

* Re: [PATCH 3/3] VFIO: Direct access config reg without capability
From: Alex Williamson @ 2013-03-21  0:58 UTC (permalink / raw)
  To: Gavin Shan; +Cc: aik, linuxppc-dev, kvm
In-Reply-To: <1363332390-12754-4-git-send-email-shangw@linux.vnet.ibm.com>

On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
> The config registers in [0, 0x40] is being supported by VFIO. Apart
> from that, the other config registers should be coverred by PCI or
> PCIe capability. However, there might have some PCI devices (be2net)
> who has config registers (0x7c) out of [0, 0x40], and don't have
> corresponding PCI or PCIe capability. VFIO will return 0x0 on reading
> those registers and writing is dropped. It caused the be2net driver
> fails to be loaded because 0x0 returned from its config register 0x7c.
> 
> The patch changes the behaviour so that those config registers out
> of [0, 0x40] and don't have corresponding PCI or PCIe capability
> will be accessed directly.
> 
> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---

Hi Gavin,

I'm onboard with making this change now, but this patch isn't
sufficient.  The config space map uses a byte per dword to index the
capability since both standard and extended capabilities are dword
aligned.  We currently have a bug that this patch exposes that we round
the length down, ex. a 14 byte MSI capability becomes 12 bytes leaving
the message data now exposed and writable with this patch.  That bug can
be fixed by aligning the length so the capability fills the dword, but
notice that 0x7c on the be2net is filling one of these gaps.  So fixing
that bug attaches that gap to the previous capability instead of
allowing direct access.

So, before we can make this change we need to fix the config map to have
byte granularity.  Thanks,

Alex

>  drivers/vfio/pci/vfio_pci_config.c |   31 ++++++++++++++++++++-----------
>  1 files changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 964ff22..5ea3afb 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -1471,18 +1471,27 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
>  
>  	cap_id = vdev->pci_config_map[*ppos / 4];
>  
> +	/*
> +	 * Some PCI device config registers might not be coverred by
> +	 * capability and useful. We will enable direct access to
> +	 * those registers.
> +	 */
>  	if (cap_id == PCI_CAP_ID_INVALID) {
> -		if (iswrite)
> -			return ret; /* drop */
> -
> -		/*
> -		 * Per PCI spec 3.0, section 6.1, reads from reserved and
> -		 * unimplemented registers return 0
> -		 */
> -		if (copy_to_user(buf, &val, count))
> -			return -EFAULT;
> -
> -		return ret;
> +		if (iswrite) {
> +			if (copy_from_user(&val, buf, count))
> +				return -EFAULT;
> +			ret = vfio_user_config_write(vdev->pdev, (int)(*ppos),
> +						     val, count);
> +			return ret ? ret : count;
> +		} else {
> +			ret = vfio_user_config_read(vdev->pdev, (int)(*ppos),
> +						    &val, count);
> +			if (ret)
> +				return ret;
> +			if (copy_to_user(buf, &val, count))
> +				return -EFAULT;
> +			return count;
> +		}
>  	}
>  
>  	/*

^ permalink raw reply

* Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command
From: Gavin Shan @ 2013-03-21  2:09 UTC (permalink / raw)
  To: Alex Williamson; +Cc: aik, linuxppc-dev, Gavin Shan, kvm
In-Reply-To: <1363808782.24132.534.camel@bling.home>

On Wed, Mar 20, 2013 at 01:46:22PM -0600, Alex Williamson wrote:
>On Wed, 2013-03-20 at 20:31 +0100, Benjamin Herrenschmidt wrote:
>> On Wed, 2013-03-20 at 12:48 -0600, Alex Williamson wrote:

.../...

>> As for EEH, I will let Gavin describe in more details what he is doing,
>> though I wouldn't be surprised if so far he doesn't have a
>> non-accelerated path :-) Which indeed makes things oddball, granted ...
>> at least for now. I *think* what Gavin's doing right now is a
>> pass-through to the host EEH directly in the kernel, so without a slow
>> path...
>> 

Yes, I don't have non-accelerated path. I'm trying to describe what I'm
doing: On the host side, the interrupt will be triggered while detecting
frozen PE, which has been passed to guest. We won't send EEH event to
EEH core on host side and we're waiting for guest to be involved to handle
the EEH error. In guest, any access to config or MMIO of the frozen PE will
trigger EEH event and in turn, the guest utilizes existing (exactly same
to pSeries on phyp case) RTAS calls to recover the error. The RTAS calls is
being emulated in host kernel.

Part of the RTAS call arguments is PCI domain/bus/slot/function viewed from
guest perspective. That's different from that for same physical PCI device
in host side. So I used VFIO-PCI to do the mapping and maintain the information
in host kernel.

>> Gavin, it really boils down to that. In-kernel EEH for guests is a
>> KVMism that ends up not involving VFIO in any other way than
>> establishing the mapping, then arguably it could be done via a VM ioctl.
>> 
>> If there's more going through VFIO and shared state, then it should
>> probably go through VFIO-PCI.
>

Ben, you're right. I use VFIO for nothing other than doing address mapping.
So I will do the address mapping in VM IOCTL instead of in VFIO-PCI.

Thanks,
Gavin

^ permalink raw reply

* Re: [Suggestion] PowerPC: kernel: cross compiling issue with allmodconfig
From: Chen Gang @ 2013-03-21  5:55 UTC (permalink / raw)
  To: Michael Neuling, Benjamin Herrenschmidt
  Cc: sfr, matt, linux-kernel@vger.kernel.org, paulus@samba.org,
	imunsie, linuxppc-dev
In-Reply-To: <5142AE27.7060003@asianux.com>

Hello All:

summary:
  the root cause is no enough room in exception area (0x5500 -- 0x7000).

  it is caused by the patches "for saving/restre PPR":
    they consumed much space of this area (0x5500 -- 0x7000).
    for pseries_defconfig and ppc64_defconfig, it is still ok.
    but for allmodconfig and "some additional config", it will cause issue.

  the solving patch "Make room in exception vector area" can make room larger.
    it can let "some additional config" ok.
    but for allmodconfig, it is still not enough.


details
  reason:
    it is caused by:
       commit number: 13e7a8e846c2ea38a552b986ea49332f965bbb7a
       commit number: 44e9309f1f357794b7ae93d5f3e3e6f11d2b8a7f
    they are "for saving/restore PPR"
    by Haren Myneni <haren@linux.vnet.ibm.com> Thu, 6 Dec 2012
    compiling result:
      pseries_defconfig: pass   (cpu for POWER7)
      ppc64_defconfig:   pass   (cpu for POWER7)
      allmodconfig:      failed (cpu for POWER7)

  analysing:
    solving patch:
      ------------------------------------------------------------------
      commit number: 61383407677aef05928541a00678591abea2d84c
      Author: Benjamin Herrenschmidt <benh@kernel.crashing.org>
      Date:   Thu Jan 10 17:44:19 2013 +1100

        powerpc: Make room in exception vector area
    
        The FWNMI region is fixed at 0x7000 and the vector are now
        overflowing that with some configurations. Fix that by moving
        some hash management code out of that region as it doesn't need
        to be that close to the call sites (isn't accessed using
        conditional branches).
      ------------------------------------------------------------------

      but for allmodconfig (not only for "some configurations"):
        it really can reduce much overflow bytes,
          (maybe from hundreds bytes to dozens bytes)
        but still not enough (still content overflow bytes)

    additional trying:
	after del CONFIG_VSX and CONFIG_PPC_970_NAP in allmodconfig,
          (will reduce dozens bytes in the region .0x5500 -- .0x7000)
        it can pass compiling (not overflow).


next:
  I am sorry:
      I am not quite familiar with the detail features of powerpc.
      it seems I am not the suitable member to continue trying.

  I prefer Benjamin to continue trying (just like what he has done).

  if Benjamin will not do it (e.g. maybe no time to do)
    I should continue: "make additional room in exception vector area".
      (if get no reply within a week: before 2013-03-28, I should continue)



  welcome any members' (especially Benjamin) suggestions or completions.

  thanks.

  :-)


On 2013年03月15日 13:14, Chen Gang wrote:
> 于 2013年03月15日 12:52, Michael Neuling 写道:
>> Yep it's a known problem but no one has bothered to fix it since it
>> doesn't happen in a config that anyone cares about like
>> pseries_defconfig and ppc64_defconfig.  We've been moving code around in
>> this area a lot recently hence the breakage.
>>
>> It should be fixed though.  Patches welcome. :-)
> 
>   thanks, and I should try, and very glad to try.
> 
>   :-)  :-)
> 
>   excuse me, I try to provide related patch within this month (2013-03-31), is it ok ?
>   the reason is:
>     I am not familiar with ppc assembly code, neither ppc kernel,
>     so need additional time resource.
>       (originally, I worked for x86(_64) core dump analysing for kernel and user programs)
> 
>   thanks.
> 


-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* linux-next: build failure after merge of the final tree (linus' tree related)
From: Stephen Rothwell @ 2013-03-21  6:23 UTC (permalink / raw)
  To: Marcelo Tosatti, Gleb Natapov
  Cc: Kevin Hilman, linuxppc-dev, linux-kernel, linux-next,
	Paul Mackerras, Linus

[-- Attachment #1: Type: text/plain, Size: 1813 bytes --]

Hi all,

After merging the final tree, today's linux-next build (powerpc64
allnoconfig) failed like this:

In file included from arch/powerpc/include/asm/kvm_ppc.h:33:0,
                 from arch/powerpc/kernel/setup_64.c:67:
arch/powerpc/include/asm/kvm_book3s.h:65:20: error: field 'pte' has incomplete type
arch/powerpc/include/asm/kvm_book3s.h:69:18: error: field 'vcpu' has incomplete type
arch/powerpc/include/asm/kvm_book3s.h:98:34: error: 'HPTEG_HASH_NUM_PTE' undeclared here (not in a function)
arch/powerpc/include/asm/kvm_book3s.h:99:39: error: 'HPTEG_HASH_NUM_PTE_LONG' undeclared here (not in a function)
arch/powerpc/include/asm/kvm_book3s.h:100:35: error: 'HPTEG_HASH_NUM_VPTE' undeclared here (not in a function)
arch/powerpc/include/asm/kvm_book3s.h:101:40: error: 'HPTEG_HASH_NUM_VPTE_LONG' undeclared here (not in a function)
arch/powerpc/include/asm/kvm_book3s.h:129:4: error: 'struct kvm_run' declared inside parameter list [-Werror]
arch/powerpc/include/asm/kvm_book3s.h:129:4: error: its scope is only this definition or declaration, which is probably not what you want [-Werror]

And it went downhill form there.  This build does not have CONFIG_KVM
defined.

Caused by commit f445f11eb2cc ("KVM: allow host header to be included
even for !CONFIG_KVM") which clearly never saw the light of day in
linux-next :-(

It would have been nice if the "compile failure when KVM is not enabled"
was included in the commit log so that we could figure out exactly what
needed to be protected instead of just effectively removing the whole
file.

I just reverted that commit for today.  Can someone please supply a
better solution or even just more information about what that commit was
solving.
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc/uprobes: teach uprobes to ignore gdb breakpoints
From: Ananth N Mavinakayanahalli @ 2013-03-21  7:15 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: ppcdev, Srikar Dronamraju, stable
In-Reply-To: <20130320160644.GA20352@redhat.com>

On Wed, Mar 20, 2013 at 05:06:44PM +0100, Oleg Nesterov wrote:
> On 03/20, Ananth N Mavinakayanahalli wrote:
> >
> > On Wed, Mar 20, 2013 at 01:26:39PM +0100, Oleg Nesterov wrote:
> >
> > > But, at the same time, is the new definition fine for verify_opcode()?
> > >
> > > IOW, powerpc has another is_trap() insn(s) used by gdb, lets denote it X.
> > > X != UPROBE_SWBP_INSN.
> > >
> > > Suppose that gdb installs the trap X at some addr, and then uprobe_register()
> > > tries to install uprobe at the same address. Then set_swbp() will do nothing,
> > > assuming the uprobe was already installed.

I think that is not right... see below...

> > > But we did not install UPROBE_SWBP_INSN. Is it fine? I hope yes, just to
> > > verify. If not, we need 2 definitions. is_uprobe_insn() should still check
> > > insns == UPROBE_SWBP_INSN, and is_swbp_insn() should check is_trap().

Its fine from gdb's perspective with my patch.

> > is_trap() checks for all trap variants on powerpc, including the one
> > uprobe uses. It returns true if the instruction is *any* trap variant.
> 
> I understand,
> 
> > So, install_breakpoint()->prepare_uprobe()->is_swbp_insn() will return
> > ENOTSUPP. In fact, arch_uprobe_analyze_insn() will also do the same.
> 
> Yes and the check in arch_uprobe_analyze_insn() should go away.
> 
> But you missed my point. Please forget about prepare_uprobe(), it is
> wrong anyway. And, prepare_uprobe() inspects the _original_ insn from
> the file, this has nothing install_breakpoint/etc.
>
> I meant verify_opcode() called by install_breakpoint/etc.

For the case where X already exists, verify_opcode() currently returns 0.
IMO, it should return -EEXIST, unless you are proposing that uprobes
should ride on the existing trap (even if its a variant).

If you are proposing that uprobes ride on X if it already exists, that's
not always possible and is a big can of worms... see below...

> > This itself should take care of all the cases.
> >
> > > And I am just curious, could you explain how X and UPROBE_SWBP_INSN
> > > differ?
> >
> > Powerpc has numerous variants of the trap instruction based on
> > comparison of two registers or a regsiter and immediate value and a condition
> > (less than, greater than, [signed forms thereof], or equal to).
> >
> > Uprobes uses 0x7fe0008 which is 'tw 31,0,0'  which essentially is an
> > unconditional trap.
> >
> > Gdb uses many traps, one of which is 0x7d821008 which is twge r2,r2,
> > which is basically trap if r2 greater than or equal to r2.
> 
> OK. So, if I understand correctly, gdb can use some conditional
> breakpoint, and it is possible that this insn won't generate the
> trap?

Yes it is possible if the condition is not met. If the condition is
met, the instruction will generate a trap, and uprobes will do a
send_sig(SIGTRAP) from handle_swbp().

> Then this patch is not right, or at least we need another change
> on top?
> 
> Once again. Suppose that gdb installs the TRAP_IF_R1_GT_R2.
> 
> After that uprobe_register() is called, but it won't change this
> insn because verify_opcode() returns 0.
> 
> Then the probed task hits this breakoint with "r1 < r2" and we do
> not report this event.

At this time, the condition for the trap is not satisfied, so no
exception occurs. If the expectation is that the trap always trigger,
then all such trap variants need to be replaced with the unconditional
trap and we should either add logic to re-execute the condional trap
after uprobe handling and send_sig() via handle_swbp() or emulate the
condition in software and do a send_sig() if needed.

> So. I still think that we actually need something like below, and
> powerpc should reimplement is_trap_insn() to use is_trap(insn).
> 
> No?

I don't see how this will help, especially since the gdb<->uprobes is
fraught with races.

With your proposed patch, we refuse to insert a uprobe if the underlying
instruction is a UPROBE_SWBP_INSTRUCTION; changing is_swbp_at_addr()
will need changes in handle_swbp() too. But, unlike x86, we cannot
expect a uprobe with an underlying trap variant (X) to always trigger.

IMHO, its not a good idea to do that for x86 either, since you'll run
into many other complications (what if the entity that put the original
breakpoint, removed it, etc).

IMHO, I really think we should not allow uprobe_register() to succeed if
the underlying instruction is a breakpoint (or a variant thereof).

Ananth

^ permalink raw reply

* Re: [PATCH] powerpc/uprobes: teach uprobes to ignore gdb breakpoints
From: Ananth N Mavinakayanahalli @ 2013-03-21  7:17 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: ppcdev, Srikar Dronamraju, stable
In-Reply-To: <20130320160728.GB20352@redhat.com>

On Wed, Mar 20, 2013 at 05:07:28PM +0100, Oleg Nesterov wrote:
> On 03/20, Ananth N Mavinakayanahalli wrote:
> >
> > On Wed, Mar 20, 2013 at 01:43:01PM +0100, Oleg Nesterov wrote:
> > > On 03/20, Oleg Nesterov wrote:
> > > >
> > > > But we did not install UPROBE_SWBP_INSN. Is it fine? I hope yes, just to
> > > > verify. If not, we need 2 definitions. is_uprobe_insn() should still check
> > > > insns == UPROBE_SWBP_INSN, and is_swbp_insn() should check is_trap().
> > > >
> > > > And I am just curious, could you explain how X and UPROBE_SWBP_INSN
> > > > differ?
> > >
> > > IOW, if I wasn't clear... Lets forget about gdb/etc for the moment.
> > > Suppose we apply the patch below. Will uprobes on powerpc work?
> > >
> > > If yes, then your patch should be fine. If not, we probably need more
> > > changes.
> >
> > Yes, it will work fine.
> 
> Even if this new insn is conditional?

Yes.

^ permalink raw reply

* Re: [RFC PATCH -V2 00/21] THP support for PPC64
From: Simon Jeons @ 2013-03-21  8:17 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

Hi Aneesh,
On 02/22/2013 12:47 AM, Aneesh Kumar K.V wrote:
> Hi,
>
> This patchset adds transparent huge page support for PPC64.
>
> I am marking the series to linux-mm because the PPC64 implementation
> required few interface changes to core THP code. I still have considerable
> number of FIXME!! in the patchset mostly related to PPC64 mm susbsytem.
> Those would require closer review and once we are clear on those changes,
> I will drop those FIXME!! with necessary comments.
>
> Some numbers:
>
> The latency measurements code from Anton  found at
> http://ozlabs.org/~anton/junkcode/latency2001.c

Can this benchmark use for x86?

>
> THP disabled 64K page size
> ------------------------
> [root@llmp24l02 ~]# ./latency2001 8G
>   8589934592    731.73 cycles    205.77 ns
> [root@llmp24l02 ~]# ./latency2001 8G
>   8589934592    743.39 cycles    209.05 ns
> [root@llmp24l02 ~]#
>
> THP disabled large page via hugetlbfs
> -------------------------------------
> [root@llmp24l02 ~]# ./latency2001  -l 8G
>   8589934592    416.09 cycles    117.01 ns
> [root@llmp24l02 ~]# ./latency2001  -l 8G
>   8589934592    415.74 cycles    116.91 ns
>
> THP enabled 64K page size.
> ----------------
> [root@llmp24l02 ~]# ./latency2001 8G
>   8589934592    405.07 cycles    113.91 ns
> [root@llmp24l02 ~]# ./latency2001 8G
>   8589934592    411.82 cycles    115.81 ns
> [root@llmp24l02 ~]#
>
>
> We are close to hugetlbfs in latency and we can achieve this with zero
> config/page reservation. Most of the allocations above are fault allocated.
> I haven't really measured the collapse alloc impact.
>
> Another test that does 50000000 random access over 1GB area goes from
> 2.65 seconds to 1.07 seconds with this patchset.
>
> Changes from RFC V1:
> * HugeTLB fs now works
> * Compile issues fixed
> * rebased to v3.8
> * Patch series reorded so that ppc64 cleanups and MM THP changes are moved
>    early in the series. This should help in picking those patches early.
>
> Thanks,
> -aneesh
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] powerpc: add CONFIG(s) require for using flash controller
From: Kumar Gala @ 2013-03-21  9:04 UTC (permalink / raw)
  To: Prabhakar Kushwaha; +Cc: linuxppc-dev
In-Reply-To: <1363595483-7248-1-git-send-email-prabhakar@freescale.com>


On Mar 18, 2013, at 3:31 AM, Prabhakar Kushwaha wrote:

> Add CONFIG(s) required for NAND and NOR flash controller usage.
> It defines MTD, Jffs2 and UBIFS file system required for controllers.
>=20
> It also enables IFC controller
>=20
> Signed-off-by: Prabhakar Kushwaha <prabhakar@freescale.com>
> ---
> Based upon =
http://git.kernel.org/pub/scm/linux/kernel/git/galak/powerpc.git
> branch next
>=20
> arch/powerpc/configs/corenet64_smp_defconfig |   35 =
+++++++++++++++++++++++++-
> arch/powerpc/configs/mpc85xx_defconfig       |   31 =
+++++++++++++++++++++++
> arch/powerpc/configs/mpc85xx_smp_defconfig   |   31 =
+++++++++++++++++++++++
> 3 files changed, 96 insertions(+), 1 deletion(-)

applied to next

- k=

^ permalink raw reply

* MPC5121e, Linux, simple IO ports
From: CF Studelec @ 2013-03-21  8:54 UTC (permalink / raw)
  To: linuxppc-dev

Hello,

This is a simple board base on mpc5121e MCU.

Gpio is detected: kernel is compiled with its support - i got
gpiochip_find_base: found new base @224 in dmesg - on kernel 3.0.4.

But i'm unable to access it through /sys/class/gpio. I can successfully
export a pin (ie, if i type cat 224 > export, gpio224 is created), but i
can't successfully control it:

echo "out" > /sys/class/gpio/gpio224/direction

cat /sys/class/gpio/gpio224/value

0

echo 1 > /sys/class/gpio/gpio224/value

cat /sys/class/gpio/gpio224/value

0

My need is a simple chipselect (well, 3 chipselect exactly ), for a
custom design. Running linux is mandatory. I suspect the MPC5121e to be
in the bad function mode, so:

- does anyone knows how to change mode ? What register can i acces and
how ? does it worths a try ?

- does anyone successfully performed simple IO control ?

Last thing, i have tryed to access internal registers through /dev/mem,
but no success. There are very few ressources available for this
microcontroler, but i'm stick to it. Perhaps anybody knows how to access
(read) internal registers with /proc or sys-fs ?

Thank you for your help.

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/fsl-booke: Update DCSR EPU device tree entries
From: Kumar Gala @ 2013-03-21  9:05 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Stephen George
In-Reply-To: <1363633870-3894-2-git-send-email-galak@kernel.crashing.org>


On Mar 18, 2013, at 2:11 PM, Kumar Gala wrote:

> From: Stephen George <Stephen.George@freescale.com>
> 
> Identifies the epu as compatible with Chassis v1 Debug IP.
> 
> Signed-off-by: Stephen George <Stephen.George@freescale.com>
> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
> arch/powerpc/boot/dts/fsl/p2041si-post.dtsi |    2 +-
> arch/powerpc/boot/dts/fsl/p3041si-post.dtsi |    2 +-
> arch/powerpc/boot/dts/fsl/p4080si-post.dtsi |    2 +-
> arch/powerpc/boot/dts/fsl/p5020si-post.dtsi |    2 +-
> arch/powerpc/boot/dts/fsl/p5040si-post.dtsi |    2 +-
> 5 files changed, 5 insertions(+), 5 deletions(-)

applied to next

- k

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/fsl-booke: Added device tree DCSR entries for
From: Kumar Gala @ 2013-03-21  9:06 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Stephen George
In-Reply-To: <1363633870-3894-1-git-send-email-galak@kernel.crashing.org>


On Mar 18, 2013, at 2:11 PM, Kumar Gala wrote:

> From: Stephen George <Stephen.George@freescale.com>
>=20
> Signed-off-by: Stephen George <Stephen.George@freescale.com>
> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
> arch/powerpc/boot/dts/fsl/t4240si-post.dtsi |  131 =
+++++++++++++++++++++++++++
> arch/powerpc/boot/dts/fsl/t4240si-pre.dtsi  |   25 ++---
> arch/powerpc/boot/dts/t4240qds.dts          |    4 +
> 3 files changed, 148 insertions(+), 12 deletions(-)

applied to next

- k=

^ permalink raw reply

* [PATCH] memblock: kill "config MAX_ACTIVE_REGIONS"
From: Paul Bolle @ 2013-03-21  9:27 UTC (permalink / raw)
  To: James Hogan, Benjamin Herrenschmidt, Paul Mackerras, Paul Mundt
  Cc: Tejun Heo, linuxppc-dev, linux-kernel, linux-sh

The Kconfig symbol MAX_ACTIVE_REGIONS is unused. Commit
0ee332c1451869963626bf9cac88f165a90990e1 ("memblock: Kill
early_node_map[]") removed the only place were it was actually used. But
it did not remove its Kconfig entries (for powerpc and sh).

Remove those two entries (and the entry for metag, that popped up in
v3.9-rc1).

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
---
0) Eyeball tested again.

1) It felt silly to split this clean up patch into three patches. But if
the maintainers involved disagree I'm happy to split and resend it. 

 arch/metag/mm/Kconfig | 5 -----
 arch/powerpc/Kconfig  | 5 -----
 arch/sh/mm/Kconfig    | 7 -------
 3 files changed, 17 deletions(-)

diff --git a/arch/metag/mm/Kconfig b/arch/metag/mm/Kconfig
index 794f26a..03fb8f1 100644
--- a/arch/metag/mm/Kconfig
+++ b/arch/metag/mm/Kconfig
@@ -93,11 +93,6 @@ config ARCH_SPARSEMEM_ENABLE
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
 
-config MAX_ACTIVE_REGIONS
-	int
-	default "2" if SPARSEMEM
-	default "1"
-
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 867ace7..b674397 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -427,11 +427,6 @@ config NODES_SHIFT
 	default "4"
 	depends on NEED_MULTIPLE_NODES
 
-config MAX_ACTIVE_REGIONS
-	int
-	default "256" if PPC64
-	default "32"
-
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on PPC64
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 5a43a87..dba285e 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -137,13 +137,6 @@ config ARCH_SPARSEMEM_ENABLE
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
 
-config MAX_ACTIVE_REGIONS
-	int
-	default "6" if (CPU_SUBTYPE_SHX3 && SPARSEMEM)
-	default "2" if SPARSEMEM && (CPU_SUBTYPE_SH7722 || \
-		       CPU_SUBTYPE_SH7785)
-	default "1"
-
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 
-- 
1.7.11.7

^ permalink raw reply related

* Re: [PATCH] memblock: kill "config MAX_ACTIVE_REGIONS"
From: Paul Mundt @ 2013-03-21  9:34 UTC (permalink / raw)
  To: Paul Bolle
  Cc: James Hogan, linux-sh, linux-kernel, Paul Mackerras, Tejun Heo,
	linuxppc-dev
In-Reply-To: <1363858076.1390.104.camel@x61.thuisdomein>

On Thu, Mar 21, 2013 at 10:27:56AM +0100, Paul Bolle wrote:
> The Kconfig symbol MAX_ACTIVE_REGIONS is unused. Commit
> 0ee332c1451869963626bf9cac88f165a90990e1 ("memblock: Kill
> early_node_map[]") removed the only place were it was actually used. But
> it did not remove its Kconfig entries (for powerpc and sh).
> 
> Remove those two entries (and the entry for metag, that popped up in
> v3.9-rc1).
> 
> Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
> ---
> 0) Eyeball tested again.
> 
> 1) It felt silly to split this clean up patch into three patches. But if
> the maintainers involved disagree I'm happy to split and resend it. 
> 
Given that it's unused now it doesn't really matter how it gets applied,
it looks fine to me.

Acked-by: Paul Mundt <lethal@linux-sh.org>

^ permalink raw reply

* Re: [PATCH] memblock: kill "config MAX_ACTIVE_REGIONS"
From: James Hogan @ 2013-03-21  9:48 UTC (permalink / raw)
  To: Paul Bolle
  Cc: linux-sh, linux-kernel, Paul Mundt, Paul Mackerras, Tejun Heo,
	linuxppc-dev
In-Reply-To: <20130321093455.GA8027@linux-sh.org>

On 21/03/13 09:34, Paul Mundt wrote:
> On Thu, Mar 21, 2013 at 10:27:56AM +0100, Paul Bolle wrote:
>> The Kconfig symbol MAX_ACTIVE_REGIONS is unused. Commit
>> 0ee332c1451869963626bf9cac88f165a90990e1 ("memblock: Kill
>> early_node_map[]") removed the only place were it was actually used. But
>> it did not remove its Kconfig entries (for powerpc and sh).
>>
>> Remove those two entries (and the entry for metag, that popped up in
>> v3.9-rc1).
>>
>> Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
>> ---
>> 0) Eyeball tested again.
>>
>> 1) It felt silly to split this clean up patch into three patches. But if
>> the maintainers involved disagree I'm happy to split and resend it. 
>>
> Given that it's unused now it doesn't really matter how it gets applied,
> it looks fine to me.
> 
> Acked-by: Paul Mundt <lethal@linux-sh.org>
> 

Acked-by: James Hogan <james.hogan@imgtec.com>

^ permalink raw reply

* [PATCH] powerpc/dts: Add qe support for 36bit
From: Zhicheng Fan @ 2013-03-21  9:26 UTC (permalink / raw)
  To: galak, linuxppc-dev; +Cc: Zhicheng Fan

fixed the following errors:
	Error: arch/powerpc/boot/dts/p1025rdb.dtsi:326.2-3 label or path, 'qe', not found
	Error: arch/powerpc/boot/dts/fsl/p1021si-post.dtsi:242.2-3 label or path, 'qe', not found
	FATAL ERROR: Syntax error parsing input tree

Signed-off-by: Zhicheng Fan <B32736@freescale.com>
---
 arch/powerpc/boot/dts/p1025rdb_36b.dts |   48 ++++++++++++++++++++++++++++++++
 1 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/boot/dts/p1025rdb_36b.dts b/arch/powerpc/boot/dts/p1025rdb_36b.dts
index 4ce4bfa..c74c39b 100644
--- a/arch/powerpc/boot/dts/p1025rdb_36b.dts
+++ b/arch/powerpc/boot/dts/p1025rdb_36b.dts
@@ -82,6 +82,54 @@
 				  0x0 0x100000>;
 		};
 	};
+
+	qe: qe@fffe80000 {
+		ranges = <0x0 0xf 0xffe80000 0x40000>;
+		reg = <0xf 0xffe80000 0 0x480>;
+		brg-frequency = <0>;
+		bus-frequency = <0>;
+		status = "disabled"; /* no firmware loaded */
+
+		enet3: ucc@2000 {
+			device_type = "network";
+			compatible = "ucc_geth";
+			rx-clock-name = "clk12";
+			tx-clock-name = "clk9";
+			pio-handle = <&pio1>;
+			phy-handle = <&qe_phy0>;
+			phy-connection-type = "mii";
+		};
+
+		mdio@2120 {
+			qe_phy0: ethernet-phy@0 {
+				interrupt-parent = <&mpic>;
+				interrupts = <4 1 0 0>;
+				reg = <0x6>;
+				device_type = "ethernet-phy";
+			};
+			qe_phy1: ethernet-phy@03 {
+				interrupt-parent = <&mpic>;
+				interrupts = <5 1 0 0>;
+				reg = <0x3>;
+				device_type = "ethernet-phy";
+			};
+			tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		enet4: ucc@2400 {
+			device_type = "network";
+			compatible = "ucc_geth";
+			rx-clock-name = "none";
+			tx-clock-name = "clk13";
+			pio-handle = <&pio2>;
+			phy-handle = <&qe_phy1>;
+			phy-connection-type = "rmii";
+		};
+	};
+
 };
 
 /include/ "p1025rdb.dtsi"
-- 
1.7.0.4

^ permalink raw reply related

* Re: [PATCH] KVM: PPC: e500: Expose MMU registers via ONE_REG
From: Alexander Graf @ 2013-03-21 10:06 UTC (permalink / raw)
  To: Scott Wood; +Cc: Mihai Caraman, linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <1363713985.16671.12@snotra>


On 19.03.2013, at 18:26, Scott Wood wrote:

> On 03/19/2013 12:17:11 PM, Mihai Caraman wrote:
>> diff --git a/arch/powerpc/kvm/e500_mmu.c =
b/arch/powerpc/kvm/e500_mmu.c
>> index 66b6e31..b77b855 100644
>> --- a/arch/powerpc/kvm/e500_mmu.c
>> +++ b/arch/powerpc/kvm/e500_mmu.c
>> @@ -596,6 +596,95 @@ int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu =
*vcpu, struct kvm_sregs *sregs)
>> 	return 0;
>> }
>> +int kvmppc_get_one_reg_500_tlb(struct kvm_vcpu *vcpu, u64 id,
>> +				union kvmppc_one_reg *val)
>=20
> s/500/e500/
>=20
>> +int kvmppc_set_one_reg_500_tlb(struct kvm_vcpu *vcpu, u64 id,
>> +			       union kvmppc_one_reg *val)
>> +{
>> +	int r =3D 0;
>> +	long int i;
>> +
>> +	switch (id) {
>> +	case KVM_REG_PPC_MAS0:
>> +		vcpu->arch.shared->mas0 =3D set_reg_val(id, *val);
>> +		break;
>> +	case KVM_REG_PPC_MAS1:
>> +		vcpu->arch.shared->mas1 =3D set_reg_val(id, *val);
>> +		break;
>> +	case KVM_REG_PPC_MAS2:
>> +		vcpu->arch.shared->mas2 =3D set_reg_val(id, *val);
>> +		break;
>> +	case KVM_REG_PPC_MAS7_3:
>> +		vcpu->arch.shared->mas7_3 =3D set_reg_val(id, *val);
>> +		break;
>> +	case KVM_REG_PPC_MAS4:
>> +		vcpu->arch.shared->mas4 =3D set_reg_val(id, *val);
>> +		break;
>> +	case KVM_REG_PPC_MAS6:
>> +		vcpu->arch.shared->mas6 =3D set_reg_val(id, *val);
>> +		break;
>> +	case KVM_REG_PPC_MMUCFG: {
>> +		u32 mmucfg =3D set_reg_val(id, *val);
>> +		vcpu->arch.mmucfg =3D mmucfg & ~MMUCFG_LPIDSIZE;
>> +		break;
>> +	}
>=20
> Do we really want to allow arbitrary MMUCFG changes?  It won't =
magically make us able to support larger RAs, PIDs, different MAVN, etc.

Only if we update the actual shadow mmu configuration as well.

>=20
>> +	case KVM_REG_PPC_TLB0CFG:
>> +	case KVM_REG_PPC_TLB1CFG:
>> +	case KVM_REG_PPC_TLB2CFG:
>> +	case KVM_REG_PPC_TLB3CFG: {
>> +		u32 tlbncfg =3D set_reg_val(id, *val);			=09=

>> +		u32 geometry_mask =3D TLBnCFG_N_ENTRY | TLBnCFG_ASSOC;
>> +		i =3D id - KVM_REG_PPC_TLB0CFG;
>> +
>> +		/* MMU geometry (way/size) can be set only using SW_TLB =
*/
>> +		if ((vcpu->arch.tlbcfg[i] & geometry_mask) !=3D
>> +		    (tlbncfg & geometry_mask))
>> +			r =3D -EINVAL;
>> +
>> +		vcpu->arch.tlbcfg[i] =3D set_reg_val(id, *val);
>> +		break;
>> +	}
>=20
> Likewise -- just because QEMU sets a bit here doesn't mean KVM can =
support it.
>=20
> I thought the initial plan for setting these config registers was to =
accept it if it exactly matches what KVM already has, and give an error =
otherwise -- thus allowing for the possibliity of accepting certain =
specific updates in the future.

Yes, that was the idea :).


Alex

^ permalink raw reply

* Re: [PATCH] KVM: PPC: e500: Add separate functions for vcpu's MMU configuration
From: Alexander Graf @ 2013-03-21 10:07 UTC (permalink / raw)
  To: Mihai Caraman; +Cc: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <1363713407-27886-1-git-send-email-mihai.caraman@freescale.com>


On 19.03.2013, at 18:16, Mihai Caraman wrote:

> Move vcpu's MMU default configuration and geometry update into their =
own
> functions.

Mind to explain why?


Alex

>=20
> Signed-off-by: Mihai Caraman <mihai.caraman@freescale.com>
> ---
> arch/powerpc/kvm/e500_mmu.c |   59 =
+++++++++++++++++++++++++++----------------
> 1 files changed, 37 insertions(+), 22 deletions(-)
>=20
> diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
> index 5c44759..66b6e31 100644
> --- a/arch/powerpc/kvm/e500_mmu.c
> +++ b/arch/powerpc/kvm/e500_mmu.c
> @@ -596,6 +596,20 @@ int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu =
*vcpu, struct kvm_sregs *sregs)
> 	return 0;
> }
>=20
> +static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu,
> +		struct kvm_book3e_206_tlb_params *params)
> +{
> +	vcpu->arch.tlbcfg[0] &=3D ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
> +	if (params->tlb_sizes[0] <=3D 2048)
> +		vcpu->arch.tlbcfg[0] |=3D params->tlb_sizes[0];
> +	vcpu->arch.tlbcfg[0] |=3D params->tlb_ways[0] << =
TLBnCFG_ASSOC_SHIFT;
> +
> +	vcpu->arch.tlbcfg[1] &=3D ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
> +	vcpu->arch.tlbcfg[1] |=3D params->tlb_sizes[1];
> +	vcpu->arch.tlbcfg[1] |=3D params->tlb_ways[1] << =
TLBnCFG_ASSOC_SHIFT;
> +	return 0;	=09
> +}
> +
> int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
> 			      struct kvm_config_tlb *cfg)
> {
> @@ -692,16 +706,8 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu =
*vcpu,
> 	vcpu_e500->gtlb_offset[0] =3D 0;
> 	vcpu_e500->gtlb_offset[1] =3D params.tlb_sizes[0];
>=20
> -	vcpu->arch.mmucfg =3D mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
> -
> -	vcpu->arch.tlbcfg[0] &=3D ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
> -	if (params.tlb_sizes[0] <=3D 2048)
> -		vcpu->arch.tlbcfg[0] |=3D params.tlb_sizes[0];
> -	vcpu->arch.tlbcfg[0] |=3D params.tlb_ways[0] << =
TLBnCFG_ASSOC_SHIFT;
> -
> -	vcpu->arch.tlbcfg[1] &=3D ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
> -	vcpu->arch.tlbcfg[1] |=3D params.tlb_sizes[1];
> -	vcpu->arch.tlbcfg[1] |=3D params.tlb_ways[1] << =
TLBnCFG_ASSOC_SHIFT;
> +	/* Update vcpu's MMU geometry based on SW_TLB input */
> +	vcpu_mmu_geometry_update(vcpu, &params);
>=20
> 	vcpu_e500->shared_tlb_pages =3D pages;
> 	vcpu_e500->num_shared_tlb_pages =3D num_pages;
> @@ -737,6 +743,26 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu =
*vcpu,
> 	return 0;
> }
>=20
> +/* vcpu's MMU default configuration */
> +static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
> +		       struct kvmppc_e500_tlb_params *params)
> +{
> +	/* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host =
values*/
> +	vcpu->arch.mmucfg =3D mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
> +
> +	/* Initialize IPROT field with host value*/
> +	vcpu->arch.tlbcfg[0] =3D mfspr(SPRN_TLB0CFG) &
> +			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
> +	vcpu->arch.tlbcfg[0] |=3D params[0].entries;
> +	vcpu->arch.tlbcfg[0] |=3D params[0].ways << TLBnCFG_ASSOC_SHIFT;
> +
> +	vcpu->arch.tlbcfg[1] =3D mfspr(SPRN_TLB1CFG) &
> +			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
> +	vcpu->arch.tlbcfg[1] |=3D params[1].entries;
> +	vcpu->arch.tlbcfg[1] |=3D params[1].ways << TLBnCFG_ASSOC_SHIFT;
> +	return 0;
> +}
> +
> int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
> {
> 	struct kvm_vcpu *vcpu =3D &vcpu_e500->vcpu;
> @@ -781,18 +807,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 =
*vcpu_e500)
> 	if (!vcpu_e500->g2h_tlb1_map)
> 		goto err;
>=20
> -	/* Init TLB configuration register */
> -	vcpu->arch.tlbcfg[0] =3D mfspr(SPRN_TLB0CFG) &
> -			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
> -	vcpu->arch.tlbcfg[0] |=3D vcpu_e500->gtlb_params[0].entries;
> -	vcpu->arch.tlbcfg[0] |=3D
> -		vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
> -
> -	vcpu->arch.tlbcfg[1] =3D mfspr(SPRN_TLB1CFG) &
> -			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
> -	vcpu->arch.tlbcfg[1] |=3D vcpu_e500->gtlb_params[1].entries;
> -	vcpu->arch.tlbcfg[1] |=3D
> -		vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
> +	vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
>=20
> 	kvmppc_recalc_tlb1map_range(vcpu_e500);
> 	return 0;
> --=20
> 1.7.4.1
>=20
>=20
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox