LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH -V2 10/26] powerpc: Decode the pte-lp-encoding bits correctly.
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1362550227-575-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

We look at both the segment base page size and actual page size and store
the pte-lp-encodings in an array per base page size.

We also update all relevant functions to take actual page size argument
so that we can use the correct PTE LP encoding in HPTE. This should also
get the basic Multiple Page Size per Segment (MPSS) support. This is needed
to enable THP on ppc64.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/machdep.h      |    3 +-
 arch/powerpc/include/asm/mmu-hash64.h   |   33 +++++----
 arch/powerpc/kvm/book3s_hv.c            |    2 +-
 arch/powerpc/mm/hash_low_64.S           |   18 +++--
 arch/powerpc/mm/hash_native_64.c        |  116 +++++++++++++++++++++---------
 arch/powerpc/mm/hash_utils_64.c         |  118 ++++++++++++++++++++-----------
 arch/powerpc/mm/hugetlbpage-hash64.c    |    4 +-
 arch/powerpc/platforms/cell/beat_htab.c |   16 ++---
 arch/powerpc/platforms/ps3/htab.c       |    6 +-
 arch/powerpc/platforms/pseries/lpar.c   |    6 +-
 10 files changed, 208 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 19d9d96..6cee6e0 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -50,7 +50,8 @@ struct machdep_calls {
 				       unsigned long prpn,
 				       unsigned long rflags,
 				       unsigned long vflags,
-				       int psize, int ssize);
+				       int psize, int apsize,
+				       int ssize);
 	long		(*hpte_remove)(unsigned long hpte_group);
 	void            (*hpte_removebolted)(unsigned long ea,
 					     int psize, int ssize);
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index feac737..a0322bf 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -154,7 +154,7 @@ extern unsigned long htab_hash_mask;
 struct mmu_psize_def
 {
 	unsigned int	shift;	/* number of bits */
-	unsigned int	penc;	/* HPTE encoding */
+	unsigned int	penc[MMU_PAGE_COUNT];	/* HPTE encoding */
 	unsigned int	tlbiel;	/* tlbiel supported for that page size */
 	unsigned long	avpnm;	/* bits to mask out in AVPN in the HPTE */
 	unsigned long	sllp;	/* SLB L||LP (exact mask to use in slbmte) */
@@ -181,6 +181,13 @@ struct mmu_psize_def
  */
 #define VPN_SHIFT	12
 
+/*
+ * HPTE Large Page (LP) details
+ */
+#define LP_SHIFT	12
+#define LP_BITS		8
+#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
+
 #ifndef __ASSEMBLY__
 
 static inline int segment_shift(int ssize)
@@ -237,14 +244,14 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize,
 
 /*
  * This function sets the AVPN and L fields of the HPTE  appropriately
- * for the page size
+ * using the base page size and actual page size.
  */
-static inline unsigned long hpte_encode_v(unsigned long vpn,
-					  int psize, int ssize)
+static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize,
+					  int actual_psize, int ssize)
 {
 	unsigned long v;
-	v = hpte_encode_avpn(vpn, psize, ssize);
-	if (psize != MMU_PAGE_4K)
+	v = hpte_encode_avpn(vpn, base_psize, ssize);
+	if (actual_psize != MMU_PAGE_4K)
 		v |= HPTE_V_LARGE;
 	return v;
 }
@@ -254,19 +261,17 @@ static inline unsigned long hpte_encode_v(unsigned long vpn,
  * for the page size. We assume the pa is already "clean" that is properly
  * aligned for the requested page size
  */
-static inline unsigned long hpte_encode_r(unsigned long pa, int psize)
+static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize,
+					  int actual_psize)
 {
-	unsigned long r;
-
 	/* A 4K page needs no special encoding */
-	if (psize == MMU_PAGE_4K)
+	if (actual_psize == MMU_PAGE_4K)
 		return pa & HPTE_R_RPN;
 	else {
-		unsigned int penc = mmu_psize_defs[psize].penc;
-		unsigned int shift = mmu_psize_defs[psize].shift;
-		return (pa & ~((1ul << shift) - 1)) | (penc << 12);
+		unsigned int penc = mmu_psize_defs[base_psize].penc[actual_psize];
+		unsigned int shift = mmu_psize_defs[actual_psize].shift;
+		return (pa & ~((1ul << shift) - 1)) | (penc << LP_SHIFT);
 	}
-	return r;
 }
 
 /*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 71d0c90..48f6d99 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1515,7 +1515,7 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
 	(*sps)->page_shift = def->shift;
 	(*sps)->slb_enc = def->sllp;
 	(*sps)->enc[0].page_shift = def->shift;
-	(*sps)->enc[0].pte_enc = def->penc;
+	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
 	(*sps)++;
 }
 
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index abdd5e2..0e980ac 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -196,7 +196,8 @@ htab_insert_pte:
 	mr	r4,r29			/* Retrieve vpn */
 	li	r7,0			/* !bolted, !secondary */
 	li	r8,MMU_PAGE_4K		/* page size */
-	ld	r9,STK_PARAM(R9)(r1)	/* segment size */
+	li	r9,MMU_PAGE_4K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
 _GLOBAL(htab_call_hpte_insert1)
 	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
@@ -219,7 +220,8 @@ _GLOBAL(htab_call_hpte_insert1)
 	mr	r4,r29			/* Retrieve vpn */
 	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
 	li	r8,MMU_PAGE_4K		/* page size */
-	ld	r9,STK_PARAM(R9)(r1)	/* segment size */
+	li	r9,MMU_PAGE_4K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
 _GLOBAL(htab_call_hpte_insert2)
 	bl	.			/* Patched by htab_finish_init() */
 	cmpdi	0,r3,0
@@ -515,7 +517,8 @@ htab_special_pfn:
 	mr	r4,r29			/* Retrieve vpn */
 	li	r7,0			/* !bolted, !secondary */
 	li	r8,MMU_PAGE_4K		/* page size */
-	ld	r9,STK_PARAM(R9)(r1)	/* segment size */
+	li	r9,MMU_PAGE_4K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
 _GLOBAL(htab_call_hpte_insert1)
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
@@ -542,7 +545,8 @@ _GLOBAL(htab_call_hpte_insert1)
 	mr	r4,r29			/* Retrieve vpn */
 	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
 	li	r8,MMU_PAGE_4K		/* page size */
-	ld	r9,STK_PARAM(R9)(r1)	/* segment size */
+	li	r9,MMU_PAGE_4K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
 _GLOBAL(htab_call_hpte_insert2)
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
@@ -840,7 +844,8 @@ ht64_insert_pte:
 	mr	r4,r29			/* Retrieve vpn */
 	li	r7,0			/* !bolted, !secondary */
 	li	r8,MMU_PAGE_64K
-	ld	r9,STK_PARAM(R9)(r1)	/* segment size */
+	li	r9,MMU_PAGE_64K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
 _GLOBAL(ht64_call_hpte_insert1)
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
@@ -863,7 +868,8 @@ _GLOBAL(ht64_call_hpte_insert1)
 	mr	r4,r29			/* Retrieve vpn */
 	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
 	li	r8,MMU_PAGE_64K
-	ld	r9,STK_PARAM(R9)(r1)	/* segment size */
+	li	r9,MMU_PAGE_64K		/* actual page size */
+	ld	r10,STK_PARAM(R9)(r1)	/* segment size */
 _GLOBAL(ht64_call_hpte_insert2)
 	bl	.			/* patched by htab_finish_init() */
 	cmpdi	0,r3,0
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 9d8983a..7b5bf94 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -39,7 +39,7 @@
 
 DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
-static inline void __tlbie(unsigned long vpn, int psize, int ssize)
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
 {
 	unsigned long va;
 	unsigned int penc;
@@ -68,7 +68,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int ssize)
 		break;
 	default:
 		/* We need 14 to 14 + i bits of va */
-		penc = mmu_psize_defs[psize].penc;
+		penc = mmu_psize_defs[psize].penc[apsize];
 		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 		va |= penc << 12;
 		va |= ssize << 8;
@@ -80,7 +80,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int ssize)
 	}
 }
 
-static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
 {
 	unsigned long va;
 	unsigned int penc;
@@ -102,7 +102,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
 		break;
 	default:
 		/* We need 14 to 14 + i bits of va */
-		penc = mmu_psize_defs[psize].penc;
+		penc = mmu_psize_defs[psize].penc[apsize];
 		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 		va |= penc << 12;
 		va |= ssize << 8;
@@ -114,7 +114,8 @@ static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
 
 }
 
-static inline void tlbie(unsigned long vpn, int psize, int ssize, int local)
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+			 int ssize, int local)
 {
 	unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
@@ -125,10 +126,10 @@ static inline void tlbie(unsigned long vpn, int psize, int ssize, int local)
 		raw_spin_lock(&native_tlbie_lock);
 	asm volatile("ptesync": : :"memory");
 	if (use_local) {
-		__tlbiel(vpn, psize, ssize);
+		__tlbiel(vpn, psize, apsize, ssize);
 		asm volatile("ptesync": : :"memory");
 	} else {
-		__tlbie(vpn, psize, ssize);
+		__tlbie(vpn, psize, apsize, ssize);
 		asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	}
 	if (lock_tlbie && !use_local)
@@ -156,7 +157,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep)
 
 static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 			unsigned long pa, unsigned long rflags,
-			unsigned long vflags, int psize, int ssize)
+			unsigned long vflags, int psize, int apsize, int ssize)
 {
 	struct hash_pte *hptep = htab_address + hpte_group;
 	unsigned long hpte_v, hpte_r;
@@ -183,8 +184,8 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 	if (i == HPTES_PER_GROUP)
 		return -1;
 
-	hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize) | rflags;
+	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED)) {
 		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
@@ -244,6 +245,42 @@ static long native_hpte_remove(unsigned long hpte_group)
 	return i;
 }
 
+static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
+{
+	int i, shift;
+	unsigned int mask;
+	/* Look at the 8 bit LP value */
+	unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+	/* First check if it is large page */
+	if (!(hptep->v & HPTE_V_LARGE))
+		return MMU_PAGE_4K;
+
+	/* start from 1 ignoring MMU_PAGE_4K */
+	for (i = 1; i < MMU_PAGE_COUNT; i++) {
+		/* valid entries have a shift value */
+		if (!mmu_psize_defs[i].shift)
+			continue;
+		/*
+		 * encoding bits per actual page size
+		 *        PTE LP     actual page size
+		 *    rrrr rrrz		≥8KB
+		 *    rrrr rrzz		≥16KB
+		 *    rrrr rzzz		≥32KB
+		 *    rrrr zzzz		≥64KB
+		 * .......
+		 */
+		shift = mmu_psize_defs[i].shift -
+				mmu_psize_defs[MMU_PAGE_4K].shift;
+		if (shift > LP_BITS)
+			shift = LP_BITS;
+		mask = (1 << shift) - 1;
+		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+			return i;
+	}
+	return -1;
+}
+
 static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 				 unsigned long vpn, int psize, int ssize,
 				 int local)
@@ -251,6 +288,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	struct hash_pte *hptep = htab_address + slot;
 	unsigned long hpte_v, want_v;
 	int ret = 0;
+	int actual_psize;
 
 	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
@@ -260,6 +298,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	native_lock_hpte(hptep);
 
 	hpte_v = hptep->v;
+	actual_psize = hpte_actual_psize(hptep, psize);
 
 	/* Even if we miss, we need to invalidate the TLB */
 	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
@@ -274,7 +313,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	native_unlock_hpte(hptep);
 
 	/* Ensure it is out of the tlb too. */
-	tlbie(vpn, psize, ssize, local);
+	tlbie(vpn, psize, actual_psize, ssize, local);
 
 	return ret;
 }
@@ -315,6 +354,7 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 				       int psize, int ssize)
 {
+	int actual_psize;
 	unsigned long vpn;
 	unsigned long vsid;
 	long slot;
@@ -327,13 +367,14 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 	if (slot == -1)
 		panic("could not find page to bolt\n");
 	hptep = htab_address + slot;
+	actual_psize = hpte_actual_psize(hptep, psize);
 
 	/* Update the HPTE */
 	hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
 		(newpp & (HPTE_R_PP | HPTE_R_N));
 
 	/* Ensure it is out of the tlb too. */
-	tlbie(vpn, psize, ssize, 0);
+	tlbie(vpn, psize, actual_psize, ssize, 0);
 }
 
 static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
@@ -343,6 +384,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	unsigned long hpte_v;
 	unsigned long want_v;
 	unsigned long flags;
+	int actual_psize;
 
 	local_irq_save(flags);
 
@@ -352,6 +394,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	native_lock_hpte(hptep);
 	hpte_v = hptep->v;
 
+	actual_psize = hpte_actual_psize(hptep, psize);
 	/* Even if we miss, we need to invalidate the TLB */
 	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
 		native_unlock_hpte(hptep);
@@ -360,27 +403,24 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 		hptep->v = 0;
 
 	/* Invalidate the TLB */
-	tlbie(vpn, psize, ssize, local);
+	tlbie(vpn, psize, actual_psize, ssize, local);
 
 	local_irq_restore(flags);
 }
 
-#define LP_SHIFT	12
-#define LP_BITS		8
-#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
-
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
-			int *psize, int *ssize, unsigned long *vpn)
+			int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
 	unsigned long avpn, pteg, vpi;
 	unsigned long hpte_r = hpte->r;
 	unsigned long hpte_v = hpte->v;
 	unsigned long vsid, seg_off;
-	int i, size, shift, penc;
+	int i, size, a_size, shift, penc;
 
-	if (!(hpte_v & HPTE_V_LARGE))
-		size = MMU_PAGE_4K;
-	else {
+	if (!(hpte_v & HPTE_V_LARGE)) {
+		size   = MMU_PAGE_4K;
+		a_size = MMU_PAGE_4K;
+	} else {
 		for (i = 0; i < LP_BITS; i++) {
 			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
 				break;
@@ -388,19 +428,26 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 		penc = LP_MASK(i+1) >> LP_SHIFT;
 		for (size = 0; size < MMU_PAGE_COUNT; size++) {
 
-			/* 4K pages are not represented by LP */
-			if (size == MMU_PAGE_4K)
-				continue;
-
 			/* valid entries have a shift value */
 			if (!mmu_psize_defs[size].shift)
 				continue;
+			for (a_size = 0; a_size < MMU_PAGE_COUNT; a_size++) {
 
-			if (penc == mmu_psize_defs[size].penc)
-				break;
+				/* 4K pages are not represented by LP */
+				if (a_size == MMU_PAGE_4K)
+					continue;
+
+				/* valid entries have a shift value */
+				if (!mmu_psize_defs[a_size].shift)
+					continue;
+
+				if (penc == mmu_psize_defs[size].penc[a_size])
+					goto out;
+			}
 		}
 	}
 
+out:
 	/* This works for all page sizes, and for 256M and 1T segments */
 	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
 	shift = mmu_psize_defs[size].shift;
@@ -433,7 +480,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 	default:
 		*vpn = size = 0;
 	}
-	*psize = size;
+	*psize  = size;
+	*apsize = a_size;
 }
 
 /*
@@ -451,7 +499,7 @@ static void native_hpte_clear(void)
 	struct hash_pte *hptep = htab_address;
 	unsigned long hpte_v;
 	unsigned long pteg_count;
-	int psize, ssize;
+	int psize, apsize, ssize;
 
 	pteg_count = htab_hash_mask + 1;
 
@@ -477,9 +525,9 @@ static void native_hpte_clear(void)
 		 * already hold the native_tlbie_lock.
 		 */
 		if (hpte_v & HPTE_V_VALID) {
-			hpte_decode(hptep, slot, &psize, &ssize, &vpn);
+			hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
 			hptep->v = 0;
-			__tlbie(vpn, psize, ssize);
+			__tlbie(vpn, psize, apsize, ssize);
 		}
 	}
 
@@ -540,7 +588,7 @@ static void native_flush_hash_range(unsigned long number, int local)
 
 			pte_iterate_hashed_subpages(pte, psize,
 						    vpn, index, shift) {
-				__tlbiel(vpn, psize, ssize);
+				__tlbiel(vpn, psize, psize, ssize);
 			} pte_iterate_hashed_end();
 		}
 		asm volatile("ptesync":::"memory");
@@ -557,7 +605,7 @@ static void native_flush_hash_range(unsigned long number, int local)
 
 			pte_iterate_hashed_subpages(pte, psize,
 						    vpn, index, shift) {
-				__tlbie(vpn, psize, ssize);
+				__tlbie(vpn, psize, psize, ssize);
 			} pte_iterate_hashed_end();
 		}
 		asm volatile("eieio; tlbsync; ptesync":::"memory");
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index bfeab83..6cf6c66 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -125,7 +125,6 @@ static struct mmu_psize_def mmu_psize_defaults_old[] = {
 	[MMU_PAGE_4K] = {
 		.shift	= 12,
 		.sllp	= 0,
-		.penc	= 0,
 		.avpnm	= 0,
 		.tlbiel = 0,
 	},
@@ -139,14 +138,12 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
 	[MMU_PAGE_4K] = {
 		.shift	= 12,
 		.sllp	= 0,
-		.penc	= 0,
 		.avpnm	= 0,
 		.tlbiel = 1,
 	},
 	[MMU_PAGE_16M] = {
 		.shift	= 24,
 		.sllp	= SLB_VSID_L,
-		.penc	= 0,
 		.avpnm	= 0x1UL,
 		.tlbiel = 0,
 	},
@@ -208,7 +205,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 
 		BUG_ON(!ppc_md.hpte_insert);
 		ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot,
-					 HPTE_V_BOLTED, psize, ssize);
+					 HPTE_V_BOLTED, psize, psize, ssize);
 
 		if (ret < 0)
 			break;
@@ -275,6 +272,30 @@ static void __init htab_init_seg_sizes(void)
 	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
 }
 
+static int __init get_idx_from_shift(unsigned int shift)
+{
+	int idx = -1;
+
+	switch (shift) {
+	case 0xc:
+		idx = MMU_PAGE_4K;
+		break;
+	case 0x10:
+		idx = MMU_PAGE_64K;
+		break;
+	case 0x14:
+		idx = MMU_PAGE_1M;
+		break;
+	case 0x18:
+		idx = MMU_PAGE_16M;
+		break;
+	case 0x22:
+		idx = MMU_PAGE_16G;
+		break;
+	}
+	return idx;
+}
+
 static int __init htab_dt_scan_page_sizes(unsigned long node,
 					  const char *uname, int depth,
 					  void *data)
@@ -294,60 +315,61 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
 		size /= 4;
 		cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
 		while(size > 0) {
-			unsigned int shift = prop[0];
+			unsigned int base_shift = prop[0];
 			unsigned int slbenc = prop[1];
 			unsigned int lpnum = prop[2];
-			unsigned int lpenc = 0;
 			struct mmu_psize_def *def;
-			int idx = -1;
+			int idx, base_idx;
 
 			size -= 3; prop += 3;
-			while(size > 0 && lpnum) {
-				if (prop[0] == shift)
-					lpenc = prop[1];
-				prop += 2; size -= 2;
-				lpnum--;
+			base_idx = get_idx_from_shift(base_shift);
+			if (base_idx < 0) {
+				/*
+				 * skip the pte encoding also
+				 */
+				prop += lpnum * 2; size -= lpnum * 2;
+				continue;
 			}
-			switch(shift) {
-			case 0xc:
-				idx = MMU_PAGE_4K;
-				break;
-			case 0x10:
-				idx = MMU_PAGE_64K;
-				break;
-			case 0x14:
-				idx = MMU_PAGE_1M;
-				break;
-			case 0x18:
-				idx = MMU_PAGE_16M;
+			def = &mmu_psize_defs[base_idx];
+			if (base_idx == MMU_PAGE_16M)
 				cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
-				break;
-			case 0x22:
-				idx = MMU_PAGE_16G;
-				break;
-			}
-			if (idx < 0)
-				continue;
-			def = &mmu_psize_defs[idx];
-			def->shift = shift;
-			if (shift <= 23)
+
+			def->shift = base_shift;
+			if (base_shift <= 23)
 				def->avpnm = 0;
 			else
-				def->avpnm = (1 << (shift - 23)) - 1;
+				def->avpnm = (1 << (base_shift - 23)) - 1;
 			def->sllp = slbenc;
-			def->penc = lpenc;
-			/* We don't know for sure what's up with tlbiel, so
+			/*
+			 * We don't know for sure what's up with tlbiel, so
 			 * for now we only set it for 4K and 64K pages
 			 */
-			if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+			if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
 				def->tlbiel = 1;
 			else
 				def->tlbiel = 0;
 
-			DBG(" %d: shift=%02x, sllp=%04lx, avpnm=%08lx, "
-			    "tlbiel=%d, penc=%d\n",
-			    idx, shift, def->sllp, def->avpnm, def->tlbiel,
-			    def->penc);
+			while (size > 0 && lpnum) {
+				unsigned int shift = prop[0];
+				unsigned int penc  = prop[1];
+
+				prop += 2; size -= 2;
+				lpnum--;
+
+				idx = get_idx_from_shift(shift);
+				if (idx < 0)
+					continue;
+
+				if ((signed)penc == -1)
+					pr_err("Invalid penc for base_shift=%d "
+					       "shift=%d\n", base_shift, shift);
+
+				def->penc[idx] = penc;
+				DBG(" %d: shift=%02x, sllp=%04lx, "
+				    "avpnm=%08lx, tlbiel=%d, penc=%d\n",
+				    idx, shift, def->sllp, def->avpnm,
+				    def->tlbiel, def->penc[idx]);
+			}
 		}
 		return 1;
 	}
@@ -396,10 +418,20 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
 }
 #endif /* CONFIG_HUGETLB_PAGE */
 
+static void mmu_psize_set_default_penc(struct mmu_psize_def *mmu_psize)
+{
+	int bpsize, apsize;
+	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+		for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+			mmu_psize[bpsize].penc[apsize] = -1;
+}
+
 static void __init htab_init_page_sizes(void)
 {
 	int rc;
 
+	mmu_psize_set_default_penc(mmu_psize_defaults_old);
+
 	/* Default to 4K pages only */
 	memcpy(mmu_psize_defs, mmu_psize_defaults_old,
 	       sizeof(mmu_psize_defaults_old));
@@ -411,6 +443,8 @@ static void __init htab_init_page_sizes(void)
 	if (rc != 0)  /* Found */
 		goto found;
 
+	mmu_psize_set_default_penc(mmu_psize_defaults_gp);
+
 	/*
 	 * Not in the device-tree, let's fallback on known size
 	 * list for 16M capable GP & GR
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index cecad34..e0d52ee 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -103,7 +103,7 @@ repeat:
 
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-					  mmu_psize, ssize);
+					  mmu_psize, mmu_psize, ssize);
 
 		/* Primary is full, try the secondary */
 		if (unlikely(slot == -1)) {
@@ -111,7 +111,7 @@ repeat:
 				      HPTES_PER_GROUP) & ~0x7UL;
 			slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
 						  HPTE_V_SECONDARY,
-						  mmu_psize, ssize);
+						  mmu_psize, mmu_psize, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c
index 472f9a7..246e1d8 100644
--- a/arch/powerpc/platforms/cell/beat_htab.c
+++ b/arch/powerpc/platforms/cell/beat_htab.c
@@ -90,7 +90,7 @@ static inline unsigned int beat_read_mask(unsigned hpte_group)
 static long beat_lpar_hpte_insert(unsigned long hpte_group,
 				  unsigned long vpn, unsigned long pa,
 				  unsigned long rflags, unsigned long vflags,
-				  int psize, int ssize)
+				  int psize, int apsize, int ssize)
 {
 	unsigned long lpar_rc;
 	u64 hpte_v, hpte_r, slot;
@@ -103,9 +103,9 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group,
 			"rflags=%lx, vflags=%lx, psize=%d)\n",
 		hpte_group, va, pa, rflags, vflags, psize);
 
-	hpte_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M) |
+	hpte_v = hpte_encode_v(vpn, psize, apsize, MMU_SEGSIZE_256M) |
 		vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize) | rflags;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED))
 		DBG_LOW(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
@@ -314,7 +314,7 @@ void __init hpte_init_beat(void)
 static long beat_lpar_hpte_insert_v3(unsigned long hpte_group,
 				  unsigned long vpn, unsigned long pa,
 				  unsigned long rflags, unsigned long vflags,
-				  int psize, int ssize)
+				  int psize, int apsize, int ssize)
 {
 	unsigned long lpar_rc;
 	u64 hpte_v, hpte_r, slot;
@@ -327,9 +327,9 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group,
 			"rflags=%lx, vflags=%lx, psize=%d)\n",
 		hpte_group, vpn, pa, rflags, vflags, psize);
 
-	hpte_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M) |
+	hpte_v = hpte_encode_v(vpn, psize, apsize, MMU_SEGSIZE_256M) |
 		vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize) | rflags;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED))
 		DBG_LOW(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
@@ -373,7 +373,7 @@ static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
 	unsigned long pss;
 
 	want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
-	pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc;
+	pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc[psize];
 
 	DBG_LOW("    update: "
 		"avpnv=%016lx, slot=%016lx, psize: %d, newpp %016lx ... ",
@@ -403,7 +403,7 @@ static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn,
 	DBG_LOW("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
 		slot, vpn, psize, local);
 	want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
-	pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc;
+	pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc[psize];
 
 	lpar_rc = beat_invalidate_htab_entry3(0, slot, want_v, pss);
 
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index 07a4bba..44f06d2 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -45,7 +45,7 @@ static DEFINE_SPINLOCK(ps3_htab_lock);
 
 static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 	unsigned long pa, unsigned long rflags, unsigned long vflags,
-	int psize, int ssize)
+	int psize, int apsize, int ssize)
 {
 	int result;
 	u64 hpte_v, hpte_r;
@@ -61,8 +61,8 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 	 */
 	vflags &= ~HPTE_V_SECONDARY;
 
-	hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize) | rflags;
+	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags;
 
 	spin_lock_irqsave(&ps3_htab_lock, flags);
 
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index a77c35b..3daced3 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -109,7 +109,7 @@ void vpa_init(int cpu)
 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 				     unsigned long vpn, unsigned long pa,
 				     unsigned long rflags, unsigned long vflags,
-				     int psize, int ssize)
+				     int psize, int apsize, int ssize)
 {
 	unsigned long lpar_rc;
 	unsigned long flags;
@@ -121,8 +121,8 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 			 "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
 			 hpte_group, vpn,  pa, rflags, vflags, psize);
 
-	hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize) | rflags;
+	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED))
 		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V2 09/26] powerpc: Use encode avpn where we need only avpn values
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1362550227-575-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

In all these cases we are doing something similar to

HPTE_V_COMPARE(hpte_v, want_v) which ignores the HPTE_V_LARGE bit

With MPSS support we would need actual page size to set HPTE_V_LARGE
bit and that won't be available in most of these cases. Since we are ignoring
HPTE_V_LARGE bit, use the  avpn value instead. There should not be any change
in behaviour after this patch.

Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/mm/hash_native_64.c        |    8 ++++----
 arch/powerpc/platforms/cell/beat_htab.c |   10 +++++-----
 arch/powerpc/platforms/ps3/htab.c       |    2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index ffc1e00..9d8983a 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -252,7 +252,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	unsigned long hpte_v, want_v;
 	int ret = 0;
 
-	want_v = hpte_encode_v(vpn, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
 	DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
 		vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -288,7 +288,7 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 	unsigned long want_v, hpte_v;
 
 	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
-	want_v = hpte_encode_v(vpn, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
 	/* Bolted mappings are only ever in the primary group */
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -348,7 +348,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 
 	DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
 
-	want_v = hpte_encode_v(vpn, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 	native_lock_hpte(hptep);
 	hpte_v = hptep->v;
 
@@ -520,7 +520,7 @@ static void native_flush_hash_range(unsigned long number, int local)
 			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 			slot += hidx & _PTEIDX_GROUP_IX;
 			hptep = htab_address + slot;
-			want_v = hpte_encode_v(vpn, psize, ssize);
+			want_v = hpte_encode_avpn(vpn, psize, ssize);
 			native_lock_hpte(hptep);
 			hpte_v = hptep->v;
 			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c
index 0f6f839..472f9a7 100644
--- a/arch/powerpc/platforms/cell/beat_htab.c
+++ b/arch/powerpc/platforms/cell/beat_htab.c
@@ -191,7 +191,7 @@ static long beat_lpar_hpte_updatepp(unsigned long slot,
 	u64 dummy0, dummy1;
 	unsigned long want_v;
 
-	want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+	want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
 
 	DBG_LOW("    update: "
 		"avpnv=%016lx, slot=%016lx, psize: %d, newpp %016lx ... ",
@@ -228,7 +228,7 @@ static long beat_lpar_hpte_find(unsigned long vpn, int psize)
 	unsigned long want_v, hpte_v;
 
 	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, MMU_SEGSIZE_256M);
-	want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+	want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
 
 	for (j = 0; j < 2; j++) {
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -283,7 +283,7 @@ static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
 
 	DBG_LOW("    inval : slot=%lx, va=%016lx, psize: %d, local: %d\n",
 		slot, va, psize, local);
-	want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+	want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
 
 	raw_spin_lock_irqsave(&beat_htab_lock, flags);
 	dummy1 = beat_lpar_hpte_getword0(slot);
@@ -372,7 +372,7 @@ static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
 	unsigned long want_v;
 	unsigned long pss;
 
-	want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+	want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
 	pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc;
 
 	DBG_LOW("    update: "
@@ -402,7 +402,7 @@ static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn,
 
 	DBG_LOW("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
 		slot, vpn, psize, local);
-	want_v = hpte_encode_v(vpn, psize, MMU_SEGSIZE_256M);
+	want_v = hpte_encode_avpn(vpn, psize, MMU_SEGSIZE_256M);
 	pss = (psize == MMU_PAGE_4K) ? -1UL : mmu_psize_defs[psize].penc;
 
 	lpar_rc = beat_invalidate_htab_entry3(0, slot, want_v, pss);
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index d00d7b0..07a4bba 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -115,7 +115,7 @@ static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	unsigned long flags;
 	long ret;
 
-	want_v = hpte_encode_v(vpn, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
 	spin_lock_irqsave(&ps3_htab_lock, flags);
 
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V2 08/26] powerpc: Clarify __pgtable_cache_add by renaming shift to index
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1362550227-575-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

With table_size as second argument, first argument of the function
is not the shift value, but rather index into the array. Rename the
variable to clarify the same.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pgtable-ppc64.h |    2 +-
 arch/powerpc/mm/init_64.c                |   16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index d51d893..658ba7c 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -338,7 +338,7 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define pgoff_to_pte(off)	((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE})
 #define PTE_FILE_MAX_BITS	(BITS_PER_LONG - PTE_RPN_SHIFT)
 
-extern void __pgtable_cache_add(unsigned shift, unsigned long table_size,
+extern void __pgtable_cache_add(unsigned index, unsigned long table_size,
 				void (*ctor)(void *));
 void pgtable_cache_init(void);
 static inline void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index d6df419..2e09ec1 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -100,7 +100,7 @@ struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
  * everything else.  Caches created by this function are used for all
  * the higher level pagetables, and for hugepage pagetables.
  */
-void __pgtable_cache_add(unsigned int shift, unsigned long table_size,
+void __pgtable_cache_add(unsigned int index, unsigned long table_size,
 			 void (*ctor)(void *))
 {
 	char *name;
@@ -110,8 +110,8 @@ void __pgtable_cache_add(unsigned int shift, unsigned long table_size,
 	 * the index size in the low bits.  Table alignment must be
 	 * big enough to fit it.
 	 *
-	 * Likewise, hugeapge pagetable pointers contain a (different)
-	 * shift value in the low bits.  All tables must be aligned so
+	 * Likewise, hugepage pagetable pointers contain a (different)
+	 * huge page size in the low bits.  All tables must be aligned so
 	 * as to leave enough 0 bits in the address to contain it. */
 	unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
 				     HUGEPD_SHIFT_MASK + 1);
@@ -121,17 +121,17 @@ void __pgtable_cache_add(unsigned int shift, unsigned long table_size,
 	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
 	 * constant expression, so so much for that. */
 	BUG_ON(!is_power_of_2(minalign));
-	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+	BUG_ON((index < 1) || (index > MAX_PGTABLE_INDEX_SIZE));
 
-	if (PGT_CACHE(shift))
+	if (PGT_CACHE(index))
 		return; /* Already have a cache of this size */
 
 	align = max_t(unsigned long, align, minalign);
-	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", index);
 	new = kmem_cache_create(name, table_size, align, 0, ctor);
-	PGT_CACHE(shift) = new;
+	PGT_CACHE(index) = new;
 
-	pr_debug("Allocated pgtable cache for order %d\n", shift);
+	pr_debug("Allocated pgtable cache for order %d\n", index);
 }
 
 
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V2 07/26] powerpc: Add size argument to pgtable_cache_add
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1362550227-575-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

We will use this later with THP changes to request for pmd table of double the size.
THP code does PTE page allocation along with large page request and deposit them
for later use. This is to ensure that we won't have any failures when we split
huge pages to regular pages.

On powerpc we want to use the deposited PTE page for storing hash pte slot and
secondary bit information for the HPTEs. Hence we save them in the second half
of the pmd table.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pgtable-ppc64.h |    7 ++++++-
 arch/powerpc/mm/init_64.c                |    4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 0182c20..d51d893 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -338,8 +338,13 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define pgoff_to_pte(off)	((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE})
 #define PTE_FILE_MAX_BITS	(BITS_PER_LONG - PTE_RPN_SHIFT)
 
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
+extern void __pgtable_cache_add(unsigned shift, unsigned long table_size,
+				void (*ctor)(void *));
 void pgtable_cache_init(void);
+static inline void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
+	return __pgtable_cache_add(shift, sizeof(void *) << shift, ctor);
+}
 
 /*
  * find_linux_pte returns the address of a linux pte for a given
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 95a4529..d6df419 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -100,10 +100,10 @@ struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
  * everything else.  Caches created by this function are used for all
  * the higher level pagetables, and for hugepage pagetables.
  */
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+void __pgtable_cache_add(unsigned int shift, unsigned long table_size,
+			 void (*ctor)(void *))
 {
 	char *name;
-	unsigned long table_size = sizeof(void *) << shift;
 	unsigned long align = table_size;
 
 	/* When batching pgtable pointers for RCU freeing, we store
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V2 04/26] powerpc: Reduce the PTE_INDEX_SIZE
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1362550227-575-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

This make one PMD cover 16MB range. That helps in easier implementation of THP
on power. THP core code make use of one pmd entry to track the huge page and
the range mapped by a single pmd entry should be equal to the huge page size
supported by the hardware.

Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pgtable-ppc64-64k.h |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
index be4e287..3c529b4 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
@@ -4,10 +4,10 @@
 #include <asm-generic/pgtable-nopud.h>
 
 
-#define PTE_INDEX_SIZE  12
+#define PTE_INDEX_SIZE  8
 #define PMD_INDEX_SIZE  12
 #define PUD_INDEX_SIZE	0
-#define PGD_INDEX_SIZE  6
+#define PGD_INDEX_SIZE  10
 
 #ifndef __ASSEMBLY__
 #define PTE_TABLE_SIZE	(sizeof(real_pte_t) << PTE_INDEX_SIZE)
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V2 03/26] powerpc: Don't hard code the size of pte page
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1362550227-575-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

USE PTRS_PER_PTE to indicate the size of pte page. To support THP,
later patches will be changing PTRS_PER_PTE value.

Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pgtable.h |    6 ++++++
 arch/powerpc/mm/hash_low_64.S      |    4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index a9cbd3b..4b52726 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -17,6 +17,12 @@ struct mm_struct;
 #  include <asm/pgtable-ppc32.h>
 #endif
 
+/*
+ * We save the slot number & secondary bit in the second half of the
+ * PTE page. We use the 8 bytes per each pte entry.
+ */
+#define PTE_PAGE_HIDX_OFFSET (PTRS_PER_PTE * 8)
+
 #ifndef __ASSEMBLY__
 
 #include <asm/tlbflush.h>
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 7443481..abdd5e2 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -490,7 +490,7 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 	beq	htab_inval_old_hpte
 
 	ld	r6,STK_PARAM(R6)(r1)
-	ori	r26,r6,0x8000		/* Load the hidx mask */
+	ori	r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */
 	ld	r26,0(r26)
 	addi	r5,r25,36		/* Check actual HPTE_SUB bit, this */
 	rldcr.	r0,r31,r5,0		/* must match pgtable.h definition */
@@ -607,7 +607,7 @@ htab_pte_insert_ok:
 	sld	r4,r4,r5
 	andc	r26,r26,r4
 	or	r26,r26,r3
-	ori	r5,r6,0x8000
+	ori	r5,r6,PTE_PAGE_HIDX_OFFSET
 	std	r26,0(r5)
 	lwsync
 	std	r30,0(r6)
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V2 02/26] powerpc: Save DAR and DSISR in pt_regs on MCE
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1362550227-575-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

We were not saving DAR and DSISR on MCE. Save then and also print the values
along with exception details in xmon.

Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/exceptions-64s.S |    9 +++++++++
 arch/powerpc/xmon/xmon.c             |    2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 0e9c48c..d02e730 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -640,9 +640,18 @@ slb_miss_user_pseries:
 	.align	7
 	.globl machine_check_common
 machine_check_common:
+
+	mfspr	r10,SPRN_DAR
+	std	r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr	r10,SPRN_DSISR
+	stw	r10,PACA_EXGEN+EX_DSISR(r13)
 	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
 	FINISH_NAP
 	DISABLE_INTS
+	ld	r3,PACA_EXGEN+EX_DAR(r13)
+	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
 	bl	.save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.machine_check_exception
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 1f8d2f1..a72e490 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1423,7 +1423,7 @@ static void excprint(struct pt_regs *fp)
 	printf("    sp: %lx\n", fp->gpr[1]);
 	printf("   msr: %lx\n", fp->msr);
 
-	if (trap == 0x300 || trap == 0x380 || trap == 0x600) {
+	if (trap == 0x300 || trap == 0x380 || trap == 0x600 || trap == 0x200) {
 		printf("   dar: %lx\n", fp->dar);
 		if (trap != 0x380)
 			printf(" dsisr: %lx\n", fp->dsisr);
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V2 01/26] powerpc: Use signed formatting when printing error
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1362550227-575-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

PAPR defines these errors as negative values. So print them accordingly
for easy debugging.

Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/lpar.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 0da39fe..a77c35b 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -155,7 +155,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	 */
 	if (unlikely(lpar_rc != H_SUCCESS)) {
 		if (!(vflags & HPTE_V_BOLTED))
-			pr_devel(" lpar err %lu\n", lpar_rc);
+			pr_devel(" lpar err %ld\n", lpar_rc);
 		return -2;
 	}
 	if (!(vflags & HPTE_V_BOLTED))
-- 
1.7.10

^ permalink raw reply related

* [PATCH -V2 00/26]T HP support for PPC64
From: Aneesh Kumar K.V @ 2013-03-06  6:10 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev

Hi,

This patchset adds transparent huge page support for PPC64.

TODO:
* ppc64 KVM related changes
* powernv still doesn't boot
* hash preload support in update_mmu_cache_pmd

Some numbers:

The latency measurements code from Anton  found at
http://ozlabs.org/~anton/junkcode/latency2001.c

THP disabled 64K page size
------------------------
[root@llmp24l02 ~]# ./latency2001 8G
 8589934592    731.73 cycles    205.77 ns
[root@llmp24l02 ~]# ./latency2001 8G
 8589934592    743.39 cycles    209.05 ns
[root@llmp24l02 ~]#

THP disabled large page via hugetlbfs
-------------------------------------
[root@llmp24l02 ~]# ./latency2001  -l 8G
 8589934592    416.09 cycles    117.01 ns
[root@llmp24l02 ~]# ./latency2001  -l 8G
 8589934592    415.74 cycles    116.91 ns

THP enabled 64K page size.
----------------
[root@llmp24l02 ~]# ./latency2001 8G
 8589934592    405.07 cycles    113.91 ns
[root@llmp24l02 ~]# ./latency2001 8G
 8589934592    411.82 cycles    115.81 ns
[root@llmp24l02 ~]#

We are close to hugetlbfs in latency and we can achieve this with zero
config/page reservation. Most of the allocations above are fault allocated.
I haven't really measured the collapse alloc impact.

Another test that does 50000000 random access over 1GB area goes from
2.65 seconds to 1.07 seconds with this patchset.

Changes from V1
* Address review comments
* More patch split
* Add batch hpte invalidate for hugepages.

Changes from RFC V2:
* Address review comments
* More code cleanup and patch split

Changes from RFC V1:
* HugeTLB fs now works
* Compile issues fixed
* rebased to v3.8
* Patch series reorded so that ppc64 cleanups and MM THP changes are moved
  early in the series. This should help in picking those patches early.

Thanks,
-aneesh

^ permalink raw reply

* 3.9-rc1 powerpc ptrace.c: 'brk.len' is used uninitialized
From: Philippe De Muyter @ 2013-03-06  6:00 UTC (permalink / raw)
  To: Michael Neuling; +Cc: linuxppc-dev, linux-kernel

Hello Michael,

bisect tells me that since your commit 9422de3e953d0e60eb95f5430a9dd803eec1c6d7 
"powerpc: Hardware breakpoints rewrite to handle non DABR breakpoint registers",
compiling linux fails with :

  cc1: warnings being treated as errors
  arch/powerpc/kernel/ptrace.c: In function 'arch_ptrace':
  arch/powerpc/kernel/ptrace.c:1450: warning: 'brk.len' is used uninitialized in this function
  arch/powerpc/kernel/ptrace.c:1352: note: 'brk.len' was declared here

could you look at that ?

Thanks

Philippe

^ permalink raw reply

* Re: [PATCH 5/6][v4]: perf: Create a sysfs entry for Power event format
From: Sukadev Bhattiprolu @ 2013-03-06  5:48 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Andi Kleen, Peter Zijlstra, robert.richter, Anton Blanchard,
	linux-kernel, Stephane Eranian, linuxppc-dev, Ingo Molnar,
	Paul Mackerras, Arnaldo Carvalho de Melo, Jiri Olsa
In-Reply-To: <20130227011725.GA5819@concordia>

Michael Ellerman [michael@ellerman.id.au] wrote:
| I suspect Arnaldo was either waiting for an ACK from Ben, or was
| expecting Ben to take it?

Arnaldo, here is an updated patch. If it is acked by Paul Mackerras,
Michael Ellerman or Ben, will you add it to your tree so the whole
patchset comes from one place ?

Sukadev

---
>From 50c7a46f14083c0ed10d66b7aed66ba76e798550 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 5 Mar 2013 21:20:56 -0800
Subject: [PATCH] [PATCH 5/6][v4]: perf Create a sysfs format entry for Power7 events

Create a sysfs entry, '/sys/bus/event_source/devices/cpu/format/event'
which describes the format of the POWER7 PMU events.

This code is based on corresponding code in x86.

Changelog[v4]:  [Michael Ellerman, Paul Mckerras] The event format is different
		for other POWER cpus. So move the code to POWER7-specific,
		power7-pmu.c Also, the POWER7 format uses bits 0-19 not 0-20.

Changelog[v2]: [Jiri Osla] Use PMU_FORMAT_ATTR rather than duplicating code.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
 arch/powerpc/perf/power7-pmu.c |   13 +++++++++++++
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index b554879..3c475d6 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -420,7 +420,20 @@ static struct attribute_group power7_pmu_events_group = {
 	.attrs = power7_events_attr,
 };
 
+PMU_FORMAT_ATTR(event, "config:0-19");
+
+static struct attribute *power7_pmu_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+struct attribute_group power7_pmu_format_group = {
+	.name = "format",
+	.attrs = power7_pmu_format_attr,
+};
+
 static const struct attribute_group *power7_pmu_attr_groups[] = {
+	&power7_pmu_format_group,
 	&power7_pmu_events_group,
 	NULL,
 };
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH 2/3] irq: Add hw continuous IRQs map to virtual continuous IRQs support
From: Michael Ellerman @ 2013-03-06  5:42 UTC (permalink / raw)
  To: Mike Qiu; +Cc: tglx, linuxppc-dev, linux-kernel
In-Reply-To: <5136D582.80101@linux.vnet.ibm.com>

On Wed, Mar 06, 2013 at 01:34:58PM +0800, Mike Qiu wrote:
> 于 2013/3/6 11:54, Michael Ellerman 写道:
> >On Tue, Mar 05, 2013 at 03:19:57PM +0800, Mike Qiu wrote:
> >>于 2013/3/5 10:23, Michael Ellerman 写道:
> >>>On Tue, Jan 15, 2013 at 03:38:55PM +0800, Mike Qiu wrote:
> >>>>diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
> >>>>index 96f3a1d..38648e6 100644
> >>>>--- a/kernel/irq/irqdomain.c
> >>>>+++ b/kernel/irq/irqdomain.c
> >>>>@@ -636,6 +636,67 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
> >>>>  }
> >>>>  EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
> >>>>+/**
> >>>>+ * irq_create_mapping_many - Map a range of hw IRQs to a range of virtual IRQs
> >>>>+ * @domain: domain owning the interrupt range
> >>>>+ * @hwirq_base: beginning of continuous hardware IRQ range
> >>>>+ * @count: Number of interrupts to map
> >>>For multiple-MSI the allocated interrupt numbers must be a power-of-2,
> >>>and must be naturally aligned. I don't /think/ that's a requirement for
> >>>the virtual numbers, but it's probably best that we do it anyway.
> >>>
> >>>So this API needs to specify that it will give you back a power-of-2
> >>>block that is naturally aligned - otherwise you can't use it for MSI.
> >>rtas_call will return the numbers of hardware interrupt, and it
> >>should be power-of-2, as this I think do not need to specify
> >You're confusing hardware interrupt numbers and virtual interrupt
> >numbers. My comment is about irq_create_mapping_many(), which returns
> >virtual interrupt numbers.
> >
> >As I said I don't think there is a requirement that the virtual
> >interrupt numbers are also a power-of-2 naturally aligned block, but we
> >should allocate them as one anyway, to avoid any issues in future.

> But for virtual interrupt numbersit should be a power-of-2 naturally
> aligned block, because it must be continuous, as the MSI-HOWTO.txt says:
> 
>     4.2.2 pci_enable_msi_block
>     int pci_enable_msi_block(struct pci_dev *dev, int count)
>     This variation on the above call allows a device driver to request
>     multiple MSIs.  The MSI specification only allows interrupts to be
>     allocated in powers of two, up to a maximum of 2^5 (32).
>     If this function returns 0, it has succeeded in allocating at least
>     as many interrupts as the driver requested
>     (it may have allocated more in order to satisfy the power-of-two
>     requirement). In this case, the function enables MSI on this device
>     and updates dev->irq to be the lowest of the new interrupts
>     assigned to it. The other interrupts assigned to the device are in
>     the range dev->irq to dev->irq + count - 1.
> 
> See the last line, that means for the virtual interrupts must be a
> continuous block.

In practice I think things could work if we didn't, because we are not
using the mask routines that assume that layout.

But you're right, we must implement the API as it's specified, so the
virtual interrupt numbers must be a naturally aligned power-of-2.

cheers

^ permalink raw reply

* Re: [PATCH 2/3] irq: Add hw continuous IRQs map to virtual continuous IRQs support
From: Mike Qiu @ 2013-03-06  5:34 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: tglx, linuxppc-dev, linux-kernel
In-Reply-To: <20130306035443.GB3493@concordia>

[-- Attachment #1: Type: text/plain, Size: 5555 bytes --]

于 2013/3/6 11:54, Michael Ellerman 写道:
> On Tue, Mar 05, 2013 at 03:19:57PM +0800, Mike Qiu wrote:
>> 于 2013/3/5 10:23, Michael Ellerman 写道:
>>> On Tue, Jan 15, 2013 at 03:38:55PM +0800, Mike Qiu wrote:
>>>> Adding a function irq_create_mapping_many() which can associate
>>>> multiple MSIs to a continous irq mapping.
>>>>
>>>> This is needed to enable multiple MSI support for pSeries.
>>>>
>>>> Signed-off-by: Mike Qiu <qiudayu@linux.vnet.ibm.com>
>>>> ---
>>>>   include/linux/irq.h       |    2 +
>>>>   include/linux/irqdomain.h |    3 ++
>>>>   kernel/irq/irqdomain.c    |   61 +++++++++++++++++++++++++++++++++++++++++++++
>>>>   3 files changed, 66 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/include/linux/irq.h b/include/linux/irq.h
>>>> index 60ef45b..e00a7ec 100644
>>>> --- a/include/linux/irq.h
>>>> +++ b/include/linux/irq.h
>>>> @@ -592,6 +592,8 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
>>>>   #define irq_alloc_desc_from(from, node)		\
>>>>   	irq_alloc_descs(-1, from, 1, node)
>>>> +#define irq_alloc_desc_n(nevc, node)		\
>>>> +	irq_alloc_descs(-1, 0, nevc, node)
>>> This has been superseeded by irq_alloc_descs_from(), which is the right
>>> way to do it.
>> Yes, but irq_alloc_descs_from() just for 1 irq
> No it's not, look again.
>
> #define irq_alloc_descs_from(from, cnt, node)   \
> 	irq_alloc_descs(-1, from, cnt, node)
Sorry, I see as irq_alloc_desc_from(from, node)
you are right
>
>
>>>> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
>>>> index 96f3a1d..38648e6 100644
>>>> --- a/kernel/irq/irqdomain.c
>>>> +++ b/kernel/irq/irqdomain.c
>>>> @@ -636,6 +636,67 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
>>>> +/**
>>>> + * irq_create_mapping_many - Map a range of hw IRQs to a range of virtual IRQs
>>>> + * @domain: domain owning the interrupt range
>>>> + * @hwirq_base: beginning of continuous hardware IRQ range
>>>> + * @count: Number of interrupts to map
>>> For multiple-MSI the allocated interrupt numbers must be a power-of-2,
>>> and must be naturally aligned. I don't /think/ that's a requirement for
>>> the virtual numbers, but it's probably best that we do it anyway.
>>>
>>> So this API needs to specify that it will give you back a power-of-2
>>> block that is naturally aligned - otherwise you can't use it for MSI.
>> rtas_call will return the numbers of hardware interrupt, and it
>> should be power-of-2, as this I think do not need to specify
> You're confusing hardware interrupt numbers and virtual interrupt
> numbers. My comment is about irq_create_mapping_many(), which returns
> virtual interrupt numbers.
>
> As I said I don't think there is a requirement that the virtual
> interrupt numbers are also a power-of-2 naturally aligned block, but we
> should allocate them as one anyway, to avoid any issues in future.
But for virtual interrupt numbersit should be a power-of-2 naturally
aligned block, because it must be continuous, as the MSI-HOWTO.txt says:

     4.2.2 pci_enable_msi_block
     int pci_enable_msi_block(struct pci_dev *dev, int count)
     This variation on the above call allows a device driver to request
     multiple MSIs.  The MSI specification only allows interrupts to be
     allocated in powers of two, up to a maximum of 2^5 (32).
     If this function returns 0, it has succeeded in allocating at least
     as many interrupts as the driver requested
     (it may have allocated more in order to satisfy the power-of-two
     requirement). In this case, the function enables MSI on this device
     and updates dev->irq to be the lowest of the new interrupts
     assigned to it. The other interrupts assigned to the device are in
     the range dev->irq to dev->irq + count - 1.

See the last line, that means for the virtual interrupts must be a
continuous block.
> And so this API, which returns virtual interrupt numbers, must satisfy
> that specification.
>
>>>> +	/* Look for default domain if nececssary */
>>>> +	if (!domain)
>>>> +		domain = irq_default_domain;
>>>> +	if (!domain) {
>>>> +		pr_warn("irq_create_mapping called for NULL domain, hwirq=%lx\n"
>>>> +			, hwirq_base);
>>>> +		WARN_ON(1);
>>>> +		return 0;
>>>> +	}
>>>> +	pr_debug("-> using domain @%p\n", domain);
>>>> +
>>>> +	/* For IRQ_DOMAIN_MAP_LEGACY, get the first virtual interrupt number */
>>>> +	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
>>>> +		return irq_domain_legacy_revmap(domain, hwirq_base);
>>> The above doesn't work.
>> Why it doesn't work ?
> Because irq_domain_legacy_revmap() only allocates a single interrupt
> number.
OK, your right.
>>>> +	/* Check if mapping already exists */
>>>> +	for (i = 0; i < count; i++) {
>>>> +		virq = irq_find_mapping(domain, hwirq_base+i);
>>>> +		if (virq) {
>>>> +			pr_debug("existing mapping on virq %d,"
>>>> +					" now dispose it first\n", virq);
>>>> +			irq_dispose_mapping(virq);
>>> You might have just disposed of someone elses mapping, we shouldn't do
>>> that. It should be an error to the caller.
>> It's a good question. If the interrupt used for someone elses, why I
>> can apply it from the system?
> I agree, that would be a bug. But disposing of someone elses mapping is
> not OK.
>
>> So it may someone else forget to dispose mapping, and it never be
>> used for others as I have got the interrupt I think.
> Perhaps, but that is a bug that needs to be fixed in the code that
> forgets to dispose of the mapping.
>
> cheers
>


[-- Attachment #2: Type: text/html, Size: 8453 bytes --]

^ permalink raw reply

* Re: [PATCH -V1 06/24] powerpc: Reduce PTE table memory wastage
From: Aneesh Kumar K.V @ 2013-03-06  5:03 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <20130305021219.GC2888@iris.ozlabs.ibm.com>

Paul Mackerras <paulus@samba.org> writes:

> On Mon, Mar 04, 2013 at 04:28:42PM +0530, Aneesh Kumar K.V wrote:
>> The last one that ends up doing atomic_xor_bits which cause the mapcount
>> to go zero, will take the page off the list and free the page. 
>
> No, look at the example again.  page_table_free_rcu() won't take it
> off the list because it uses the (mask & FRAG_MASK) == 0 test, which
> fails (one fragment is still in use).  page_table_free() won't take it
> off the list because it uses the mask == 0 test, which also fails (one
> fragment is still waiting for the RCU grace period).  Finally,
> __page_table_free_rcu() doesn't take it off the list, it just frees
> the page.  Oops. :)


How about the below

--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -425,7 +425,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
        bit = 1 << ((__pa(table) & ~PAGE_MASK) / PTE_FRAG_SIZE);
        spin_lock(&mm->page_table_lock);
        mask = atomic_xor_bits(&page->_mapcount, bit);
-       if (mask == 0)
+       if (!(mask & FRAG_MASK))
                list_del(&page->lru);
        else if (mask & FRAG_MASK) {
                /*
@@ -446,7 +446,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)


ie, we always remove the page from the list, when the lower half is
zero or lower half is FRAG_MASK.  We free the page when _mapcount is 0.

-aneesh

^ permalink raw reply

* Re: [PATCH -V1 09/24] powerpc: Decode the pte-lp-encoding bits correctly.
From: Aneesh Kumar K.V @ 2013-03-06  4:30 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <20130305020205.GB2888@iris.ozlabs.ibm.com>

Paul Mackerras <paulus@samba.org> writes:

> On Mon, Mar 04, 2013 at 05:11:53PM +0530, Aneesh Kumar K.V wrote:
>> Paul Mackerras <paulus@samba.org> writes:
>> >> +static inline int hpte_actual_psize(struct hash_pte *hptep, int psiz=
e)
>> >> +{
>> >> +	unsigned int mask;
>> >> +	int i, penc, shift;
>> >> +	/* Look at the 8 bit LP value */
>> >> +	unsigned int lp =3D (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
>> >> +
>> >> +	penc =3D 0;
>> >> +	for (i =3D 0; i < MMU_PAGE_COUNT; i++) {
>> >> +		/* valid entries have a shift value */
>> >> +		if (!mmu_psize_defs[i].shift)
>> >> +			continue;
>> >> +
>> >> +		/* encoding bits per actual page size */
>> >> +		shift =3D mmu_psize_defs[i].shift - 11;
>> >> +		if (shift > 9)
>> >> +			shift =3D 9;
>> >> +		mask =3D (1 << shift) - 1;
>> >> +		if ((lp & mask) =3D=3D mmu_psize_defs[psize].penc[i])
>> >> +			return i;
>> >> +	}
>> >> +	return -1;
>> >> +}
>> >
>> > This doesn't look right to me.  First, it's not clear what the 11 and
>> > 9 refer to, and I think the 9 should be LP_BITS (i.e. 8).  Secondly,
>> > the mask for the comparison needs to depend on the actual page size
>> > not the base page size.
>>=20
>> That 11 should be 12.That depends on the fact that we have below mapping
>
> And the 12 should be LP_SHIFT, shouldn't it?

LP_SHIFT would indicate how many bit poisition need to be shifted to get
to the LP field in HPTE. I guess what we want here is shift value for 4K
page.  How about=20

shift =3D mmu_psize_defs[i].shift - mmu_psize_defs[MMU_PAGE_4K].shift;


>
>>  rrrr rrrz 	=E2=89=A58KB
>>=20
>> Yes, that 9 should be LP_BITs.=20
>>=20
>> We are generating mask based on actual page size above (variable i in
>> the for loop).
>
> OK, yes, you're right.
>
>> > I don't see where in this function you set the penc[] elements for
>> > invalid actual page sizes to -1.
>>=20
>> We do the below
>>=20
>> --- a/arch/powerpc/mm/hash_utils_64.c
>> +++ b/arch/powerpc/mm/hash_utils_64.c
>> @@ -125,7 +125,7 @@ static struct mmu_psize_def mmu_psize_defaults_old[]=
 =3D {
>>         [MMU_PAGE_4K] =3D {
>>                 .shift  =3D 12,
>>                 .sllp   =3D 0,
>> -               .penc   =3D 0,
>> +               .penc   =3D { [0 ... MMU_PAGE_COUNT - 1] =3D -1 },
>>                 .avpnm  =3D 0,
>
> Yes, which sets them for the entries you initialize, but not for the
> others.  For example, the entry for MMU_PAGE_64K will initially be all
> zeroes.  Then we find an entry in the ibm,segment-page-sizes property
> for 64k pages, so we set mmu_psize_defs[MMU_PAGE_64K].shift to 16,
> making that entry valid, but we never set any of the .penc[] entries
> to -1, leading your other code to think that it can do (say) 1M pages
> in a 64k segment using an encoding of 0.
>

Noticed that earlier. This is what i currently have.

+static void mmu_psize_set_default_penc(struct mmu_psize_def *mmu_psize)
+{
+	int bpsize, apsize;
+	for (bpsize =3D 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+		for (apsize =3D 0; apsize < MMU_PAGE_COUNT; apsize++)
+			mmu_psize[bpsize].penc[apsize] =3D -1;
+}
+
 static void __init htab_init_page_sizes(void)
 {
 	int rc;
=20
+	mmu_psize_set_default_penc(mmu_psize_defaults_old);
+
 	/* Default to 4K pages only */
 	memcpy(mmu_psize_defs, mmu_psize_defaults_old,
 	       sizeof(mmu_psize_defaults_old));
@@ -411,6 +443,8 @@ static void __init htab_init_page_sizes(void)
 	if (rc !=3D 0)  /* Found */
 		goto found;
=20
+	mmu_psize_set_default_penc(mmu_psize_defaults_gp);
+
 	/*
 	 * Not in the device-tree, let's fallback on known size
 	 * list for 16M capable GP & GR
	Modified   arch/powerpc/mm/hugetlbpage-hash64.c



> Also, I noticed that the code in the if (base_idx < 0) statement is
> wrong.  It needs to advance prop (and decrease size) by 2 * lpnum,
> not just 2.
>

Ok. Fixed now.

-aneesh

^ permalink raw reply

* Re: [PATCH -V1 07/24] powerpc: Add size argument to pgtable_cache_add
From: Aneesh Kumar K.V @ 2013-03-06  4:23 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <20130305015041.GA2888@iris.ozlabs.ibm.com>

Paul Mackerras <paulus@samba.org> writes:

> On Mon, Mar 04, 2013 at 04:32:24PM +0530, Aneesh Kumar K.V wrote:
>> 
>> Now with table_size argument, the first arg is no more the shift value,
>> rather it is index into the array. Hence i changed the variable name. I
>> will split that patch to make it easy for review.
>
> OK, so you're saying that the simple relation between index and the
> size of the objects in PGT_CACHE(index) no longer holds. That worries
> me, because now, what guarantees that two callers won't use the same
> index value with two different sizes?  And what guarantees that we
> won't have two callers using different index values but the same size
> (which wouldn't be a disaster but would be a waste of space)?
>
> I think it would be preferable to keep the relation between shift and
> the size of the objects and just arrange to use a different shift
> value for the pmd objects when you need to.

Most of the places we get the cache pointer by doing something like.
PGT_CACHE(PMD_INDEX_SIZE). What we need is that kmem_cache to return an
object twice the size of PMD_TABLE_SIZE. The relevant diff in the later
patch is below.

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/*
+	 * we store the pgtable details in the second half of PMD
+	 */
+	if (PGT_CACHE(PMD_INDEX_SIZE))
+		pr_err("PMD Page cache already initialized with different size\n");
+	__pgtable_cache_add(PMD_INDEX_SIZE, PMD_TABLE_SIZE * 2, pmd_ctor);
+#else
 	pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+#endif

^ permalink raw reply

* Re: [PATCH] powerpc/powernv: Fix next available MSI IRQ
From: Gavin Shan @ 2013-03-06  4:09 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Gavin Shan
In-Reply-To: <20130306032454.GA3493@concordia>

On Wed, Mar 06, 2013 at 02:24:54PM +1100, Michael Ellerman wrote:
>On Tue, Mar 05, 2013 at 02:59:16PM +0800, Gavin Shan wrote:
>> The allocation of MSI is implemented based on bitmap and working
>> like the mechanism of strict round through the traced next available
>> cursor. However, the next available MSI is never updated in current
>> implementation. The patch fixes the issue.
>> 
>> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/platforms/powernv/pci.c |    5 +++++
>>  1 files changed, 5 insertions(+), 0 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index 6f464dc..9cf18c4 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -66,6 +66,11 @@ static unsigned int pnv_get_one_msi(struct pnv_phb *phb)
>>  		rc = 0;
>>  		goto out;
>>  	}
>> +
>> +	if (id >= phb->msi_count - 1)
>> +		phb->msi_next = 0;
>> +	else
>> +		phb->msi_next = id + 1;
>>  	__set_bit(id, phb->msi_map);
>
>
>There is code in arch/powerpc/sysdev/msi_bitmap.c that implements a
>bitmap allocator for MSI. It may not do what you need but please take a
>look at it if you haven't already.
>

Thanks, Michael. I neve know that you've implemented bitmaps to manage
MSI interrupts. It seems arch/powerpc/sysdev/msi_bitmap.c meets our
requirment here except that needs device tree node. Fortunately, we
can set the corresponding device tree node to NULL and functions playing
with the device tree nodes (of_node_get/of_node_put) works well for NULL
device tree node.

I'll update powernv platform to use msi_bitmap.c to manage it MSI interrupts.

Thanks,
Gavin

^ permalink raw reply

* Re: [PATCH] ppc32: Fix compile of sha1-powerpc-asm.S
From: Michael Ellerman @ 2013-03-06  4:09 UTC (permalink / raw)
  To: Christian Kujau; +Cc: LinuxPPC-dev
In-Reply-To: <alpine.DEB.2.10.1303041656010.22410@trent.utfs.org>

On Mon, Mar 04, 2013 at 05:23:14PM -0800, Christian Kujau wrote:
> On Tue, 26 Feb 2013 at 13:20, Tony Breeds wrote:
> > When building with CRYPTO_SHA1_PPC enabled we fail with:
> > ---
> > powerpc/crypto/sha1-powerpc-asm.S: Assembler messages:
> > powerpc/crypto/sha1-powerpc-asm.S:116: Error: can't resolve `0' {*ABS* section} - `STACKFRAMESIZE' {*UND* section}
> > powerpc/crypto/sha1-powerpc-asm.S:116: Error: expression too complex
> > powerpc/crypto/sha1-powerpc-asm.S:178: Error: unsupported relocation against STACKFRAMESIZE
> > ---
> > 
> > Use INT_FRAME_SIZE instead.
> > 
> > Signed-off-by: Tony Breeds <tony@bakeyournoodle.com>
> 
> Thanks for the fix! Ran into this as well, with your patch 3.9-rc1 
> compiles again (and it even boots :-))
> 
> Tested-by: Christian Kujau <lists@nerdbynature.de>
> 
> $ grep -A10 sha1 /proc/crypto
> name         : sha1
> driver       : sha1-powerpc
> module       : kernel
> priority     : 0
> refcnt       : 1
> selftest     : passed
> type         : shash
> blocksize    : 64
> digestsize   : 20

Thanks Christian. What hardware are you on?

cheers

^ permalink raw reply

* Re: [PATCH -V1 06/24] powerpc: Reduce PTE table memory wastage
From: Aneesh Kumar K.V @ 2013-03-06  4:08 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <20130305021219.GC2888@iris.ozlabs.ibm.com>

Paul Mackerras <paulus@samba.org> writes:

> On Mon, Mar 04, 2013 at 04:28:42PM +0530, Aneesh Kumar K.V wrote:
>> Paul Mackerras <paulus@samba.org> writes:
>> 
>> > The other general comment I have is that it's not really clear when a
>> > page will be on the mm->context.pgtable_list and when it won't.  I
>> > would like to see an invariant that says something like "the page is
>> > on the pgtable_list if and only if (page->_mapcount & FRAG_MASK) is
>> > neither 0 nor FRAG_MASK".  But that doesn't seem to be the case
>> > exactly, and I can't see any consistent rule, which makes me think
>> > there are going to be bugs in corner cases.
>> >
>> 
>> 
>> I added the below comment when initializing the list.
>> 
>> +#ifdef CONFIG_PPC_64K_PAGES
>> +       /*
>> +        * Used to support 4K PTE fragment. The pages are added to list,
>> +        * when we have free framents in the page. We track the whether
>> +        * a page frament is available using page._mapcount. A value of
>> +        * zero indicate none of the fragments are used and page can be
>> +        * freed. A value of FRAG_MASK indicate all the fragments are used
>> +        * and hence the page will be removed from the below list.
>> +        */
>> +       INIT_LIST_HEAD(&init_mm.context.pgtable_list);
>> +#endif
>> 
>> I am not sure about why you say there is no consistent rule. Can you
>> elaborate on that ?
>
> Well, sometimes you take things off the list when mask == 0, and
> sometimes when (mask & FRAG_MASK) == 0.  So it's not clear whether the
> page is supposed to be on the list when (mask & FRAG_MASK) == 0 but
> mask != 0.  If you stated in a comment what the rule was supposed to
> be then reviewers could check whether your code implemented that rule.
> Also, if you had a consistent rule you could more easily add debug
> code to check that the rule was being followed.


I guess you are looking at this in page_table_free_rcu ?

+	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << FRAG_MASK_BITS));
+	if (!(mask & FRAG_MASK))
+		list_del(&page->lru);
+	else {

We want to remove the page from list looking at the lower half bits. If
all the bits are cleared, that indicate nobody is using that page. But
then we may have pending rcu, which is indicated by higher half. So if
the lower half is 0 we can remove from the list. _mapcount is 0 we can
free the page. 

Now if all the bits in the lower half is set then also we remove the
page from the list, because we don't have any free fragments in the
page.


>
> Also, that comment above doesn't say anything about the upper bits and
> whether they have any influence on whether the page should be on the
> list or not.


will add more to the comment.

>
>> > Consider, for example, the case where a page has two fragments still
>> > in use, and one of them gets queued up by RCU for freeing via a call
>> > to page_table_free_rcu, and then the other one gets freed through
>> > page_table_free().  Neither the call to page_table_free_rcu nor the
>> > call to page_table_free will take the page off the list AFAICS, and
>> > then __page_table_free_rcu() will free the page while it's still on
>> > the pgtable_list.
>> 
>> The last one that ends up doing atomic_xor_bits which cause the mapcount
>> to go zero, will take the page off the list and free the page. 
>
> No, look at the example again.  page_table_free_rcu() won't take it
> off the list because it uses the (mask & FRAG_MASK) == 0 test, which
> fails (one fragment is still in use).  page_table_free() won't take it
> off the list because it uses the mask == 0 test, which also fails (one
> fragment is still waiting for the RCU grace period).  Finally,
> __page_table_free_rcu() doesn't take it off the list, it just frees
> the page.  Oops. :)

Got it, I will see how we can fix that.

-aneesh

^ permalink raw reply

* Re: [PATCH -V1 06/24] powerpc: Reduce PTE table memory wastage
From: Aneesh Kumar K.V @ 2013-03-06  4:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Paul Mackerras, linux-mm
In-Reply-To: <1362440204.21357.20.camel@pasglop>

Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:

> On Mon, 2013-03-04 at 16:28 +0530, Aneesh Kumar K.V wrote:
>> I added the below comment when initializing the list.
>> 
>> +#ifdef CONFIG_PPC_64K_PAGES
>> +       /*
>> +        * Used to support 4K PTE fragment. The pages are added to list,
>> +        * when we have free framents in the page. We track the whether
>> +        * a page frament is available using page._mapcount. A value of
>> +        * zero indicate none of the fragments are used and page can be
>> +        * freed. A value of FRAG_MASK indicate all the fragments are used
>> +        * and hence the page will be removed from the below list.
>> +        */
>> +       INIT_LIST_HEAD(&init_mm.context.pgtable_list);
>> +#endif
>> 
>> I am not sure about why you say there is no consistent rule. Can you
>> elaborate on that ?
>
> Do you really need that list ? I assume it's meant to allow you to find
> free frags when allocating but my worry is that you'll end up losing
> quite a bit of node locality of PTE pages....
>
> It may or may not work but can you investigate doing things differently
> here ? The idea I want you to consider is to always allocate a full
> page, but make the relationship of the fragments to PTE pages fixed. IE.
> the fragment in the page is a function of the VA.
>
> Basically, the algorithm for allocation is roughly:
>
>  - Walk the tree down to the PMD ptr (* that can be improved with a
> generic change, see below)
>
>  - Check if any of the neighbouring PMDs is populated. If yes, you have
> your page and pick the appropriate fragment based on the VA
>
>  - If not, allocate and populate
>
> On free, similarly, you checked if all neighbouring PMDs have been
> cleared, in which case you can fire off the page for RCU freeing.
>
> (*) By changing pte_alloc_one to take the PMD ptr (which the call side
> has right at hand) you can avoid the tree lookup.
>

Will try this.

-aneesh

^ permalink raw reply

* Re: [PATCH 2/3] irq: Add hw continuous IRQs map to virtual continuous IRQs support
From: Michael Ellerman @ 2013-03-06  3:54 UTC (permalink / raw)
  To: Mike Qiu; +Cc: tglx, linuxppc-dev, linux-kernel
In-Reply-To: <51359C9D.5030009@linux.vnet.ibm.com>

On Tue, Mar 05, 2013 at 03:19:57PM +0800, Mike Qiu wrote:
> 于 2013/3/5 10:23, Michael Ellerman 写道:
> >On Tue, Jan 15, 2013 at 03:38:55PM +0800, Mike Qiu wrote:
> >>Adding a function irq_create_mapping_many() which can associate
> >>multiple MSIs to a continous irq mapping.
> >>
> >>This is needed to enable multiple MSI support for pSeries.
> >>
> >>Signed-off-by: Mike Qiu <qiudayu@linux.vnet.ibm.com>
> >>---
> >>  include/linux/irq.h       |    2 +
> >>  include/linux/irqdomain.h |    3 ++
> >>  kernel/irq/irqdomain.c    |   61 +++++++++++++++++++++++++++++++++++++++++++++
> >>  3 files changed, 66 insertions(+), 0 deletions(-)
> >>
> >>diff --git a/include/linux/irq.h b/include/linux/irq.h
> >>index 60ef45b..e00a7ec 100644
> >>--- a/include/linux/irq.h
> >>+++ b/include/linux/irq.h
> >>@@ -592,6 +592,8 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
> >>  #define irq_alloc_desc_from(from, node)		\
> >>  	irq_alloc_descs(-1, from, 1, node)
> >>+#define irq_alloc_desc_n(nevc, node)		\
> >>+	irq_alloc_descs(-1, 0, nevc, node)
> >This has been superseeded by irq_alloc_descs_from(), which is the right
> >way to do it.

> Yes, but irq_alloc_descs_from() just for 1 irq

No it's not, look again.

#define irq_alloc_descs_from(from, cnt, node)   \
	irq_alloc_descs(-1, from, cnt, node)


> >>diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
> >>index 96f3a1d..38648e6 100644
> >>--- a/kernel/irq/irqdomain.c
> >>+++ b/kernel/irq/irqdomain.c
> >>@@ -636,6 +636,67 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
> >>  }
> >>  EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
> >>+/**
> >>+ * irq_create_mapping_many - Map a range of hw IRQs to a range of virtual IRQs
> >>+ * @domain: domain owning the interrupt range
> >>+ * @hwirq_base: beginning of continuous hardware IRQ range
> >>+ * @count: Number of interrupts to map

> >For multiple-MSI the allocated interrupt numbers must be a power-of-2,
> >and must be naturally aligned. I don't /think/ that's a requirement for
> >the virtual numbers, but it's probably best that we do it anyway.
> >
> >So this API needs to specify that it will give you back a power-of-2
> >block that is naturally aligned - otherwise you can't use it for MSI.

> rtas_call will return the numbers of hardware interrupt, and it
> should be power-of-2, as this I think do not need to specify

You're confusing hardware interrupt numbers and virtual interrupt
numbers. My comment is about irq_create_mapping_many(), which returns
virtual interrupt numbers.

As I said I don't think there is a requirement that the virtual
interrupt numbers are also a power-of-2 naturally aligned block, but we
should allocate them as one anyway, to avoid any issues in future.

And so this API, which returns virtual interrupt numbers, must satisfy
that specification.

> >>+	/* Look for default domain if nececssary */
> >>+	if (!domain)
> >>+		domain = irq_default_domain;
> >>+	if (!domain) {
> >>+		pr_warn("irq_create_mapping called for NULL domain, hwirq=%lx\n"
> >>+			, hwirq_base);
> >>+		WARN_ON(1);
> >>+		return 0;
> >>+	}
> >>+	pr_debug("-> using domain @%p\n", domain);
> >>+
> >>+	/* For IRQ_DOMAIN_MAP_LEGACY, get the first virtual interrupt number */
> >>+	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
> >>+		return irq_domain_legacy_revmap(domain, hwirq_base);
> >The above doesn't work.
> Why it doesn't work ?

Because irq_domain_legacy_revmap() only allocates a single interrupt
number.

> >>+	/* Check if mapping already exists */
> >>+	for (i = 0; i < count; i++) {
> >>+		virq = irq_find_mapping(domain, hwirq_base+i);
> >>+		if (virq) {
> >>+			pr_debug("existing mapping on virq %d,"
> >>+					" now dispose it first\n", virq);
> >>+			irq_dispose_mapping(virq);

> >You might have just disposed of someone elses mapping, we shouldn't do
> >that. It should be an error to the caller.

> It's a good question. If the interrupt used for someone elses, why I
> can apply it from the system?

I agree, that would be a bug. But disposing of someone elses mapping is
not OK.

> So it may someone else forget to dispose mapping, and it never be
> used for others as I have got the interrupt I think.

Perhaps, but that is a bug that needs to be fixed in the code that
forgets to dispose of the mapping.

cheers

^ permalink raw reply

* [PATCH 2/2] powerpc: Setup in HFSCR for POWER8
From: Michael Neuling @ 2013-03-06  3:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Michael Neuling, linuxppc-dev
In-Reply-To: <1362540924-15253-1-git-send-email-mikey@neuling.org>

Setup the HFSCR (Hypervisor Facility Status and Control Register) for POWER8
when running HV=1.  The HFSCR is the same as the FSCR except it's for
hypervisors.  It controls the available of various facilities in OS and
userspace levels.  It also indicates the cause of a hypervisor facility
unavailable interrupt (although we are not using this here).

This patch sets the facilities Linux knows about incase the firmware doesn't.

Signed-off-by: Michael Neuling <mikey@neuling.org>
---
 arch/powerpc/kernel/cpu_setup_power.S |    8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index ea847ab..2e6ad11 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -57,6 +57,7 @@ _GLOBAL(__setup_cpu_power8)
 	mfspr	r3,SPRN_LPCR
 	oris	r3, r3, LPCR_AIL_3@h
 	bl	__init_LPCR
+	bl	__init_HFSCR
 	bl	__init_TLB
 	mtlr	r11
 	blr
@@ -72,6 +73,7 @@ _GLOBAL(__restore_cpu_power8)
 	mfspr   r3,SPRN_LPCR
 	oris	r3, r3, LPCR_AIL_3@h
 	bl	__init_LPCR
+	bl	__init_HFSCR
 	bl	__init_TLB
 	mtlr	r11
 	blr
@@ -120,6 +122,12 @@ __init_FSCR:
 	mtspr	SPRN_FSCR,r3
 	blr
 
+__init_HFSCR:
+	mfspr	r3,SPRN_HFSCR
+	ori	r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP
+	mtspr	SPRN_HFSCR,r3
+	blr
+
 __init_TLB:
 	/* Clear the TLB */
 	li	r6,128
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 1/2] powerpc: Add HFSCR SPR definitions
From: Michael Neuling @ 2013-03-06  3:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Michael Neuling, linuxppc-dev
In-Reply-To: <1362540924-15253-1-git-send-email-mikey@neuling.org>

Add SPR number and bit definitions for the HFSCR (Hypervisor Facility Status
and Control Register).

Signed-off-by: Michael Neuling <mikey@neuling.org>
---
 arch/powerpc/include/asm/reg.h |    6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c9c67fc..4ae2d44 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -268,6 +268,12 @@
 #define SPRN_FSCR	0x099	/* Facility Status & Control Register */
 #define   FSCR_TAR	(1 << (63-55)) /* Enable Target Address Register */
 #define   FSCR_DSCR	(1 << (63-61)) /* Enable Data Stream Control Register */
+#define SPRN_HFSCR	0xbe	/* HV=1 Facility Status & Control Register */
+#define   HFSCR_TAR	(1 << (63-55)) /* Enable Target Address Register */
+#define   HFSCR_TM	(1 << (63-58)) /* Enable Transactional Memory */
+#define   HFSCR_DSCR	(1 << (63-61)) /* Enable Data Stream Control Register */
+#define   HFSCR_VECVSX	(1 << (63-62)) /* Enable VMX/VSX  */
+#define   HFSCR_FP	(1 << (63-63)) /* Enable Floating Point */
 #define SPRN_TAR	0x32f	/* Target Address Register */
 #define SPRN_LPCR	0x13E	/* LPAR Control Register */
 #define   LPCR_VPM0	(1ul << (63-0))
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 0/2] powerpc: HFSCR enablement for POWER8
From: Michael Neuling @ 2013-03-06  3:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Michael Neuling, linuxppc-dev

Benh, 

This small series adds support for the HFSCR (Hypervisor Facility Status &
Control Register) in POWER8.  It just sets the bits we know about at this
stage.  This is useful only when MSR HV=1.

The HFSCR is the same as the FSCR except it's for hypervisors.  It controls the
available of various facilities in OS and userspace levels.  It also indicates
the cause of a hypervisor facility unavailable interrupt (although we are not
using this here).

v2:
  Make description of the HFSCR more verbose as suggested by benh.

Mikey

^ permalink raw reply

* Re: [PATCH] powerpc/powernv: Fix next available MSI IRQ
From: Michael Ellerman @ 2013-03-06  3:24 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev
In-Reply-To: <1362466756-16113-1-git-send-email-shangw@linux.vnet.ibm.com>

On Tue, Mar 05, 2013 at 02:59:16PM +0800, Gavin Shan wrote:
> The allocation of MSI is implemented based on bitmap and working
> like the mechanism of strict round through the traced next available
> cursor. However, the next available MSI is never updated in current
> implementation. The patch fixes the issue.
> 
> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/pci.c |    5 +++++
>  1 files changed, 5 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 6f464dc..9cf18c4 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -66,6 +66,11 @@ static unsigned int pnv_get_one_msi(struct pnv_phb *phb)
>  		rc = 0;
>  		goto out;
>  	}
> +
> +	if (id >= phb->msi_count - 1)
> +		phb->msi_next = 0;
> +	else
> +		phb->msi_next = id + 1;
>  	__set_bit(id, phb->msi_map);


There is code in arch/powerpc/sysdev/msi_bitmap.c that implements a
bitmap allocator for MSI. It may not do what you need but please take a
look at it if you haven't already.

cheers

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox