linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] 64K page support for kexec
@ 2007-04-24 18:31 Luke Browning
  2007-04-24 19:43 ` Olof Johansson
  2007-04-24 22:48 ` [PATCH] " Benjamin Herrenschmidt
  0 siblings, 2 replies; 28+ messages in thread
From: Luke Browning @ 2007-04-24 18:31 UTC (permalink / raw)
  To: Arnd Bergmann, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, cbe-oss-dev

This patch fixes a couple of kexec problems related to 64K page 
support in the kernel.  kexec issues a tlbie for each pte.  The 
parameters for the tlbie are the page size and the virtual address.
Support was missing for the computation of these two parameters
for 64K pages.  This patch adds that support.  

Patch is updated from previous version to address Ben's comments
and to make it easier to add 16G page support in the future.

Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>

Index: linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
===================================================================
--- linux-2.6.21-rc4.orig/arch/powerpc/mm/hash_native_64.c
+++ linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
@@ -340,31 +340,77 @@ static void native_hpte_invalidate(unsig
 	local_irq_restore(flags);
 }
 
-/*
- * XXX This need fixing based on page size. It's only used by
- * native_hpte_clear() for now which needs fixing too so they
- * make a good pair...
- */
-static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
-{
-	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
-	unsigned long va;
-
-	va = avpn << 23;
-
-	if (! (hpte_v & HPTE_V_LARGE)) {
-		unsigned long vpi, pteg;
-
-		pteg = slot / HPTES_PER_GROUP;
-		if (hpte_v & HPTE_V_SECONDARY)
-			pteg = ~pteg;
+#define LP_SHIFT	12
+#define LP_BITS		8
+#define LP_MASK(i)	((((1 << LP_BITS) - 1) >> (i)) << LP_SHIFT)
+
+static void hpte_decode(hpte_t *hpte, unsigned long slot, 
+			int *psize, unsigned long *va)
+{
+	unsigned long hpte_r = hpte->r;
+	unsigned long hpte_v = hpte->v;
+	unsigned long avpn;
+	int i, size, shift, penc, avpnm_bits;
+		
+	if (!(hpte_v & HPTE_V_LARGE))
+		size = MMU_PAGE_4K;
+#if 0
+	else if (hpte_v & 0x4000000000000000UL)
+		size = MMU_PAGE_16G;
+#endif
+	else if (!(hpte_r & LP_MASK(0)))
+		size = MMU_PAGE_16M;
+	else {
+		for (i = 0; i < LP_BITS; i++) {
+			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
+				break;
+		}
+		penc = LP_MASK(i+1) >> LP_SHIFT;
+		for (size = MMU_PAGE_64K; size < MMU_PAGE_16M; size++) {
+			if (!mmu_psize_defs[size].shift)
+				continue;
+			if (penc == mmu_psize_defs[size].penc)
+				break;
+		}
+	}
 
-		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
+	/*
+	 * FIXME, this could be made more efficient by storing the type 
+	 * of hash algorithm in mmu_psize_defs[].  The code below assumes 
+	 * the number of bits in the va representing the offset in the
+	 * page is less than 23. This affects the hash algorithm that is
+	 * used. When 16G pages are supported, a new hash algorithm
+	 * needs to be provided.  See POWER ISA Book III.
+	 *
+	 * The code below works for 16M, 64K, and 4K pages.
+	 */
+	shift = mmu_psize_defs[size].shift;
+	if (mmu_psize_defs[size].avpnm)
+		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
+	else
+		avpnm_bits = 0;
+	if (shift - avpnm_bits <= 23) {
+		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
+
+		if (shift < 23) {
+			unsigned long vpi, pteg;
+
+			pteg = slot / HPTES_PER_GROUP;
+			if (hpte_v & HPTE_V_SECONDARY)
+				pteg = ~pteg;
+			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
+			avpn |= (vpi << mmu_psize_defs[size].shift);
+		}
+	}
+#if 0
+	/* 16GB page hash, p > 23 */
+	else {
 
-		va |= vpi << PAGE_SHIFT;
 	}
+#endif
 
-	return va;
+	*va = avpn;
+	*psize = size;
 }
 
 /*
@@ -374,8 +420,6 @@ static unsigned long slot2va(unsigned lo
  *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
  * athough there is the control page available...
- *
- * XXX FIXME: 4k only for now !
  */
 static void native_hpte_clear(void)
 {
@@ -383,6 +427,7 @@ static void native_hpte_clear(void)
 	hpte_t *hptep = htab_address;
 	unsigned long hpte_v;
 	unsigned long pteg_count;
+	int psize;
 
 	pteg_count = htab_hash_mask + 1;
 
@@ -408,8 +453,9 @@ static void native_hpte_clear(void)
 		 * already hold the native_tlbie_lock.
 		 */
 		if (hpte_v & HPTE_V_VALID) {
+			hpte_decode(hptep, slot, &psize, &hpte_v);
 			hptep->v = 0;
-			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
+			__tlbie(hpte_v, psize);
 		}
 	}
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] 64K page support for kexec
  2007-04-24 18:31 [PATCH] 64K page support for kexec Luke Browning
@ 2007-04-24 19:43 ` Olof Johansson
  2007-04-24 22:50   ` Benjamin Herrenschmidt
  2007-04-24 22:48 ` [PATCH] " Benjamin Herrenschmidt
  1 sibling, 1 reply; 28+ messages in thread
From: Olof Johansson @ 2007-04-24 19:43 UTC (permalink / raw)
  To: Luke Browning; +Cc: Paul Mackerras, cbe-oss-dev, Arnd Bergmann, linuxppc-dev

Hi,

Didn't see this posted before, but see some comments below.


Thanks,

Olof
On Tue, Apr 24, 2007 at 03:31:53PM -0300, Luke Browning wrote:

> +#define LP_SHIFT	12
> +#define LP_BITS		8
> +#define LP_MASK(i)	((((1 << LP_BITS) - 1) >> (i)) << LP_SHIFT)

Seems to me that something like:
#define LP_MASK(i)	((0xff >> (i)) << LP_SHIFT)

is considerably easier to read.

> +static void hpte_decode(hpte_t *hpte, unsigned long slot, 
> +			int *psize, unsigned long *va)
> +{
> +	unsigned long hpte_r = hpte->r;
> +	unsigned long hpte_v = hpte->v;
> +	unsigned long avpn;
> +	int i, size, shift, penc, avpnm_bits;
> +		
> +	if (!(hpte_v & HPTE_V_LARGE))
> +		size = MMU_PAGE_4K;
> +#if 0
> +	else if (hpte_v & 0x4000000000000000UL)
> +		size = MMU_PAGE_16G;
> +#endif

No ifdefs please. Just take it out if it's not needed.

> +	else if (!(hpte_r & LP_MASK(0)))
> +		size = MMU_PAGE_16M;
> +	else {
> +		for (i = 0; i < LP_BITS; i++) {
> +			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
> +				break;
> +		}
> +		penc = LP_MASK(i+1) >> LP_SHIFT;

No need for braces. Also, why iterate from 0 to LP_BITS if you're doing i+1 everywhere?

> +		for (size = MMU_PAGE_64K; size < MMU_PAGE_16M; size++) {

This assumes that the page sizes are ordered. Why not just iterate from
0 to MMU_PAGE_COUNT?

> +			if (!mmu_psize_defs[size].shift)
> +				continue;

A comment to the effect of "unused entries have a shift value of 0" could be
useful.

> +			if (penc == mmu_psize_defs[size].penc)
> +				break;
> +		}
> +	}
>  
> -		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
> +	/*
> +	 * FIXME, this could be made more efficient by storing the type 
> +	 * of hash algorithm in mmu_psize_defs[].  The code below assumes 
> +	 * the number of bits in the va representing the offset in the
> +	 * page is less than 23. This affects the hash algorithm that is
> +	 * used. When 16G pages are supported, a new hash algorithm
> +	 * needs to be provided.  See POWER ISA Book III.
> +	 *
> +	 * The code below works for 16M, 64K, and 4K pages.
> +	 */

A BUG_ON() when other sizes are hit could be a good idea?

> +	shift = mmu_psize_defs[size].shift;
> +	if (mmu_psize_defs[size].avpnm)
> +		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
> +	else
> +		avpnm_bits = 0;
> +	if (shift - avpnm_bits <= 23) {
> +		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
> +
> +		if (shift < 23) {
> +			unsigned long vpi, pteg;
> +
> +			pteg = slot / HPTES_PER_GROUP;
> +			if (hpte_v & HPTE_V_SECONDARY)
> +				pteg = ~pteg;
> +			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
> +			avpn |= (vpi << mmu_psize_defs[size].shift);
> +		}
> +	}
> +#if 0
> +	/* 16GB page hash, p > 23 */
> +	else {

Same thing here w.r.t. ifdefs

>  
> -		va |= vpi << PAGE_SHIFT;
>  	}
> +#endif
>  
> -	return va;
> +	*va = avpn;
> +	*psize = size;
>  }
>  
>  /*
> @@ -374,8 +420,6 @@ static unsigned long slot2va(unsigned lo
>   *
>   * TODO: add batching support when enabled.  remember, no dynamic memory here,
>   * athough there is the control page available...
> - *
> - * XXX FIXME: 4k only for now !
>   */
>  static void native_hpte_clear(void)
>  {
> @@ -383,6 +427,7 @@ static void native_hpte_clear(void)
>  	hpte_t *hptep = htab_address;
>  	unsigned long hpte_v;
>  	unsigned long pteg_count;
> +	int psize;
>  
>  	pteg_count = htab_hash_mask + 1;
>  
> @@ -408,8 +453,9 @@ static void native_hpte_clear(void)
>  		 * already hold the native_tlbie_lock.
>  		 */
>  		if (hpte_v & HPTE_V_VALID) {
> +			hpte_decode(hptep, slot, &psize, &hpte_v);
>  			hptep->v = 0;
> -			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
> +			__tlbie(hpte_v, psize);

Using hpte_v as variable name is a bit misleading.  avpn or va would be
a better variable name.


-Olof

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] 64K page support for kexec
  2007-04-24 18:31 [PATCH] 64K page support for kexec Luke Browning
  2007-04-24 19:43 ` Olof Johansson
@ 2007-04-24 22:48 ` Benjamin Herrenschmidt
  2007-04-25 13:06   ` Luke Browning
  1 sibling, 1 reply; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2007-04-24 22:48 UTC (permalink / raw)
  To: Luke Browning; +Cc: linuxppc-dev, Paul Mackerras, cbe-oss-dev, Arnd Bergmann

Getting better :-)

Sorry for the constant nagging, let's say I'm a bit perfectionist...

> -static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
> -{
> -	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
> -	unsigned long va;
> -
> -	va = avpn << 23;
> -
> -	if (! (hpte_v & HPTE_V_LARGE)) {
> -		unsigned long vpi, pteg;
> -
> -		pteg = slot / HPTES_PER_GROUP;
> -		if (hpte_v & HPTE_V_SECONDARY)
> -			pteg = ~pteg;

Hrm... hpte_decode ends up being a pretty big function... I suppose
that's ok.

> +#define LP_SHIFT	12
> +#define LP_BITS		8
> +#define LP_MASK(i)	((((1 << LP_BITS) - 1) >> (i)) << LP_SHIFT)
> +
> +static void hpte_decode(hpte_t *hpte, unsigned long slot, 
> +			int *psize, unsigned long *va)
> +{
> +	unsigned long hpte_r = hpte->r;
> +	unsigned long hpte_v = hpte->v;
> +	unsigned long avpn;
> +	int i, size, shift, penc, avpnm_bits;
> +		
> +	if (!(hpte_v & HPTE_V_LARGE))
> +		size = MMU_PAGE_4K;
> +#if 0
> +	else if (hpte_v & 0x4000000000000000UL)
> +		size = MMU_PAGE_16G;
> +#endif

Remove the above. I don't think it's right anyway. We'll deal with 16G
pages when we start using them.

> +	else if (!(hpte_r & LP_MASK(0)))
> +		size = MMU_PAGE_16M;

Is that correct ? (The above). I haven't quite noticed in the previous
instances of the patch, sorry about that but I don't think that's the
way to detect the "old style" 16M pages... I -suspect- that the normal
algorithm will work for them, that is, they'll have a penc of 0 which
will match what's in the mmu_psize_defs[MMU_PAGE_16M] but it's worth
actually testing it.

Now that I think about it, it's possible that I lead you on the wrong
track there initially... Sorry about that.

I think your above code will treat anything with a penc of 0 as a 16M
page, which might be true with current implementations, is also, I
think, not mandated by the arch, is it ? (I don't have my 2.03 at hand
as I'm writing this email).

> +	else {
> +		for (i = 0; i < LP_BITS; i++) {
> +			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
> +				break;
> +		}
> +		penc = LP_MASK(i+1) >> LP_SHIFT;
> +		for (size = MMU_PAGE_64K; size < MMU_PAGE_16M; size++) {
> +			if (!mmu_psize_defs[size].shift)
> +				continue;
> +			if (penc == mmu_psize_defs[size].penc)
> +				break;
> +		}
> +	}
>  
> -		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
> +	/*
> +	 * FIXME, this could be made more efficient by storing the type 
> +	 * of hash algorithm in mmu_psize_defs[].  The code below assumes 
> +	 * the number of bits in the va representing the offset in the
> +	 * page is less than 23. This affects the hash algorithm that is
> +	 * used. When 16G pages are supported, a new hash algorithm
> +	 * needs to be provided.  See POWER ISA Book III.
> +	 *
> +	 * The code below works for 16M, 64K, and 4K pages.
> +	 */

I'm not 100% certain about your comment. The by type of hash algorithm
you mean the segment size right ? This is not directly related to the
page size. While 1T segments are mandatory for 16G pages, they can also
hold normal page sizes... If we're going to implement support for 1T
segment, we should get the segment size (and thus the hash algorithm)
from the B bit of the PTE.

In fact, when doing 1T segments, we'll have to deal with them regardless
of the page size (the hashing will be different for all page sizes).

> +	shift = mmu_psize_defs[size].shift;
> +	if (mmu_psize_defs[size].avpnm)
> +		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
> +	else
> +		avpnm_bits = 0;
> +	if (shift - avpnm_bits <= 23) {
> +		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
> +
> +		if (shift < 23) {
> +			unsigned long vpi, pteg;
> +
> +			pteg = slot / HPTES_PER_GROUP;
> +			if (hpte_v & HPTE_V_SECONDARY)
> +				pteg = ~pteg;
> +			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
> +			avpn |= (vpi << mmu_psize_defs[size].shift);
> +		}
> +	}
> +#if 0
> +	/* 16GB page hash, p > 23 */
> +	else {
>  
> -		va |= vpi << PAGE_SHIFT;
>  	}
> +#endif

Just don't keep the code in #if 0, just a comment about something
needing to be done for 16G ...

> -	return va;
> +	*va = avpn;
> +	*psize = size;
>  }
>  
>  /*
> @@ -374,8 +420,6 @@ static unsigned long slot2va(unsigned lo
>   *
>   * TODO: add batching support when enabled.  remember, no dynamic memory here,
>   * athough there is the control page available...
> - *
> - * XXX FIXME: 4k only for now !
>   */
>  static void native_hpte_clear(void)
>  {
> @@ -383,6 +427,7 @@ static void native_hpte_clear(void)
>  	hpte_t *hptep = htab_address;
>  	unsigned long hpte_v;
>  	unsigned long pteg_count;
> +	int psize;
>  
>  	pteg_count = htab_hash_mask + 1;
>  
> @@ -408,8 +453,9 @@ static void native_hpte_clear(void)
>  		 * already hold the native_tlbie_lock.
>  		 */
>  		if (hpte_v & HPTE_V_VALID) {
> +			hpte_decode(hptep, slot, &psize, &hpte_v);
>  			hptep->v = 0;
> -			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
> +			__tlbie(hpte_v, psize);
>  		}
>  	}
>  
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] 64K page support for kexec
  2007-04-24 19:43 ` Olof Johansson
@ 2007-04-24 22:50   ` Benjamin Herrenschmidt
  2007-04-24 23:07     ` Olof Johansson
  2007-04-25 19:35     ` [PATCH v2] powerpc: " Luke Browning
  0 siblings, 2 replies; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2007-04-24 22:50 UTC (permalink / raw)
  To: Olof Johansson; +Cc: Paul Mackerras, cbe-oss-dev, Arnd Bergmann, linuxppc-dev


> This assumes that the page sizes are ordered. Why not just iterate from
> 0 to MMU_PAGE_COUNT?

You don't want to hit MMU_PAGE_4K which is guaranteed to be 0 ... maybe
just having an if (size == MMU_PAGE_4K) continue; statement in the loop
would be better ?

> > +			if (!mmu_psize_defs[size].shift)
> > +				continue;
> 
> A comment to the effect of "unused entries have a shift value of 0" could be
> useful.
> 
> > +			if (penc == mmu_psize_defs[size].penc)
> > +				break;
> > +		}
> > +	}
> >  
> > -		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
> > +	/*
> > +	 * FIXME, this could be made more efficient by storing the type 
> > +	 * of hash algorithm in mmu_psize_defs[].  The code below assumes 
> > +	 * the number of bits in the va representing the offset in the
> > +	 * page is less than 23. This affects the hash algorithm that is
> > +	 * used. When 16G pages are supported, a new hash algorithm
> > +	 * needs to be provided.  See POWER ISA Book III.
> > +	 *
> > +	 * The code below works for 16M, 64K, and 4K pages.
> > +	 */
> 
> A BUG_ON() when other sizes are hit could be a good idea?

a BUG_ON if the B bit is set would be useful too. (that is 1T segment
HPTE).

> > +	shift = mmu_psize_defs[size].shift;
> > +	if (mmu_psize_defs[size].avpnm)
> > +		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
> > +	else
> > +		avpnm_bits = 0;
> > +	if (shift - avpnm_bits <= 23) {
> > +		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
> > +
> > +		if (shift < 23) {
> > +			unsigned long vpi, pteg;
> > +
> > +			pteg = slot / HPTES_PER_GROUP;
> > +			if (hpte_v & HPTE_V_SECONDARY)
> > +				pteg = ~pteg;
> > +			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
> > +			avpn |= (vpi << mmu_psize_defs[size].shift);
> > +		}
> > +	}
> > +#if 0
> > +	/* 16GB page hash, p > 23 */
> > +	else {
> 
> Same thing here w.r.t. ifdefs
> 
> >  
> > -		va |= vpi << PAGE_SHIFT;
> >  	}
> > +#endif
> >  
> > -	return va;
> > +	*va = avpn;
> > +	*psize = size;
> >  }
> >  
> >  /*
> > @@ -374,8 +420,6 @@ static unsigned long slot2va(unsigned lo
> >   *
> >   * TODO: add batching support when enabled.  remember, no dynamic memory here,
> >   * athough there is the control page available...
> > - *
> > - * XXX FIXME: 4k only for now !
> >   */
> >  static void native_hpte_clear(void)
> >  {
> > @@ -383,6 +427,7 @@ static void native_hpte_clear(void)
> >  	hpte_t *hptep = htab_address;
> >  	unsigned long hpte_v;
> >  	unsigned long pteg_count;
> > +	int psize;
> >  
> >  	pteg_count = htab_hash_mask + 1;
> >  
> > @@ -408,8 +453,9 @@ static void native_hpte_clear(void)
> >  		 * already hold the native_tlbie_lock.
> >  		 */
> >  		if (hpte_v & HPTE_V_VALID) {
> > +			hpte_decode(hptep, slot, &psize, &hpte_v);
> >  			hptep->v = 0;
> > -			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
> > +			__tlbie(hpte_v, psize);
> 
> Using hpte_v as variable name is a bit misleading.  avpn or va would be
> a better variable name.

No. The variable doesn't contain only the va, it contains the whole "V"
part of the HPTE which includes other things like the valid bit.

Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] 64K page support for kexec
  2007-04-24 22:50   ` Benjamin Herrenschmidt
@ 2007-04-24 23:07     ` Olof Johansson
  2007-04-25  5:48       ` Milton Miller
  2007-04-25 19:35     ` [PATCH v2] powerpc: " Luke Browning
  1 sibling, 1 reply; 28+ messages in thread
From: Olof Johansson @ 2007-04-24 23:07 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Paul Mackerras, cbe-oss-dev, Arnd Bergmann, linuxppc-dev

On Wed, Apr 25, 2007 at 08:50:14AM +1000, Benjamin Herrenschmidt wrote:
> 
> > This assumes that the page sizes are ordered. Why not just iterate from
> > 0 to MMU_PAGE_COUNT?
> 
> You don't want to hit MMU_PAGE_4K which is guaranteed to be 0 ... maybe
> just having an if (size == MMU_PAGE_4K) continue; statement in the loop
> would be better ?

Either that, or from 1 to MMU_PAGE_COUNT (with a comment as to why we're skipping
0).

> > > +	/*
> > > +	 * FIXME, this could be made more efficient by storing the type 
> > > +	 * of hash algorithm in mmu_psize_defs[].  The code below assumes 
> > > +	 * the number of bits in the va representing the offset in the
> > > +	 * page is less than 23. This affects the hash algorithm that is
> > > +	 * used. When 16G pages are supported, a new hash algorithm
> > > +	 * needs to be provided.  See POWER ISA Book III.
> > > +	 *
> > > +	 * The code below works for 16M, 64K, and 4K pages.
> > > +	 */
> > 
> > A BUG_ON() when other sizes are hit could be a good idea?
> 
> a BUG_ON if the B bit is set would be useful too. (that is 1T segment
> HPTE).

Yep.

> > > @@ -408,8 +453,9 @@ static void native_hpte_clear(void)
> > >  		 * already hold the native_tlbie_lock.
> > >  		 */
> > >  		if (hpte_v & HPTE_V_VALID) {
> > > +			hpte_decode(hptep, slot, &psize, &hpte_v);
> > >  			hptep->v = 0;
> > > -			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
> > > +			__tlbie(hpte_v, psize);
> > 
> > Using hpte_v as variable name is a bit misleading.  avpn or va would be
> > a better variable name.
> 
> No. The variable doesn't contain only the va, it contains the whole "V"
> part of the HPTE which includes other things like the valid bit.

Ok. The previous usage was confusing me (given that they did a translation
from hpte_v to va, or at least that's what the function name implies).


-Olof

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] 64K page support for kexec
  2007-04-24 23:07     ` Olof Johansson
@ 2007-04-25  5:48       ` Milton Miller
  0 siblings, 0 replies; 28+ messages in thread
From: Milton Miller @ 2007-04-25  5:48 UTC (permalink / raw)
  To: Olof Johansson, Luke Browning, Benjamin Herrenschmidt; +Cc: ppcdev, cbe-oss-dev

>
>
> > > > + /*
> > > > +  * FIXME, this could be made more efficient by storing the type
> > > > +  * of hash algorithm in mmu_psize_defs[].  The code below 
> assumes
> > > > +  * the number of bits in the va representing the offset in the
> > > > +  * page is less than 23. This affects the hash algorithm that 
> is
> > > > +  * used. When 16G pages are supported, a new hash algorithm
> > > > +  * needs to be provided.  See POWER ISA Book III.
> > > > +  *
> > > > +  * The code below works for 16M, 64K, and 4K pages.
> > > > +  */
> > >
> > > A BUG_ON() when other sizes are hit could be a good idea?
> >
> > a BUG_ON if the B bit is set would be useful too. (that is 1T segment
> > HPTE).
>
> Yep.


NNNNOOOOO!!!!


Do NOT add any BUG() is this code!


Look at the context:

(1) We are tearing down *ALL* mappings.   That includes the
kernel linear mapping and the mapping of the kernel text.

(2) We are in real mode.   There is no way back to virtural
mode.  See (1).

(3) We hove put the new kernel in memory.  There is no data
that was not a part of the static data or bss sections.   There
are no per-cpu variables.  Nothing with vmalloc.   Nothing with
kmalloc.  Nothing with alloc_pages.

(4) If this is the panic kernel case, we are allready crashed
and trying to get into a new envrionment to dump memory.
Intentionally failing that is the last thing you want to do.

(5) This hook is only used by kexec_sequence, called by
machine_kexec.   At the beginning of that function there
is the comment "its too late to fail here."



Bottom line: Don't even think of causing any trap here, just
execute as best as you know how.   You dohn't have any way to
tell the user you failed anyways.


milton

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] 64K page support for kexec
  2007-04-24 22:48 ` [PATCH] " Benjamin Herrenschmidt
@ 2007-04-25 13:06   ` Luke Browning
  2007-04-25 22:11     ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 28+ messages in thread
From: Luke Browning @ 2007-04-25 13:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linuxppc-dev, Paul Mackerras, cbe-oss-dev, Arnd Bergmann

On Wed, 2007-04-25 at 08:48 +1000, Benjamin Herrenschmidt wrote:
> Getting better :-)
> 
> Sorry for the constant nagging, let's say I'm a bit perfectionist...
> 
so am I.  That is why I redid the patch instead of going with the
previous version.

> > -static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
> > -{
> > -	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
> > -	unsigned long va;
> > -
> > -	va = avpn << 23;
> > -
> > -	if (! (hpte_v & HPTE_V_LARGE)) {
> > -		unsigned long vpi, pteg;
> > -
> > -		pteg = slot / HPTES_PER_GROUP;
> > -		if (hpte_v & HPTE_V_SECONDARY)
> > -			pteg = ~pteg;
> 
> Hrm... hpte_decode ends up being a pretty big function... I suppose
> that's ok.
> 
It is bigger because it combines two functions.  I think it is better
this way as it will make it easier to add support for 16G pages and 1T
segments in the future. 

> > +#define LP_SHIFT	12
> > +#define LP_BITS		8
> > +#define LP_MASK(i)	((((1 << LP_BITS) - 1) >> (i)) << LP_SHIFT)
> > +
> > +static void hpte_decode(hpte_t *hpte, unsigned long slot, 
> > +			int *psize, unsigned long *va)
> > +{
> > +	unsigned long hpte_r = hpte->r;
> > +	unsigned long hpte_v = hpte->v;
> > +	unsigned long avpn;
> > +	int i, size, shift, penc, avpnm_bits;
> > +		
> > +	if (!(hpte_v & HPTE_V_LARGE))
> > +		size = MMU_PAGE_4K;
> > +#if 0
> > +	else if (hpte_v & 0x4000000000000000UL)
> > +		size = MMU_PAGE_16G;
> > +#endif
> 
> Remove the above. I don't think it's right anyway. We'll deal with 16G
> pages when we start using them.

OK.  I put it there mainly as a place holder.  How many page sizes do
you expect to support in a 1TB segment? 

> 
> > +	else if (!(hpte_r & LP_MASK(0)))
> > +		size = MMU_PAGE_16M;
> 
> Is that correct ? (The above). 
> I haven't quite noticed in the previous
> instances of the patch, sorry about that but I don't think that's the
> way to detect the "old style" 16M pages... I -suspect- that the normal
> algorithm will work for them, that is, they'll have a penc of 0 which
> will match what's in the mmu_psize_defs[MMU_PAGE_16M] but it's worth
> actually testing it.

Ok.   

> 
> Now that I think about it, it's possible that I lead you on the wrong
> track there initially... Sorry about that.
> 
> I think your above code will treat anything with a penc of 0 as a 16M
> page, which might be true with current implementations, is also, I
> think, not mandated by the arch, is it ? (I don't have my 2.03 at hand
> as I'm writing this email).

right.  The encoding is implementation dependent.  There is a minimum
size requirement that comes into play, but that works the other way.
penc of zero has a minimum page size requirement of 8K.

I will remove the test and use the array for the comparison.

> 
> > +	else {
> > +		for (i = 0; i < LP_BITS; i++) {
> > +			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
> > +				break;
> > +		}
> > +		penc = LP_MASK(i+1) >> LP_SHIFT;
> > +		for (size = MMU_PAGE_64K; size < MMU_PAGE_16M; size++) {
> > +			if (!mmu_psize_defs[size].shift)
> > +				continue;
> > +			if (penc == mmu_psize_defs[size].penc)
> > +				break;
> > +		}
> > +	}
> >  
> > -		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
> > +	/*
> > +	 * FIXME, this could be made more efficient by storing the type 
> > +	 * of hash algorithm in mmu_psize_defs[].  The code below assumes 
> > +	 * the number of bits in the va representing the offset in the
> > +	 * page is less than 23. This affects the hash algorithm that is
> > +	 * used. When 16G pages are supported, a new hash algorithm
> > +	 * needs to be provided.  See POWER ISA Book III.
> > +	 *
> > +	 * The code below works for 16M, 64K, and 4K pages.
> > +	 */
> 
> I'm not 100% certain about your comment. The by type of hash algorithm
> you mean the segment size right ? This is not directly related to the
> page size. While 1T segments are mandatory for 16G pages, they can also
> hold normal page sizes... If we're going to implement support for 1T
> segment, we should get the segment size (and thus the hash algorithm)
> from the B bit of the PTE.

yes. what you said is correct.  I was thinking that you might want to
double the number of array elements so that you could apply a different
hash algorithm based on the type of segments. 

> 
> In fact, when doing 1T segments, we'll have to deal with them regardless
> of the page size (the hashing will be different for all page sizes).
> 
> > +	shift = mmu_psize_defs[size].shift;
> > +	if (mmu_psize_defs[size].avpnm)
> > +		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
> > +	else
> > +		avpnm_bits = 0;
> > +	if (shift - avpnm_bits <= 23) {
> > +		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
> > +
> > +		if (shift < 23) {
> > +			unsigned long vpi, pteg;
> > +
> > +			pteg = slot / HPTES_PER_GROUP;
> > +			if (hpte_v & HPTE_V_SECONDARY)
> > +				pteg = ~pteg;
> > +			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
> > +			avpn |= (vpi << mmu_psize_defs[size].shift);
> > +		}
> > +	}
> > +#if 0
> > +	/* 16GB page hash, p > 23 */
> > +	else {
> >  
> > -		va |= vpi << PAGE_SHIFT;
> >  	}
> > +#endif
> 
> Just don't keep the code in #if 0, just a comment about something
> needing to be done for 16G ...
> 
ok.

> > -	return va;
> > +	*va = avpn;
> > +	*psize = size;
> >  }
> >  
> >  /*
> > @@ -374,8 +420,6 @@ static unsigned long slot2va(unsigned lo
> >   *
> >   * TODO: add batching support when enabled.  remember, no dynamic memory here,
> >   * athough there is the control page available...
> > - *
> > - * XXX FIXME: 4k only for now !
> >   */
> >  static void native_hpte_clear(void)
> >  {
> > @@ -383,6 +427,7 @@ static void native_hpte_clear(void)
> >  	hpte_t *hptep = htab_address;
> >  	unsigned long hpte_v;
> >  	unsigned long pteg_count;
> > +	int psize;
> >  
> >  	pteg_count = htab_hash_mask + 1;
> >  
> > @@ -408,8 +453,9 @@ static void native_hpte_clear(void)
> >  		 * already hold the native_tlbie_lock.
> >  		 */
> >  		if (hpte_v & HPTE_V_VALID) {
> > +			hpte_decode(hptep, slot, &psize, &hpte_v);
> >  			hptep->v = 0;
> > -			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
> > +			__tlbie(hpte_v, psize);
> >  		}
> >  	}
> >  
> > 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] powerpc: 64K page support for kexec
  2007-04-24 22:50   ` Benjamin Herrenschmidt
  2007-04-24 23:07     ` Olof Johansson
@ 2007-04-25 19:35     ` Luke Browning
  2007-04-25 22:19       ` Benjamin Herrenschmidt
  2007-04-26  7:15       ` [PATCH v2] " Olof Johansson
  1 sibling, 2 replies; 28+ messages in thread
From: Luke Browning @ 2007-04-25 19:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Olof Johansson, linuxppc-dev, Paul Mackerras, cbe-oss-dev,
	Arnd Bergmann


This patch fixes a couple of kexec problems related to 64K page 
support in the kernel.  kexec issues a tlbie for each pte.  The 
parameters for the tlbie are the page size and the virtual address.
Support was missing for the computation of these two parameters
for 64K pages.  This patch adds that support.  

Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>

Index: linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
===================================================================
--- linux-2.6.21-rc4.orig/arch/powerpc/mm/hash_native_64.c
+++ linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
@@ -340,31 +340,74 @@ static void native_hpte_invalidate(unsig
 	local_irq_restore(flags);
 }
 
-/*
- * XXX This need fixing based on page size. It's only used by
- * native_hpte_clear() for now which needs fixing too so they
- * make a good pair...
- */
-static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
-{
-	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
-	unsigned long va;
-
-	va = avpn << 23;
-
-	if (! (hpte_v & HPTE_V_LARGE)) {
-		unsigned long vpi, pteg;
+#define LP_SHIFT	12
+#define LP_BITS		8
+#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
+
+static void hpte_decode(hpte_t *hpte, unsigned long slot, 
+			int *psize, unsigned long *va)
+{
+	unsigned long hpte_r = hpte->r;
+	unsigned long hpte_v = hpte->v;
+	unsigned long avpn;
+	int i, size, shift, penc, avpnm_bits;
+		
+	if (!(hpte_v & HPTE_V_LARGE))
+		size = MMU_PAGE_4K;
+	else {
+		for (i = 0; i < LP_BITS; i++) {
+			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
+				break;
+		}
+		penc = LP_MASK(i+1) >> LP_SHIFT;
+		for (size = 0; size < MMU_PAGE_COUNT - 1; size++) {
 
-		pteg = slot / HPTES_PER_GROUP;
-		if (hpte_v & HPTE_V_SECONDARY)
-			pteg = ~pteg;
+			/* 4K pages are not represented by LP */
+			if (size == MMU_PAGE_4K)
+				continue;
+
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[size].shift)
+				continue;
 
-		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
+			if (penc == mmu_psize_defs[size].penc)
+				break;
+		}
+	}
 
-		va |= vpi << PAGE_SHIFT;
+	/*
+	 * FIXME, the code below works for 16M, 64K, and 4K pages as these
+	 * fall under the p<=23 rules for calculating the virtual address.
+	 * In the case of 16M pages, an extra bit is stolen from the AVPN
+	 * field to achieve the requisite 24 bits. 
+	 *
+	 * 16G pages are not supported by the code below.
+	 */
+	BUG_ON(hpte_v & 0x4000000000000000UL);		/* 1T segment */
+	BUG_ON(size == MMU_PAGE_16G);
+	BUG_ON(size == MMU_PAGE_64K_AP);
+
+	shift = mmu_psize_defs[size].shift;
+	if (mmu_psize_defs[size].avpnm)
+		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
+	else
+		avpnm_bits = 0;
+	if (shift - avpnm_bits <= 23) {
+		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
+
+		if (shift < 23) {
+			unsigned long vpi, pteg;
+
+			pteg = slot / HPTES_PER_GROUP;
+			if (hpte_v & HPTE_V_SECONDARY)
+				pteg = ~pteg;
+			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
+			avpn |= (vpi << mmu_psize_defs[size].shift);
+		}
 	}
 
-	return va;
+	*va = avpn;
+	*psize = size;
 }
 
 /*
@@ -374,15 +417,14 @@ static unsigned long slot2va(unsigned lo
  *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
  * athough there is the control page available...
- *
- * XXX FIXME: 4k only for now !
  */
 static void native_hpte_clear(void)
 {
 	unsigned long slot, slots, flags;
 	hpte_t *hptep = htab_address;
-	unsigned long hpte_v;
+	unsigned long hpte_v, va;
 	unsigned long pteg_count;
+	int psize;
 
 	pteg_count = htab_hash_mask + 1;
 
@@ -408,8 +450,9 @@ static void native_hpte_clear(void)
 		 * already hold the native_tlbie_lock.
 		 */
 		if (hpte_v & HPTE_V_VALID) {
+			hpte_decode(hptep, slot, &psize, &va);
 			hptep->v = 0;
-			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
+			__tlbie(va, psize);
 		}
 	}
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] 64K page support for kexec
  2007-04-25 13:06   ` Luke Browning
@ 2007-04-25 22:11     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2007-04-25 22:11 UTC (permalink / raw)
  To: Luke Browning; +Cc: linuxppc-dev, Paul Mackerras, cbe-oss-dev, Arnd Bergmann


> OK.  I put it there mainly as a place holder.  How many page sizes do
> you expect to support in a 1TB segment? 

Not quite sure yet.. at least 16M and possibly 4K and 64K.

 .../...

> right.  The encoding is implementation dependent.  There is a minimum
> size requirement that comes into play, but that works the other way.
> penc of zero has a minimum page size requirement of 8K.
> 
> I will remove the test and use the array for the comparison.

Ok.

> yes. what you said is correct.  I was thinking that you might want to
> double the number of array elements so that you could apply a different
> hash algorithm based on the type of segments. 

The way our WIP patches for 1T segments do is we have separate psize and
ssize at every level where it matters. Anyway, let's leave that alone
for now. It will be easy enough to fixup.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] powerpc: 64K page support for kexec
  2007-04-25 19:35     ` [PATCH v2] powerpc: " Luke Browning
@ 2007-04-25 22:19       ` Benjamin Herrenschmidt
  2007-04-26 15:28         ` Luke Browning
  2007-04-26  7:15       ` [PATCH v2] " Olof Johansson
  1 sibling, 1 reply; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2007-04-25 22:19 UTC (permalink / raw)
  To: Luke Browning
  Cc: Olof Johansson, linuxppc-dev, Paul Mackerras, cbe-oss-dev,
	Arnd Bergmann

On Wed, 2007-04-25 at 16:35 -0300, Luke Browning wrote:
> This patch fixes a couple of kexec problems related to 64K page 
> support in the kernel.  kexec issues a tlbie for each pte.  The 
> parameters for the tlbie are the page size and the virtual address.
> Support was missing for the computation of these two parameters
> for 64K pages.  This patch adds that support.  
> 
> Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>

Quick look: looks good to me. I suppose you verified it works well
too :-)

(Have you added some debug to check we get the 16M case right ?)

Note that Milton is against using BUG_ON's in here since that code is
used for crash dumps.

Appart from that,

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] powerpc: 64K page support for kexec
  2007-04-25 19:35     ` [PATCH v2] powerpc: " Luke Browning
  2007-04-25 22:19       ` Benjamin Herrenschmidt
@ 2007-04-26  7:15       ` Olof Johansson
  1 sibling, 0 replies; 28+ messages in thread
From: Olof Johansson @ 2007-04-26  7:15 UTC (permalink / raw)
  To: Luke Browning; +Cc: Paul Mackerras, cbe-oss-dev, Arnd Bergmann, linuxppc-dev

Hi,
                                                                                                                                             
Two comments below, besides that it looks good!
                                                                                                                                             
On Wed, Apr 25, 2007 at 04:35:39PM -0300, Luke Browning wrote:

> +#define LP_SHIFT	12
> +#define LP_BITS		8
> +#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
> +
> +static void hpte_decode(hpte_t *hpte, unsigned long slot, 
> +			int *psize, unsigned long *va)
> +{
> +	unsigned long hpte_r = hpte->r;
> +	unsigned long hpte_v = hpte->v;
> +	unsigned long avpn;
> +	int i, size, shift, penc, avpnm_bits;
> +		
> +	if (!(hpte_v & HPTE_V_LARGE))
> +		size = MMU_PAGE_4K;
> +	else {
> +		for (i = 0; i < LP_BITS; i++) {
> +			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
> +				break;
> +		}
> +		penc = LP_MASK(i+1) >> LP_SHIFT;
> +		for (size = 0; size < MMU_PAGE_COUNT - 1; size++) {

This will never consider the last page size. Either <= MMU_PAGE_COUNT-1
or < MMU_PAGE_COUNT. The latter would be more natural.

> -		pteg = slot / HPTES_PER_GROUP;
> -		if (hpte_v & HPTE_V_SECONDARY)
> -			pteg = ~pteg;
> +			/* 4K pages are not represented by LP */
> +			if (size == MMU_PAGE_4K)
> +				continue;
> +
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[size].shift)
> +				continue;
>  
> -		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
> +			if (penc == mmu_psize_defs[size].penc)
> +				break;
> +		}
> +	}
>  
> -		va |= vpi << PAGE_SHIFT;
> +	/*
> +	 * FIXME, the code below works for 16M, 64K, and 4K pages as these
> +	 * fall under the p<=23 rules for calculating the virtual address.
> +	 * In the case of 16M pages, an extra bit is stolen from the AVPN
> +	 * field to achieve the requisite 24 bits. 
> +	 *
> +	 * 16G pages are not supported by the code below.
> +	 */
> +	BUG_ON(hpte_v & 0x4000000000000000UL);		/* 1T segment */
> +	BUG_ON(size == MMU_PAGE_16G);
> +	BUG_ON(size == MMU_PAGE_64K_AP);

Milton didn't like BUG_ON() here, and I think I agree after hearing his
motivations. I know I was the one suggesting them, but they're better
to keep out at the moment.

[...]



Thanks,

-Olof

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] powerpc: 64K page support for kexec
  2007-04-25 22:19       ` Benjamin Herrenschmidt
@ 2007-04-26 15:28         ` Luke Browning
  2007-04-27  4:36           ` [PATCH v3] " Milton Miller
  0 siblings, 1 reply; 28+ messages in thread
From: Luke Browning @ 2007-04-26 15:28 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Olof Johansson, linuxppc-dev, Paul Mackerras, cbe-oss-dev,
	Arnd Bergmann

On Thu, 2007-04-26 at 08:19 +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2007-04-25 at 16:35 -0300, Luke Browning wrote:
> > This patch fixes a couple of kexec problems related to 64K page 
> > support in the kernel.  kexec issues a tlbie for each pte.  The 
> > parameters for the tlbie are the page size and the virtual address.
> > Support was missing for the computation of these two parameters
> > for 64K pages.  This patch adds that support.  
> > 
> > Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
> 
> Quick look: looks good to me. I suppose you verified it works well
> too :-)

yes.  but only on cell.  

> 
> (Have you added some debug to check we get the 16M case right ?)
> 
> Note that Milton is against using BUG_ON's in here since that code is
> used for crash dumps.

I would prefer to leave BUG_ON()s in the code as they work in many
cases.  It depends on how far you have get in the algorithm.  I added
BUG_ON(size == 16M) which is hit after a hundred entries or so have been
processed.  See output below.  I also put a BUG_ON() at the end of the
table scan but no output was presented so there are limitations, but I
don't believe that there is a downside.  The BUG_ON() at the end of the
sequence presented the original symptom so there is no difference from a
user perspective when the algorithm was completely broken.  During the
development of this feature, we encountered a lot of false hits though
as the system continued and experienced a bunch of false symptoms. This
is worse as it is better to have the system fail in a deterministic way
than to fail in random way.  Some of the failures that we experienced
were dma, timer, and module initialization problems.  These were all red
herrings.  Having BUG_ONs in the code allows developers to make
assertions about the code which is important when diagnosing strange
system crashes and provides a clue to future developers that they need
to add support for something.  Comments are fine, but asserts are better
in that they show up in cscope and other development tools.  So all
things considered I think it is better to include them.

Here's the 16M failure I mentioned above.

------------[ cut here ]------------
kernel BUG
at /home/luke/Desktop/code/cell/SDK3.0/Kexec2/linux-2.6.21-rc4/arch/!
cpu 0x0: Vector: 700 (Program Check) at [c000000000527bd0]
    pc: c00000000002f648: .native_hpte_clear+0x12c/0x220
    lr: c00000000002f568: .native_hpte_clear+0x4c/0x220
    sp: c000000000527e50
   msr: 9000000000021002
  current = 0xc0000000009fe860
  paca    = 0xc000000000454e80
    pid   = 1831, comm = sh
kernel BUG
at /home/luke/Desktop/code/cell/SDK3.0/Kexec2/linux-2.6.21-rc4/arch/!
enter ? for help
[c000000000527e50] 0000000000000000 .__start+0x4000000000000000/0x8
(unreliable)
[c000000000527ee0] c0000000000256b4 .kexec_sequence+0x78/0xac
[c000000000527f90] 0000000000000000 .__start+0x4000000000000000/0x8
[c000000001d13830] c00000000002ae00 .default_machine_kexec+0x1ec/0x1f0
[c000000001d138e0] c00000000002a58c .machine_kexec+0x3c/0x54
[c000000001d13950] c0000000000820a8 .crash_kexec+0x130/0x16c
[c000000001d13b30] c0000000001c0cb0 .sysrq_handle_crashdump+0x28/0x40
[c000000001d13bb0] c0000000001c087c .__handle_sysrq+0xe8/0x1c0
[c000000001d13c60] c000000000114dd8 .write_sysrq_trigger+0x7c/0xa8
[c000000001d13cf0] c0000000000c2364 .vfs_write+0xd8/0x1a4
[c000000001d13d90] c0000000000c2d2c .sys_write+0x4c/0x8c
[c000000001d13e30] c000000000008634 syscall_exit+0x0/0x40
--- Exception: c01 (System Call) at 000000000ff1a8fc
SP (f997f280) is in userspace


> Appart from that,
> 
> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> 
> Cheers,
> Ben.
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3] powerpc: 64K page support for kexec
@ 2007-04-26 22:23 Luke Browning
  2007-04-26 22:32 ` Olof Johansson
  0 siblings, 1 reply; 28+ messages in thread
From: Luke Browning @ 2007-04-26 22:23 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Olof Johansson, Paul Mackerras,
	cbe-oss-dev, Arnd Bergmann, miltonm
  Cc: linuxppc-dev, cbe-oss-dev

This patch fixes a couple of kexec problems related to 64K page 
support in the kernel.  kexec issues a tlbie for each pte.  The 
parameters for the tlbie are the page size and the virtual address.
Support was missing for the computation of these two parameters
for 64K pages.  This patch adds that support.  

Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Index: linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
===================================================================
--- linux-2.6.21-rc4.orig/arch/powerpc/mm/hash_native_64.c
+++ linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
@@ -340,31 +340,74 @@ static void native_hpte_invalidate(unsig
 	local_irq_restore(flags);
 }
 
-/*
- * XXX This need fixing based on page size. It's only used by
- * native_hpte_clear() for now which needs fixing too so they
- * make a good pair...
- */
-static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
-{
-	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
-	unsigned long va;
-
-	va = avpn << 23;
-
-	if (! (hpte_v & HPTE_V_LARGE)) {
-		unsigned long vpi, pteg;
+#define LP_SHIFT	12
+#define LP_BITS		8
+#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
+
+static void hpte_decode(hpte_t *hpte, unsigned long slot, 
+			int *psize, unsigned long *va)
+{
+	unsigned long hpte_r = hpte->r;
+	unsigned long hpte_v = hpte->v;
+	unsigned long avpn;
+	int i, size, shift, penc, avpnm_bits;
+		
+	if (!(hpte_v & HPTE_V_LARGE))
+		size = MMU_PAGE_4K;
+	else {
+		for (i = 0; i < LP_BITS; i++) {
+			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
+				break;
+		}
+		penc = LP_MASK(i+1) >> LP_SHIFT;
+		for (size = 0; size < MMU_PAGE_COUNT; size++) {
 
-		pteg = slot / HPTES_PER_GROUP;
-		if (hpte_v & HPTE_V_SECONDARY)
-			pteg = ~pteg;
+			/* 4K pages are not represented by LP */
+			if (size == MMU_PAGE_4K)
+				continue;
+
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[size].shift)
+				continue;
 
-		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
+			if (penc == mmu_psize_defs[size].penc)
+				break;
+		}
+	}
 
-		va |= vpi << PAGE_SHIFT;
+	/*
+	 * FIXME, the code below works for 16M, 64K, and 4K pages as these
+	 * fall under the p<=23 rules for calculating the virtual address.
+	 * In the case of 16M pages, an extra bit is stolen from the AVPN
+	 * field to achieve the requisite 24 bits. 
+	 *
+	 * 16G pages are not supported by the code below.
+	 */
+	BUG_ON(hpte_v & 0x4000000000000000UL);		/* 1T segment */
+	BUG_ON(size == MMU_PAGE_16G);
+	BUG_ON(size == MMU_PAGE_64K_AP);
+
+	shift = mmu_psize_defs[size].shift;
+	if (mmu_psize_defs[size].avpnm)
+		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
+	else
+		avpnm_bits = 0;
+	if (shift - avpnm_bits <= 23) {
+		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
+
+		if (shift < 23) {
+			unsigned long vpi, pteg;
+
+			pteg = slot / HPTES_PER_GROUP;
+			if (hpte_v & HPTE_V_SECONDARY)
+				pteg = ~pteg;
+			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
+			avpn |= (vpi << mmu_psize_defs[size].shift);
+		}
 	}
 
-	return va;
+	*va = avpn;
+	*psize = size;
 }
 
 /*
@@ -374,15 +417,14 @@ static unsigned long slot2va(unsigned lo
  *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
  * athough there is the control page available...
- *
- * XXX FIXME: 4k only for now !
  */
 static void native_hpte_clear(void)
 {
 	unsigned long slot, slots, flags;
 	hpte_t *hptep = htab_address;
-	unsigned long hpte_v;
+	unsigned long hpte_v, va;
 	unsigned long pteg_count;
+	int psize;
 
 	pteg_count = htab_hash_mask + 1;
 
@@ -408,8 +450,9 @@ static void native_hpte_clear(void)
 		 * already hold the native_tlbie_lock.
 		 */
 		if (hpte_v & HPTE_V_VALID) {
+			hpte_decode(hptep, slot, &psize, &va);
 			hptep->v = 0;
-			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
+			__tlbie(va, psize);
 		}
 	}
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3] powerpc: 64K page support for kexec
  2007-04-26 22:23 [PATCH v3] powerpc: " Luke Browning
@ 2007-04-26 22:32 ` Olof Johansson
  2007-05-02 14:19   ` [PATCH v4] " Luke Browning
  0 siblings, 1 reply; 28+ messages in thread
From: Olof Johansson @ 2007-04-26 22:32 UTC (permalink / raw)
  To: Luke Browning
  Cc: Arnd Bergmann, miltonm, linuxppc-dev, Paul Mackerras, cbe-oss-dev

On Thu, Apr 26, 2007 at 07:23:56PM -0300, Luke Browning wrote:
> This patch fixes a couple of kexec problems related to 64K page 
> support in the kernel.  kexec issues a tlbie for each pte.  The 
> parameters for the tlbie are the page size and the virtual address.
> Support was missing for the computation of these two parameters
> for 64K pages.  This patch adds that support.  
> 
> Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Acked-by: Olof Johansson <olof@lixom.net>


-Olof

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3] powerpc: 64K page support for kexec
  2007-04-26 15:28         ` Luke Browning
@ 2007-04-27  4:36           ` Milton Miller
  2007-04-27 14:42             ` Luke Browning
  2007-04-27 16:22             ` [PATCH v4] " Luke Browning
  0 siblings, 2 replies; 28+ messages in thread
From: Milton Miller @ 2007-04-27  4:36 UTC (permalink / raw)
  To: Luke Browning
  Cc: Arnd Bergmann, linuxppc-dev, Paul Mackerras, Olof Johansson,
	cbe-oss-dev

On Apr 26, 2007, at 5:23 PM, Luke Browning wrote:
> This patch fixes a couple of kexec problems related to 64K page
> support in the kernel.  kexec issues a tlbie for each pte.  The
> parameters for the tlbie are the page size and the virtual address.
> Support was missing for the computation of these two parameters
> for 64K pages.  This patch adds that support.
>
> Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>

> -		va |= vpi << PAGE_SHIFT;
> +	/*
> +	 * FIXME, the code below works for 16M, 64K, and 4K pages as these
> +	 * fall under the p<=23 rules for calculating the virtual address.
> +	 * In the case of 16M pages, an extra bit is stolen from the AVPN
> +	 * field to achieve the requisite 24 bits.
> +	 *
> +	 * 16G pages are not supported by the code below.
> +	 */
> +	BUG_ON(hpte_v & 0x4000000000000000UL);		/* 1T segment */
> +	BUG_ON(size == MMU_PAGE_16G);
> +	BUG_ON(size == MMU_PAGE_64K_AP);
> +
> +	shift = mmu_psize_defs[size].shift;
> +	if (mmu_psize_defs[size].avpnm)
> +		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
> +	else
> +		avpnm_bits = 0;
>


I have to continue to disagree on the above BUG_ONs.

>  /*
> @@ -374,15 +417,14 @@ static unsigned long slot2va(unsigned lo
>   *
>   * TODO: add batching support when enabled.  remember, no dynamic 
> memory here,
>   * athough there is the control page available...
> - *
> - * XXX FIXME: 4k only for now !
>   */
>  static void native_hpte_clear(void)
>  {
>  	unsigned long slot, slots, flags;
>  	hpte_t *hptep = htab_address;
> -	unsigned long hpte_v;
> +	unsigned long hpte_v, va;
>  	unsigned long pteg_count;
> +	int psize;
>
>  	pteg_count = htab_hash_mask + 1;
>
> @@ -408,8 +450,9 @@ static void native_hpte_clear(void)
>  		 * already hold the native_tlbie_lock.
>  		 */
>  		if (hpte_v & HPTE_V_VALID) {
> +			hpte_decode(hptep, slot, &psize, &va);
>  			hptep->v = 0;
> -			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
> +			__tlbie(va, psize);
>  		}
>  	}
>

<speculation severity=minor>
I'm guessing hpte_decode is not inlined by gcc anymore?  (I
didn't get a chance to try it).   I believed you said
hpte_decode is basically doing two things, (1) calculating
the page size, and (2) using that additional information to
calculate the va.   If we split those two functions apart,
does it change the inlining decsions?

Actually this is all optimizing an infrequent path, although
it is a path that counts as downtime.
</speculation>


Luke later wrote:
> Ben H Wrote:
>> (Have you added some debug to check we get the 16M case right ?)
>>
>> Note that Milton is against using BUG_ON's in here since that code is
>> used for crash dumps.
>
> I would prefer to leave BUG_ON()s in the code as they work in many
> cases.  It depends on how far you have get in the algorithm.  I added
> BUG_ON(size == 16M) which is hit after a hundred entries or so have 
> been
> processed.  See output below.

BUG_ON will fail as soon as you invalidate the page
containing the program_check handler, or anything leading up
to it.   Since the kernel text is normally mapped with 16M
pages, it doesn't surprise me when you BUG before unmapping
any 16M pages.

It also depends on your output device.   Your cell blade has
this nice real mode RTAS.  Go to a real serial port and try
to debug the mess (recursive faults) when the port iomap is
unmapped.

> I also put a BUG_ON() at the end of the
> table scan but no output was presented so there are limitations, but I
> don't believe that there is a downside.  The BUG_ON() at the end of the
> sequence presented the original symptom so there is no difference from 
> a
> user perspective when the algorithm was completely broken.

I have had very different experiences when stopping execution, loading
a new kernel, setting the entrypoint, and starting the new kernel.   I
usually die when starting init, but get through all of the kernel init
without incident.   This is going from a 4k kernel to 4k kernel, with
most used drivers built in. In effect, this is doing a kexec without
any hash table clear.   I've even gotten away with it when I do this
because I forgot the initramfs the first time or a driver init paniced.

> During the
> development of this feature, we encountered a lot of false hits though
> as the system continued and experienced a bunch of false symptoms. This
> is worse as it is better to have the system fail in a deterministic way
> than to fail in random way.  Some of the failures that we experienced
> were dma, timer, and module initialization problems.  These were all 
> red
> herrings.

Was your development testing going from 64k to 64k base
kernel?   Or 64k to 4k or 4k to 64k?   Did your development
break 4k pages along the way?

The reason I ask is because when starting a similar kernel,
I expect any failures of invalidating the kernel linear
mapping to be mapped with the same mapping the next time.
If you were going to a dissimilar kernel, or possibly a
modular kernel with modules loaded in random order, I would
expect incorrect io-mapping and vmalloc could also pose
problems you mentioned.

It appears the distros want to use a similar kernel for
their dump kernel.  The would prefer it be the same binary;
I'm trying to influence people that it is a softer requirement
than not slowing down the primary kernel.

> Having BUG_ONs in the code allows developers to make
> assertions about the code which is important when diagnosing strange
> system crashes and provides a clue to future developers that they need
> to add support for something.  Comments are fine, but asserts are 
> better
> in that they show up in cscope and other development tools.  So all
> things considered I think it is better to include them.

I think a better way to debug this code is to call it from a
debugfs hook or xmon dump command to scan the table and do
the computation.  That code would have the full debugger to
notice and print the assert.

Having a xmon function to dump the hash table or a slot
might be useful for other purposes.

If you think you need the assert, then I ask it be put under
an ifdef or it not be triggered when kexec is called with
panic=1  (ie BUG_ON(x && !panic).  Alternatively you could
run the table with dry-run sometime between cpu_down and the
kernel copy.

> > Appart from that,
> >
> > Acked-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
> >

milton

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3] powerpc: 64K page support for kexec
  2007-04-27  4:36           ` [PATCH v3] " Milton Miller
@ 2007-04-27 14:42             ` Luke Browning
  2007-04-27 16:51               ` Milton Miller
  2007-04-27 16:22             ` [PATCH v4] " Luke Browning
  1 sibling, 1 reply; 28+ messages in thread
From: Luke Browning @ 2007-04-27 14:42 UTC (permalink / raw)
  To: Milton Miller
  Cc: Arnd Bergmann, linuxppc-dev, Paul Mackerras, Olof Johansson,
	cbe-oss-dev

On Thu, 2007-04-26 at 23:36 -0500, Milton Miller wrote:

> Was your development testing going from 64k to 64k base
> kernel?   Or 64k to 4k or 4k to 64k?   Did your development
> break 4k pages along the way?
> 

The test was from a 64K to 64K kernel.  The primary fix was to 4K
mappings as the PAGE_SHIFT macro resolves to 16, so the 4K page hash was
incorrectly applied.  The 16M and 64K pages were correctly cleared.
Presumably, this is the reason we had problems with I/O.

> The reason I ask is because when starting a similar kernel,
> I expect any failures of invalidating the kernel linear
> mapping to be mapped with the same mapping the next time.
> If you were going to a dissimilar kernel, or possibly a
> modular kernel with modules loaded in random order, I would
> expect incorrect io-mapping and vmalloc could also pose
> problems you mentioned.
> 
> It appears the distros want to use a similar kernel for
> their dump kernel.  The would prefer it be the same binary;
> I'm trying to influence people that it is a softer requirement
> than not slowing down the primary kernel.

Here's anon-related question.  What is the pSeries strategy for
automating the capture of the dump image and rebooting to a usable
kernel.  Has anybody provided a customized initrd for this purpose that
ultimately reboots the system to the default kernel. Seems like that is
more important in terms of minimizing downtime than inlining a function.

> I think a better way to debug this code is to call it from a
> debugfs hook or xmon dump command to scan the table and do
> the computation.  That code would have the full debugger to
> notice and print the assert.

I am not familiar with debugfs but I suspect that wasn't an option,
because the system hung immediately.  xmon was not invoked either.

> 
> Having a xmon function to dump the hash table or a slot
> might be useful for other purposes.
> 

agreed.

> If you think you need the assert, then I ask it be put under
> an ifdef or it not be triggered when kexec is called with
> panic=1  (ie BUG_ON(x && !panic).  Alternatively you could
> run the table with dry-run sometime between cpu_down and the
> kernel copy.

That is a good idea.  That way, people can use kexec -l to debug.

> 
> > > Appart from that,
> > >
> > > Acked-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
> > >
> 
> milton
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v4] powerpc: 64K page support for kexec
  2007-04-27  4:36           ` [PATCH v3] " Milton Miller
  2007-04-27 14:42             ` Luke Browning
@ 2007-04-27 16:22             ` Luke Browning
  2007-04-27 16:59               ` Milton Miller
  1 sibling, 1 reply; 28+ messages in thread
From: Luke Browning @ 2007-04-27 16:22 UTC (permalink / raw)
  To: Milton Miller
  Cc: Arnd Bergmann, linuxppc-dev, Paul Mackerras, Olof Johansson,
	cbe-oss-dev

This patch fixes a couple of kexec problems related to 64K page
support in the kernel.  kexec issues a tlbie for each pte.  The
parameters for the tlbie are the page size and the virtual address.
Support was missing for the computation of these two parameters
for 64K pages.  This patch adds that support.

Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>


Index: linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
===================================================================
--- linux-2.6.21-rc4.orig/arch/powerpc/mm/hash_native_64.c
+++ linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
@@ -340,31 +340,74 @@ static void native_hpte_invalidate(unsig
 	local_irq_restore(flags);
 }
 
-/*
- * XXX This need fixing based on page size. It's only used by
- * native_hpte_clear() for now which needs fixing too so they
- * make a good pair...
- */
-static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
-{
-	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
-	unsigned long va;
-
-	va = avpn << 23;
-
-	if (! (hpte_v & HPTE_V_LARGE)) {
-		unsigned long vpi, pteg;
+#define LP_SHIFT	12
+#define LP_BITS		8
+#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
+
+static void hpte_decode(hpte_t *hpte, unsigned long slot, 
+			int *psize, unsigned long *va)
+{
+	unsigned long hpte_r = hpte->r;
+	unsigned long hpte_v = hpte->v;
+	unsigned long avpn;
+	int i, size, shift, penc, avpnm_bits;
+		
+	if (!(hpte_v & HPTE_V_LARGE))
+		size = MMU_PAGE_4K;
+	else {
+		for (i = 0; i < LP_BITS; i++) {
+			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
+				break;
+		}
+		penc = LP_MASK(i+1) >> LP_SHIFT;
+		for (size = 0; size < MMU_PAGE_COUNT; size++) {
 
-		pteg = slot / HPTES_PER_GROUP;
-		if (hpte_v & HPTE_V_SECONDARY)
-			pteg = ~pteg;
+			/* 4K pages are not represented by LP */
+			if (size == MMU_PAGE_4K)
+				continue;
+
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[size].shift)
+				continue;
 
-		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
+			if (penc == mmu_psize_defs[size].penc)
+				break;
+		}
+	}
 
-		va |= vpi << PAGE_SHIFT;
+	/*
+	 * FIXME, the code below works for 16M, 64K, and 4K pages as these
+	 * fall under the p<=23 rules for calculating the virtual address.
+	 * In the case of 16M pages, an extra bit is stolen from the AVPN
+	 * field to achieve the requisite 24 bits. 
+	 * 
+	 * You can use kexec -l to debug new page support!
+	 */
+	BUG_ON(hpte_v & 0x4000000000000000UL && !panic);   /* 1T segment */
+	BUG_ON(size == MMU_PAGE_16G && !panic);
+	BUG_ON(size == MMU_PAGE_64K_AP && !panic);
+
+	shift = mmu_psize_defs[size].shift;
+	if (mmu_psize_defs[size].avpnm)
+		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
+	else
+		avpnm_bits = 0;
+	if (shift - avpnm_bits <= 23) {
+		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
+
+		if (shift < 23) {
+			unsigned long vpi, pteg;
+
+			pteg = slot / HPTES_PER_GROUP;
+			if (hpte_v & HPTE_V_SECONDARY)
+				pteg = ~pteg;
+			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
+			avpn |= (vpi << mmu_psize_defs[size].shift);
+		}
 	}
 
-	return va;
+	*va = avpn;
+	*psize = size;
 }
 
 /*
@@ -374,15 +417,14 @@ static unsigned long slot2va(unsigned lo
  *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
  * athough there is the control page available...
- *
- * XXX FIXME: 4k only for now !
  */
 static void native_hpte_clear(void)
 {
 	unsigned long slot, slots, flags;
 	hpte_t *hptep = htab_address;
-	unsigned long hpte_v;
+	unsigned long hpte_v, va;
 	unsigned long pteg_count;
+	int psize;
 
 	pteg_count = htab_hash_mask + 1;
 
@@ -408,8 +450,9 @@ static void native_hpte_clear(void)
 		 * already hold the native_tlbie_lock.
 		 */
 		if (hpte_v & HPTE_V_VALID) {
+			hpte_decode(hptep, slot, &psize, &va);
 			hptep->v = 0;
-			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
+			__tlbie(va, psize);
 		}
 	}
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3] powerpc: 64K page support for kexec
  2007-04-27 14:42             ` Luke Browning
@ 2007-04-27 16:51               ` Milton Miller
  0 siblings, 0 replies; 28+ messages in thread
From: Milton Miller @ 2007-04-27 16:51 UTC (permalink / raw)
  To: Luke Browning
  Cc: Arnd Bergmann, linuxppc-dev, Paul Mackerras, Olof Johansson,
	cbe-oss-dev

On Apr 27, 2007, at 9:42 AM, Luke Browning wrote:
> On Thu, 2007-04-26 at 23:36 -0500, Milton Miller wrote:
>> It appears the distros want to use a similar kernel for
>> their dump kernel.  The would prefer it be the same binary;
>> I'm trying to influence people that it is a softer requirement
>> than not slowing down the primary kernel.
>
> Here's anon-related question.  What is the pSeries strategy for
> automating the capture of the dump image and rebooting to a usable
> kernel.  Has anybody provided a customized initrd for this purpose that
> ultimately reboots the system to the default kernel. Seems like that is
> more important in terms of minimizing downtime than inlining a 
> function.

I'm not involved directly, but my understanding is the distros are
planning to use kdump and makedumpfile or similar, save it to
disk or network, then call the platform reboot (for pSeries, that
would be RTAS).  The platform would then be responsible for all
cleanup and reinitializing the new kernel.

So the page-table clear is part of the path to capture the dump.

>> I think a better way to debug this code is to call it from a
>> debugfs hook or xmon dump command to scan the table and do
>> the computation.  That code would have the full debugger to
>> notice and print the assert.
>
> I am not familiar with debugfs but I suspect that wasn't an option,
> because the system hung immediately.  xmon was not invoked either.

Referring to when you put the trap after the table?   Yes I would
expect it to hang, the 700 program check will bounce to 400,
which will then bounce repeatedly against instruction miss.

The point was to check if you could take down the table before
actually taking it down.  xmon uses the kernel virtual mappings.

>> Having a xmon function to dump the hash table or a slot
>> might be useful for other purposes.
>>
>
> agreed.
>
>> If you think you need the assert, then I ask it be put under
>> an ifdef or it not be triggered when kexec is called with
>> panic=1  (ie BUG_ON(x && !panic).  Alternatively you could
>> run the table with dry-run sometime between cpu_down and the
>> kernel copy.
>
> That is a good idea.  That way, people can use kexec -l to debug.
>
>>>> Appart from that,
>>>>
>>>> Acked-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
>>>>
>>
>> milton
>>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-04-27 16:22             ` [PATCH v4] " Luke Browning
@ 2007-04-27 16:59               ` Milton Miller
  2007-04-27 17:30                 ` Luke Browning
  0 siblings, 1 reply; 28+ messages in thread
From: Milton Miller @ 2007-04-27 16:59 UTC (permalink / raw)
  To: Luke Browning
  Cc: Arnd Bergmann, linuxppc-dev, Paul Mackerras, Olof Johansson,
	cbe-oss-dev


On Apr 27, 2007, at 11:22 AM, Luke Browni+	/*
> +	 * FIXME, the code below works for 16M, 64K, and 4K pages as these
> +	 * fall under the p<=23 rules for calculating the virtual address.
> +	 * In the case of 16M pages, an extra bit is stolen from the AVPN
> +	 * field to achieve the requisite 24 bits.
> +	 *
> +	 * You can use kexec -l to debug new page support!
> +	 */
> +	BUG_ON(hpte_v & 0x4000000000000000UL && !panic);   /* 1T segment */
> +	BUG_ON(size == MMU_PAGE_16G && !panic);
> +	BUG_ON(size == MMU_PAGE_64K_AP && !panic);
> +

I see I've achived my nefarious goal of making these bugs never happen.

(panic is a function, so you are checking that the staticly linked
non-weak function is available.   If you want to check on when
if its a panic kdump or not, you need to decode the flag and pass
it to kexec_sequence, pass it back here, and update all platforms
for the new parameter).

But I still say the check while clearing the table is too late,
it should be a debug scan before clearing any mappings.

milton

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-04-27 16:59               ` Milton Miller
@ 2007-04-27 17:30                 ` Luke Browning
  2007-04-27 18:23                   ` Haren Myneni
  2007-04-29  8:30                   ` Paul Mackerras
  0 siblings, 2 replies; 28+ messages in thread
From: Luke Browning @ 2007-04-27 17:30 UTC (permalink / raw)
  To: Milton Miller
  Cc: Arnd Bergmann, linuxppc-dev, Paul Mackerras, Olof Johansson,
	cbe-oss-dev

On Fri, 2007-04-27 at 11:59 -0500, Milton Miller wrote:

> 
> I see I've achived my nefarious goal of making these bugs never happen.
> 
> (panic is a function, so you are checking that the staticly linked
> non-weak function is available.   If you want to check on when
> if its a panic kdump or not, you need to decode the flag and pass
> it to kexec_sequence, pass it back here, and update all platforms
> for the new parameter).
> 
> But I still say the check while clearing the table is too late,
> it should be a debug scan before clearing any mappings.

How about the following as an alternative. 

  BUG_ON((hpte_v & 0x4000000000000000UL) && (crashing_cpus == -1));
  BUG_ON((size == MMU_PAGE_16G) && (crashing_cpus == -1));
  BUG_ON((size == MMU_PAGE_64K_AP) && (crashing_cpus == -1));

I don't have time to work on a multi-platform solution.

Regards,
Luke

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-04-27 17:30                 ` Luke Browning
@ 2007-04-27 18:23                   ` Haren Myneni
  2007-04-29  5:35                     ` Milton Miller
  2007-04-29  8:30                   ` Paul Mackerras
  1 sibling, 1 reply; 28+ messages in thread
From: Haren Myneni @ 2007-04-27 18:23 UTC (permalink / raw)
  To: Luke Browning
  Cc: Arnd Bergmann, Milton Miller, linuxppc-dev, Paul Mackerras,
	Olof Johansson, cbe-oss-dev

Luke Browning wrote:
> On Fri, 2007-04-27 at 11:59 -0500, Milton Miller wrote:
>
>   
>> I see I've achived my nefarious goal of making these bugs never happen.
>>
>> (panic is a function, so you are checking that the staticly linked
>> non-weak function is available.   If you want to check on when
>> if its a panic kdump or not, you need to decode the flag and pass
>> it to kexec_sequence, pass it back here, and update all platforms
>> for the new parameter).
>>
>> But I still say the check while clearing the table is too late,
>> it should be a debug scan before clearing any mappings.
>>     
>
> How about the following as an alternative. 
>
>   BUG_ON((hpte_v & 0x4000000000000000UL) && (crashing_cpus == -1));
>   BUG_ON((size == MMU_PAGE_16G) && (crashing_cpus == -1));
>   BUG_ON((size == MMU_PAGE_64K_AP) && (crashing_cpus == -1));
>   
should be crashing_cpu -  contains the panic cpu ID

> I don't have time to work on a multi-platform solution.
>
> Regards,
> Luke
>
>
>
>
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@ozlabs.org
> https://ozlabs.org/mailman/listinfo/linuxppc-dev
>   

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-04-27 18:23                   ` Haren Myneni
@ 2007-04-29  5:35                     ` Milton Miller
  0 siblings, 0 replies; 28+ messages in thread
From: Milton Miller @ 2007-04-29  5:35 UTC (permalink / raw)
  To: Haren Myneni
  Cc: Arnd Bergmann, linuxppc-dev, Paul Mackerras, Olof Johansson,
	cbe-oss-dev

On Apr 27, 2007, at 1:23 PM, Haren Myneni wrote:
> Luke Browning wrote:
>> On Fri, 2007-04-27 at 11:59 -0500, Milton Miller wrote:
>>> (panic is a function, so you are checking that the staticly linked
>>> non-weak function is available.   If you want to check on when
>>> if its a panic kdump or not, you need to decode the flag and pass
>>> it to kexec_sequence, pass it back here, and update all platforms
>>> for the new parameter).
>>>
>>> But I still say the check while clearing the table is too late,
>>> it should be a debug scan before clearing any mappings.
>>
>> How about the following as an alternative.
>>   BUG_ON((hpte_v & 0x4000000000000000UL) && (crashing_cpus == -1));
>>   BUG_ON((size == MMU_PAGE_16G) && (crashing_cpus == -1));
>>   BUG_ON((size == MMU_PAGE_64K_AP) && (crashing_cpus == -1));
>>
> should be crashing_cpu -  contains the panic cpu ID

and that only exists (or is even declared) when CONFIG_KEXEC,
whereas this code is based on MMU.

>> I don't have time to work on a multi-platform solution.
>>
>> Regards,
>> Luke

We could move the comment to fix this next to the MMU definitions,
and drop the BUG.   For that matter, we probably don't need to
define or save slots for the sizes we don't support.  In other
words, tie adding support to including support for this function.

milton

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-04-27 17:30                 ` Luke Browning
  2007-04-27 18:23                   ` Haren Myneni
@ 2007-04-29  8:30                   ` Paul Mackerras
  2007-04-29  9:31                     ` Benjamin Herrenschmidt
  2007-04-29 13:27                     ` Segher Boessenkool
  1 sibling, 2 replies; 28+ messages in thread
From: Paul Mackerras @ 2007-04-29  8:30 UTC (permalink / raw)
  To: Luke Browning
  Cc: Arnd Bergmann, Milton Miller, linuxppc-dev, Olof Johansson,
	cbe-oss-dev

Luke Browning writes:

> How about the following as an alternative. 
> 
>   BUG_ON((hpte_v & 0x4000000000000000UL) && (crashing_cpus == -1));
>   BUG_ON((size == MMU_PAGE_16G) && (crashing_cpus == -1));
>   BUG_ON((size == MMU_PAGE_64K_AP) && (crashing_cpus == -1));

How much effort would it be to add the code to cope with those
conditions and give the correct answers?  I'd much prefer that to
these BUG_ONs.

Paul.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-04-29  8:30                   ` Paul Mackerras
@ 2007-04-29  9:31                     ` Benjamin Herrenschmidt
  2007-04-29 13:27                     ` Segher Boessenkool
  1 sibling, 0 replies; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2007-04-29  9:31 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Arnd Bergmann, Milton Miller, linuxppc-dev, Olof Johansson,
	cbe-oss-dev

On Sun, 2007-04-29 at 18:30 +1000, Paul Mackerras wrote:
> Luke Browning writes:
> 
> > How about the following as an alternative. 
> > 
> >   BUG_ON((hpte_v & 0x4000000000000000UL) && (crashing_cpus == -1));
> >   BUG_ON((size == MMU_PAGE_16G) && (crashing_cpus == -1));
> >   BUG_ON((size == MMU_PAGE_64K_AP) && (crashing_cpus == -1));
> 
> How much effort would it be to add the code to cope with those
> conditions and give the correct answers?  I'd much prefer that to
> these BUG_ONs.

hrm...

1T segments shouldn't be too hard... 16G pages goes with it pretty much,
and AP ... well... do they work on any implementation anyway ?

Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-04-29  8:30                   ` Paul Mackerras
  2007-04-29  9:31                     ` Benjamin Herrenschmidt
@ 2007-04-29 13:27                     ` Segher Boessenkool
  2007-04-29 22:49                       ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 28+ messages in thread
From: Segher Boessenkool @ 2007-04-29 13:27 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Arnd Bergmann, Milton Miller, linuxppc-dev, Olof Johansson,
	cbe-oss-dev

>>   BUG_ON((hpte_v & 0x4000000000000000UL) && (crashing_cpus == -1));
>>   BUG_ON((size == MMU_PAGE_16G) && (crashing_cpus == -1));
>>   BUG_ON((size == MMU_PAGE_64K_AP) && (crashing_cpus == -1));
>
> How much effort would it be to add the code to cope with those
> conditions and give the correct answers?  I'd much prefer that to
> these BUG_ONs.

The test "& 0x400..." should be "& 0xc00..." since the
"B" filed in a page table entry is the top _two_ bits.

Not sure what that tells us about the work involved in
making this code do the right thing under all conditions,
but it would seem to indicate trouble ahead.  Also, the
encodings with bit 0 = 1 are reserved (says arch v2.03),
so there would probably still be a BUG_ON() here.


Segher

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-04-29 13:27                     ` Segher Boessenkool
@ 2007-04-29 22:49                       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2007-04-29 22:49 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Arnd Bergmann, Milton Miller, linuxppc-dev, Paul Mackerras,
	Olof Johansson, cbe-oss-dev

On Sun, 2007-04-29 at 15:27 +0200, Segher Boessenkool wrote:
> >>   BUG_ON((hpte_v & 0x4000000000000000UL) && (crashing_cpus == -1));
> >>   BUG_ON((size == MMU_PAGE_16G) && (crashing_cpus == -1));
> >>   BUG_ON((size == MMU_PAGE_64K_AP) && (crashing_cpus == -1));
> >
> > How much effort would it be to add the code to cope with those
> > conditions and give the correct answers?  I'd much prefer that to
> > these BUG_ONs.
> 
> The test "& 0x400..." should be "& 0xc00..." since the
> "B" filed in a page table entry is the top _two_ bits.

Yup, we need to put some symbolic constants too. We can dig them from
the old never-applied 1T segment patch I did a while ago.

> Not sure what that tells us about the work involved in
> making this code do the right thing under all conditions,
> but it would seem to indicate trouble ahead.  Also, the
> encodings with bit 0 = 1 are reserved (says arch v2.03),
> so there would probably still be a BUG_ON() here.

Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v4] powerpc: 64K page support for kexec
  2007-04-26 22:32 ` Olof Johansson
@ 2007-05-02 14:19   ` Luke Browning
  2007-05-03 13:45     ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Luke Browning @ 2007-05-02 14:19 UTC (permalink / raw)
  To: Olof Johansson
  Cc: Arnd Bergmann, miltonm, linuxppc-dev, Paul Mackerras, cbe-oss-dev

This patch fixes a couple of kexec problems related to 64K page
support in the kernel.  kexec issues a tlbie for each pte.  The
parameters for the tlbie are the page size and the virtual address.
Support was missing for the computation of these two parameters
for 64K pages.  This patch adds that support.

Same as previous versions without BUG_ON()s.

Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Olof Johansson <olof@lixom.net>

Index: linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
===================================================================
--- linux-2.6.21-rc4.orig/arch/powerpc/mm/hash_native_64.c
+++ linux-2.6.21-rc4/arch/powerpc/mm/hash_native_64.c
@@ -26,6 +26,7 @@
 #include <asm/tlb.h>
 #include <asm/cputable.h>
 #include <asm/udbg.h>
+#include <asm/kexec.h>
 
 #ifdef DEBUG_LOW
 #define DBG_LOW(fmt...) udbg_printf(fmt)
@@ -340,31 +341,70 @@ static void native_hpte_invalidate(unsig
 	local_irq_restore(flags);
 }
 
-/*
- * XXX This need fixing based on page size. It's only used by
- * native_hpte_clear() for now which needs fixing too so they
- * make a good pair...
- */
-static unsigned long slot2va(unsigned long hpte_v, unsigned long slot)
-{
-	unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v);
-	unsigned long va;
-
-	va = avpn << 23;
-
-	if (! (hpte_v & HPTE_V_LARGE)) {
-		unsigned long vpi, pteg;
+#define LP_SHIFT	12
+#define LP_BITS		8
+#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
+
+static void hpte_decode(hpte_t *hpte, unsigned long slot, 
+			int *psize, unsigned long *va)
+{
+	unsigned long hpte_r = hpte->r;
+	unsigned long hpte_v = hpte->v;
+	unsigned long avpn;
+	int i, size, shift, penc, avpnm_bits;
+		
+	if (!(hpte_v & HPTE_V_LARGE))
+		size = MMU_PAGE_4K;
+	else {
+		for (i = 0; i < LP_BITS; i++) {
+			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
+				break;
+		}
+		penc = LP_MASK(i+1) >> LP_SHIFT;
+		for (size = 0; size < MMU_PAGE_COUNT; size++) {
 
-		pteg = slot / HPTES_PER_GROUP;
-		if (hpte_v & HPTE_V_SECONDARY)
-			pteg = ~pteg;
+			/* 4K pages are not represented by LP */
+			if (size == MMU_PAGE_4K)
+				continue;
+
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[size].shift)
+				continue;
 
-		vpi = ((va >> 28) ^ pteg) & htab_hash_mask;
+			if (penc == mmu_psize_defs[size].penc)
+				break;
+		}
+	}
 
-		va |= vpi << PAGE_SHIFT;
+	/*
+	 * FIXME, the code below works for 16M, 64K, and 4K pages as these
+	 * fall under the p<=23 rules for calculating the virtual address.
+	 * In the case of 16M pages, an extra bit is stolen from the AVPN
+	 * field to achieve the requisite 24 bits. 
+	 * 
+	 * Does not work for 16G pages or 1 TB segments.
+	 */
+	shift = mmu_psize_defs[size].shift;
+	if (mmu_psize_defs[size].avpnm)
+		avpnm_bits = __ilog2_u64(mmu_psize_defs[size].avpnm) + 1;
+	else
+		avpnm_bits = 0;
+	if (shift - avpnm_bits <= 23) {
+		avpn = HPTE_V_AVPN_VAL(hpte_v) << 23;
+
+		if (shift < 23) {
+			unsigned long vpi, pteg;
+
+			pteg = slot / HPTES_PER_GROUP;
+			if (hpte_v & HPTE_V_SECONDARY)
+				pteg = ~pteg;
+			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
+			avpn |= (vpi << mmu_psize_defs[size].shift);
+		}
 	}
 
-	return va;
+	*va = avpn;
+	*psize = size;
 }
 
 /*
@@ -374,15 +414,14 @@ static unsigned long slot2va(unsigned lo
  *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
  * athough there is the control page available...
- *
- * XXX FIXME: 4k only for now !
  */
 static void native_hpte_clear(void)
 {
 	unsigned long slot, slots, flags;
 	hpte_t *hptep = htab_address;
-	unsigned long hpte_v;
+	unsigned long hpte_v, va;
 	unsigned long pteg_count;
+	int psize;
 
 	pteg_count = htab_hash_mask + 1;
 
@@ -408,8 +447,9 @@ static void native_hpte_clear(void)
 		 * already hold the native_tlbie_lock.
 		 */
 		if (hpte_v & HPTE_V_VALID) {
+			hpte_decode(hptep, slot, &psize, &va);
 			hptep->v = 0;
-			__tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K);
+			__tlbie(va, psize);
 		}
 	}
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4] powerpc: 64K page support for kexec
  2007-05-02 14:19   ` [PATCH v4] " Luke Browning
@ 2007-05-03 13:45     ` Arnd Bergmann
  0 siblings, 0 replies; 28+ messages in thread
From: Arnd Bergmann @ 2007-05-03 13:45 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Olof Johansson, Paul Mackerras, cbe-oss-dev, miltonm

On Wednesday 02 May 2007, Luke Browning wrote:
> This patch fixes a couple of kexec problems related to 64K page
> support in the kernel. =A0kexec issues a tlbie for each pte. =A0The
> parameters for the tlbie are the page size and the virtual address.
> Support was missing for the computation of these two parameters
> for 64K pages. =A0This patch adds that support.
>=20
> Same as previous versions without BUG_ON()s.
>=20
> Signed-off-by: Luke Browning <lukebrowning@us.ibm.com>
> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>

Paul, please include this in your next merge for 2.6.22.

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2007-05-03 13:45 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-24 18:31 [PATCH] 64K page support for kexec Luke Browning
2007-04-24 19:43 ` Olof Johansson
2007-04-24 22:50   ` Benjamin Herrenschmidt
2007-04-24 23:07     ` Olof Johansson
2007-04-25  5:48       ` Milton Miller
2007-04-25 19:35     ` [PATCH v2] powerpc: " Luke Browning
2007-04-25 22:19       ` Benjamin Herrenschmidt
2007-04-26 15:28         ` Luke Browning
2007-04-27  4:36           ` [PATCH v3] " Milton Miller
2007-04-27 14:42             ` Luke Browning
2007-04-27 16:51               ` Milton Miller
2007-04-27 16:22             ` [PATCH v4] " Luke Browning
2007-04-27 16:59               ` Milton Miller
2007-04-27 17:30                 ` Luke Browning
2007-04-27 18:23                   ` Haren Myneni
2007-04-29  5:35                     ` Milton Miller
2007-04-29  8:30                   ` Paul Mackerras
2007-04-29  9:31                     ` Benjamin Herrenschmidt
2007-04-29 13:27                     ` Segher Boessenkool
2007-04-29 22:49                       ` Benjamin Herrenschmidt
2007-04-26  7:15       ` [PATCH v2] " Olof Johansson
2007-04-24 22:48 ` [PATCH] " Benjamin Herrenschmidt
2007-04-25 13:06   ` Luke Browning
2007-04-25 22:11     ` Benjamin Herrenschmidt
  -- strict thread matches above, loose matches on Subject: below --
2007-04-26 22:23 [PATCH v3] powerpc: " Luke Browning
2007-04-26 22:32 ` Olof Johansson
2007-05-02 14:19   ` [PATCH v4] " Luke Browning
2007-05-03 13:45     ` Arnd Bergmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).