[PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
@ 2008-08-27 22:38 Becky Bruce
  2008-08-27 23:43 ` Scott Wood
  2008-09-01  5:19 ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 16+ messages in thread
From: Becky Bruce @ 2008-08-27 22:38 UTC (permalink / raw)
  To: linuxppc-dev

This rearranges a bit of code, and adds support for
36-bit physical addressing for configs that use a
hashed page table.  The 36b physical support is not
enabled by default on any config - it must be
explicitly enabled via the config system.

This patch *only* expands the page table code to accomodate
large physical addresses on 32-bit systems and enables the
PHYS_64BIT config option for 6xx.  It does *not*
allow you to boot a board with more than about 3.5GB of
RAM - for that, SWIOTLB support is also required (and
coming soon).

Signed-off-by: Becky Bruce <becky.bruce@freescale.com>
---
NOTE: This patch is now based against Kumar's mmu tree,
as we had conflicting edits.  Kumar will pick this up into his
tree if it looks OK.

Cheers,
Becky

 arch/powerpc/include/asm/io.h            |    2 +-
 arch/powerpc/include/asm/page_32.h       |   10 ++++-
 arch/powerpc/include/asm/pgtable-ppc32.h |   30 +++++++++++-
 arch/powerpc/kernel/head_32.S            |    4 +-
 arch/powerpc/kernel/head_fsl_booke.S     |    2 -
 arch/powerpc/mm/hash_low_32.S            |   78 +++++++++++++++++++++++------
 arch/powerpc/mm/pgtable_32.c             |    4 +-
 arch/powerpc/platforms/Kconfig.cputype   |   14 ++++--
 8 files changed, 114 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 77c7fa0..08266d2 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -711,7 +711,7 @@ static inline void * phys_to_virt(unsigned long address)
 /*
  * Change "struct page" to physical address.
  */
-#define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
+#define page_to_phys(page)	((phys_addr_t)page_to_pfn(page) << PAGE_SHIFT)
 
 /* We do NOT want virtual merging, it would put too much pressure on
  * our iommu allocator. Instead, we want drivers to be smart enough
diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index ebfae53..0b253f6 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -13,10 +13,18 @@
 #define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
 #endif
 
+#ifdef CONFIG_PTE_64BIT
+#define PTE_FLAGS_OFFSET	4	/* offset of PTE flags, in bytes */
+#define LNX_PTE_SIZE		8	/* size of a linux PTE, in bytes */
+#else
+#define PTE_FLAGS_OFFSET	0
+#define LNX_PTE_SIZE		4
+#endif
+
 #ifndef __ASSEMBLY__
 /*
  * The basic type of a PTE - 64 bits for those CPUs with > 32 bit
- * physical addressing.  For now this just the IBM PPC440.
+ * physical addressing.
  */
 #ifdef CONFIG_PTE_64BIT
 typedef unsigned long long pte_basic_t;
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 82bf914..4a7055a 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -369,7 +369,12 @@ extern int icache_44x_need_flush;
 #define _PAGE_RW	0x400	/* software: user write access allowed */
 #define _PAGE_SPECIAL	0x800	/* software: Special page */
 
+#ifdef CONFIG_PTE_64BIT
+/* We never clear the high word of the pte, mask those off */
+#define _PTE_NONE_MASK	(0xffffffff00000000ULL | _PAGE_HASHPTE)
+#else
 #define _PTE_NONE_MASK	_PAGE_HASHPTE
+#endif
 
 #define _PMD_PRESENT	0
 #define _PMD_PRESENT_MASK (PAGE_MASK)
@@ -528,7 +533,8 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
 
 #define pte_none(pte)		((pte_val(pte) & ~_PTE_NONE_MASK) == 0)
 #define pte_present(pte)	(pte_val(pte) & _PAGE_PRESENT)
-#define pte_clear(mm,addr,ptep)	do { pte_update(ptep, ~0, 0); } while (0)
+#define pte_clear(mm, addr, ptep) \
+	do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(pmd_val(pmd) & _PMD_BAD)
@@ -664,8 +670,30 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pte)
 {
 #if _PAGE_HASHPTE != 0
+#ifndef CONFIG_PTE_64BIT
 	pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
 #else
+	/*
+	 * We have to do the write of the 64b pte as 2 stores.  This
+	 * code assumes that the entry we're storing to is currently
+	 * not valid and that all callers have the page table lock.
+	 * Having the entry be not valid protects readers who might read
+	 * between the first and second stores.
+	 */
+	unsigned int tmp;
+
+	__asm__ __volatile__("\
+1:	lwarx	%0,0,%4\n\
+	rlwimi	%L2,%0,0,30,30\n\
+	stw	%2,0(%3)\n\
+	eieio\n\
+	stwcx.  %L2,0,%4\n\
+	bne-	1b"
+	: "=&r" (tmp), "=m" (*ptep)
+	: "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4), "m" (*ptep)
+	: "cc");
+#endif	/* CONFIG_PTE_64BIT */
+#else /* _PAGE_HASHPTE == 0 */
 #if defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
 	__asm__ __volatile__("\
 		stw%U0%X0 %2,%0\n\
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 8bb6575..a6de6db 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -369,13 +369,13 @@ i##n:								\
 DataAccess:
 	EXCEPTION_PROLOG
 	mfspr	r10,SPRN_DSISR
+	stw	r10,_DSISR(r11)
 	andis.	r0,r10,0xa470		/* weird error? */
 	bne	1f			/* if not, try to put a PTE */
 	mfspr	r4,SPRN_DAR		/* into the hash table */
 	rlwinm	r3,r10,32-15,21,21	/* DSISR_STORE -> _PAGE_RW */
 	bl	hash_page
-1:	stw	r10,_DSISR(r11)
-	mr	r5,r10
+1:	lwz	r5,_DSISR(r11)		/* get DSISR value */
 	mfspr	r4,SPRN_DAR
 	EXC_XFER_EE_LITE(0x300, handle_page_fault)
 
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index fa39cce..9000891 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -422,7 +422,6 @@ skpinv:	addi	r6,r6,1				/* Increment */
  *   r12 is pointer to the pte
  */
 #ifdef CONFIG_PTE_64BIT
-#define PTE_FLAGS_OFFSET	4
 #define FIND_PTE	\
 	rlwinm	r12, r10, 13, 19, 29;	/* Compute pgdir/pmd offset */	\
 	lwzx	r11, r12, r11;		/* Get pgd/pmd entry */		\
@@ -431,7 +430,6 @@ skpinv:	addi	r6,r6,1				/* Increment */
 	rlwimi	r12, r10, 23, 20, 28;	/* Compute pte address */	\
 	lwz	r11, 4(r12);		/* Get pte entry */
 #else
-#define PTE_FLAGS_OFFSET	0
 #define FIND_PTE	\
 	rlwimi	r11, r10, 12, 20, 29;	/* Create L1 (pgdir/pmd) address */	\
 	lwz	r11, 0(r11);		/* Get L1 entry */			\
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index b9ba7d9..d63e20a 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -75,7 +75,7 @@ _GLOBAL(hash_page_sync)
  * Returns to the caller if the access is illegal or there is no
  * mapping for the address.  Otherwise it places an appropriate PTE
  * in the hash table and returns from the exception.
- * Uses r0, r3 - r8, ctr, lr.
+ * Uses r0, r3 - r8, r10, ctr, lr.
  */
 	.text
 _GLOBAL(hash_page)
@@ -106,9 +106,15 @@ _GLOBAL(hash_page)
 	addi	r5,r5,swapper_pg_dir@l	/* kernel page table */
 	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
 112:	add	r5,r5,r7		/* convert to phys addr */
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r5,r4,12,20,29		/* insert top 10 bits of address */
 	lwz	r8,0(r5)		/* get pmd entry */
 	rlwinm.	r8,r8,0,0,19		/* extract address of pte page */
+#else
+	rlwinm	r8,r4,13,19,29		/* Compute pgdir/pmd offset */
+	lwzx	r8,r8,r5		/* Get L1 entry */
+	rlwinm. r8,r8,0,0,20		/* extract pt base address */
+#endif
 #ifdef CONFIG_SMP
 	beq-	hash_page_out		/* return if no mapping */
 #else
@@ -118,7 +124,11 @@ _GLOBAL(hash_page)
 	   to the address following the rfi. */
 	beqlr-
 #endif
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r8,r4,22,20,29		/* insert next 10 bits of address */
+#else
+	rlwimi	r8,r4,23,20,28		/* compute pte address */
+#endif
 	rlwinm	r0,r3,32-3,24,24	/* _PAGE_RW access -> _PAGE_DIRTY */
 	ori	r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
 
@@ -127,9 +137,15 @@ _GLOBAL(hash_page)
 	 * because almost always, there won't be a permission violation
 	 * and there won't already be an HPTE, and thus we will have
 	 * to update the PTE to set _PAGE_HASHPTE.  -- paulus.
+	 *
+	 * If PTE_64BIT is set, the low word is the flags word; use that
+	 * word for locking since it contains all the interesting bits.
 	 */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
 retry:
-	lwarx	r6,0,r8			/* get linux-style pte */
+	lwarx	r6,0,r8			/* get linux-style pte, flag word */
 	andc.	r5,r3,r6		/* check access & ~permission */
 #ifdef CONFIG_SMP
 	bne-	hash_page_out		/* return if access not permitted */
@@ -137,6 +153,11 @@ retry:
 	bnelr-
 #endif
 	or	r5,r0,r6		/* set accessed/dirty bits */
+#ifdef CONFIG_PTE_64BIT
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#endif
 	stwcx.	r5,0,r8			/* attempt to update PTE */
 	bne-	retry			/* retry if someone got there first */
 
@@ -203,9 +224,9 @@ _GLOBAL(add_hash_page)
 	 * we can't take a hash table miss (assuming the code is
 	 * covered by a BAT).  -- paulus
 	 */
-	mfmsr	r10
+	mfmsr	r9
 	SYNC
-	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r9,0,17,15		/* clear bit 16 (MSR_EE) */
 	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
 	mtmsr	r0
 	SYNC_601
@@ -214,14 +235,14 @@ _GLOBAL(add_hash_page)
 	tophys(r7,0)
 
 #ifdef CONFIG_SMP
-	addis	r9,r7,mmu_hash_lock@ha
-	addi	r9,r9,mmu_hash_lock@l
-10:	lwarx	r0,0,r9			/* take the mmu_hash_lock */
+	addis	r6,r7,mmu_hash_lock@ha
+	addi	r6,r6,mmu_hash_lock@l
+10:	lwarx	r0,0,r6			/* take the mmu_hash_lock */
 	cmpi	0,r0,0
 	bne-	11f
-	stwcx.	r8,0,r9
+	stwcx.	r8,0,r6
 	beq+	12f
-11:	lwz	r0,0(r9)
+11:	lwz	r0,0(r6)
 	cmpi	0,r0,0
 	beq	10b
 	b	11b
@@ -234,10 +255,20 @@ _GLOBAL(add_hash_page)
 	 * HPTE, so we just unlock and return.
 	 */
 	mr	r8,r5
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r8,r4,22,20,29
+#else
+	rlwimi	r8,r4,23,20,28
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
 1:	lwarx	r6,0,r8
 	andi.	r0,r6,_PAGE_HASHPTE
 	bne	9f			/* if HASHPTE already set, done */
+#ifdef CONFIG_PTE_64BIT
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#endif
 	ori	r5,r6,_PAGE_HASHPTE
 	stwcx.	r5,0,r8
 	bne-	1b
@@ -246,13 +277,15 @@ _GLOBAL(add_hash_page)
 
 9:
 #ifdef CONFIG_SMP
+	addis	r6,r7,mmu_hash_lock@ha
+	addi	r6,r6,mmu_hash_lock@l
 	eieio
 	li	r0,0
-	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	stw	r0,0(r6)		/* clear mmu_hash_lock */
 #endif
 
 	/* reenable interrupts and DR */
-	mtmsr	r10
+	mtmsr	r9
 	SYNC_601
 	isync
 
@@ -267,7 +300,8 @@ _GLOBAL(add_hash_page)
  * r5 contains the linux PTE, r6 contains the old value of the
  * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
  * offset to be added to addresses (0 if the MMU is on,
- * -KERNELBASE if it is off).
+ * -KERNELBASE if it is off).  r10 contains the upper half of
+ * the PTE if CONFIG_PTE_64BIT.
  * On SMP, the caller should have the mmu_hash_lock held.
  * We assume that the caller has (or will) set the _PAGE_HASHPTE
  * bit in the linux PTE in memory.  The value passed in r6 should
@@ -313,6 +347,11 @@ _GLOBAL(create_hpte)
 BEGIN_FTR_SECTION
 	ori	r8,r8,_PAGE_COHERENT	/* set M (coherence required) */
 END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+#ifdef CONFIG_PTE_64BIT
+	/* Put the XPN bits into the PTE */
+	rlwimi	r8,r10,8,20,22
+	rlwimi	r8,r10,2,29,29
+#endif
 
 	/* Construct the high word of the PPC-style PTE (r5) */
 	rlwinm	r5,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
@@ -499,14 +538,18 @@ _GLOBAL(flush_hash_pages)
 	isync
 
 	/* First find a PTE in the range that has _PAGE_HASHPTE set */
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r5,r4,22,20,29
-1:	lwz	r0,0(r5)
+#else
+	rlwimi	r5,r4,23,20,28
+#endif
+1:	lwz	r0,PTE_FLAGS_OFFSET(r5)
 	cmpwi	cr1,r6,1
 	andi.	r0,r0,_PAGE_HASHPTE
 	bne	2f
 	ble	cr1,19f
 	addi	r4,r4,0x1000
-	addi	r5,r5,4
+	addi	r5,r5,LNX_PTE_SIZE
 	addi	r6,r6,-1
 	b	1b
 
@@ -545,7 +588,10 @@ _GLOBAL(flush_hash_pages)
 	 * already clear, we're done (for this pte).  If not,
 	 * clear it (atomically) and proceed.  -- paulus.
 	 */
-33:	lwarx	r8,0,r5			/* fetch the pte */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r5,r5,PTE_FLAGS_OFFSET
+#endif
+33:	lwarx	r8,0,r5			/* fetch the pte flags word */
 	andi.	r0,r8,_PAGE_HASHPTE
 	beq	8f			/* done if HASHPTE is already clear */
 	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
@@ -590,7 +636,7 @@ _GLOBAL(flush_hash_patch_B)
 
 8:	ble	cr1,9f			/* if all ptes checked */
 81:	addi	r6,r6,-1
-	addi	r5,r5,4			/* advance to next pte */
+	addi	r5,r5,LNX_PTE_SIZE	/* go to next linux pte flag word */
 	addi	r4,r4,0x1000
 	lwz	r0,0(r5)		/* check next pte */
 	cmpwi	cr1,r6,1
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 2001abd..c31d6d2 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -73,7 +73,7 @@ extern unsigned long p_mapped_by_tlbcam(unsigned long pa);
 #endif /* HAVE_TLBCAM */
 
 #ifdef CONFIG_PTE_64BIT
-/* 44x uses an 8kB pgdir because it has 8-byte Linux PTEs. */
+/* Some processors use an 8kB pgdir because they have 8-byte Linux PTEs. */
 #define PGDIR_ORDER	1
 #else
 #define PGDIR_ORDER	0
@@ -288,7 +288,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
 }
 
 /*
- * Map in all of physical memory starting at KERNELBASE.
+ * Map in a big chunk of physical memory starting at KERNELBASE.
  */
 void __init mapin_ram(void)
 {
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 7f65127..ca5b58b 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
 
 config PTE_64BIT
 	bool
-	depends on 44x || E500
+	depends on 44x || E500 || 6xx
 	default y if 44x
-	default y if E500 && PHYS_64BIT
+	default y if PHYS_64BIT
 
 config PHYS_64BIT
-	bool 'Large physical address support' if E500
-	depends on 44x || E500
+	bool 'Large physical address support' if E500 || 6xx
+	depends on 44x || E500 || 6xx
 	select RESOURCES_64BIT
 	default y if 44x
 	---help---
 	  This option enables kernel support for larger than 32-bit physical
-	  addresses.  This features is not be available on all e500 cores.
+	  addresses.  This features is not be available on all cores.
+
+	  If you have more than 3.5GB of RAM or so, you also need to enable
+	  SWIOTLB under Kernel Options for this to work.  The actual number
+	  is platform-dependent.
 
 	  If in doubt, say N here.
 
-- 
1.5.5.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-27 22:38 [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical Becky Bruce
@ 2008-08-27 23:43 ` Scott Wood
  2008-08-28 15:36   ` Becky Bruce
  2008-09-01  5:28   ` Benjamin Herrenschmidt
  2008-09-01  5:19 ` Benjamin Herrenschmidt
  1 sibling, 2 replies; 16+ messages in thread
From: Scott Wood @ 2008-08-27 23:43 UTC (permalink / raw)
  To: Becky Bruce; +Cc: linuxppc-dev

Becky Bruce wrote:
>  #if _PAGE_HASHPTE != 0
> +#ifndef CONFIG_PTE_64BIT
>  	pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
>  #else
> +	/*
> +	 * We have to do the write of the 64b pte as 2 stores.  This
> +	 * code assumes that the entry we're storing to is currently
> +	 * not valid and that all callers have the page table lock.
> +	 * Having the entry be not valid protects readers who might read
> +	 * between the first and second stores.
> +	 */
> +	unsigned int tmp;
> +
> +	__asm__ __volatile__("\
> +1:	lwarx	%0,0,%4\n\
> +	rlwimi	%L2,%0,0,30,30\n\
> +	stw	%2,0(%3)\n\
> +	eieio\n\
> +	stwcx.  %L2,0,%4\n\
> +	bne-	1b"
> +	: "=&r" (tmp), "=m" (*ptep)
> +	: "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4), "m" (*ptep)
> +	: "cc");
> +#endif	/* CONFIG_PTE_64BIT */

Could the stw to the same reservation granule as the stwcx cancel the 
reservation on some implementations?  Plus, if you're assuming that the 
entry is currently invalid and all callers have the page table lock, do 
we need the lwarx/stwcx at all?  At the least, it should check 
PTE_ATOMIC_UPDATES.

%2 should be "+&r", not "r".

> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
> index 7f65127..ca5b58b 100644
> --- a/arch/powerpc/platforms/Kconfig.cputype
> +++ b/arch/powerpc/platforms/Kconfig.cputype
> @@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
>  
>  config PTE_64BIT
>  	bool
> -	depends on 44x || E500
> +	depends on 44x || E500 || 6xx
>  	default y if 44x
> -	default y if E500 && PHYS_64BIT
> +	default y if PHYS_64BIT

How is this different from PHYS_64BIT?

>  config PHYS_64BIT
> -	bool 'Large physical address support' if E500
> -	depends on 44x || E500
> +	bool 'Large physical address support' if E500 || 6xx

Maybe "if !44x", or have 44x "select" this, rather than listing all 
arches where it's optional.

> +	depends on 44x || E500 || 6xx
>  	select RESOURCES_64BIT
>  	default y if 44x
>  	---help---
>  	  This option enables kernel support for larger than 32-bit physical
> -	  addresses.  This features is not be available on all e500 cores.
> +	  addresses.  This features is not be available on all cores.

"This features is not be"?

-Scott

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-27 23:43 ` Scott Wood
@ 2008-08-28 15:36   ` Becky Bruce
  2008-08-28 16:07     ` Scott Wood
  2008-09-01  5:28   ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 16+ messages in thread
From: Becky Bruce @ 2008-08-28 15:36 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Aug 27, 2008, at 6:43 PM, Scott Wood wrote:

> Becky Bruce wrote:
>> #if _PAGE_HASHPTE != 0
>> +#ifndef CONFIG_PTE_64BIT
>> 	pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
>> #else
>> +	/*
>> +	 * We have to do the write of the 64b pte as 2 stores.  This
>> +	 * code assumes that the entry we're storing to is currently
>> +	 * not valid and that all callers have the page table lock.
>> +	 * Having the entry be not valid protects readers who might read
>> +	 * between the first and second stores.
>> +	 */
>> +	unsigned int tmp;
>> +
>> +	__asm__ __volatile__("\
>> +1:	lwarx	%0,0,%4\n\
>> +	rlwimi	%L2,%0,0,30,30\n\
>> +	stw	%2,0(%3)\n\
>> +	eieio\n\
>> +	stwcx.  %L2,0,%4\n\
>> +	bne-	1b"
>> +	: "=&r" (tmp), "=m" (*ptep)
>> +	: "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4),  
>> "m" (*ptep)
>> +	: "cc");
>> +#endif	/* CONFIG_PTE_64BIT */
>
> Could the stw to the same reservation granule as the stwcx cancel  
> the reservation on some implementations?  P

Not on the same processor.

> lus, if you're assuming that the entry is currently invalid and all  
> callers have the page table lock, do we need the lwarx/stwcx at  
> all?  At the least, it should check PTE_ATOMIC_UPDATES.

I'm pretty sure I went through this in great detail at one point and  
concluded that I did in fact need the lwarx/stwcx.  IIRC, it has to do  
with other non-set_pte_at writers not necessarily holding the page  
table lock. FYI, the existing 32-bit PTE code is doing atomic updates  
as well.

About PTE_ATOMIC_UPDATES, I didn't add that in because hashed page  
table implementations require atomic updates.  Adding it in would  
create another clause in that code, because I would still need to  
order the operations with a 64-bit PTE and I can't call pte_update as  
it only changes the low word of the pte.   I wasn't feeling too keen  
on adding untested pagetable code into the kernel :)  I can add it if  
the peanut gallery wants it, but I'll be marking it with a big fat  
"BUYER BEWARE".

>
>
> %2 should be "+&r", not "r".
>
>> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/ 
>> platforms/Kconfig.cputype
>> index 7f65127..ca5b58b 100644
>> --- a/arch/powerpc/platforms/Kconfig.cputype
>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>> @@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
>>  config PTE_64BIT
>> 	bool
>> -	depends on 44x || E500
>> +	depends on 44x || E500 || 6xx
>> 	default y if 44x
>> -	default y if E500 && PHYS_64BIT
>> +	default y if PHYS_64BIT
>
> How is this different from PHYS_64BIT?

One is the width of the PTE and one is the width of a physical  
address.  It's entirely plausible to have a 64-bit PTE because you  
have a bunch of status bits, and only have 32-bit physical  
addressing.  That's why there are 2 options.

>
>
>> config PHYS_64BIT
>> -	bool 'Large physical address support' if E500
>> -	depends on 44x || E500
>> +	bool 'Large physical address support' if E500 || 6xx
>
> Maybe "if !44x", or have 44x "select" this, rather than listing all  
> arches where it's optional.

Not sure exactly what you're suggesting here........

>
>
>> +	depends on 44x || E500 || 6xx
>> 	select RESOURCES_64BIT
>> 	default y if 44x
>> 	---help---
>> 	  This option enables kernel support for larger than 32-bit physical
>> -	  addresses.  This features is not be available on all e500 cores.
>> +	  addresses.  This features is not be available on all cores.
>
> "This features is not be"?

Heh, I didn't type that :)  But I can fix it.

Thanks,
B

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-28 15:36   ` Becky Bruce
@ 2008-08-28 16:07     ` Scott Wood
  2008-08-28 19:37       ` Becky Bruce
  0 siblings, 1 reply; 16+ messages in thread
From: Scott Wood @ 2008-08-28 16:07 UTC (permalink / raw)
  To: Becky Bruce; +Cc: linuxppc-dev

Becky Bruce wrote:
> I'm pretty sure I went through this in great detail at one point and 
> concluded that I did in fact need the lwarx/stwcx.  IIRC, it has to do 
> with other non-set_pte_at writers not necessarily holding the page table 
> lock. FYI, the existing 32-bit PTE code is doing atomic updates as well.

But will those updates happen if there isn't already a valid PTE?

> About PTE_ATOMIC_UPDATES, I didn't add that in because hashed page table 
> implementations require atomic updates.

Right, I misread it and thought it was being used for non-hashed 
implementations as well.

What happens if you enable 64-bit PTEs on a 603-ish CPU?  The kconfig 
seems to allow it.

>  Adding it in would create 
> another clause in that code, because I would still need to order the 
> operations with a 64-bit PTE and I can't call pte_update as it only 
> changes the low word of the pte.   I wasn't feeling too keen on adding 
> untested pagetable code into the kernel :)

Wimp. :-)

 > I can add it if the peanut
> gallery wants it, but I'll be marking it with a big fat "BUYER BEWARE".

No, there's little point if nothing selects it (or is planned to in the 
near future).

>>> diff --git a/arch/powerpc/platforms/Kconfig.cputype 
>>> b/arch/powerpc/platforms/Kconfig.cputype
>>> index 7f65127..ca5b58b 100644
>>> --- a/arch/powerpc/platforms/Kconfig.cputype
>>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>>> @@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
>>>  config PTE_64BIT
>>>     bool
>>> -    depends on 44x || E500
>>> +    depends on 44x || E500 || 6xx
>>>     default y if 44x
>>> -    default y if E500 && PHYS_64BIT
>>> +    default y if PHYS_64BIT
>>
>> How is this different from PHYS_64BIT?
> 
> One is the width of the PTE and one is the width of a physical address.  
> It's entirely plausible to have a 64-bit PTE because you have a bunch of 
> status bits, and only have 32-bit physical addressing.  That's why there 
> are 2 options.

Right, I just didn't see anything that actually selects it independently 
of PHYS_64BIT.  Is this something that's expected to actually happen in 
the future?

The "default y if 44x" line is redundant with the "default y if PHYS_64BIT".

>>> config PHYS_64BIT
>>> -    bool 'Large physical address support' if E500
>>> -    depends on 44x || E500
>>> +    bool 'Large physical address support' if E500 || 6xx
>>
>> Maybe "if !44x", or have 44x "select" this, rather than listing all 
>> arches where it's optional.
> 
> Not sure exactly what you're suggesting here........

It just seems simpler to not conditionalize the bool, but instead have 
CONFIG_44x do "select PHYS_64BIT".  I'd rather avoid another list of 
platforms accumulating in a kconfig dependency.

>>> +    depends on 44x || E500 || 6xx
>>>     select RESOURCES_64BIT
>>>     default y if 44x
>>>     ---help---
>>>       This option enables kernel support for larger than 32-bit physical
>>> -      addresses.  This features is not be available on all e500 cores.
>>> +      addresses.  This features is not be available on all cores.
>>
>> "This features is not be"?
> 
> Heh, I didn't type that :)  But I can fix it.

You didn't type it, but you touched it.  Tag, you're it. :-)

-Scott

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-28 16:07     ` Scott Wood
@ 2008-08-28 19:37       ` Becky Bruce
  2008-08-28 20:28         ` Scott Wood
  2008-08-28 22:42         ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 16+ messages in thread
From: Becky Bruce @ 2008-08-28 19:37 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev list


On Aug 28, 2008, at 11:07 AM, Scott Wood wrote:

> Becky Bruce wrote:
>> I'm pretty sure I went through this in great detail at one point  
>> and concluded that I did in fact need the lwarx/stwcx.  IIRC, it  
>> has to do with other non-set_pte_at writers not necessarily holding  
>> the page table lock. FYI, the existing 32-bit PTE code is doing  
>> atomic updates as well.
>
> But will those updates happen if there isn't already a valid PTE?

I understand what you're saying, I've been here before :)  However, I  
was never able to convince myself that it's safe without the lwarx/ 
stwcx.  There's hashing code that wanks around with the HASHPTE bit  
doing a RMW without holding any lock (other than lwarx/stwcx-ing the  
PTE itself).  And there's definitely code that's fairly far removed  
from the last time you checked that an entry was valid.  I've CC'd Ben  
on this mail - perhaps he can shed some light.  If we don't need the  
atomics, I'm happy to yank them.

Now, it *does* seem like set_pte_at could be optimized for the non-SMP  
case....  I'll have to chew on that one a bit.

>
>
>> About PTE_ATOMIC_UPDATES, I didn't add that in because hashed page  
>> table implementations require atomic updates.
>
> Right, I misread it and thought it was being used for non-hashed  
> implementations as well.
>
> What happens if you enable 64-bit PTEs on a 603-ish CPU?  The  
> kconfig seems to allow it.

Don't do that :)  That's why the help is there in the Kconfig.   
Otherwise, I have to list out every 74xx part that supports 36-bit  
physical addressing.  In any event, nothing interesting will happen  
other than that you'll waste some space.  The kernel boots fine with a  
non-36b physical u-boot and small amounts of RAM.

>
>
>> Adding it in would create another clause in that code, because I  
>> would still need to order the operations with a 64-bit PTE and I  
>> can't call pte_update as it only changes the low word of the pte.    
>> I wasn't feeling too keen on adding untested pagetable code into  
>> the kernel :)
>
> Wimp. :-)

Pansy :)

>
>
> > I can add it if the peanut
>> gallery wants it, but I'll be marking it with a big fat "BUYER  
>> BEWARE".
>
> No, there's little point if nothing selects it (or is planned to in  
> the near future).

Good :)

>
>
>>>> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/ 
>>>> powerpc/platforms/Kconfig.cputype
>>>> index 7f65127..ca5b58b 100644
>>>> --- a/arch/powerpc/platforms/Kconfig.cputype
>>>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>>>> @@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
>>>> config PTE_64BIT
>>>>    bool
>>>> -    depends on 44x || E500
>>>> +    depends on 44x || E500 || 6xx
>>>>    default y if 44x
>>>> -    default y if E500 && PHYS_64BIT
>>>> +    default y if PHYS_64BIT
>>>
>>> How is this different from PHYS_64BIT?
>> One is the width of the PTE and one is the width of a physical  
>> address.  It's entirely plausible to have a 64-bit PTE because you  
>> have a bunch of status bits, and only have 32-bit physical  
>> addressing.  That's why there are 2 options.
>
> Right, I just didn't see anything that actually selects it  
> independently of PHYS_64BIT.  Is this something that's expected to  
> actually happen in the future?

There's been talk of making the PTEs always 64-bit even on 32-bit  
platforms.  So it's entirely plausible.

>
>
> The "default y if 44x" line is redundant with the "default y if  
> PHYS_64BIT".

You're right, I'll clean that up.

>
>
>>>> config PHYS_64BIT
>>>> -    bool 'Large physical address support' if E500
>>>> -    depends on 44x || E500
>>>> +    bool 'Large physical address support' if E500 || 6xx
>>>
>>> Maybe "if !44x", or have 44x "select" this, rather than listing  
>>> all arches where it's optional.
>> Not sure exactly what you're suggesting here........
>
> It just seems simpler to not conditionalize the bool, but instead  
> have CONFIG_44x do "select PHYS_64BIT".  I'd rather avoid another  
> list of platforms accumulating in a kconfig dependency.

I'm still not sure where you're going with this - I can remove 44x  
from the conditional part, but we still have to deal with e500 and  
6xx.  In which case you're now setting this in different places for  
difft plats, making it potentially harder to read.  Unless you're  
suggesting allowing the selection of PHYS_64BIT on any platform (in  
which case your comment about what happens if I select this on 603  
doesn't make much sense).....

>
>
>>>> +    depends on 44x || E500 || 6xx
>>>>    select RESOURCES_64BIT
>>>>    default y if 44x
>>>>    ---help---
>>>>      This option enables kernel support for larger than 32-bit  
>>>> physical
>>>> -      addresses.  This features is not be available on all e500  
>>>> cores.
>>>> +      addresses.  This features is not be available on all cores.
>>>
>>> "This features is not be"?
>> Heh, I didn't type that :)  But I can fix it.
>
> You didn't type it, but you touched it.  Tag, you're it. :-)

It's already fixed in my tree  :)

-B

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-28 19:37       ` Becky Bruce
@ 2008-08-28 20:28         ` Scott Wood
  2008-08-28 21:13           ` Becky Bruce
  2008-08-28 22:42         ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 16+ messages in thread
From: Scott Wood @ 2008-08-28 20:28 UTC (permalink / raw)
  To: Becky Bruce; +Cc: linuxppc-dev list

Becky Bruce wrote:
> On Aug 28, 2008, at 11:07 AM, Scott Wood wrote:
> 
>> Becky Bruce wrote:
>>> I'm pretty sure I went through this in great detail at one point
>>> and concluded that I did in fact need the lwarx/stwcx.  IIRC, it
>>> has to do with other non-set_pte_at writers not necessarily
>>> holding the page table lock. FYI, the existing 32-bit PTE code is
>>> doing atomic updates as well.
>> 
>> But will those updates happen if there isn't already a valid PTE?
> 
> I understand what you're saying, I've been here before :)  However, I
>  was never able to convince myself that it's safe without the 
> lwarx/stwcx.  There's hashing code that wanks around with the HASHPTE
>  bit doing a RMW without holding any lock (other than lwarx/stwcx-ing
> the PTE itself).

OK.  I was concerned not just about efficiency, but of the safety of the 
  "stw" write if there were other modifications going on (even if the 
set_pte_at stwcx fails, the other updater could have lwarxed an 
succesfully stwcxed after the stw and ended up with a mixed PTE), but it 
may not be an issue depending on the nature of the updates.

>>> About PTE_ATOMIC_UPDATES, I didn't add that in because hashed
>>> page table implementations require atomic updates.
>> 
>> Right, I misread it and thought it was being used for non-hashed 
>> implementations as well.
>> 
>> What happens if you enable 64-bit PTEs on a 603-ish CPU?  The
>> kconfig seems to allow it.
> 
> Don't do that :)  That's why the help is there in the Kconfig. 

People will do it anyway -- and there's multiplatform to consider.

> Otherwise, I have to list out every 74xx part that supports 36-bit 
> physical addressing.  In any event, nothing interesting will happen 
> other than that you'll waste some space.  The kernel boots fine with
> a non-36b physical u-boot and small amounts of RAM.

My concern was the generic code trying to use 64-bit PTEs, and the 603 
TLB miss handlers continuing to assume that the PTEs are 32-bit, and 
loading the wrong thing.

Wasted space alone is an acceptable consequence of turning on things you 
don't need. :-)

> I'm still not sure where you're going with this - I can remove 44x
> from the conditional part, but we still have to deal with e500 and
> 6xx.

You still need it in "depends" (in the absence of a "PHYS_64BIT_CAPABLE" 
or some such), but not "bool '...' if".  It's not a big deal, just a pet 
peeve.

> In which case you're now setting this in different places for difft
> plats, making it potentially harder to read.  Unless you're
> suggesting allowing the selection of PHYS_64BIT on any platform

No, unless the code for all platforms makes it safe to do so.

-Scott

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-28 20:28         ` Scott Wood
@ 2008-08-28 21:13           ` Becky Bruce
  0 siblings, 0 replies; 16+ messages in thread
From: Becky Bruce @ 2008-08-28 21:13 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev list

Great, so *you* got my email, and I did not.  I love our mailserver!

On Aug 28, 2008, at 3:28 PM, Scott Wood wrote:

> Becky Bruce wrote:
>> On Aug 28, 2008, at 11:07 AM, Scott Wood wrote:
>>> Becky Bruce wrote:
>>>> I'm pretty sure I went through this in great detail at one point
>>>> and concluded that I did in fact need the lwarx/stwcx.  IIRC, it
>>>> has to do with other non-set_pte_at writers not necessarily
>>>> holding the page table lock. FYI, the existing 32-bit PTE code is
>>>> doing atomic updates as well.
>>> But will those updates happen if there isn't already a valid PTE?
>> I understand what you're saying, I've been here before :)  However, I
>> was never able to convince myself that it's safe without the lwarx/ 
>> stwcx.  There's hashing code that wanks around with the HASHPTE
>> bit doing a RMW without holding any lock (other than lwarx/stwcx-ing
>> the PTE itself).
>
> OK.  I was concerned not just about efficiency, but of the safety of  
> the  "stw" write if there were other modifications going on (even if  
> the set_pte_at stwcx fails, the other updater could have lwarxed an  
> succesfully stwcxed after the stw and ended up with a mixed PTE),  
> but it may not be an issue depending on the nature of the updates.

It shouldn't be an issue - set_pte_at() is the only writer of the high  
bits of the PTE, the pte is invalid upon entry to set_pte_at(), and  
none of the potential interfering writers should be turning on the  
valid bit.

>
>
>>>> About PTE_ATOMIC_UPDATES, I didn't add that in because hashed
>>>> page table implementations require atomic updates.
>>> Right, I misread it and thought it was being used for non-hashed  
>>> implementations as well.
>>> What happens if you enable 64-bit PTEs on a 603-ish CPU?  The
>>> kconfig seems to allow it.
>> Don't do that :)  That's why the help is there in the Kconfig.
>
> People will do it anyway -- and there's multiplatform to consider.

>
>
>> Otherwise, I have to list out every 74xx part that supports 36-bit  
>> physical addressing.  In any event, nothing interesting will happen  
>> other than that you'll waste some space.  The kernel boots fine with
>> a non-36b physical u-boot and small amounts of RAM.
>
> My concern was the generic code trying to use 64-bit PTEs, and the  
> 603 TLB miss handlers continuing to assume that the PTEs are 32-bit,  
> and loading the wrong thing.
>
> Wasted space alone is an acceptable consequence of turning on things  
> you don't need. :-)

Actually, I'm lying to you - I forgot about the old parts with DTLB/ 
ITLB handlers.  That code will actually break, and I'd rather not hack  
it up to pointlessly accomodate large PTEs.  I do need to fix this the  
Kconfig, even though it's going to be gross. Thanks for pointing this  
out.

>
>
>> I'm still not sure where you're going with this - I can remove 44x
>> from the conditional part, but we still have to deal with e500 and
>> 6xx.
>
> You still need it in "depends" (in the absence of a  
> "PHYS_64BIT_CAPABLE" or some such), but not "bool '...' if".  It's  
> not a big deal, just a pet peeve.

I'll look at making it less peevy :)

>
>
>> In which case you're now setting this in different places for difft
>> plats, making it potentially harder to read.  Unless you're
>> suggesting allowing the selection of PHYS_64BIT on any platform
>
> No, unless the code for all platforms makes it safe to do so.
>

Thanks!
-Becky

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-28 19:37       ` Becky Bruce
  2008-08-28 20:28         ` Scott Wood
@ 2008-08-28 22:42         ` Benjamin Herrenschmidt
  2008-08-30 16:24           ` Scott Wood
  1 sibling, 1 reply; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-08-28 22:42 UTC (permalink / raw)
  To: Becky Bruce; +Cc: Scott Wood, linuxppc-dev list

> I understand what you're saying, I've been here before :)  However, I  
> was never able to convince myself that it's safe without the lwarx/ 
> stwcx.  There's hashing code that wanks around with the HASHPTE bit  
> doing a RMW without holding any lock (other than lwarx/stwcx-ing the  
> PTE itself).  And there's definitely code that's fairly far removed  
> from the last time you checked that an entry was valid.  I've CC'd Ben  
> on this mail - perhaps he can shed some light.  If we don't need the  
> atomics, I'm happy to yank them.
> 
> Now, it *does* seem like set_pte_at could be optimized for the non-SMP  
> case....  I'll have to chew on that one a bit.

I haven't read the whole discussion not reviewed the patches yet, so I'm
just answering to the above sentence before I head off for the week-end
(and yes, Becky, I _DO_ have reviewing your stuff high on my todo list,
but I've been really swamped those last 2 weeks).

So all generic code always accesses PTEs with the PTE lock held (that
lock can be in various places ... typically for us it's one per PTE
page).

44x and FSL BookE no longer write to the PTEs without that lock anymore
and thus don't require atomic access in set_pte and friends.

Hash based platforms still do because of -one- thing : the hashing code
proper which writes back using lwarx/stwcx. to update _PAGE_ACCESSED,
_PAGE_HASHPTE and _PAGE_DIRTY. The hash code does take a global lock to
avoid double-hashing of the same page but this isn't something we should
use elsewhere.

So on hash based platforms, updates of the PTEs are expected to be done
atomically. Now if you extend the size of the PTE, hopefully those
atomic updates are still necessary but only for the -low- part of the
PTE that contains those bits.

For the non-SMP case, I think it should be possible to optimize it. The
only thing that can happen at interrupt time is hashing of kernel or
vmalloc/ioremap pages, which shouldn't compete with set_pte on those
pages, so there would be no access races there, but I may be missing
something as it's the morning and I about just woke up :-)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-28 22:42         ` Benjamin Herrenschmidt
@ 2008-08-30 16:24           ` Scott Wood
  2008-08-31  8:22             ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 16+ messages in thread
From: Scott Wood @ 2008-08-30 16:24 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev list

On Fri, Aug 29, 2008 at 08:42:01AM +1000, Benjamin Herrenschmidt wrote:
> For the non-SMP case, I think it should be possible to optimize it. The
> only thing that can happen at interrupt time is hashing of kernel or
> vmalloc/ioremap pages, which shouldn't compete with set_pte on those
> pages, so there would be no access races there, but I may be missing
> something as it's the morning and I about just woke up :-)

Is that still true with preemptible kernels?

-Scott

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-30 16:24           ` Scott Wood
@ 2008-08-31  8:22             ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-08-31  8:22 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev list

On Sat, 2008-08-30 at 11:24 -0500, Scott Wood wrote:
> On Fri, Aug 29, 2008 at 08:42:01AM +1000, Benjamin Herrenschmidt wrote:
> > For the non-SMP case, I think it should be possible to optimize it. The
> > only thing that can happen at interrupt time is hashing of kernel or
> > vmalloc/ioremap pages, which shouldn't compete with set_pte on those
> > pages, so there would be no access races there, but I may be missing
> > something as it's the morning and I about just woke up :-)
> 
> Is that still true with preemptible kernels?

Those shouldn't be an issue as long as we can't preempt while holding a
spinlock and we do hold the pte lock on any modification... Of course,
-rt is a different matter.

Ben.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-27 22:38 [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical Becky Bruce
  2008-08-27 23:43 ` Scott Wood
@ 2008-09-01  5:19 ` Benjamin Herrenschmidt
  2008-09-02 16:19   ` Becky Bruce
  1 sibling, 1 reply; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-09-01  5:19 UTC (permalink / raw)
  To: Becky Bruce; +Cc: linuxppc-dev


> +#ifdef CONFIG_PTE_64BIT
> +#define PTE_FLAGS_OFFSET	4	/* offset of PTE flags, in bytes */
> +#define LNX_PTE_SIZE		8	/* size of a linux PTE, in bytes */
> +#else
> +#define PTE_FLAGS_OFFSET	0
> +#define LNX_PTE_SIZE		4
> +#endif

s/LNX_PTE_SIZE/PTE_BYTES or PTE_SIZE, no need for that horrible LNX
prefix. In fact, if it's only used by the asm, then ditch it and
have asm-offsets.c create something based on sizeof(pte_t).

>  #define pte_none(pte)		((pte_val(pte) & ~_PTE_NONE_MASK) == 0)
>  #define pte_present(pte)	(pte_val(pte) & _PAGE_PRESENT)
> -#define pte_clear(mm,addr,ptep)	do { pte_update(ptep, ~0, 0); } while (0)
> +#define pte_clear(mm, addr, ptep) \
> +	do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)

Where does this previous definition of pte_clear comes from ? It's bogus
(and it's not like that upstream). Your "updated" ones looks ok though.

But whatever tree has the "previous" one would break hash based ppc32
if merged or is there some other related changes in Kumar tree that
make the above safe ?
 
>  #define pmd_none(pmd)		(!pmd_val(pmd))
>  #define	pmd_bad(pmd)		(pmd_val(pmd) & _PMD_BAD)
> @@ -664,8 +670,30 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
>  			      pte_t *ptep, pte_t pte)
>  {
>  #if _PAGE_HASHPTE != 0
> +#ifndef CONFIG_PTE_64BIT
>  	pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
>  #else
> +	/*
> +	 * We have to do the write of the 64b pte as 2 stores.  This
> +	 * code assumes that the entry we're storing to is currently
> +	 * not valid and that all callers have the page table lock.
> +	 * Having the entry be not valid protects readers who might read
> +	 * between the first and second stores.
> +	 */
> +	unsigned int tmp;

Do you know for sure the entry isn't valid ? On ppc64, we explicitely
test for that and if it was valid, we clear and flush it. The generic
code has been going on and off about calling set_pte_at() on an already
valid PTE, I wouldn't rely too much on it guaranteeing it will not
happen. The 32b PTE code is safe because it preserves _PAGE_HASHPTE .

Note also that once you have (which you don't now) the guarantee
that your previous PTE is invalid, then you can safely do two normal
stores instead of a lwarx/stwcx. loop. In any case, having the stw in
the middle of the loop doesn't sound very useful.

> +	__asm__ __volatile__("\
> +1:	lwarx	%0,0,%4\n\
> +	rlwimi	%L2,%0,0,30,30\n\
> +	stw	%2,0(%3)\n\
> +	eieio\n\
> +	stwcx.  %L2,0,%4\n\
> +	bne-	1b"
> +	: "=&r" (tmp), "=m" (*ptep)
> +	: "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4), "m" (*ptep)
> +	: "cc");
> +#endif	/* CONFIG_PTE_64BIT */
> +#else /* _PAGE_HASHPTE == 0 */
>  #if defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
>  	__asm__ __volatile__("\
>  		stw%U0%X0 %2,%0\n\
> diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
> index b9ba7d9..d63e20a 100644
> --- a/arch/powerpc/mm/hash_low_32.S
> +++ b/arch/powerpc/mm/hash_low_32.S
> @@ -75,7 +75,7 @@ _GLOBAL(hash_page_sync)
>   * Returns to the caller if the access is illegal or there is no
>   * mapping for the address.  Otherwise it places an appropriate PTE
>   * in the hash table and returns from the exception.
> - * Uses r0, r3 - r8, ctr, lr.
> + * Uses r0, r3 - r8, r10, ctr, lr.
>   */
>  	.text
>  _GLOBAL(hash_page)
> @@ -106,9 +106,15 @@ _GLOBAL(hash_page)
>  	addi	r5,r5,swapper_pg_dir@l	/* kernel page table */
>  	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
>  112:	add	r5,r5,r7		/* convert to phys addr */
> +#ifndef CONFIG_PTE_64BIT
>  	rlwimi	r5,r4,12,20,29		/* insert top 10 bits of address */
>  	lwz	r8,0(r5)		/* get pmd entry */
>  	rlwinm.	r8,r8,0,0,19		/* extract address of pte page */
> +#else
> +	rlwinm	r8,r4,13,19,29		/* Compute pgdir/pmd offset */
> +	lwzx	r8,r8,r5		/* Get L1 entry */
> +	rlwinm. r8,r8,0,0,20		/* extract pt base address */
> +#endif

Any reason you wrote the above using a different technique ? If you
believe that rlwinm/lwzx is going to be more efficient than rlwimi/lwz,
maybe we should fix the old one ... or am I missing something totally
obvious ? :-)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-08-27 23:43 ` Scott Wood
  2008-08-28 15:36   ` Becky Bruce
@ 2008-09-01  5:28   ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-09-01  5:28 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev


> Could the stw to the same reservation granule as the stwcx cancel the 
> reservation on some implementations?

It might I suppose ... In any case, see my replies to Becky.

>   Plus, if you're assuming that the 
> entry is currently invalid and all callers have the page table lock, do 
> we need the lwarx/stwcx at all?  At the least, it should check 
> PTE_ATOMIC_UPDATES.

It shouldn't need atomic operations -if- the current entry is invalid
-and- _PAGE_HASHPTE is clear. (Ie, the current entry is invalid -and- 
the hash table has been updated).

In any case, the stw can just move out of the loop.

It might be worth just doing something along the lines of

	if (pte_val(*ptep) & _PAGE_PRESENT)
		pte_clear(pte);
	if (pte_val(*ptep) & _PAGE_HASHPTE)
		flush_hash_entry(mm, ptep, addr);
	asm v. { "stw ; eieio; stw" };

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-09-01  5:19 ` Benjamin Herrenschmidt
@ 2008-09-02 16:19   ` Becky Bruce
  2008-09-02 21:21     ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 16+ messages in thread
From: Becky Bruce @ 2008-09-02 16:19 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev


On Sep 1, 2008, at 12:19 AM, Benjamin Herrenschmidt wrote:

Thanks for taking the time to dig through this :)

>
>> +#ifdef CONFIG_PTE_64BIT
>> +#define PTE_FLAGS_OFFSET	4	/* offset of PTE flags, in bytes */
>> +#define LNX_PTE_SIZE		8	/* size of a linux PTE, in bytes */
>> +#else
>> +#define PTE_FLAGS_OFFSET	0
>> +#define LNX_PTE_SIZE		4
>> +#endif
>
> s/LNX_PTE_SIZE/PTE_BYTES or PTE_SIZE, no need for that horrible LNX
> prefix. In fact, if it's only used by the asm, then ditch it and
> have asm-offsets.c create something based on sizeof(pte_t).

The reason I did that was to distinguish between the size of a  
software PTE entry, and the actual size of the hardware entry.  In the  
case of 36b physical, the hardware PTE size is the same as it always  
is; but we've grown the Linux PTE.  We can call it something else, or  
I can change as you suggest; I was worried that the next person to  
come along might get confused.  I suppose there aren't too many of us  
that are crazy enough to muck around in this code, so maybe it's not  
that big of a deal.

>
>
>> #define pte_none(pte)		((pte_val(pte) & ~_PTE_NONE_MASK) == 0)
>> #define pte_present(pte)	(pte_val(pte) & _PAGE_PRESENT)
>> -#define pte_clear(mm,addr,ptep)	do { pte_update(ptep, ~0, 0); }  
>> while (0)
>> +#define pte_clear(mm, addr, ptep) \
>> +	do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)
>
> Where does this previous definition of pte_clear comes from ? It's  
> bogus
> (and it's not like that upstream). Your "updated" ones looks ok  
> though.
>
> But whatever tree has the "previous" one would break hash based ppc32
> if merged or is there some other related changes in Kumar tree that
> make the above safe ?

That's from Kumar's tree of changes for BookE.... he'll need to answer  
that.  I think he put out a patchset with that work; not sure if it  
was correct in this particular or not.

>
>
>> #define pmd_none(pmd)		(!pmd_val(pmd))
>> #define	pmd_bad(pmd)		(pmd_val(pmd) & _PMD_BAD)
>> @@ -664,8 +670,30 @@ static inline void set_pte_at(struct mm_struct  
>> *mm, unsigned long addr,
>> 			      pte_t *ptep, pte_t pte)
>> {
>> #if _PAGE_HASHPTE != 0
>> +#ifndef CONFIG_PTE_64BIT
>> 	pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
>> #else
>> +	/*
>> +	 * We have to do the write of the 64b pte as 2 stores.  This
>> +	 * code assumes that the entry we're storing to is currently
>> +	 * not valid and that all callers have the page table lock.
>> +	 * Having the entry be not valid protects readers who might read
>> +	 * between the first and second stores.
>> +	 */
>> +	unsigned int tmp;
>
> Do you know for sure the entry isn't valid ? On ppc64, we explicitely
> test for that and if it was valid, we clear and flush it. The generic
> code has been going on and off about calling set_pte_at() on an  
> already
> valid PTE, I wouldn't rely too much on it guaranteeing it will not
> happen. The 32b PTE code is safe because it preserves _PAGE_HASHPTE .

IIRC, we discussed this at some point, months ago, and decided this  
was the case. If it *can* be valid, and it's possible to have a reader  
on another cpu, I think we end up having to go through a very gross  
sequence where we have to mark it invalid before we start mucking  
around with it; otherwise a reader can get half-and-half.  Perhaps I'm  
missing a key restriction here?

>
>
> Note also that once you have (which you don't now) the guarantee
> that your previous PTE is invalid, then you can safely do two normal
> stores instead of a lwarx/stwcx. loop. In any case, having the stw in
> the middle of the loop doesn't sound very useful.

If the pte is invalid, and we're SMP, we need the store to the high  
word to occur before the stwcx that sets the valid bit, unless you're  
certain we can't have a reader on another cpu.  If we're not SMP, then  
I agree, let's move that store.

I thought we discussed at some point the possibility that there might  
be an updater on another CPU?  Is this not possible?  If not, I'll  
change it.  The existing code is also lwarx/stwcxing via pte_update();  
is there no reason for that?  We should probably change that as well,  
if that's the case.

>
>
>> +	__asm__ __volatile__("\
>> +1:	lwarx	%0,0,%4\n\
>> +	rlwimi	%L2,%0,0,30,30\n\
>> +	stw	%2,0(%3)\n\
>> +	eieio\n\
>> +	stwcx.  %L2,0,%4\n\
>> +	bne-	1b"
>> +	: "=&r" (tmp), "=m" (*ptep)
>> +	: "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4),  
>> "m" (*ptep)
>> +	: "cc");
>> +#endif	/* CONFIG_PTE_64BIT */
>> +#else /* _PAGE_HASHPTE == 0 */
>> #if defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
>> 	__asm__ __volatile__("\
>> 		stw%U0%X0 %2,%0\n\
>> diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/ 
>> hash_low_32.S
>> index b9ba7d9..d63e20a 100644
>> --- a/arch/powerpc/mm/hash_low_32.S
>> +++ b/arch/powerpc/mm/hash_low_32.S
>> @@ -75,7 +75,7 @@ _GLOBAL(hash_page_sync)
>>  * Returns to the caller if the access is illegal or there is no
>>  * mapping for the address.  Otherwise it places an appropriate PTE
>>  * in the hash table and returns from the exception.
>> - * Uses r0, r3 - r8, ctr, lr.
>> + * Uses r0, r3 - r8, r10, ctr, lr.
>>  */
>> 	.text
>> _GLOBAL(hash_page)
>> @@ -106,9 +106,15 @@ _GLOBAL(hash_page)
>> 	addi	r5,r5,swapper_pg_dir@l	/* kernel page table */
>> 	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
>> 112:	add	r5,r5,r7		/* convert to phys addr */
>> +#ifndef CONFIG_PTE_64BIT
>> 	rlwimi	r5,r4,12,20,29		/* insert top 10 bits of address */
>> 	lwz	r8,0(r5)		/* get pmd entry */
>> 	rlwinm.	r8,r8,0,0,19		/* extract address of pte page */
>> +#else
>> +	rlwinm	r8,r4,13,19,29		/* Compute pgdir/pmd offset */
>> +	lwzx	r8,r8,r5		/* Get L1 entry */
>> +	rlwinm. r8,r8,0,0,20		/* extract pt base address */
>> +#endif
>
> Any reason you wrote the above using a different technique ? If you
> believe that rlwinm/lwzx is going to be more efficient than rlwimi/ 
> lwz,
> maybe we should fix the old one ... or am I missing something totally
> obvious ? :-)

Nothing so exciting there - I borrowed this code from the BookE  
version.  I'm happy to change it.

Thanks, Ben!

Cheers,
Becky

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-09-02 16:19   ` Becky Bruce
@ 2008-09-02 21:21     ` Benjamin Herrenschmidt
  2008-09-03 15:10       ` Becky Bruce
  0 siblings, 1 reply; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-09-02 21:21 UTC (permalink / raw)
  To: Becky Bruce; +Cc: linuxppc-dev

> The reason I did that was to distinguish between the size of a  
> software PTE entry, and the actual size of the hardware entry.  In the  
> case of 36b physical, the hardware PTE size is the same as it always  
> is; but we've grown the Linux PTE.  We can call it something else, or  
> I can change as you suggest; I was worried that the next person to  
> come along might get confused.  I suppose there aren't too many of us  
> that are crazy enough to muck around in this code, so maybe it's not  
> that big of a deal.

We already called "PTE" the linux entry everywhere so hopefully there
should be no confusion. We call "HPTE" the hash table ones.

> That's from Kumar's tree of changes for BookE.... he'll need to answer  
> that.  I think he put out a patchset with that work; not sure if it  
> was correct in this particular or not.

Well, unless he did some changes in the area of hash flushing, linux
code must -never- clear _PAGE_HASHPTE on 32 bits. Only the assembly hash
flushing code will do it while also flushing the hash table.

> IIRC, we discussed this at some point, months ago, and decided this  
> was the case. If it *can* be valid, and it's possible to have a reader  
> on another cpu, I think we end up having to go through a very gross  
> sequence where we have to mark it invalid before we start mucking  
> around with it; otherwise a reader can get half-and-half.  Perhaps I'm  
> missing a key restriction here?

No that's correct. You can go out of line with something like

	if (unlikely(pte_valid(*ptep)) {
		pte_clear(ptep);
		eieio(); /* make sure above is visible before
		          * further stw to high word is */
	}
	if (pte_val(*ptep) & _PAGE_HASHPTE)
		flush_hash_entry(ptep, addr) /* or whatever proto */

I'd rather be safe than sorry in that area. ppc64 does something akin
to the above.

> If the pte is invalid, and we're SMP, we need the store to the high  
> word to occur before the stwcx that sets the valid bit, unless you're  
> certain we can't have a reader on another cpu.  If we're not SMP, then  
> I agree, let's move that store.

Yes, the stores must occur in order, but none of them need to be a
stwcx. if your PTE cannot be modified from underneath you and in any
case, the high word which isn't atomic doesn't have to be in the loop,
it should just be before the loop.

> I thought we discussed at some point the possibility that there might  
> be an updater on another CPU?  Is this not possible?  If not, I'll  
> change it.  The existing code is also lwarx/stwcxing via pte_update();  
> is there no reason for that?  We should probably change that as well,  
> if that's the case.

You are holding the PTE lock, so the only code path that might
eventually perform an update are the hash miss and the hash flush. The
former will never happen if your PTE is invalid. The later will never
happen if your PTE has _PAGE_HASHPTE clear which it should have if you
do the sequence above properly and if I didn't miss anything (which is
always possible :-)

There is indeed the chance that your PTE -will- be concurrently modified
without a lock by an invalidation if you have _PAGE_HASHPTE set and you
don't first ensure the hash has been flushed. In that case, you do need
an atomic operation. Just move the stw to the top word to be before the
loop then (and still keep the pte_clear bit).

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-09-02 21:21     ` Benjamin Herrenschmidt
@ 2008-09-03 15:10       ` Becky Bruce
  2008-09-04  2:53         ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 16+ messages in thread
From: Becky Bruce @ 2008-09-03 15:10 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev


On Sep 2, 2008, at 4:21 PM, Benjamin Herrenschmidt wrote:

>
>> The reason I did that was to distinguish between the size of a
>> software PTE entry, and the actual size of the hardware entry.  In  
>> the
>> case of 36b physical, the hardware PTE size is the same as it always
>> is; but we've grown the Linux PTE.  We can call it something else, or
>> I can change as you suggest; I was worried that the next person to
>> come along might get confused.  I suppose there aren't too many of us
>> that are crazy enough to muck around in this code, so maybe it's not
>> that big of a deal.
>
> We already called "PTE" the linux entry everywhere so hopefully there
> should be no confusion. We call "HPTE" the hash table ones.

Actually, PTE_SIZE is already defined in hash_low_32.S, and it's the  
size of the HPTE.  I'll rename that to HPTE_SIZE and use PTE_SIZE for  
the linux PTE.

-Becky

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
  2008-09-03 15:10       ` Becky Bruce
@ 2008-09-04  2:53         ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-09-04  2:53 UTC (permalink / raw)
  To: Becky Bruce; +Cc: linuxppc-dev

On Wed, 2008-09-03 at 10:10 -0500, Becky Bruce wrote:

> Actually, PTE_SIZE is already defined in hash_low_32.S, and it's the  
> size of the HPTE.  I'll rename that to HPTE_SIZE and use PTE_SIZE for  
> the linux PTE.

Good idea. Thanks.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2008-09-04  2:55 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-27 22:38 [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical Becky Bruce
2008-08-27 23:43 ` Scott Wood
2008-08-28 15:36   ` Becky Bruce
2008-08-28 16:07     ` Scott Wood
2008-08-28 19:37       ` Becky Bruce
2008-08-28 20:28         ` Scott Wood
2008-08-28 21:13           ` Becky Bruce
2008-08-28 22:42         ` Benjamin Herrenschmidt
2008-08-30 16:24           ` Scott Wood
2008-08-31  8:22             ` Benjamin Herrenschmidt
2008-09-01  5:28   ` Benjamin Herrenschmidt
2008-09-01  5:19 ` Benjamin Herrenschmidt
2008-09-02 16:19   ` Becky Bruce
2008-09-02 21:21     ` Benjamin Herrenschmidt
2008-09-03 15:10       ` Becky Bruce
2008-09-04  2:53         ` Benjamin Herrenschmidt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).