* [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
@ 2008-08-27 22:38 Becky Bruce
2008-08-27 23:43 ` Scott Wood
2008-09-01 5:19 ` Benjamin Herrenschmidt
0 siblings, 2 replies; 16+ messages in thread
From: Becky Bruce @ 2008-08-27 22:38 UTC (permalink / raw)
To: linuxppc-dev
This rearranges a bit of code, and adds support for
36-bit physical addressing for configs that use a
hashed page table. The 36b physical support is not
enabled by default on any config - it must be
explicitly enabled via the config system.
This patch *only* expands the page table code to accomodate
large physical addresses on 32-bit systems and enables the
PHYS_64BIT config option for 6xx. It does *not*
allow you to boot a board with more than about 3.5GB of
RAM - for that, SWIOTLB support is also required (and
coming soon).
Signed-off-by: Becky Bruce <becky.bruce@freescale.com>
---
NOTE: This patch is now based against Kumar's mmu tree,
as we had conflicting edits. Kumar will pick this up into his
tree if it looks OK.
Cheers,
Becky
arch/powerpc/include/asm/io.h | 2 +-
arch/powerpc/include/asm/page_32.h | 10 ++++-
arch/powerpc/include/asm/pgtable-ppc32.h | 30 +++++++++++-
arch/powerpc/kernel/head_32.S | 4 +-
arch/powerpc/kernel/head_fsl_booke.S | 2 -
arch/powerpc/mm/hash_low_32.S | 78 +++++++++++++++++++++++------
arch/powerpc/mm/pgtable_32.c | 4 +-
arch/powerpc/platforms/Kconfig.cputype | 14 ++++--
8 files changed, 114 insertions(+), 30 deletions(-)
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 77c7fa0..08266d2 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -711,7 +711,7 @@ static inline void * phys_to_virt(unsigned long address)
/*
* Change "struct page" to physical address.
*/
-#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
+#define page_to_phys(page) ((phys_addr_t)page_to_pfn(page) << PAGE_SHIFT)
/* We do NOT want virtual merging, it would put too much pressure on
* our iommu allocator. Instead, we want drivers to be smart enough
diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index ebfae53..0b253f6 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -13,10 +13,18 @@
#define ARCH_KMALLOC_MINALIGN L1_CACHE_BYTES
#endif
+#ifdef CONFIG_PTE_64BIT
+#define PTE_FLAGS_OFFSET 4 /* offset of PTE flags, in bytes */
+#define LNX_PTE_SIZE 8 /* size of a linux PTE, in bytes */
+#else
+#define PTE_FLAGS_OFFSET 0
+#define LNX_PTE_SIZE 4
+#endif
+
#ifndef __ASSEMBLY__
/*
* The basic type of a PTE - 64 bits for those CPUs with > 32 bit
- * physical addressing. For now this just the IBM PPC440.
+ * physical addressing.
*/
#ifdef CONFIG_PTE_64BIT
typedef unsigned long long pte_basic_t;
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 82bf914..4a7055a 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -369,7 +369,12 @@ extern int icache_44x_need_flush;
#define _PAGE_RW 0x400 /* software: user write access allowed */
#define _PAGE_SPECIAL 0x800 /* software: Special page */
+#ifdef CONFIG_PTE_64BIT
+/* We never clear the high word of the pte, mask those off */
+#define _PTE_NONE_MASK (0xffffffff00000000ULL | _PAGE_HASHPTE)
+#else
#define _PTE_NONE_MASK _PAGE_HASHPTE
+#endif
#define _PMD_PRESENT 0
#define _PMD_PRESENT_MASK (PAGE_MASK)
@@ -528,7 +533,8 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
#define pte_none(pte) ((pte_val(pte) & ~_PTE_NONE_MASK) == 0)
#define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT)
-#define pte_clear(mm,addr,ptep) do { pte_update(ptep, ~0, 0); } while (0)
+#define pte_clear(mm, addr, ptep) \
+ do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)
#define pmd_none(pmd) (!pmd_val(pmd))
#define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD)
@@ -664,8 +670,30 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
#if _PAGE_HASHPTE != 0
+#ifndef CONFIG_PTE_64BIT
pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
#else
+ /*
+ * We have to do the write of the 64b pte as 2 stores. This
+ * code assumes that the entry we're storing to is currently
+ * not valid and that all callers have the page table lock.
+ * Having the entry be not valid protects readers who might read
+ * between the first and second stores.
+ */
+ unsigned int tmp;
+
+ __asm__ __volatile__("\
+1: lwarx %0,0,%4\n\
+ rlwimi %L2,%0,0,30,30\n\
+ stw %2,0(%3)\n\
+ eieio\n\
+ stwcx. %L2,0,%4\n\
+ bne- 1b"
+ : "=&r" (tmp), "=m" (*ptep)
+ : "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4), "m" (*ptep)
+ : "cc");
+#endif /* CONFIG_PTE_64BIT */
+#else /* _PAGE_HASHPTE == 0 */
#if defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
__asm__ __volatile__("\
stw%U0%X0 %2,%0\n\
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 8bb6575..a6de6db 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -369,13 +369,13 @@ i##n: \
DataAccess:
EXCEPTION_PROLOG
mfspr r10,SPRN_DSISR
+ stw r10,_DSISR(r11)
andis. r0,r10,0xa470 /* weird error? */
bne 1f /* if not, try to put a PTE */
mfspr r4,SPRN_DAR /* into the hash table */
rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */
bl hash_page
-1: stw r10,_DSISR(r11)
- mr r5,r10
+1: lwz r5,_DSISR(r11) /* get DSISR value */
mfspr r4,SPRN_DAR
EXC_XFER_EE_LITE(0x300, handle_page_fault)
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index fa39cce..9000891 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -422,7 +422,6 @@ skpinv: addi r6,r6,1 /* Increment */
* r12 is pointer to the pte
*/
#ifdef CONFIG_PTE_64BIT
-#define PTE_FLAGS_OFFSET 4
#define FIND_PTE \
rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \
lwzx r11, r12, r11; /* Get pgd/pmd entry */ \
@@ -431,7 +430,6 @@ skpinv: addi r6,r6,1 /* Increment */
rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \
lwz r11, 4(r12); /* Get pte entry */
#else
-#define PTE_FLAGS_OFFSET 0
#define FIND_PTE \
rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \
lwz r11, 0(r11); /* Get L1 entry */ \
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index b9ba7d9..d63e20a 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -75,7 +75,7 @@ _GLOBAL(hash_page_sync)
* Returns to the caller if the access is illegal or there is no
* mapping for the address. Otherwise it places an appropriate PTE
* in the hash table and returns from the exception.
- * Uses r0, r3 - r8, ctr, lr.
+ * Uses r0, r3 - r8, r10, ctr, lr.
*/
.text
_GLOBAL(hash_page)
@@ -106,9 +106,15 @@ _GLOBAL(hash_page)
addi r5,r5,swapper_pg_dir@l /* kernel page table */
rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
112: add r5,r5,r7 /* convert to phys addr */
+#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
lwz r8,0(r5) /* get pmd entry */
rlwinm. r8,r8,0,0,19 /* extract address of pte page */
+#else
+ rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */
+ lwzx r8,r8,r5 /* Get L1 entry */
+ rlwinm. r8,r8,0,0,20 /* extract pt base address */
+#endif
#ifdef CONFIG_SMP
beq- hash_page_out /* return if no mapping */
#else
@@ -118,7 +124,11 @@ _GLOBAL(hash_page)
to the address following the rfi. */
beqlr-
#endif
+#ifndef CONFIG_PTE_64BIT
rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */
+#else
+ rlwimi r8,r4,23,20,28 /* compute pte address */
+#endif
rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */
ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
@@ -127,9 +137,15 @@ _GLOBAL(hash_page)
* because almost always, there won't be a permission violation
* and there won't already be an HPTE, and thus we will have
* to update the PTE to set _PAGE_HASHPTE. -- paulus.
+ *
+ * If PTE_64BIT is set, the low word is the flags word; use that
+ * word for locking since it contains all the interesting bits.
*/
+#if (PTE_FLAGS_OFFSET != 0)
+ addi r8,r8,PTE_FLAGS_OFFSET
+#endif
retry:
- lwarx r6,0,r8 /* get linux-style pte */
+ lwarx r6,0,r8 /* get linux-style pte, flag word */
andc. r5,r3,r6 /* check access & ~permission */
#ifdef CONFIG_SMP
bne- hash_page_out /* return if access not permitted */
@@ -137,6 +153,11 @@ retry:
bnelr-
#endif
or r5,r0,r6 /* set accessed/dirty bits */
+#ifdef CONFIG_PTE_64BIT
+ subf r10,r6,r8 /* create false data dependency */
+ subi r10,r10,PTE_FLAGS_OFFSET
+ lwzx r10,r6,r10 /* Get upper PTE word */
+#endif
stwcx. r5,0,r8 /* attempt to update PTE */
bne- retry /* retry if someone got there first */
@@ -203,9 +224,9 @@ _GLOBAL(add_hash_page)
* we can't take a hash table miss (assuming the code is
* covered by a BAT). -- paulus
*/
- mfmsr r10
+ mfmsr r9
SYNC
- rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
+ rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear MSR_DR */
mtmsr r0
SYNC_601
@@ -214,14 +235,14 @@ _GLOBAL(add_hash_page)
tophys(r7,0)
#ifdef CONFIG_SMP
- addis r9,r7,mmu_hash_lock@ha
- addi r9,r9,mmu_hash_lock@l
-10: lwarx r0,0,r9 /* take the mmu_hash_lock */
+ addis r6,r7,mmu_hash_lock@ha
+ addi r6,r6,mmu_hash_lock@l
+10: lwarx r0,0,r6 /* take the mmu_hash_lock */
cmpi 0,r0,0
bne- 11f
- stwcx. r8,0,r9
+ stwcx. r8,0,r6
beq+ 12f
-11: lwz r0,0(r9)
+11: lwz r0,0(r6)
cmpi 0,r0,0
beq 10b
b 11b
@@ -234,10 +255,20 @@ _GLOBAL(add_hash_page)
* HPTE, so we just unlock and return.
*/
mr r8,r5
+#ifndef CONFIG_PTE_64BIT
rlwimi r8,r4,22,20,29
+#else
+ rlwimi r8,r4,23,20,28
+ addi r8,r8,PTE_FLAGS_OFFSET
+#endif
1: lwarx r6,0,r8
andi. r0,r6,_PAGE_HASHPTE
bne 9f /* if HASHPTE already set, done */
+#ifdef CONFIG_PTE_64BIT
+ subf r10,r6,r8 /* create false data dependency */
+ subi r10,r10,PTE_FLAGS_OFFSET
+ lwzx r10,r6,r10 /* Get upper PTE word */
+#endif
ori r5,r6,_PAGE_HASHPTE
stwcx. r5,0,r8
bne- 1b
@@ -246,13 +277,15 @@ _GLOBAL(add_hash_page)
9:
#ifdef CONFIG_SMP
+ addis r6,r7,mmu_hash_lock@ha
+ addi r6,r6,mmu_hash_lock@l
eieio
li r0,0
- stw r0,0(r9) /* clear mmu_hash_lock */
+ stw r0,0(r6) /* clear mmu_hash_lock */
#endif
/* reenable interrupts and DR */
- mtmsr r10
+ mtmsr r9
SYNC_601
isync
@@ -267,7 +300,8 @@ _GLOBAL(add_hash_page)
* r5 contains the linux PTE, r6 contains the old value of the
* linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
* offset to be added to addresses (0 if the MMU is on,
- * -KERNELBASE if it is off).
+ * -KERNELBASE if it is off). r10 contains the upper half of
+ * the PTE if CONFIG_PTE_64BIT.
* On SMP, the caller should have the mmu_hash_lock held.
* We assume that the caller has (or will) set the _PAGE_HASHPTE
* bit in the linux PTE in memory. The value passed in r6 should
@@ -313,6 +347,11 @@ _GLOBAL(create_hpte)
BEGIN_FTR_SECTION
ori r8,r8,_PAGE_COHERENT /* set M (coherence required) */
END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+#ifdef CONFIG_PTE_64BIT
+ /* Put the XPN bits into the PTE */
+ rlwimi r8,r10,8,20,22
+ rlwimi r8,r10,2,29,29
+#endif
/* Construct the high word of the PPC-style PTE (r5) */
rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */
@@ -499,14 +538,18 @@ _GLOBAL(flush_hash_pages)
isync
/* First find a PTE in the range that has _PAGE_HASHPTE set */
+#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,22,20,29
-1: lwz r0,0(r5)
+#else
+ rlwimi r5,r4,23,20,28
+#endif
+1: lwz r0,PTE_FLAGS_OFFSET(r5)
cmpwi cr1,r6,1
andi. r0,r0,_PAGE_HASHPTE
bne 2f
ble cr1,19f
addi r4,r4,0x1000
- addi r5,r5,4
+ addi r5,r5,LNX_PTE_SIZE
addi r6,r6,-1
b 1b
@@ -545,7 +588,10 @@ _GLOBAL(flush_hash_pages)
* already clear, we're done (for this pte). If not,
* clear it (atomically) and proceed. -- paulus.
*/
-33: lwarx r8,0,r5 /* fetch the pte */
+#if (PTE_FLAGS_OFFSET != 0)
+ addi r5,r5,PTE_FLAGS_OFFSET
+#endif
+33: lwarx r8,0,r5 /* fetch the pte flags word */
andi. r0,r8,_PAGE_HASHPTE
beq 8f /* done if HASHPTE is already clear */
rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */
@@ -590,7 +636,7 @@ _GLOBAL(flush_hash_patch_B)
8: ble cr1,9f /* if all ptes checked */
81: addi r6,r6,-1
- addi r5,r5,4 /* advance to next pte */
+ addi r5,r5,LNX_PTE_SIZE /* go to next linux pte flag word */
addi r4,r4,0x1000
lwz r0,0(r5) /* check next pte */
cmpwi cr1,r6,1
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 2001abd..c31d6d2 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -73,7 +73,7 @@ extern unsigned long p_mapped_by_tlbcam(unsigned long pa);
#endif /* HAVE_TLBCAM */
#ifdef CONFIG_PTE_64BIT
-/* 44x uses an 8kB pgdir because it has 8-byte Linux PTEs. */
+/* Some processors use an 8kB pgdir because they have 8-byte Linux PTEs. */
#define PGDIR_ORDER 1
#else
#define PGDIR_ORDER 0
@@ -288,7 +288,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
}
/*
- * Map in all of physical memory starting at KERNELBASE.
+ * Map in a big chunk of physical memory starting at KERNELBASE.
*/
void __init mapin_ram(void)
{
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 7f65127..ca5b58b 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
config PTE_64BIT
bool
- depends on 44x || E500
+ depends on 44x || E500 || 6xx
default y if 44x
- default y if E500 && PHYS_64BIT
+ default y if PHYS_64BIT
config PHYS_64BIT
- bool 'Large physical address support' if E500
- depends on 44x || E500
+ bool 'Large physical address support' if E500 || 6xx
+ depends on 44x || E500 || 6xx
select RESOURCES_64BIT
default y if 44x
---help---
This option enables kernel support for larger than 32-bit physical
- addresses. This features is not be available on all e500 cores.
+ addresses. This features is not be available on all cores.
+
+ If you have more than 3.5GB of RAM or so, you also need to enable
+ SWIOTLB under Kernel Options for this to work. The actual number
+ is platform-dependent.
If in doubt, say N here.
--
1.5.5.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-27 22:38 [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical Becky Bruce
@ 2008-08-27 23:43 ` Scott Wood
2008-08-28 15:36 ` Becky Bruce
2008-09-01 5:28 ` Benjamin Herrenschmidt
2008-09-01 5:19 ` Benjamin Herrenschmidt
1 sibling, 2 replies; 16+ messages in thread
From: Scott Wood @ 2008-08-27 23:43 UTC (permalink / raw)
To: Becky Bruce; +Cc: linuxppc-dev
Becky Bruce wrote:
> #if _PAGE_HASHPTE != 0
> +#ifndef CONFIG_PTE_64BIT
> pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
> #else
> + /*
> + * We have to do the write of the 64b pte as 2 stores. This
> + * code assumes that the entry we're storing to is currently
> + * not valid and that all callers have the page table lock.
> + * Having the entry be not valid protects readers who might read
> + * between the first and second stores.
> + */
> + unsigned int tmp;
> +
> + __asm__ __volatile__("\
> +1: lwarx %0,0,%4\n\
> + rlwimi %L2,%0,0,30,30\n\
> + stw %2,0(%3)\n\
> + eieio\n\
> + stwcx. %L2,0,%4\n\
> + bne- 1b"
> + : "=&r" (tmp), "=m" (*ptep)
> + : "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4), "m" (*ptep)
> + : "cc");
> +#endif /* CONFIG_PTE_64BIT */
Could the stw to the same reservation granule as the stwcx cancel the
reservation on some implementations? Plus, if you're assuming that the
entry is currently invalid and all callers have the page table lock, do
we need the lwarx/stwcx at all? At the least, it should check
PTE_ATOMIC_UPDATES.
%2 should be "+&r", not "r".
> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
> index 7f65127..ca5b58b 100644
> --- a/arch/powerpc/platforms/Kconfig.cputype
> +++ b/arch/powerpc/platforms/Kconfig.cputype
> @@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
>
> config PTE_64BIT
> bool
> - depends on 44x || E500
> + depends on 44x || E500 || 6xx
> default y if 44x
> - default y if E500 && PHYS_64BIT
> + default y if PHYS_64BIT
How is this different from PHYS_64BIT?
> config PHYS_64BIT
> - bool 'Large physical address support' if E500
> - depends on 44x || E500
> + bool 'Large physical address support' if E500 || 6xx
Maybe "if !44x", or have 44x "select" this, rather than listing all
arches where it's optional.
> + depends on 44x || E500 || 6xx
> select RESOURCES_64BIT
> default y if 44x
> ---help---
> This option enables kernel support for larger than 32-bit physical
> - addresses. This features is not be available on all e500 cores.
> + addresses. This features is not be available on all cores.
"This features is not be"?
-Scott
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-27 23:43 ` Scott Wood
@ 2008-08-28 15:36 ` Becky Bruce
2008-08-28 16:07 ` Scott Wood
2008-09-01 5:28 ` Benjamin Herrenschmidt
1 sibling, 1 reply; 16+ messages in thread
From: Becky Bruce @ 2008-08-28 15:36 UTC (permalink / raw)
To: Scott Wood; +Cc: linuxppc-dev
On Aug 27, 2008, at 6:43 PM, Scott Wood wrote:
> Becky Bruce wrote:
>> #if _PAGE_HASHPTE != 0
>> +#ifndef CONFIG_PTE_64BIT
>> pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
>> #else
>> + /*
>> + * We have to do the write of the 64b pte as 2 stores. This
>> + * code assumes that the entry we're storing to is currently
>> + * not valid and that all callers have the page table lock.
>> + * Having the entry be not valid protects readers who might read
>> + * between the first and second stores.
>> + */
>> + unsigned int tmp;
>> +
>> + __asm__ __volatile__("\
>> +1: lwarx %0,0,%4\n\
>> + rlwimi %L2,%0,0,30,30\n\
>> + stw %2,0(%3)\n\
>> + eieio\n\
>> + stwcx. %L2,0,%4\n\
>> + bne- 1b"
>> + : "=&r" (tmp), "=m" (*ptep)
>> + : "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4),
>> "m" (*ptep)
>> + : "cc");
>> +#endif /* CONFIG_PTE_64BIT */
>
> Could the stw to the same reservation granule as the stwcx cancel
> the reservation on some implementations? P
Not on the same processor.
> lus, if you're assuming that the entry is currently invalid and all
> callers have the page table lock, do we need the lwarx/stwcx at
> all? At the least, it should check PTE_ATOMIC_UPDATES.
I'm pretty sure I went through this in great detail at one point and
concluded that I did in fact need the lwarx/stwcx. IIRC, it has to do
with other non-set_pte_at writers not necessarily holding the page
table lock. FYI, the existing 32-bit PTE code is doing atomic updates
as well.
About PTE_ATOMIC_UPDATES, I didn't add that in because hashed page
table implementations require atomic updates. Adding it in would
create another clause in that code, because I would still need to
order the operations with a 64-bit PTE and I can't call pte_update as
it only changes the low word of the pte. I wasn't feeling too keen
on adding untested pagetable code into the kernel :) I can add it if
the peanut gallery wants it, but I'll be marking it with a big fat
"BUYER BEWARE".
>
>
> %2 should be "+&r", not "r".
>
>> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/
>> platforms/Kconfig.cputype
>> index 7f65127..ca5b58b 100644
>> --- a/arch/powerpc/platforms/Kconfig.cputype
>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>> @@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
>> config PTE_64BIT
>> bool
>> - depends on 44x || E500
>> + depends on 44x || E500 || 6xx
>> default y if 44x
>> - default y if E500 && PHYS_64BIT
>> + default y if PHYS_64BIT
>
> How is this different from PHYS_64BIT?
One is the width of the PTE and one is the width of a physical
address. It's entirely plausible to have a 64-bit PTE because you
have a bunch of status bits, and only have 32-bit physical
addressing. That's why there are 2 options.
>
>
>> config PHYS_64BIT
>> - bool 'Large physical address support' if E500
>> - depends on 44x || E500
>> + bool 'Large physical address support' if E500 || 6xx
>
> Maybe "if !44x", or have 44x "select" this, rather than listing all
> arches where it's optional.
Not sure exactly what you're suggesting here........
>
>
>> + depends on 44x || E500 || 6xx
>> select RESOURCES_64BIT
>> default y if 44x
>> ---help---
>> This option enables kernel support for larger than 32-bit physical
>> - addresses. This features is not be available on all e500 cores.
>> + addresses. This features is not be available on all cores.
>
> "This features is not be"?
Heh, I didn't type that :) But I can fix it.
Thanks,
B
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-28 15:36 ` Becky Bruce
@ 2008-08-28 16:07 ` Scott Wood
2008-08-28 19:37 ` Becky Bruce
0 siblings, 1 reply; 16+ messages in thread
From: Scott Wood @ 2008-08-28 16:07 UTC (permalink / raw)
To: Becky Bruce; +Cc: linuxppc-dev
Becky Bruce wrote:
> I'm pretty sure I went through this in great detail at one point and
> concluded that I did in fact need the lwarx/stwcx. IIRC, it has to do
> with other non-set_pte_at writers not necessarily holding the page table
> lock. FYI, the existing 32-bit PTE code is doing atomic updates as well.
But will those updates happen if there isn't already a valid PTE?
> About PTE_ATOMIC_UPDATES, I didn't add that in because hashed page table
> implementations require atomic updates.
Right, I misread it and thought it was being used for non-hashed
implementations as well.
What happens if you enable 64-bit PTEs on a 603-ish CPU? The kconfig
seems to allow it.
> Adding it in would create
> another clause in that code, because I would still need to order the
> operations with a 64-bit PTE and I can't call pte_update as it only
> changes the low word of the pte. I wasn't feeling too keen on adding
> untested pagetable code into the kernel :)
Wimp. :-)
> I can add it if the peanut
> gallery wants it, but I'll be marking it with a big fat "BUYER BEWARE".
No, there's little point if nothing selects it (or is planned to in the
near future).
>>> diff --git a/arch/powerpc/platforms/Kconfig.cputype
>>> b/arch/powerpc/platforms/Kconfig.cputype
>>> index 7f65127..ca5b58b 100644
>>> --- a/arch/powerpc/platforms/Kconfig.cputype
>>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>>> @@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
>>> config PTE_64BIT
>>> bool
>>> - depends on 44x || E500
>>> + depends on 44x || E500 || 6xx
>>> default y if 44x
>>> - default y if E500 && PHYS_64BIT
>>> + default y if PHYS_64BIT
>>
>> How is this different from PHYS_64BIT?
>
> One is the width of the PTE and one is the width of a physical address.
> It's entirely plausible to have a 64-bit PTE because you have a bunch of
> status bits, and only have 32-bit physical addressing. That's why there
> are 2 options.
Right, I just didn't see anything that actually selects it independently
of PHYS_64BIT. Is this something that's expected to actually happen in
the future?
The "default y if 44x" line is redundant with the "default y if PHYS_64BIT".
>>> config PHYS_64BIT
>>> - bool 'Large physical address support' if E500
>>> - depends on 44x || E500
>>> + bool 'Large physical address support' if E500 || 6xx
>>
>> Maybe "if !44x", or have 44x "select" this, rather than listing all
>> arches where it's optional.
>
> Not sure exactly what you're suggesting here........
It just seems simpler to not conditionalize the bool, but instead have
CONFIG_44x do "select PHYS_64BIT". I'd rather avoid another list of
platforms accumulating in a kconfig dependency.
>>> + depends on 44x || E500 || 6xx
>>> select RESOURCES_64BIT
>>> default y if 44x
>>> ---help---
>>> This option enables kernel support for larger than 32-bit physical
>>> - addresses. This features is not be available on all e500 cores.
>>> + addresses. This features is not be available on all cores.
>>
>> "This features is not be"?
>
> Heh, I didn't type that :) But I can fix it.
You didn't type it, but you touched it. Tag, you're it. :-)
-Scott
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-28 16:07 ` Scott Wood
@ 2008-08-28 19:37 ` Becky Bruce
2008-08-28 20:28 ` Scott Wood
2008-08-28 22:42 ` Benjamin Herrenschmidt
0 siblings, 2 replies; 16+ messages in thread
From: Becky Bruce @ 2008-08-28 19:37 UTC (permalink / raw)
To: Scott Wood; +Cc: linuxppc-dev list
On Aug 28, 2008, at 11:07 AM, Scott Wood wrote:
> Becky Bruce wrote:
>> I'm pretty sure I went through this in great detail at one point
>> and concluded that I did in fact need the lwarx/stwcx. IIRC, it
>> has to do with other non-set_pte_at writers not necessarily holding
>> the page table lock. FYI, the existing 32-bit PTE code is doing
>> atomic updates as well.
>
> But will those updates happen if there isn't already a valid PTE?
I understand what you're saying, I've been here before :) However, I
was never able to convince myself that it's safe without the lwarx/
stwcx. There's hashing code that wanks around with the HASHPTE bit
doing a RMW without holding any lock (other than lwarx/stwcx-ing the
PTE itself). And there's definitely code that's fairly far removed
from the last time you checked that an entry was valid. I've CC'd Ben
on this mail - perhaps he can shed some light. If we don't need the
atomics, I'm happy to yank them.
Now, it *does* seem like set_pte_at could be optimized for the non-SMP
case.... I'll have to chew on that one a bit.
>
>
>> About PTE_ATOMIC_UPDATES, I didn't add that in because hashed page
>> table implementations require atomic updates.
>
> Right, I misread it and thought it was being used for non-hashed
> implementations as well.
>
> What happens if you enable 64-bit PTEs on a 603-ish CPU? The
> kconfig seems to allow it.
Don't do that :) That's why the help is there in the Kconfig.
Otherwise, I have to list out every 74xx part that supports 36-bit
physical addressing. In any event, nothing interesting will happen
other than that you'll waste some space. The kernel boots fine with a
non-36b physical u-boot and small amounts of RAM.
>
>
>> Adding it in would create another clause in that code, because I
>> would still need to order the operations with a 64-bit PTE and I
>> can't call pte_update as it only changes the low word of the pte.
>> I wasn't feeling too keen on adding untested pagetable code into
>> the kernel :)
>
> Wimp. :-)
Pansy :)
>
>
> > I can add it if the peanut
>> gallery wants it, but I'll be marking it with a big fat "BUYER
>> BEWARE".
>
> No, there's little point if nothing selects it (or is planned to in
> the near future).
Good :)
>
>
>>>> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/
>>>> powerpc/platforms/Kconfig.cputype
>>>> index 7f65127..ca5b58b 100644
>>>> --- a/arch/powerpc/platforms/Kconfig.cputype
>>>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>>>> @@ -128,18 +128,22 @@ config FSL_EMB_PERFMON
>>>> config PTE_64BIT
>>>> bool
>>>> - depends on 44x || E500
>>>> + depends on 44x || E500 || 6xx
>>>> default y if 44x
>>>> - default y if E500 && PHYS_64BIT
>>>> + default y if PHYS_64BIT
>>>
>>> How is this different from PHYS_64BIT?
>> One is the width of the PTE and one is the width of a physical
>> address. It's entirely plausible to have a 64-bit PTE because you
>> have a bunch of status bits, and only have 32-bit physical
>> addressing. That's why there are 2 options.
>
> Right, I just didn't see anything that actually selects it
> independently of PHYS_64BIT. Is this something that's expected to
> actually happen in the future?
There's been talk of making the PTEs always 64-bit even on 32-bit
platforms. So it's entirely plausible.
>
>
> The "default y if 44x" line is redundant with the "default y if
> PHYS_64BIT".
You're right, I'll clean that up.
>
>
>>>> config PHYS_64BIT
>>>> - bool 'Large physical address support' if E500
>>>> - depends on 44x || E500
>>>> + bool 'Large physical address support' if E500 || 6xx
>>>
>>> Maybe "if !44x", or have 44x "select" this, rather than listing
>>> all arches where it's optional.
>> Not sure exactly what you're suggesting here........
>
> It just seems simpler to not conditionalize the bool, but instead
> have CONFIG_44x do "select PHYS_64BIT". I'd rather avoid another
> list of platforms accumulating in a kconfig dependency.
I'm still not sure where you're going with this - I can remove 44x
from the conditional part, but we still have to deal with e500 and
6xx. In which case you're now setting this in different places for
difft plats, making it potentially harder to read. Unless you're
suggesting allowing the selection of PHYS_64BIT on any platform (in
which case your comment about what happens if I select this on 603
doesn't make much sense).....
>
>
>>>> + depends on 44x || E500 || 6xx
>>>> select RESOURCES_64BIT
>>>> default y if 44x
>>>> ---help---
>>>> This option enables kernel support for larger than 32-bit
>>>> physical
>>>> - addresses. This features is not be available on all e500
>>>> cores.
>>>> + addresses. This features is not be available on all cores.
>>>
>>> "This features is not be"?
>> Heh, I didn't type that :) But I can fix it.
>
> You didn't type it, but you touched it. Tag, you're it. :-)
It's already fixed in my tree :)
-B
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-28 19:37 ` Becky Bruce
@ 2008-08-28 20:28 ` Scott Wood
2008-08-28 21:13 ` Becky Bruce
2008-08-28 22:42 ` Benjamin Herrenschmidt
1 sibling, 1 reply; 16+ messages in thread
From: Scott Wood @ 2008-08-28 20:28 UTC (permalink / raw)
To: Becky Bruce; +Cc: linuxppc-dev list
Becky Bruce wrote:
> On Aug 28, 2008, at 11:07 AM, Scott Wood wrote:
>
>> Becky Bruce wrote:
>>> I'm pretty sure I went through this in great detail at one point
>>> and concluded that I did in fact need the lwarx/stwcx. IIRC, it
>>> has to do with other non-set_pte_at writers not necessarily
>>> holding the page table lock. FYI, the existing 32-bit PTE code is
>>> doing atomic updates as well.
>>
>> But will those updates happen if there isn't already a valid PTE?
>
> I understand what you're saying, I've been here before :) However, I
> was never able to convince myself that it's safe without the
> lwarx/stwcx. There's hashing code that wanks around with the HASHPTE
> bit doing a RMW without holding any lock (other than lwarx/stwcx-ing
> the PTE itself).
OK. I was concerned not just about efficiency, but of the safety of the
"stw" write if there were other modifications going on (even if the
set_pte_at stwcx fails, the other updater could have lwarxed an
succesfully stwcxed after the stw and ended up with a mixed PTE), but it
may not be an issue depending on the nature of the updates.
>>> About PTE_ATOMIC_UPDATES, I didn't add that in because hashed
>>> page table implementations require atomic updates.
>>
>> Right, I misread it and thought it was being used for non-hashed
>> implementations as well.
>>
>> What happens if you enable 64-bit PTEs on a 603-ish CPU? The
>> kconfig seems to allow it.
>
> Don't do that :) That's why the help is there in the Kconfig.
People will do it anyway -- and there's multiplatform to consider.
> Otherwise, I have to list out every 74xx part that supports 36-bit
> physical addressing. In any event, nothing interesting will happen
> other than that you'll waste some space. The kernel boots fine with
> a non-36b physical u-boot and small amounts of RAM.
My concern was the generic code trying to use 64-bit PTEs, and the 603
TLB miss handlers continuing to assume that the PTEs are 32-bit, and
loading the wrong thing.
Wasted space alone is an acceptable consequence of turning on things you
don't need. :-)
> I'm still not sure where you're going with this - I can remove 44x
> from the conditional part, but we still have to deal with e500 and
> 6xx.
You still need it in "depends" (in the absence of a "PHYS_64BIT_CAPABLE"
or some such), but not "bool '...' if". It's not a big deal, just a pet
peeve.
> In which case you're now setting this in different places for difft
> plats, making it potentially harder to read. Unless you're
> suggesting allowing the selection of PHYS_64BIT on any platform
No, unless the code for all platforms makes it safe to do so.
-Scott
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-28 20:28 ` Scott Wood
@ 2008-08-28 21:13 ` Becky Bruce
0 siblings, 0 replies; 16+ messages in thread
From: Becky Bruce @ 2008-08-28 21:13 UTC (permalink / raw)
To: Scott Wood; +Cc: linuxppc-dev list
Great, so *you* got my email, and I did not. I love our mailserver!
On Aug 28, 2008, at 3:28 PM, Scott Wood wrote:
> Becky Bruce wrote:
>> On Aug 28, 2008, at 11:07 AM, Scott Wood wrote:
>>> Becky Bruce wrote:
>>>> I'm pretty sure I went through this in great detail at one point
>>>> and concluded that I did in fact need the lwarx/stwcx. IIRC, it
>>>> has to do with other non-set_pte_at writers not necessarily
>>>> holding the page table lock. FYI, the existing 32-bit PTE code is
>>>> doing atomic updates as well.
>>> But will those updates happen if there isn't already a valid PTE?
>> I understand what you're saying, I've been here before :) However, I
>> was never able to convince myself that it's safe without the lwarx/
>> stwcx. There's hashing code that wanks around with the HASHPTE
>> bit doing a RMW without holding any lock (other than lwarx/stwcx-ing
>> the PTE itself).
>
> OK. I was concerned not just about efficiency, but of the safety of
> the "stw" write if there were other modifications going on (even if
> the set_pte_at stwcx fails, the other updater could have lwarxed an
> succesfully stwcxed after the stw and ended up with a mixed PTE),
> but it may not be an issue depending on the nature of the updates.
It shouldn't be an issue - set_pte_at() is the only writer of the high
bits of the PTE, the pte is invalid upon entry to set_pte_at(), and
none of the potential interfering writers should be turning on the
valid bit.
>
>
>>>> About PTE_ATOMIC_UPDATES, I didn't add that in because hashed
>>>> page table implementations require atomic updates.
>>> Right, I misread it and thought it was being used for non-hashed
>>> implementations as well.
>>> What happens if you enable 64-bit PTEs on a 603-ish CPU? The
>>> kconfig seems to allow it.
>> Don't do that :) That's why the help is there in the Kconfig.
>
> People will do it anyway -- and there's multiplatform to consider.
>
>
>> Otherwise, I have to list out every 74xx part that supports 36-bit
>> physical addressing. In any event, nothing interesting will happen
>> other than that you'll waste some space. The kernel boots fine with
>> a non-36b physical u-boot and small amounts of RAM.
>
> My concern was the generic code trying to use 64-bit PTEs, and the
> 603 TLB miss handlers continuing to assume that the PTEs are 32-bit,
> and loading the wrong thing.
>
> Wasted space alone is an acceptable consequence of turning on things
> you don't need. :-)
Actually, I'm lying to you - I forgot about the old parts with DTLB/
ITLB handlers. That code will actually break, and I'd rather not hack
it up to pointlessly accomodate large PTEs. I do need to fix this the
Kconfig, even though it's going to be gross. Thanks for pointing this
out.
>
>
>> I'm still not sure where you're going with this - I can remove 44x
>> from the conditional part, but we still have to deal with e500 and
>> 6xx.
>
> You still need it in "depends" (in the absence of a
> "PHYS_64BIT_CAPABLE" or some such), but not "bool '...' if". It's
> not a big deal, just a pet peeve.
I'll look at making it less peevy :)
>
>
>> In which case you're now setting this in different places for difft
>> plats, making it potentially harder to read. Unless you're
>> suggesting allowing the selection of PHYS_64BIT on any platform
>
> No, unless the code for all platforms makes it safe to do so.
>
Thanks!
-Becky
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-28 19:37 ` Becky Bruce
2008-08-28 20:28 ` Scott Wood
@ 2008-08-28 22:42 ` Benjamin Herrenschmidt
2008-08-30 16:24 ` Scott Wood
1 sibling, 1 reply; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-08-28 22:42 UTC (permalink / raw)
To: Becky Bruce; +Cc: Scott Wood, linuxppc-dev list
> I understand what you're saying, I've been here before :) However, I
> was never able to convince myself that it's safe without the lwarx/
> stwcx. There's hashing code that wanks around with the HASHPTE bit
> doing a RMW without holding any lock (other than lwarx/stwcx-ing the
> PTE itself). And there's definitely code that's fairly far removed
> from the last time you checked that an entry was valid. I've CC'd Ben
> on this mail - perhaps he can shed some light. If we don't need the
> atomics, I'm happy to yank them.
>
> Now, it *does* seem like set_pte_at could be optimized for the non-SMP
> case.... I'll have to chew on that one a bit.
I haven't read the whole discussion not reviewed the patches yet, so I'm
just answering to the above sentence before I head off for the week-end
(and yes, Becky, I _DO_ have reviewing your stuff high on my todo list,
but I've been really swamped those last 2 weeks).
So all generic code always accesses PTEs with the PTE lock held (that
lock can be in various places ... typically for us it's one per PTE
page).
44x and FSL BookE no longer write to the PTEs without that lock anymore
and thus don't require atomic access in set_pte and friends.
Hash based platforms still do because of -one- thing : the hashing code
proper which writes back using lwarx/stwcx. to update _PAGE_ACCESSED,
_PAGE_HASHPTE and _PAGE_DIRTY. The hash code does take a global lock to
avoid double-hashing of the same page but this isn't something we should
use elsewhere.
So on hash based platforms, updates of the PTEs are expected to be done
atomically. Now if you extend the size of the PTE, hopefully those
atomic updates are still necessary but only for the -low- part of the
PTE that contains those bits.
For the non-SMP case, I think it should be possible to optimize it. The
only thing that can happen at interrupt time is hashing of kernel or
vmalloc/ioremap pages, which shouldn't compete with set_pte on those
pages, so there would be no access races there, but I may be missing
something as it's the morning and I about just woke up :-)
Cheers,
Ben.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-28 22:42 ` Benjamin Herrenschmidt
@ 2008-08-30 16:24 ` Scott Wood
2008-08-31 8:22 ` Benjamin Herrenschmidt
0 siblings, 1 reply; 16+ messages in thread
From: Scott Wood @ 2008-08-30 16:24 UTC (permalink / raw)
To: Benjamin Herrenschmidt; +Cc: linuxppc-dev list
On Fri, Aug 29, 2008 at 08:42:01AM +1000, Benjamin Herrenschmidt wrote:
> For the non-SMP case, I think it should be possible to optimize it. The
> only thing that can happen at interrupt time is hashing of kernel or
> vmalloc/ioremap pages, which shouldn't compete with set_pte on those
> pages, so there would be no access races there, but I may be missing
> something as it's the morning and I about just woke up :-)
Is that still true with preemptible kernels?
-Scott
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-30 16:24 ` Scott Wood
@ 2008-08-31 8:22 ` Benjamin Herrenschmidt
0 siblings, 0 replies; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-08-31 8:22 UTC (permalink / raw)
To: Scott Wood; +Cc: linuxppc-dev list
On Sat, 2008-08-30 at 11:24 -0500, Scott Wood wrote:
> On Fri, Aug 29, 2008 at 08:42:01AM +1000, Benjamin Herrenschmidt wrote:
> > For the non-SMP case, I think it should be possible to optimize it. The
> > only thing that can happen at interrupt time is hashing of kernel or
> > vmalloc/ioremap pages, which shouldn't compete with set_pte on those
> > pages, so there would be no access races there, but I may be missing
> > something as it's the morning and I about just woke up :-)
>
> Is that still true with preemptible kernels?
Those shouldn't be an issue as long as we can't preempt while holding a
spinlock and we do hold the pte lock on any modification... Of course,
-rt is a different matter.
Ben.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-27 22:38 [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical Becky Bruce
2008-08-27 23:43 ` Scott Wood
@ 2008-09-01 5:19 ` Benjamin Herrenschmidt
2008-09-02 16:19 ` Becky Bruce
1 sibling, 1 reply; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-09-01 5:19 UTC (permalink / raw)
To: Becky Bruce; +Cc: linuxppc-dev
> +#ifdef CONFIG_PTE_64BIT
> +#define PTE_FLAGS_OFFSET 4 /* offset of PTE flags, in bytes */
> +#define LNX_PTE_SIZE 8 /* size of a linux PTE, in bytes */
> +#else
> +#define PTE_FLAGS_OFFSET 0
> +#define LNX_PTE_SIZE 4
> +#endif
s/LNX_PTE_SIZE/PTE_BYTES or PTE_SIZE, no need for that horrible LNX
prefix. In fact, if it's only used by the asm, then ditch it and
have asm-offsets.c create something based on sizeof(pte_t).
> #define pte_none(pte) ((pte_val(pte) & ~_PTE_NONE_MASK) == 0)
> #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT)
> -#define pte_clear(mm,addr,ptep) do { pte_update(ptep, ~0, 0); } while (0)
> +#define pte_clear(mm, addr, ptep) \
> + do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)
Where does this previous definition of pte_clear comes from ? It's bogus
(and it's not like that upstream). Your "updated" ones looks ok though.
But whatever tree has the "previous" one would break hash based ppc32
if merged or is there some other related changes in Kumar tree that
make the above safe ?
> #define pmd_none(pmd) (!pmd_val(pmd))
> #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD)
> @@ -664,8 +670,30 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
> pte_t *ptep, pte_t pte)
> {
> #if _PAGE_HASHPTE != 0
> +#ifndef CONFIG_PTE_64BIT
> pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
> #else
> + /*
> + * We have to do the write of the 64b pte as 2 stores. This
> + * code assumes that the entry we're storing to is currently
> + * not valid and that all callers have the page table lock.
> + * Having the entry be not valid protects readers who might read
> + * between the first and second stores.
> + */
> + unsigned int tmp;
Do you know for sure the entry isn't valid ? On ppc64, we explicitely
test for that and if it was valid, we clear and flush it. The generic
code has been going on and off about calling set_pte_at() on an already
valid PTE, I wouldn't rely too much on it guaranteeing it will not
happen. The 32b PTE code is safe because it preserves _PAGE_HASHPTE .
Note also that once you have (which you don't now) the guarantee
that your previous PTE is invalid, then you can safely do two normal
stores instead of a lwarx/stwcx. loop. In any case, having the stw in
the middle of the loop doesn't sound very useful.
> + __asm__ __volatile__("\
> +1: lwarx %0,0,%4\n\
> + rlwimi %L2,%0,0,30,30\n\
> + stw %2,0(%3)\n\
> + eieio\n\
> + stwcx. %L2,0,%4\n\
> + bne- 1b"
> + : "=&r" (tmp), "=m" (*ptep)
> + : "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4), "m" (*ptep)
> + : "cc");
> +#endif /* CONFIG_PTE_64BIT */
> +#else /* _PAGE_HASHPTE == 0 */
> #if defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
> __asm__ __volatile__("\
> stw%U0%X0 %2,%0\n\
> diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
> index b9ba7d9..d63e20a 100644
> --- a/arch/powerpc/mm/hash_low_32.S
> +++ b/arch/powerpc/mm/hash_low_32.S
> @@ -75,7 +75,7 @@ _GLOBAL(hash_page_sync)
> * Returns to the caller if the access is illegal or there is no
> * mapping for the address. Otherwise it places an appropriate PTE
> * in the hash table and returns from the exception.
> - * Uses r0, r3 - r8, ctr, lr.
> + * Uses r0, r3 - r8, r10, ctr, lr.
> */
> .text
> _GLOBAL(hash_page)
> @@ -106,9 +106,15 @@ _GLOBAL(hash_page)
> addi r5,r5,swapper_pg_dir@l /* kernel page table */
> rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
> 112: add r5,r5,r7 /* convert to phys addr */
> +#ifndef CONFIG_PTE_64BIT
> rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
> lwz r8,0(r5) /* get pmd entry */
> rlwinm. r8,r8,0,0,19 /* extract address of pte page */
> +#else
> + rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */
> + lwzx r8,r8,r5 /* Get L1 entry */
> + rlwinm. r8,r8,0,0,20 /* extract pt base address */
> +#endif
Any reason you wrote the above using a different technique ? If you
believe that rlwinm/lwzx is going to be more efficient than rlwimi/lwz,
maybe we should fix the old one ... or am I missing something totally
obvious ? :-)
Cheers,
Ben.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-08-27 23:43 ` Scott Wood
2008-08-28 15:36 ` Becky Bruce
@ 2008-09-01 5:28 ` Benjamin Herrenschmidt
1 sibling, 0 replies; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-09-01 5:28 UTC (permalink / raw)
To: Scott Wood; +Cc: linuxppc-dev
> Could the stw to the same reservation granule as the stwcx cancel the
> reservation on some implementations?
It might I suppose ... In any case, see my replies to Becky.
> Plus, if you're assuming that the
> entry is currently invalid and all callers have the page table lock, do
> we need the lwarx/stwcx at all? At the least, it should check
> PTE_ATOMIC_UPDATES.
It shouldn't need atomic operations -if- the current entry is invalid
-and- _PAGE_HASHPTE is clear. (Ie, the current entry is invalid -and-
the hash table has been updated).
In any case, the stw can just move out of the loop.
It might be worth just doing something along the lines of
if (pte_val(*ptep) & _PAGE_PRESENT)
pte_clear(pte);
if (pte_val(*ptep) & _PAGE_HASHPTE)
flush_hash_entry(mm, ptep, addr);
asm v. { "stw ; eieio; stw" };
Cheers,
Ben.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-09-01 5:19 ` Benjamin Herrenschmidt
@ 2008-09-02 16:19 ` Becky Bruce
2008-09-02 21:21 ` Benjamin Herrenschmidt
0 siblings, 1 reply; 16+ messages in thread
From: Becky Bruce @ 2008-09-02 16:19 UTC (permalink / raw)
To: benh; +Cc: linuxppc-dev
On Sep 1, 2008, at 12:19 AM, Benjamin Herrenschmidt wrote:
Thanks for taking the time to dig through this :)
>
>> +#ifdef CONFIG_PTE_64BIT
>> +#define PTE_FLAGS_OFFSET 4 /* offset of PTE flags, in bytes */
>> +#define LNX_PTE_SIZE 8 /* size of a linux PTE, in bytes */
>> +#else
>> +#define PTE_FLAGS_OFFSET 0
>> +#define LNX_PTE_SIZE 4
>> +#endif
>
> s/LNX_PTE_SIZE/PTE_BYTES or PTE_SIZE, no need for that horrible LNX
> prefix. In fact, if it's only used by the asm, then ditch it and
> have asm-offsets.c create something based on sizeof(pte_t).
The reason I did that was to distinguish between the size of a
software PTE entry, and the actual size of the hardware entry. In the
case of 36b physical, the hardware PTE size is the same as it always
is; but we've grown the Linux PTE. We can call it something else, or
I can change as you suggest; I was worried that the next person to
come along might get confused. I suppose there aren't too many of us
that are crazy enough to muck around in this code, so maybe it's not
that big of a deal.
>
>
>> #define pte_none(pte) ((pte_val(pte) & ~_PTE_NONE_MASK) == 0)
>> #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT)
>> -#define pte_clear(mm,addr,ptep) do { pte_update(ptep, ~0, 0); }
>> while (0)
>> +#define pte_clear(mm, addr, ptep) \
>> + do { pte_update(ptep, ~_PAGE_HASHPTE, 0); } while (0)
>
> Where does this previous definition of pte_clear comes from ? It's
> bogus
> (and it's not like that upstream). Your "updated" ones looks ok
> though.
>
> But whatever tree has the "previous" one would break hash based ppc32
> if merged or is there some other related changes in Kumar tree that
> make the above safe ?
That's from Kumar's tree of changes for BookE.... he'll need to answer
that. I think he put out a patchset with that work; not sure if it
was correct in this particular or not.
>
>
>> #define pmd_none(pmd) (!pmd_val(pmd))
>> #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD)
>> @@ -664,8 +670,30 @@ static inline void set_pte_at(struct mm_struct
>> *mm, unsigned long addr,
>> pte_t *ptep, pte_t pte)
>> {
>> #if _PAGE_HASHPTE != 0
>> +#ifndef CONFIG_PTE_64BIT
>> pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte) & ~_PAGE_HASHPTE);
>> #else
>> + /*
>> + * We have to do the write of the 64b pte as 2 stores. This
>> + * code assumes that the entry we're storing to is currently
>> + * not valid and that all callers have the page table lock.
>> + * Having the entry be not valid protects readers who might read
>> + * between the first and second stores.
>> + */
>> + unsigned int tmp;
>
> Do you know for sure the entry isn't valid ? On ppc64, we explicitely
> test for that and if it was valid, we clear and flush it. The generic
> code has been going on and off about calling set_pte_at() on an
> already
> valid PTE, I wouldn't rely too much on it guaranteeing it will not
> happen. The 32b PTE code is safe because it preserves _PAGE_HASHPTE .
IIRC, we discussed this at some point, months ago, and decided this
was the case. If it *can* be valid, and it's possible to have a reader
on another cpu, I think we end up having to go through a very gross
sequence where we have to mark it invalid before we start mucking
around with it; otherwise a reader can get half-and-half. Perhaps I'm
missing a key restriction here?
>
>
> Note also that once you have (which you don't now) the guarantee
> that your previous PTE is invalid, then you can safely do two normal
> stores instead of a lwarx/stwcx. loop. In any case, having the stw in
> the middle of the loop doesn't sound very useful.
If the pte is invalid, and we're SMP, we need the store to the high
word to occur before the stwcx that sets the valid bit, unless you're
certain we can't have a reader on another cpu. If we're not SMP, then
I agree, let's move that store.
I thought we discussed at some point the possibility that there might
be an updater on another CPU? Is this not possible? If not, I'll
change it. The existing code is also lwarx/stwcxing via pte_update();
is there no reason for that? We should probably change that as well,
if that's the case.
>
>
>> + __asm__ __volatile__("\
>> +1: lwarx %0,0,%4\n\
>> + rlwimi %L2,%0,0,30,30\n\
>> + stw %2,0(%3)\n\
>> + eieio\n\
>> + stwcx. %L2,0,%4\n\
>> + bne- 1b"
>> + : "=&r" (tmp), "=m" (*ptep)
>> + : "r" (pte), "r" (ptep), "r" ((unsigned long)(ptep) + 4),
>> "m" (*ptep)
>> + : "cc");
>> +#endif /* CONFIG_PTE_64BIT */
>> +#else /* _PAGE_HASHPTE == 0 */
>> #if defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
>> __asm__ __volatile__("\
>> stw%U0%X0 %2,%0\n\
>> diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/
>> hash_low_32.S
>> index b9ba7d9..d63e20a 100644
>> --- a/arch/powerpc/mm/hash_low_32.S
>> +++ b/arch/powerpc/mm/hash_low_32.S
>> @@ -75,7 +75,7 @@ _GLOBAL(hash_page_sync)
>> * Returns to the caller if the access is illegal or there is no
>> * mapping for the address. Otherwise it places an appropriate PTE
>> * in the hash table and returns from the exception.
>> - * Uses r0, r3 - r8, ctr, lr.
>> + * Uses r0, r3 - r8, r10, ctr, lr.
>> */
>> .text
>> _GLOBAL(hash_page)
>> @@ -106,9 +106,15 @@ _GLOBAL(hash_page)
>> addi r5,r5,swapper_pg_dir@l /* kernel page table */
>> rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
>> 112: add r5,r5,r7 /* convert to phys addr */
>> +#ifndef CONFIG_PTE_64BIT
>> rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
>> lwz r8,0(r5) /* get pmd entry */
>> rlwinm. r8,r8,0,0,19 /* extract address of pte page */
>> +#else
>> + rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */
>> + lwzx r8,r8,r5 /* Get L1 entry */
>> + rlwinm. r8,r8,0,0,20 /* extract pt base address */
>> +#endif
>
> Any reason you wrote the above using a different technique ? If you
> believe that rlwinm/lwzx is going to be more efficient than rlwimi/
> lwz,
> maybe we should fix the old one ... or am I missing something totally
> obvious ? :-)
Nothing so exciting there - I borrowed this code from the BookE
version. I'm happy to change it.
Thanks, Ben!
Cheers,
Becky
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-09-02 16:19 ` Becky Bruce
@ 2008-09-02 21:21 ` Benjamin Herrenschmidt
2008-09-03 15:10 ` Becky Bruce
0 siblings, 1 reply; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-09-02 21:21 UTC (permalink / raw)
To: Becky Bruce; +Cc: linuxppc-dev
> The reason I did that was to distinguish between the size of a
> software PTE entry, and the actual size of the hardware entry. In the
> case of 36b physical, the hardware PTE size is the same as it always
> is; but we've grown the Linux PTE. We can call it something else, or
> I can change as you suggest; I was worried that the next person to
> come along might get confused. I suppose there aren't too many of us
> that are crazy enough to muck around in this code, so maybe it's not
> that big of a deal.
We already called "PTE" the linux entry everywhere so hopefully there
should be no confusion. We call "HPTE" the hash table ones.
> That's from Kumar's tree of changes for BookE.... he'll need to answer
> that. I think he put out a patchset with that work; not sure if it
> was correct in this particular or not.
Well, unless he did some changes in the area of hash flushing, linux
code must -never- clear _PAGE_HASHPTE on 32 bits. Only the assembly hash
flushing code will do it while also flushing the hash table.
> IIRC, we discussed this at some point, months ago, and decided this
> was the case. If it *can* be valid, and it's possible to have a reader
> on another cpu, I think we end up having to go through a very gross
> sequence where we have to mark it invalid before we start mucking
> around with it; otherwise a reader can get half-and-half. Perhaps I'm
> missing a key restriction here?
No that's correct. You can go out of line with something like
if (unlikely(pte_valid(*ptep)) {
pte_clear(ptep);
eieio(); /* make sure above is visible before
* further stw to high word is */
}
if (pte_val(*ptep) & _PAGE_HASHPTE)
flush_hash_entry(ptep, addr) /* or whatever proto */
I'd rather be safe than sorry in that area. ppc64 does something akin
to the above.
> If the pte is invalid, and we're SMP, we need the store to the high
> word to occur before the stwcx that sets the valid bit, unless you're
> certain we can't have a reader on another cpu. If we're not SMP, then
> I agree, let's move that store.
Yes, the stores must occur in order, but none of them need to be a
stwcx. if your PTE cannot be modified from underneath you and in any
case, the high word which isn't atomic doesn't have to be in the loop,
it should just be before the loop.
> I thought we discussed at some point the possibility that there might
> be an updater on another CPU? Is this not possible? If not, I'll
> change it. The existing code is also lwarx/stwcxing via pte_update();
> is there no reason for that? We should probably change that as well,
> if that's the case.
You are holding the PTE lock, so the only code path that might
eventually perform an update are the hash miss and the hash flush. The
former will never happen if your PTE is invalid. The later will never
happen if your PTE has _PAGE_HASHPTE clear which it should have if you
do the sequence above properly and if I didn't miss anything (which is
always possible :-)
There is indeed the chance that your PTE -will- be concurrently modified
without a lock by an invalidation if you have _PAGE_HASHPTE set and you
don't first ensure the hash has been flushed. In that case, you do need
an atomic operation. Just move the stw to the top word to be before the
loop then (and still keep the pte_clear bit).
Cheers,
Ben.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-09-02 21:21 ` Benjamin Herrenschmidt
@ 2008-09-03 15:10 ` Becky Bruce
2008-09-04 2:53 ` Benjamin Herrenschmidt
0 siblings, 1 reply; 16+ messages in thread
From: Becky Bruce @ 2008-09-03 15:10 UTC (permalink / raw)
To: benh; +Cc: linuxppc-dev
On Sep 2, 2008, at 4:21 PM, Benjamin Herrenschmidt wrote:
>
>> The reason I did that was to distinguish between the size of a
>> software PTE entry, and the actual size of the hardware entry. In
>> the
>> case of 36b physical, the hardware PTE size is the same as it always
>> is; but we've grown the Linux PTE. We can call it something else, or
>> I can change as you suggest; I was worried that the next person to
>> come along might get confused. I suppose there aren't too many of us
>> that are crazy enough to muck around in this code, so maybe it's not
>> that big of a deal.
>
> We already called "PTE" the linux entry everywhere so hopefully there
> should be no confusion. We call "HPTE" the hash table ones.
Actually, PTE_SIZE is already defined in hash_low_32.S, and it's the
size of the HPTE. I'll rename that to HPTE_SIZE and use PTE_SIZE for
the linux PTE.
-Becky
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical
2008-09-03 15:10 ` Becky Bruce
@ 2008-09-04 2:53 ` Benjamin Herrenschmidt
0 siblings, 0 replies; 16+ messages in thread
From: Benjamin Herrenschmidt @ 2008-09-04 2:53 UTC (permalink / raw)
To: Becky Bruce; +Cc: linuxppc-dev
On Wed, 2008-09-03 at 10:10 -0500, Becky Bruce wrote:
> Actually, PTE_SIZE is already defined in hash_low_32.S, and it's the
> size of the HPTE. I'll rename that to HPTE_SIZE and use PTE_SIZE for
> the linux PTE.
Good idea. Thanks.
Cheers,
Ben.
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2008-09-04 2:55 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-27 22:38 [PATCH v2] POWERPC: Allow 32-bit pgtable code to support 36-bit physical Becky Bruce
2008-08-27 23:43 ` Scott Wood
2008-08-28 15:36 ` Becky Bruce
2008-08-28 16:07 ` Scott Wood
2008-08-28 19:37 ` Becky Bruce
2008-08-28 20:28 ` Scott Wood
2008-08-28 21:13 ` Becky Bruce
2008-08-28 22:42 ` Benjamin Herrenschmidt
2008-08-30 16:24 ` Scott Wood
2008-08-31 8:22 ` Benjamin Herrenschmidt
2008-09-01 5:28 ` Benjamin Herrenschmidt
2008-09-01 5:19 ` Benjamin Herrenschmidt
2008-09-02 16:19 ` Becky Bruce
2008-09-02 21:21 ` Benjamin Herrenschmidt
2008-09-03 15:10 ` Becky Bruce
2008-09-04 2:53 ` Benjamin Herrenschmidt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).