From: Balbir Singh <bsingharora@gmail.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
benh@kernel.crashing.org, paulus@samba.org, mpe@ellerman.id.au
Cc: linuxppc-dev@lists.ozlabs.org
Subject: Re: [RFC PATCH] powerpc/mm: Use big endian page table for book3s 64
Date: Mon, 29 Feb 2016 12:59:55 +1100 [thread overview]
Message-ID: <56D3A61B.5040308@gmail.com> (raw)
In-Reply-To: <1456458814-7497-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
On 26/02/16 14:53, Aneesh Kumar K.V wrote:
> This enables us to share the same page table code for
> both radix and hash. Radix use a hardware defined big endian
^uses
> page table
>
> Asm -> C conversion makes it simpler to build code for both little
> and big endian page table.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
> Note:
> Any suggestion on how we can do that pte update better so that we can build
> a LE and BE page table kernel will be helpful.
Ideally this should not break software compatibility for VM migration, but might be worth testing. Basically a hypervisor with BE page tables and software endian older kernel instance. Also look for any tools that work off of saved dump images with PTE entries in them - crash/kdump/etc
> arch/powerpc/include/asm/book3s/64/hash.h | 75 ++++++++++++--------
> arch/powerpc/include/asm/kvm_book3s_64.h | 12 ++--
> arch/powerpc/include/asm/page.h | 4 ++
> arch/powerpc/include/asm/pgtable-be-types.h | 104 ++++++++++++++++++++++++++++
> arch/powerpc/mm/hash64_4k.c | 6 +-
> arch/powerpc/mm/hash64_64k.c | 11 +--
> arch/powerpc/mm/hugepage-hash64.c | 5 +-
> arch/powerpc/mm/hugetlbpage-hash64.c | 5 +-
> arch/powerpc/mm/pgtable-hash64.c | 42 +++++------
> 9 files changed, 197 insertions(+), 67 deletions(-)
> create mode 100644 arch/powerpc/include/asm/pgtable-be-types.h
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
> index 9b451cb8294a..9153bda5f395 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> @@ -1,6 +1,9 @@
> #ifndef _ASM_POWERPC_BOOK3S_64_HASH_H
> #define _ASM_POWERPC_BOOK3S_64_HASH_H
> #ifdef __KERNEL__
> +#ifndef __ASSEMBLY__
> +#include <asm/cmpxchg.h>
> +#endif
Do we still need PTE_ATOMIC_UPDATE as 1 after these changes?
>
> /*
> * Common bits between 4K and 64K pages in a linux-style PTE.
> @@ -249,27 +252,35 @@ static inline unsigned long pte_update(struct mm_struct *mm,
> unsigned long set,
> int huge)
> {
> - unsigned long old, tmp;
> -
> - __asm__ __volatile__(
> - "1: ldarx %0,0,%3 # pte_update\n\
> - andi. %1,%0,%6\n\
> - bne- 1b \n\
> - andc %1,%0,%4 \n\
> - or %1,%1,%7\n\
> - stdcx. %1,0,%3 \n\
> - bne- 1b"
> - : "=&r" (old), "=&r" (tmp), "=m" (*ptep)
> - : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set)
> - : "cc" );
> + pte_t pte;
> + unsigned long old_pte, new_pte;
> +
> + do {
> +reload:
> + pte = READ_ONCE(*ptep);
> + old_pte = pte_val(pte);
> +
> + /* If PTE busy, retry */
> + if (unlikely(old_pte & _PAGE_BUSY))
> + goto reload;
A loop within another? goto to upward labels can be ugly..
do {
pte = READ_ONCE(*ptep);
old_pte = pte_val(pte);
while (unlikely(old_pte & _PAGE_BUSY)) {
cpu_relax(); /* Do we need this? */
pte = READ_ONCE(*ptep);
old_pte = pte_val(pte);
}
The above four lines can be abstracted further to loop_while_page_busy() if required :)
> + /*
> + * Try to lock the PTE, add ACCESSED and DIRTY if it was
> + * a write access. Since this is 4K insert of 64K page size
> + * also add _PAGE_COMBO
> + */
> + new_pte = (old_pte | set) & ~clr;
> +
> + } while (cpu_to_be64(old_pte) != __cmpxchg_u64((unsigned long *)ptep,
> + cpu_to_be64(old_pte),
> + cpu_to_be64(new_pte)));
> /* huge pages use the old page table lock */
> if (!huge)
> assert_pte_locked(mm, addr);
>
> - if (old & _PAGE_HASHPTE)
> - hpte_need_flush(mm, addr, ptep, old, huge);
> + if (old_pte & _PAGE_HASHPTE)
> + hpte_need_flush(mm, addr, ptep, old_pte, huge);
>
> - return old;
> + return old_pte;
> }
>
> /*
> @@ -317,22 +328,30 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
> */
> static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
> {
> + pte_t pte;
> + unsigned long old_pte, new_pte;
> unsigned long bits = pte_val(entry) &
> (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC |
> _PAGE_SOFT_DIRTY);
>
> - unsigned long old, tmp;
> -
> - __asm__ __volatile__(
> - "1: ldarx %0,0,%4\n\
> - andi. %1,%0,%6\n\
> - bne- 1b \n\
> - or %0,%3,%0\n\
> - stdcx. %0,0,%4\n\
> - bne- 1b"
> - :"=&r" (old), "=&r" (tmp), "=m" (*ptep)
> - :"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY)
> - :"cc");
> + do {
> +reload:
> + pte = READ_ONCE(*ptep);
> + old_pte = pte_val(pte);
> +
> + /* If PTE busy, retry */
> + if (unlikely(old_pte & _PAGE_BUSY))
> + goto reload;
> + /*
> + * Try to lock the PTE, add ACCESSED and DIRTY if it was
> + * a write access. Since this is 4K insert of 64K page size
> + * also add _PAGE_COMBO
> + */
> + new_pte = old_pte | bits;
> +
> + } while (cpu_to_be64(old_pte) != __cmpxchg_u64((unsigned long *)ptep,
> + cpu_to_be64(old_pte),
> + cpu_to_be64(new_pte)));
> }
Can we abstract the above code into a smaller abstraction
>
> static inline int pgd_bad(pgd_t pgd)
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 2aa79c864e91..508c3741fd6f 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -299,6 +299,7 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
> */
> static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
> {
> + unsigned long old_ptev;
> pte_t old_pte, new_pte = __pte(0);
>
> while (1) {
> @@ -306,24 +307,25 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
> * Make sure we don't reload from ptep
> */
> old_pte = READ_ONCE(*ptep);
> + old_ptev = pte_val(old_pte);
> /*
> * wait until _PAGE_BUSY is clear then set it atomically
> */
> - if (unlikely(pte_val(old_pte) & _PAGE_BUSY)) {
> + if (unlikely(old_ptev & _PAGE_BUSY)) {
> cpu_relax();
> continue;
> }
> /* If pte is not present return None */
> - if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT)))
> + if (unlikely(!(old_ptev & _PAGE_PRESENT)))
> return __pte(0);
>
> new_pte = pte_mkyoung(old_pte);
> if (writing && pte_write(old_pte))
> new_pte = pte_mkdirty(new_pte);
>
> - if (pte_val(old_pte) == __cmpxchg_u64((unsigned long *)ptep,
> - pte_val(old_pte),
> - pte_val(new_pte))) {
> + if (cpu_to_be64(old_ptev) == __cmpxchg_u64((unsigned long *)ptep,
> + cpu_to_be64(old_ptev),
> + cpu_to_be64(pte_val(new_pte)))) {
> break;
> }
> }
The while(1) style here looks cleaner than the do {} while() style above
> diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
> index ab3d8977bacd..158574d2acf4 100644
> --- a/arch/powerpc/include/asm/page.h
> +++ b/arch/powerpc/include/asm/page.h
> @@ -288,7 +288,11 @@ extern long long virt_phys_offset;
>
> #ifndef __ASSEMBLY__
>
> +#ifdef CONFIG_PPC_BOOK3S_64
> +#include <asm/pgtable-be-types.h>
> +#else
> #include <asm/pgtable-types.h>
> +#endif
>
> typedef struct { signed long pd; } hugepd_t;
>
> diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h
> new file mode 100644
> index 000000000000..20527200d6ae
> --- /dev/null
> +++ b/arch/powerpc/include/asm/pgtable-be-types.h
> @@ -0,0 +1,104 @@
> +#ifndef _ASM_POWERPC_PGTABLE_BE_TYPES_H
> +#define _ASM_POWERPC_PGTABLE_BE_TYPES_H
> +
> +#ifdef CONFIG_STRICT_MM_TYPECHECKS
> +/* These are used to make use of C type-checking. */
> +
> +/* PTE level */
> +typedef struct { __be64 pte; } pte_t;
> +#define __pte(x) ((pte_t) { cpu_to_be64(x) })
> +static inline unsigned long pte_val(pte_t x)
> +{
> + return be64_to_cpu(x.pte);
> +}
> +
> +/* PMD level */
> +#ifdef CONFIG_PPC64
> +typedef struct { __be64 pmd; } pmd_t;
> +#define __pmd(x) ((pmd_t) { cpu_to_be64(x) })
> +static inline unsigned long pmd_val(pmd_t x)
> +{
> + return be64_to_cpu(x.pmd);
> +}
> +
> +/*
> + * 64 bit hash always use 4 level table. Everybody else use 4 level
> + * only for 4K page size.
> + */
> +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
> +typedef struct { __be64 pud; } pud_t;
> +#define __pud(x) ((pud_t) { cpu_to_be64(x) })
> +static inline unsigned long pud_val(pud_t x)
> +{
> + return be64_to_cpu(x.pud);
> +}
> +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
> +#endif /* CONFIG_PPC64 */
> +
> +/* PGD level */
> +typedef struct { __be64 pgd; } pgd_t;
> +#define __pgd(x) ((pgd_t) { cpu_to_be64(x) })
> +static inline unsigned long pgd_val(pgd_t x)
> +{
> + return be64_to_cpu(x.pgd);
> +}
> +
> +/* Page protection bits */
> +typedef struct { unsigned long pgprot; } pgprot_t;
> +#define pgprot_val(x) ((x).pgprot)
> +#define __pgprot(x) ((pgprot_t) { (x) })
> +
> +#else
> +
> +/*
> + * .. while these make it easier on the compiler
> + */
> +
> +typedef __be64 pte_t;
> +#define __pte(x) cpu_to_be64(x)
> +static inline unsigned long pte_val(pte_t pte)
> +{
> + return be64_to_cpu(pte);
> +}
> +
> +#ifdef CONFIG_PPC64
> +typedef __be64 pmd_t;
> +#define __pmd(x) cpu_to_be64(x)
> +static inline unsigned long pmd_val(pmd_t pmd)
> +{
> + return be64_to_cpu(pmd);
> +}
> +
> +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
> +typedef __be64 pud_t;
> +#define __pud(x) cpu_to_be64(x)
> +static inline unsigned long pud_val(pud_t pud)
> +{
> + return be64_to_cpu(pud);
> +}
> +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
> +#endif /* CONFIG_PPC64 */
> +
> +typedef __be64 pgd_t;
> +#define __pgd(x) cpu_to_be64(x)
> +static inline unsigned long pgd_val(pgd_t pgd)
> +{
> + return be64_to_cpu(pgd);
> +}
> +
> +typedef unsigned long pgprot_t;
> +#define pgprot_val(x) (x)
> +#define __pgprot(x) (x)
> +
> +#endif /* CONFIG_STRICT_MM_TYPECHECKS */
> +/*
> + * With hash config 64k pages additionally define a bigger "real PTE" type that
> + * gathers the "second half" part of the PTE for pseudo 64k pages
> + */
> +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
> +typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
> +#else
> +typedef struct { pte_t pte; } real_pte_t;
> +#endif
> +
> +#endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
> diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
> index 47d1b26effc6..1a862eb6fef1 100644
> --- a/arch/powerpc/mm/hash64_4k.c
> +++ b/arch/powerpc/mm/hash64_4k.c
> @@ -47,8 +47,10 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
> new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
> if (access & _PAGE_RW)
> new_pte |= _PAGE_DIRTY;
> - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
> - old_pte, new_pte));
> +
> + } while (cpu_to_be64(old_pte) != __cmpxchg_u64((unsigned long *)ptep,
> + cpu_to_be64(old_pte),
> + cpu_to_be64(new_pte)));
> /*
> * PP bits. _PAGE_USER is already PP bit 0x2, so we only
> * need to add in 0x1 if it's a read-only user page
> diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
> index b2d659cf51c6..976eb5f6e492 100644
> --- a/arch/powerpc/mm/hash64_64k.c
> +++ b/arch/powerpc/mm/hash64_64k.c
> @@ -79,8 +79,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
> new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO;
> if (access & _PAGE_RW)
> new_pte |= _PAGE_DIRTY;
> - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
> - old_pte, new_pte));
> + } while (cpu_to_be64(old_pte) != __cmpxchg_u64((unsigned long *)ptep,
> + cpu_to_be64(old_pte),
> + cpu_to_be64(new_pte)));
> /*
> * Handle the subpage protection bits
> */
> @@ -254,9 +255,9 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
> new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
> if (access & _PAGE_RW)
> new_pte |= _PAGE_DIRTY;
> - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
> - old_pte, new_pte));
> -
> + } while (cpu_to_be64(old_pte) != __cmpxchg_u64((unsigned long *)ptep,
> + cpu_to_be64(old_pte),
> + cpu_to_be64(new_pte)));
> rflags = htab_convert_pte_flags(new_pte);
>
> if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
> diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
> index eb2accdd76fd..d9675f1d606e 100644
> --- a/arch/powerpc/mm/hugepage-hash64.c
> +++ b/arch/powerpc/mm/hugepage-hash64.c
> @@ -49,8 +49,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
> new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
> if (access & _PAGE_RW)
> new_pmd |= _PAGE_DIRTY;
> - } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
> - old_pmd, new_pmd));
> + } while (cpu_to_be64(old_pmd) != __cmpxchg_u64((unsigned long *)pmdp,
> + cpu_to_be64(old_pmd),
> + cpu_to_be64(new_pmd)));
> rflags = htab_convert_pte_flags(new_pmd);
>
> #if 0
> diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
> index db17ff3f4864..0a1f87ffde8c 100644
> --- a/arch/powerpc/mm/hugetlbpage-hash64.c
> +++ b/arch/powerpc/mm/hugetlbpage-hash64.c
> @@ -68,8 +68,9 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
> new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
> if (access & _PAGE_RW)
> new_pte |= _PAGE_DIRTY;
> - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
> - old_pte, new_pte));
> + } while(cpu_to_be64(old_pte) != __cmpxchg_u64((unsigned long *)ptep,
> + cpu_to_be64(old_pte),
> + cpu_to_be64(new_pte)));
> rflags = htab_convert_pte_flags(new_pte);
>
> sz = ((1UL) << shift);
> diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
> index cbd81345fdec..b5e7e8efef1e 100644
> --- a/arch/powerpc/mm/pgtable-hash64.c
> +++ b/arch/powerpc/mm/pgtable-hash64.c
> @@ -281,34 +281,30 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
> pmd_t *pmdp, unsigned long clr,
> unsigned long set)
> {
> -
> - unsigned long old, tmp;
> + pmd_t pmd;
> + unsigned long old_pmd, new_pmd;
>
> #ifdef CONFIG_DEBUG_VM
> WARN_ON(!pmd_trans_huge(*pmdp));
> assert_spin_locked(&mm->page_table_lock);
> #endif
> -
> -#ifdef PTE_ATOMIC_UPDATES
> - __asm__ __volatile__(
> - "1: ldarx %0,0,%3\n\
> - andi. %1,%0,%6\n\
> - bne- 1b \n\
> - andc %1,%0,%4 \n\
> - or %1,%1,%7\n\
> - stdcx. %1,0,%3 \n\
> - bne- 1b"
> - : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
> - : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
> - : "cc" );
> -#else
> - old = pmd_val(*pmdp);
> - *pmdp = __pmd((old & ~clr) | set);
> -#endif
> - trace_hugepage_update(addr, old, clr, set);
> - if (old & _PAGE_HASHPTE)
> - hpte_do_hugepage_flush(mm, addr, pmdp, old);
> - return old;
> + do {
> +reload:
> + pmd = READ_ONCE(*pmdp);
> + old_pmd = pmd_val(pmd);
> +
> + /* If PTE busy, retry */
> + if (unlikely(old_pmd & _PAGE_BUSY))
> + goto reload;
> + new_pmd = (old_pmd | set) & ~clr;
> +
> + } while (cpu_to_be64(old_pmd) != __cmpxchg_u64((unsigned long *)pmdp,
> + cpu_to_be64(old_pmd),
> + cpu_to_be64(new_pmd)));
> + trace_hugepage_update(addr, old_pmd, clr, set);
> + if (old_pmd & _PAGE_HASHPTE)
> + hpte_do_hugepage_flush(mm, addr, pmdp, old_pmd);
> + return old_pmd;
> }
>
> pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
Balbir Singh.
next prev parent reply other threads:[~2016-02-29 2:00 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-02-26 3:53 [RFC PATCH] powerpc/mm: Use big endian page table for book3s 64 Aneesh Kumar K.V
2016-02-29 1:59 ` Balbir Singh [this message]
2016-02-29 2:58 ` Balbir Singh
2016-03-01 5:31 ` Aneesh Kumar K.V
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=56D3A61B.5040308@gmail.com \
--to=bsingharora@gmail.com \
--cc=aneesh.kumar@linux.vnet.ibm.com \
--cc=benh@kernel.crashing.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=mpe@ellerman.id.au \
--cc=paulus@samba.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.