* [RFC PATCH 09/17] powerpc/mm: Use encode avpn where we need only avpn values
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/mmu-hash64.h | 8 ++++----
arch/powerpc/mm/hash_native_64.c | 10 +++++-----
arch/powerpc/platforms/pseries/lpar.c | 2 +-
3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 6ec65b6..aeeee5e 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -237,14 +237,14 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize,
/*
* This function sets the AVPN and L fields of the HPTE appropriately
- * for the page size
+ * using the base page size and actual page size.
*/
-static inline unsigned long hpte_encode_v(unsigned long vpn,
- int psize, int ssize)
+static inline unsigned long hpte_encode_v(unsigned long vpn, int psize,
+ int apsize, int ssize)
{
unsigned long v;
v = hpte_encode_avpn(vpn, psize, ssize);
- if (psize != MMU_PAGE_4K)
+ if (apsize != MMU_PAGE_4K)
v |= HPTE_V_LARGE;
return v;
}
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index da46cd3..4cf361f 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -216,7 +216,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
if (i == HPTES_PER_GROUP)
return -1;
- hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
+ hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED)) {
@@ -327,7 +327,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
int ret = 0;
int actual_psize;
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -364,7 +364,7 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
unsigned long want_v, hpte_v;
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
/* Bolted mappings are only ever in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -427,7 +427,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
hpte_v = hptep->v;
@@ -599,7 +599,7 @@ static void native_flush_hash_range(unsigned long number, int local)
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
hpte_v = hptep->v;
if (!HPTE_V_COMPARE(hpte_v, want_v) ||
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 9f99847..ca9c2bb 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -121,7 +121,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
"pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
hpte_group, vpn, pa, rflags, vflags, psize);
- hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
+ hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED))
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 13/17] powerpc/THP: Add code to handle HPTE faults for large pages
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
We now have pmd entries covering to 16MB range. To implement THP on powerpc,
we double the size of PMD. The second half is used to deposit the pgtable (PTE page).
We also use the depoisted PTE page for tracking the HPTE information. The information
include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
With 16MB huge page and 64K HPTE we need 256 entries and with 4K HPTE we need
4096 entries. Both will fit in a 4K PTE page.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/mmu-hash64.h | 5 +
arch/powerpc/include/asm/pgtable-ppc64.h | 33 ++----
arch/powerpc/kernel/io-workarounds.c | 2 +-
arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +-
arch/powerpc/kvm/book3s_hv_rm_mmu.c | 5 +-
arch/powerpc/mm/Makefile | 1 +
arch/powerpc/mm/hash_utils_64.c | 12 ++-
arch/powerpc/mm/hugetlbpage.c | 19 +++-
arch/powerpc/mm/largepage-hash64.c | 170 ++++++++++++++++++++++++++++++
arch/powerpc/mm/pgtable.c | 34 ++++++
arch/powerpc/mm/tlb_hash64.c | 2 +-
arch/powerpc/perf/callchain.c | 2 +-
arch/powerpc/platforms/pseries/eeh.c | 2 +-
13 files changed, 248 insertions(+), 41 deletions(-)
create mode 100644 arch/powerpc/mm/largepage-hash64.c
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index aeeee5e..f1024c8 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -319,6 +319,11 @@ extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, int local, int ssize,
unsigned int shift, unsigned int mmu_psize);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __hash_page_thp(unsigned long ea, unsigned long access,
+ unsigned long vsid, pmd_t *pmdp, unsigned long trap,
+ int local, int ssize, unsigned int psize);
+#endif
extern void hash_failure_debug(unsigned long ea, unsigned long access,
unsigned long vsid, unsigned long trap,
int ssize, int psize, int lpsize,
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 0da8840..d9579a5 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -350,39 +350,18 @@ static inline void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
return __pgtable_cache_add(shift, sizeof(void *) << shift, ctor);
}
-/*
- * find_linux_pte returns the address of a linux pte for a given
- * effective address and directory. If not found, it returns zero.
- */
-static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
-{
- pgd_t *pg;
- pud_t *pu;
- pmd_t *pm;
- pte_t *pt = NULL;
-
- pg = pgdir + pgd_index(ea);
- if (!pgd_none(*pg)) {
- pu = pud_offset(pg, ea);
- if (!pud_none(*pu)) {
- pm = pmd_offset(pu, ea);
- if (pmd_present(*pm))
- pt = pte_offset_kernel(pm, ea);
- }
- }
- return pt;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE)
pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
- unsigned *shift);
+ unsigned *shift, unsigned int *thp);
#else
+pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea, unsigned int *thp);
static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
- unsigned *shift)
+ unsigned *shift,
+ unsigned int *thp)
{
if (shift)
*shift = 0;
- return find_linux_pte(pgdir, ea);
+ return find_linux_pte(pgdir, ea, thp);
}
#endif /* !CONFIG_HUGETLB_PAGE */
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 50e90b7..a37c5d2 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -70,7 +70,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
return NULL;
- ptep = find_linux_pte(init_mm.pgd, vaddr);
+ ptep = find_linux_pte(init_mm.pgd, vaddr, NULL);
if (ptep == NULL)
paddr = 0;
else
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 8cc18ab..4f2a7dc 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -683,7 +683,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
rcu_read_lock_sched();
ptep = find_linux_pte_or_hugepte(current->mm->pgd,
- hva, NULL);
+ hva, NULL, NULL);
if (ptep && pte_present(*ptep)) {
pte = kvmppc_read_update_linux_pte(ptep, 1);
if (pte_write(pte))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 19c93ba..5a9b7f6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x)
unsigned long addr = (unsigned long) x;
pte_t *p;
- p = find_linux_pte(swapper_pg_dir, addr);
+ p = find_linux_pte(swapper_pg_dir, addr, NULL);
if (!p || !pte_present(*p))
return NULL;
/* assume we don't have huge pages in vmalloc space... */
@@ -145,6 +145,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
unlock_rmap(rmap);
}
+/* FIXME!! check */
static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
int writing, unsigned long *pte_sizep)
{
@@ -152,7 +153,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
unsigned long ps = *pte_sizep;
unsigned int shift;
- ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
+ ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift, NULL);
if (!ptep)
return __pte(0);
if (shift)
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3787b61..6b09f9d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -33,6 +33,7 @@ obj-y += hugetlbpage.o
obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += largepage-hash64.o
obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index a06b55a..3a1752f 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -939,7 +939,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
unsigned long vsid;
struct mm_struct *mm;
pte_t *ptep;
- unsigned hugeshift;
+ unsigned hugeshift, thp;
const struct cpumask *tmp;
int rc, user_region = 0, local = 0;
int psize, ssize;
@@ -1005,7 +1005,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
#endif /* CONFIG_PPC_64K_PAGES */
/* Get PTE and page size from page tables */
- ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
+ ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift, &thp);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
return 1;
@@ -1028,6 +1028,12 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
ssize, hugeshift, psize);
#endif /* CONFIG_HUGETLB_PAGE */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (thp)
+ return __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+ trap, local, ssize, psize);
+#endif
+
#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
#else
@@ -1133,7 +1139,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
pgdir = mm->pgd;
if (pgdir == NULL)
return;
- ptep = find_linux_pte(pgdir, ea);
+ ptep = find_linux_pte(pgdir, ea, NULL);
if (!ptep)
return;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1a6de0a..bce7a9f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -67,7 +67,8 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
#define hugepd_none(hpd) ((hpd).pd == 0)
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+ unsigned *shift, unsigned int *thp)
{
pgd_t *pg;
pud_t *pu;
@@ -77,6 +78,8 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
if (shift)
*shift = 0;
+ if (thp)
+ *thp = 0;
pg = pgdir + pgd_index(ea);
if (is_hugepd(pg)) {
@@ -91,12 +94,20 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
pm = pmd_offset(pu, ea);
if (is_hugepd(pm))
hpdp = (hugepd_t *)pm;
- else if (!pmd_none(*pm)) {
+ else if (pmd_large(*pm)) {
+ /* THP page */
+ if (thp)
+ *thp = 1;
+ /*
+ * This should be ok, except for few flags
+ * most of the pte, large page pmd bits map
+ */
+ return (pte_t *)pm;
+ } else if (!pmd_none(*pm)) {
return pte_offset_kernel(pm, ea);
}
}
}
-
if (!hpdp)
return NULL;
@@ -614,7 +625,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
unsigned shift;
unsigned long mask;
- ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
+ ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift, NULL);
/* Verify it is a huge page else bail. */
if (!ptep || !shift)
diff --git a/arch/powerpc/mm/largepage-hash64.c b/arch/powerpc/mm/largepage-hash64.c
new file mode 100644
index 0000000..2a5fc39
--- /dev/null
+++ b/arch/powerpc/mm/largepage-hash64.c
@@ -0,0 +1,170 @@
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+#include <asm/udbg.h>
+
+/*
+ * A linux huge page PMD was changed and the corresponding hash table entry
+ * neesd to be flushed. FIXME!! there is no batching support yet.
+ *
+ * The linux huge page PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
+ * With 16MB huge page and 64K HPTE we need 256 entries and with 4K HPTE we need
+ * 4096 entries. Both will fit in a 4K pgtable_t.
+ */
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+ pmd_t *pmdp, unsigned long trap, int local, int ssize,
+ unsigned int psize)
+{
+ unsigned int index, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long rflags, pa, hidx;
+ unsigned long old_pmd, new_pmd;
+ int ret, lpsize = MMU_PAGE_16M;
+ unsigned long vpn, hash, shift, slot;
+
+ /*
+ * atomically mark the linux large page PMD busy and dirty
+ */
+ do {
+ old_pmd = pmd_val(*pmdp);
+ /* If PMD busy, retry the access */
+ if (unlikely(old_pmd & PMD_HUGE_BUSY))
+ return 0;
+ /* If PMD permissions don't match, take page fault */
+ if (unlikely(access & ~old_pmd))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access
+ */
+ new_pmd = old_pmd | PMD_HUGE_BUSY | PMD_HUGE_ACCESSED;
+ if (access & _PAGE_RW)
+ new_pmd |= PMD_HUGE_DIRTY;
+ } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+ old_pmd, new_pmd));
+ /*
+ * derive the rflags. Default enable read (0x2)
+ */
+ rflags = 0x2 | (!(new_pmd & PMD_HUGE_RW));
+ /* PMD_HUGE_EXEC -> HW_NO_EXEC since it's inverted */
+ rflags |= ((new_pmd & PMD_HUGE_EXEC) ? 0 : HPTE_R_N);
+
+#if 0 /* FIXME!! */
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+ }
+#endif
+ /*
+ * Find the slot index details for this ea, using base page size.
+ */
+ shift = mmu_psize_defs[psize].shift;
+ index = (ea & (HUGE_PAGE_SIZE - 1)) >> shift;
+ BUG_ON(index > 4096);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ /*
+ * The hpte hindex are stored in the pgtable whose address is in the
+ * second half of the PMD
+ */
+ hpte_slot_array = *(char **)(pmdp + PTRS_PER_PMD);
+
+ valid = hpte_slot_array[index] & 0x1;
+ if (unlikely(valid)) {
+ /* update the hpte bits */
+ hidx = hpte_slot_array[index] >> 1;
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+ psize, ssize, local);
+ /*
+ * We failed to update, try to insert a new entry.
+ */
+ if (ret == -1) {
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ hpte_slot_array[index] = 0;
+ valid = 0;
+ }
+ }
+
+ if (likely(!valid)) {
+ unsigned long hpte_group;
+
+ /* insert new entry */
+ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+ hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+ /* clear the busy bits and set the hash pte bits */
+ new_pmd = (new_pmd & ~PMD_HUGE_HPTEFLAGS) | PMD_HUGE_HASHPTE;
+
+#if 0
+ /* Add in WIMG bits. FIXME!! enabled by default */
+ rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+ _PAGE_COHERENT | _PAGE_GUARDED));
+#endif
+ /* Insert into the hash table, primary slot */
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ psize, lpsize, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = ((~hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+ rflags, HPTE_V_SECONDARY,
+ psize, lpsize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+
+ ppc_md.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pmd and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *pmdp = __pmd(old_pmd);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ psize, lpsize, old_pmd);
+ return -1;
+ }
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ hpte_slot_array[index] = slot << 1 | 0x1;
+ }
+ /*
+ * No need to use ldarx/stdcx here
+ */
+ *pmdp = __pmd(new_pmd & ~PMD_HUGE_BUSY);
+ return 0;
+}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index e173b5e..841271f 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -580,3 +580,37 @@ struct page *pmd_page(pmd_t pmd)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/*
+ * find_linux_pte returns the address of a linux pte for a given
+ * effective address and directory. If not found, it returns zero.
+ */
+pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea, unsigned int *thp)
+{
+ pgd_t *pg;
+ pud_t *pu;
+ pmd_t *pm;
+ pte_t *pt = NULL;
+
+ if (thp)
+ *thp = 0;
+ pg = pgdir + pgd_index(ea);
+ if (!pgd_none(*pg)) {
+ pu = pud_offset(pg, ea);
+ if (!pud_none(*pu)) {
+ pm = pmd_offset(pu, ea);
+ if (pmd_large(*pm)) {
+ /* THP page */
+ if (thp)
+ *thp = 1;
+ /*
+ * This should be ok, except for few flags
+ * most of the pte, large page pmd bits map
+ */
+ return (pte_t *)pm;
+ } else if (pmd_present(*pm))
+ pt = pte_offset_kernel(pm, ea);
+ }
+ }
+ return pt;
+}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 023ec8a..9a951d5 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -206,7 +206,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_linux_pte(mm->pgd, start);
+ pte_t *ptep = find_linux_pte(mm->pgd, start, NULL);
unsigned long pte;
if (ptep == NULL)
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 74d1e78..578cac7 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -125,7 +125,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
if (!pgdir)
return -EFAULT;
- ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+ ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift, NULL);
if (!shift)
shift = PAGE_SHIFT;
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 9a04322..d6f8f0e 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -261,7 +261,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
pte_t *ptep;
unsigned long pa;
- ptep = find_linux_pte(init_mm.pgd, token);
+ ptep = find_linux_pte(init_mm.pgd, token, NULL);
if (!ptep)
return token;
pa = pte_pfn(*ptep) << PAGE_SHIFT;
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 06/17] powerpc/mm: Decode the pte-lp-encoding bits correctly.
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
We look at both the segment base page size and actual page size and store
the pte-lp-encodings in an array per base page size.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/machdep.h | 3 +-
arch/powerpc/include/asm/mmu-hash64.h | 12 ++--
arch/powerpc/mm/hash_low_64.S | 18 ++++--
arch/powerpc/mm/hash_native_64.c | 105 ++++++++++++++++++++++++---------
arch/powerpc/mm/hash_utils_64.c | 103 +++++++++++++++++++-------------
arch/powerpc/platforms/pseries/lpar.c | 4 +-
6 files changed, 163 insertions(+), 82 deletions(-)
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 19d9d96..6cee6e0 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -50,7 +50,8 @@ struct machdep_calls {
unsigned long prpn,
unsigned long rflags,
unsigned long vflags,
- int psize, int ssize);
+ int psize, int apsize,
+ int ssize);
long (*hpte_remove)(unsigned long hpte_group);
void (*hpte_removebolted)(unsigned long ea,
int psize, int ssize);
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index c3b3518..6290e26 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -154,7 +154,7 @@ extern unsigned long htab_hash_mask;
struct mmu_psize_def
{
unsigned int shift; /* number of bits */
- unsigned int penc; /* HPTE encoding */
+ unsigned int penc[MMU_PAGE_COUNT]; /* HPTE encoding */
unsigned int tlbiel; /* tlbiel supported for that page size */
unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */
unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */
@@ -254,16 +254,18 @@ static inline unsigned long hpte_encode_v(unsigned long vpn,
* for the page size. We assume the pa is already "clean" that is properly
* aligned for the requested page size
*/
-static inline unsigned long hpte_encode_r(unsigned long pa, int psize)
+static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize,
+ int actual_psize)
{
unsigned long r;
/* A 4K page needs no special encoding */
- if (psize == MMU_PAGE_4K)
+ if (actual_psize == MMU_PAGE_4K)
return pa & HPTE_R_RPN;
else {
- unsigned int penc = mmu_psize_defs[psize].penc;
- unsigned int shift = mmu_psize_defs[psize].shift;
+ unsigned int penc = mmu_psize_defs[base_psize].penc[actual_psize];
+ unsigned int shift = mmu_psize_defs[actual_psize].shift;
+ /* FIXME!! replace 12 by LP_SHIFT ? */
return (pa & ~((1ul << shift) - 1)) | (penc << 12);
}
return r;
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index abdd5e2..0e980ac 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -196,7 +196,8 @@ htab_insert_pte:
mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARAM(R9)(r1) /* segment size */
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
_GLOBAL(htab_call_hpte_insert1)
bl . /* Patched by htab_finish_init() */
cmpdi 0,r3,0
@@ -219,7 +220,8 @@ _GLOBAL(htab_call_hpte_insert1)
mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARAM(R9)(r1) /* segment size */
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
_GLOBAL(htab_call_hpte_insert2)
bl . /* Patched by htab_finish_init() */
cmpdi 0,r3,0
@@ -515,7 +517,8 @@ htab_special_pfn:
mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARAM(R9)(r1) /* segment size */
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
_GLOBAL(htab_call_hpte_insert1)
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
@@ -542,7 +545,8 @@ _GLOBAL(htab_call_hpte_insert1)
mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARAM(R9)(r1) /* segment size */
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
_GLOBAL(htab_call_hpte_insert2)
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
@@ -840,7 +844,8 @@ ht64_insert_pte:
mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_64K
- ld r9,STK_PARAM(R9)(r1) /* segment size */
+ li r9,MMU_PAGE_64K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
_GLOBAL(ht64_call_hpte_insert1)
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
@@ -863,7 +868,8 @@ _GLOBAL(ht64_call_hpte_insert1)
mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_64K
- ld r9,STK_PARAM(R9)(r1) /* segment size */
+ li r9,MMU_PAGE_64K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
_GLOBAL(ht64_call_hpte_insert2)
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index ffc1e00..16ba033 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -36,10 +36,14 @@
#endif
#define HPTE_LOCK_BIT 3
+#define LP_SHIFT 12
+#define LP_BITS 8
+#define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT)
+
DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-static inline void __tlbie(unsigned long vpn, int psize, int ssize)
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
@@ -68,7 +72,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int ssize)
break;
default:
/* We need 14 to 14 + i bits of va */
- penc = mmu_psize_defs[psize].penc;
+ penc = mmu_psize_defs[psize].penc[apsize];
va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
@@ -80,7 +84,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int ssize)
}
}
-static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
@@ -102,7 +106,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
break;
default:
/* We need 14 to 14 + i bits of va */
- penc = mmu_psize_defs[psize].penc;
+ penc = mmu_psize_defs[psize].penc[apsize];
va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
@@ -114,7 +118,8 @@ static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
}
-static inline void tlbie(unsigned long vpn, int psize, int ssize, int local)
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+ int ssize, int local)
{
unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
@@ -125,10 +130,10 @@ static inline void tlbie(unsigned long vpn, int psize, int ssize, int local)
raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync": : :"memory");
if (use_local) {
- __tlbiel(vpn, psize, ssize);
+ __tlbiel(vpn, psize, apsize, ssize);
asm volatile("ptesync": : :"memory");
} else {
- __tlbie(vpn, psize, ssize);
+ __tlbie(vpn, psize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
@@ -156,7 +161,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep)
static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
unsigned long pa, unsigned long rflags,
- unsigned long vflags, int psize, int ssize)
+ unsigned long vflags, int psize, int apsize, int ssize)
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
@@ -184,7 +189,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
return -1;
hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(pa, psize) | rflags;
+ hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
@@ -244,6 +249,47 @@ static long native_hpte_remove(unsigned long hpte_group)
return i;
}
+static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
+{
+ unsigned int mask;
+ int i, penc, shift;
+ unsigned int lp = (hptep->r >> LP_SHIFT) & LP_BITS;
+
+#if 0
+ /*
+ * FIXME!! hpte_decode have more tricks. why not
+ * How do we find how many bits need to be used for r and z ?
+ */
+ for (i = 0; i < LP_BITS; i++) {
+ if ((hptep->r & LP_MASK(i+1)) == LP_MASK(i+1))
+ break;
+ }
+ penc = LP_MASK(i+1) >> LP_SHIFT;
+ for (i = 0; i < MMU_PAGE_COUNT; i++) {
+ if (penc == mmu_psize_defs[psize].penc[i])
+ return i;
+ }
+ return -1;
+#else
+ penc = 0;
+ /* is this better ? */
+ for (i = 0; i < MMU_PAGE_COUNT; i++) {
+ /* valid entries have a shift value */
+ if (!mmu_psize_defs[i].shift)
+ continue;
+
+ /* encoding bits per actual page size */
+ shift = mmu_psize_defs[i].shift - 11;
+ if (shift > 9)
+ shift = 9;
+ mask = (1 << shift) - 1;
+ if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+ return i;
+ }
+ return -1;
+#endif
+}
+
static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
unsigned long vpn, int psize, int ssize,
int local)
@@ -251,6 +297,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0;
+ int actual_psize;
want_v = hpte_encode_v(vpn, psize, ssize);
@@ -260,6 +307,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
native_lock_hpte(hptep);
hpte_v = hptep->v;
+ actual_psize = hpte_actual_psize(hptep, psize);
/* Even if we miss, we need to invalidate the TLB */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
@@ -274,7 +322,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
native_unlock_hpte(hptep);
/* Ensure it is out of the tlb too. */
- tlbie(vpn, psize, ssize, local);
+ tlbie(vpn, psize, actual_psize, ssize, local);
return ret;
}
@@ -315,6 +363,7 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
int psize, int ssize)
{
+ int actual_psize;
unsigned long vpn;
unsigned long vsid;
long slot;
@@ -327,13 +376,14 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
if (slot == -1)
panic("could not find page to bolt\n");
hptep = htab_address + slot;
+ actual_psize = hpte_actual_psize(hptep, psize);
/* Update the HPTE */
hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
(newpp & (HPTE_R_PP | HPTE_R_N));
/* Ensure it is out of the tlb too. */
- tlbie(vpn, psize, ssize, 0);
+ tlbie(vpn, psize, actual_psize, ssize, 0);
}
static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
@@ -343,6 +393,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
unsigned long hpte_v;
unsigned long want_v;
unsigned long flags;
+ int actual_psize;
local_irq_save(flags);
@@ -352,6 +403,7 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
native_lock_hpte(hptep);
hpte_v = hptep->v;
+ actual_psize = hpte_actual_psize(hptep, psize);
/* Even if we miss, we need to invalidate the TLB */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
@@ -360,23 +412,19 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
hptep->v = 0;
/* Invalidate the TLB */
- tlbie(vpn, psize, ssize, local);
+ tlbie(vpn, psize, actual_psize, ssize, local);
local_irq_restore(flags);
}
-#define LP_SHIFT 12
-#define LP_BITS 8
-#define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT)
-
static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
- int *psize, int *ssize, unsigned long *vpn)
+ int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
unsigned long avpn, pteg, vpi;
unsigned long hpte_r = hpte->r;
unsigned long hpte_v = hpte->v;
unsigned long vsid, seg_off;
- int i, size, shift, penc;
+ int i, size, a_size = MMU_PAGE_4K, shift, penc;
if (!(hpte_v & HPTE_V_LARGE))
size = MMU_PAGE_4K;
@@ -395,12 +443,13 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
/* valid entries have a shift value */
if (!mmu_psize_defs[size].shift)
continue;
-
- if (penc == mmu_psize_defs[size].penc)
- break;
+ for (a_size = 0; a_size < MMU_PAGE_COUNT; a_size++)
+ if (penc == mmu_psize_defs[size].penc[a_size])
+ goto out;
}
}
+out:
/* This works for all page sizes, and for 256M and 1T segments */
*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
shift = mmu_psize_defs[size].shift;
@@ -433,7 +482,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
default:
*vpn = size = 0;
}
- *psize = size;
+ *psize = size;
+ *apsize = a_size;
}
/*
@@ -451,7 +501,7 @@ static void native_hpte_clear(void)
struct hash_pte *hptep = htab_address;
unsigned long hpte_v;
unsigned long pteg_count;
- int psize, ssize;
+ int psize, apsize, ssize;
pteg_count = htab_hash_mask + 1;
@@ -477,9 +527,9 @@ static void native_hpte_clear(void)
* already hold the native_tlbie_lock.
*/
if (hpte_v & HPTE_V_VALID) {
- hpte_decode(hptep, slot, &psize, &ssize, &vpn);
+ hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
hptep->v = 0;
- __tlbie(vpn, psize, ssize);
+ __tlbie(vpn, psize, apsize, ssize);
}
}
@@ -491,6 +541,7 @@ static void native_hpte_clear(void)
/*
* Batched hash table flush, we batch the tlbie's to avoid taking/releasing
* the lock all the time
+ * FIXME!! large page support needed ?
*/
static void native_flush_hash_range(unsigned long number, int local)
{
@@ -540,7 +591,7 @@ static void native_flush_hash_range(unsigned long number, int local)
pte_iterate_hashed_subpages(pte, psize,
vpn, index, shift) {
- __tlbiel(vpn, psize, ssize);
+ __tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("ptesync":::"memory");
@@ -557,7 +608,7 @@ static void native_flush_hash_range(unsigned long number, int local)
pte_iterate_hashed_subpages(pte, psize,
vpn, index, shift) {
- __tlbie(vpn, psize, ssize);
+ __tlbie(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("eieio; tlbsync; ptesync":::"memory");
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index bfeab83..48edb46 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -125,7 +125,7 @@ static struct mmu_psize_def mmu_psize_defaults_old[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
- .penc = 0,
+ .penc[MMU_PAGE_4K] = 0,
.avpnm = 0,
.tlbiel = 0,
},
@@ -139,14 +139,14 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
- .penc = 0,
+ .penc[MMU_PAGE_4K] = 0,
.avpnm = 0,
.tlbiel = 1,
},
[MMU_PAGE_16M] = {
.shift = 24,
.sllp = SLB_VSID_L,
- .penc = 0,
+ .penc[MMU_PAGE_16M] = 0,
.avpnm = 0x1UL,
.tlbiel = 0,
},
@@ -208,7 +208,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
BUG_ON(!ppc_md.hpte_insert);
ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot,
- HPTE_V_BOLTED, psize, ssize);
+ HPTE_V_BOLTED, psize, psize, ssize);
if (ret < 0)
break;
@@ -275,6 +275,30 @@ static void __init htab_init_seg_sizes(void)
of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
}
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x14:
+ idx = MMU_PAGE_1M;
+ break;
+ case 0x18:
+ idx = MMU_PAGE_16M;
+ break;
+ case 0x22:
+ idx = MMU_PAGE_16G;
+ break;
+ }
+ return idx;
+}
+
static int __init htab_dt_scan_page_sizes(unsigned long node,
const char *uname, int depth,
void *data)
@@ -294,60 +318,57 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
size /= 4;
cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
while(size > 0) {
- unsigned int shift = prop[0];
+ unsigned int base_shift = prop[0];
unsigned int slbenc = prop[1];
unsigned int lpnum = prop[2];
- unsigned int lpenc = 0;
struct mmu_psize_def *def;
- int idx = -1;
+ int idx, base_idx;
size -= 3; prop += 3;
- while(size > 0 && lpnum) {
- if (prop[0] == shift)
- lpenc = prop[1];
+ base_idx = get_idx_from_shift(base_shift);
+ if (base_idx < 0) {
+ /*
+ * skip the pte encoding also
+ */
prop += 2; size -= 2;
- lpnum--;
+ continue;
}
- switch(shift) {
- case 0xc:
- idx = MMU_PAGE_4K;
- break;
- case 0x10:
- idx = MMU_PAGE_64K;
- break;
- case 0x14:
- idx = MMU_PAGE_1M;
- break;
- case 0x18:
- idx = MMU_PAGE_16M;
+ def = &mmu_psize_defs[base_idx];
+ if (base_idx == MMU_PAGE_16M)
cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
- break;
- case 0x22:
- idx = MMU_PAGE_16G;
- break;
- }
- if (idx < 0)
- continue;
- def = &mmu_psize_defs[idx];
- def->shift = shift;
- if (shift <= 23)
+
+ def->shift = base_shift;
+ if (base_shift <= 23)
def->avpnm = 0;
else
- def->avpnm = (1 << (shift - 23)) - 1;
+ def->avpnm = (1 << (base_shift - 23)) - 1;
def->sllp = slbenc;
- def->penc = lpenc;
- /* We don't know for sure what's up with tlbiel, so
+ /*
+ * We don't know for sure what's up with tlbiel, so
* for now we only set it for 4K and 64K pages
*/
- if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+ if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
def->tlbiel = 1;
else
def->tlbiel = 0;
- DBG(" %d: shift=%02x, sllp=%04lx, avpnm=%08lx, "
- "tlbiel=%d, penc=%d\n",
- idx, shift, def->sllp, def->avpnm, def->tlbiel,
- def->penc);
+ while (size > 0 && lpnum) {
+ unsigned int shift = prop[0];
+ unsigned int penc = prop[1];
+
+ prop += 2; size -= 2;
+ lpnum--;
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ def->penc[idx] = penc;
+ DBG(" %d: shift=%02x, sllp=%04lx, "
+ "avpnm=%08lx, tlbiel=%d, penc=%d\n",
+ idx, shift, def->sllp, def->avpnm,
+ def->tlbiel, def->penc[idx]);
+ }
}
return 1;
}
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 0da39fe..9f99847 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -109,7 +109,7 @@ void vpa_init(int cpu)
static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
unsigned long vpn, unsigned long pa,
unsigned long rflags, unsigned long vflags,
- int psize, int ssize)
+ int psize, int apsize, int ssize)
{
unsigned long lpar_rc;
unsigned long flags;
@@ -122,7 +122,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
hpte_group, vpn, pa, rflags, vflags, psize);
hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(pa, psize) | rflags;
+ hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 02/17] arch/powerpc: Reduce the PTE_INDEX_SIZE
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
This make one PMD cover 16MB range. That helps in easier implementation of THP
on power. THP core code make use of one pmd entry to track the huge page and
the range mapped by a single pmd entry should be equal to the huge page size
supported by the hardware.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/pgtable-ppc64-64k.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
index be4e287..3c529b4 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
@@ -4,10 +4,10 @@
#include <asm-generic/pgtable-nopud.h>
-#define PTE_INDEX_SIZE 12
+#define PTE_INDEX_SIZE 8
#define PMD_INDEX_SIZE 12
#define PUD_INDEX_SIZE 0
-#define PGD_INDEX_SIZE 6
+#define PGD_INDEX_SIZE 10
#ifndef __ASSEMBLY__
#define PTE_TABLE_SIZE (sizeof(real_pte_t) << PTE_INDEX_SIZE)
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 07/17] powerpc: Update tlbie/tlbiel as per ISA doc
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
This make sure we handle Multiple page size segment correctly.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/mm/hash_native_64.c | 52 +++++++++++++++++++++++++++++---------
1 file changed, 40 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 16ba033..da46cd3 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -43,7 +43,7 @@
DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+static inline void __tlbie(unsigned long vpn, int bpsize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
@@ -63,19 +63,33 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
*/
va &= ~(0xffffULL << 48);
- switch (psize) {
+ switch (bpsize) {
case MMU_PAGE_4K:
+ /* clear out bits after (52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ va |= mmu_psize_defs[apsize].sllp << 6;
asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
default:
/* We need 14 to 14 + i bits of va */
- penc = mmu_psize_defs[psize].penc[apsize];
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ penc = mmu_psize_defs[bpsize].penc[apsize];
+ /* clear out bits after (44) [0....44.....63] */
+ va &= ~((1ul << (64 - 44)) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /* Add AVAL part */
+ if (bpsize != apsize) {
+ /*
+ * MPSS, 64K base page size and 16MB parge page size
+ * We don't need all the bits, but this seems to work.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 56..62 bits of va.
+ */
+ va |= ((vpn >> 2) & 0xfe);
+ }
va |= 1; /* L */
asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
@@ -84,7 +98,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
}
}
-static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
+static inline void __tlbiel(unsigned long vpn, int bpsize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
@@ -98,18 +112,32 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
*/
va &= ~(0xffffULL << 48);
- switch (psize) {
+ switch (bpsize) {
case MMU_PAGE_4K:
+ /* clear out bits after(52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ va |= mmu_psize_defs[apsize].sllp << 6;
asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
: : "r"(va) : "memory");
break;
default:
/* We need 14 to 14 + i bits of va */
- penc = mmu_psize_defs[psize].penc[apsize];
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ penc = mmu_psize_defs[bpsize].penc[apsize];
+ /* clear out bits after(44) [0....44.....63] */
+ va &= ~((1ul << (64 - 44)) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /* Add AVAL part */
+ if (bpsize != apsize) {
+ /*
+ * MPSS, 64K base page size and 16MB parge page size
+ * We don't need all the bits, but this seems to work.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 56..62 bits of va.
+ */
+ va |= ((vpn >> 2) & 0xfe);
+ }
va |= 1; /* L */
asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
: : "r"(va) : "memory");
@@ -118,22 +146,22 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
}
-static inline void tlbie(unsigned long vpn, int psize, int apsize,
+static inline void tlbie(unsigned long vpn, int bpsize, int apsize,
int ssize, int local)
{
unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
if (use_local)
- use_local = mmu_psize_defs[psize].tlbiel;
+ use_local = mmu_psize_defs[bpsize].tlbiel;
if (lock_tlbie && !use_local)
raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync": : :"memory");
if (use_local) {
- __tlbiel(vpn, psize, apsize, ssize);
+ __tlbiel(vpn, bpsize, apsize, ssize);
asm volatile("ptesync": : :"memory");
} else {
- __tlbie(vpn, psize, apsize, ssize);
+ __tlbie(vpn, bpsize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 04/17] mm/THP: Add pmd args to pgtable deposit and withdraw APIs
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
This will be later used by powerpc THP support. In powerpc we want to use
pgtable for storing the hash index values. So instead of adding them to
mm_context list, we would like to store them in the second half of pmd
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/s390/include/asm/pgtable.h | 5 +++--
arch/s390/mm/pgtable.c | 5 +++--
arch/sparc/include/asm/pgtable_64.h | 5 +++--
arch/sparc/mm/tlb.c | 5 +++--
include/asm-generic/pgtable.h | 5 +++--
mm/huge_memory.c | 12 ++++++------
mm/pgtable-generic.c | 5 +++--
7 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index c1d7930..d57436c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1232,10 +1232,11 @@ static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
#define SEGMENT_RW __pgprot(_HPAGE_TYPE_RW)
#define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable);
#define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
static inline int pmd_trans_splitting(pmd_t pmd)
{
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ae44d2a..9ab3224 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -920,7 +920,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
}
}
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
{
struct list_head *lh = (struct list_head *) pgtable;
@@ -934,7 +935,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
mm->pmd_huge_pte = pgtable;
}
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
struct list_head *lh;
pgtable_t pgtable;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 7870be0..4fa7133 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -855,10 +855,11 @@ extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd);
#define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable);
#define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif
/* Encode and de-code a swap entry */
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 3e8fec3..79922f4 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -150,7 +150,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
}
}
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
{
struct list_head *lh = (struct list_head *) pgtable;
@@ -164,7 +165,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
mm->pmd_huge_pte = pgtable;
}
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
struct list_head *lh;
pgtable_t pgtable;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 5cf680a..6f87e9e 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -163,11 +163,12 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
#endif
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable);
#endif
#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif
#ifndef __HAVE_ARCH_PMDP_INVALIDATE
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6001ee6..5beb2e2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -739,7 +739,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
page_add_new_anon_rmap(page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
- pgtable_trans_huge_deposit(mm, pgtable);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm->nr_ptes++;
spin_unlock(&mm->page_table_lock);
@@ -926,7 +926,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- pgtable_trans_huge_deposit(dst_mm, pgtable);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
dst_mm->nr_ptes++;
ret = 0;
@@ -1091,10 +1091,10 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1373,7 +1373,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
- pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (is_huge_zero_pmd(orig_pmd)) {
@@ -1705,7 +1705,7 @@ static int __split_huge_page_map(struct page *page,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
haddr = address;
@@ -2393,7 +2393,7 @@ static void collapse_huge_page(struct mm_struct *mm,
page_add_new_anon_rmap(new_page, vma, address);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- pgtable_trans_huge_deposit(mm, pgtable);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
spin_unlock(&mm->page_table_lock);
*hpage = NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c8323f..e1a6e4f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
{
assert_spin_locked(&mm->page_table_lock);
@@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* no "address" argument so destroys page coloring of some arch */
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
pgtable_t pgtable;
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 14/17] powerpc: support for zerout withdraw.
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Need changes to other archs. This need to be fixed further
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/pgtable.h | 3 ++-
arch/powerpc/mm/pgtable.c | 11 ++++++++---
mm/huge_memory.c | 18 ++++++++++++------
3 files changed, 22 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 4e49c34..3dfbec9 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -262,7 +262,8 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable);
#define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm,
+ pmd_t *pmdp, int tozero);
#define __HAVE_ARCH_PMDP_INVALIDATE
extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 841271f..fa5e108 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -355,7 +355,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
/* FIXME!! May be all this should be in pgtable_64.c ? */
#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t))
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp, int tozero)
{
pgtable_t pgtable;
unsigned long *pgtable_slot;
@@ -368,8 +368,13 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
* Make sure we are invalidating all the entries. So that
* we fault and create new entries later
*/
- /* zero out the table before returning */
- memset(pgtable, 0, PTE_FRAG_SIZE);
+ /* FIXME!! this is not correct. zero out the table before returning
+ * because we are using this for other things.
+ * zap_huge_pmd
+ */
+ if (tozero)
+ /* Not needed, because we depoist a zeroed table ? */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
return pgtable;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5beb2e2..3777a5b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -781,7 +781,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
set_pmd_at(mm, haddr, pmd, entry);
- pgtable_trans_huge_deposit(mm, pgtable);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm->nr_ptes++;
return true;
}
@@ -996,7 +996,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd, 1);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1091,7 +1091,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd, 1);
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
@@ -1373,7 +1373,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
- pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+ /*
+ * Withdraw the pgtable without zero out, because
+ * the following pmd_get_and_clear will look at
+ * pgtable contents, in case of some architectures
+ * like ppc64
+ */
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd, 0);
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (is_huge_zero_pmd(orig_pmd)) {
@@ -1705,7 +1711,7 @@ static int __split_huge_page_map(struct page *page,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd, 1);
pmd_populate(mm, &_pmd, pgtable);
haddr = address;
@@ -2699,7 +2705,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd, 1);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 12/17] powerpc/THP: Implement transparent huge pages for ppc64
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
We now have pmd entries covering to 16MB range. To implement THP on powerpc,
we double the size of PMD. The second half is used to deposit the pgtable (PTE page).
We also use the depoisted PTE page for tracking the HPTE information. The information
include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
With 16MB huge page and 64K HPTE we need 256 entries and with 4K HPTE we need
4096 entries. Both will fit in a 4K PTE page.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/page.h | 2 +-
arch/powerpc/include/asm/pgtable-ppc64-64k.h | 3 +-
arch/powerpc/include/asm/pgtable-ppc64.h | 6 +-
arch/powerpc/include/asm/pgtable.h | 247 +++++++++++++++++++
arch/powerpc/mm/init_64.c | 14 ++
arch/powerpc/mm/pgtable.c | 340 ++++++++++++++++++++++++++
arch/powerpc/platforms/Kconfig.cputype | 1 +
7 files changed, 610 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 38e7ff6..b927447 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -40,7 +40,7 @@
#ifdef CONFIG_HUGETLB_PAGE
extern unsigned int HPAGE_SHIFT;
#else
-#define HPAGE_SHIFT PAGE_SHIFT
+#define HPAGE_SHIFT PMD_SHIFT
#endif
#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
index 3c529b4..5c5541a 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
@@ -33,7 +33,8 @@
#define PGDIR_MASK (~(PGDIR_SIZE-1))
/* Bits to mask out from a PMD to get to the PTE page */
-#define PMD_MASKED_BITS 0x1ff
+/* PMDs point to PTE table fragments which are 4K aligned. */
+#define PMD_MASKED_BITS 0xfff
/* Bits to mask out from a PGD/PUD to get to the PMD page */
#define PUD_MASKED_BITS 0x1ff
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 658ba7c..0da8840 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -149,8 +149,12 @@
|| (pmd_val(pmd) & PMD_BAD_BITS))
#define pmd_present(pmd) (pmd_val(pmd) != 0)
#define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0)
+/*
+ * FIXME PMD_MASKED_BITS should include all of PMD_HUGE_PROTBITS
+ * should only be called for non huge pages.
+ */
#define pmd_page_vaddr(pmd) (pmd_val(pmd) & ~PMD_MASKED_BITS)
-#define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd))
+extern struct page *pmd_page(pmd_t pmd);
#define pud_set(pudp, pudval) (pud_val(*(pudp)) = (pudval))
#define pud_none(pud) (!pud_val(pud))
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index fc57855..4e49c34 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -23,6 +23,253 @@ struct mm_struct;
*/
#define PTE_PAGE_HIDX_OFFSET (PTRS_PER_PTE * 8)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* A large part matches with pte bits */
+#define PMD_HUGE_PROTBITS 0x7ff
+#define PMD_HUGE_PRESENT 0x001 /* software: pte contains a translation */
+#define PMD_HUGE_USER 0x002 /* matches one of the PP bits */
+#define PMD_HUGE_FILE 0x002 /* (!present only) software: pte holds file offset */
+#define PMD_HUGE_EXEC 0x004 /* No execute on POWER4 and newer (we invert) */
+#define PMD_HUGE_SPLITTING 0x008
+#define PMD_HUGE_HASHPTE 0x010
+#define PMD_ISHUGE 0x020
+#define PMD_HUGE_DIRTY 0x080 /* C: page changed */
+#define PMD_HUGE_ACCESSED 0x100 /* R: page referenced */
+#define PMD_HUGE_RW 0x200 /* software: user write access allowed */
+#define PMD_HUGE_BUSY 0x800 /* software: PTE & hash are busy */
+#define PMD_HUGE_HPTEFLAGS (PMD_HUGE_BUSY | PMD_HUGE_HASHPTE)
+/*
+ * We keep both the pmd and pte rpn shift same, eventhough we use only
+ * lower 12 bits for huge page flags at pmd level
+ */
+#define PMD_HUGE_RPN_SHIFT PTE_RPN_SHIFT
+#define HUGE_PAGE_SIZE (ASM_CONST(1) << 24)
+#define HUGE_PAGE_MASK (~(HUGE_PAGE_SIZE - 1))
+
+#ifndef __ASSEMBLY__
+extern void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp);
+extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
+extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd);
+extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd);
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ /*
+ * Only called for huge page pmd
+ */
+// unsigned long val = pmd_val(pmd) & ~PMD_HUGE_PROTBITS;
+ return pmd_val(pmd) >> PMD_HUGE_RPN_SHIFT;
+}
+
+static inline int pmd_young(pmd_t pmd)
+{
+ return pmd_val(pmd) & PMD_HUGE_ACCESSED;
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ /* Do nothing, mk_pmd() does this part. */
+ return pmd;
+}
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+ return pmd_val(pmd) & PMD_HUGE_RW;
+}
+
+static inline int pmd_large(pmd_t pmd)
+{
+ return (pmd_val(pmd) & (PMD_ISHUGE | PMD_HUGE_PRESENT)) ==
+ (PMD_ISHUGE | PMD_HUGE_PRESENT);
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return (pmd_val(pmd) & (PMD_ISHUGE|PMD_HUGE_SPLITTING)) ==
+ (PMD_ISHUGE|PMD_HUGE_SPLITTING);
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return pmd_val(pmd) & PMD_ISHUGE;
+}
+
+#define has_transparent_hugepage() 1
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ pmd_val(pmd) &= ~PMD_HUGE_ACCESSED;
+ return pmd;
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ pmd_val(pmd) &= ~PMD_HUGE_RW;
+ return pmd;
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ pmd_val(pmd) |= PMD_HUGE_DIRTY;
+ return pmd;
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ pmd_val(pmd) |= PMD_HUGE_ACCESSED;
+ return pmd;
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+ pmd_val(pmd) |= PMD_HUGE_RW;
+ return pmd;
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ pmd_val(pmd) &= ~PMD_HUGE_PRESENT;
+ return pmd;
+}
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+ pmd_val(pmd) |= PMD_HUGE_SPLITTING;
+ return pmd;
+}
+
+extern pgprot_t pmd_pgprot(pmd_t entry);
+
+/*
+ * Set the dirty and/or accessed bits atomically in a linux hugepage PMD, this
+ * function doesn't need to flush the hash entry
+ */
+static inline void __pmdp_set_access_flags(pmd_t *pmdp, pmd_t entry)
+{
+ unsigned long bits = pmd_val(entry) & (PMD_HUGE_DIRTY |
+ PMD_HUGE_ACCESSED |
+ PMD_HUGE_RW | PMD_HUGE_EXEC);
+#ifdef PTE_ATOMIC_UPDATES
+ unsigned long old, tmp;
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%4\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ or %0,%3,%0\n\
+ stdcx. %0,0,%4\n\
+ bne- 1b"
+ :"=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ :"r" (bits), "r" (pmdp), "m" (*pmdp), "i" (PMD_HUGE_BUSY)
+ :"cc");
+#else
+ unsigned long old = pmd_val(*pmdp);
+ *pmdp = __pmd(old | bits);
+#endif
+}
+
+#define __HAVE_ARCH_PMD_SAME
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+ return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~PMD_HUGE_HPTEFLAGS) == 0);
+}
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty);
+
+static inline unsigned long pmd_hugepage_update(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t *pmdp, unsigned long clr)
+{
+#ifdef PTE_ATOMIC_UPDATES
+ unsigned long old, tmp;
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3 # pmd_hugepage_update\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ andc %1,%0,%4 \n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (PMD_HUGE_BUSY)
+ : "cc" );
+#else
+ unsigned long old = pmd_val(*pmdp);
+ *pmdp = __pmd(old & ~clr);
+#endif
+
+#ifdef CONFIG_PPC_STD_MMU_64 /* FIXME!! do we support anything else ? */
+ /*
+ * FIXME!! How do we find all the hash values
+ */
+ if (old & PMD_HUGE_HASHPTE)
+ hpte_need_hugepage_flush(mm, addr, pmdp);
+#endif
+ return old;
+}
+
+static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ unsigned long old;
+
+ if ((pmd_val(*pmdp) & (PMD_HUGE_ACCESSED | PMD_HUGE_HASHPTE)) == 0)
+ return 0;
+ old = pmd_hugepage_update(mm, addr, pmdp, PMD_HUGE_ACCESSED);
+ return ((old & PMD_HUGE_ACCESSED) != 0);
+}
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ unsigned long old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
+ return __pmd(old);
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+
+ if ((pmd_val(*pmdp) & PMD_HUGE_RW) == 0)
+ return;
+
+ pmd_hugepage_update(mm, addr, pmdp, PMD_HUGE_RW);
+}
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable);
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_INVALIDATE
+extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp);
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
#ifndef __ASSEMBLY__
#include <asm/tlbflush.h>
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index b378438..398a700 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -88,7 +88,12 @@ static void pgd_ctor(void *addr)
static void pmd_ctor(void *addr)
{
+/* FIXME may be we can take size as arg ? */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
memset(addr, 0, PMD_TABLE_SIZE);
+#endif
}
struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -138,7 +143,16 @@ void __pgtable_cache_add(unsigned int index, unsigned long table_size,
void pgtable_cache_init(void)
{
pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * we store the pgtable details in the second half of PMD
+ */
+ if (PGT_CACHE(PMD_INDEX_SIZE))
+ pr_err("PMD Page cache already initialized with different size\n");
+ __pgtable_cache_add(PMD_INDEX_SIZE, PMD_TABLE_SIZE * 2, pmd_ctor);
+#else
pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+#endif
if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
panic("Couldn't allocate pgtable caches");
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a..e173b5e 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -31,6 +31,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
+#include <asm/machdep.h>
#include "mmu_decl.h"
@@ -240,3 +241,342 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
}
#endif /* CONFIG_DEBUG_VM */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static pmd_t set_hugepage_access_flags_filter(pmd_t pmd,
+ struct vm_area_struct *vma,
+ int dirty)
+{
+ return pmd;
+}
+
+/*
+ * This is called when relaxing access to a huge page. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp, pmd_t entry, int dirty)
+{
+ int changed;
+ entry = set_hugepage_access_flags_filter(entry, vma, dirty);
+ changed = !pmd_same(*(pmdp), entry);
+ if (changed) {
+ __pmdp_set_access_flags(pmdp, entry);
+#if 0 /* FIXME!! We are not supporting SW TLB systems */
+ flush_tlb_hugepage_nohash(vma, address);
+#endif
+ }
+ return changed;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this huge page.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ unsigned long old, tmp;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifdef PTE_ATOMIC_UPDATES
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ ori %1,%0,%4 \n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "i" (PMD_HUGE_SPLITTING), "m" (*pmdp), "i" (PMD_HUGE_BUSY)
+ : "cc" );
+#else
+ old = pmd_val(*pmdp);
+ *pmdp = __pmd(old | PMD_HUGE_SPLITTING);
+#endif
+ /*
+ * If we didn't had the splitting flag set, go and flush the
+ * HPTE entries and serialize against gup fast.
+ */
+ if (!(old & PMD_HUGE_SPLITTING)) {
+#ifdef CONFIG_PPC_STD_MMU_64
+ /* We need to flush the hpte */
+ if (old & PMD_HUGE_HASHPTE)
+ hpte_need_hugepage_flush(vma->vm_mm, address, pmdp);
+#endif
+ /* need tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+/*
+ * FIXME!! pmd_page need to be validated, we may get a different value than expected
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ unsigned long *pgtable_slot;
+ assert_spin_locked(&mm->page_table_lock);
+ /*
+ * we store the pgtable in the second half of PMD
+ */
+ pgtable_slot = pmdp + PTRS_PER_PMD;
+ *pgtable_slot = (unsigned long )pgtable;
+}
+
+/* FIXME!! May be all this should be in pgtable_64.c ? */
+#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t))
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+ unsigned long *pgtable_slot;
+
+ assert_spin_locked(&mm->page_table_lock);
+ pgtable_slot = pmdp + PTRS_PER_PMD;
+ pgtable = (pgtable_t) *pgtable_slot;
+
+ /* FIXME!
+ * Make sure we are invalidating all the entries. So that
+ * we fault and create new entries later
+ */
+ /* zero out the table before returning */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return pgtable;
+}
+
+/*
+ * Since we are looking at latest ppc64, we don't need to worry about
+ * i/d cache coherency on exec fault
+ */
+static pmd_t set_pmd_filter(pmd_t pmd, unsigned long addr)
+{
+ pmd = __pmd(pmd_val(pmd) & ~PMD_HUGE_HPTEFLAGS);
+ return pmd;
+}
+
+/*
+ * We can make it less convoluted than __set_pte_at, because
+ * we can ignore lot of hardware here, because this is only for
+ * MPSS
+ */
+static inline void __set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd, int percpu)
+{
+ /*
+ * There is nothing in hash page table now, so nothing to
+ * invalidate, set_pte_at is used for adding new entry.
+ * For updating we should use update_hugepage_pmd()
+ */
+ *pmdp = pmd;
+}
+
+/*
+ * set a new huge pmd
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+ /*
+ * Note: mm->context.id might not yet have been assigned as
+ * this context might not have been activated yet when this
+ * is called.
+ * FIXME!! catch a pmd update here. Those should actually go via
+ * pmd_hugepage_update.
+ */
+ pmd = set_pmd_filter(pmd, addr);
+
+ __set_pmd_at(mm, addr, pmdp, pmd, 0);
+
+}
+
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ /* FIXME!! validate it more closely */
+ pmd_hugepage_update(vma->vm_mm, address, pmdp, PMD_HUGE_PRESENT);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+
+/*
+ * A linux huge page PMD was changed and the corresponding hash table entry
+ * neesd to be flushed. FIXME!! there is no batching support yet.
+ *
+ * The linux huge page PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
+ * With 16MB huge page and 64K HPTE we need 256 entries and with 4K HPTE we need
+ * 4096 entries. Both will fit in a 4K pgtable_t.
+ */
+void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ int ssize, i;
+ unsigned long s_addr;
+ unsigned int psize, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long hidx, vpn, vsid, hash, shift, slot;
+
+ /*
+ * Flush all the hptes mapping this huge page
+ */
+ s_addr = addr & HUGE_PAGE_MASK;
+ /*
+ * The hpte hindex are stored in the pgtable whose address is in the
+ * second half of the PMD
+ */
+ hpte_slot_array = *(char **)(pmdp + PTRS_PER_PMD);
+
+ /* get the base page size */
+ psize = get_slice_psize(mm, s_addr);
+ shift = mmu_psize_defs[psize].shift;
+
+ for (i = 0; i < HUGE_PAGE_SIZE/(1ul << shift); i++) {
+ /*
+ * 8 bits per each hpte entries
+ * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+ */
+ valid = hpte_slot_array[i] & 0x1;
+ if (!valid)
+ continue;
+ hidx = hpte_slot_array[i] >> 1;
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_vsid(mm->context.id, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+// DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
+ ppc_md.hpte_invalidate(slot, vpn, psize, ssize, 0);
+
+ /* mark the slot array invalid ?? pte variant doesn't do this*/
+// hpte_slot_array[i] = 0x0;
+ }
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+ unsigned long pmd_prot = 0;
+ unsigned long prot = pgprot_val(pgprot);
+
+ if (prot & _PAGE_PRESENT)
+ pmd_prot |= PMD_HUGE_PRESENT;
+ if (prot & _PAGE_USER)
+ pmd_prot |= PMD_HUGE_USER;
+ if (prot & _PAGE_FILE)
+ pmd_prot |= PMD_HUGE_FILE;
+ if (prot & _PAGE_EXEC)
+ pmd_prot |= PMD_HUGE_EXEC;
+
+// WARN_ON(prot & _PAGE_GUARDED);
+// WARN_ON(prot & _PAGE_COHERENT);
+// WARN_ON(prot & _PAGE_NO_CACHE);
+// WARN_ON(prot & _PAGE_WRITETHRU);
+
+ if (prot & _PAGE_DIRTY)
+ pmd_prot |= PMD_HUGE_DIRTY;
+ if (prot & _PAGE_ACCESSED)
+ pmd_prot |= PMD_HUGE_ACCESSED;
+ if (prot & _PAGE_RW)
+ pmd_prot |= PMD_HUGE_RW;
+
+// WARN_ON(prot & _PAGE_BUSY);
+ /*
+ * FIXME!! we need to do some sanity check. But the
+ * values map easily.
+ */
+ pmd_val(pmd) |= pmd_prot;
+ return pmd;
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+ pmd_t pmd;
+
+ pmd_val(pmd) = pfn << PMD_HUGE_RPN_SHIFT;
+ /*
+ * pgtable_t is always 4K aligned, even in case where we use the
+ * pmd_t to store a large page which is 16MB aligned
+ */
+ pmd_val(pmd) |= PMD_ISHUGE;
+ pmd = pmd_set_protbits(pmd, pgprot);
+ return pmd;
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+ return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ /* FIXME!! why are this bits cleared ? */
+ pmd_val(pmd) &= ~(PMD_HUGE_PRESENT |
+ PMD_HUGE_RW |
+ PMD_HUGE_EXEC);
+ pmd = pmd_set_protbits(pmd, newprot);
+ return pmd;
+}
+
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ /* FIXME!! fill in later looking at update_mmu_cache */
+}
+
+/*
+ * For huge page we have pfn in the pmd, we use PMD_HUGE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_val(pmd) & PMD_ISHUGE)
+ return pfn_to_page(pmd_pfn(pmd));
+#endif
+ return virt_to_page(pmd_page_vaddr(pmd));
+}
+
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 72afd28..90ee19b 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
select PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
config PPC_BOOK3E_64
bool "Embedded processors"
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 10/17] powerpc/mm: Fix hpte_decode to use the correct decoding for page sizes
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
As per ISA doc, we encode base and actual page size in the LP bits of
PTE. The number of bit used to encode the page sizes depend on actual
page size. ISA doc lists this as
PTE LP actual page size
rrrr rrrz ≥8KB
rrrr rrzz ≥16KB
rrrr rzzz ≥32KB
rrrr zzzz ≥64KB
rrrz zzzz ≥128KB
rrzz zzzz ≥256KB
rzzz zzzz ≥512KB
zzzz zzzz ≥1MB
ISA doc also says
"The values of the “z” bits used to specify each size, along with all possible
values of “r” bits in the LP field, must result in LP values distinct from
other LP values for other sizes."
based on the above update hpte_decode to use the correct decoding for LP bits.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/mm/hash_native_64.c | 26 ++++++++++++++++----------
1 file changed, 16 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 4cf361f..d36ddef 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -449,19 +449,14 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
unsigned long avpn, pteg, vpi;
- unsigned long hpte_r = hpte->r;
unsigned long hpte_v = hpte->v;
unsigned long vsid, seg_off;
- int i, size, a_size = MMU_PAGE_4K, shift, penc;
+ int size, a_size = MMU_PAGE_4K, shift, mask;
+ unsigned int lp = (hpte->r >> LP_SHIFT) & LP_BITS;
if (!(hpte_v & HPTE_V_LARGE))
size = MMU_PAGE_4K;
else {
- for (i = 0; i < LP_BITS; i++) {
- if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
- break;
- }
- penc = LP_MASK(i+1) >> LP_SHIFT;
for (size = 0; size < MMU_PAGE_COUNT; size++) {
/* 4K pages are not represented by LP */
@@ -471,12 +466,23 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
/* valid entries have a shift value */
if (!mmu_psize_defs[size].shift)
continue;
- for (a_size = 0; a_size < MMU_PAGE_COUNT; a_size++)
- if (penc == mmu_psize_defs[size].penc[a_size])
+
+ for (a_size = 0; a_size < MMU_PAGE_COUNT; a_size++) {
+ /* valid entries have a shift value */
+ if (!mmu_psize_defs[a_size].shift)
+ continue;
+
+ shift = mmu_psize_defs[a_size].shift - 11;
+ if (shift > 9)
+ shift = 9;
+ mask = (1 << shift) - 1;
+ if ((lp & mask) ==
+ mmu_psize_defs[size].penc[a_size]) {
goto out;
+ }
+ }
}
}
-
out:
/* This works for all page sizes, and for 256M and 1T segments */
*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
--
1.7.10
^ permalink raw reply related
* [RFC PATCH 08/17] powerpc: print both base and actual page size on hash failure
From: Aneesh Kumar K.V @ 2013-02-18 10:28 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
In-Reply-To: <1361183295-6958-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/mmu-hash64.h | 3 ++-
arch/powerpc/mm/hash_utils_64.c | 12 +++++++-----
arch/powerpc/mm/hugetlbpage-hash64.c | 2 +-
3 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 6290e26..6ec65b6 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -321,7 +321,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
unsigned int shift, unsigned int mmu_psize);
extern void hash_failure_debug(unsigned long ea, unsigned long access,
unsigned long vsid, unsigned long trap,
- int ssize, int psize, unsigned long pte);
+ int ssize, int psize, int lpsize,
+ unsigned long pte);
extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
unsigned long pstart, unsigned long prot,
int psize, int ssize);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 48edb46..df48ba5 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -917,14 +917,14 @@ static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
void hash_failure_debug(unsigned long ea, unsigned long access,
unsigned long vsid, unsigned long trap,
- int ssize, int psize, unsigned long pte)
+ int ssize, int psize, int lpsize, unsigned long pte)
{
if (!printk_ratelimit())
return;
pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
ea, access, current->comm);
- pr_info(" trap=0x%lx vsid=0x%lx ssize=%d psize=%d pte=0x%lx\n",
- trap, vsid, ssize, psize, pte);
+ pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+ trap, vsid, ssize, psize, lpsize, pte);
}
/* Result code is:
@@ -1097,7 +1097,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
*/
if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize, psize,
- pte_val(*ptep));
+ psize, pte_val(*ptep));
#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
#else
@@ -1175,7 +1175,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
*/
if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize,
- mm->context.user_psize, pte_val(*ptep));
+ mm->context.user_psize,
+ mm->context.user_psize,
+ pte_val(*ptep));
local_irq_restore(flags);
}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index cecad34..af98ee8 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -129,7 +129,7 @@ repeat:
if (unlikely(slot == -2)) {
*ptep = __pte(old_pte);
hash_failure_debug(ea, access, vsid, trap, ssize,
- mmu_psize, old_pte);
+ mmu_psize, mmu_psize, old_pte);
return -1;
}
--
1.7.10
^ permalink raw reply related
* Re: [PATCH v5 00/45] CPU hotplug: stop_machine()-free CPU hotplug
From: Vincent Guittot @ 2013-02-18 10:24 UTC (permalink / raw)
To: Srivatsa S. Bhat
Cc: linux-doc, peterz, fweisbec, linux-kernel, walken, mingo,
linux-arch, Russell King - ARM Linux, xiaoguangrong, wangyun,
paulmck, nikunj, linux-pm, Rusty Russell, rostedt, rjw, namhyung,
tglx, linux-arm-kernel, netdev, oleg, sbw, tj, akpm, linuxppc-dev
In-Reply-To: <511E8F3C.2010406@linux.vnet.ibm.com>
On 15 February 2013 20:40, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
> Hi Vincent,
>
> On 02/15/2013 06:58 PM, Vincent Guittot wrote:
>> Hi Srivatsa,
>>
>> I have run some tests with you branch (thanks Paul for the git tree)
>> and you will find results below.
>>
>
> Thank you very much for testing this patchset!
>
>> The tests condition are:
>> - 5 CPUs system in 2 clusters
>> - The test plugs/unplugs CPU2 and it increases the system load each 20
>> plug/unplug sequence with either more cyclictests threads
>> - The test is done with all CPUs online and with only CPU0 and CPU2
>>
>> The main conclusion is that there is no differences with and without
>> your patches with my stress tests. I'm not sure that it was the
>> expected results but the cpu_down is already quite low : 4-5ms in
>> average
>>
>
> Atleast my patchset doesn't perform _worse_ than mainline, with respect
> to cpu_down duration :-)
yes exactly and it has pass more than 400 consecutive plug/unplug on
an ARM platform
>
> So, here is the analysis:
> Stop-machine() doesn't really slow down CPU-down operation, if the rest
> of the CPUs are mostly running in userspace all the time. Because, the
> CPUs running userspace workloads cooperate very eagerly with the stop-machine
> dance - they receive the resched IPI, and allow the per-cpu cpu-stopper
> thread to monopolize the CPU, almost immediately.
>
> The scenario where stop-machine() takes longer to take effect is when
> most of the online CPUs are running in kernelspace, because, then the
> probability that they call preempt_disable() frequently (and hence inhibit
> stop-machine) is higher. That's why, in my tests, I ran genload from LTP
> which generated a lot of system-time (system-time in 'top' indicates activity
> in kernelspace). Hence my patchset showed significant improvement over
> mainline in my tests.
>
ok, I hadn't noticed this important point for the test
> However, your test is very useful too, if we measure a different parameter:
> the latency impact on the workloads running on the system (cyclic test).
> One other important aim of this patchset is to make hotplug as less intrusive
> as possible, for other workloads running on the system. So if you measure
> the cyclictest numbers, I would expect my patchset to show better numbers
> than mainline, when you do cpu-hotplug in parallel (same test that you did).
> Mainline would run stop-machine and hence interrupt the cyclic test tasks
> too often. My patchset wouldn't do that, and hence cyclic test should
> ideally show better numbers.
In fact, I haven't looked at the results as i was more interested by
the load that was generated
>
> I'd really appreciate if you could try that out and let me know how it
> goes.. :-) Thank you very much!
ok, I'm going to try to run a test series
Vincent
>
> Regards,
> Srivatsa S. Bhat
>
>>
>>
>> On 12 February 2013 04:58, Srivatsa S. Bhat
>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>> On 02/12/2013 12:38 AM, Paul E. McKenney wrote:
>>>> On Mon, Feb 11, 2013 at 05:53:41PM +0530, Srivatsa S. Bhat wrote:
>>>>> On 02/11/2013 05:28 PM, Vincent Guittot wrote:
>>>>>> On 8 February 2013 19:09, Srivatsa S. Bhat
>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>
>>>> [ . . . ]
>>>>
>>>>>>> Adding Vincent to CC, who had previously evaluated the performance and
>>>>>>> latency implications of CPU hotplug on ARM platforms, IIRC.
>>>>>>>
>>>>>>
>>>>>> Hi Srivatsa,
>>>>>>
>>>>>> I can try to run some of our stress tests on your patches.
>>>>>
>>>>> Great!
>>>>>
>>>>>> Have you
>>>>>> got a git tree that i can pull ?
>>>>>>
>>>>>
>>>>> Unfortunately, no, none at the moment.. :-(
>>>>
>>>> You do need to create an externally visible git tree.
>>>
>>> Ok, I'll do that soon.
>>>
>>>> In the meantime,
>>>> I have added your series at rcu/bhat.2013.01.21a on -rcu:
>>>>
>>>> git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
>>>>
>>>> This should appear soon on a kernel.org mirror near you. ;-)
>>>>
>>>
>>> Thank you very much, Paul! :-)
>>>
>>> Regards,
>>> Srivatsa S. Bhat
>>>
>
>
^ permalink raw reply
* Re: [PATCH 2/4] powerpc kvm: added multiple TCEs requests support
From: Alexey Kardashevskiy @ 2013-02-18 8:14 UTC (permalink / raw)
To: Paul Mackerras
Cc: kvm, Alexander Graf, kvm-ppc, linux-kernel, linuxppc-dev,
David Gibson
In-Reply-To: <20130215032458.GB25015@drongo>
On 15/02/13 14:24, Paul Mackerras wrote:
> On Mon, Feb 11, 2013 at 11:12:41PM +1100, aik@ozlabs.ru wrote:
>
>> +static long emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt,
>> + unsigned long ioba, unsigned long tce)
>> +{
>> + unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
>> + struct page *page;
>> + u64 *tbl;
>> +
>> + /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */
>> + /* liobn, stt, stt->window_size); */
>> + if (ioba >= stt->window_size) {
>> + pr_err("%s failed on ioba=%lx\n", __func__, ioba);
>
> Doesn't this give the guest a way to spam the host logs? And in fact
> printk in real mode is potentially problematic. I would just leave
> out this statement.
>
>> + return H_PARAMETER;
>> + }
>> +
>> + page = stt->pages[idx / TCES_PER_PAGE];
>> + tbl = (u64 *)page_address(page);
>
> I would like to see an explanation of why we are confident that
> page_address() will work correctly in real mode, across all the
> combinations of config options that we can have for a ppc64 book3s
> kernel.
It was there before this patch, I just moved it so I would think it has
been explained before :)
There is no combination on PPC to get WANT_PAGE_VIRTUAL enabled.
CONFIG_HIGHMEM is supported for PPC32 only so HASHED_PAGE_VIRTUAL is not
enabled on PPC64 either.
So this definition is supposed to work on PPC64:
#define page_address(page) lowmem_page_address(page)
where lowmem_page_address() is arithmetic operation on a page struct address:
static __always_inline void *lowmem_page_address(const struct page *page)
{
return __va(PFN_PHYS(page_to_pfn(page)));
}
PPC32 will use page_address() from mm/highmem.c, I need some lesson about
memory layout in 32bit but for now I cannot see how it can possibly fail here.
>> +
>> + /* FIXME: Need to validate the TCE itself */
>> + /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
>> + tbl[idx % TCES_PER_PAGE] = tce;
>> +
>> + return H_SUCCESS;
>> +}
>> +
>> +/*
>> + * Real mode handlers
>> */
>> long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>> unsigned long ioba, unsigned long tce)
>> {
>> - struct kvm *kvm = vcpu->kvm;
>> struct kvmppc_spapr_tce_table *stt;
>>
>> - /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
>> - /* liobn, ioba, tce); */
>> + stt = find_tce_table(vcpu, liobn);
>> + /* Didn't find the liobn, put it to userspace */
>> + if (!stt)
>> + return H_TOO_HARD;
>> +
>> + /* Emulated IO */
>> + return emulated_h_put_tce(stt, ioba, tce);
>> +}
>> +
>> +long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>> + unsigned long liobn, unsigned long ioba,
>> + unsigned long tce_list, unsigned long npages)
>> +{
>> + struct kvmppc_spapr_tce_table *stt;
>> + long i, ret = 0;
>> + unsigned long *tces;
>> +
>> + stt = find_tce_table(vcpu, liobn);
>> + /* Didn't find the liobn, put it to userspace */
>> + if (!stt)
>> + return H_TOO_HARD;
>>
>> - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
>> - if (stt->liobn == liobn) {
>> - unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
>> - struct page *page;
>> - u64 *tbl;
>> + tces = (void *) get_real_address(vcpu, tce_list, false, NULL, NULL);
>> + if (!tces)
>> + return H_TOO_HARD;
>>
>> - /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */
>> - /* liobn, stt, stt->window_size); */
>> - if (ioba >= stt->window_size)
>> - return H_PARAMETER;
>> + /* Emulated IO */
>> + for (i = 0; (i < npages) && !ret; ++i, ioba += IOMMU_PAGE_SIZE)
>> + ret = emulated_h_put_tce(stt, ioba, tces[i]);
>
> So, tces is a pointer to somewhere inside a real page. Did we check
> somewhere that tces[npages-1] is in the same page as tces[0]? If so,
> I missed it. If we didn't, then we probably should check and do
> something about it.
>
>>
>> - page = stt->pages[idx / TCES_PER_PAGE];
>> - tbl = (u64 *)page_address(page);
>> + return ret;
>> +}
>>
>> - /* FIXME: Need to validate the TCE itself */
>> - /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
>> - tbl[idx % TCES_PER_PAGE] = tce;
>> - return H_SUCCESS;
>> - }
>> - }
>> +long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
>> + unsigned long liobn, unsigned long ioba,
>> + unsigned long tce_value, unsigned long npages)
>> +{
>> + struct kvmppc_spapr_tce_table *stt;
>> + long i, ret = 0;
>> +
>> + stt = find_tce_table(vcpu, liobn);
>> + /* Didn't find the liobn, put it to userspace */
>> + if (!stt)
>> + return H_TOO_HARD;
>>
>> - /* Didn't find the liobn, punt it to userspace */
>> - return H_TOO_HARD;
>> + /* Emulated IO */
>> + for (i = 0; (i < npages) && !ret; ++i, ioba += IOMMU_PAGE_SIZE)
>> + ret = emulated_h_put_tce(stt, ioba, tce_value);
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * Virtual mode handlers
>> + */
>> +extern long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
>> + unsigned long liobn, unsigned long ioba,
>> + unsigned long tce)
>> +{
>> + /* At the moment emulated IO is handled the same way */
>> + return kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
>> +}
>> +
>> +extern long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>> + unsigned long liobn, unsigned long ioba,
>> + unsigned long tce_list, unsigned long npages)
>> +{
>> + struct kvmppc_spapr_tce_table *stt;
>> + unsigned long *tces;
>> + long ret = 0, i;
>> +
>> + stt = find_tce_table(vcpu, liobn);
>> + /* Didn't find the liobn, put it to userspace */
>> + if (!stt)
>> + return H_TOO_HARD;
>> +
>> + tces = (void *) get_virt_address(vcpu, tce_list, false, NULL, NULL);
>> + if (!tces)
>> + return H_TOO_HARD;
>> +
>> + /* Emulated IO */
>> + for (i = 0; (i < npages) && !ret; ++i, ioba += IOMMU_PAGE_SIZE)
>> + ret = emulated_h_put_tce(stt, ioba, tces[i]);
>
> Same comment here about tces[i] overflowing a page boundary.
>
>> +
>> + return ret;
>> +}
>> +
>> +extern long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
>> + unsigned long liobn, unsigned long ioba,
>> + unsigned long tce_value, unsigned long npages)
>> +{
>> + /* At the moment emulated IO is handled the same way */
>> + return kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
>> }
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 71d0c90..13c8436 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -515,6 +515,29 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>> kvmppc_get_gpr(vcpu, 5),
>> kvmppc_get_gpr(vcpu, 6));
>> break;
>> + case H_PUT_TCE:
>> + ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>> + kvmppc_get_gpr(vcpu, 5),
>> + kvmppc_get_gpr(vcpu, 6));
>> + if (ret == H_TOO_HARD)
>> + return RESUME_HOST;
>> + break;
>> + case H_PUT_TCE_INDIRECT:
>> + ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
>> + kvmppc_get_gpr(vcpu, 5),
>> + kvmppc_get_gpr(vcpu, 6),
>> + kvmppc_get_gpr(vcpu, 7));
>> + if (ret == H_TOO_HARD)
>> + return RESUME_HOST;
>> + break;
>> + case H_STUFF_TCE:
>> + ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>> + kvmppc_get_gpr(vcpu, 5),
>> + kvmppc_get_gpr(vcpu, 6),
>> + kvmppc_get_gpr(vcpu, 7));
>> + if (ret == H_TOO_HARD)
>> + return RESUME_HOST;
>> + break;
>> default:
>> return RESUME_HOST;
>> }
> [snip]
>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>> index 70739a0..95614c7 100644
>> --- a/arch/powerpc/kvm/powerpc.c
>> +++ b/arch/powerpc/kvm/powerpc.c
>> @@ -383,6 +383,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>> r = 1;
>> break;
>> #endif
>> + case KVM_CAP_PPC_MULTITCE:
>> + r = 1;
>> + break;
>> default:
>> r = 0;
>> break;
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index e6e5d4b..26e2b271 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -635,6 +635,7 @@ struct kvm_ppc_smmu_info {
>> #define KVM_CAP_IRQFD_RESAMPLE 82
>> #define KVM_CAP_PPC_BOOKE_WATCHDOG 83
>> #define KVM_CAP_PPC_HTAB_FD 84
>> +#define KVM_CAP_PPC_MULTITCE 87
>
> The capability should be described in
> Documentation/virtual/kvm/api.txt.
Is it enough description?
===
4.79 KVM_CAP_PPC_MULTITCE
Architectures: ppc
Parameters: none
Returns: 0 on success; -1 on error
This capability enables the guest to put/remove multiple TCE entries
per hypercall which significanly accelerates DMA operations for PPC KVM
guests.
When this capability is enabled, H_PUT_TCE_INDIRECT and H_STUFF_TCE are
expected to occur rather than H_PUT_TCE which supports only one TCE entry
per call.
===
--
Alexey
^ permalink raw reply
* RE: [PATCH][UPSTEAM] powerpc/mpic: add irq_set_wake support
From: Wang Dongsheng-B40534 @ 2013-02-18 2:52 UTC (permalink / raw)
To: Kumar Gala, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1359601823-9861-1-git-send-email-dongsheng.wang@freescale.com>
Hi Kumar,
Could you ack this patch?
Thanks.
> -----Original Message-----
> From: Wang Dongsheng-B40534
> Sent: Thursday, January 31, 2013 11:10 AM
> To: linuxppc-dev@lists.ozlabs.org
> Cc: Wang Dongsheng-B40534
> Subject: [PATCH][UPSTEAM] powerpc/mpic: add irq_set_wake support
>=20
> Add irq_set_wake support. Just add IRQF_NO_SUSPEND to desc->action->flag.
> So the wake up interrupt will not be disable in suspend_device_irqs.
>=20
> Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
> ---
> arch/powerpc/sysdev/mpic.c | 15 +++++++++++++++
> 1 files changed, 15 insertions(+), 0 deletions(-)
>=20
> diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
> index 9c6e535..2ed0220 100644
> --- a/arch/powerpc/sysdev/mpic.c
> +++ b/arch/powerpc/sysdev/mpic.c
> @@ -920,6 +920,18 @@ int mpic_set_irq_type(struct irq_data *d, unsigned
> int flow_type)
> return IRQ_SET_MASK_OK_NOCOPY;
> }
>=20
> +static int mpic_irq_set_wake(struct irq_data *d, unsigned int on)
> +{
> + struct irq_desc *desc =3D container_of(d, struct irq_desc, irq_data);
> +
> + if (on)
> + desc->action->flags |=3D IRQF_NO_SUSPEND;
> + else
> + desc->action->flags &=3D ~IRQF_NO_SUSPEND;
> +
> + return 0;
> +}
> +
> void mpic_set_vector(unsigned int virq, unsigned int vector)
> {
> struct mpic *mpic =3D mpic_from_irq(virq);
> @@ -957,6 +969,7 @@ static struct irq_chip mpic_irq_chip =3D {
> .irq_unmask =3D mpic_unmask_irq,
> .irq_eoi =3D mpic_end_irq,
> .irq_set_type =3D mpic_set_irq_type,
> + .irq_set_wake =3D mpic_irq_set_wake,
> };
>=20
> #ifdef CONFIG_SMP
> @@ -971,6 +984,7 @@ static struct irq_chip mpic_tm_chip =3D {
> .irq_mask =3D mpic_mask_tm,
> .irq_unmask =3D mpic_unmask_tm,
> .irq_eoi =3D mpic_end_irq,
> + .irq_set_wake =3D mpic_irq_set_wake,
> };
>=20
> #ifdef CONFIG_MPIC_U3_HT_IRQS
> @@ -981,6 +995,7 @@ static struct irq_chip mpic_irq_ht_chip =3D {
> .irq_unmask =3D mpic_unmask_ht_irq,
> .irq_eoi =3D mpic_end_ht_irq,
> .irq_set_type =3D mpic_set_irq_type,
> + .irq_set_wake =3D mpic_irq_set_wake,
> };
> #endif /* CONFIG_MPIC_U3_HT_IRQS */
>=20
> --
> 1.7.5.1
^ permalink raw reply
* Re: [PATCH] i2c: Remove unneeded xxx_set_drvdata(..., NULL) calls
From: Peter Korsgaard @ 2013-02-17 15:12 UTC (permalink / raw)
To: Doug Anderson
Cc: Wolfram Sang, Tony Lindgren, Linus Walleij, Thierry Reding,
Sekhar Nori, linux-i2c, Guan Xuetao, Kevin Hilman, Sonic Zhang,
linux-arm-kernel, Deepak Sikri, Havard Skinnemoen, Marek Vasut,
Pawel Moll, Stephen Warren, Sascha Hauer, Uwe Kleine-König,
Rob Herring, uclinux-dist-devel, Jean Delvare, Lars-Peter Clausen,
Ben Dooks (embedded platforms), Barry Song, linux-omap,
Mika Westerberg, Oskar Schirmer, Fabio Estevam,
davinci-linux-open-source, Shawn Guo, Jim Cromie,
Greg Kroah-Hartman, Tomoya MORINAGA, linux-kernel, Kyungmin Park,
Viresh Kumar, Karol Lewandowski, Jiri Kosina, STEricsson,
Joe Perches, Andrew Morton, Alessandro Rubini, linuxppc-dev,
Alexander Stein
In-Reply-To: <1360970315-32116-1-git-send-email-dianders@chromium.org>
>>>>> "Doug" == Doug Anderson <dianders@chromium.org> writes:
Doug> There is simply no reason to be manually setting the private driver
Doug> data to NULL in the remove/fail to probe cases. This is just extra
Doug> cruft code that can be removed.
Doug> A few notes:
Doug> * Nothing relies on drvdata being set to NULL.
Doug> * The __device_release_driver() function eventually calls
Doug> dev_set_drvdata(dev, NULL) anyway, so there's no need to do it
Doug> twice.
Doug> * I verified that there were no cases where xxx_get_drvdata() was
Doug> being called in these drivers and checking for / relying on the NULL
Doug> return value.
Doug> This could be cleaned up kernel-wide but for now just take the baby
Doug> step and remove from the i2c subsystem.
Doug> Reported-by: Wolfram Sang <wsa@the-dreams.de>
Doug> Reported-by: Stephen Warren <swarren@wwwdotorg.org>
Doug> Signed-off-by: Doug Anderson <dianders@chromium.org>
For i2c-ocores.c + i2c-mux-gpio.c:
Acked-by: Peter Korsgaard <jacmet@sunsite.dk>
--
Bye, Peter Korsgaard
^ permalink raw reply
* Re: [PATCH 2/2] of: use platform_device_add
From: Fabio Estevam @ 2013-02-17 13:18 UTC (permalink / raw)
To: Shawn Guo
Cc: Jason Gunthorpe, linux-kernel, Rob Herring, Greg Kroah-Hartman,
linuxppc-dev, linux-arm-kernel
In-Reply-To: <20130217074317.GB16632@S2101-09.ap.freescale.net>
On Sun, Feb 17, 2013 at 4:43 AM, Shawn Guo <shawn.guo@linaro.org> wrote:
> On Sun, Feb 17, 2013 at 11:03:35AM +0800, Shawn Guo wrote:
>> On Fri, Jan 18, 2013 at 01:40:00AM +0000, Grant Likely wrote:
>> > This allows platform_device_add a chance to call insert_resource on all
>> > of the resources from OF. At a minimum this fills in proc/iomem and
>> > presumably makes resource tracking and conflict detection work better.
>> > However, it has the side effect of moving all OF generated platform
>> > devices from /sys/devices to /sys/devices/platform/. It /shouldn't/
>> > break userspace because userspace is not supposed to depend on the full
>> > path (because userspace always does what it is supposed to, right?).
>> >
>> > This may cause breakage if either:
>> > 1) any two nodes in a given device tree have overlapping & staggered
>> > regions (ie. 0x80..0xbf and 0xa0..0xdf; where one is not contained
>> > within the other). In this case one of the devices will fail to
>> > register and an exception will be needed in platform_device_add() to
>> > complain but not fail.
>>
>> Grant,
>>
>> The patch introduce a regression on imx6q boot.
>
> It also breaks all of_amba_device users.
>
> of_amba_device_create() --> amba_device_add() --> request_resource()
> and fails.
Yes, correct: amba-pl011 does not register anymore after this patch,
which causes the serial console to be not functional.
^ permalink raw reply
* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Phileas Fogg @ 2013-02-17 12:40 UTC (permalink / raw)
To: Geert Uytterhoeven; +Cc: linuxppc-dev
In-Reply-To: <CAMuHMdUTQWpQH=UaHc=iZPiJrry7_VqOjtm7bX+jW6r3vOge8A@mail.gmail.com>
Geert Uytterhoeven wrote:
> Hi Phileas,
>
> On Sun, Feb 17, 2013 at 12:12 AM, Phileas Fogg <phileas-fogg@mail.ru> wrote:
>> I found new clues about the problem.
>>
>> Normally the device tree memory segment is allocated at the top of the boot
>> memory region. The boot memory size on the PS3 console is 128MB.
>>
>>
>> root@ps3-linux:~# kexec -l loader.ps3
>> segment[0].mem:0x131d000 memsz:262144
>> segment[1].mem:0x135d000 memsz:36864
>> segment[2].mem:0x7fff000 memsz:4096
>>
>> And the device tree is located at address 0x7fff000, it's the last page of
>> the boot memory.
>>
>> I changed the kexec-tools and made it store the device tree just after the
>> purgatory code which is located at address 0x135d000. Like here:
>>
>>
>> root@ps3-linux:~# kexec -l loader.ps3
>> segment[0].mem:0x131d000 memsz:262144
>> segment[1].mem:0x135d000 memsz:36864
>> segment[2].mem:0x1366000 memsz:4096 <---- new address of device tree
>> segment
>>
>> And now the sha256 verification is always successful for the FreeBSD loader
>> too.
>> But still no idea what actually corrupts the device tree segment when it's
>> located at the top of the boot memory region. And why it happens on Linux
>> 3.7 and Linux 3.8 but not on Linux 3.3.8.
>
> Have you looked at the actual data that ends up being written there?
> It may give a clue...
>
> Gr{oetje,eeting}s,
>
> Geert
>
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
>
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like that.
> -- Linus Torvalds
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
i was able to dump the device tree data from the purgatory code and compared the
original DT which i dumped from kexec-tools and the one from purgatory.
About 20 bytes at the end of the string table of the device tree were corrupted.
Large part of the new data are 0s.
regards
^ permalink raw reply
* Re: [PATCH 2/2] of: use platform_device_add
From: Grant Likely @ 2013-02-17 10:49 UTC (permalink / raw)
To: Russell King - ARM Linux
Cc: Linux Kernel Mailing List, Rob Herring, Jason Gunthorpe,
Greg Kroah-Hartman, Shawn Guo, linuxppc-dev@lists.ozlabs.org,
linux-arm-kernel@lists.infradead.org
In-Reply-To: <20130217101958.GR17833@n2100.arm.linux.org.uk>
On Sun, Feb 17, 2013 at 10:19 AM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Sun, Feb 17, 2013 at 03:43:20PM +0800, Shawn Guo wrote:
> > The patch introduce a regression on imx6q boot. The IOMUXC block on
> > imx6q is special. It acts not only a pin controller but also a system
> > controller with a bunch of system level registers in there. That's why
> > we currently have the following two nodes in imx6q device tree with the
> > same start "reg" address, which work with drivers/mfd/syscon.c and
> > drivers/pinctrl/pinctrl-imx6q.c respectively.
> >
> > gpr: iomuxc-gpr@020e0000 {
> > compatible = "fsl,imx6q-iomuxc-gpr", "syscon";
> > reg = <0x020e0000 0x38>;
> > };
> >
> > iomuxc: iomuxc@020e0000 {
> > compatible = "fsl,imx6q-iomuxc";
> > reg = <0x020e0000 0x4000>;
> > };
> >
> > With the patch in place, pinctrl-imx6q fails to register like below.
> >
> > syscon 20e0000.iomuxc: syscon regmap start 0x20e0000 end 0x20e3fff registered
> > imx6q-pinctrl 20e0000.iomuxc: can't request region for resource [mem 0x020e0000-0x020e3fff]
> > imx6q-pinctrl: probe of 20e0000.iomuxc failed with error -16
Strictly you're not supposed to do that with the device tree. There
shouldn't be two nodes using the same overlapping memory region unless
they are parent/child. That rule has been around for a long time, but
the core never checked for it. What /should/ happen is the two drivers
should be cooperating for the register region and only one device
driver probe sets up both behaviours.
However, neither is it okay to just break the existing device trees.
Best thing to do I think is to deprecate one of the nodes.
>> It also breaks all of_amba_device users.
>>
>> of_amba_device_create() --> amba_device_add() --> request_resource()
>> and fails.
>
> Presumably that's because we no longer know what the parent resource
> is supposed to be?
Hmmm, it looks that way, yes. Currently the OF code is using
iomem_resource as the parent for all amba_device_add() calls
(driver/of/platform.c), but if the parent node gets registered as a
platform device and it has the resources then the parenthood chain
doesn't match up. It isn't immediately obvious to me how to fix this.
I'm going to drop the patch from my tree. I could use some help
figuring out what the correct behaviour really should be here.
g.
--
Grant Likely, B.Sc., P.Eng.
Secret Lab Technologies Ltd.
^ permalink raw reply
* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Geert Uytterhoeven @ 2013-02-17 8:53 UTC (permalink / raw)
To: Phileas Fogg; +Cc: linuxppc-dev
In-Reply-To: <51201276.8020104@mail.ru>
Hi Phileas,
On Sun, Feb 17, 2013 at 12:12 AM, Phileas Fogg <phileas-fogg@mail.ru> wrote:
> I found new clues about the problem.
>
> Normally the device tree memory segment is allocated at the top of the boot
> memory region. The boot memory size on the PS3 console is 128MB.
>
>
> root@ps3-linux:~# kexec -l loader.ps3
> segment[0].mem:0x131d000 memsz:262144
> segment[1].mem:0x135d000 memsz:36864
> segment[2].mem:0x7fff000 memsz:4096
>
> And the device tree is located at address 0x7fff000, it's the last page of
> the boot memory.
>
> I changed the kexec-tools and made it store the device tree just after the
> purgatory code which is located at address 0x135d000. Like here:
>
>
> root@ps3-linux:~# kexec -l loader.ps3
> segment[0].mem:0x131d000 memsz:262144
> segment[1].mem:0x135d000 memsz:36864
> segment[2].mem:0x1366000 memsz:4096 <---- new address of device tree
> segment
>
> And now the sha256 verification is always successful for the FreeBSD loader
> too.
> But still no idea what actually corrupts the device tree segment when it's
> located at the top of the boot memory region. And why it happens on Linux
> 3.7 and Linux 3.8 but not on Linux 3.3.8.
Have you looked at the actual data that ends up being written there?
It may give a clue...
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
^ permalink raw reply
* Re: [PATCH 2/2] of: use platform_device_add
From: Russell King - ARM Linux @ 2013-02-17 10:19 UTC (permalink / raw)
To: Shawn Guo
Cc: Jason Gunthorpe, linux-kernel, Rob Herring, Greg Kroah-Hartman,
linuxppc-dev, linux-arm-kernel
In-Reply-To: <20130217074317.GB16632@S2101-09.ap.freescale.net>
On Sun, Feb 17, 2013 at 03:43:20PM +0800, Shawn Guo wrote:
> It also breaks all of_amba_device users.
>
> of_amba_device_create() --> amba_device_add() --> request_resource()
> and fails.
Presumably that's because we no longer know what the parent resource
is supposed to be?
^ permalink raw reply
* Re: [PATCH 2/2] of: use platform_device_add
From: Shawn Guo @ 2013-02-17 7:43 UTC (permalink / raw)
To: Grant Likely
Cc: linux-kernel, Rob Herring, Jason Gunthorpe, Greg Kroah-Hartman,
linuxppc-dev, linux-arm-kernel
In-Reply-To: <20130217030331.GA15048@S2101-09.ap.freescale.net>
On Sun, Feb 17, 2013 at 11:03:35AM +0800, Shawn Guo wrote:
> On Fri, Jan 18, 2013 at 01:40:00AM +0000, Grant Likely wrote:
> > This allows platform_device_add a chance to call insert_resource on all
> > of the resources from OF. At a minimum this fills in proc/iomem and
> > presumably makes resource tracking and conflict detection work better.
> > However, it has the side effect of moving all OF generated platform
> > devices from /sys/devices to /sys/devices/platform/. It /shouldn't/
> > break userspace because userspace is not supposed to depend on the full
> > path (because userspace always does what it is supposed to, right?).
> >
> > This may cause breakage if either:
> > 1) any two nodes in a given device tree have overlapping & staggered
> > regions (ie. 0x80..0xbf and 0xa0..0xdf; where one is not contained
> > within the other). In this case one of the devices will fail to
> > register and an exception will be needed in platform_device_add() to
> > complain but not fail.
>
> Grant,
>
> The patch introduce a regression on imx6q boot.
It also breaks all of_amba_device users.
of_amba_device_create() --> amba_device_add() --> request_resource()
and fails.
Shawn
^ permalink raw reply
* Re: [PATCH] arch/powerpc/kernel: using %12.12s instead of %12s for avoiding memory overflow.
From: Chen Gang @ 2013-02-17 4:00 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev
In-Reply-To: <5100B53C.3030109@asianux.com>
Hello relative members:
please give a glance to this patch, when you have time.
thanks.
:-)
gchen.
于 2013年01月24日 12:14, Chen Gang 写道:
>
> for tmp_part->header.name:
> it is "Terminating null required only for names < 12 chars".
> so need to limit the %.12s for it in printk
>
> additional info:
>
> %12s limit the width, not for the original string output length
> if name length is more than 12, it still can be fully displayed.
> if name length is less than 12, the ' ' will be filled before name.
>
> %.12s truly limit the original string output length (precision)
>
>
> Signed-off-by: Chen Gang <gang.chen@asianux.com>
> ---
> arch/powerpc/kernel/nvram_64.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
> index bec1e93..57bf6d2 100644
> --- a/arch/powerpc/kernel/nvram_64.c
> +++ b/arch/powerpc/kernel/nvram_64.c
> @@ -202,7 +202,7 @@ static void __init nvram_print_partitions(char * label)
> printk(KERN_WARNING "--------%s---------\n", label);
> printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n");
> list_for_each_entry(tmp_part, &nvram_partitions, partition) {
> - printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%12s\n",
> + printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%12.12s\n",
> tmp_part->index, tmp_part->header.signature,
> tmp_part->header.checksum, tmp_part->header.length,
> tmp_part->header.name);
>
--
Chen Gang
Asianux Corporation
^ permalink raw reply
* Re: [PATCH 2/2] of: use platform_device_add
From: Shawn Guo @ 2013-02-17 3:03 UTC (permalink / raw)
To: Grant Likely
Cc: linux-kernel, Rob Herring, Jason Gunthorpe, Greg Kroah-Hartman,
linuxppc-dev, linux-arm-kernel
In-Reply-To: <1358473200-17886-2-git-send-email-grant.likely@secretlab.ca>
On Fri, Jan 18, 2013 at 01:40:00AM +0000, Grant Likely wrote:
> This allows platform_device_add a chance to call insert_resource on all
> of the resources from OF. At a minimum this fills in proc/iomem and
> presumably makes resource tracking and conflict detection work better.
> However, it has the side effect of moving all OF generated platform
> devices from /sys/devices to /sys/devices/platform/. It /shouldn't/
> break userspace because userspace is not supposed to depend on the full
> path (because userspace always does what it is supposed to, right?).
>
> This may cause breakage if either:
> 1) any two nodes in a given device tree have overlapping & staggered
> regions (ie. 0x80..0xbf and 0xa0..0xdf; where one is not contained
> within the other). In this case one of the devices will fail to
> register and an exception will be needed in platform_device_add() to
> complain but not fail.
Grant,
The patch introduce a regression on imx6q boot. The IOMUXC block on
imx6q is special. It acts not only a pin controller but also a system
controller with a bunch of system level registers in there. That's why
we currently have the following two nodes in imx6q device tree with the
same start "reg" address, which work with drivers/mfd/syscon.c and
drivers/pinctrl/pinctrl-imx6q.c respectively.
gpr: iomuxc-gpr@020e0000 {
compatible = "fsl,imx6q-iomuxc-gpr", "syscon";
reg = <0x020e0000 0x38>;
};
iomuxc: iomuxc@020e0000 {
compatible = "fsl,imx6q-iomuxc";
reg = <0x020e0000 0x4000>;
};
With the patch in place, pinctrl-imx6q fails to register like below.
syscon 20e0000.iomuxc: syscon regmap start 0x20e0000 end 0x20e3fff registered
imx6q-pinctrl 20e0000.iomuxc: can't request region for resource [mem 0x020e0000-0x020e3fff]
imx6q-pinctrl: probe of 20e0000.iomuxc failed with error -16
Shawn
> 2) any device calls request_mem_region() on a region larger than
> specified in the device tree. In this case the device node may be
> wrong, or the driver is overreaching. In either case I'd like to know
> about any problems and fix them.
^ permalink raw reply
* RE: [PATCH] powerpc/85xx: dts - add ranges property for SEC
From: Liu Po-B43644 @ 2013-02-17 2:40 UTC (permalink / raw)
To: Kumar Gala, Phillips Kim-R1AAHA; +Cc: linuxppc-dev@ozlabs.org
In-Reply-To: <3D277CC6-2C2C-46E7-B0F8-FAECED8BC868@kernel.crashing.org>
Hi Kim,
Thank you for the fixing.=20
Best regards,
Liu Po
- 8038
-----Original Message-----
From: Kumar Gala [mailto:galak@kernel.crashing.org]=20
Sent: Wednesday, February 13, 2013 1:27 AM
To: Phillips Kim-R1AAHA
Cc: Liu Po-B43644; linuxppc-dev@ozlabs.org
Subject: Re: [PATCH] powerpc/85xx: dts - add ranges property for SEC
On Jan 18, 2013, at 2:40 PM, Kim Phillips wrote:
> On Fri, 18 Jan 2013 17:16:13 +0800
> Po Liu <po.liu@freescale.com> wrote:
>=20
>> This facilitates getting the physical address of the SEC node.
>>=20
>> Signed-off-by: Liu po <po.liu@freescale.com>
>> ---
> Reviewed-by: Kim Phillips <kim.phillips@freescale.com>
>=20
> Kim
This was missing a trailing ';', so wondering if it was ever tested?
I fixed when I applied.
applied.
- k
^ permalink raw reply
* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Phileas Fogg @ 2013-02-16 23:12 UTC (permalink / raw)
To: Phileas Fogg; +Cc: linuxppc-dev
In-Reply-To: <511F652F.4090508@mail.ru>
I found new clues about the problem.
Normally the device tree memory segment is allocated at the top of the boot
memory region. The boot memory size on the PS3 console is 128MB.
root@ps3-linux:~# kexec -l loader.ps3
segment[0].mem:0x131d000 memsz:262144
segment[1].mem:0x135d000 memsz:36864
segment[2].mem:0x7fff000 memsz:4096
And the device tree is located at address 0x7fff000, it's the last page of the
boot memory.
I changed the kexec-tools and made it store the device tree just after the
purgatory code which is located at address 0x135d000. Like here:
root@ps3-linux:~# kexec -l loader.ps3
segment[0].mem:0x131d000 memsz:262144
segment[1].mem:0x135d000 memsz:36864
segment[2].mem:0x1366000 memsz:4096 <---- new address of device tree segment
And now the sha256 verification is always successful for the FreeBSD loader too.
But still no idea what actually corrupts the device tree segment when it's
located at the top of the boot memory region. And why it happens on Linux 3.7
and Linux 3.8 but not on Linux 3.3.8.
regards
^ permalink raw reply
* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Phileas Fogg @ 2013-02-16 22:14 UTC (permalink / raw)
To: Phileas Fogg; +Cc: linuxppc-dev
In-Reply-To: <511F652F.4090508@mail.ru>
Phileas Fogg wrote:
> I was able to capture the debug output from the purgatory code and it's very odd.
>
> This the SHA256 digest calculated by kexec-tools:
>
> root@ps3-linux:~# kexec -l loader.ps3
> Warning: append= option is not passed. Using the first kernel root partition
> Modified cmdline:
> Unable to find /proc/device-tree//chosen/linux,stdout-path, printing from
> purgatory is diabled
> segment[0].mem:0x131d000 memsz:262144
> segment[1].mem:0x135d000 memsz:36864
> segment[2].mem:0x7fff000 memsz:4096
> sha256_digest: 77 d5 30 a7 67 5f 67 93 f1 e0 ce 84 bd 4e 1b ec 3c 4a 9e 86 5c a1
> 33 87 9e b1 5f c8 91 ce e8 61
>
>
> And this is the debug output i'm always getting from the purgatory code:
>
> I'm in purgatory
> sha256 digests do not match :(
> digest: fd 4f df a8 af 5b e1 6b bc 51 5d b8 ab be 75 fb 76 fd 64 64 26
> 3e a8 9f 46 ec 91 de 05 4e 72 78
> sha256_digest: 00 39 e3 b2 45 0d 20 68 74 c2 4e ee e4 4a cf ec c3 78 4f 1c 65 ff
> a8 76 73 68 5d 01 70 0b b6 50
>
> regards
>
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
I was able to analyze the problem more and found out that the device tree memory
region gets corrupted. I slightly modified kexec-tools and made it first compute
a checksum of the first segment only where the new kernel is located.
And the checksum was always verified as correct in the purgatoroy code.
Then i made kexec-tools compute the checksum of the 3rd segment only where a
device tree is stored. And this time the verify function in the purgatory failed
always.
Output form the purgatory code:
--------------------------------
I'm in purgatory
sha256 digests do not match :(
digest: e3 b0 c4 42 98 fc 1c 14 9a fb f4 c8 99 6f b9 24 27 ae 41 e4 64
9b 93 4c a4 95 99 1b 78 52 b8 55
sha256_digest: 57 08 81 e7 62 c3 22 2f d9 1d 94 a5 d0 f7 53 8f fe 69 64 84 4d 71
2d aa e2 07 45 b3 78 79 6e 26
sha256_regions:
start=0x0000000007fff000 len=0x0000000000001000
The sha256_digest is actually the correct SHA256 checksum precomputed by
kexec-tools when the new kernel was given to the old kernel.
I will try to analyze the problem more later.
regards
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox