* [PATCH v3 17/45] powerpc/mm: PTE_ATOMIC_UPDATES is only for 40x
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Only 40x still uses PTE_ATOMIC_UPDATES.
40x cannot not select CONFIG_PTE64_BIT.
Drop handling of PTE_ATOMIC_UPDATES:
- In nohash/64
- In nohash/32 for CONFIG_PTE_64BIT
Keep PTE_ATOMIC_UPDATES only for nohash/32 for !CONFIG_PTE_64BIT
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/nohash/32/pgtable.h | 17 ------------
arch/powerpc/include/asm/nohash/64/pgtable.h | 28 +-------------------
2 files changed, 1 insertion(+), 44 deletions(-)
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index b04ba257fddb..523c4c3876c5 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -262,25 +262,8 @@ static inline unsigned long long pte_update(pte_t *p,
unsigned long clr,
unsigned long set)
{
-#ifdef PTE_ATOMIC_UPDATES
- unsigned long long old;
- unsigned long tmp;
-
- __asm__ __volatile__("\
-1: lwarx %L0,0,%4\n\
- lwzx %0,0,%3\n\
- andc %1,%L0,%5\n\
- or %1,%1,%6\n"
- PPC405_ERR77(0,%3)
-" stwcx. %1,0,%4\n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*p)
- : "r" (p), "r" ((unsigned long)(p) + 4), "r" (clr), "r" (set), "m" (*p)
- : "cc" );
-#else /* PTE_ATOMIC_UPDATES */
unsigned long long old = pte_val(*p);
*p = __pte((old & ~(unsigned long long)clr) | set);
-#endif /* !PTE_ATOMIC_UPDATES */
#ifdef CONFIG_44x
if ((old & _PAGE_USER) && (old & _PAGE_EXEC))
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 9a33b8bd842d..9c703b140d64 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -211,22 +211,9 @@ static inline unsigned long pte_update(struct mm_struct *mm,
unsigned long set,
int huge)
{
-#ifdef PTE_ATOMIC_UPDATES
- unsigned long old, tmp;
-
- __asm__ __volatile__(
- "1: ldarx %0,0,%3 # pte_update\n\
- andc %1,%0,%4 \n\
- or %1,%1,%6\n\
- stdcx. %1,0,%3 \n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*ptep)
- : "r" (ptep), "r" (clr), "m" (*ptep), "r" (set)
- : "cc" );
-#else
unsigned long old = pte_val(*ptep);
*ptep = __pte((old & ~clr) | set);
-#endif
+
/* huge pages use the old page table lock */
if (!huge)
assert_pte_locked(mm, addr);
@@ -310,21 +297,8 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long bits = pte_val(entry) &
(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
-#ifdef PTE_ATOMIC_UPDATES
- unsigned long old, tmp;
-
- __asm__ __volatile__(
- "1: ldarx %0,0,%4\n\
- or %0,%3,%0\n\
- stdcx. %0,0,%4\n\
- bne- 1b"
- :"=&r" (old), "=&r" (tmp), "=m" (*ptep)
- :"r" (bits), "r" (ptep), "m" (*ptep)
- :"cc");
-#else
unsigned long old = pte_val(*ptep);
*ptep = __pte(old | bits);
-#endif
flush_tlb_page(vma, address);
}
--
2.25.0
^ permalink raw reply related
* [PATCH v3 16/45] powerpc/mm: Fix conditions to perform MMU specific management by blocks on PPC32.
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Setting init mem to NX shall depend on sinittext being mapped by
block, not on stext being mapped by block.
Setting text and rodata to RO shall depend on stext being mapped by
block, not on sinittext being mapped by block.
Fixes: 63b2bc619565 ("powerpc/mm/32s: Use BATs for STRICT_KERNEL_RWX")
Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/pgtable_32.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 9934659cb871..bd0cb6e3573e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -185,7 +185,7 @@ void mark_initmem_nx(void)
unsigned long numpages = PFN_UP((unsigned long)_einittext) -
PFN_DOWN((unsigned long)_sinittext);
- if (v_block_mapped((unsigned long)_stext + 1))
+ if (v_block_mapped((unsigned long)_sinittext))
mmu_mark_initmem_nx();
else
change_page_attr(page, numpages, PAGE_KERNEL);
@@ -197,7 +197,7 @@ void mark_rodata_ro(void)
struct page *page;
unsigned long numpages;
- if (v_block_mapped((unsigned long)_sinittext)) {
+ if (v_block_mapped((unsigned long)_stext + 1)) {
mmu_mark_rodata_ro();
ptdump_check_wx();
return;
--
2.25.0
^ permalink raw reply related
* [PATCH v3 15/45] powerpc/mm: Allocate static page tables for fixmap
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Allocate static page tables for the fixmap area. This allows
setting mappings through page tables before memblock is ready.
That's needed to use early_ioremap() early and to use standard
page mappings with fixmap.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/fixmap.h | 4 ++++
arch/powerpc/kernel/setup_32.c | 2 +-
arch/powerpc/mm/pgtable_32.c | 16 ++++++++++++++++
3 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h
index 2ef155a3c821..ccbe2e83c950 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -86,6 +86,10 @@ enum fixed_addresses {
#define __FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
+#define FIXMAP_ALIGNED_SIZE (ALIGN(FIXADDR_TOP, PGDIR_SIZE) - \
+ ALIGN_DOWN(FIXADDR_START, PGDIR_SIZE))
+#define FIXMAP_PTE_SIZE (FIXMAP_ALIGNED_SIZE / PGDIR_SIZE * PTE_TABLE_SIZE)
+
#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NCG
#define FIXMAP_PAGE_IO PAGE_KERNEL_NCG
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 305ca89d856f..fee167d2701f 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -80,7 +80,7 @@ notrace void __init machine_init(u64 dt_ptr)
/* Configure static keys first, now that we're relocated. */
setup_feature_keys();
- early_ioremap_setup();
+ early_ioremap_init();
/* Enable early debugging if any specified (see udbg.h) */
udbg_early_init();
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index f62de06e3d07..9934659cb871 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -29,11 +29,27 @@
#include <asm/fixmap.h>
#include <asm/setup.h>
#include <asm/sections.h>
+#include <asm/early_ioremap.h>
#include <mm/mmu_decl.h>
extern char etext[], _stext[], _sinittext[], _einittext[];
+static u8 early_fixmap_pagetable[FIXMAP_PTE_SIZE] __page_aligned_data;
+
+notrace void __init early_ioremap_init(void)
+{
+ unsigned long addr = ALIGN_DOWN(FIXADDR_START, PGDIR_SIZE);
+ pte_t *ptep = (pte_t *)early_fixmap_pagetable;
+ pmd_t *pmdp = pmd_ptr_k(addr);
+
+ for (; (s32)(FIXADDR_TOP - addr) > 0;
+ addr += PGDIR_SIZE, ptep += PTRS_PER_PTE, pmdp++)
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+
+ early_ioremap_setup();
+}
+
static void __init *early_alloc_pgtable(unsigned long size)
{
void *ptr = memblock_alloc(size, size);
--
2.25.0
^ permalink raw reply related
* [PATCH v3 14/45] powerpc/32s: Don't warn when mapping RO data ROX.
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Mapping RO data as ROX is not an issue since that data
cannot be modified to introduce an exploit.
PPC64 accepts to have RO data mapped ROX, as a trade off
between kernel size and strictness of protection.
On PPC32, kernel size is even more critical as amount of
memory is usually small.
Depending on the number of available IBATs, the last IBATs
might overflow the end of text. Only warn if it crosses
the end of RO data.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/book3s32/mmu.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 39ba53ca5bb5..a9b2cbc74797 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -187,6 +187,7 @@ void mmu_mark_initmem_nx(void)
int i;
unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
+ unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
unsigned long size;
if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
@@ -201,9 +202,10 @@ void mmu_mark_initmem_nx(void)
size = block_size(base, top);
size = max(size, 128UL << 10);
if ((top - base) > size) {
- if (strict_kernel_rwx_enabled())
- pr_warn("Kernel _etext not properly aligned\n");
size <<= 1;
+ if (strict_kernel_rwx_enabled() && base + size > border)
+ pr_warn("Some RW data is getting mapped X. "
+ "Adjust CONFIG_DATA_SHIFT to avoid that.\n");
}
setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
base += size;
--
2.25.0
^ permalink raw reply related
* [PATCH v3 13/45] powerpc/ptdump: Handle hugepd at PGD level
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
The 8xx is about to map kernel linear space and IMMR using huge
pages.
In order to display those pages properly, ptdump needs to handle
hugepd tables at PGD level.
For the time being do it only at PGD level. Further patches may
add handling of hugepd tables at lower level for other platforms
when needed in the future.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v3: notepage() now takes page size instead of page shift
---
arch/powerpc/mm/ptdump/ptdump.c | 29 ++++++++++++++++++++++++++---
1 file changed, 26 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 98d82dcf6f0b..5fc880e30175 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -23,6 +23,7 @@
#include <linux/const.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
+#include <asm/hugetlb.h>
#include <mm/mmu_decl.h>
@@ -269,6 +270,26 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
}
}
+static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start,
+ int pdshift, int level)
+{
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+ unsigned int i;
+ int shift = hugepd_shift(*phpd);
+ int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1;
+
+ if (start & ((1 << shift) - 1))
+ return;
+
+ for (i = 0; i < ptrs_per_hpd; i++) {
+ unsigned long addr = start + (i << shift);
+ pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
+
+ note_page(st, addr, level + 1, pte_val(*pte), 1 << shift);
+ }
+#endif
+}
+
static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
{
pmd_t *pmd = pmd_offset(pud, 0);
@@ -312,11 +333,13 @@ static void walk_pagetables(struct pg_state *st)
* the hash pagetable.
*/
for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
- if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
+ if (pgd_none(*pgd) || pgd_is_leaf(*pgd))
+ note_page(st, addr, 1, pgd_val(*pgd), PGDIR_SIZE);
+ else if (is_hugepd(__hugepd(pgd_val(*pgd))))
+ walk_hugepd(st, (hugepd_t *)pgd, addr, PGDIR_SHIFT, 1);
+ else
/* pgd exists */
walk_pud(st, pgd, addr);
- else
- note_page(st, addr, 1, pgd_val(*pgd), PGDIR_SIZE);
}
}
--
2.25.0
^ permalink raw reply related
* [PATCH v3 06/45] powerpc/kasan: Declare kasan_init_region() weak
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
In order to alloc sub-arches to alloc KASAN regions using optimised
methods (Huge pages on 8xx, BATs on BOOK3S, ...), declare
kasan_init_region() weak.
Also make kasan_init_shadow_page_tables() accessible from outside,
so that it can be called from the specific kasan_init_region()
functions if needed.
And populate remaining KASAN address space only once performed
the region mapping, to allow 8xx to allocate hugepd instead of
standard page tables for mapping via 8M hugepages.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/kasan.h | 3 +++
arch/powerpc/mm/kasan/kasan_init_32.c | 21 +++++++++++----------
2 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index 4769bbf7173a..107a24c3f7b3 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -34,5 +34,8 @@ static inline void kasan_init(void) { }
static inline void kasan_late_init(void) { }
#endif
+int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end);
+int kasan_init_region(void *start, size_t size);
+
#endif /* __ASSEMBLY */
#endif
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 10481d904fea..76d418af4ce8 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -28,7 +28,7 @@ static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot)
__set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
}
-static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end)
+int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end)
{
pmd_t *pmd;
unsigned long k_cur, k_next;
@@ -52,7 +52,7 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned
return 0;
}
-static int __init kasan_init_region(void *start, size_t size)
+int __init __weak kasan_init_region(void *start, size_t size)
{
unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
@@ -122,14 +122,6 @@ static void __init kasan_mmu_init(void)
int ret;
struct memblock_region *reg;
- if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ||
- IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
- ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END);
-
- if (ret)
- panic("kasan: kasan_init_shadow_page_tables() failed");
- }
-
for_each_memblock(memory, reg) {
phys_addr_t base = reg->base;
phys_addr_t top = min(base + reg->size, total_lowmem);
@@ -141,6 +133,15 @@ static void __init kasan_mmu_init(void)
if (ret)
panic("kasan: kasan_init_region() failed");
}
+
+ if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ||
+ IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+ ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ if (ret)
+ panic("kasan: kasan_init_shadow_page_tables() failed");
+ }
+
}
void __init kasan_init(void)
--
2.25.0
^ permalink raw reply related
* [PATCH v3 12/45] powerpc/ptdump: Properly handle non standard page size
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
In order to properly display information regardless of the page size,
it is necessary to take into account real page size.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Fixes: cabe8138b23c ("powerpc: dump as a single line areas mapping a single physical page.")
Cc: stable@vger.kernel.org
---
v3: Fixed sizes which were shifted one level (went unoticed on PPC32 as PMD and PUD level don't exist)
---
arch/powerpc/mm/ptdump/ptdump.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 1f97668853e3..98d82dcf6f0b 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -60,6 +60,7 @@ struct pg_state {
unsigned long start_address;
unsigned long start_pa;
unsigned long last_pa;
+ unsigned long page_size;
unsigned int level;
u64 current_flags;
bool check_wx;
@@ -168,9 +169,9 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
#endif
pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
- if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) {
+ if (st->start_pa == st->last_pa && st->start_address + st->page_size != addr) {
pt_dump_seq_printf(st->seq, "[" REG "]", st->start_pa);
- delta = PAGE_SIZE >> 10;
+ delta = st->page_size >> 10;
} else {
pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
delta = (addr - st->start_address) >> 10;
@@ -195,7 +196,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
}
static void note_page(struct pg_state *st, unsigned long addr,
- unsigned int level, u64 val)
+ unsigned int level, u64 val, unsigned long page_size)
{
u64 flag = val & pg_level[level].mask;
u64 pa = val & PTE_RPN_MASK;
@@ -207,6 +208,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
st->start_address = addr;
st->start_pa = pa;
st->last_pa = pa;
+ st->page_size = page_size;
pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
/*
* Dump the section of virtual memory when:
@@ -218,7 +220,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
*/
} else if (flag != st->current_flags || level != st->level ||
addr >= st->marker[1].start_address ||
- (pa != st->last_pa + PAGE_SIZE &&
+ (pa != st->last_pa + st->page_size &&
(pa != st->start_pa || st->start_pa != st->last_pa))) {
/* Check the PTE flags */
@@ -246,6 +248,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
st->start_address = addr;
st->start_pa = pa;
st->last_pa = pa;
+ st->page_size = page_size;
st->current_flags = flag;
st->level = level;
} else {
@@ -261,7 +264,7 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
addr = start + i * PAGE_SIZE;
- note_page(st, addr, 4, pte_val(*pte));
+ note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE);
}
}
@@ -278,7 +281,7 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
/* pmd exists */
walk_pte(st, pmd, addr);
else
- note_page(st, addr, 3, pmd_val(*pmd));
+ note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE);
}
}
@@ -294,7 +297,7 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
/* pud exists */
walk_pmd(st, pud, addr);
else
- note_page(st, addr, 2, pud_val(*pud));
+ note_page(st, addr, 2, pud_val(*pud), PUD_SIZE);
}
}
@@ -313,7 +316,7 @@ static void walk_pagetables(struct pg_state *st)
/* pgd exists */
walk_pud(st, pgd, addr);
else
- note_page(st, addr, 1, pgd_val(*pgd));
+ note_page(st, addr, 1, pgd_val(*pgd), PGDIR_SIZE);
}
}
@@ -368,7 +371,7 @@ static int ptdump_show(struct seq_file *m, void *v)
/* Traverse kernel page tables */
walk_pagetables(&st);
- note_page(&st, 0, 0, 0);
+ note_page(&st, 0, 0, 0, 0);
return 0;
}
--
2.25.0
^ permalink raw reply related
* [PATCH v3 09/45] powerpc/ptdump: Add _PAGE_COHERENT flag
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
For platforms using shared.c (4xx, Book3e, Book3s/32),
also handle the _PAGE_COHERENT flag with corresponds to the
M bit of the WIMG flags.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/ptdump/shared.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index dab5d8028a9b..634b83aa3487 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -40,6 +40,11 @@ static const struct flag_info flag_array[] = {
.val = _PAGE_NO_CACHE,
.set = "i",
.clear = " ",
+ }, {
+ .mask = _PAGE_COHERENT,
+ .val = _PAGE_COHERENT,
+ .set = "m",
+ .clear = " ",
}, {
.mask = _PAGE_GUARDED,
.val = _PAGE_GUARDED,
--
2.25.0
^ permalink raw reply related
* [PATCH v3 07/45] powerpc/ptdump: Limit size of flags text to 1/2 chars on PPC32
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
In order to have all flags fit on a 80 chars wide screen,
reduce the flags to 1 char (2 where ambiguous).
No cache is 'i'
User is 'ur' (Supervisor would be sr)
Shared (for 8xx) becomes 'sh' (it was 'user' when not shared but
that was ambiguous because that's not entirely right)
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/ptdump/8xx.c | 33 ++++++++++++++++---------------
arch/powerpc/mm/ptdump/shared.c | 35 +++++++++++++++++----------------
2 files changed, 35 insertions(+), 33 deletions(-)
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 9e2d8e847d6e..ca9ce94672f5 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -12,9 +12,9 @@
static const struct flag_info flag_array[] = {
{
.mask = _PAGE_SH,
- .val = 0,
- .set = "user",
- .clear = " ",
+ .val = _PAGE_SH,
+ .set = "sh",
+ .clear = " ",
}, {
.mask = _PAGE_RO | _PAGE_NA,
.val = 0,
@@ -30,37 +30,38 @@ static const struct flag_info flag_array[] = {
}, {
.mask = _PAGE_EXEC,
.val = _PAGE_EXEC,
- .set = " X ",
- .clear = " ",
+ .set = "x",
+ .clear = " ",
}, {
.mask = _PAGE_PRESENT,
.val = _PAGE_PRESENT,
- .set = "present",
- .clear = " ",
+ .set = "p",
+ .clear = " ",
}, {
.mask = _PAGE_GUARDED,
.val = _PAGE_GUARDED,
- .set = "guarded",
- .clear = " ",
+ .set = "g",
+ .clear = " ",
}, {
.mask = _PAGE_DIRTY,
.val = _PAGE_DIRTY,
- .set = "dirty",
- .clear = " ",
+ .set = "d",
+ .clear = " ",
}, {
.mask = _PAGE_ACCESSED,
.val = _PAGE_ACCESSED,
- .set = "accessed",
- .clear = " ",
+ .set = "a",
+ .clear = " ",
}, {
.mask = _PAGE_NO_CACHE,
.val = _PAGE_NO_CACHE,
- .set = "no cache",
- .clear = " ",
+ .set = "i",
+ .clear = " ",
}, {
.mask = _PAGE_SPECIAL,
.val = _PAGE_SPECIAL,
- .set = "special",
+ .set = "s",
+ .clear = " ",
}
};
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index f7ed2f187cb0..44a8a64a664f 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -13,8 +13,8 @@ static const struct flag_info flag_array[] = {
{
.mask = _PAGE_USER,
.val = _PAGE_USER,
- .set = "user",
- .clear = " ",
+ .set = "ur",
+ .clear = " ",
}, {
.mask = _PAGE_RW,
.val = _PAGE_RW,
@@ -23,42 +23,43 @@ static const struct flag_info flag_array[] = {
}, {
.mask = _PAGE_EXEC,
.val = _PAGE_EXEC,
- .set = " X ",
- .clear = " ",
+ .set = "x",
+ .clear = " ",
}, {
.mask = _PAGE_PRESENT,
.val = _PAGE_PRESENT,
- .set = "present",
- .clear = " ",
+ .set = "p",
+ .clear = " ",
}, {
.mask = _PAGE_GUARDED,
.val = _PAGE_GUARDED,
- .set = "guarded",
- .clear = " ",
+ .set = "g",
+ .clear = " ",
}, {
.mask = _PAGE_DIRTY,
.val = _PAGE_DIRTY,
- .set = "dirty",
- .clear = " ",
+ .set = "d",
+ .clear = " ",
}, {
.mask = _PAGE_ACCESSED,
.val = _PAGE_ACCESSED,
- .set = "accessed",
- .clear = " ",
+ .set = "a",
+ .clear = " ",
}, {
.mask = _PAGE_WRITETHRU,
.val = _PAGE_WRITETHRU,
- .set = "write through",
- .clear = " ",
+ .set = "w",
+ .clear = " ",
}, {
.mask = _PAGE_NO_CACHE,
.val = _PAGE_NO_CACHE,
- .set = "no cache",
- .clear = " ",
+ .set = "i",
+ .clear = " ",
}, {
.mask = _PAGE_SPECIAL,
.val = _PAGE_SPECIAL,
- .set = "special",
+ .set = "s",
+ .clear = " ",
}
};
--
2.25.0
^ permalink raw reply related
* [PATCH v3 05/45] powerpc/kasan: Refactor update of early shadow mappings
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
kasan_remap_early_shadow_ro() and kasan_unmap_early_shadow_vmalloc()
are both updating the early shadow mapping: the first one sets
the mapping read-only while the other clears the mapping.
Refactor and create kasan_update_early_region()
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/kasan/kasan_init_32.c | 39 +++++++++++++--------------
1 file changed, 18 insertions(+), 21 deletions(-)
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 91e2ade75192..10481d904fea 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -79,45 +79,42 @@ static int __init kasan_init_region(void *start, size_t size)
return 0;
}
-static void __init kasan_remap_early_shadow_ro(void)
+static void __init
+kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t pte)
{
- pgprot_t prot = kasan_prot_ro();
- unsigned long k_start = KASAN_SHADOW_START;
- unsigned long k_end = KASAN_SHADOW_END;
unsigned long k_cur;
phys_addr_t pa = __pa(kasan_early_shadow_page);
- kasan_populate_pte(kasan_early_shadow_pte, prot);
-
- for (k_cur = k_start & PAGE_MASK; k_cur != k_end; k_cur += PAGE_SIZE) {
+ for (k_cur = k_start; k_cur != k_end; k_cur += PAGE_SIZE) {
pmd_t *pmd = pmd_ptr_k(k_cur);
pte_t *ptep = pte_offset_kernel(pmd, k_cur);
if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
continue;
- __set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
+ __set_pte_at(&init_mm, k_cur, ptep, pte, 0);
}
- flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ flush_tlb_kernel_range(k_start, k_end);
}
-static void __init kasan_unmap_early_shadow_vmalloc(void)
+static void __init kasan_remap_early_shadow_ro(void)
{
- unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START);
- unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END);
- unsigned long k_cur;
+ pgprot_t prot = kasan_prot_ro();
phys_addr_t pa = __pa(kasan_early_shadow_page);
- for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
- pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
- pte_t *ptep = pte_offset_kernel(pmd, k_cur);
+ kasan_populate_pte(kasan_early_shadow_pte, prot);
- if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
- continue;
+ kasan_update_early_region(KASAN_SHADOW_START, KASAN_SHADOW_END,
+ pfn_pte(PHYS_PFN(pa), prot));
+}
- __set_pte_at(&init_mm, k_cur, ptep, __pte(0), 0);
- }
- flush_tlb_kernel_range(k_start, k_end);
+static void __init kasan_unmap_early_shadow_vmalloc(void)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END);
+
+ kasan_update_early_region(k_start, k_end, __pte(0));
}
static void __init kasan_mmu_init(void)
--
2.25.0
^ permalink raw reply related
* [PATCH v3 08/45] powerpc/ptdump: Reorder flags
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Reorder flags in a more logical way:
- Page size (huge) first
- User
- RWX
- Present
- WIMG
- Special
- Dirty and Accessed
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/ptdump/8xx.c | 30 +++++++++++++++---------------
arch/powerpc/mm/ptdump/shared.c | 30 +++++++++++++++---------------
2 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index ca9ce94672f5..a3169677dced 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -11,11 +11,6 @@
static const struct flag_info flag_array[] = {
{
- .mask = _PAGE_SH,
- .val = _PAGE_SH,
- .set = "sh",
- .clear = " ",
- }, {
.mask = _PAGE_RO | _PAGE_NA,
.val = 0,
.set = "rw",
@@ -37,11 +32,26 @@ static const struct flag_info flag_array[] = {
.val = _PAGE_PRESENT,
.set = "p",
.clear = " ",
+ }, {
+ .mask = _PAGE_NO_CACHE,
+ .val = _PAGE_NO_CACHE,
+ .set = "i",
+ .clear = " ",
}, {
.mask = _PAGE_GUARDED,
.val = _PAGE_GUARDED,
.set = "g",
.clear = " ",
+ }, {
+ .mask = _PAGE_SH,
+ .val = _PAGE_SH,
+ .set = "sh",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "s",
+ .clear = " ",
}, {
.mask = _PAGE_DIRTY,
.val = _PAGE_DIRTY,
@@ -52,16 +62,6 @@ static const struct flag_info flag_array[] = {
.val = _PAGE_ACCESSED,
.set = "a",
.clear = " ",
- }, {
- .mask = _PAGE_NO_CACHE,
- .val = _PAGE_NO_CACHE,
- .set = "i",
- .clear = " ",
- }, {
- .mask = _PAGE_SPECIAL,
- .val = _PAGE_SPECIAL,
- .set = "s",
- .clear = " ",
}
};
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index 44a8a64a664f..dab5d8028a9b 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -30,21 +30,6 @@ static const struct flag_info flag_array[] = {
.val = _PAGE_PRESENT,
.set = "p",
.clear = " ",
- }, {
- .mask = _PAGE_GUARDED,
- .val = _PAGE_GUARDED,
- .set = "g",
- .clear = " ",
- }, {
- .mask = _PAGE_DIRTY,
- .val = _PAGE_DIRTY,
- .set = "d",
- .clear = " ",
- }, {
- .mask = _PAGE_ACCESSED,
- .val = _PAGE_ACCESSED,
- .set = "a",
- .clear = " ",
}, {
.mask = _PAGE_WRITETHRU,
.val = _PAGE_WRITETHRU,
@@ -55,11 +40,26 @@ static const struct flag_info flag_array[] = {
.val = _PAGE_NO_CACHE,
.set = "i",
.clear = " ",
+ }, {
+ .mask = _PAGE_GUARDED,
+ .val = _PAGE_GUARDED,
+ .set = "g",
+ .clear = " ",
}, {
.mask = _PAGE_SPECIAL,
.val = _PAGE_SPECIAL,
.set = "s",
.clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "d",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "a",
+ .clear = " ",
}
};
--
2.25.0
^ permalink raw reply related
* [PATCH v3 11/45] powerpc/ptdump: Standardise display of BAT flags
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Display BAT flags the same way as page flags: rwx and wimg
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/ptdump/bats.c | 37 ++++++++++++++---------------------
1 file changed, 15 insertions(+), 22 deletions(-)
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index d6c660f63d71..cebb58c7e289 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -15,12 +15,12 @@
static char *pp_601(int k, int pp)
{
if (pp == 0)
- return k ? "NA" : "RWX";
+ return k ? " " : "rwx";
if (pp == 1)
- return k ? "ROX" : "RWX";
+ return k ? "r x" : "rwx";
if (pp == 2)
- return k ? "RWX" : "RWX";
- return k ? "ROX" : "ROX";
+ return "rwx";
+ return "r x";
}
static void bat_show_601(struct seq_file *m, int idx, u32 lower, u32 upper)
@@ -48,12 +48,9 @@ static void bat_show_601(struct seq_file *m, int idx, u32 lower, u32 upper)
seq_printf(m, "Kernel %s User %s", pp_601(k & 2, pp), pp_601(k & 1, pp));
- if (lower & _PAGE_WRITETHRU)
- seq_puts(m, "write through ");
- if (lower & _PAGE_NO_CACHE)
- seq_puts(m, "no cache ");
- if (lower & _PAGE_COHERENT)
- seq_puts(m, "coherent ");
+ seq_puts(m, lower & _PAGE_WRITETHRU ? "w " : " ");
+ seq_puts(m, lower & _PAGE_NO_CACHE ? "i " : " ");
+ seq_puts(m, lower & _PAGE_COHERENT ? "m " : " ");
seq_puts(m, "\n");
}
@@ -101,20 +98,16 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool
seq_puts(m, "Kernel/User ");
if (lower & BPP_RX)
- seq_puts(m, is_d ? "RO " : "EXEC ");
+ seq_puts(m, is_d ? "r " : " x ");
else if (lower & BPP_RW)
- seq_puts(m, is_d ? "RW " : "EXEC ");
+ seq_puts(m, is_d ? "rw " : " x ");
else
- seq_puts(m, is_d ? "NA " : "NX ");
-
- if (lower & _PAGE_WRITETHRU)
- seq_puts(m, "write through ");
- if (lower & _PAGE_NO_CACHE)
- seq_puts(m, "no cache ");
- if (lower & _PAGE_COHERENT)
- seq_puts(m, "coherent ");
- if (lower & _PAGE_GUARDED)
- seq_puts(m, "guarded ");
+ seq_puts(m, is_d ? " " : " ");
+
+ seq_puts(m, lower & _PAGE_WRITETHRU ? "w " : " ");
+ seq_puts(m, lower & _PAGE_NO_CACHE ? "i " : " ");
+ seq_puts(m, lower & _PAGE_COHERENT ? "m " : " ");
+ seq_puts(m, lower & _PAGE_GUARDED ? "g " : " ");
seq_puts(m, "\n");
}
--
2.25.0
^ permalink raw reply related
* [PATCH v3 10/45] powerpc/ptdump: Display size of BATs
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Display the size of areas mapped with BATs.
For that, the size display for pages is refactorised.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: Add missing include of linux/seq_file.h (Thanks to kbuild test robot)
---
arch/powerpc/mm/ptdump/bats.c | 4 ++++
arch/powerpc/mm/ptdump/ptdump.c | 23 ++++++++++++++---------
arch/powerpc/mm/ptdump/ptdump.h | 3 +++
3 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index d3a5d6b318d1..d6c660f63d71 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -10,6 +10,8 @@
#include <asm/pgtable.h>
#include <asm/cpu_has_feature.h>
+#include "ptdump.h"
+
static char *pp_601(int k, int pp)
{
if (pp == 0)
@@ -42,6 +44,7 @@ static void bat_show_601(struct seq_file *m, int idx, u32 lower, u32 upper)
#else
seq_printf(m, "0x%08x ", pbn);
#endif
+ pt_dump_size(m, size);
seq_printf(m, "Kernel %s User %s", pp_601(k & 2, pp), pp_601(k & 1, pp));
@@ -88,6 +91,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool
#else
seq_printf(m, "0x%08x ", brpn);
#endif
+ pt_dump_size(m, size);
if (k == 1)
seq_puts(m, "User ");
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index d92bb8ea229c..1f97668853e3 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -112,6 +112,19 @@ static struct addr_marker address_markers[] = {
seq_putc(m, c); \
})
+void pt_dump_size(struct seq_file *m, unsigned long size)
+{
+ static const char units[] = "KMGTPE";
+ const char *unit = units;
+
+ /* Work out what appropriate unit to use */
+ while (!(size & 1023) && unit[1]) {
+ size >>= 10;
+ unit++;
+ }
+ pt_dump_seq_printf(m, "%9lu%c ", size, *unit);
+}
+
static void dump_flag_info(struct pg_state *st, const struct flag_info
*flag, u64 pte, int num)
{
@@ -146,8 +159,6 @@ static void dump_flag_info(struct pg_state *st, const struct flag_info
static void dump_addr(struct pg_state *st, unsigned long addr)
{
- static const char units[] = "KMGTPE";
- const char *unit = units;
unsigned long delta;
#ifdef CONFIG_PPC64
@@ -164,13 +175,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
delta = (addr - st->start_address) >> 10;
}
- /* Work out what appropriate unit to use */
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
- }
- pt_dump_seq_printf(st->seq, "%9lu%c", delta, *unit);
-
+ pt_dump_size(st->seq, delta);
}
static void note_prot_wx(struct pg_state *st, unsigned long addr)
diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h
index 5d513636de73..154efae96ae0 100644
--- a/arch/powerpc/mm/ptdump/ptdump.h
+++ b/arch/powerpc/mm/ptdump/ptdump.h
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/types.h>
+#include <linux/seq_file.h>
struct flag_info {
u64 mask;
@@ -17,3 +18,5 @@ struct pgtable_level {
};
extern struct pgtable_level pg_level[5];
+
+void pt_dump_size(struct seq_file *m, unsigned long delta);
--
2.25.0
^ permalink raw reply related
* [PATCH v3 00/45] Use hugepages to map kernel mem on 8xx
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
The main purpose of this big series is to:
- reorganise huge page handling to avoid using mm_slices.
- use huge pages to map kernel memory on the 8xx.
The 8xx supports 4 page sizes: 4k, 16k, 512k and 8M.
It uses 2 Level page tables, PGD having 1024 entries, each entry
covering 4M address space. Then each page table has 1024 entries.
At the time being, page sizes are managed in PGD entries, implying
the use of mm_slices as it can't mix several pages of the same size
in one page table.
The first purpose of this series is to reorganise things so that
standard page tables can also handle 512k pages. This is done by
adding a new _PAGE_HUGE flag which will be copied into the Level 1
entry in the TLB miss handler. That done, we have 2 types of pages:
- PGD entries to regular page tables handling 4k/16k and 512k pages
- PGD entries to hugepd tables handling 8M pages.
There is no need to mix 8M pages with other sizes, because a 8M page
will use more than what a single PGD covers.
Then comes the second purpose of this series. At the time being, the
8xx has implemented special handling in the TLB miss handlers in order
to transparently map kernel linear address space and the IMMR using
huge pages by building the TLB entries in assembly at the time of the
exception.
As mm_slices is only for user space pages, and also because it would
anyway not be convenient to slice kernel address space, it was not
possible to use huge pages for kernel address space. But after step
one of the series, it is now more flexible to use huge pages.
This series drop all assembly 'just in time' handling of huge pages
and use huge pages in page tables instead.
Once the above is done, then comes the cherry on cake:
- Use huge pages for KASAN shadow mapping
- Allow pinned TLBs with strict kernel rwx
- Allow pinned TLBs with debug pagealloc
Then, last but not least, those modifications for the 8xx allows the
following improvement on book3s/32:
- Mapping KASAN shadow with BATs
- Allowing BATs with debug pagealloc
All this allows to considerably simplify TLB miss handlers and associated
initialisation. The overhead of reading page tables is negligible
compared to the reduction of the miss handlers.
While we were at touching pte_update(), some cleanup was done
there too.
Tested widely on 8xx and 832x. Boot tested on QEMU MAC99.
Changes in v3:
- Fixed the handling of leaf pages page size which didn't build on PPC64 and was invisibily bogus on PPC32 (patch 12)
Changes in v2:
- Selecting HUGETLBFS instead of HUGETLB_PAGE which leads to link failure.
- Rebase on latest powerpc/merge branch
- Reworked the way TLB 28 to 31 are pinned because it was not working.
Christophe Leroy (45):
powerpc/kasan: Fix error detection on memory allocation
powerpc/kasan: Fix issues by lowering KASAN_SHADOW_END
powerpc/kasan: Fix shadow pages allocation failure
powerpc/kasan: Remove unnecessary page table locking
powerpc/kasan: Refactor update of early shadow mappings
powerpc/kasan: Declare kasan_init_region() weak
powerpc/ptdump: Limit size of flags text to 1/2 chars on PPC32
powerpc/ptdump: Reorder flags
powerpc/ptdump: Add _PAGE_COHERENT flag
powerpc/ptdump: Display size of BATs
powerpc/ptdump: Standardise display of BAT flags
powerpc/ptdump: Properly handle non standard page size
powerpc/ptdump: Handle hugepd at PGD level
powerpc/32s: Don't warn when mapping RO data ROX.
powerpc/mm: Allocate static page tables for fixmap
powerpc/mm: Fix conditions to perform MMU specific management by
blocks on PPC32.
powerpc/mm: PTE_ATOMIC_UPDATES is only for 40x
powerpc/mm: Refactor pte_update() on nohash/32
powerpc/mm: Refactor pte_update() on book3s/32
powerpc/mm: Standardise __ptep_test_and_clear_young() params between
PPC32 and PPC64
powerpc/mm: Standardise pte_update() prototype between PPC32 and PPC64
powerpc/mm: Create a dedicated pte_update() for 8xx
powerpc/mm: Reduce hugepd size for 8M hugepages on 8xx
powerpc/8xx: Drop CONFIG_8xx_COPYBACK option
powerpc/8xx: Prepare handlers for _PAGE_HUGE for 512k pages.
powerpc/8xx: Manage 512k huge pages as standard pages.
powerpc/8xx: Only 8M pages are hugepte pages now
powerpc/8xx: MM_SLICE is not needed anymore
powerpc/8xx: Move PPC_PIN_TLB options into 8xx Kconfig
powerpc/8xx: Add function to set pinned TLBs
powerpc/8xx: Don't set IMMR map anymore at boot
powerpc/8xx: Always pin TLBs at startup.
powerpc/8xx: Drop special handling of Linear and IMMR mappings in I/D
TLB handlers
powerpc/8xx: Remove now unused TLB miss functions
powerpc/8xx: Move DTLB perf handling closer.
powerpc/mm: Don't be too strict with _etext alignment on PPC32
powerpc/8xx: Refactor kernel address boundary comparison
powerpc/8xx: Add a function to early map kernel via huge pages
powerpc/8xx: Map IMMR with a huge page
powerpc/8xx: Map linear memory with huge pages
powerpc/8xx: Allow STRICT_KERNEL_RwX with pinned TLB
powerpc/8xx: Allow large TLBs with DEBUG_PAGEALLOC
powerpc/8xx: Implement dedicated kasan_init_region()
powerpc/32s: Allow mapping with BATs with DEBUG_PAGEALLOC
powerpc/32s: Implement dedicated kasan_init_region()
arch/powerpc/Kconfig | 62 +--
arch/powerpc/configs/adder875_defconfig | 1 -
arch/powerpc/configs/ep88xc_defconfig | 1 -
arch/powerpc/configs/mpc866_ads_defconfig | 1 -
arch/powerpc/configs/mpc885_ads_defconfig | 1 -
arch/powerpc/configs/tqm8xx_defconfig | 1 -
arch/powerpc/include/asm/book3s/32/pgtable.h | 78 ++--
arch/powerpc/include/asm/fixmap.h | 4 +
arch/powerpc/include/asm/hugetlb.h | 4 -
arch/powerpc/include/asm/kasan.h | 10 +-
.../include/asm/nohash/32/hugetlb-8xx.h | 32 +-
arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 74 +---
arch/powerpc/include/asm/nohash/32/pgtable.h | 104 +++--
arch/powerpc/include/asm/nohash/32/pte-8xx.h | 4 +-
arch/powerpc/include/asm/nohash/32/slice.h | 20 -
arch/powerpc/include/asm/nohash/64/pgtable.h | 28 +-
arch/powerpc/include/asm/nohash/pgtable.h | 2 +-
arch/powerpc/include/asm/pgtable.h | 2 +
arch/powerpc/include/asm/slice.h | 2 -
arch/powerpc/kernel/head_8xx.S | 354 ++++++++----------
arch/powerpc/kernel/setup_32.c | 2 +-
arch/powerpc/kernel/vmlinux.lds.S | 3 +-
arch/powerpc/mm/book3s32/mmu.c | 12 +-
arch/powerpc/mm/hugetlbpage.c | 41 +-
arch/powerpc/mm/init_32.c | 12 +-
arch/powerpc/mm/kasan/8xx.c | 74 ++++
arch/powerpc/mm/kasan/Makefile | 2 +
arch/powerpc/mm/kasan/book3s_32.c | 57 +++
arch/powerpc/mm/kasan/kasan_init_32.c | 88 ++---
arch/powerpc/mm/mmu_decl.h | 4 +
arch/powerpc/mm/nohash/8xx.c | 226 ++++++-----
arch/powerpc/mm/pgtable.c | 34 +-
arch/powerpc/mm/pgtable_32.c | 22 +-
arch/powerpc/mm/ptdump/8xx.c | 52 +--
arch/powerpc/mm/ptdump/bats.c | 41 +-
arch/powerpc/mm/ptdump/ptdump.c | 71 +++-
arch/powerpc/mm/ptdump/ptdump.h | 3 +
arch/powerpc/mm/ptdump/shared.c | 58 +--
arch/powerpc/perf/8xx-pmu.c | 10 -
arch/powerpc/platforms/8xx/Kconfig | 50 ++-
arch/powerpc/platforms/Kconfig.cputype | 2 +-
arch/powerpc/sysdev/cpm_common.c | 2 +
42 files changed, 853 insertions(+), 798 deletions(-)
delete mode 100644 arch/powerpc/include/asm/nohash/32/slice.h
create mode 100644 arch/powerpc/mm/kasan/8xx.c
create mode 100644 arch/powerpc/mm/kasan/book3s_32.c
--
2.25.0
^ permalink raw reply
* [PATCH v3 02/45] powerpc/kasan: Fix issues by lowering KASAN_SHADOW_END
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
At the time being, KASAN_SHADOW_END is 0x100000000, which
is 0 in 32 bits representation.
This leads to a couple of issues:
- kasan_remap_early_shadow_ro() does nothing because the comparison
k_cur < k_end is always false.
- In ptdump, address comparison for markers display fails and the
marker's name is printed at the start of the KASAN area instead of
being printed at the end.
However, there is no need to shadow the KASAN shadow area itself,
so the KASAN shadow area can stop shadowing memory at the start
of itself.
With a PAGE_OFFSET set to 0xc0000000, KASAN shadow area is then going
from 0xf8000000 to 0xff000000.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Fixes: cbd18991e24f ("powerpc/mm: Fix an Oops in kasan_mmu_init()")
Cc: stable@vger.kernel.org
---
arch/powerpc/include/asm/kasan.h | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index fbff9ff9032e..fc900937f653 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -23,9 +23,7 @@
#define KASAN_SHADOW_OFFSET ASM_CONST(CONFIG_KASAN_SHADOW_OFFSET)
-#define KASAN_SHADOW_END 0UL
-
-#define KASAN_SHADOW_SIZE (KASAN_SHADOW_END - KASAN_SHADOW_START)
+#define KASAN_SHADOW_END (-(-KASAN_SHADOW_START >> KASAN_SHADOW_SCALE_SHIFT))
#ifdef CONFIG_KASAN
void kasan_early_init(void);
--
2.25.0
^ permalink raw reply related
* [PATCH v3 01/45] powerpc/kasan: Fix error detection on memory allocation
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
In case (k_start & PAGE_MASK) doesn't equal (kstart), 'va' will never be
NULL allthough 'block' is NULL
Check the return of memblock_alloc() directly instead of
the resulting address in the loop.
Fixes: 509cd3f2b473 ("powerpc/32: Simplify KASAN init")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/kasan/kasan_init_32.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index cbcad369fcb2..8b15fe09b967 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -76,15 +76,14 @@ static int __init kasan_init_region(void *start, size_t size)
return ret;
block = memblock_alloc(k_end - k_start, PAGE_SIZE);
+ if (!block)
+ return -ENOMEM;
for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
pmd_t *pmd = pmd_ptr_k(k_cur);
void *va = block + k_cur - k_start;
pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
- if (!va)
- return -ENOMEM;
-
__set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
}
flush_tlb_kernel_range(k_start, k_end);
--
2.25.0
^ permalink raw reply related
* [PATCH v3 04/45] powerpc/kasan: Remove unnecessary page table locking
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Commit 45ff3c559585 ("powerpc/kasan: Fix parallel loading of
modules.") added spinlocks to manage parallele module loading.
Since then commit 47febbeeec44 ("powerpc/32: Force KASAN_VMALLOC for
modules") converted the module loading to KASAN_VMALLOC.
The spinlocking has then become unneeded and can be removed to
simplify kasan_init_shadow_page_tables()
Also remove inclusion of linux/moduleloader.h and linux/vmalloc.h
which are not needed anymore since the removal of modules management.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/kasan/kasan_init_32.c | 19 ++++---------------
1 file changed, 4 insertions(+), 15 deletions(-)
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index b7c287adfd59..91e2ade75192 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -5,9 +5,7 @@
#include <linux/kasan.h>
#include <linux/printk.h>
#include <linux/memblock.h>
-#include <linux/moduleloader.h>
#include <linux/sched/task.h>
-#include <linux/vmalloc.h>
#include <asm/pgalloc.h>
#include <asm/code-patching.h>
#include <mm/mmu_decl.h>
@@ -34,31 +32,22 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned
{
pmd_t *pmd;
unsigned long k_cur, k_next;
- pte_t *new = NULL;
pmd = pmd_ptr_k(k_start);
for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
+ pte_t *new;
+
k_next = pgd_addr_end(k_cur, k_end);
if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
continue;
- if (!new)
- new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
+ new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
if (!new)
return -ENOMEM;
kasan_populate_pte(new, PAGE_KERNEL);
-
- smp_wmb(); /* See comment in __pte_alloc */
-
- spin_lock(&init_mm.page_table_lock);
- /* Has another populated it ? */
- if (likely((void *)pmd_page_vaddr(*pmd) == kasan_early_shadow_pte)) {
- pmd_populate_kernel(&init_mm, pmd, new);
- new = NULL;
- }
- spin_unlock(&init_mm.page_table_lock);
+ pmd_populate_kernel(&init_mm, pmd, new);
}
return 0;
}
--
2.25.0
^ permalink raw reply related
* [PATCH v3 03/45] powerpc/kasan: Fix shadow pages allocation failure
From: Christophe Leroy @ 2020-05-11 11:25 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1589196133.git.christophe.leroy@csgroup.eu>
Doing kasan pages allocation in MMU_init is too early, kernel doesn't
have access yet to the entire memory space and memblock_alloc() fails
when the kernel is a bit big.
Do it from kasan_init() instead.
Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support")
Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/kasan.h | 2 --
arch/powerpc/mm/init_32.c | 2 --
arch/powerpc/mm/kasan/kasan_init_32.c | 4 +++-
3 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index fc900937f653..4769bbf7173a 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -27,12 +27,10 @@
#ifdef CONFIG_KASAN
void kasan_early_init(void);
-void kasan_mmu_init(void);
void kasan_init(void);
void kasan_late_init(void);
#else
static inline void kasan_init(void) { }
-static inline void kasan_mmu_init(void) { }
static inline void kasan_late_init(void) { }
#endif
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 872df48ae41b..a6991ef8727d 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -170,8 +170,6 @@ void __init MMU_init(void)
btext_unmap();
#endif
- kasan_mmu_init();
-
setup_kup();
/* Shortly after that, the entire linear mapping will be available */
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 8b15fe09b967..b7c287adfd59 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -131,7 +131,7 @@ static void __init kasan_unmap_early_shadow_vmalloc(void)
flush_tlb_kernel_range(k_start, k_end);
}
-void __init kasan_mmu_init(void)
+static void __init kasan_mmu_init(void)
{
int ret;
struct memblock_region *reg;
@@ -159,6 +159,8 @@ void __init kasan_mmu_init(void)
void __init kasan_init(void)
{
+ kasan_mmu_init();
+
kasan_remap_early_shadow_ro();
clear_page(kasan_early_shadow_page);
--
2.25.0
^ permalink raw reply related
* Re: [PATCH] powerpc/kvm: silence kmemleak false positives
From: Michael Ellerman @ 2020-05-11 11:15 UTC (permalink / raw)
To: Qian Cai; +Cc: linux-kernel, kvm-ppc, Qian Cai, catalin.marinas, linuxppc-dev
In-Reply-To: <20200509015538.3183-1-cai@lca.pw>
Qian Cai <cai@lca.pw> writes:
> kvmppc_pmd_alloc() and kvmppc_pte_alloc() allocate some memory but then
> pud_populate() and pmd_populate() will use __pa() to reference the newly
> allocated memory. The same is in xive_native_provision_pages().
>
> Since kmemleak is unable to track the physical memory resulting in false
> positives, silence those by using kmemleak_ignore().
There is kmemleak_alloc_phys(), which according to the docs can be used
for tracking a phys address.
Did you try that?
cheers
> unreferenced object 0xc000201c382a1000 (size 4096):
> comm "qemu-kvm", pid 124828, jiffies 4295733767 (age 341.250s)
> hex dump (first 32 bytes):
> c0 00 20 09 f4 60 03 87 c0 00 20 10 72 a0 03 87 .. ..`.... .r...
> c0 00 20 0e 13 a0 03 87 c0 00 20 1b dc c0 03 87 .. ....... .....
> backtrace:
> [<000000004cc2790f>] kvmppc_create_pte+0x838/0xd20 [kvm_hv]
> kvmppc_pmd_alloc at arch/powerpc/kvm/book3s_64_mmu_radix.c:366
> (inlined by) kvmppc_create_pte at arch/powerpc/kvm/book3s_64_mmu_radix.c:590
> [<00000000d123c49a>] kvmppc_book3s_instantiate_page+0x2e0/0x8c0 [kvm_hv]
> [<00000000bb549087>] kvmppc_book3s_radix_page_fault+0x1b4/0x2b0 [kvm_hv]
> [<0000000086dddc0e>] kvmppc_book3s_hv_page_fault+0x214/0x12a0 [kvm_hv]
> [<000000005ae9ccc2>] kvmppc_vcpu_run_hv+0xc5c/0x15f0 [kvm_hv]
> [<00000000d22162ff>] kvmppc_vcpu_run+0x34/0x48 [kvm]
> [<00000000d6953bc4>] kvm_arch_vcpu_ioctl_run+0x314/0x420 [kvm]
> [<000000002543dd54>] kvm_vcpu_ioctl+0x33c/0x950 [kvm]
> [<0000000048155cd6>] ksys_ioctl+0xd8/0x130
> [<0000000041ffeaa7>] sys_ioctl+0x28/0x40
> [<000000004afc4310>] system_call_exception+0x114/0x1e0
> [<00000000fb70a873>] system_call_common+0xf0/0x278
> unreferenced object 0xc0002001f0c03900 (size 256):
> comm "qemu-kvm", pid 124830, jiffies 4295735235 (age 326.570s)
> hex dump (first 32 bytes):
> c0 00 20 10 fa a0 03 87 c0 00 20 10 fa a1 03 87 .. ....... .....
> c0 00 20 10 fa a2 03 87 c0 00 20 10 fa a3 03 87 .. ....... .....
> backtrace:
> [<0000000023f675b8>] kvmppc_create_pte+0x854/0xd20 [kvm_hv]
> kvmppc_pte_alloc at arch/powerpc/kvm/book3s_64_mmu_radix.c:356
> (inlined by) kvmppc_create_pte at arch/powerpc/kvm/book3s_64_mmu_radix.c:593
> [<00000000d123c49a>] kvmppc_book3s_instantiate_page+0x2e0/0x8c0 [kvm_hv]
> [<00000000bb549087>] kvmppc_book3s_radix_page_fault+0x1b4/0x2b0 [kvm_hv]
> [<0000000086dddc0e>] kvmppc_book3s_hv_page_fault+0x214/0x12a0 [kvm_hv]
> [<000000005ae9ccc2>] kvmppc_vcpu_run_hv+0xc5c/0x15f0 [kvm_hv]
> [<00000000d22162ff>] kvmppc_vcpu_run+0x34/0x48 [kvm]
> [<00000000d6953bc4>] kvm_arch_vcpu_ioctl_run+0x314/0x420 [kvm]
> [<000000002543dd54>] kvm_vcpu_ioctl+0x33c/0x950 [kvm]
> [<0000000048155cd6>] ksys_ioctl+0xd8/0x130
> [<0000000041ffeaa7>] sys_ioctl+0x28/0x40
> [<000000004afc4310>] system_call_exception+0x114/0x1e0
> [<00000000fb70a873>] system_call_common+0xf0/0x278
> unreferenced object 0xc000201b53e90000 (size 65536):
> comm "qemu-kvm", pid 124557, jiffies 4295650285 (age 364.370s)
> hex dump (first 32 bytes):
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
> backtrace:
> [<00000000acc2fb77>] xive_native_alloc_vp_block+0x168/0x210
> xive_native_provision_pages at arch/powerpc/sysdev/xive/native.c:645
> (inlined by) xive_native_alloc_vp_block at arch/powerpc/sysdev/xive/native.c:674
> [<000000004d5c7964>] kvmppc_xive_compute_vp_id+0x20c/0x3b0 [kvm]
> [<0000000055317cd2>] kvmppc_xive_connect_vcpu+0xa4/0x4a0 [kvm]
> [<0000000093dfc014>] kvm_arch_vcpu_ioctl+0x388/0x508 [kvm]
> [<00000000d25aea0f>] kvm_vcpu_ioctl+0x15c/0x950 [kvm]
> [<0000000048155cd6>] ksys_ioctl+0xd8/0x130
> [<0000000041ffeaa7>] sys_ioctl+0x28/0x40
> [<000000004afc4310>] system_call_exception+0x114/0x1e0
> [<00000000fb70a873>] system_call_common+0xf0/0x278
>
> Signed-off-by: Qian Cai <cai@lca.pw>
> ---
> arch/powerpc/kvm/book3s_64_mmu_radix.c | 16 ++++++++++++++--
> arch/powerpc/sysdev/xive/native.c | 4 ++++
> 2 files changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index aa12cd4078b3..bc6c1aa3d0e9 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -353,7 +353,13 @@ static struct kmem_cache *kvm_pmd_cache;
>
> static pte_t *kvmppc_pte_alloc(void)
> {
> - return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
> + pte_t *pte;
> +
> + pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
> + /* pmd_populate() will only reference _pa(pte). */
> + kmemleak_ignore(pte);
> +
> + return pte;
> }
>
> static void kvmppc_pte_free(pte_t *ptep)
> @@ -363,7 +369,13 @@ static void kvmppc_pte_free(pte_t *ptep)
>
> static pmd_t *kvmppc_pmd_alloc(void)
> {
> - return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
> + pmd_t *pmd;
> +
> + pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
> + /* pud_populate() will only reference _pa(pmd). */
> + kmemleak_ignore(pmd);
> +
> + return pmd;
> }
>
> static void kvmppc_pmd_free(pmd_t *pmdp)
> diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
> index 5218fdc4b29a..2d19f28967a6 100644
> --- a/arch/powerpc/sysdev/xive/native.c
> +++ b/arch/powerpc/sysdev/xive/native.c
> @@ -18,6 +18,7 @@
> #include <linux/delay.h>
> #include <linux/cpumask.h>
> #include <linux/mm.h>
> +#include <linux/kmemleak.h>
>
> #include <asm/machdep.h>
> #include <asm/prom.h>
> @@ -647,6 +648,9 @@ static bool xive_native_provision_pages(void)
> pr_err("Failed to allocate provisioning page\n");
> return false;
> }
> + /* Kmemleak is unable to track the physical address. */
> + kmemleak_ignore(p);
> +
> opal_xive_donate_page(chip, __pa(p));
> }
> return true;
> --
> 2.21.0 (Apple Git-122.2)
^ permalink raw reply
* Re: [PATCH 02/31] arm64: fix the flush_icache_range arguments in machine_kexec
From: Catalin Marinas @ 2020-05-11 11:00 UTC (permalink / raw)
To: Will Deacon
Cc: linux-ia64, linux-sh, Roman Zippel, linux-mips, linux-mm,
sparclinux, linux-riscv, Christoph Hellwig, linux-arch,
linux-c6x-dev, linux-hexagon, x86, linux-xtensa, Arnd Bergmann,
Jessica Yu, linux-um, linux-m68k, openrisc, linux-arm-kernel,
Michal Simek, linux-kernel, james.morse, linux-alpha,
linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200511075115.GA16134@willie-the-truck>
On Mon, May 11, 2020 at 08:51:15AM +0100, Will Deacon wrote:
> On Sun, May 10, 2020 at 09:54:41AM +0200, Christoph Hellwig wrote:
> > The second argument is the end "pointer", not the length.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> > arch/arm64/kernel/machine_kexec.c | 1 +
> > 1 file changed, 1 insertion(+)
> >
> > diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
> > index 8e9c924423b4e..a0b144cfaea71 100644
> > --- a/arch/arm64/kernel/machine_kexec.c
> > +++ b/arch/arm64/kernel/machine_kexec.c
> > @@ -177,6 +177,7 @@ void machine_kexec(struct kimage *kimage)
> > * the offline CPUs. Therefore, we must use the __* variant here.
> > */
> > __flush_icache_range((uintptr_t)reboot_code_buffer,
> > + (uintptr_t)reboot_code_buffer +
> > arm64_relocate_new_kernel_size);
>
> Urgh, well spotted. It's annoyingly different from __flush_dcache_area().
>
> But now I'm wondering what this code actually does... the loop condition
> in invalidate_icache_by_line works with 64-bit arithmetic, so we could
> spend a /very/ long time here afaict.
I think it goes through the loop only once. The 'b.lo' saves us here.
OTOH, there is no I-cache maintenance done.
> It's also a bit annoying that we do a bunch of redundant D-cache
> maintenance too. Should we use invalidate_icache_range() here instead?
Since we have the __flush_dcache_area() above it for cleaning to PoC, we
could use invalidate_icache_range() here. We probably didn't have this
function at the time, it was added for KVM (commit 4fee94736603cd6).
> (and why does that thing need to toggle uaccess)?
invalidate_icache_range() doesn't need to, it works on the kernel linear
map.
__flush_icache_range() doesn't need to either, that's a side-effect of
the fall-through implementation.
Anyway, I think Christoph's patch needs to go in with a fixes tag:
Fixes: d28f6df1305a ("arm64/kexec: Add core kexec support")
Cc: <stable@vger.kernel.org> # 4.8.x-
and we'll change these functions/helpers going forward for arm64.
Happy to pick this up via the arm64 for-next/fixes branch.
--
Catalin
^ permalink raw reply
* [PATCH v2] powerpc/64/signal: balance return predictor stack in signal trampoline
From: Nicholas Piggin @ 2020-05-11 10:19 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Alan Modra
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return predictor.
Solve this by instead resuming execution at the signal trampoline which
then calls the signal handler. qtrace-tools link_stack checker confirms
the entire user/kernel/vdso cycle is balanced after this patch, whereas
it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it break
inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Cc: Alan Modra <amodra@gmail.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Since v1:
- Fix for the legacy on-stack trampoline path (TRAMP_TRACEBACK was not
updated to 4).
- Tested legacy case (must also disable no-exec stack to test this).
arch/powerpc/include/asm/ppc-opcode.h | 1 +
arch/powerpc/kernel/signal_64.c | 22 ++++++++++++----------
arch/powerpc/kernel/vdso64/sigtramp.S | 13 +++++--------
3 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index c1df75edde44..747b37f1ce09 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -329,6 +329,7 @@
#define PPC_INST_BLR 0x4e800020
#define PPC_INST_BLRL 0x4e800021
#define PPC_INST_BCTR 0x4e800420
+#define PPC_INST_BCTRL 0x4e800421
#define PPC_INST_MULLD 0x7c0001d2
#define PPC_INST_MULLW 0x7c0001d6
#define PPC_INST_MULHWU 0x7c000016
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index adfde59cf4ba..6bb14cdf256f 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -40,8 +40,8 @@
#define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs))
#define FP_REGS_SIZE sizeof(elf_fpregset_t)
-#define TRAMP_TRACEBACK 3
-#define TRAMP_SIZE 6
+#define TRAMP_TRACEBACK 4
+#define TRAMP_SIZE 7
/*
* When we have signals to deliver, we set up on the user stack,
@@ -603,13 +603,15 @@ static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp)
int i;
long err = 0;
+ /* bctrl # call the handler */
+ err |= __put_user(PPC_INST_BCTRL, &tramp[0]);
/* addi r1, r1, __SIGNAL_FRAMESIZE # Pop the dummy stackframe */
err |= __put_user(PPC_INST_ADDI | __PPC_RT(R1) | __PPC_RA(R1) |
- (__SIGNAL_FRAMESIZE & 0xffff), &tramp[0]);
+ (__SIGNAL_FRAMESIZE & 0xffff), &tramp[1]);
/* li r0, __NR_[rt_]sigreturn| */
- err |= __put_user(PPC_INST_ADDI | (syscall & 0xffff), &tramp[1]);
+ err |= __put_user(PPC_INST_ADDI | (syscall & 0xffff), &tramp[2]);
/* sc */
- err |= __put_user(PPC_INST_SC, &tramp[2]);
+ err |= __put_user(PPC_INST_SC, &tramp[3]);
/* Minimal traceback info */
for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++)
@@ -867,12 +869,12 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
/* Set up to return from userspace. */
if (vdso64_rt_sigtramp && tsk->mm->context.vdso_base) {
- regs->link = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
+ regs->nip = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
} else {
err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
if (err)
goto badframe;
- regs->link = (unsigned long) &frame->tramp[0];
+ regs->nip = (unsigned long) &frame->tramp[0];
}
/* Allocate a dummy caller frame for the signal handler. */
@@ -881,8 +883,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
/* Set up "regs" so we "return" to the signal handler. */
if (is_elf2_task()) {
- regs->nip = (unsigned long) ksig->ka.sa.sa_handler;
- regs->gpr[12] = regs->nip;
+ regs->ctr = (unsigned long) ksig->ka.sa.sa_handler;
+ regs->gpr[12] = regs->ctr;
} else {
/* Handler is *really* a pointer to the function descriptor for
* the signal routine. The first entry in the function
@@ -892,7 +894,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
func_descr_t __user *funct_desc_ptr =
(func_descr_t __user *) ksig->ka.sa.sa_handler;
- err |= get_user(regs->nip, &funct_desc_ptr->entry);
+ err |= get_user(regs->ctr, &funct_desc_ptr->entry);
err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
}
diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S
index a8cc0409d7d2..bbf68cd01088 100644
--- a/arch/powerpc/kernel/vdso64/sigtramp.S
+++ b/arch/powerpc/kernel/vdso64/sigtramp.S
@@ -6,6 +6,7 @@
* Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
* Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
*/
+#include <asm/cache.h> /* IFETCH_ALIGN_BYTES */
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/unistd.h>
@@ -14,21 +15,17 @@
.text
-/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from
- the return address to get an address in the middle of the presumed
- call instruction. Since we don't have a call here, we artificially
- extend the range covered by the unwind info by padding before the
- real start. */
- nop
.balign 8
+ .balign IFETCH_ALIGN_BYTES
V_FUNCTION_BEGIN(__kernel_sigtramp_rt64)
-.Lsigrt_start = . - 4
+.Lsigrt_start:
+ bctrl /* call the handler */
addi r1, r1, __SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
.Lsigrt_end:
V_FUNCTION_END(__kernel_sigtramp_rt64)
-/* The ".balign 8" above and the following zeros mimic the old stack
+/* The .balign 8 above and the following zeros mimic the old stack
trampoline layout. The last magic value is the ucontext pointer,
chosen in such a way that older libgcc unwind code returns a zero
for a sigcontext pointer. */
--
2.23.0
^ permalink raw reply related
* Re: powerpc/pci: [PATCH 1/1]: PCIE PHB reset
From: Oliver O'Halloran @ 2020-05-11 10:17 UTC (permalink / raw)
To: wenxiong; +Cc: Brian King, linuxppc-dev, wenxiong
In-Reply-To: <1588857037-25950-1-git-send-email-wenxiong@linux.vnet.ibm.com>
On Fri, May 8, 2020 at 12:36 AM <wenxiong@linux.vnet.ibm.com> wrote:
>
> From: Wen Xiong <wenxiong@linux.vnet.ibm.com>
>
> Several device drivers hit EEH(Extended Error handling) when triggering
> kdump on Pseries PowerVM. This patch implemented a reset of the PHBs
> in pci general code. PHB reset stop all PCI transactions from previous
> kernel. We have tested the patch in several enviroments:
> - direct slot adapters
> - adapters under the switch
> - a VF adapter in PowerVM
> - a VF adapter/adapter in KVM guest.
>
> Signed-off-by: Wen Xiong <wenxiong@linux.vnet.ibm.com>
> ---
> arch/powerpc/platforms/pseries/pci.c | 153 +++++++++++++++++++++++++++
> 1 file changed, 153 insertions(+)
>
> diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
> index 911534b89c85..aac7f00696d2 100644
> --- a/arch/powerpc/platforms/pseries/pci.c
> +++ b/arch/powerpc/platforms/pseries/pci.c
> @@ -11,6 +11,8 @@
> #include <linux/kernel.h>
> #include <linux/pci.h>
> #include <linux/string.h>
> +#include <linux/crash_dump.h>
> +#include <linux/delay.h>
>
> #include <asm/eeh.h>
> #include <asm/pci-bridge.h>
> @@ -354,3 +356,154 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
>
> return 0;
> }
> +
> +/**
> + * pseries_get_pdn_addr - Retrieve PHB address
> + * @pe: EEH PE
> + *
> + * Retrieve the assocated PHB address. Actually, there're 2 RTAS
> + * function calls dedicated for the purpose. We need implement
> + * it through the new function and then the old one. Besides,
> + * you should make sure the config address is figured out from
> + * FDT node before calling the function.
> + *
> + */
> +static int pseries_get_pdn_addr(struct pci_controller *phb)
> +{
> + int ret = -1;
> + int rets[3];
> + int ibm_get_config_addr_info;
> + int ibm_get_config_addr_info2;
> + int config_addr = 0;
> + struct pci_dn *root_pdn, *pdn;
> +
> + ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
> + ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
> +
> + root_pdn = PCI_DN(phb->dn);
> + pdn = list_first_entry(&root_pdn->child_list, struct pci_dn, list);
> + config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
> +
> + if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
> + /*
> + * First of all, we need to make sure there has one PE
> + * associated with the device. Otherwise, PE address is
> + * meaningless.
> + */
> + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
> + config_addr, BUID_HI(pdn->phb->buid),
> + BUID_LO(pdn->phb->buid), 1);
> + if (ret || (rets[0] == 0)) {
> + pr_warn("%s: Failed to get address for PHB#%x-PE# "
> + "option=%d config_addr=%x\n",
> + __func__, pdn->phb->global_number, 1, rets[0]);
> + return -1;
> + }
> +
> + /* Retrieve the associated PE config address */
> + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
> + config_addr, BUID_HI(pdn->phb->buid),
> + BUID_LO(pdn->phb->buid), 0);
> + if (ret) {
> + pr_warn("%s: Failed to get address for PHB#%x-PE# "
> + "option=%d config_addr=%x\n",
> + __func__, pdn->phb->global_number, 0, rets[0]);
> + return -1;
> + }
> + return rets[0];
> + }
> +
> + if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
> + ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
> + config_addr, BUID_HI(pdn->phb->buid),
> + BUID_LO(pdn->phb->buid), 0);
> + if (ret || rets[0]) {
> + pr_warn("%s: Failed to get address for PHB#%x-PE# "
> + "config_addr=%x\n",
> + __func__, pdn->phb->global_number, rets[0]);
> + return -1;
> + }
> + return rets[0];
> + }
> +
> + return ret;
> +}
I'd be nice if we could reduce the amount of duplication between this
function and eeh_pseries_get_pe_addr(). I was planning to re-working
the EEH version though so that this is fine for now.
> +
> +static int __init pseries_phb_reset(void)
> +{
> + struct pci_controller *phb;
> + int config_addr;
> + int ibm_set_slot_reset;
> + int ibm_configure_pe;
> + int ret;
> +
> + if (is_kdump_kernel() || reset_devices) {
> + pr_info("Issue PHB reset ...\n");
> + ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
> + ibm_configure_pe = rtas_token("ibm,configure-pe");
> +
> + if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE ||
> + ibm_configure_pe == RTAS_UNKNOWN_SERVICE) {
> + pr_info("%s: EEH functionality not supported\n",
> + __func__);
> + }
> +
> + list_for_each_entry(phb, &hose_list, list_node) {
> + config_addr = pseries_get_pdn_addr(phb);
> + if (config_addr == -1)
> + continue;
Considering we already cache the buid in the pci_controller we could
also cache the config_addr. That said, considering all this runs
precisely once at boot I'm not that bothered by calling
pseries_get_pdn_addr() again in each loop.
> +
> + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
> + config_addr, BUID_HI(phb->buid),
> + BUID_LO(phb->buid), EEH_RESET_FUNDAMENTAL);
> +
> + /* If fundamental-reset not supported, try hot-reset */
> + if (ret == -8)
> + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
> + config_addr, BUID_HI(phb->buid),
> + BUID_LO(phb->buid), EEH_RESET_HOT);
> +
> + if (ret) {
> + pr_err("%s: fail with rtas_call fundamental reset=%d\n",
> + __func__, ret);
> + continue;
> + }
> + }
> + msleep(EEH_PE_RST_SETTLE_TIME);
> +
> + list_for_each_entry(phb, &hose_list, list_node) {
> + config_addr = pseries_get_pdn_addr(phb);
> + if (config_addr == -1)
> + continue;
> +
> + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
> + config_addr, BUID_HI(phb->buid),
> + BUID_LO(phb->buid), EEH_RESET_DEACTIVATE);
> + if (ret) {
> + pr_err("%s: fail with rtas_call deactive=%d\n",
> + __func__, ret);
> + continue;
> + }
> + }
> + msleep(EEH_PE_RST_SETTLE_TIME);
> +
> + list_for_each_entry(phb, &hose_list, list_node) {
> + config_addr = pseries_get_pdn_addr(phb);
> + if (config_addr == -1)
> + continue;
> +
> + ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
> + config_addr, BUID_HI(phb->buid),
> + BUID_LO(phb->buid));
> + if (ret) {
> + pr_err("%s: fail with rtas_call configure_pe =%d\n",
> + __func__, ret);
> + continue;
> + }
> + }
> + }
> +
> + return 0;
> +}
> +postcore_initcall(pseries_phb_reset);
You probably should use machine_postcore_initcall(pseries,
pseries_phb_reset); so that this only gets run on pseries. Without the
machine type specifier it'll run on PowerNV too.
> +
> --
> 2.18.1
>
^ permalink raw reply
* Re: [PATCH v4 11/16] powerpc/64s: machine check interrupt update NMI accounting
From: Michael Ellerman @ 2020-05-11 9:50 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev, kbuild test robot; +Cc: kbuild-all
In-Reply-To: <1589010505.dk8cddftjn.astroid@bobo.none>
Nicholas Piggin <npiggin@gmail.com> writes:
> Excerpts from kbuild test robot's message of May 9, 2020 1:13 pm:
>> Hi Nicholas,
>>
>> I love your patch! Yet something to improve:
>
> ...
>
>> 1419 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
>> 1420 pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
>> 1421 #else
>> 1422 pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
>> 1423 #endif
>> 1424 #ifdef CONFIG_PPC64
>>> 1425 pr_cont("IRQMASK: %lx IN_NMI:%d IN_MCE:%d", regs->softe, (int)get_paca()->in_nmi, (int)get_paca()->in_mce);
>
> Oh I meant to get rid of that hunk, it crept back in :(
>
> mpe if you could please take it out if you're merging this.
Yep. I just came here to tell you I'd dropped that hunk :)
> It was quite useful for debugging this stuff, I might do a proper patch
> for this, but for now not necessary (it doesn't matter for "normal"
> crashes only crash crashes).
Yeah would be good to print more of those flags.
cheers
^ permalink raw reply
* Re: [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics
From: Emanuele Giuseppe Esposito @ 2020-05-11 9:37 UTC (permalink / raw)
To: Paolo Bonzini, Jonathan Adams
Cc: linux-s390, kvm list, David Hildenbrand, Cornelia Huck,
Emanuele Giuseppe Esposito, LKML, kvm-ppc, linux-mips,
Christian Borntraeger, Alexander Viro, David Rientjes,
linux-fsdevel, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <29982969-92f6-b6d0-aeae-22edb401e3ac@redhat.com>
On 5/8/20 11:44 AM, Paolo Bonzini wrote:
> So in general I'd say the sources/values model holds up. We certainly
> want to:
>
> - switch immediately to callbacks instead of the type constants (so that
> core statsfs code only does signed/unsigned)
>
> - add a field to distinguish cumulative and floating properties (and use
> it to determine the default file mode)
>
> - add a new argument to statsfs_create_source and statsfs_create_values
> that makes it not create directories and files respectively
>
> - add a new API to look for a statsfs_value recursively in all the
> subordinate sources, and pass the source/value pair to a callback
> function; and reimplement recursive aggregation and clear in terms of
> this function.
Ok I will apply this, thank you for all the suggestions.
I will post the v3 patchset in the next few weeks.
In the meanwhile, I wrote the documentation you asked (even though it's
going to change in v3), you can find it here:
https://github.com/esposem/linux/commit/dfa92f270f1aed73d5f3b7f12640b2a1635c711f
Thank you,
Emanuele
^ permalink raw reply
* [PATCH v3 2/2] powerpc/64s/hash: add torture_hpt kernel boot option to increase hash faults
From: Nicholas Piggin @ 2020-05-11 9:20 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20200511092040.1339667-1-npiggin@gmail.com>
This option increases the number of hash misses by limiting the number of
kernel HPT entries, by accessing the address immediately after installing
the PTE, then removing it again (except in the case of CI entries that
must not be accessed, these are removed on the next hash fault).
This helps stress test difficult to hit paths in the kernel.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
v3:
- Remove the dead code that didn't work on QEMU, and comment it instead.
.../admin-guide/kernel-parameters.txt | 9 ++++
arch/powerpc/include/asm/book3s/64/mmu-hash.h | 11 ++++
arch/powerpc/mm/book3s64/hash_4k.c | 3 ++
arch/powerpc/mm/book3s64/hash_64k.c | 8 +++
arch/powerpc/mm/book3s64/hash_utils.c | 54 ++++++++++++++++++-
5 files changed, 84 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8dd4260746dc..f51ed836954f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -876,6 +876,15 @@
them frequently to increase the rate of SLB faults
on kernel addresses.
+ torture_hpt [PPC]
+ Limits the number of kernel HPT entries in the hash
+ page table to increase the rate of hash page table
+ faults on kernel addresses.
+
+ This may hang when run on processors / emulators which
+ do not have a TLB, or flush it more often than
+ required, QEMU seems to have problems.
+
disable= [IPV6]
See Documentation/networking/ipv6.txt.
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 758de1e0f676..3d50ab629bde 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -8,6 +8,7 @@
* PPC64 rework.
*/
+#include <linux/jump_label.h>
#include <asm/page.h>
#include <asm/bug.h>
#include <asm/asm-const.h>
@@ -324,6 +325,16 @@ static inline bool torture_slb(void)
return static_branch_unlikely(&torture_slb_key);
}
+extern bool torture_hpt_enabled;
+DECLARE_STATIC_KEY_FALSE(torture_hpt_key);
+static inline bool torture_hpt(void)
+{
+ return static_branch_unlikely(&torture_hpt_key);
+}
+
+void hpt_do_torture(unsigned long ea, unsigned long access,
+ unsigned long rflags, unsigned long hpte_group);
+
/*
* This computes the AVPN and B fields of the first dword of a HPTE,
* for use when we want to match an existing PTE. The bottom 7 bits
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
index 22e787123cdf..54e4ff8c558d 100644
--- a/arch/powerpc/mm/book3s64/hash_4k.c
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -118,6 +118,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
}
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (torture_hpt())
+ hpt_do_torture(ea, access, rflags, hpte_group);
}
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
index 7084ce2951e6..19ea0fc145a9 100644
--- a/arch/powerpc/mm/book3s64/hash_64k.c
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -216,6 +216,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
new_pte |= H_PAGE_HASHPTE;
+ if (torture_hpt())
+ hpt_do_torture(ea, access, rflags, hpte_group);
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
@@ -327,7 +330,12 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+ if (torture_hpt())
+ hpt_do_torture(ea, access, rflags, hpte_group);
}
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+
return 0;
}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 9c487b5782ef..ffe364269b8a 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -353,8 +353,12 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
return ret;
}
-static bool disable_1tb_segments = false;
+static bool disable_1tb_segments __read_mostly = false;
bool torture_slb_enabled __read_mostly = false;
+bool torture_hpt_enabled __read_mostly = false;
+
+/* per-CPU array allocated if we enable torture_hpt. */
+static unsigned long *torture_hpt_last_group;
static int __init parse_disable_1tb_segments(char *p)
{
@@ -370,6 +374,13 @@ static int __init parse_torture_slb(char *p)
}
early_param("torture_slb", parse_torture_slb);
+static int __init parse_torture_hpt(char *p)
+{
+ torture_hpt_enabled = true;
+ return 0;
+}
+early_param("torture_hpt", parse_torture_hpt);
+
static int __init htab_dt_scan_seg_sizes(unsigned long node,
const char *uname, int depth,
void *data)
@@ -863,6 +874,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
}
DEFINE_STATIC_KEY_FALSE(torture_slb_key);
+DEFINE_STATIC_KEY_FALSE(torture_hpt_key);
static void __init htab_initialize(void)
{
@@ -882,6 +894,15 @@ static void __init htab_initialize(void)
if (torture_slb_enabled)
static_branch_enable(&torture_slb_key);
+ if (torture_hpt_enabled) {
+ unsigned long tmp;
+ static_branch_enable(&torture_hpt_key);
+ tmp = memblock_phys_alloc_range(sizeof(unsigned long) * NR_CPUS,
+ 0,
+ 0, MEMBLOCK_ALLOC_ANYWHERE);
+ memset((void *)tmp, 0xff, sizeof(unsigned long) * NR_CPUS);
+ torture_hpt_last_group = __va(tmp);
+ }
/*
* Calculate the required size of the htab. We want the number of
@@ -1901,6 +1922,37 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
return slot;
}
+void hpt_do_torture(unsigned long ea, unsigned long access,
+ unsigned long rflags, unsigned long hpte_group)
+{
+ unsigned long last_group;
+ int cpu = raw_smp_processor_id();
+
+ last_group = torture_hpt_last_group[cpu];
+ if (last_group != -1UL) {
+ while (mmu_hash_ops.hpte_remove(last_group) != -1)
+ ;
+ torture_hpt_last_group[cpu] = -1UL;
+ }
+
+ if (ea >= PAGE_OFFSET) {
+ /*
+ * We would really like to prefetch here to get the TLB loaded,
+ * then remove the PTE before returning to userspace, to
+ * increase the hash fault rate.
+ *
+ * Unfortunately QEMU TCG does not model the TLB in a way that
+ * makes this possible, and systemsim (mambo) emulator does not
+ * bring in TLBs with prefetches (although loads/stores do
+ * work for non-CI PTEs).
+ *
+ * So remember this PTE and clear it on the next hash fault.
+ */
+ torture_hpt_last_group[cpu] = hpte_group;
+ }
+}
+
+
#ifdef CONFIG_DEBUG_PAGEALLOC
static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
{
--
2.23.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox